孤不冷cing 发表于 2022-5-26 11:22:39

crawl函数中的response.json()一直出错

import requests
import time
import random
from pymongo import MongoClient

client = MongoClient('localhost', 27017)
db = client["spider"]
collection = db["lagou"]

def get_cookie():#因为Cookies是有时效性的,每隔一段时间Cookies就会动态地发生变化,所以每次都刷新cookie
    headers = {
    'Connection': 'keep-alive',
    'Cache-Control': 'max-age=0',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5057.3 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    }#未登录时的headers截选
   
   
    response = requests.get(
       'https://www.lagou.com/jobs/list_?city=%E4%B8%8A%E6%B5%B7&cl=false&fromSearch=true&labelWords=&suginput=',
      headers=headers,
      allow_redirects=False)# 请求原网页
    r = requests.utils.dict_from_cookiejar(response.cookies)# 获取未登录之前的cookies,并将其保存为字典形式

   
    #结尾对cookie重编码一下,防止中文乱码导致报错
    cookie='user_trace_token=20220524200705-fcb14498-7140-410b-a825-0e170e7bf93f; __lg_stoken__=274bca4300b5a592488b893dce0320355daa048bc99c19344d7ba4da54ef5411dcdf936ec1c088bdcb6a6ad767aa32ec9f9107fd6f50cdbe6bff10b150adee66e9564397a5a5; LGUID=20220524200707-7d49e914-e123-4ffa-817e-c7a98e15945f; _ga=GA1.2.1104908297.1653394025; RECOMMEND_TIP=true; index_location_city=全国; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1653394025,1653489017; _gid=GA1.2.1818049825.1653489017; sensorsdata2015session={}; gate_login_token=c92112d49c380c66a1517e71f92618c1a815d48c336cf06cc656257c6632aab5; LG_HAS_LOGIN=1; _putrc=03D4C75C01DFF59B123F89F2B170EADC; JSESSIONID=ABAAAECABFAACEABF7DDC7FFFCDD2F02CBE6E81D2A34E7A; login=true; hasDeliver=0; privacyPolicyPopup=false; WEBTJ-ID=20220525223618-180fba5a6b34e4-046eb2db15b7c8-4c607a68-3686400-180fba5a6b48a0; X_HTTP_TOKEN=15f915f7e9cfccbd862094356154d166aab6732e2b; unick=用户4184; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; PRE_UTM=; PRE_HOST=; PRE_LAND=https://www.lagou.com/; LGSID=20220526082739-36277dac-b6f8-4192-bc56-9e5c75c7e58a; PRE_SITE=; __SAFETY_CLOSE_TIME__24627981=1; TG-TRACK-CODE=index_zhaopin; SEARCH_ID=cdbc23c678b9496e8d398b7fef622572; sensorsdata2015jssdkcross={"distinct_id":"24627981","$device_id":"180f5f6a88e42c-09c9b8d39ef2b1-4c607a68-3686400-180f5f6a8904f6","props":{"$latest_traffic_source_type":"直接流量","$latest_referrer":"","$latest_search_keyword":"未取到值_直接打开","$os":"Windows","$browser":"Chrome","$browser_version":"96.0.4664.110","$latest_referrer_host":""},"first_id":"180f5f6a88e42c-09c9b8d39ef2b1-4c607a68-3686400-180f5f6a8904f6"}; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1653524890; LGRID=20220526082814-074bd393-cec1-42fd-a11a-43b48c0c3ffb'.encode("utf-8").decode("latin1")
    cookies=dict()#手动添加登录之后的cookies
   
   
    cookies.update(r)# 更新接口的cookies
    return cookies

def crawl(city = "", pn = 1, cookies = None):
    headers = {
      'Origin': 'https://www.lagou.com',
      'X-Anit-Forge-Code': '0',
      'Accept-Encoding': 'gzip, deflate, br',
      'Accept-Language': 'zh-CN,zh,q=0.9,en,q=0.8',
      'User-Agent': 'Mozilla/5.0 (Windows NT 6.1, Win64, x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
      'Content-Type': 'application/x-www-form-urlencoded, charset=UTF-8',
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
      'Referer': 'https://www.lagou.com/jobs/list_java?px=new&city=%E4%B8%8A%E6%B5%B7'.encode("utf-8").decode("latin1"),#重编码一下,防止中文乱码导致报错
      'X-Requested-With': 'XMLHttpRequest',
      'Connection': 'keep-alive',
      'X-Anit-Forge-Token': 'None',
    }

    params = (
      ('px', 'default'),
      ('city',city),
      ('needAddtionalResult', 'false'),
    )

    data = {"first":"true",
            'kd': '数据分析'#岗位名称,
            'pn': pn#pn是第几页
         }
    #if pn>1:
    #    data["first"] = "false"
    response = requests.post('https://www.lagou.com/jobs/positionAjax.json', headers=headers, params=params,
                           cookies=cookies, data=data)# 请求接口
    response.encoding='utf-8'
    return response.json()

city_list = ["北京","上海","深圳","广州","杭州","成都","南京","武汉","西安","厦门","长沙","苏州","天津"]

for city in city_list:
    print("*"*60)
    print("{city} start".format(city=city))
    for i in range(1,31):
      if (i-1)%5==0:
            cookies = get_cookie()
      time.sleep(random.random()+random.randint(1,2))
      response_json = crawl(city=city,pn=i,cookies=cookies)
      try:
            position_list = response_json["content"]["positionResulft"]["result"]
            print(position_list)
      except:
            print(response_json)
      if len(position_list)<1:
            print("{city} start".format(city=city))
            print("*"*60)
            break
      collection.insert_many(position_list)
      print(cookies)
    print("{city} end".format(city=city))
    print("*"*60)
头晕了已经,看了三小时很糊涂

suchocolate 发表于 2022-5-27 00:09:26

报错贴一下,print看看返回的数据是不是json。
页: [1]
查看完整版本: crawl函数中的response.json()一直出错