鱼C论坛

 找回密码
 立即注册
查看: 1053|回复: 1

crawl函数中的response.json()一直出错

[复制链接]
发表于 2022-5-26 11:22:39 | 显示全部楼层 |阅读模式
60鱼币
  1. import requests
  2. import time
  3. import random
  4. from pymongo import MongoClient

  5. client = MongoClient('localhost', 27017)
  6. db = client["spider"]
  7. collection = db["lagou"]

  8. def get_cookie():#因为Cookies是有时效性的,每隔一段时间Cookies就会动态地发生变化,所以每次都刷新cookie
  9.     headers = {
  10.     'Connection': 'keep-alive',
  11.     'Cache-Control': 'max-age=0',
  12.     'Upgrade-Insecure-Requests': '1',
  13.     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5057.3 Safari/537.36',
  14.     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
  15.     'Accept-Encoding': 'gzip, deflate, br',
  16.     'Accept-Language': 'zh-CN,zh;q=0.9',
  17.     }  #未登录时的headers截选
  18.    
  19.    
  20.     response = requests.get(
  21.        'https://www.lagou.com/jobs/list_?city=%E4%B8%8A%E6%B5%B7&cl=false&fromSearch=true&labelWords=&suginput=',
  22.         headers=headers,
  23.         allow_redirects=False)  # 请求原网页
  24.     r = requests.utils.dict_from_cookiejar(response.cookies)  # 获取未登录之前的cookies,并将其保存为字典形式

  25.    
  26.     #结尾对cookie重编码一下,防止中文乱码导致报错
  27.     cookie='user_trace_token=20220524200705-fcb14498-7140-410b-a825-0e170e7bf93f; __lg_stoken__=274bca4300b5a592488b893dce0320355daa048bc99c19344d7ba4da54ef5411dcdf936ec1c088bdcb6a6ad767aa32ec9f9107fd6f50cdbe6bff10b150adee66e9564397a5a5; LGUID=20220524200707-7d49e914-e123-4ffa-817e-c7a98e15945f; _ga=GA1.2.1104908297.1653394025; RECOMMEND_TIP=true; index_location_city=全国; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1653394025,1653489017; _gid=GA1.2.1818049825.1653489017; sensorsdata2015session={}; gate_login_token=c92112d49c380c66a1517e71f92618c1a815d48c336cf06cc656257c6632aab5; LG_HAS_LOGIN=1; _putrc=03D4C75C01DFF59B123F89F2B170EADC; JSESSIONID=ABAAAECABFAACEABF7DDC7FFFCDD2F02CBE6E81D2A34E7A; login=true; hasDeliver=0; privacyPolicyPopup=false; WEBTJ-ID=20220525223618-180fba5a6b34e4-046eb2db15b7c8-4c607a68-3686400-180fba5a6b48a0; X_HTTP_TOKEN=15f915f7e9cfccbd862094356154d166aab6732e2b; unick=用户4184; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; PRE_UTM=; PRE_HOST=; PRE_LAND=https://www.lagou.com/; LGSID=20220526082739-36277dac-b6f8-4192-bc56-9e5c75c7e58a; PRE_SITE=; __SAFETY_CLOSE_TIME__24627981=1; TG-TRACK-CODE=index_zhaopin; SEARCH_ID=cdbc23c678b9496e8d398b7fef622572; sensorsdata2015jssdkcross={"distinct_id":"24627981","$device_id":"180f5f6a88e42c-09c9b8d39ef2b1-4c607a68-3686400-180f5f6a8904f6","props":{"$latest_traffic_source_type":"直接流量","$latest_referrer":"","$latest_search_keyword":"未取到值_直接打开","$os":"Windows","$browser":"Chrome","$browser_version":"96.0.4664.110","$latest_referrer_host":""},"first_id":"180f5f6a88e42c-09c9b8d39ef2b1-4c607a68-3686400-180f5f6a8904f6"}; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1653524890; LGRID=20220526082814-074bd393-cec1-42fd-a11a-43b48c0c3ffb'.encode("utf-8").decode("latin1")
  28.     cookies=dict([l.split("=", 1) for l in cookie.split("; ")])  #手动添加登录之后的cookies
  29.    
  30.    
  31.     cookies.update(r)  # 更新接口的cookies
  32.     return cookies

  33. def crawl(city = "", pn = 1, cookies = None):
  34.     headers = {
  35.         'Origin': 'https://www.lagou.com',
  36.         'X-Anit-Forge-Code': '0',
  37.         'Accept-Encoding': 'gzip, deflate, br',
  38.         'Accept-Language': 'zh-CN,zh,q=0.9,en,q=0.8',
  39.         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1, Win64, x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
  40.         'Content-Type': 'application/x-www-form-urlencoded, charset=UTF-8',
  41.         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
  42.         'Referer': 'https://www.lagou.com/jobs/list_java?px=new&city=%E4%B8%8A%E6%B5%B7'.encode("utf-8").decode("latin1"),  #重编码一下,防止中文乱码导致报错
  43.         'X-Requested-With': 'XMLHttpRequest',
  44.         'Connection': 'keep-alive',
  45.         'X-Anit-Forge-Token': 'None',
  46.     }

  47.     params = (
  48.         ('px', 'default'),
  49.         ('city',city),
  50.         ('needAddtionalResult', 'false'),
  51.     )

  52.     data = {"first":"true",
  53.             'kd': '数据分析'  #岗位名称,
  54.             'pn': pn  #pn是第几页
  55.            }
  56.     #if pn>1:
  57.     #    data["first"] = "false"
  58.     response = requests.post('https://www.lagou.com/jobs/positionAjax.json', headers=headers, params=params,
  59.                              cookies=cookies, data=data)  # 请求接口
  60.     response.encoding='utf-8'
  61.     return response.json()

  62. city_list = ["北京","上海","深圳","广州","杭州","成都","南京","武汉","西安","厦门","长沙","苏州","天津"]

  63. for city in city_list:
  64.     print("*"*60)
  65.     print("{city} start".format(city=city))
  66.     for i in range(1,31):
  67.         if (i-1)%5==0:
  68.             cookies = get_cookie()
  69.         time.sleep(random.random()+random.randint(1,2))
  70.         response_json = crawl(city=city,pn=i,cookies=cookies)
  71.         try:
  72.             position_list = response_json["content"]["positionResulft"]["result"]
  73.             print(position_list)
  74.         except:
  75.             print(response_json)
  76.         if len(position_list)<1:
  77.             print("{city} start".format(city=city))
  78.             print("*"*60)
  79.             break
  80.         collection.insert_many(position_list)
  81.         print(cookies)
  82.     print("{city} end".format(city=city))
  83.     print("*"*60)
复制代码

头晕了已经,看了三小时很糊涂

想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复

使用道具 举报

发表于 2022-5-27 00:09:26 | 显示全部楼层
报错贴一下,print看看返回的数据是不是json。
想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

小黑屋|手机版|Archiver|鱼C工作室 ( 粤ICP备18085999号-1 | 粤公网安备 44051102000585号)

GMT+8, 2024-5-2 09:00

Powered by Discuz! X3.4

© 2001-2023 Discuz! Team.

快速回复 返回顶部 返回列表