鱼C论坛

 找回密码
 立即注册
查看: 1225|回复: 1

crawl函数中的response.json()一直出错

[复制链接]
发表于 2022-5-26 11:22:39 | 显示全部楼层 |阅读模式
60鱼币
import requests
import time
import random
from pymongo import MongoClient

client = MongoClient('localhost', 27017)
db = client["spider"]
collection = db["lagou"]

def get_cookie():#因为Cookies是有时效性的,每隔一段时间Cookies就会动态地发生变化,所以每次都刷新cookie
    headers = {
    'Connection': 'keep-alive',
    'Cache-Control': 'max-age=0',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5057.3 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    }  #未登录时的headers截选
    
    
    response = requests.get(
       'https://www.lagou.com/jobs/list_?city=%E4%B8%8A%E6%B5%B7&cl=false&fromSearch=true&labelWords=&suginput=',
        headers=headers,
        allow_redirects=False)  # 请求原网页
    r = requests.utils.dict_from_cookiejar(response.cookies)  # 获取未登录之前的cookies,并将其保存为字典形式

    
    #结尾对cookie重编码一下,防止中文乱码导致报错
    cookie='user_trace_token=20220524200705-fcb14498-7140-410b-a825-0e170e7bf93f; __lg_stoken__=274bca4300b5a592488b893dce0320355daa048bc99c19344d7ba4da54ef5411dcdf936ec1c088bdcb6a6ad767aa32ec9f9107fd6f50cdbe6bff10b150adee66e9564397a5a5; LGUID=20220524200707-7d49e914-e123-4ffa-817e-c7a98e15945f; _ga=GA1.2.1104908297.1653394025; RECOMMEND_TIP=true; index_location_city=全国; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1653394025,1653489017; _gid=GA1.2.1818049825.1653489017; sensorsdata2015session={}; gate_login_token=c92112d49c380c66a1517e71f92618c1a815d48c336cf06cc656257c6632aab5; LG_HAS_LOGIN=1; _putrc=03D4C75C01DFF59B123F89F2B170EADC; JSESSIONID=ABAAAECABFAACEABF7DDC7FFFCDD2F02CBE6E81D2A34E7A; login=true; hasDeliver=0; privacyPolicyPopup=false; WEBTJ-ID=20220525223618-180fba5a6b34e4-046eb2db15b7c8-4c607a68-3686400-180fba5a6b48a0; X_HTTP_TOKEN=15f915f7e9cfccbd862094356154d166aab6732e2b; unick=用户4184; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; PRE_UTM=; PRE_HOST=; PRE_LAND=https://www.lagou.com/; LGSID=20220526082739-36277dac-b6f8-4192-bc56-9e5c75c7e58a; PRE_SITE=; __SAFETY_CLOSE_TIME__24627981=1; TG-TRACK-CODE=index_zhaopin; SEARCH_ID=cdbc23c678b9496e8d398b7fef622572; sensorsdata2015jssdkcross={"distinct_id":"24627981","$device_id":"180f5f6a88e42c-09c9b8d39ef2b1-4c607a68-3686400-180f5f6a8904f6","props":{"$latest_traffic_source_type":"直接流量","$latest_referrer":"","$latest_search_keyword":"未取到值_直接打开","$os":"Windows","$browser":"Chrome","$browser_version":"96.0.4664.110","$latest_referrer_host":""},"first_id":"180f5f6a88e42c-09c9b8d39ef2b1-4c607a68-3686400-180f5f6a8904f6"}; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1653524890; LGRID=20220526082814-074bd393-cec1-42fd-a11a-43b48c0c3ffb'.encode("utf-8").decode("latin1")
    cookies=dict([l.split("=", 1) for l in cookie.split("; ")])  #手动添加登录之后的cookies
    
    
    cookies.update(r)  # 更新接口的cookies
    return cookies

def crawl(city = "", pn = 1, cookies = None):
    headers = {
        'Origin': 'https://www.lagou.com',
        'X-Anit-Forge-Code': '0',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh,q=0.9,en,q=0.8',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1, Win64, x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
        'Content-Type': 'application/x-www-form-urlencoded, charset=UTF-8',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Referer': 'https://www.lagou.com/jobs/list_java?px=new&city=%E4%B8%8A%E6%B5%B7'.encode("utf-8").decode("latin1"),  #重编码一下,防止中文乱码导致报错
        'X-Requested-With': 'XMLHttpRequest',
        'Connection': 'keep-alive',
        'X-Anit-Forge-Token': 'None',
    }

    params = (
        ('px', 'default'),
        ('city',city),
        ('needAddtionalResult', 'false'),
    )

    data = {"first":"true",
            'kd': '数据分析'  #岗位名称,
            'pn': pn  #pn是第几页
           }
    #if pn>1:
    #    data["first"] = "false"
    response = requests.post('https://www.lagou.com/jobs/positionAjax.json', headers=headers, params=params,
                             cookies=cookies, data=data)  # 请求接口
    response.encoding='utf-8'
    return response.json()

city_list = ["北京","上海","深圳","广州","杭州","成都","南京","武汉","西安","厦门","长沙","苏州","天津"]

for city in city_list:
    print("*"*60)
    print("{city} start".format(city=city))
    for i in range(1,31):
        if (i-1)%5==0:
            cookies = get_cookie()
        time.sleep(random.random()+random.randint(1,2))
        response_json = crawl(city=city,pn=i,cookies=cookies)
        try:
            position_list = response_json["content"]["positionResulft"]["result"]
            print(position_list)
        except:
            print(response_json)
        if len(position_list)<1: 
            print("{city} start".format(city=city))
            print("*"*60)
            break
        collection.insert_many(position_list)
        print(cookies)
    print("{city} end".format(city=city))
    print("*"*60)
头晕了已经,看了三小时很糊涂

想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复

使用道具 举报

发表于 2022-5-27 00:09:26 | 显示全部楼层
报错贴一下,print看看返回的数据是不是json。
想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

小黑屋|手机版|Archiver|鱼C工作室 ( 粤ICP备18085999号-1 | 粤公网安备 44051102000585号)

GMT+8, 2025-1-11 12:45

Powered by Discuz! X3.4

© 2001-2023 Discuz! Team.

快速回复 返回顶部 返回列表