import requests
import time
import random
from pymongo import MongoClient
client = MongoClient('localhost', 27017)
db = client["spider"]
collection = db["lagou"]
def get_cookie():#因为Cookies是有时效性的,每隔一段时间Cookies就会动态地发生变化,所以每次都刷新cookie
headers = {
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5057.3 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
} #未登录时的headers截选
response = requests.get(
'https://www.lagou.com/jobs/list_?city=%E4%B8%8A%E6%B5%B7&cl=false&fromSearch=true&labelWords=&suginput=',
headers=headers,
allow_redirects=False) # 请求原网页
r = requests.utils.dict_from_cookiejar(response.cookies) # 获取未登录之前的cookies,并将其保存为字典形式
#结尾对cookie重编码一下,防止中文乱码导致报错
cookie='user_trace_token=20220524200705-fcb14498-7140-410b-a825-0e170e7bf93f; __lg_stoken__=274bca4300b5a592488b893dce0320355daa048bc99c19344d7ba4da54ef5411dcdf936ec1c088bdcb6a6ad767aa32ec9f9107fd6f50cdbe6bff10b150adee66e9564397a5a5; LGUID=20220524200707-7d49e914-e123-4ffa-817e-c7a98e15945f; _ga=GA1.2.1104908297.1653394025; RECOMMEND_TIP=true; index_location_city=全国; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1653394025,1653489017; _gid=GA1.2.1818049825.1653489017; sensorsdata2015session={}; gate_login_token=c92112d49c380c66a1517e71f92618c1a815d48c336cf06cc656257c6632aab5; LG_HAS_LOGIN=1; _putrc=03D4C75C01DFF59B123F89F2B170EADC; JSESSIONID=ABAAAECABFAACEABF7DDC7FFFCDD2F02CBE6E81D2A34E7A; login=true; hasDeliver=0; privacyPolicyPopup=false; WEBTJ-ID=20220525223618-180fba5a6b34e4-046eb2db15b7c8-4c607a68-3686400-180fba5a6b48a0; X_HTTP_TOKEN=15f915f7e9cfccbd862094356154d166aab6732e2b; unick=用户4184; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; PRE_UTM=; PRE_HOST=; PRE_LAND=https://www.lagou.com/; LGSID=20220526082739-36277dac-b6f8-4192-bc56-9e5c75c7e58a; PRE_SITE=; __SAFETY_CLOSE_TIME__24627981=1; TG-TRACK-CODE=index_zhaopin; SEARCH_ID=cdbc23c678b9496e8d398b7fef622572; sensorsdata2015jssdkcross={"distinct_id":"24627981","$device_id":"180f5f6a88e42c-09c9b8d39ef2b1-4c607a68-3686400-180f5f6a8904f6","props":{"$latest_traffic_source_type":"直接流量","$latest_referrer":"","$latest_search_keyword":"未取到值_直接打开","$os":"Windows","$browser":"Chrome","$browser_version":"96.0.4664.110","$latest_referrer_host":""},"first_id":"180f5f6a88e42c-09c9b8d39ef2b1-4c607a68-3686400-180f5f6a8904f6"}; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1653524890; LGRID=20220526082814-074bd393-cec1-42fd-a11a-43b48c0c3ffb'.encode("utf-8").decode("latin1")
cookies=dict([l.split("=", 1) for l in cookie.split("; ")]) #手动添加登录之后的cookies
cookies.update(r) # 更新接口的cookies
return cookies
def crawl(city = "", pn = 1, cookies = None):
headers = {
'Origin': 'https://www.lagou.com',
'X-Anit-Forge-Code': '0',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh,q=0.9,en,q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1, Win64, x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded, charset=UTF-8',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Referer': 'https://www.lagou.com/jobs/list_java?px=new&city=%E4%B8%8A%E6%B5%B7'.encode("utf-8").decode("latin1"), #重编码一下,防止中文乱码导致报错
'X-Requested-With': 'XMLHttpRequest',
'Connection': 'keep-alive',
'X-Anit-Forge-Token': 'None',
}
params = (
('px', 'default'),
('city',city),
('needAddtionalResult', 'false'),
)
data = {"first":"true",
'kd': '数据分析' #岗位名称,
'pn': pn #pn是第几页
}
#if pn>1:
# data["first"] = "false"
response = requests.post('https://www.lagou.com/jobs/positionAjax.json', headers=headers, params=params,
cookies=cookies, data=data) # 请求接口
response.encoding='utf-8'
return response.json()
city_list = ["北京","上海","深圳","广州","杭州","成都","南京","武汉","西安","厦门","长沙","苏州","天津"]
for city in city_list:
print("*"*60)
print("{city} start".format(city=city))
for i in range(1,31):
if (i-1)%5==0:
cookies = get_cookie()
time.sleep(random.random()+random.randint(1,2))
response_json = crawl(city=city,pn=i,cookies=cookies)
try:
position_list = response_json["content"]["positionResulft"]["result"]
print(position_list)
except:
print(response_json)
if len(position_list)<1:
print("{city} start".format(city=city))
print("*"*60)
break
collection.insert_many(position_list)
print(cookies)
print("{city} end".format(city=city))
print("*"*60)
头晕了已经,看了三小时很糊涂