本帖最后由 wongyusing 于 2018-9-7 23:18 编辑
因为你没有拿到正确的数据,看我这里#import pymongo
import json
import time
import requests
#client=pymongo.MongoClient('localhost',27017)
#mydb=client['mydb']
#lagou=mydb['lagou']
headers={
'cookies':"xxxxx",
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
'Connection':'keep-live'
}
def get_page(url,params):
html=requests.get(url,data=params,headers=headers)
json_data=json.loads(html.text)
# 这里打印一下获取到的的数据
print(json_data)
total_count=json_data['content']['positionResult']['totalCount']
page_number=int(total_count/15) if int(total_count/15) <30 else 30
get_page(url,page_number)
def get_info(url,page):
for pn in range(1,page+1):
params={
'first':'ture',
'pn':str(pn),
'kd':'Python'
}
try:
html=requests.get(url,data=params,headers=headers)
print(html.text)
json_data=json.loads(html.text)
results=json_data['content']['positionResult']['result']
for result in results:
infos={
'businessZones': result['businessZones'],
'city': result['city'],
'companyFullName': result['companyFullName'],
'companyLabelList': result['companyLabelList'],
'companySize': result['companySize'],
'district': result['district'],
'education': result['education'],
'explain': result['explain'],
'financeStage': result['financeStage'],
'firstType': result['firstType'],
'formatCreateTime': result['formatCreateTime'],
'gradeDescription': result['gradeDescription'],
'imState': result['imState'],
'industryField': result['industryField'],
'jobNature': result['jobNature'],
'positionAdvantage': result['positionAdvantage'],
'salary': result['salary'],
'secondType': result['secondType'],
'workYear': result['workYear']
}
print(infos)
#lagou.insert_one(infos)
time.sleep(2)
except requests.exceptions.ConnectionError:
pass
if __name__=='__main__':
url='https://www.lagou.com/jobs/positionAjax.json'
params={
'first':'true',
'pn':'1',
'kd':'Python'
}
get_page(url,params)
打印你的获取到的数据,结果如下(py_web) ➜ amazon python spider.py
{'success': False, 'msg': '您操作太频繁,请稍后再访问', 'clientIp': '218.15.235.105'}
Traceback (most recent call last):
File "spider.py", line 72, in <module>
get_page(url,params)
File "spider.py", line 21, in get_page
total_count=json_data['content']['positionResult']['totalCount']
KeyError: 'content'
|