|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
无聊爬一爬
- import requests # 网络请求模块
- import time # 时间模块
- import random # 随机模块
- import re,json
- json_url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,{page}.html'
- class Crawl():
- def __init__(self):
- # 创建头部信息
- self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0'}
- def get_json(self,json_url):
- response = requests.get(json_url, headers=self.headers)
- # 判断请求是否成功
- if response.status_code == 200:
- html_data = re.findall('window\.__SEARCH_RESULT__ =(.*?)</script>', response.text)[0]
- json_data = json.loads(html_data)
- search_result = json_data['engine_search_result']
- for index in search_result:
- item = {}
- item['title'] = index['job_name']
- item['xin_xi'] = index['attribute_text']
- item['gong_si'] = index['company_name']
- print(item)
- else:
- print('获取信息的请求没有成功!')
- if __name__ == '__main__':
- c = Crawl() # 创建爬虫类对象
- for page in range(1,6): #页码
- text = c.get_json(json_url.format(page=page))
- time.sleep(random.randint(2,4)) # 随机产生获取json请求的间隔时间'''
复制代码 |
|