|

楼主 |
发表于 2020-7-8 11:59:26
|
显示全部楼层
增加了两个函数
查询客户营业执照名字
获取总页码
成功爬取了数据,但是有个问题,就是速度太慢了
- import requests
- from lxml import etree
- from openpyxl import Workbook
- #####################获取当前最新总页码######################
- def num_1():
- url = 'https://search.51job.com/list/030700,000000,0000,00,9,06%252C07%252C08%252C09%252C10,%2B,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
- headers = {'user-agent': 'firefox'}
- r = requests.get(url, headers=headers)
- r.encoding = 'gbk'
- html = etree.HTML(r.text)
- result = html.xpath('//div[@class="p_in"]/input[@type="hidden"]/@value')
- return result[0]
- #######################获取营业执照名字####################
- def test(url_test):
- proxies = {"url": 'http://115.221.242.206:9999'}
- headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"}
- r_test = requests.get(url_test, headers=headers, proxies=proxies)
- r_test.encoding = 'gbk'
- html = etree.HTML(r_test.text)
- try:
- result_test = html.xpath('//span[@class="icon_det"]/@title')
- if result_test:
- result_test = result_test[0][5:]
- else:
- result_test = html.xpath('//h1/@title')
- result_test = result_test[0]
- except IndexError:
- result_test = html.xpath('//head/title/text()')
- len_1=len(str(result_test[0]))-3
- result_test = result_test[0][:len_1]
- return result_test
- def main():
- row = 1
- wb = Workbook()
- ws = wb.active
- num_2 = int(num_1())+1
- for num in range(1,num_2):
- url = 'https://search.51job.com/list/030700,000000,0000,00,9,05%252C06%252C07%252C08%252C09,%2B,2,' + str(
- num) + '.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
- # IP代理伪装
- proxies = {"url": 'http://115.221.242.206:9999'}
- # 浏览器伪装
- headers = {
- "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"}
- r = requests.get(url, headers=headers, proxies=proxies)
- r.encoding = 'gbk'
- html = etree.HTML(r.text)
- result = html.xpath('//div[(@class="el") and not(@id)]')
- for item in result:
- # 公司
- cpy_url = item.xpath('./span[@class="t2"]/a/@href')[0]
- cpy = test(cpy_url)
- ws.cell(row=row, column=1, value=cpy)
- # 月薪
- sly = item.xpath('./span[@class="t4"]/text()')
- if sly:
- ws.cell(row=row, column=2, value=sly[0])
- # 职位
- pos = item.xpath('./p/span/a/@title')[0]
- ws.cell(row=row, column=3, value=pos)
- row = row + 1
- wb.save('job_51_2.xlsx')
- if __name__ == '__main__':
- main()
复制代码 |
|