增加了两个函数
查询客户营业执照名字
获取总页码
成功爬取了数据,但是有个问题,就是速度太慢了import requests
from lxml import etree
from openpyxl import Workbook
#####################获取当前最新总页码######################
def num_1():
url = 'https://search.51job.com/list/030700,000000,0000,00,9,06%252C07%252C08%252C09%252C10,%2B,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
headers = {'user-agent': 'firefox'}
r = requests.get(url, headers=headers)
r.encoding = 'gbk'
html = etree.HTML(r.text)
result = html.xpath('//div[@class="p_in"]/input[@type="hidden"]/@value')
return result[0]
#######################获取营业执照名字####################
def test(url_test):
proxies = {"url": 'http://115.221.242.206:9999'}
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"}
r_test = requests.get(url_test, headers=headers, proxies=proxies)
r_test.encoding = 'gbk'
html = etree.HTML(r_test.text)
try:
result_test = html.xpath('//span[@class="icon_det"]/@title')
if result_test:
result_test = result_test[0][5:]
else:
result_test = html.xpath('//h1/@title')
result_test = result_test[0]
except IndexError:
result_test = html.xpath('//head/title/text()')
len_1=len(str(result_test[0]))-3
result_test = result_test[0][:len_1]
return result_test
def main():
row = 1
wb = Workbook()
ws = wb.active
num_2 = int(num_1())+1
for num in range(1,num_2):
url = 'https://search.51job.com/list/030700,000000,0000,00,9,05%252C06%252C07%252C08%252C09,%2B,2,' + str(
num) + '.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
# IP代理伪装
proxies = {"url": 'http://115.221.242.206:9999'}
# 浏览器伪装
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"}
r = requests.get(url, headers=headers, proxies=proxies)
r.encoding = 'gbk'
html = etree.HTML(r.text)
result = html.xpath('//div[(@class="el") and not(@id)]')
for item in result:
# 公司
cpy_url = item.xpath('./span[@class="t2"]/a/@href')[0]
cpy = test(cpy_url)
ws.cell(row=row, column=1, value=cpy)
# 月薪
sly = item.xpath('./span[@class="t4"]/text()')
if sly:
ws.cell(row=row, column=2, value=sly[0])
# 职位
pos = item.xpath('./p/span/a/@title')[0]
ws.cell(row=row, column=3, value=pos)
row = row + 1
wb.save('job_51_2.xlsx')
if __name__ == '__main__':
main()
|