|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
目的:想爬取智联招聘网站的特定关键词的列表招聘信息和列表内招聘详情信息
问题:通过request模块和xpath定位到的内容和实际浏览器显示的内容不一致;下面是我的代码(纯小白,刚学的)有大佬解释一下是什么原因吗?
import requests
from lxml import etree
if __name__ == "__main__":
#伪装浏览器
headers = {'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.35'}
#输入url
#设置通用url
#设置翻页url, 字符串中的{}.format()被替换,f'__'和.format()一样
for pagenum in range(1,2):
url = f'https://sou.zhaopin.com/?jl=763&kw=%E5%92%A8%E8%AF%A2&p={pagenum}'
#获取参数
params = {
'accept': ' text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': ' gzip, deflate, br',
'accept-language': ' zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'cache-control': ' max-age=0',
'cookie': ' x-zp-client-id=59fe1cae-f844-4a11-99bf-886b90370523; campusOperateJobUserInfo=a51e859b-1db1-4ced-b6df-4557593737b7; FSSBBIl1UgzbN7NO=5Oj3DFqFWYkxjSLGVtPQc72nArgWa90JU_JBoGEfhoz_Jlc7LsbF8tCJx2gicQuttOmssC0zdFyy7Ci6JmhK.oa; _uab_collina=166754798597534651997073; locationInfo_search={%22code%22:%22763%22%2C%22name%22:%22%E5%B9%BF%E5%B7%9E%22%2C%22message%22:%22%E5%8C%B9%E9%85%8D%E5%88%B0%E5%B8%82%E7%BA%A7%E7%BC%96%E7%A0%81%22}; LastCity=%E5%B9%BF%E5%B7%9E; LastCity%5Fid=763; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221044786621%22%2C%22first_id%22%3A%221843c1da41e95-017c0825a51e454-45647f52-1049088-1843c1da420381%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fcn.bing.com%2F%22%2C%22%24latest_utm_source%22%3A%22360sem_b%22%2C%22%24latest_utm_medium%22%3A%22CPC%22%2C%22%24latest_utm_campaign%22%3A%22pp%22%2C%22%24latest_utm_content%22%3A%22bb%22%2C%22%24latest_utm_term%22%3A%2285622%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTg0M2MxZGE0MWU5NS0wMTdjMDgyNWE1MWU0NTQtNDU2NDdmNTItMTA0OTA4OC0xODQzYzFkYTQyMDM4MSIsIiRpZGVudGl0eV9sb2dpbl9pZCI6IjEwNDQ3ODY2MjEifQ%3D%3D%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%221044786621%22%7D%2C%22%24device_id%22%3A%221843c1da41e95-017c0825a51e454-45647f52-1049088-1843c1da420381%22%7D; selectCity_search=763; ssxmod_itna2=QqUxRieGqWqeqDKitDXDnBAtG=k+WO70+r3buxA6W5edD/+7YDFoR09ZPAPo8C5QBcSv6=q1iu+rz/D3hH6mtG6kq=Y9Y0uGXjKeUqQ61q7Uuh4RIOQYPG2Gl408Deq+D===; Hm_lvt_38ba284938d5eddca645bb5e02a02006=1667547986,1667873580,1667957310,1668056889; ssxmod_itna=QqGx9Dy7eYqwG7DzxAO4=KDtei=Y3f5GCYmUDBqfT4iNDnD8x7YDvm+EBm78SYnGDcXxvKmxp4=fKDOe3qsir7SooDU4i8DCkDw3bDeW=D5xGoDPxDeDADYE6DAqiOD7qDdfhTXtkDbxi3fxDbDim8mxGCDeKD0ZbFDQKDuEF5ZAFhOpBCrKyDP4xefxG1T40H1C3xfcYffbizP4fAD+4ODlKUDCF1uEyFr4Gd66v1DRqPA0DPb74q8Yr5QxhKwixx5cQqt=DP17iNP20D8QD4ZArnT74M4D; ZL_REPORT_GLOBAL={%22jobs%22:{%22funczoneShare%22:%22dtl_best_for_you%22%2C%22recommandActionidShare%22:%224b73965e-3682-41c6-9c01-5cb9fd099e43-job%22}}; zp_passport_deepknow_sessionId=25c482d3s20ec9461481c822f78edd7234e1; at=2e4a6506ec3a4e8095dbf71e06d82e37; rt=353844b5b82b4f1ab191f55fbab1827f; BEST_EMPLOYER_SHOW_TIME=[1667547773556%2C1667873519700%2C1668059329333]; acw_tc=2760828916680593302367664eb4a8e8082758d1bdab23beea2e0e19797a42; Hm_lpvt_38ba284938d5eddca645bb5e02a02006=1668059338; FSSBBIl1UgzbN7NP=53CnASCJ79WQqqqDl3rK9vA7o5w6B4qVFKttE5ESjI634Jn_yZqRthX3InsuOyvPZ6Hxj_KtNuXjM5ZFg5CViGkStIIlKoLuewH7YPEDYDDH1Twe1T_fyWbgEHCIRO27cJEInIEw_qQ_fFQP7Um4jL5DulQBzXBnWG9WSxvpAyASg3_0_KZzuSqRVK00QdWDXAoudH6ahVwfkUMOTAamHc2Fz6bFX1g0mmkdzG11xLfSyPVMSCW6LKdeUkU31P54.PZ0cWj36engC6Kc8g9Qe81AZckoi.adqXbxoZG6T1_uZUR2.ZmhrBobjmlGh5Tz.e4YgcD2ZctwbG7bWKwA3RK',
'referer': ' https://www.zhaopin.com/',
'sec-ch-ua': ' "Microsoft Edge";v="107", "Chromium";v="107", "Not=A?Brand";v="24"',
'sec-ch-ua-mobile': ' ?0',
'sec-ch-ua-platform': ' "Windows"',
'sec-fetch-dest': ' document',
'sec-fetch-mode': ' navigate',
'sec-fetch-site': ' same-origin',
'sec-fetch-user': ' ?1',
'upgrade-insecure-requests': ' 1',
'user-agent': ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.35',
}
#请求数据
response = requests.get(url=url, params=params)
# 对响应值进行存储赋值
page_content = response.text
with open('page_content.html','w', encoding='utf-8') as fp:
fp.write(page_content)
print('over!')
#对数据进行解析
#将数据导入etree
parser = etree.HTMLParser(encoding='utf-8')
data = etree.parse('page_content.html', parser=parser)
#1.定位数据所在的标签,提取标签内的数据[去文本用text(),去属性下的值用/@XXX
VR = data.xpath('/html/body/div/div[4]/div[2]/div[2]//span/@title')
print(VR)
本帖最后由 cflying 于 2022-11-14 18:31 编辑
requests是爬不到渲染后的页面的,得模拟浏览器
前半截是读取浏览器cookies的,后半截才是爬取和写入excel,自己看着改吧,格式乱了点,将就用
- import sqlite3
- import requests
- import os
- import json
- import random
- import sys
- import base64
- from cryptography.hazmat.backends import default_backend
- from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
- def dpapi_decrypt(encrypted):
- import ctypes
- import ctypes.wintypes
- class DATA_BLOB(ctypes.Structure):
- _fields_ = [('cbData', ctypes.wintypes.DWORD),
- ('pbData', ctypes.POINTER(ctypes.c_char))]
- p = ctypes.create_string_buffer(encrypted, len(encrypted))
- blobin = DATA_BLOB(ctypes.sizeof(p), p)
- blobout = DATA_BLOB()
- retval = ctypes.windll.crypt32.CryptUnprotectData(
- ctypes.byref(blobin), None, None, None, None, 0, ctypes.byref(blobout))
- if not retval:
- raise ctypes.WinError()
- result = ctypes.string_at(blobout.pbData, blobout.cbData)
- ctypes.windll.kernel32.LocalFree(blobout.pbData)
- return result
- def aes_decrypt(encrypted_txt):
- with open(os.path.join(os.environ['LOCALAPPDATA'],
- r"Microsoft\Edge\User Data\Local State"), encoding='utf-8', mode="r") as f:
- jsn = json.loads(str(f.readline()))
- encoded_key = jsn["os_crypt"]["encrypted_key"]
- encrypted_key = base64.b64decode(encoded_key.encode())
- encrypted_key = encrypted_key[5:]
- key = dpapi_decrypt(encrypted_key)
- nonce = encrypted_txt[3:15]
- cipher = Cipher(algorithms.AES(key), None, backend=default_backend())
- cipher.mode = modes.GCM(nonce)
- decryptor = cipher.decryptor()
- return decryptor.update(encrypted_txt[15:])
- def chrome_decrypt(encrypted_txt):
- if sys.platform == 'win32':
- try:
- if encrypted_txt[:4] == b'x01x00x00x00':
- decrypted_txt = dpapi_decrypt(encrypted_txt)
- return decrypted_txt.decode()
- elif encrypted_txt[:3] == b'v10':
- decrypted_txt = aes_decrypt(encrypted_txt)
- return decrypted_txt[:-16].decode()
- except WindowsError:
- return None
- else:
- raise WindowsError
- def get_cookies_from_chrome(domain):
- sql = f'SELECT name, encrypted_value as value FROM cookies where host_key like "%{domain}%"'
- if os.path.exists(os.path.join(os.environ['USERPROFILE'], r'AppData\Local\Microsoft\Edge\User Data\Profile 3\Cookies'))==True:
- filename = os.path.join(os.environ['USERPROFILE'], r'AppData\Local\Microsoft\Edge\User Data\Profile 3\Cookies')
- elif os.path.exists(os.path.join(os.environ['USERPROFILE'], r'AppData\Local\Microsoft\Edge\User Data\Default\Cookies'))==True:
- filename = os.path.join(os.environ['USERPROFILE'], r'AppData\Local\Microsoft\Edge\User Data\Default\Cookies')
- elif os.path.exists(os.path.join(os.environ['USERPROFILE'], r'AppData\Local\Microsoft\Edge\User Data\Profile 3\Network\Cookies'))==True:
- filename = os.path.join(os.environ['USERPROFILE'], r'AppData\Local\Microsoft\Edge\User Data\Profile 3\Network\Cookies')
- elif os.path.exists(os.path.join(os.environ['USERPROFILE'], r'AppData\Local\Microsoft\Edge\User Data\Default\Network\Cookies'))==True:
- filename = os.path.join(os.environ['USERPROFILE'], r'AppData\Local\Microsoft\Edge\User Data\Default\Network\Cookies')
- else:
- print('未找到cookies路径')
- con = sqlite3.connect(filename)
- con.row_factory = sqlite3.Row
- cur = con.cursor()
- cur.execute(sql)
- cookie = ''
- for row in cur:
- if row['value'] is not None:
- name = row['name']
- value = chrome_decrypt(row['value'])
- if value is not None:
- cookie += name + '=' + value + ';'
- con.close()
- return cookie
- #设置host,url
- host='.zhaopin.com'
- #获取url数据
- cookies_tmp=get_cookies_from_chrome(host)
- cookies=[]
- for i in cookies_tmp.split(";"):
- if i:
- if i.split("=")[-1]:
- cookies.append({'name':i.split("=")[0],'value':i.split("=")[-1],'domain':host,'path': '/'})
- '''-----------------------------------------------------------------------------------------------------'''
- #使用前edeg浏览器要登录账号
- from playwright.sync_api import sync_playwright
- from bs4 import BeautifulSoup
- import openpyxl
- url0=r'https://passport.zhaopin.com/login?bkUrl=%2F%2Fi.zhaopin.com%2Fblank%3Fhttps%3A%2F%2Fwww.zhaopin.com%2F%3FvalidateCampus%253D'
- url1=r'https://sou.zhaopin.com/?jl=551&kw=%E4%BC%81%E7%AE%A1&p={}'
- def getinfo(uu):
- p=1
- url=uu.format(str(p))
- tmp_data=[]
- while True:
- print(url)
- page.goto(url)
- bs = BeautifulSoup(page.content(),"html.parser")
- for i in bs.find_all('div',attrs={'class':'joblist-box__item clearfix'}):
- tmp_data.append([i.find('span',attrs={'class':'iteminfo__line1__jobname__name'}).text,i.find('p',attrs={'class':'iteminfo__line2__jobdesc__salary'}).text,'kong',i.find('span',attrs={'class':'iteminfo__line1__compname__name'}).text,i.find('a').attrs['href']])
- if 'disabled' in bs.find('div',attrs={'class':'soupager'}).find_all('button')[1].attrs:
- break
- else:
- p+=1
- url=uu.format(str(p))
- return tmp_data
- book = openpyxl.load_workbook('./infodata.xlsx')
- sheet = book['zhilian']
- with sync_playwright() as p:
- browser = p.chromium.launch()
- context = browser.new_context()
- context.add_cookies(cookies=cookies)
- page = context.new_page()
- for u in [url1]:
- data=getinfo(u)
- for row in data:
- sheet.append(row)
- print('写入数据条数:',len(data))
- book.save('./infodata.xlsx')
- page.close()
- browser.close()
- #input('结束')
复制代码
|
|