鱼C论坛

 找回密码
 立即注册
查看: 2428|回复: 3

[已解决]智联招聘爬虫问题求救!

[复制链接]
发表于 2022-11-14 17:07:46 | 显示全部楼层 |阅读模式

马上注册,结交更多好友,享用更多功能^_^

您需要 登录 才可以下载或查看,没有账号?立即注册

x
目的:想爬取智联招聘网站的特定关键词的列表招聘信息和列表内招聘详情信息
问题:通过request模块和xpath定位到的内容和实际浏览器显示的内容不一致;下面是我的代码(纯小白,刚学的)有大佬解释一下是什么原因吗?


import requests
from lxml import etree
if __name__ == "__main__":
    #伪装浏览器
    headers = {'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.35'}
    #输入url
    #设置通用url
    #设置翻页url, 字符串中的{}.format()被替换,f'__'和.format()一样
    for pagenum in range(1,2):
        url = f'https://sou.zhaopin.com/?jl=763&kw=%E5%92%A8%E8%AF%A2&p={pagenum}'
    #获取参数
        params = {
            'accept': ' text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'accept-encoding': ' gzip, deflate, br',
            'accept-language': ' zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
            'cache-control': ' max-age=0',
            'cookie': ' x-zp-client-id=59fe1cae-f844-4a11-99bf-886b90370523; campusOperateJobUserInfo=a51e859b-1db1-4ced-b6df-4557593737b7; FSSBBIl1UgzbN7NO=5Oj3DFqFWYkxjSLGVtPQc72nArgWa90JU_JBoGEfhoz_Jlc7LsbF8tCJx2gicQuttOmssC0zdFyy7Ci6JmhK.oa; _uab_collina=166754798597534651997073; locationInfo_search={%22code%22:%22763%22%2C%22name%22:%22%E5%B9%BF%E5%B7%9E%22%2C%22message%22:%22%E5%8C%B9%E9%85%8D%E5%88%B0%E5%B8%82%E7%BA%A7%E7%BC%96%E7%A0%81%22}; LastCity=%E5%B9%BF%E5%B7%9E; LastCity%5Fid=763; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221044786621%22%2C%22first_id%22%3A%221843c1da41e95-017c0825a51e454-45647f52-1049088-1843c1da420381%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fcn.bing.com%2F%22%2C%22%24latest_utm_source%22%3A%22360sem_b%22%2C%22%24latest_utm_medium%22%3A%22CPC%22%2C%22%24latest_utm_campaign%22%3A%22pp%22%2C%22%24latest_utm_content%22%3A%22bb%22%2C%22%24latest_utm_term%22%3A%2285622%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTg0M2MxZGE0MWU5NS0wMTdjMDgyNWE1MWU0NTQtNDU2NDdmNTItMTA0OTA4OC0xODQzYzFkYTQyMDM4MSIsIiRpZGVudGl0eV9sb2dpbl9pZCI6IjEwNDQ3ODY2MjEifQ%3D%3D%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%221044786621%22%7D%2C%22%24device_id%22%3A%221843c1da41e95-017c0825a51e454-45647f52-1049088-1843c1da420381%22%7D; selectCity_search=763; ssxmod_itna2=QqUxRieGqWqeqDKitDXDnBAtG=k+WO70+r3buxA6W5edD/+7YDFoR09ZPAPo8C5QBcSv6=q1iu+rz/D3hH6mtG6kq=Y9Y0uGXjKeUqQ61q7Uuh4RIOQYPG2Gl408Deq+D===; Hm_lvt_38ba284938d5eddca645bb5e02a02006=1667547986,1667873580,1667957310,1668056889; ssxmod_itna=QqGx9Dy7eYqwG7DzxAO4=KDtei=Y3f5GCYmUDBqfT4iNDnD8x7YDvm+EBm78SYnGDcXxvKmxp4=fKDOe3qsir7SooDU4i8DCkDw3bDeW=D5xGoDPxDeDADYE6DAqiOD7qDdfhTXtkDbxi3fxDbDim8mxGCDeKD0ZbFDQKDuEF5ZAFhOpBCrKyDP4xefxG1T40H1C3xfcYffbizP4fAD+4ODlKUDCF1uEyFr4Gd66v1DRqPA0DPb74q8Yr5QxhKwixx5cQqt=DP17iNP20D8QD4ZArnT74M4D; ZL_REPORT_GLOBAL={%22jobs%22:{%22funczoneShare%22:%22dtl_best_for_you%22%2C%22recommandActionidShare%22:%224b73965e-3682-41c6-9c01-5cb9fd099e43-job%22}}; zp_passport_deepknow_sessionId=25c482d3s20ec9461481c822f78edd7234e1; at=2e4a6506ec3a4e8095dbf71e06d82e37; rt=353844b5b82b4f1ab191f55fbab1827f; BEST_EMPLOYER_SHOW_TIME=[1667547773556%2C1667873519700%2C1668059329333]; acw_tc=2760828916680593302367664eb4a8e8082758d1bdab23beea2e0e19797a42; Hm_lpvt_38ba284938d5eddca645bb5e02a02006=1668059338; FSSBBIl1UgzbN7NP=53CnASCJ79WQqqqDl3rK9vA7o5w6B4qVFKttE5ESjI634Jn_yZqRthX3InsuOyvPZ6Hxj_KtNuXjM5ZFg5CViGkStIIlKoLuewH7YPEDYDDH1Twe1T_fyWbgEHCIRO27cJEInIEw_qQ_fFQP7Um4jL5DulQBzXBnWG9WSxvpAyASg3_0_KZzuSqRVK00QdWDXAoudH6ahVwfkUMOTAamHc2Fz6bFX1g0mmkdzG11xLfSyPVMSCW6LKdeUkU31P54.PZ0cWj36engC6Kc8g9Qe81AZckoi.adqXbxoZG6T1_uZUR2.ZmhrBobjmlGh5Tz.e4YgcD2ZctwbG7bWKwA3RK',
            'referer': ' https://www.zhaopin.com/',
            'sec-ch-ua': ' "Microsoft Edge";v="107", "Chromium";v="107", "Not=A?Brand";v="24"',
            'sec-ch-ua-mobile': ' ?0',
            'sec-ch-ua-platform': ' "Windows"',
            'sec-fetch-dest': ' document',
            'sec-fetch-mode': ' navigate',
            'sec-fetch-site': ' same-origin',
            'sec-fetch-user': ' ?1',
            'upgrade-insecure-requests': ' 1',
            'user-agent': ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.35',

        }
        #请求数据
        response = requests.get(url=url, params=params)
        # 对响应值进行存储赋值
        page_content = response.text

        with open('page_content.html','w', encoding='utf-8') as fp:
            fp.write(page_content)
        print('over!')
        #对数据进行解析
        #将数据导入etree
        parser = etree.HTMLParser(encoding='utf-8')
        data = etree.parse('page_content.html', parser=parser)
        #1.定位数据所在的标签,提取标签内的数据[去文本用text(),去属性下的值用/@XXX
        VR = data.xpath('/html/body/div/div[4]/div[2]/div[2]//span/@title')
        print(VR)
最佳答案
2022-11-14 18:27:12
本帖最后由 cflying 于 2022-11-14 18:31 编辑

requests是爬不到渲染后的页面的,得模拟浏览器
前半截是读取浏览器cookies的,后半截才是爬取和写入excel,自己看着改吧,格式乱了点,将就用
import sqlite3
import requests
import os
import json
import random

import sys
import base64
from cryptography.hazmat.backends import default_backend
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes


def dpapi_decrypt(encrypted):
    import ctypes
    import ctypes.wintypes

    class DATA_BLOB(ctypes.Structure):
        _fields_ = [('cbData', ctypes.wintypes.DWORD),
                    ('pbData', ctypes.POINTER(ctypes.c_char))]

    p = ctypes.create_string_buffer(encrypted, len(encrypted))
    blobin = DATA_BLOB(ctypes.sizeof(p), p)
    blobout = DATA_BLOB()
    retval = ctypes.windll.crypt32.CryptUnprotectData(
        ctypes.byref(blobin), None, None, None, None, 0, ctypes.byref(blobout))
    if not retval:
        raise ctypes.WinError()
    result = ctypes.string_at(blobout.pbData, blobout.cbData)
    ctypes.windll.kernel32.LocalFree(blobout.pbData)
    return result


def aes_decrypt(encrypted_txt):
    with open(os.path.join(os.environ['LOCALAPPDATA'],
                           r"Microsoft\Edge\User Data\Local State"), encoding='utf-8', mode="r") as f:
        jsn = json.loads(str(f.readline()))
    encoded_key = jsn["os_crypt"]["encrypted_key"]
    encrypted_key = base64.b64decode(encoded_key.encode())
    encrypted_key = encrypted_key[5:]
    key = dpapi_decrypt(encrypted_key)
    nonce = encrypted_txt[3:15]
    cipher = Cipher(algorithms.AES(key), None, backend=default_backend())
    cipher.mode = modes.GCM(nonce)
    decryptor = cipher.decryptor()
    return decryptor.update(encrypted_txt[15:])


def chrome_decrypt(encrypted_txt):
    if sys.platform == 'win32':
        try:
            if encrypted_txt[:4] == b'x01x00x00x00':
                decrypted_txt = dpapi_decrypt(encrypted_txt)
                return decrypted_txt.decode()
            elif encrypted_txt[:3] == b'v10':
                decrypted_txt = aes_decrypt(encrypted_txt)
                return decrypted_txt[:-16].decode()
        except WindowsError:
            return None
    else:
        raise WindowsError


def get_cookies_from_chrome(domain):
    sql = f'SELECT name, encrypted_value as value FROM cookies where host_key like "%{domain}%"'
    if os.path.exists(os.path.join(os.environ['USERPROFILE'], r'AppData\Local\Microsoft\Edge\User Data\Profile 3\Cookies'))==True:
        filename = os.path.join(os.environ['USERPROFILE'], r'AppData\Local\Microsoft\Edge\User Data\Profile 3\Cookies')
    elif  os.path.exists(os.path.join(os.environ['USERPROFILE'], r'AppData\Local\Microsoft\Edge\User Data\Default\Cookies'))==True:
        filename = os.path.join(os.environ['USERPROFILE'], r'AppData\Local\Microsoft\Edge\User Data\Default\Cookies')
    elif  os.path.exists(os.path.join(os.environ['USERPROFILE'], r'AppData\Local\Microsoft\Edge\User Data\Profile 3\Network\Cookies'))==True:
        filename = os.path.join(os.environ['USERPROFILE'], r'AppData\Local\Microsoft\Edge\User Data\Profile 3\Network\Cookies')
    elif  os.path.exists(os.path.join(os.environ['USERPROFILE'], r'AppData\Local\Microsoft\Edge\User Data\Default\Network\Cookies'))==True:
        filename = os.path.join(os.environ['USERPROFILE'], r'AppData\Local\Microsoft\Edge\User Data\Default\Network\Cookies')
    else:
        print('未找到cookies路径')
    con = sqlite3.connect(filename)
    con.row_factory = sqlite3.Row
    cur = con.cursor()
    cur.execute(sql)
    cookie = ''
    for row in cur:
        if row['value'] is not None:
            name = row['name']
            value = chrome_decrypt(row['value'])
            if value is not None:
                cookie += name + '=' + value + ';'
    con.close()
    return cookie

#设置host,url
host='.zhaopin.com'
#获取url数据
cookies_tmp=get_cookies_from_chrome(host)
cookies=[]
for i in cookies_tmp.split(";"):
    if i:
        if i.split("=")[-1]:
            cookies.append({'name':i.split("=")[0],'value':i.split("=")[-1],'domain':host,'path': '/'})



'''-----------------------------------------------------------------------------------------------------'''
#使用前edeg浏览器要登录账号
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
import openpyxl


url0=r'https://passport.zhaopin.com/login?bkUrl=%2F%2Fi.zhaopin.com%2Fblank%3Fhttps%3A%2F%2Fwww.zhaopin.com%2F%3FvalidateCampus%253D'
url1=r'https://sou.zhaopin.com/?jl=551&kw=%E4%BC%81%E7%AE%A1&p={}'




def getinfo(uu):
    p=1
    url=uu.format(str(p))
    tmp_data=[]
    while True:
        print(url)
        page.goto(url)
        bs = BeautifulSoup(page.content(),"html.parser")
        for i in bs.find_all('div',attrs={'class':'joblist-box__item clearfix'}):
            tmp_data.append([i.find('span',attrs={'class':'iteminfo__line1__jobname__name'}).text,i.find('p',attrs={'class':'iteminfo__line2__jobdesc__salary'}).text,'kong',i.find('span',attrs={'class':'iteminfo__line1__compname__name'}).text,i.find('a').attrs['href']])

        if 'disabled' in bs.find('div',attrs={'class':'soupager'}).find_all('button')[1].attrs:
            break
        else:
            p+=1
            url=uu.format(str(p))

    return tmp_data


book = openpyxl.load_workbook('./infodata.xlsx')
sheet = book['zhilian']
with sync_playwright() as p:
    browser = p.chromium.launch()
    context = browser.new_context()
    context.add_cookies(cookies=cookies)
    page = context.new_page()
    for u in [url1]:
        data=getinfo(u)
        for row in data:
            sheet.append(row)
        print('写入数据条数:',len(data))
        book.save('./infodata.xlsx')
    page.close()
    browser.close()
#input('结束')
想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复

使用道具 举报

发表于 2022-11-14 18:27:12 | 显示全部楼层    本楼为最佳答案   
本帖最后由 cflying 于 2022-11-14 18:31 编辑

requests是爬不到渲染后的页面的,得模拟浏览器
前半截是读取浏览器cookies的,后半截才是爬取和写入excel,自己看着改吧,格式乱了点,将就用
import sqlite3
import requests
import os
import json
import random

import sys
import base64
from cryptography.hazmat.backends import default_backend
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes


def dpapi_decrypt(encrypted):
    import ctypes
    import ctypes.wintypes

    class DATA_BLOB(ctypes.Structure):
        _fields_ = [('cbData', ctypes.wintypes.DWORD),
                    ('pbData', ctypes.POINTER(ctypes.c_char))]

    p = ctypes.create_string_buffer(encrypted, len(encrypted))
    blobin = DATA_BLOB(ctypes.sizeof(p), p)
    blobout = DATA_BLOB()
    retval = ctypes.windll.crypt32.CryptUnprotectData(
        ctypes.byref(blobin), None, None, None, None, 0, ctypes.byref(blobout))
    if not retval:
        raise ctypes.WinError()
    result = ctypes.string_at(blobout.pbData, blobout.cbData)
    ctypes.windll.kernel32.LocalFree(blobout.pbData)
    return result


def aes_decrypt(encrypted_txt):
    with open(os.path.join(os.environ['LOCALAPPDATA'],
                           r"Microsoft\Edge\User Data\Local State"), encoding='utf-8', mode="r") as f:
        jsn = json.loads(str(f.readline()))
    encoded_key = jsn["os_crypt"]["encrypted_key"]
    encrypted_key = base64.b64decode(encoded_key.encode())
    encrypted_key = encrypted_key[5:]
    key = dpapi_decrypt(encrypted_key)
    nonce = encrypted_txt[3:15]
    cipher = Cipher(algorithms.AES(key), None, backend=default_backend())
    cipher.mode = modes.GCM(nonce)
    decryptor = cipher.decryptor()
    return decryptor.update(encrypted_txt[15:])


def chrome_decrypt(encrypted_txt):
    if sys.platform == 'win32':
        try:
            if encrypted_txt[:4] == b'x01x00x00x00':
                decrypted_txt = dpapi_decrypt(encrypted_txt)
                return decrypted_txt.decode()
            elif encrypted_txt[:3] == b'v10':
                decrypted_txt = aes_decrypt(encrypted_txt)
                return decrypted_txt[:-16].decode()
        except WindowsError:
            return None
    else:
        raise WindowsError


def get_cookies_from_chrome(domain):
    sql = f'SELECT name, encrypted_value as value FROM cookies where host_key like "%{domain}%"'
    if os.path.exists(os.path.join(os.environ['USERPROFILE'], r'AppData\Local\Microsoft\Edge\User Data\Profile 3\Cookies'))==True:
        filename = os.path.join(os.environ['USERPROFILE'], r'AppData\Local\Microsoft\Edge\User Data\Profile 3\Cookies')
    elif  os.path.exists(os.path.join(os.environ['USERPROFILE'], r'AppData\Local\Microsoft\Edge\User Data\Default\Cookies'))==True:
        filename = os.path.join(os.environ['USERPROFILE'], r'AppData\Local\Microsoft\Edge\User Data\Default\Cookies')
    elif  os.path.exists(os.path.join(os.environ['USERPROFILE'], r'AppData\Local\Microsoft\Edge\User Data\Profile 3\Network\Cookies'))==True:
        filename = os.path.join(os.environ['USERPROFILE'], r'AppData\Local\Microsoft\Edge\User Data\Profile 3\Network\Cookies')
    elif  os.path.exists(os.path.join(os.environ['USERPROFILE'], r'AppData\Local\Microsoft\Edge\User Data\Default\Network\Cookies'))==True:
        filename = os.path.join(os.environ['USERPROFILE'], r'AppData\Local\Microsoft\Edge\User Data\Default\Network\Cookies')
    else:
        print('未找到cookies路径')
    con = sqlite3.connect(filename)
    con.row_factory = sqlite3.Row
    cur = con.cursor()
    cur.execute(sql)
    cookie = ''
    for row in cur:
        if row['value'] is not None:
            name = row['name']
            value = chrome_decrypt(row['value'])
            if value is not None:
                cookie += name + '=' + value + ';'
    con.close()
    return cookie

#设置host,url
host='.zhaopin.com'
#获取url数据
cookies_tmp=get_cookies_from_chrome(host)
cookies=[]
for i in cookies_tmp.split(";"):
    if i:
        if i.split("=")[-1]:
            cookies.append({'name':i.split("=")[0],'value':i.split("=")[-1],'domain':host,'path': '/'})



'''-----------------------------------------------------------------------------------------------------'''
#使用前edeg浏览器要登录账号
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
import openpyxl


url0=r'https://passport.zhaopin.com/login?bkUrl=%2F%2Fi.zhaopin.com%2Fblank%3Fhttps%3A%2F%2Fwww.zhaopin.com%2F%3FvalidateCampus%253D'
url1=r'https://sou.zhaopin.com/?jl=551&kw=%E4%BC%81%E7%AE%A1&p={}'




def getinfo(uu):
    p=1
    url=uu.format(str(p))
    tmp_data=[]
    while True:
        print(url)
        page.goto(url)
        bs = BeautifulSoup(page.content(),"html.parser")
        for i in bs.find_all('div',attrs={'class':'joblist-box__item clearfix'}):
            tmp_data.append([i.find('span',attrs={'class':'iteminfo__line1__jobname__name'}).text,i.find('p',attrs={'class':'iteminfo__line2__jobdesc__salary'}).text,'kong',i.find('span',attrs={'class':'iteminfo__line1__compname__name'}).text,i.find('a').attrs['href']])

        if 'disabled' in bs.find('div',attrs={'class':'soupager'}).find_all('button')[1].attrs:
            break
        else:
            p+=1
            url=uu.format(str(p))

    return tmp_data


book = openpyxl.load_workbook('./infodata.xlsx')
sheet = book['zhilian']
with sync_playwright() as p:
    browser = p.chromium.launch()
    context = browser.new_context()
    context.add_cookies(cookies=cookies)
    page = context.new_page()
    for u in [url1]:
        data=getinfo(u)
        for row in data:
            sheet.append(row)
        print('写入数据条数:',len(data))
        book.save('./infodata.xlsx')
    page.close()
    browser.close()
#input('结束')
想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复 支持 反对

使用道具 举报

 楼主| 发表于 2022-11-14 20:37:47 | 显示全部楼层
谢谢大佬!我研究研究
想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复 支持 反对

使用道具 举报

 楼主| 发表于 2022-11-14 20:40:27 | 显示全部楼层
cflying 发表于 2022-11-14 18:27
requests是爬不到渲染后的页面的,得模拟浏览器
前半截是读取浏览器cookies的,后半截才是爬取和写入excel ...

谢谢大佬!我研究研究
想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复 支持 反对

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

小黑屋|手机版|Archiver|鱼C工作室 ( 粤ICP备18085999号-1 | 粤公网安备 44051102000585号)

GMT+8, 2024-11-15 14:26

Powered by Discuz! X3.4

© 2001-2023 Discuz! Team.

快速回复 返回顶部 返回列表