|  | 
 
| 
本帖最后由 3236654291 于 2021-7-7 09:42 编辑
x
马上注册,结交更多好友,享用更多功能^_^您需要 登录 才可以下载或查看,没有账号?立即注册  
 有些网页爬取不到页数
   比如:
 https://fishc.com.cn/thread-107659-1-1.html
 为什么
   
 
 复制代码import requests
import bs4
import re
def get_num(url): #获取页数
    headers = {}
    headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
    res = requests.get(url,headers = headers)
    soup = bs4.BeautifulSoup(res.text,"html.parser")
    number  = []
    target = soup.find_all("div",class_="pg")
    for each in target:
        number.append(each.label.span.text)
    nr = re.search(r"\d+",str(number))
    print(number)
    if str(type(nr)) == "<class 'NoneType'>":
        return "1"
    else:
        return str(nr.group())
def main(url,c,running = True): #爬取评论
    print(url)
    headers = {}
    headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
    res = requests.get(url,headers = headers)
    soup = bs4.BeautifulSoup(res.text,"html.parser")
    targets = soup.find_all("td",class_="t_f")
    for i in targets:
        if running:
            running = False
        else:
            c.append(i.text)
    return c
    
def keep(file_name,what): #保存文件
    content = ''.join('%s'%id for id in what[0])
    with open(file_name,"w",encoding='utf-8') as f:
        f.write(str(content))
    
def decide(url): #判断网址
    hi = re.search(r"-\d{1,10}-\d{1,10}-\d{1,10}",url)
    if str(type(hi)) == "<class 'NoneType'>":
        return True
    else:
        return False
    
def conversion(url,n): #转化网址
    t = []
    for i in url:
        t.append(i)
    t = ''.join(t[:n])
    return t
    
if __name__ == "__main__":
    url = input("爬取评论代码(仅支持鱼C网址(有几率此永网页无法爬取)):")
    num = get_num(url)#共几页
    file_name = input("请输入需要保存的文件名(注意:不加后缀名(Friendly tips:文件名相同会覆盖原文件)):")
    file_name = file_name + ".txt"
    c = []
    what = []
    if decide(url = url):
        for i in range(int(num)):
            page = str(i+1)
            what.append(main(url = url + '&page=' + page,c=c))
        keep(file_name = file_name,what=what)#保存
    else:
        for i in range(int(num)):
            page = str(i+1)
            what.append(main(url = conversion(url = url,n = -8) + page + '-1.html' ,c=c))
        keep(file_name = file_name,what=what)
 试试这样,把 get_num 函数用下面这个:
 
 
 复制代码def get_num(url):  # 获取页数
    headers = {}
    headers[
        'User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
    headers[
        'Cookie'] = 'oMVX_2132_saltkey=ttg5geGa; oMVX_2132_lastvisit=1624623545; oMVX_2132_auth=e37dd2PNTtpF1Tmt8%2ByP9Gr%2FAvWXweScDEHJ%2FLzFd%2BRFC41%2B81vqg6h784Xa9sG54pMFBr5hP6zGidNgcAEswvrcJ20; oMVX_2132_lastcheckfeed=854664%7C1624627154; oMVX_2132_atarget=1; oMVX_2132_smile=6D1; oMVX_2132_atlist=9; oMVX_2132_sid=ng222X; oMVX_2132_lip=112.50.189.204%2C1625758119; PHPSESSID=mgj7l3mkssq70ag6j7es6d99c6; oMVX_2132_ulastactivity=5838ck7yBFJTgL%2FfVWUcIRaHqSxszsrs5%2BXWC79MnRSl6WIDnqvg; acw_tc=781bad0816257949619514154e4035c90ea37235db295b2910fb132ac7402f; oMVX_2132_noticeTitle=1; oMVX_2132_st_t=854664%7C1625795252%7Cc6611f7934f6145aef7408b139f83a36; oMVX_2132_forum_lastvisit=D_173_1625795252; oMVX_2132_sendmail=1; oMVX_2132_visitedfid=354D173D188D38D125D149D33D171D219D84; oMVX_2132_viewid=tid_107659; oMVX_2132_checkpm=1; oMVX_2132_st_p=854664%7C1625795751%7C0c786b5b745132c2deaf8706e2812eb0; _fmdata=XqSSl%2FXqDIZ5Fsa93RNAO8xoew4KAZ%2FpoIVHewJ6UI5D5tAk0kpV1t58NSABPaVntgjj6GUetZ4oh6vFn%2BMgOCfhUZpBPvGfFCQuOJMGw3g%3D; oMVX_2132_lastact=1625795752%09misc.php%09patch'
    res = requests.get(url, headers=headers)
    soup = bs4.BeautifulSoup(res.text, "html.parser")
    number = []
    target = soup.find_all("div", class_="pg")
    for each in target:
        number.append(each.label.span.text)
    nr = re.search(r"\d+", str(number))
    print(number)
    if str(type(nr)) == "<class 'NoneType'>":
        return "1"
    else:
        return str(nr.group())
 | 
 |