| 
 | 
 
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册  
 
x
 
import lxml 
import requests 
from bs4 import BeautifulSoup 
#爬取小说所有的标题和章节 
if __name__=='__main__': 
    url = 'https://www.xbiquge.la/13/13959/' 
    headers = { 
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36' 
    } 
 
    #对首页数据进行爬取 
    page_text1 = requests.get(url=url,headers=headers) 
    page_text1.encoding = 'utf-8' 
    page_text = page_text1.text 
    #print(page_text) 
    #在首页中解析出章节的标题和详情页的url 
    #1.实例化BeautifulSoup对象,需要将页面源码数据加载到该对象 
    soup = BeautifulSoup(page_text,'lxml') 
    #解析章节标题和详情页的url 
    dd_list = soup.select('.box_con > div > dl > dd') 
    print(dd_list) 
    fp = open('./shengxu.txt','w',encoding='utf-8') 
    for dd in dd_list: 
        title = dd.a.string 
        detail_url = 'https://www.xbiquge.la/'+dd.a['href'] 
        #对详情页发起请求,解析出章节内容 
        detail_text1 = requests.get(url=detail_url,headers=headers) 
        detail_text1.encoding = 'utf-8' 
        detail_text = detail_text1.text 
        #解析出详情页中相关的章节内容 
        detail_soup = BeautifulSoup(detail_text,'lxml') 
        div_tag = detail_soup.find('div',id='content') 
        #解析到章节的内容 
        content = div_tag.text 
        fp.write(title+':'+content+'\n') 
        print(title,'爬取成功!') 
出现如下问题怎么解决呀!Orz 
 
 
 
 
 
触发了反爬机制,每次爬取后间隔几秒。 
我也写了一个,爬取几页后就出现你说的问题,加上sleep后就好了。
 - import requests
 
 - from bs4 import BeautifulSoup
 
 - import time
 
 - url = 'https://www.xbiquge.la/13/13959/'
 
 - headers = {
 
 - 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
 
 - }
 
 - res = requests.get(url = url, headers = headers)
 
 - res.encoding = 'utf-8'
 
 - soup = BeautifulSoup(res.text, 'html.parser')
 
 - s = soup.find_all('dd')
 
 - f= open('mybook.text', 'w', encoding = 'utf-8')
 
 - for each in s:
 
 -     title = each.a.text
 
 -     url = 'https://www.xbiquge.la' + each.a['href']
 
 -     res = requests.get(url = url, headers = headers)
 
 -     res.encoding = 'utf-8'
 
 -     soup = BeautifulSoup(res.text, 'html.parser')
 
 -     s1 = soup.find('div', id = 'content')
 
 -     f.write('=' * 20 + '\n' + title + '\n' + '=' * 20 + '\n')
 
 -     f.write(s1.text + '\n')
 
 -     print(title, '爬取成功!')
 
 -     time.sleep(2)
 
 - f.close()
 
  复制代码 
 
 
 |   
 
 
 
 |