鱼C论坛

 找回密码
 立即注册
查看: 1005|回复: 0

[作品展示] 爬取小说(python3.7.0, urllib, xpath, python自带的模块)

[复制链接]
发表于 2019-5-31 14:33:34 | 显示全部楼层 |阅读模式

马上注册,结交更多好友,享用更多功能^_^

您需要 登录 才可以下载或查看,没有账号?立即注册

x
使用方法.png




  1. # 网站地址:https://www.biqiuge.com/     (笔趣阁)

  2. from urllib.request import Request, urlopen
  3. from urllib.error import URLError
  4. from lxml import etree
  5. from time import sleep
  6. import os, re, random


  7. # 获取html
  8. def get_html(url):

  9.         headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36'}
  10.         req = Request(url, headers=headers)
  11.         try:
  12.                 response = urlopen(req)
  13.         except URLError as e:
  14.                 if hasattr(e, "reason"):
  15.                         print("We failed to reach a server.")
  16.                         print("Reason:", e.reason)
  17.                 elif hasattr(e, "code"):
  18.                         print("The server couldn\'t fulfill the request.")
  19.                         print("Error code:", e.code)
  20.         else:
  21.                 html = response.read()
  22.         return html


  23. def get_all_chapter(html):

  24.        
  25.         base_url = "https://www.biqiuge.com"
  26.         start = int(input("请输入开始章数:"))
  27.         end = int(input("请输入结束章数:"))

  28.         html_x = etree.HTML(html)
  29.         chapter_title = html_x.xpath('//div[@class="listmain"]//dd[position() > 6]/a/text()')
  30.         chapter_title = chapter_title[start-1:end]

  31.         all_chapter = html_x.xpath('//div[@class="listmain"]//dd[position() > 6]/a/@href')
  32.         #列表推导式 完整化章节url
  33.         each_chapter_url = [base_url+suffix for suffix in all_chapter]
  34.         each_chapter_url = each_chapter_url[start-1:end]
  35.         return [each_chapter_url,chapter_title]
  36.        


  37. # 获取章节内容并保存
  38. def get_content_and_save(all_chapter):
  39.        
  40.         files = input("请输入保存小说的文件夹名称:")
  41.         os.mkdir(files)
  42.         counter = 0
  43.         for each_chapter_url in all_chapter[0]:

  44.                 html = get_html(each_chapter_url)
  45.                 html = html.decode("gbk")
  46.                 tree = etree.HTML(html)
  47.                 # 获取章节内容到列表
  48.                 fiction_content = tree.xpath('//div[@id="content"]/text()')
  49.                 # 优化获取章节内容
  50.                 text_content = ""
  51.                 for each_line_content in fiction_content:
  52.                         text_content += each_line_content
  53.                 beautiful_text = text_content.replace("\r", "\n").replace(" ","")
  54.                 # 保存
  55.                 print("正在下载:", all_chapter[1][counter])
  56.                 with open("./" + files + "/" + all_chapter[1][counter] + ".txt", "w", encoding="utf-8") as f:
  57.                         f.write(beautiful_text)
  58.                 counter += 1
  59.                 wait_time = random.choice([3, 4, 5, 6])
  60.                 sleep(wait_time)

  61. # 主函数
  62. def get_fictions(url):

  63.         html = get_html(url)
  64.         html = html.decode("gbk")
  65.         all_chapter = get_all_chapter(html)
  66.         get_content_and_save(all_chapter)
  67.        

  68. if __name__ == "__main__":

  69.         url = input("请输入小说章节页面地址:") # 例如下面格式的url
  70.         # url = "https://www.biqiuge.com/book/24277/"
  71.         # url = "https://www.biqiuge.com/book/4772/"
  72.         get_fictions(url)
  73.         print("已停止下载!")
复制代码

本帖被以下淘专辑推荐:

想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

小黑屋|手机版|Archiver|鱼C工作室 ( 粤ICP备18085999号-1 | 粤公网安备 44051102000585号)

GMT+8, 2024-5-27 20:14

Powered by Discuz! X3.4

© 2001-2023 Discuz! Team.

快速回复 返回顶部 返回列表