再写一个爬取小说的虫虫,Python交流,编程语言专区,鱼C论坛

chenxz186 发表于 2020-6-19 13:43:26

再写一个爬取小说的虫虫

本帖最后由 chenxz186 于 2020-6-19 18:29 编辑

好久没有再写爬小说了，今天放了一个上来，不多说了，直接上图上代码。

import requests
import pyquery
import os
import easygui as g
import time
import datetime

def catch_exception(func):
"""抓取错误"""

def error_func(*args, **kwargs):
   try:
         func(*args, **kwargs)
   except TimeoutError:
         msg = '由于连接方在一段时间后没有正确答复或连接的主机没有反应，连接尝试失败，是否继续'
         title = '出错显示'
         choices = ['continue', 'exit']
         choice = g.buttonbox(msg, title, choices)
         if choice is None or choice == choices:
            exit()
         if choice == choices:
            Fiction.main()
   except AttributeError:
         msg = 'NoneType object has no attribute find_all'
         title = '出错显示'
         choices = ['continue', 'exit']
         choice = g.buttonbox(msg, title, choices)
         if choice is None or choice == choices:
            exit()
         if choice == choices:
            Fiction.main()

return error_func

def statistics_time(func):
"""统计程序运行时间"""

def test_time(*args, **kwargs):
   start_time = datetime.datetime.now()
   func(*args, **kwargs)
   end_time = datetime.datetime.now()
   result = end_time - start_time
   print(result)

return test_time

class ParentFrame(object):
"""建立此父类框架，将通用的方法写进去，方便下次写别的爬虫，不用重写，直接继承。"""

def __init__(self):
   self.headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
                     '78.0.3904.108 Safari/537.36'
   }

def open_url(self, url):
   """获取网页响应"""
   res = requests.get(url, headers=self.headers)
   return res

@staticmethod
def build_doc(res):
   """将网页响应进行PyQuery"""
   html = res.text
   doc = pyquery.PyQuery(html)
   return doc

@staticmethod
def save_path():
   """选择你要存放的路径，，小说会下载在此文件夹内。"""
   msg = '小说会保存在此路径下'
   title = '存放路径选择'
   dir_path = g.diropenbox(msg, title)
   if dir_path is None:
         exit()
   os.chdir(dir_path)
   return os.getcwd()

class Fiction(ParentFrame):
"""小说下载"""

def __init__(self, search_url):
   super().__init__()
   self.dir_path = self.save_path()
   self.home_url = 'https://www.23txt.com'
   self.search_url = search_url

def find_fiction(self, name, url):
   res = self.open_url(url)
   res.encoding = 'utf-8'
   doc = self.build_doc(res)
   need_tags = doc('h3').items()
   fiction_names_urls = {}
   choices = []
   for each in need_tags:
         fiction_name = each.find('a').attr('title')
         fiction_url = each.find('a').attr('href')
         fiction_names_urls = fiction_url
         choices.append(fiction_name)
   print(fiction_names_urls)
   msg = '选择你要下载的小说'
   title = '选择框'
   choice = g.choicebox(msg, title, choices)
   if choice == 'Add more choices':
         print('没有这部小说，要重新选择')
         self.run()
   if choice is None:
         exit()
   return choice, fiction_names_urls

def download_tool(self, filename, url):
   res = self.open_url(url)
   res.encoding = 'GBK'
   doc = self.build_doc(res)
   title_tag = doc('h1').text()
   content_tags = doc('#content').text()
   chapter_content = title_tag + '\n\n' + content_tags + '\n\n\n\n'
   with open(filename, 'a', encoding='utf-8') as f:
         f.write(chapter_content)

@statistics_time
def fiction_chapters_download(self, filename, url):
   """这里有BUG，不过已经解决了。"""
   res = self.open_url(url)
   res.encoding = 'GBK'
   doc = self.build_doc(res)
   need_tags = doc('dd').items()
   if os.path.exists(filename):
         with open(filename, 'r', encoding='utf-8') as f:
            fiction_content = f.read()
            not_download_chapters = []
            for each in need_tags:
               chapter_title = each.text()
               if chapter_title not in fiction_content:
                     not_download_chapters.append((chapter_title, self.home_url + each.find('a').attr('href')))
            length = len(not_download_chapters)
            if length != 0:
               for chapter_title, chapter_url in not_download_chapters:
                     print(chapter_title)
                     self.download_tool(filename, chapter_url)
                     time.sleep(0.5)
            else:
               if g.msgbox('没有可更新的章节') is None:
                     exit()
   else:
         for each in need_tags:
            chapter_url = self.home_url + each.find('a').attr('href')
            chapter_title = each.text()
            print(chapter_title)
            self.download_tool(filename, chapter_url)
            time.sleep(0.5)

def run(self):
   while True:
         msg = '请输入你要查找的小说名，例如《我老婆是鬼王》,即输入我老婆是鬼王 '
         title = '小说搜索'
         name = g.enterbox(msg, title)
         if name is None:
            exit()
         if name == '':
            continue
         name = name.replace(' ', '')
         print(name)
         fiction_search_url = self.search_url + name
         fiction_name, fiction_url = self.find_fiction(name, fiction_search_url)
         print(fiction_name, fiction_url)
         filename = self.dir_path + '/' + fiction_name + '.txt'
         self.fiction_chapters_download(filename, fiction_url)

@classmethod
@catch_exception
def main(cls):
   search_url = 'https://www.23txt.com/search.php?keyword='
   fiction = cls(search_url)
   fiction.run()

if __name__ == '__main__':
Fiction.main()

永恒的蓝色梦想 发表于 2020-6-19 13:52:20

20行if choice is None and choice == choices:这个条件不可能为真。

chenxz186 发表于 2020-6-19 15:10:24

永恒的蓝色梦想发表于 2020-6-19 13:52
20行这个条件不可能为真。

谢谢指正，我当时定得急了，应用or

页: [1]

鱼C论坛's Archiver

再写一个爬取小说的虫虫