|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 chenxz186 于 2020-6-19 18:29 编辑
好久没有再写爬小说了,今天放了一个上来,不多说了,直接上图上代码。
- import requests
- import pyquery
- import os
- import easygui as g
- import time
- import datetime
- def catch_exception(func):
- """抓取错误"""
- def error_func(*args, **kwargs):
- try:
- func(*args, **kwargs)
- except TimeoutError:
- msg = '由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败,是否继续'
- title = '出错显示'
- choices = ['continue', 'exit']
- choice = g.buttonbox(msg, title, choices)
- if choice is None or choice == choices[1]:
- exit()
- if choice == choices[0]:
- Fiction.main()
- except AttributeError:
- msg = 'NoneType object has no attribute find_all'
- title = '出错显示'
- choices = ['continue', 'exit']
- choice = g.buttonbox(msg, title, choices)
- if choice is None or choice == choices[1]:
- exit()
- if choice == choices[0]:
- Fiction.main()
- return error_func
- def statistics_time(func):
- """统计程序运行时间"""
- def test_time(*args, **kwargs):
- start_time = datetime.datetime.now()
- func(*args, **kwargs)
- end_time = datetime.datetime.now()
- result = end_time - start_time
- print(result)
- return test_time
- class ParentFrame(object):
- """建立此父类框架,将通用的方法写进去,方便下次写别的爬虫,不用重写,直接继承。"""
- def __init__(self):
- self.headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
- '78.0.3904.108 Safari/537.36'
- }
- def open_url(self, url):
- """获取网页响应"""
- res = requests.get(url, headers=self.headers)
- return res
- @staticmethod
- def build_doc(res):
- """将网页响应进行PyQuery"""
- html = res.text
- doc = pyquery.PyQuery(html)
- return doc
- @staticmethod
- def save_path():
- """选择你要存放的路径,,小说会下载在此文件夹内。"""
- msg = '小说会保存在此路径下'
- title = '存放路径选择'
- dir_path = g.diropenbox(msg, title)
- if dir_path is None:
- exit()
- os.chdir(dir_path)
- return os.getcwd()
- class Fiction(ParentFrame):
- """小说下载"""
- def __init__(self, search_url):
- super().__init__()
- self.dir_path = self.save_path()
- self.home_url = 'https://www.23txt.com'
- self.search_url = search_url
- def find_fiction(self, name, url):
- res = self.open_url(url)
- res.encoding = 'utf-8'
- doc = self.build_doc(res)
- need_tags = doc('h3').items()
- fiction_names_urls = {}
- choices = []
- for each in need_tags:
- fiction_name = each.find('a').attr('title')
- fiction_url = each.find('a').attr('href')
- fiction_names_urls[fiction_name] = fiction_url
- choices.append(fiction_name)
- print(fiction_names_urls)
- msg = '选择你要下载的小说'
- title = '选择框'
- choice = g.choicebox(msg, title, choices)
- if choice == 'Add more choices':
- print('没有这部小说,要重新选择')
- self.run()
- if choice is None:
- exit()
- return choice, fiction_names_urls[choice]
- def download_tool(self, filename, url):
- res = self.open_url(url)
- res.encoding = 'GBK'
- doc = self.build_doc(res)
- title_tag = doc('h1').text()
- content_tags = doc('#content').text()
- chapter_content = title_tag + '\n\n' + content_tags + '\n\n\n\n'
- with open(filename, 'a', encoding='utf-8') as f:
- f.write(chapter_content)
- @statistics_time
- def fiction_chapters_download(self, filename, url):
- """这里有BUG,不过已经解决了。"""
- res = self.open_url(url)
- res.encoding = 'GBK'
- doc = self.build_doc(res)
- need_tags = doc('dd').items()
- if os.path.exists(filename):
- with open(filename, 'r', encoding='utf-8') as f:
- fiction_content = f.read()
- not_download_chapters = []
- for each in need_tags:
- chapter_title = each.text()
- if chapter_title not in fiction_content:
- not_download_chapters.append((chapter_title, self.home_url + each.find('a').attr('href')))
- length = len(not_download_chapters)
- if length != 0:
- for chapter_title, chapter_url in not_download_chapters:
- print(chapter_title)
- self.download_tool(filename, chapter_url)
- time.sleep(0.5)
- else:
- if g.msgbox('没有可更新的章节') is None:
- exit()
- else:
- for each in need_tags:
- chapter_url = self.home_url + each.find('a').attr('href')
- chapter_title = each.text()
- print(chapter_title)
- self.download_tool(filename, chapter_url)
- time.sleep(0.5)
- def run(self):
- while True:
- msg = '请输入你要查找的小说名,例如《我老婆是鬼王》,即输入 我老婆是鬼王 '
- title = '小说搜索'
- name = g.enterbox(msg, title)
- if name is None:
- exit()
- if name == '':
- continue
- name = name.replace(' ', '')
- print(name)
- fiction_search_url = self.search_url + name
- fiction_name, fiction_url = self.find_fiction(name, fiction_search_url)
- print(fiction_name, fiction_url)
- filename = self.dir_path + '/' + fiction_name + '.txt'
- self.fiction_chapters_download(filename, fiction_url)
- @classmethod
- @catch_exception
- def main(cls):
- search_url = 'https://www.23txt.com/search.php?keyword='
- fiction = cls(search_url)
- fiction.run()
- if __name__ == '__main__':
- Fiction.main()
复制代码 |
-
-
|