再写一个爬取小说的虫虫
本帖最后由 chenxz186 于 2020-6-19 18:29 编辑好久没有再写爬小说了,今天放了一个上来,不多说了,直接上图上代码。
import requests
import pyquery
import os
import easygui as g
import time
import datetime
def catch_exception(func):
"""抓取错误"""
def error_func(*args, **kwargs):
try:
func(*args, **kwargs)
except TimeoutError:
msg = '由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败,是否继续'
title = '出错显示'
choices = ['continue', 'exit']
choice = g.buttonbox(msg, title, choices)
if choice is None or choice == choices:
exit()
if choice == choices:
Fiction.main()
except AttributeError:
msg = 'NoneType object has no attribute find_all'
title = '出错显示'
choices = ['continue', 'exit']
choice = g.buttonbox(msg, title, choices)
if choice is None or choice == choices:
exit()
if choice == choices:
Fiction.main()
return error_func
def statistics_time(func):
"""统计程序运行时间"""
def test_time(*args, **kwargs):
start_time = datetime.datetime.now()
func(*args, **kwargs)
end_time = datetime.datetime.now()
result = end_time - start_time
print(result)
return test_time
class ParentFrame(object):
"""建立此父类框架,将通用的方法写进去,方便下次写别的爬虫,不用重写,直接继承。"""
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
'78.0.3904.108 Safari/537.36'
}
def open_url(self, url):
"""获取网页响应"""
res = requests.get(url, headers=self.headers)
return res
@staticmethod
def build_doc(res):
"""将网页响应进行PyQuery"""
html = res.text
doc = pyquery.PyQuery(html)
return doc
@staticmethod
def save_path():
"""选择你要存放的路径,,小说会下载在此文件夹内。"""
msg = '小说会保存在此路径下'
title = '存放路径选择'
dir_path = g.diropenbox(msg, title)
if dir_path is None:
exit()
os.chdir(dir_path)
return os.getcwd()
class Fiction(ParentFrame):
"""小说下载"""
def __init__(self, search_url):
super().__init__()
self.dir_path = self.save_path()
self.home_url = 'https://www.23txt.com'
self.search_url = search_url
def find_fiction(self, name, url):
res = self.open_url(url)
res.encoding = 'utf-8'
doc = self.build_doc(res)
need_tags = doc('h3').items()
fiction_names_urls = {}
choices = []
for each in need_tags:
fiction_name = each.find('a').attr('title')
fiction_url = each.find('a').attr('href')
fiction_names_urls = fiction_url
choices.append(fiction_name)
print(fiction_names_urls)
msg = '选择你要下载的小说'
title = '选择框'
choice = g.choicebox(msg, title, choices)
if choice == 'Add more choices':
print('没有这部小说,要重新选择')
self.run()
if choice is None:
exit()
return choice, fiction_names_urls
def download_tool(self, filename, url):
res = self.open_url(url)
res.encoding = 'GBK'
doc = self.build_doc(res)
title_tag = doc('h1').text()
content_tags = doc('#content').text()
chapter_content = title_tag + '\n\n' + content_tags + '\n\n\n\n'
with open(filename, 'a', encoding='utf-8') as f:
f.write(chapter_content)
@statistics_time
def fiction_chapters_download(self, filename, url):
"""这里有BUG,不过已经解决了。"""
res = self.open_url(url)
res.encoding = 'GBK'
doc = self.build_doc(res)
need_tags = doc('dd').items()
if os.path.exists(filename):
with open(filename, 'r', encoding='utf-8') as f:
fiction_content = f.read()
not_download_chapters = []
for each in need_tags:
chapter_title = each.text()
if chapter_title not in fiction_content:
not_download_chapters.append((chapter_title, self.home_url + each.find('a').attr('href')))
length = len(not_download_chapters)
if length != 0:
for chapter_title, chapter_url in not_download_chapters:
print(chapter_title)
self.download_tool(filename, chapter_url)
time.sleep(0.5)
else:
if g.msgbox('没有可更新的章节') is None:
exit()
else:
for each in need_tags:
chapter_url = self.home_url + each.find('a').attr('href')
chapter_title = each.text()
print(chapter_title)
self.download_tool(filename, chapter_url)
time.sleep(0.5)
def run(self):
while True:
msg = '请输入你要查找的小说名,例如《我老婆是鬼王》,即输入 我老婆是鬼王 '
title = '小说搜索'
name = g.enterbox(msg, title)
if name is None:
exit()
if name == '':
continue
name = name.replace(' ', '')
print(name)
fiction_search_url = self.search_url + name
fiction_name, fiction_url = self.find_fiction(name, fiction_search_url)
print(fiction_name, fiction_url)
filename = self.dir_path + '/' + fiction_name + '.txt'
self.fiction_chapters_download(filename, fiction_url)
@classmethod
@catch_exception
def main(cls):
search_url = 'https://www.23txt.com/search.php?keyword='
fiction = cls(search_url)
fiction.run()
if __name__ == '__main__':
Fiction.main()
20行if choice is None and choice == choices:这个条件不可能为真。 永恒的蓝色梦想 发表于 2020-6-19 13:52
20行这个条件不可能为真。
谢谢指正,我当时定得急了,应用or
页:
[1]