优先队列,顺序问题已解决,更新代码如下:"""
多线程爬虫首次尝试:
爬取赘婿小说全部章节,解析并保存为txt
目标url:http://www.xbiquge.la/0/885/
@author: 昨非
"""
from threading import Thread
from queue import PriorityQueue
from fake_useragent import UserAgent
import requests
from lxml import etree
headers = {
"User-Agent": UserAgent().random
}
# 爬虫类
class GetInfo(Thread):
def __init__(self,url_queue,html_queue):
Thread.__init__(self)
self.url_queue = url_queue
self.html_queue = html_queue
def run(self):
while self.url_queue.empty() == False:
item = self.url_queue.get()
url = item[1]
response = requests.get(url, headers=headers)
if response.status_code == 200:
response.encoding = 'utf-8' # 这步很关键
self.html_queue.put((item[0],response.text))
# 解析类
class ParseInfo(Thread):
def __init__(self, html_queue):
Thread.__init__(self)
self.html_queue = html_queue
def run(self):
while self.html_queue.empty() == False:
item2 = self.html_queue.get()
e = etree.HTML(item2[1])
chapter_names = e.xpath('//div[@class = "bookname"]/h1/text()')
chapter_contents = e.xpath('//div[@id = "content"]/text()')
for chapter_name in chapter_names:
txt = ''
for i in chapter_contents: # 先拼接
if i != '\n':
i = repr(i).replace(r'\xa0', '').replace("'", '')
txt += i
txt = repr(txt).replace("\\n", '\n').replace('\\', '')
txt = repr(txt).replace('rr', '\n')# 最终处理
with open('赘婿.txt', 'a', encoding='utf-8') as f:
f.write(chapter_name + '\n'+txt + '\n')
if __name__ == '__main__':
# 存储url的容器
url_queue = PriorityQueue()
# 存储内容的容器
html_queue = PriorityQueue()
first_url = 'http://www.xbiquge.la/0/885/'
response = requests.get(first_url, headers=headers)
e = etree.HTML(response.content.decode('utf-8')) # 返回字符串
urls = e.xpath('//div[@class="box_con"]/div[@id="list"]/dl/dd/a/@href')
i = 0
for url in urls:
chapter_url = 'http://www.xbiquge.la' + url
url_queue.put((i,chapter_url))
i += 1
# 创建一个爬虫
crawl_list = []
for i in range(0, 100):
crawl1 = GetInfo(url_queue, html_queue)
crawl_list.append(crawl1)
crawl1.start()
for crawl in crawl_list:
crawl.join()
parse_list = []
for i in range(0, 100):
parse = ParseInfo(html_queue)
parse_list.append(parse)
parse.start()
for parse in parse_list:
parse.join()
|