|
楼主 |
发表于 2021-4-21 17:38:36
|
显示全部楼层
优先队列,顺序问题已解决,更新代码如下:
- """
- 多线程爬虫首次尝试:
- 爬取赘婿小说全部章节,解析并保存为txt
- 目标url:http://www.xbiquge.la/0/885/
- @author: 昨非
- """
- from threading import Thread
- from queue import PriorityQueue
- from fake_useragent import UserAgent
- import requests
- from lxml import etree
- headers = {
- "User-Agent": UserAgent().random
- }
- # 爬虫类
- class GetInfo(Thread):
- def __init__(self,url_queue,html_queue):
- Thread.__init__(self)
- self.url_queue = url_queue
- self.html_queue = html_queue
- def run(self):
- while self.url_queue.empty() == False:
- item = self.url_queue.get()
- url = item[1]
- response = requests.get(url, headers=headers)
- if response.status_code == 200:
- response.encoding = 'utf-8' # 这步很关键
- self.html_queue.put((item[0],response.text))
- # 解析类
- class ParseInfo(Thread):
- def __init__(self, html_queue):
- Thread.__init__(self)
- self.html_queue = html_queue
- def run(self):
- while self.html_queue.empty() == False:
- item2 = self.html_queue.get()
- e = etree.HTML(item2[1])
- chapter_names = e.xpath('//div[@class = "bookname"]/h1/text()')
- chapter_contents = e.xpath('//div[@id = "content"]/text()')
- for chapter_name in chapter_names:
- txt = ''
- for i in chapter_contents: # 先拼接
- if i != '\n':
- i = repr(i).replace(r'\xa0', '').replace("'", '')
- txt += i
- txt = repr(txt).replace("\\n", '\n').replace('\\', '')
- txt = repr(txt).replace('rr', '\n')# 最终处理
- with open('赘婿.txt', 'a', encoding='utf-8') as f:
- f.write(chapter_name + '\n'+txt + '\n')
- if __name__ == '__main__':
- # 存储url的容器
- url_queue = PriorityQueue()
- # 存储内容的容器
- html_queue = PriorityQueue()
- first_url = 'http://www.xbiquge.la/0/885/'
- response = requests.get(first_url, headers=headers)
- e = etree.HTML(response.content.decode('utf-8')) # 返回字符串
- urls = e.xpath('//div[@class="box_con"]/div[@id="list"]/dl/dd/a/@href')
- i = 0
- for url in urls:
- chapter_url = 'http://www.xbiquge.la' + url
- url_queue.put((i,chapter_url))
- i += 1
- # 创建一个爬虫
- crawl_list = []
- for i in range(0, 100):
- crawl1 = GetInfo(url_queue, html_queue)
- crawl_list.append(crawl1)
- crawl1.start()
- for crawl in crawl_list:
- crawl.join()
- parse_list = []
- for i in range(0, 100):
- parse = ParseInfo(html_queue)
- parse_list.append(parse)
- parse.start()
- for parse in parse_list:
- parse.join()
复制代码 |
|