多线程爬虫阻塞queue问题

丶霁灵 · 发表于 2020-4-5 00:49:52

import requests
from lxml import etree
import re
import threading
from queue import Queue
import time
import random
class Producer(threading.Thread):
headers = {
'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"}
def __init__(self,catalog_queue,text_queue,*args,**kwargs):
super(Producer, self).__init__(*args,**kwargs)
self.catalog_queue = catalog_queue
self.text_queue = text_queue
def run(self):
while True:
if self.catalog_queue.empty(): #---------------------------------------------------------可能有多线程通过了这个判断，导致有的线程卡在了get()，但是没关系吧
break
catalog = self.catalog_queue.get()#---------------------------------------------------这里卡不卡无所谓
url = catalog[2]
chapter_text = self.find_text(self.get_url(url))
self.text_queue.put((catalog[0], catalog[1], chapter_text))#--------------------------------------我怀疑是没有放进去，但是不知道为啥
def get_url(self,url):
response = requests.get(url, headers=self.headers)
if response.encoding == 'ISO-8859-1':
response.encoding = response.apparent_encoding
html = response.text
return (html)
def find_text(self,html):
html = etree.HTML(html)
text_list = html.xpath('//div[@id="content"]/text()')
if '精彩小说无弹窗免费阅读' in text_list[0]:
text_list = text_list[1:]
p = re.compile(r'(\S+)')
k = []
for each_line in text_list:
line = each_line.replace(u'\u3000', '')
line = line.replace(u'\xa0', '')
a = p.findall(line)
b = ''.join(a)
k.append(b)
while '' in k:
k.remove('')
chapter_text = '\n\n'.join(k) + '\n\n\n\n'
return(chapter_text)
def get_url(url):
headers ={'user-agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"}
response = requests.get(url,headers = headers)
if response.encoding == 'ISO-8859-1':
response.encoding = response.apparent_encoding
html = response.text
return(html)
def find_catalog(html,target_url,catalog_queue):
html = etree.HTML(html)
name = html.xpath('//h1/text()')[0]
catalog_names = html.xpath('//dd/a/text()')
catalog_short_urls = html.xpath('//dd/a/@href')
#len_url = len(catalog_short_urls)
catalog_num = 1
count = 0
for each in catalog_short_urls[:]:
#catalog_urls.append(target_url+each)
#print((catalog_num,catalog_names[count],target_url+each))
a = (catalog_num,catalog_names[count],target_url+each)
catalog_queue.put(a)
catalog_num +=1
count += 1
return((name,catalog_num-1))
def len_catalog(html):
html = etree.HTML(html)
catalog_short_urls = html.xpath('//dd/a/@href')
len_url = len(catalog_short_urls)
return(len_url)
def main():
t1 = time.time()
start_time = time.asctime()
emple_url = 'https://www.52bqg.com/book_'
p=re.compile(r'book_(\d+)')
target_url = input('请输入笔趣阁52bqg.com的小说地址：')
urls = p.findall(target_url)[0]
print('正在解析网页')
if urls == '':
print('输入网页格式错误,倒计时两秒')
time.sleep(2)
raise
html = get_url(emple_url+urls)
len_url = len_catalog(html)
catalog_queue = Queue(len_url+10)
text_queue = Queue(len_url+10)
novel = find_catalog(html, target_url,catalog_queue)
print('----------------------解析成功---------------------------')
print('-----------小说名：{}-----------章节数：{}-----------'.format(novel[0], novel[1]))
for i in range(100):
t =Producer(catalog_queue,text_queue)#------------------------------------------------------------从目录catalog_queue去爬取每一章的文字放在text_queue里面
t.start()
temp = input('按回车键爬取')
print('正在爬取小说内容')
novel_text = []
# print(text_queue.qsize())
# raise
while 1:
print('请勿关闭程序！当前时间：'+time.asctime()+' 正在爬取 '+str(len(novel_text)+1)+'/'+str(novel[1])+' 章当前线程数量：'+str(len(threading.enumerate())-1))
if len(novel_text) >= len_url:
break
content = text_queue.get()#-------------------------------------------肯定是卡在这里了，他拿不到这个最后几个数据。
novel_text.append(content)#------------------------------------------把章节放在列表里面，好排序
print('爬取完成，正在写入')
time.sleep(1)
def take_one(novel_text):
return (novel_text[0])
novel_text.sort(key=take_one, reverse=False)#-------------------------------------------------------------------------------排序
begin_text = '------------------ {} ------------------\n'
with open(novel[0]+'.txt','w',encoding = 'utf-8') as f:
count_0 = 0
for i in novel_text:
print('正在写入 {} .....'.format(i[1]))
f.write(begin_text.format(i[1])+i[2])
count_0 += 1
end_time = time.asctime()
t2 = time.time()
print('\n\n开始时间：'+start_time)
print('结束时间：'+end_time)
print('小说写入 '+str(count_0)+' 个章节')
print('平均下载速率 %.2f 章/秒 '%(count_0/(t2-t1)))
temp = input('\n按回车键结束')
if __name__ == '__main__':
main()

复制代码

小说爬虫
最近学习了多线程爬虫，这里面没有用锁，但是用了queue还是经常阻塞。
程序的流程是先爬取所有章节的链接，然后开启多线程把每个章节的内容放在queue里面，然后再单线程放在列表里面，然后排序写入。

但是爬小说的时候，经常时不时的阻塞，我百度了超级多的方法，但是一直解决不了问题。我也不知道为啥会阻塞。因为上面的多线程里面的get阻塞了也没事呀，主进程结束了它也就结束了呀。
到底是为啥呀，难受

FC的注册很坑 · 发表于 2020-4-9 13:38:35

我运行了下，输入target_url：https://www.52bqg.com/book_127354
会报错。。。

顺便分享下

https://zhuanlan.zhihu.com/p/25228075?utm_source=wechat_session&utm_medium=social&utm_oi=28425901309952&s_s_i=Uij1R8pnmkjCPiO4HI66ySfC%2FyMZpeB3dNV9SUmG%2FjY%3D&s_r=1

兢兢 · 发表于 2020-4-12 20:05:28

很好,1400多章的小说，10分钟爬完

hwhrr · 发表于 2020-4-14 13:28:14

兢兢发表于 2020-4-12 20:05
很好,1400多章的小说，10分钟爬完

太慢了，可以看看这个
https://url.ms/mol4x

兢兢 · 发表于 2020-4-14 21:28:20

获取小说的目录链接时没有过滤掉部分空链接
有一部小说，总共有64章，但是有68个链接

账号		自动登录	找回密码
密码			立即注册