|
5鱼币
- import requests
- from lxml import etree
- import re
- import threading
- from queue import Queue
- import time
- import random
- class Producer(threading.Thread):
- headers = {
- 'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"}
- def __init__(self,catalog_queue,text_queue,*args,**kwargs):
- super(Producer, self).__init__(*args,**kwargs)
- self.catalog_queue = catalog_queue
- self.text_queue = text_queue
- def run(self):
- while True:
- if self.catalog_queue.empty(): #---------------------------------------------------------可能有多线程通过了这个判断,导致有的线程卡在了get(),但是没关系吧
- break
- catalog = self.catalog_queue.get()#---------------------------------------------------这里卡不卡无所谓
- url = catalog[2]
- chapter_text = self.find_text(self.get_url(url))
- self.text_queue.put((catalog[0], catalog[1], chapter_text))#--------------------------------------我怀疑是没有放进去,但是不知道为啥
- def get_url(self,url):
- response = requests.get(url, headers=self.headers)
- if response.encoding == 'ISO-8859-1':
- response.encoding = response.apparent_encoding
- html = response.text
- return (html)
- def find_text(self,html):
- html = etree.HTML(html)
- text_list = html.xpath('//div[@id="content"]/text()')
- if '精彩小说无弹窗免费阅读' in text_list[0]:
- text_list = text_list[1:]
- p = re.compile(r'(\S+)')
- k = []
- for each_line in text_list:
- line = each_line.replace(u'\u3000', '')
- line = line.replace(u'\xa0', '')
- a = p.findall(line)
- b = ''.join(a)
- k.append(b)
- while '' in k:
- k.remove('')
- chapter_text = '\n\n'.join(k) + '\n\n\n\n'
- return(chapter_text)
- def get_url(url):
- headers ={'user-agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"}
- response = requests.get(url,headers = headers)
- if response.encoding == 'ISO-8859-1':
- response.encoding = response.apparent_encoding
- html = response.text
- return(html)
- def find_catalog(html,target_url,catalog_queue):
- html = etree.HTML(html)
- name = html.xpath('//h1/text()')[0]
- catalog_names = html.xpath('//dd/a/text()')
- catalog_short_urls = html.xpath('//dd/a/@href')
- #len_url = len(catalog_short_urls)
- catalog_num = 1
- count = 0
- for each in catalog_short_urls[:]:
- #catalog_urls.append(target_url+each)
- #print((catalog_num,catalog_names[count],target_url+each))
- a = (catalog_num,catalog_names[count],target_url+each)
- catalog_queue.put(a)
- catalog_num +=1
- count += 1
- return((name,catalog_num-1))
- def len_catalog(html):
- html = etree.HTML(html)
- catalog_short_urls = html.xpath('//dd/a/@href')
- len_url = len(catalog_short_urls)
- return(len_url)
- def main():
- t1 = time.time()
- start_time = time.asctime()
- emple_url = 'https://www.52bqg.com/book_'
- p=re.compile(r'book_(\d+)')
- target_url = input('请输入笔趣阁52bqg.com的小说地址:')
- urls = p.findall(target_url)[0]
- print('正在解析网页')
- if urls == '':
- print('输入网页格式错误,倒计时两秒')
- time.sleep(2)
- raise
- html = get_url(emple_url+urls)
- len_url = len_catalog(html)
- catalog_queue = Queue(len_url+10)
- text_queue = Queue(len_url+10)
- novel = find_catalog(html, target_url,catalog_queue)
- print('----------------------解析成功---------------------------')
- print('-----------小说名:{}-----------章节数:{}-----------'.format(novel[0], novel[1]))
- for i in range(100):
- t =Producer(catalog_queue,text_queue)#------------------------------------------------------------从目录catalog_queue去爬取每一章的文字放在text_queue里面
- t.start()
- temp = input('按回车键爬取')
- print('正在爬取小说内容')
- novel_text = []
- # print(text_queue.qsize())
- # raise
- while 1:
- print('请勿关闭程序! 当前时间:'+time.asctime()+' 正在爬取 '+str(len(novel_text)+1)+'/'+str(novel[1])+' 章 当前线程数量:'+str(len(threading.enumerate())-1))
- if len(novel_text) >= len_url:
- break
- content = text_queue.get()#-------------------------------------------肯定是卡在这里了,他拿不到这个最后几个数据。
- novel_text.append(content)#------------------------------------------把章节放在列表里面,好排序
- print('爬取完成,正在写入')
- time.sleep(1)
- def take_one(novel_text):
- return (novel_text[0])
- novel_text.sort(key=take_one, reverse=False)#-------------------------------------------------------------------------------排序
- begin_text = '------------------ {} ------------------\n'
- with open(novel[0]+'.txt','w',encoding = 'utf-8') as f:
- count_0 = 0
- for i in novel_text:
- print('正在写入 {} .....'.format(i[1]))
- f.write(begin_text.format(i[1])+i[2])
- count_0 += 1
- end_time = time.asctime()
- t2 = time.time()
- print('\n\n开始时间:'+start_time)
- print('结束时间:'+end_time)
- print('小说写入 '+str(count_0)+' 个章节')
- print('平均下载速率 %.2f 章/秒 '%(count_0/(t2-t1)))
- temp = input('\n按回车键结束')
- if __name__ == '__main__':
- main()
复制代码
小说爬虫
最近学习了多线程爬虫,这里面没有用锁,但是用了queue还是经常阻塞。
程序的流程是先爬取所有章节的链接,然后开启多线程把每个章节的内容放在queue里面,然后再单线程放在列表里面,然后排序写入。
但是爬小说的时候,经常时不时的阻塞,我百度了超级多的方法,但是一直解决不了问题。我也不知道为啥会阻塞。因为上面的多线程里面的get阻塞了也没事呀,主进程结束了它也就结束了呀。
到底是为啥呀,难受 |
|