[求助鱼油]一个很简单的多线程程序,十万火急!!!

MadJoker · 发表于 2019-7-24 22:01:55

一个很简单的多线程爬虫,自己改了一天了,还是各种问题,其中采集类应该是没问题的(也有点不敢确定,毕竟菜鸟),因为我单独测试过,解析类中的选择器部分是确认无误的。但是运行的的时候还是出各种问题,只有这点币了,币什么的都不重要,但是我是真的想把爬虫学好,麻烦各位路过的大神帮忙改改,最好贴一下改正后的代码,以及指正我代码中的不足，谢谢大神,代码如下:

from  bs4 import BeautifulSoup as bs
from queue import Queue
import threading
import requests
import json

class Crawl_thread(threading.Thread):
def __init__(self,thread_name,queue):
      threading.Thread.__init__(self)
      self.thread_name = thread_name
      self.queue = queue

def run(self):
      print('启动了线程{}'.format(self.thread_name))
      self.crawl_spider()
      print('退出了线程{}'.format(self.thread_name))

def crawl_spider(self):
      while True:
         if self.queue.empty():#判断队列是否为空
            break
         else:
            page = self.queue.get()#获取队列里面的元素
            print('当前正在工作的线程是,{},正在采集第{}个页面'.format(self.thread_name,str(page+1)))
            base_url = 'https://maoyan.com/board/4?offset={}'
            url = base_url.format(str(page*10))
            headers = {
                     'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
                     }
            response = requests.get(url,headers=headers)
            data = response.content.decode('utf-8')
            data_queue.put(data)

class Parser_thread(threading.Thread):
def __init__(self,thread_name,queue,file):
      threading.Thread.__init__(self)
      self.thread_name = thread_name
      self.queue = queue
      self.file = file
def run(self):
      print('启动了线程{}'.format(self.thread_name))
      while not flag:
         #get的参数为False的时候,队列为空,会抛出异常
         item = self.queue.get(False)
         if not item:
            pass
         self.parse_data(item)
         self.queue.task_done()#每当get一次之后,提示是否阻塞
      print('退出了线程{}'.format(self.thread_name))

def parse_data(self,item):
      '''
      解析网页内容的函数
      '''
      soup = bs(item,'lxml')
      dd_list = soup.select('dl dd')
      for dd in dd_list:
         film_img =dd.find('img',attrs={'class':'board-img'}).attrs['data-src']
         film_title = dd.find('p',attrs={'class':'name'}).find('a').get_text()
         film_score = dd.find('p',attrs={'class':'score'}).get_text()
         film_star = dd.find('p',attrs={'class':'star'}).get_text()
         film_releasetime = dd.find('p',attrs={'class':'releasetime'}).get_text()
         response = {
                  '电影图片':film_img,
                  '电影标题':film_title,
                  '电影得分':film_score,
                  '电影主演':film_star,
                  '上映时间':film_releasetime
                  }
         self.file.write(json.dumps(response).encode('utf-8')+'\n')

data_queue = Queue()#创建一个队列
flag = False #全局变量
def main():
output = open('maoyan.json','a')
pagequeue = Queue(50)
for page in range(10):
      pagequeue.put(page)
#初始化采集线程
crawl_threads=[]
crawl_name_list = ['crawl_1','crawl_2','crawl_3']
for thread_name in crawl_name_list:
      thread = Crawl_thread(thread_name,pagequeue)
      thread.start()#启动线程
      crawl_threads.append(thread)#list的添加方法
#初始化解析线程
parser_threads=[]
parser_name_list = ['parser_1','parser_2','parser_3']
for thread_name in parser_name_list:
      thread = Parser_thread(thread_name,data_queue,output)
      thread.start()#启动线程
      parser_threads.append(thread)#list的添加方法
#等待队列情况
while not pagequeue.empty():
      pass

#等待所有的线程结束
for t in crawl_threads:
      t.join()

while not data_queue.empty():
      pass

#通知线程退出
global flag
for t in parser_threads:
      t.join()
print('退出主线程')
output.close()

if __name__ == '__main__':
main()

wwhywhy · 发表于 2019-7-24 22:09:12

本帖最后由 wwhywhy 于 2019-7-24 22:16 编辑

class Parser_thread(threading.Thread):
def __init__(self,thread_name,queue,file):
      threading.Thread.__init__(self)
      self.thread_name = thread_name
      self.queue = queue
      self.file = file
中的threading.Thread.__init__(self) 应该在最后吧？否则self的值还没有设置，后面的run(self)怎么调用呢？

账号		自动登录	找回密码
密码			立即注册

[求助鱼油]一个很简单的多线程程序,十万火急!!!

浏览过的版块