|
|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
最近学习多进程和多线程爬虫以提升效率。多进程基本搞懂了,多线程这块也差不多,不过在用多线程爬取豆瓣电影排行榜的时候,开了三个解析线程,每个线程解析第一页的速度都远远快于后面的页,百思不得其解,忘高手指点。源码奉上,基本的注释也都加了,数据存储在一个excel表格中,表格如图(这里添加不了图片?算了,表格内容就是A-O列第一行分别为排名,片名,年份,国家,导演,编剧,主演,类型,片长,又名,评分,评分人数,主题,详情页,图片链接)。
代码如下,可以复制过去直接运行。看出问题所在,可以微信加我,10块钱红包
- #多线程爬取豆瓣top250
- import re
- import requests
- import time
- from bs4 import BeautifulSoup
- from openpyxl import load_workbook as lw
- import threading
- from queue import Queue
- #采集线程类
- class CrawlThread(threading.Thread):
- def __init__(self,thread_name,page_queue,html_queue):
- super(CrawlThread, self).__init__()
- self.thread_name = thread_name
- self.page_queue = page_queue
- self.html_queue = html_queue
- def run(self):
- print('———%s开始工作———' % self.thread_name)
- while not CRAWL_EXIT:
- page = self.page_queue.get()
- print('—%s开始爬取第%s页-' % (self.thread_name,page))
- url = 'https://movie.douban.com/top250?start=' + str((page-1) * 25) + '&filter='
- html = requests.get(url).text
- self.html_queue.put((page,html))#页面队列中,将html页面和页码组成元祖存入页面队列中
- print('—%s爬取第%s页完毕—' % (self.thread_name,page))
- print('———%s工作完毕———' % self.thread_name)
- #解析线程类
- class ParseThread(threading.Thread):
- def __init__(self,thread_name,html_queue,sheet):
- super(ParseThread, self).__init__()
- self.thread_name = thread_name
- self.html_queue = html_queue
- self.sheet = sheet
- def run(self):
- print('******%s开始工作******' % self.thread_name)
- while not PARSE_EXIT:
- item = self.html_queue.get()#页面队列中取出页码和页面数据
- page,html = item[0],item[1]
- print('【%s开始解析第%s页】' % (self.thread_name,page))
- time1 = time.time()
- result = self.parse(html)
- for each in result: # 填表及保存图片
- # 爬取的每条数据根据排名存入对应的行,因此多线程并发写入该文件不会引起混乱,该操作不需要加锁
- row = int(each[0]) + 1
- self.sheet['A%s' % row].value = str(each[0])
- # sheet['A%s' % row].value = each[0]#这行为什么报错呢?匪夷所思!!!
- self.sheet['B%s' % row].value = each[1]
- self.sheet['C%s' % row].value = each[2]
- self.sheet['D%s' % row].value = each[3]
- self.sheet['E%s' % row].value = each[4]
- self.sheet['F%s' % row].value = each[5]
- self.sheet['G%s' % row].value = each[6]
- self.sheet['H%s' % row].value = each[7]
- self.sheet['I%s' % row].value = each[8]
- self.sheet['J%s' % row].value = each[9]
- self.sheet['K%s' % row].value = each[10]
- self.sheet['L%s' % row].value = each[11]
- self.sheet['M%s' % row].value = each[12]
- self.sheet['N%s' % row].value = each[13]
- self.sheet['O%s' % row].value = each[14]
- time2 = time.time()
- print('【%s解决掉第%s页,耗时%s秒】' % (self.thread_name,page,int(time2-time1)))
- print('******%s工作完毕******' % self.thread_name)
- def parse(self,html):
- soup = BeautifulSoup(html, 'lxml')
- # 以下定义存放各类数据的空列表
- ranks = []
- names = []
- years = []
- countrys = []
- directors = []
- scriptwriters = []
- stars = []
- types = []
- runtimes = []
- themes = []
- detail_links = []
- pic_srcs = []
- other_names = []
- scores = []
- num_of_rating_person = []
- other_name_pattern = re.compile(r'<span class="pl">又名:</span>(.+?)<br/>', re.S) # 影片又名用正则提取
- # 年份,国家,类型用正则提取。这里由于是从bs解析返回的网页文档中获取数据,因此其格式与requests直接获取的网页内容
- # 有区别, 因此正则表达式与豆瓣top250_bs4_1中该处的正则有所区别
- year_country_type_pattern = re.compile(r'<br/>\s*?(.*?)\s*?</p>')
- for li in soup.ol.find_all('li'):
- ranks.append(li.find('em').string) # 提取排名
- names.append(li.find('img')['alt']) # 提取片名,从图片链接的属性中提取
- try: # 提取主题
- themes.append(li.find('span', attrs={'class': 'inq'}).text)
- except AttributeError:
- themes.append('无')
- scores.append(li.find('span', attrs={'class': 'rating_num'}).text) # 提取评分
- div = li.find('div', attrs={'class': 'star'}) # 提取评分人数
- num_of_rating_person.append(div.find_all('span', attrs={'class': ''})[1].text)
- year_country_type = re.search(year_country_type_pattern, li.prettify()).group(1).split('/') # 提取年份,国家,类型
- # 因为67条,上映年份列了好几个,都是用‘/’来分隔,因此年份只取第一个(0),而国家和类型则用倒序索引提取(-2,-1)
- years.append(year_country_type[0].strip())
- countrys.append(year_country_type[-2].strip())
- types.append(year_country_type[-1].strip())
- detail_link = li.find('a')['href'] # 提取详情页链接
- detail_links.append(detail_link)
- detail_html = requests.get(detail_link).text # 进入详情页提取其他内容
- soup2 = BeautifulSoup(detail_html, 'lxml')
- pic_srcs.append(soup2.find('img', attrs={'title': '点击看更多海报'})['src']) # 提取图片链接
- # 提取导演0、编剧1、主演2;纪录片可能没有编剧和主演,因此加判断
- director_scriptwriter = soup2.find_all('span', attrs={'class': 'attrs'})
- if len(director_scriptwriter) == 3:
- directors.append(director_scriptwriter[0].text)
- scriptwriters.append(director_scriptwriter[1].text)
- stars.append(director_scriptwriter[2].text)
- else:
- directors.append(director_scriptwriter[0].text)
- scriptwriters.append('无')
- stars.append('无')
- runtimes.append(soup2.find('span', attrs={'property': 'v:runtime'}).text) # 提取片长
- try: # 提取又名。又名内容不在标签里,故借助正则提取。有些影片没有给出又名,故需加一个异常处理。
- other_name = re.search(other_name_pattern, detail_html).group(1)
- other_names.append(other_name)
- except AttributeError:
- other_names.append('无')
- # 通过zip返回结果
- return zip(ranks, names, years, countrys, directors, scriptwriters, \
- stars, types, runtimes, other_names, scores, num_of_rating_person, themes, detail_links, pic_srcs)
- # for each in zip(ranks,names,years,countrys,directors,scriptwriters,\
- # stars,types,runtimes,other_names,scores,num_of_rating_person,themes,detail_links,pic_srcs):
- # print(each)
- # 采集线程退出信号
- CRAWL_EXIT = False
- # 解析线程退出信号
- PARSE_EXIT = False
- def main():
- page_queue = Queue(10)#页码队列,供采集线程爬取页面
- html_queue = Queue()#爬取的页面放入页面队列,供解析线程解析页面并处理数据
- for page in range(1,7):#例如爬取1-6页
- page_queue.put(page)
- crawl_thread_names = ['采集线程%s号' % i for i in range(1,4)]#定义3个采集线程的名字
- parse_thread_names = ['解析线程%s号' % i for i in range(1,4)]#定义3个解析线程的名字
- crawl_list = []#实例化后的采集线程存入该列表
- parse_list = []#实例化后的解析线程存入该列表
- wb = lw('豆瓣top250_2.xlsx')#爬取的数据存入此Excel表格
- sheet = wb['Sheet1']#该sheet对象作为参数传给解析线程,以供解析线程写入数据
- #启动采集线程
- for thread_name in crawl_thread_names:
- crawl_thread = CrawlThread(thread_name,page_queue,html_queue)
- crawl_thread.start()
- crawl_list.append(crawl_thread)
- #启动解析线程
- for thread_name in parse_thread_names:
- parse_thread = ParseThread(thread_name,html_queue,sheet)
- parse_thread.start()
- parse_list.append(parse_thread)
- #等待page_queue队列为空,等待采集线程工作完毕后全部退出工作(join)。
- while not page_queue.empty():
- pass
- print('\npage_queue为空\n')
- global CRAWL_EXIT
- CRAWL_EXIT = True
- for crawl_thread in crawl_list:
- crawl_thread.join()
- print('——————采集线程全部结束工作——————')
- # 等待page_queue队列为空,等待解析线程工作完毕后全部退出工作(join)。
- while not html_queue.empty():
- pass
- print('\nhtml_queue为空\n')
- global PARSE_EXIT
- PARSE_EXIT = True
- for parse_thread in parse_list:
- parse_thread.join()
- print('——————解析线程全部结束工作——————')
- #保存excel
- wb.save('豆瓣top250_2.xlsx')
- if __name__ == '__main__':
- time_start = time.time()
- main()
- time_end = time.time()
- print('总耗时%s秒' % (time_start - time_end))
复制代码 |
|