多线程爬取豆瓣top250的源码奉上，有个疑点请教各位大佬

念羁情 · 发表于 2019-5-26 22:34:46

马上注册，结交更多好友，享用更多功能^_^

您需要登录才可以下载或查看，没有账号？立即注册

x

最近学习多进程和多线程爬虫以提升效率。多进程基本搞懂了，多线程这块也差不多，不过在用多线程爬取豆瓣电影排行榜的时候，开了三个解析线程，每个线程解析第一页的速度都远远快于后面的页，百思不得其解，忘高手指点。源码奉上，基本的注释也都加了，数据存储在一个excel表格中，表格如图（这里添加不了图片？算了，表格内容就是A-O列第一行分别为排名,片名,年份,国家,导演,编剧,主演,类型,片长,又名,评分,评分人数,主题,详情页,图片链接）。

代码如下，可以复制过去直接运行。看出问题所在，可以微信加我，10块钱红包

#多线程爬取豆瓣top250
import re
import requests
import time
from bs4 import BeautifulSoup
from openpyxl import load_workbook as lw
import threading
from queue import Queue
#采集线程类
class CrawlThread(threading.Thread):
def __init__(self,thread_name,page_queue,html_queue):
super(CrawlThread, self).__init__()
self.thread_name = thread_name
self.page_queue = page_queue
self.html_queue = html_queue
def run(self):
print('———%s开始工作———' % self.thread_name)
while not CRAWL_EXIT:
page = self.page_queue.get()
print('—%s开始爬取第%s页-' % (self.thread_name,page))
url = 'https://movie.douban.com/top250?start=' + str((page-1) * 25) + '&filter='
html = requests.get(url).text
self.html_queue.put((page,html))#页面队列中，将html页面和页码组成元祖存入页面队列中
print('—%s爬取第%s页完毕—' % (self.thread_name,page))
print('———%s工作完毕———' % self.thread_name)
#解析线程类
class ParseThread(threading.Thread):
def __init__(self,thread_name,html_queue,sheet):
super(ParseThread, self).__init__()
self.thread_name = thread_name
self.html_queue = html_queue
self.sheet = sheet
def run(self):
print('******%s开始工作******' % self.thread_name)
while not PARSE_EXIT:
item = self.html_queue.get()#页面队列中取出页码和页面数据
page,html = item[0],item[1]
print('【%s开始解析第%s页】' % (self.thread_name,page))
time1 = time.time()
result = self.parse(html)
for each in result: # 填表及保存图片
# 爬取的每条数据根据排名存入对应的行，因此多线程并发写入该文件不会引起混乱，该操作不需要加锁
row = int(each[0]) + 1
self.sheet['A%s' % row].value = str(each[0])
# sheet['A%s' % row].value = each[0]#这行为什么报错呢？匪夷所思！！！
self.sheet['B%s' % row].value = each[1]
self.sheet['C%s' % row].value = each[2]
self.sheet['D%s' % row].value = each[3]
self.sheet['E%s' % row].value = each[4]
self.sheet['F%s' % row].value = each[5]
self.sheet['G%s' % row].value = each[6]
self.sheet['H%s' % row].value = each[7]
self.sheet['I%s' % row].value = each[8]
self.sheet['J%s' % row].value = each[9]
self.sheet['K%s' % row].value = each[10]
self.sheet['L%s' % row].value = each[11]
self.sheet['M%s' % row].value = each[12]
self.sheet['N%s' % row].value = each[13]
self.sheet['O%s' % row].value = each[14]
time2 = time.time()
print('【%s解决掉第%s页，耗时%s秒】' % (self.thread_name,page,int(time2-time1)))
print('******%s工作完毕******' % self.thread_name)
def parse(self,html):
soup = BeautifulSoup(html, 'lxml')
# 以下定义存放各类数据的空列表
ranks = []
names = []
years = []
countrys = []
directors = []
scriptwriters = []
stars = []
types = []
runtimes = []
themes = []
detail_links = []
pic_srcs = []
other_names = []
scores = []
num_of_rating_person = []
other_name_pattern = re.compile(r'又名:(.+?) ', re.S) # 影片又名用正则提取
# 年份，国家，类型用正则提取。这里由于是从bs解析返回的网页文档中获取数据，因此其格式与requests直接获取的网页内容
# 有区别，因此正则表达式与豆瓣top250_bs4_1中该处的正则有所区别
year_country_type_pattern = re.compile(r' \s*?(.*?)\s*?')
for li in soup.ol.find_all('li'):
ranks.append(li.find('em').string) # 提取排名
names.append(li.find('img')['alt']) # 提取片名，从图片链接的属性中提取
try: # 提取主题
themes.append(li.find('span', attrs={'class': 'inq'}).text)
except AttributeError:
themes.append('无')
scores.append(li.find('span', attrs={'class': 'rating_num'}).text) # 提取评分
div = li.find('div', attrs={'class': 'star'}) # 提取评分人数
num_of_rating_person.append(div.find_all('span', attrs={'class': ''})[1].text)
year_country_type = re.search(year_country_type_pattern, li.prettify()).group(1).split('/') # 提取年份，国家，类型
# 因为67条，上映年份列了好几个，都是用‘/’来分隔，因此年份只取第一个（0），而国家和类型则用倒序索引提取（-2，-1）
years.append(year_country_type[0].strip())
countrys.append(year_country_type[-2].strip())
types.append(year_country_type[-1].strip())
detail_link = li.find('a')['href'] # 提取详情页链接
detail_links.append(detail_link)
detail_html = requests.get(detail_link).text # 进入详情页提取其他内容
soup2 = BeautifulSoup(detail_html, 'lxml')
pic_srcs.append(soup2.find('img', attrs={'title': '点击看更多海报'})['src']) # 提取图片链接
# 提取导演0、编剧1、主演2；纪录片可能没有编剧和主演，因此加判断
director_scriptwriter = soup2.find_all('span', attrs={'class': 'attrs'})
if len(director_scriptwriter) == 3:
directors.append(director_scriptwriter[0].text)
scriptwriters.append(director_scriptwriter[1].text)
stars.append(director_scriptwriter[2].text)
else:
directors.append(director_scriptwriter[0].text)
scriptwriters.append('无')
stars.append('无')
runtimes.append(soup2.find('span', attrs={'property': 'v:runtime'}).text) # 提取片长
try: # 提取又名。又名内容不在标签里，故借助正则提取。有些影片没有给出又名，故需加一个异常处理。
other_name = re.search(other_name_pattern, detail_html).group(1)
other_names.append(other_name)
except AttributeError:
other_names.append('无')
# 通过zip返回结果
return zip(ranks, names, years, countrys, directors, scriptwriters, \
stars, types, runtimes, other_names, scores, num_of_rating_person, themes, detail_links, pic_srcs)
# for each in zip(ranks,names,years,countrys,directors,scriptwriters,\
# stars,types,runtimes,other_names,scores,num_of_rating_person,themes,detail_links,pic_srcs):
# print(each)
# 采集线程退出信号
CRAWL_EXIT = False
# 解析线程退出信号
PARSE_EXIT = False
def main():
page_queue = Queue(10)#页码队列，供采集线程爬取页面
html_queue = Queue()#爬取的页面放入页面队列，供解析线程解析页面并处理数据
for page in range(1,7):#例如爬取1-6页
page_queue.put(page)
crawl_thread_names = ['采集线程%s号' % i for i in range(1,4)]#定义3个采集线程的名字
parse_thread_names = ['解析线程%s号' % i for i in range(1,4)]#定义3个解析线程的名字
crawl_list = []#实例化后的采集线程存入该列表
parse_list = []#实例化后的解析线程存入该列表
wb = lw('豆瓣top250_2.xlsx')#爬取的数据存入此Excel表格
sheet = wb['Sheet1']#该sheet对象作为参数传给解析线程，以供解析线程写入数据
#启动采集线程
for thread_name in crawl_thread_names:
crawl_thread = CrawlThread(thread_name,page_queue,html_queue)
crawl_thread.start()
crawl_list.append(crawl_thread)
#启动解析线程
for thread_name in parse_thread_names:
parse_thread = ParseThread(thread_name,html_queue,sheet)
parse_thread.start()
parse_list.append(parse_thread)
#等待page_queue队列为空，等待采集线程工作完毕后全部退出工作(join)。
while not page_queue.empty():
pass
print('\npage_queue为空\n')
global CRAWL_EXIT
CRAWL_EXIT = True
for crawl_thread in crawl_list:
crawl_thread.join()
print('——————采集线程全部结束工作——————')
# 等待page_queue队列为空，等待解析线程工作完毕后全部退出工作(join)。
while not html_queue.empty():
pass
print('\nhtml_queue为空\n')
global PARSE_EXIT
PARSE_EXIT = True
for parse_thread in parse_list:
parse_thread.join()
print('——————解析线程全部结束工作——————')
#保存excel
wb.save('豆瓣top250_2.xlsx')
if __name__ == '__main__':
time_start = time.time()
main()
time_end = time.time()
print('总耗时%s秒' % (time_start - time_end))

复制代码

念羁情 · 发表于 2019-5-26 22:35:20

微信1324726119可加我，可以多交流。

念羁情 · 发表于 2019-5-26 23:43:16

完整测试了下，刚才那个表述有问题。应该是，每个线程最后一个页面耗时很短。比如爬取10个页面，开5个线程，情况如下图。
C:\Users\念羁情\Desktop\截图00.png

账号		自动登录	找回密码
密码			立即注册