|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
1、抓取https://www.mzitu.com/mm /page/2-33页面下图片链接,返回URL
2、统计串联、单线程、多线程同条件抓取耗时,返回数组
3、结果写入EXCEL
4、多次测试求平均,最大值,80%分位值,横向对比,了解多线程差异
- import re
- import time
- from multiprocessing import Pool #多线程
- from bs4 import BeautifulSoup
- import requests #获取HTTP相应
- import xlwt #Excel写入
- def url_open(url):
- name=url
- headers={
- 'Referer': 'https://www.mzitu.com/',
- 'User-Agent':'Mozilla /5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'
- }
- response=requests.get(url,headers=headers)
- html=response.text
- soup=BeautifulSoup(html,'html.parser')
- div=soup.find('div',attrs={'id':'pins'})
- soup1=str(div)
- p=r'<a href="([^"]+\d)"'
- url=re.findall(p,soup1)
- #URL去重,排序不变
- url_list = list(set(url))
- url_list.sort(key=url.index)
- #print("页面链接%s:共有%d个有效果链接"%(name,len(url_list)))
- return url_list
- #获取每次测试的时间,返回数组
- def getTime(i):
- timeData=[]
- urls = ['https://www.mzitu.com/mm/page/{}/'.format(str(i)) for i in range(2, 32)]
- start_0 = time.time()
- for url in urls:
- url_open(url)
- end_0 = time.time()
- time_0=end_0 - start_0
- print('串行爬虫耗时:',time_0)
- start_1 = time.time()
- pool = Pool(processes=1)
- pool.map(url_open,urls)
- end_1 = time.time()
- time_1=end_1 - start_1
- print('1进程爬虫耗时:',time_1)
-
- start_2 = time.time()
- pool = Pool(processes=2)
- pool.map(url_open,urls)
- end_2 = time.time()
- time_2=end_2 - start_2
- print('2进程爬虫耗时:',time_2)
-
- start_3 = time.time()
- pool = Pool(processes=3)
- pool.map(url_open,urls)
- end_3 = time.time()
- time_3=end_3 - start_3
- print('3进程爬虫耗时:',time_3)
-
- start_4 = time.time()
- pool = Pool(processes=4)
- pool.map(url_open,urls)
- end_4 = time.time()
- time_4=end_4 - start_4
- print('4进程爬虫耗时:',time_4)
- timeData=[i,time_0,time_1,time_2,time_3,time_4]
- return timeData
- if __name__ == "__main__":
- #新建Excel
- workbook = xlwt.Workbook(encoding = 'ascii')
- worksheet=workbook.add_sheet('My Worksheet')
- workbook.save('Excel_Workbook.xls')
- for x in range(1,2):
- timeData=getTime(x)
- #数组for循环写入单元格
- for i in range(0,6):
- worksheet.write(x, i, label = timeData[i])
- print("已完成第%d次测试,结果写入EXCEL成功"%x)
- workbook.save('Excel_Workbook.xls')
-
复制代码
EXCEL读写
https://www.cnblogs.com/xuxaut-558/p/10166642.html
多进程爬虫
https://www.cnblogs.com/MrLJC/p/3715783.html
实用才是硬道理,先模仿,再进步 |
|