异步爬虫
请问各位大佬,这个爬虫怎样改为异步import requests
import logging
from lxml import html
etree = html.etree
from os import makedirs
from os.path import exists
RESULT = 'doutu'
exists(RESULT) or makedirs(RESULT)
logging.basicConfig(level=logging.INFO,format = '%(asctime)s - %(levelname)s: %(message)s')
index_url = 'https://www.doutub.com/img_lists/new/{}'
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
"Referer":"https://www.doutub.com/"
}
PAGE = 2
#发起请求
def scrape_url(url):
logging.info('正在爬取{}'.format(url))
try:
response = requests.get(url=url,headers=header)
if response.status_code == 200:
return response
logging.error('出现错误,错误代码{}'.format(response.status_code))
except requests.RequestException:
logging.error('爬取{}出现错误'.format(url))
#拼接url并发起请求
def scrape_page(page):
url = index_url.format(page)
return scrape_url(url).text
#解析表情包url
def parse_url(html):
href_dic = []
tree = etree.HTML(html)
href_list = tree.xpath('//*[@id="__layout"]/div/div/div/div/div/div')
for href in href_list:
title = str(href.xpath('./a/@title')).replace('表情包详情','').replace('点击进入','')
src = href.xpath('./a/img/@data-src')
href_dic.append({'title':title,'src':src})
return href_dic
#对图片地址发起请求
def parse_tu(src):
return scrape_url(src).content
def save_data(title,data):
title_path = f'{RESULT}/{title}.jpg'
with open(title_path,'wb') as fp:
fp.write(data)
def main(page):
for page in range(1,PAGE+1):
html = scrape_page(page)
href_dic = parse_url(html)
for t in href_dic:
title = t.get('title')
src = t.get('src')
data = parse_tu(src)
save_data(title,data)
if __name__ == '__main__':
main()
要将该爬虫改为异步爬虫,你可以使用aiohttp库实现异步请求。下面是修改后的代码示例:
import asyncio
import aiohttp
import logging
from lxml import html
from os import makedirs
from os.path import exists
RESULT = 'doutu'
exists(RESULT) or makedirs(RESULT)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')
index_url = 'https://www.doutub.com/img_lists/new/{}'
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
"Referer": "https://www.doutub.com/"
}
PAGE = 2
async def scrape_url(session, url):
logging.info('正在爬取{}'.format(url))
try:
async with session.get(url=url, headers=header) as response:
if response.status == 200:
return await response.text()
else:
logging.error('出现错误,错误代码{}'.format(response.status))
except aiohttp.ClientError:
logging.error('爬取{}出现错误'.format(url))
def scrape_page(session, page):
url = index_url.format(page)
return scrape_url(session, url)
def parse_url(html_text):
href_dic = []
tree = html.fromstring(html_text)
href_list = tree.xpath('//*[@id="__layout"]/div/div/div/div/div/div')
for href in href_list:
title = str(href.xpath('./a/@title')).replace('表情包详情', '').replace('点击进入', '')
src = href.xpath('./a/img/@data-src')
href_dic.append({'title': title, 'src': src})
return href_dic
async def parse_tu(session, src):
async with session.get(src) as response:
if response.status == 200:
return await response.read()
def save_data(title, data):
title_path = f'{RESULT}/{title}.jpg'
with open(title_path, 'wb') as fp:
fp.write(data)
async def main():
async with aiohttp.ClientSession() as session:
tasks = []
for page in range(1, PAGE + 1):
html_text = await scrape_page(session, page)
href_dic = parse_url(html_text)
for t in href_dic:
title = t.get('title')
src = t.get('src')
tasks.append(asyncio.create_task(parse_tu(session, src)))
tasks.append(asyncio.create_task(save_data(title, data)))
await asyncio.gather(*tasks)
if __name__ == '__main__':
asyncio.run(main())
这里使用asyncio来驱动异步任务,aiohttp库用于发送异步HTTP请求。scrape_url()和scrape_page()函数改为异步函数,并使用ClientSession管理HTTP会话。parse_tu()函数也改为异步函数,使用await关键字进行异步等待。main()函数中的请求和保存图片的任务被包装成异步任务列表,最后使用asyncio.gather()来并发执行这些任务。
请注意,由于涉及到IO操作,所以爬虫代码本身的运行速度可能没有明显的提升,但是使用异步可以充分利用并发性能,提高整体效率。 陶远航 发表于 2023-7-11 16:20
要将该爬虫改为异步爬虫,你可以使用aiohttp库实现异步请求。下面是修改后的代码示例:
好的,感谢感谢
页:
[1]