|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
请问各位大佬,这个爬虫怎样改为异步
- import requests
- import logging
- from lxml import html
- etree = html.etree
- from os import makedirs
- from os.path import exists
- RESULT = 'doutu'
- exists(RESULT) or makedirs(RESULT)
- logging.basicConfig(level=logging.INFO,format = '%(asctime)s - %(levelname)s: %(message)s')
- index_url = 'https://www.doutub.com/img_lists/new/{}'
- header = {
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
- "Referer":"https://www.doutub.com/"
- }
- PAGE = 2
- #发起请求
- def scrape_url(url):
- logging.info('正在爬取{}'.format(url))
- try:
- response = requests.get(url=url,headers=header)
- if response.status_code == 200:
- return response
- logging.error('出现错误,错误代码{}'.format(response.status_code))
- except requests.RequestException:
- logging.error('爬取{}出现错误'.format(url))
- #拼接url并发起请求
- def scrape_page(page):
- url = index_url.format(page)
- return scrape_url(url).text
- #解析表情包url
- def parse_url(html):
- href_dic = []
- tree = etree.HTML(html)
- href_list = tree.xpath('//*[@id="__layout"]/div/div/div[2]/div[1]/div[1]/div')
- for href in href_list:
- title = str(href.xpath('./a/@title')[0]).replace('表情包详情','').replace('点击进入','')
- src = href.xpath('./a/img/@data-src')[0]
- href_dic.append({'title':title,'src':src})
- return href_dic
- #对图片地址发起请求
- def parse_tu(src):
- return scrape_url(src).content
- def save_data(title,data):
- title_path = f'{RESULT}/{title}.jpg'
- with open(title_path,'wb') as fp:
- fp.write(data)
- def main(page):
- for page in range(1,PAGE+1):
- html = scrape_page(page)
- href_dic = parse_url(html)
- for t in href_dic:
- title = t.get('title')
- src = t.get('src')
- data = parse_tu(src)
- save_data(title,data)
- if __name__ == '__main__':
- main()
复制代码
要将该爬虫改为异步爬虫,你可以使用 aiohttp库实现异步请求。下面是修改后的代码示例:
- import asyncio
- import aiohttp
- import logging
- from lxml import html
- from os import makedirs
- from os.path import exists
- RESULT = 'doutu'
- exists(RESULT) or makedirs(RESULT)
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')
- index_url = 'https://www.doutub.com/img_lists/new/{}'
- header = {
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
- "Referer": "https://www.doutub.com/"
- }
- PAGE = 2
- async def scrape_url(session, url):
- logging.info('正在爬取{}'.format(url))
- try:
- async with session.get(url=url, headers=header) as response:
- if response.status == 200:
- return await response.text()
- else:
- logging.error('出现错误,错误代码{}'.format(response.status))
- except aiohttp.ClientError:
- logging.error('爬取{}出现错误'.format(url))
- def scrape_page(session, page):
- url = index_url.format(page)
- return scrape_url(session, url)
- def parse_url(html_text):
- href_dic = []
- tree = html.fromstring(html_text)
- href_list = tree.xpath('//*[@id="__layout"]/div/div/div[2]/div[1]/div[1]/div')
- for href in href_list:
- title = str(href.xpath('./a/@title')[0]).replace('表情包详情', '').replace('点击进入', '')
- src = href.xpath('./a/img/@data-src')[0]
- href_dic.append({'title': title, 'src': src})
- return href_dic
- async def parse_tu(session, src):
- async with session.get(src) as response:
- if response.status == 200:
- return await response.read()
- def save_data(title, data):
- title_path = f'{RESULT}/{title}.jpg'
- with open(title_path, 'wb') as fp:
- fp.write(data)
- async def main():
- async with aiohttp.ClientSession() as session:
- tasks = []
- for page in range(1, PAGE + 1):
- html_text = await scrape_page(session, page)
- href_dic = parse_url(html_text)
- for t in href_dic:
- title = t.get('title')
- src = t.get('src')
- tasks.append(asyncio.create_task(parse_tu(session, src)))
- tasks.append(asyncio.create_task(save_data(title, data)))
- await asyncio.gather(*tasks)
- if __name__ == '__main__':
- asyncio.run(main())
复制代码
这里使用 asyncio来驱动异步任务, aiohttp库用于发送异步HTTP请求。 scrape_url()和 scrape_page()函数改为异步函数,并使用 ClientSession管理HTTP会话。 parse_tu()函数也改为异步函数,使用 await关键字进行异步等待。 main()函数中的请求和保存图片的任务被包装成异步任务列表,最后使用 asyncio.gather()来并发执行这些任务。
请注意,由于涉及到IO操作,所以爬虫代码本身的运行速度可能没有明显的提升,但是使用异步可以充分利用并发性能,提高整体效率。
|
|