本帖最后由 YunGuo 于 2020-12-22 01:42 编辑
改一下试试。import aiohttp
from lxml import etree
import asyncio
import os
async def get_page(url, session):
async with session.get(url, headers=header) as response:
return await response.text(encoding='gbk')
async def get_img_list(page, type_):
img_page_urls = []
url = 'http://pic.netbian.com/4k' + type_ + '/index_'+ page + '.html'
async with aiohttp.ClientSession() as session:
html = await get_page(url, session)
sel = etree.HTML(html)
imgs = sel.xpath('//div[@id="main"]/div[3]/ul/li')
for img in imgs:
img_page_url = 'http://pic.netbian.com' + img.xpath('./a/@href')[0]
img_page_urls.append(img_page_url)
return img_page_urls
async def get_img(urls):
img_infos = []
for url in urls:
async with aiohttp.ClientSession() as session:
html = await get_page(url, session)
sel = etree.HTML(html)
img_info = {
'img_url': 'http://pic.netbian.com' + sel.xpath('//*[@id="img"]/img/@src')[0],
'title': sel.xpath('//*[@id="img"]/img/@alt')[0]
}
img_infos.append(img_info)
return img_infos
async def save_img(infos):
for info in infos:
url = info.get('img_url')
title = info.get('title')
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=header) as response:
img = await response.read()
name = 'bian/' + str(title) + '.jpg'
with open(name, 'wb') as f:
f.write(img)
async def main():
page = input('请输入要爬取的页码:')
type_ = input('请输入详情页类型:(拼音方式)')
track1 = asyncio.ensure_future(get_img_list(page, type_))
urls = await asyncio.gather(track1)
track2 = asyncio.ensure_future(get_img(urls[0]))
infos = await asyncio.gather(track2)
track3 = asyncio.ensure_future(save_img(infos[0]))
await asyncio.gather(track3)
if __name__ == '__main__':
if not os.path.exists('./bian'):
os.mkdir('./bian')
header = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/81.0.4044.62 Safari/537.36"
}
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
|