|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
第一个问题:
第一个列表为什么是空的呢?
第二个问题:
为什么前面抓到了后面就出错呢?
import asyncio
import aiohttp
from lxml import etree
img_list = []
#//*[@id="main"]/div[3]/ul/li[20]/a/img
#//*[@id="main"]/div[3]/ul/li[19]/a/img
async def geturl(url1):
global img_list
async with aiohttp.ClientSession() as session:
async with await session.get(url1) as res:
res.encoding = 'gbk'
tree = etree.HTML(await res.text())
print(tree.xpath('//*[@id="main"]/div[3]/ul/li/a/img/@src')[:-1],'\n\n\n\n\n')
for urlp in tree.xpath('//*[@id="main"]/div[3]/ul/li/a/img/@src')[:-1]:
img_list.append(urlp)
'''
async def getimg(url2):
name = url2.rsplit('/', 1)[1]
async with aiohttp.ClientSession() as session:
async with await session.get(url2) as res:
async with open(f'彼岸桌面壁纸下载/{name}', 'wb') as f:
await f.write(await res.content.read())'''
async def main1():
tasks = [asyncio.create_task(geturl(f'http://www.netbian.com/dongman/index_{i}.htm')) for i in range(1, 20)]
await asyncio.wait(tasks)
'''
async def main2():
tasks = [asyncio.create_task(getimg(url2)) for url2 in img_list]
await asyncio.wait(tasks)'''
if __name__ == '__main__':
asyncio.run(main1())
print(len(img_list))
报错如下:
Task exception was never retrieved
future: <Task finished name='Task-19' coro=<geturl() done, defined at D:/MyPcharmProject/爬虫/彼岸壁纸疯狂爬.py:8> exception=UnicodeDecodeError('gb2312', b'<!doctype html>\r\n<html>\r\n<head>\r\n<meta charset="gbk" />\r\n<title>\xa1\xbe\xb6\xaf\xc2\xfe\xd7\xc0\xc3\xe6\xb1\xda\xd6\xbd\xa1\xbf\xb8\xdf\xc7\xe5\xb6\xaf\xc2\xfe\xcd\xbc\xc6\xac-\xb1\xcb\xb0\xb6\xd7\xc0\xc3\xe6\xb1\xda\xd6\xbd</title>\r\n<meta name="keywords" content="\xb6\xaf\xc2\xfe\xb1\xda\xd6\xbd,\xb8\xdf\xc7\xe5\xb6\xaf\xc2\xfe\xb1\xda\xd6\xbd,\xb6\xaf\xc2\xfe\xd7\xc0\xc3\xe6\xb1\xda\xd6\xbd,\xb8\xdf\xc7\xe5\xb6\xaf\xc2\xfe\xd7\xc0\xc3\xe6\xb1\xda\xd6\xbd" />\r\n<meta name="description" content="\xb1\xcb\xb0\xb6\xd7\xc0\xc3\xe6\xc3\xe2\xb7\xd1\xcc\xe1\xb9\xa9\xc7\xe5\xd0\xc2\xce\xa8\xc3\xc0\xb5\xc4\xb6\xaf\xc2\xfe\xb1\xda\xd6\xbd\xcf\xc2\xd4\xd8,\xb0\xd1\xb8\xdf\xc7\xe5\xb5\xc4\xb6\xaf\xc2\xfe\xb1\xda\xd6\xbd\xcd\xc6\xbc\xf6\xb8\xf8\xc4\xfa,\xc8\xc3\xc4\xfa\xb8\xfc\xbf\xec\xb5\xc4\xd5\xd2\xb5\xbd\xc4\xfa\xcf\xeb\xd2\xaa\xb5\xc4\xb6\xaf\xc2\xfe\xd7\xc0\xc3\xe6\xb1\xda\xd6\xbd" />\r\n<link href="/skin/style.css" rel="stylesheet" type="text/css" />\r\n<meta http-equiv="x-ua-compatible" content="ie=7" />\r\n<script type="text/javascript" src="/skin/js/jquery.min.js"></script>\r\n<script type="text/javascript" src="/skin/js/common.js"></script>\r\n</head>\r\n<body>\r\n<div id="header"><div class="head"><a href="http://www.netbian.com/" class="logo" title="\xb1\xda\xd6\xbd">\xb1\xda\xd6\xbd</a>\r\n<ul class="menu">\r\n <li class="more"><a class="m">\xb7\xd6\xc0\xe0<i></i></a><div class="nav cate"><em></em><a href="http://pic.netbian.com/" target="_blank" style="color:#FFA800;" title="4k\xb1\xda\xd6\xbd">4k\xb1\xda\xd6\xbd</a> <a href="/rili/"
...........................................
@2010-2021 <a href="http://www.beian.miit.gov.cn/" style="color:#FFFFFF;" target="_blank">\xc3\xf6ICP\xb1\xb813013111\xba\xc5-1</a> <a href="http://www.netbian.com/sitemap/" target="_blank" style="color:#ffffff;">\xcd\xf8\xd5\xbe\xb5\xd8\xcd\xbc</a> \xbf\xcd\xb7\xfeQQ:55346968\r\n</div>\r\n</div>\r\n\r\n</body>\r\n</html>\r\n<script src=\'/e/public/onclick/?enews=doclass&classid=19\'></script>', 8728, 8729, 'illegal multibyte sequence')>
Traceback (most recent call last):
File "D:/MyPcharmProject/爬虫/彼岸壁纸疯狂爬.py", line 13, in geturl
tree = etree.HTML(await res.text())
File "C:\Users\lenovo\AppData\Local\Programs\Python\Python38\lib\site-packages\aiohttp\client_reqrep.py", line 1081, in text
return self._body.decode(encoding, errors=errors) # type: ignore
UnicodeDecodeError: 'gb2312' codec can't decode byte 0xeb in position 8728: illegal multibyte sequence |
|