这是个好网站
import requestsfrom lxml import etree
import os
class ManhuaSpider:
def __init__(self):
self.url='http://www.ccnn88.com'
self.start_url='http://www.ccnn88.com/htm/Pic4/15608.htm'
self.headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
def parse_url(self, url):
r = requests.get(url, headers=self.headers)
return r
def save_img(self, imgs, title):
os.mkdir(title)
os.chdir(title)
for i in imgs:
with open(i.url.split('_')[-1], 'wb') as f:
f.write(i.content)
os.chdir('..')
def get_content_list(self,url):
html = etree.HTML(url.content.decode())
title = html.xpath("/html/head/title/text()")
print(title)
next_url = self.url + html.xpath("//li/a/@href") if html.xpath("//li/a/@href") else None
if os.path.isdir(title):
return next_url
imgs = //img/@src")]
self.save_img(imgs,title)
return next_url
def run(self):
next_url = self.start_url
while next_url is not None:
res = self.parse_url(next_url)
next_url = self.get_content_list(res)
if __name__ == "__main__":
manhua = ManhuaSpider()
manhua.run() Traceback (most recent call last):
File "D:\AA\1.py", line 44, in <module>
manhua.run()
File "D:\AA\1.py", line 39, in run
next_url = self.get_content_list(res)
File "D:\AA\1.py", line 32, in get_content_list
self.save_img(imgs,title)
File "D:\AA\1.py", line 17, in save_img
os.mkdir(title)
OSError: 文件名、目录名或卷标语法不正确。
页:
[1]