马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
import requests
from lxml import etree
import os
class ManhuaSpider:
def __init__(self):
self.url='http://www.ccnn88.com'
self.start_url='http://www.ccnn88.com/htm/Pic4/15608.htm'
self.headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
def parse_url(self, url):
r = requests.get(url, headers=self.headers)
return r
def save_img(self, imgs, title):
os.mkdir(title)
os.chdir(title)
for i in imgs:
with open(i.url.split('_')[-1], 'wb') as f:
f.write(i.content)
os.chdir('..')
def get_content_list(self,url):
html = etree.HTML(url.content.decode())
title = html.xpath("/html/head/title/text()")[0]
print(title)
next_url = self.url + html.xpath("//li/a[contains(text(),'下一篇')]/@href")[0] if html.xpath("//li/a[contains(text(),'下一篇')]/@href") else None
if os.path.isdir(title):
return next_url
imgs = [self.parse_url(u) for u in html.xpath("//div[@class='details-content text-justify']//img/@src")]
self.save_img(imgs,title)
return next_url
def run(self):
next_url = self.start_url
while next_url is not None:
res = self.parse_url(next_url)
next_url = self.get_content_list(res)
if __name__ == "__main__":
manhua = ManhuaSpider()
manhua.run()
|