|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
- import requests
- from lxml import etree
- import os
- class ManhuaSpider:
- def __init__(self):
- self.url='http://www.ccnn88.com'
- self.start_url='http://www.ccnn88.com/htm/Pic4/15608.htm'
- self.headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
-
- def parse_url(self, url):
- r = requests.get(url, headers=self.headers)
- return r
-
- def save_img(self, imgs, title):
- os.mkdir(title)
- os.chdir(title)
- for i in imgs:
- with open(i.url.split('_')[-1], 'wb') as f:
- f.write(i.content)
- os.chdir('..')
-
- def get_content_list(self,url):
- html = etree.HTML(url.content.decode())
- title = html.xpath("/html/head/title/text()")[0]
- print(title)
- next_url = self.url + html.xpath("//li/a[contains(text(),'下一篇')]/@href")[0] if html.xpath("//li/a[contains(text(),'下一篇')]/@href") else None
- if os.path.isdir(title):
- return next_url
- imgs = [self.parse_url(u) for u in html.xpath("//div[@class='details-content text-justify']//img/@src")]
- self.save_img(imgs,title)
- return next_url
-
- def run(self):
- next_url = self.start_url
- while next_url is not None:
- res = self.parse_url(next_url)
- next_url = self.get_content_list(res)
-
-
- if __name__ == "__main__":
- manhua = ManhuaSpider()
- manhua.run()
复制代码 |
|