此人 发表于 2020-3-7 21:42:38

这是个好网站

import requests
from lxml import etree
import os


class ManhuaSpider:
    def __init__(self):
      self.url='http://www.ccnn88.com'
      self.start_url='http://www.ccnn88.com/htm/Pic4/15608.htm'
      self.headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
      
    def parse_url(self, url):
      r = requests.get(url, headers=self.headers)
      return r
      
    def save_img(self, imgs, title):
      os.mkdir(title)
      os.chdir(title)
      for i in imgs:
            with open(i.url.split('_')[-1], 'wb') as f:
                f.write(i.content)
      os.chdir('..')
      
    def get_content_list(self,url):
      html = etree.HTML(url.content.decode())
      title = html.xpath("/html/head/title/text()")
      print(title)
      next_url = self.url + html.xpath("//li/a/@href") if html.xpath("//li/a/@href") else None
      if os.path.isdir(title):
            return next_url
      imgs = //img/@src")]
      self.save_img(imgs,title)
      return next_url
      
    def run(self):
      next_url = self.start_url
      while next_url is not None:
            res = self.parse_url(next_url)
            next_url = self.get_content_list(res)
      
      
if __name__ == "__main__":
    manhua = ManhuaSpider()
    manhua.run()

g403290116 发表于 2020-8-1 21:59:28

Traceback (most recent call last):
File "D:\AA\1.py", line 44, in <module>
    manhua.run()
File "D:\AA\1.py", line 39, in run
    next_url = self.get_content_list(res)
File "D:\AA\1.py", line 32, in get_content_list
    self.save_img(imgs,title)
File "D:\AA\1.py", line 17, in save_img
    os.mkdir(title)
OSError: 文件名、目录名或卷标语法不正确。
页: [1]
查看完整版本: 这是个好网站