啊淡淡蛋蛋 发表于 2021-9-11 19:22:19

scrapy爬取妹子图(一)


看最近都喜欢发妹子图QAQ






-----------------------------------------##pipelines.py

from itemadapter import ItemAdapter
import scrapy
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
from mmkk.settings import IMAGES_STORE as images_store
import os


class MmkkPipeline(ImagesPipeline):

    def get_media_requests(self,item,info):

      yield scrapy.Request(url = item['image_url'],meta = {'folder_name':item['folder_name'],'name':item['image_name']})

    def file_path(self,request,response = None,info =None):
      
      folder_name = request.meta['folder_name']
      folder_path = './' + folder_name

   
      
      image_path = os.path.join(folder_name,request.meta['name'])
      
      return image_path

    def item_completed(self,results,item,info):
      
      image_paths = for ok, x in results if ok]
      print('11111111111111111111111111111111111')
      
      if not image_paths:
            raise DropItem("Item contains no image")
      
      return item



-----------------------------------------##items.py

import scrapy


class MmkkItem(scrapy.Item):

   image_url = scrapy.Field()
    image_name = scrapy.Field()
    folder_name = scrapy.Field()






--------------------------------------##settings.py

ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
   'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36 Edg/92.0.902.62'
}
ITEM_PIPELINES = {
    'mmkk.pipelines.MmkkPipeline': 300,
}
IMAGES_STORE = 'meizitu'
IMAGES_URLS_FIELD = 'image_url'
IMAGES_EXPIRES = 90


------------------------------------## mm.py
import scrapy
from mmkk.items import MmkkItem

class MmSpider(scrapy.Spider):
    name = 'mm'
    allowed_domains = ['www.mmkk.me']
    start_urls = ['https://www.mmkk.me/category/xinggan/']

    def parse(self, response):
      

      html_urls =response.xpath('//*/div/div/a[@class="item-link"]/@href').extract()
      for html_url in html_urls:
            print(html_url)
            yield scrapy.Request(url = html_url,callback = self.parse0)

      new_url = response.xpath('/html/body/div/ol/li[@class = "next"]/a/@href').extract_first()
      if new_url:
            yield response.follow(url = new_url,callback = self.parse)


    def parse0(self,response):
      item = MmkkItem()

      item['folder_name'] = response.xpath('/html/head/meta/@content').extract_first()
      urls = response.xpath('//*/div/img').extract()
      for url in urls:
            image_url = url.split('data-original="')[-1].split('"',1)
            name = url.split('alt="')[-1].split('"',1)
            item['image_url'] = image_url
            item['image_name'] = name + '.jpg'
            yield item
页: [1]
查看完整版本: scrapy爬取妹子图(一)