scrapy爬取妹子图(一)
看最近都喜欢发妹子图QAQ
-----------------------------------------##pipelines.py
from itemadapter import ItemAdapter
import scrapy
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
from mmkk.settings import IMAGES_STORE as images_store
import os
class MmkkPipeline(ImagesPipeline):
def get_media_requests(self,item,info):
yield scrapy.Request(url = item['image_url'],meta = {'folder_name':item['folder_name'],'name':item['image_name']})
def file_path(self,request,response = None,info =None):
folder_name = request.meta['folder_name']
folder_path = './' + folder_name
image_path = os.path.join(folder_name,request.meta['name'])
return image_path
def item_completed(self,results,item,info):
image_paths = for ok, x in results if ok]
print('11111111111111111111111111111111111')
if not image_paths:
raise DropItem("Item contains no image")
return item
-----------------------------------------##items.py
import scrapy
class MmkkItem(scrapy.Item):
image_url = scrapy.Field()
image_name = scrapy.Field()
folder_name = scrapy.Field()
--------------------------------------##settings.py
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36 Edg/92.0.902.62'
}
ITEM_PIPELINES = {
'mmkk.pipelines.MmkkPipeline': 300,
}
IMAGES_STORE = 'meizitu'
IMAGES_URLS_FIELD = 'image_url'
IMAGES_EXPIRES = 90
------------------------------------## mm.py
import scrapy
from mmkk.items import MmkkItem
class MmSpider(scrapy.Spider):
name = 'mm'
allowed_domains = ['www.mmkk.me']
start_urls = ['https://www.mmkk.me/category/xinggan/']
def parse(self, response):
html_urls =response.xpath('//*/div/div/a[@class="item-link"]/@href').extract()
for html_url in html_urls:
print(html_url)
yield scrapy.Request(url = html_url,callback = self.parse0)
new_url = response.xpath('/html/body/div/ol/li[@class = "next"]/a/@href').extract_first()
if new_url:
yield response.follow(url = new_url,callback = self.parse)
def parse0(self,response):
item = MmkkItem()
item['folder_name'] = response.xpath('/html/head/meta/@content').extract_first()
urls = response.xpath('//*/div/img').extract()
for url in urls:
image_url = url.split('data-original="')[-1].split('"',1)
name = url.split('alt="')[-1].split('"',1)
item['image_url'] = image_url
item['image_name'] = name + '.jpg'
yield item
页:
[1]