|
|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 wongyusing 于 2018-10-14 19:08 编辑
代码如下:
- # -*- coding: utf-8 -*-
- import scrapy
- import re
- from scrapy.http import Request
- import os
- from mm131.items import Mm131Item
- class MeiziSpider(scrapy.Spider):
- name = 'meizi'
- #allowed_domains = ['http://www.mm131.com/']
- start_urls = ['http://www.mm131.com/xinggan/']
- base = r'/home/wongyusing/桌面/gril/mm131/mm131/download/'
- #如果需要测试,请修改自己电脑的路径
- def parse(self, response):
- items = [] #/html/body/div[5]/dl/dd
- mains = response.xpath('/html/body/div/dl//a[@target="_blank"]')
- sort_name = response.xpath('/html/body/div/dl/dt/a[2]/text()').extract()[0]
- #print(sort_name)#获取分类名,例子‘性感美女’
- for main in mains:
- item = Mm131Item()
- item['title'] = main.xpath('./text()').extract()[0]#获取妹子标题
- item['girl_url'] = main.xpath('./@href').extract()[0]#获取妹子入口图片
- item['file_name'] = self.base + sort_name + '/' + item['title']#拼接绝对路径
- items.append(item)
- for item in items:#创建文件夹
- fileName=item['file_name']
- if not os.path.exists(fileName):
- os.makedirs(fileName)
- # #用meta传入下一层
- yield Request(url=item['girl_url'],meta={'item1':item},callback=self.parse_one)
- def parse_one(self,response):
- item2 = response.meta['item1']
- item = Mm131Item()
- img_all_ = response.xpath('/html/body/div/div/span[1]/text()').extract()[0]
- req = r'共(.*?)页'
- max_img_page = re.findall(req,img_all_)[0]#获取妹子图集的总页数
- items = []
- for i in range(2,int(max_img_page)+1):
- item['file_name'] = item2['file_name']
- item['path'] = item['file_name'] + '/' + str(i) + '.jpg'#生成绝对路径
- item['img_url'] = response.url[:-5] + '_' + str(i) + '.html'#拼接每一页的url
- items.append(item)
- for item in items:
- yield Request(url=item['img_url'], meta={'item2': item}, callback=self.parse_two)
- def parse_two(self,response):
- item3 = response.meta['item2']
- item = Mm131Item()
- #获取图片的url,问题处在下面这条代码,返回的url会自动连到腾讯的图片
- url_3 = response.xpath('/html/body/div/div[2]/a/img/@src').extract()[0]
- print(url_3)
复制代码
代码网盘链接:https://pan.baidu.com/s/17AyCvpysUjxqpue04eAIUg
密码:7tjb
代码运行,运行mainmain.py就可以运行代码了
问题:
1.这网站是不是静态啊??该如何判断类型??
2.打印出来的URL不是想要的妹子图,该如何解决??是需要加上get的参数吗???如下图的链接,无法链接到真正的图片
http://img1.mm131.me/pic/3849/40.jpg
http://img1.mm131.me/pic/3855/55.jpg
http://img1.mm131.me/pic/3853/50.jpg
http://img1.mm131.me/pic/3850/50.jpg
http://img1.mm131.me/pic/3851/45.jpg
http://img1.mm131.me/pic/3852/48.jpg
http://img1.mm131.me/pic/3857/39.jpg
http://img1.mm131.me/pic/3856/39.jpg
|
|