|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 lhgzbxhz 于 2020-7-8 16:01 编辑
废话不多说,直接上代码
1、
- cd F:\编程\Python\Scrapy
- Scrapy startproject mkz
复制代码
2、
- # in spiders\chapters.py
- import scrapy
- class ChapterSpider(scrapy.Spider):
- name = "chapter"
- start_urls = ["https://www.mkzhan.com/211692/"]
- def parse(self, response: scrapy.http.Response):
- for chapter in response.css("a.j-chapter-link"):
- # 在<a>标签内有另外的标签
- # 所以要用//text()获取所有文本
- title = chapter.xpath("..//text()").extract()
- if title is None:
- self.log("None!")
- continue
- # 获取到的文本中有"\n "以及" "一类
- # 所以要先strip(),再 if t != ""
- for i in range(len(title)):
- title[i] = title[i].strip()
- yield {
- "title": [t for t in title if t != ""][0],
- "url": response.urljoin(chapter.css("::attr(data-hreflink)").extract_first()),
- }
复制代码
3、
- Scrapy crawl chapter -o ch.json
复制代码
爬取到的ch.json节选如下:
- [
- {“title”: “\u7b2c553\u8bdd \u6838\u5fc3\u533a2”, “url”: “https://www.mkzhan.com/211692/903557.html”},
- {“title”: “\u7b2c552\u8bdd \u6838\u5fc3\u533a1”, “url”: “https://www.mkzhan.com/211692/903553.html”},
- {“title”: “\u7b2c551\u8bdd \u9053\u6b492”, “url”: “https://www.mkzhan.com/211692/902374.html”},
- {“title”: “\u7b2c550\u8bdd \u9053\u6b491”, “url”: “https://www.mkzhan.com/211692/902375.html”},
- {“title”: “\u7b2c549\u8bdd \u82cf\u91922”, “url”: “https://www.mkzhan.com/211692/901306.html”},
- …
- ]
复制代码
4、
- # in spiders\images.py
- import scrapy
- import requests
- import json
- import os
- class ImageSpider(scrapy.Spider):
- name = "images"
- def start_requests(self):
- with open("ch.json", 'r') as f:
- chapters_list = json.load(f)
- for chapter in chapters_list:
- # 此处使用了requests对象的meta属性,具体的大家可以自行百度
- yield scrapy.Request(chapter["url"], callback=self.parse, meta={"title": chapter["title"]})
- def img_parse(self, response):
- with open(response.meta["path"], 'wb') as f:
- f.write(response.body)
- def parse(self, response):
- title = response.meta["title"]
- img_tags = response.xpath('//div[@class="rd-article__pic hide"]')
- page_ids = []
- image_urls = {}
- for tag in img_tags:
- # 按照data-page_id排序
- page_id = int(tag.xpath('./@data-page_id').extract_first())
- page_ids.append(page_id)
- image_urls[page_id] = response.urljoin(tag.xpath('./img/@data-src').extract_first())
- page_ids.sort()
- # 创建文件夹
- try:
- os.mkdir(".\\images\" + title)
- except FileExistsError:
- pass # 如果文件夹已存在
- for page_id in page_ids:
- url = image_urls[page_id]
- # 文件格式:".\images\%title%\%page_id%.jpg"
- # 为了使爬取到的图片有序,必须这么干
- yield scrapy.Request(url, callback=self.img_parse, meta={"path": ".\\images\" + title + '\\' + str(page_id) + ".jpg"})
复制代码
5、
爬取效果:
|
|