|
发表于 2020-7-22 14:40:36
|
显示全部楼层
回帖奖励 +1 鱼币
- import scrapy
- from douban.items import DoubanItem
- class DoubanSpiderSpider(scrapy.Spider):
- name = 'douban'
- allowed_domains = ['douban.com']
- start_urls = ['https://movie.douban.com/top250']
- def parse(self, response):
- content = response.xpath('//ol/li')
-
- for each in content:
- item = DoubanItem()
- item['rank'] = each.xpath('//em/text()').extract()
- item['name'] = each.xpath('//div[@class="hd"]/a/span[1]/text()').extract()
- item['info'] = each.xpath('//p/text()').extract()
- item['scort'] = each.xpath('//div[@class="star"]/span[2]/text()').extract()
- yield item
- if len(response.xpath('//span[@class="next"]/a/@href')) != 0 :
- next_url = response.xpath('//span[@class="next"]/a/@href').extract()[0]
- yield scrapy.Request('https://movie.douban.com/top250' + next_url,callback = self.parse,dont_filter = False)
复制代码 |
|