|
|

楼主 |
发表于 2018-3-13 07:30:20
|
显示全部楼层
- import scrapy
- from jiandan.items import Jiandanitem
- class duan(scrapy.Spider):
- name = "duan_spider"
- allowed_domains = ["jandan"]
- start_urls = ["http://jandan.net/duan"]
- #重写start_requests(),添加代理
- #def start_requests(self):
- # for url in self.start_urls:
- # yield scrapy.Request(url,callback = self.parse,meta = {"proxy": "http://14.118.253.22:6666"})
- def parse(self,response):
- sel = scrapy.selector.Selector(response)
- sites = sel.xpath('//div[@class="author"]/strong/text()').extract()
- contents = sel.xpath('//div[@class="text"]/p/text()').extract()
- items = []
- # item = Jiandanitem() 放在循环外无法迭代
- self.count = 0
- num = len(sites)
- with open("duanzi.txt","a",encoding = 'utf-8') as f:
- for i in range(num):
- item = Jiandanitem()
- self.count += 1
- name = "name" + str(self.count)
- item['name'] = name + sites[i] #对应item里的name属性
- items.append(item)
- f.write(sites[i] + ":")
- item = Jiandanitem()
- item["content"] = "content" + str(i+1) + contents[i]
- items.append(item) #不使用print也可以在命令行中打印出来
- f.write(contents[i] + '\n' +'\n')
- print("写入内容")
- page = str(response.url)
- with open("duanzi.txt","a",encoding = 'utf-8') as f:
- f.write("-----" + page +"-----" + str(num) + '\n')
- urls = [sel.xpath('//div[@class="cp-pagenavi"]/a[@title="Older Comments"]/@href').extract()[0]] #应该是一个列表,不是字符串,因为之后的for
- print("获取下一页网址",urls)
- for url in urls:
- print("准备爬取下一页")
- yield scrapy.Request("http://" + url, callback=self.parse, dont_filter=True) #http:和http://输出值一致
- print(response.url)
- #yield response.follow("http:"+url,callback = self.parse,dont_filter=True)
- #递归parse获取所有页面的内容
- print("爬取结束")
- return items
复制代码 |
|