|
|

楼主 |
发表于 2018-12-13 16:10:19
|
显示全部楼层
归类我倒是实现了,就是如何将文章页放到小类里,我弄得很头疼,我用的是scrapy crawlspider来写的
- class XlSpider(CrawlSpider):
- name = 'xl'
- #allowed_domains = ['https://www.sina.com.cn/']
- start_urls = ['https://www.sina.com.cn/']
- rules = (Rule(LinkExtractor(allow=r'https://www.sina.com.cn/'), callback='parse_parent', follow=False),
- Rule(LinkExtractor(restrict_xpaths='//div[@class="main-nav"]/div[@class!="nav-mod-1 nav-w nav-hasmore"]/ul/li[position()>1]'), follow=True),
- Rule(LinkExtractor(allow='https:.+\.shtml'),follow=True,callback='parse_article'))
- def parse_parent(self, response):
- item = XinlangItem()
- a=response.xpath('//div[@class="main-nav"]/div[@class!="nav-mod-1 nav-w nav-hasmore"]/ul')
- for each in a:
- li1title=each.xpath('./li[1]/a/b/text()').extract()[0]
- li2title = each.xpath('./li[2]/a/text()').extract()[0]
- li3title = each.xpath('./li[3]/a/text()').extract()[0]
- li4title = each.xpath('./li[4]/a/text()').extract()[0]
- pname='e:/小爬爬成果/新浪/'+li1title
- li1url=each.xpath('./li[1]/a/@href').extract()[0]
- li2url = each.xpath('./li[2]/a/@href').extract()[0]
- li3url = each.xpath('./li[3]/a/@href').extract()[0]
- li4url = each.xpath('./li[4]/a/@href').extract()[0]
- if not os.path.exists(pname):
- os.makedirs(pname+'/'+li2title)
- os.makedirs(pname+'/'+li3title)
- os.makedirs(pname+'/'+li4title)
- item['parent_title']=li1title
- item['sub_title']=li2title+','+li3title+','+li4title
- item['parent_url']=li1url
- item['son_url']=li2url+','+li3url+','+li4url
- yield item
- def parse_article(self, response):
- item = XinlangItem()
- item['head']=response.xpath('//h1[@class="main-title"]/text()').extract()[0]
- b=response.xpath('//div[@id="article"]/p/text()').extract()
- content=''
- for each in b:
- content+=each
- item['content']=content
- yield item
复制代码
这是我写的,要不你也写一下吧,看了代码就一清二楚了 |
|