|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
import scrapy
from selenium import webdriver
from wangyiPro import WangyiproItem
class WangyiSpider(scrapy.Spider):
name = 'wangyi'
#allowed_domains = ['www.xxx.com']
start_urls = ['https://news.163.com/']
model_urls = [] #存储五个板块对应详情页的url
#实例化一个浏览器对象
def __init__(self):
self.bro = webdriver.Edge(executable_path='D:\\编程练习\\edge_driver\\msedgedriver')
def parse(self, response):
li_list = response.xpath('//div[@class="ns_area list"]/ul/li')
alist = [3, 4, 7, 8, 9]
for a in alist:
model_url = li_list[a].xapath('./a/@href').extract_first()
self.model_urls.append(model_url)
#依次对每一个板块对应的页面进行请求
for url in self.model_urls:#对每一个板块的url进行请求发送
yield scrapy.Request(url=url, callback=self.parse_model)
#每一个板块对应的新闻标题相关的内容都是动态加载出来的
def parse_model(self, response):
div_list = response.xpath('/html/body/div/div[3]/div[4]/div[1]/div/div/ul/li/div/div')
for div in div_list:
title = div.xpath('.//div[#class="news_title"]/h3/a/text()').extract_first()
new_detail_url = div.xpath('.//div[#class="news_title"]/h3/a/@href').extract_first()
item = WangyiproItem()
item['title'] = title
yield scrapy.Request(url=new_detail_url, callback=self.detail_parse, meta={'item': item})
def detail_parse(self, response):
content = response.xpath('//div[@class="post_text"]//text()').extract()
content = ''.join(content)#转换成字符串
item = response.meta['item']
item['content'] = content
yield item
def closed(self,spider):
self.bro.quit() |
|