|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
- # -*- coding: utf-8 -*-
- import scrapy
- from amazoncar import items
- from amazoncar.items import AmazoncarItem
- import re
- class AmazonSpider(scrapy.Spider):
- name = 'amazon'
- allowed_domains = ['amazon.com']
- # start_urls = ['https://www.amazon.com/b/ref=sr_aj?node=10677469011&bbn=10677469011&ajr=0'] 首先用这个爬取各类汽车分类界面地址
- def start_requests(self):
- urls=['https://www.amazon.com/s/ref=lp_10677469011_nr_p_n_feature_four_bro_0?fst=as%3Aoff&rh=n%3A10677469011%2Cp_n_feature_four_browse-bin%3A11710192011&ie=UTF8',
- 'https://www.amazon.com/s/ref=lp_10677469011_nr_p_n_feature_four_bro_1?fst=as%3Aoff&rh=n%3A10677469011%2Cp_n_feature_four_browse-bin%3A11710193011&ie=UTF8',
- 'https://www.amazon.com/s/ref=lp_10677469011_nr_p_n_feature_four_bro_2?fst=as%3Aoff&rh=n%3A10677469011%2Cp_n_feature_four_browse-bin%3A11710199011&ie=UTF8',
- 'https://www.amazon.com/s/ref=lp_10677469011_nr_p_n_feature_four_bro_3?fst=as%3Aoff&rh=n%3A10677469011%2Cp_n_feature_four_browse-bin%3A11710198011&ie=UTF8',
- 'https://www.amazon.com/s/ref=lp_10677469011_nr_p_n_feature_four_bro_4?fst=as%3Aoff&rh=n%3A10677469011%2Cp_n_feature_four_browse-bin%3A11710200011&ie=UTF8',
- 'https://www.amazon.com/s/ref=lp_10677469011_nr_p_n_feature_four_bro_5?fst=as%3Aoff&rh=n%3A10677469011%2Cp_n_feature_four_browse-bin%3A11710195011&ie=UTF8',
- 'https://www.amazon.com/s/ref=lp_10677469011_nr_p_n_feature_four_bro_6?fst=as%3Aoff&rh=n%3A10677469011%2Cp_n_feature_four_browse-bin%3A11710201011&ie=UTF8',
- 'https://www.amazon.com/s/ref=lp_10677469011_nr_p_n_feature_four_bro_7?fst=as%3Aoff&rh=n%3A10677469011%2Cp_n_feature_four_browse-bin%3A11710197011&ie=UTF8',
- 'https://www.amazon.com/s/ref=lp_10677469011_nr_p_n_feature_four_bro_8?fst=as%3Aoff&rh=n%3A10677469011%2Cp_n_feature_four_browse-bin%3A11710194011&ie=UTF8',
- 'https://www.amazon.com/s/ref=lp_10677469011_nr_p_n_feature_four_bro_9?fst=as%3Aoff&rh=n%3A10677469011%2Cp_n_feature_four_browse-bin%3A11710196011&ie=UTF8',
- 'https://www.amazon.com/s/ref=lp_10677469011_nr_p_n_feature_four_bro_10?fst=as%3Aoff&rh=n%3A10677469011%2Cp_n_feature_four_browse-bin%3A11710202011&&ie=UTF8']
- pages=[26,71,20,20,16,9,14,21,140,85,19]
- #pages=[3,3,3,3,3,3,3,3,3,3,3]
- for i in range(0,11):#测试10
- url=urls[i]
- for i in range(1,pages[i]+1):#11
- print(url)
- starturl=url+'&page={}'.format(i)
- yield scrapy.Request(url=starturl, callback=self.parse)
- def parse(self, response): #爬虫逻辑
- # print(response.text[0:100])
- # addlist = []
- # str1='https://www.amazon.com'
- # for i in range(1,12):
- # addpath=response.xpath('//*[@id="leftNav"]/ul[1]/div/li[{}]/span/span'.format(i)).re("/s.*rnid=11710191011")
- # addlist.append(str1+addpath[0])
- # print(addlist) #验证是否成功取得地址
- amzitem = AmazoncarItem()
- maxpage=response.xpath('//*[@id="pagn"]/span[6]').re('>.*<') #.re("data='.*'"))
- if maxpage:
- maxpage=maxpage[0][1:-1]
- else:
- print(response.url)
- return 0
- #页面产品代号
- m=response.xpath('//*[@id="s-result-count"]/text()').re('\d{1,3}')
- # print(type(m))
- # print(m)
- startnum=int(m[0])-1
- endnum=int(m[1])
- # print(startnum,endnum)
- for n in range(startnum,endnum):#24
- amzitem['body_Style']=response.xpath('//*[@id="s-result-count"]/span/span/text()').extract_first('无显示')
- amzitem['link']=response.xpath('//*[@id="result_{}"]/div/div[3]/div[1]/a/@href'.format(n)).extract_first('无显示')
- maker=response.xpath('//*[@id="result_{}"]/div/div[3]/div[2]/span[2]/text()'.format(n)).extract_first('无显示')
- amzitem['maker']=maker
- print(maker)
- #cats是读取的目标,首先用正则取得year
- cats=response.xpath('//*[@id="result_{}"]/div/div[3]/div[1]/a'.format(n)).re('title=".*" h')[0][7:-3]
- listmodel=cats.split(' ',1)
- amzitem['year']=listmodel[0]
- models=listmodel[1].split(maker+' ')[1]
- amzitem['model']=models
- amzitem['reviews']=response.xpath('//*[@id="result_{}"]/div/div[6]/a/text()'.format(n)).extract_first('无显示')
- if response.xpath('//*[@id="result_{}"]/div/div[6]/span/span/a/i[1]/span/text()'.format(n)).extract_first():
- rating=response.xpath('//*[@id="result_{}"]/div/div[6]/span/span/a/i[1]/span/text()'.format(n)).extract_first()
- rating=rating.split(' ')
- amzitem['rating']=rating[0]
- else:
- amzitem['rating'] = response.xpath('//*[@id="result_{}"]/div/div[6]/span/span/a/i[1]/span/text()'.format(n)).extract_first('无显示')
- yield amzitem
复制代码 amazon.py
- from openpyxl import Workbook
- class AmazoncarPipeline(object):
- def open_spider(self, spider):
- # 创建工作簿,同时页建一个sheet
- self.wb = Workbook()
- # 调用得到的sheet
- self.ws = self.wb.active
- # (注:active返回的是一个列表)
- self.ws.append(['bodyStyle', 'Link', 'Maker','Year', 'Model', 'Reviews', 'Rating']) # 设置表头
- def process_item(self, item, spider): # 工序具体内容
- line = [item['body_Style'], item['link'], item['maker'], item['year'],item['model'], item['reviews'], item['rating']] # 把数据中每一项整理出来
- self.ws.append(line) # 将数据以行的形式添加到xlsx中
- return item
- def close_spider(self, spider):
- self.wb.save('amazoncar.xlsx') # 保存xlsx文件
复制代码 pipelines.py
用这个爬虫爬取亚马逊界面时,开始没问题,差不多一个网址超过41页就开始报错,目测估计是返回是亚马逊给的其他导航网址,而不是我要爬的网址导致索引超过
有办法能得到导航网址时重新爬取当前页,不进入下一界面么?
还有用了useragent头,爬虫设置是3秒一次,得到界面错误的原因是什么,怎么检测,麻烦大神指教一下?
你先确认亚马逊商品的页面大概有多少页。
或者你直接获取41页,看一下status_code是多少。
又或者说亚马逊会在41页的时候会突然改变页面或url逻辑来反爬虫。
|
|