|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
历时一个月零10天,功夫不负有心人
用Scrapy框架写的爬取淘宝商品的爬虫,开放搜索哦
爬取的商品名有英文有中文,不知道怎么把中文提取出来
大神勿喷
- # -*- coding: utf-8 -*-
- import scrapy
- import json
- from Taobao.items import TaobaoItem
- # url编码
- from urllib.parse import quote
- # url解码
- from urllib.parse import unquote
- class TaobaoSpider(scrapy.Spider):
- name = 'taobao'
- # allowed_domains = ['taobao.com/']
- page = input('请输入打印页数:')
- Quote = input('请输入要搜索的商品名')
- start_urls = ['https://ai.taobao.com/search/getItem.htm?_tb_token_=e3d450b1e33e&__ajax__=1&pid=mm_33793785_3431230_471812702&unid=&clk1=&page={}&pageSize=60&pvid=200_11.224.194.119_358_1541678031255&squareFlag=&sourceId=search&ppathName=&supportCod=&city=&ppath=&dc12=&pageNav=false&itemAssurance=&fcatName=&price=&cat=&from=&tmall=&key={}&fcat=&ppage=0&debug=false&maxPageSize=200&sort=&exchange7=&custAssurance=&postFree=&npx=50&location='.format(int(page),quote(Quote,'utf-8'))]
- a = 1
- def parse(self, response):
- js = json.loads(response.body)['result']['auction']
- f = open('{}.csv'.format(self.Quote), 'w', encoding='utf-8')
- f.write("商品名,价格,店名\n")
- for text in js:
- dict = {
- 'name' : text['description'],
- 'nick' : text['nick'],
- 'realPrice' : text['realPrice'],
- }
- f.write("{name},{realPrice},{nick}\n".format(**dict))
- origPicUrl = 'https:' + text['origPicUrl']
- item = TaobaoItem()
- item['origPicUrl'] = origPicUrl
- yield item
- f.close()
- if self.page != '1':
- print('=' * 40 + '第' + self.page + '页下载完毕' + '=' * 40)
- if self.page != '1':
- for n in range(2,int(self.page)):
- yield scrapy.Request('https://ai.taobao.com/search/getItem.htm?_tb_token_=e3d450b1e33e&__ajax__=1&pid=mm_33793785_3431230_471812702&unid=&clk1=&page={}&pageSize=60&pvid=200_11.224.194.119_358_1541678031255&squareFlag=&sourceId=search&ppathName=&supportCod=&city=&ppath=&dc12=&pageNav=false&itemAssurance=&fcatName=&price=&cat=&from=&tmall=&key=python%E7%BC%96%E7%A8%8B%E4%BB%8E%E5%85%A5%E9%97%A8%E5%88%B0%E5%AE%9E%E6%88%98&fcat=&ppage=0&debug=false&maxPageSize=200&sort=&exchange7=&custAssurance=&postFree=&npx=50&location='.format(n),self.parse)
复制代码
|
评分
-
查看全部评分
|