|

楼主 |
发表于 2020-3-25 21:07:19
|
显示全部楼层
- import re
- import os
- import json
- import time
- import requests
- import jsonpath
- from openpyxl import Workbook, load_workbook
- print('欢迎使用淘宝查询助理')
- print('本程序旨在帮助您更容易挑选到您所需要的商品')
- good = input('请输入要查询的商品名称:')
- low_price = int(input('请输入您所能接受的商品最低价格:'))
- high_price = int(input('请输入您所能接受的商品最高价格:'))
- print('请稍等片刻,数据正在获取中')
- print('{0}.xlsx正在创建中,您可以在本程序同目录下找到'.format(good))
- print('请等待程序关闭后再打开表格,谢谢配合!')
- print('祝您购物愉快!\n')
- print('-------------------------------------------------------------------')
- class TaoBaoSpider(object):
- """淘宝爬虫"""
- def __init__(self, key, start_price, end_price):
- self.headers = {
- 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/603.1.30'
- ' (KHTML, like Gecko) Version/10.0 Mobile/14E277 Safari/602.1',
- }
- self.start_price = start_price
- self.end_price = end_price
- self.count = 1
- self.rank_dict = {
- '0': '无',
- '1': '1心',
- '2': '2心',
- '3': '3心',
- '4': '4心',
- '5': '5心',
- '6': '1钻',
- '7': '2钻',
- '8': '3钻',
- '9': '4钻',
- '10': '5钻',
- '11': '1蓝冠',
- '12': '2蓝冠',
- '13': '3蓝冠',
- '14': '4蓝冠',
- '15': '5蓝冠',
- '16': '1金冠',
- '17': '2金冠',
- '18': '3金冠',
- '19': '4金冠',
- '20': '5金冠'
- }
- self.key = key
- self.file_path = f"{self.key}.xlsx"
- self.shop_set = set()
- if os.path.exists(self.file_path):
- self.wb = load_workbook(self.file_path)
- self.ws = self.wb.active
- max_row = self.ws.max_row
- for row in range(2, max_row + 1):
- value = self.ws.cell(row=row, column=2).value.strip()
- self.shop_set.add(value)
- else:
- self.wb = Workbook()
- self.ws = self.wb.active
- self.ws.append(['序号', '商品ID', '店铺ID', '商城类型', '标题', '链接', '价格', '月销', '地区', '店铺名称',
- '店铺等级', '评论总数', '店铺评分', '相关描述', '图片评论', '追加评论', '好评', '中评', '差评', '与描述相符'])
- def get_img_add_key(self, item_id, user_id, s_type):
- url = f'https://rate.taobao.com/detailCommon.htm?auctionNumId={item_id}&userNumId={user_id}'
- headers = self.headers
- headers['Referer'] = f'https://item.taobao.com/item.htm?id={item_id}'
- response = requests.get(url, headers=headers)
- info = re.findall('\((.+}?)\)', response.text)[0]
- info = json.loads(info)
- data = info['data']
- keys = data['impress']
- keys = [f"{key['title']}({key['count']})" for key in keys]
- keys = ','.join(keys)
- if s_type == '淘宝':
- add_comment = data['count']['additional']
- img_comment = data['count']['pic']
- god_comment = data['count']['good']
- mid_comment = data['count']['normal']
- bad_comment = data['count']['bad']
- else:
- add_comment = data['count']['additional']
- img_comment = data['count']['pic']
- god_comment = data['count']['good']
- mid_comment = data['count']['normal']
- bad_comment = data['count']['bad']
- correspond = data['correspond']
- line = [keys, add_comment, img_comment, god_comment, mid_comment, bad_comment, correspond]
- return line
- def get_comment_key(self, item_id):
- url = f'https://rate.tmall.com/listTagClouds.htm?itemId={item_id}'
- response = requests.get(url, headers=self.headers)
- print(response.text)
- info = re.findall(r'\((.+}?)\)', response.text)[0]
- info = json.loads(info)
- keys = info['tags']['tagClouds']
- keys = [f"{key['tag']}({key['count']})" for key in keys]
- keys = ','.join(keys)
- return keys
- def get_info(self, data):
- title = data['raw_title']
- city = data['item_loc']
- price = data['view_price']
- item_id = data['nid']
- url = data['detail_url']
- if item_id in self.shop_set:
- print(title, '存在')
- self.count += 1
- return
- user_id = data['user_id']
- if 'tmall' in url or 'mclick' in url:
- s_type = '天猫'
- url = 'https://detail.tmall.com/item.htm?id={}'.format(item_id)
- elif 'taobao' in url:
- s_type = '淘宝'
- url = 'http://item.taobao.com/item.htm?id={}'.format(item_id)
- else:
- s_type = '未知'
- time.sleep(5)
- x_url = 'https://h5api.m.taobao.com/h5/mtop.taobao.detail.getdetail/6.0/?data=%7B%22itemNumId%22%3A%22{}%22%7D'.format(
- item_id)
- try:
- x_data = requests.get(x_url, headers=self.headers).json()['data']
- except:
- return
- try:
- shopName = x_data['seller']['shopName']
- except:
- shopName = ''
- try:
- rank = x_data['seller']['creditLevel']
- rank = self.rank_dict[rank]
- except:
- rank = '无'
- try:
- youhui_json = json.loads(x_data['apiStack'][0]['value'])
- except:
- youhui_json = {}
- try:
- month_xiao = youhui_json['item']['sellCount']
- except:
- month_xiao = ''
- # 评价数量
- try:
- comment_num = x_data['rate']['totalCount']
- except KeyError:
- comment_num = ''
- line = self.get_img_add_key(item_id, user_id, s_type)
- if shopName == '':
- print('闲鱼?')
- return
- # 店铺评分
- shop_source = x_data['seller']['evaluates']
- shop_source = [':'.join([source['title'], source['score']]) for source in shop_source]
- shop_source = ','.join(shop_source)
- item = [str(self.count), item_id + '\t', user_id + '\t', s_type, title, url, price, month_xiao, city, shopName, rank, comment_num, shop_source]
- item += line
- print(item)
- self.ws.append(item)
- self.wb.save(self.file_path)
- self.count += 1
- self.shop_set.add(item_id)
- def search(self):
- session = requests.session()
- cookie = 't=85db5e7cb0133f23f29f98c7d6955615; cna=3uklFEhvXUoCAd9H6ovaVLTG; isg=BM3NGT0Oqmp6Mg4qfcGPnvDY3-pNqzF2joji8w9SGWTYBu241_taTS6UdFrF3Rk0; miid=983575671563913813; thw=cn; um=535523100CBE37C36EEFF761CFAC96BC4CD04CD48E6631C3112393F438E181DF6B34171FDA66B2C2CD43AD3E795C914C34A100CE538767508DAD6914FD9E61CE; _cc_=W5iHLLyFfA%3D%3D; tg=0; enc=oRI1V9aX5p%2BnPbULesXvnR%2BUwIh9CHIuErw0qljnmbKe0Ecu1Gxwa4C4%2FzONeGVH9StU4Isw64KTx9EHQEhI2g%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; mt=ci=0_0; hibext_instdsigdipv2=1; JSESSIONID=EC33B48CDDBA7F11577AA9FEB44F0DF3'
- session.headers = {
- 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E277 Safari/602.1',
- 'cookie': cookie,
- 'referer': 'https://s.taobao.com/search?q=%E5%86%B0%E7%AE%B1&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20190804&ie=utf8&sort=sale-desc&bcoffset=0&p4ppushleft=%2C48&s=1364'
- }
- url = 'https://s.taobao.com/search?rec_type=1&q={}&sort=sale-desc&s=0&ajax=true&data-value=396&filter=reserve_price%5B{}%2C{}%5D'.format(self.key, self.start_price, self.end_price)
- start_flag = 0
- while True:
- if start_flag > 10:
- session.headers['cookie'] = 't=85db5e7cb0133f23f29f98c7d6955615; cna=3uklFEhvXUoCAd9H6ovaVLTG; isg=BM3NGT0Oqmp6Mg4qfcGPnvDY3-pNqzF2joji8w9SGWTYBu241_taTS6UdFrF3Rk0; miid=983575671563913813; thw=cn; um=535523100CBE37C36EEFF761CFAC96BC4CD04CD48E6631C3112393F438E181DF6B34171FDA66B2C2CD43AD3E795C914C34A100CE538767508DAD6914FD9E61CE; _cc_=W5iHLLyFfA%3D%3D; tg=0; enc=oRI1V9aX5p%2BnPbULesXvnR%2BUwIh9CHIuErw0qljnmbKe0Ecu1Gxwa4C4%2FzONeGVH9StU4Isw64KTx9EHQEhI2g%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; mt=ci=0_0; hibext_instdsigdipv2=1; JSESSIONID=EC33B48CDDBA7F11577AA9FEB44F0DF3'
- start_flag = 0
- req = session.get(url).json()
- try:
- capurl = req['url']
- print('验证码', capurl)
- time.sleep(1)
- # cookies = self.loop.run_until_complete(main(capurl))
- # print(cookies)
- # session.cookies.set('x5sec', cookies['x5sec'])
- start_flag += 1
- except:
- break
- page = jsonpath.jsonpath(req, '$..totalPage')[0]
- datas = req['mods']['itemlist']['data']['auctions']
- for data in datas:
- self.get_info(data)
- for p in range(1, page):
- p_va = p * 44
- url = 'https://s.taobao.com/search?rec_type=1&q={}&sort=sale-desc&s={}&ajax=true&data-value=396&filter=reserve_price%5B{}%2C{}%5D'.format(self.key, p_va, self.start_price, self.end_price)
- print(url)
- flag = 0
- while True:
- if flag > 10:
- session.headers['cookie'] = 't=85db5e7cb0133f23f29f98c7d6955615; cna=3uklFEhvXUoCAd9H6ovaVLTG; isg=BM3NGT0Oqmp6Mg4qfcGPnvDY3-pNqzF2joji8w9SGWTYBu241_taTS6UdFrF3Rk0; miid=983575671563913813; thw=cn; um=535523100CBE37C36EEFF761CFAC96BC4CD04CD48E6631C3112393F438E181DF6B34171FDA66B2C2CD43AD3E795C914C34A100CE538767508DAD6914FD9E61CE; _cc_=W5iHLLyFfA%3D%3D; tg=0; enc=oRI1V9aX5p%2BnPbULesXvnR%2BUwIh9CHIuErw0qljnmbKe0Ecu1Gxwa4C4%2FzONeGVH9StU4Isw64KTx9EHQEhI2g%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; mt=ci=0_0; hibext_instdsigdipv2=1; JSESSIONID=EC33B48CDDBA7F11577AA9FEB44F0DF3'
- flag = 0
- req = session.get(url).json()
- try:
- capurl = req['url']
- print('验证码', capurl)
- # cookies = self.loop.run_until_complete(main(capurl))
- # print(cookies)
- # session.cookies.set('x5sec', cookies['x5sec'])
- time.sleep(1)
- flag += 1
- except:
- break
- datas = req['mods']['itemlist']['data']['auctions']
- for data in datas:
- self.get_info(data)
- if __name__ == '__main__':
- tb = TaoBaoSpider(good, low_price, high_price)
- tb.search()
复制代码
每次 f"{self.key}.xlsx"类似于这种样子的它双引号就会报错 |
|