|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
给个思路,我想不出怎么弄
- <script>
- window.PAGE_START_LOAD_TIME = new Date().getTime();
- window.LOGIN_UMID_LOAD = true;
- window.viewConfig = {"api":{"smsLoginApi":"/newlogin/sms/login.do?appName=taobao&fromSite=0","loginApi":"/newlogin/login.do?appName=taobao&fromSite=0","smsLoginRegAp......
- window.viewData = {"appEntrance":"taobao_pc","appName":"taobao","awscCdn":"//g.alicdn.com","countryAreaConfig":{"countryList":[{"areaName":"中国大陆","checkKey":"^(86){.......
- window._lang = {"error-login-mobile-empty":"请输入手机号码","view-mobile-country-area-popup-cancel-btn-title":"取消","view-pwdlogin-mobile-loginid-title":"手机号","view-qrcodelog.....
复制代码
下面源码是学习其他作者模仿写的
这里url = 'https://list.tmall.com/search_product.htm?q=' + item# + '&s=' + str(num) 加上这个就不行 就会出现上面的问题
- import requests
- import re
- import bs4
- def getHtmlText(url):
- header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4209.400'}
- try:
- res = requests.get(url, headers = header, timeout = 30) #超出加载网页时间,就退出
- res.raise_for_status() #检查信号强度
- res.encoding = res.apparent_encoding #修改编码格式
- return res.text
- except:
- return ''
- def parsPage(html, report):
- html = bs4.BeautifulSoup(html, 'html.parser')
- cp = re.compile('<[^>]+>| |\n|\t')
- product_prices = html.find_all('p', class_ = 'productPrice')
- product_titles = html.find_all('p', class_ = 'productTitle')
- for product_price, product_title in zip(product_prices, product_titles):
- price = cp.sub('', str(product_price))
- title = cp.sub('', str(product_title))
- report.append([price, title])
- def printGoodList(report, temp):
- tplt = '{:4}\t{:8}\t{:4}'
- print(tplt.format('序号', '价格', '商品名称'))
- while True:
- for i in range(len(report)):
- print(tplt.format(str((temp*60) + i+1), report[i][0], report[i][1]))
- yield
- def main():
- # item = input('请选择你要查找的物品?')
- # page = input('请选择需要查找的页数?')
- item = '书包'
- page = '1'
- for temp in range(int(page)):
- report = []
- num = temp*60
- url = 'https://list.tmall.com/search_product.htm?q=' + item# + '&s=' + str(num) 加上这个就不行
- print(url)
- html = getHtmlText(url)
- print(html)
- parsPage(html, report)
- try:
- print(report)
- next(printGoodList(report, temp))
- except StopIteration:
- break
- if __name__ == '__main__':
- main()
复制代码
淘宝没这么简单爬的吧,应该要登陆、cookie之类,爬第二页就没拿到数据,出来的是登陆页面
用selenium先登陆,再获取数据试试吧
|
|