|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 2022@lif 于 2022-1-25 08:30 编辑
爬取网址https://list.szlcsc.com/catalog/439.html中的所有产品的信息,发现在此页面翻页时网址的url不会变。查看网络请求时发现这些页面页数信息应该是存放在Payload中的表单数据中:
但是我发起POST请求时携带了这些参数,并不能得到这个页面的页面源码。
- data = {
- 'catalogNodeId': 439,
- 'pageNumber': 2,
- 'querySortBySign': 0,
- 'showOutSockProduct': 1,
- 'showDiscountProduct': 1,
- 'queryBeginPrice':'',
- 'queryEndPrice':'' ,
- 'queryProductArrange':'',
- 'queryProductGradePlateId': '',
- 'queryProductTypeCode': '',
- 'queryParameterValue': '',
- 'queryProductStandard': '',
- 'querySmtLabel': '',
- 'queryReferenceEncap': '',
- 'queryProductLabel':'',
- 'lastParamName': '',
- 'baseParameterCondition': '',
- 'parameterCondition': ''
- }
- session = requests.session()
- response = session.post(lit_url,data = json.dumps(data),headers = headers).content.decode('utf-8')
复制代码 完整代码如下:
- import requests
- from lxml import etree
- import json
- import pandas as pd
- import sys
- url = 'https://list.szlcsc.com/'
- headers = {
- 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36 Edg/96.0.1054.34'
- }
- #获取页面源码
- def get_page(url):
- session = requests.session()
- #session在发起请求时会创建cookies,并在下次发起session请求时包含sookies;'.content.decode('utf-8')解决爬取到的数据中文乱码问题'
- page_text = session.get(url = url,headers = headers,proxies=proxy).content.decode('utf-8')
- return page_text
- #获取路径下的内容
- def get_content(text,path):
- tree = etree.HTML(text)
- content_list = tree.xpath(path)
- return content_list
- resp = get_page(url)
- for num in range(1,19):
- Manul_list = get_content(resp,f'/html/body/div[1]/div/div[1]/ul/li[{num}]/div/dl/dt/a[1]/text()')
- Manul = '/'.join(Manul_list)
- print(str(num)+' '+Manul)
- #选择需要查询的大类
- choice = input('please select a number you would like to choice:')
- lit_list = get_content(resp,f'/html/body/div[1]/div/div[1]/ul/li[{choice}]/div/dl/dd/a[1]/text()')
- lit_url_list = get_content(resp,f'/html/body/div[1]/div/div[1]/ul/li[{choice}]/div/dl/dd/a[1]/@href')
- lit_dict = {}
- for i in range(0,len(lit_list)):
- lit_dict.update({lit_list[i]:lit_url_list[i]})
- name_dict = {}
- for n in range(1,len(lit_list)+1):
- print(str(n)+' '+lit_list[n-1])
- name_dict.update({n:lit_list[n-1]})
- #选择需要查询的具体类型
- cho = int(input('please select a number you would like to choice:'))
- lit_url = lit_dict[name_dict[cho]]
- data = {
- 'catalogNodeId': 439,
复制代码 运行代码后选1,然后选18.
得到的却不是我想要的这个页面的页面源码,哪里出现了问题呢?
看看这些东西是不是你要的,这个页面太专业了,有点看不懂
- import requests,json
- data = {
- 'catalogNodeId': 439,
- 'pageNumber': 2,
- 'querySortBySign': 0,
- 'showOutSockProduct': 1,
- 'showDiscountProduct': 1,
- 'queryBeginPrice':'',
- 'queryEndPrice':'' ,
- 'queryProductArrange':'',
- 'queryProductGradePlateId': '',
- 'queryProductTypeCode': '',
- 'queryParameterValue': '',
- 'queryProductStandard': '',
- 'querySmtLabel': '',
- 'queryReferenceEncap': '',
- 'queryProductLabel':'',
- 'lastParamName': '',
- 'baseParameterCondition': '',
- 'parameterCondition': ''
- }
- headers = {
- 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36 Edg/96.0.1054.34',
- "cookie": "acw_tc=da3dd31c16430728279798128e080b8460c74973023561a97ab6c006d3; SID=471d893b-2380-43e5-ba2f-7c8cf55e7592; SID.sig=bPEGcnKdLpFDiL5G5xuKHDFqRnS7DjKR0uoun7RX0Cg; Qs_lvt_290854=1643072833; Qs_pv_290854=4487896694418019300; cpx=1; guidePage=true; noLoginCustomerFlag=929acea54e81e00b7866; noLoginCustomerFlag2=a0c9da953af694050bd3; PRO_NEW_SID=8920f7cd-dc56-4f97-bd00-f3cb9b98596b; computerKey=d87e0a87824fc7096b6a; AGL_USER_ID=85617b33-f9fb-4d6e-9d65-1b1878037a9d; Hm_lvt_e2986f4b6753d376004696a1628713d2=1643072840; Hm_lpvt_e2986f4b6753d376004696a1628713d2=1643072840; show_out_sock_product=1",
- "origin":"https://list.szlcsc.com",
- "referer": "https://list.szlcsc.com/catalog/439.html"
- }
- url="https://list.szlcsc.com/products/list"
- txt=requests.post(url,headers=headers,data=data).text
- js=json.loads(txt)
- print(js["productRecordList"])
复制代码
|
|