新手练习爬虫,但是报错麻烦大佬帮忙看看代码,不吝赐教
import requestsimport json
name=input("请输入关键词:")
a=input("请输入搜索开始日期,如2022-10-10:")
startTime=a+' 00:00:00'
b=input("请输入搜索开始日期,如2022-12-10:")
endTime=b+' 23:59:59'
print("正在为您搜索请稍后...")
url='https://www.cqggzy.com/interface/rest/esinteligentsearch/getFullTextDataNew'
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.61'}
data={
"token": "",
"pn": "0",
"rn": "999",
"sdt": "",
"edt": "",
"wd": "",
"inc_wd": "",
"exc_wd": "",
"fields": "",
"cnum": "001",
"sort": "{\"istop\":\"0\",\"ordernum\":\"0\",\"webdate\":\"0\",\"rowid\":\"0\"}",
"ssort": "",
"cl": "10000",
"terminal": "",
"condition": [
{
"fieldName": "categorynum",
"equal": "004",
"notEqual": "null",
"equalList": "null",
"notEqualList": [
"014001018",
"004002005",
"014001015",
"014005014",
"014008011"
],
"isLike": "true",
"likeType": "2"
},
{
"fieldName": "titlenew",
"equal": name,
"notEqual": "null",
"equalList": "null",
"notEqualList": "null",
"isLike": "true",
"likeType": "0"
}
],
"time": [
{
"fieldName": "webdate",
"startTime": startTime,
"endTime": endTime
}
],
"highlights": "",
"statistics": "null",
"unionCondition": [],
"accuracy": "",
"noParticiple": "1",
"searchRange": "null",
"noWd": "true"
}
response=requests.post(url=url,data=data,headers=headers)
id=response.json()
filename=startTime+'至'+endTime+'.json'
fp=open(filename,'w',encoding='utf-8')
json.dump(id,fp=fp,ensure_ascii=False)
print("over!!!")
uupppo 发表于 2023-1-25 19:29
我已经根据大佬的建议优化过一次了,已经尽力,烦请各位大佬再帮我看看
不好意思啊,我忘了说一点,data末尾那个冒号不要加
还有,如果你的name是中文的话,直接传会报错,得在headers里面加一项,还得把data编码成utf-8
还有你的filename,里面包括冒号这种特殊字符,会报错,得把逗号替换成其他符号,我用短横杠,你也可以用其他的
import requests
import json
name = input("请输入关键词:")
a = input("请输入搜索开始日期,如2022-10-10:")
startTime = a + ' 00:00:00'
b = input("请输入搜索开始日期,如2022-12-10:")
endTime = b + ' 23:59:59'
print("正在为您搜索请稍后...")
url = 'https://www.cqggzy.com/interface/rest/esinteligentsearch/getFullTextDataNew'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.61',
'Content-Type': 'text/text; charset=utf-8'
}
data = r'{"token":"","pn":0,"rn":20,"sdt":"","edt":"","wd":"","inc_wd":"","exc_wd":"","fields":"","cnum":"001","sort":"{\"istop\":\"0\",\"ordernum\":\"0\",\"webdate\":\"0\",\"rowid\":\"0\"}","ssort":"","cl":10000,"terminal":"","condition":[{"fieldName":"categorynum","equal":"004","notEqual":null,"equalList":null,"notEqualList":["014001018","004002005","014001015","014005014","014008011"],"isLike":true,"likeType":2},{"fieldName":"titlenew","equal":"'+name+'","notEqual":null,"equalList":null,"notEqualList":null,"isLike":true,"likeType":0}],"time":[{"fieldName":"webdate","startTime":"'+startTime+'","endTime":"'+endTime+'"}],"highlights":"","statistics":null,"unionCondition":[],"accuracy":"","noParticiple":"1","searchRange":null,"noWd":true}'
response = requests.post(url=url, data=data.encode('utf-8'), headers=headers)
data_id = response.json()
filename = f"{startTime}至{endTime}.json".replace(":", "-")
fp = open(filename, 'w', encoding='utf-8')
json.dump(data_id, fp=fp, ensure_ascii=False)
print("over!!!")
https://www.cqggzy.com/interface/rest/esinteligentsearch/getFullTextDataNew
都无法访问。又如何爬取?
调试一下,获取到的数据一目了然。
response=requests.post(url=url,data=data,headers=headers)
print(response.content) ba21 发表于 2023-1-24 23:53
https://www.cqggzy.com/interface/rest/esinteligentsearch/getFullTextDataNew
都无法访问。又如何爬取 ...
大佬,这个确实是post请求的url,萌新求教 ba21 发表于 2023-1-24 23:53
https://www.cqggzy.com/interface/rest/esinteligentsearch/getFullTextDataNew
都无法访问。又如何爬取 ...
而且这个请求确实有我需要的响应信息。 这个好坑人,它的data是一个字符串而不是json,我第一眼没看出来,后面扒js才看出来
代码如下
import requests
import json
name = input("请输入关键词:")
a = input("请输入搜索开始日期,如2022-10-10:")
startTime = a + ' 00:00:00'
b = input("请输入搜索开始日期,如2022-12-10:")
endTime = b + ' 23:59:59'
print("正在为您搜索请稍后...")
url = 'https://www.cqggzy.com/interface/rest/esinteligentsearch/getFullTextDataNew'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.61',
'Cookie': "cookie_www=19398923;__jsluid_s=c909e6e6b4e5bd96fcf32a454560ae15;Hm_lvt_3b83938a8721dadef0b185225769572a=1674614068;Hm_lpvt_3b83938a8721dadef0b185225769572a=1674614110",
'Host': 'www.cqggzy.com',
'Origin': 'https://www.cqggzy.com',
'Referer': 'https://www.cqggzy.com/xxhz/transaction_detail.html'}
data = '{"token":"","pn":0,"rn":20,"sdt":"","edt":"","wd":"","inc_wd":"","exc_wd":"","fields":"","cnum":"001","sort":"{\\"istop\\":\\"0\\",\\"ordernum\\":\\"0\\",\\"webdate\\":\\"0\\",\\"rowid\\":\\"0\\"}","ssort":"","cl":10000,"terminal":"","condition":[{"fieldName":"categorynum","equal":"014","notEqual":null,"equalList":null,"notEqualList":["014001018","004002005","014001015","014005014","014008011"],"isLike":true,"likeType":2}],"time":[{"fieldName":"webdate","startTime":"' + \
startTime + '","endTime":"' + endTime + '"}],"highlights":"","statistics":null,"unionCondition":[],"accuracy":"","noParticiple":"1","searchRange":null,"noWd":true}'
response = requests.post(url=url, data=data, headers=headers)
data_id = response.json()
filename = f"{startTime}至{endTime}.json".replace(":", "-")
fp = open(filename, 'w', encoding='utf-8')
json.dump(data_id, fp=fp, ensure_ascii=False)
print("over!!!")
鱼cpython学习者 发表于 2023-1-25 11:22
这个好坑人,它的data是一个字符串而不是json,我第一眼没看出来,后面扒js才看出来
代码如下
这么说已解决了?恭喜 鱼cpython学习者 发表于 2023-1-25 11:22
这个好坑人,它的data是一个字符串而不是json,我第一眼没看出来,后面扒js才看出来
代码如下
大佬,我复制了你的代码,但是还是报错,我在想会不会是字符串最后有个:号,但是我加上去了依旧报错 uupppo 发表于 2023-1-25 18:22
大佬,我复制了你的代码,但是还是报错,我在想会不会是字符串最后有个:号,但是我加上去了依旧报错
刚刚试了一下,在data = '...'那里改成data = r'...'就好了
很奇怪,我也不知道刚才为什么可以运行,现在又不行了
import requests
import json
name = input("请输入关键词:")
a = input("请输入搜索开始日期,如2022-10-10:")
startTime = a + ' 00:00:00'
b = input("请输入搜索开始日期,如2022-12-10:")
endTime = b + ' 23:59:59'
print("正在为您搜索请稍后...")
url = 'https://www.cqggzy.com/interface/rest/esinteligentsearch/getFullTextDataNew'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.61',
'Cookie': "cookie_www=19398923; __jsluid_s=c909e6e6b4e5bd96fcf32a454560ae15; Hm_lvt_3b83938a8721dadef0b185225769572a=1674614068; Hm_lpvt_3b83938a8721dadef0b185225769572a=1674642615",
'Host': 'www.cqggzy.com',
'Origin': 'https://www.cqggzy.com',
'Referer': 'https://www.cqggzy.com/xxhz/transaction_detail.html'}
data = r'{"token":"","pn":0,"rn":20,"sdt":"","edt":"","wd":"","inc_wd":"","exc_wd":"","fields":"","cnum":"001","sort":"{\"istop\":\"0\",\"ordernum\":\"0\",\"webdate\":\"0\",\"rowid\":\"0\"}","ssort":"","cl":10000,"terminal":"","condition":[{"fieldName":"categorynum","equal":"014","notEqual":null,"equalList":null,"notEqualList":["014001018","004002005","014001015","014005014","014008011"],"isLike":true,"likeType":2}],"time":[{"fieldName":"webdate","startTime":"' + startTime + '","endTime":"' + endTime + '"}],"highlights":"","statistics":null,"unionCondition":[],"accuracy":"","noParticiple":"1","searchRange":null,"noWd":true}'
response = requests.post(url=url, data=data, headers=headers)
data_id = response.json()
filename = f"{startTime}至{endTime}.json".replace(":", "-")
fp = open(filename, 'w', encoding='utf-8')
json.dump(data_id, fp=fp, ensure_ascii=False)
print("over!!!")
鱼cpython学习者 发表于 2023-1-25 19:04
刚刚试了一下,在data = '...'那里改成data = r'...'就好了
很奇怪,我也不知道刚才为什么可以运行,现 ...
大佬,不好意思,我复制运行你的代码还是报错,是我这边的运行环境问题吗? 我已经根据大佬的建议优化过一次了,已经尽力,烦请各位大佬再帮我看看
import requests
import json
name = input("请输入关键词:")
a = input("请输入搜索开始日期,如2022-10-10:")
startTime = a+' 00:00:00'
b = input("请输入搜索结束日期,如2022-12-10:")
endTime = b+' 23:59:59'
print("正在为您搜索请稍后...")
url = 'https://www.cqggzy.com/interface/rest/esinteligentsearch/getFullTextDataNew'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.61'}
data = '{"token":"","pn":0,"rn":20,"sdt":"","edt":"","wd":"","inc_wd":"","exc_wd":"","fields":"","cnum":"001","sort":"{\\"istop\\":\\"0\\",\\"ordernum\\":\\"0\\",\\"webdate\\":\\"0\\",\\"rowid\\":\\"0\\"}","ssort":"","cl":10000,"terminal":"","condition":[{"fieldName":"categorynum","equal":"004","notEqual":null,"equalList":null,"notEqualList":["014001018","004002005","014001015","014005014","014008011"],"isLike":true,"likeType":2},{"fieldName":"titlenew","equal":"'+name+'","notEqual":null,"equalList":null,"notEqualList":null,"isLike":true,"likeType":0}],"time":[{"fieldName":"webdate","startTime":"'+startTime+'","endTime":"'+endTime+'"}],"highlights":"","statistics":null,"unionCondition":[],"accuracy":"","noParticiple":"1","searchRange":null,"noWd":true}: '
print(data)
response = requests.post(url=url,data=data,headers=headers)
data_id = response.json()
filename = a+'至'+b+'.json'
fp = open(filename,'w',encoding = 'utf-8')
json.dump(data_id,fp = fp,ensure_ascii = False)
print("over!!!")
论坛挺热闹的。 鱼cpython学习者 发表于 2023-1-24 23:28
不好意思啊,我忘了说一点,data末尾那个冒号不要加
还有,如果你的name是中文的话,直接传会报错,得在 ...
谢谢大佬。已经解决了
页:
[1]