uupppo 发表于 2023-1-24 23:28:35

新手练习爬虫,但是报错麻烦大佬帮忙看看代码,不吝赐教

import requests
import json
name=input("请输入关键词:")
a=input("请输入搜索开始日期,如2022-10-10:")
startTime=a+' 00:00:00'
b=input("请输入搜索开始日期,如2022-12-10:")
endTime=b+' 23:59:59'
print("正在为您搜索请稍后...")
url='https://www.cqggzy.com/interface/rest/esinteligentsearch/getFullTextDataNew'
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.61'}
data={
      "token": "",
      "pn": "0",
      "rn": "999",
      "sdt": "",
      "edt": "",
      "wd": "",
      "inc_wd": "",
      "exc_wd": "",
      "fields": "",
      "cnum": "001",
      "sort": "{\"istop\":\"0\",\"ordernum\":\"0\",\"webdate\":\"0\",\"rowid\":\"0\"}",
      "ssort": "",
      "cl": "10000",
      "terminal": "",
      "condition": [
            {
                "fieldName": "categorynum",
                "equal": "004",
                "notEqual": "null",
                "equalList": "null",
                "notEqualList": [
                  "014001018",
                  "004002005",
                  "014001015",
                  "014005014",
                  "014008011"
                ],
                "isLike": "true",
                "likeType": "2"
            },
            {
                "fieldName": "titlenew",
                "equal": name,
                "notEqual": "null",
                "equalList": "null",
                "notEqualList": "null",
                "isLike": "true",
                "likeType": "0"
            }
      ],
      "time": [
            {
                "fieldName": "webdate",
                "startTime": startTime,
                "endTime": endTime
            }
      ],
      "highlights": "",
      "statistics": "null",
      "unionCondition": [],
      "accuracy": "",
      "noParticiple": "1",
      "searchRange": "null",
      "noWd": "true"
}
response=requests.post(url=url,data=data,headers=headers)
id=response.json()
filename=startTime+'至'+endTime+'.json'
fp=open(filename,'w',encoding='utf-8')
json.dump(id,fp=fp,ensure_ascii=False)
print("over!!!")

鱼cpython学习者 发表于 2023-1-24 23:28:36

uupppo 发表于 2023-1-25 19:29
我已经根据大佬的建议优化过一次了,已经尽力,烦请各位大佬再帮我看看

不好意思啊,我忘了说一点,data末尾那个冒号不要加
还有,如果你的name是中文的话,直接传会报错,得在headers里面加一项,还得把data编码成utf-8
还有你的filename,里面包括冒号这种特殊字符,会报错,得把逗号替换成其他符号,我用短横杠,你也可以用其他的
import requests
import json
name = input("请输入关键词:")
a = input("请输入搜索开始日期,如2022-10-10:")
startTime = a + ' 00:00:00'
b = input("请输入搜索开始日期,如2022-12-10:")
endTime = b + ' 23:59:59'
print("正在为您搜索请稍后...")
url = 'https://www.cqggzy.com/interface/rest/esinteligentsearch/getFullTextDataNew'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.61',
    'Content-Type': 'text/text; charset=utf-8'
}
data = r'{"token":"","pn":0,"rn":20,"sdt":"","edt":"","wd":"","inc_wd":"","exc_wd":"","fields":"","cnum":"001","sort":"{\"istop\":\"0\",\"ordernum\":\"0\",\"webdate\":\"0\",\"rowid\":\"0\"}","ssort":"","cl":10000,"terminal":"","condition":[{"fieldName":"categorynum","equal":"004","notEqual":null,"equalList":null,"notEqualList":["014001018","004002005","014001015","014005014","014008011"],"isLike":true,"likeType":2},{"fieldName":"titlenew","equal":"'+name+'","notEqual":null,"equalList":null,"notEqualList":null,"isLike":true,"likeType":0}],"time":[{"fieldName":"webdate","startTime":"'+startTime+'","endTime":"'+endTime+'"}],"highlights":"","statistics":null,"unionCondition":[],"accuracy":"","noParticiple":"1","searchRange":null,"noWd":true}'
response = requests.post(url=url, data=data.encode('utf-8'), headers=headers)
data_id = response.json()
filename = f"{startTime}至{endTime}.json".replace(":", "-")
fp = open(filename, 'w', encoding='utf-8')
json.dump(data_id, fp=fp, ensure_ascii=False)
print("over!!!")

ba21 发表于 2023-1-24 23:53:54

https://www.cqggzy.com/interface/rest/esinteligentsearch/getFullTextDataNew
都无法访问。又如何爬取?

调试一下,获取到的数据一目了然。
response=requests.post(url=url,data=data,headers=headers)
print(response.content)

uupppo 发表于 2023-1-25 00:38:53

ba21 发表于 2023-1-24 23:53
https://www.cqggzy.com/interface/rest/esinteligentsearch/getFullTextDataNew
都无法访问。又如何爬取 ...

大佬,这个确实是post请求的url,萌新求教

uupppo 发表于 2023-1-25 00:41:47

ba21 发表于 2023-1-24 23:53
https://www.cqggzy.com/interface/rest/esinteligentsearch/getFullTextDataNew
都无法访问。又如何爬取 ...

而且这个请求确实有我需要的响应信息。

鱼cpython学习者 发表于 2023-1-25 11:22:16

这个好坑人,它的data是一个字符串而不是json,我第一眼没看出来,后面扒js才看出来
代码如下
import requests
import json
name = input("请输入关键词:")
a = input("请输入搜索开始日期,如2022-10-10:")
startTime = a + ' 00:00:00'
b = input("请输入搜索开始日期,如2022-12-10:")
endTime = b + ' 23:59:59'
print("正在为您搜索请稍后...")
url = 'https://www.cqggzy.com/interface/rest/esinteligentsearch/getFullTextDataNew'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.61',
    'Cookie': "cookie_www=19398923;__jsluid_s=c909e6e6b4e5bd96fcf32a454560ae15;Hm_lvt_3b83938a8721dadef0b185225769572a=1674614068;Hm_lpvt_3b83938a8721dadef0b185225769572a=1674614110",
    'Host': 'www.cqggzy.com',
    'Origin': 'https://www.cqggzy.com',
    'Referer': 'https://www.cqggzy.com/xxhz/transaction_detail.html'}
data = '{"token":"","pn":0,"rn":20,"sdt":"","edt":"","wd":"","inc_wd":"","exc_wd":"","fields":"","cnum":"001","sort":"{\\"istop\\":\\"0\\",\\"ordernum\\":\\"0\\",\\"webdate\\":\\"0\\",\\"rowid\\":\\"0\\"}","ssort":"","cl":10000,"terminal":"","condition":[{"fieldName":"categorynum","equal":"014","notEqual":null,"equalList":null,"notEqualList":["014001018","004002005","014001015","014005014","014008011"],"isLike":true,"likeType":2}],"time":[{"fieldName":"webdate","startTime":"' + \
    startTime + '","endTime":"' + endTime + '"}],"highlights":"","statistics":null,"unionCondition":[],"accuracy":"","noParticiple":"1","searchRange":null,"noWd":true}'
response = requests.post(url=url, data=data, headers=headers)
data_id = response.json()
filename = f"{startTime}至{endTime}.json".replace(":", "-")
fp = open(filename, 'w', encoding='utf-8')
json.dump(data_id, fp=fp, ensure_ascii=False)
print("over!!!")

ba21 发表于 2023-1-25 16:44:37

鱼cpython学习者 发表于 2023-1-25 11:22
这个好坑人,它的data是一个字符串而不是json,我第一眼没看出来,后面扒js才看出来
代码如下

这么说已解决了?恭喜

uupppo 发表于 2023-1-25 18:22:59

鱼cpython学习者 发表于 2023-1-25 11:22
这个好坑人,它的data是一个字符串而不是json,我第一眼没看出来,后面扒js才看出来
代码如下

大佬,我复制了你的代码,但是还是报错,我在想会不会是字符串最后有个:号,但是我加上去了依旧报错

鱼cpython学习者 发表于 2023-1-25 19:04:31

uupppo 发表于 2023-1-25 18:22
大佬,我复制了你的代码,但是还是报错,我在想会不会是字符串最后有个:号,但是我加上去了依旧报错

刚刚试了一下,在data = '...'那里改成data = r'...'就好了
很奇怪,我也不知道刚才为什么可以运行,现在又不行了
import requests
import json
name = input("请输入关键词:")
a = input("请输入搜索开始日期,如2022-10-10:")
startTime = a + ' 00:00:00'
b = input("请输入搜索开始日期,如2022-12-10:")
endTime = b + ' 23:59:59'
print("正在为您搜索请稍后...")
url = 'https://www.cqggzy.com/interface/rest/esinteligentsearch/getFullTextDataNew'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.61',
    'Cookie': "cookie_www=19398923; __jsluid_s=c909e6e6b4e5bd96fcf32a454560ae15; Hm_lvt_3b83938a8721dadef0b185225769572a=1674614068; Hm_lpvt_3b83938a8721dadef0b185225769572a=1674642615",
    'Host': 'www.cqggzy.com',
    'Origin': 'https://www.cqggzy.com',
    'Referer': 'https://www.cqggzy.com/xxhz/transaction_detail.html'}
data = r'{"token":"","pn":0,"rn":20,"sdt":"","edt":"","wd":"","inc_wd":"","exc_wd":"","fields":"","cnum":"001","sort":"{\"istop\":\"0\",\"ordernum\":\"0\",\"webdate\":\"0\",\"rowid\":\"0\"}","ssort":"","cl":10000,"terminal":"","condition":[{"fieldName":"categorynum","equal":"014","notEqual":null,"equalList":null,"notEqualList":["014001018","004002005","014001015","014005014","014008011"],"isLike":true,"likeType":2}],"time":[{"fieldName":"webdate","startTime":"' + startTime + '","endTime":"' + endTime + '"}],"highlights":"","statistics":null,"unionCondition":[],"accuracy":"","noParticiple":"1","searchRange":null,"noWd":true}'
response = requests.post(url=url, data=data, headers=headers)
data_id = response.json()
filename = f"{startTime}至{endTime}.json".replace(":", "-")
fp = open(filename, 'w', encoding='utf-8')
json.dump(data_id, fp=fp, ensure_ascii=False)
print("over!!!")

uupppo 发表于 2023-1-25 19:27:39

鱼cpython学习者 发表于 2023-1-25 19:04
刚刚试了一下,在data = '...'那里改成data = r'...'就好了
很奇怪,我也不知道刚才为什么可以运行,现 ...

大佬,不好意思,我复制运行你的代码还是报错,是我这边的运行环境问题吗?

uupppo 发表于 2023-1-25 19:29:05

我已经根据大佬的建议优化过一次了,已经尽力,烦请各位大佬再帮我看看
import requests
import json
name = input("请输入关键词:")
a = input("请输入搜索开始日期,如2022-10-10:")
startTime = a+' 00:00:00'
b = input("请输入搜索结束日期,如2022-12-10:")
endTime = b+' 23:59:59'
print("正在为您搜索请稍后...")
url = 'https://www.cqggzy.com/interface/rest/esinteligentsearch/getFullTextDataNew'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.61'}
data = '{"token":"","pn":0,"rn":20,"sdt":"","edt":"","wd":"","inc_wd":"","exc_wd":"","fields":"","cnum":"001","sort":"{\\"istop\\":\\"0\\",\\"ordernum\\":\\"0\\",\\"webdate\\":\\"0\\",\\"rowid\\":\\"0\\"}","ssort":"","cl":10000,"terminal":"","condition":[{"fieldName":"categorynum","equal":"004","notEqual":null,"equalList":null,"notEqualList":["014001018","004002005","014001015","014005014","014008011"],"isLike":true,"likeType":2},{"fieldName":"titlenew","equal":"'+name+'","notEqual":null,"equalList":null,"notEqualList":null,"isLike":true,"likeType":0}],"time":[{"fieldName":"webdate","startTime":"'+startTime+'","endTime":"'+endTime+'"}],"highlights":"","statistics":null,"unionCondition":[],"accuracy":"","noParticiple":"1","searchRange":null,"noWd":true}: '
print(data)
response = requests.post(url=url,data=data,headers=headers)
data_id = response.json()
filename = a+'至'+b+'.json'
fp = open(filename,'w',encoding = 'utf-8')
json.dump(data_id,fp = fp,ensure_ascii = False)
print("over!!!")

aabb8899 发表于 2023-1-26 07:03:08

论坛挺热闹的。

uupppo 发表于 2023-1-29 13:41:47

鱼cpython学习者 发表于 2023-1-24 23:28
不好意思啊,我忘了说一点,data末尾那个冒号不要加
还有,如果你的name是中文的话,直接传会报错,得在 ...

谢谢大佬。已经解决了
页: [1]
查看完整版本: 新手练习爬虫,但是报错麻烦大佬帮忙看看代码,不吝赐教