马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 鸷爱之殇 于 2020-7-10 11:13 编辑
哪个大神能帮忙看一下吗?为什么这段代码一直报错。
这个代码是想利用爬虫获取国家数字植物标本馆中物种的采集地信息。
# -*- coding: UTF-8 -*-
import requests
import json
import pandas as pd
import time
###筛选植物,从第一个节点得到ID
def getPlantINFO(name="大叶相思"):
offset = 0
ALL =pd.DataFrame()
json_data={}
while len(json_data)!=3:
base_url = "http://www.cvh.ac.cn/cvh6/view/controller/search/spms.php?&taxonName="+name+"&offset=" +str(offset)
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36" ,
"Cookie":"_pk_ses.1.2cf1=1; _pk_ses.43.2cf1=1; PHPSESSID=6623lkaj0679eo25lmr0okris7; ASPSESSIONIDCCCBDDQC=LFKBJDADMAHAFGMDBOOINCDA; _pk_id.43.2cf1=ae54e87ea20fbba7.1591011679.2.1591174487.1591173253.; _pk_id.1.2cf1=8fad1f8b96701ecc.1591011679.2.1591174487.1591173253."
}
response = requests.get(base_url)
data = response.text
json_data = json.loads(data)
if len(json_data)==3:
continue;
data_list = json_data['rows']
mid=pd.DataFrame.from_dict(data_list)
if offset ==0:
ALL = mid
else:
ALL = ALL.append(mid)
offset +=30
print(ALL)
return ALL
###根据ID从第二个节点得到样本INFO
def getSampleINFO(plantInfo):
ID = plantInfo["collectionID"].tolist()
output = pd.DataFrame()
n = 0
for i in ID:
base_url = "http://www.cvh.ac.cn/cvh6/view/controller/search/spms_record.php?id=" + i
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36" ,
"Cookie":"_pk_ses.1.2cf1=1; _pk_ses.43.2cf1=1; PHPSESSID=6623lkaj0679eo25lmr0okris7; ASPSESSIONIDCCCBDDQC=LFKBJDADMAHAFGMDBOOINCDA; _pk_id.43.2cf1=ae54e87ea20fbba7.1591011679.2.1591174487.1591173253.; _pk_id.1.2cf1=8fad1f8b96701ecc.1591011679.2.1591174487.1591173253."
}
response = requests.get(base_url)
data = response.text
json_data = json.loads(data)
data_list = json_data['rows']
list = [data_list]
mid=pd.DataFrame.from_dict(list)
if n ==0:
output = mid
else:
output =output.append(mid)
n+=1
time.sleep(0.5) #######查询一个样本后延迟0.5s
return output
def main(name=["大叶相思"]):
outputPath = "E:/test"
for i in name:
test = getPlantINFO(name=i)
output = getSampleINFO(test)
output.to_csv(outputPath+i+".csv",index=False)
print(i+"查询结束")
time.sleep(2)##########查询一个植物后延迟2s
listOfPlants = ["大叶相思","阿拉伯婆婆纳"] #需要在这里加上你要的植物的list
main(listOfPlants)
两个网址path里,都去掉/cvh6/view,另外循环访问间隔时间0.5秒太短,容易获取失败
|