|
60鱼币
侄女今年高考,想给她爬取山东省所有学校专业的数据,代码写好运行之后发现一些学校每一年的评论的json数据的url是不一样的,但是我也没找到规律,所以来求助一下,
举个例子
我发现一些学校的评论json在2020年前是,/37/1/14/,之后是/37/3/1570/,但是运行的时候发现很多是不一样的,应该怎么找规律呢,是不是哪里可以获取?
- from time import sleep
- import math
- import requests
- import json
- from fake_useragent import UserAgent
- #获取学校id
- def get_id():
- url = 'https://api.eol.cn/web/api/?&page=3&province_id=37&ranktype=&school_type=&size=20&uri=apidata/api/gk/school/lists'
- wb_data = requests.get(url, headers={'User-Agent':str(UserAgent().random)}).text
- print(wb_data)
- data = json.loads(wb_data)['data']['item']
- print(data)
- # print(data['data'].keys())
- school_id = []
- for i in data:
- dic = {'school_id': i['school_id'], 'nature_name': i['nature_name'],
- 'city_name': i['city_name'], 'level_name': i['level_name'],
- 'name': i['name']}
- school_id.append(dic)
- print(school_id)
- link = school_id
- return link
- #根据不同的学校不同的年份获取不同的基本url
- def get_url(link):
- for i in link:
- id = i['school_id']
- #它每一年的url构造是不一样的,这是我找到的规律,但是实际上很多都不一样,所以报错来求助了
- for year in range(2017, 2022):
- if year < 2020:
- url = f'https://static-data.gaokao.cn/www/2.0/schoolspecialindex/{year}/{id}/37/1/14/'
- else:
- url= f'https://static-data.gaokao.cn/www/2.0/schoolspecialindex/{year}/{id}/37/3/1570/'
- print(url)
- get_content(url,i,year)
- #根据不同的url加上页数获取评论
- def get_content(url,i,year):
- urlmax=url
- print(url)
- wb_data = requests.get(url+'1.json', headers={'User-Agent':str(UserAgent().random)}, timeout=2, verify=False).text
- try:
- #获取页数
- page = math.ceil(int(json.loads(wb_data)['data']['numFound'] / 10))
- #有的学校没有数据这里抛出异常
- except:
- page=0
- print(page)
- print(type(page))
- #开始获取评论
- for c_page in range(1, page+1):
- print(urlmax+str(c_page)+'.json')
- wb_data = requests.get(urlmax+str(c_page)+'.json', headers={'User-Agent':str(UserAgent().random)}, timeout=2, verify=False).text
- sleep(6)
- data = json.loads(wb_data)['data']['item']
- for one in data:
- dic = {'学校名称': i['name'], '专业': one['spname'], '层次': one['level1_name'], 'school_id': i['school_id'],
- '类型': i['nature_name'], '城市': i['city_name'], '年份': year, '最低分': one['min'],
- '最低排名': one['min_section'], '选课': one['sp_info']
- }
- # print(one.keys())
- # print(one)
- # min=one['min']#最低分
- # min_section=one['min_section']#最低录取分数线
- # sp_info=one['sp_info']#选课
- # level1_name=one['level1_name']#专科还是本科
- # spname=one['spname']#专业名称
- print(dic)
- print("插入完成")
- # col.insert_one(dic)
- # break
- # print(data.keys())
- if __name__ == '__main__':
- link=get_id()
- # link = [{'school_id': 2132, 'nature_name': '公办', 'city_name': '东营市', 'level_name': '普通本科', 'name': '山东石油化工学院'}]
- # print(link)
- get_url(link)
复制代码
代码应该是可以运行的,我已经让封了 |
|