|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
我想爬取深交所官网上市公司的年报,只要年度报告,但是爬取出来了很多季度报告和报告摘要,求大佬指导,这个代码应该怎么修改才能只爬出年度报告。
import os
import math
import json
import requests
from copy import deepcopy
URL = 'http://www.szse.cn/api/disc/announcement/annList'
HEADER = {
'Host': 'www.szse.cn',
'Origin': 'http://www.szse.cn',
'Referer': 'http://www.szse.cn/disclosure/listed/fixed/index.html',
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36",
'Content-Type': 'application/json',
'Connection': 'keep-alive',
'X-Request-Type': 'ajax',
'X-Requested-With': 'XMLHttpRequest',
}
PAGE_SIZE = 30
PAYLOAD = {
'channelCode': ["fixed_disc"],
'pageNum': 1,
'pageSize': PAGE_SIZE,
'seDate': ["", ""],
'stock': ["000001"],
}
PDF_URL_PREFIX = 'http://disc.static.szse.cn/download'
def get_pdf_url(code, begin_date, end_date):
pdf_urls = []
payload = deepcopy(PAYLOAD)
payload['stock'] = [code]
payload['seDate'] = [begin_date, end_date]
res = requests.post(URL, data=json.dumps(payload), headers=HEADER).json()
for i in res['data']:
file_name = '_'.join([i['title'], ''.join(i['publishTime'].split()[0].split('-'))])
pdf_url = PDF_URL_PREFIX + i['attachPath']
pdf_urls.append((file_name, pdf_url))
page_count = math.ceil(res['announceCount'] / PAGE_SIZE)
for j in range(page_count - 1):
payload['pageNum'] = j + 2
res = requests.post(URL, data=json.dumps(payload), headers=HEADER).json()
for i in res['data']:
file_name = '_'.join([i['title'], ''.join(i['publishTime'].split()[0].split('-'))])
pdf_url = PDF_URL_PREFIX + i['attachPath']
pdf_urls.append((file_name, pdf_url))
return pdf_urls
def save_pdf(code, path='./', begin_date='', end_date=''):
pdf_urls = get_pdf_url(code, begin_date, end_date)
file_path = os.path.join(path, code)
if not os.path.isdir(file_path):
os.makedirs(file_path)
for file_name, url in pdf_urls:
extension = url.split('.')[-1]
file_full_name = os.path.join(file_path, '.'.join([file_name, extension])).replace('*', '')
rs = requests.get(url, stream=True)
with open(file_full_name, "wb") as fp:
for chunk in rs.iter_content(chunk_size=10240):
if chunk:
fp.write(chunk)
if __name__ == '__main__':
l = ['300500 ', '300499 ']
for i in l:
save_pdf(i, begin_date='2005-12-27', end_date='2019-12-27')
time.sleep(random.uniform(1, 2))
本帖最后由 YunGuo 于 2021-1-12 04:14 编辑
- import requests
- import json
- import os
- def save_pdf(title, ts, download_url, folder_name):
- """
- 下载
- :param title: pdf文件标题
- :param ts: 报告发表时间
- :param download_url: 报告下载地址
- :param folder_name: 文件夹名
- :return:
- """
- print('正在下载:', title)
- if not os.path.isdir(f'./{folder_name}'):
- os.makedirs(f'./{folder_name}')
- res = requests.get(download_url, stream=True)
- with open(f'./{folder_name}/{title}_{ts}.pdf', 'wb') as f:
- for chunk in res.iter_content(chunk_size=10240):
- if chunk:
- f.write(chunk)
- def get_item(page):
- """
- 请求数据
- :param page: 页码
- :return:
- """
- url = 'http://www.szse.cn/api/disc/announcement/annList'
- headers = {
- "Content-Type": "application/json",
- "Host": "www.szse.cn",
- "Origin": "http://www.szse.cn",
- "Referer": "http://www.szse.cn/disclosure/listed/fixed/index.html",
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36",
- }
- data = {
- # 参数按需求改
- # 010301:年度报告 010303:半年度报告 010305:一季度报告 010307:三季度报告
- "bigCategoryId": ["010301"], # 设置报告类型
- "channelCode": ["fixed_disc"], # 固定不变
- "pageNum": page, # 设置页码
- "pageSize": 30, # 设置一页报告数量
- "seDate": ["", ""] # 设置时间:格式 ["2017-01-01", "2021-01-01"]
- }
- res = requests.post(url, headers=headers, data=json.dumps(data))
- datas = res.json().get('data')
- # 将报告类型id作为文件夹名
- folder_name = ''.join(data.get('bigCategoryId'))
- for info in datas:
- title = info.get('title').replace('*', '')
- ts = info.get('publishTime').split(' ')[0]
- download_url = 'http://disc.static.szse.cn/download' + info.get('attachPath')
- save_pdf(title, ts, download_url, folder_name)
- if __name__ == '__main__':
- # 需要下载的页数
- page_num = 10
- for i in range(1, page_num+1):
- print(f'第{i}页'.center(50, '='))
- get_item(i)
- print('=' * 50)
复制代码
|
|