本帖最后由 wrpython 于 2024-7-5 02:47 编辑 import requests
from bs4 import BeautifulSoup
import time
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0'}
content = requests.get('https://www.sec.gov/cgi-bin/current?q1=0&q2=2&q3=144', headers = headers).text
soup = BeautifulSoup(content, 'html.parser')
test = soup.findAll('a')
n = 1 #计数,只读取偶数标签对应的链接
ls = [] #用于存储读取到的链接
n1 = 0 #统计总共有多少条链接
#先在原始页面读取今天所有的链接然后存入列表中
for i in test[:-1]:
n = n + 1
if n % 2 == 0:
link = 'https://www.sec.gov/'+i['href']
print(link)
ls.append(link)
n1 = n1+1
ls2 =[] #用来存从ls列表的链接里读到的新链接
n2 = 1 #用来计数,记录当前都到第几个链接
for i in ls:
time.sleep(1)
content2 = requests.get(i, headers = headers).text
soup2 = BeautifulSoup(content2, 'html.parser')
print(soup2)
test2 = soup2.findAll('tr')
link2 = 'https://www.sec.gov/' + test2[1].a['href']
ls2.append(link2)
print('%d/%d'%(n2,n1))
n2 = n2+1
|