ok,这次加了循环,可以爬以前所有的新闻了
可以给一个最佳答案吗
from bs4 import BeautifulSoup
import requests,chardet
def get(url):
req = requests.get(url)
req.encoding="utf-8"
req=req.text
# req.encoding = chardet.detect(req.content)['encoding'] #提取网页编码
soup = BeautifulSoup(req,"html.parser")
names = soup.findAll('div',class_="list")
newnames=[a['title'] if 'title' in a.attrs else '' for a in soup.find_all('a')]
newnames = [name for name in newnames if name!='']s's's's
newtimes = soup.findAll('span',attrs={"class": "time"})
a = 1
for i in range(len(newnames)):
if newnames[i] == None:
continue
else:
try:
print("学校新闻版块第",i,"条标题名称:\t",newnames[i]," 发布时间 ",newtimes[i].text)
a+=1
except:
pass
get("https://news.wtu.edu.cn/xxxw.htm")
for j in range(295):
if 294-j!=0:
j=294-j
else:
break
get("https://news.wtu.edu.cn/xxxw/"+str(j)+".htm")
|