求救,爬取这个网站新闻和标题时候爬不到内容,只能爬到导航栏
from urllib.request import urlopenfrom bs4 import BeautifulSoup
import pandas as pd
def get_response(url,tag):
response=urlopen(url)
soup=BeautifulSoup(response.read(),"lxml")
result=soup.find_all(tag)
data=list()
for a in result:
if a:
if a.text and len(a.text)>5: #过滤导航栏
row=dict()
row['title']=a.text
row['href']=a.get('href')
data.append(row)
return data
url='http://jnxxq.jinan.gov.cn/col/col39742/index.html'
tag='a'
data=get_response(url,tag)
print(data)
解决加微信一盒烟钱红包 本帖最后由 suchocolate 于 2021-8-30 17:22 编辑
页面把新闻链接和标题信息放到了<datastore>,用re可以取出来。具体存放位置看一下生成的r.txt。
from urllib import request
import re
def main():
headers = {'User-Agent': 'Firefox'}
url = 'http://jnxxq.jinan.gov.cn/col/col39742/index.html'
req = request.Request(url, headers=headers)
r = request.urlopen(req)
txt = r.read().decode('utf-8')
with open('r.txt', 'w', encoding='utf-8') as f:
f.write(txt)
result = re.findall(r'</span><a (.*?)\'target', txt)
for i in result:
print(i)
if __name__ == "__main__":
main()
页:
[1]