|

楼主 |
发表于 2024-4-12 19:05:48
|
显示全部楼层
那我现有代码:
import re
import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook
url = "https://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/index.html"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Cookie": "wzws_sessionid=gWRmZGNjZaBmFfeqgjdlZDJkMIA2MC4xNjAuMTU2LjUw"
}
res = requests.get(url, headers=headers)
res.encoding = res.apparent_encoding
text = res.text
# 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(text, 'lxml')
# htmltext = soup.prettify()
# print(htmltext)
# 获取地址前缀(用于相对地址)
def get_prefix(url):
return url[0:url.rindex("/") + 1]
#创建工作簿
wb = Workbook()
sheet = wb.active
sheet.title = '行政区划'
headers = ['编码', '名称']
sheet.append(headers)
#省级代码
province_Num = []
city_Num = []
province_list = soup.select('tr.provincetr a')
for province in province_list:
href = province.get("href")
province_code = href[0: 2]
print('province_code:', province_code)
province_Num.append(province_code)
province_name = province.text
# print('province_Num:', province_Num)
# 将省级代码和名称作为列表传递给append()方法
sheet.append([province_code + "0000000000", province_name])
wb.save('E:/Temp/行政区划1.xlsx')
print('省级区划已入库~~')
for num in province_Num:
url1 = get_prefix(url) + num + '.html'
soup1 = BeautifulSoup(requests.get(url1).content, 'lxml')
htmltext = soup1.prettify()
print(htmltext)
city_list = soup1.select('tr.citytr a')
for city in city_list:
href = city.get("href")
city_code = href[3: 7]
print('city_code:', city_code)
city_Num.append(city_code)
city_name = city.text
print('city_name:', city_name)
# 将省级代码和名称作为列表传递给append()方法
sheet.append([city_code, city_name])
wb.save('E:/Temp/行政区划1.xlsx')
print('市级区划已入库~~')
为什么连<a href="13/1303.html" 130300000000、a href="13/1301.html"130100000000也取出来了?
我只要取出来href="13/1303.html" 秦皇岛和 href="13/1301.html" 石家庄市啊
|
|