import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook
cookies = {
'_trs_uv': 'luroger7_6267_dg9s',
'wzws_sessionid': 'gjdlZDJkMIA2MC4xNjAuMTI4LjUxgTEzZjliM6BmFOXW',
}
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Connection': 'keep-alive',
'Referer': 'https://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/index.html',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0',
'sec-ch-ua': '"Microsoft Edge";v="123", "Not:A-Brand";v="8", "Chromium";v="123"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
# 发起GET请求获取网页内容
response = requests.get('https://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/11.html', cookies=cookies, headers=headers)
response.encoding = response.apparent_encoding
# 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(response.text, 'html.parser')
print('soup:', soup)
# 定位行政区划表格
table = soup.find('table', class_='countytable')
print('table:', table)
if table is not None:
data = [] # 存储行政区划数据的列表
for tr in table.find_all('tr'):
tds = tr.find_all('td')
print('tds:', tds)
input('222')
if len(tds) == 3:
code = tds[0].text.strip() # 获取编码
name = tds[2].text.strip() # 获取名称
data.append((code, name))
def save_to_excel(data, filename):
wb = Workbook() # 创建一个工作簿
sheet = wb.active # 获取活动工作表
sheet.title = '行政区划' # 设置工作表标题
headers = ['编码', '名称'] # 列标题
sheet.append(headers) # 写入列标题
# 逐行写入行政区划数据
for row in data:
sheet.append(row)
wb.save(filename) # 保存工作簿到文件
if __name__ == "__main__":
url = 'https://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/index.html'
data = [] # 存储行政区划数据的列表
# crawl_districts(url, data) # 爬取行政区划数据
save_to_excel(data, 'E:/Temp/行政区划.xlsx') # 将数据保存到Excel文件