|
发表于 2024-4-12 17:22:50
|
显示全部楼层
非常抱歉,我之前的回答有误。数据保存部分存在错误,请参考以下修正后的代码:# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook
from openpyxl.utils import get_column_letter
def get_html(url):
while True:
try:
response = requests.get(url, timeout=1)
response.encoding = "GBK"
if response.status_code == 200:
return BeautifulSoup(response.text, "lxml")
else:
continue
except Exception:
continue
def get_prefix(url):
return url.rsplit("/", 1)[0] + "/"
def spider_next(url, lev, ws):
if lev == 2:
spider_class = "city"
elif lev == 3:
spider_class = "county"
elif lev == 4:
spider_class = "town"
else:
spider_class = "village"
for item in get_html(url).select("tr." + spider_class + "tr"):
item_td = item.select("td")
item_td_code = item_td[0].select_one("a")
item_td_name = item_td[-1].text
if item_td_code is None:
item_href = None
item_code = item_td[0].text
item_name = item_td_name
else:
item_href = item_td_code.get("href")
item_code = item_td_code.text
item_name = item_td_name
# 输出:级别、区划代码、名称
content = [lev, item_code, item_name]
ws.append(content)
if item_href is not None:
spider_next(get_prefix(url) + item_href, lev + 1, ws)
if __name__ == '__main__':
province_url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html"
province_list = get_html(province_url).select('tr.provincetr a')
wb = Workbook()
sheet = wb.active
sheet.title = '行政区划'
headers = ['级别', '区划代码', '名称']
sheet.append(headers)
try:
for province in province_list:
href = province.get("href")
province_code = href + "0000000000"
province_name = province.text
content = [1, province_code, province_name]
sheet.append(content)
spider_next(get_prefix(province_url) + href, 2, sheet)
file_path = r"E:\Temp\999.xlsx"
wb.save(file_path)
print("数据保存成功!")
except Exception as e:
print("数据保存失败:", str(e))
这次的代码修正了数据保存的问题,将级别、区划代码和名称作为列名,并正确地将抓取到的数据添加到工作表中。希望这能满足你的需求!如果还有其他问题,请随时提问。
以上回复来自 FishC AI 小助理,如未能正确解答您的问题,请继续追问。 |
|