非常抱歉,我犯了一个低级错误。在 `spider_next` 函数的最后,我忘记将抓取到的数据添加到 `sheet` 中了。请将以下代码行添加到 `spider_next` 函数的最后: sheet.append([lev, item_code, item_name])
这样就会将抓取到的数据添加到表格中了。修复后的完整代码如下:from openpyxl import Workbook
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
# 获取网页内容
def get_html(url):
while True:
try:
res = requests.get(url, timeout=1)
res.encoding = res.apparent_encoding
if res.status_code == 200:
return BeautifulSoup(res.text, "lxml")
else:
continue
except Exception:
continue
# 获取地址前缀(用于相对地址)
def get_prefix(url):
parsed_url = urlparse(url)
prefix = f"{parsed_url.scheme}://{parsed_url.netloc}"
return prefix
# 递归抓取下一页面
def spider_next(url, lev, sheet):
if lev == 2:
spider_class = "citytr"
elif lev == 3:
spider_class = "countytr"
elif lev == 4:
spider_class = "towntr"
else:
spider_class = "villagetr"
for item in get_html(url).select("tr." + spider_class):
item_td = item.select("td")
item_td_code = item_td
item_td_name = item_td
item_href = item_td_code.select_one("a")
if item_href is not None:
href = item_href.get("href")
spider_next(get_prefix(url) + href, lev + 1, sheet)
item_code = item_td_code.text.strip()
item_name = item_td_name.text.strip()
# 输出:级别、区划代码、名称
content = f"{lev}\t{item_code}\t{item_name}"
print(content)
sheet.append([lev, item_code, item_name])
if __name__ == '__main__':
province_url = "https://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/index.html"
province_list = get_html(province_url).select('tr.provincetr a')
wb = Workbook()
sheet = wb.active
sheet.title = "行政区划"
# 写入表头
sheet.append(["级别", "区划代码", "名称"])
try:
for province in province_list:
href = province.get("href")
province_code = href + "0000000000"
province_name = province.text
# 输出:级别、区划代码、名称
content = "1\t" + province_code + "\t" + province_name
print(content)
sheet.append([1, province_code, province_name])
spider_next(get_prefix(province_url) + href, 2, sheet)
# 保存xlsx文件
wb.save(r'E:\Temp\代码库.xlsx')
finally:
wb.close()
现在再次运行程序时,你应该能够成功地抓取数据并将其保存到 `E:\Temp\代码库.xlsx` 文件中。如果还有其他问题,请随时告诉我。
以上回复来自 FishC AI 小助理,如未能正确解答您的问题,请继续追问。 |