2962615177 发表于 2020-4-13 13:44:15

爬取城市历史天气数据并保存在Excel中

输入城市名称和年份,获取2011~2019年中任意一年的历史天气并保持在excel中
感觉用正则表达式有点繁杂了
城市和年份太多,只测试了几个城市,不保证每个城市每个年份都能查到

import requests
from bs4 import BeautifulSoup
import openpyxl
import re

#查找日期正则表达式
find_date = re.compile(r'<div><a href="//lishi.tianqi.com/.*?/(.*?).html"',re.S)
#查找最高温度正则表达式
find_htemp = re.compile(r'<div style="width: 100px">(\d{1,2}|-\d{1,2})</div>',re.S)
#查找最低温度正则表达式
find_dtemp = re.compile(r'<div>(\d{1,2}|-\d{1,2})</div>',re.S)
#查找天气情况正则表达式
find_tianqi = re.compile(r'<div>([^0-9]*?)</div>',re.S)
#查找风力正则表达式
find_fengli = re.compile(r'<div style="width:200px;">(.*?)</div>',re.S)

#生成查询网址的函数
def genHTML(cityname,year):
    months = ["%d%02d" % (year, month + 1) for month in range(12)]
    todo_urls =
    return todo_urls


#抓取网页信息
def askURL(url):
    headers = {
      "User-Agent": "Mozilla / 5.0(Windows NT 10.0; WOW64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome/78.0.3904.108 Safari / 537.36"
    }
    res = requests.get(url, headers=headers)
    res = res.content.decode("utf-8")
    return res

#解析网页信息
def getdata(res):
    datalist = [ ]
    soup = BeautifulSoup(res, "html.parser")
    data = soup.find_all('ul',class_="lishitable_content clearfix")
    data = str(data)

    dates = re.findall(find_date,data)
    dtemp = re.findall(find_dtemp,data)
    htemp = re.findall(find_htemp,data)
    tianqi = re.findall(find_tianqi,data)
    fengli = re.findall(find_fengli,data)

    for i in range(len(dates)):
      datalist.append(,dtemp,htemp,tianqi,fengli])

    return datalist

#保存信息至文件
def saveData(datalist,cityname,year):
    wb = openpyxl.Workbook()
    ws = wb.active

    ws['A1'] = "日期"
    ws['B1'] = "最低温度"
    ws['C1'] = "最高温度"
    ws['D1'] = "天气"
    ws['E1'] = "风力"

    #datalist为双层列表
    for i in range(12):
      for each in datalist:
            ws.append(each)

    wb.save("%d年%s历史天气.xlsx"%(year,cityname))

def main():
    print("-------历史天气查询系统-------")
    cityname = input("请输入城市名称(小写全拼):")
    year = int(input("请输入查询年份:"))
    urls = genHTML(cityname,year)
    result = []
    for i in range(12):
      res = askURL(urls)
      result.append(getdata(res))

    saveData(result,cityname,year)


if __name__ == '__main__':
    main()

Mike_python小 发表于 2020-4-13 20:51:48

沙发
页: [1]
查看完整版本: 爬取城市历史天气数据并保存在Excel中