|
发表于 2020-2-16 17:12:17
|
显示全部楼层
- from selenium import webdriver
- import re
- import pandas as pd
- import openpyxl as op
- import time
- class feiyan():
- def __init__(self,html=None,source=None,confirmed=None,suspected=None,cure=None,died=None,timess=None):
- self.html = html
- self.source = source
- self.confirmed = confirmed
- self.suspected = suspected
- self.cure = cure
- self.died = died
- self.timess = timess
- def FindWeb(self,html):
- option = webdriver.ChromeOptions()
- option.add_argument('--headless')
- driver = webdriver.Chrome(options=option)
- driver.get(html)
- time.sleep(5)
- #source = driver.page_source
- self.source = driver.execute_script('return document.documentElement.outerHTML') #执行JS后得到整个HTML
- return self.source
- def FindData(self):
- confir = re.search(r'number.*\S\s*.*累计确诊',self.source)
- self.confirmed = re.search(r'\d+',confir.group())
- suspect = re.search(r'number.*\S\s*.*现有疑似',self.source)
- self.suspected = re.search(r'\d+',suspect.group())
- cur = re.search(r'number.*\S\s*.*治愈人数',self.source)
- self.cure = re.search(r'\d+',cur.group())
- die = re.search(r'number.*\S\s*.*死亡人数',self.source)
- self.died = re.search(r'\d+',die.group())
- times = re.search(r'统计截至.*?</span>',self.source)
- self.timess = re.search(r'\d+-\d+-\d+\s+\d+:\d+:\d+',times.group())
- print('截止{}:\n全国确诊人数为:{}人\n'.format(self.timess.group(),self.confirmed.group()))
- print('疑似病例为:{}人\n'.format(self.suspected.group()))
- print('治愈人数为:{}人\n'.format(self.cure.group()))
- print('死亡人数为:{}人\n'.format(self.died.group()))
- def save(self):
- target = r'E:\Python\肺炎数据\肺炎数据.xlsx' #读取前一天数据,合并生成本日数据并保存
- excel = pd.read_excel(target)
- num = len(excel.loc[:,'时间'])
- excel.loc[num,'时间'] = self.timess.group()
- excel.loc[num,'确诊人数'] = self.confirmed.group()
- excel.loc[num,'死亡人数'] = self.died.group()
- excel.loc[num,'治愈人数'] = self.cure.group()
- excel.loc[num,'疑似病例'] = self.suspected.group()
- book = op.load_workbook(target)
- write = pd.ExcelWriter(target,engine = 'openpyxl')
- write.book = book
- excel.to_excel(write,sheet_name='alldata')
- write.save()
- write.close()
- if __name__ == '__main__':
- f = feiyan()
- f.FindWeb(html='https://news.qq.com//zt2020/page/feiyan.htm')
- f.FindData()
- f.save()
复制代码 |
|