| 
 | 
 
 
发表于 2020-3-23 15:23:11
|
显示全部楼层
 
 
 
给你重写了一下 
- import requests
 
 - from bs4 import BeautifulSoup
 
 - import re
 
 - import openpyxl
 
  
- def open_url(url):
 
 -     headers = {
 
 -         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
 
 -     }
 
  
-     res = requests.get(url, headers = headers)
 
 -     res.encoding = 'utf-8'
 
 -     return res
 
  
 
- def find_data(res):
 
 -     data = []
 
 -     soup = BeautifulSoup(res.text,'lxml')
 
 -     content = soup.find('article',class_="article-content")
 
 -     targets = content.find_all('p')[8:-13]
 
 -     for i in range(len(targets)//5):
 
 -         data.append([
 
 -                 re.findall(r'\[(.+)\]',targets[i*5+1].text)[0],
 
 -                 re.findall(r'\d.*',targets[i*5+2].text)[0],
 
 -                 re.findall(r'\d.*',targets[i*5+3].text)[0],
 
 -                 re.findall(r'\d.*',targets[i*5+4].text)[0]
 
 -             ])
 
 -     return data
 
  
 
- def to_excel(data):
 
 -     wb = openpyxl.Workbook()
 
 -     wb.guess_types = True
 
 -     ws = wb.active
 
 -     ws.append(['城市','平均房价','平均工资','房价工资比'])
 
 -     for each in data:
 
 -         ws.append(each)
 
  
-     wb.save("2019全国各大主要城市房价、工资排行榜.xlsx")
 
  
 
- def main():
 
 -     url = 'http://www.szfce.com/gn/27107.html'
 
 -     # url = 'https://news.house.qq.com/a/20170702/003985.htm'
 
 -     res = open_url(url)
 
 -     data = find_data(res)
 
 -     to_excel(data)
 
  
- if __name__ == "__main__":
 
 -     main()
 
 
  复制代码 |   
 
 
 
 |