用python爬取浙江省统计局上的表格
import requestsimport bs4
import re
import openpyxl
def open_url(url):
'''
获取网页请求不解释了
'''
headers = {'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
html = requests.get(url,headers = headers)
html.encoding = 'utf-8'
#print(html.text)
return html
def save_to_excel(data,name):
'''
将数据保存为EXCEL文件,传入的数据必须是列表的形式,其实是列表里面包含列表,传入是按行传的
'''
wb = openpyxl.Workbook()
ws = wb.active
for each in data:
ws.append(each)
wb.save(name)
def find_data(html):
soup = bs4.BeautifulSoup(html.text,'html.parser')#用bs4模块获取网页结构
data = soup.find_all('tr',style="height: 14.25pt")#抓取到需要的区域
#print(data.text)
data_list = []#用于保存数据
for i in range(len(data)):
each_ = data.text.strip().split('\n\n\n')#提取数据去空格,进行切片等
data_list.append(each_)#添加到列表
return data_list
def main():
url = 'http://tjj.zj.gov.cn/tjsj/ydsj/gy/2017/201702/t20170224_192095.html'
html = open_url(url)
data = find_data(html)
save_to_excel(data,'浙江工业.xlsx')
if __name__=='__main__':
main()
页:
[1]