|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
还是继续复制过来以前的代码:
- import pandas as pd
- import os
- import numpy as np
- from sklearn.pipeline import Pipeline
- from sklearn.preprocessing import StandardScaler
- from sklearn.preprocessing import Imputer
- from sklearn.base import BaseEstimator,TransformerMixin
- from sklearn.preprocessing import LabelBinarizer
- from sklearn.pipeline import FeatureUnion
- HOUSING_PATH="datasets/housing"
- def load_housing_data(housing_path=HOUSING_PATH):
- csv_path=os.path.join(housing_path,"housing.csv")
- return pd.read_csv(csv_path)
- housing=load_housing_data()
- housing["income_cat"]=np.ceil(housing["median_income"]/1.5)
- housing["income_cat"].where(housing["income_cat"]<5,5.0,inplace=True)
- from sklearn.model_selection import StratifiedShuffleSplit
- split=StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
- for train_index,test_index in split.split(housing,housing["income_cat"]):
- strat_train_set=housing.loc[train_index]
- strat_test_set=housing.loc[test_index]
- for set in (strat_train_set,strat_test_set):
- set.drop(["income_cat"],axis=1,inplace=True)
-
- housing=strat_train_set.drop("median_house_value",axis=1)
- housing_labels=strat_train_set["median_house_value"].copy()
- housing_num=housing.drop("ocean_proximity",axis=1)
- rooms_ix,bedroom_ix,population_ix,household_ix=3,4,5,6
- class CombinedAttributesAdder(BaseEstimator,TransformerMixin):
- def __init__(self,add_bedrooms_per_room=True):
- self.add_bedrooms_per_room=add_bedrooms_per_room
- def fit(self,X,y=None):
- return self
- def transform(self,X,y=None):
- rooms_per_household=X[:,rooms_ix]/X[:,household_ix]
- population_per_household=X[:,population_ix]/X[:,household_ix]
- if self.add_bedrooms_per_room:
- bedrooms_per_room=X[:,bedroom_ix]/X[:,rooms_ix]
- return np.c_[X,rooms_per_household,population_per_household,bedrooms_per_room]
- else:
- return np,c_[X,rooms_per_household,population_per_household]
-
- class MyLabelBinarizer(TransformerMixin):
- def __init__(self, *args, **kwargs):
- self.encoder = LabelBinarizer(*args, **kwargs)
- def fit(self, x, y=0):
- self.encoder.fit(x)
- return self
- def transform(self, x, y=0):
- return self.encoder.transform(x)
-
- class DataFrameSelector(BaseEstimator,TransformerMixin):
- def __init__(self,attribute_names):
- self.attribute_names=attribute_names
- def fit(self,X,y=None):
- return self
- def transform(self,X):
- return X[self.attribute_names].values
- num_attribs=list(housing_num)
- cat_attribs=["ocean_proximity"]
- num_pipline=Pipeline([
- ('selector',DataFrameSelector(num_attribs)),
- ('imputer',Imputer(strategy="median")),
- ('attribus_adder',CombinedAttributesAdder()),
- ('std_scaler',StandardScaler()),
- ])
- cat_pipline=Pipeline([
- ('selector',DataFrameSelector(cat_attribs)),
- ('label_binarizer',MyLabelBinarizer()),
- ])
- full_pipline=FeatureUnion(transformer_list=[
- ("num_pipline",num_pipline),
- ("cat_pipline",cat_pipline),
- ])
- housing_prepared=full_pipline.fit_transform(housing)
复制代码
然后为我们的模型设置超参数(就是随机森林里树的个数,训练集有几组特征值等等一系列的参数):
- from sklearn.model_selection import GridSearchCV
- from sklearn.ensemble import RandomForestRegressor
- param_grid=[
- {'n_estimators':[3,10,30],'max_features':[2,4,6,8]},
- {'bootstrap':[False],'n_estimators':[3,10],'max_features':[2,3,4]}
- ]
- forest_reg=RandomForestRegressor()
- grid_search=GridSearchCV(forest_reg,param_grid,cv=5,scoring='neg_mean_squared_error')
- grid_search.fit(housing_prepared,housing_labels)
复制代码
然后看看以上方法为我们预测出来的超参数是什么:
写完上面这行执行后会出现如下数据:
{'max_features': 8, 'n_estimators': 30}
这就是我们要用的最优参数,最大特征值是8组,决策树的个数是30个,会使我们预测的数据更准确。继续看下对比后的结果:
- cvres=grid_search.cv_results_
- for mean_score,params in zip(cvres["mean_test_score"],cvres["params"]):
- print(np.sqrt(-mean_score),params)
复制代码
执行后会显示以下结果:
64539.9065965 {'n_estimators': 3, 'max_features': 2}
55401.3638545 {'n_estimators': 10, 'max_features': 2}
52849.5504873 {'n_estimators': 30, 'max_features': 2}
60488.5008218 {'n_estimators': 3, 'max_features': 4}
53228.6746741 {'n_estimators': 10, 'max_features': 4}
50746.7001414 {'n_estimators': 30, 'max_features': 4}
59350.5028588 {'n_estimators': 3, 'max_features': 6}
52568.7835547 {'n_estimators': 10, 'max_features': 6}
50013.8405865 {'n_estimators': 30, 'max_features': 6}
59451.5995633 {'n_estimators': 3, 'max_features': 8}
52092.3880668 {'n_estimators': 10, 'max_features': 8}
49857.1401575 {'n_estimators': 30, 'max_features': 8}
61897.0944086 {'n_estimators': 3, 'bootstrap': False, 'max_features': 2}
54548.6814397 {'n_estimators': 10, 'bootstrap': False, 'max_features': 2}
60637.0802455 {'n_estimators': 3, 'bootstrap': False, 'max_features': 3}
53100.3544235 {'n_estimators': 10, 'bootstrap': False, 'max_features': 3}
58487.3207429 {'n_estimators': 3, 'bootstrap': False, 'max_features': 4}
51741.4498429 {'n_estimators': 10, 'bootstrap': False, 'max_features': 4}
果然最优的就是刚才的那个,第一个均差最大的是最差的。然后看看特征的重要程度:
- feature_importances=grid_search.best_estimator_.feature_importances_
- feature_importances
复制代码
这个输出结果不是太容易看,只是一组数据这里就不复制了,然后我们用更直观的方法看看:
- from sklearn.preprocessing import LabelEncoder
- encoder=LabelEncoder()
- housing_cat=housing["ocean_proximity"]
- housing_cat_encoded=encoder.fit_transform(housing_cat)
- extra_attribs=["rooms_per_household","pop_per_household","bedrooms_per_room"]
- cat_one_hot_attribs=list(encoder.classes_)
- attribus=num_attribs+extra_attribs+cat_one_hot_attribs
- sorted(zip(feature_importances,attribus),reverse=True)
复制代码
输出为:
[(0.35216413184624645, 'median_income'),
(0.1600064373659322, 'INLAND'),
(0.10973914260172225, 'pop_per_household'),
(0.073196611077795654, 'longitude'),
(0.069993781986234183, 'bedrooms_per_room'),
(0.063532534714522193, 'latitude'),
(0.050927579668676295, 'rooms_per_household'),
(0.043936553336933998, 'housing_median_age'),
(0.015821739675359821, 'total_rooms'),
(0.014974729578331942, 'total_bedrooms'),
(0.014716810447382345, 'households'),
(0.014198191755119289, 'population'),
(0.01174446919617937, '<1H OCEAN'),
(0.0028095259721804206, 'NEAR OCEAN'),
(0.0021732837648423087, 'NEAR BAY'),
(6.4477012541252311e-05, 'ISLAND')]
这里面0.00几的就不是什么好特征了,应该给删掉,但是因为太麻烦我就不管他了,下面我们要用电脑给我们预测出的参数进行机器学习:
- from sklearn.metrics import mean_squared_error
- final_model=grid_search.best_estimator_
- x_test=strat_test_set.drop("median_house_value",axis=1)
- y_test=strat_test_set["median_house_value"].copy()
- x_test_prepared=full_pipline.transform(x_test)
- final_prediction=final_model.predict(x_test_prepared)
- final_mse=mean_squared_error(y_test,final_prediction)
- final_rmse=np.sqrt(final_mse)
复制代码
然后我们再看看方差:
我这里显示的是47568.875366719221,又比以前的强了。 |
|