马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
#对合并集的非数值型数据进行onehot编码
#nonnumerical = full_dataset.select_dtypes(include='object').columns
pd.get_dummies(full_dataset,dummy_na=True)
#df_nonnumerical
#均值归一化数值型特征
numeric_cols = full_dataset.select_dtypes(np.number).columns # Numerical column name
numeric_cols = numeric_cols.drop(['Id',"SalePrice"], errors='ignore') # Drop label
df_numerical = full_dataset[numeric_cols].apply(lambda x: (x - x.mean()) / (x.std())) # Normalize every numerical column
df_numerical = df_numerical.fillna(0) # Fill in missing value with mean 0
df_numerical.shape
full_df = pd.concat([df_nonnumerical,df_numerical],axis=1)
full_df.shape
这样出来结果是330列
但是别人的出来结果是331列# Preprocessing function
def preprocess(df):
df = df.copy() # Make copy so we don't ruin the original dataframe
df = df.drop(columns=["Id"], errors='ignore') # Drop Id column
numeric_cols = df.select_dtypes(np.number).columns # Numerical column name
numeric_cols = numeric_cols.drop("SalePrice", errors='ignore') # Drop label
categorical_cols = df.select_dtypes(include=['object']).columns # Categorical column name
df_categorical = pd.get_dummies(df[categorical_cols], dummy_na=True) # OneHotEncoding, dummy_na create another column to indicate if the cell contain NA
df_numerical = df[numeric_cols].apply(lambda x: (x - x.mean()) / (x.std())) # Normalize every numerical column
df_numerical = df_numerical.fillna(0) # Fill in missing value with mean 0
df = pd.concat([df_numerical, df_categorical], axis=1) # Bind numerical column and categorical column
return df
|