|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
def node_mean(X):
return np.mean(X[:, -1])
# 计算方差和
def node_variance(X):
return np.var(X[:, -1]) * X.shape[0]
# 计算均值
def get_thresholds(X, i):
return set(X[:, i].tolist())
# 获取阙值
from collections import defaultdict
def split_dataset(X, idx, thred):
split_data = defaultdict(list)
for x in X:
split_data[x[idx] < thred].append(x)
return list(split_data.values()), list(split_data.keys())
# 用阙值拆分数据
# 每次迭代方差优化的底线
MINIMUM_IMPROVE = 2.0
# 每个叶子节点最少样本数
MINIMUM_SAMPLES = 1
# 根据阙值对数据进行拆分 返回拆分后的[方差和]
def split_variance(dataset, idx, threshold):
left, right = [], []
n = dataset.shape[0]
for data in dataset:
if data[idx] < threshold:
left.append(data)
else:
right.append(data)
left, right = np.array(left), np.array(right)
# 预剪枝
# 如果拆分结果有一边过少,则返回None,防止过拟合
if len(left) < MINIMUM_SAMPLES or len(right) < MINIMUM_SAMPLES:
return None
# 拆分之后的方差和等于左子树的方差和加上右子树的方差和
# 因为是方差和而不是均方差,所以可以累加
return node_variance(left) + node_variance(right)
# 筛选特征值和阙值 找出最佳拆分可能
def choose_feature_to_split(dataset):
n = len(dataset[0])-1
m = len(dataset)
# 记录最佳方差,特征和阈值
var_ = node_variance(dataset)
bestVar = float('inf')
feature = -1
thred = None
for i in range(n):
threds = get_thresholds(dataset, i)
for t in threds:
# 遍历所有的阈值,计算每个阈值的variance
v = split_variance(dataset, i, t)
# 如果v等于None,说明拆分过拟合了,跳过
if v is None:
continue
if v < bestVar:
bestVar, feature, thred = v, i, t
# 如果最好的拆分效果达不到要求,那么就不拆分,控制子树的数量
# if var_ - bestVar < MINIMUM_IMPROVE:
# return None, None
return feature, thred
# 建树咯
def create_decision_tree(dataset):
dataset = np.array(dataset)
# 如果当前数量小于2,那么就不再继续划分了
if dataset.shape[0] < MINIMUM_SAMPLES:
return node_mean(dataset)
# 记录最佳拆分的特征和阈值
fidx, th = choose_feature_to_split(dataset)
if fidx is None:
return th
node = {}
node['feature'] = fidx
node['threshold'] = th
# 递归建树
split_data, vals = split_dataset(dataset, fidx, th)
for data, val in zip(split_data, vals):
node[val] = create_decision_tree(data)
return node
def classify(node, data):
key = node['feature']
pred = None
thred = node['threshold']
# 检查阈值是否为None
if thred is None:
# 如果是None,那么这个节点是一个叶子节点,直接返回它的值
return node
if isinstance(node[data[key] < thred], dict):
pred = classify(node[data[key] < thred], data)
else:
pred = node[data[key] < thred]
# 放置pred为空,挑选一个叶子节点作为替补
if pred is None:
for key in node:
if not isinstance(node[key], dict):
pred = node[key]
break
return pred
def predict(node, X):
y_pred = []
for x in X:
y = classify(node, x)
y_pred.append(y)
return np.array(y_pred)
dataset = pd.read_excel(r'C:\Users\指针\Desktop\低空经济\data.xlsx')
X = dataset.iloc[:, 1:12] # 所有特征值
y = dataset.iloc[:, 13] # 标签
X_np = X.values
X_train, X_test, y_train, y_test = train_test_split(X_np, y, test_size=0.2, random_state=42)
train_data = np.hstack((X_train, y_train.values.reshape(-1, 1)))
tree = create_decision_tree(train_data)
y_pred = predict(tree, X_test)
# 计算均方误差
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('MSE:', mse)
print('R-Squared:', r2)
TypeError: '<' not supported between instances of 'float' and 'NoneType'
抱歉,我之前的回答有误。根据您提供的代码,问题出现在split_variance函数中的left和right变量没有初始化为空列表。在使用append方法添加数据时,需要先将它们初始化为空列表。您可以将以下代码添加到split_variance函数中:
这样就可以解决该错误。同时,还要注意修改创建决策树时调用的create_decision_tree函数中的判断条件,改为dataset.shape[0] < MINIMUM_SAMPLES。
另外,您的代码还有几处错误,例如定义node变量时多次赋值,导致无法正确保存特征索引和阈值。请参考下面的修正后的代码:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from collections import defaultdict
def node_mean(X):
return np.mean(X)
def node_variance(X):
return np.var(X) * X.shape
def get_thresholds(X, i):
return set(X[:, i])
def split_dataset(X, idx, thred):
split_data = defaultdict(list)
for x in X:
split_data[x[idx] < thred].append(x)
return list(split_data.values()), list(split_data.keys())
MINIMUM_IMPROVE = 2.0
MINIMUM_SAMPLES = 2
def split_variance(dataset, idx, threshold):
left, right = [], []
n = dataset.shape[0]
for data in dataset:
if data[idx] < threshold:
left.append(data)
else:
right.append(data)
left, right = np.array(left), np.array(right)
if len(left) < MINIMUM_SAMPLES or len(right) < MINIMUM_SAMPLES:
return None
return node_variance(left) + node_variance(right)
def choose_feature_to_split(dataset):
n = dataset.shape[1] - 1
bestVar = float('inf')
feature = -1
thred = None
for i in range(n):
threds = get_thresholds(dataset, i)
for t in threds:
v = split_variance(dataset, i, t)
if v is None:
continue
if v < bestVar:
bestVar, feature, thred = v, i, t
return feature, thred
def create_decision_tree(dataset):
dataset = np.array(dataset)
if dataset.shape[0] < MINIMUM_SAMPLES:
return node_mean(dataset)
fidx, th = choose_feature_to_split(dataset)
if fidx is None:
return th
node = {}
node['feature'] = fidx
node['threshold'] = th
split_data, vals = split_dataset(dataset, fidx, th)
node['left'] = create_decision_tree(split_data[0])
node['right'] = create_decision_tree(split_data[1])
return node
def classify(node, data):
if 'threshold' not in node:
return node
feature = node['feature']
threshold = node['threshold']
if data[feature] < threshold:
return classify(node['left'], data)
else:
return classify(node['right'], data)
def predict(node, X):
y_pred = []
for x in X:
y = classify(node, x)
y_pred.append(y)
return np.array(y_pred)
dataset = pd.read_excel(r'C:\Users\指针\Desktop\低空经济\data.xlsx')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_data = np.hstack((X_train, y_train.reshape(-1, 1)))
tree = create_decision_tree(train_data)
y_pred = predict(tree, X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('MSE:', mse)
print('R-Squared:', r2)
这个修正后的代码应该能够正确构建回归树并进行预测和评估了。希望能对您有所帮助!
以上回复来自 FishC AI 小助理,如未能正确解答您的问题,请继续追问。
|
|