|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
一共有3种花,每种花50个数据,现在的参数是每种花前30个为训练组,后二十个为实验组
test1是30个里随机挑20个作为训练组然后制成knn,这个过程重复20次,但需要这20次的数据不能完全相同。
这个过程该如何实现
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.spatial.distance import cdist
def SplitData_1(data):
traindata, testdata = [], []
for i in range(3):
idx = np.arange(50)
trainidx, testidx = idx[:30], idx[30:]
trainidx=np.random.permutation(trainidx)
datai = data.loc[data.iloc[:, -1] == i, :]
traindata.append(datai.iloc[trainidx, :])
testdata.append(datai.iloc[testidx, :])
traindata = pd.concat(traindata, 0)
testdata = pd.concat(testdata, 0)
#print('traindata:', traindata )
#print('testdata:', testdata )
Xtrain = traindata.iloc[:, :-1]
ytrain = traindata.iloc[:, -1]
Xtest = testdata.iloc[:, :-1]
ytest = testdata.iloc[:, -1]
return Xtrain.values, Xtest.values, ytrain.values, ytest.values
def SplitData_2(data):
traindata, testdata = [], []
for i in range(3):
idx = np.arange(50)
trainidx, testidx = idx[:30], idx[30:]
trainidx=np.random.permutation(trainidx)
trainidx=trainidx[:20]
datai = data.loc[data.iloc[:, -1] == i, :]
traindata.append(datai.iloc[trainidx, :])
testdata.append(datai.iloc[testidx, :])
traindata = pd.concat(traindata, 0)
testdata = pd.concat(testdata, 0)
#print('traindata:', traindata )
#print('testdata:', testdata )
Xtrain = traindata.iloc[:, :-1]
ytrain = traindata.iloc[:, -1]
Xtest = testdata.iloc[:, :-1]
ytest = testdata.iloc[:, -1]
return Xtrain.values, Xtest.values, ytrain.values, ytest.values
def SplitData_3(data):
traindata, testdata = [], []
for i in range(3):
idx = np.arange(50)
trainidx, testidx = idx[:30], idx[30:]
trainidx=np.random.permutation(trainidx)
trainidx = trainidx[:25]
datai = data.loc[data.iloc[:, -1] == i, :]
traindata.append(datai.iloc[trainidx, :])
testdata.append(datai.iloc[testidx, :])
traindata = pd.concat(traindata, 0)
testdata = pd.concat(testdata, 0)
#print('traindata:', traindata )
#print('testdata:', testdata )
Xtrain = traindata.iloc[:, :-1]
ytrain = traindata.iloc[:, -1]
Xtest = testdata.iloc[:, :-1]
ytest = testdata.iloc[:, -1]
return Xtrain.values, Xtest.values, ytrain.values, ytest.values
class KNN(object):
def __init__(self, k):
self.k = k
def fit(self, Xtrain, ytrain):
self.Xtrain = Xtrain
self.ytrain = ytrain
return self
def predict(self, Xtest):
k = self.k
Xtrain = self.Xtrain
ytrain = self.ytrain
N = Xtest.shape[0]
y_pred = np.zeros([N, 1])
for i in range(Xtest.shape[0]):
x = Xtest[i, :] # 提取预测样本
# 计算样本训练集中各个样本的距离
dists = cdist(Xtrain, np.expand_dims(x, 0))
# 对距离进行升序排列,记录排序后的位置
idxs = np.argsort(dists, 0)
# 取排在前k个的训练集中样本对应的索引,提取他们的label,并用其中的众数作为预测结果
cand = pd.DataFrame(data=ytrain[idxs[:k]])
y_pred[i] = cand.mode()[0]
return y_pred
# %% Load and split data
data = pd.read_csv('iris.data', header=None)
class_map = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}
data.iloc[:, -1] = data.iloc[:, -1].map(class_map)
# %% KNN
from sklearn.metrics import accuracy_score
acc_knn_1 = np.zeros(20)#实验次数
acc_knn_2 = np.zeros(20)
acc_knn_3 = np.zeros(20)
for i in range(20):
Xtrain, Xtest, ytrain, ytest = SplitData_1(data)
knn = KNN(k=3).fit(Xtrain, ytrain)
ypred = knn.predict(Xtest)
acc_knn_1[i] = accuracy_score(ypred, ytest)
print('KNN Result:')
print(acc_knn_1)
print('Acc mean:', np.mean(acc_knn_1))
print('Acc std:', np.std(acc_knn_1))
# %% Test1
for j in range(20):
Xtrain, Xtest, ytrain, ytest = SplitData_2(data)
knn = KNN(k=3).fit(Xtrain, ytrain)
ypred = knn.predict(Xtest)
acc_knn_2[j] = accuracy_score(ypred, ytest)
print('KNN Result test1:')
print(acc_knn_2)
print('Acc mean:', np.mean(acc_knn_2))
print('Acc std:', np.std(acc_knn_2))
# %% Test2
for k in range(20):
Xtrain, Xtest, ytrain, ytest = SplitData_3(data)
knn = KNN(k=3).fit(Xtrain, ytrain)
ypred = knn.predict(Xtest)
acc_knn_3[k] = accuracy_score(ypred, ytest)
print('KNN Result test2:')
print(acc_knn_3)
print('Acc mean:', np.mean(acc_knn_3))
print('Acc std:', np.std(acc_knn_3))
本帖最后由 qq1151985918 于 2021-6-3 15:40 编辑
参考代码 from random import sample
data_rand_history = []
while len(data_rand_history) < 20:
rand = set(sample(range(30), 20))
if rand not in data_rand_history:
data_rand_history.append(rand)
print(data_rand_history)
|
|