乌克大喵喵 发表于 2021-6-3 14:47:19

求助,怎么实现在固定的数组中随机抽取几个数,重复这个过程几次,这几次不能重复


一共有3种花,每种花50个数据,现在的参数是每种花前30个为训练组,后二十个为实验组
test1是30个里随机挑20个作为训练组然后制成knn,这个过程重复20次,但需要这20次的数据不能完全相同。
这个过程该如何实现


import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.spatial.distance import cdist


def SplitData_1(data):
    traindata, testdata = [], []
    for i in range(3):
      idx = np.arange(50)
      trainidx, testidx = idx[:30], idx
      trainidx=np.random.permutation(trainidx)
      datai = data.loc == i, :]
      traindata.append(datai.iloc)
      testdata.append(datai.iloc)

    traindata = pd.concat(traindata, 0)
    testdata = pd.concat(testdata, 0)


    #print('traindata:', traindata )
    #print('testdata:', testdata )

    Xtrain = traindata.iloc[:, :-1]
    ytrain = traindata.iloc[:, -1]
    Xtest = testdata.iloc[:, :-1]
    ytest = testdata.iloc[:, -1]

    return Xtrain.values, Xtest.values, ytrain.values, ytest.values

def SplitData_2(data):
    traindata, testdata = [], []
    for i in range(3):
      idx = np.arange(50)
      trainidx, testidx = idx[:30], idx
      trainidx=np.random.permutation(trainidx)
      trainidx=trainidx[:20]
      datai = data.loc == i, :]
      traindata.append(datai.iloc)
      testdata.append(datai.iloc)

    traindata = pd.concat(traindata, 0)
    testdata = pd.concat(testdata, 0)


    #print('traindata:', traindata )
    #print('testdata:', testdata )

    Xtrain = traindata.iloc[:, :-1]
    ytrain = traindata.iloc[:, -1]
    Xtest = testdata.iloc[:, :-1]
    ytest = testdata.iloc[:, -1]

    return Xtrain.values, Xtest.values, ytrain.values, ytest.values

def SplitData_3(data):
    traindata, testdata = [], []
    for i in range(3):
      idx = np.arange(50)
      trainidx, testidx = idx[:30], idx
      trainidx=np.random.permutation(trainidx)
      trainidx = trainidx[:25]
      datai = data.loc == i, :]
      traindata.append(datai.iloc)
      testdata.append(datai.iloc)

    traindata = pd.concat(traindata, 0)
    testdata = pd.concat(testdata, 0)


    #print('traindata:', traindata )
    #print('testdata:', testdata )

    Xtrain = traindata.iloc[:, :-1]
    ytrain = traindata.iloc[:, -1]
    Xtest = testdata.iloc[:, :-1]
    ytest = testdata.iloc[:, -1]

    return Xtrain.values, Xtest.values, ytrain.values, ytest.values


class KNN(object):
    def __init__(self, k):
      self.k = k

    def fit(self, Xtrain, ytrain):
      self.Xtrain = Xtrain
      self.ytrain = ytrain

      return self

    def predict(self, Xtest):
      k = self.k
      Xtrain = self.Xtrain
      ytrain = self.ytrain
      N = Xtest.shape
      y_pred = np.zeros()
      for i in range(Xtest.shape):
            x = Xtest# 提取预测样本
            # 计算样本训练集中各个样本的距离
            dists = cdist(Xtrain, np.expand_dims(x, 0))
            # 对距离进行升序排列,记录排序后的位置
            idxs = np.argsort(dists, 0)
            # 取排在前k个的训练集中样本对应的索引,提取他们的label,并用其中的众数作为预测结果
            cand = pd.DataFrame(data=ytrain])
            y_pred = cand.mode()

      return y_pred



# %% Load and split data
data = pd.read_csv('iris.data', header=None)
class_map = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}
data.iloc[:, -1] = data.iloc[:, -1].map(class_map)

# %% KNN
from sklearn.metrics import accuracy_score
acc_knn_1 = np.zeros(20)#实验次数
acc_knn_2 = np.zeros(20)
acc_knn_3 = np.zeros(20)
for i in range(20):
    Xtrain, Xtest, ytrain, ytest = SplitData_1(data)
    knn = KNN(k=3).fit(Xtrain, ytrain)
    ypred = knn.predict(Xtest)
    acc_knn_1 = accuracy_score(ypred, ytest)
print('KNN Result:')
print(acc_knn_1)
print('Acc mean:', np.mean(acc_knn_1))
print('Acc std:', np.std(acc_knn_1))


# %% Test1
for j in range(20):
    Xtrain, Xtest, ytrain, ytest = SplitData_2(data)
    knn = KNN(k=3).fit(Xtrain, ytrain)
    ypred = knn.predict(Xtest)
    acc_knn_2 = accuracy_score(ypred, ytest)
print('KNN Result test1:')
print(acc_knn_2)
print('Acc mean:', np.mean(acc_knn_2))
print('Acc std:', np.std(acc_knn_2))


# %% Test2
for k in range(20):
    Xtrain, Xtest, ytrain, ytest = SplitData_3(data)
    knn = KNN(k=3).fit(Xtrain, ytrain)
    ypred = knn.predict(Xtest)
    acc_knn_3 = accuracy_score(ypred, ytest)
print('KNN Result test2:')
print(acc_knn_3)
print('Acc mean:', np.mean(acc_knn_3))
print('Acc std:', np.std(acc_knn_3))

qq1151985918 发表于 2021-6-3 15:24:43

本帖最后由 qq1151985918 于 2021-6-3 15:40 编辑

参考代码
from random import sample

data_rand_history = []
while len(data_rand_history) < 20:
    rand = set(sample(range(30), 20))
    if rand not in data_rand_history:
      data_rand_history.append(rand)
print(data_rand_history)

乌克大喵喵 发表于 2021-6-4 14:06:10

在SplitData_2中加了以下条件
trainidx = sorted(trainidx)
    if trainidx not in data_rand_history2:
      data_rand_history2.append(trainidx)
    else:
      SplitData_2(data)
页: [1]
查看完整版本: 求助,怎么实现在固定的数组中随机抽取几个数,重复这个过程几次,这几次不能重复