Python实现kNN,Python交流,编程语言专区,鱼C论坛

糖逗发表于 2020-11-3 21:40:46

Python实现kNN

本帖最后由糖逗于 2020-11-4 12:54 编辑

参考书籍《机器学习实战》
本地测试环境：python3.7

import numpy as np
import operator

class kNN():
def __init__(self, inX, dataSet, labels, k):
   self.inX = inX
   self.dataSet = dataSet
   self.labels = labels
   self.k = k

def classify0(self):
   dataSetSize = self.dataSet.shape
   #https://blog.csdn.net/laobai1015/article/details/85719724
   diffMat = np.tile(self.inX, (dataSetSize, 1)) - dataSet
   sqDiffMat = diffMat ** 2
   sqDistances = sqDiffMat.sum(axis = 1)
   distances = sqDistances ** 0.5
   sortedDistIndicies = distances.argsort()
   classCount = {}
   for i in range(self.k):
         voteIlabel = self.labels]
         #https://blog.csdn.net/weixin_45683963/article/details/103898093
         classCount = classCount.get(voteIlabel, 0) + 1
   #https://www.runoob.com/python3/python3-func-sorted.html
   sortedClassCount = sorted(classCount.items(),
                              ##https://www.cnblogs.com/zhoufankui/p/6274172.html
                              key = operator.itemgetter(1),
                              reverse = True)#reverse = True为降序
   print(sortedClassCount)

if __name__ == '__main__':
group = np.array([, , , ]) #特征
labels = ['A', 'A', 'B', 'B']#分类
kNN(, group, labels, 3).classify0()#是待分了的特征

import numpy as np
import operator

class kNN():
def __init__(self, filename, k):
   self.filename = filename
   self.k = k

#读取数据
def file2matrix(filename):
   fr = open(filename)
   arrayOLines = fr.readlines()
   numberOfLines = len(arrayOLines)
   returnMat = np.zeros((numberOfLines, 3))
   classLabelVector = []
   index = 0
   for line in arrayOLines:
         line = line.strip()
         listFromLine = line.split('\t')
         returnMat = listFromLine
         classLabelVector.append(int(listFromLine[-1]))
         index += 1
   return returnMat, classLabelVector

#归一化
def autoNorm(dataSet):
   minVals = dataSet.min(0)
   maxVals = dataSet.max(0)
   ranges = maxVals - minVals
   normDataSet = np.zeros(np.shape(dataSet))
   m = dataSet.shape
   normDataSet = dataSet - np.tile(minVals, (m, 1))
   normDataSet = normDataSet / np.tile(ranges, (m, 1))
   return normDataSet, ranges, minVals

#kNN
def classify0(self, inX, dataSet, labels):
         dataSetSize = dataSet.shape
         diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet
         sqDiffMat = diffMat ** 2
         sqDistances = sqDiffMat.sum(axis = 1)
         distances = sqDistances ** 0.5
         sortedDistIndicies = distances.argsort()
         classCount = {}
         for i in range(self.k):
            voteIlabel = labels]
            classCount = classCount.get(voteIlabel, 0) + 1
         sortedClassCount = sorted(classCount.items(),
                                 key = operator.itemgetter(1),
                                 reverse = True)#reverse = True为降序
         return sortedClassCount

#测试准确率
def datingClassTest(self, hoRatio):
   #只取了数据量的10%作为样本
   #hoRatio = 0.10
   datingDataMat, datingLabels = kNN.file2matrix(self.filename)
   normMat, ranges, minVals = kNN.autoNorm(datingDataMat)
   m = normMat.shape
   numTestVecs = int(m * hoRatio)
   errorCount = 0.0
   for i in range(numTestVecs):
         classifierResult = kNN.classify0(self, normMat, normMat,
                                    datingLabels)
         print("the classifier came back with: %d, the real answer is %d"
               %(classifierResult, datingLabels))
         if(classifierResult != datingLabels):
            errorCount += 1
   print("the total error rate is: %f" %(errorCount / float(numTestVecs)))

#预测
def classifyPerson(self):
   resultList = ["not at all", "in small doses", "in large doses"]
   percentTats = float(input("percentage of time spent playing video games?"))#用来获取控制台的输入
   ffMiles = float(input("frequent filer miles earned per year?"))
   iceCream = float(input("liters of ice cream consumed per year?"))
   datingDataMat, datingLabels = kNN.file2matrix(self.filename)
   normMat, ranges, minVals = kNN.autoNorm(datingDataMat)
   inArr = np.array()
   classifierResult = kNN.classify0(self, (inArr - minVals)/ranges, normMat, datingLabels)
   print("You will probably like this person:", resultList)#因为实际类别是1,2,3，下标识从0开始的，所以要-1

if __name__ == '__main__':
path = 'datingTestSet2.txt'
test1 = kNN(path, 3)
test1.datingClassTest(0.1)
test1.classifyPerson()

import numpy as np
import operator
import os

class kNN():
def __init__(self, path):
   self.path = path

#将矩阵打平，编程一维向量
def img2vector(filname):
   returnVect = np.zeros((1, 1024))
   fr = open(filname)
   for i in range(32):
         lineStr = fr.readline()
         for j in range(32):
            returnVect = int(lineStr)
   return returnVect

def classify0(inX, dataSet, labels, k):
         dataSetSize = dataSet.shape
         diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet
         sqDiffMat = diffMat ** 2
         sqDistances = sqDiffMat.sum(axis = 1)
         distances = sqDistances ** 0.5
         sortedDistIndicies = distances.argsort()
         classCount = {}
         for i in range(k):
            voteIlabel = labels]
            classCount = classCount.get(voteIlabel, 0) + 1
         sortedClassCount = sorted(classCount.items(),
                                 key = operator.itemgetter(1),
                                 reverse = True)#reverse = True为降序
         return sortedClassCount

#在全部的数据上训练，然后在全部的数据上进行检验
def handwritingClassTest(self):
   hwLabels = []
   #返回文件下的所有文件名
   trainingFileList = os.listdir(self.path + "testDigits")
   m = len(trainingFileList)
   trainingMat = np.zeros((m, 1024))
   for i in range(m):
         fileNameStr = trainingFileList
         fileStr = fileNameStr.split('.')
         classNumStr = int(fileStr.split('_'))
         hwLabels.append(classNumStr)
         trainingMat = kNN.img2vector(self.path + "testDigits/%s" % fileNameStr)
   testFileList = os.listdir(self.path + "testDigits")
   errorCount = 0
   mTest = len(testFileList)
   for i in range(mTest):
         fileNameStr = testFileList
         fileStr = fileNameStr.split('.')
         classNumStr = int(fileStr.split('_'))
         vectorUnderTest = kNN.img2vector(self.path + "testDigits\%s" % fileNameStr)
         classifierResult = kNN.classify0(vectorUnderTest, trainingMat, hwLabels, 3)
         print("the classifier came back with:%d, the real answer if :%d" %(classifierResult, classNumStr))
         if(classifierResult != classNumStr) :
            errorCount += 1
   print("\nthe total numner of errors is: %d" % errorCount)
   print("\nthe total error rate is %f" %(errorCount / float(mTest)))

if __name__ == '__main__':
path = "C:/.../machinelearninginaction/Ch02/digits/"
kNN(path).handwritingClassTest()

糖逗发表于 2020-11-4 10:55:46

每个框都是独立的个体，可以独立运行{:10_327:}

糖逗发表于 2020-11-4 12:55:50

该书数据代码配套网站：https://www.manning.com/books/machine-learning-in-action

页: [1]

鱼C论坛's Archiver

Python实现kNN