|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 糖逗 于 2020-11-4 12:54 编辑
参考书籍《机器学习实战》
本地测试环境:python3.7
- import numpy as np
- import operator
- class kNN():
- def __init__(self, inX, dataSet, labels, k):
- self.inX = inX
- self.dataSet = dataSet
- self.labels = labels
- self.k = k
-
- def classify0(self):
- dataSetSize = self.dataSet.shape[0]
- #https://blog.csdn.net/laobai1015/article/details/85719724
- diffMat = np.tile(self.inX, (dataSetSize, 1)) - dataSet
- sqDiffMat = diffMat ** 2
- sqDistances = sqDiffMat.sum(axis = 1)
- distances = sqDistances ** 0.5
- sortedDistIndicies = distances.argsort()
- classCount = {}
- for i in range(self.k):
- voteIlabel = self.labels[sortedDistIndicies[i]]
- #https://blog.csdn.net/weixin_45683963/article/details/103898093
- classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
- #https://www.runoob.com/python3/python3-func-sorted.html
- sortedClassCount = sorted(classCount.items(),
- ##https://www.cnblogs.com/zhoufankui/p/6274172.html
- key = operator.itemgetter(1),
- reverse = True)#reverse = True为降序
- print(sortedClassCount[0][0])
-
- if __name__ == '__main__':
- group = np.array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]]) #特征
- labels = ['A', 'A', 'B', 'B']#分类
- kNN([0, 0], group, labels, 3).classify0()#[0,0]是待分了的特征
复制代码
- import numpy as np
- import operator
- class kNN():
- def __init__(self, filename, k):
- self.filename = filename
- self.k = k
-
- #读取数据
- def file2matrix(filename):
- fr = open(filename)
- arrayOLines = fr.readlines()
- numberOfLines = len(arrayOLines)
- returnMat = np.zeros((numberOfLines, 3))
- classLabelVector = []
- index = 0
- for line in arrayOLines:
- line = line.strip()
- listFromLine = line.split('\t')
- returnMat[index, :] = listFromLine[0:3]
- classLabelVector.append(int(listFromLine[-1]))
- index += 1
- return returnMat, classLabelVector
-
- #归一化
- def autoNorm(dataSet):
- minVals = dataSet.min(0)
- maxVals = dataSet.max(0)
- ranges = maxVals - minVals
- normDataSet = np.zeros(np.shape(dataSet))
- m = dataSet.shape[0]
- normDataSet = dataSet - np.tile(minVals, (m, 1))
- normDataSet = normDataSet / np.tile(ranges, (m, 1))
- return normDataSet, ranges, minVals
-
- #kNN
- def classify0(self, inX, dataSet, labels):
- dataSetSize = dataSet.shape[0]
- diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet
- sqDiffMat = diffMat ** 2
- sqDistances = sqDiffMat.sum(axis = 1)
- distances = sqDistances ** 0.5
- sortedDistIndicies = distances.argsort()
- classCount = {}
- for i in range(self.k):
- voteIlabel = labels[sortedDistIndicies[i]]
- classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
- sortedClassCount = sorted(classCount.items(),
- key = operator.itemgetter(1),
- reverse = True)#reverse = True为降序
- return sortedClassCount[0][0]
-
- #测试准确率
- def datingClassTest(self, hoRatio):
- #只取了数据量的10%作为样本
- #hoRatio = 0.10
- datingDataMat, datingLabels = kNN.file2matrix(self.filename)
- normMat, ranges, minVals = kNN.autoNorm(datingDataMat)
- m = normMat.shape[0]
- numTestVecs = int(m * hoRatio)
- errorCount = 0.0
- for i in range(numTestVecs):
- classifierResult = kNN.classify0(self, normMat[i, :], normMat[numTestVecs:m, :],
- datingLabels[numTestVecs:m])
- print("the classifier came back with: %d, the real answer is %d"
- %(classifierResult, datingLabels[i]))
- if(classifierResult != datingLabels[i]):
- errorCount += 1
- print("the total error rate is: %f" %(errorCount / float(numTestVecs)))
-
- #预测
- def classifyPerson(self):
- resultList = ["not at all", "in small doses", "in large doses"]
- percentTats = float(input("percentage of time spent playing video games?"))#用来获取控制台的输入
- ffMiles = float(input("frequent filer miles earned per year?"))
- iceCream = float(input("liters of ice cream consumed per year?"))
- datingDataMat, datingLabels = kNN.file2matrix(self.filename)
- normMat, ranges, minVals = kNN.autoNorm(datingDataMat)
- inArr = np.array([ffMiles, percentTats, iceCream])
- classifierResult = kNN.classify0(self, (inArr - minVals)/ranges, normMat, datingLabels)
- print("You will probably like this person:", resultList[classifierResult - 1])#因为实际类别是1,2,3,下标识从0开始的,所以要-1
- if __name__ == '__main__':
- path = 'datingTestSet2.txt'
- test1 = kNN(path, 3)
- test1.datingClassTest(0.1)
- test1.classifyPerson()
复制代码
- import numpy as np
- import operator
- import os
- class kNN():
- def __init__(self, path):
- self.path = path
-
- #将矩阵打平,编程一维向量
- def img2vector(filname):
- returnVect = np.zeros((1, 1024))
- fr = open(filname)
- for i in range(32):
- lineStr = fr.readline()
- for j in range(32):
- returnVect[0, 32 * i + j] = int(lineStr[j])
- return returnVect
- def classify0(inX, dataSet, labels, k):
- dataSetSize = dataSet.shape[0]
- diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet
- sqDiffMat = diffMat ** 2
- sqDistances = sqDiffMat.sum(axis = 1)
- distances = sqDistances ** 0.5
- sortedDistIndicies = distances.argsort()
- classCount = {}
- for i in range(k):
- voteIlabel = labels[sortedDistIndicies[i]]
- classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
- sortedClassCount = sorted(classCount.items(),
- key = operator.itemgetter(1),
- reverse = True)#reverse = True为降序
- return sortedClassCount[0][0]
- #在全部的数据上训练,然后在全部的数据上进行检验
- def handwritingClassTest(self):
- hwLabels = []
- #返回文件下的所有文件名
- trainingFileList = os.listdir(self.path + "testDigits")
- m = len(trainingFileList)
- trainingMat = np.zeros((m, 1024))
- for i in range(m):
- fileNameStr = trainingFileList[i]
- fileStr = fileNameStr.split('.')[0]
- classNumStr = int(fileStr.split('_')[0])
- hwLabels.append(classNumStr)
- trainingMat[i, :] = kNN.img2vector(self.path + "testDigits/%s" % fileNameStr)
- testFileList = os.listdir(self.path + "testDigits")
- errorCount = 0
- mTest = len(testFileList)
- for i in range(mTest):
- fileNameStr = testFileList[i]
- fileStr = fileNameStr.split('.')[0]
- classNumStr = int(fileStr.split('_')[0])
- vectorUnderTest = kNN.img2vector(self.path + "testDigits\%s" % fileNameStr)
- classifierResult = kNN.classify0(vectorUnderTest, trainingMat, hwLabels, 3)
- print("the classifier came back with:%d, the real answer if :%d" %(classifierResult, classNumStr))
- if(classifierResult != classNumStr) :
- errorCount += 1
- print("\nthe total numner of errors is: %d" % errorCount)
- print("\nthe total error rate is %f" %(errorCount / float(mTest)))
- if __name__ == '__main__':
- path = "C:/.../machinelearninginaction/Ch02/digits/"
- kNN(path).handwritingClassTest()
复制代码 |
|