|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 糖逗 于 2020-11-17 15:44 编辑
参考书籍:《机器学习实战》
1.K均值聚类
- import numpy as np
- def loadDataSet(fileName):
- dataMat = []
- fr = open(fileName)
- for line in fr.readlines():
- curLine = line.strip().split("\t")
- fltLine = list(map(float, curLine))
- dataMat.append(fltLine)
- return np.mat(dataMat)
- def distance(vecA, vecB):
- return np.sqrt(np.sum(np.power(vecA - vecB, 2)))
- #根据每个特征的最大值最小值范围随机生成k个质心
- def randCent(dataSet, k):
- n = np.shape(dataSet)[1]
- centroids = np.mat(np.zeros((k, n)))
- for j in range(n):
- minJ = min(dataSet[:, j])
- rangeJ = float(max(dataSet[:, j]) - minJ)
- #https://blog.csdn.net/sinat_38944746/article/details/89140276
- centroids[:, j] = minJ + rangeJ * np.random.rand(k, 1)
- return centroids
- def kMeans(dataSet, k, distMeas = distance, createCent = randCent):
- m = np.shape(dataSet)[0]
- clusterAssment = np.mat(np.zeros((m, 2)))#存储每个样本点最近质心的ID和距离
- centroids = createCent(dataSet, k)
- clusterChanged = True
- while clusterChanged:
- clusterChanged = False
- for i in range(m):
- minDist = np.inf
- minIndex = -1
- for j in range(k):#对于每个样本点找其最近的质心
- distJI = distMeas(centroids[j, :], dataSet[i, :])
- if distJI < minDist:
- minDist = distJI
- minIndex = j
- if clusterAssment[i, 0] != minIndex:
- clusterChanged = True#该样本的最近质心改变了
- clusterAssment[i, :] = minIndex, minDist ** 2
- for cent in range(k):#重新计算质心
- ptsInClust = dataSet[np.nonzero(clusterAssment[:, 0] == cent)[0]]
- centroids[cent, :] = np.mean(ptsInClust, axis = 0)
- return centroids, clusterAssment
-
- if __name__ == "__main__":
- dataMat = loadDataSet(r"C:\...\testSet.txt")
- myCentroids, clustAssing = kMeans(dataMat, 4)
复制代码
2.二分K均值聚类
- import numpy as np
- def loadDataSet(fileName):
- dataMat = []
- fr = open(fileName)
- for line in fr.readlines():
- curLine = line.strip().split("\t")
- fltLine = list(map(float, curLine))
- dataMat.append(fltLine)
- return np.mat(dataMat)
- def distance(vecA, vecB):
- return np.sqrt(np.sum(np.power(vecA - vecB, 2)))
- #根据每个特征的最大值最小值范围随机生成k个质心
- def randCent(dataSet, k):
- n = np.shape(dataSet)[1]
- centroids = np.mat(np.zeros((k, n)))
- for j in range(n):
- minJ = min(dataSet[:, j])
- rangeJ = float(max(dataSet[:, j]) - minJ)
- #https://blog.csdn.net/sinat_38944746/article/details/89140276
- centroids[:, j] = minJ + rangeJ * np.random.rand(k, 1)
- return centroids
- def kMeans(dataSet, k, distMeas = distance, createCent = randCent):
- m = np.shape(dataSet)[0]
- clusterAssment = np.mat(np.zeros((m, 2)))#存储每个样本点最近质心的ID和距离
- centroids = createCent(dataSet, k)
- clusterChanged = True
- while clusterChanged:
- clusterChanged = False
- for i in range(m):
- minDist = np.inf
- minIndex = -1
- for j in range(k):#对于每个样本点找其最近的质心
- distJI = distMeas(centroids[j, :], dataSet[i, :])
- if distJI < minDist:
- minDist = distJI
- minIndex = j
- if clusterAssment[i, 0] != minIndex:
- clusterChanged = True#该样本的最近质心改变了
- clusterAssment[i, :] = minIndex, minDist ** 2
- for cent in range(k):#重新计算质心
- ptsInClust = dataSet[np.nonzero(clusterAssment[:, 0] == cent)[0]]
- centroids[cent, :] = np.mean(ptsInClust, axis = 0)
- return centroids, clusterAssment
- def biKmeans(dataSet, k, distMeas = distance):
- m = np.shape(dataSet)[0]
- clusterAssment = np.mat(np.zeros((m, 2)))
- centroid0 = np.mean(dataSet, axis = 0).tolist()[0]#一开始将虽有样本都聚成一个类,质心就是各特征的均值中心
- centList =[centroid0]
- for j in range(m):
- clusterAssment[j,1] = distMeas(np.mat(centroid0), dataSet[j,:])**2
- while (len(centList) < k):#当分类达到k类后停止
- lowestSSE = np.inf
- for i in range(len(centList)):
- ptsInCurrCluster = dataSet[np.nonzero(clusterAssment[:,0] == i)[0],:]
- centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distMeas)
- sseSplit = sum(splitClustAss[:,1])#将当前类进一步划分为两类后的sse
- sseNotSplit = sum(clusterAssment[np.nonzero(clusterAssment[:,0] != i)[0],1])
- #print("sseSplit, and notSplit: ",sseSplit,sseNotSplit)
- if (sseSplit + sseNotSplit) < lowestSSE:#这里的SSE为什么是换分前和划分后的总sse,为什么不是差值看变化多少呢?
- bestCentToSplit = i
- bestNewCents = centroidMat
- bestClustAss = splitClustAss.copy()
- lowestSSE = sseSplit + sseNotSplit
- bestClustAss[np.nonzero(bestClustAss[:, 0] == 1)[0],0] = len(centList)#给样本重新标记质心
- bestClustAss[np.nonzero(bestClustAss[:, 0] == 0)[0],0] = bestCentToSplit
- #print('the bestCentToSplit is: ', bestCentToSplit)
- #print('the len of bestClustAss is: ', len(bestClustAss))
- centList[bestCentToSplit] = bestNewCents[0, :].tolist()[0]#修改原质心
- centList.append(bestNewCents[1, :].tolist()[0])#添加新置信
- clusterAssment[np.nonzero(clusterAssment[:, 0] == bestCentToSplit)[0], :]= bestClustAss
- return np.mat(centList), clusterAssment
-
- if __name__ == "__main__":
- dataMat = loadDataSet(r"C:\...\testSet2.txt")
- myCentroids, clustAssing = biKmeans(dataMat, 3)
- import matplotlib.pyplot as plt
- plt.scatter(dataMat[:, 0].tolist(), dataMat[:, 1].tolist())
- plt.plot(myCentroids[:, 0].tolist(), myCentroids[:, 1].tolist(),'rs')
复制代码
|
|