鱼C论坛

 找回密码
 立即注册
查看: 2131|回复: 1

[技术交流] Python实现KMeans

[复制链接]
发表于 2020-11-17 14:48:04 | 显示全部楼层 |阅读模式

马上注册,结交更多好友,享用更多功能^_^

您需要 登录 才可以下载或查看,没有账号?立即注册

x
本帖最后由 糖逗 于 2020-11-17 15:44 编辑

参考书籍:《机器学习实战》



1.K均值聚类
  1. import numpy as np

  2. def loadDataSet(fileName):
  3.     dataMat = []
  4.     fr = open(fileName)
  5.     for line in fr.readlines():
  6.         curLine = line.strip().split("\t")
  7.         fltLine = list(map(float, curLine))
  8.         dataMat.append(fltLine)
  9.     return np.mat(dataMat)



  10. def distance(vecA, vecB):
  11.     return np.sqrt(np.sum(np.power(vecA - vecB, 2)))


  12. #根据每个特征的最大值最小值范围随机生成k个质心
  13. def randCent(dataSet, k):
  14.     n = np.shape(dataSet)[1]
  15.     centroids = np.mat(np.zeros((k, n)))
  16.     for j in range(n):
  17.         minJ = min(dataSet[:, j])
  18.         rangeJ = float(max(dataSet[:, j]) - minJ)
  19.         #https://blog.csdn.net/sinat_38944746/article/details/89140276
  20.         centroids[:, j] = minJ + rangeJ * np.random.rand(k, 1)
  21.     return centroids


  22. def kMeans(dataSet, k, distMeas = distance, createCent = randCent):
  23.     m = np.shape(dataSet)[0]
  24.     clusterAssment = np.mat(np.zeros((m, 2)))#存储每个样本点最近质心的ID和距离
  25.     centroids = createCent(dataSet, k)
  26.     clusterChanged = True
  27.     while clusterChanged:
  28.         clusterChanged = False
  29.         for i in range(m):
  30.             minDist = np.inf
  31.             minIndex = -1
  32.             for j in range(k):#对于每个样本点找其最近的质心
  33.                 distJI = distMeas(centroids[j, :], dataSet[i, :])
  34.                 if distJI < minDist:
  35.                     minDist = distJI
  36.                     minIndex = j
  37.             if clusterAssment[i, 0] != minIndex:
  38.                 clusterChanged = True#该样本的最近质心改变了
  39.             clusterAssment[i, :] = minIndex, minDist ** 2
  40.         for cent in range(k):#重新计算质心
  41.             ptsInClust = dataSet[np.nonzero(clusterAssment[:, 0] == cent)[0]]
  42.             centroids[cent, :] = np.mean(ptsInClust, axis = 0)
  43.     return centroids, clusterAssment
  44.    

  45. if __name__ == "__main__":
  46.     dataMat = loadDataSet(r"C:\...\testSet.txt")
  47.     myCentroids, clustAssing = kMeans(dataMat, 4)
复制代码



2.二分K均值聚类
  1. import numpy as np

  2. def loadDataSet(fileName):
  3.     dataMat = []
  4.     fr = open(fileName)
  5.     for line in fr.readlines():
  6.         curLine = line.strip().split("\t")
  7.         fltLine = list(map(float, curLine))
  8.         dataMat.append(fltLine)
  9.     return np.mat(dataMat)



  10. def distance(vecA, vecB):
  11.     return np.sqrt(np.sum(np.power(vecA - vecB, 2)))


  12. #根据每个特征的最大值最小值范围随机生成k个质心
  13. def randCent(dataSet, k):
  14.     n = np.shape(dataSet)[1]
  15.     centroids = np.mat(np.zeros((k, n)))
  16.     for j in range(n):
  17.         minJ = min(dataSet[:, j])
  18.         rangeJ = float(max(dataSet[:, j]) - minJ)
  19.         #https://blog.csdn.net/sinat_38944746/article/details/89140276
  20.         centroids[:, j] = minJ + rangeJ * np.random.rand(k, 1)
  21.     return centroids

  22. def kMeans(dataSet, k, distMeas = distance, createCent = randCent):
  23.     m = np.shape(dataSet)[0]
  24.     clusterAssment = np.mat(np.zeros((m, 2)))#存储每个样本点最近质心的ID和距离
  25.     centroids = createCent(dataSet, k)
  26.     clusterChanged = True
  27.     while clusterChanged:
  28.         clusterChanged = False
  29.         for i in range(m):
  30.             minDist = np.inf
  31.             minIndex = -1
  32.             for j in range(k):#对于每个样本点找其最近的质心
  33.                 distJI = distMeas(centroids[j, :], dataSet[i, :])
  34.                 if distJI < minDist:
  35.                     minDist = distJI
  36.                     minIndex = j
  37.             if clusterAssment[i, 0] != minIndex:
  38.                 clusterChanged = True#该样本的最近质心改变了
  39.             clusterAssment[i, :] = minIndex, minDist ** 2
  40.         for cent in range(k):#重新计算质心
  41.             ptsInClust = dataSet[np.nonzero(clusterAssment[:, 0] == cent)[0]]
  42.             centroids[cent, :] = np.mean(ptsInClust, axis = 0)
  43.     return centroids, clusterAssment

  44. def biKmeans(dataSet, k, distMeas = distance):
  45.     m = np.shape(dataSet)[0]
  46.     clusterAssment = np.mat(np.zeros((m, 2)))
  47.     centroid0 = np.mean(dataSet, axis = 0).tolist()[0]#一开始将虽有样本都聚成一个类,质心就是各特征的均值中心
  48.     centList =[centroid0]
  49.     for j in range(m):
  50.         clusterAssment[j,1] = distMeas(np.mat(centroid0), dataSet[j,:])**2
  51.     while (len(centList) < k):#当分类达到k类后停止
  52.         lowestSSE = np.inf
  53.         for i in range(len(centList)):
  54.             ptsInCurrCluster = dataSet[np.nonzero(clusterAssment[:,0] == i)[0],:]
  55.             centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distMeas)
  56.             sseSplit = sum(splitClustAss[:,1])#将当前类进一步划分为两类后的sse
  57.             sseNotSplit = sum(clusterAssment[np.nonzero(clusterAssment[:,0] != i)[0],1])
  58.             #print("sseSplit, and notSplit: ",sseSplit,sseNotSplit)
  59.             if (sseSplit + sseNotSplit) < lowestSSE:#这里的SSE为什么是换分前和划分后的总sse,为什么不是差值看变化多少呢?
  60.                 bestCentToSplit = i
  61.                 bestNewCents = centroidMat
  62.                 bestClustAss = splitClustAss.copy()
  63.                 lowestSSE = sseSplit + sseNotSplit
  64.         bestClustAss[np.nonzero(bestClustAss[:, 0] == 1)[0],0] = len(centList)#给样本重新标记质心
  65.         bestClustAss[np.nonzero(bestClustAss[:, 0] == 0)[0],0] = bestCentToSplit
  66.         #print('the bestCentToSplit is: ', bestCentToSplit)
  67.         #print('the len of bestClustAss is: ', len(bestClustAss))
  68.         centList[bestCentToSplit] = bestNewCents[0, :].tolist()[0]#修改原质心
  69.         centList.append(bestNewCents[1, :].tolist()[0])#添加新置信
  70.         clusterAssment[np.nonzero(clusterAssment[:, 0] == bestCentToSplit)[0], :]= bestClustAss
  71.     return np.mat(centList), clusterAssment
  72.    

  73. if __name__ == "__main__":
  74.     dataMat = loadDataSet(r"C:\...\testSet2.txt")
  75.     myCentroids, clustAssing = biKmeans(dataMat, 3)
  76.     import matplotlib.pyplot as plt
  77.     plt.scatter(dataMat[:, 0].tolist(), dataMat[:, 1].tolist())
  78.     plt.plot(myCentroids[:, 0].tolist(), myCentroids[:, 1].tolist(),'rs')
复制代码


本帖被以下淘专辑推荐:

小甲鱼最新课程 -> https://ilovefishc.com
回复

使用道具 举报

 楼主| 发表于 2020-11-17 15:45:26 | 显示全部楼层
小甲鱼最新课程 -> https://ilovefishc.com
回复

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

小黑屋|手机版|Archiver|鱼C工作室 ( 粤ICP备18085999号-1 | 粤公网安备 44051102000585号)

GMT+8, 2025-5-20 07:44

Powered by Discuz! X3.4

© 2001-2023 Discuz! Team.

快速回复 返回顶部 返回列表