糖逗 发表于 2020-11-17 14:48:04

Python实现KMeans

本帖最后由 糖逗 于 2020-11-17 15:44 编辑

参考书籍:《机器学习实战》



1.K均值聚类
import numpy as np

def loadDataSet(fileName):
    dataMat = []
    fr = open(fileName)
    for line in fr.readlines():
      curLine = line.strip().split("\t")
      fltLine = list(map(float, curLine))
      dataMat.append(fltLine)
    return np.mat(dataMat)



def distance(vecA, vecB):
    return np.sqrt(np.sum(np.power(vecA - vecB, 2)))


#根据每个特征的最大值最小值范围随机生成k个质心
def randCent(dataSet, k):
    n = np.shape(dataSet)
    centroids = np.mat(np.zeros((k, n)))
    for j in range(n):
      minJ = min(dataSet[:, j])
      rangeJ = float(max(dataSet[:, j]) - minJ)
      #https://blog.csdn.net/sinat_38944746/article/details/89140276
      centroids[:, j] = minJ + rangeJ * np.random.rand(k, 1)
    return centroids


def kMeans(dataSet, k, distMeas = distance, createCent = randCent):
    m = np.shape(dataSet)
    clusterAssment = np.mat(np.zeros((m, 2)))#存储每个样本点最近质心的ID和距离
    centroids = createCent(dataSet, k)
    clusterChanged = True
    while clusterChanged:
      clusterChanged = False
      for i in range(m):
            minDist = np.inf
            minIndex = -1
            for j in range(k):#对于每个样本点找其最近的质心
                distJI = distMeas(centroids, dataSet)
                if distJI < minDist:
                  minDist = distJI
                  minIndex = j
            if clusterAssment != minIndex:
                clusterChanged = True#该样本的最近质心改变了
            clusterAssment = minIndex, minDist ** 2
      for cent in range(k):#重新计算质心
            ptsInClust = dataSet == cent)]
            centroids = np.mean(ptsInClust, axis = 0)
    return centroids, clusterAssment
   

if __name__ == "__main__":
    dataMat = loadDataSet(r"C:\...\testSet.txt")
    myCentroids, clustAssing = kMeans(dataMat, 4)


2.二分K均值聚类
import numpy as np

def loadDataSet(fileName):
    dataMat = []
    fr = open(fileName)
    for line in fr.readlines():
      curLine = line.strip().split("\t")
      fltLine = list(map(float, curLine))
      dataMat.append(fltLine)
    return np.mat(dataMat)



def distance(vecA, vecB):
    return np.sqrt(np.sum(np.power(vecA - vecB, 2)))


#根据每个特征的最大值最小值范围随机生成k个质心
def randCent(dataSet, k):
    n = np.shape(dataSet)
    centroids = np.mat(np.zeros((k, n)))
    for j in range(n):
      minJ = min(dataSet[:, j])
      rangeJ = float(max(dataSet[:, j]) - minJ)
      #https://blog.csdn.net/sinat_38944746/article/details/89140276
      centroids[:, j] = minJ + rangeJ * np.random.rand(k, 1)
    return centroids

def kMeans(dataSet, k, distMeas = distance, createCent = randCent):
    m = np.shape(dataSet)
    clusterAssment = np.mat(np.zeros((m, 2)))#存储每个样本点最近质心的ID和距离
    centroids = createCent(dataSet, k)
    clusterChanged = True
    while clusterChanged:
      clusterChanged = False
      for i in range(m):
            minDist = np.inf
            minIndex = -1
            for j in range(k):#对于每个样本点找其最近的质心
                distJI = distMeas(centroids, dataSet)
                if distJI < minDist:
                  minDist = distJI
                  minIndex = j
            if clusterAssment != minIndex:
                clusterChanged = True#该样本的最近质心改变了
            clusterAssment = minIndex, minDist ** 2
      for cent in range(k):#重新计算质心
            ptsInClust = dataSet == cent)]
            centroids = np.mean(ptsInClust, axis = 0)
    return centroids, clusterAssment

def biKmeans(dataSet, k, distMeas = distance):
    m = np.shape(dataSet)
    clusterAssment = np.mat(np.zeros((m, 2)))
    centroid0 = np.mean(dataSet, axis = 0).tolist()#一开始将虽有样本都聚成一个类,质心就是各特征的均值中心
    centList =
    for j in range(m):
      clusterAssment = distMeas(np.mat(centroid0), dataSet)**2
    while (len(centList) < k):#当分类达到k类后停止
      lowestSSE = np.inf
      for i in range(len(centList)):
            ptsInCurrCluster = dataSet == i),:]
            centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distMeas)
            sseSplit = sum(splitClustAss[:,1])#将当前类进一步划分为两类后的sse
            sseNotSplit = sum(clusterAssment != i),1])
            #print("sseSplit, and notSplit: ",sseSplit,sseNotSplit)
            if (sseSplit + sseNotSplit) < lowestSSE:#这里的SSE为什么是换分前和划分后的总sse,为什么不是差值看变化多少呢?
                bestCentToSplit = i
                bestNewCents = centroidMat
                bestClustAss = splitClustAss.copy()
                lowestSSE = sseSplit + sseNotSplit
      bestClustAss == 1),0] = len(centList)#给样本重新标记质心
      bestClustAss == 0),0] = bestCentToSplit
      #print('the bestCentToSplit is: ', bestCentToSplit)
      #print('the len of bestClustAss is: ', len(bestClustAss))
      centList = bestNewCents.tolist()#修改原质心
      centList.append(bestNewCents.tolist())#添加新置信
      clusterAssment == bestCentToSplit), :]= bestClustAss
    return np.mat(centList), clusterAssment
   

if __name__ == "__main__":
    dataMat = loadDataSet(r"C:\...\testSet2.txt")
    myCentroids, clustAssing = biKmeans(dataMat, 3)
    import matplotlib.pyplot as plt
    plt.scatter(dataMat[:, 0].tolist(), dataMat[:, 1].tolist())
    plt.plot(myCentroids[:, 0].tolist(), myCentroids[:, 1].tolist(),'rs')

糖逗 发表于 2020-11-17 15:45:26

{:10_287:}
页: [1]
查看完整版本: Python实现KMeans