鱼C论坛

 找回密码
 立即注册
查看: 1675|回复: 1

[技术交流] Python实现KMeans

[复制链接]
发表于 2020-11-17 14:48:04 | 显示全部楼层 |阅读模式

马上注册,结交更多好友,享用更多功能^_^

您需要 登录 才可以下载或查看,没有账号?立即注册

x
本帖最后由 糖逗 于 2020-11-17 15:44 编辑

参考书籍:《机器学习实战》



1.K均值聚类
import numpy as np

def loadDataSet(fileName):
    dataMat = []
    fr = open(fileName)
    for line in fr.readlines():
        curLine = line.strip().split("\t")
        fltLine = list(map(float, curLine))
        dataMat.append(fltLine)
    return np.mat(dataMat)



def distance(vecA, vecB):
    return np.sqrt(np.sum(np.power(vecA - vecB, 2)))


#根据每个特征的最大值最小值范围随机生成k个质心
def randCent(dataSet, k):
    n = np.shape(dataSet)[1]
    centroids = np.mat(np.zeros((k, n)))
    for j in range(n):
        minJ = min(dataSet[:, j])
        rangeJ = float(max(dataSet[:, j]) - minJ)
        #https://blog.csdn.net/sinat_38944746/article/details/89140276
        centroids[:, j] = minJ + rangeJ * np.random.rand(k, 1)
    return centroids


def kMeans(dataSet, k, distMeas = distance, createCent = randCent):
    m = np.shape(dataSet)[0]
    clusterAssment = np.mat(np.zeros((m, 2)))#存储每个样本点最近质心的ID和距离
    centroids = createCent(dataSet, k)
    clusterChanged = True
    while clusterChanged:
        clusterChanged = False
        for i in range(m):
            minDist = np.inf
            minIndex = -1
            for j in range(k):#对于每个样本点找其最近的质心
                distJI = distMeas(centroids[j, :], dataSet[i, :])
                if distJI < minDist:
                    minDist = distJI
                    minIndex = j
            if clusterAssment[i, 0] != minIndex:
                clusterChanged = True#该样本的最近质心改变了
            clusterAssment[i, :] = minIndex, minDist ** 2
        for cent in range(k):#重新计算质心
            ptsInClust = dataSet[np.nonzero(clusterAssment[:, 0] == cent)[0]]
            centroids[cent, :] = np.mean(ptsInClust, axis = 0)
    return centroids, clusterAssment
    

if __name__ == "__main__":
    dataMat = loadDataSet(r"C:\...\testSet.txt")
    myCentroids, clustAssing = kMeans(dataMat, 4)


2.二分K均值聚类
import numpy as np

def loadDataSet(fileName):
    dataMat = []
    fr = open(fileName)
    for line in fr.readlines():
        curLine = line.strip().split("\t")
        fltLine = list(map(float, curLine))
        dataMat.append(fltLine)
    return np.mat(dataMat)



def distance(vecA, vecB):
    return np.sqrt(np.sum(np.power(vecA - vecB, 2)))


#根据每个特征的最大值最小值范围随机生成k个质心
def randCent(dataSet, k):
    n = np.shape(dataSet)[1]
    centroids = np.mat(np.zeros((k, n)))
    for j in range(n):
        minJ = min(dataSet[:, j])
        rangeJ = float(max(dataSet[:, j]) - minJ)
        #https://blog.csdn.net/sinat_38944746/article/details/89140276
        centroids[:, j] = minJ + rangeJ * np.random.rand(k, 1)
    return centroids

def kMeans(dataSet, k, distMeas = distance, createCent = randCent):
    m = np.shape(dataSet)[0]
    clusterAssment = np.mat(np.zeros((m, 2)))#存储每个样本点最近质心的ID和距离
    centroids = createCent(dataSet, k)
    clusterChanged = True
    while clusterChanged:
        clusterChanged = False
        for i in range(m):
            minDist = np.inf
            minIndex = -1
            for j in range(k):#对于每个样本点找其最近的质心
                distJI = distMeas(centroids[j, :], dataSet[i, :])
                if distJI < minDist:
                    minDist = distJI
                    minIndex = j
            if clusterAssment[i, 0] != minIndex:
                clusterChanged = True#该样本的最近质心改变了
            clusterAssment[i, :] = minIndex, minDist ** 2
        for cent in range(k):#重新计算质心
            ptsInClust = dataSet[np.nonzero(clusterAssment[:, 0] == cent)[0]]
            centroids[cent, :] = np.mean(ptsInClust, axis = 0)
    return centroids, clusterAssment

def biKmeans(dataSet, k, distMeas = distance):
    m = np.shape(dataSet)[0]
    clusterAssment = np.mat(np.zeros((m, 2)))
    centroid0 = np.mean(dataSet, axis = 0).tolist()[0]#一开始将虽有样本都聚成一个类,质心就是各特征的均值中心
    centList =[centroid0] 
    for j in range(m):
        clusterAssment[j,1] = distMeas(np.mat(centroid0), dataSet[j,:])**2
    while (len(centList) < k):#当分类达到k类后停止
        lowestSSE = np.inf
        for i in range(len(centList)):
            ptsInCurrCluster = dataSet[np.nonzero(clusterAssment[:,0] == i)[0],:]
            centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distMeas)
            sseSplit = sum(splitClustAss[:,1])#将当前类进一步划分为两类后的sse
            sseNotSplit = sum(clusterAssment[np.nonzero(clusterAssment[:,0] != i)[0],1])
            #print("sseSplit, and notSplit: ",sseSplit,sseNotSplit)
            if (sseSplit + sseNotSplit) < lowestSSE:#这里的SSE为什么是换分前和划分后的总sse,为什么不是差值看变化多少呢?
                bestCentToSplit = i
                bestNewCents = centroidMat
                bestClustAss = splitClustAss.copy()
                lowestSSE = sseSplit + sseNotSplit
        bestClustAss[np.nonzero(bestClustAss[:, 0] == 1)[0],0] = len(centList)#给样本重新标记质心
        bestClustAss[np.nonzero(bestClustAss[:, 0] == 0)[0],0] = bestCentToSplit
        #print('the bestCentToSplit is: ', bestCentToSplit)
        #print('the len of bestClustAss is: ', len(bestClustAss))
        centList[bestCentToSplit] = bestNewCents[0, :].tolist()[0]#修改原质心
        centList.append(bestNewCents[1, :].tolist()[0])#添加新置信
        clusterAssment[np.nonzero(clusterAssment[:, 0] == bestCentToSplit)[0], :]= bestClustAss
    return np.mat(centList), clusterAssment
    

if __name__ == "__main__":
    dataMat = loadDataSet(r"C:\...\testSet2.txt")
    myCentroids, clustAssing = biKmeans(dataMat, 3)
    import matplotlib.pyplot as plt
    plt.scatter(dataMat[:, 0].tolist(), dataMat[:, 1].tolist())
    plt.plot(myCentroids[:, 0].tolist(), myCentroids[:, 1].tolist(),'rs')

本帖被以下淘专辑推荐:

想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复

使用道具 举报

 楼主| 发表于 2020-11-17 15:45:26 | 显示全部楼层
想知道小甲鱼最近在做啥?请访问 -> ilovefishc.com
回复

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

小黑屋|手机版|Archiver|鱼C工作室 ( 粤ICP备18085999号-1 | 粤公网安备 44051102000585号)

GMT+8, 2024-11-22 15:27

Powered by Discuz! X3.4

© 2001-2023 Discuz! Team.

快速回复 返回顶部 返回列表