Python实现KMeans
本帖最后由 糖逗 于 2020-11-17 15:44 编辑参考书籍:《机器学习实战》
1.K均值聚类
import numpy as np
def loadDataSet(fileName):
dataMat = []
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split("\t")
fltLine = list(map(float, curLine))
dataMat.append(fltLine)
return np.mat(dataMat)
def distance(vecA, vecB):
return np.sqrt(np.sum(np.power(vecA - vecB, 2)))
#根据每个特征的最大值最小值范围随机生成k个质心
def randCent(dataSet, k):
n = np.shape(dataSet)
centroids = np.mat(np.zeros((k, n)))
for j in range(n):
minJ = min(dataSet[:, j])
rangeJ = float(max(dataSet[:, j]) - minJ)
#https://blog.csdn.net/sinat_38944746/article/details/89140276
centroids[:, j] = minJ + rangeJ * np.random.rand(k, 1)
return centroids
def kMeans(dataSet, k, distMeas = distance, createCent = randCent):
m = np.shape(dataSet)
clusterAssment = np.mat(np.zeros((m, 2)))#存储每个样本点最近质心的ID和距离
centroids = createCent(dataSet, k)
clusterChanged = True
while clusterChanged:
clusterChanged = False
for i in range(m):
minDist = np.inf
minIndex = -1
for j in range(k):#对于每个样本点找其最近的质心
distJI = distMeas(centroids, dataSet)
if distJI < minDist:
minDist = distJI
minIndex = j
if clusterAssment != minIndex:
clusterChanged = True#该样本的最近质心改变了
clusterAssment = minIndex, minDist ** 2
for cent in range(k):#重新计算质心
ptsInClust = dataSet == cent)]
centroids = np.mean(ptsInClust, axis = 0)
return centroids, clusterAssment
if __name__ == "__main__":
dataMat = loadDataSet(r"C:\...\testSet.txt")
myCentroids, clustAssing = kMeans(dataMat, 4)
2.二分K均值聚类
import numpy as np
def loadDataSet(fileName):
dataMat = []
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split("\t")
fltLine = list(map(float, curLine))
dataMat.append(fltLine)
return np.mat(dataMat)
def distance(vecA, vecB):
return np.sqrt(np.sum(np.power(vecA - vecB, 2)))
#根据每个特征的最大值最小值范围随机生成k个质心
def randCent(dataSet, k):
n = np.shape(dataSet)
centroids = np.mat(np.zeros((k, n)))
for j in range(n):
minJ = min(dataSet[:, j])
rangeJ = float(max(dataSet[:, j]) - minJ)
#https://blog.csdn.net/sinat_38944746/article/details/89140276
centroids[:, j] = minJ + rangeJ * np.random.rand(k, 1)
return centroids
def kMeans(dataSet, k, distMeas = distance, createCent = randCent):
m = np.shape(dataSet)
clusterAssment = np.mat(np.zeros((m, 2)))#存储每个样本点最近质心的ID和距离
centroids = createCent(dataSet, k)
clusterChanged = True
while clusterChanged:
clusterChanged = False
for i in range(m):
minDist = np.inf
minIndex = -1
for j in range(k):#对于每个样本点找其最近的质心
distJI = distMeas(centroids, dataSet)
if distJI < minDist:
minDist = distJI
minIndex = j
if clusterAssment != minIndex:
clusterChanged = True#该样本的最近质心改变了
clusterAssment = minIndex, minDist ** 2
for cent in range(k):#重新计算质心
ptsInClust = dataSet == cent)]
centroids = np.mean(ptsInClust, axis = 0)
return centroids, clusterAssment
def biKmeans(dataSet, k, distMeas = distance):
m = np.shape(dataSet)
clusterAssment = np.mat(np.zeros((m, 2)))
centroid0 = np.mean(dataSet, axis = 0).tolist()#一开始将虽有样本都聚成一个类,质心就是各特征的均值中心
centList =
for j in range(m):
clusterAssment = distMeas(np.mat(centroid0), dataSet)**2
while (len(centList) < k):#当分类达到k类后停止
lowestSSE = np.inf
for i in range(len(centList)):
ptsInCurrCluster = dataSet == i),:]
centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distMeas)
sseSplit = sum(splitClustAss[:,1])#将当前类进一步划分为两类后的sse
sseNotSplit = sum(clusterAssment != i),1])
#print("sseSplit, and notSplit: ",sseSplit,sseNotSplit)
if (sseSplit + sseNotSplit) < lowestSSE:#这里的SSE为什么是换分前和划分后的总sse,为什么不是差值看变化多少呢?
bestCentToSplit = i
bestNewCents = centroidMat
bestClustAss = splitClustAss.copy()
lowestSSE = sseSplit + sseNotSplit
bestClustAss == 1),0] = len(centList)#给样本重新标记质心
bestClustAss == 0),0] = bestCentToSplit
#print('the bestCentToSplit is: ', bestCentToSplit)
#print('the len of bestClustAss is: ', len(bestClustAss))
centList = bestNewCents.tolist()#修改原质心
centList.append(bestNewCents.tolist())#添加新置信
clusterAssment == bestCentToSplit), :]= bestClustAss
return np.mat(centList), clusterAssment
if __name__ == "__main__":
dataMat = loadDataSet(r"C:\...\testSet2.txt")
myCentroids, clustAssing = biKmeans(dataMat, 3)
import matplotlib.pyplot as plt
plt.scatter(dataMat[:, 0].tolist(), dataMat[:, 1].tolist())
plt.plot(myCentroids[:, 0].tolist(), myCentroids[:, 1].tolist(),'rs')
{:10_287:}
页:
[1]