import numpy as np
def loadSimpData():
datMat = np.matrix([[1, 2.1], [2, 1.1], [1.3, 1], [1, 1], [2, 1]])
classLabels = [1, 1, -1, -1, 1]
return datMat, classLabels
def stumpClassify(dataMatrix, dimen, threshVal, threshIneq):
retArray = np.ones((np.shape(dataMatrix)[0], 1))
if threshIneq == "It":
retArray[dataMatrix[:, dimen] <= threshVal] = -1
else:
retArray[dataMatrix[:, dimen] > threshVal] = -1
return retArray
def buildStump(dataArr, classLabels, D):#D是定义的权重向量
dataMatrix = np.mat(dataArr)
labelMat = np.mat(classLabels).T
m, n = np.shape(dataMatrix)
numSteps = 10
bestStump = {}
bestClassEst = np.mat(np.zeros((m, 1)))
minError = np.inf
for i in range(n):#特征循环
rangeMin = dataMatrix[:, i].min()#特征中的最小
rangeMax = dataMatrix[:, i].max()
stepSize = (rangeMax - rangeMin) / numSteps
for j in range(-1, int(numSteps) + 1):#特征中按照跨度选取的每个连续特征的值(未来找特征最好切割点)
for inequal in ['It', 'gt']:
threshVal = (rangeMin + float(j) * stepSize)
predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal)
errArr = np.mat(np.ones((m, 1)))
errArr[predictedVals == labelMat] = 0
weightedError = D.T * errArr
if weightedError < minError:
minError = weightedError
bestClassEst = predictedVals.copy()
bestStump["dim"] = i
bestStump["thresh"] = threshVal
bestStump["ineq"] = inequal
return bestStump, minError, bestClassEst
def adaBoostTrainDS(dataArr, classLabels, numIt = 40):
weakClassArr = []
m = np.shape(dataArr)[0]
D = np.mat(np.ones((m, 1)) / m)#每个样本的初始化权重是相同的
aggClassEst = np.mat(np.zeros((m, 1)))
for i in range(numIt):
bestStump, error, classEst = buildStump(dataArr, classLabels, D)
print("D:", D.T)
alpha = float(0.5 * np.log((1 - error) / max(error, 1e-16)))#更新每个分类器的权重alpha
bestStump['alpha'] = alpha#记录每个分类器的alpha值(这个分类器是指针对当前数据集最好的特征划分)
weakClassArr.append(bestStump)
print("classEst:", classEst.T)
#np.mat和np.matrix的区别
#https://blog.csdn.net/qq_43212169/article/details/101679293
expon = np.multiply(-1 * alpha * np.mat(classLabels).T, classEst)
D = np.multiply(D, np.exp(expon))
D = D / D.sum()
aggClassEst += alpha * classEst#记录每个数据点的类别估计累计值
print("aggClassEst:", aggClassEst.T)
arrErrors = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T,
np.ones((m, 1)))
errorRate = arrErrors.sum() / m
print("total error:", errorRate, "\n")
if errorRate == 0:
break
return weakClassArr
#利用多个弱分类器做最终的分类
def adaClassify(dataToClass, classifierArr):
dataMatrix = np.mat(dataToClass)
m = np.shape(dataMatrix)[0]
aggClassEst = np.mat(np.zeros((m, 1)))
for i in range(len(classifierArr)):
classEst = stumpClassify(dataMatrix, classifierArr[i]["dim"],
classifierArr[i]["thresh"],
classifierArr[i]["ineq"])
aggClassEst += classifierArr[i]["alpha"] * classEst
print(aggClassEst)
return np.sign(aggClassEst)
datMat, classLabels = loadSimpData()
classifierArray = adaBoostTrainDS(datMat, classLabels)
adaClassify([0, 0],classifierArray)#返回[0,0]特征向量的预测值