|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
参考书籍:《机器学习实战》
- import numpy as np
- def loadSimpData():
- datMat = np.matrix([[1, 2.1], [2, 1.1], [1.3, 1], [1, 1], [2, 1]])
- classLabels = [1, 1, -1, -1, 1]
- return datMat, classLabels
- def stumpClassify(dataMatrix, dimen, threshVal, threshIneq):
- retArray = np.ones((np.shape(dataMatrix)[0], 1))
- if threshIneq == "It":
- retArray[dataMatrix[:, dimen] <= threshVal] = -1
- else:
- retArray[dataMatrix[:, dimen] > threshVal] = -1
- return retArray
- def buildStump(dataArr, classLabels, D):#D是定义的权重向量
- dataMatrix = np.mat(dataArr)
- labelMat = np.mat(classLabels).T
- m, n = np.shape(dataMatrix)
- numSteps = 10
- bestStump = {}
- bestClassEst = np.mat(np.zeros((m, 1)))
- minError = np.inf
- for i in range(n):#特征循环
- rangeMin = dataMatrix[:, i].min()#特征中的最小
- rangeMax = dataMatrix[:, i].max()
- stepSize = (rangeMax - rangeMin) / numSteps
- for j in range(-1, int(numSteps) + 1):#特征中按照跨度选取的每个连续特征的值(未来找特征最好切割点)
- for inequal in ['It', 'gt']:
- threshVal = (rangeMin + float(j) * stepSize)
- predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal)
- errArr = np.mat(np.ones((m, 1)))
- errArr[predictedVals == labelMat] = 0
- weightedError = D.T * errArr
- if weightedError < minError:
- minError = weightedError
- bestClassEst = predictedVals.copy()
- bestStump["dim"] = i
- bestStump["thresh"] = threshVal
- bestStump["ineq"] = inequal
- return bestStump, minError, bestClassEst
- def adaBoostTrainDS(dataArr, classLabels, numIt = 40):
- weakClassArr = []
- m = np.shape(dataArr)[0]
- D = np.mat(np.ones((m, 1)) / m)#每个样本的初始化权重是相同的
- aggClassEst = np.mat(np.zeros((m, 1)))
- for i in range(numIt):
- bestStump, error, classEst = buildStump(dataArr, classLabels, D)
- print("D:", D.T)
- alpha = float(0.5 * np.log((1 - error) / max(error, 1e-16)))#更新每个分类器的权重alpha
- bestStump['alpha'] = alpha#记录每个分类器的alpha值(这个分类器是指针对当前数据集最好的特征划分)
- weakClassArr.append(bestStump)
- print("classEst:", classEst.T)
- #np.mat和np.matrix的区别
- #https://blog.csdn.net/qq_43212169/article/details/101679293
- expon = np.multiply(-1 * alpha * np.mat(classLabels).T, classEst)
- D = np.multiply(D, np.exp(expon))
- D = D / D.sum()
- aggClassEst += alpha * classEst#记录每个数据点的类别估计累计值
- print("aggClassEst:", aggClassEst.T)
- arrErrors = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T,
- np.ones((m, 1)))
- errorRate = arrErrors.sum() / m
- print("total error:", errorRate, "\n")
- if errorRate == 0:
- break
- return weakClassArr
- #利用多个弱分类器做最终的分类
- def adaClassify(dataToClass, classifierArr):
- dataMatrix = np.mat(dataToClass)
- m = np.shape(dataMatrix)[0]
- aggClassEst = np.mat(np.zeros((m, 1)))
- for i in range(len(classifierArr)):
- classEst = stumpClassify(dataMatrix, classifierArr[i]["dim"],
- classifierArr[i]["thresh"],
- classifierArr[i]["ineq"])
- aggClassEst += classifierArr[i]["alpha"] * classEst
- print(aggClassEst)
- return np.sign(aggClassEst)
-
- datMat, classLabels = loadSimpData()
- classifierArray = adaBoostTrainDS(datMat, classLabels)
- adaClassify([0, 0],classifierArray)#返回[0,0]特征向量的预测值
复制代码 |
|