|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 糖逗 于 2020-11-7 08:54 编辑
参考书籍:《机器学习实战》
- import numpy as np
- #获得词典
- def createVocabList(dataSet):
- vocabSet = set()
- for document in dataSet:
- #求两个集合的全集
- vocabSet = vocabSet | set(document)#按位或运算符:只要对应的二个二进位有一个为1时,结果位就为1。
- return list(vocabSet)
- #将输入根据已有的词典转换为向量
- def setOfWords2Vec(vocabList, inputSet):
- returnVec = [0] * len(vocabList)
- for word in inputSet:
- if word in vocabList:
- returnVec[vocabList.index(word)] = 1
- else:
- print("the word: %s is not in my Vocabulary!" % word)
- return returnVec
- def trainNB0(trainMatrix, trainCategory):
- numTrainDocs = len(trainMatrix)
- numWords = len(trainMatrix[0])
- pAbusive = sum(trainCategory) / float(numTrainDocs)#训练的样本中属于正样本的占比
- p0Num = np.ones(numWords)#为了避免概率连乘时,其中一个概率为0,导致总体成绩为0
- p1Num = np.ones(numWords)
- p0Denom = 2#为了避免概率连乘时,其中一个概率为0,导致总体成绩为0
- p1Denom = 2
- for i in range(numTrainDocs):
- if trainCategory[i] == 1:
- p1Num += trainMatrix[i]
- p1Denom += sum(trainMatrix[i])
- else:
- p0Num += trainMatrix[i]
- p0Denom += sum(trainMatrix[i])
- p1Vect = np.log(p1Num / p1Denom)#正分类样本中,某一个单词数占所有正样本单词数的比值
- p0Vect = np.log(p0Num / p0Denom)#防止下溢
- return p0Vect, p1Vect, pAbusive
- def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
- p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)
- p0 = sum(vec2Classify * p0Vec) + np.log(1 - pClass1)
- if p1 > p0:
- return 1
- else:
- return 0
-
- def testingNB(testEntry, listOPosts, listClasses):
- myVocabList = createVocabList(listOPosts)#创建词典
- trainMat = []
- for postinDoc in listOPosts:
- trainMat.append(setOfWords2Vec(myVocabList, postinDoc))#将训练数据变为二维矩阵形式
- p0V, p1V, pAb = trainNB0(np.array(trainMat), np.array(listClasses))
- thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
- print(testEntry, "classified as:", classifyNB(thisDoc, p0V, p1V, pAb))
-
-
- if __name__ == "__main__":
- postingList = [["my", "dog", "has", "flea","problems", "help", "please"],
- ["maybe", "not", "take", "him", "to", "dog", "park", "stupid"],
- ["my", "dalmation", "is", "so", "cute", "I", "love", "him"],
- ["stop", "posting", "stupid", "worthless", "garbage"],
- ["mr", "licks", "ate", "my", "steak", "how", "to", "stop", "him"],
- ["quit", "buying", "worthless", "dog", "food", "stupid"]]
- classVec = [0, 1, 0, 1, 0, 1]
- testEntry = ["love", "my", "dalmation"]
- testingNB(testEntry, postingList, classVec)
复制代码 |
|