#coding:utf-8
import numpy as np
class NaiveBayes():
def __init__(self):
pass
def createVocabList(self, train_x):
vocabSet = set([])
for wordList in train_x:
vocabSet = vocabSet | set(wordList)
return list(vocabSet)
def listOfWords2Vec(self, vocabList, wordList):
wordsVec = [0] * len(vocabList)
for word in wordList:
if word in vocabList:
wordsVec[vocabList.index(word)] = 1 # 词集模型
# wordsVec[vocabList.index(word)] += 1 # 词袋模型
# else:
# print "the word:%s is not in my vocabulary!" % word
return wordsVec
def fit(self, train_x, train_y):
vocabList = self.createVocabList(train_x)
trainMat = []
for wordList in train_x:
trainMat.append(self.listOfWords2Vec(vocabList, wordList))
trainMatrix = np.array(trainMat) ## array
trainLabel = np.array(train_y) ## array
numTrainDocs = len(trainMatrix) # 统计样本个数
numWords = len(trainMatrix[0]) # 统计特征个数,理论上是词库的长度
## 计算p(c0),p(c1)
p1 = sum(trainLabel)/float(numTrainDocs) # 对应p(c1)
p0 = 1-p1 # 对应p(c0)
## 计算p(wi|c0),p(wi|c1)
p0Num = np.ones(numWords) # 初始样本个数为1,防止条件概率为0,影响结果
p1Num = np.ones(numWords)
p0InAll = 2.0 # 词库中只有两类,所以此处初始化为2
p1InAll = 2.0
for i in range(numTrainDocs):
if trainLabel[i] == 1:
p1Num += trainMatrix[i]
p1InAll += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0InAll += sum(trainMatrix[i])
p0Vec = np.log(p0Num/p0InAll) # 对应p(wi|c0)
p1Vec = np.log(p1Num/p1InAll) # 对应p(wi|c1)
## 整合参数
param = p0, p1, p0Vec, p1Vec
return vocabList, param
def predict(self, test_X, vocabList, param):
p0, p1, p0Vec, p1Vec = param
testMat = []
for wordList in test_X:
testMat.append(self.listOfWords2Vec(vocabList, wordList))
testMatrix = np.array(testMat) ## array
predict_y = []
for vec in testMatrix:
prob_y0 = sum(vec*p0Vec)+np.log(p0) # 对应p(w1|c0)*p(w2|c0)*...*p(c0),log(a*b) = log(a)+log(b)
prob_y1 = sum(vec*p1Vec)+np.log(p1) # 对应p(w1|c1)*p(w2|c1)*...*p(c1),log(a*b) = log(a)+log(b)
if prob_y0 < prob_y1: ## 对应0/1分类,但是NaiveBayes可以修改成多分类
predict_y.append(1)
else:
predict_y.append(0)
predictLabel = np.array(predict_y) ## array
return predictLabel
def predict1(self, test_X, test_y, vocabList, param):
p0, p1, p0Vec, p1Vec = param
testMat = []
for wordList in test_X:
testMat.append(self.listOfWords2Vec(vocabList, wordList))
testMatrix = np.array(testMat) ## array
m = testMatrix.shape[0]
predict_y = []
for vec in testMatrix:
prob_y0 = sum(vec*p0Vec)+np.log(p0) # 对应p(w1|c0)*p(w2|c0)*...*p(c0),log(a*b) = log(a)+log(b)
prob_y1 = sum(vec*p1Vec)+np.log(p1) # 对应p(w1|c1)*p(w2|c1)*...*p(c1),log(a*b) = log(a)+log(b)
if prob_y0 < prob_y1: ## 对应0/1分类,但是NaiveBayes可以修改成多分类
predict_y.append(1)
else:
predict_y.append(0)
testLabel = np.array(test_y) ## array
predictLabel = np.array(predict_y) ## array
print 'accuracy:', sum(testLabel==predictLabel)/float(m)
return predictLabel
class LogisticRegression(): # 二分类,0/1分类
def __init__(self):
pass
def createVocabList(self, train_x):
vocabSet = set([])
for wordList in train_x:
vocabSet = vocabSet | set(wordList)
return list(vocabSet)
def listOfWords2Vec(self, vocabList, wordList):
wordsVec = [0] * len(vocabList)
for word in wordList:
if word in vocabList:
wordsVec[vocabList.index(word)] = 1 # 词集模型
# wordsVec[vocabList.index(word)] += 1 # 词袋模型
# else:
# print "the word:%s is not in my vocabulary!" % word
return wordsVec
def sigmoid(self, inX):
return 1.0/(1 + np.exp(-inX))
# 使用梯度下降方法训练模型,alpha为步长(学习率),maxCycles最大迭代次数
def fit(self, train_x, train_y, alpha=0.01, maxCycles=100):
vocabList = self.createVocabList(train_x)
trainMat = []
for wordList in train_x:
trainMat.append(self.listOfWords2Vec(vocabList, wordList))
trainMatrix = np.matrix(trainMat) ## matrix是二维的 # size: m*n
trainLabel = np.matrix(train_y).T ## matrix是二维的 # size: m*1
m, n = trainMatrix.shape
weigh = np.matrix(np.ones((n, 1))) # size: n*1
for i in range(maxCycles):
hx = self.sigmoid(trainMatrix*weigh) # size: m*1 sigmoid把线性回归转换到[0,1]之间,对应概率
error = trainLabel-hx # size: m*1
weigh += alpha*trainMatrix.T*error # size: n*1
return vocabList, weigh
# 使用学习得到的参数进行分类
def predict(self, test_X, vocabList, weigh):
testMat = []
for wordList in test_X:
testMat.append(self.listOfWords2Vec(vocabList, wordList))
testMatrix = np.matrix(testMat) ## matrix是二维的
m = testMatrix.shape[0]
hx = self.sigmoid(testMatrix*weigh) # size: m*1 sigmoid把线性回归转换到[0,1]之间,对应概率
predict_y = []
for i in range(m): ## 对应0/1分类
if hx[i][0] > 0.5:
predict_y.append(1)
else:
predict_y.append(0)
predictLabel = np.array(predict_y) ## array
# predictLabel = np.matrix(predict_y).T ## matrix
return predictLabel
# 使用学习得到的参数进行分类
def predict1(self, test_X, test_y, vocabList, weigh):
testMat = []
for wordList in test_X:
testMat.append(self.listOfWords2Vec(vocabList, wordList))
testMatrix = np.matrix(testMat) ## matrix是二维的
m = testMatrix.shape[0]
hx = self.sigmoid(testMatrix*weigh) # size: m*1 sigmoid把线性回归转换到[0,1]之间,对应概率
predict_y = []
for i in range(m): ## 对应0/1分类
if hx[i][0] > 0.5:
predict_y.append(1)
else:
predict_y.append(0)
testLabel = np.array(test_y) ## array
predictLabel = np.array(predict_y) ## array
print 'accuracy:', sum(testLabel==predictLabel)/float(m)
return predictLabel
def loadTrainDataSet():
train_x = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', ' and', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks','ate','my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
train_y = [0,1,0,1,0,1] # 0:good; 1: bad
return train_x, train_y
def loadTestDataSet():
test_X = [['love', 'my', 'girl', 'friend'],
['stupid', 'garbage'],
['Haha', 'I', 'really', "Love", "You"],
['This', 'is', "my", "dog"]]
test_y = [0,1,0,0] # 0:good; 1: bad
return test_X, test_y
if __name__ == '__main__':
train_X, train_y = loadTrainDataSet()
test_X, test_y = loadTestDataSet()
clf = NaiveBayes()
vocabList, param = clf.fit(train_X, train_y)
results = clf.predict(test_X, vocabList, param)
print results
results1 = clf.predict1(test_X, test_y, vocabList, param)
print results1
clf = LogisticRegression()
vocabList, weigh = clf.fit(train_X, train_y)
results = clf.predict(test_X, vocabList, weigh)
print results
results1 = clf.predict1(test_X, test_y, vocabList, weigh)
print results1
转载请注明:宁哥的小站 » 朴素贝叶斯与逻辑回归算法实现