|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
import os
import pandas as pd
from sklearn.model_selection import train_test_split
SPAM = 1
HAM = 0
TEST_SIZE = 0.3
FILE_PATHS = [('./spam_2/', SPAM),
('./easy_ham/', HAM),
('./hard_ham/', HAM)
]
SKIP_FILES = ['cmd']
def read_datas(): #read_datas()方法把数据从文件里读出来,并存成pandas DataFrame
result = []
for path, mail_type in FILE_PATHS:
file_list = os.listdir(path)
for f in file_list:
if f in SKIP_FILES:
continue
with open(path + f, 'r') as mail:
content = []
for line in mail:
content.append(line.decode('latin-1'))
content = '\n'.join(content)
result.append({'data': content, 'type': mail_type})
result = pd.DataFrame(result)
return result
def get_train_test_split(): #get_train_test_split()方法是把数据分成训练集和测试集
data = read_datas()
return train_test_split(data['data'], data['type'], test_size=TEST_SIZE, random_state=42) #数据读取并转换的功能
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB #调用多项式贝叶斯模型
from sklearn.feature_extraction.text import CountVectorizer
def get_naive_bayes_model():
pipeline = Pipeline([
('vectorizer', CountVectorizer()),
('classifier', MultinomialNB())])
return pipeline
from data import get_train_test_split
from model import *
from sklearn.metrics import accuracy_score,confusion_matrix,recall_score, f1_score
from sklearn.model_selection import cross_val_score
pipeline = get_naive_bayes_model()
X_train, X_test, y_train, y_test = get_train_test_split()
pipeline.fit(X_train, y_train)
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5)
print cv_scores
predicted = pipeline.predict(X_test) # ['spam', 'ham']
print confusion_matrix(y_test, predicted)
print 'accuracy_score', accuracy_score(y_test, predicted)
print 'f1_score', f1_score(y_test, predicted)
print 'recall_score', recall_score(y_test, predicted) #输出confusion matrix,并加入cross_validation, 以全面检测模型性能
|
|