AC1966 发表于 2021-5-20 22:06:06

有个邮件过滤的代码,我要怎么把他和邮件测试样本链接,或者联网?

import os
import pandas as pd
from sklearn.model_selection import train_test_split

SPAM = 1
HAM = 0
TEST_SIZE = 0.3

FILE_PATHS = [('./spam_2/', SPAM),
            ('./easy_ham/', HAM),
            ('./hard_ham/', HAM)
            ]
SKIP_FILES = ['cmd']

def read_datas():                #read_datas()方法把数据从文件里读出来,并存成pandas DataFrame
    result = []
    for path, mail_type in FILE_PATHS:
      file_list = os.listdir(path)
      for f in file_list:
            if f in SKIP_FILES:
                continue
            with open(path + f, 'r') as mail:
                content = []
                for line in mail:
                  content.append(line.decode('latin-1'))
                content = '\n'.join(content)
            result.append({'data': content, 'type': mail_type})
    result = pd.DataFrame(result)
    return result

def get_train_test_split():          #get_train_test_split()方法是把数据分成训练集和测试集
    data = read_datas()
    return train_test_split(data['data'], data['type'], test_size=TEST_SIZE, random_state=42)             #数据读取并转换的功能




from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB       #调用多项式贝叶斯模型
from sklearn.feature_extraction.text import CountVectorizer

def get_naive_bayes_model():
    pipeline = Pipeline([
      ('vectorizer',CountVectorizer()),
      ('classifier',MultinomialNB())])
    return pipeline





from data import get_train_test_split
from model import *
from sklearn.metrics import accuracy_score,confusion_matrix,recall_score, f1_score
from sklearn.model_selection import cross_val_score

pipeline = get_naive_bayes_model()
X_train, X_test, y_train, y_test = get_train_test_split()
pipeline.fit(X_train, y_train)
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5)
print cv_scores
predicted = pipeline.predict(X_test) # ['spam', 'ham']
print confusion_matrix(y_test, predicted)
print 'accuracy_score', accuracy_score(y_test, predicted)
print 'f1_score', f1_score(y_test, predicted)
print 'recall_score', recall_score(y_test, predicted)       #输出confusion matrix,并加入cross_validation, 以全面检测模型性能

nahongyan1997 发表于 2021-6-23 15:51:00

用 urllib 直接访问服务器网站
页: [1]
查看完整版本: 有个邮件过滤的代码,我要怎么把他和邮件测试样本链接,或者联网?