麻烦大神们帮我看看这个朴素贝叶斯分类垃圾邮件的程序哪里有问题，谢谢你们

meili-li · 发表于 2019-3-30 14:05:37

马上注册，结交更多好友，享用更多功能^_^

您需要登录才可以下载或查看，没有账号？立即注册

x

import os
def readtxt(path,encoding):
with open(path, 'r', encoding ) as f:
      lines = f.readlines()
return lines
def fileWalker(path):#遍历所有文件
fileArray = []
for root, dirs, files in os.walk(path):
      for fn in files:
         eachpath = str(root+'\\'+fn)
         fileArray.append(eachpath)
      return fileArray
def email_parser(email_path):#得到所有词的列表
punctuations = []
content_list = readtxt(email_path, 'utf8')
content = (' '.join(content_list)).replace('\r\n', ' ').replace('\t', ' ')
clean_word = []
for punctuation in punctuations:
      content = (' '.join(content.split(punctuation))).replace('  ', ' ')
      clean_word = [word.lower()
      for word in content.split(' ') if len(word) > 2]
      return clean_word
def get_word(email_file):
word_list = []
word_set = []
email_paths = fileWalker(email_file)
for email_file in email_paths:
      clean_word = email_parser(email_paths)
      word_list.append(clean_word)
      word_set.extend(clean_word)
return word_list, set(word_set)
def count_word_prob(email_list, union_set):
word_prob = {} #建立一个字典，统计每一个词的词频，如出现，计数。未出现，即为0.01
for word in union_set:
      counter = 0
      for email in email_list:
         if word in email:
            counter += 1  #在所有文件中出现的次数
         else:
            continue
      prob = 0.0
      if counter != 0:
         prob = counter/len(email_list)
      else:
         prob = 0.01
         word_prob[word] = prob
         return word_prob #大概意思是，谁谁，出现在邮件中的次数/所有邮件数
def filter(ham_word_pro, spam_word_pro, test_file):
         test_paths = fileWalker(test_file)
         for test_path in test_paths:
            email_spam_prob = 0.0
            spam_prob = 0.5
            ham_prob = 0.5
            file_name = test_path.split('\\')[-1]
            prob_dict = {}
            words = set(email_parser(test_path))  #当前测试集中某一邮件分词集合
            for word in words:
                  Psw = 0.0
                  if word not in spam_word_pro:
                     Psw = 0.4 #如果词语未出现在所有邮件中，则记为0.4
                  else:
                     Pws = spam_word_pro[word]#该词在垃圾邮件中的频率
                     Pwh = ham_word_pro[word] #该词在正常邮件中的频率
                     Psw = spam_prob*(Pws/(Pwh*ham_prob+Pws*spam_prob))#该词的贝叶斯概率
                     prob_dict[word] = Psw  #加入到字典中
                     numerator = 1
                     denominator_h = 1
                     for k, v in prob_dict.items():
                        numerator *= v
                        denominator_h *= (1-v)
                     email_spam_prob = round(numerator/(numerator+denominator_h), 4)
                     if email_spam_prob > 0.5:
                        print(file_name, 'spam', 'psw is',email_spam_prob)
                     else:
                        print(file_name, 'ham', 'psw is',email_spam_prob)
                        print(prob_dict)
                        print('----------------------------------------我是分界线---------------------------------')
def main():
ham_file = (r'D:\Program Files (x86)\python文件\贝叶斯垃圾邮件分类\email\ham')
spam_file = (r'D:\Program Files (x86)\python文件\贝叶斯垃圾邮件分类\email\spam')
test_file = (r'D:\Program Files (x86)\python文件\贝叶斯垃圾邮件分类\email\test')
ham_list, ham_set = get_word(ham_file)
spam_list, spam_set = get_word(spam_file)
union_set = ham_set | spam_set
ham_word_pro = count_word_prob(ham_list, union_set)
spam_word_pro = count_word_prob(spam_list, union_set)
print(ham_set)
filter(ham_word_pro, spam_word_pro, test_file)
if __name__ == '__main__':
main()

jackz007 · 发表于 2019-3-30 15:23:54

你应该说明程序用途，输入样例，预期输出，当前问题

meili-li · 发表于 2019-3-31 13:00:40

jackz007 发表于 2019-3-30 15:23
你应该说明程序用途，输入样例，预期输出，当前问题

一直显示错误，运行不了呀

塔利班 · 发表于 2019-3-31 13:05:09

你发错误出来，我们又没运行环境，就干瞪眼看累不累

账号		自动登录	找回密码
密码			立即注册