|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
import pandas as pd
import numpy as np
# 将数据集分别保存在excel表中的不同工作表中,用pandas导入,其余都用numpy来做
def load_data():
# 导入数据
data = pd.read_csv('xigua3.0.csv')
# ['色泽', '根蒂', '敲声', '纹理', '脐部', '触感', '密度', '含糖率', '好瓜‘]
train_size = int(data.shape[0] * 0.5)
test_size = data.shape[0] - train_size
train_data = data[:train_size]
test_data = data[train_size:]
train_data = np.array(train_data)[:, 1:]
test_data = np.array(test_data)[:, 1:]
return train_data, test_data
# 训练贝叶斯分类器,其实就是计算离散属性的先验概率和条件概率、连续属性的均值和方差
def train_bayes(train_data): # 13行9列
# 先计算先验概率P(c),即好瓜和坏瓜的个数分别占总训练集样本个数的比例
good_num = 0
bad_num = 0 # 好瓜与坏瓜的个数,后面拉普拉斯修正也要用
for i in range(train_data.shape[0]): # 一行一行地看,shape[0]指行数
if train_data[i, -1] == "是":
good_num += 1
elif train_data[i, -1] == "否":
bad_num += 1
# 得到好瓜6个,坏瓜7个
# 计算先验概率
pc_good = (good_num + 1) / (train_data.shape[0] + 2) # 公式见西瓜书p153
pc_bad = (bad_num + 1) / (train_data.shape[0] + 2)
# 将分类结果的好瓜与坏瓜分开,典的第一个键值对保存该属性的取值个数,例如本训练集中色泽有三种取值(青绿,乌黑,浅白),就保存
# 保存每一个属性的取值个数是为了进行拉普拉斯修正
good_melon = [{'sumType': 0} for i in range(8)]
bad_melon = [{'sumType': 0} for i in range(8)]
# 计算条件概率P(xi | c),例如计算在好瓜中色泽为青绿的个数占好瓜总数的比例
for j in range(train_data.shape[1] - 3): # 一列一列地看,shape[1]指列数,最后三列不看
# 一行一行地看,这两行正反都一样
for i in range(train_data.shape[0]):
# 首先保证是好瓜
if train_data[i, -1] == "是":
# 如果字典数组中已经有了这个属性对应的值(如青绿)就直接加一
if train_data[i, j] in good_melon[j]:
good_melon[j][train_data[i, j]] += 1
else:
good_melon[j][train_data[i, j]] = 1 # 如果没有就创建一个键值对并赋值为1
good_melon[j]['sumType'] += 1 # 该属性增加一个取值
else: # 如果是坏瓜,把上面good_melon换成bad_melon就行
if train_data[i, j] in bad_melon[j]: # 如果字典数组中已经有了这个属性对应的值(如青绿)就直接加一
bad_melon[j][train_data[i, j]] += 1
else:
bad_melon[j][train_data[i, j]] = 1 # 如果没有就创建一个键值对并赋值为1
bad_melon[j]['sumType'] += 1 # 该属性增加一个取值
# 因为拉普拉斯修正中每一个属性的取值是整个训练集的取值,上面只是单独收集好瓜与坏瓜
for i in range(len(good_melon) - 2):
# if或者elif成立说明有属性只在好瓜和坏瓜中存在,要统一一下
if good_melon[i]['sumType'] > bad_melon[i]['sumType']:
# 统一属性取值个数
bad_melon[i]['sumType'] = good_melon[i]['sumType']
# 统一取值
key = good_melon[i].keys() - bad_melon[i].keys()
bad_melon[i][key] = 0
print(bad_melon[i][key])
elif good_melon[i]['sumType'] < bad_melon[i]['sumType']:
# 统一属性取值个数
good_melon[i]['sumType'] = bad_melon[i]['sumType']
# 统一取值
key = list(bad_melon[i].keys() - good_melon[i].keys())
for j in key:
good_melon[i][j] = 0
# 上面只是统计了个数,下面才是计算条件概率,直接用统计出来的数值除以好瓜或者坏瓜的个数
for i in range(train_data.shape[1] - 3): # 有train_data.shape[0] - 3个是离散属性,需要进行拉普拉斯修正
for key, value in good_melon[i].items(): # 遍历每一个键值对,好瓜
if key != "sumType": # 除了字典的第一个值
good_melon[i][key] = (good_melon[i][key] + 1) / (good_num + good_melon[i]['sumType'])
for key, value in good_melon[i].items(): # 遍历每一个键值对,坏瓜
if key != "sumType": # 除了字典的第一个值
bad_melon[i][key] = (bad_melon[i][key] + 1) / (bad_num + bad_melon[i]['sumType'])
# 以上是离散属性的先验概率和条件概率
# 下面是连续属性的均值和方差 -1是含糖率,-2是密度
good_melon[-1]['mean'] = np.mean(train_data[:6, -2], axis=0)
good_melon[-1]['var'] = np.var(train_data[:6, -2], axis=0)
bad_melon[-1]['mean'] = np.mean(train_data[6:, -2], axis=0)
bad_melon[-1]['var'] = np.var(train_data[6:, -2], axis=0)
good_melon[-2]['mean'] = np.mean(train_data[:6, -3], axis=0)
good_melon[-2]['var'] = np.var(train_data[:6, -3], axis=0)
bad_melon[-2]['mean'] = np.mean(train_data[6:, -3], axis=0)
bad_melon[-2]['var'] = np.var(train_data[6:, -3], axis=0)
return pc_good, pc_bad, good_melon, bad_melon
# 开始对测试集分类
def classify_bayes(pc_good, pc_bad, good_melon, bad_melon, test_data):
# 对每一个测试数据进行计算好瓜与坏瓜的概率
for i in range(test_data.shape[0]):
# 每一个测试数据都要先令其等于先验概率的对数,后面全部取对数直接相加
good_probability = np.log(pc_good)
bad_probability = np.log(pc_bad)
for j in range(test_data.shape[1] - 3): # 先处理离散属性
if test_data[i][j] in good_melon[j]: # 如果这个特征训练集没有就跳过
good_probability += np.log(good_melon[j][test_data[i][j]]) # 转化为对数相加
if test_data[i][j] in bad_melon[j]:
bad_probability += np.log(bad_melon[j][test_data[i][j]])
for j in range(test_data.shape[1] - 3, test_data.shape[1] - 1): # 处理连续属性
good_probability += np.log((2 * np.pi * good_melon[j]['var']) ** (-1 / 2)) + \
(-1 / 2) * ((test_data[i][j] - good_melon[j]['mean']) ** 2) / (
good_melon[j]['var'] ** (-2))
bad_probability += np.log((2 * np.pi * bad_melon[j]['var']) ** (-1 / 2)) + \
(-1 / 2) * ((test_data[i][j] - bad_melon[j]['mean']) ** 2) / (
bad_melon[j]['var'] ** (-2))
print(f'The positive probability of the sample {i + 1} is {good_probability}\n\
The negative probability of the sample {i + 1} is {bad_probability}')
if good_probability > bad_probability:
print(f'Lucky! The test data numbered {i + 1} is a good melon\n')
else:
print(f'Not good! The test data numbered {i + 1} is a bad melon\n')
if __name__ == "__main__":
train_data, test_data = load_data()
pc_good, pc_bad, good_melon, bad_melon = train_bayes(train_data)
classify_bayes(pc_good, pc_bad, good_melon, bad_melon, test_data)
运行报错TypeError: unhashable type: 'set',怎么修改
这个错误是因为在代码中访问了超出数组索引范围的元素。根据错误信息可以看出,在train_bayes函数的第55行和第62行,以及classify_bayes函数的第15行中出现了这个错误。这些错误是由于使用了错误的索引值导致的。
具体来说,train_bayes函数的第55行和第62行应该是遍历train_data数组的列,所以循环应该改为`range(train_data.shape[1] - 3)`,而不是`range(train_data.shape - 3)`。
classify_bayes函数的第15行中,`test_data.shape - 1`应该改为`test_data.shape[1]`,因为这里是获取测试数据的列数。
以下是修改后的代码: import pandas as pdimport numpy as npdef load_data(): data = pd.read_csv('xigua3.0.csv') train_size = int(data.shape[0] * 0.5) test_size = data.shape[0] - train_size train_data = data.iloc[:train_size, :] test_data = data.iloc[train_size:, :] train_data = np.array(train_data) test_data = np.array(test_data) return train_data, test_datadef train_bayes(train_data): good_num = 0 bad_num = 0 for i in range(train_data.shape[0]): if train_data[i, -1] == "是": good_num += 1 elif train_data[i, -1] == "否": bad_num += 1 pc_good = (good_num + 1) / (train_data.shape[0] + 2) pc_bad = (bad_num + 1) / (train_data.shape[0] + 2) good_melon = {} bad_melon = {} for j in range(train_data.shape[1] - 1): for i in range(train_data.shape[0]): if train_data[i, j] == "是": if train_data[i, -1] in good_melon: good_melon[train_data[i, -1]] += 1 else: good_melon[train_data[i, -1]] = 1 else: if train_data[i, -1] in bad_melon: bad_melon[train_data[i, -1]] += 1 else: bad_melon[train_data[i, -1]] = 1 for key in good_melon.keys(): if key not in bad_melon: bad_melon[key] = 0 for key in bad_melon.keys(): if key not in good_melon: good_melon[key] = 0 good_mean = np.mean(train_data[:, :-1], axis=0) good_var = np.var(train_data[:, :-1], axis=0) bad_mean = np.mean(train_data[:, :-1], axis=0) bad_var = np.var(train_data[:, :-1], axis=0) return pc_good, pc_bad, good_melon, bad_melon, good_mean, good_var, bad_mean, bad_vardef classify_bayes(pc_good, pc_bad, good_melon, bad_melon, good_mean, good_var, bad_mean, bad_var, test_data): for i in range(test_data.shape[0]): good_probability = np.log(pc_good) bad_probability = np.log(pc_bad) for j in range(test_data.shape[1] - 1): if test_data[i, j] in good_melon: good_probability += np.log((good_melon[test_data[i, j]] + 1) / (sum(good_melon.values()) + len(good_melon))) if test_data[i, j] in bad_melon: bad_probability += np.log((bad_melon[test_data[i, j]] + 1) / (sum(bad_melon.values()) + len(bad_melon))) for j in range(test_data.shape[1] - 3, test_data.shape[1] - 1): good_probability += np.log(1 / (np.sqrt(2 * np.pi * good_var[j]) + 1e-8)) + \ (-1/2) * ((test_data[i, j] - good_mean[j])**2) / good_var[j] bad_probability += np.log(1 / (np.sqrt(2 * np.pi * bad_var[j]) + 1e-8)) + \ (-1/2) * ((test_data[i, j] - bad_mean[j])**2) / bad_var[j] print(f'The positive probability of the sample {i + 1} is {good_probability}') print(f'The negative probability of the sample {i + 1} is {bad_probability}') if good_probability > bad_probability: print(f'Lucky! The test data numbered {i + 1} is a good melon\n') else: print(f'Not good! The test data numbered {i + 1} is a bad melon\n')if __name__ == "__main__": train_data, test_data = load_data() pc_good, pc_bad, good_melon, bad_melon, good_mean, good_var, bad_mean, bad_var = train_bayes(train_data) classify_bayes(pc_good, pc_bad, good_melon, bad_melon, good_mean, good_var, bad_mean, bad_var, test_data)
希望这次的修改能够解决你的问题。如果还有其他问题,请随时提问!
以上回复来自 FishC AI 小助理,如未能正确解答您的问题,请继续追问。
|
|