江东的笔记

Be overcome difficulties is victory

0%

朴素贝叶斯+拉普拉斯平滑代码实现-方法二

朴素贝叶斯有多种实现方式,这是另一种实现方式!

首先导入包:

1
2
3
4
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import scorer
import numpy as np

数据的读取:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22


datasets = pd.DataFrame([["青绿", "蜷缩", "浊响", "清晰", "凹陷", "硬滑", "是"],
["乌黑", "蜷缩", "沉闷", "清晰", "凹陷", "硬滑", "是"],
["乌黑", "蜷缩", "浊响", "清晰", "凹陷", "硬滑", "是"],
["青绿", "蜷缩", "沉闷", "清晰", "凹陷", "硬滑", "是"],
["浅白", "蜷缩", "浊响", "清晰", "凹陷", "硬滑", "是"],
["青绿", "稍蜷", "浊响", "清晰", "稍凹", "软粘", "是"],
["乌黑", "稍蜷", "浊响", "稍糊", "稍凹", "软粘", "是"],
["乌黑", "稍蜷", "浊响", "清晰", "稍凹", "硬滑", "是"],
["乌黑", "稍蜷", "沉闷", "稍糊", "稍凹", "硬滑", "否"],
["青绿", "硬挺", "清脆", "清晰", "平坦", "软粘", "否"],
["浅白", "硬挺", "清脆", "模糊", "平坦", "硬滑", "否"],
["浅白", "蜷缩", "浊响", "模糊", "平坦", "软粘", "否"],
["青绿", "稍蜷", "浊响", "稍糊", "凹陷", "硬滑", "否"],
["浅白", "稍蜷", "沉闷", "稍糊", "凹陷", "硬滑", "否"],
["乌黑", "稍蜷", "浊响", "清晰", "稍凹", "软粘", "否"],
["浅白", "蜷缩", "浊响", "模糊", "平坦", "硬滑", "否"],
["青绿", "蜷缩", "沉闷", "稍糊", "稍凹", "硬滑", "否"]],
columns=["色泽", "根蒂", "敲声", "纹理", "脐部", "触感", "好瓜"])


计算出好瓜的概率:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
def fit_fun(datasets):
good_el_dic = {} # P(*|好瓜)
bad_el_dic = {} # P(*|坏瓜)
P_dic = {} # P(*)
for j in datasets.columns.to_list()[0:-1]:
for i in datasets[j].unique():
# P_dic[i] = round((datasets[j].value_counts()[i]+1)/(len(datasets)+len(datasets[j].unique())), 2)
if "是" in datasets.groupby(j)["好瓜"].value_counts()[i]:
good_el_dic[i] = round((datasets.groupby(j)["好瓜"].value_counts()[i]["是"]+1)/(len(datasets[datasets["好瓜"] == "是"])+len(datasets[j].unique())), 3) # 拉普拉斯
# good_el_dic[i] = round((datasets.groupby(j)["好瓜"].value_counts()[i]["是"]+1)/(datasets[j].value_counts()[i]+len(datasets[j].unique())), 2) # 拉普拉斯
else:
# good_el_dic[i] = round(1/(datasets[j].value_counts()[i]+len(datasets[j].unique())), 2) # 拉普拉斯
good_el_dic[i] = round(1/(len(datasets[datasets["好瓜"] == "是"])+len(datasets[j].unique())), 3) # 拉普拉斯
P_good = round((len(datasets[datasets["好瓜"] == "是"])+1)/(len(datasets)+2), 2)
print("P(*|好瓜):", good_el_dic)
# print("\nP(*):", P_dic)
print("\nP:", P_good)
return good_el_dic, P_dic, P_good
good_el_dic, P_dic, P = fit_fun(datasets)

输出:

1
2
3
P(*|好瓜): {'青绿': 0.364, '乌黑': 0.455, '浅白': 0.182, '蜷缩': 0.545, '稍蜷': 0.364, '硬挺': 0.091, '浊响': 0.636, '沉闷': 0.273, '清脆': 0.091, '清晰': 0.727, '稍糊': 0.182, '模糊': 0.091, '凹陷': 0.545, '稍凹': 0.364, '平坦': 0.091, '硬滑': 0.7, '软粘': 0.3}

P: 0.47

计算坏瓜的概率:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
def fit_fun(datasets):
# good_el_dic = {} # P(*|好瓜)
bad_el_dic = {} # P(*|坏瓜)
# P_dic = {} # P(*)
for j in datasets.columns.to_list()[0:-1]:
for i in datasets[j].unique():
if "否" in datasets.groupby(j)["好瓜"].value_counts()[i]:
# bad_el_dic[i] = round((datasets.groupby(j)["好瓜"].value_counts()[i]["否"]+1)/(datasets[j].value_counts()[i]+len(datasets[j].unique())), 2) # 平滑
bad_el_dic[i] = round((datasets.groupby(j)["好瓜"].value_counts()[i]["否"]+1)/(len(datasets[datasets["好瓜"] == "否"])+len(datasets[j].unique())), 3)
else:
# P_bad[i] = round(1/(datasets[j].value_counts()[i]+len(datasets[j].unique())), 2) # 平滑
P_bad[i] = round(1/(len(datasets[datasets["好瓜"] == "否"])+len(datasets[j].unique())), 3) # 平滑
P_bad = round((len(datasets[datasets["好瓜"] == "否"])+1)/(len(datasets)+2), 3)
print("P(*|坏瓜):", bad_el_dic)
print("\nP_bad:", P_bad)
return bad_el_dic, P_bad
bad_el_dic, P_bad = fit_fun(datasets)
P_ = 1-P

输出:

1
2
3
P(*|坏瓜): {'青绿': 0.333, '乌黑': 0.25, '浅白': 0.417, '蜷缩': 0.333, '稍蜷': 0.417, '硬挺': 0.25, '浊响': 0.417, '沉闷': 0.333, '清脆': 0.25, '清晰': 0.25, '稍糊': 0.417, '模糊': 0.333, '凹陷': 0.25, '稍凹': 0.333, '平坦': 0.417, '硬滑': 0.636, '软粘': 0.364}

P_bad: 0.526

单个数据的预测:

好瓜概率预测:

1
2
3
4
5
6
7
test = ["乌黑", "稍蜷", "浊响", "清晰", "稍凹", "软粘"]
P_good = P
for i in test:
# P_good = P_good*good_el_dic[i]*P_dic[i]/P
P_good = P_good*good_el_dic[i]
print("预测为好瓜的概率:", P_good )
# 预测为坏瓜的概率: 0.0006981899836275

坏瓜概率预测:

1
2
3
4
5
6
P_bad = P_
for i in test:
# P_bad = P_bad*bad_el_dic[i]*P_dic[i]/P_
P_bad = P_bad*bad_el_dic[i]
print("预测为坏瓜的概率:", P_bad)
# 预测为坏瓜的概率: 0.0006981899836275

整体预测:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
y_pre = []
for i in range(0, len(datasets)):
test = datasets.iloc[i, :0-1].to_list()
P_g = P
P_b = P_
for i in test:
# P_g = P_g*good_el_dic[i]*P_dic[i]/P
P_g = P_g*good_el_dic[i]
for i in test:
# P_b = P_b*bad_el_dic[i]*P_dic[i]/P_
P_b = P_b*bad_el_dic[i]
if P_g>P_b:
y_pre.append("是")
else:
y_pre.append("否")
y_test = datasets["好瓜"]
y_test==pd.Series(y_pre)

输出:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
0      True
1 True
2 True
3 True
4 True
5 True
6 False
7 True
8 True
9 True
10 True
11 True
12 False
13 True
14 False
15 True
16 True
dtype: bool

如有大佬看出错误,请指正。