-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnaive_bayes.py
147 lines (112 loc) · 4.46 KB
/
naive_bayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import os
import re
# Dictionary that contains all the occurrences of the words in the ham emails
counter_ham = dict()
# Dictionary that contains all the occurrences of the words in the spam emails
counter_spam = dict()
# Function that reads all the emails, breaks the message in single words, counts the files and the occurrences of the words in the emails
def readWords(ham_loc, spam_loc):
ham_total = len(os.listdir(os.getcwd()+ham_loc))
spam_total = len(os.listdir(os.getcwd()+spam_loc))
# For every file read the message break it down to single words (no duplicates of words) and update the dictionaries
for fname in os.listdir(os.getcwd()+ham_loc):
filename = os.getcwd()+ham_loc+fname
file = open(filename, "r")
message = file.read()
contents = re.split("\s", message)
one_occur = []
for word in contents:
if word not in one_occur:
one_occur.append(word)
for word in one_occur:
if word not in counter_ham:
counter_ham.update({word: 1})
else:
counter_ham[word] = counter_ham.get(word)+1
file.close()
for fname in os.listdir(os.getcwd()+spam_loc):
filename = os.getcwd() + spam_loc+ fname
file = open(filename, "r")
message = file.read()
contents = re.split("\s", message)
one_occur = []
for word in contents:
if word not in one_occur:
one_occur.append(word)
for word in one_occur:
if word not in counter_spam:
counter_spam.update({word: 1})
else:
counter_spam[word] = counter_spam.get(word) + 1
file.close()
return ham_total, spam_total
# Naive Bayes classifier
def classify(filename, ham_total, spam_total):
# Open the file & finds what words of the message are in the dictionaries
opened_file = open(filename, "r")
message = opened_file.read()
contents = re.split("\s", message)
ham_total += 2
spam_total += 2
# We calculate the probabilities of the message
final_spam = spam_total / (spam_total + ham_total)
final_ham = ham_total / (spam_total + ham_total)
for word in contents:
ham = counter_ham.get(word)
spam = counter_spam.get(word)
if ham is None:
ham = 1
else:
ham += 1
if spam is None:
spam = 1
else:
spam += 1
final_spam *= (spam/spam_total)
final_ham *= (ham/ham_total)
# If the probability of ham is bigger or equal to the probability of spam return 1, else 0
if final_ham >= final_spam:
return 1
elif final_ham < final_spam:
return 0
# Helper function that is call from analysis contains the locations of the ham & spam trains, test & validation
def init(ham_train, spam_train, test, validation):
ham_total, spam_total = readWords(ham_train, spam_train)
test_size = len(os.listdir(os.getcwd()+test))
correct_test = 0
true_positive = 0
false_positive = 0
false_negative = 0
for file in os.listdir(os.getcwd()+test):
res = classify(os.getcwd()+test+file, ham_total, spam_total)
if "spam" in file and res == 0:
correct_test += 1
true_positive += 1
if "ham" in file and res == 1:
correct_test += 1
true_positive += 1
if "spam" in file and res == 1:
false_positive += 1
if "ham" in file and res == 0:
false_positive += 1
validation_size = len(os.listdir(os.getcwd()+validation))
correct_validation = 0
for file in os.listdir(os.getcwd()+validation):
res = classify(os.getcwd()+validation+file, ham_total, spam_total)
if "spam" in file and res == 0:
correct_validation += 1
true_positive += 1
if "ham" in file and res == 1:
correct_validation += 1
true_positive += 1
if "spam" in file and res == 1:
false_negative += 1
if "ham" in file and res == 0:
false_negative += 1
# Calculate accuracy, error, precision, recall & F1 and return them to analysis
accuracy = correct_test/test_size*100
error = 100 - accuracy
precision = true_positive/(true_positive+false_positive)*100
recall = true_positive/(true_positive+false_negative)*100
f1 = 2*((precision*recall)/(precision+recall))
return accuracy, error, precision, recall, f1