-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
100 lines (90 loc) · 4.4 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import re
import os
from sklearn.metrics import precision_score, f1_score, recall_score
import json
import numpy
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
# reconstructs hyphen, slash and apostrophes
def reconstruct_hyphenated_words(corpus):
i = 0
while i < len(corpus):
if((corpus[i].text == "-" or corpus[i].text == "/") and corpus[i].whitespace_ == ""): # identify hyphen ("-" inside a word)
with corpus.retokenize() as retokenizer:
retokenizer.merge(corpus[i-1:i+2]) # merge the first part of the word, the hyphen and the second part of the word
elif(corpus[i].text == "’s" and corpus[i-1].whitespace_ == ""):
with corpus.retokenize() as retokenizer:
retokenizer.merge(corpus[i-1:i+1])
else:
i += 1
return corpus
# used to reconstruct noun chunks that correspond to keywords
# merge the compound words specified in the keywords parameters into the same token
def reconstruct_noun_chunks(corpus,keywords):
i = 0
while i < len(corpus):
counter = i
token = corpus[i].text
for keyword in keywords:
kw_lower = keyword.lower()
index = kw_lower.find(token)
aux = index
while (aux != -1 and counter < len(corpus)-1 and token != kw_lower):
counter += 1
token += ' '+corpus[counter].text
aux = kw_lower.find(token)
if(aux == -1):
counter -=1
token = corpus[i].text
if(i != counter):
if(token == kw_lower):
with corpus.retokenize() as retokenizer:
retokenizer.merge(corpus[i:counter+1])
break
else:
counter = i
if(i == counter):
i += 1
return corpus
def clean_corpus(corpus):
corpus = corpus.lower()
corpus = re.sub("\n", " ", corpus) # Removing \n
corpus = re.sub("(\s+\-)", r" - ", corpus)
corpus = re.sub("([a-zA-Z]+)([0-9]+)", r"\1 \2", corpus)
corpus = re.sub("([0-9]+)([a-zA-Z]+)", r"\1 \2", corpus)
corpus = re.sub("([()!,;:\.\?\[\]\|])", r" \1 ", corpus)
corpus = re.sub(" +", " ", corpus)
return corpus
# Creates dictionary of a set, associating sentence with label
def create_sent_label_dict(sents, labels):
sents_dict = []
for row_id,row in enumerate(sents):
row = re.sub("\n", " ", row)
sents_dict.append({"text":row.strip(), "label":labels[row_id]})
return sents_dict
# For both classifiers
# WRITE OUTPUT STATISTICS FILE
def write_output_stats_file(path, name, ref_labels, pred_labels, labels):
with open(path, 'a') as file:
print(name,"set:\n", file=file) # Title
print("Precision macro:",round( precision_score( ref_labels, pred_labels, average="macro"),3), file=file)
print("Precision Individually:", numpy.round (precision_score( ref_labels, pred_labels, average=None, labels=labels),3), file=file)
print("Recall macro:",round( recall_score( ref_labels, pred_labels, average="macro"),3), file=file)
print("Recall Individually:", numpy.round(recall_score( ref_labels, pred_labels, average=None, labels=labels),3), file=file)
print("F1 Score micro:",round( f1_score( ref_labels, pred_labels, average="micro"),3), file=file)
print("F1 Score macro:",round( f1_score( ref_labels, pred_labels, average="macro"),3), file=file)
print("F1 Score weighted:",round( f1_score(ref_labels, pred_labels, average="weighted", ),3), file=file)
print("F1 Score Individually:", numpy.round(f1_score(ref_labels, pred_labels, average=None, labels=labels),3), file=file)
print("\n", file=file)
# WRITE OUTPUT PREDICTIONS IN JSON FORMAT
def write_predictions_file(pred_dict, path):
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, 'w') as file:
file.write(json.dumps(pred_dict, indent=4, ensure_ascii=False))
# Creates a confusion matrix
def create_confusion_matrix(refs, preds, normalize, path, labels, display_labels):
ConfusionMatrixDisplay.from_predictions(refs,preds, normalize=normalize, labels=labels, display_labels=display_labels)
plt.xticks(rotation=45, ha="right")
plt.subplots_adjust(bottom=0.4)
#plt.show() # obs.: either show or save the confusion matrix
plt.savefig(path)