This repository has been archived by the owner on Jan 8, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
utils.py
executable file
·188 lines (137 loc) · 5.69 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
#!/usr/bin/env python3
"""
General utils
"""
from os.path import exists
from shutil import copyfile
from subprocess import run
from urllib.request import urlretrieve
from urllib.error import HTTPError
from zipfile import ZipFile
from keras import backend as kerasbackend
from collections import Counter
def simplify_ratio(list_a, list_b):
"""
Little helper to simplify ratios.
"""
list_a_s = round(list_a / min([list_a, list_b]))
list_b_s = round(list_b / min([list_a, list_b]))
return_tuple = (min([list_a_s, list_b_s]), max([list_a_s, list_b_s]))
return return_tuple
def download_vuamc_xml(url='http://ota.ahds.ac.uk/text/2541.zip'):
"""
Downloads the original VUAMC.zip if necessary.
http://ota.ahds.ac.uk/headers/2541.xml
"""
zipped_vuamc_file = 'starterkits/2541.zip'
unzipped_vuamc_file = 'starterkits/2541/VUAMC.xml'
if exists(unzipped_vuamc_file):
return
if not exists(zipped_vuamc_file):
try:
print('Downloading {url}'.format(url=url))
urlretrieve(url, zipped_vuamc_file)
except HTTPError:
print('Could not download VUAMC.zip')
return
zipped_vuamc = ZipFile(zipped_vuamc_file, 'r')
zipped_vuamc.extractall('starterkits/')
zipped_vuamc.close()
print('Successfully extracted {url}'.format(url=url))
def generate_vuamc_csv():
"""
Generates the CSV files used in the Shared Task, using the scripts provided by NAACL
https://github.com/EducationalTestingService/metaphor/tree/master/NAACL-FLP-shared-task
"""
if not exists('source/vuamc_corpus_test.csv'):
run(['python3', 'vua_xml_parser_test.py'], cwd='starterkits')
copyfile('starterkits/vuamc_corpus_test.csv', 'source/vuamc_corpus_test.csv')
print('Successfully generated vuamc_corpus_test.csv')
if not exists('source/vuamc_corpus_train.csv'):
run(['python3', 'vua_xml_parser.py'], cwd='starterkits')
copyfile('starterkits/vuamc_corpus_train.csv', 'source/vuamc_corpus_train.csv')
print('Successfully generated vuamc_corpus_train.csv')
def recall(y_true, y_pred):
"""
Only computes a batch-wise average of recall.
Computes the recall, a metric for multi-label classification of
how many relevant items are selected.
"""
true_positives = kerasbackend.sum(kerasbackend.round(kerasbackend.clip(y_true * y_pred, 0, 1)))
possible_positives = kerasbackend.sum(kerasbackend.round(kerasbackend.clip(y_true, 0, 1)))
recall = true_positives / (possible_positives + kerasbackend.epsilon())
return recall
def precision(y_true, y_pred):
"""
Only computes a batch-wise average of precision.
Computes the precision, a metric for multi-label classification of
how many selected items are relevant.
"""
true_positives = kerasbackend.sum(kerasbackend.round(kerasbackend.clip(y_true * y_pred, 0, 1)))
predicted_positives = kerasbackend.sum(kerasbackend.round(kerasbackend.clip(y_pred, 0, 1)))
precision = true_positives / (predicted_positives + kerasbackend.epsilon())
return precision
def f1(y_true, y_pred):
"""
Keras 2.0 doesn't ship the F1 Metric anymore.
https://github.com/keras-team/keras/issues/6507
"""
prec = precision(y_true, y_pred)
reca = recall(y_true, y_pred)
return 2 * ((prec * reca) / (prec + reca))
def get_class_weights(y, smooth_factor=0):
"""
Returns the weights for each class based on the frequencies of the samples
:param smooth_factor: factor that smooths extremely uneven weights
:param y: list of true labels (the labels must be hashable)
:return: dictionary with the weight for each class
"""
counter = Counter(y)
if smooth_factor > 0:
p = max(counter.values()) * smooth_factor
for k in counter.keys():
counter[k] += p
majority = max(counter.values())
return {cls: float(majority / count) for cls, count in counter.items()}
def weighted_categorical_crossentropy(weights):
"""
A weighted version of keras.objectives.categorical_crossentropy.
https://github.com/keras-team/keras/issues/6261
Variables:
weights: numpy array of shape (C,) where C is the number of classes
Usage:
# Class one at 0.5, class 2 twice the normal weights, class 3 10x.
weights = np.array([0.5,2,10])
loss = weighted_categorical_crossentropy(weights)
model.compile(loss=loss, optimizer='adam')
"""
weights = kerasbackend.variable(weights)
def loss(y_true, y_pred):
"""
Calculate categorical_crossentropy including the weights
"""
# scale predictions so that the class probas of each sample sum to 1
y_pred /= kerasbackend.sum(y_pred, axis=-1, keepdims=True)
# clip to prevent NaN's and Inf's
y_pred = kerasbackend.clip(y_pred, kerasbackend.epsilon(), 1 - kerasbackend.epsilon())
# calc
loss = y_true * kerasbackend.log(y_pred) * weights
loss = -kerasbackend.sum(loss, -1)
return loss
return loss
from csv import reader, DictReader, writer
import spacy
def create_postags(filename, outputfile):
nlp = spacy.load('en_core_web_sm')
output = []
with open(filename) as csvfile:
csvreader = DictReader(csvfile, delimiter=',', quotechar='"')
for row in csvreader:
txt_id = row['txt_id']
sentence_id = row['sentence_id']
sentence_tags = ' '.join([token.pos_ for token in nlp(row['sentence_txt'])])
output.append([txt_id, sentence_id, sentence_tags])
with open(outputfile, 'w', newline='') as csvfile:
csvwriter = writer(csvfile, delimiter=',', quotechar='"')
for row in output:
csvwriter.writerow(row)