-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
307 lines (254 loc) · 10.6 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
import os, sys
#import time
#import requests, shutil
import json
import re
from pathlib import Path
import numpy as np
import pandas as pd
"""
path_data = "tg2019task/worldtree_corpus_textgraphs2019sharedtask_withgraphvis"
if not Path(path_data).exists():
# Download data
!git clone -q https://github.com/umanlp/tg2019task.git
!cd tg2019task/ && make dataset
# Run baseline tfidf (expected MAP: 0.054)
!cd {path_data} && python ../baseline_tfidf.py annotation/expl-tablestore-export-2017-08-25-230344/tables questions/ARC-Elementary+EXPL-Dev.tsv > predict.txt
!cd {path_data} && python ../evaluate.py --gold=questions/ARC-Elementary+EXPL-Dev.tsv predict.txt
"""
import nltk
nltk.download("stopwords")
import spacy
nlp = spacy.load("en_core_web_sm")
whitelist_words="""
most least much few
all nothing
full empty
once used
front back
below above
bottom top
down up
less more
part whole
see move
first one two
show
something
""".strip().lower().split()
# use make
# 'something' is a bit of a wild-card here...
def convert_texts(texts, remove_stop=True, remove_punct=True):
def prepreprocess(arr):
# Sometimes spacy doesn't handle punctuation well eg "work;life"
# But completely removing all punctuation worsens score
return [txt.replace(";", "; ") for txt in arr]
stops = set(nltk.corpus.stopwords.words("english"))
tokens, lemmas = [], []
for doc in nlp.pipe(prepreprocess(texts), disable=["ner", "tagger", "parser"]):
_tokens, _lemmas = [], []
for token in doc:
#print(token.text.lower())
if not token.text.lower() in whitelist_words: # These get waved through
# if token.is_stop and remove_stop:
# continue
if token.text in stops and remove_stop:
continue
if token.is_punct and remove_punct:
continue
if len(token.lemma_.strip())==0:
continue # Kill spaces
_tokens.append(token.text)
#_lemmas.append(token.lemma_)
_lemmas.append(token.lemma_.strip().lower())
tokens.append(_tokens)
lemmas.append(_lemmas)
return tokens, lemmas
def get_questions(path_questions, fname):
df = pd.read_csv(Path(path_questions).joinpath(fname), sep="\t")
tokens, lemmas = convert_texts(df.Question)
df["tokens"] = tokens
df["lemmas"] = lemmas
print(df.shape)
return df
def read_explanations_with_permutations(path, uids_existing): # uids_existing is modified-in-place
df = pd.read_csv(path, sep='\t')
header, uid_column, dep_column = [], None, None
for name in df.columns:
if name.startswith('[SKIP]'):
if 'UID' in name and not uid_column:
uid_column = name # This is the column header
if 'DEP' in name and not dep_column:
dep_column = name # This is the column header
else:
header.append(name) # These are all those not market '[SKIP]'
if not uid_column or len(df) == 0:
print('Possibly misformatted file: ' + path)
return []
arr=[]
for idx in df.index:
if dep_column is not None:
dep = df.loc[idx][dep_column]
if not pd.isna(dep) and len( str(dep).strip() )>0:
#print(f"Skipping : '{df.loc[idx][dep_column]}' for")
#print(f" {df.loc[idx]}")
if False: # Actually this hasn't been done properly in the dataset
continue
uid_raw = df.at[idx, uid_column]
if uid_raw in uids_existing:
print(f"Skipping duplicate uid : '{uid_raw}'")
continue
uids_existing.add(uid_raw)
cells, combos, combo_tot = dict(), [], 1
for h,v in zip(header, df.loc[idx][header]):
s = '' if pd.isna(v) else v
options = [ o.strip() for o in str(s).split(';') ]
options = [ o for o in options if len(o)>0 ]
if len(options)==0: options=['']
#print(options)
cells[h] = options
combos += [ len(options) ]
combo_tot *= len(options) # Count up the number of combos this contributes
for i in range(combo_tot):
# Go through all the columns, figuring out which combo we're on
combo, lemmas, residual = [], [], i
for j,h in enumerate(header):
# Find the relevant part for this specific combo
c = cells[h][ residual % combos[j] ] # Works even if only 1 combo
if len(cells[h])>1: lemmas.append(c) # This is when there are choices
combo.append( c )
residual = residual // combos[j] # TeeHee
# Order : uid, text, musthave, orig
arr.append( [
f"{uid_raw}_{i}", # uid_i
' '.join( [ c for c in combo if len(c)>0 ] ), # text for this combo
lemmas,
' '.join( [ (f"{{{'; '.join(cells[h])}}}" if len(cells[h])>1 else cells[h][0] )
for h in header
if len(cells[h][0])>0 ]).strip(), # 'orig' for debug
os.path.basename(path).replace('.tsv', ''), # 'table'
uid_raw, # 'uid_raw'
] )
return arr
def get_df_explanations(path_tables):
explanations, uids_existing = [], set()
for p in path_tables.iterdir():
#if 'USEDFOR.tsv' not in str(p): continue
explanations += read_explanations_with_permutations(str(p), uids_existing)
#print(len(uids_existing)) # Check that uids_existing is being modified-in-place
df = pd.DataFrame(explanations, columns=("uid", "text", "musthave", "orig", "table", "uid_raw",))
#print( df[ df.duplicated("uid") ]['uid'] ) # have a look at the problem rows
# this problem eliminate in parse above using '[SKIP] DEP' column
#df = df.drop_duplicates("uid") # NOO!
#return df
tokens, lemmas = convert_texts(df.text)
#df["tokens"] = tokens
df["lemmas"] = lemmas
for idx in df.index:
musthave=df.at[idx, 'musthave']
for i, musthave_lemma in enumerate(convert_texts(musthave)[1]):
if len(musthave_lemma)==0:
print(f"Need to have lemma of '{musthave[i]}' in :\n '{df.at[idx, 'orig']}'")
print(df.shape)
return df
def flatten(nested_list):
return [item for lst in nested_list for item in lst]
def get_flattened_items(dfs, field):
all_items = []
for df in dfs:
all_items.extend(flatten(df[field]))
return all_items
#all_lemmas = get_flattened_items([df_trn, df_dev, df_test, df_exp], "lemmas")
#80307 original
#80061 all lower-cased, no white-space
# all_tokens = get_flattened_items([df_trn, df_dev, df_test, df_exp], "tokens")
#88152 with 'combos' within exp unravelled
#unique_lemmas = sorted(list(set(all_lemmas)))
#node_lemma = unique_lemmas
#n_nodes=len(node_lemma)
#print(n_nodes)
#5262 with upper+lower case
#4754 all lower-cased
#4751 all lower-cased, no white-space
#4758 with whitelist_words
#4766 with whitelist_words including some required to lengthen explanations
#4717 as above with ';'->'; ' fix
#4716 with 'combos' within exp unravelled
#lemma2node = {lemma:idx for idx, lemma in enumerate(node_lemma)}
#def nodes_to_sentence(nodes):
# return ' '.join([ node_lemma[n] for n in nodes ])
#df_lemma = pd.DataFrame({
# "node": unique_lemmas,
# "embedding": None,
#})
#df_lemma.sample(5, random_state=42)
def get_node_fns( dfs ):
all_lemmas = get_flattened_items(dfs, "lemmas")
unique_lemmas = sorted(list(set(all_lemmas)))
print(f"Total number of lemmas found : {len(all_lemmas):d}, unique : {len(unique_lemmas)}")
node_lemma = unique_lemmas
#n_nodes=len(node_lemma)
lemma2node = {lemma:idx for idx, lemma in enumerate(node_lemma)}
def nodes_to_sentence(nodes):
return ' '.join([ node_lemma[n] for n in nodes ])
return node_lemma, lemma2node
def decompose_questions(df, lemma2node):
def get_nodes(lemmas):
return [lemma2node[lemma] for lemma in lemmas]
df['q_lem'], df['a_lem']=None,None
for prob in df.index:
multi=re.split(r'\([ABCDEF]\)\s*', df.at[prob,'Question'])
j_ans = 'ABCDEF'.find(df.at[prob,'AnswerKey'])+1
q_lem, a_lem=None, []
_, lemmas = convert_texts(multi)
for j,ls in enumerate(lemmas):
#print(ls)
ids = get_nodes(ls)
#print(ls, ids, j_ans)
if 0==len(ids):
# Just show that there's a problem (fortunately, no correct answer has 0 lemma terms)
print(prob, multi, lemmas, j, j_ans, ids)
if 0==j:
q_lem=ids # This is the question
else:
if j==j_ans: # This is the correct answer (reorder to first in list)
a_lem.insert(0, ids)
else: # Wrong answers come after correct one
a_lem.append(ids)
if False:
print(q_lem)
for a in a_lem:
print(" ", a)
df.at[prob,'q_lem']=q_lem
df.at[prob,'a_lem']=a_lem
def add_tfidf_questions(df, node_lemma):
for prob in df.index:
question_with_ans = df.at[prob,'q_lem'] + df.at[prob,'a_lem'][0]
df.at[prob,'q_tfidf']=' '.join(
[ node_lemma[n] for n in question_with_ans ]
)
def add_tfidf_explanation(dfe, node_lemma):
for e in dfe.index:
ex_nodes = dfe.at[e,'nodes']
dfe.at[e,'e_tfidf']=' '.join([ node_lemma[n] for n in ex_nodes ])
def add_gold_explanation_idx(df, df_exp):
df['ex_gold']=None
df['ex_reason']=None
for prob in df.index:
exs_str = df.at[prob, 'explanation']
ex_arr, reason_arr = [],[]
if type(exs_str) is str:
for ex_str in exs_str.split(' '):
uid, reason = ex_str.split('|')
ex_uid_rows = df_exp[ df_exp['uid_raw'] == uid ]
exs_arr=[]
for i in range(len(ex_uid_rows)):
exs_arr.append( df_exp.index.get_loc(ex_uid_rows.iloc[i].name) )
if 0==len(exs_arr):
print("Missing ID '%s' in '%s'" % (uid, exs_str,))
continue # missing uid, somehow
#print(exs_arr)
ex_arr.append(exs_arr)
reason_arr.append(reason)
df.at[prob, 'ex_gold']=ex_arr # Each explanation is an array of uid_raw aliases
df.at[prob, 'ex_reason']=reason_arr # Only for those found