-
Notifications
You must be signed in to change notification settings - Fork 1
/
utils.py
95 lines (78 loc) · 2.56 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import string
# punctuation characters
punct = set(string.punctuation)
# morphology rules used to assign unknown word tokens
noun_suffix = ["action", "age", "ance", "cy", "dom", "ee", "ence", "er", "hood", "ion", "ism", "ist", "ity", "ling", "ment", "ness", "or", "ry", "scape", "ship", "ty"]
verb_suffix = ["ate", "ify", "ise", "ize"]
adj_suffix = ["able", "ese", "ful", "i", "ian", "ible", "ic", "ish", "ive", "less", "ly", "ous"]
adv_suffix = ["ward", "wards", "wise"]
def get_word_tag(line, vocab):
# check if a line is empty (just contains \n or \t), if yes
if not line.split():
word = "--n--"
tag = "--s--"
return word, tag
else:
word, tag = line.split()
if word not in vocab:
word = assign_unk(word)
return word, tag
return None
def preprocess(vocab, tokens):
orig = []
prep = []
for cnt, word in enumerate(tokens):
if not word.split():
orig.append(word.strip())
word = "--n--"
prep.append(word)
continue
elif word.strip() not in vocab:
orig.append(word.strip())
word = assign_unk(word)
prep.append(word)
continue
else:
orig.append(word.strip())
prep.append(word.strip())
assert(len(orig) == len(tokens))
assert(len(prep) == len(tokens))
return orig, prep
def processing(vocab, text):
prep_sentence = []
for word in text:
if not word.split():
word = "--n--"
prep_sentence.append(word)
continue
elif word.strip() not in vocab:
word = assign_unk(word)
prep_sentence.append(word)
continue
else:
prep_sentence.append(word.strip())
assert(len(prep_sentence) == len(text))
return prep_sentence
def assign_unk(tok):
# Digits
if any(char.isdigit() for char in tok):
return "--unk_digit--"
# Punctuation
elif any(char in punct for char in tok):
return "--unk_punct--"
# Upper-case
elif any(char.isupper() for char in tok):
return "--unk_upper--"
# Nouns
elif any(tok.endswith(suffix) for suffix in noun_suffix):
return "--unk_noun--"
# Verbs
elif any(tok.endswith(suffix) for suffix in verb_suffix):
return "--unk_verb--"
# Adjectives
elif any(tok.endswith(suffix) for suffix in adj_suffix):
return "--unk_adj--"
# Adverbs
elif any(tok.endswith(suffix) for suffix in adv_suffix):
return "--unk_adv--"
return "--unk--"