-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathparse_definitions.py
executable file
·136 lines (103 loc) · 4.33 KB
/
parse_definitions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python3
#
# This script searches for definitions for those words already in the database.
#
# - Each wiktionary entry is for one headword
# - Each headword can have multiple etymologies
# - Each etymology can have multiple definitions
# - Each definition can have multiple "texts"
#
# So if x is the entry for "με":
# x[0]['definitions'][0]['text'][0] is
# first etymology, list of definitions, first definition, list of definition texts, first definition text
import re, sqlite3, json, yaml
class Word:
def __init__(self, word, raw_wiki, supplemental_definitions):
self.word = word
self.wiki = json.loads(raw_wiki)
if self.word in supplemental_definitions:
# don't parse, just look in the manually-entered data
self.anki = supplemental_definitions[self.word]['anki']
# change the word to the new display
self.word = supplemental_definitions[self.word]['new_word']
self.keep = True
else:
if self.wiki is None:
# drop if no wiki data
self.keep = False
else:
self.definitions = [Definition(self.word, x) for etymology in self.wiki for x in etymology['definitions']]
if len([x for x in self.definitions if x.keep]) > 0:
self.keep = True
self.anki = self.ankiize()
else:
self.keep = False
def ankiize(self):
return "<br><br>".join([x.anki for x in self.definitions if x.keep])
class Definition:
def __init__(self, word, definition):
self.word = word
self.part_of_speech = definition['partOfSpeech']
if self.part_of_speech in ['numeral', 'letter']:
self.keep = False
else:
self.texts = self.clean_texts(definition['text'])
if self.keep:
self.anki = self.ankiize()
@staticmethod
def useless_text(x):
return ('dated' in x) or (x.startswith('form of'))
def clean_texts(self, texts):
texts = [x for x in texts if not self.useless_text(x)]
# print('*texts*', texts)
if len(texts) == 0:
self.keep = False
return None
first_text = texts[0]
if '•' not in first_text:
self.keep = False
return None
m = re.match("(?P<word>.+)\s+•\s+\((?P<pronunciation>.+)\)\s*(?P<rest>.*)", first_text)
if m is None:
raise RuntimeError('unparsed definition:', self.word, first_text, other_texts)
self.first_text_rest = m.groupdict()['rest']
self.other_texts = [re.sub('\s+', ' ', x) for x in texts[1:]]
self.keep = True
def ankiize(self):
# print([self.word, self.part_of_speech, self.first_text_rest, self.texts])
first_line = "{} {}".format(self.word, self.part_of_speech)
if self.first_text_rest != '':
first_line += self.first_text_rest
lines = [first_line] + self.other_texts
return "<br>".join(lines)
def load_supplemental_definitions(fn):
with open(fn) as f:
defs = yaml.load(f)
for word, data in defs.items():
if 'new_word' not in data:
data['new_word'] = word
data['anki'] = data['new_word'] + " " + re.sub(";", "<br>", data['def'])
return defs
if __name__ == '__main__':
# load up DB
connection = sqlite3.connect('greek_db')
cursor = connection.cursor()
# load up words to skip
with open('words_to_skip.txt') as f:
words_to_skip = [line.rstrip() for line in f]
# load up supplemental definitions
supplemental_definitions = load_supplemental_definitions('supplemental_definitions.yaml')
rows = list(cursor.execute('''select word, wiki from lemmas where wiki is not null order by frequency desc'''))
with open('words.txt', 'w') as f:
for row in rows:
print(row[0], file=f)
words = [Word(x[0], x[1], supplemental_definitions) for x in rows if x[0] not in words_to_skip]
# look for dropped words
dropped_words = [x for x in words if not x.keep]
with open('dropped_words.txt', 'w') as f:
for x in dropped_words:
print(x.word, file=f)
with open('anki.tsv', 'w') as f:
for word in words:
if word.keep:
print(word.word, word.anki, sep='\t', file=f)