-
Notifications
You must be signed in to change notification settings - Fork 0
/
pre_build.py
60 lines (52 loc) · 2.18 KB
/
pre_build.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import re
import os
import shutil
def collect_lemmata(dirName):
lemmata = ''
lexrules = ''
for fname in os.listdir(dirName):
if fname.endswith('.txt') and fname.startswith('hye-lexemes-'):
f = open(os.path.join(dirName, fname), 'r', encoding='utf-8-sig')
lemmata += f.read() + '\n'
f.close()
elif fname.endswith('.txt') and fname.startswith('hye-lexrules-'):
f = open(os.path.join(dirName, fname), 'r', encoding='utf-8-sig')
lexrules += f.read() + '\n'
f.close()
lemmataSet = set(re.findall('-lexeme\n(?: [^\r\n]*\n)+', lemmata, flags=re.DOTALL))
lemmata = '\n'.join(sorted(list(lemmataSet)))
return lemmata, lexrules
def prepare_files():
"""
Put all the lemmata to lexemes.txt. Put all the lexical
rules to lexical_rules.txt.
Put all grammar files to ../uniparser_eastern_armenian/data_strict/.
"""
lemmata, lexrules = collect_lemmata('.')
fOutLemmata = open('uniparser_eastern_armenian/data/lexemes.txt', 'w', encoding='utf-8')
fOutLemmata.write(lemmata)
fOutLemmata.close()
fInParadigms = open('paradigms.txt', 'r', encoding='utf-8-sig')
paradigms = fInParadigms.read()
fInParadigms.close()
fOutParadigms = open('uniparser_eastern_armenian/data/paradigms.txt', 'w', encoding='utf-8')
fOutParadigms.write(paradigms)
fOutParadigms.close()
fOutLexrules = open('uniparser_eastern_armenian/data/lex_rules.txt', 'w', encoding='utf-8')
fOutLexrules.write(lexrules)
fOutLexrules.close()
shutil.copy2('bad_analyses.txt', 'uniparser_eastern_armenian/data/')
shutil.copy2('armenian_disambiguation.cg3', 'uniparser_eastern_armenian/data/')
def parse_wordlists():
"""
Analyze wordlists/wordlist.csv.
"""
from uniparser_eastern_armenian import EasternArmenianAnalyzer
a = EasternArmenianAnalyzer()
a.analyze_wordlist(freqListFile='wordlists/eanc_wordlist.csv',
parsedFile='wordlists/eanc_wordlist_analyzed.txt',
unparsedFile='wordlists/eanc_wordlist_unanalyzed.txt',
verbose=True)
if __name__ == '__main__':
prepare_files()
parse_wordlists()