forked from Nuclear-Squid/ergol
-
Notifications
You must be signed in to change notification settings - Fork 0
/
chardict.py
executable file
·76 lines (66 loc) · 2.62 KB
/
chardict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/bin/env python3
""" Turn corpus texts into dictionaries of symbols and digrams. """
import json
from os import path, listdir
from sys import argv
IGNORED_CHARS = "1234567890 \t\r\n\ufeff"
def parse_corpus(file_path):
""" Count symbols and digrams in a text file. """
symbols = {}
digrams = {}
trigrams = {}
char_count = 0
prev_symbol = None
prev_prev_symbol = None
# get a dictionary of all symbols (letters, punctuation marks...)
file = open(file_path, "r", encoding="utf-8")
for char in file.read():
symbol = char.lower()
if char not in IGNORED_CHARS:
char_count += 1
if symbol not in symbols:
symbols[symbol] = 0
symbols[symbol] += 1
if prev_symbol is not None:
digram = prev_symbol + symbol
if digram not in digrams:
digrams[digram] = 0
digrams[digram] += 1
if prev_prev_symbol is not None:
trigram = prev_prev_symbol + digram
if trigram not in trigrams:
trigrams[trigram] = 0
trigrams[trigram] += 1
prev_prev_symbol = prev_symbol
prev_symbol = symbol
else:
prev_symbol = None
file.close()
# sort the dictionary by symbol frequency (requires CPython 3.6+)
def sort_by_frequency(table, precision=3):
sorted_dict = {}
for (key, count) in sorted(table.items(), key=lambda x: -x[1]):
freq = round(100 * count / char_count, precision)
if freq > 0:
sorted_dict[key] = freq
return sorted_dict
results = {}
results["corpus"] = file_path
results["symbols"] = sort_by_frequency(symbols)
results["digrams"] = sort_by_frequency(digrams, 4)
results["trigrams"] = sort_by_frequency(trigrams)
return results
if __name__ == "__main__":
if len(argv) == 2: # convert one file
data = parse_corpus(argv[1])
print(json.dumps(data, indent=4, ensure_ascii=False))
else: # converts all *.txt files in the script directory
rootdir = path.dirname(__file__)
destdir = path.join(rootdir, "..", "corpus")
for filename in listdir(rootdir):
if filename.endswith(".txt"):
print(f" {filename}...")
data = parse_corpus(path.join(rootdir, filename))
destfile = path.join(destdir, filename[:-4] + ".json")
with open(destfile, "w", encoding="utf-8") as outfile:
json.dump(data, outfile, indent=4, ensure_ascii=False)