-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvocabulary.py
executable file
·130 lines (108 loc) · 4.38 KB
/
vocabulary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env python3
import spacy
import statistics
import math
import pandas as pd
import pickle
# spanish billion word corpus and embeddings
# https://github.com/crscardellino/sbwce
class Vocabulary():
def __init__(self, file, nlp):
self.nlp = nlp
self.vocabulary = {}
self.word_POS = {}
self.sentence_POS = {}
self.POS2word = {}
self.read_file(file)
self.mean_POS_length = self.get_mean_POS_length()
def read_file(self, file):
with open(file_txt, 'r') as file:
for i, line in enumerate(file):
# creating vocabulary
self.get_vocabulary(line)
# creating POS
sentence = self.nlp(line.lower().strip())
self.get_word_POS(sentence)
# creating sentence POS
self.get_sentence_POS(i, sentence)
# creating POS2word
self.get_POS2word(sentence)
def get_vocabulary(self, line):
for word in line.split():
word = word.lower()
try:
self.vocabulary[word] += 1
except KeyError:
self.vocabulary[word] = 1
def get_word_POS(self, sentence):
for word in sentence:
try:
self.word_POS[(word.text, word.pos_)] += 1
except KeyError:
self.word_POS[(word.text, word.pos_)] = 1
def get_sentence_POS(self, i, sentence):
current_sentence_POS = []
for word in sentence:
current_sentence_POS.append(word.pos_)
self.sentence_POS[i] = (sentence, current_sentence_POS)
def get_POS2word(self, sentence):
for word in sentence:
try:
self.POS2word[word.pos_].add(word.text)
except KeyError:
self.POS2word[word.pos_] = {word.text}
def get_mean_POS_length(self):
lengths = [len(self.sentence_POS[i][1]) for i in range(len(self.sentence_POS))]
#print(sum(lengths)/len(lengths))
m = statistics.mean(lengths)
#print(m) # 5.21...
m = math.ceil(m) # rounding up
return m
if __name__ == '__main__':
#file_txt = 'sbwce.clean.small.txt'
file_txt = 'decimas_data_small.txt'
nlp = spacy.load("es_core_news_sm")
vocabulary = Vocabulary(file_txt, nlp)
#vocabulary.vocabulary = dict(sorted(vocabulary.vocabulary.items(), key=lambda x: x[1], reverse=True))
#word_POS = dict(sorted(vocabulary.word_POS.items(), key=lambda x: x[0][0], reverse=True))
#print(vocabulary.POS2word)
# POS tag frequencies
raw_count = pd.DataFrame(0, columns=range(int(vocabulary.mean_POS_length)), index=vocabulary.POS2word)
postag_freq = pd.DataFrame(0, columns=range(int(vocabulary.mean_POS_length)), index=vocabulary.POS2word)
position_freq = pd.DataFrame(0, columns=range(int(vocabulary.mean_POS_length)), index=vocabulary.POS2word)
for sentence in vocabulary.sentence_POS:
seq = vocabulary.sentence_POS[sentence][1]
for position in range(len(seq)):
POS = seq[position]
try:
raw_count.loc[POS, position] += 1
except KeyError:
break
print('raw count\n', raw_count)
for index, row in raw_count.iterrows():
total = sum(raw_count.loc[index])
for col in range(len(row)):
count = raw_count.loc[index, col]
postag_freq.loc[index, col] = count / total * 100 # percentage
row_sum = round(postag_freq.loc[index].sum())
if row_sum != 100.0:
raise ValueError(f'row {index} sums up to {row_sum}. it should be 100.0')
print('POS tag\n', postag_freq)
for col_name, column in raw_count.iteritems():
total = sum(column)
for row in range(len(column)):
count = raw_count.iloc[row, col_name]
position_freq.iloc[row, col_name] = count / total * 100
col_sum = round(position_freq[col_name].sum())
if col_sum != 100.0:
raise ValueError(f'row {index} sums up to {col_sum}. it should be 100.0')
print('position\n', position_freq)
# uncomment these lines to save files
"""
raw_count.to_json('raw_count.json')
postag_freq.to_json('postag_freq.json')
position_freq.to_json('position_freq.json')
picklefile = open('vocabulary.pk', 'wb')
pickle.dump(vocabulary, picklefile)
picklefile.close()
"""