-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpos.py
303 lines (274 loc) · 11 KB
/
pos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
# -----------------------------------
# POS Tagging
# -----------------------------------
from collections import Counter
import pandas as pd
import numpy
# -----------------------------------
# POS TAGS + FREQUENCIES
# -----------------------------------
# GET_POS_TAGS()
# -----------------------------------
# SENTS = spaCy Doc.sents (https://spacy.io/api/doc#sents)
# POS_TAGGED_DICT = dict(); all individual terms (= each term once per passage) and their associated POS Tags
# POS_TAGGED_LIST = list(); all POS Tags per passage in their original order
def get_pos_tags(sents):
pos_tagged_dict = dict()
pos_tagged_list = list()
counter = 0
for sents_grouped in sents:
pos_tagged_dict[counter] = {}
pos_tagged_list_level1 = []
for sentence in sents_grouped:
for token in sentence:
pos_tagged_dict[counter][token.text] = token.pos_
pos_tagged_list_level1.append(token.pos_)
pos_tagged_list.append(pos_tagged_list_level1)
counter += 1
return pos_tagged_dict, pos_tagged_list
# COUNT_TAG_FREQS()
# -----------------------------------
# POS_TAGGED = list(); output POS_TAGGED_LIST from GET_POS_TAGS
# SORTED_TAG_FREQS = pd.DataFrame(); where rows = tag name, columns = index of passage, values = relative frequencies
# TAGS_USED = list(); all individual POS Tags used within an input text/document
def count_tag_freqs(pos_tagged):
values_per_key = {}
tags_used = []
pos_counter = 0
# absolute frequencies: iterate through all values (assigned) in pos_tagged and count the individual frequencies
for passage in pos_tagged:
values_per_key[pos_counter] = {}
for token in passage:
if token in values_per_key[pos_counter].keys():
values_per_key[pos_counter][token] += 1
else:
values_per_key[pos_counter][token] = 1
if token not in tags_used:
tags_used.append(token)
pos_counter += 1
# relative frequencies: based on number of words within one passage
num_words = 0
for tagset in values_per_key.values():
num_words = sum(tagset.values())
for k, v in tagset.items():
rel_freq = v / num_words
tagset[k] = rel_freq
sorted_tag_freqs = pd.DataFrame(values_per_key).sort_index()
return sorted_tag_freqs, tags_used
# -----------------------------------
# RETURN POS N-GRAMS
# -----------------------------------
# FIND_NGRAM_INDEX()
# -----------------------------------
# POS_TAGGED = list(); output POS_TAGGED_LIST from GET_POS_TAGS
# NGRAM = str(); in the form "[pos_tag],[pos_tag_2],[pos_tag_n]"
# INDEX_LIST = list()
def find_ngram_index(pos_tagged, ngram):
ngram = ngram.replace(" ","")
index_list = []
counter = 0
for nested_list in pos_tagged:
tag_str= ""
for tag in nested_list:
tag_str += tag
tag_str += ","
if ngram in tag_str:
index_list.append(counter)
counter += 1
return index_list
# FIND_NGRAMS()
# -----------------------------------
# POS_TAGGED = list(); output POS_TAGGED_LIST from GET_POS_TAGS
# N = int(); n as in n-Gram
# NGRAMS_LIST = list(); all the corresponding n-Grams found
def find_ngrams(pos_tagged, n):
ngrams_list = []
for nested_list in pos_tagged:
#element_list = list(element)
for item in range(len(nested_list)-n):
every_ngram = []
for i in range(item, item+n, +1):
every_ngram.append(nested_list[i])
ngrams_list.append(every_ngram)
return ngrams_list
#NGRAM_COUNT()
# -----------------------------------
# NGRAMS = list(); output NGRAMS_LIST from FIND_NGRAMS
# DF = pd.Dataframe(); cols = ["ngram", "count"]
def ngram_count(ngrams):
ngram_counter = Counter((tuple(item) for item in ngrams))
df = pd.DataFrame.from_dict(ngram_counter, orient='index').reset_index()
df = df.rename(columns={'index':'ngram', 0:'count'})
df = df.sort_values(by=['count'], ascending=False)
df.ngram = df.ngram.astype("str")
return df
# GET_N_NGRAMS()
# -----------------------------------
# N = list([int_1, int_2, int_n]); n as in n-Gram
# TOPN = int(); how many highest values to return
# POS_TAGGED = list(); output POS_TAGGED_LIST from GET_POS_TAGS
# NGRAMS = list() of pd.Dataframes(); for each n returns a df similar to output DF from NGRAM_COUNT()
# NAMES = list(); contains names for each of the nested dfs to use for vis
def get_n_ngrams(n, topn, pos_tagged):
if isinstance(n, list):
ngrams = []
names = []
for number in n:
ngram = ngram_count(find_ngrams(pos_tagged, number))
ngrams.append(ngram[:topn])
names.append(str(number) + "-grams")
else:
ngrams = ngram_count(find_ngrams(pos_tagged, n))
names = [str(n) + "-grams"]
return ngrams, names
# -----------------------------------
# COMPARE POS N-GRAMS
# -----------------------------------
# LIST_INDIVIDUAL_GRAMS()
# -----------------------------------
# DF_LISTS = list() of pandas.DataFrames(); output NGRAMS created by GET_N_NGRAMS()
# ALL_GRAMS_LIST = list(); all individual ngrams over the different input DataFrames
def list_individual_grams(df_lists):
all_grams_list = []
for grams_list in df_lists:
for sublist in grams_list:
for gram in sublist.ngram:
val_index = sublist.ngram[sublist.ngram == gram].index.values
if val_index <= 1:
pass
elif gram in all_grams_list:
pass
else:
all_grams_list.append(gram)
return all_grams_list
# GRAMS_MATRIX_PREP()
# -----------------------------------
# GRAMS = list() of pandas.DataFrames(); output NGRAMS created by GET_N_NGRAMS()
# ALL_GRAMS = list(); output ALL_GRAMS_LIST from LIST_INDIVIDUAL_GRAMS()
# TYPE = str(); either "binary" or "count" -> add check type before execution in the future
# CHECK_GRAMS = returns list of binary values/count for each n-gram in all_grams
def grams_matrix_prep(grams, all_grams, type):
df = pd.concat(grams, ignore_index=True)
check_grams = []
for all_gram in all_grams:
if all_gram in list(df.ngram):
if type == "binary":
check_grams.append(1)
if type == "count":
val_index = df.ngram[df.ngram == all_gram].index.values
check_grams.append(df.loc[val_index, "count"].values[0])
else:
check_grams.append(0)
return check_grams
# -----------------------------------
# POS DIVERSITY
# -----------------------------------
# GET_POS_DIVERSITY()
# -----------------------------------
# DF = pandas.DataFrame; output SORTED_TAG_FREQS from COUNT_TAG_FREQS()
# DIV_DF = pandas.DataFrame; contains column "pos_diversity" (Shannon Entropy for all POS Tag frequencies per passage)
def get_pos_diversity(df):
list_vals = []
for col in df.columns:
diversity_sum = 0
tag_vals = []
for tag in df[col]:
if numpy.isnan(tag):
pass
else:
diversity = tag * numpy.log2(tag)
tag_vals.append(diversity)
for val in tag_vals:
diversity_sum += val
list_vals.append(-diversity_sum if diversity < 0 else diversity_sum) # added condition "if diversity < 0 else diversity_sum" here to change -0 values to 0
div_df = pd.DataFrame(list_vals, columns=['pos_diversity'])
return div_df
# -----------------------------------
# POS TAGS GLOSSARY
# -----------------------------------
# look up all existing possible pos tags: https://github.com/explosion/spaCy/blob/master/spacy/glossary.py
# used here: only universal & german pos tags so far
def get_pos_glossary():
pos_glossary = {
# POS tags
# Universal POS Tags
# http://universaldependencies.org/u/pos/
"ADJ": "adjective",
"ADP": "adposition",
"ADV": "adverb",
"AUX": "auxiliary",
"CONJ": "conjunction",
"CCONJ": "coordinating conjunction",
"DET": "determiner",
"INTJ": "interjection",
"NOUN": "noun",
"NUM": "numeral",
"PART": "particle",
"PRON": "pronoun",
"PROPN": "proper noun",
"PUNCT": "punctuation",
"SCONJ": "subordinating conjunction",
"SYM": "symbol",
"VERB": "verb",
"X": "other",
"EOL": "end of line",
"SPACE": "space",
# POS Tags (German)
# TIGER Treebank
# http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf
"$(": "other sentence-internal punctuation mark",
"$,": "comma",
"$.": "sentence-final punctuation mark",
"ADJA": "adjective, attributive",
"ADJD": "adjective, adverbial or predicative",
"APPO": "postposition",
"APPR": "preposition; circumposition left",
"APPRART": "preposition with article",
"APZR": "circumposition right",
"ART": "definite or indefinite article",
"CARD": "cardinal number",
"FM": "foreign language material",
"ITJ": "interjection",
"KOKOM": "comparative conjunction",
"KON": "coordinate conjunction",
"KOUI": 'subordinate conjunction with "zu" and infinitive',
"KOUS": "subordinate conjunction with sentence",
"NE": "proper noun",
"NNE": "proper noun",
"PAV": "pronominal adverb",
"PROAV": "pronominal adverb",
"PDAT": "attributive demonstrative pronoun",
"PDS": "substituting demonstrative pronoun",
"PIAT": "attributive indefinite pronoun without determiner",
"PIDAT": "attributive indefinite pronoun with determiner",
"PIS": "substituting indefinite pronoun",
"PPER": "non-reflexive personal pronoun",
"PPOSAT": "attributive possessive pronoun",
"PPOSS": "substituting possessive pronoun",
"PRELAT": "attributive relative pronoun",
"PRELS": "substituting relative pronoun",
"PRF": "reflexive personal pronoun",
"PTKA": "particle with adjective or adverb",
"PTKANT": "answer particle",
"PTKNEG": "negative particle",
"PTKVZ": "separable verbal particle",
"PTKZU": '"zu" before infinitive',
"PWAT": "attributive interrogative pronoun",
"PWAV": "adverbial interrogative or relative pronoun",
"PWS": "substituting interrogative pronoun",
"TRUNC": "word remnant",
"VAFIN": "finite verb, auxiliary",
"VAIMP": "imperative, auxiliary",
"VAINF": "infinitive, auxiliary",
"VAPP": "perfect participle, auxiliary",
"VMFIN": "finite verb, modal",
"VMINF": "infinitive, modal",
"VMPP": "perfect participle, modal",
"VVFIN": "finite verb, full",
"VVIMP": "imperative, full",
"VVINF": "infinitive, full",
"VVIZU": 'infinitive with "zu", full',
"VVPP": "perfect participle, full",
"XY": "non-word containing non-letter",
}
return pos_glossary