-
Notifications
You must be signed in to change notification settings - Fork 1
/
text_dispersion.py
143 lines (126 loc) · 5.31 KB
/
text_dispersion.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#reference corpus needs to be in the same directory as the target corpus
import glob
import re
import math
import sys
import gzip
from collections import Counter
from common import load_key_data,text_count,word_count,total_wc
import statistics
from statistics import mean
from analyze import print_textv2
from signal import signal, SIGPIPE, SIG_DFL
signal(SIGPIPE,SIG_DFL)
target_reg = sys.argv[1] # give target register as an argument (e.g. NA or NA_ne)
ref_reg = sys.argv[2]
#Declare and initialize dictionaries
counts_T = {}
counts_R = {}
keyness = {}
text_count_T = {} # dictionaries for text dispersion count (no of texts a word appears in)
text_count_R = {}
word_count_T = {} # dictionaries for word frequency count (no of times a word appears in a corpus)
word_count_R = {}
total_T = 0 # number of texts in a corpus
total_R = 0
words_total_T = 0 # number of words in a corpus
words_total_R = 0
#print(glob.glob("data/*"))
for file in glob.glob("data/*"): # read all files from path -> run from corpus directory
word_list_T = []
word_list_R = []
print("in folder", file)
print("argvs",sys.argv[1:])
if file in sys.argv[1:]:
print("match", file)
print("Reading a conllu file named", file, flush=True)
f = print_textv2(file, "FORM", 1000000)
f=f.split("\n")
else:
continue
#Count the texts in the target and reference corpora
for line in f:
# print("XXX",line)
#print()
line=line.strip()
#print("JOTAIN",sys.argv[1])
if sys.argv[1] == file:
# print("file is target")
total_T += 1
else:
total_R += 1
# print("reference")
line = line.lower()
line = re.sub('[^\w|\s|\'|\-]', '', line)
words=line.split( )
#print("WWW",words)
#print()
#If the file comes from the target register(s) then add to target word count, add all words to file word list. If the file comes from any other register, add to reference corpus dictionary
if re.match(sys.argv[1], file):
# print("target")
word_list_T.extend(words)
# print("word_list", word_list_T)
words_total_T += len(words)
# print("LEN", words_total_T)
else:
word_list_R.extend(words)
words_total_R += len(words)
#Create a list of word types in this file, and add these words to a dictionary (to track text counts for each word)
# in addition, add the word counts to the word count dictionary to track word frequencies
word_list_T_unique = list(set(word_list_T))
for x in word_list_T_unique:
text_count_T[x] = text_count_T.get(x,0) + 1
word_count_T[x] = word_count_T.get(x,0) + Counter(word_list_T)[x]
word_list_R_unique = list(set(word_list_R))
for x in word_list_R_unique:
text_count_R[x] = text_count_R.get(x,0) + 1
word_count_R[x] = word_count_R.get(x,0) + Counter(word_list_R)[x]
word_list_T = []
word_list_R = []
#calculate log-likelihood formula for each word in target dictionary
for i in (text_count_T):
freq_T = text_count_T[i]
freq_R = 0
cfreq_T = word_count_T[i]
cfreq_R = 0
#If the current word is in the reference corpus then use this formula
if i in text_count_R:
freq_R = text_count_R[i]
cfreq_R = word_count_R[i]
E_T = total_T * ((freq_T + freq_R) / (total_T + total_R))
E_R = total_R * ((freq_T + freq_R) / (total_T + total_R))
G2 = 2 * ((freq_T * math.log(freq_T / E_T)) + (freq_R * math.log(freq_R / E_R)))
if (freq_R / E_R) > (freq_T / E_T):
G2 = (G2 * -1)
cE_T = words_total_T * ((cfreq_T + cfreq_R) / (words_total_T + words_total_R))
cE_R = words_total_R * ((cfreq_T + cfreq_R) / (words_total_T + words_total_R))
cG2 = 2 * (cfreq_T * math.log(cfreq_T / cE_T)) + (cfreq_R * math.log(cfreq_R / cE_R))
if (cfreq_R / cE_R) > (cfreq_T / cE_T):
cG2 = (cG2*-1)
keyness[i] = (G2, cG2)
#If the current word is not in the reference corpus use this formula
else:
E_T = total_T * (freq_T + freq_R) / (total_T + total_R)
E_R = total_R * (freq_T + freq_R) / (total_T + total_R)
G2 = 2 * (freq_T * math.log(freq_T / E_T))
cE_T = words_total_T * (cfreq_T + cfreq_R) / (words_total_T + words_total_R)
cE_R = words_total_R * (cfreq_T + cfreq_R) / (words_total_T + words_total_R)
cG2 = 2 * (cfreq_T * math.log(cfreq_T / cE_T))
keyness[i] = (G2, cG2)
print("Total_T:",total_T)
print("Total_R:",total_R)
print("words_total_t:",words_total_T) # number of words in a corpus
print("words_total_r:",words_total_R)
#Write out the words and keyness (log-likelihood) values for each word
count = 0
target_100 = []
ref_100 = []
text_target_100 = []
text_ref_100 = []
word_fr_t = 0
word_fr_r = 0
print()
print("Text dispersion keywords for", sys.argv[1])
dists = {}
for i in sorted(keyness, key = keyness.get, reverse=True):
print(i, keyness[i][0])