text_dispersion.py

#reference corpus needs to be in the same directory as the target corpus

import glob
import re
import math
import sys
import gzip
from collections import Counter
from common import load_key_data,text_count,word_count,total_wc
import statistics
from statistics import mean
from analyze import print_textv2
from signal import signal, SIGPIPE, SIG_DFL
signal(SIGPIPE,SIG_DFL)

target_reg = sys.argv[1] # give target register as an argument (e.g. NA or NA_ne)
ref_reg = sys.argv[2]

#Declare and initialize dictionaries
counts_T = {}
counts_R = {}
keyness = {}
text_count_T = {} # dictionaries for text dispersion count (no of texts a word appears in)
text_count_R = {}
word_count_T = {} # dictionaries for word frequency count (no of times a word appears in a corpus)
word_count_R = {}

total_T = 0 # number of texts in a corpus
total_R = 0
words_total_T = 0 # number of words in a corpus
words_total_R = 0

#print(glob.glob("data/*"))

for file in glob.glob("data/*"): # read all files from path -> run from corpus directory
    word_list_T = []
    word_list_R = []
    print("in folder", file)
    print("argvs",sys.argv[1:])
    if file in sys.argv[1:]:
            print("match", file)
            print("Reading a conllu file named", file, flush=True)
            f = print_textv2(file, "FORM", 1000000)
            f=f.split("\n")
    else:
        continue

#Count the texts in the target and reference corpora
    for line in f:
#        print("XXX",line)
        #print()
        line=line.strip()
        #print("JOTAIN",sys.argv[1])
        if sys.argv[1]  == file:
 #           print("file is target")
            total_T += 1
        else:
            total_R += 1
#            print("reference")
        line = line.lower()
        line = re.sub('[^\w|\s|\'|\-]', '', line)
        words=line.split( )
        #print("WWW",words)
        #print()
#If the file comes from the target register(s) then add to target word count, add all words to file word list. If the file comes from any other register, add to reference corpus dictionary
        if re.match(sys.argv[1], file):
#            print("target")
            word_list_T.extend(words)
#            print("word_list", word_list_T)
            words_total_T += len(words)
 #           print("LEN", words_total_T)
        else:
            word_list_R.extend(words)
            words_total_R += len(words)
 
#Create a list of word types in this file, and add these words to a dictionary (to track text counts for each word)
# in addition, add the word counts to the word count dictionary to track word frequencies
        word_list_T_unique = list(set(word_list_T))
        for x in word_list_T_unique:
            text_count_T[x] = text_count_T.get(x,0) + 1
            word_count_T[x] = word_count_T.get(x,0) + Counter(word_list_T)[x]

        word_list_R_unique = list(set(word_list_R))
        for x in word_list_R_unique:
            text_count_R[x] = text_count_R.get(x,0) + 1
            word_count_R[x] = word_count_R.get(x,0) + Counter(word_list_R)[x]

        word_list_T = []
        word_list_R = []

#calculate log-likelihood formula for each word in target dictionary
for i in (text_count_T):

        freq_T = text_count_T[i]
        freq_R = 0
        cfreq_T = word_count_T[i]
        cfreq_R = 0
#If the current word is in the reference corpus then use this formula
        if i in text_count_R:
            freq_R = text_count_R[i]
            cfreq_R = word_count_R[i]
            E_T = total_T * ((freq_T + freq_R) / (total_T + total_R))
            E_R = total_R * ((freq_T + freq_R) / (total_T + total_R))
            G2 = 2 * ((freq_T * math.log(freq_T / E_T)) + (freq_R * math.log(freq_R / E_R)))
            if (freq_R / E_R) > (freq_T / E_T):
                G2 = (G2 * -1)
            cE_T = words_total_T * ((cfreq_T + cfreq_R) / (words_total_T + words_total_R))
            cE_R = words_total_R * ((cfreq_T + cfreq_R) / (words_total_T + words_total_R))
            cG2 = 2 * (cfreq_T * math.log(cfreq_T / cE_T)) + (cfreq_R * math.log(cfreq_R / cE_R))
            if (cfreq_R / cE_R) > (cfreq_T / cE_T):
                cG2 = (cG2*-1)
            keyness[i] = (G2, cG2)

#If the current word is not in the reference corpus use this formula
        else:
            E_T = total_T * (freq_T + freq_R) / (total_T + total_R)
            E_R = total_R * (freq_T + freq_R) / (total_T + total_R)
            G2 = 2 * (freq_T * math.log(freq_T / E_T))
            cE_T = words_total_T * (cfreq_T + cfreq_R) / (words_total_T + words_total_R)
            cE_R = words_total_R * (cfreq_T + cfreq_R) / (words_total_T + words_total_R)
            cG2 = 2 * (cfreq_T * math.log(cfreq_T / cE_T))
            keyness[i] = (G2, cG2)
            
print("Total_T:",total_T)
print("Total_R:",total_R)
print("words_total_t:",words_total_T) # number of words in a corpus                                                             
print("words_total_r:",words_total_R)
            
#Write out the words and keyness (log-likelihood) values for each word
count = 0
target_100 = []
ref_100 = []
text_target_100 = []
text_ref_100 = []

word_fr_t = 0
word_fr_r = 0
print()
print("Text dispersion keywords for", sys.argv[1])
dists = {}
for i in sorted(keyness, key = keyness.get, reverse=True):
        print(i, keyness[i][0])