-
Notifications
You must be signed in to change notification settings - Fork 2
/
generate_text_from_clusters.py
52 lines (46 loc) · 1.63 KB
/
generate_text_from_clusters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import gzip, json, os, tqdm, sys, argparse, random, pickle
from operator import itemgetter
'''
Generates random texts from clustered word frequencies obtained from clusters with > 20 hits
'''
def generate_texts(cluster_location, min_frequency):
data = []
files = os.listdir(cluster_location)
corr = 0
miss = 0
for filename in tqdm.tqdm(files):
cluster_data = json.load(gzip.open(cluster_location + "/" + filename, "rt"))
for cluster_id, cluster_word_freqs in cluster_data.items():
words = []
for word_dict in cluster_word_freqs:
max_k, max_v = max(word_dict.items(), key=itemgetter(1))
total = sum(word_dict.values())
if max_v > min_frequency and len(max_k) > 1:
for word, freq in word_dict.items():
if word == max_k:
if len(word_dict) > 1 and freq > total-freq:
freq = total-freq
elif len(word_dict) == 1:
if freq > 10:
freq = 20
corr += freq
else: miss += freq
for i in range(freq):
words.append([max_k, word])
#random.shuffle(words)
data.append([cluster_id, words])
print("Correct: {}\nWrong: {}".format(corr, miss))
return data
if __name__ == "__main__":
parser = argparse.ArgumentParser("Generate 'texts' from clustered word frqeuencies")
parser.add_argument("--input", help="Location of folder with the cluster files", required=True)
parser.add_argument("--min_freq", default=10, type=int)
parser.add_argument("--output")
args = parser.parse_args()
data = generate_texts(args.input, args.min_freq)
if args.output:
with gzip.open(args.output, "wt") as pf:
#pickle.dump(data, pf)
pf.write(json.dumps(data))
else:
print(data)