-
Notifications
You must be signed in to change notification settings - Fork 0
/
find_mention_heads.py
48 lines (47 loc) · 1.89 KB
/
find_mention_heads.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import spacy
import json
import sys
import util
import os
if __name__ == '__main__':
nlp = spacy.load('en')
f = open(sys.argv[1])
examples = [json.loads(line) for line in f.readlines()]
f_out = open(sys.argv[2], 'w+')
for example in examples:
words = util.flatten(example['sentences'])
doc = spacy.tokens.Doc(vocab=nlp.vocab, words=words)
for name, proc in nlp.pipeline:
doc = proc(doc)
new_clusters = {}
entities = {}
entity_indices = {}
entity_cluster = {}
for j, clust in enumerate(example['clusters']):
finished_clust = False
names = set()
name_indices = []
longest_name = ""
for mention in clust:
pmention = None
span = doc[mention[0]:mention[1]+1]
# for entity_mention in entity_mentions:
for entity_mention in example['people']:
if (span.root.i >= entity_mention[0] and span.root.i <= entity_mention[1]):
name = ' '.join(words[entity_mention[0]:entity_mention[1]+1])
names.add(name)
name_indices.append(entity_mention)
if len(name) > len(longest_name):
longest_name = name
if len(longest_name) > 0 and longest_name not in new_clusters.keys():
new_clusters[longest_name] = clust
entities[longest_name] = list(names)
entity_indices[longest_name] = list(name_indices)
entity_cluster[longest_name] = j
example['entities'] = entities
example['entity_indices'] = entity_indices
example['entity_cluster'] = entity_cluster
example['clusters_dict'] = new_clusters
f_out.write(json.dumps(example)+'\n')
print(example['doc_key'])
f_out.close()