-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathsplit_clauses.py
113 lines (95 loc) · 3.74 KB
/
split_clauses.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import codecs
import sys
import os
from collections import defaultdict
try:
import argparse
except ImportError:
import compat.argparse as argparse
SCRIPTDIR=os.path.dirname(os.path.abspath(__file__))
ID,FORM,LEMMA,CPOS,POS,FEAT,HEAD,DEPREL,DEPS,MISC=range(10)
class Tree(object):
def __init__(self):
self.gov={}
self.deps=defaultdict(lambda:set())
self.dtypes={0:u"ROOTDEP"}
self.morpho={}
def add_dep(self,g,d,t,morpho):
self.deps[g].add(d)
self.gov[d]=g
self.dtypes[d]=t
self.morpho[d]=morpho
def read_conll(inp):
""" Read conll format file and yield one sentence at a time as a list of lists of columns. If inp is a string it will be interpreted as filename, otherwise read from sys.stdin"""
if isinstance(inp,basestring):
f=codecs.open(inp,u"rt",u"utf-8")
else:
f=codecs.getreader("utf-8")(sys.stdin) # read stdin
sent=[]
for line in f:
line=line.strip()
if not line or line.startswith(u"#"): #Do not rely on empty lines in conll files, ignore comments
continue
if line.startswith(u"1\t") and sent: #New sentence, and I have an old one to yield
yield sent
sent=[]
sent.append(line.split(u"\t"))
else:
if sent:
yield sent
if isinstance(inp,basestring):
f.close() #Close it if you opened it
#types=u"ccomp advcl rcmod".split()
types=u"ccomp advcl acl:relcl".split()
def orig_dtype(tree,node):
dtype=tree.dtypes[tree.gov[node]]
if dtype==u"conj":
return orig_dtype(tree,tree.gov[node])
else: return dtype,tree.morpho[node]
def DFS(tree,node,idx,indexes,hier):
dtype=tree.dtypes[node]
new_idx=idx
if dtype in types: # this is a new clause, use idx + '.' + next number
if dtype==u"ccomp" and (u"Case=Gen" in tree.morpho[node] or u"Case=Tra" in tree.morpho[node]):
pass
suffix=max(hier.get(idx)) if idx in hier else 0
new_idx=idx+u"."+unicode(suffix+1)
hier[idx].add(suffix+1)
elif dtype==u"conj": # coordination of two subclauses or coordination from sentence root, split from last '.' and add next number
t,m=orig_dtype(tree,node)
if t in types or t==u"root":
if dtype==u"ccomp" and (u"Case=Gen" in tree.morpho[node] or u"Case=Tra" in tree.morpho[node]):
pass
id,suffix=idx.rsplit(u".",1)
suffix=max(hier.get(id)) if id in hier else 1
new_idx=id+u"."+unicode(suffix+1)
hier[id].add(suffix+1)
indexes[node]=new_idx
for dep in sorted(tree.deps[node]):
DFS(tree,dep,new_idx,indexes,hier)
def split(args):
count=1
for sent in read_conll(args.input):
tree=Tree()
for token in sent:
head=int(token[HEAD])
if head==0:
tree.add_dep(0,int(token[ID]),u"root",u"_")
else:
tree.add_dep(head,int(token[ID]),token[DEPREL],u"|".join(m for m in [token[CPOS],token[FEAT]]))
# now dictionary is ready, start search from root
indexes={}
hier=defaultdict(lambda:set()) # wipe the set
DFS(tree,0,unicode(count)+u".1",indexes,hier)
for token in sent:
token[MISC]=unicode(indexes[int(token[ID])])
#token[13]=unicode(indexes[int(token[ID])])
print (u"\t".join(c for c in token)).encode(u"utf-8")
print
count+=1
if __name__==u"__main__":
parser = argparse.ArgumentParser(description='Split parsed sentences into clauses.')
g=parser.add_argument_group("Input/Output")
g.add_argument('input', nargs='?', help='Parser output file name, or nothing for reading on stdin')
args = parser.parse_args()
split(args)