-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcorpus.py
79 lines (59 loc) · 1.67 KB
/
corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import glob
from common import DependencyGraph, Node
from nltk.tree import Tree
from treeutil import filterLexical
def idg(filename):
dg = DependencyGraph()
with open(filename) as fp:
for line in fp:
trimmed = line.strip()
if len(trimmed) == 0:
yield dg
dg = DependencyGraph()
else:
dg.addNode(Node.byline(trimmed))
if dg.length() > 0:
yield dg
def itree_stream(stream, removeLeaves=False):
tree = ''
nopen = 0
cleanbuff = filter(lambda x: not x in ('\n'), stream.read())
for c in cleanbuff:
if c == '(':
nopen += 1
elif c == ')':
nopen -= 1
tree += c
if nopen == 0:
t = Tree(tree)
if removeLeaves:
filterLexical(t)
yield t
tree = ''
def itree(filename, removeLeaves=False):
#print filename
tree = ''
nopen = 0
with open(filename) as fp:
for t in itree_stream(fp, removeLeaves):
yield t
def idgcorpus(iwildcard):
for source in glob.glob(iwildcard):
for dg in idg(source):
yield dg
def itreecorpus(iwildcard):
for source in glob.glob(iwildcard):
for dg in itree(source):
yield dg
def isdgcorpus(iwildcard):
for i, source in enumerate(glob.glob(iwildcard)):
with open(source) as fp:
for line in fp:
clean = line.strip()
if len(clean) == 0 or len(clean.split('\t')) == 10:
pass
else:
return False
if i > 3:
break
return True