forked from szroland/nlu
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext_parser.py
224 lines (194 loc) · 7.94 KB
/
text_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
import spacy
import time
from concept import Concept
from typing import Iterable
from relation import Relation
import logging
logger = logging.getLogger(__name__)
class TextToMentalase:
def parse(self, text:str) -> Iterable[Concept]:
pass
__spacy_parsers = {}
def get_spacy_parser(language='en'):
if language in __spacy_parsers:
return __spacy_parsers[language]
logger.warn("Loading text parsers for language '%s' (this may take a while...)" % language)
start = time.time()
result = spacy.load(language)
end = time.time()
logger.warn('Models loaded, took %r seconds' % (end-start))
__spacy_parsers[language] = result
return result
class SpacyTranslator(TextToMentalase):
def __init__(self, language='en', nlp=None):
self.language = language;
self.__nlp = nlp
@property
def nlp(self):
if self.__nlp is None:
self.__nlp = get_spacy_parser(self.language)
return self.__nlp
def dep(self, root, type):
if root is None:
return None
for c in root.children:
if c.dep_ == type:
return c
return None
def deps(self, root, type):
if root is None:
return None
for c in root.children:
if c.dep_ == type:
yield c
def subj(self, sentence) -> Iterable[Concept]:
s = self.dep(sentence, 'nsubj')
while s is not None:
p = self.dep(s, 'poss')
if p is None:
yield Concept.word(self.name(s))
else:
yield Concept(None, Relation.Part, [
Concept.word(self.name(p)),
Concept.word(self.name(s))
])
s = self.dep(s, 'conj')
@staticmethod
def name(noun):
if noun.pos_ == 'PROPN': # keep case for proper nouns
return noun.orth_
if noun.tag_ == 'WP' or noun.tag_ == 'WP$': # replace WH-pronouns with ?
return '?'
if noun.tag_ == 'WRB':
return '?'
return noun.lemma_
def parse_simple(self, root) -> Iterable:
if root.lemma_ == 'be':
attr = self.dep(root, 'attr')
if attr and (attr.pos_ == 'NOUN' or attr.pos_ == 'PROPN'):
rel = Relation.Class
prep = self.dep(root, 'prep')
if prep is not None and prep.lemma_ == 'like':
rel = Relation.Feature
attr_concept = Concept.word(self.name(attr))
poss = self.dep(attr, "poss")
if poss is not None:
attr_concept = Concept(None, Relation.Part, [
Concept.word(self.name(poss)),
attr_concept
])
rel = Relation.Identical
for subj in self.subj(root): # type: Concept
if subj.relation == Relation.Part:
yield Concept(None, Relation.Identical, [
attr_concept,
subj
])
else:
yield Concept(None, rel, [
subj,
attr_concept
])
acomp = self.dep(root, 'acomp')
if acomp and acomp.pos_ == 'ADJ':
for subj in self.subj(root):
yield Concept(None, Relation.Feature, [
subj,
Concept.word(self.name(acomp))
])
elif root.pos_ == 'VERB':
for subj in self.subj(root):
rel = Relation.Feature
aux = self.dep(root, 'aux')
mark = self.dep(root, 'mark')
if aux is not None and aux.lemma_ == 'be': # continuous
rel = Relation.Action
if aux is not None and aux.lemma_ == 'do' and aux.tag_ == 'VBD': # past tense with aux do
rel = Relation.Action
if root.tag_ == 'VBD': # past tense
rel = Relation.Action
if mark is not None and mark.lemma_ == 'if': # conditional
rel = Relation.Action
concept = Concept(None, rel, [
subj,
Concept.word(self.name(root))
])
features = []
npadvmod = self.dep(root, 'npadvmod')
if npadvmod is not None:
features.append(Concept(None, Relation.Time, [
Concept.word(self.name(npadvmod))
]))
advmod = self.dep(root, 'advmod')
if advmod is not None:
if advmod.lemma_ == 'where':
features.append(Concept(None, Relation.Relative, [Concept.word('?'), Concept.word('?')]))
if advmod.lemma_ == 'when':
features.append(Concept(None, Relation.Time, [Concept.word('?')]))
for prep in self.deps(root, 'prep'):
while prep is not None:
obj = self.dep(prep, 'pobj')
obj_name = '?'
if obj is not None:
obj_name = self.name(obj)
features.append(Concept(None, Relation.Relative, [
Concept.word(self.name(prep)),
Concept.word(obj_name)
]))
prep = self.dep(prep, 'prep')
if len(features) > 0:
concept = Concept(None, Relation.Feature, [
concept
] + features)
yield concept
def parse_sentence(self, root) -> Iterable:
advcl = self.dep(root, 'advcl')
mark = self.dep(advcl, 'mark')
if advcl and mark and mark.lemma_ == 'if':
condition = next(self.parse_simple(advcl))
for action in self.parse_simple(root):
yield Concept(None, Relation.Implication, [
condition, action
])
else:
for c in self.parse_simple(root):
yield c
def dump_sentence(self, token, indent=" "):
logger.debug("%s%s (%s %s %s)" % (indent, token.lemma_, token.pos_, token.tag_, token.dep_))
for child in token.children:
self.dump_sentence(child, indent + " ")
def parse(self, text: str):
doc = self.nlp(text)
for sentence in doc.sents:
logger.debug('Parsing: %r ROOT: %s (%s)' % (sentence, sentence.root, sentence.root.lemma_))
self.dump_sentence(sentence.root)
for concept in self.parse_sentence(sentence.root):
yield concept
def explore(self, text):
doc = self.nlp(text)
logger.info('Doc %r' % doc)
for sentence in doc.sents:
logger.debug(' - Sentence: %r ROOT: %s (%s)' % (sentence, sentence.root, sentence.root.lemma_))
self.parse_sentence(sentence.root)
# for e in sentence.subtree:
# logger.debug(' - %r' % e)
for token in doc:
logger.debug(' - Token: %r: %s %s %s (shape=%s, entity=%s, lemma=%s, head=%s)' %
(token, token.pos_, token.tag_, token.dep_,
token.shape_, token.ent_type_, token.lemma_, token.head))
return doc
if __name__ == '__main__':
logging.basicConfig(level=logging.DEBUG)
parser = SpacyTranslator()
parser.explore(
"If Joe is tired, Joe becomes slow. "
"If Joe is slow, Joe becomes angry. "
"Joe is tired. "
"Is Joe angry?"
)
parser.explore(
"Budapest, Tokyo and Vienna are cities. "
"She arrived yesterday from Vienna to Budapest. "
"She stayed at the Hilton on the Ring."
"She only had 2 bags and a backpack and $100 in her valet"
)