Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update parsing scripts to work for stanford-corenlp-3.6.0 #45

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions client.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ class StanfordNLP:
def __init__(self):
self.server = ServerProxy(JsonRpc20(),
TransportTcpIp(addr=("127.0.0.1", 8080)))

def parse(self, text):
return json.loads(self.server.parse(text))

Expand All @@ -15,5 +15,5 @@ def parse(self, text):
pprint(result)

from nltk.tree import Tree
tree = Tree.parse(result['sentences'][0]['parsetree'])
tree = Tree.fromstring(result['sentences'][0]['parsetree'])
pprint(tree)
88 changes: 47 additions & 41 deletions corenlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,17 @@
# corenlp - Python interface to Stanford Core NLP tools
# Copyright (c) 2014 Dustin Smith
# https://github.com/dasmith/stanford-corenlp-python
#
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
Expand Down Expand Up @@ -74,30 +74,33 @@ def parse_parser_results(text):
state = STATE_START
for line in text.encode('utf-8').split("\n"):
line = line.strip()

if line.startswith("Sentence #"):
sentence = {'words':[], 'parsetree':[], 'dependencies':[]}
results["sentences"].append(sentence)
state = STATE_TEXT

elif state == STATE_TEXT:
sentence['text'] = line
state = STATE_WORDS

elif state == STATE_WORDS:
if not line.startswith("[Text="):
raise Exception('Parse error. Could not find "[Text=" in: %s' % line)
for s in WORD_PATTERN.findall(line):
sentence['words'].append(parse_bracketed(s))
state = STATE_TREE

if line.startswith("[Text="):
# now each line is a word
for s in WORD_PATTERN.findall(line):
sentence['words'].append(parse_bracketed(s))
else:
state = STATE_TREE
if len(line) != 0:
sentence['parsetree'].append(line)

elif state == STATE_TREE:
if len(line) == 0:
state = STATE_DEPENDENCY
sentence['parsetree'] = " ".join(sentence['parsetree'])
else:
sentence['parsetree'].append(line)

elif state == STATE_DEPENDENCY:
if len(line) == 0:
state = STATE_COREFERENCE
Expand All @@ -106,7 +109,7 @@ def parse_parser_results(text):
if len(split_entry) == 3:
rel, left, right = map(lambda x: remove_id(x), split_entry)
sentence['dependencies'].append(tuple([rel,left,right]))

elif state == STATE_COREFERENCE:
if "Coreference set" in line:
if 'coref' not in results:
Expand All @@ -118,7 +121,7 @@ def parse_parser_results(text):
src_i, src_pos, src_l, src_r = int(src_i)-1, int(src_pos)-1, int(src_l)-1, int(src_r)-1
sink_i, sink_pos, sink_l, sink_r = int(sink_i)-1, int(sink_pos)-1, int(sink_l)-1, int(sink_r)-1
coref_set.append(((src_word, src_i, src_pos, src_l, src_r), (sink_word, sink_i, sink_pos, sink_l, sink_r)))

return results


Expand All @@ -132,36 +135,39 @@ def __init__(self, corenlp_path=None):
Checks the location of the jar files.
Spawns the server as a process.
"""
jars = ["stanford-corenlp-3.4.1.jar",
"stanford-corenlp-3.4.1-models.jar",
jars = ["stanford-corenlp-3.6.0.jar",
"stanford-corenlp-3.6.0-models.jar",
"joda-time.jar",
"xom.jar",
"jollyday.jar"]

"jollyday.jar",
"ejml-0.23.jar",
"slf4j-api.jar",
"slf4j-simple.jar"]

# if CoreNLP libraries are in a different directory,
# change the corenlp_path variable to point to them
if not corenlp_path:
corenlp_path = "./stanford-corenlp-full-2014-08-27/"
corenlp_path = "./stanford-corenlp-full-2015-12-09/"

java_path = "java"
classname = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
# include the properties file, so you can change defaults
# but any changes in output format will break parse_parser_results()
props = "-props default.properties"
props = "-props default.properties"

# add and check classpaths
jars = [corenlp_path + jar for jar in jars]
for jar in jars:
if not os.path.exists(jar):
logger.error("Error! Cannot locate %s" % jar)
sys.exit(1)

# spawn the server
start_corenlp = "%s -Xmx1800m -cp %s %s %s" % (java_path, ':'.join(jars), classname, props)
if VERBOSE:
start_corenlp = "%s -Xmx3600m -cp %s %s %s" % (java_path, ':'.join(jars), classname, props)
if VERBOSE:
logger.debug(start_corenlp)
self.corenlp = pexpect.spawn(start_corenlp)

# show progress bar while loading the models
widgets = ['Loading Models: ', Fraction()]
pbar = ProgressBar(widgets=widgets, maxval=5, force_update=True).start()
Expand All @@ -177,11 +183,11 @@ def __init__(self, corenlp_path=None):
pbar.update(5)
self.corenlp.expect("Entering interactive shell.")
pbar.finish()

def _parse(self, text):
"""
This is the core interaction with the parser.

It returns a Python data-structure, while the parse()
function returns a JSON object
"""
Expand All @@ -191,11 +197,11 @@ def _parse(self, text):
self.corenlp.read_nonblocking (4000, 0.3)
except pexpect.TIMEOUT:
break

self.corenlp.sendline(text)

# How much time should we give the parser to parse it?
# the idea here is that you increase the timeout as a
# the idea here is that you increase the timeout as a
# function of the text's length.
# anything longer than 5 seconds requires that you also
# increase timeout=5 in jsonrpc.py
Expand All @@ -207,7 +213,7 @@ def _parse(self, text):
# Time left, read more data
try:
incoming += self.corenlp.read_nonblocking(2000, 1)
if "\nNLP>" in incoming:
if "\nNLP>" in incoming:
break
time.sleep(0.0001)
except pexpect.TIMEOUT:
Expand All @@ -218,20 +224,20 @@ def _parse(self, text):
continue
except pexpect.EOF:
break
if VERBOSE:

if VERBOSE:
logger.debug("%s\n%s" % ('='*40, incoming))
try:
results = parse_parser_results(incoming)
except Exception, e:
if VERBOSE:
if VERBOSE:
logger.debug(traceback.format_exc())
raise e

return results

def parse(self, text):
"""
"""
This function takes a text string, sends it to the Stanford parser,
reads in the result, parses the results and returns a list
with one dictionary entry for each parsed sentence, in JSON format.
Expand All @@ -253,9 +259,9 @@ def parse(self, text):
options, args = parser.parse_args()
server = jsonrpc.Server(jsonrpc.JsonRpc20(),
jsonrpc.TransportTcpIp(addr=(options.host, int(options.port))))

nlp = StanfordCoreNLP()
server.register_function(nlp.parse)

logger.info('Serving on http://%s:%s' % (options.host, options.port))
server.serve()