From cf5fb559df38c6b842213ca78db7a95154e21bef Mon Sep 17 00:00:00 2001 From: danieljamieson Date: Fri, 24 Nov 2017 17:14:51 +0000 Subject: [PATCH 1/2] Adding a parameter to StanfordCoreNLP class to pass a string containing the version of the library. Enables the use of versions higher than 3.4.1 --- corenlp.py | 68 +++++++++++++++++++++++++++--------------------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/corenlp.py b/corenlp.py index 753e51c..154e7d8 100644 --- a/corenlp.py +++ b/corenlp.py @@ -3,17 +3,17 @@ # corenlp - Python interface to Stanford Core NLP tools # Copyright (c) 2014 Dustin Smith # https://github.com/dasmith/stanford-corenlp-python -# +# # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. -# +# # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. -# +# # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. @@ -74,30 +74,30 @@ def parse_parser_results(text): state = STATE_START for line in text.encode('utf-8').split("\n"): line = line.strip() - + if line.startswith("Sentence #"): sentence = {'words':[], 'parsetree':[], 'dependencies':[]} results["sentences"].append(sentence) state = STATE_TEXT - + elif state == STATE_TEXT: sentence['text'] = line state = STATE_WORDS - + elif state == STATE_WORDS: if not line.startswith("[Text="): raise Exception('Parse error. Could not find "[Text=" in: %s' % line) for s in WORD_PATTERN.findall(line): sentence['words'].append(parse_bracketed(s)) state = STATE_TREE - + elif state == STATE_TREE: if len(line) == 0: state = STATE_DEPENDENCY sentence['parsetree'] = " ".join(sentence['parsetree']) else: sentence['parsetree'].append(line) - + elif state == STATE_DEPENDENCY: if len(line) == 0: state = STATE_COREFERENCE @@ -106,7 +106,7 @@ def parse_parser_results(text): if len(split_entry) == 3: rel, left, right = map(lambda x: remove_id(x), split_entry) sentence['dependencies'].append(tuple([rel,left,right])) - + elif state == STATE_COREFERENCE: if "Coreference set" in line: if 'coref' not in results: @@ -118,7 +118,7 @@ def parse_parser_results(text): src_i, src_pos, src_l, src_r = int(src_i)-1, int(src_pos)-1, int(src_l)-1, int(src_r)-1 sink_i, sink_pos, sink_l, sink_r = int(sink_i)-1, int(sink_pos)-1, int(sink_l)-1, int(sink_r)-1 coref_set.append(((src_word, src_i, src_pos, src_l, src_r), (sink_word, sink_i, sink_pos, sink_l, sink_r))) - + return results @@ -127,41 +127,41 @@ class StanfordCoreNLP(object): Command-line interaction with Stanford's CoreNLP java utilities. Can be run as a JSON-RPC server or imported as a module. """ - def __init__(self, corenlp_path=None): + def __init__(self, corenlp_path=None, version_str="3.4.1"): """ Checks the location of the jar files. Spawns the server as a process. """ - jars = ["stanford-corenlp-3.4.1.jar", - "stanford-corenlp-3.4.1-models.jar", + jars = ["stanford-corenlp-%s.jar" % version_str, + "stanford-corenlp-%s-models.jar" % version_str, "joda-time.jar", "xom.jar", "jollyday.jar"] - + # if CoreNLP libraries are in a different directory, # change the corenlp_path variable to point to them if not corenlp_path: corenlp_path = "./stanford-corenlp-full-2014-08-27/" - + java_path = "java" classname = "edu.stanford.nlp.pipeline.StanfordCoreNLP" # include the properties file, so you can change defaults # but any changes in output format will break parse_parser_results() - props = "-props default.properties" - + props = "-props default.properties" + # add and check classpaths jars = [corenlp_path + jar for jar in jars] for jar in jars: if not os.path.exists(jar): logger.error("Error! Cannot locate %s" % jar) sys.exit(1) - + # spawn the server start_corenlp = "%s -Xmx1800m -cp %s %s %s" % (java_path, ':'.join(jars), classname, props) - if VERBOSE: + if VERBOSE: logger.debug(start_corenlp) self.corenlp = pexpect.spawn(start_corenlp) - + # show progress bar while loading the models widgets = ['Loading Models: ', Fraction()] pbar = ProgressBar(widgets=widgets, maxval=5, force_update=True).start() @@ -177,11 +177,11 @@ def __init__(self, corenlp_path=None): pbar.update(5) self.corenlp.expect("Entering interactive shell.") pbar.finish() - + def _parse(self, text): """ This is the core interaction with the parser. - + It returns a Python data-structure, while the parse() function returns a JSON object """ @@ -191,11 +191,11 @@ def _parse(self, text): self.corenlp.read_nonblocking (4000, 0.3) except pexpect.TIMEOUT: break - + self.corenlp.sendline(text) - + # How much time should we give the parser to parse it? - # the idea here is that you increase the timeout as a + # the idea here is that you increase the timeout as a # function of the text's length. # anything longer than 5 seconds requires that you also # increase timeout=5 in jsonrpc.py @@ -207,7 +207,7 @@ def _parse(self, text): # Time left, read more data try: incoming += self.corenlp.read_nonblocking(2000, 1) - if "\nNLP>" in incoming: + if "\nNLP>" in incoming: break time.sleep(0.0001) except pexpect.TIMEOUT: @@ -218,20 +218,20 @@ def _parse(self, text): continue except pexpect.EOF: break - - if VERBOSE: + + if VERBOSE: logger.debug("%s\n%s" % ('='*40, incoming)) try: results = parse_parser_results(incoming) except Exception, e: - if VERBOSE: + if VERBOSE: logger.debug(traceback.format_exc()) raise e - + return results - + def parse(self, text): - """ + """ This function takes a text string, sends it to the Stanford parser, reads in the result, parses the results and returns a list with one dictionary entry for each parsed sentence, in JSON format. @@ -253,9 +253,9 @@ def parse(self, text): options, args = parser.parse_args() server = jsonrpc.Server(jsonrpc.JsonRpc20(), jsonrpc.TransportTcpIp(addr=(options.host, int(options.port)))) - + nlp = StanfordCoreNLP() server.register_function(nlp.parse) - + logger.info('Serving on http://%s:%s' % (options.host, options.port)) server.serve() From af6cddb8bc67a3908e28ecdf984ca6e0e1cca148 Mon Sep 17 00:00:00 2001 From: danieljamieson Date: Sat, 25 Nov 2017 15:57:54 +0000 Subject: [PATCH 2/2] Adding absolute path for properties file so that StandfordCoreNLP can be ran and imported from files outside of the root folder. Added command line arguments for corenlp path & version string --- corenlp.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/corenlp.py b/corenlp.py index 154e7d8..90834ab 100644 --- a/corenlp.py +++ b/corenlp.py @@ -147,7 +147,7 @@ def __init__(self, corenlp_path=None, version_str="3.4.1"): classname = "edu.stanford.nlp.pipeline.StanfordCoreNLP" # include the properties file, so you can change defaults # but any changes in output format will break parse_parser_results() - props = "-props default.properties" + props = "-props %s/default.properties" % os.path.dirname(os.path.abspath(__file__)) # add and check classpaths jars = [corenlp_path + jar for jar in jars] @@ -250,12 +250,16 @@ def parse(self, text): help='Port to serve on (default: 8080)') parser.add_option('-H', '--host', default='127.0.0.1', help='Host to serve on (default: 127.0.0.1. Use 0.0.0.0 to make public)') + parser.add_option('-c', '--corenlp', default='./stanford-corenlp-full-2014-08-27/', + help='Path to directory containing Core NLP binaries (default: 3.4.1 folder in root)') + parser.add_option('-v', '--version', default='3.4.1', + help='Version of CoreNLP to run (default: 3.4.1)') options, args = parser.parse_args() server = jsonrpc.Server(jsonrpc.JsonRpc20(), jsonrpc.TransportTcpIp(addr=(options.host, int(options.port)))) - nlp = StanfordCoreNLP() + nlp = StanfordCoreNLP(options.corenlp, options.version) server.register_function(nlp.parse) - logger.info('Serving on http://%s:%s' % (options.host, options.port)) + logger.info('Serving version %s on http://%s:%s' % (options.version, options.host, options.port)) server.serve()