diff --git a/README.md b/README.md index 96b068f..cfd1051 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Python interface to Stanford Core NLP tools v3.4.1 +# Python interface to Stanford Core NLP tools v3.5.2 This is a Python wrapper for Stanford University's NLP group's Java-based [CoreNLP tools](http://nlp.stanford.edu/software/corenlp.shtml). It can either be imported as a module or run as a JSON-RPC server. Because it uses many large trained models (requiring 3GB RAM on 64-bit machines and usually a few minutes loading time), most applications will probably want to run it as a server. @@ -10,7 +10,7 @@ This is a Python wrapper for Stanford University's NLP group's Java-based [CoreN It depends on [pexpect](http://www.noah.org/wiki/pexpect) and includes and uses code from [jsonrpc](http://www.simple-is-better.org/rpc/) and [python-progressbar](http://code.google.com/p/python-progressbar/). -It runs the Stanford CoreNLP jar in a separate process, communicates with the java process using its command-line interface, and makes assumptions about the output of the parser in order to parse it into a Python dict object and transfer it using JSON. The parser will break if the output changes significantly, but it has been tested on **Core NLP tools version 3.4.1** released 2014-08-27. +It runs the Stanford CoreNLP jar in a separate process, communicates with the java process using its command-line interface, and makes assumptions about the output of the parser in order to parse it into a Python dict object and transfer it using JSON. The parser will break if the output changes significantly, but it has been tested on **Core NLP tools version 3.5.2** released 2015-04-20. ## Download and Usage @@ -19,8 +19,8 @@ To use this program you must [download](http://nlp.stanford.edu/software/corenlp sudo pip install pexpect unidecode git clone git://github.com/dasmith/stanford-corenlp-python.git cd stanford-corenlp-python - wget http://nlp.stanford.edu/software/stanford-corenlp-full-2014-08-27.zip - unzip stanford-corenlp-full-2014-08-27.zip + wget http://nlp.stanford.edu/software/stanford-corenlp-full-2015-04-20.zip + unzip stanford-corenlp-full-2015-04-20.zip Then launch the server: @@ -110,7 +110,7 @@ To use it in a regular script (useful for debugging), load the module instead: corenlp = StanfordCoreNLP() # wait a few minutes... corenlp.parse("Parse this sentence.") -The server, `StanfordCoreNLP()`, takes an optional argument `corenlp_path` which specifies the path to the jar files. The default value is `StanfordCoreNLP(corenlp_path="./stanford-corenlp-full-2014-08-27/")`. +The server, `StanfordCoreNLP()`, takes an optional argument `corenlp_path` which specifies the path to the jar files. The default value is `StanfordCoreNLP(corenlp_path="./stanford-corenlp-full-2015-04-20/")`. ## Coreference Resolution @@ -139,7 +139,7 @@ tar xvfz WNprolog-3.0.tar.gz **Stanford CoreNLP tools require a large amount of free memory**. Java 5+ uses about 50% more RAM on 64-bit machines than 32-bit machines. 32-bit machine users can lower the memory requirements by changing `-Xmx3g` to `-Xmx2g` or even less. If pexpect timesout while loading models, check to make sure you have enough memory and can run the server alone without your kernel killing the java process: - java -cp stanford-corenlp-2014-08-27.jar:stanford-corenlp-3.4.1-models.jar:xom.jar:joda-time.jar -Xmx3g edu.stanford.nlp.pipeline.StanfordCoreNLP -props default.properties + java -cp stanford-corenlp-2015-04-20.jar:stanford-corenlp-3.4.1-models.jar:xom.jar:joda-time.jar -Xmx3g edu.stanford.nlp.pipeline.StanfordCoreNLP -props default.properties You can reach me, Dustin Smith, by sending a message on GitHub or through email (contact information is available [on my webpage](http://web.media.mit.edu/~dustin)). diff --git a/corenlp.py b/corenlp.py index 3eefa94..a497637 100644 --- a/corenlp.py +++ b/corenlp.py @@ -74,30 +74,30 @@ def parse_parser_results(text): state = STATE_START for line in text.split("\n"): line = line.strip() - + if line.startswith("(ROOT"): + state = STATE_TREE if line.startswith("Sentence #"): - sentence = {'words':[], 'parsetree':[], 'dependencies':[]} + sentence = {'words': [], 'parsetree': [], 'dependencies': []} results["sentences"].append(sentence) state = STATE_TEXT - + elif state == STATE_TEXT: sentence['text'] = line state = STATE_WORDS - + elif state == STATE_WORDS: if not line.startswith("[Text="): raise Exception('Parse error. Could not find "[Text=" in: %s' % line) for s in WORD_PATTERN.findall(line): sentence['words'].append(parse_bracketed(s)) - state = STATE_TREE - + elif state == STATE_TREE: if len(line) == 0: state = STATE_DEPENDENCY sentence['parsetree'] = " ".join(sentence['parsetree']) else: sentence['parsetree'].append(line) - + elif state == STATE_DEPENDENCY: if len(line) == 0: state = STATE_COREFERENCE @@ -105,8 +105,8 @@ def parse_parser_results(text): split_entry = re.split("\(|, ", line[:-1]) if len(split_entry) == 3: rel, left, right = map(lambda x: remove_id(x), split_entry) - sentence['dependencies'].append(tuple([rel,left,right])) - + sentence['dependencies'].append(tuple([rel, left, right])) + elif state == STATE_COREFERENCE: if "Coreference set" in line: if 'coref' not in results: @@ -118,7 +118,7 @@ def parse_parser_results(text): src_i, src_pos, src_l, src_r = int(src_i)-1, int(src_pos)-1, int(src_l)-1, int(src_r)-1 sink_i, sink_pos, sink_l, sink_r = int(sink_i)-1, int(sink_pos)-1, int(sink_l)-1, int(sink_r)-1 coref_set.append(((src_word, src_i, src_pos, src_l, src_r), (sink_word, sink_i, sink_pos, sink_l, sink_r))) - + return results @@ -132,70 +132,70 @@ def __init__(self, corenlp_path=None): Checks the location of the jar files. Spawns the server as a process. """ - jars = ["stanford-corenlp-3.5.1.jar", - "stanford-corenlp-3.5.1-models.jar", + jars = ["stanford-corenlp-3.5.2.jar", + "stanford-corenlp-3.5.2-models.jar", "joda-time.jar", "xom.jar", "jollyday.jar"] - + # if CoreNLP libraries are in a different directory, # change the corenlp_path variable to point to them if not corenlp_path: - corenlp_path = "./stanford-corenlp-full-2015-01-30/" - + corenlp_path = "./stanford-corenlp-full-2015-04-20/" + java_path = "java" classname = "edu.stanford.nlp.pipeline.StanfordCoreNLP" # include the properties file, so you can change defaults # but any changes in output format will break parse_parser_results() - props = "-props default.properties" - + props = "-props default.properties" + # add and check classpaths jars = [corenlp_path + jar for jar in jars] for jar in jars: if not os.path.exists(jar): logger.error("Error! Cannot locate %s" % jar) sys.exit(1) - + # spawn the server start_corenlp = "%s -Xmx1800m -cp %s %s %s" % (java_path, ':'.join(jars), classname, props) - if VERBOSE: + if VERBOSE: logger.debug(start_corenlp) self.corenlp = pexpect.spawn(start_corenlp) - + # show progress bar while loading the models widgets = ['Loading Models: ', Fraction()] pbar = ProgressBar(widgets=widgets, maxval=5, force_update=True).start() - self.corenlp.expect("done.", timeout=20) # Load pos tagger model (~5sec) + self.corenlp.expect("done.", timeout=20) # Load pos tagger model (~5sec) pbar.update(1) - self.corenlp.expect("done.", timeout=200) # Load NER-all classifier (~33sec) + self.corenlp.expect("done.", timeout=200) # Load NER-all classifier (~33sec) pbar.update(2) - self.corenlp.expect("done.", timeout=600) # Load NER-muc classifier (~60sec) + self.corenlp.expect("done.", timeout=600) # Load NER-muc classifier (~60sec) pbar.update(3) - self.corenlp.expect("done.", timeout=600) # Load CoNLL classifier (~50sec) + self.corenlp.expect("done.", timeout=600) # Load CoNLL classifier (~50sec) pbar.update(4) - self.corenlp.expect("done.", timeout=200) # Loading PCFG (~3sec) + self.corenlp.expect("done.", timeout=200) # Loading PCFG (~3sec) pbar.update(5) self.corenlp.expect("Entering interactive shell.") pbar.finish() - + def _parse(self, text): """ This is the core interaction with the parser. - + It returns a Python data-structure, while the parse() function returns a JSON object """ # clean up anything leftover while True: try: - self.corenlp.read_nonblocking (4000, 0.3) + self.corenlp.read_nonblocking(4000, 0.3) except pexpect.TIMEOUT: break - + self.corenlp.sendline(text) - + # How much time should we give the parser to parse it? - # the idea here is that you increase the timeout as a + # the idea here is that you increase the timeout as a # function of the text's length. # anything longer than 5 seconds requires that you also # increase timeout=5 in jsonrpc.py @@ -207,7 +207,7 @@ def _parse(self, text): # Time left, read more data try: incoming += self.corenlp.read_nonblocking(2000, 1) - if "\nNLP>" in incoming: + if "\nNLP>" in incoming: break time.sleep(0.0001) except pexpect.TIMEOUT: @@ -218,20 +218,20 @@ def _parse(self, text): continue except pexpect.EOF: break - - if VERBOSE: + + if VERBOSE: logger.debug("%s\n%s" % ('='*40, incoming)) try: results = parse_parser_results(incoming) except Exception, e: - if VERBOSE: + if VERBOSE: logger.debug(traceback.format_exc()) raise e - + return results - + def parse(self, text): - """ + """ This function takes a text string, sends it to the Stanford parser, reads in the result, parses the results and returns a list with one dictionary entry for each parsed sentence, in JSON format. @@ -253,9 +253,9 @@ def parse(self, text): options, args = parser.parse_args() server = jsonrpc.Server(jsonrpc.JsonRpc20(), jsonrpc.TransportTcpIp(addr=(options.host, int(options.port)))) - + nlp = StanfordCoreNLP() server.register_function(nlp.parse) - + logger.info('Serving on http://%s:%s' % (options.host, options.port)) server.serve()