Merge pull request #44 from AmyOlex/lukedev

Lukedev
AmyOlex · Jun 16, 2018 · 9504ca6 · 9504ca6
2 parents cacf009 + 86f09fd
commit 9504ca6
Show file tree

Hide file tree

Showing 3 changed files with 139 additions and 25 deletions.
diff --git a/.gitignore b/.gitignore
@@ -67,3 +67,6 @@ kfold_files_new_official.csv
 official_train_MLmatrix_Win3_012618_class.csv
 official_train_MLmatrix_Win3_012618_data.csv
 runit.sh
+*.xml
+Chrono_SemEval2018_PostEvalSubmission_NB_NewswireModel_060818/Chrono_TempEval2018_PostEvalSubmission_NB_060818.zip
+*.tiff
diff --git a/Chrono/TimePhrase_to_Chrono.py b/Chrono/TimePhrase_to_Chrono.py
@@ -2495,38 +2495,37 @@ def hasYear(tpentity, flags):
         for text in text_list:
             # get start coordinate of this token in the full string so we can calculate the position of the temporal matches.
             text_start, text_end = getSpan(text_norm, text)
-
+
+            result = re.search('([0-9]{1,2})[-/:]([0-9]{1,2})[-/:]([0-9]{4})', text)
             #define regular expression to find a 4-digit year from the date format
-            if(re.search('([0-9]{1,2})[-/:]([0-9]{1,2})[-/:]([0-9]{4})',text)):
-                result = re.search('([0-9]{1,2})[-/:]([0-9]{1,2})[-/:]([0-9]{4})',text).group(0)
-                if  len(result.split("/")) == 3:
-                    start_idx, end_idx = getSpan(result,re.compile("/").split(result)[2])    
-                    return True, re.compile("/").split(result)[2], text_start+start_idx, text_start+end_idx, flags
-                elif len(result.split("-")) == 3:
-                    start_idx, end_idx = getSpan(result,re.compile("-").split(result)[2])    
-                    return True, re.compile("-").split(result)[2], text_start+start_idx, text_start+end_idx, flags
+            if result :
+                result = result.group(0)
+                split_result = re.split("/-:", result)
+                if len(split_result) == 3:
+                    start_idx, end_idx = getSpan(result,split_result[2])
+                    return True, split_result[2], text_start+start_idx, text_start+end_idx, flags
                 else:
                    return False, None, None, None, flags
             ## look for year at start of date
             ## added by Amy Olex
-            elif(re.search('([0-9]{4})[-/:]([0-9]{1,2})[-/:]([0-9]{1,2})',text)):
-                result = re.search('([0-9]{4})[-/:]([0-9]{1,2})[-/:]([0-9]{1,2})',text).group(0)
-                if  len(result.split("/")) == 3:
-                    start_idx, end_idx = getSpan(result,re.compile("/").split(result)[0])    
-                    return True, re.compile("/").split(result)[0], text_start+start_idx, text_start+end_idx, flags
-                elif len(result.split("-")) == 3:
-                    start_idx, end_idx = getSpan(result,re.compile("-").split(result)[0])    
-                    return True, re.compile("-").split(result)[0], text_start+start_idx, text_start+end_idx, flags
-                else:
-                   return False, None, None, None, flags
+            elif len(text) > 7:
+                result = re.search('([0-9]{4})[-/:]([0-9]{1,2})[-/:]([0-9]{1,2})',text)
+                if result :
+                    result = result.group(0)
+                    split_result = re.split("/-:", result)
+                    if len(split_result) == 3:
+                        start_idx, end_idx = getSpan(result, split_result[0])
+                        return True, split_result[0], text_start + start_idx, text_start + end_idx, flags
+                    else:
+                        return False, None, None, None, flags
             ## special case to look for c.yyyy
-            elif len(text)==6 is not None:
-                r = re.search("c\.([0-9]{4})", text)
-                if r is not None:
-                    rval = utils.getNumberFromText(r.group(1))
-                    if rval is not None:
+            elif len(text) == 6 :
+                result = re.search("c\.([0-9]{4})", text)
+                if result :
+                    rval = utils.getNumberFromText(result.group(1))
+                    if rval :
                         if rval >=1500 and rval<=2050:
-                            start_idx, end_idx = r.span(1)
+                            start_idx, end_idx = result.span(1)
                             return True, rval, start_idx, end_idx, flags
 
         return False, None, None, None, flags #if no 4 digit year expressions were found return false            

diff --git a/ExtractParsedContext.py b/ExtractParsedContext.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2018 
+# Amy L. Olex, Virginia Commonwealth University
+# alolex at vcu.edu
+#
+# Luke Maffey, Virginia Commonwealth University
+# maffeyl at vcu.edu
+#
+# Nicholas Morton,  Virginia Commonwealth University 
+# nmorton at vcu.edu
+#
+# Bridget T. McInnes, Virginia Commonwealth University
+# btmcinnes at vcu.edu
+#
+# This file is part of Chrono
+#
+# Chrono is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 3
+# of the License, or (at your option) any later version.
+#
+# Chrono is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Chrono; if not, write to 
+#
+# The Free Software Foundation, Inc., 
+# 59 Temple Place - Suite 330, 
+# Boston, MA  02111-1307, USA.
+
+## This program extracts the specified context around each SCATE Entity of a certain type.  
+## Example Usage:  python ExtractParsedContext.py -x data/SemEval-OfficialTrain -i results/newswire.list -t data/SemEval-OfficialTrain -o results/output1-Gold-MinuteOfHour.txt -e Minute-Of-Hour -f gold -c 25
+
+import argparse
+from xml.dom import minidom
+import os.path
+
+if __name__ == "__main__":
+
+    ## Parse input arguments
+    parser = argparse.ArgumentParser(description='Extract entity context information from text using the AnaforaXML Annotation files.')
+    parser.add_argument('-x', metavar='xmldir', type=str, help='path to the input xml directory that holds the annotated files.', required=True)
+    parser.add_argument('-i', metavar='filelist', type=str, help='File with list of documents to parse.', required=True)
+    parser.add_argument('-t', metavar='textfiledir', type=str, help='Path to directory holding the raw text files.', required=True)
+    parser.add_argument('-o', metavar='outputfile', type=str, help='Name of the output file to save results to.', required=True)
+    parser.add_argument('-e', metavar='entity', type=str, help='The name of the entity we want to extract.', required=True)
+    parser.add_argument('-f', metavar='flag', type=str, help='gold or chrono', required=True)
+    parser.add_argument('-c', metavar='context', type=str, help='The number of characters before and after for context.', required=False, default=20)
+
+    args = parser.parse_args()
+    ## Now we can access each argument as args.i, args.o, args.r
+
+    def getTargetSpans(xmlfile, entity):
+        xmldoc = minidom.parse(xmlfile)
+        itemlist = xmldoc.getElementsByTagName('entity')
+        entitylist = []
+        for item in itemlist:
+            eid = item.getElementsByTagName('id')[0].firstChild.data
+            espan = item.getElementsByTagName('span')[0].firstChild.data
+            etype = item.getElementsByTagName('type')[0].firstChild.data
+            if etype == entity:
+                start, end = espan.split(",")
+                entitylist.append([eid, etype, int(start), int(end)])
+        return(entitylist)
+
+
+    def writeTargetSpans(infile, entitylist, context, outfile):
+        linestring = open(infile, 'r').read()
+
+        for entity in entitylist:
+            start = max(0,int(entity[2])-context)
+            end = min(len(linestring), int(entity[3])+context)
+            outfile.write("\n\nID: " + entity[0] + ", Type: " + entity[1] + ", Span: (" + str(entity[2]) + "," + str(entity[3]) + "), Value: " + linestring[entity[2]:entity[3]] + "\n")
+            outfile.write(linestring[start:end])
+
+
+
+
+    ## Loop over each file in the file list and parse it
+    out = open(args.o, 'w')
+    inputfiles = open(args.i, 'r').read().split("\n")
+    for f in inputfiles:
+
+        ## Open the XML file and parse it
+        if args.f == "gold":
+            path = args.x + "/" + f + "/" + f + ".TimeNorm.gold.completed.xml"
+        else:
+            path = args.x + "/" + f + "/" + f + ".completed.xml"
+
+        if(os.path.isfile(path)):
+            myElist = getTargetSpans(path, args.e)
+
+            ## Pass this information to extract the text segments and write to file
+            path2 = args.t + "/" + f + "/" + f
+            if(os.path.isfile(path2)):
+                out.write("\n\n*****\nFile: " + f)
+                writeTargetSpans(path2, myElist, int(args.c), out)
+        else:
+            out.write("\n\n*****\nSkipping File: " + f)
+    out.close()
+    print("Completed!")
+
+
+
+
+
+
+
+
+