Skip to content

Commit

Permalink
Merge pull request #44 from AmyOlex/lukedev
Browse files Browse the repository at this point in the history
Lukedev
  • Loading branch information
maffeyl authored Jun 16, 2018
2 parents cacf009 + 86f09fd commit 9504ca6
Show file tree
Hide file tree
Showing 3 changed files with 139 additions and 25 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,6 @@ kfold_files_new_official.csv
official_train_MLmatrix_Win3_012618_class.csv
official_train_MLmatrix_Win3_012618_data.csv
runit.sh
*.xml
Chrono_SemEval2018_PostEvalSubmission_NB_NewswireModel_060818/Chrono_TempEval2018_PostEvalSubmission_NB_060818.zip
*.tiff
49 changes: 24 additions & 25 deletions Chrono/TimePhrase_to_Chrono.py
Original file line number Diff line number Diff line change
Expand Up @@ -2495,38 +2495,37 @@ def hasYear(tpentity, flags):
for text in text_list:
# get start coordinate of this token in the full string so we can calculate the position of the temporal matches.
text_start, text_end = getSpan(text_norm, text)


result = re.search('([0-9]{1,2})[-/:]([0-9]{1,2})[-/:]([0-9]{4})', text)
#define regular expression to find a 4-digit year from the date format
if(re.search('([0-9]{1,2})[-/:]([0-9]{1,2})[-/:]([0-9]{4})',text)):
result = re.search('([0-9]{1,2})[-/:]([0-9]{1,2})[-/:]([0-9]{4})',text).group(0)
if len(result.split("/")) == 3:
start_idx, end_idx = getSpan(result,re.compile("/").split(result)[2])
return True, re.compile("/").split(result)[2], text_start+start_idx, text_start+end_idx, flags
elif len(result.split("-")) == 3:
start_idx, end_idx = getSpan(result,re.compile("-").split(result)[2])
return True, re.compile("-").split(result)[2], text_start+start_idx, text_start+end_idx, flags
if result :
result = result.group(0)
split_result = re.split("/-:", result)
if len(split_result) == 3:
start_idx, end_idx = getSpan(result,split_result[2])
return True, split_result[2], text_start+start_idx, text_start+end_idx, flags
else:
return False, None, None, None, flags
## look for year at start of date
## added by Amy Olex
elif(re.search('([0-9]{4})[-/:]([0-9]{1,2})[-/:]([0-9]{1,2})',text)):
result = re.search('([0-9]{4})[-/:]([0-9]{1,2})[-/:]([0-9]{1,2})',text).group(0)
if len(result.split("/")) == 3:
start_idx, end_idx = getSpan(result,re.compile("/").split(result)[0])
return True, re.compile("/").split(result)[0], text_start+start_idx, text_start+end_idx, flags
elif len(result.split("-")) == 3:
start_idx, end_idx = getSpan(result,re.compile("-").split(result)[0])
return True, re.compile("-").split(result)[0], text_start+start_idx, text_start+end_idx, flags
else:
return False, None, None, None, flags
elif len(text) > 7:
result = re.search('([0-9]{4})[-/:]([0-9]{1,2})[-/:]([0-9]{1,2})',text)
if result :
result = result.group(0)
split_result = re.split("/-:", result)
if len(split_result) == 3:
start_idx, end_idx = getSpan(result, split_result[0])
return True, split_result[0], text_start + start_idx, text_start + end_idx, flags
else:
return False, None, None, None, flags
## special case to look for c.yyyy
elif len(text)==6 is not None:
r = re.search("c\.([0-9]{4})", text)
if r is not None:
rval = utils.getNumberFromText(r.group(1))
if rval is not None:
elif len(text) == 6 :
result = re.search("c\.([0-9]{4})", text)
if result :
rval = utils.getNumberFromText(result.group(1))
if rval :
if rval >=1500 and rval<=2050:
start_idx, end_idx = r.span(1)
start_idx, end_idx = result.span(1)
return True, rval, start_idx, end_idx, flags

return False, None, None, None, flags #if no 4 digit year expressions were found return false
Expand Down
112 changes: 112 additions & 0 deletions ExtractParsedContext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
# Copyright (c) 2018
# Amy L. Olex, Virginia Commonwealth University
# alolex at vcu.edu
#
# Luke Maffey, Virginia Commonwealth University
# maffeyl at vcu.edu
#
# Nicholas Morton, Virginia Commonwealth University
# nmorton at vcu.edu
#
# Bridget T. McInnes, Virginia Commonwealth University
# btmcinnes at vcu.edu
#
# This file is part of Chrono
#
# Chrono is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 3
# of the License, or (at your option) any later version.
#
# Chrono is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Chrono; if not, write to
#
# The Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.

## This program extracts the specified context around each SCATE Entity of a certain type.
## Example Usage: python ExtractParsedContext.py -x data/SemEval-OfficialTrain -i results/newswire.list -t data/SemEval-OfficialTrain -o results/output1-Gold-MinuteOfHour.txt -e Minute-Of-Hour -f gold -c 25

import argparse
from xml.dom import minidom
import os.path

if __name__ == "__main__":

## Parse input arguments
parser = argparse.ArgumentParser(description='Extract entity context information from text using the AnaforaXML Annotation files.')
parser.add_argument('-x', metavar='xmldir', type=str, help='path to the input xml directory that holds the annotated files.', required=True)
parser.add_argument('-i', metavar='filelist', type=str, help='File with list of documents to parse.', required=True)
parser.add_argument('-t', metavar='textfiledir', type=str, help='Path to directory holding the raw text files.', required=True)
parser.add_argument('-o', metavar='outputfile', type=str, help='Name of the output file to save results to.', required=True)
parser.add_argument('-e', metavar='entity', type=str, help='The name of the entity we want to extract.', required=True)
parser.add_argument('-f', metavar='flag', type=str, help='gold or chrono', required=True)
parser.add_argument('-c', metavar='context', type=str, help='The number of characters before and after for context.', required=False, default=20)

args = parser.parse_args()
## Now we can access each argument as args.i, args.o, args.r

def getTargetSpans(xmlfile, entity):
xmldoc = minidom.parse(xmlfile)
itemlist = xmldoc.getElementsByTagName('entity')
entitylist = []
for item in itemlist:
eid = item.getElementsByTagName('id')[0].firstChild.data
espan = item.getElementsByTagName('span')[0].firstChild.data
etype = item.getElementsByTagName('type')[0].firstChild.data
if etype == entity:
start, end = espan.split(",")
entitylist.append([eid, etype, int(start), int(end)])
return(entitylist)


def writeTargetSpans(infile, entitylist, context, outfile):
linestring = open(infile, 'r').read()

for entity in entitylist:
start = max(0,int(entity[2])-context)
end = min(len(linestring), int(entity[3])+context)
outfile.write("\n\nID: " + entity[0] + ", Type: " + entity[1] + ", Span: (" + str(entity[2]) + "," + str(entity[3]) + "), Value: " + linestring[entity[2]:entity[3]] + "\n")
outfile.write(linestring[start:end])




## Loop over each file in the file list and parse it
out = open(args.o, 'w')
inputfiles = open(args.i, 'r').read().split("\n")
for f in inputfiles:

## Open the XML file and parse it
if args.f == "gold":
path = args.x + "/" + f + "/" + f + ".TimeNorm.gold.completed.xml"
else:
path = args.x + "/" + f + "/" + f + ".completed.xml"

if(os.path.isfile(path)):
myElist = getTargetSpans(path, args.e)

## Pass this information to extract the text segments and write to file
path2 = args.t + "/" + f + "/" + f
if(os.path.isfile(path2)):
out.write("\n\n*****\nFile: " + f)
writeTargetSpans(path2, myElist, int(args.c), out)
else:
out.write("\n\n*****\nSkipping File: " + f)
out.close()
print("Completed!")









0 comments on commit 9504ca6

Please sign in to comment.