Skip to content

Commit

Permalink
Merge pull request #143 from NLPatVCU/development
Browse files Browse the repository at this point in the history
Development
  • Loading branch information
jvargas2 authored Jul 10, 2019
2 parents 0f0ced0 + 2b83d29 commit 52314c9
Show file tree
Hide file tree
Showing 30 changed files with 2,140 additions and 543 deletions.
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
# medaCy
*.ann
*.txt

# macOS
.DS_Store

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
1 change: 1 addition & 0 deletions docs/source/medacy.ner.model.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ medacy.ner.model package
.. toctree::

medacy.ner.model.model
medacy.ner.model.spacy_model
medacy.ner.model.stratified_k_fold
7 changes: 7 additions & 0 deletions docs/source/medacy.ner.model.spacy_model.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
medacy.ner.model.spacy_model module
=========================

.. automodule:: medacy.ner.model.spacy_model
:members:
:undoc-members:
:show-inheritance:
2 changes: 1 addition & 1 deletion medacy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
__version__ = '0.1.0'
__version__ = '0.1.1'
__authors__ = "Andriy Mulyar, Corey Sutphin, Bobby Best, Steele Farnsworth, Bridget McInnes"
113 changes: 113 additions & 0 deletions medacy/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import argparse
import logging
from datetime import datetime
import time
import importlib

from medacy.data import Dataset
from medacy.ner import Model
from medacy.ner import SpacyModel

def setup(args):
dataset = Dataset(args.dataset)

pipeline = None

if args.pipeline == 'spacy':
model = SpacyModel
return dataset, model

else:
labels = list(dataset.get_labels())

pipeline_arg = args.pipeline

#Parse the argument as a class name in module medacy.ner.pipelines
module = importlib.import_module("medacy.ner.pipelines")
pipeline_class = getattr(module, pipeline_arg)

if args.word_embeddings is not None:
pipeline = pipeline_class(entities=labels, word_embeddings=args.word_embeddings)
else:
pipeline = pipeline_class(entities=labels)


model = Model(pipeline)

return dataset, model

def train(args, dataset, model):
if args.filename is None:
response = input('No filename given. Continue without saving the model at the end? (y/n) ')
if response.lower() == 'y':
model.fit(dataset)
else:
print('Cancelling. Add filename with -f or --filename.')
else:
model.fit(dataset)
model.dump(args.filename)

def predict(args, dataset, model):
model.load(args.model_path)
model.predict(dataset, prediction_directory=args.predictions, groundtruth_directory=args.groundtruth)

def cross_validate(args, dataset, model):
model.cross_validate(num_folds=args.k_folds, training_dataset=dataset, prediction_directory=args.predictions,groundtruth_directory=args.groundtruth)

def main():
# Argparse setup
parser = argparse.ArgumentParser(prog='medacy', description='Train and evaluate medaCy pipelines.')
parser.add_argument('-p', '--print_logs', action='store_true', help='Use to print logs to console.')
parser.add_argument('-pl', '--pipeline', default='ClinicalPipeline', help='Pipeline to use for training. Write the exact name of the class.')
parser.add_argument('-d', '--dataset', required=True, help='Directory of dataset to use for training.')
parser.add_argument('-w', '--word_embeddings', help='Path to word embeddings.')
subparsers = parser.add_subparsers()

# Train arguments
parser_train = subparsers.add_parser('train', help='Train a new model.')
parser_train.add_argument('-f', '--filename', help='Filename to use for saved model.')
parser_train.set_defaults(func=train)

# Predict arguments
parser_predict = subparsers.add_parser('predict', help='Run predictions on the dataset using a trained model.')
parser_predict.add_argument('-m', '--model_path', required=True, help='Trained model to load.')
parser_predict.add_argument('-gt', '--groundtruth', action='store_true', help='Use to store groundtruth files.')
parser_predict.add_argument('-pd', '--predictions', action='store_true', help='Use to store prediction files.')
parser_predict.set_defaults(func=predict)

# Cross Validation arguments
parser_validate = subparsers.add_parser('validate', help='Cross validate a model on a given dataset.')
parser_validate.add_argument('-k', '--k_folds', default=5, type=int, help='Number of folds to use for cross-validation.')
parser_validate.add_argument('-gt', '--groundtruth', action='store_true', help='Use to store groundtruth files.')
parser_validate.add_argument('-pd', '--predictions', action='store_true', help='Use to store prediction files.')
parser_validate.set_defaults(func=cross_validate)

# Parse initial args
args = parser.parse_args()

# Logging
logging.basicConfig(filename='medacy.log', format='%(asctime)-15s: %(message)s', level=logging.INFO)
if args.print_logs:
logging.getLogger().addHandler(logging.StreamHandler())
start_time = time.time()
current_time = datetime.fromtimestamp(start_time).strftime('%Y_%m_%d_%H.%M.%S')
logging.info('\nSTART TIME: ' + current_time)

# Run proper function
dataset, model = setup(args)
args.func(args, dataset, model)

# Calculate/print end time
end_time = time.time()
current_time = datetime.fromtimestamp(end_time).strftime('%Y_%m_%d_%H.%M.%S')
logging.info('END TIME: ' + current_time)

# Calculate/print time elapsed
seconds_elapsed = end_time - start_time
minutes_elapsed, seconds_elapsed = divmod(seconds_elapsed, 60)
hours_elapsed, minutes_elapsed = divmod(minutes_elapsed, 60)

logging.info('H:M:S ELAPSED: %d:%d:%d' % (hours_elapsed, minutes_elapsed, seconds_elapsed))

if __name__ == '__main__':
main()
57 changes: 57 additions & 0 deletions medacy/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,63 @@ def get_data_files(self):
def __iter__(self):
return self.get_data_files().__iter__()

def get_labels(self):
"""
Get all of the entities/labels used in the dataset.
:return: A set of strings. Each string is a label used.
"""
labels = set()
data_files = self.all_data_files

for datafile in data_files:
ann_path = datafile.get_annotation_path()
annotations = Annotations(ann_path)
labels.update(annotations.get_labels())

return labels

def get_training_data(self, data_format='spacy'):
"""
Get training data in a specified format.
:param data_format: The specified format as a string.
:return: The requested data in the requested format.
"""
# Only spaCy format is currently supported.
if data_format != 'spacy':
raise TypeError("Format %s not supported" % format)

training_data = []

# Add each entry in dataset with annotation to train_data
for data_file in self.all_data_files:
txt_path = data_file.get_text_path()
ann_path = data_file.get_annotation_path()
annotations = Annotations(ann_path, source_text_path=txt_path)
training_data.append(annotations.get_entity_annotations(format='spacy'))

return training_data

def get_subdataset(self, indices):
"""
Get a subdataset of data files based on indices.
:param indices: List of ints that represent the indexes of the data files to split off.
:return: Dataset object with only the specified data files.
"""
subdataset = Dataset(self.data_directory)
data_files = subdataset.get_data_files()
sub_data_files = []

for i in range(len(data_files)):
if i in indices:
sub_data_files.append(data_files[i])

subdataset.all_data_files = sub_data_files
return subdataset

def metamap(self, metamap, n_jobs=multiprocessing.cpu_count() - 1, retry_possible_corruptions=True):
"""
Expand Down
1 change: 1 addition & 0 deletions medacy/ner/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from .model.model import Model
from .model.spacy_model import SpacyModel
2 changes: 1 addition & 1 deletion medacy/ner/model/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from .model import Model
from .stratified_k_fold import SequenceStratifiedKFold
from .stratified_k_fold import SequenceStratifiedKFold
12 changes: 9 additions & 3 deletions medacy/ner/model/_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,14 @@ def construct_annotations_from_tuples(doc, predictions):
annotations = {'entities': {}, 'relations': []}
T_num = 1
for prediction in predictions:
(entity, start, end) = prediction
labeled_text = doc.text[start:end]
if len(prediction) == 3:
(entity, start, end) = prediction
labeled_text = doc.text[start:end]
elif len(prediction) == 4:
(entity, start, end, labeled_text) = prediction
else:
raise ValueError("Incorrect prediction length.")

annotations['entities']['T%i' % T_num] = (entity, start, end, labeled_text)
T_num += 1
return Annotations(annotations)
return Annotations(annotations)
Loading

0 comments on commit 52314c9

Please sign in to comment.