From 140b575594ebe10c953f1c50a1121bb250fc788c Mon Sep 17 00:00:00 2001 From: Mikkel Christensen Date: Fri, 26 Mar 2021 12:37:57 +0000 Subject: [PATCH] adding solution for cpmplex split-merge. Makeing sure a gene is only created once even if it is part of more than one event. Also makeing sure a gene/event is only subbited once to the database and written once to the history file by adding status flags to the feature. --- allocation_service/annotation_events.py | 15 +++-- allocation_service/event_output.py | 12 ++-- allocation_service/genomic_features.py | 4 +- .../tests/test_allocation_pipeline.py | 56 +++++++++++++++++-- 4 files changed, 72 insertions(+), 15 deletions(-) diff --git a/allocation_service/annotation_events.py b/allocation_service/annotation_events.py index 1ce93a8..5a8344d 100644 --- a/allocation_service/annotation_events.py +++ b/allocation_service/annotation_events.py @@ -70,13 +70,16 @@ def setup(self, index): def _create_genes(self, event, index): genes = list() for gene_model in event: - gene = ProteinCodingGene(gene_model, index) + if gene_model['id'] not in index: + gene = ProteinCodingGene(gene_model, index) + if gene.source != 'reference': + self.new_gene_count += 1 + self.created_genes.append(gene) + else: + gene = index[gene_model['id']] genes.append(gene) self.gene_event_index[gene.source_id] = genes - if gene.source != 'reference': - self.new_gene_count += 1 - self.created_genes.append(gene) return genes def _allocate_to_gene(self, osid_id, gene_id): @@ -98,10 +101,12 @@ def update_ancestors(self): ancestors = list() for gene in event: if gene.source == 'reference': + gene.known_events.add(self.event_type) ancestors.append(gene) for gene in event: if gene.source != 'reference': - gene.ancestors = ancestors + for ancestor_gene in ancestors: + gene.ancestors.add(ancestor_gene) class CreateGeneModelEvent(AnnotationEvent): diff --git a/allocation_service/event_output.py b/allocation_service/event_output.py index 30f2586..bca0c39 100644 --- a/allocation_service/event_output.py +++ b/allocation_service/event_output.py @@ -95,15 +95,18 @@ def write_event_file(self): for annotation_event_type in self.event_collection.annotation_event_list: event_type = annotation_event_type.event_type for event in annotation_event_type.event_list: + flag = event_type + '_written_to_history_file' for gene in event: - if gene.source_id != 'reference': + if gene.source != 'reference' and flag not in gene.status_flags: + gene.status_flags.add(flag) if len(gene.ancestors) == 0 and gene.allocated_id: self.file_handle.write(gene.allocated_id + "\t" + event_type + "\t" + '' + "\n") else: for ancestor in gene.ancestors: - self.file_handle.write(gene.allocated_id - + "\t" + event_type + "\t" + ancestor.source_id + "\n") + if event_type in ancestor.known_events: + self.file_handle.write(gene.allocated_id + + "\t" + event_type + "\t" + ancestor.source_id + "\n") self.file_handle.close() @@ -121,8 +124,9 @@ def __init__(self, session_database, application_id, production_database_id, com for event in annotation_event_type.event_list: for gene in event: - if gene.source_id != 'reference' and gene.allocated_id: + if gene.allocated_id and "written_to_session_database" not in gene.status_flags: self.add_feature(gene, 'gene') + gene.status_flags.add("written_to_session_database") for mrna in gene.mrnas: self.add_feature(mrna, 'transcript') diff --git a/allocation_service/genomic_features.py b/allocation_service/genomic_features.py index 6b53659..473f8d6 100644 --- a/allocation_service/genomic_features.py +++ b/allocation_service/genomic_features.py @@ -22,7 +22,9 @@ def __init__(self, model, index): self.source_id = str() self.allocated_id = str() self.osid_id = int - self.ancestors = list() + self.ancestors = set() + self.status_flags = set() + self.known_events = set() self.setup_model(model) self._register_my_self(index) diff --git a/allocation_service/tests/test_allocation_pipeline.py b/allocation_service/tests/test_allocation_pipeline.py index bf694d3..b697acc 100644 --- a/allocation_service/tests/test_allocation_pipeline.py +++ b/allocation_service/tests/test_allocation_pipeline.py @@ -19,19 +19,28 @@ def get_organism_id(organism_name): @staticmethod def get_gene_id(organism_id, generate_genes): _ = organism_id - - if generate_genes == 2: + if generate_genes == 3: + return 1, [{"geneId": "ABC00015", "transcripts": [], "proteins": []}, + {"geneId": "ABC00016", "transcripts": [], "proteins": []}, + {"geneId": "ABC00017", "transcripts": [], "proteins": []}] + elif generate_genes == 2: return 1, [{"geneId": "ABC00015", "transcripts": [], "proteins": []}, - {"geneId": "ABC00016", "transcripts": [], "proteins": []}] + {"geneId": "ABC00016", "transcripts": [], "proteins": []}] elif generate_genes == 1: return 1, [{"geneId": "ABC00015", "transcripts": [], "proteins": []}] + else: + print("Error " + str(generate_genes)) @staticmethod def get_transcripts(id_set_id, transcript_patch): _ = id_set_id - if len(transcript_patch) == 2: + if len(transcript_patch) == 3: + return [{"geneId": "ABC00015", "transcripts": ['ABC00015_R001'], "proteins": ['ABC00015_P001']}, + {"geneId": "ABC00016", "transcripts": ['ABC00016_R001'], "proteins": ['ABC00016_P001']}, + {"geneId": "ABC00017", "transcripts": ['ABC00017_R001'], "proteins": ['ABC00017_P001']}] + elif len(transcript_patch) == 2: return [{"geneId": "ABC00015", "transcripts": ['ABC00015_R001'], "proteins": ['ABC00015_P001']}, - {"geneId": "ABC00016", "transcripts": ['ABC00016_R001'], "proteins": ['ABC00016_P001']}] + {"geneId": "ABC00016", "transcripts": ['ABC00016_R001'], "proteins": ['ABC00016_P001']}] elif len(transcript_patch) == 1: return [{"geneId": "ABC00015", "transcripts": ['ABC00015_R001'], "proteins": ['ABC00015_P001']}] @@ -54,6 +63,39 @@ def get_annotations_events(event_type): merge_1 = [ref_model1, ref_model2, merge_model1] events.append(merge_1) + return events + elif event_type == 'complex_split': + events = list() + ref_model_s1 = {"source": "reference", "id": "AARA004952", "children": [{"id": "AARA004952_R0001", "version": 2, + "children": [{"id": "AARA004952_P0001", "version": 2}]}]} + split_model_s1a = {"source": "apollo", "id": "dd6f006e-613d-4507-84ec-d00e2097cd88", "children": [{"id": "DHEYODH-DHYERS-dd6f006e", "version": 2, + "children": [{"id": None, "version": 2}]}]} + split_model_s1b = {"source": "apollo", "id": "5d6f2e78-566e-4a3b-8534-d3422b77734d", "children": [{"id": "DHEYODH-DHYERS-5d6f2e78", "version": 2, + "children": [{"id": None, "version": 2}]}]} + complex_split1 = [ref_model_s1, split_model_s1a, split_model_s1b] + events.append(complex_split1) + + ref_model_s2 = {"source": "reference", "id": "AARA004953", "children": [{"id": "AARA004953_R0001", "version": 2, + "children": [{"id": "AARA004953_P0001", "version": 2}]}]} + split_model_s2a = {"source": "apollo", "id": "fd03de20-5f52-49a7-88b8-6f79443ff90b", "children": [{"id": "DHEYODH-DHYERS-fd03de20", "version": 2, + "children": [{"id": None, "version": 2}]}]} + split_model_s2b = {"source": "apollo", "id": "5d6f2e78-566e-4a3b-8534-d3422b77734d", "children": [{"id": "DHEYODH-DHYERS-5d6f2e78", "version": 2, + "children": [{"id": None, "version": 2}]}]} + complex_split2 = [ref_model_s2, split_model_s2a, split_model_s2b] + events.append(complex_split2) + + return events + + elif event_type == 'complex_merge': + events = list() + ref_model_m1 = {"source": "reference", "id": "AARA004952", "children": [{"id": "AARA004952_R0001", "version": 2, + "children": [{"id": "AARA004952_P0001", "version": 2}]}]} + ref_model_m2 = {"source": "reference", "id": "AARA004953", "children": [{"id": "AARA004953_R0001", "version": 2, + "children": [{"id": "AARA004953_P0001", "version": 2}]}]} + merge_model_m12 = {"source": "apollo", "id": "5d6f2e78-566e-4a3b-8534-d3422b77734d", "children": [{"id": "DHEYODH-DHYERS-5d6f2e78", "version": 2, + "children": [{"id": None, "version": 2}]}]} + complex_merge = [ref_model_m1, ref_model_m2, merge_model_m12] + events.append(complex_merge) return events else: return False @@ -81,6 +123,10 @@ def test_create_event_collection(self): self.assertEqual('ABC00015_R001', event_collection.get_allocated_id('DHEYODH-DHYERS')) self.assertEqual('ABC00015_P001', event_collection.get_allocated_id('DHEYODH-DHYERS-CDS')) + event_collection = EventCollection('test', event_connection, stable_id_service) + event_collection.event_types = {'complex_split', 'complex_merge'} + event_collection.create() + class EventFileTestCase(unittest.TestCase):