diff --git a/README.rst b/README.rst index e0c28cb7..bc74f9dd 100644 --- a/README.rst +++ b/README.rst @@ -85,6 +85,10 @@ Or with the Arrow client: History ------- +- 4.3.0 + - Biopython requires are now more strict, there is a maximum that bcbio_gff supports + - loading GFF3 now has a new flag, --cds_cleaning, to enable more apollo conformant behaviour for multi-exon transcripts + (https://github.com/galaxy-genome-annotation/python-apollo/issues/60, https://github.com/galaxy-genome-annotation/python-apollo/pull/62) - 4.2.13 - Relax biopython requirements - 4.2.12 diff --git a/apollo/annotations/__init__.py b/apollo/annotations/__init__.py index 10163b97..4ac593e6 100644 --- a/apollo/annotations/__init__.py +++ b/apollo/annotations/__init__.py @@ -1247,10 +1247,15 @@ def _get_type(self, rec): def _get_subfeature_type(self, rec): return rec.features[0].type - def _process_gff_entry(self, rec, source=None, disable_cds_recalculation=False, use_name=False): + def _process_gff_entry(self, rec, source=None, disable_cds_recalculation=False, use_name=False, cds_cleaning=False): new_feature_list = [] new_transcript_list = [] + kwargs = { + 'use_name': use_name, + 'disable_cds_recalculation': disable_cds_recalculation, + 'cds_cleaning': cds_cleaning, + } type = self._get_type(rec) log.debug("type " + str(type)) @@ -1260,8 +1265,7 @@ def _process_gff_entry(self, rec, source=None, disable_cds_recalculation=False, if type in util.gene_types: log.debug("is gene type") if len(feature.sub_features) > 0: - feature_data = util.yieldApolloData(feature, use_name=use_name, - disable_cds_recalculation=disable_cds_recalculation) + feature_data = util.yieldApolloData(feature, **kwargs) log.debug("output feature data" + str(feature_data)) if isinstance(feature_data, list): new_transcript_list += feature_data @@ -1269,30 +1273,25 @@ def _process_gff_entry(self, rec, source=None, disable_cds_recalculation=False, new_transcript_list.append(feature_data) else: log.debug("NO sub features, just adding directly") - feature_data = util.yieldApolloData(feature, use_name=use_name, - disable_cds_recalculation=disable_cds_recalculation) + feature_data = util.yieldApolloData(feature, **kwargs) log.debug("output feature data" + str(feature_data)) new_feature_list.append(feature_data) elif type in util.pseudogenes_types: - feature_data = util.yieldApolloData(feature, use_name=use_name, - disable_cds_recalculation=disable_cds_recalculation) + feature_data = util.yieldApolloData(feature, **kwargs) if isinstance(feature_data, list): new_feature_list += feature_data else: new_feature_list.append(feature_data) elif type in util.coding_transcript_types: - feature_data = util.yieldApolloData(feature, use_name=use_name, - disable_cds_recalculation=disable_cds_recalculation) + feature_data = util.yieldApolloData(feature, **kwargs) new_transcript_list.append(feature_data) elif type in util.noncoding_transcript_types: log.debug("a non-coding transcript") - feature_data = util.yieldApolloData(feature, use_name=use_name, - disable_cds_recalculation=disable_cds_recalculation) + feature_data = util.yieldApolloData(feature, **kwargs) new_feature_list.append(feature_data) log.debug("new feature list " + str(new_feature_list)) elif type in util.single_level_feature_types: - feature_data = util.yieldApolloData(feature, use_name=use_name, - disable_cds_recalculation=disable_cds_recalculation) + feature_data = util.yieldApolloData(feature, **kwargs) new_feature_list.append(feature_data) else: log.debug("unknown type " + type + " ") @@ -1303,6 +1302,7 @@ def load_gff3(self, organism, gff3, source=None, batch_size=1, test=False, use_name=False, disable_cds_recalculation=False, + cds_cleaning=False, timing=False, ): """ @@ -1329,6 +1329,13 @@ def load_gff3(self, organism, gff3, source=None, batch_size=1, :type disable_cds_recalculation: bool :param disable_cds_recalculation: Disable CDS recalculation and instead use the one provided + :type cds_cleaning: bool + :param cds_cleaning: This changes the behaviour of creating GFF3 + features in apollo to match more closely to what it expects. Generally + you'll probably want this on if you have transcripts with multiple + exons and CDSs, but we don't want to change existing scripts + so we are not defaulting this on. + :type timing: bool :param timing: Output loading performance metrics @@ -1361,7 +1368,8 @@ def load_gff3(self, organism, gff3, source=None, batch_size=1, log.info("Processing %s with features: %s" % (rec.id, rec.features)) processed = self._process_gff_entry(rec, source=source, disable_cds_recalculation=disable_cds_recalculation, - use_name=use_name + use_name=use_name, + cds_cleaning=cds_cleaning ) all_processed['top-level'].extend(processed['top-level']) all_processed['transcripts'].extend(processed['transcripts']) diff --git a/apollo/util.py b/apollo/util.py index 802625be..8741a961 100644 --- a/apollo/util.py +++ b/apollo/util.py @@ -99,7 +99,7 @@ def _tnType(feature): return 'exon' -def _yieldGeneData(gene, disable_cds_recalculation=False, use_name=False): +def _yieldGeneData(gene, disable_cds_recalculation=False, use_name=False, cds_cleaning=False): current = _yieldSubFeatureData(gene, disable_cds_recalculation=disable_cds_recalculation, use_name=use_name) if gene.sub_features: @@ -122,7 +122,30 @@ def _yieldGeneData(gene, disable_cds_recalculation=False, use_name=False): # # TODO: handle GO, Gene Product, Provenance if 'children' in current and gene.type == 'gene': + if not cds_cleaning: + return current['children'] + # Only sending mRNA level as apollo is more comfortable with orphan mRNAs + for mRNA in current['children']: + new_mRNA_children = [] + new_cds = None + for feature in mRNA['children']: + if feature['type']['name'] == 'CDS': + if new_cds: + new_cds_start = new_cds['location']['fmin'] + new_cds_end = new_cds['location']['fmax'] + this_cds_start = feature['location']['fmin'] + this_cds_end = feature['location']['fmax'] + new_cds['location']['fmin'] = min(new_cds_start, this_cds_start) + new_cds['location']['fmax'] = max(new_cds_end, this_cds_end) + else: + new_cds = feature + else: + new_mRNA_children.append(feature) + if new_cds: + mRNA['children'] = new_mRNA_children + mRNA['children'].append(new_cds) + return current['children'] else: # No children, return a generic gene feature @@ -205,21 +228,27 @@ def _yieldNonCodingTranscriptData(features, disable_cds_recalculation=False, use # return _yieldSubFeatureData(features[0]) -def yieldApolloData(feature, use_name=False, disable_cds_recalculation=False): +def yieldApolloData(feature, use_name=False, disable_cds_recalculation=False, cds_cleaning=False): + # manually created a kwargs so we don't lose the actual method signature on yieldApolloData + kwargs = { + 'use_name': use_name, + 'disable_cds_recalculation': disable_cds_recalculation, + 'cds_cleaning': cds_cleaning, + } feature_type = _tnType(feature) if feature_type in gene_types: - return _yieldGeneData(feature) + return _yieldGeneData(feature, **kwargs) elif feature_type in pseudogenes_types: - return _yieldGeneData(feature) + return _yieldGeneData(feature, **kwargs) elif feature_type in coding_transcript_types: - return _yieldCodingTranscriptData(feature) + return _yieldCodingTranscriptData(feature, **kwargs) elif feature_type in noncoding_transcript_types: - return _yieldNonCodingTranscriptData(feature) + return _yieldNonCodingTranscriptData(feature, **kwargs) elif feature_type in single_level_feature_types: # return _yieldSingleLevelFeatureData(current_feature) - return _yieldSubFeatureData(feature) + return _yieldSubFeatureData(feature, **kwargs) else: - return _yieldSubFeatureData(feature) + return _yieldSubFeatureData(feature, **kwargs) # # if OGS: # # TODO: handle comments @@ -288,17 +317,23 @@ def add_property_to_feature(feature, property_key, property_value): return feature -def features_to_apollo_schema(features, use_name=False, disable_cds_recalculation=False): +def features_to_apollo_schema(features, use_name=False, disable_cds_recalculation=False, cds_cleaning=False): """ - :param disable_cds_recalculation: :param use_name: :param features: + :param cds_cleaning: :return: """ + kwargs = { + 'use_name': use_name, + 'disable_cds_recalculation': disable_cds_recalculation, + 'cds_cleaning': cds_cleaning, + } + compiled = [] for f in features: - compiled.append(yieldApolloData(f, use_name=use_name, disable_cds_recalculation=disable_cds_recalculation)) + compiled.append(yieldApolloData(f, **kwargs)) return compiled diff --git a/arrow/__init__.py b/arrow/__init__.py index aa33e3a7..5ee6158c 100644 --- a/arrow/__init__.py +++ b/arrow/__init__.py @@ -1 +1 @@ -__version__ = '4.2.13' +__version__ = '4.3.0' diff --git a/arrow/commands/annotations/load_gff3.py b/arrow/commands/annotations/load_gff3.py index 0559664c..949b68e0 100644 --- a/arrow/commands/annotations/load_gff3.py +++ b/arrow/commands/annotations/load_gff3.py @@ -33,6 +33,11 @@ help="Disable CDS recalculation and instead use the one provided", is_flag=True ) +@click.option( + "--cds_cleaning", + help="This changes the behaviour of creating GFF3 features in apollo to match more closely to what it expects. Generally you'll probably want this on if you have transcripts with multiple exons and CDSs, but we don't want to change existing scripts so we are not defaulting this on.", + is_flag=True +) @click.option( "--timing", help="Output loading performance metrics", @@ -41,11 +46,11 @@ @pass_context @custom_exception @str_output -def cli(ctx, organism, gff3, source="", batch_size=1, test=False, use_name=False, disable_cds_recalculation=False, timing=False): +def cli(ctx, organism, gff3, source="", batch_size=1, test=False, use_name=False, disable_cds_recalculation=False, cds_cleaning=False, timing=False): """Load a full GFF3 into annotation track Output: Loading report """ - return ctx.gi.annotations.load_gff3(organism, gff3, source=source, batch_size=batch_size, test=test, use_name=use_name, disable_cds_recalculation=disable_cds_recalculation, timing=timing) + return ctx.gi.annotations.load_gff3(organism, gff3, source=source, batch_size=batch_size, test=test, use_name=use_name, disable_cds_recalculation=disable_cds_recalculation, cds_cleaning=cds_cleaning, timing=timing) diff --git a/setup.py b/setup.py index e4cc11f4..0bea9cc5 100644 --- a/setup.py +++ b/setup.py @@ -16,11 +16,11 @@ setup( name="apollo", - version='4.2.13', + version='4.3.0', description="Apollo API library", long_description=readme, author="Helena Rasche;Anthony Bretaudeau;Nathan Dunn", - author_email="hxr@hx42.org", + author_email="hexylena@galaxians.org", url='https://github.com/galaxy-genome-annotation/python-apollo', packages=['apollo', 'arrow'] + subpackages, entry_points='''