Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: automatically retrieve references/annotations and cache them #531

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions snappy_pipeline/workflows/reference/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# -*- coding: utf-8 -*-
"""CUBI Pipeline adapter_trimming step Snakefile"""

import os

from snappy_pipeline import expand_ref
from snappy_pipeline.workflows.reference import ReferenceWorkflow

__author__ = "Till Hartmann <[email protected]>"


# Configuration ===============================================================


configfile: "config.yaml"


# Expand "$ref" JSON pointers in configuration (also works for YAML)
config, lookup_paths, config_paths = expand_ref("config.yaml", config)

# WorkflowImpl Object Setup ===================================================

wf = ReferenceWorkflow(workflow, config, lookup_paths, config_paths, os.getcwd())

# Rules =======================================================================


rule reference_all:
input:
wf.get_result_files(),
default_target: True


rule reference_retrieve_fasta_run:
output:
fasta=protected("work/reference/{reference}/reference.fasta"),
params:
reference=lambda wildcards: wildcards.reference,
cache: "omit-software"
shell:
"""
touch {output.fasta}
"""
99 changes: 99 additions & 0 deletions snappy_pipeline/workflows/reference/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# -*- coding: utf-8 -*-
"""Implementation of the ``reference`` step

=====================
Default Configuration
=====================

The default configuration is as follows.

.. include:: DEFAULT_CONFIG_adapter_trimming.rst

"""

from biomedsheets.shortcuts import GenericSampleSheet

from snappy_pipeline.utils import dictify, listify
from snappy_pipeline.workflows.abstract import BaseStepPart, BaseStep, LinkInStep
from .model import ReferenceModel as ReferenceConfigModel

#: Default configuration for the reference
DEFAULT_CONFIG = ReferenceConfigModel.default_config_yaml_string()


class ReferenceStepPart(BaseStepPart):
"""Reference retrieval common features"""

#: Step name
name = ""

#: Class available actions
actions = ("run",)

def __init__(self, parent):
super().__init__(parent)
self.base_path_out = "work/{source}.{{library_name}}"

@dictify
def get_output_files(self, action):
"""Return output files"""
# Validate action
self._validate_action(action)
return (("out_done", self.base_path_out.format(source=self.name) + "/out/.done"),)

@dictify
def _get_log_file(self, action):
"""Return dict of log files."""
# Validate action
self._validate_action(action)
_ = action
prefix = "work/{source}/log/{source}.{{reference_name}}".format(source=self.name)
key_ext = (
("log", ".log"),
("conda_info", ".conda_info.txt"),
("conda_list", ".conda_list.txt"),
)
yield (
"done",
"work/{source}.{{reference_name}}/log/.done".format(source=self.name),
)
for key, ext in key_ext:
yield key, prefix + ext
yield key + "_md5", prefix + ext + ".md5"

def get_args(self, action):
"""Return function that maps wildcards to dict for input files"""

def args_function(wildcards):
return {}

# Validate action
self._validate_action(action)
return args_function


class ReferenceWorkflow(BaseStep):
"""Automatically retrieve reference data"""

#: Step name
name = "reference"

#: Default biomed sheet class
sheet_shortcut_class = GenericSampleSheet

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs, config_model_class=ReferenceConfigModel)
self.register_sub_step_classes((LinkInStep,))

@classmethod
def default_config_yaml(cls):
"""Return default config YAML, to be overwritten by project-specific one"""
return DEFAULT_CONFIG

@listify
def get_result_files(self):
"""Return list of result files for the reference workflow"""
tpls = ("output/{source}/{reference_name}/out/.done",)
for name, reference in self.config["references"]:
for tpl in tpls:
yield tpl.format(source=reference.source, reference_name=name)
80 changes: 80 additions & 0 deletions snappy_pipeline/workflows/reference/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
from enum import StrEnum
from typing import Annotated

from pydantic import Field

from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel


class Source(StrEnum):
Ensembl = "Ensembl"
NCBI = "NCBI"
Custom = "Custom"


class DataType(StrEnum):
dna = "dna"
cds = "cds"
cdna = "cdna"
ncrna = "ncrna"
pep = "pep"


class Region(SnappyModel):
name: str
start: int | None
end: int | None


class Annotation(SnappyModel):
reference: list[str] | None = None


class Reference(SnappyModel):
description: str
"""Description of the reference."""

source: Annotated[Source, EnumField(Source)]
"""Source of the reference."""

custom_url: str | None = Field(
None, examples=["file:///path/to/reference.fa", "http://example.com/reference.fa"]
)
"""URL to custom reference. Only used when source is 'Custom'."""

species: str = Field(examples=["Homo Sapiens"])
"""Species name."""

taxon_id: str | int = Field(examples=[9606])
"""Taxon ID."""

datatype: Annotated[DataType, EnumField(DataType)]
"""Data type of the reference."""

release: str | int = Field(examples=[112])
"""Release of the reference."""

build: str | None = Field(None, examples=["GRCh37", "GRCh38"])
"""Build of the reference."""

branch: str | None = Field(None, examples=["grch37"])
"""Branch of the reference."""

exclude_contigs: str | None = None
"""Regular expression to exclude contigs with"""

regions: list[Region] | None = None
"""Regions of the reference."""

additional_sequences: list[str] | None = None
"""List of local fasta files to add to the reference"""

annotations: dict[str, Annotation] = {}


class ReferenceModel(SnappyStepModel):
references: dict[str, Reference] = {
"GRCh38-foo": Reference(
source="Ensembl", species="Homo Sapiens", taxon_id=9606, datatype="dna", release=112
)
}
Loading