Skip to content

Commit

Permalink
Merge pull request #615 from nextstrain/drop-backward-compatibility
Browse files Browse the repository at this point in the history
Drop backward compatibility
  • Loading branch information
huddlej authored May 7, 2021
2 parents a9b66a2 + 98d51a6 commit c7a707f
Show file tree
Hide file tree
Showing 14 changed files with 264 additions and 461 deletions.
23 changes: 22 additions & 1 deletion Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ from getpass import getuser
from snakemake.logging import logger
from snakemake.utils import validate
from collections import OrderedDict
import textwrap
import time

# Store the user's configuration prior to loading defaults, so we can check for
Expand Down Expand Up @@ -74,6 +75,26 @@ if "builds" not in config:

include: "workflow/snakemake_rules/reference_build_definitions.smk"

# Check for old-style input file references and alert users to the new format.
if "sequences" in config or "metadata" in config:
logger.error("ERROR: Your configuration file includes references to an unsupported specification of input files (e.g., `config['sequences']` or `config['metadata']`).")
logger.error("Update your configuration file (e.g., 'builds.yaml') to define your inputs as follows and try running the workflow again:")
logger.error(textwrap.indent(
f"\ninputs:\n name: local-data\n metadata: {config['metadata']}\n sequences: {config['sequences']}\n",
" "
))
sys.exit(1)

# Check for missing inputs.
if "inputs" not in config:
logger.error("ERROR: Your workflow does not define any input files to start with.")
logger.error("Update your configuration file (e.g., 'builds.yaml') to define at least one input dataset as follows and try running the workflow again:")
logger.error(textwrap.indent(
f"\ninputs:\n name: local-data\n metadata: data/example_metadata.tsv\n sequences: data/example_sequences.fasta.gz\n",
" "
))
sys.exit(1)

# Allow users to specify a list of active builds from the command line.
if config.get("active_builds"):
BUILD_NAMES = config["active_builds"].split(",")
Expand All @@ -93,7 +114,7 @@ wildcard_constraints:
# but not special strings used for Nextstrain builds.
build_name = r'(?:[_a-zA-Z-](?!(tip-frequencies)))+',
date = r"[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]",
origin = r"(_[a-zA-Z0-9-]+)?" # origin starts with an underscore _OR_ it's the empty string
origin = r"[a-zA-Z0-9-_]+"

localrules: download_metadata, download_sequences, clean

Expand Down
8 changes: 3 additions & 5 deletions defaults/parameters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,6 @@
# This must be a relative path to the top-level Snakefile directory (e.g., `ncov/`).
conda_environment: "workflow/envs/nextstrain.yaml"

# These are the two main starting files for the run.
# If they do not exist, we will attempt to fetch them from a S3 bucket (see below)
sequences: "data/sequences.fasta"
metadata: "data/metadata.tsv"

strip_strain_prefixes:
- hCoV-19/
- SARS-CoV-2/
Expand All @@ -36,6 +31,9 @@ files:
clades: "defaults/clades.tsv"
emerging_lineages: "defaults/emerging_lineages.tsv"

# Define genes to translate during alignment by nextalign.
genes: ["ORF1a", "ORF1b", "S", "ORF3a", "M", "N"]

# Filter settings
filter:
# Require nearly full-length genomes.
Expand Down
15 changes: 15 additions & 0 deletions docs/change_log.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,21 @@
As of April 2021, we use major version numbers (e.g. v2) to reflect backward incompatible changes to the workflow that likely require you to update your Nextstrain installation.
We also use this change log to document new features that maintain backward compatibility, indicating these features by the date they were added.

## v5 (7 May 2021)

[See the corresponding pull request](https://github.com/nextstrain/ncov/pull/615) for more details about this release.

### Major changes

- Drop support for old sequence/metadata inputs
- Use nextalign for alignment instead of mafft

### Minor changes

- Drop unused haplotype status rule and script
- Remove unused nucleotide mutation frequencies rule
- Use augur distance for mutation counts

## v4 (5 May 2021)

[See the corresponding pull request](https://github.com/nextstrain/ncov/pull/605) for more details about changes in this release.
Expand Down
20 changes: 9 additions & 11 deletions docs/multiple_inputs.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,15 +51,17 @@ my_profiles/example_multiple_inputs/my_auspice_config.json

## Setting up the config

Typically, inside the `builds.yaml` one would specify input files such as
You can define a single input dataset in `builds.yaml` as follows.

```yaml
# traditional syntax for specifying starting files
sequences: "data/sequences.fasta"
metadata: "data/metadata.tsv"
inputs:
- name: my-data
metadata: "data/metadata.tsv"
sequences: "data/sequences.fasta"
```
For multiple inputs, we shall use the new `inputs` section of the config to specify that we have two different inputs, and we will give them the names "aus" and "worldwide":
For multiple inputs, you can add another entry to the `inputs` config list.
Here, we will give them the names "aus" and "worldwide":

```yaml
# my_profiles/example_multiple_inputs/builds.yaml
Expand All @@ -72,15 +74,11 @@ inputs:
sequences: "data/example_sequences_worldwide.fasta"
```

> Note that if you also specify `sequences` or `metadata` as top level entries in the config, they will be ignored.

### Snakemake terminology

Inside the Snakemake rules, we use a wildcard `origin` to define different starting points.
For instance, if we ask for the file `results/aligned_worldwide.fasta` then `wildcards.origin="_worldwide"` and we expect that the config has defined
a sequences input via `config["sequences"]["worldwide"]=<path to fasta>` (note the leading `_` has been stripped from the `origin` in the config).
If we use the older syntax (specifying `sequences` or `metadata` as top level entries in the config) then `wildcards.origin=""`.

For instance, if we ask for the file `results/aligned_worldwide.fasta` then `wildcards.origin="worldwide"` and we expect that the config has defined
a sequences input as shown above.

## How is metadata combined?

Expand Down
6 changes: 6 additions & 0 deletions my_profiles/example/builds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@

# In this example, we use these default methods. See other templates for examples of how to customize this subsampling scheme.

# Define input files.
inputs:
- name: example-data
metadata: data/example_metadata.tsv
sequences: data/example_sequences.fasta

builds:
# Focus on King County (location) in Washington State (division) in the USA (country)
# with a build name that will produce the following URL fragment on Nextstrain/auspice:
Expand Down
4 changes: 0 additions & 4 deletions my_profiles/example/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,6 @@ configfile:
- defaults/parameters.yaml # Pull in the default values
- my_profiles/example/builds.yaml # Pull in our list of desired builds

config:
- sequences=data/example_sequences.fasta
- metadata=data/example_metadata.tsv

# Set the maximum number of cores you want Snakemake to use for this pipeline.
cores: 2

Expand Down
6 changes: 6 additions & 0 deletions my_profiles/getting_started/builds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@
# These subsample primarily from the area of interest ("focus"), and add in background ("contextual") sequences from the rest of the world.
# Contextual sequences that are genetically similar to (hamming distance) and geographically near the focal sequences are heavily prioritized.

# Define input files.
inputs:
- name: example-data
metadata: data/example_metadata.tsv
sequences: data/example_sequences.fasta.gz

# In this example, we use these default methods. See other templates for examples of how to customize this subsampling scheme.
builds:
# This build samples evenly from the globe
Expand Down
4 changes: 0 additions & 4 deletions my_profiles/getting_started/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,6 @@ configfile:
- defaults/parameters.yaml # Pull in the default values
- my_profiles/getting_started/builds.yaml # Pull in our list of desired builds

config:
- sequences=data/example_sequences.fasta
- metadata=data/example_metadata.tsv

# Set the maximum number of cores you want Snakemake to use for this pipeline.
cores: 1

Expand Down
36 changes: 0 additions & 36 deletions scripts/annotate-haplotype-status.py

This file was deleted.

55 changes: 0 additions & 55 deletions scripts/mutation_counts.py

This file was deleted.

62 changes: 21 additions & 41 deletions workflow/snakemake_rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -25,89 +25,69 @@ def numeric_date(dt=None):

return res

def _trim_origin(origin):
"""the origin wildcard includes a leading `_`. This function returns the value without this `_`"""
if origin=="":
return ""
return origin[1:]

def _get_subsampling_scheme_by_build_name(build_name):
return config["builds"][build_name].get("subsampling_scheme", build_name)

def _get_filter_value(wildcards, key):
default = config["filter"].get(key, "")
if wildcards["origin"] == "":
return default
return config["filter"].get(_trim_origin(wildcards["origin"]), {}).get(key, default)
return config["filter"].get(wildcards["origin"], {}).get(key, default)

def _get_path_for_input(stage, origin_wildcard):
"""
A function called to define an input for a Snakemake rule
This function always returns a local filepath, the format of which decides whether rules should
create this by downloading from a remote resource, or create it by a local compute rule.
"""
if not origin_wildcard:
# No origin wildcards => deprecated single inputs (e.g. `config["sequences"]`) which cannot
# be downloaded from remote resources
if config.get("inputs"):
raise Exception("ERROR: empty origin wildcard but config defines 'inputs`")
path_or_url = config[stage] if stage in ["metadata", "sequences"] else ""
remote = False
else:
trimmed_origin = _trim_origin(origin_wildcard)
path_or_url = config.get("inputs", {}).get(trimmed_origin, {}).get(stage, "")
scheme = urlsplit(path_or_url).scheme
remote = bool(scheme)
path_or_url = config.get("inputs", {}).get(origin_wildcard, {}).get(stage, "")
scheme = urlsplit(path_or_url).scheme
remote = bool(scheme)

# Following checking should be the remit of the rule which downloads the remote resource
if scheme and scheme!="s3":
raise Exception(f"Input defined scheme {scheme} which is not yet supported.")
# Following checking should be the remit of the rule which downloads the remote resource
if scheme and scheme!="s3":
raise Exception(f"Input defined scheme {scheme} which is not yet supported.")

## Basic checking which could be taken care of by the config schema
## If asking for metadata/sequences, the config _must_ supply a `path_or_url`
if path_or_url=="" and stage in ["metadata", "sequences"]:
raise Exception(f"ERROR: config->input->{trimmed_origin}->{stage} is not defined.")
## Basic checking which could be taken care of by the config schema
## If asking for metadata/sequences, the config _must_ supply a `path_or_url`
if path_or_url=="" and stage in ["metadata", "sequences"]:
raise Exception(f"ERROR: config->input->{origin_wildcard}->{stage} is not defined.")

if stage=="metadata":
return f"data/downloaded{origin_wildcard}.tsv" if remote else path_or_url
return f"data/downloaded_{origin_wildcard}.tsv" if remote else path_or_url
if stage=="sequences":
return f"data/downloaded{origin_wildcard}.fasta" if remote else path_or_url
return f"data/downloaded_{origin_wildcard}.fasta" if remote else path_or_url
if stage=="aligned":
return f"results/precomputed-aligned{origin_wildcard}.fasta" if remote else f"results/aligned{origin_wildcard}.fasta"
return f"results/precomputed-aligned_{origin_wildcard}.fasta" if remote else f"results/aligned_{origin_wildcard}.fasta"
if stage=="to-exclude":
return f"results/precomputed-to-exclude{origin_wildcard}.txt" if remote else f"results/to-exclude{origin_wildcard}.txt"
return f"results/precomputed-to-exclude_{origin_wildcard}.txt" if remote else f"results/to-exclude_{origin_wildcard}.txt"
if stage=="masked":
return f"results/precomputed-masked{origin_wildcard}.fasta" if remote else f"results/masked{origin_wildcard}.fasta"
return f"results/precomputed-masked_{origin_wildcard}.fasta" if remote else f"results/masked_{origin_wildcard}.fasta"
if stage=="filtered":
if remote:
return f"results/precomputed-filtered{origin_wildcard}.fasta"
return f"results/precomputed-filtered_{origin_wildcard}.fasta"
elif path_or_url:
return path_or_url
else:
return f"results/filtered{origin_wildcard}.fasta"
return f"results/filtered_{origin_wildcard}.fasta"

raise Exception(f"_get_path_for_input with unknown stage \"{stage}\"")


def _get_unified_metadata(wildcards):
"""
Returns a single metadata file representing the input metadata file(s).
If there was only one supplied metadata file (e.g. the deprecated
`config["metadata"]` syntax, or one entry in the `config["inputs"] dict`)
If there was only one supplied metadata file in the `config["inputs"] dict`,
then that file is returned. Else "results/combined_metadata.tsv" is returned
which will run the `combine_input_metadata` rule to make it.
"""
if not config.get("inputs"):
return config["metadata"]
if len(list(config["inputs"].keys()))==1:
return "results/sanitized_metadata{origin}.tsv".format(origin="_"+list(config["inputs"].keys())[0])
return "results/sanitized_metadata_{origin}.tsv".format(origin=list(config["inputs"].keys())[0])
return "results/combined_metadata.tsv"

def _get_unified_alignment(wildcards):
if not config.get("inputs"):
return "results/filtered.fasta"
if len(list(config["inputs"].keys()))==1:
return _get_path_for_input("filtered", "_"+list(config["inputs"].keys())[0])
return _get_path_for_input("filtered", list(config["inputs"].keys())[0])
return "results/combined_sequences_for_subsampling.fasta",

def _get_metadata_by_build_name(build_name):
Expand Down
Loading

0 comments on commit c7a707f

Please sign in to comment.