Skip to content

Commit

Permalink
Merge pull request #842 from nextstrain/nextclade-align
Browse files Browse the repository at this point in the history
Nextclade align
  • Loading branch information
huddlej authored Feb 3, 2022
2 parents 59da03e + 46fa71e commit c5699a6
Show file tree
Hide file tree
Showing 6 changed files with 204 additions and 13 deletions.
3 changes: 1 addition & 2 deletions defaults/parameters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,7 @@ sanitize_metadata:
- "Lineage=pango_lineage"
- "Pangolin version=pangolin_version"
- "Variant=variant"
- "AA Substitutions=aa_substitutions"
- "aaSubstitutions=aa_substitutions"
- "AA Substitutions=aaSubstitutions"
- "Submission date=date_submitted"
- "Is reference?=is_reference"
- "Is complete?=is_complete"
Expand Down
4 changes: 4 additions & 0 deletions docs/src/reference/change_log.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
As of April 2021, we use major version numbers (e.g. v2) to reflect backward incompatible changes to the workflow that likely require you to update your Nextstrain installation.
We also use this change log to document new features that maintain backward compatibility, indicating these features by the date they were added.

## v11 (3 February 2022)

- Run Nextclade QC and filtering on the final sample set before building a tree. Nextclade also runs `nextalign` under the hood. Importantly, this enables filtering the final sample set to omit strains with many reversions and/or possible contaminants, significantly improving the quality of Omicron trees. [See the original pull request for more details](https://github.com/nextstrain/ncov/pull/842). To disable this filtering by Nextclade quality control metrics, set `skip_diagnostics: true` in [the `filter` section of your build configuration file](https://docs.nextstrain.org/projects/ncov/en/latest/reference/configuration.html#filter).

## New features since last version update

- 29 January 2022: Update "mutational fitness" coloring based on latest results from [Obermeyer et al model](https://www.medrxiv.org/content/10.1101/2021.09.07.21263228v1) via [github.com/broadinstitute/pyro-cov/](https://github.com/broadinstitute/pyro-cov/blob/master/paper/mutations.tsv).
Expand Down
5 changes: 5 additions & 0 deletions docs/src/reference/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,11 @@ Builds support any named attributes that can be referenced by subsampling scheme
* description: Minimum collection date for strains to include in the analysis used by `augur filter --min-date`. Dates can be numeric floating point values (e.g., `2019.74`) or ISO 8601-style strings (e.g., `2019-10-01`).
* default: `2019.74`

### skip_diagnostics
* type: boolean
* description: Skip filtering by Nextclade quality control metrics like clock rate deviation, number of SNP clusters, possible contaminations, etc.
* default: `false`

## frequencies
### min_date
* type: float or string
Expand Down
147 changes: 147 additions & 0 deletions scripts/join-metadata-and-clades.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
#!/usr/bin/env python3
"""
Copied from https://github.com/nextstrain/ncov-ingest/blob/master/bin/join-metadata-and-clades
"""
import argparse
import sys
from datetime import datetime
import pandas as pd
import numpy as np

INSERT_BEFORE_THIS_COLUMN = "pango_lineage"
METADATA_JOIN_COLUMN_NAME = 'strain'
NEXTCLADE_JOIN_COLUMN_NAME = 'seqName'
VALUE_MISSING_DATA = '?'

rate_per_day = 0.0007 * 29903 / 365
reference_day = datetime(2020,1,1).toordinal()

column_map = {
"clade": "Nextstrain_clade",
"totalMissing": "missing_data",
"totalSubstitutions": "divergence",
"totalNonACGTNs": "nonACGTN",
"privateNucMutations.totalUnlabeledSubstitutions": "rare_mutations",
"privateNucMutations.totalReversionSubstitutions": "reversion_mutations",
"privateNucMutations.totalLabeledSubstitutions": "potential_contaminants",
"qc.missingData.status": "QC_missing_data",
"qc.mixedSites.status": "QC_mixed_sites",
"qc.privateMutations.status": "QC_rare_mutations",
"qc.snpClusters.status": "QC_snp_clusters",
"qc.frameShifts.status": "QC_frame_shifts",
"qc.stopCodons.status": "QC_stop_codons",
"frameShifts": "frame_shifts",
"deletions": "deletions",
"insertions": "insertions",
"substitutions": "substitutions",
"aaSubstitutions": "aaSubstitutions"
}

preferred_types = {
"divergence": "int32",
"nonACGTN": "int32",
"missing_data": "int32",
"snp_clusters": "int32",
"rare_mutations": "int32"
}

def reorder_columns(result: pd.DataFrame):
"""
Moves the new clade column after a specified column
"""
columns = list(result.columns)
columns.remove(column_map['clade'])
insert_at = columns.index(INSERT_BEFORE_THIS_COLUMN)
columns.insert(insert_at, column_map['clade'])
return result[columns]


def parse_args():
parser = argparse.ArgumentParser(
description="Joins metadata file with Nextclade clade output",
)
parser.add_argument("first_file")
parser.add_argument("second_file")
parser.add_argument("-o", default=sys.stdout)
return parser.parse_args()

def datestr_to_ordinal(x):
try:
return datetime.strptime(x,"%Y-%m-%d").toordinal()
except:
return np.nan

def isfloat(value):
try:
float(value)
return True
except ValueError:
return False

def main():
args = parse_args()

metadata = pd.read_csv(args.first_file, index_col=METADATA_JOIN_COLUMN_NAME,
sep='\t', low_memory=False)

# Check for existing annotations in the given metadata. Skip join with
# Nextclade QC file, if those annotations already exist and none of the
# columns have empty values. In the case where metadata were combined from
# different sources with and without annotations, the "clock_deviation"
# column will exist but some values will be missing. We handle this case as
# if the annotations do not exist at all and reannotate all columns. We
# cannot look for missing values across all expected columns as evidence of
# incomplete annotations, since a complete annotation by Nextclade will
# include missing values for some columns by design.
expected_columns = list(column_map.values()) + ["clock_deviation"]
existing_annotation_columns = metadata.columns.intersection(expected_columns)
if len(existing_annotation_columns) == len(expected_columns):
if metadata["clock_deviation"].isnull().sum() == 0:
print(f"Metadata file '{args.first_file}' has already been annotated with Nextclade QC columns. Skipping re-annotation.")
metadata.to_csv(args.o, sep="\t")
return

# Read and rename clade column to be more descriptive
clades = pd.read_csv(args.second_file, index_col=NEXTCLADE_JOIN_COLUMN_NAME,
sep='\t', low_memory=False, na_filter = False) \
.rename(columns=column_map)

clade_columns = clades.columns.intersection(list(column_map.values()))
clades = clades[clade_columns]

# Concatenate on columns
result = pd.merge(
metadata, clades,
left_index=True,
right_index=True,
how='left',
suffixes=["_original", ""],
)
all_clades = result.Nextstrain_clade.unique()
t = result["date"].apply(datestr_to_ordinal)
div_array = np.array([float(x) if isfloat(x) else np.nan for x in result.divergence])
offset_by_clade = {}
for clade in all_clades:
ind = result.Nextstrain_clade==clade
if ind.sum()>100:
deviation = div_array[ind] - (t[ind] - reference_day)*rate_per_day
offset_by_clade[clade] = np.mean(deviation[~np.isnan(deviation)])

# extract divergence, time and offset information into vectors or series
offset = result["Nextstrain_clade"].apply(lambda x: offset_by_clade.get(x, 2.0))
# calculate divergence
result["clock_deviation"] = np.array(div_array - ((t-reference_day)*rate_per_day + offset), dtype=int)
result.loc[np.isnan(div_array)|np.isnan(t), "clock_deviation"] = np.nan

for col in list(column_map.values()) + ["clock_deviation"]:
result[col] = result[col].fillna(VALUE_MISSING_DATA)

# Move the new column so that it's next to other clade columns
if INSERT_BEFORE_THIS_COLUMN in result.columns:
result = reorder_columns(result) #.astype(preferred_types)

result.to_csv(args.o, index_label=METADATA_JOIN_COLUMN_NAME, sep='\t')


if __name__ == '__main__':
main()
1 change: 1 addition & 0 deletions workflow/envs/nextstrain.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ dependencies:
- epiweeks=2.1.2
- iqtree=2.1.4-beta
- nextalign=1.9.0
- nextclade=1.10.1
- pangolin=3.1.17
- pangolearn=2021.12.06
- python>=3.7*
57 changes: 46 additions & 11 deletions workflow/snakemake_rules/main_workflow.smk
Original file line number Diff line number Diff line change
Expand Up @@ -444,19 +444,35 @@ rule combine_samples:
--output-metadata {output.metadata} 2>&1 | tee {log}
"""

rule prepare_nextclade:
message:
"""
Downloading reference files for nextclade (used for alignment and qc).
"""
output:
nextclade_dataset = directory("data/sars-cov-2-nextclade-defaults"),
params:
name = "sars-cov-2",
shell:
"""
nextclade dataset get --name {params.name} --output-dir {output.nextclade_dataset}
"""

rule build_align:
message:
"""
Aligning sequences to {input.reference}
Running nextclade QC and aligning sequences to {input.reference}
- gaps relative to reference are considered real
"""
input:
sequences = rules.combine_samples.output.sequences,
genemap = config["files"]["annotation"],
reference = config["files"]["alignment_reference"]
reference = config["files"]["alignment_reference"],
nextclade_dataset = rules.prepare_nextclade.output.nextclade_dataset,
output:
alignment = "results/{build_name}/aligned.fasta",
insertions = "results/{build_name}/insertions.tsv",
nextclade_qc = 'results/{build_name}/nextclade_qc.tsv',
translations = expand("results/{{build_name}}/translations/aligned.gene.{gene}.fasta", gene=config.get('genes', ['S']))
params:
outdir = "results/{build_name}/translations",
Expand All @@ -472,22 +488,41 @@ rule build_align:
mem_mb=3000
shell:
"""
xz -c -d {input.sequences} | nextalign \
--jobs={threads} \
xz -c -d {input.sequences} | nextclade run \
--jobs {threads} \
--input-fasta /dev/stdin \
--reference {input.reference} \
--genemap {input.genemap} \
--genes {params.genes} \
--sequences /dev/stdin \
--input-dataset {input.nextclade_dataset} \
--output-tsv {output.nextclade_qc} \
--output-dir {params.outdir} \
--output-basename {params.basename} \
--output-fasta {output.alignment} \
--output-insertions {output.insertions} > {log} 2>&1
--output-insertions {output.insertions} 2>&1 | tee {log}
"""

rule join_metadata_and_nextclade_qc:
input:
metadata = "results/{build_name}/{build_name}_subsampled_metadata.tsv.xz",
nextclade_qc = "results/{build_name}/nextclade_qc.tsv",
output:
metadata = "results/{build_name}/metadata_with_nextclade_qc.tsv",
log:
"logs/join_metadata_and_nextclade_qc_{build_name}.txt",
benchmark:
"benchmarks/join_metadata_and_nextclade_qc_{build_name}.txt",
conda: config["conda_environment"]
shell:
"""
python3 scripts/join-metadata-and-clades.py \
{input.metadata} \
{input.nextclade_qc} \
-o {output.metadata} 2>&1 | tee {log}
"""

rule diagnostic:
message: "Scanning metadata {input.metadata} for problematic sequences. Removing sequences with >{params.clock_filter} deviation from the clock and with more than {params.snp_clusters}."
input:
metadata = "results/{build_name}/{build_name}_subsampled_metadata.tsv.xz",
metadata = "results/{build_name}/metadata_with_nextclade_qc.tsv",
output:
to_exclude = "results/{build_name}/excluded_by_diagnostics.txt"
params:
Expand Down Expand Up @@ -596,7 +631,7 @@ rule index:

rule annotate_metadata_with_index:
input:
metadata="results/{build_name}/{build_name}_subsampled_metadata.tsv.xz",
metadata="results/{build_name}/metadata_with_nextclade_qc.tsv",
sequence_index = "results/{build_name}/sequence_index.tsv",
output:
metadata="results/{build_name}/metadata_with_index.tsv",
Expand Down Expand Up @@ -716,7 +751,7 @@ rule adjust_metadata_regions:
Adjusting metadata for build '{wildcards.build_name}'
"""
input:
metadata="results/{build_name}/{build_name}_subsampled_metadata.tsv.xz",
metadata="results/{build_name}/metadata_with_index.tsv",
output:
metadata = "results/{build_name}/metadata_adjusted.tsv.xz"
params:
Expand Down

0 comments on commit c5699a6

Please sign in to comment.