From 97d3d40063421fd087a04ef21107157bf14fe488 Mon Sep 17 00:00:00 2001 From: Sidney Bell Date: Tue, 25 Jan 2022 09:47:24 -0800 Subject: [PATCH 1/6] Align subsampled sequences with Nextclade Replaces Nextalign with Nextclade in the build alignment rule, to get Nextclade QC values that get used by the diagnostics script. This change allows all users to benefit from diagnostics filters instead of just users who had previously updated their metadata to include Nextclade annotations. Notably, this change adds and uses a script from ncov-ingest to join metadata and Nextclade QC. This script adds annotations like the clock rate divergence that the diagnostics script looks for and that do not otherwise get annotated by Nextclade alone. This script also includes the mapping of all Nextclade output columns to what we use in Nextstrain builds, so it's easier to copy these details from ncov-ingest. --- scripts/join-metadata-and-clades.py | 129 +++++++++++++++++++++ workflow/envs/nextstrain.yaml | 1 + workflow/snakemake_rules/main_workflow.smk | 57 +++++++-- 3 files changed, 176 insertions(+), 11 deletions(-) create mode 100644 scripts/join-metadata-and-clades.py diff --git a/scripts/join-metadata-and-clades.py b/scripts/join-metadata-and-clades.py new file mode 100644 index 000000000..93bf47301 --- /dev/null +++ b/scripts/join-metadata-and-clades.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +""" +Copied from https://github.com/nextstrain/ncov-ingest/blob/master/bin/join-metadata-and-clades +""" +import argparse +import sys +from datetime import datetime +import pandas as pd +import numpy as np + +INSERT_BEFORE_THIS_COLUMN = "pango_lineage" +METADATA_JOIN_COLUMN_NAME = 'strain' +NEXTCLADE_JOIN_COLUMN_NAME = 'seqName' +VALUE_MISSING_DATA = '?' + +rate_per_day = 0.0007 * 29903 / 365 +reference_day = datetime(2020,1,1).toordinal() + +column_map = { + "clade": "Nextstrain_clade", + "totalMissing": "missing_data", + "totalSubstitutions": "divergence", + "totalNonACGTNs": "nonACGTN", + "privateNucMutations.totalUnlabeledSubstitutions": "rare_mutations", + "privateNucMutations.totalReversionSubstitutions": "reversion_mutations", + "privateNucMutations.totalLabeledSubstitutions": "potential_contaminants", + "qc.missingData.status": "QC_missing_data", + "qc.mixedSites.status": "QC_mixed_sites", + "qc.privateMutations.status": "QC_rare_mutations", + "qc.snpClusters.status": "QC_snp_clusters", + "qc.frameShifts.status": "QC_frame_shifts", + "qc.stopCodons.status": "QC_stop_codons", + "frameShifts": "frame_shifts", + "deletions": "deletions", + "insertions": "insertions", + "substitutions": "substitutions", + "aaSubstitutions": "aaSubstitutions" +} + +preferred_types = { + "divergence": "int32", + "nonACGTN": "int32", + "missing_data": "int32", + "snp_clusters": "int32", + "rare_mutations": "int32" +} + +def reorder_columns(result: pd.DataFrame): + """ + Moves the new clade column after a specified column + """ + columns = list(result.columns) + columns.remove(column_map['clade']) + insert_at = columns.index(INSERT_BEFORE_THIS_COLUMN) + columns.insert(insert_at, column_map['clade']) + return result[columns] + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Joins metadata file with Nextclade clade output", + ) + parser.add_argument("first_file") + parser.add_argument("second_file") + parser.add_argument("-o", default=sys.stdout) + return parser.parse_args() + +def datestr_to_ordinal(x): + try: + return datetime.strptime(x,"%Y-%m-%d").toordinal() + except: + return np.nan + +def isfloat(value): + try: + float(value) + return True + except ValueError: + return False + +def main(): + args = parse_args() + + metadata = pd.read_csv(args.first_file, index_col=METADATA_JOIN_COLUMN_NAME, + sep='\t', low_memory=False, na_filter = False) + + # Read and rename clade column to be more descriptive + clades = pd.read_csv(args.second_file, index_col=NEXTCLADE_JOIN_COLUMN_NAME, + sep='\t', low_memory=False, na_filter = False) \ + .rename(columns=column_map) + + clades = clades[list(column_map.values())] + + # Concatenate on columns + result = pd.merge( + metadata, clades, + left_index=True, + right_index=True, + how='left', + suffixes=["_original", ""], + ) + all_clades = result.Nextstrain_clade.unique() + t = result["date"].apply(datestr_to_ordinal) + div_array = np.array([float(x) if isfloat(x) else np.nan for x in result.divergence]) + offset_by_clade = {} + for clade in all_clades: + ind = result.Nextstrain_clade==clade + if ind.sum()>100: + deviation = div_array[ind] - (t[ind] - reference_day)*rate_per_day + offset_by_clade[clade] = np.mean(deviation[~np.isnan(deviation)]) + + # extract divergence, time and offset information into vectors or series + offset = result["Nextstrain_clade"].apply(lambda x: offset_by_clade.get(x, 2.0)) + # calculate divergence + result["clock_deviation"] = np.array(div_array - ((t-reference_day)*rate_per_day + offset), dtype=int) + result.loc[np.isnan(div_array)|np.isnan(t), "clock_deviation"] = np.nan + + for col in list(column_map.values()) + ["clock_deviation"]: + result[col] = result[col].fillna(VALUE_MISSING_DATA) + + # Move the new column so that it's next to other clade columns + if INSERT_BEFORE_THIS_COLUMN in result.columns: + result = reorder_columns(result) #.astype(preferred_types) + + result.to_csv(args.o, index_label=METADATA_JOIN_COLUMN_NAME, sep='\t') + + +if __name__ == '__main__': + main() diff --git a/workflow/envs/nextstrain.yaml b/workflow/envs/nextstrain.yaml index 28b54c5a2..4122ceb3d 100644 --- a/workflow/envs/nextstrain.yaml +++ b/workflow/envs/nextstrain.yaml @@ -8,6 +8,7 @@ dependencies: - epiweeks=2.1.2 - iqtree=2.1.4-beta - nextalign=1.9.0 + - nextclade=1.10.1 - pangolin=3.1.17 - pangolearn=2021.12.06 - python>=3.7* diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk index 5817b8eae..a6b0c17c6 100644 --- a/workflow/snakemake_rules/main_workflow.smk +++ b/workflow/snakemake_rules/main_workflow.smk @@ -444,19 +444,35 @@ rule combine_samples: --output-metadata {output.metadata} 2>&1 | tee {log} """ +rule prepare_nextclade: + message: + """ + Downloading reference files for nextclade (used for alignment and qc). + """ + output: + nextclade_dataset = directory("data/sars-cov-2-nextclade-defaults"), + params: + name = "sars-cov-2", + shell: + """ + nextclade dataset get --name {params.name} --output-dir {output.nextclade_dataset} + """ + rule build_align: message: """ - Aligning sequences to {input.reference} + Running nextclade QC and aligning sequences to {input.reference} - gaps relative to reference are considered real """ input: sequences = rules.combine_samples.output.sequences, genemap = config["files"]["annotation"], - reference = config["files"]["alignment_reference"] + reference = config["files"]["alignment_reference"], + nextclade_dataset = rules.prepare_nextclade.output.nextclade_dataset, output: alignment = "results/{build_name}/aligned.fasta", insertions = "results/{build_name}/insertions.tsv", + nextclade_qc = 'results/{build_name}/nextclade_qc.tsv', translations = expand("results/{{build_name}}/translations/aligned.gene.{gene}.fasta", gene=config.get('genes', ['S'])) params: outdir = "results/{build_name}/translations", @@ -472,22 +488,41 @@ rule build_align: mem_mb=3000 shell: """ - xz -c -d {input.sequences} | nextalign \ - --jobs={threads} \ + xz -c -d {input.sequences} | nextclade run \ + --jobs {threads} \ + --input-fasta /dev/stdin \ --reference {input.reference} \ - --genemap {input.genemap} \ - --genes {params.genes} \ - --sequences /dev/stdin \ + --input-dataset {input.nextclade_dataset} \ + --output-tsv {output.nextclade_qc} \ --output-dir {params.outdir} \ --output-basename {params.basename} \ --output-fasta {output.alignment} \ - --output-insertions {output.insertions} > {log} 2>&1 + --output-insertions {output.insertions} 2>&1 | tee {log} + """ + +rule join_metadata_and_nextclade_qc: + input: + metadata = "results/{build_name}/{build_name}_subsampled_metadata.tsv.xz", + nextclade_qc = "results/{build_name}/nextclade_qc.tsv", + output: + metadata = "results/{build_name}/metadata_with_nextclade_qc.tsv", + log: + "logs/join_metadata_and_nextclade_qc_{build_name}.txt", + benchmark: + "benchmarks/join_metadata_and_nextclade_qc_{build_name}.txt", + conda: config["conda_environment"] + shell: + """ + python3 scripts/join-metadata-and-clades.py \ + {input.metadata} \ + {input.nextclade_qc} \ + -o {output.metadata} 2>&1 | tee {log} """ rule diagnostic: message: "Scanning metadata {input.metadata} for problematic sequences. Removing sequences with >{params.clock_filter} deviation from the clock and with more than {params.snp_clusters}." input: - metadata = "results/{build_name}/{build_name}_subsampled_metadata.tsv.xz", + metadata = "results/{build_name}/metadata_with_nextclade_qc.tsv", output: to_exclude = "results/{build_name}/excluded_by_diagnostics.txt" params: @@ -596,7 +631,7 @@ rule index: rule annotate_metadata_with_index: input: - metadata="results/{build_name}/{build_name}_subsampled_metadata.tsv.xz", + metadata="results/{build_name}/metadata_with_nextclade_qc.tsv", sequence_index = "results/{build_name}/sequence_index.tsv", output: metadata="results/{build_name}/metadata_with_index.tsv", @@ -716,7 +751,7 @@ rule adjust_metadata_regions: Adjusting metadata for build '{wildcards.build_name}' """ input: - metadata="results/{build_name}/{build_name}_subsampled_metadata.tsv.xz", + metadata="results/{build_name}/metadata_with_index.tsv", output: metadata = "results/{build_name}/metadata_adjusted.tsv.xz" params: From 4edbb0721a0762edbf80b6fd4ac088e12dffefce Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Fri, 28 Jan 2022 16:36:50 -0800 Subject: [PATCH 2/6] Skip Nextclade annotations if they exist already Checks for existing Nextclade QC columns in the given metadata and skips merge of the metadata and given Nextclade QC file when these existing annotations are complete. This check prevents overwriting of the Nextstrain team's Nextclade annotations from ncov-ingest. The ingest workflow always downloads and runs the latest Nextclade release, while this workflow depends on the version in the Docker image or conda environment that could be slightly outdated. The check for existing annotations was originally complicated by the renaming of columns by the sanitize metadata step where an original column like `aaSubstitution` would be renamed to `aa_substitution`. We could update the `column_map` in the join script to map `aaSubstitution` to `aa_substitution`, but the production Nextstrain builds _skip_ sanitizing metadata, so they would retain the `aaSubstitution` column and end up getting reannotated Nextclade QC metrics here. To simplify this check a bit, this commit changes the sanitized column name for amino acid substitutions to `aaSubstitution`. --- defaults/parameters.yaml | 3 +-- scripts/join-metadata-and-clades.py | 19 ++++++++++++++++++- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/defaults/parameters.yaml b/defaults/parameters.yaml index 2369f9f50..7f699feea 100644 --- a/defaults/parameters.yaml +++ b/defaults/parameters.yaml @@ -37,8 +37,7 @@ sanitize_metadata: - "Lineage=pango_lineage" - "Pangolin version=pangolin_version" - "Variant=variant" - - "AA Substitutions=aa_substitutions" - - "aaSubstitutions=aa_substitutions" + - "AA Substitutions=aaSubstitutions" - "Submission date=date_submitted" - "Is reference?=is_reference" - "Is complete?=is_complete" diff --git a/scripts/join-metadata-and-clades.py b/scripts/join-metadata-and-clades.py index 93bf47301..12dfc2ee1 100644 --- a/scripts/join-metadata-and-clades.py +++ b/scripts/join-metadata-and-clades.py @@ -82,7 +82,24 @@ def main(): args = parse_args() metadata = pd.read_csv(args.first_file, index_col=METADATA_JOIN_COLUMN_NAME, - sep='\t', low_memory=False, na_filter = False) + sep='\t', low_memory=False) + + # Check for existing annotations in the given metadata. Skip join with + # Nextclade QC file, if those annotations already exist and none of the + # columns have empty values. In the case where metadata were combined from + # different sources with and without annotations, the "clock_deviation" + # column will exist but some values will be missing. We handle this case as + # if the annotations do not exist at all and reannotate all columns. We + # cannot look for missing values across all expected columns as evidence of + # incomplete annotations, since a complete annotation by Nextclade will + # include missing values for some columns by design. + expected_columns = list(column_map.values()) + ["clock_deviation"] + existing_annotation_columns = metadata.columns.intersection(expected_columns) + if len(existing_annotation_columns) == len(expected_columns): + if metadata["clock_deviation"].isnull().sum() == 0: + print(f"Metadata file '{args.first_file}' has already been annotated with Nextclade QC columns. Skipping re-annotation.") + metadata.to_csv(args.o, sep="\t") + return # Read and rename clade column to be more descriptive clades = pd.read_csv(args.second_file, index_col=NEXTCLADE_JOIN_COLUMN_NAME, From 7e00b750354093ca959d2514a90690f9bd41b8d0 Mon Sep 17 00:00:00 2001 From: Sidney Bell Date: Thu, 27 Jan 2022 17:11:16 -0800 Subject: [PATCH 3/6] Add note to changelog --- docs/src/reference/change_log.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/src/reference/change_log.md b/docs/src/reference/change_log.md index cce47574d..1be9cfc20 100644 --- a/docs/src/reference/change_log.md +++ b/docs/src/reference/change_log.md @@ -5,6 +5,8 @@ We also use this change log to document new features that maintain backward comp ## New features since last version update +- 31 January 2022: Run Nextclade QC and filtering on the final sample set before building a tree. Nextclade also runs `nextalign` under the hood. Importantly, this enables filtering the final sample set to omit strains with many reversions and/or possible contaminants, significantly improving the quality of omicron trees. + - 29 January 2022: Update "mutational fitness" coloring based on latest results from [Obermeyer et al model](https://www.medrxiv.org/content/10.1101/2021.09.07.21263228v1) via [github.com/broadinstitute/pyro-cov/](https://github.com/broadinstitute/pyro-cov/blob/master/paper/mutations.tsv). ## v10 (5 January 2022) From 71a12e18d1f61c5e7ece463c7a352b4527b0e82a Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Wed, 2 Feb 2022 15:41:57 -0800 Subject: [PATCH 4/6] Document the `skip_diagnostics` filter parameter --- docs/src/reference/configuration.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/src/reference/configuration.md b/docs/src/reference/configuration.md index f637ba962..805d986a6 100644 --- a/docs/src/reference/configuration.md +++ b/docs/src/reference/configuration.md @@ -209,6 +209,11 @@ Builds support any named attributes that can be referenced by subsampling scheme * description: Minimum collection date for strains to include in the analysis used by `augur filter --min-date`. Dates can be numeric floating point values (e.g., `2019.74`) or ISO 8601-style strings (e.g., `2019-10-01`). * default: `2019.74` +### skip_diagnostics +* type: boolean +* description: Skip filtering by Nextclade quality control metrics like clock rate deviation, number of SNP clusters, possible contaminations, etc. +* default: `false` + ## frequencies ### min_date * type: float or string From 574411e84f92cdfb3762de258cde8fcee244f068 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Wed, 2 Feb 2022 16:01:28 -0800 Subject: [PATCH 5/6] Support older versions of Nextclade Although we'd prefer to use the latest version of Nextclade, users may have older versions installed, so we cannot assume their QC output will match all of the columns found in the latest release. This commit uses the intersection of QC columns and expected columns to support older Nextclade versions. --- scripts/join-metadata-and-clades.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/join-metadata-and-clades.py b/scripts/join-metadata-and-clades.py index 12dfc2ee1..87cce026f 100644 --- a/scripts/join-metadata-and-clades.py +++ b/scripts/join-metadata-and-clades.py @@ -106,7 +106,8 @@ def main(): sep='\t', low_memory=False, na_filter = False) \ .rename(columns=column_map) - clades = clades[list(column_map.values())] + clade_columns = clades.columns.intersection(list(column_map.values())) + clades = clades[clade_columns] # Concatenate on columns result = pd.merge( From 46fa71eab905ab7206ddfc46cc6f05de71b4e394 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Wed, 2 Feb 2022 16:06:14 -0800 Subject: [PATCH 6/6] Update change log to reflect new release --- docs/src/reference/change_log.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/src/reference/change_log.md b/docs/src/reference/change_log.md index 1be9cfc20..e363512ea 100644 --- a/docs/src/reference/change_log.md +++ b/docs/src/reference/change_log.md @@ -3,9 +3,11 @@ As of April 2021, we use major version numbers (e.g. v2) to reflect backward incompatible changes to the workflow that likely require you to update your Nextstrain installation. We also use this change log to document new features that maintain backward compatibility, indicating these features by the date they were added. -## New features since last version update +## v11 (3 February 2022) + +- Run Nextclade QC and filtering on the final sample set before building a tree. Nextclade also runs `nextalign` under the hood. Importantly, this enables filtering the final sample set to omit strains with many reversions and/or possible contaminants, significantly improving the quality of Omicron trees. [See the original pull request for more details](https://github.com/nextstrain/ncov/pull/842). To disable this filtering by Nextclade quality control metrics, set `skip_diagnostics: true` in [the `filter` section of your build configuration file](https://docs.nextstrain.org/projects/ncov/en/latest/reference/configuration.html#filter). -- 31 January 2022: Run Nextclade QC and filtering on the final sample set before building a tree. Nextclade also runs `nextalign` under the hood. Importantly, this enables filtering the final sample set to omit strains with many reversions and/or possible contaminants, significantly improving the quality of omicron trees. +## New features since last version update - 29 January 2022: Update "mutational fitness" coloring based on latest results from [Obermeyer et al model](https://www.medrxiv.org/content/10.1101/2021.09.07.21263228v1) via [github.com/broadinstitute/pyro-cov/](https://github.com/broadinstitute/pyro-cov/blob/master/paper/mutations.tsv).