From 97d3d40063421fd087a04ef21107157bf14fe488 Mon Sep 17 00:00:00 2001
From: Sidney Bell <sidneymbell@gmail.com>
Date: Tue, 25 Jan 2022 09:47:24 -0800
Subject: [PATCH 1/6] Align subsampled sequences with Nextclade

Replaces Nextalign with Nextclade in the build alignment rule, to get
Nextclade QC values that get used by the diagnostics script. This change
allows all users to benefit from diagnostics filters instead of just
users who had previously updated their metadata to include Nextclade
annotations.

Notably, this change adds and uses a script from ncov-ingest to join
metadata and Nextclade QC. This script adds annotations like the clock
rate divergence that the diagnostics script looks for and that do not
otherwise get annotated by Nextclade alone. This script also includes
the mapping of all Nextclade output columns to what we use in Nextstrain
builds, so it's easier to copy these details from ncov-ingest.
---
 scripts/join-metadata-and-clades.py        | 129 +++++++++++++++++++++
 workflow/envs/nextstrain.yaml              |   1 +
 workflow/snakemake_rules/main_workflow.smk |  57 +++++++--
 3 files changed, 176 insertions(+), 11 deletions(-)
 create mode 100644 scripts/join-metadata-and-clades.py

diff --git a/scripts/join-metadata-and-clades.py b/scripts/join-metadata-and-clades.py
new file mode 100644
index 000000000..93bf47301
--- /dev/null
+++ b/scripts/join-metadata-and-clades.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python3
+"""
+Copied from https://github.com/nextstrain/ncov-ingest/blob/master/bin/join-metadata-and-clades
+"""
+import argparse
+import sys
+from datetime import datetime
+import pandas as pd
+import numpy as np
+
+INSERT_BEFORE_THIS_COLUMN = "pango_lineage"
+METADATA_JOIN_COLUMN_NAME = 'strain'
+NEXTCLADE_JOIN_COLUMN_NAME = 'seqName'
+VALUE_MISSING_DATA = '?'
+
+rate_per_day = 0.0007 * 29903 / 365
+reference_day = datetime(2020,1,1).toordinal()
+
+column_map = {
+    "clade": "Nextstrain_clade",
+    "totalMissing": "missing_data",
+    "totalSubstitutions": "divergence",
+    "totalNonACGTNs": "nonACGTN",
+    "privateNucMutations.totalUnlabeledSubstitutions":  "rare_mutations",
+    "privateNucMutations.totalReversionSubstitutions": "reversion_mutations",
+    "privateNucMutations.totalLabeledSubstitutions": "potential_contaminants",
+    "qc.missingData.status": "QC_missing_data",
+    "qc.mixedSites.status": "QC_mixed_sites",
+    "qc.privateMutations.status": "QC_rare_mutations",
+    "qc.snpClusters.status": "QC_snp_clusters",
+    "qc.frameShifts.status": "QC_frame_shifts",
+    "qc.stopCodons.status": "QC_stop_codons",
+    "frameShifts": "frame_shifts",
+    "deletions": "deletions",
+    "insertions": "insertions",
+    "substitutions": "substitutions",
+    "aaSubstitutions": "aaSubstitutions"
+}
+
+preferred_types = {
+    "divergence": "int32",
+    "nonACGTN": "int32",
+    "missing_data": "int32",
+    "snp_clusters": "int32",
+    "rare_mutations": "int32"
+}
+
+def reorder_columns(result: pd.DataFrame):
+    """
+    Moves the new clade column after a specified column
+    """
+    columns = list(result.columns)
+    columns.remove(column_map['clade'])
+    insert_at = columns.index(INSERT_BEFORE_THIS_COLUMN)
+    columns.insert(insert_at, column_map['clade'])
+    return result[columns]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Joins metadata file with Nextclade clade output",
+    )
+    parser.add_argument("first_file")
+    parser.add_argument("second_file")
+    parser.add_argument("-o", default=sys.stdout)
+    return parser.parse_args()
+
+def datestr_to_ordinal(x):
+    try:
+        return datetime.strptime(x,"%Y-%m-%d").toordinal()
+    except:
+        return np.nan
+
+def isfloat(value):
+  try:
+    float(value)
+    return True
+  except ValueError:
+    return False
+
+def main():
+    args = parse_args()
+
+    metadata = pd.read_csv(args.first_file, index_col=METADATA_JOIN_COLUMN_NAME,
+                           sep='\t', low_memory=False, na_filter = False)
+
+    # Read and rename clade column to be more descriptive
+    clades = pd.read_csv(args.second_file, index_col=NEXTCLADE_JOIN_COLUMN_NAME,
+                         sep='\t', low_memory=False, na_filter = False) \
+            .rename(columns=column_map)
+
+    clades = clades[list(column_map.values())]
+
+    # Concatenate on columns
+    result = pd.merge(
+        metadata, clades,
+        left_index=True,
+        right_index=True,
+        how='left',
+        suffixes=["_original", ""],
+    )
+    all_clades = result.Nextstrain_clade.unique()
+    t = result["date"].apply(datestr_to_ordinal)
+    div_array = np.array([float(x) if isfloat(x) else np.nan for x in result.divergence])
+    offset_by_clade = {}
+    for clade in all_clades:
+        ind = result.Nextstrain_clade==clade
+        if ind.sum()>100:
+            deviation = div_array[ind] - (t[ind] - reference_day)*rate_per_day
+            offset_by_clade[clade] = np.mean(deviation[~np.isnan(deviation)])
+
+    # extract divergence, time and offset information into vectors or series
+    offset = result["Nextstrain_clade"].apply(lambda x: offset_by_clade.get(x, 2.0))
+    # calculate divergence
+    result["clock_deviation"] = np.array(div_array - ((t-reference_day)*rate_per_day + offset), dtype=int)
+    result.loc[np.isnan(div_array)|np.isnan(t), "clock_deviation"] = np.nan
+
+    for col in list(column_map.values()) + ["clock_deviation"]:
+        result[col] = result[col].fillna(VALUE_MISSING_DATA)
+
+    # Move the new column so that it's next to other clade columns
+    if INSERT_BEFORE_THIS_COLUMN in result.columns:
+        result = reorder_columns(result) #.astype(preferred_types)
+
+    result.to_csv(args.o, index_label=METADATA_JOIN_COLUMN_NAME, sep='\t')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/workflow/envs/nextstrain.yaml b/workflow/envs/nextstrain.yaml
index 28b54c5a2..4122ceb3d 100644
--- a/workflow/envs/nextstrain.yaml
+++ b/workflow/envs/nextstrain.yaml
@@ -8,6 +8,7 @@ dependencies:
   - epiweeks=2.1.2
   - iqtree=2.1.4-beta
   - nextalign=1.9.0
+  - nextclade=1.10.1
   - pangolin=3.1.17
   - pangolearn=2021.12.06
   - python>=3.7*
diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk
index 5817b8eae..a6b0c17c6 100644
--- a/workflow/snakemake_rules/main_workflow.smk
+++ b/workflow/snakemake_rules/main_workflow.smk
@@ -444,19 +444,35 @@ rule combine_samples:
             --output-metadata {output.metadata} 2>&1 | tee {log}
         """
 
+rule prepare_nextclade:
+    message:
+        """
+        Downloading reference files for nextclade (used for alignment and qc).
+        """
+    output:
+        nextclade_dataset = directory("data/sars-cov-2-nextclade-defaults"),
+    params:
+        name = "sars-cov-2",
+    shell:
+        """
+        nextclade dataset get --name {params.name} --output-dir {output.nextclade_dataset}
+        """
+
 rule build_align:
     message:
         """
-        Aligning sequences to {input.reference}
+        Running nextclade QC and aligning sequences to {input.reference}
             - gaps relative to reference are considered real
         """
     input:
         sequences = rules.combine_samples.output.sequences,
         genemap = config["files"]["annotation"],
-        reference = config["files"]["alignment_reference"]
+        reference = config["files"]["alignment_reference"],
+        nextclade_dataset = rules.prepare_nextclade.output.nextclade_dataset,
     output:
         alignment = "results/{build_name}/aligned.fasta",
         insertions = "results/{build_name}/insertions.tsv",
+        nextclade_qc = 'results/{build_name}/nextclade_qc.tsv',
         translations = expand("results/{{build_name}}/translations/aligned.gene.{gene}.fasta", gene=config.get('genes', ['S']))
     params:
         outdir = "results/{build_name}/translations",
@@ -472,22 +488,41 @@ rule build_align:
         mem_mb=3000
     shell:
         """
-        xz -c -d {input.sequences} | nextalign \
-            --jobs={threads} \
+        xz -c -d {input.sequences} |  nextclade run \
+            --jobs {threads} \
+            --input-fasta /dev/stdin \
             --reference {input.reference} \
-            --genemap {input.genemap} \
-            --genes {params.genes} \
-            --sequences /dev/stdin \
+            --input-dataset {input.nextclade_dataset} \
+            --output-tsv {output.nextclade_qc} \
             --output-dir {params.outdir} \
             --output-basename {params.basename} \
             --output-fasta {output.alignment} \
-            --output-insertions {output.insertions} > {log} 2>&1
+            --output-insertions {output.insertions} 2>&1 | tee {log}
+        """
+
+rule join_metadata_and_nextclade_qc:
+    input:
+        metadata = "results/{build_name}/{build_name}_subsampled_metadata.tsv.xz",
+        nextclade_qc = "results/{build_name}/nextclade_qc.tsv",
+    output:
+        metadata = "results/{build_name}/metadata_with_nextclade_qc.tsv",
+    log:
+        "logs/join_metadata_and_nextclade_qc_{build_name}.txt",
+    benchmark:
+        "benchmarks/join_metadata_and_nextclade_qc_{build_name}.txt",
+    conda: config["conda_environment"]
+    shell:
+        """
+        python3 scripts/join-metadata-and-clades.py \
+            {input.metadata} \
+            {input.nextclade_qc} \
+            -o {output.metadata} 2>&1 | tee {log}
         """
 
 rule diagnostic:
     message: "Scanning metadata {input.metadata} for problematic sequences. Removing sequences with >{params.clock_filter} deviation from the clock and with more than {params.snp_clusters}."
     input:
-        metadata = "results/{build_name}/{build_name}_subsampled_metadata.tsv.xz",
+        metadata = "results/{build_name}/metadata_with_nextclade_qc.tsv",
     output:
         to_exclude = "results/{build_name}/excluded_by_diagnostics.txt"
     params:
@@ -596,7 +631,7 @@ rule index:
 
 rule annotate_metadata_with_index:
     input:
-        metadata="results/{build_name}/{build_name}_subsampled_metadata.tsv.xz",
+        metadata="results/{build_name}/metadata_with_nextclade_qc.tsv",
         sequence_index = "results/{build_name}/sequence_index.tsv",
     output:
         metadata="results/{build_name}/metadata_with_index.tsv",
@@ -716,7 +751,7 @@ rule adjust_metadata_regions:
         Adjusting metadata for build '{wildcards.build_name}'
         """
     input:
-        metadata="results/{build_name}/{build_name}_subsampled_metadata.tsv.xz",
+        metadata="results/{build_name}/metadata_with_index.tsv",
     output:
         metadata = "results/{build_name}/metadata_adjusted.tsv.xz"
     params:

From 4edbb0721a0762edbf80b6fd4ac088e12dffefce Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Fri, 28 Jan 2022 16:36:50 -0800
Subject: [PATCH 2/6] Skip Nextclade annotations if they exist already

Checks for existing Nextclade QC columns in the given metadata and skips
merge of the metadata and given Nextclade QC file when these existing
annotations are complete. This check prevents overwriting of the
Nextstrain team's Nextclade annotations from ncov-ingest. The ingest
workflow always downloads and runs the latest Nextclade release, while
this workflow depends on the version in the Docker image or conda
environment that could be slightly outdated.

The check for existing annotations was originally complicated by the
renaming of columns by the sanitize metadata step where an original
column like `aaSubstitution` would be renamed to `aa_substitution`. We
could update the `column_map` in the join script to map `aaSubstitution`
to `aa_substitution`, but the production Nextstrain builds _skip_
sanitizing metadata, so they would retain the `aaSubstitution` column
and end up getting reannotated Nextclade QC metrics here. To simplify
this check a bit, this commit changes the sanitized column name for
amino acid substitutions to `aaSubstitution`.
---
 defaults/parameters.yaml            |  3 +--
 scripts/join-metadata-and-clades.py | 19 ++++++++++++++++++-
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/defaults/parameters.yaml b/defaults/parameters.yaml
index 2369f9f50..7f699feea 100644
--- a/defaults/parameters.yaml
+++ b/defaults/parameters.yaml
@@ -37,8 +37,7 @@ sanitize_metadata:
     - "Lineage=pango_lineage"
     - "Pangolin version=pangolin_version"
     - "Variant=variant"
-    - "AA Substitutions=aa_substitutions"
-    - "aaSubstitutions=aa_substitutions"
+    - "AA Substitutions=aaSubstitutions"
     - "Submission date=date_submitted"
     - "Is reference?=is_reference"
     - "Is complete?=is_complete"
diff --git a/scripts/join-metadata-and-clades.py b/scripts/join-metadata-and-clades.py
index 93bf47301..12dfc2ee1 100644
--- a/scripts/join-metadata-and-clades.py
+++ b/scripts/join-metadata-and-clades.py
@@ -82,7 +82,24 @@ def main():
     args = parse_args()
 
     metadata = pd.read_csv(args.first_file, index_col=METADATA_JOIN_COLUMN_NAME,
-                           sep='\t', low_memory=False, na_filter = False)
+                           sep='\t', low_memory=False)
+
+    # Check for existing annotations in the given metadata. Skip join with
+    # Nextclade QC file, if those annotations already exist and none of the
+    # columns have empty values. In the case where metadata were combined from
+    # different sources with and without annotations, the "clock_deviation"
+    # column will exist but some values will be missing. We handle this case as
+    # if the annotations do not exist at all and reannotate all columns. We
+    # cannot look for missing values across all expected columns as evidence of
+    # incomplete annotations, since a complete annotation by Nextclade will
+    # include missing values for some columns by design.
+    expected_columns = list(column_map.values()) + ["clock_deviation"]
+    existing_annotation_columns = metadata.columns.intersection(expected_columns)
+    if len(existing_annotation_columns) == len(expected_columns):
+        if metadata["clock_deviation"].isnull().sum() == 0:
+            print(f"Metadata file '{args.first_file}' has already been annotated with Nextclade QC columns. Skipping re-annotation.")
+            metadata.to_csv(args.o, sep="\t")
+            return
 
     # Read and rename clade column to be more descriptive
     clades = pd.read_csv(args.second_file, index_col=NEXTCLADE_JOIN_COLUMN_NAME,

From 7e00b750354093ca959d2514a90690f9bd41b8d0 Mon Sep 17 00:00:00 2001
From: Sidney Bell <sidneymbell@gmail.com>
Date: Thu, 27 Jan 2022 17:11:16 -0800
Subject: [PATCH 3/6] Add note to changelog

---
 docs/src/reference/change_log.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/src/reference/change_log.md b/docs/src/reference/change_log.md
index cce47574d..1be9cfc20 100644
--- a/docs/src/reference/change_log.md
+++ b/docs/src/reference/change_log.md
@@ -5,6 +5,8 @@ We also use this change log to document new features that maintain backward comp
 
 ## New features since last version update
 
+- 31 January 2022: Run Nextclade QC and filtering on the final sample set before building a tree. Nextclade also runs `nextalign` under the hood. Importantly, this enables filtering the final sample set to omit strains with many reversions and/or possible contaminants, significantly improving the quality of omicron trees.
+
 - 29 January 2022: Update "mutational fitness" coloring based on latest results from [Obermeyer et al model](https://www.medrxiv.org/content/10.1101/2021.09.07.21263228v1) via [github.com/broadinstitute/pyro-cov/](https://github.com/broadinstitute/pyro-cov/blob/master/paper/mutations.tsv).
 
 ## v10 (5 January 2022)

From 71a12e18d1f61c5e7ece463c7a352b4527b0e82a Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Wed, 2 Feb 2022 15:41:57 -0800
Subject: [PATCH 4/6] Document the `skip_diagnostics` filter parameter

---
 docs/src/reference/configuration.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/src/reference/configuration.md b/docs/src/reference/configuration.md
index f637ba962..805d986a6 100644
--- a/docs/src/reference/configuration.md
+++ b/docs/src/reference/configuration.md
@@ -209,6 +209,11 @@ Builds support any named attributes that can be referenced by subsampling scheme
 * description: Minimum collection date for strains to include in the analysis used by `augur filter --min-date`. Dates can be numeric floating point values (e.g., `2019.74`) or ISO 8601-style strings (e.g., `2019-10-01`).
 * default: `2019.74`
 
+### skip_diagnostics
+* type: boolean
+* description: Skip filtering by Nextclade quality control metrics like clock rate deviation, number of SNP clusters, possible contaminations, etc.
+* default: `false`
+
 ## frequencies
 ### min_date
 * type: float or string

From 574411e84f92cdfb3762de258cde8fcee244f068 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Wed, 2 Feb 2022 16:01:28 -0800
Subject: [PATCH 5/6] Support older versions of Nextclade

Although we'd prefer to use the latest version of Nextclade, users may
have older versions installed, so we cannot assume their QC output will
match all of the columns found in the latest release. This commit uses
the intersection of QC columns and expected columns to support older
Nextclade versions.
---
 scripts/join-metadata-and-clades.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/join-metadata-and-clades.py b/scripts/join-metadata-and-clades.py
index 12dfc2ee1..87cce026f 100644
--- a/scripts/join-metadata-and-clades.py
+++ b/scripts/join-metadata-and-clades.py
@@ -106,7 +106,8 @@ def main():
                          sep='\t', low_memory=False, na_filter = False) \
             .rename(columns=column_map)
 
-    clades = clades[list(column_map.values())]
+    clade_columns = clades.columns.intersection(list(column_map.values()))
+    clades = clades[clade_columns]
 
     # Concatenate on columns
     result = pd.merge(

From 46fa71eab905ab7206ddfc46cc6f05de71b4e394 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Wed, 2 Feb 2022 16:06:14 -0800
Subject: [PATCH 6/6] Update change log to reflect new release

---
 docs/src/reference/change_log.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/src/reference/change_log.md b/docs/src/reference/change_log.md
index 1be9cfc20..e363512ea 100644
--- a/docs/src/reference/change_log.md
+++ b/docs/src/reference/change_log.md
@@ -3,9 +3,11 @@
 As of April 2021, we use major version numbers (e.g. v2) to reflect backward incompatible changes to the workflow that likely require you to update your Nextstrain installation.
 We also use this change log to document new features that maintain backward compatibility, indicating these features by the date they were added.
 
-## New features since last version update
+## v11 (3 February 2022)
+
+- Run Nextclade QC and filtering on the final sample set before building a tree. Nextclade also runs `nextalign` under the hood. Importantly, this enables filtering the final sample set to omit strains with many reversions and/or possible contaminants, significantly improving the quality of Omicron trees. [See the original pull request for more details](https://github.com/nextstrain/ncov/pull/842). To disable this filtering by Nextclade quality control metrics, set `skip_diagnostics: true` in [the `filter` section of your build configuration file](https://docs.nextstrain.org/projects/ncov/en/latest/reference/configuration.html#filter).
 
-- 31 January 2022: Run Nextclade QC and filtering on the final sample set before building a tree. Nextclade also runs `nextalign` under the hood. Importantly, this enables filtering the final sample set to omit strains with many reversions and/or possible contaminants, significantly improving the quality of omicron trees.
+## New features since last version update
 
 - 29 January 2022: Update "mutational fitness" coloring based on latest results from [Obermeyer et al model](https://www.medrxiv.org/content/10.1101/2021.09.07.21263228v1) via [github.com/broadinstitute/pyro-cov/](https://github.com/broadinstitute/pyro-cov/blob/master/paper/mutations.tsv).