Skip to content

Commit

Permalink
Update assign colors to account for clade recency
Browse files Browse the repository at this point in the history
When assigning colors gray out clades that are not in recent samples. Use clade_recency parameter to consider how many months back to consider "recent".
  • Loading branch information
trvrb committed Sep 25, 2024
1 parent 41e2e8f commit 593e9ca
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 8 deletions.
4 changes: 4 additions & 0 deletions defaults/parameters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,10 @@ refine:
ancestral:
inference: "joint"

# Gray out clades that are older than 18 months when constructing a color ramp
colors:
clade_recency: 18

# Frequencies settings
frequencies:

Expand Down
42 changes: 34 additions & 8 deletions scripts/assign-colors.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,20 @@
import argparse
import pandas as pd
from datetime import datetime, timedelta

# Forced colours MUST NOT appear in the ordering TSV
forced_colors = {
}

def date_within_last_n_months(date_str, cutoff_date):
if 'XX' in date_str:
return False # Ignore uncertain dates
try:
date = datetime.strptime(date_str, "%Y-%m-%d")
return date >= cutoff_date
except ValueError:
return False

if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="Assign colors based on ordering",
Expand All @@ -15,6 +25,7 @@
parser.add_argument('--color-schemes', type=str, required=True, help="input color schemes file")
parser.add_argument('--metadata', type=str, help="if provided, restrict colors to only those found in metadata")
parser.add_argument('--clade-node-data', type=str, help="if provided, restrict to only those clades found in tree")
parser.add_argument('--clade-recency', type=int, help="if provided, restrict to clades found in tree within X months of present")
parser.add_argument('--output', type=str, required=True, help="output colors tsv")
args = parser.parse_args()

Expand All @@ -36,26 +47,41 @@
if args.metadata:
metadata = pd.read_csv(args.metadata, delimiter='\t')
for name, trait in assignment.items():
if name in metadata:
if name in metadata['strain'].values:
subset_present = [x for x in assignment[name] if x in metadata[name].unique()]
assignment[name] = subset_present
if name in metadata and 'focal' in metadata:
if name in metadata['strain'].values and 'focal' in metadata.columns:
focal_list = metadata.loc[metadata['focal'] == True, name].unique()
subset_focal = [x for x in assignment[name] if x in focal_list]
assignment[name] = subset_focal

# if node json is supplied, restrict to clades names in the tree
# if node json is supplied, restrict to clades names in the tree within the specified recency
if args.clade_node_data and "clade_membership" in assignment:
with open(args.clade_node_data) as fh:
import json
clades = json.load(fh)['nodes']

# generate a set of present values
subset_present = set([x["clade_membership"] for x in clades.values()])
# restrict to only those present while maintaining order
assignment["clade_membership"] = [x for x in assignment["clade_membership"]
if x in subset_present]
if args.clade_recency and args.metadata:
# Calculate the cutoff date based on clade_recency (number of months ago from today)
cutoff_date = datetime.today() - timedelta(days=args.clade_recency * 30) # approximate months as 30 days

# Generate a set of present values within the specified recency
subset_present = set()
metadata = pd.read_csv(args.metadata, delimiter='\t')
for strain, info in clades.items():
if strain in metadata['strain'].values:
date_str = metadata.loc[metadata['strain'] == strain, 'date'].values[0]
if date_within_last_n_months(date_str, cutoff_date):
subset_present.add(info["clade_membership"])

# Restrict to only those present while maintaining order
assignment["clade_membership"] = [x for x in assignment["clade_membership"]
if x in subset_present]
else:
# If no clade_recency is provided, look for all clades present in the tree
subset_present = set([x["clade_membership"] for x in clades.values()])
assignment["clade_membership"] = [x for x in assignment["clade_membership"]
if x in subset_present]

schemes = {}
counter = 0
Expand Down
3 changes: 3 additions & 0 deletions workflow/snakemake_rules/main_workflow.smk
Original file line number Diff line number Diff line change
Expand Up @@ -1105,6 +1105,8 @@ rule colors:
color_schemes = config["files"]["color_schemes"],
metadata="results/{build_name}/metadata_adjusted.tsv.xz",
clades = rules.clades.output.clade_data
params:
clade_recency = config["colors"]["clade_recency"]
output:
colors = "results/{build_name}/colors.tsv"
log:
Expand All @@ -1124,6 +1126,7 @@ rule colors:
--color-schemes {input.color_schemes} \
--output {output.colors} \
--clade-node-data {input.clades} \
--clade-recency {params.clade_recency} \
--metadata {input.metadata} 2>&1 | tee {log}
"""

Expand Down

0 comments on commit 593e9ca

Please sign in to comment.