Skip to content

Commit

Permalink
feat: make genome taxa the same as organism taxa and add strain field (
Browse files Browse the repository at this point in the history
  • Loading branch information
hunterckx committed Dec 12, 2024
1 parent 021f70a commit fb0ccf1
Show file tree
Hide file tree
Showing 7 changed files with 150 additions and 81 deletions.
1 change: 1 addition & 0 deletions app/apis/catalog/brc-analytics-catalog/common/entities.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ export interface BRCDataCatalogGenome {
scaffoldCount: number;
scaffoldL50: number;
scaffoldN50: number;
strain: string | null;
tags: string[];
taxon: string;
ucscBrowserUrl: string | null;
Expand Down
1 change: 1 addition & 0 deletions files/build-catalog.ts
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ async function buildGenomes(
scaffoldCount: parseNumber(row.scaffoldCount),
scaffoldL50: parseNumber(row.scaffoldL50),
scaffoldN50: parseNumber(row.scaffoldN50),
strain: parseStringOrNull(row.strain),
tags: organismsByTaxon.get(row.taxon)?.tags ?? [],
taxon: row.taxon,
ucscBrowserUrl: parseStringOrNull(row.ucscBrowser),
Expand Down
26 changes: 16 additions & 10 deletions files/build-files-from-ncbi.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
def build_taxonomy_request_body(taxa):
return {"taxons": taxa, "children": False, "ranks": ["genus"]}

def get_organism_row(organism_info):
def get_organism_row(organism_info, accession):
if len(organism_info.get("errors", [])) > 0:
raise Exception(organism_info)

Expand All @@ -25,22 +25,24 @@ def get_organism_row(organism_info):
"taxon": organism_taxonomy["current_scientific_name"]["name"],
"taxonomyId": str(organism_taxonomy["tax_id"]),
"assemblyCount": next(count["count"] for count in organism_taxonomy["counts"] if count["type"] == "COUNT_TYPE_ASSEMBLY"),
"accession": accession,
}

def get_organisms_df(taxa):
organisms_info = requests.post(TAXONOMY_URL, json=build_taxonomy_request_body(taxa)).json()["reports"]
return pd.DataFrame([get_organism_row(organism_info) for organism_info in organisms_info])
def get_organisms_df(taxa_with_accessions):
organisms_info_with_accessions = [(organism_info, accession) for taxon, accession in taxa_with_accessions for organism_info in requests.post(TAXONOMY_URL, json=build_taxonomy_request_body([taxon])).json()["reports"]]
return pd.DataFrame([get_organism_row(organism_info, accession) for organism_info, accession in organisms_info_with_accessions])

def get_tax_ids(organisms_df):
return list(organisms_df["taxonomyId"])

def build_genomes_url(tax_ids):
return f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/taxon/{urllib.parse.quote(",".join([str(id) for id in tax_ids]))}/dataset_report?filters.assembly_source=refseq&filters.has_annotation=true&filters.exclude_paired_reports=true&filters.exclude_atypical=true&filters.assembly_level=scaffold&filters.assembly_level=chromosome&filters.assembly_level=complete_genome"

def get_genome_row(genome_info):
def get_genome_row(genome_info, taxon):
refseq_category = genome_info["assembly_info"].get("refseq_category")
return {
"taxon": genome_info["organism"]["organism_name"],
"taxon": taxon,
"strain": genome_info["organism"].get("infraspecific_names", {}).get("strain", ""),
"taxonomyId": genome_info["organism"]["tax_id"],
"accession": genome_info["accession"],
"isRef": refseq_category == "reference genome",
Expand All @@ -56,8 +58,12 @@ def get_genome_row(genome_info):
"pairedAccession": genome_info["paired_accession"],
}

def get_genomes_df(tax_ids):
return pd.DataFrame(data=[get_genome_row(genome_info) for genome_info in requests.get(build_genomes_url(tax_ids)).json()["reports"]])
def get_organism_genomes(tax_id, accession):
return [genome_info for genome_info in requests.get(build_genomes_url([tax_id])).json()["reports"] if genome_info["accession"] == accession]

def get_genomes_df(organism_ids):
genomes_info_with_organisms = [(genome_info, taxon) for tax_id, taxon, accession in organism_ids for genome_info in get_organism_genomes(tax_id, accession)]
return pd.DataFrame(data=[get_genome_row(*info) for info in genomes_info_with_organisms])

def _id_to_gene_model_url(asm_id):
hubs_url = "https://hgdownload.soe.ucsc.edu/hubs/"
Expand Down Expand Up @@ -95,15 +101,15 @@ def build_files():

taxa_df = pd.read_csv(TAXA_URL, keep_default_na=False)

organisms_source_df = get_organisms_df([taxon.strip() for taxon in taxa_df["Name"] if taxon])
organisms_source_df = get_organisms_df([(taxon.strip(), accession.strip()) for taxon, accession in zip(taxa_df["Name"], taxa_df["RefSeq Accession"]) if taxon])

organisms_df = organisms_source_df.merge(taxa_df[["TaxId", "CustomTags"]], how="left", left_on="taxonomyId", right_on="TaxId").drop(columns=["TaxId"])

organisms_df.to_csv(ORGANISMS_OUTPUT_PATH, index=False, sep="\t")

print(f"Wrote to {ORGANISMS_OUTPUT_PATH}")

genomes_source_df = get_genomes_df(get_tax_ids(organisms_df))
genomes_source_df = get_genomes_df(zip(organisms_df["taxonomyId"], organisms_df["taxon"], organisms_df["accession"]))
assemblies_df = pd.DataFrame(requests.get(ASSEMBLIES_URL).json()["data"])[["ucscBrowser", "genBank", "refSeq"]]

gen_bank_merge_df = genomes_source_df.merge(assemblies_df, how="left", left_on="accession", right_on="genBank")
Expand Down
1 change: 1 addition & 0 deletions files/entities.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ export interface SourceGenome {
scaffoldCount: string;
scaffoldL50: string;
scaffoldN50: string;
strain: string;
taxon: string;
taxonomyId: string;
ucscBrowser: string;
Expand Down
115 changes: 87 additions & 28 deletions files/out/genomes.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
"scaffoldCount": 2747,
"scaffoldL50": 6,
"scaffoldN50": 1678596,
"strain": "Salvador I",
"tags": [
"VEuPathDB"
],
Expand All @@ -33,8 +34,11 @@
"scaffoldCount": 12,
"scaffoldL50": 4,
"scaffoldN50": 2481190,
"tags": [],
"taxon": "Trypanosoma brucei brucei TREU927",
"strain": null,
"tags": [
"VEuPathDB"
],
"taxon": "Trypanosoma brucei",
"ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_000002445.2"
},
{
Expand All @@ -51,8 +55,9 @@
"scaffoldCount": 8,
"scaffoldL50": 4,
"scaffoldN50": 3948441,
"strain": "Af293",
"tags": [],
"taxon": "Aspergillus fumigatus Af293",
"taxon": "Aspergillus fumigatus",
"ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_000002655.1"
},
{
Expand All @@ -69,8 +74,11 @@
"scaffoldCount": 36,
"scaffoldL50": 11,
"scaffoldN50": 1091540,
"tags": [],
"taxon": "Leishmania major strain Friedlin",
"strain": "Friedlin",
"tags": [
"VEuPathDB"
],
"taxon": "Leishmania major",
"ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_000002725.2"
},
{
Expand All @@ -87,8 +95,11 @@
"scaffoldCount": 14,
"scaffoldL50": 5,
"scaffoldN50": 1687656,
"tags": [],
"taxon": "Plasmodium falciparum 3D7",
"strain": null,
"tags": [
"VEuPathDB"
],
"taxon": "Plasmodium falciparum",
"ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_000002765.6"
},
{
Expand All @@ -105,8 +116,11 @@
"scaffoldCount": 138,
"scaffoldL50": 11,
"scaffoldN50": 992961,
"tags": [],
"taxon": "Leishmania braziliensis MHOM/BR/75/M2904",
"strain": "MHOM/BR/75/M2904",
"tags": [
"VEuPathDB"
],
"taxon": "Leishmania braziliensis",
"ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_000002845.2"
},
{
Expand All @@ -123,8 +137,11 @@
"scaffoldCount": 2276,
"scaffoldL50": 6,
"scaffoldN50": 4973582,
"tags": [],
"taxon": "Toxoplasma gondii ME49",
"strain": "ME49",
"tags": [
"VEuPathDB"
],
"taxon": "Toxoplasma gondii",
"ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_000006565.2"
},
{
Expand All @@ -141,8 +158,9 @@
"scaffoldCount": 14,
"scaffoldL50": 6,
"scaffoldN50": 1438950,
"strain": "JEC21",
"tags": [],
"taxon": "Cryptococcus neoformans var. neoformans JEC21",
"taxon": "Cryptococcus neoformans",
"ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_000091045.1"
},
{
Expand All @@ -159,8 +177,9 @@
"scaffoldCount": 6,
"scaffoldL50": 3,
"scaffoldN50": 4323945,
"strain": "RS",
"tags": [],
"taxon": "Coccidioides immitis RS",
"taxon": "Coccidioides immitis",
"ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_000149335.2"
},
{
Expand All @@ -177,8 +196,9 @@
"scaffoldCount": 8,
"scaffoldL50": 3,
"scaffoldN50": 2231883,
"strain": "SC5314",
"tags": [],
"taxon": "Candida albicans SC5314",
"taxon": "Candida albicans",
"ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_000182965.3"
},
{
Expand All @@ -195,8 +215,11 @@
"scaffoldCount": 1,
"scaffoldL50": 1,
"scaffoldN50": 4411532,
"tags": [],
"taxon": "Mycobacterium tuberculosis H37Rv",
"strain": "H37Rv",
"tags": [
"Bact"
],
"taxon": "Mycobacterium tuberculosis",
"ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_000195955.2"
},
{
Expand All @@ -213,6 +236,7 @@
"scaffoldCount": 29495,
"scaffoldL50": 212,
"scaffoldN50": 88624,
"strain": "CL Brener",
"tags": [
"VEuPathDB"
],
Expand All @@ -233,29 +257,54 @@
"scaffoldCount": 36,
"scaffoldL50": 11,
"scaffoldN50": 1024085,
"strain": "BPK282A1",
"tags": [
"VEuPathDB"
],
"taxon": "Leishmania donovani",
"ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_000227135.1"
},
{
"accession": "GCF_000277735.2",
"accession": "GCF_000857045.1",
"annotationStatus": null,
"chromosomes": 1,
"coverage": null,
"gcPercent": 65.5,
"gcPercent": 33,
"geneModelUrl": null,
"isRef": "No",
"length": 4411709,
"length": 196858,
"level": "Complete Genome",
"ncbiTaxonomyId": "83332",
"ncbiTaxonomyId": "10244",
"scaffoldCount": 1,
"scaffoldL50": 1,
"scaffoldN50": 4411709,
"tags": [],
"taxon": "Mycobacterium tuberculosis H37Rv",
"ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_000277735.2"
"scaffoldN50": 196858,
"strain": "Zaire-96-I-16",
"tags": [
"Virus"
],
"taxon": "Monkeypox virus",
"ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_000857045.1"
},
{
"accession": "GCF_009858895.2",
"annotationStatus": null,
"chromosomes": 1,
"coverage": null,
"gcPercent": 38,
"geneModelUrl": null,
"isRef": "No",
"length": 29903,
"level": "Complete Genome",
"ncbiTaxonomyId": "2697049",
"scaffoldCount": 1,
"scaffoldL50": 1,
"scaffoldN50": 29903,
"strain": null,
"tags": [
"Virus"
],
"taxon": "Severe acute respiratory syndrome coronavirus 2",
"ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_009858895.2"
},
{
"accession": "GCF_015732765.1",
Expand All @@ -271,6 +320,7 @@
"scaffoldCount": 56,
"scaffoldL50": 2,
"scaffoldN50": 201550677,
"strain": "JHB",
"tags": [
"VEuPathDB"
],
Expand All @@ -291,6 +341,7 @@
"scaffoldCount": 289,
"scaffoldL50": 2,
"scaffoldN50": 186194774,
"strain": null,
"tags": [],
"taxon": "Culex pipiens pallens",
"ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_016801865.2"
Expand All @@ -309,8 +360,11 @@
"scaffoldCount": 9,
"scaffoldL50": 2,
"scaffoldN50": 8079863,
"tags": [],
"taxon": "Coccidioides posadasii str. Silveira",
"strain": "Silveira",
"tags": [
"VEuPathDB"
],
"taxon": "Coccidioides posadasii",
"ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_018416015.2"
},
{
Expand All @@ -327,6 +381,7 @@
"scaffoldCount": 14,
"scaffoldL50": 5,
"scaffoldN50": 2046250,
"strain": "17X",
"tags": [
"VEuPathDB"
],
Expand All @@ -347,8 +402,11 @@
"scaffoldCount": 14,
"scaffoldL50": 5,
"scaffoldN50": 1692345,
"tags": [],
"taxon": "Plasmodium vinckei vinckei",
"strain": null,
"tags": [
"VEuPathDB"
],
"taxon": "Plasmodium vinckei",
"ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_900681995.1"
},
{
Expand All @@ -365,6 +423,7 @@
"scaffoldCount": 190,
"scaffoldL50": 2,
"scaffoldN50": 99149756,
"strain": null,
"tags": [
"VEuPathDB"
],
Expand Down
Loading

0 comments on commit fb0ccf1

Please sign in to comment.