diff --git a/app/apis/catalog/brc-analytics-catalog/common/entities.ts b/app/apis/catalog/brc-analytics-catalog/common/entities.ts index 1998a66..fb529c2 100644 --- a/app/apis/catalog/brc-analytics-catalog/common/entities.ts +++ b/app/apis/catalog/brc-analytics-catalog/common/entities.ts @@ -23,6 +23,7 @@ export interface BRCDataCatalogGenome { scaffoldCount: number; scaffoldL50: number; scaffoldN50: number; + strain: string | null; tags: string[]; taxon: string; ucscBrowserUrl: string | null; diff --git a/files/build-catalog.ts b/files/build-catalog.ts index ee6029e..e3f7b88 100644 --- a/files/build-catalog.ts +++ b/files/build-catalog.ts @@ -65,6 +65,7 @@ async function buildGenomes( scaffoldCount: parseNumber(row.scaffoldCount), scaffoldL50: parseNumber(row.scaffoldL50), scaffoldN50: parseNumber(row.scaffoldN50), + strain: parseStringOrNull(row.strain), tags: organismsByTaxon.get(row.taxon)?.tags ?? [], taxon: row.taxon, ucscBrowserUrl: parseStringOrNull(row.ucscBrowser), diff --git a/files/build-files-from-ncbi.py b/files/build-files-from-ncbi.py index 3cb9562..190f188 100644 --- a/files/build-files-from-ncbi.py +++ b/files/build-files-from-ncbi.py @@ -15,7 +15,7 @@ def build_taxonomy_request_body(taxa): return {"taxons": taxa, "children": False, "ranks": ["genus"]} -def get_organism_row(organism_info): +def get_organism_row(organism_info, accession): if len(organism_info.get("errors", [])) > 0: raise Exception(organism_info) @@ -25,11 +25,12 @@ def get_organism_row(organism_info): "taxon": organism_taxonomy["current_scientific_name"]["name"], "taxonomyId": str(organism_taxonomy["tax_id"]), "assemblyCount": next(count["count"] for count in organism_taxonomy["counts"] if count["type"] == "COUNT_TYPE_ASSEMBLY"), + "accession": accession, } -def get_organisms_df(taxa): - organisms_info = requests.post(TAXONOMY_URL, json=build_taxonomy_request_body(taxa)).json()["reports"] - return pd.DataFrame([get_organism_row(organism_info) for organism_info in organisms_info]) +def get_organisms_df(taxa_with_accessions): + organisms_info_with_accessions = [(organism_info, accession) for taxon, accession in taxa_with_accessions for organism_info in requests.post(TAXONOMY_URL, json=build_taxonomy_request_body([taxon])).json()["reports"]] + return pd.DataFrame([get_organism_row(organism_info, accession) for organism_info, accession in organisms_info_with_accessions]) def get_tax_ids(organisms_df): return list(organisms_df["taxonomyId"]) @@ -37,10 +38,11 @@ def get_tax_ids(organisms_df): def build_genomes_url(tax_ids): return f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/taxon/{urllib.parse.quote(",".join([str(id) for id in tax_ids]))}/dataset_report?filters.assembly_source=refseq&filters.has_annotation=true&filters.exclude_paired_reports=true&filters.exclude_atypical=true&filters.assembly_level=scaffold&filters.assembly_level=chromosome&filters.assembly_level=complete_genome" -def get_genome_row(genome_info): +def get_genome_row(genome_info, taxon): refseq_category = genome_info["assembly_info"].get("refseq_category") return { - "taxon": genome_info["organism"]["organism_name"], + "taxon": taxon, + "strain": genome_info["organism"].get("infraspecific_names", {}).get("strain", ""), "taxonomyId": genome_info["organism"]["tax_id"], "accession": genome_info["accession"], "isRef": refseq_category == "reference genome", @@ -56,8 +58,12 @@ def get_genome_row(genome_info): "pairedAccession": genome_info["paired_accession"], } -def get_genomes_df(tax_ids): - return pd.DataFrame(data=[get_genome_row(genome_info) for genome_info in requests.get(build_genomes_url(tax_ids)).json()["reports"]]) +def get_organism_genomes(tax_id, accession): + return [genome_info for genome_info in requests.get(build_genomes_url([tax_id])).json()["reports"] if genome_info["accession"] == accession] + +def get_genomes_df(organism_ids): + genomes_info_with_organisms = [(genome_info, taxon) for tax_id, taxon, accession in organism_ids for genome_info in get_organism_genomes(tax_id, accession)] + return pd.DataFrame(data=[get_genome_row(*info) for info in genomes_info_with_organisms]) def _id_to_gene_model_url(asm_id): hubs_url = "https://hgdownload.soe.ucsc.edu/hubs/" @@ -95,7 +101,7 @@ def build_files(): taxa_df = pd.read_csv(TAXA_URL, keep_default_na=False) - organisms_source_df = get_organisms_df([taxon.strip() for taxon in taxa_df["Name"] if taxon]) + organisms_source_df = get_organisms_df([(taxon.strip(), accession.strip()) for taxon, accession in zip(taxa_df["Name"], taxa_df["RefSeq Accession"]) if taxon]) organisms_df = organisms_source_df.merge(taxa_df[["TaxId", "CustomTags"]], how="left", left_on="taxonomyId", right_on="TaxId").drop(columns=["TaxId"]) @@ -103,7 +109,7 @@ def build_files(): print(f"Wrote to {ORGANISMS_OUTPUT_PATH}") - genomes_source_df = get_genomes_df(get_tax_ids(organisms_df)) + genomes_source_df = get_genomes_df(zip(organisms_df["taxonomyId"], organisms_df["taxon"], organisms_df["accession"])) assemblies_df = pd.DataFrame(requests.get(ASSEMBLIES_URL).json()["data"])[["ucscBrowser", "genBank", "refSeq"]] gen_bank_merge_df = genomes_source_df.merge(assemblies_df, how="left", left_on="accession", right_on="genBank") diff --git a/files/entities.ts b/files/entities.ts index 40c1774..a96f44f 100644 --- a/files/entities.ts +++ b/files/entities.ts @@ -11,6 +11,7 @@ export interface SourceGenome { scaffoldCount: string; scaffoldL50: string; scaffoldN50: string; + strain: string; taxon: string; taxonomyId: string; ucscBrowser: string; diff --git a/files/out/genomes.json b/files/out/genomes.json index e8ab3b0..7fe173d 100644 --- a/files/out/genomes.json +++ b/files/out/genomes.json @@ -13,6 +13,7 @@ "scaffoldCount": 2747, "scaffoldL50": 6, "scaffoldN50": 1678596, + "strain": "Salvador I", "tags": [ "VEuPathDB" ], @@ -33,8 +34,11 @@ "scaffoldCount": 12, "scaffoldL50": 4, "scaffoldN50": 2481190, - "tags": [], - "taxon": "Trypanosoma brucei brucei TREU927", + "strain": null, + "tags": [ + "VEuPathDB" + ], + "taxon": "Trypanosoma brucei", "ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_000002445.2" }, { @@ -51,8 +55,9 @@ "scaffoldCount": 8, "scaffoldL50": 4, "scaffoldN50": 3948441, + "strain": "Af293", "tags": [], - "taxon": "Aspergillus fumigatus Af293", + "taxon": "Aspergillus fumigatus", "ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_000002655.1" }, { @@ -69,8 +74,11 @@ "scaffoldCount": 36, "scaffoldL50": 11, "scaffoldN50": 1091540, - "tags": [], - "taxon": "Leishmania major strain Friedlin", + "strain": "Friedlin", + "tags": [ + "VEuPathDB" + ], + "taxon": "Leishmania major", "ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_000002725.2" }, { @@ -87,8 +95,11 @@ "scaffoldCount": 14, "scaffoldL50": 5, "scaffoldN50": 1687656, - "tags": [], - "taxon": "Plasmodium falciparum 3D7", + "strain": null, + "tags": [ + "VEuPathDB" + ], + "taxon": "Plasmodium falciparum", "ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_000002765.6" }, { @@ -105,8 +116,11 @@ "scaffoldCount": 138, "scaffoldL50": 11, "scaffoldN50": 992961, - "tags": [], - "taxon": "Leishmania braziliensis MHOM/BR/75/M2904", + "strain": "MHOM/BR/75/M2904", + "tags": [ + "VEuPathDB" + ], + "taxon": "Leishmania braziliensis", "ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_000002845.2" }, { @@ -123,8 +137,11 @@ "scaffoldCount": 2276, "scaffoldL50": 6, "scaffoldN50": 4973582, - "tags": [], - "taxon": "Toxoplasma gondii ME49", + "strain": "ME49", + "tags": [ + "VEuPathDB" + ], + "taxon": "Toxoplasma gondii", "ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_000006565.2" }, { @@ -141,8 +158,9 @@ "scaffoldCount": 14, "scaffoldL50": 6, "scaffoldN50": 1438950, + "strain": "JEC21", "tags": [], - "taxon": "Cryptococcus neoformans var. neoformans JEC21", + "taxon": "Cryptococcus neoformans", "ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_000091045.1" }, { @@ -159,8 +177,9 @@ "scaffoldCount": 6, "scaffoldL50": 3, "scaffoldN50": 4323945, + "strain": "RS", "tags": [], - "taxon": "Coccidioides immitis RS", + "taxon": "Coccidioides immitis", "ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_000149335.2" }, { @@ -177,8 +196,9 @@ "scaffoldCount": 8, "scaffoldL50": 3, "scaffoldN50": 2231883, + "strain": "SC5314", "tags": [], - "taxon": "Candida albicans SC5314", + "taxon": "Candida albicans", "ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_000182965.3" }, { @@ -195,8 +215,11 @@ "scaffoldCount": 1, "scaffoldL50": 1, "scaffoldN50": 4411532, - "tags": [], - "taxon": "Mycobacterium tuberculosis H37Rv", + "strain": "H37Rv", + "tags": [ + "Bact" + ], + "taxon": "Mycobacterium tuberculosis", "ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_000195955.2" }, { @@ -213,6 +236,7 @@ "scaffoldCount": 29495, "scaffoldL50": 212, "scaffoldN50": 88624, + "strain": "CL Brener", "tags": [ "VEuPathDB" ], @@ -233,6 +257,7 @@ "scaffoldCount": 36, "scaffoldL50": 11, "scaffoldN50": 1024085, + "strain": "BPK282A1", "tags": [ "VEuPathDB" ], @@ -240,22 +265,46 @@ "ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_000227135.1" }, { - "accession": "GCF_000277735.2", + "accession": "GCF_000857045.1", "annotationStatus": null, "chromosomes": 1, "coverage": null, - "gcPercent": 65.5, + "gcPercent": 33, "geneModelUrl": null, "isRef": "No", - "length": 4411709, + "length": 196858, "level": "Complete Genome", - "ncbiTaxonomyId": "83332", + "ncbiTaxonomyId": "10244", "scaffoldCount": 1, "scaffoldL50": 1, - "scaffoldN50": 4411709, - "tags": [], - "taxon": "Mycobacterium tuberculosis H37Rv", - "ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_000277735.2" + "scaffoldN50": 196858, + "strain": "Zaire-96-I-16", + "tags": [ + "Virus" + ], + "taxon": "Monkeypox virus", + "ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_000857045.1" + }, + { + "accession": "GCF_009858895.2", + "annotationStatus": null, + "chromosomes": 1, + "coverage": null, + "gcPercent": 38, + "geneModelUrl": null, + "isRef": "No", + "length": 29903, + "level": "Complete Genome", + "ncbiTaxonomyId": "2697049", + "scaffoldCount": 1, + "scaffoldL50": 1, + "scaffoldN50": 29903, + "strain": null, + "tags": [ + "Virus" + ], + "taxon": "Severe acute respiratory syndrome coronavirus 2", + "ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_009858895.2" }, { "accession": "GCF_015732765.1", @@ -271,6 +320,7 @@ "scaffoldCount": 56, "scaffoldL50": 2, "scaffoldN50": 201550677, + "strain": "JHB", "tags": [ "VEuPathDB" ], @@ -291,6 +341,7 @@ "scaffoldCount": 289, "scaffoldL50": 2, "scaffoldN50": 186194774, + "strain": null, "tags": [], "taxon": "Culex pipiens pallens", "ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_016801865.2" @@ -309,8 +360,11 @@ "scaffoldCount": 9, "scaffoldL50": 2, "scaffoldN50": 8079863, - "tags": [], - "taxon": "Coccidioides posadasii str. Silveira", + "strain": "Silveira", + "tags": [ + "VEuPathDB" + ], + "taxon": "Coccidioides posadasii", "ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_018416015.2" }, { @@ -327,6 +381,7 @@ "scaffoldCount": 14, "scaffoldL50": 5, "scaffoldN50": 2046250, + "strain": "17X", "tags": [ "VEuPathDB" ], @@ -347,8 +402,11 @@ "scaffoldCount": 14, "scaffoldL50": 5, "scaffoldN50": 1692345, - "tags": [], - "taxon": "Plasmodium vinckei vinckei", + "strain": null, + "tags": [ + "VEuPathDB" + ], + "taxon": "Plasmodium vinckei", "ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_900681995.1" }, { @@ -365,6 +423,7 @@ "scaffoldCount": 190, "scaffoldL50": 2, "scaffoldN50": 99149756, + "strain": null, "tags": [ "VEuPathDB" ], diff --git a/files/source/genomes-from-ncbi.tsv b/files/source/genomes-from-ncbi.tsv index 468f878..5b137dc 100644 --- a/files/source/genomes-from-ncbi.tsv +++ b/files/source/genomes-from-ncbi.tsv @@ -1,21 +1,22 @@ -taxon taxonomyId accession isRef level chromosomeCount length scaffoldCount scaffoldN50 scaffoldL50 coverage gcPercent annotationStatus pairedAccession ucscBrowser genBank refSeq geneModelUrl -Mycobacterium tuberculosis H37Rv 83332 GCF_000195955.2 True Complete Genome 1.0 4411532 1 4411532 1 65.5 GCA_000195955.2 https://genome.ucsc.edu/h/GCF_000195955.2 GCA_000195955.2 GCF_000195955.2 -Plasmodium falciparum 3D7 36329 GCF_000002765.6 True Complete Genome 14.0 23292622 14 1687656 5 100.0x 19.5 Full annotation GCA_000002765.3 https://genome.ucsc.edu/h/GCF_000002765.6 GCA_000002765.3 GCF_000002765.6 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/002/765/GCF_000002765.6/genes/GCF_000002765.6_GCA_000002765.ncbiRefSeq.gtf.gz -Leishmania major strain Friedlin 347515 GCF_000002725.2 True Complete Genome 36.0 32855089 36 1091540 11 59.5 Full annotation GCA_000002725.2 https://genome.ucsc.edu/h/GCF_000002725.2 GCA_000002725.2 GCF_000002725.2 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/002/725/GCF_000002725.2/genes/GCF_000002725.2_ASM272v2.ncbiRefSeq.gtf.gz -Plasmodium yoelii 5861 GCF_900002385.2 True Complete Genome 14.0 23043114 14 2046250 5 100.0x 21.5 Full annotation GCA_900002385.2 https://genome.ucsc.edu/h/GCF_900002385.2 GCA_900002385.2 GCF_900002385.2 https://hgdownload.soe.ucsc.edu/hubs/GCF/900/002/385/GCF_900002385.2/genes/GCF_900002385.2_GCA_900002385.ncbiRefSeq.gtf.gz -Coccidioides posadasii str. Silveira 443226 GCF_018416015.2 True Complete Genome 9.0 28193268 9 8079863 2 475.0x 46.5 Full annotation GCA_018416015.2 https://genome.ucsc.edu/h/GCF_018416015.2 GCA_018416015.2 GCF_018416015.2 https://hgdownload.soe.ucsc.edu/hubs/GCF/018/416/015/GCF_018416015.2/genes/GCF_018416015.2_ASM1841601v2.ncbiRefSeq.gtf.gz -Plasmodium vinckei vinckei 54757 GCF_900681995.1 True Chromosome 14.0 18338688 14 1692345 5 155.0x 23.0 Full annotation GCA_900681995.1 https://genome.ucsc.edu/h/GCF_900681995.1 GCA_900681995.1 GCF_900681995.1 https://hgdownload.soe.ucsc.edu/hubs/GCF/900/681/995/GCF_900681995.1/genes/GCF_900681995.1_PVVCY_v1.ncbiRefSeq.gtf.gz -Candida albicans SC5314 237561 GCF_000182965.3 True Chromosome 8.0 14282666 8 2231883 3 700.0x 33.5 Full annotation GCA_000182965.3 https://genome.ucsc.edu/h/GCF_000182965.3 GCA_000182965.3 GCF_000182965.3 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/182/965/GCF_000182965.3/genes/GCF_000182965.3_ASM18296v3.ncbiRefSeq.gtf.gz -Cryptococcus neoformans var. neoformans JEC21 214684 GCF_000091045.1 True Chromosome 14.0 19051922 14 1438950 6 48.5 Full annotation GCA_000091045.1 https://genome.ucsc.edu/h/GCF_000091045.1 GCA_000091045.1 GCF_000091045.1 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/091/045/GCF_000091045.1/genes/GCF_000091045.1_ASM9104v1.ncbiRefSeq.gtf.gz -Leishmania donovani 5661 GCF_000227135.1 True Chromosome 36.0 32444968 36 1024085 11 59.5 Full annotation GCA_000227135.2 https://genome.ucsc.edu/h/GCF_000227135.1 GCA_000227135.2 GCF_000227135.1 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/227/135/GCF_000227135.1/genes/GCF_000227135.1_ASM22713v2.ncbiRefSeq.gtf.gz -Aspergillus fumigatus Af293 330879 GCF_000002655.1 True Chromosome 8.0 29384958 8 3948441 4 50.0 Full annotation GCA_000002655.1 https://genome.ucsc.edu/h/GCF_000002655.1 GCA_000002655.1 GCF_000002655.1 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/002/655/GCF_000002655.1/genes/GCF_000002655.1_ASM265v1.ncbiRefSeq.gtf.gz -Toxoplasma gondii ME49 508771 GCF_000006565.2 True Chromosome 14.0 65633124 2276 4973582 6 26.5x 52.5 GCA_000006565.2 https://genome.ucsc.edu/h/GCF_000006565.2 GCA_000006565.2 GCF_000006565.2 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/006/565/GCF_000006565.2/genes/GCF_000006565.2_TGA4.ncbiRefSeq.gtf.gz -Trypanosoma brucei brucei TREU927 185431 GCF_000002445.2 True Chromosome 11.0 26075494 12 2481190 4 46.5 Full annotation GCA_000002445.1 https://genome.ucsc.edu/h/GCF_000002445.2 GCA_000002445.1 GCF_000002445.2 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/002/445/GCF_000002445.2/genes/GCF_000002445.2_ASM244v1.ncbiRefSeq.gtf.gz -Anopheles gambiae 7165 GCF_943734735.2 True Chromosome 3.0 264451381 190 99149756 2 54.0x 44.5 Full annotation GCA_943734735.2 https://genome.ucsc.edu/h/GCF_943734735.2 GCA_943734735.2 GCF_943734735.2 https://hgdownload.soe.ucsc.edu/hubs/GCF/943/734/735/GCF_943734735.2/genes/GCF_943734735.2_idAnoGambNW_F1_1.ncbiRefSeq.gtf.gz -Plasmodium vivax 5855 GCF_000002415.2 True Chromosome 14.0 27007701 2747 1678596 6 42.5 Full annotation GCA_000002415.2 https://genome.ucsc.edu/h/GCF_000002415.2 GCA_000002415.2 GCF_000002415.2 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/002/415/GCF_000002415.2/genes/GCF_000002415.2_ASM241v2.ncbiRefSeq.gtf.gz -Culex quinquefasciatus 7176 GCF_015732765.1 True Chromosome 3.0 573214445 56 201550677 2 76.0x 37.0 Full annotation GCA_015732765.1 https://genome.ucsc.edu/h/GCF_015732765.1 GCA_015732765.1 GCF_015732765.1 https://hgdownload.soe.ucsc.edu/hubs/GCF/015/732/765/GCF_015732765.1/genes/GCF_015732765.1_VPISU_Cqui_1.0_pri_paternal.ncbiRefSeq.gtf.gz -Culex pipiens pallens 42434 GCF_016801865.2 True Chromosome 3.0 566339288 289 186194774 2 250.0x 37.0 Full annotation GCA_016801865.2 https://genome.ucsc.edu/h/GCF_016801865.2 GCA_016801865.2 GCF_016801865.2 https://hgdownload.soe.ucsc.edu/hubs/GCF/016/801/865/GCF_016801865.2/genes/GCF_016801865.2_TS_CPP_V2.ncbiRefSeq.gtf.gz -Leishmania braziliensis MHOM/BR/75/M2904 420245 GCF_000002845.2 True Chromosome 35.0 32068771 138 992961 11 58.0 GCA_000002845.2 https://genome.ucsc.edu/h/GCF_000002845.2 GCA_000002845.2 GCF_000002845.2 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/002/845/GCF_000002845.2/genes/GCF_000002845.2_ASM284v2.ncbiRefSeq.gtf.gz -Trypanosoma cruzi 5693 GCF_000209065.1 True Scaffold 89937456 29495 88624 212 51.5 Full annotation GCA_000209065.1 https://genome.ucsc.edu/h/GCF_000209065.1 GCA_000209065.1 GCF_000209065.1 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/209/065/GCF_000209065.1/genes/GCF_000209065.1_ASM20906v1.ncbiRefSeq.gtf.gz -Coccidioides immitis RS 246410 GCF_000149335.2 True Scaffold 28947925 6 4323945 3 46.0 Full annotation GCA_000149335.2 https://genome.ucsc.edu/h/GCF_000149335.2 GCA_000149335.2 GCF_000149335.2 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/149/335/GCF_000149335.2/genes/GCF_000149335.2_ASM14933v2.ncbiRefSeq.gtf.gz -Mycobacterium tuberculosis H37Rv 83332 GCF_000277735.2 False Complete Genome 1.0 4411709 1 4411709 1 65.5 GCA_000277735.2 https://genome.ucsc.edu/h/GCF_000277735.2 GCA_000277735.2 GCF_000277735.2 +taxon strain taxonomyId accession isRef level chromosomeCount length scaffoldCount scaffoldN50 scaffoldL50 coverage gcPercent annotationStatus pairedAccession ucscBrowser genBank refSeq geneModelUrl +Plasmodium falciparum 36329 GCF_000002765.6 True Complete Genome 14.0 23292622 14 1687656 5 100.0x 19.5 Full annotation GCA_000002765.3 https://genome.ucsc.edu/h/GCF_000002765.6 GCA_000002765.3 GCF_000002765.6 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/002/765/GCF_000002765.6/genes/GCF_000002765.6_GCA_000002765.ncbiRefSeq.gtf.gz +Plasmodium vivax Salvador I 5855 GCF_000002415.2 True Chromosome 14.0 27007701 2747 1678596 6 42.5 Full annotation GCA_000002415.2 https://genome.ucsc.edu/h/GCF_000002415.2 GCA_000002415.2 GCF_000002415.2 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/002/415/GCF_000002415.2/genes/GCF_000002415.2_ASM241v2.ncbiRefSeq.gtf.gz +Plasmodium yoelii 17X 5861 GCF_900002385.2 True Complete Genome 14.0 23043114 14 2046250 5 100.0x 21.5 Full annotation GCA_900002385.2 https://genome.ucsc.edu/h/GCF_900002385.2 GCA_900002385.2 GCF_900002385.2 https://hgdownload.soe.ucsc.edu/hubs/GCF/900/002/385/GCF_900002385.2/genes/GCF_900002385.2_GCA_900002385.ncbiRefSeq.gtf.gz +Plasmodium vinckei 54757 GCF_900681995.1 True Chromosome 14.0 18338688 14 1692345 5 155.0x 23.0 Full annotation GCA_900681995.1 https://genome.ucsc.edu/h/GCF_900681995.1 GCA_900681995.1 GCF_900681995.1 https://hgdownload.soe.ucsc.edu/hubs/GCF/900/681/995/GCF_900681995.1/genes/GCF_900681995.1_PVVCY_v1.ncbiRefSeq.gtf.gz +Culex pipiens pallens 42434 GCF_016801865.2 True Chromosome 3.0 566339288 289 186194774 2 250.0x 37.0 Full annotation GCA_016801865.2 https://genome.ucsc.edu/h/GCF_016801865.2 GCA_016801865.2 GCF_016801865.2 https://hgdownload.soe.ucsc.edu/hubs/GCF/016/801/865/GCF_016801865.2/genes/GCF_016801865.2_TS_CPP_V2.ncbiRefSeq.gtf.gz +Culex quinquefasciatus JHB 7176 GCF_015732765.1 True Chromosome 3.0 573214445 56 201550677 2 76.0x 37.0 Full annotation GCA_015732765.1 https://genome.ucsc.edu/h/GCF_015732765.1 GCA_015732765.1 GCF_015732765.1 https://hgdownload.soe.ucsc.edu/hubs/GCF/015/732/765/GCF_015732765.1/genes/GCF_015732765.1_VPISU_Cqui_1.0_pri_paternal.ncbiRefSeq.gtf.gz +Anopheles gambiae 7165 GCF_943734735.2 True Chromosome 3.0 264451381 190 99149756 2 54.0x 44.5 Full annotation GCA_943734735.2 https://genome.ucsc.edu/h/GCF_943734735.2 GCA_943734735.2 GCF_943734735.2 https://hgdownload.soe.ucsc.edu/hubs/GCF/943/734/735/GCF_943734735.2/genes/GCF_943734735.2_idAnoGambNW_F1_1.ncbiRefSeq.gtf.gz +Toxoplasma gondii ME49 508771 GCF_000006565.2 True Chromosome 14.0 65633124 2276 4973582 6 26.5x 52.5 GCA_000006565.2 https://genome.ucsc.edu/h/GCF_000006565.2 GCA_000006565.2 GCF_000006565.2 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/006/565/GCF_000006565.2/genes/GCF_000006565.2_TGA4.ncbiRefSeq.gtf.gz +Mycobacterium tuberculosis H37Rv 83332 GCF_000195955.2 True Complete Genome 1.0 4411532 1 4411532 1 65.5 GCA_000195955.2 https://genome.ucsc.edu/h/GCF_000195955.2 GCA_000195955.2 GCF_000195955.2 +Coccidioides posadasii Silveira 443226 GCF_018416015.2 True Complete Genome 9.0 28193268 9 8079863 2 475.0x 46.5 Full annotation GCA_018416015.2 https://genome.ucsc.edu/h/GCF_018416015.2 GCA_018416015.2 GCF_018416015.2 https://hgdownload.soe.ucsc.edu/hubs/GCF/018/416/015/GCF_018416015.2/genes/GCF_018416015.2_ASM1841601v2.ncbiRefSeq.gtf.gz +Coccidioides immitis RS 246410 GCF_000149335.2 True Scaffold 28947925 6 4323945 3 46.0 Full annotation GCA_000149335.2 https://genome.ucsc.edu/h/GCF_000149335.2 GCA_000149335.2 GCF_000149335.2 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/149/335/GCF_000149335.2/genes/GCF_000149335.2_ASM14933v2.ncbiRefSeq.gtf.gz +Trypanosoma cruzi CL Brener 5693 GCF_000209065.1 True Scaffold 89937456 29495 88624 212 51.5 Full annotation GCA_000209065.1 https://genome.ucsc.edu/h/GCF_000209065.1 GCA_000209065.1 GCF_000209065.1 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/209/065/GCF_000209065.1/genes/GCF_000209065.1_ASM20906v1.ncbiRefSeq.gtf.gz +Trypanosoma brucei 185431 GCF_000002445.2 True Chromosome 11.0 26075494 12 2481190 4 46.5 Full annotation GCA_000002445.1 https://genome.ucsc.edu/h/GCF_000002445.2 GCA_000002445.1 GCF_000002445.2 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/002/445/GCF_000002445.2/genes/GCF_000002445.2_ASM244v1.ncbiRefSeq.gtf.gz +Leishmania major Friedlin 347515 GCF_000002725.2 True Complete Genome 36.0 32855089 36 1091540 11 59.5 Full annotation GCA_000002725.2 https://genome.ucsc.edu/h/GCF_000002725.2 GCA_000002725.2 GCF_000002725.2 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/002/725/GCF_000002725.2/genes/GCF_000002725.2_ASM272v2.ncbiRefSeq.gtf.gz +Leishmania donovani BPK282A1 5661 GCF_000227135.1 True Chromosome 36.0 32444968 36 1024085 11 59.5 Full annotation GCA_000227135.2 https://genome.ucsc.edu/h/GCF_000227135.1 GCA_000227135.2 GCF_000227135.1 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/227/135/GCF_000227135.1/genes/GCF_000227135.1_ASM22713v2.ncbiRefSeq.gtf.gz +Leishmania braziliensis MHOM/BR/75/M2904 420245 GCF_000002845.2 True Chromosome 35.0 32068771 138 992961 11 58.0 GCA_000002845.2 https://genome.ucsc.edu/h/GCF_000002845.2 GCA_000002845.2 GCF_000002845.2 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/002/845/GCF_000002845.2/genes/GCF_000002845.2_ASM284v2.ncbiRefSeq.gtf.gz +Severe acute respiratory syndrome coronavirus 2 2697049 GCF_009858895.2 False Complete Genome 1.0 29903 1 29903 1 38.0 GCA_009858895.3 https://genome.ucsc.edu/h/GCF_009858895.2 GCA_009858895.3 GCF_009858895.2 +Monkeypox virus Zaire-96-I-16 10244 GCF_000857045.1 False Complete Genome 1.0 196858 1 196858 1 33.0 GCA_000857045.1 https://genome.ucsc.edu/h/GCF_000857045.1 GCA_000857045.1 GCF_000857045.1 +Aspergillus fumigatus Af293 330879 GCF_000002655.1 True Chromosome 8.0 29384958 8 3948441 4 50.0 Full annotation GCA_000002655.1 https://genome.ucsc.edu/h/GCF_000002655.1 GCA_000002655.1 GCF_000002655.1 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/002/655/GCF_000002655.1/genes/GCF_000002655.1_ASM265v1.ncbiRefSeq.gtf.gz +Candida albicans SC5314 237561 GCF_000182965.3 True Chromosome 8.0 14282666 8 2231883 3 700.0x 33.5 Full annotation GCA_000182965.3 https://genome.ucsc.edu/h/GCF_000182965.3 GCA_000182965.3 GCF_000182965.3 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/182/965/GCF_000182965.3/genes/GCF_000182965.3_ASM18296v3.ncbiRefSeq.gtf.gz +Cryptococcus neoformans JEC21 214684 GCF_000091045.1 True Chromosome 14.0 19051922 14 1438950 6 48.5 Full annotation GCA_000091045.1 https://genome.ucsc.edu/h/GCF_000091045.1 GCA_000091045.1 GCF_000091045.1 https://hgdownload.soe.ucsc.edu/hubs/GCF/000/091/045/GCF_000091045.1/genes/GCF_000091045.1_ASM9104v1.ncbiRefSeq.gtf.gz diff --git a/files/source/organisms-from-ncbi.tsv b/files/source/organisms-from-ncbi.tsv index eb42060..a7a0d3a 100644 --- a/files/source/organisms-from-ncbi.tsv +++ b/files/source/organisms-from-ncbi.tsv @@ -1,22 +1,22 @@ -taxon taxonomyId assemblyCount CustomTags -Anopheles gambiae 7165 7 VEuPathDB -Aspergillus fumigatus 746128 352 -Candida albicans 5476 117 -Coccidioides immitis 5501 5 -Coccidioides posadasii 199306 13 VEuPathDB -Cryptococcus neoformans 5207 183 -Culex pipiens pallens 42434 1 -Culex quinquefasciatus 7176 3 VEuPathDB -Leishmania braziliensis 5660 11 VEuPathDB -Leishmania donovani 5661 12 VEuPathDB -Leishmania major 5664 7 VEuPathDB -Monkeypox virus 10244 6911 Virus -Mycobacterium tuberculosis 1773 7829 Bact -Plasmodium falciparum 5833 67 VEuPathDB -Plasmodium vinckei 5860 10 VEuPathDB -Plasmodium vivax 5855 19 VEuPathDB -Plasmodium yoelii 5861 15 VEuPathDB -Severe acute respiratory syndrome coronavirus 2 2697049 12408 Virus -Toxoplasma gondii 5811 29 VEuPathDB -Trypanosoma brucei 5691 6 VEuPathDB -Trypanosoma cruzi 5693 44 VEuPathDB +taxon taxonomyId assemblyCount accession CustomTags +Plasmodium falciparum 5833 67 GCF_000002765.6 VEuPathDB +Plasmodium vivax 5855 19 GCF_000002415.2 VEuPathDB +Plasmodium yoelii 5861 15 GCF_900002385.2 VEuPathDB +Plasmodium vinckei 5860 10 GCF_900681995.1 VEuPathDB +Culex pipiens pallens 42434 1 GCF_016801865.2 +Culex quinquefasciatus 7176 3 GCF_015732765.1 VEuPathDB +Anopheles gambiae 7165 7 GCF_943734735.2 VEuPathDB +Toxoplasma gondii 5811 29 GCF_000006565.2 VEuPathDB +Mycobacterium tuberculosis 1773 7829 GCF_000195955.2 Bact +Coccidioides posadasii 199306 13 GCF_018416015.2 VEuPathDB +Coccidioides immitis 5501 5 GCF_000149335.2 +Trypanosoma cruzi 5693 44 GCF_000209065.1 VEuPathDB +Trypanosoma brucei 5691 6 GCF_000002445.2 VEuPathDB +Leishmania major 5664 7 GCF_000002725.2 VEuPathDB +Leishmania donovani 5661 12 GCF_000227135.1 VEuPathDB +Leishmania braziliensis 5660 11 GCF_000002845.2 VEuPathDB +Severe acute respiratory syndrome coronavirus 2 2697049 12408 GCF_009858895.2 Virus +Monkeypox virus 10244 6911 GCF_000857045.1 Virus +Aspergillus fumigatus 746128 352 GCF_000002655.1 +Candida albicans 5476 117 GCF_000182965.3 +Cryptococcus neoformans 5207 183 GCF_000091045.1