From 991881b5df5f5228ecf4445ee2cc1431b9602ea8 Mon Sep 17 00:00:00 2001 From: AndreaFurlani <93392092+AndreaFurlani@users.noreply.github.com> Date: Wed, 11 Oct 2023 16:29:58 +0200 Subject: [PATCH] Added the new tool query_impc (#5503) * Fixed issues with the first commit * Fixed issues with the first commit * Fixed errors and issues after commit * Fixed error E131 in .py file * Added macros to xml file * Fix Planemo test issues * Fix Planemo test issues * Fix Planemo test issues * Fix Planemo test issues * Fix Planemo test issues * Fix Planemo test issues * Fix output test issues * Fix inputs test issues * Performed small changes to improve code readability * Added conditionals to tests * Added conditionals to tests * Fixing merging issues * Fixing merging issues * Fixing merging issues * Fixing merging issues * Fixing merging issues * Fixing merging issues * Fixing merging issues * Fixing merging issues * Fixing merging issues * Fixing merging issues * Fixing merging issues * Remove conditional from tests 3/4/5 * Fixed conditionals for all tests * Removed unused files and adjusted the text for input selection --- tools/query_impc/.shed.yml | 16 + tools/query_impc/impc_tool.py | 759 ++++++++++++++++++ tools/query_impc/impc_tool.xml | 351 ++++++++ .../test-data/test_output_1_1.tabular | 10 + .../test-data/test_output_1_2.tabular | 5 + .../test-data/test_output_2.tabular | 21 + .../test-data/test_output_3.tabular | 79 ++ .../test-data/test_output_9.tabular | 4 + tools/query_impc/test-data/test_query_1.txt | 1 + tools/query_impc/test-data/test_query_2.txt | 1 + tools/query_impc/test-data/test_query_3.txt | 1 + 11 files changed, 1248 insertions(+) create mode 100644 tools/query_impc/.shed.yml create mode 100644 tools/query_impc/impc_tool.py create mode 100644 tools/query_impc/impc_tool.xml create mode 100644 tools/query_impc/test-data/test_output_1_1.tabular create mode 100644 tools/query_impc/test-data/test_output_1_2.tabular create mode 100644 tools/query_impc/test-data/test_output_2.tabular create mode 100644 tools/query_impc/test-data/test_output_3.tabular create mode 100644 tools/query_impc/test-data/test_output_9.tabular create mode 100644 tools/query_impc/test-data/test_query_1.txt create mode 100644 tools/query_impc/test-data/test_query_2.txt create mode 100644 tools/query_impc/test-data/test_query_3.txt diff --git a/tools/query_impc/.shed.yml b/tools/query_impc/.shed.yml new file mode 100644 index 00000000000..80aa97dd800 --- /dev/null +++ b/tools/query_impc/.shed.yml @@ -0,0 +1,16 @@ +name: query_impc +owner: iuc +description: Contains a tool to query the IMPC database. +homepage_url: https://github.com/INFRAFRONTIERDIB/tools-iuc/tree/query_impc/tools/query_impc +remote_repository_url: https://github.com/INFRAFRONTIERDIB/tools-iuc/tree/query_impc/tools/query_impc +long_description: | + With this tool, it is possible to submit various types of queries to the IMPC database. Select the desired query from the drop down menu. + As input both MGI ids or gene symbols are allowed (even mixed). If you want to input more than one id, separate them with a comma without spaces (eg: MGI:104636,MGI:104637). + If a mixed input is retrieved, the order after the mapping will not be maintained. Note that if the mapping between the two types of ids doesn't retrieves a result, + that id will not be included in the query input, resulting in an error if all of the ids are not mapped. The output will be a table containing the data. + For the phenotypes, is possible to give as input both MP term ids or HP terms ids since they will be mapped to MP terms (also here the order of the input will not be maintained). + For both genes and phenotypes mapping, check the "View details" section of the job to check if some of them were not mapped (typo errors/id not present in the database). +type: unrestricted +categories: +- Convert Formats +- Web Services \ No newline at end of file diff --git a/tools/query_impc/impc_tool.py b/tools/query_impc/impc_tool.py new file mode 100644 index 00000000000..ae67ed6c419 --- /dev/null +++ b/tools/query_impc/impc_tool.py @@ -0,0 +1,759 @@ +import sys + +import mygene +import pandas as pd +import requests + + +impc_api_url = "https://www.ebi.ac.uk/mi/impc/bulkdata-api" +impc_api_search_url = f"{impc_api_url}/genes" +impc_api_gene_bundle_url = f"{impc_api_url}/geneBundles" + + +def stop_err(msg): + sys.exit(msg) + + +def main(): + inp = str(sys.argv[1]) + query = str(sys.argv[3]) + + try: + if query == "7": + g_out = str(sys.argv[5]) + full_gene_table(g_out) + sys.exit(0) + + if str(sys.argv[5]) == "txt": + s = str(sys.argv[6]) + if s == "t": + sep = "\t" + elif s == "s": + sep = " " + elif s in ",;.": + sep = s + else: + sys.exit("Separator not valid, please change it.") + inp = pd.read_csv(inp, header=None, delimiter=sep) + if len(inp.columns) == 1: + inp = inp.to_csv(header=None, + index=False).strip("\n").split("\n") + inp = ",".join(inp) + else: + inp = inp.to_csv(header=None, + index=False).strip(sep).split(sep) + inp = ",".join(inp) + + if query == "8": + if str(sys.argv[5]) == "txt": + g_out = str(sys.argv[7]) + else: + g_out = str(sys.argv[6]) + genes_in_pipeline(inp, g_out) + sys.exit(0) + elif query == "9": + if str(sys.argv[5]) == "txt": + g_out = str(sys.argv[7]) + else: + g_out = str(sys.argv[6]) + sign_mp(inp, g_out) + sys.exit(0) + elif query == "10": + par_pip_ma(inp) + sys.exit(0) + elif query == "11": + par_gen(inp) + sys.exit(0) + elif query == "2" or query == "4": + final_list = pheno_mapping(inp) + else: + final_list = gene_mapping(inp) + inp = ",".join(final_list) + + if query == "1": + get_pheno(inp) + sys.exit(0) + elif query == "2": + if str(sys.argv[5]) == "txt": + g_out = str(sys.argv[7]) + else: + g_out = str(sys.argv[6]) + get_genes(inp, g_out) + sys.exit(0) + elif query == "3": + gene_set(inp) + sys.exit(0) + elif query == "4": + extr_img(inp) + sys.exit(0) + elif query == "5": + parameters(inp) + sys.exit(0) + elif query == "6": + sign_par(inp) + sys.exit(0) + else: + stop_err("Error, non-implemented query selected: " + query) + except Exception as ex: + stop_err("Error running impc_tool.py:\n" + str(ex)) + + +# 1-Given a gene id, retrieve all the phenotypes related to it (id and name) +def get_pheno(inp): + head = sys.argv[4] + mgi_accession_id = inp + + gene_url = f"{impc_api_search_url}/{mgi_accession_id}" + gene_data = requests.get(gene_url).json() + + p_list = [] + id_list = [] + + if gene_data["significantMpTerms"] is None: + stop_err("No significant MP terms found for this gene") + else: + for x in gene_data["significantMpTerms"]: + p_list.append(x["mpTermId"]) + id_list.append(x["mpTermName"]) + + df = pd.DataFrame() + df["MP term name"] = p_list + df["MP term id"] = id_list + + if head == "True": + df.to_csv(sys.argv[2], header=True, index=False, + sep="\t", index_label=False) + else: + df.to_csv(sys.argv[2], header=False, index=False, + sep="\t", index_label=False) + + +# 3-Extract all genes having a particular phenotype or a set of phenotypes +# (e.g. relevant to a disease) +def get_genes(inp, g_out): + head = sys.argv[4] + target_mp_terms = inp + +# All the data is paginated using the page and size parameters, +# by default the endpoint returns the first 20 hits + gene_by_phenotypes_query = f"{impc_api_search_url}" \ + f"/search/findAllBySignificantMpTermIdsContains" \ + f"?mpTermIds={target_mp_terms}&page=0&size=20" + genes_with_clinical_chemistry_phen = \ + requests.get(gene_by_phenotypes_query).json() + print(f"Genes with {target_mp_terms}: " + f"{genes_with_clinical_chemistry_phen['page']['totalElements']}") + acc = [] + name = [] + url = [] + + for gene in genes_with_clinical_chemistry_phen["_embedded"]["genes"]: + acc.append(gene["mgiAccessionId"]) + name.append(gene["markerName"]) + url.append(gene["_links"]["geneBundle"]["href"]) + + if g_out == "sym": + list_of_genes = pd.DataFrame(columns=["Gene symbol id", "Gene name", + "Gene bundle url"]) + list_of_genes["Gene symbol id"] = mgi_sym_map(acc) + else: + list_of_genes = pd.DataFrame(columns=["Gene accession id", + "Gene name", "Gene bundle url"]) + list_of_genes["Gene accession id"] = acc + list_of_genes["Gene name"] = name + list_of_genes["Gene bundle url"] = url + + if head == "True": + list_of_genes.to_csv(sys.argv[2], header=True, index=False, + sep="\t", index_label=False) + else: + list_of_genes.to_csv(sys.argv[2], header=False, index=False, + sep="\t", index_label=False) + + +# 4. Extract all phenotypes which are present in a particular gene set +# (e.g. genes together in a pathway) +def gene_set(inp): + head = sys.argv[4] + target_genes = inp + + genes_in_gene_list_query = f"{impc_api_search_url}/search/" \ + f"findAllByMgiAccessionIdIn?" \ + f"mgiAccessionIds={target_genes}" + + genes_in_gene_list = requests.get(genes_in_gene_list_query).json() + mp_terms_vs_gene_idx = {} + + for gene in genes_in_gene_list["_embedded"]["genes"]: + mp_terms = gene["significantMpTerms"] + gene_acc_id = gene["mgiAccessionId"] + if mp_terms is None: + continue + for mp_term_name in mp_terms: + if mp_term_name["mpTermId"] not in mp_terms_vs_gene_idx: + mp_terms_vs_gene_idx[mp_term_name["mpTermId"]] = \ + {"mp_term": mp_term_name["mpTermId"], + "mp_name": mp_term_name["mpTermName"], "genes": []} + mp_terms_vs_gene_idx[mp_term_name["mpTermId"]]["genes"].\ + append(gene_acc_id) + genes_by_mp_term = list(mp_terms_vs_gene_idx.values()) + + df = pd.DataFrame() + terms = [] + names = [] + genes = [] + for i in genes_by_mp_term: + terms.append(i["mp_term"]) + names.append(i["mp_name"]) + genes.append(",".join(i["genes"])) + + df["mp_term"] = terms + df["mp_name"] = names + df["genes"] = genes + + if head == "True": + df.to_csv(sys.argv[2], header=True, index=False, + sep="\t", index_label=False) + else: + df.to_csv(sys.argv[2], header=False, index=False, + sep="\t", index_label=False) + + +# 7. Extract images with a particular phenotype or a set of phenotypes +def extr_img(inp): + head = sys.argv[4] + target_mp_terms = inp # ["MP:0002110", "MP:0000559"] + +# All the data is paginated using the page and size parameters, +# by default the endpoint returns the first 20 hits + gene_by_phenotypes_query = f"{impc_api_search_url}/search/" \ + f"findAllBySignificantMpTermIdsContains?" \ + f"mpTermIds={target_mp_terms}&page=0&size=20" + genes_with_morph_mps = requests.get(gene_by_phenotypes_query).json() + list_of_gene_bundle_urls = [ + gene["_links"]["geneBundle"]["href"] for gene in + genes_with_morph_mps["_embedded"]["genes"] + ] + + gene_bundles = [] + for gene_bundle_url in list_of_gene_bundle_urls: + gene_bundle = requests.get(gene_bundle_url).json() + gene_bundles.append(gene_bundle) + + images_with_morphology_mps = [] + + # Doing just the first 20 and filtering out fields on the images + display_fields = ["geneSymbol", "parameterName", "biologicalSampleGroup", + "colonyId", "zygosity", "sex", "downloadUrl", + "externalSampleId", "thumbnailUrl"] + + for gene_bundle in gene_bundles[:20]: + if len(gene_bundle) == 4: + continue + if gene_bundle["geneImages"] is not None: + images = gene_bundle["geneImages"] + for image in images: + display_image = {k: v for k, v in image.items() + if k in display_fields} + images_with_morphology_mps.append(display_image) + + images_table = [] + print(f"Images related to phenotype {target_mp_terms}: " + f"{len(images_with_morphology_mps)}") + # Displaying just the first 20 images + for i in images_with_morphology_mps[:20]: + row = [f""] + list(i.values()) + images_table.append(row) + + df = pd.DataFrame() + externalSampleId = [] + geneSymbol = [] + biologicalSampleGroup = [] + sex = [] + colonyId = [] + zygosity = [] + parameterName = [] + downloadUrl = [] + thumbnailUrl = [] + + for i in images_table: + externalSampleId.append(i[1]) + geneSymbol.append(i[2]) + biologicalSampleGroup.append(i[3]) + sex.append(i[4]) + colonyId.append(i[5]) + zygosity.append(i[6]) + parameterName.append(i[7]) + downloadUrl.append(i[8]) + thumbnailUrl.append(i[9]) + + df["externalSampleId"] = externalSampleId + df["geneSymbol"] = geneSymbol + df["biologicalSampleGroup"] = biologicalSampleGroup + df["sex"] = sex + df["colonyId"] = colonyId + df["zygosity"] = zygosity + df["parameterName"] = parameterName + df["downloadUrl"] = downloadUrl + df["thumbnailUrl"] = thumbnailUrl + + if head == "True": + df.to_csv(sys.argv[2], header=True, index=False, + sep="\t", index_label=False) + else: + df.to_csv(sys.argv[2], header=False, index=False, + sep="\t", index_label=False) + + +# 11- Which parameters have been measured for a particular knockout +def parameters(inp): + head = sys.argv[4] + knockout = inp # "MGI:104636" + gene_info = requests.get(impc_api_search_url + "/" + knockout).json() + + if gene_info["phenotypingDataAvailable"]: + geneBundle = requests.get(gene_info["_links"]["geneBundle"]["href"])\ + .json() + gen_imgs = geneBundle["geneImages"] + par_list = [] + lis = {} + for i in gen_imgs: + lis = {"Parameter Name": i["parameterName"]} + if lis not in par_list: + par_list.append(lis) + df = pd.DataFrame() + li = [] + + for i in par_list: + li.append(i["Parameter Name"]) + + df["Parameter"] = li + if head == "True": + df.to_csv(sys.argv[2], header=True, index=False, + sep="\t", index_label=False) + else: + df.to_csv(sys.argv[2], header=False, index=False, + sep="\t", index_label=False) + + else: + stop_err("No parameters available for this knockout gene") + + +# 12- Which parameters identified a significant finding for a particular +# knockout line (colony) +def sign_par(inp): + head = sys.argv[4] + knockout = inp # "MGI:104636" + + gene_info = requests.get(f"{impc_api_url}statisticalResults/search/" + f"findAllByMarkerAccessionIdIsAndSignificantTrue?" + f"mgiAccessionId=" + knockout).json() + gene_stats = gene_info["_embedded"]["statisticalResults"] + + if len(gene_stats) == 0: + stop_err("No statistically relevant parameters found " + "for this knockout gene") + else: + df = pd.DataFrame() + n = [] + p = [] + for g in gene_stats: + n.append(g["parameterName"]) + p.append(g["pvalue"]) + + df["Parameter name"] = n + df["p-value"] = p + if head == "True": + df.to_csv(sys.argv[2], header=True, index=False, + sep="\t", index_label=False) + else: + df.to_csv(sys.argv[2], header=False, index=False, + sep="\t", index_label=False) + + +# 13- List of genes names and ID measured in a pipeline +def genes_in_pipeline(inp, g_out): + head = sys.argv[4] + pip = inp + + g_in_p_query = f"{impc_api_search_url}/search/" \ + f"findAllByTestedPipelineId?pipelineId={pip}&" \ + f"page=0&size=1000" + genes_in_pip = requests.get(g_in_p_query).json() + pages = genes_in_pip["page"]["totalPages"] + max_elem = genes_in_pip["page"]["totalElements"] + + print(f"Genes with {pip}: {genes_in_pip['page']['totalElements']}") + list_d = [] + acc = [] + name = [] + + if max_elem > 1000: + g_in_p_query = genes_in_pip["_embedded"]["genes"] + for i in range(1, pages): + gl = requests.get(f"{impc_api_search_url}/search/" + f"findAllByTestedPipelineId?pipelineId={pip}&" + f"page={i}&" + f"size=1000").json()["_embedded"]["genes"] + g_in_p_query += gl + else: + g_in_p_query = genes_in_pip["_embedded"]["genes"] + + for g in g_in_p_query: + d = {"Gene Accession ID": g["mgiAccessionId"], + "Gene Name": g["markerName"]} + list_d.append(d) + + for i in list_d: + acc.append(i["Gene Accession ID"]) + name.append(i["Gene Name"]) + if g_out == "sym": + list_of_genes = pd.DataFrame(columns=["Gene symbol", "Gene name"]) + list_of_genes["Gene symbol"] = mgi_sym_map(acc) + else: + list_of_genes = pd.DataFrame(columns=["Gene accession id", + "Gene name"]) + list_of_genes["Gene accession id"] = acc + list_of_genes["Gene name"] = name + + if head == "True": + list_of_genes.to_csv(sys.argv[2], header=True, index=False, + sep="\t", index_label=False) + else: + list_of_genes.to_csv(sys.argv[2], header=False, index=False, + sep="\t", index_label=False) + + +# 14- Extract all genes and corresponding phenotypes related to a +# particular organ system (eg: significatMPTerm) +def sign_mp(inp, g_out): + head = sys.argv[4] + mp_term = inp # ["MP:0005391"] + + gene_by_mpterm_query = f"{impc_api_search_url}/search/" \ + f"findAllBySignificantMpTermIdsContains?" \ + f"mpTermIds={mp_term}&size=1000" + genes_with_mpterm = requests.get(gene_by_mpterm_query).json() + + pages = genes_with_mpterm["page"]["totalPages"] + genes_info = genes_with_mpterm["_embedded"]["genes"] + + for pn in range(1, pages): + pq = f"{impc_api_search_url}/search/" \ + f"findAllBySignificantMpTermIdsContains?" \ + f"mpTermIds={mp_term}&page={pn}&size=1000" + g = requests.get(pq).json()["_embedded"]["genes"] + genes_info += g + + list_d = [] + d = {} + for g in genes_info: + names = [] + ids = [] + for s in g["significantMpTerms"]: + names.append(s["mpTermName"]) + ids.append(s["mpTermId"]) + d = {"Gene": g["mgiAccessionId"], "mpTermId": ids, "mpTermName": names} + list_d.append(d) + + g = [] + ids = [] + names = [] + for i in list_d: + g.append(i["Gene"]) + ids.append(i["mpTermId"]) + names.append(i["mpTermName"]) + + df = pd.DataFrame() + if g_out == "sym": + df["Gene symbol"] = mgi_sym_map(g) + else: + df["Gene Id"] = g + df["Significant MP terms Ids"] = ids + df["Significant MP terms Names"] = names + + if head == "True": + df.to_csv(sys.argv[2], header=True, index=False, + sep="\t", index_label=False) + else: + df.to_csv(sys.argv[2], header=False, index=False, + sep="\t", index_label=False) + + +# 16- Full table of genes and all identified phenotypes +def full_gene_table(g_out): + head = sys.argv[4] + gene_list = requests.get(impc_api_search_url + "?page=0&size=1000").json() + pages = gene_list["page"]["totalPages"] + genes_info = gene_list["_embedded"]["genes"] + + for pn in range(1, pages): + gp = requests.get(impc_api_search_url + + f"?page={pn}&" + f"size=1000").json()["_embedded"]["genes"] + genes_info += gp + + d = {} + list_d = [] + + for i in genes_info: + if i["significantMpTerms"] is None: + d = {"Gene": i["mgiAccessionId"], "Identified phenotypes": "None"} + else: + d = {"Gene": i["mgiAccessionId"], + "Identified phenotypes": [ + sub["mpTermId"] for sub in i["significantMpTerms"] + ]} + list_d.append(d) + + df = pd.DataFrame() + g = [] + p = [] + for i in list_d: + g.append(i["Gene"]) + p.append(i["Identified phenotypes"]) + + if g_out == "sym": + df["Gene symbol"] = mgi_sym_map(g) + else: + df["MGI id"] = g + df["MP term list"] = p + + for i in range(0, len(df)): + if df["MP term list"][i] != "None": + df["MP term list"][i] = str( + df["MP term list"][i] + )[1:-1].replace("'", "") + + if str(sys.argv[1]) == "True": + if head == "True": + df.to_csv(sys.argv[2], header=True, index=False, + sep="\t", index_label=False) + else: + df.to_csv(sys.argv[2], header=False, index=False, + sep="\t", index_label=False) + else: + df = df[df["MP term list"] != "None"] + df.reset_index(drop=True, inplace=True) + if head == "True": + df.to_csv(sys.argv[2], header=True, index=False, + sep="\t", index_label=False) + else: + df.to_csv(sys.argv[2], header=False, index=False, + sep="\t", index_label=False) + + +# 18- Extract measurements and analysis for a parameter or pipeline +def par_pip_ma(inp): + head = sys.argv[4] + id = inp + + if id[0:4] == "IMPC": + par = True + ma_query = f"{impc_api_search_url}/search/" \ + f"findAllByTestedParameterId?" \ + f"parameterId={id}&page=0&size=1000" + else: + ma_query = f"{impc_api_search_url}/search/" \ + f"findAllByTestedPipelineId?" \ + f"pipelineId={id}&page=0&size=1000" + par = False + + ma_in_pip = requests.get(ma_query).json() + pages = ma_in_pip["page"]["totalPages"] + max_elem = ma_in_pip["page"]["totalElements"] + + print(f"Genes with {id}: {ma_in_pip['page']['totalElements']}") + list_d = [] + list_of_genes = pd.DataFrame(columns=["Measurements", "Analysis"]) + mes = [] + an = [] + + if max_elem > 1000: + + ma_in_pip = ma_in_pip["_embedded"]["genes"] + for pn in range(1, pages): + if par: + pip = requests.get(f"{impc_api_search_url}/search/" + f"findAllByTestedParameterId?" + f"parameterId={id}&page={pn}&" + f"size=1000").json()["_embedded"]["genes"] + else: + pip = requests.get(f"{impc_api_search_url}/search/" + f"findAllByTestedPipelineId?" + f"pipelineId={id}&page={pn}&" + f"size=1000").json()["_embedded"]["genes"] + ma_in_pip += pip + + else: + ma_in_pip = ma_in_pip["_embedded"]["genes"] + + for g in ma_in_pip: + d = {"Measurements": g[""], "Analysis": g[""]} + list_d.append(d) + + for i in list_d: + mes.append(i[""]) + an.append(i[""]) + + list_of_genes["Analysis"] = an + list_of_genes["Measurements"] = mes + + if head == "True": + list_of_genes.to_csv(sys.argv[2], header=True, index=False, + sep="\t", index_label=False) + else: + list_of_genes.to_csv(sys.argv[2], header=False, index=False, + sep="\t", index_label=False) + + +# 19- Get all genes and measured values for a particular parameter +def par_gen(inp, g_out): + head = sys.argv[4] + id = inp + + pa_query = f"{impc_api_search_url}/search/" \ + f"findAllByTestedParameterId?parameterId={id}&page=0&size=1000" + + gm_par = requests.get(pa_query).json() + pages = gm_par["page"]["totalPages"] + max_elem = gm_par["page"]["totalElements"] + + print(f"Genes with {id}: {gm_par['page']['totalElements']}") + list_d = [] + gen = [] + mes = [] + + if max_elem > 1000: + + gm_par = gm_par["_embedded"]["genes"] + + for pn in range(1, pages): + pip = requests.get(f"{impc_api_search_url}/search/" + f"findAllByTestedParameterId?" + f"parameterId={id}&page={pn}&" + f"size=1000").json()["_embedded"]["genes"] + gm_par += pip + + else: + gm_par = gm_par["_embedded"]["genes"] + + for g in gm_par: + d = {"Genes": g["mgiAccessionId"], "Measured Values": g[""]} + list_d.append(d) + + for i in list_d: + gen.append(i["Genes"]) + mes.append(i["Measured Values"]) + + if g_out == "sym": + list_of_genes = pd.DataFrame(columns=["Gene symbol", + "Measured Values"]) + list_of_genes["Gene symbol"] = mgi_sym_map(gen) + else: + list_of_genes = pd.DataFrame(columns=["Gene accession id", + "Measured Values"]) + list_of_genes["Gene accession id"] = gen + list_of_genes["Measured Values"] = mes + + if head == "True": + list_of_genes.to_csv(sys.argv[2], header=True, index=False, + sep="\t", index_label=False) + else: + list_of_genes.to_csv(sys.argv[2], header=False, index=False, + sep="\t", index_label=False) + + +# Function to map gene symbol to MGI ids +def gene_mapping(inp): + tmp = inp.split(",") + final_list = [] + sym_list = [] + for i in tmp: + if "MGI:" in i: + final_list.append(i) + else: + sym_list.append(i) + del i + + # symbol for symbols, mgi for MGI : + # https://docs.mygene.info/en/latest/doc/query_service.html#available-fields + if len(sym_list) != 0: + mg = mygene.MyGeneInfo() + ginfo = mg.querymany(sym_list, scopes="symbol", fields="symbol,MGI", + species="mouse") + empty = True + discarded = [] + for i in ginfo: + try: + final_list.append(i["MGI"]) + empty = False + except KeyError: + discarded.append(i["query"]) + if empty and len(final_list) == 0: + stop_err("Error: it was not possible to map the input.") + elif empty: + print("Warning: it was not possible to map any of the symbol ids. " + "Only MGI ids will be used.") + elif len(discarded) != 0: + print("Warning: it was not possible to map these elements: " + "" + ",".join(discarded) + "\n") + + return final_list + + +# Function to map phenotypes ids to names +def pheno_mapping(inp): + tmp = inp.split(",") + final_list = [] + sym_list = [] + for i in tmp: + if "MP:" in i: + final_list.append(i) + else: + sym_list.append(i) + del i + if len(sym_list) != 0: + url = "https://raw.githubusercontent.com/AndreaFurlani/" \ + "hp_mp_mapping_test/main/hp_mp_mapping.csv" + mapper = pd.read_csv(url, header=0, index_col=2) + empty = True + discarded = [] + for i in sym_list: + try: + final_list.append(mapper.loc[i]["mpId"]) + empty = False + except KeyError: + discarded.append(i) + continue + if empty and len(final_list) == 0: + stop_err("Error: it was not possible to map the input.") + elif empty: + print("Warning: it was not possible to map any of the " + "HP term entries. Only MP entries will be used.") + elif len(discarded) != 0: + print("Warning: it was not possible to " + "map these elements: " + ",".join(discarded) + "\n") + return final_list + + +# Function to map MGI ids to Gene Symbols +def mgi_sym_map(mgi_list): + sym_list = [] + mg = mygene.MyGeneInfo() + ginfo = mg.querymany(mgi_list, scopes="MGI", fields="symbol,MGI", + species="mouse") + discarded = [] + for i in ginfo: + try: + sym_list.append(i["symbol"]) + except KeyError: + sym_list.append(i["query"]) + discarded.append(i["query"]) + if len(discarded) != 0: + print("It was not possible to map these genes: " + ",".join(discarded)) + return sym_list + + +if __name__ == "__main__": + main() diff --git a/tools/query_impc/impc_tool.xml b/tools/query_impc/impc_tool.xml new file mode 100644 index 00000000000..9d825334e0a --- /dev/null +++ b/tools/query_impc/impc_tool.xml @@ -0,0 +1,351 @@ + + query tool + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + requests + pandas + lxml + mygene + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + https://doi.org/10.1093/nar/gku1193 + https://doi.org/10.12688/f1000research.25369.1 + https://doi.org/10.1038/nature19356 + + \ No newline at end of file diff --git a/tools/query_impc/test-data/test_output_1_1.tabular b/tools/query_impc/test-data/test_output_1_1.tabular new file mode 100644 index 00000000000..b82250a17e8 --- /dev/null +++ b/tools/query_impc/test-data/test_output_1_1.tabular @@ -0,0 +1,10 @@ +MP term name MP term id +MP:0002135 abnormal kidney morphology +MP:0000194 increased circulating calcium level +MP:0002574 increased vertical activity +MP:0005633 increased circulating sodium level +MP:0001303 abnormal lens morphology +MP:0002965 increased circulating serum albumin level +MP:0001304 cataract +MP:0010052 increased grip strength +MP:0001402 decreased locomotor activity diff --git a/tools/query_impc/test-data/test_output_1_2.tabular b/tools/query_impc/test-data/test_output_1_2.tabular new file mode 100644 index 00000000000..4a3aadca5b4 --- /dev/null +++ b/tools/query_impc/test-data/test_output_1_2.tabular @@ -0,0 +1,5 @@ +MP term name MP term id +MP:0000194 increased circulating calcium level +MP:0011110 preweaning lethality, incomplete penetrance +MP:0001303 abnormal lens morphology +MP:0010053 decreased grip strength diff --git a/tools/query_impc/test-data/test_output_2.tabular b/tools/query_impc/test-data/test_output_2.tabular new file mode 100644 index 00000000000..861646a48bf --- /dev/null +++ b/tools/query_impc/test-data/test_output_2.tabular @@ -0,0 +1,21 @@ +Gene accession id Gene name Gene bundle url +MGI:1345144 sprouty RTK signaling antagonist 4 https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:1345144 +MGI:2670964 terminal nucleotidyltransferase 5A https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:2670964 +MGI:95490 fibrillin 2 https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:95490 +MGI:95689 growth differentiation factor 6 https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:95689 +MGI:1341886 ajuba LIM protein https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:1341886 +MGI:1347352 hormonally upregulated Neu-associated kinase https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:1347352 +MGI:109331 nucleoredoxin https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:109331 +MGI:1914061 dual oxidase maturation factor 2 https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:1914061 +MGI:1915958 RAB, member RAS oncogene family-like 2 https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:1915958 +MGI:1917363 ciliary microtubule associated protein 1B https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:1917363 +MGI:1920858 MARVEL (membrane-associating) domain containing 3 https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:1920858 +MGI:106576 chondroitin polymerizing factor https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:106576 +MGI:107185 chaperonin containing Tcp1, subunit 5 (epsilon) https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:107185 +MGI:1931881 DnaJ heat shock protein family (Hsp40) member B12 https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:1931881 +MGI:109327 BCL2/adenovirus E1B interacting protein 2 https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:109327 +MGI:1913955 deoxyribonuclease 1-like 2 https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:1913955 +MGI:107374 paired-like homeodomain transcription factor 1 https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:107374 +MGI:1335088 proline-serine-threonine phosphatase-interacting protein 2 https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:1335088 +MGI:95688 growth differentiation factor 5 https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:95688 +MGI:107474 CD38 antigen https://www.ebi.ac.uk/mi/impc/bulkdata-api/geneBundles/MGI:107474 diff --git a/tools/query_impc/test-data/test_output_3.tabular b/tools/query_impc/test-data/test_output_3.tabular new file mode 100644 index 00000000000..0db14cd3fdc --- /dev/null +++ b/tools/query_impc/test-data/test_output_3.tabular @@ -0,0 +1,79 @@ +MP:0002764 short tibia MGI:99960,MGI:108071 +MP:0001785 edema MGI:99960 +MP:0002968 increased circulating alkaline phosphatase level MGI:99960 +MPATH:590 fibro-osseous lesion MGI:99960 +MP:0001399 hyperactivity MGI:99960,MGI:1354170 +MP:0011100 preweaning lethality, complete penetrance MGI:99960,MGI:1344380,MGI:1917473 +MP:0010052 increased grip strength MGI:99960,MGI:96709 +MPATH:134 hyperplasia MGI:99960 +MP:0000218 increased leukocyte cell number MGI:99960,MGI:96709 +MP:0005013 increased lymphocyte cell number MGI:99960 +MP:0001363 increased anxiety-related response MGI:1354170 +MP:0001258 decreased body length MGI:1354170,MGI:108071,MGI:1915775,MGI:2443026 +MP:0003795 abnormal bone structure MGI:1354170 +MP:0001417 decreased exploration in new environment MGI:1354170,MGI:96709 +MP:0002797 increased thigmotaxis MGI:1354170 +MP:0002757 decreased vertical activity MGI:1354170 +MP:0011960 abnormal eye anterior chamber depth MGI:1354170 +MP:0010124 decreased bone mineral content MGI:1354170 +MP:0001402 decreased locomotor activity MGI:1354170 +MP:0004924 abnormal behavior MGI:1354170,MGI:96709 +MP:0013279 increased fasting circulating glucose level MGI:99502,MGI:1860418,MGI:103225 +MP:0005333 decreased heart rate MGI:3616082 +MP:0001406 abnormal gait MGI:96709 +MP:0010053 decreased grip strength MGI:96709,MGI:1924093,MGI:1915775 +MP:0001523 impaired righting response MGI:96709 +MP:0005559 increased circulating glucose level MGI:96709 +MP:0000745 tremors MGI:96709 +MPATH:52 lipid depletion MGI:1913564 +MPATH:42 lipid deposition MGI:1913564 +MP:0005419 decreased circulating serum albumin level MGI:1860418 +MP:0000219 increased neutrophil cell number MGI:1860418 +MP:0005567 decreased circulating total protein level MGI:1860418,MGI:1915775 +MP:0008810 increased circulating iron level MGI:1914361 +MP:0002875 decreased erythrocyte cell number MGI:1914361 +MP:0000208 decreased hematocrit MGI:1914361 +MP:0002874 decreased hemoglobin content MGI:1914361 +MP:0005566 decreased blood urea nitrogen level MGI:103225,MGI:1915775 +MP:0005343 increased circulating aspartate transaminase level MGI:103225 +MP:0011954 shortened PQ interval MGI:103225 +MP:0005344 increased circulating bilirubin level MGI:103225,MGI:95479 +MP:0002644 decreased circulating triglyceride level MGI:103225 +MP:0001415 increased exploration in new environment MGI:103225 +MP:0010511 shortened PR interval MGI:103225 +MP:0002574 increased vertical activity MGI:1915291 +MP:0003917 increased kidney weight MGI:1915291 +MP:0013292 embryonic lethality prior to organogenesis MGI:1344380 +MP:0000221 decreased leukocyte cell number MGI:95479 +MP:0005016 decreased lymphocyte cell number MGI:95479 +MP:0012361 decreased large unstained cell number MGI:95479 +MP:0001146 abnormal testis morphology MGI:2443598 +MP:0002152 abnormal brain morphology MGI:2443598 +MPATH:127 atrophy MGI:2443598 +MPATH:639 hydrocephalus MGI:2443598 +MP:0001925 male infertility MGI:2443598 +MP:0002092 abnormal eye morphology MGI:2443598 +MP:0005238 increased brain size MGI:2443598 +MP:0001147 small testis MGI:2443598 +MP:0000598 abnormal liver morphology MGI:2441730 +MP:0002833 increased heart weight MGI:2441730 +MP:0011110 preweaning lethality, incomplete penetrance MGI:2441730,MGI:1915775,MGI:2443026 +MP:0004738 abnormal auditory brainstem response MGI:2441730 +MP:0000599 enlarged liver MGI:2441730 +MP:0009476 enlarged cecum MGI:2441730 +MP:0005565 increased blood urea nitrogen level MGI:2441730 +MP:0001284 absent vibrissae MGI:2441730 +MP:0004832 enlarged ovary MGI:2441730 +MP:0005084 abnormal gallbladder morphology MGI:1915775 +MP:0000274 enlarged heart MGI:1915775 +MP:0009142 decreased prepulse inhibition MGI:1915775 +MP:0000692 small spleen MGI:1915775 +MP:0030610 absent teeth MGI:1915775 +MP:0001325 abnormal retina morphology MGI:1915775 +MP:0000266 abnormal heart morphology MGI:1915775 +MPATH:64 developmental dysplasia MGI:1915775 +MP:0000494 abnormal cecum morphology MGI:1915775 +MP:0001120 abnormal uterus morphology MGI:1915775 +MP:0000689 abnormal spleen morphology MGI:1915775 +MP:0009709 hydrometra MGI:1915775 +MP:0002060 abnormal skin morphology MGI:1915775 diff --git a/tools/query_impc/test-data/test_output_9.tabular b/tools/query_impc/test-data/test_output_9.tabular new file mode 100644 index 00000000000..a5024bb6f9b --- /dev/null +++ b/tools/query_impc/test-data/test_output_9.tabular @@ -0,0 +1,4 @@ +Gene symbol Significant MP terms Ids Significant MP terms Names +Cacna1s ['MP:0001697', 'MP:0001785', 'MP:0003231', 'MP:0005388', 'MP:0001491', 'MP:0001575', 'MP:0003743', 'MP:0001914', 'MP:0011100', 'MP:0005560'] ['abnormal embryo size', 'edema', 'abnormal placenta vasculature', 'respiratory system phenotype', 'unresponsive to tactile stimuli', 'cyanosis', 'abnormal facial morphology', 'hemorrhage', 'preweaning lethality, complete penetrance', 'decreased circulating glucose level'] +Ndel1 ['MP:0001697', 'MP:0003984', 'MP:0002111', 'MP:0005388', 'MP:0011100'] ['abnormal embryo size', 'embryonic growth retardation', 'abnormal tail morphology', 'respiratory system phenotype', 'preweaning lethality, complete penetrance'] +Zfp536 ['MP:0003019', 'MP:0005564', 'MP:0005388', 'MP:0001575', 'MP:0001399', 'MP:0011100', 'MP:0005641'] ['increased circulating chloride level', 'increased hemoglobin content', 'respiratory system phenotype', 'cyanosis', 'hyperactivity', 'preweaning lethality, complete penetrance', 'increased mean corpuscular hemoglobin concentration'] diff --git a/tools/query_impc/test-data/test_query_1.txt b/tools/query_impc/test-data/test_query_1.txt new file mode 100644 index 00000000000..30dbc4efa3d --- /dev/null +++ b/tools/query_impc/test-data/test_query_1.txt @@ -0,0 +1 @@ +MGI:1923523 \ No newline at end of file diff --git a/tools/query_impc/test-data/test_query_2.txt b/tools/query_impc/test-data/test_query_2.txt new file mode 100644 index 00000000000..fd3d667ae3b --- /dev/null +++ b/tools/query_impc/test-data/test_query_2.txt @@ -0,0 +1 @@ +MP:0002110 MP:0000559 \ No newline at end of file diff --git a/tools/query_impc/test-data/test_query_3.txt b/tools/query_impc/test-data/test_query_3.txt new file mode 100644 index 00000000000..04e762f8bb6 --- /dev/null +++ b/tools/query_impc/test-data/test_query_3.txt @@ -0,0 +1 @@ +MGI:1913564 MGI:1915291 MGI:1914361 MGI:1915775 MGI:1354170 MGI:103225 MGI:2441730 MGI:108071 MGI:2443598 MGI:106643 MGI:1917473 MGI:1338073 MGI:1924093 MGI:99960 MGI:99502 MGI:95479 MGI:1344380 MGI:1860418 MGI:1354721 MGI:3616082 MGI:96709 MGI:2443026 \ No newline at end of file