diff --git a/nbs/99_manuscript/supplementary_files/01-supplementary_file3.ipynb b/nbs/99_manuscript/supplementary_files/01-supplementary_file3.ipynb new file mode 100644 index 00000000..8bdf72dc --- /dev/null +++ b/nbs/99_manuscript/supplementary_files/01-supplementary_file3.ipynb @@ -0,0 +1,3646 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "87e0ce1b-7ce6-4499-9342-5ded05307598", + "metadata": { + "papermill": { + "duration": 0.012027, + "end_time": "2024-01-05T18:03:25.401599", + "exception": false, + "start_time": "2024-01-05T18:03:25.389572", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Description" + ] + }, + { + "cell_type": "markdown", + "id": "c9e288bf-415c-44b7-84a4-7999c21af777", + "metadata": { + "papermill": { + "duration": 0.010328, + "end_time": "2024-01-05T18:03:25.422531", + "exception": false, + "start_time": "2024-01-05T18:03:25.412203", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Creates **Supplementary File 3**.\n", + "\n", + "*Description*: Correlations and p-values of a subset of gene pairs across all tissues in GTEx v8." + ] + }, + { + "cell_type": "markdown", + "id": "e020c781-238b-43c2-8cad-2722b8a240e2", + "metadata": { + "papermill": { + "duration": 0.010284, + "end_time": "2024-01-05T18:03:25.443447", + "exception": false, + "start_time": "2024-01-05T18:03:25.433163", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Modules" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "a819bfbc-5009-4c68-ba8d-37d0979d368f", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:25.465283Z", + "iopub.status.busy": "2024-01-05T18:03:25.465089Z", + "iopub.status.idle": "2024-01-05T18:03:25.861250Z", + "shell.execute_reply": "2024-01-05T18:03:25.860761Z" + }, + "papermill": { + "duration": 0.409123, + "end_time": "2024-01-05T18:03:25.862997", + "exception": false, + "start_time": "2024-01-05T18:03:25.453874", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import rpy2.robjects as ro\n", + "from rpy2.robjects import pandas2ri\n", + "from rpy2.robjects.conversion import localconverter\n", + "\n", + "from ccc import conf" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "860ff6ef-dba1-4647-b77d-ca289cffa36b", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:25.885393Z", + "iopub.status.busy": "2024-01-05T18:03:25.885293Z", + "iopub.status.idle": "2024-01-05T18:03:25.888526Z", + "shell.execute_reply": "2024-01-05T18:03:25.888117Z" + }, + "papermill": { + "duration": 0.015781, + "end_time": "2024-01-05T18:03:25.889707", + "exception": false, + "start_time": "2024-01-05T18:03:25.873926", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "readRDS = ro.r[\"readRDS\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "8b82561e-ea69-4e57-ac22-8c026b34b022", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:25.911306Z", + "iopub.status.busy": "2024-01-05T18:03:25.911204Z", + "iopub.status.idle": "2024-01-05T18:03:25.913877Z", + "shell.execute_reply": "2024-01-05T18:03:25.913523Z" + }, + "papermill": { + "duration": 0.014723, + "end_time": "2024-01-05T18:03:25.915052", + "exception": false, + "start_time": "2024-01-05T18:03:25.900329", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "saveRDS = ro.r[\"saveRDS\"]" + ] + }, + { + "cell_type": "markdown", + "id": "b4834387-58ff-468c-b326-85c408bc5feb", + "metadata": { + "papermill": { + "duration": 0.010406, + "end_time": "2024-01-05T18:03:25.936020", + "exception": false, + "start_time": "2024-01-05T18:03:25.925614", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Settings" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "765e820a-4518-4bbc-a00f-14c9cea03821", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:25.955038Z", + "iopub.status.busy": "2024-01-05T18:03:25.954953Z", + "iopub.status.idle": "2024-01-05T18:03:25.957323Z", + "shell.execute_reply": "2024-01-05T18:03:25.956876Z" + }, + "papermill": { + "duration": 0.011454, + "end_time": "2024-01-05T18:03:25.958066", + "exception": false, + "start_time": "2024-01-05T18:03:25.946612", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "DATASET_CONFIG = conf.GTEX" + ] + }, + { + "cell_type": "markdown", + "id": "a1159982-5dd1-4494-97d4-0674eeead1c3", + "metadata": { + "papermill": { + "duration": 0.005497, + "end_time": "2024-01-05T18:03:25.969129", + "exception": false, + "start_time": "2024-01-05T18:03:25.963632", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Paths" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "d50e29a6-0e18-4c3d-b933-f21a17abf831", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:25.981083Z", + "iopub.status.busy": "2024-01-05T18:03:25.980717Z", + "iopub.status.idle": "2024-01-05T18:03:25.983834Z", + "shell.execute_reply": "2024-01-05T18:03:25.983368Z" + }, + "papermill": { + "duration": 0.009927, + "end_time": "2024-01-05T18:03:25.984572", + "exception": false, + "start_time": "2024-01-05T18:03:25.974645", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "assert (\n", + " conf.MANUSCRIPT[\"BASE_DIR\"] is not None and conf.MANUSCRIPT[\"BASE_DIR\"].exists()\n", + "), \"Manuscript dir not set\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "30cce6f5-ca1b-438c-859d-31903a42d4c6", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:25.996739Z", + "iopub.status.busy": "2024-01-05T18:03:25.996223Z", + "iopub.status.idle": "2024-01-05T18:03:26.005998Z", + "shell.execute_reply": "2024-01-05T18:03:26.005538Z" + }, + "papermill": { + "duration": 0.016652, + "end_time": "2024-01-05T18:03:26.006837", + "exception": false, + "start_time": "2024-01-05T18:03:25.990185", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "PosixPath('/opt/data/results/gtex_v8/other_tissues')" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "INPUT_DIR = conf.GTEX[\"RESULTS_DIR\"] / \"other_tissues\"\n", + "display(INPUT_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "523c6a4e-87ec-44bf-bad3-e912b8aa0482", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.038997Z", + "iopub.status.busy": "2024-01-05T18:03:26.038842Z", + "iopub.status.idle": "2024-01-05T18:03:26.043285Z", + "shell.execute_reply": "2024-01-05T18:03:26.042771Z" + }, + "papermill": { + "duration": 0.011579, + "end_time": "2024-01-05T18:03:26.044125", + "exception": false, + "start_time": "2024-01-05T18:03:26.032546", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "PosixPath('/opt/data/supplementary_material')" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "OUTPUT_DIR = conf.MANUSCRIPT[\"SUPPLEMENTARY_MATERIAL_DIR\"]\n", + "OUTPUT_DIR.mkdir(parents=True, exist_ok=True)\n", + "display(OUTPUT_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "8fefcf66-4c68-4086-bf23-7f52c6fd2291", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.056526Z", + "iopub.status.busy": "2024-01-05T18:03:26.056140Z", + "iopub.status.idle": "2024-01-05T18:03:26.059042Z", + "shell.execute_reply": "2024-01-05T18:03:26.058533Z" + }, + "papermill": { + "duration": 0.010002, + "end_time": "2024-01-05T18:03:26.059885", + "exception": false, + "start_time": "2024-01-05T18:03:26.049883", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "OUTPUT_FILENAME = \"Supplementary_File_03-Gene_pairs_correlations_all_GTEx_tissues\"" + ] + }, + { + "cell_type": "markdown", + "id": "486ca2a1-d8ec-4be4-a208-1315c28d566c", + "metadata": { + "papermill": { + "duration": 0.005708, + "end_time": "2024-01-05T18:03:26.071424", + "exception": false, + "start_time": "2024-01-05T18:03:26.065716", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Data" + ] + }, + { + "cell_type": "markdown", + "id": "cecd38b0-6fe7-4681-af63-c9ba26747767", + "metadata": { + "papermill": { + "duration": 0.00568, + "end_time": "2024-01-05T18:03:26.082873", + "exception": false, + "start_time": "2024-01-05T18:03:26.077193", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Gene Ensembl ID -> Symbol mapping" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "29cbb445-2dfb-42f1-a56c-6595db019c7a", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.095274Z", + "iopub.status.busy": "2024-01-05T18:03:26.094858Z", + "iopub.status.idle": "2024-01-05T18:03:26.116318Z", + "shell.execute_reply": "2024-01-05T18:03:26.115754Z" + }, + "papermill": { + "duration": 0.028656, + "end_time": "2024-01-05T18:03:26.117267", + "exception": false, + "start_time": "2024-01-05T18:03:26.088611", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "gene_map = pd.read_pickle(\n", + " DATASET_CONFIG[\"DATA_DIR\"] / \"gtex_gene_id_symbol_mappings.pkl\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "47fdd6e9-3775-4bc3-b65c-e368793a6f33", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.129769Z", + "iopub.status.busy": "2024-01-05T18:03:26.129377Z", + "iopub.status.idle": "2024-01-05T18:03:26.184510Z", + "shell.execute_reply": "2024-01-05T18:03:26.183933Z" + }, + "papermill": { + "duration": 0.062423, + "end_time": "2024-01-05T18:03:26.185468", + "exception": false, + "start_time": "2024-01-05T18:03:26.123045", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "gene_map = gene_map.set_index(\"gene_ens_id\")[\"gene_symbol\"].to_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "b8962711-87b0-4664-9bfe-76e48d49b15d", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.197726Z", + "iopub.status.busy": "2024-01-05T18:03:26.197561Z", + "iopub.status.idle": "2024-01-05T18:03:26.200750Z", + "shell.execute_reply": "2024-01-05T18:03:26.200196Z" + }, + "papermill": { + "duration": 0.010361, + "end_time": "2024-01-05T18:03:26.201584", + "exception": false, + "start_time": "2024-01-05T18:03:26.191223", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "assert gene_map[\"ENSG00000145309.5\"] == \"CABS1\"" + ] + }, + { + "cell_type": "markdown", + "id": "3893f4f3-f0c5-464a-ab43-a5d0de0f2ea8", + "metadata": { + "papermill": { + "duration": 0.007364, + "end_time": "2024-01-05T18:03:26.215533", + "exception": false, + "start_time": "2024-01-05T18:03:26.208169", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# List of dataframes to combine" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "dfcf7486-20bc-4144-9b19-b872a35fadaf", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.229029Z", + "iopub.status.busy": "2024-01-05T18:03:26.228502Z", + "iopub.status.idle": "2024-01-05T18:03:26.231189Z", + "shell.execute_reply": "2024-01-05T18:03:26.230787Z" + }, + "papermill": { + "duration": 0.010211, + "end_time": "2024-01-05T18:03:26.231974", + "exception": false, + "start_time": "2024-01-05T18:03:26.221763", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "df_list = []" + ] + }, + { + "cell_type": "markdown", + "id": "5b2f0f21-17c4-4046-8a71-74a97c3124ce", + "metadata": { + "papermill": { + "duration": 0.005796, + "end_time": "2024-01-05T18:03:26.243771", + "exception": false, + "start_time": "2024-01-05T18:03:26.237975", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# KDM6A - UTY" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "e7d8b188-ddbc-4bca-9b9a-89863a0e8c47", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.256336Z", + "iopub.status.busy": "2024-01-05T18:03:26.256055Z", + "iopub.status.idle": "2024-01-05T18:03:26.258893Z", + "shell.execute_reply": "2024-01-05T18:03:26.258496Z" + }, + "papermill": { + "duration": 0.009949, + "end_time": "2024-01-05T18:03:26.259663", + "exception": false, + "start_time": "2024-01-05T18:03:26.249714", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "gene0_id, gene1_id = \"ENSG00000147050.14\", \"ENSG00000183878.15\"\n", + "gene0_symbol, gene1_symbol = \"KDM6A\", \"UTY\"\n", + "\n", + "assert gene_map[gene0_id] == gene0_symbol\n", + "assert gene_map[gene1_id] == gene1_symbol" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "0ad4ef1b-7c1c-4720-ab36-456aa5d23b0a", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.272208Z", + "iopub.status.busy": "2024-01-05T18:03:26.271939Z", + "iopub.status.idle": "2024-01-05T18:03:26.275241Z", + "shell.execute_reply": "2024-01-05T18:03:26.274865Z" + }, + "papermill": { + "duration": 0.010394, + "end_time": "2024-01-05T18:03:26.276008", + "exception": false, + "start_time": "2024-01-05T18:03:26.265614", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "PosixPath('/opt/data/results/gtex_v8/other_tissues/kdm6a_vs_uty')" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "GENE_PAIR_INPUT_DIR = INPUT_DIR / f\"{gene0_symbol.lower()}_vs_{gene1_symbol.lower()}\"\n", + "display(GENE_PAIR_INPUT_DIR)" + ] + }, + { + "cell_type": "markdown", + "id": "2574db4a-dc94-4f8a-89b0-2229b76956d8", + "metadata": { + "papermill": { + "duration": 0.005871, + "end_time": "2024-01-05T18:03:26.287887", + "exception": false, + "start_time": "2024-01-05T18:03:26.282016", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Correlation values" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "a91a6c4e-4ecd-4bf7-bd25-e3e354d2276a", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.300630Z", + "iopub.status.busy": "2024-01-05T18:03:26.300289Z", + "iopub.status.idle": "2024-01-05T18:03:26.304076Z", + "shell.execute_reply": "2024-01-05T18:03:26.303673Z" + }, + "papermill": { + "duration": 0.010874, + "end_time": "2024-01-05T18:03:26.304806", + "exception": false, + "start_time": "2024-01-05T18:03:26.293932", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "res_all = pd.read_pickle(GENE_PAIR_INPUT_DIR / \"coef_values.pkl\").rename(\n", + " columns={\n", + " \"cm\": \"ccc_coef\",\n", + " \"pearson\": \"pearson_coef\",\n", + " \"spearman\": \"spearman_coef\",\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "1425c2ab-e665-429c-a9c8-50b585fc221e", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.317421Z", + "iopub.status.busy": "2024-01-05T18:03:26.317166Z", + "iopub.status.idle": "2024-01-05T18:03:26.320277Z", + "shell.execute_reply": "2024-01-05T18:03:26.319880Z" + }, + "papermill": { + "duration": 0.010262, + "end_time": "2024-01-05T18:03:26.321074", + "exception": false, + "start_time": "2024-01-05T18:03:26.310812", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(50, 3)" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "res_all.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "ee21a161-eaae-4242-904f-82f373616064", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.333956Z", + "iopub.status.busy": "2024-01-05T18:03:26.333608Z", + "iopub.status.idle": "2024-01-05T18:03:26.341071Z", + "shell.execute_reply": "2024-01-05T18:03:26.340670Z" + }, + "papermill": { + "duration": 0.014695, + "end_time": "2024-01-05T18:03:26.341853", + "exception": false, + "start_time": "2024-01-05T18:03:26.327158", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ccc_coefpearson_coefspearman_coef
colon_transverse0.336727-0.517899-0.408343
brain_amygdala0.2805240.0375410.147571
artery_coronary0.274554-0.413862-0.391764
artery_aorta0.429771-0.485788-0.363510
adrenal_gland0.260197-0.459190-0.354190
\n", + "
" + ], + "text/plain": [ + " ccc_coef pearson_coef spearman_coef\n", + "colon_transverse 0.336727 -0.517899 -0.408343\n", + "brain_amygdala 0.280524 0.037541 0.147571\n", + "artery_coronary 0.274554 -0.413862 -0.391764\n", + "artery_aorta 0.429771 -0.485788 -0.363510\n", + "adrenal_gland 0.260197 -0.459190 -0.354190" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "res_all.head()" + ] + }, + { + "cell_type": "markdown", + "id": "f55c1996-9eaa-48c6-a376-8b9255971af0", + "metadata": { + "papermill": { + "duration": 0.006081, + "end_time": "2024-01-05T18:03:26.354171", + "exception": false, + "start_time": "2024-01-05T18:03:26.348090", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## P-values" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "58ef1571-13a9-4406-a991-fc2703f74095", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.366973Z", + "iopub.status.busy": "2024-01-05T18:03:26.366848Z", + "iopub.status.idle": "2024-01-05T18:03:26.370381Z", + "shell.execute_reply": "2024-01-05T18:03:26.369964Z" + }, + "papermill": { + "duration": 0.010868, + "end_time": "2024-01-05T18:03:26.371159", + "exception": false, + "start_time": "2024-01-05T18:03:26.360291", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "res_pval_all = pd.read_pickle(GENE_PAIR_INPUT_DIR / \"coef_pvalues.pkl\").rename(\n", + " columns={\n", + " \"cm\": \"ccc_pvalue\",\n", + " \"pearson\": \"pearson_pvalue\",\n", + " \"spearman\": \"spearman_pvalue\",\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "aa2385b5-9914-424b-8d47-cc45a323e83c", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.384306Z", + "iopub.status.busy": "2024-01-05T18:03:26.383996Z", + "iopub.status.idle": "2024-01-05T18:03:26.387279Z", + "shell.execute_reply": "2024-01-05T18:03:26.386885Z" + }, + "papermill": { + "duration": 0.010673, + "end_time": "2024-01-05T18:03:26.388046", + "exception": false, + "start_time": "2024-01-05T18:03:26.377373", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(50, 3)" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "res_pval_all.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "734fc9d9-17d4-422f-9b1d-45dbe1a76263", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.401353Z", + "iopub.status.busy": "2024-01-05T18:03:26.401042Z", + "iopub.status.idle": "2024-01-05T18:03:26.407260Z", + "shell.execute_reply": "2024-01-05T18:03:26.406902Z" + }, + "papermill": { + "duration": 0.013613, + "end_time": "2024-01-05T18:03:26.407996", + "exception": false, + "start_time": "2024-01-05T18:03:26.394383", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ccc_pvaluepearson_pvaluespearman_pvalue
colon_transverse9.999990e-073.063714e-299.539164e-18
brain_amygdala9.999990e-076.461089e-016.963023e-02
artery_coronary9.999990e-072.389970e-113.159321e-10
artery_aorta9.999990e-075.775754e-276.092383e-15
adrenal_gland9.999990e-077.334489e-154.847677e-09
\n", + "
" + ], + "text/plain": [ + " ccc_pvalue pearson_pvalue spearman_pvalue\n", + "colon_transverse 9.999990e-07 3.063714e-29 9.539164e-18\n", + "brain_amygdala 9.999990e-07 6.461089e-01 6.963023e-02\n", + "artery_coronary 9.999990e-07 2.389970e-11 3.159321e-10\n", + "artery_aorta 9.999990e-07 5.775754e-27 6.092383e-15\n", + "adrenal_gland 9.999990e-07 7.334489e-15 4.847677e-09" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "res_pval_all.head()" + ] + }, + { + "cell_type": "markdown", + "id": "47367871-fc3c-463e-b2b8-d6dce217bd23", + "metadata": { + "papermill": { + "duration": 0.006276, + "end_time": "2024-01-05T18:03:26.420645", + "exception": false, + "start_time": "2024-01-05T18:03:26.414369", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Combine" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "6faf9276-9683-413c-8393-4e689da262f0", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.433903Z", + "iopub.status.busy": "2024-01-05T18:03:26.433784Z", + "iopub.status.idle": "2024-01-05T18:03:26.439299Z", + "shell.execute_reply": "2024-01-05T18:03:26.438904Z" + }, + "papermill": { + "duration": 0.013049, + "end_time": "2024-01-05T18:03:26.440072", + "exception": false, + "start_time": "2024-01-05T18:03:26.427023", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "df = res_all.join(res_pval_all, how=\"inner\").rename_axis(\"tissue\").reset_index()\n", + "assert df.shape[0] == res_all.shape[0]\n", + "assert df.shape[0] == res_pval_all.shape[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "1cd14e69-5adb-48d6-98a1-9b7567a24ea3", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.453711Z", + "iopub.status.busy": "2024-01-05T18:03:26.453386Z", + "iopub.status.idle": "2024-01-05T18:03:26.458014Z", + "shell.execute_reply": "2024-01-05T18:03:26.457622Z" + }, + "papermill": { + "duration": 0.012287, + "end_time": "2024-01-05T18:03:26.458780", + "exception": false, + "start_time": "2024-01-05T18:03:26.446493", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "df.insert(0, \"gene0_id\", gene0_id)\n", + "df.insert(1, \"gene1_id\", gene1_id)\n", + "df.insert(2, \"gene0_symbol\", gene0_symbol)\n", + "df.insert(3, \"gene1_symbol\", gene1_symbol)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "598c8dfd-da7a-4d9d-9a8c-02244c9f8f1a", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.472413Z", + "iopub.status.busy": "2024-01-05T18:03:26.471994Z", + "iopub.status.idle": "2024-01-05T18:03:26.475346Z", + "shell.execute_reply": "2024-01-05T18:03:26.474976Z" + }, + "papermill": { + "duration": 0.01092, + "end_time": "2024-01-05T18:03:26.476114", + "exception": false, + "start_time": "2024-01-05T18:03:26.465194", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(50, 11)" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "4c4e4515-1f2a-4f15-ac0d-b292b17a9115", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.489694Z", + "iopub.status.busy": "2024-01-05T18:03:26.489548Z", + "iopub.status.idle": "2024-01-05T18:03:26.501596Z", + "shell.execute_reply": "2024-01-05T18:03:26.501100Z" + }, + "papermill": { + "duration": 0.019818, + "end_time": "2024-01-05T18:03:26.502420", + "exception": false, + "start_time": "2024-01-05T18:03:26.482602", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gene0_idgene1_idgene0_symbolgene1_symboltissueccc_coefpearson_coefspearman_coefccc_pvaluepearson_pvaluespearman_pvalue
0ENSG00000147050.14ENSG00000183878.15KDM6AUTYcolon_transverse0.336727-0.517899-0.4083439.999990e-073.063714e-299.539164e-18
1ENSG00000147050.14ENSG00000183878.15KDM6AUTYbrain_amygdala0.2805240.0375410.1475719.999990e-076.461089e-016.963023e-02
2ENSG00000147050.14ENSG00000183878.15KDM6AUTYartery_coronary0.274554-0.413862-0.3917649.999990e-072.389970e-113.159321e-10
3ENSG00000147050.14ENSG00000183878.15KDM6AUTYartery_aorta0.429771-0.485788-0.3635109.999990e-075.775754e-276.092383e-15
4ENSG00000147050.14ENSG00000183878.15KDM6AUTYadrenal_gland0.260197-0.459190-0.3541909.999990e-077.334489e-154.847677e-09
\n", + "
" + ], + "text/plain": [ + " gene0_id gene1_id gene0_symbol gene1_symbol \\\n", + "0 ENSG00000147050.14 ENSG00000183878.15 KDM6A UTY \n", + "1 ENSG00000147050.14 ENSG00000183878.15 KDM6A UTY \n", + "2 ENSG00000147050.14 ENSG00000183878.15 KDM6A UTY \n", + "3 ENSG00000147050.14 ENSG00000183878.15 KDM6A UTY \n", + "4 ENSG00000147050.14 ENSG00000183878.15 KDM6A UTY \n", + "\n", + " tissue ccc_coef pearson_coef spearman_coef ccc_pvalue \\\n", + "0 colon_transverse 0.336727 -0.517899 -0.408343 9.999990e-07 \n", + "1 brain_amygdala 0.280524 0.037541 0.147571 9.999990e-07 \n", + "2 artery_coronary 0.274554 -0.413862 -0.391764 9.999990e-07 \n", + "3 artery_aorta 0.429771 -0.485788 -0.363510 9.999990e-07 \n", + "4 adrenal_gland 0.260197 -0.459190 -0.354190 9.999990e-07 \n", + "\n", + " pearson_pvalue spearman_pvalue \n", + "0 3.063714e-29 9.539164e-18 \n", + "1 6.461089e-01 6.963023e-02 \n", + "2 2.389970e-11 3.159321e-10 \n", + "3 5.775754e-27 6.092383e-15 \n", + "4 7.334489e-15 4.847677e-09 " + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "4abcf3da-e636-4f0f-9f65-7ed5d1b96fff", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.516267Z", + "iopub.status.busy": "2024-01-05T18:03:26.516109Z", + "iopub.status.idle": "2024-01-05T18:03:26.518879Z", + "shell.execute_reply": "2024-01-05T18:03:26.518376Z" + }, + "papermill": { + "duration": 0.010665, + "end_time": "2024-01-05T18:03:26.519705", + "exception": false, + "start_time": "2024-01-05T18:03:26.509040", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "df_list.append(df)" + ] + }, + { + "cell_type": "markdown", + "id": "89e7bc77-29a4-462f-8621-438079de42a6", + "metadata": { + "papermill": { + "duration": 0.00651, + "end_time": "2024-01-05T18:03:26.532866", + "exception": false, + "start_time": "2024-01-05T18:03:26.526356", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# KDM6A - DDX3Y" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "2d44475c-19cc-475f-86bd-0830b8ae2384", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.546966Z", + "iopub.status.busy": "2024-01-05T18:03:26.546566Z", + "iopub.status.idle": "2024-01-05T18:03:26.549884Z", + "shell.execute_reply": "2024-01-05T18:03:26.549382Z" + }, + "papermill": { + "duration": 0.011236, + "end_time": "2024-01-05T18:03:26.550691", + "exception": false, + "start_time": "2024-01-05T18:03:26.539455", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "gene0_id, gene1_id = \"ENSG00000147050.14\", \"ENSG00000067048.16\"\n", + "gene0_symbol, gene1_symbol = \"KDM6A\", \"DDX3Y\"\n", + "\n", + "assert gene_map[gene0_id] == gene0_symbol\n", + "assert gene_map[gene1_id] == gene1_symbol" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "8ec7ce72-8e84-4c3d-a3be-22fa4b9498fc", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.564774Z", + "iopub.status.busy": "2024-01-05T18:03:26.564397Z", + "iopub.status.idle": "2024-01-05T18:03:26.568343Z", + "shell.execute_reply": "2024-01-05T18:03:26.567846Z" + }, + "papermill": { + "duration": 0.011848, + "end_time": "2024-01-05T18:03:26.569175", + "exception": false, + "start_time": "2024-01-05T18:03:26.557327", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "PosixPath('/opt/data/results/gtex_v8/other_tissues/kdm6a_vs_ddx3y')" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "GENE_PAIR_INPUT_DIR = INPUT_DIR / f\"{gene0_symbol.lower()}_vs_{gene1_symbol.lower()}\"\n", + "display(GENE_PAIR_INPUT_DIR)" + ] + }, + { + "cell_type": "markdown", + "id": "80011d74-4e2e-488e-a77d-29f7458dedf1", + "metadata": { + "papermill": { + "duration": 0.006649, + "end_time": "2024-01-05T18:03:26.582553", + "exception": false, + "start_time": "2024-01-05T18:03:26.575904", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Correlation values" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "9846f56e-1b62-43fa-9dfe-38638574a3d7", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.596940Z", + "iopub.status.busy": "2024-01-05T18:03:26.596547Z", + "iopub.status.idle": "2024-01-05T18:03:26.600610Z", + "shell.execute_reply": "2024-01-05T18:03:26.600105Z" + }, + "papermill": { + "duration": 0.012115, + "end_time": "2024-01-05T18:03:26.601450", + "exception": false, + "start_time": "2024-01-05T18:03:26.589335", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "res_all = pd.read_pickle(GENE_PAIR_INPUT_DIR / \"coef_values.pkl\").rename(\n", + " columns={\n", + " \"cm\": \"ccc_coef\",\n", + " \"pearson\": \"pearson_coef\",\n", + " \"spearman\": \"spearman_coef\",\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "0778f0f4-a292-4f48-811d-17813cbde3bf", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.615758Z", + "iopub.status.busy": "2024-01-05T18:03:26.615431Z", + "iopub.status.idle": "2024-01-05T18:03:26.619215Z", + "shell.execute_reply": "2024-01-05T18:03:26.618721Z" + }, + "papermill": { + "duration": 0.011815, + "end_time": "2024-01-05T18:03:26.620038", + "exception": false, + "start_time": "2024-01-05T18:03:26.608223", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(50, 3)" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "res_all.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "85b140ad-e4c8-41ba-8628-e7cd9d768228", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.634600Z", + "iopub.status.busy": "2024-01-05T18:03:26.634188Z", + "iopub.status.idle": "2024-01-05T18:03:26.641032Z", + "shell.execute_reply": "2024-01-05T18:03:26.640645Z" + }, + "papermill": { + "duration": 0.014937, + "end_time": "2024-01-05T18:03:26.641804", + "exception": false, + "start_time": "2024-01-05T18:03:26.626867", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ccc_coefpearson_coefspearman_coef
colon_transverse0.280814-0.393881-0.465898
brain_amygdala0.3056770.0387320.154331
artery_coronary0.242720-0.487610-0.426164
artery_aorta0.381970-0.579236-0.409761
adrenal_gland0.188929-0.489400-0.418784
\n", + "
" + ], + "text/plain": [ + " ccc_coef pearson_coef spearman_coef\n", + "colon_transverse 0.280814 -0.393881 -0.465898\n", + "brain_amygdala 0.305677 0.038732 0.154331\n", + "artery_coronary 0.242720 -0.487610 -0.426164\n", + "artery_aorta 0.381970 -0.579236 -0.409761\n", + "adrenal_gland 0.188929 -0.489400 -0.418784" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "res_all.head()" + ] + }, + { + "cell_type": "markdown", + "id": "2800d481-53ad-4706-9a5c-a28d5a1a0b70", + "metadata": { + "papermill": { + "duration": 0.006796, + "end_time": "2024-01-05T18:03:26.655533", + "exception": false, + "start_time": "2024-01-05T18:03:26.648737", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## P-values" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "6478e953-b946-4eac-9101-237a18f1d6c6", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.670427Z", + "iopub.status.busy": "2024-01-05T18:03:26.670068Z", + "iopub.status.idle": "2024-01-05T18:03:26.674128Z", + "shell.execute_reply": "2024-01-05T18:03:26.673730Z" + }, + "papermill": { + "duration": 0.01242, + "end_time": "2024-01-05T18:03:26.674897", + "exception": false, + "start_time": "2024-01-05T18:03:26.662477", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "res_pval_all = pd.read_pickle(GENE_PAIR_INPUT_DIR / \"coef_pvalues.pkl\").rename(\n", + " columns={\n", + " \"cm\": \"ccc_pvalue\",\n", + " \"pearson\": \"pearson_pvalue\",\n", + " \"spearman\": \"spearman_pvalue\",\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "16f10b8e-9233-4f68-ad90-76accc6dc760", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.689596Z", + "iopub.status.busy": "2024-01-05T18:03:26.689276Z", + "iopub.status.idle": "2024-01-05T18:03:26.692706Z", + "shell.execute_reply": "2024-01-05T18:03:26.692311Z" + }, + "papermill": { + "duration": 0.011632, + "end_time": "2024-01-05T18:03:26.693485", + "exception": false, + "start_time": "2024-01-05T18:03:26.681853", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(50, 3)" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "res_pval_all.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "4226d1a6-94fa-463a-8d4a-539cdf667028", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.708163Z", + "iopub.status.busy": "2024-01-05T18:03:26.707850Z", + "iopub.status.idle": "2024-01-05T18:03:26.715232Z", + "shell.execute_reply": "2024-01-05T18:03:26.714850Z" + }, + "papermill": { + "duration": 0.015561, + "end_time": "2024-01-05T18:03:26.716006", + "exception": false, + "start_time": "2024-01-05T18:03:26.700445", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ccc_pvaluepearson_pvaluespearman_pvalue
colon_transverse9.999990e-071.613504e-162.880714e-23
brain_amygdala9.999990e-076.356755e-015.764275e-02
artery_coronary9.999990e-079.731623e-165.220895e-12
artery_aorta9.999990e-074.513966e-406.380372e-19
adrenal_gland9.999990e-076.058615e-172.230048e-12
\n", + "
" + ], + "text/plain": [ + " ccc_pvalue pearson_pvalue spearman_pvalue\n", + "colon_transverse 9.999990e-07 1.613504e-16 2.880714e-23\n", + "brain_amygdala 9.999990e-07 6.356755e-01 5.764275e-02\n", + "artery_coronary 9.999990e-07 9.731623e-16 5.220895e-12\n", + "artery_aorta 9.999990e-07 4.513966e-40 6.380372e-19\n", + "adrenal_gland 9.999990e-07 6.058615e-17 2.230048e-12" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "res_pval_all.head()" + ] + }, + { + "cell_type": "markdown", + "id": "696a2c33-09cc-4114-9457-56491a120d78", + "metadata": { + "papermill": { + "duration": 0.007042, + "end_time": "2024-01-05T18:03:26.730163", + "exception": false, + "start_time": "2024-01-05T18:03:26.723121", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Combine" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "082566e1-c1c7-41a4-b7ce-48c8023cfe91", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.744980Z", + "iopub.status.busy": "2024-01-05T18:03:26.744832Z", + "iopub.status.idle": "2024-01-05T18:03:26.749668Z", + "shell.execute_reply": "2024-01-05T18:03:26.749264Z" + }, + "papermill": { + "duration": 0.013207, + "end_time": "2024-01-05T18:03:26.750429", + "exception": false, + "start_time": "2024-01-05T18:03:26.737222", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "df = res_all.join(res_pval_all, how=\"inner\").rename_axis(\"tissue\").reset_index()\n", + "assert df.shape[0] == res_all.shape[0]\n", + "assert df.shape[0] == res_pval_all.shape[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "f375b8ab-e1c7-46d4-9ee8-eea4a4786af4", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.765567Z", + "iopub.status.busy": "2024-01-05T18:03:26.765246Z", + "iopub.status.idle": "2024-01-05T18:03:26.769822Z", + "shell.execute_reply": "2024-01-05T18:03:26.769416Z" + }, + "papermill": { + "duration": 0.012933, + "end_time": "2024-01-05T18:03:26.770586", + "exception": false, + "start_time": "2024-01-05T18:03:26.757653", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "df.insert(0, \"gene0_id\", gene0_id)\n", + "df.insert(1, \"gene1_id\", gene1_id)\n", + "df.insert(2, \"gene0_symbol\", gene0_symbol)\n", + "df.insert(3, \"gene1_symbol\", gene1_symbol)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "2cbc5f21-af66-4665-acb2-c7f18a484d01", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.785491Z", + "iopub.status.busy": "2024-01-05T18:03:26.785344Z", + "iopub.status.idle": "2024-01-05T18:03:26.788707Z", + "shell.execute_reply": "2024-01-05T18:03:26.788310Z" + }, + "papermill": { + "duration": 0.011692, + "end_time": "2024-01-05T18:03:26.789488", + "exception": false, + "start_time": "2024-01-05T18:03:26.777796", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(50, 11)" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "e478f0df-9fd6-469a-a4cf-1c379f708d27", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.804743Z", + "iopub.status.busy": "2024-01-05T18:03:26.804418Z", + "iopub.status.idle": "2024-01-05T18:03:26.816204Z", + "shell.execute_reply": "2024-01-05T18:03:26.815816Z" + }, + "papermill": { + "duration": 0.020198, + "end_time": "2024-01-05T18:03:26.816990", + "exception": false, + "start_time": "2024-01-05T18:03:26.796792", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gene0_idgene1_idgene0_symbolgene1_symboltissueccc_coefpearson_coefspearman_coefccc_pvaluepearson_pvaluespearman_pvalue
0ENSG00000147050.14ENSG00000067048.16KDM6ADDX3Ycolon_transverse0.280814-0.393881-0.4658989.999990e-071.613504e-162.880714e-23
1ENSG00000147050.14ENSG00000067048.16KDM6ADDX3Ybrain_amygdala0.3056770.0387320.1543319.999990e-076.356755e-015.764275e-02
2ENSG00000147050.14ENSG00000067048.16KDM6ADDX3Yartery_coronary0.242720-0.487610-0.4261649.999990e-079.731623e-165.220895e-12
3ENSG00000147050.14ENSG00000067048.16KDM6ADDX3Yartery_aorta0.381970-0.579236-0.4097619.999990e-074.513966e-406.380372e-19
4ENSG00000147050.14ENSG00000067048.16KDM6ADDX3Yadrenal_gland0.188929-0.489400-0.4187849.999990e-076.058615e-172.230048e-12
\n", + "
" + ], + "text/plain": [ + " gene0_id gene1_id gene0_symbol gene1_symbol \\\n", + "0 ENSG00000147050.14 ENSG00000067048.16 KDM6A DDX3Y \n", + "1 ENSG00000147050.14 ENSG00000067048.16 KDM6A DDX3Y \n", + "2 ENSG00000147050.14 ENSG00000067048.16 KDM6A DDX3Y \n", + "3 ENSG00000147050.14 ENSG00000067048.16 KDM6A DDX3Y \n", + "4 ENSG00000147050.14 ENSG00000067048.16 KDM6A DDX3Y \n", + "\n", + " tissue ccc_coef pearson_coef spearman_coef ccc_pvalue \\\n", + "0 colon_transverse 0.280814 -0.393881 -0.465898 9.999990e-07 \n", + "1 brain_amygdala 0.305677 0.038732 0.154331 9.999990e-07 \n", + "2 artery_coronary 0.242720 -0.487610 -0.426164 9.999990e-07 \n", + "3 artery_aorta 0.381970 -0.579236 -0.409761 9.999990e-07 \n", + "4 adrenal_gland 0.188929 -0.489400 -0.418784 9.999990e-07 \n", + "\n", + " pearson_pvalue spearman_pvalue \n", + "0 1.613504e-16 2.880714e-23 \n", + "1 6.356755e-01 5.764275e-02 \n", + "2 9.731623e-16 5.220895e-12 \n", + "3 4.513966e-40 6.380372e-19 \n", + "4 6.058615e-17 2.230048e-12 " + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "d22d7b22-c73e-47eb-84d9-408292753242", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.832452Z", + "iopub.status.busy": "2024-01-05T18:03:26.832158Z", + "iopub.status.idle": "2024-01-05T18:03:26.834760Z", + "shell.execute_reply": "2024-01-05T18:03:26.834328Z" + }, + "papermill": { + "duration": 0.011232, + "end_time": "2024-01-05T18:03:26.835539", + "exception": false, + "start_time": "2024-01-05T18:03:26.824307", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "df_list.append(df)" + ] + }, + { + "cell_type": "markdown", + "id": "fa148b71-a756-4521-b935-74527b4ad649", + "metadata": { + "papermill": { + "duration": 0.007239, + "end_time": "2024-01-05T18:03:26.850166", + "exception": false, + "start_time": "2024-01-05T18:03:26.842927", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Combine" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "48e3dd9d-0f45-4480-a1bc-b6f829048ce8", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.865719Z", + "iopub.status.busy": "2024-01-05T18:03:26.865387Z", + "iopub.status.idle": "2024-01-05T18:03:26.869247Z", + "shell.execute_reply": "2024-01-05T18:03:26.868799Z" + }, + "papermill": { + "duration": 0.012543, + "end_time": "2024-01-05T18:03:26.870041", + "exception": false, + "start_time": "2024-01-05T18:03:26.857498", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "df_final = pd.concat(df_list, ignore_index=True, axis=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "d1299505-603f-4f67-99b9-e77911878253", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.885614Z", + "iopub.status.busy": "2024-01-05T18:03:26.885287Z", + "iopub.status.idle": "2024-01-05T18:03:26.889540Z", + "shell.execute_reply": "2024-01-05T18:03:26.889128Z" + }, + "papermill": { + "duration": 0.012858, + "end_time": "2024-01-05T18:03:26.890318", + "exception": false, + "start_time": "2024-01-05T18:03:26.877460", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(100, 11)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "assert df_final.shape[0] == sum(d.shape[0] for d in df_list)\n", + "for d in df_list:\n", + " assert df_final.shape[1] == d.shape[1]\n", + "display(df_final.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "c257fb94-a76d-4739-b5a1-a92be639be14", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.906082Z", + "iopub.status.busy": "2024-01-05T18:03:26.905791Z", + "iopub.status.idle": "2024-01-05T18:03:26.921947Z", + "shell.execute_reply": "2024-01-05T18:03:26.921538Z" + }, + "papermill": { + "duration": 0.024883, + "end_time": "2024-01-05T18:03:26.922717", + "exception": false, + "start_time": "2024-01-05T18:03:26.897834", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gene0_idgene1_idgene0_symbolgene1_symboltissueccc_coefpearson_coefspearman_coefccc_pvaluepearson_pvaluespearman_pvalue
0ENSG00000147050.14ENSG00000183878.15KDM6AUTYcolon_transverse0.336727-0.517899-0.4083439.999990e-073.063714e-299.539164e-18
1ENSG00000147050.14ENSG00000183878.15KDM6AUTYbrain_amygdala0.2805240.0375410.1475719.999990e-076.461089e-016.963023e-02
2ENSG00000147050.14ENSG00000183878.15KDM6AUTYartery_coronary0.274554-0.413862-0.3917649.999990e-072.389970e-113.159321e-10
3ENSG00000147050.14ENSG00000183878.15KDM6AUTYartery_aorta0.429771-0.485788-0.3635109.999990e-075.775754e-276.092383e-15
4ENSG00000147050.14ENSG00000183878.15KDM6AUTYadrenal_gland0.260197-0.459190-0.3541909.999990e-077.334489e-154.847677e-09
....................................
95ENSG00000147050.14ENSG00000067048.16KDM6ADDX3Yartery_tibial0.298440-0.617718-0.3877659.999990e-075.248493e-713.246061e-25
96ENSG00000147050.14ENSG00000067048.16KDM6ADDX3Ybrain_hypothalamus0.2326320.1183910.2511499.999990e-079.332407e-023.117929e-04
97ENSG00000147050.14ENSG00000067048.16KDM6ADDX3Ylung0.289771-0.252442-0.2244709.999990e-077.462864e-104.905714e-08
98ENSG00000147050.14ENSG00000067048.16KDM6ADDX3Ybrain_cerebellum0.219113-0.1064690.0349029.999990e-079.916004e-025.897648e-01
99ENSG00000147050.14ENSG00000067048.16KDM6ADDX3Ybrain_hippocampus0.2185830.1929530.2970049.999990e-076.596327e-032.253662e-05
\n", + "

100 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " gene0_id gene1_id gene0_symbol gene1_symbol \\\n", + "0 ENSG00000147050.14 ENSG00000183878.15 KDM6A UTY \n", + "1 ENSG00000147050.14 ENSG00000183878.15 KDM6A UTY \n", + "2 ENSG00000147050.14 ENSG00000183878.15 KDM6A UTY \n", + "3 ENSG00000147050.14 ENSG00000183878.15 KDM6A UTY \n", + "4 ENSG00000147050.14 ENSG00000183878.15 KDM6A UTY \n", + ".. ... ... ... ... \n", + "95 ENSG00000147050.14 ENSG00000067048.16 KDM6A DDX3Y \n", + "96 ENSG00000147050.14 ENSG00000067048.16 KDM6A DDX3Y \n", + "97 ENSG00000147050.14 ENSG00000067048.16 KDM6A DDX3Y \n", + "98 ENSG00000147050.14 ENSG00000067048.16 KDM6A DDX3Y \n", + "99 ENSG00000147050.14 ENSG00000067048.16 KDM6A DDX3Y \n", + "\n", + " tissue ccc_coef pearson_coef spearman_coef ccc_pvalue \\\n", + "0 colon_transverse 0.336727 -0.517899 -0.408343 9.999990e-07 \n", + "1 brain_amygdala 0.280524 0.037541 0.147571 9.999990e-07 \n", + "2 artery_coronary 0.274554 -0.413862 -0.391764 9.999990e-07 \n", + "3 artery_aorta 0.429771 -0.485788 -0.363510 9.999990e-07 \n", + "4 adrenal_gland 0.260197 -0.459190 -0.354190 9.999990e-07 \n", + ".. ... ... ... ... ... \n", + "95 artery_tibial 0.298440 -0.617718 -0.387765 9.999990e-07 \n", + "96 brain_hypothalamus 0.232632 0.118391 0.251149 9.999990e-07 \n", + "97 lung 0.289771 -0.252442 -0.224470 9.999990e-07 \n", + "98 brain_cerebellum 0.219113 -0.106469 0.034902 9.999990e-07 \n", + "99 brain_hippocampus 0.218583 0.192953 0.297004 9.999990e-07 \n", + "\n", + " pearson_pvalue spearman_pvalue \n", + "0 3.063714e-29 9.539164e-18 \n", + "1 6.461089e-01 6.963023e-02 \n", + "2 2.389970e-11 3.159321e-10 \n", + "3 5.775754e-27 6.092383e-15 \n", + "4 7.334489e-15 4.847677e-09 \n", + ".. ... ... \n", + "95 5.248493e-71 3.246061e-25 \n", + "96 9.332407e-02 3.117929e-04 \n", + "97 7.462864e-10 4.905714e-08 \n", + "98 9.916004e-02 5.897648e-01 \n", + "99 6.596327e-03 2.253662e-05 \n", + "\n", + "[100 rows x 11 columns]" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_final" + ] + }, + { + "cell_type": "markdown", + "id": "74b046a0-52ed-4134-a572-8aa8dc540052", + "metadata": { + "papermill": { + "duration": 0.007531, + "end_time": "2024-01-05T18:03:26.937944", + "exception": false, + "start_time": "2024-01-05T18:03:26.930413", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Save" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "dc6d6873-cc91-4fb9-b4d6-9b9607f0100e", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.954259Z", + "iopub.status.busy": "2024-01-05T18:03:26.953833Z", + "iopub.status.idle": "2024-01-05T18:03:26.956359Z", + "shell.execute_reply": "2024-01-05T18:03:26.955974Z" + }, + "papermill": { + "duration": 0.011526, + "end_time": "2024-01-05T18:03:26.957179", + "exception": false, + "start_time": "2024-01-05T18:03:26.945653", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "data = df_final" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "0b24557a-e3e9-4527-8551-56b7dee86acc", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.973456Z", + "iopub.status.busy": "2024-01-05T18:03:26.973142Z", + "iopub.status.idle": "2024-01-05T18:03:26.977510Z", + "shell.execute_reply": "2024-01-05T18:03:26.977134Z" + }, + "papermill": { + "duration": 0.013278, + "end_time": "2024-01-05T18:03:26.978284", + "exception": false, + "start_time": "2024-01-05T18:03:26.965006", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "dtype('int64')" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "RangeIndex(start=0, stop=100, step=1)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(data.index.dtype)\n", + "display(data.index)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "9773177e-a637-41cb-b66b-42a4ffd15a33", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:26.994753Z", + "iopub.status.busy": "2024-01-05T18:03:26.994434Z", + "iopub.status.idle": "2024-01-05T18:03:26.997355Z", + "shell.execute_reply": "2024-01-05T18:03:26.996935Z" + }, + "papermill": { + "duration": 0.012005, + "end_time": "2024-01-05T18:03:26.998150", + "exception": false, + "start_time": "2024-01-05T18:03:26.986145", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# reset index to avoid problems with MultiIndex in Pandas\n", + "if isinstance(data.index, pd.MultiIndex):\n", + " display(\"MultiIndex\")\n", + " data = data.reset_index()" + ] + }, + { + "cell_type": "markdown", + "id": "e5d04700-54cb-40c5-be15-39bc76eafd2f", + "metadata": { + "papermill": { + "duration": 0.007745, + "end_time": "2024-01-05T18:03:27.013744", + "exception": false, + "start_time": "2024-01-05T18:03:27.005999", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Pickle" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "cda6751b-a78d-4164-b678-cd329a895011", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:27.030036Z", + "iopub.status.busy": "2024-01-05T18:03:27.029886Z", + "iopub.status.idle": "2024-01-05T18:03:27.033470Z", + "shell.execute_reply": "2024-01-05T18:03:27.033074Z" + }, + "papermill": { + "duration": 0.012664, + "end_time": "2024-01-05T18:03:27.034245", + "exception": false, + "start_time": "2024-01-05T18:03:27.021581", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "data.to_pickle(OUTPUT_DIR / f\"{OUTPUT_FILENAME}.pkl.gz\")" + ] + }, + { + "cell_type": "markdown", + "id": "025e09a9-bd4f-4bc5-a69d-248d7ea8aacb", + "metadata": { + "papermill": { + "duration": 0.00784, + "end_time": "2024-01-05T18:03:27.050000", + "exception": false, + "start_time": "2024-01-05T18:03:27.042160", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## RDS" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "c850c34d-ccf3-49ef-b737-0cf040ca77fb", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:27.066436Z", + "iopub.status.busy": "2024-01-05T18:03:27.066146Z", + "iopub.status.idle": "2024-01-05T18:03:27.069649Z", + "shell.execute_reply": "2024-01-05T18:03:27.069273Z" + }, + "papermill": { + "duration": 0.012612, + "end_time": "2024-01-05T18:03:27.070415", + "exception": false, + "start_time": "2024-01-05T18:03:27.057803", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "PosixPath('/opt/data/supplementary_material/Supplementary_File_03-Gene_pairs_correlations_all_GTEx_tissues.rds')" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "output_file = OUTPUT_DIR / f\"{OUTPUT_FILENAME}.rds\"\n", + "display(output_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "33c565ae-2710-4816-91d1-4376ee36a8a3", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:27.086800Z", + "iopub.status.busy": "2024-01-05T18:03:27.086653Z", + "iopub.status.idle": "2024-01-05T18:03:27.100502Z", + "shell.execute_reply": "2024-01-05T18:03:27.100060Z" + }, + "papermill": { + "duration": 0.023, + "end_time": "2024-01-05T18:03:27.101293", + "exception": false, + "start_time": "2024-01-05T18:03:27.078293", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "with localconverter(ro.default_converter + pandas2ri.converter):\n", + " data_r = ro.conversion.py2rpy(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "91cfc710-343a-410d-9782-91eb5ca4623e", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:27.117802Z", + "iopub.status.busy": "2024-01-05T18:03:27.117649Z", + "iopub.status.idle": "2024-01-05T18:03:27.127159Z", + "shell.execute_reply": "2024-01-05T18:03:27.126739Z" + }, + "papermill": { + "duration": 0.018722, + "end_time": "2024-01-05T18:03:27.127926", + "exception": false, + "start_time": "2024-01-05T18:03:27.109204", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " R/rpy2 DataFrame (100 x 11)\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gene0_idgene1_idgene0_symbol...ccc_pvaluepearson_pvaluespearman_pvalue
\n", + " 'ENSG0000...\n", + " \n", + " 'ENSG0000...\n", + " \n", + " 'KDM6A'\n", + " \n", + " ...\n", + " \n", + " 0.000001\n", + " \n", + " 0.000000\n", + " \n", + " 0.000000\n", + "
\n", + " 'ENSG0000...\n", + " \n", + " 'ENSG0000...\n", + " \n", + " 'KDM6A'\n", + " \n", + " \n", + " \n", + " 0.000001\n", + " \n", + " 0.646109\n", + " \n", + " 0.069630\n", + "
\n", + " 'ENSG0000...\n", + " \n", + " 'ENSG0000...\n", + " \n", + " 'KDM6A'\n", + " \n", + " \n", + " \n", + " 0.000001\n", + " \n", + " 0.000000\n", + " \n", + " 0.000000\n", + "
\n", + " 'ENSG0000...\n", + " \n", + " 'ENSG0000...\n", + " \n", + " 'KDM6A'\n", + " \n", + " \n", + " \n", + " 0.000001\n", + " \n", + " 0.000000\n", + " \n", + " 0.000000\n", + "
\n", + " ...\n", + " \n", + " ...\n", + " \n", + " ...\n", + " \n", + " \n", + " \n", + " ...\n", + " \n", + " ...\n", + " \n", + " ...\n", + "
\n", + " 'ENSG0000...\n", + " \n", + " 'ENSG0000...\n", + " \n", + " 'KDM6A'\n", + " \n", + " \n", + " \n", + " 0.000001\n", + " \n", + " 0.093324\n", + " \n", + " 0.000312\n", + "
\n", + " 'ENSG0000...\n", + " \n", + " 'ENSG0000...\n", + " \n", + " 'KDM6A'\n", + " \n", + " \n", + " \n", + " 0.000001\n", + " \n", + " 0.000000\n", + " \n", + " 0.000000\n", + "
\n", + " 'ENSG0000...\n", + " \n", + " 'ENSG0000...\n", + " \n", + " 'KDM6A'\n", + " \n", + " \n", + " \n", + " 0.000001\n", + " \n", + " 0.099160\n", + " \n", + " 0.589765\n", + "
\n", + " 'ENSG0000...\n", + " \n", + " 'ENSG0000...\n", + " \n", + " 'KDM6A'\n", + " \n", + " \n", + " \n", + " 0.000001\n", + " \n", + " 0.006596\n", + " \n", + " 0.000023\n", + "
\n", + " " + ], + "text/plain": [ + " [RTYPES.VECSXP]\n", + "R classes: ('data.frame',)\n", + "[StrSexpVe..., StrSexpVe..., StrSexpVe..., StrSexpVe..., ..., FloatSexp..., FloatSexp..., FloatSexp..., FloatSexp...]\n", + " gene0_id: \n", + " [RTYPES.STRSXP]\n", + "R classes: ('character',)\n", + "['ENSG0000..., 'ENSG0000..., 'ENSG0000..., 'ENSG0000..., ..., 'ENSG0000..., 'ENSG0000..., 'ENSG0000..., 'ENSG0000...]\n", + " gene1_id: \n", + " [RTYPES.STRSXP]\n", + "R classes: ('character',)\n", + "['ENSG0000..., 'ENSG0000..., 'ENSG0000..., 'ENSG0000..., ..., 'ENSG0000..., 'ENSG0000..., 'ENSG0000..., 'ENSG0000...]\n", + " gene0_symbol: \n", + " [RTYPES.STRSXP]\n", + "R classes: ('character',)\n", + "['KDM6A', 'KDM6A', 'KDM6A', 'KDM6A', ..., 'KDM6A', 'KDM6A', 'KDM6A', 'KDM6A']\n", + " gene1_symbol: \n", + " [RTYPES.STRSXP]\n", + "R classes: ('character',)\n", + "['UTY', 'UTY', 'UTY', 'UTY', ..., 'DDX3Y', 'DDX3Y', 'DDX3Y', 'DDX3Y']\n", + "...\n", + " ccc_coef: \n", + " [RTYPES.REALSXP]\n", + "R classes: ('numeric',)\n", + "[-0.408343, 0.147571, -0.391764, -0.363510, ..., 0.251149, -0.224470, 0.034902, 0.297004]\n", + " pearson_coef: \n", + " [RTYPES.REALSXP]\n", + "R classes: ('numeric',)\n", + "[0.000001, 0.000001, 0.000001, 0.000001, ..., 0.000001, 0.000001, 0.000001, 0.000001]\n", + " spearman_coef: \n", + " [RTYPES.REALSXP]\n", + "R classes: ('numeric',)\n", + "[0.000000, 0.646109, 0.000000, 0.000000, ..., 0.093324, 0.000000, 0.099160, 0.006596]\n", + " ccc_pvalue: \n", + " [RTYPES.REALSXP]\n", + "R classes: ('numeric',)\n", + "[0.000000, 0.069630, 0.000000, 0.000000, ..., 0.000312, 0.000000, 0.589765, 0.000023]" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_r" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "a41d10b1-8cc3-4043-a8e5-9aa8b2bdd8b0", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:27.145238Z", + "iopub.status.busy": "2024-01-05T18:03:27.144949Z", + "iopub.status.idle": "2024-01-05T18:03:27.149680Z", + "shell.execute_reply": "2024-01-05T18:03:27.149298Z" + }, + "papermill": { + "duration": 0.01432, + "end_time": "2024-01-05T18:03:27.150456", + "exception": false, + "start_time": "2024-01-05T18:03:27.136136", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + " [RTYPES.NILSXP]" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "saveRDS(data_r, str(output_file))" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "18a5dfc2-97dd-4460-9ff0-0b924556d804", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:27.168002Z", + "iopub.status.busy": "2024-01-05T18:03:27.167621Z", + "iopub.status.idle": "2024-01-05T18:03:27.170759Z", + "shell.execute_reply": "2024-01-05T18:03:27.170379Z" + }, + "papermill": { + "duration": 0.012739, + "end_time": "2024-01-05T18:03:27.171524", + "exception": false, + "start_time": "2024-01-05T18:03:27.158785", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# testing: load the rds file again\n", + "data_r = readRDS(str(output_file))" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "e70a4d0a-4439-47c6-b422-6ae58e0ea3da", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:27.189148Z", + "iopub.status.busy": "2024-01-05T18:03:27.188839Z", + "iopub.status.idle": "2024-01-05T18:03:27.197286Z", + "shell.execute_reply": "2024-01-05T18:03:27.196883Z" + }, + "papermill": { + "duration": 0.018104, + "end_time": "2024-01-05T18:03:27.198077", + "exception": false, + "start_time": "2024-01-05T18:03:27.179973", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "with localconverter(ro.default_converter + pandas2ri.converter):\n", + " data_again = ro.conversion.rpy2py(data_r)\n", + " data_again.index = data_again.index.astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "6e423577-e429-4ffb-b5d5-01968026c1f6", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:27.215628Z", + "iopub.status.busy": "2024-01-05T18:03:27.215479Z", + "iopub.status.idle": "2024-01-05T18:03:27.219038Z", + "shell.execute_reply": "2024-01-05T18:03:27.218658Z" + }, + "papermill": { + "duration": 0.013255, + "end_time": "2024-01-05T18:03:27.219800", + "exception": false, + "start_time": "2024-01-05T18:03:27.206545", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(100, 11)" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_again.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "2c7aa161-a499-4df3-aca0-2cbee5cbfa5d", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:27.237538Z", + "iopub.status.busy": "2024-01-05T18:03:27.237289Z", + "iopub.status.idle": "2024-01-05T18:03:27.248950Z", + "shell.execute_reply": "2024-01-05T18:03:27.248559Z" + }, + "papermill": { + "duration": 0.02144, + "end_time": "2024-01-05T18:03:27.249765", + "exception": false, + "start_time": "2024-01-05T18:03:27.228325", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gene0_idgene1_idgene0_symbolgene1_symboltissueccc_coefpearson_coefspearman_coefccc_pvaluepearson_pvaluespearman_pvalue
0ENSG00000147050.14ENSG00000183878.15KDM6AUTYcolon_transverse0.336727-0.517899-0.4083439.999990e-073.063714e-299.539164e-18
1ENSG00000147050.14ENSG00000183878.15KDM6AUTYbrain_amygdala0.2805240.0375410.1475719.999990e-076.461089e-016.963023e-02
2ENSG00000147050.14ENSG00000183878.15KDM6AUTYartery_coronary0.274554-0.413862-0.3917649.999990e-072.389970e-113.159321e-10
3ENSG00000147050.14ENSG00000183878.15KDM6AUTYartery_aorta0.429771-0.485788-0.3635109.999990e-075.775754e-276.092383e-15
4ENSG00000147050.14ENSG00000183878.15KDM6AUTYadrenal_gland0.260197-0.459190-0.3541909.999990e-077.334489e-154.847677e-09
\n", + "
" + ], + "text/plain": [ + " gene0_id gene1_id gene0_symbol gene1_symbol \\\n", + "0 ENSG00000147050.14 ENSG00000183878.15 KDM6A UTY \n", + "1 ENSG00000147050.14 ENSG00000183878.15 KDM6A UTY \n", + "2 ENSG00000147050.14 ENSG00000183878.15 KDM6A UTY \n", + "3 ENSG00000147050.14 ENSG00000183878.15 KDM6A UTY \n", + "4 ENSG00000147050.14 ENSG00000183878.15 KDM6A UTY \n", + "\n", + " tissue ccc_coef pearson_coef spearman_coef ccc_pvalue \\\n", + "0 colon_transverse 0.336727 -0.517899 -0.408343 9.999990e-07 \n", + "1 brain_amygdala 0.280524 0.037541 0.147571 9.999990e-07 \n", + "2 artery_coronary 0.274554 -0.413862 -0.391764 9.999990e-07 \n", + "3 artery_aorta 0.429771 -0.485788 -0.363510 9.999990e-07 \n", + "4 adrenal_gland 0.260197 -0.459190 -0.354190 9.999990e-07 \n", + "\n", + " pearson_pvalue spearman_pvalue \n", + "0 3.063714e-29 9.539164e-18 \n", + "1 6.461089e-01 6.963023e-02 \n", + "2 2.389970e-11 3.159321e-10 \n", + "3 5.775754e-27 6.092383e-15 \n", + "4 7.334489e-15 4.847677e-09 " + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_again.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "efa7326e-69d6-4fa4-b6eb-5fa0b502cbb2", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:27.267845Z", + "iopub.status.busy": "2024-01-05T18:03:27.267559Z", + "iopub.status.idle": "2024-01-05T18:03:27.275644Z", + "shell.execute_reply": "2024-01-05T18:03:27.275249Z" + }, + "papermill": { + "duration": 0.01797, + "end_time": "2024-01-05T18:03:27.276441", + "exception": false, + "start_time": "2024-01-05T18:03:27.258471", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# testing\n", + "pd.testing.assert_frame_equal(\n", + " data,\n", + " data_again,\n", + " check_dtype=False,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "50b97893-b49b-42a3-a798-3c94030ab0cf", + "metadata": { + "papermill": { + "duration": 0.008517, + "end_time": "2024-01-05T18:03:27.293584", + "exception": false, + "start_time": "2024-01-05T18:03:27.285067", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Text" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "d548aff3-b850-4050-a21d-6f00b78c280b", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:27.311739Z", + "iopub.status.busy": "2024-01-05T18:03:27.311376Z", + "iopub.status.idle": "2024-01-05T18:03:27.314954Z", + "shell.execute_reply": "2024-01-05T18:03:27.314583Z" + }, + "papermill": { + "duration": 0.0135, + "end_time": "2024-01-05T18:03:27.315737", + "exception": false, + "start_time": "2024-01-05T18:03:27.302237", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "PosixPath('/opt/data/supplementary_material/Supplementary_File_03-Gene_pairs_correlations_all_GTEx_tissues.tsv')" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# tsv format\n", + "output_file = OUTPUT_DIR / f\"{OUTPUT_FILENAME}.tsv\"\n", + "display(output_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "19a7cc7d-924b-49be-b11e-e782a601ba28", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:27.333879Z", + "iopub.status.busy": "2024-01-05T18:03:27.333557Z", + "iopub.status.idle": "2024-01-05T18:03:27.339776Z", + "shell.execute_reply": "2024-01-05T18:03:27.339388Z" + }, + "papermill": { + "duration": 0.016214, + "end_time": "2024-01-05T18:03:27.340636", + "exception": false, + "start_time": "2024-01-05T18:03:27.324422", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "data.to_csv(output_file, sep=\"\\t\", index=False, float_format=\"%.5e\")" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "84321ddb-8e26-4d5e-93e1-7e1fdfaefefb", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:27.359032Z", + "iopub.status.busy": "2024-01-05T18:03:27.358610Z", + "iopub.status.idle": "2024-01-05T18:03:27.363727Z", + "shell.execute_reply": "2024-01-05T18:03:27.363337Z" + }, + "papermill": { + "duration": 0.015122, + "end_time": "2024-01-05T18:03:27.364514", + "exception": false, + "start_time": "2024-01-05T18:03:27.349392", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# testing\n", + "data2 = data # .copy()\n", + "# data2.index = list(range(0, data2.shape[0]))\n", + "\n", + "data_again = pd.read_csv(output_file, sep=\"\\t\", index_col=None)\n", + "# data_again.index = data_again.index.map(lambda x: f\"{x:.2f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "388715f6-edd4-42a4-b1df-2aad3f580d0b", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:27.382743Z", + "iopub.status.busy": "2024-01-05T18:03:27.382452Z", + "iopub.status.idle": "2024-01-05T18:03:27.385813Z", + "shell.execute_reply": "2024-01-05T18:03:27.385438Z" + }, + "papermill": { + "duration": 0.013362, + "end_time": "2024-01-05T18:03:27.386608", + "exception": false, + "start_time": "2024-01-05T18:03:27.373246", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(100, 11)" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_again.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "42224f4a-523d-48f2-b78c-a131b35766bc", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:27.404941Z", + "iopub.status.busy": "2024-01-05T18:03:27.404797Z", + "iopub.status.idle": "2024-01-05T18:03:27.416217Z", + "shell.execute_reply": "2024-01-05T18:03:27.415827Z" + }, + "papermill": { + "duration": 0.021585, + "end_time": "2024-01-05T18:03:27.417034", + "exception": false, + "start_time": "2024-01-05T18:03:27.395449", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gene0_idgene1_idgene0_symbolgene1_symboltissueccc_coefpearson_coefspearman_coefccc_pvaluepearson_pvaluespearman_pvalue
0ENSG00000147050.14ENSG00000183878.15KDM6AUTYcolon_transverse0.336727-0.517899-0.4083439.999990e-073.063710e-299.539160e-18
1ENSG00000147050.14ENSG00000183878.15KDM6AUTYbrain_amygdala0.2805240.0375410.1475719.999990e-076.461090e-016.963020e-02
2ENSG00000147050.14ENSG00000183878.15KDM6AUTYartery_coronary0.274554-0.413862-0.3917649.999990e-072.389970e-113.159320e-10
3ENSG00000147050.14ENSG00000183878.15KDM6AUTYartery_aorta0.429771-0.485788-0.3635109.999990e-075.775750e-276.092380e-15
4ENSG00000147050.14ENSG00000183878.15KDM6AUTYadrenal_gland0.260197-0.459190-0.3541909.999990e-077.334490e-154.847680e-09
\n", + "
" + ], + "text/plain": [ + " gene0_id gene1_id gene0_symbol gene1_symbol \\\n", + "0 ENSG00000147050.14 ENSG00000183878.15 KDM6A UTY \n", + "1 ENSG00000147050.14 ENSG00000183878.15 KDM6A UTY \n", + "2 ENSG00000147050.14 ENSG00000183878.15 KDM6A UTY \n", + "3 ENSG00000147050.14 ENSG00000183878.15 KDM6A UTY \n", + "4 ENSG00000147050.14 ENSG00000183878.15 KDM6A UTY \n", + "\n", + " tissue ccc_coef pearson_coef spearman_coef ccc_pvalue \\\n", + "0 colon_transverse 0.336727 -0.517899 -0.408343 9.999990e-07 \n", + "1 brain_amygdala 0.280524 0.037541 0.147571 9.999990e-07 \n", + "2 artery_coronary 0.274554 -0.413862 -0.391764 9.999990e-07 \n", + "3 artery_aorta 0.429771 -0.485788 -0.363510 9.999990e-07 \n", + "4 adrenal_gland 0.260197 -0.459190 -0.354190 9.999990e-07 \n", + "\n", + " pearson_pvalue spearman_pvalue \n", + "0 3.063710e-29 9.539160e-18 \n", + "1 6.461090e-01 6.963020e-02 \n", + "2 2.389970e-11 3.159320e-10 \n", + "3 5.775750e-27 6.092380e-15 \n", + "4 7.334490e-15 4.847680e-09 " + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_again.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "d69f9e89-c489-441b-884c-7260165292db", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-05T18:03:27.435673Z", + "iopub.status.busy": "2024-01-05T18:03:27.435351Z", + "iopub.status.idle": "2024-01-05T18:03:27.455544Z", + "shell.execute_reply": "2024-01-05T18:03:27.455149Z" + }, + "papermill": { + "duration": 0.030234, + "end_time": "2024-01-05T18:03:27.456257", + "exception": false, + "start_time": "2024-01-05T18:03:27.426023", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# testing\n", + "pd.testing.assert_frame_equal(\n", + " data2,\n", + " data_again,\n", + " check_categorical=False,\n", + " check_dtype=False,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25663d89-6720-40d8-946b-3bbb809f00df", + "metadata": { + "papermill": { + "duration": 0.008735, + "end_time": "2024-01-05T18:03:27.473877", + "exception": false, + "start_time": "2024-01-05T18:03:27.465142", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "all,-execution,-papermill,-trusted", + "notebook_metadata_filter": "-jupytext.text_representation.jupytext_version" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + }, + "papermill": { + "default_parameters": {}, + "duration": 3.218195, + "end_time": "2024-01-05T18:03:27.697248", + "environment_variables": {}, + "exception": null, + "input_path": "nbs/99_manuscript/supplementary_files/01-supplementary_file3.ipynb", + "output_path": "nbs/99_manuscript/supplementary_files/01-supplementary_file3.run.ipynb", + "parameters": {}, + "start_time": "2024-01-05T18:03:24.479053", + "version": "2.3.4" + }, + "toc-autonumbering": true + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nbs/99_manuscript/supplementary_files/py/01-supplementary_file3.py b/nbs/99_manuscript/supplementary_files/py/01-supplementary_file3.py new file mode 100644 index 00000000..fb3a6812 --- /dev/null +++ b/nbs/99_manuscript/supplementary_files/py/01-supplementary_file3.py @@ -0,0 +1,348 @@ +# --- +# jupyter: +# jupytext: +# cell_metadata_filter: all,-execution,-papermill,-trusted +# notebook_metadata_filter: -jupytext.text_representation.jupytext_version +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] tags=[] +# # Description + +# %% [markdown] tags=[] +# Creates **Supplementary File 3**. +# +# *Description*: Correlations and p-values of a subset of gene pairs across all tissues in GTEx v8. + +# %% [markdown] tags=[] +# # Modules + +# %% tags=[] +import pandas as pd +import numpy as np +import rpy2.robjects as ro +from rpy2.robjects import pandas2ri +from rpy2.robjects.conversion import localconverter + +from ccc import conf + +# %% tags=[] +readRDS = ro.r["readRDS"] + +# %% tags=[] +saveRDS = ro.r["saveRDS"] + +# %% [markdown] tags=[] +# # Settings + +# %% tags=[] +DATASET_CONFIG = conf.GTEX + +# %% [markdown] tags=[] +# # Paths + +# %% tags=[] +assert ( + conf.MANUSCRIPT["BASE_DIR"] is not None and conf.MANUSCRIPT["BASE_DIR"].exists() +), "Manuscript dir not set" + +# %% tags=[] +INPUT_DIR = conf.GTEX["RESULTS_DIR"] / "other_tissues" +display(INPUT_DIR) + +# %% tags=[] +OUTPUT_DIR = conf.MANUSCRIPT["SUPPLEMENTARY_MATERIAL_DIR"] +OUTPUT_DIR.mkdir(parents=True, exist_ok=True) +display(OUTPUT_DIR) + +# %% tags=[] +OUTPUT_FILENAME = "Supplementary_File_03-Gene_pairs_correlations_all_GTEx_tissues" + +# %% [markdown] tags=[] +# # Data + +# %% [markdown] tags=[] +# ## Gene Ensembl ID -> Symbol mapping + +# %% tags=[] +gene_map = pd.read_pickle( + DATASET_CONFIG["DATA_DIR"] / "gtex_gene_id_symbol_mappings.pkl" +) + +# %% tags=[] +gene_map = gene_map.set_index("gene_ens_id")["gene_symbol"].to_dict() + +# %% tags=[] +assert gene_map["ENSG00000145309.5"] == "CABS1" + +# %% [markdown] tags=[] +# # List of dataframes to combine + +# %% tags=[] +df_list = [] + +# %% [markdown] tags=[] +# # KDM6A - UTY + +# %% tags=[] +gene0_id, gene1_id = "ENSG00000147050.14", "ENSG00000183878.15" +gene0_symbol, gene1_symbol = "KDM6A", "UTY" + +assert gene_map[gene0_id] == gene0_symbol +assert gene_map[gene1_id] == gene1_symbol + +# %% tags=[] +GENE_PAIR_INPUT_DIR = INPUT_DIR / f"{gene0_symbol.lower()}_vs_{gene1_symbol.lower()}" +display(GENE_PAIR_INPUT_DIR) + +# %% [markdown] tags=[] +# ## Correlation values + +# %% tags=[] +res_all = pd.read_pickle(GENE_PAIR_INPUT_DIR / "coef_values.pkl").rename( + columns={ + "cm": "ccc_coef", + "pearson": "pearson_coef", + "spearman": "spearman_coef", + } +) + +# %% tags=[] +res_all.shape + +# %% tags=[] +res_all.head() + +# %% [markdown] tags=[] +# ## P-values + +# %% tags=[] +res_pval_all = pd.read_pickle(GENE_PAIR_INPUT_DIR / "coef_pvalues.pkl").rename( + columns={ + "cm": "ccc_pvalue", + "pearson": "pearson_pvalue", + "spearman": "spearman_pvalue", + } +) + +# %% tags=[] +res_pval_all.shape + +# %% tags=[] +res_pval_all.head() + +# %% [markdown] tags=[] +# ## Combine + +# %% tags=[] +df = res_all.join(res_pval_all, how="inner").rename_axis("tissue").reset_index() +assert df.shape[0] == res_all.shape[0] +assert df.shape[0] == res_pval_all.shape[0] + +# %% tags=[] +df.insert(0, "gene0_id", gene0_id) +df.insert(1, "gene1_id", gene1_id) +df.insert(2, "gene0_symbol", gene0_symbol) +df.insert(3, "gene1_symbol", gene1_symbol) + +# %% tags=[] +df.shape + +# %% tags=[] +df.head() + +# %% tags=[] +df_list.append(df) + +# %% [markdown] tags=[] +# # KDM6A - DDX3Y + +# %% tags=[] +gene0_id, gene1_id = "ENSG00000147050.14", "ENSG00000067048.16" +gene0_symbol, gene1_symbol = "KDM6A", "DDX3Y" + +assert gene_map[gene0_id] == gene0_symbol +assert gene_map[gene1_id] == gene1_symbol + +# %% tags=[] +GENE_PAIR_INPUT_DIR = INPUT_DIR / f"{gene0_symbol.lower()}_vs_{gene1_symbol.lower()}" +display(GENE_PAIR_INPUT_DIR) + +# %% [markdown] tags=[] +# ## Correlation values + +# %% tags=[] +res_all = pd.read_pickle(GENE_PAIR_INPUT_DIR / "coef_values.pkl").rename( + columns={ + "cm": "ccc_coef", + "pearson": "pearson_coef", + "spearman": "spearman_coef", + } +) + +# %% tags=[] +res_all.shape + +# %% tags=[] +res_all.head() + +# %% [markdown] tags=[] +# ## P-values + +# %% tags=[] +res_pval_all = pd.read_pickle(GENE_PAIR_INPUT_DIR / "coef_pvalues.pkl").rename( + columns={ + "cm": "ccc_pvalue", + "pearson": "pearson_pvalue", + "spearman": "spearman_pvalue", + } +) + +# %% tags=[] +res_pval_all.shape + +# %% tags=[] +res_pval_all.head() + +# %% [markdown] tags=[] +# ## Combine + +# %% tags=[] +df = res_all.join(res_pval_all, how="inner").rename_axis("tissue").reset_index() +assert df.shape[0] == res_all.shape[0] +assert df.shape[0] == res_pval_all.shape[0] + +# %% tags=[] +df.insert(0, "gene0_id", gene0_id) +df.insert(1, "gene1_id", gene1_id) +df.insert(2, "gene0_symbol", gene0_symbol) +df.insert(3, "gene1_symbol", gene1_symbol) + +# %% tags=[] +df.shape + +# %% tags=[] +df.head() + +# %% tags=[] +df_list.append(df) + +# %% [markdown] tags=[] +# # Combine + +# %% tags=[] +df_final = pd.concat(df_list, ignore_index=True, axis=0) + +# %% tags=[] +assert df_final.shape[0] == sum(d.shape[0] for d in df_list) +for d in df_list: + assert df_final.shape[1] == d.shape[1] +display(df_final.shape) + +# %% tags=[] +df_final + +# %% [markdown] tags=[] +# # Save + +# %% tags=[] +data = df_final + +# %% tags=[] +display(data.index.dtype) +display(data.index) + +# %% tags=[] +# reset index to avoid problems with MultiIndex in Pandas +if isinstance(data.index, pd.MultiIndex): + display("MultiIndex") + data = data.reset_index() + +# %% [markdown] tags=[] +# ## Pickle + +# %% tags=[] +data.to_pickle(OUTPUT_DIR / f"{OUTPUT_FILENAME}.pkl.gz") + +# %% [markdown] tags=[] +# ## RDS + +# %% tags=[] +output_file = OUTPUT_DIR / f"{OUTPUT_FILENAME}.rds" +display(output_file) + +# %% tags=[] +with localconverter(ro.default_converter + pandas2ri.converter): + data_r = ro.conversion.py2rpy(data) + +# %% tags=[] +data_r + +# %% tags=[] +saveRDS(data_r, str(output_file)) + +# %% tags=[] +# testing: load the rds file again +data_r = readRDS(str(output_file)) + +# %% tags=[] +with localconverter(ro.default_converter + pandas2ri.converter): + data_again = ro.conversion.rpy2py(data_r) + data_again.index = data_again.index.astype(int) + +# %% tags=[] +data_again.shape + +# %% tags=[] +data_again.head() + +# %% tags=[] +# testing +pd.testing.assert_frame_equal( + data, + data_again, + check_dtype=False, +) + +# %% [markdown] tags=[] +# ## Text + +# %% tags=[] +# tsv format +output_file = OUTPUT_DIR / f"{OUTPUT_FILENAME}.tsv" +display(output_file) + +# %% tags=[] +data.to_csv(output_file, sep="\t", index=False, float_format="%.5e") + +# %% tags=[] +# testing +data2 = data # .copy() +# data2.index = list(range(0, data2.shape[0])) + +data_again = pd.read_csv(output_file, sep="\t", index_col=None) +# data_again.index = data_again.index.map(lambda x: f"{x:.2f}") + +# %% tags=[] +data_again.shape + +# %% tags=[] +data_again.head() + +# %% tags=[] +# testing +pd.testing.assert_frame_equal( + data2, + data_again, + check_categorical=False, + check_dtype=False, +) + +# %% tags=[]