Skip to content

Commit

Permalink
Added VAE notebook.
Browse files Browse the repository at this point in the history
  • Loading branch information
percolator committed Nov 18, 2024
1 parent 182f615 commit e9ac9cf
Show file tree
Hide file tree
Showing 4 changed files with 1,106 additions and 0 deletions.
1 change: 1 addition & 0 deletions dsbook/_toc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ parts:
- file: unsupervised/pca.ipynb
- file: unsupervised/PCAofCarcinomas.ipynb
- file: unsupervised/autoenc.ipynb
- file: unsupervised/VAEofCarcinomas.ipynb
- caption: Network Analysis
numbered: True
chapters:
Expand Down
382 changes: 382 additions & 0 deletions dsbook/network/gsea.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,382 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"# Example of ORA and GSEA"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
"source": [
"We first run the same steps as in the previous notebook on multiple testing."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading ../common/../data/brca_tcga_pub2015.tar.gz...\n",
"Download complete.\n",
"File extracted to ../data/brca_tcga_pub2015\n",
"File extracted to ../data/brca_tcga_pub2015\n"
]
},
{
"ename": "KeyError",
"evalue": "'PR status by ihc'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
"File \u001b[0;32m~/anaconda3/envs/jb/lib/python3.11/site-packages/pandas/core/indexes/base.py:3805\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3804\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3805\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine\u001b[38;5;241m.\u001b[39mget_loc(casted_key)\n\u001b[1;32m 3806\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n",
"File \u001b[0;32mindex.pyx:167\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
"File \u001b[0;32mindex.pyx:196\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
"File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7081\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
"File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7089\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
"\u001b[0;31mKeyError\u001b[0m: 'PR status by ihc'",
"\nThe above exception was the direct cause of the following exception:\n",
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[1], line 20\u001b[0m\n\u001b[1;32m 18\u001b[0m brca \u001b[38;5;241m=\u001b[39m brca\u001b[38;5;241m.\u001b[39mloc[\u001b[38;5;241m~\u001b[39m(brca\u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.0\u001b[39m)\u001b[38;5;241m.\u001b[39many(axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)]\n\u001b[1;32m 19\u001b[0m brca \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(data\u001b[38;5;241m=\u001b[39mnp\u001b[38;5;241m.\u001b[39mlog2(brca),index\u001b[38;5;241m=\u001b[39mbrca\u001b[38;5;241m.\u001b[39mindex,columns\u001b[38;5;241m=\u001b[39mbrca\u001b[38;5;241m.\u001b[39mcolumns)\n\u001b[0;32m---> 20\u001b[0m brca_clin\u001b[38;5;241m.\u001b[39mloc[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m3N\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m=\u001b[39m (brca_clin\u001b[38;5;241m.\u001b[39mloc[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPR status by ihc\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m==\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNegative\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m&\u001b[39m (brca_clin\u001b[38;5;241m.\u001b[39mloc[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mER Status By IHC\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m==\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNegative\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m&\u001b[39m (brca_clin\u001b[38;5;241m.\u001b[39mloc[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIHC-HER2\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m==\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNegative\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 21\u001b[0m tripple_negative_bool \u001b[38;5;241m=\u001b[39m (brca_clin\u001b[38;5;241m.\u001b[39mloc[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m3N\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 23\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_significance_two_groups\u001b[39m(row):\n",
"File \u001b[0;32m~/anaconda3/envs/jb/lib/python3.11/site-packages/pandas/core/indexing.py:1191\u001b[0m, in \u001b[0;36m_LocationIndexer.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 1189\u001b[0m maybe_callable \u001b[38;5;241m=\u001b[39m com\u001b[38;5;241m.\u001b[39mapply_if_callable(key, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mobj)\n\u001b[1;32m 1190\u001b[0m maybe_callable \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_deprecated_callable_usage(key, maybe_callable)\n\u001b[0;32m-> 1191\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_axis(maybe_callable, axis\u001b[38;5;241m=\u001b[39maxis)\n",
"File \u001b[0;32m~/anaconda3/envs/jb/lib/python3.11/site-packages/pandas/core/indexing.py:1431\u001b[0m, in \u001b[0;36m_LocIndexer._getitem_axis\u001b[0;34m(self, key, axis)\u001b[0m\n\u001b[1;32m 1429\u001b[0m \u001b[38;5;66;03m# fall thru to straight lookup\u001b[39;00m\n\u001b[1;32m 1430\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_key(key, axis)\n\u001b[0;32m-> 1431\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_label(key, axis\u001b[38;5;241m=\u001b[39maxis)\n",
"File \u001b[0;32m~/anaconda3/envs/jb/lib/python3.11/site-packages/pandas/core/indexing.py:1381\u001b[0m, in \u001b[0;36m_LocIndexer._get_label\u001b[0;34m(self, label, axis)\u001b[0m\n\u001b[1;32m 1379\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_get_label\u001b[39m(\u001b[38;5;28mself\u001b[39m, label, axis: AxisInt):\n\u001b[1;32m 1380\u001b[0m \u001b[38;5;66;03m# GH#5567 this will fail if the label is not present in the axis.\u001b[39;00m\n\u001b[0;32m-> 1381\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mobj\u001b[38;5;241m.\u001b[39mxs(label, axis\u001b[38;5;241m=\u001b[39maxis)\n",
"File \u001b[0;32m~/anaconda3/envs/jb/lib/python3.11/site-packages/pandas/core/generic.py:4301\u001b[0m, in \u001b[0;36mNDFrame.xs\u001b[0;34m(self, key, axis, level, drop_level)\u001b[0m\n\u001b[1;32m 4299\u001b[0m new_index \u001b[38;5;241m=\u001b[39m index[loc]\n\u001b[1;32m 4300\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 4301\u001b[0m loc \u001b[38;5;241m=\u001b[39m index\u001b[38;5;241m.\u001b[39mget_loc(key)\n\u001b[1;32m 4303\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(loc, np\u001b[38;5;241m.\u001b[39mndarray):\n\u001b[1;32m 4304\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m loc\u001b[38;5;241m.\u001b[39mdtype \u001b[38;5;241m==\u001b[39m np\u001b[38;5;241m.\u001b[39mbool_:\n",
"File \u001b[0;32m~/anaconda3/envs/jb/lib/python3.11/site-packages/pandas/core/indexes/base.py:3812\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3807\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m 3808\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m 3809\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m 3810\u001b[0m ):\n\u001b[1;32m 3811\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3812\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3813\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3814\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3815\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3816\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3817\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n",
"\u001b[0;31mKeyError\u001b[0m: 'PR status by ihc'"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from scipy.stats import ttest_ind\n",
"import sys\n",
"IN_COLAB = 'google.colab' in sys.modules\n",
"if IN_COLAB:\n",
" ![ ! -f \"dsbook/README.md\" ] && git clone https://github.com/statisticalbiotechnology/dsbook.git\n",
" my_path = \"dsbook/dsbook/common/\"\n",
"else:\n",
" my_path = \"../common/\"\n",
"sys.path.append(my_path) # Read local modules for tcga access and qvalue calculations\n",
"import load_tcga as tcga\n",
"import qvalue \n",
"\n",
"brca = tcga.get_expression_data(my_path + \"../data/brca_tcga_pub2015.tar.gz\", 'https://cbioportal-datahub.s3.amazonaws.com/brca_tcga_pub2015.tar.gz',\"data_mrna_seq_v2_rsem.txt\")\n",
"brca_clin = tcga.get_clinical_data(my_path + \"../data/brca_tcga_pub2015.tar.gz\", 'https://cbioportal-datahub.s3.amazonaws.com/brca_tcga_pub2015.tar.gz',\"data_clinical_sample.txt\")\n",
"brca.dropna(axis=0, how='any', inplace=True)\n",
"brca = brca.loc[~(brca<=0.0).any(axis=1)]\n",
"brca = pd.DataFrame(data=np.log2(brca),index=brca.index,columns=brca.columns)\n",
"brca_clin.loc[\"3N\"]= (brca_clin.loc[\"PR_STATUS_BY_IHC\"]==\"Negative\") & (brca_clin.loc[\"ER_STATUS_BY_IHC\"]==\"Negative\") & (brca_clin.loc[\"IHC_HER2\"]==\"Negative\")\n",
"tripple_negative_bool = (brca_clin.loc[\"3N\"] == True)\n",
"\n",
"def get_significance_two_groups(row):\n",
" log_fold_change = row[tripple_negative_bool].mean() - row[~tripple_negative_bool].mean()\n",
" p = ttest_ind(row[tripple_negative_bool],row[~tripple_negative_bool],equal_var=False)[1]\n",
" return [p,-np.log10(p),log_fold_change]\n",
"\n",
"pvalues = brca.apply(get_significance_two_groups,axis=1,result_type=\"expand\")\n",
"pvalues.rename(columns = {list(pvalues)[0]: 'p', list(pvalues)[1]: '-log_p', list(pvalues)[2]: 'log_FC'}, inplace = True)\n",
"qvalues = qvalue.qvalues(pvalues)"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"If we investigate a Volcano plot of the tripple negative cancers vs. the other cancers, we see an large number of both up and down regulated genes. We will in this note book ecamine if there are common patterns in the up and down regulation. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"sns.relplot(data=qvalues,x=\"log_FC\",y=\"-log_p\")\n",
"plt.xlabel(\"$log_2(FC)$\")\n",
"plt.ylabel(\"$-log_{10}(p)$\")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"### Over-representation analysis\n",
"\n",
"We use the [gseapy](https://gseapy.readthedocs.io/) module to run an overrepresentation analysis. The module is unfortunately not implementing pathway analysis itself. It instead call a remote webserver[Enrichr](http://amp.pharm.mssm.edu/Enrichr/). \n",
"\n",
"In the analysis here we use the [KEGG](https://www.genome.jp/kegg/) database's definition of metabolomic pathways. This choice can easily be changed to other databases such as GO.\n",
"\n",
"Here we select to use the $q$ values below $10^{-15}$ as an input. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [],
"source": [
"import gseapy as gp\n",
"\n",
"pathway_db=['KEGG_2019_Human']\n",
"background=set(qvalues.index)\n",
"gene_list = list(qvalues.loc[qvalues[\"q\"]<1e-15,\"q\"].index)\n",
"\n",
"output_enrichr=pd.DataFrame()\n",
"enr=gp.enrichr(\n",
" gene_list=gene_list,\n",
" gene_sets=pathway_db,\n",
" background=background,\n",
" outdir = None\n",
" )\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"We clean up the results a bit by only keeping some of the resulting metics. We also multiple hypothesis correct our results, and list the terms with a FDR less than 5%."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [],
"source": [
"kegg_enr = enr.results[[\"P-value\",\"Term\"]].rename(columns={\"P-value\": \"p\"})\n",
"kegg_enr = qvalue.qvalues(kegg_enr)\n",
"kegg_enr.loc[kegg_enr[\"q\"]<0.20]"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
"source": [
"The analysis seem to find overrepresentation of relatively few pathways, particularly given the significance of the differences between case and controll on transcript level. "
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"## Geneset Enrichment analysis\n",
"\n",
"Subsequently we us pygsea to perform a geneset enricment analysis (GSEA).\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [],
"source": [
"classes = [\"TrippleNeg\" if tripple_negative_bool[sample_name] else \"Respond\" for sample_name in brca.columns]\n",
"gs_res = gp.gsea(data=brca, \n",
" gene_sets='KEGG_2016', \n",
" # gene_sets='Reactome_2013',\n",
" cls=classes, # cls=class_vector\n",
" # set permutation_type to phenotype if samples >=15\n",
" permutation_type='phenotype',\n",
" permutation_num=100, # reduce number to speed up test\n",
" outdir=None, # do not write output to disk\n",
" no_plot=True, # Skip plotting\n",
" method='signal_to_noise',\n",
" # method='t_test',\n",
" processes=4, # Number of allowed parallel processes\n",
" format='png',\n",
" ascending=True,\n",
" max_size=20000)\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
"source": [
"The pygsea module's fdr calculation seems to be broken, and we hence remake the significance calculations ourselves."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"gs_res.res2d"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [],
"source": [
"import qvalue\n",
"gs_res.res2d.sort_values(by=[\"NOM p-val\"],inplace=True)\n",
"out = qvalue.qvalues(gs_res.res2d,\"NOM p-val\").drop([\"FDR q-val\",\"FWER p-val\"], axis='columns')"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"We list the 2 topscoring pathways."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [],
"source": [
"out.loc[out[\"q\"]<0.20]"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"We display some detailed plots of the best scoring pathway using gseapy's plotting routines."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [],
"source": [
"from gseapy.plot import gseaplot, heatmap\n",
"terms = gs_res.res2d.Term\n",
"axs = gs_res.plot(terms=terms[11]) # v1.0.5"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [],
"source": [
"axs = gs_res.plot(terms=terms[9]) "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"axs = gs_res.plot(terms=terms[6]) "
]
}
],
"metadata": {
"celltoolbar": "Slideshow",
"kernelspec": {
"display_name": "jb",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
1 change: 1 addition & 0 deletions dsbook/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ cptac
sphinxcontrib-mermaid
ipywidgets
jupytext
gseapy
Loading

0 comments on commit e9ac9cf

Please sign in to comment.