Added VAE notebook.

statisticalbiotechnology · Nov 18, 2024 · e9ac9cf · e9ac9cf
1 parent 182f615
commit e9ac9cf
Show file tree

Hide file tree

Showing 4 changed files with 1,106 additions and 0 deletions.
diff --git a/dsbook/_toc.yml b/dsbook/_toc.yml
@@ -32,6 +32,7 @@ parts:
     - file: unsupervised/pca.ipynb
     - file: unsupervised/PCAofCarcinomas.ipynb
     - file: unsupervised/autoenc.ipynb
+    - file: unsupervised/VAEofCarcinomas.ipynb
   - caption: Network Analysis
     numbered: True
     chapters:

diff --git a/dsbook/network/gsea.ipynb b/dsbook/network/gsea.ipynb
@@ -0,0 +1,382 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "source": [
+    "# Example of ORA and GSEA"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "fragment"
+    }
+   },
+   "source": [
+    "We first run the same steps as in the previous notebook on multiple testing."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "slideshow": {
+     "slide_type": "fragment"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading ../common/../data/brca_tcga_pub2015.tar.gz...\n",
+      "Download complete.\n",
+      "File extracted to ../data/brca_tcga_pub2015\n",
+      "File extracted to ../data/brca_tcga_pub2015\n"
+     ]
+    },
+    {
+     "ename": "KeyError",
+     "evalue": "'PR status by ihc'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
+      "File \u001b[0;32m~/anaconda3/envs/jb/lib/python3.11/site-packages/pandas/core/indexes/base.py:3805\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   3804\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3805\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine\u001b[38;5;241m.\u001b[39mget_loc(casted_key)\n\u001b[1;32m   3806\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n",
+      "File \u001b[0;32mindex.pyx:167\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
+      "File \u001b[0;32mindex.pyx:196\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
+      "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7081\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
+      "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7089\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
+      "\u001b[0;31mKeyError\u001b[0m: 'PR status by ihc'",
+      "\nThe above exception was the direct cause of the following exception:\n",
+      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[1], line 20\u001b[0m\n\u001b[1;32m     18\u001b[0m brca \u001b[38;5;241m=\u001b[39m brca\u001b[38;5;241m.\u001b[39mloc[\u001b[38;5;241m~\u001b[39m(brca\u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.0\u001b[39m)\u001b[38;5;241m.\u001b[39many(axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)]\n\u001b[1;32m     19\u001b[0m brca \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(data\u001b[38;5;241m=\u001b[39mnp\u001b[38;5;241m.\u001b[39mlog2(brca),index\u001b[38;5;241m=\u001b[39mbrca\u001b[38;5;241m.\u001b[39mindex,columns\u001b[38;5;241m=\u001b[39mbrca\u001b[38;5;241m.\u001b[39mcolumns)\n\u001b[0;32m---> 20\u001b[0m brca_clin\u001b[38;5;241m.\u001b[39mloc[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m3N\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m=\u001b[39m (brca_clin\u001b[38;5;241m.\u001b[39mloc[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPR status by ihc\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m==\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNegative\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m&\u001b[39m (brca_clin\u001b[38;5;241m.\u001b[39mloc[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mER Status By IHC\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m==\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNegative\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m&\u001b[39m (brca_clin\u001b[38;5;241m.\u001b[39mloc[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIHC-HER2\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m==\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNegative\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     21\u001b[0m tripple_negative_bool \u001b[38;5;241m=\u001b[39m (brca_clin\u001b[38;5;241m.\u001b[39mloc[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m3N\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m     23\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_significance_two_groups\u001b[39m(row):\n",
+      "File \u001b[0;32m~/anaconda3/envs/jb/lib/python3.11/site-packages/pandas/core/indexing.py:1191\u001b[0m, in \u001b[0;36m_LocationIndexer.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   1189\u001b[0m maybe_callable \u001b[38;5;241m=\u001b[39m com\u001b[38;5;241m.\u001b[39mapply_if_callable(key, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mobj)\n\u001b[1;32m   1190\u001b[0m maybe_callable \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_deprecated_callable_usage(key, maybe_callable)\n\u001b[0;32m-> 1191\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_axis(maybe_callable, axis\u001b[38;5;241m=\u001b[39maxis)\n",
+      "File \u001b[0;32m~/anaconda3/envs/jb/lib/python3.11/site-packages/pandas/core/indexing.py:1431\u001b[0m, in \u001b[0;36m_LocIndexer._getitem_axis\u001b[0;34m(self, key, axis)\u001b[0m\n\u001b[1;32m   1429\u001b[0m \u001b[38;5;66;03m# fall thru to straight lookup\u001b[39;00m\n\u001b[1;32m   1430\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_key(key, axis)\n\u001b[0;32m-> 1431\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_label(key, axis\u001b[38;5;241m=\u001b[39maxis)\n",
+      "File \u001b[0;32m~/anaconda3/envs/jb/lib/python3.11/site-packages/pandas/core/indexing.py:1381\u001b[0m, in \u001b[0;36m_LocIndexer._get_label\u001b[0;34m(self, label, axis)\u001b[0m\n\u001b[1;32m   1379\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_get_label\u001b[39m(\u001b[38;5;28mself\u001b[39m, label, axis: AxisInt):\n\u001b[1;32m   1380\u001b[0m     \u001b[38;5;66;03m# GH#5567 this will fail if the label is not present in the axis.\u001b[39;00m\n\u001b[0;32m-> 1381\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mobj\u001b[38;5;241m.\u001b[39mxs(label, axis\u001b[38;5;241m=\u001b[39maxis)\n",
+      "File \u001b[0;32m~/anaconda3/envs/jb/lib/python3.11/site-packages/pandas/core/generic.py:4301\u001b[0m, in \u001b[0;36mNDFrame.xs\u001b[0;34m(self, key, axis, level, drop_level)\u001b[0m\n\u001b[1;32m   4299\u001b[0m             new_index \u001b[38;5;241m=\u001b[39m index[loc]\n\u001b[1;32m   4300\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 4301\u001b[0m     loc \u001b[38;5;241m=\u001b[39m index\u001b[38;5;241m.\u001b[39mget_loc(key)\n\u001b[1;32m   4303\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(loc, np\u001b[38;5;241m.\u001b[39mndarray):\n\u001b[1;32m   4304\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m loc\u001b[38;5;241m.\u001b[39mdtype \u001b[38;5;241m==\u001b[39m np\u001b[38;5;241m.\u001b[39mbool_:\n",
+      "File \u001b[0;32m~/anaconda3/envs/jb/lib/python3.11/site-packages/pandas/core/indexes/base.py:3812\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   3807\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m   3808\u001b[0m         \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m   3809\u001b[0m         \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m   3810\u001b[0m     ):\n\u001b[1;32m   3811\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3812\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m   3813\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m   3814\u001b[0m     \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m   3815\u001b[0m     \u001b[38;5;66;03m#  InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m   3816\u001b[0m     \u001b[38;5;66;03m#  the TypeError.\u001b[39;00m\n\u001b[1;32m   3817\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n",
+      "\u001b[0;31mKeyError\u001b[0m: 'PR status by ihc'"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from scipy.stats import ttest_ind\n",
+    "import sys\n",
+    "IN_COLAB = 'google.colab' in sys.modules\n",
+    "if IN_COLAB:\n",
+    "    ![ ! -f \"dsbook/README.md\" ] && git clone https://github.com/statisticalbiotechnology/dsbook.git\n",
+    "    my_path = \"dsbook/dsbook/common/\"\n",
+    "else:\n",
+    "    my_path = \"../common/\"\n",
+    "sys.path.append(my_path) # Read local modules for tcga access and qvalue calculations\n",
+    "import load_tcga as tcga\n",
+    "import qvalue \n",
+    "\n",
+    "brca = tcga.get_expression_data(my_path + \"../data/brca_tcga_pub2015.tar.gz\", 'https://cbioportal-datahub.s3.amazonaws.com/brca_tcga_pub2015.tar.gz',\"data_mrna_seq_v2_rsem.txt\")\n",
+    "brca_clin = tcga.get_clinical_data(my_path + \"../data/brca_tcga_pub2015.tar.gz\", 'https://cbioportal-datahub.s3.amazonaws.com/brca_tcga_pub2015.tar.gz',\"data_clinical_sample.txt\")\n",
+    "brca.dropna(axis=0, how='any', inplace=True)\n",
+    "brca = brca.loc[~(brca<=0.0).any(axis=1)]\n",
+    "brca = pd.DataFrame(data=np.log2(brca),index=brca.index,columns=brca.columns)\n",
+    "brca_clin.loc[\"3N\"]= (brca_clin.loc[\"PR_STATUS_BY_IHC\"]==\"Negative\") & (brca_clin.loc[\"ER_STATUS_BY_IHC\"]==\"Negative\") & (brca_clin.loc[\"IHC_HER2\"]==\"Negative\")\n",
+    "tripple_negative_bool = (brca_clin.loc[\"3N\"] == True)\n",
+    "\n",
+    "def get_significance_two_groups(row):\n",
+    "    log_fold_change = row[tripple_negative_bool].mean() - row[~tripple_negative_bool].mean()\n",
+    "    p = ttest_ind(row[tripple_negative_bool],row[~tripple_negative_bool],equal_var=False)[1]\n",
+    "    return [p,-np.log10(p),log_fold_change]\n",
+    "\n",
+    "pvalues = brca.apply(get_significance_two_groups,axis=1,result_type=\"expand\")\n",
+    "pvalues.rename(columns = {list(pvalues)[0]: 'p', list(pvalues)[1]: '-log_p', list(pvalues)[2]: 'log_FC'}, inplace = True)\n",
+    "qvalues = qvalue.qvalues(pvalues)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "source": [
+    "If we investigate a Volcano plot of the tripple negative cancers vs. the other cancers, we see an large number of both up and down regulated genes. We will in this note book ecamine if there are common patterns in the up and down regulation. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "slideshow": {
+     "slide_type": "fragment"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "\n",
+    "sns.relplot(data=qvalues,x=\"log_FC\",y=\"-log_p\")\n",
+    "plt.xlabel(\"$log_2(FC)$\")\n",
+    "plt.ylabel(\"$-log_{10}(p)$\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "source": [
+    "### Over-representation analysis\n",
+    "\n",
+    "We use the [gseapy](https://gseapy.readthedocs.io/) module to run an overrepresentation analysis. The module is unfortunately not implementing pathway analysis itself. It instead call a remote webserver[Enrichr](http://amp.pharm.mssm.edu/Enrichr/).  \n",
+    "\n",
+    "In the analysis here we use the [KEGG](https://www.genome.jp/kegg/) database's definition of metabolomic pathways. This choice can easily be changed to other databases such as GO.\n",
+    "\n",
+    "Here we select to use the $q$ values below $10^{-15}$ as an input. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "slideshow": {
+     "slide_type": "fragment"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import gseapy as gp\n",
+    "\n",
+    "pathway_db=['KEGG_2019_Human']\n",
+    "background=set(qvalues.index)\n",
+    "gene_list = list(qvalues.loc[qvalues[\"q\"]<1e-15,\"q\"].index)\n",
+    "\n",
+    "output_enrichr=pd.DataFrame()\n",
+    "enr=gp.enrichr(\n",
+    "                gene_list=gene_list,\n",
+    "                gene_sets=pathway_db,\n",
+    "                background=background,\n",
+    "                outdir = None\n",
+    "            )\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "source": [
+    "We clean up the results a bit by only keeping some of the resulting metics. We also multiple hypothesis correct our results, and list the terms with a FDR less than 5%."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "slideshow": {
+     "slide_type": "fragment"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "kegg_enr = enr.results[[\"P-value\",\"Term\"]].rename(columns={\"P-value\": \"p\"})\n",
+    "kegg_enr = qvalue.qvalues(kegg_enr)\n",
+    "kegg_enr.loc[kegg_enr[\"q\"]<0.20]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "fragment"
+    }
+   },
+   "source": [
+    "The analysis seem to find overrepresentation of relatively few pathways, particularly given the significance of the differences between case and controll on transcript level. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "source": [
+    "## Geneset Enrichment analysis\n",
+    "\n",
+    "Subsequently we us pygsea to perform a geneset enricment analysis (GSEA).\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "slideshow": {
+     "slide_type": "fragment"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "classes = [\"TrippleNeg\" if tripple_negative_bool[sample_name] else \"Respond\" for sample_name in brca.columns]\n",
+    "gs_res = gp.gsea(data=brca, \n",
+    "                 gene_sets='KEGG_2016', \n",
+    "                 # gene_sets='Reactome_2013',\n",
+    "                 cls=classes, # cls=class_vector\n",
+    "                 # set permutation_type to phenotype if samples >=15\n",
+    "                 permutation_type='phenotype',\n",
+    "                 permutation_num=100, # reduce number to speed up test\n",
+    "                 outdir=None,  # do not write output to disk\n",
+    "                 no_plot=True, # Skip plotting\n",
+    "                 method='signal_to_noise',\n",
+    "                 # method='t_test',\n",
+    "                 processes=4, # Number of allowed parallel processes\n",
+    "                 format='png',\n",
+    "                 ascending=True,\n",
+    "                max_size=20000)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "fragment"
+    }
+   },
+   "source": [
+    "The pygsea module's fdr calculation seems to be broken, and we hence remake the significance calculations ourselves."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gs_res.res2d"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "slideshow": {
+     "slide_type": "fragment"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import qvalue\n",
+    "gs_res.res2d.sort_values(by=[\"NOM p-val\"],inplace=True)\n",
+    "out = qvalue.qvalues(gs_res.res2d,\"NOM p-val\").drop([\"FDR q-val\",\"FWER p-val\"], axis='columns')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "source": [
+    "We list the 2 topscoring pathways."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "slideshow": {
+     "slide_type": "fragment"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "out.loc[out[\"q\"]<0.20]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "source": [
+    "We display some detailed plots of the best scoring pathway using gseapy's plotting routines."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "slideshow": {
+     "slide_type": "fragment"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from gseapy.plot import gseaplot, heatmap\n",
+    "terms = gs_res.res2d.Term\n",
+    "axs = gs_res.plot(terms=terms[11]) # v1.0.5"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "axs = gs_res.plot(terms=terms[9]) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "axs = gs_res.plot(terms=terms[6]) "
+   ]
+  }
+ ],
+ "metadata": {
+  "celltoolbar": "Slideshow",
+  "kernelspec": {
+   "display_name": "jb",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/dsbook/requirements.txt b/dsbook/requirements.txt
@@ -8,3 +8,4 @@ cptac
 sphinxcontrib-mermaid
 ipywidgets
 jupytext
+gseapy