diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 349e8173c..60149e4b1 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -67,19 +67,27 @@ jobs: run: | cd project mkdir runs - papermill 04_1_train_DAE_VAE_wo_val_data.ipynb runs/04_1_train_DAE_VAE_wo_val_data.ipynb papermill 04_1_train_pimms_models.ipynb runs/04_1_train_pimms_models.ipynb - - name: Run demo workflow (integration test) + papermill 04_1_train_pimms_models.ipynb runs/04_1_train_pimms_models_no_val.ipynb -p sample_splits False + - name: Dry-Run demo workflow (integration test) continue-on-error: true run: | cd project snakemake -p -c1 --configfile config/single_dev_dataset/example/config.yaml -n + - name: Run demo workflow (integration test) + continue-on-error: true + run: | + cd project snakemake -p -c4 -k --configfile config/single_dev_dataset/example/config.yaml - name: Run demo workflow again (in case of installation issues) + continue-on-error: true run: | cd project - snakemake -p -c1 -n --configfile config/single_dev_dataset/example/config.yaml snakemake -p -c4 -k --configfile config/single_dev_dataset/example/config.yaml + - name: Run demo workflow again (in case of installation issues) - one thread + run: | + cd project + snakemake -p -c1 --configfile config/single_dev_dataset/example/config.yaml - name: Archive results # https://github.com/actions/upload-artifact uses: actions/upload-artifact@v4 diff --git a/.github/workflows/test_pkg_on_colab.yaml b/.github/workflows/test_pkg_on_colab.yaml index 546434bb3..3f0b6e757 100644 --- a/.github/workflows/test_pkg_on_colab.yaml +++ b/.github/workflows/test_pkg_on_colab.yaml @@ -1,10 +1,10 @@ name: Test that tutorial runs on latest colab image on: - push: - branches: [dev] + # push: + # branches: [main] pull_request: - branches: [main, dev] + branches: [main] schedule: - cron: '0 2 3 * *' @@ -24,3 +24,5 @@ jobs: run: | cd project papermill 04_1_train_pimms_models.ipynb 04_1_train_pimms_models_output.ipynb + papermill 04_1_train_pimms_models.ipynb 04_1_train_pimms_models_no_val.ipynb -p sample_splits False + diff --git a/.github/workflows/workflow_website.yaml b/.github/workflows/workflow_website.yaml index be97836db..d4f4e845a 100644 --- a/.github/workflows/workflow_website.yaml +++ b/.github/workflows/workflow_website.yaml @@ -39,9 +39,15 @@ jobs: cd project snakemake -s workflow/Snakefile_v2.smk --configfile config/alzheimer_study/config.yaml -p -c4 -k - name: Run demo workflow again (in case of installation issues) + continue-on-error: true run: | cd project snakemake -s workflow/Snakefile_v2.smk --configfile config/alzheimer_study/config.yaml -p -c4 -k + - name: Run demo workflow again (in case of installation issues) with one thread + continue-on-error: true + run: | + cd project + snakemake -s workflow/Snakefile_v2.smk --configfile config/alzheimer_study/config.yaml -p -c1 -k - name: Run differential analysis workflow run: | cd project diff --git a/README.md b/README.md index 4af30064e..b11a02b64 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ The publication is accepted in Nature Communications and the pre-print is available [on biorxiv](https://doi.org/10.1101/2023.01.12.523792). > `PIMMS` was called `vaep` during development. -> Before entire refactoring has to been completed the imported package will be `vaep`. +> Before entire refactoring has been completed the imported package will be `vaep`. We provide functionality as a python package, an excutable workflow or simply in notebooks. @@ -127,17 +127,24 @@ mamba env create -n pimms -f environment.yml # faster, less then 5mins If on Mac M1, M2 or having otherwise issue using your accelerator (e.g. GPUs): Install the pytorch dependencies first, then the rest of the environment: -### Install pytorch first (M-chips) +### Install pytorch first + +> :warning: We currently see issues with some installations on M1 chips. A dependency +> for one workflow is polars, which causes the issue. This should be [fixed now](https://github.com/RasmussenLab/njab/pull/13) +> for general use by delayed import +> of `mrmr-selection` in `njab`. If you encounter issues, please open an issue. Check how to install pytorch for your system [here](https://pytorch.org/get-started). - select the version compatible with your cuda version if you have an nvidia gpu or a Mac M-chip. ```bash -conda create -n vaep python=3.9 pip -conda activate vaep -# Follow instructions on https://pytorch.org/get-started -# conda env update -f environment.yml -n vaep # should not install the rest. +conda create -n pimms python=3.9 pip +conda activate pimms +# Follow instructions on https://pytorch.org/get-started: +# CUDA is not available on MacOS, please use default package +# pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu +conda install pytorch::pytorch torchvision torchaudio fastai -c pytorch -c fastai -y pip install pimms-learn pip install jupyterlab papermill # use run notebook interactively or as a script diff --git a/project/01_0_split_data.ipynb b/project/01_0_split_data.ipynb index 3b757bb1c..d73788af0 100644 --- a/project/01_0_split_data.ipynb +++ b/project/01_0_split_data.ipynb @@ -1370,48 +1370,7 @@ "# -> or raise error as feature completness treshold is so low that less than 3 samples\n", "# per feature are allowd.\n", "\n", - "diff = (splits\n", - " .val_y\n", - " .index\n", - " .levels[-1]\n", - " .difference(splits\n", - " .train_X\n", - " .index\n", - " .levels[-1]\n", - " ).to_list())\n", - "if diff:\n", - " to_remove = splits.val_y.loc[pd.IndexSlice[:, diff]]\n", - " display(to_remove)\n", - " splits.train_X = pd.concat([splits.train_X, to_remove])\n", - " splits.val_y = splits.val_y.drop(to_remove.index)\n", - "diff" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "hide-input" - ] - }, - "outputs": [], - "source": [ - "diff = (splits\n", - " .test_y\n", - " .index\n", - " .levels[-1]\n", - " .difference(splits\n", - " .train_X\n", - " .index\n", - " .levels[-1]\n", - " ).to_list())\n", - "if diff:\n", - " to_remove = splits.test_y.loc[pd.IndexSlice[:, diff]]\n", - " display(to_remove)\n", - " splits.train_X = pd.concat([splits.train_X, to_remove])\n", - " splits.test_y = splits.test_y.drop(to_remove.index)\n", - "diff" + "splits = vaep.sampling.check_split_integrity(splits)" ] }, { @@ -1812,17 +1771,6 @@ "writer.close()\n", "dumps" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "hide-input" - ] - }, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/project/01_0_split_data.py b/project/01_0_split_data.py index 9be123d10..e4b4c2450 100644 --- a/project/01_0_split_data.py +++ b/project/01_0_split_data.py @@ -724,38 +724,7 @@ def join_as_str(seq): # -> or raise error as feature completness treshold is so low that less than 3 samples # per feature are allowd. -diff = (splits - .val_y - .index - .levels[-1] - .difference(splits - .train_X - .index - .levels[-1] - ).to_list()) -if diff: - to_remove = splits.val_y.loc[pd.IndexSlice[:, diff]] - display(to_remove) - splits.train_X = pd.concat([splits.train_X, to_remove]) - splits.val_y = splits.val_y.drop(to_remove.index) -diff - -# %% tags=["hide-input"] -diff = (splits - .test_y - .index - .levels[-1] - .difference(splits - .train_X - .index - .levels[-1] - ).to_list()) -if diff: - to_remove = splits.test_y.loc[pd.IndexSlice[:, diff]] - display(to_remove) - splits.train_X = pd.concat([splits.train_X, to_remove]) - splits.test_y = splits.test_y.drop(to_remove.index) -diff +splits = vaep.sampling.check_split_integrity(splits) # %% [markdown] # Some tools require at least 4 observation in the training data, @@ -963,5 +932,3 @@ def join_as_str(seq): # %% tags=["hide-input"] writer.close() dumps - -# %% tags=["hide-input"] diff --git a/project/01_1_train_CF.ipynb b/project/01_1_train_CF.ipynb index b508cd19d..b1d3eac71 100644 --- a/project/01_1_train_CF.ipynb +++ b/project/01_1_train_CF.ipynb @@ -98,7 +98,6 @@ "# model\n", "# Dimensionality of encoding dimension (latent space of model)\n", "latent_dim: int = 10\n", - "# hidden_layers:str = '128_64' # Underscore separated string of layers, '128 64' for the encoder, reversed for decoder\n", "sample_idx_position: int = 0 # position of index which is sample ID\n", "model: str = 'CF' # model name\n", "model_key: str = 'CF' # potentially alternative key for model (grid search)\n", diff --git a/project/01_1_train_CF.py b/project/01_1_train_CF.py index 68acf2afa..847860649 100644 --- a/project/01_1_train_CF.py +++ b/project/01_1_train_CF.py @@ -69,7 +69,6 @@ # model # Dimensionality of encoding dimension (latent space of model) latent_dim: int = 10 -# hidden_layers:str = '128_64' # Underscore separated string of layers, '128 64' for the encoder, reversed for decoder sample_idx_position: int = 0 # position of index which is sample ID model: str = 'CF' # model name model_key: str = 'CF' # potentially alternative key for model (grid search) diff --git a/project/01_1_train_NAGuideR_methods.R b/project/01_1_train_NAGuideR_methods.R index 13997aa17..3918a663c 100644 --- a/project/01_1_train_NAGuideR_methods.R +++ b/project/01_1_train_NAGuideR_methods.R @@ -20,6 +20,8 @@ # - BiocManager could be moved to methods who are installed from BioConductor # + tags=["hide-input"] vscode={"languageId": "r"} +# options("install.lock"=FALSE) + packages_base_R <- c("BiocManager", "reshape2", "data.table", "readr", "tibble") @@ -130,6 +132,7 @@ nafunctions <- function(x, method = "zero") { else if (method == "qrilc") { install_bioconductor("impute") install_bioconductor("pcaMethods") + install_rpackage('gmm') install_rpackage('imputeLCMD') xxm <- t(df1) data_zero1 <- @@ -139,6 +142,7 @@ nafunctions <- function(x, method = "zero") { else if (method == "mindet") { install_bioconductor("impute") install_bioconductor("pcaMethods") + install_rpackage('gmm') install_rpackage('imputeLCMD') xxm <- as.matrix(df1) df <- imputeLCMD::impute.MinDet(xxm, q = 0.01) @@ -146,6 +150,7 @@ nafunctions <- function(x, method = "zero") { else if (method == "minprob") { install_bioconductor("impute") install_bioconductor("pcaMethods") + install_rpackage('gmm') install_rpackage('imputeLCMD') xxm <- as.matrix(df1) df <- @@ -278,6 +283,7 @@ nafunctions <- function(x, method = "zero") { install_bioconductor("impute") install_bioconductor("pcaMethods") + install_rpackage('gmm') install_rpackage('imputeLCMD') install_rpackage("magrittr") install_rpackage("glmnet") diff --git a/project/01_1_train_NAGuideR_methods.ipynb b/project/01_1_train_NAGuideR_methods.ipynb index 23fae4bd3..f6a7a6a1b 100644 --- a/project/01_1_train_NAGuideR_methods.ipynb +++ b/project/01_1_train_NAGuideR_methods.ipynb @@ -26,6 +26,8 @@ }, "outputs": [], "source": [ + "# options(\"install.lock\"=FALSE)\n", + "\n", "packages_base_R <-\n", " c(\"BiocManager\", \"reshape2\", \"data.table\", \"readr\", \"tibble\")\n", "\n", @@ -160,6 +162,7 @@ " else if (method == \"qrilc\") {\n", " install_bioconductor(\"impute\")\n", " install_bioconductor(\"pcaMethods\")\n", + " install_rpackage('gmm')\n", " install_rpackage('imputeLCMD')\n", " xxm <- t(df1)\n", " data_zero1 <-\n", @@ -169,6 +172,7 @@ " else if (method == \"mindet\") {\n", " install_bioconductor(\"impute\")\n", " install_bioconductor(\"pcaMethods\")\n", + " install_rpackage('gmm')\n", " install_rpackage('imputeLCMD')\n", " xxm <- as.matrix(df1)\n", " df <- imputeLCMD::impute.MinDet(xxm, q = 0.01)\n", @@ -176,6 +180,7 @@ " else if (method == \"minprob\") {\n", " install_bioconductor(\"impute\")\n", " install_bioconductor(\"pcaMethods\")\n", + " install_rpackage('gmm')\n", " install_rpackage('imputeLCMD')\n", " xxm <- as.matrix(df1)\n", " df <-\n", @@ -308,6 +313,7 @@ " \n", " install_bioconductor(\"impute\")\n", " install_bioconductor(\"pcaMethods\")\n", + " install_rpackage('gmm')\n", " install_rpackage('imputeLCMD')\n", " install_rpackage(\"magrittr\")\n", " install_rpackage(\"glmnet\")\n", diff --git a/project/04_1_train_DAE_VAE_wo_val_data.ipynb b/project/04_1_train_DAE_VAE_wo_val_data.ipynb deleted file mode 100644 index baaae316f..000000000 --- a/project/04_1_train_DAE_VAE_wo_val_data.ipynb +++ /dev/null @@ -1,348 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "2b842a1a", - "metadata": {}, - "source": [ - "# Scikit-learn styple transformers of the data\n", - "\n", - "1. Load data into pandas dataframe\n", - "2. Fit transformer on training data\n", - "3. Impute only missing values with predictions from model\n", - "\n", - "Autoencoders need wide training data, i.e. a sample with all its features' intensities, whereas\n", - "Collaborative Filtering needs long training data, i.e. sample identifier a feature identifier and the intensity.\n", - "Both data formats can be transformed into each other, but models using long data format do not need to\n", - "take care of missing values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b6c7aec0", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import pandas as pd\n", - "import numpy as np\n", - "\n", - "import vaep.plotting.data\n", - "from vaep.sklearn.ae_transformer import AETransformer\n", - "import vaep.sampling\n", - "\n", - "\n", - "IN_COLAB = 'COLAB_GPU' in os.environ\n", - "\n", - "fn_intensities = 'data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv'\n", - "if IN_COLAB:\n", - " fn_intensities = 'https://raw.githubusercontent.com/RasmussenLab/pimms/main/project/data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4f1ccbdd", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "\n", - "vaep.plotting.make_large_descriptors(8)" - ] - }, - { - "cell_type": "markdown", - "id": "9ff56309", - "metadata": {}, - "source": [ - "## Data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b921b86c", - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.read_csv(fn_intensities, index_col=0)\n", - "df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "43798bb3", - "metadata": {}, - "source": [ - "We will need the data in long format for Collaborative Filtering.\n", - "Naming both the row and column index assures\n", - "that the data can be transformed very easily into long format:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6d29c02d", - "metadata": {}, - "outputs": [], - "source": [ - "df.index.name = 'Sample ID' # already set\n", - "df.columns.name = 'protein group' # not set due to csv disk file format\n", - "df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "cb166253", - "metadata": {}, - "source": [] - }, - { - "cell_type": "markdown", - "id": "dd2148b8", - "metadata": {}, - "source": [ - "Transform the data using the logarithm, here using base 2:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b599efb8", - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "df = np.log2(df + 1)\n", - "df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "8b264559", - "metadata": {}, - "source": [ - "two plots on data availability:\n", - "\n", - "1. proportion of missing values per feature median (N = protein groups)\n", - "2. CDF of available intensities per protein group" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "da3c8dba", - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "ax = vaep.plotting.data.plot_feat_median_over_prop_missing(\n", - " data=df, type='boxplot')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a912a04a", - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "df.notna().sum().sort_values().plot()" - ] - }, - { - "cell_type": "markdown", - "id": "29d451c0", - "metadata": {}, - "source": [ - "define a minimum feature and sample frequency for a feature to be included" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ed278540", - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "SELECT_FEAT = True\n", - "\n", - "\n", - "def select_features(df, feat_prevalence=.2, axis=0):\n", - " # # ! vaep.filter.select_features\n", - " N = df.shape[axis]\n", - " minimum_freq = N * feat_prevalence\n", - " freq = df.notna().sum(axis=axis)\n", - " mask = freq >= minimum_freq\n", - " print(f\"Drop {(~mask).sum()} along axis {axis}.\")\n", - " freq = freq.loc[mask]\n", - " if axis == 0:\n", - " df = df.loc[:, mask]\n", - " else:\n", - " df = df.loc[mask]\n", - " return df\n", - "\n", - "\n", - "if SELECT_FEAT:\n", - " # potentially this can take a few iterations to stabilize.\n", - " df = select_features(df, feat_prevalence=.2)\n", - " df = select_features(df=df, feat_prevalence=.3, axis=1)\n", - "df.shape" - ] - }, - { - "cell_type": "markdown", - "id": "9f4bcf48", - "metadata": {}, - "source": [ - "## AutoEncoder architectures" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "487a4f7c", - "metadata": {}, - "outputs": [], - "source": [ - "# Reload data (for demonstration)\n", - "\n", - "df = pd.read_csv(fn_intensities, index_col=0)\n", - "df.index.name = 'Sample ID' # already set\n", - "df.columns.name = 'protein group' # not set due to csv disk file format\n", - "df = np.log2(df + 1) # log transform\n", - "df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "ceee9ced", - "metadata": {}, - "source": [ - "Test `DAE` or `VAE` model without validation data:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5c6b5ab9", - "metadata": {}, - "outputs": [], - "source": [ - "model_selected = 'VAE' # 'DAE'\n", - "model = AETransformer(\n", - " model=model_selected,\n", - " hidden_layers=[512,],\n", - " latent_dim=50,\n", - " out_folder='runs/scikit_interface',\n", - " batch_size=10,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f4ab5d10", - "metadata": {}, - "outputs": [], - "source": [ - "model.fit(df,\n", - " epochs_max=2,\n", - " cuda=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9ae733fc", - "metadata": {}, - "outputs": [], - "source": [ - "df_imputed = model.transform(df)\n", - "df_imputed" - ] - }, - { - "cell_type": "markdown", - "id": "3526ad09", - "metadata": {}, - "source": [ - "DAE" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e87d1eae", - "metadata": {}, - "outputs": [], - "source": [ - "model_selected = 'DAE'\n", - "model = AETransformer(\n", - " model=model_selected,\n", - " hidden_layers=[512,],\n", - " latent_dim=50,\n", - " out_folder='runs/scikit_interface',\n", - " batch_size=10,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c6c60295", - "metadata": {}, - "outputs": [], - "source": [ - "model.fit(df,\n", - " epochs_max=2,\n", - " cuda=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "27e80959", - "metadata": {}, - "outputs": [], - "source": [ - "df_imputed = model.transform(df)\n", - "df_imputed" - ] - } - ], - "metadata": { - "jupytext": { - "cell_metadata_filter": "-all", - "main_language": "python", - "notebook_metadata_filter": "-all" - }, - "kernelspec": { - "display_name": "Python", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.17" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/project/04_1_train_DAE_VAE_wo_val_data.py b/project/04_1_train_DAE_VAE_wo_val_data.py deleted file mode 100644 index 2fbc3564d..000000000 --- a/project/04_1_train_DAE_VAE_wo_val_data.py +++ /dev/null @@ -1,160 +0,0 @@ -# %% [markdown] -# # Scikit-learn styple transformers of the data -# -# 1. Load data into pandas dataframe -# 2. Fit transformer on training data -# 3. Impute only missing values with predictions from model -# -# Autoencoders need wide training data, i.e. a sample with all its features' intensities, whereas -# Collaborative Filtering needs long training data, i.e. sample identifier a feature identifier and the intensity. -# Both data formats can be transformed into each other, but models using long data format do not need to -# take care of missing values. - -# %% -import os -import pandas as pd -import numpy as np - -import vaep.plotting.data -from vaep.sklearn.ae_transformer import AETransformer -import vaep.sampling - - -IN_COLAB = 'COLAB_GPU' in os.environ - -fn_intensities = 'data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv' -if IN_COLAB: - fn_intensities = 'https://raw.githubusercontent.com/RasmussenLab/pimms/main/project/data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv' - -# %% - - -vaep.plotting.make_large_descriptors(8) - -# %% [markdown] -# ## Data - -# %% -df = pd.read_csv(fn_intensities, index_col=0) -df.head() - -# %% [markdown] -# We will need the data in long format for Collaborative Filtering. -# Naming both the row and column index assures -# that the data can be transformed very easily into long format: - -# %% -df.index.name = 'Sample ID' # already set -df.columns.name = 'protein group' # not set due to csv disk file format -df.head() - -# %% [markdown] -# - -# %% [markdown] -# Transform the data using the logarithm, here using base 2: - -# %% -df = np.log2(df + 1) -df.head() - - -# %% [markdown] -# two plots on data availability: -# -# 1. proportion of missing values per feature median (N = protein groups) -# 2. CDF of available intensities per protein group - -# %% -ax = vaep.plotting.data.plot_feat_median_over_prop_missing( - data=df, type='boxplot') - - -# %% -df.notna().sum().sort_values().plot() - - -# %% [markdown] -# define a minimum feature and sample frequency for a feature to be included - -# %% -SELECT_FEAT = True - - -def select_features(df, feat_prevalence=.2, axis=0): - # # ! vaep.filter.select_features - N = df.shape[axis] - minimum_freq = N * feat_prevalence - freq = df.notna().sum(axis=axis) - mask = freq >= minimum_freq - print(f"Drop {(~mask).sum()} along axis {axis}.") - freq = freq.loc[mask] - if axis == 0: - df = df.loc[:, mask] - else: - df = df.loc[mask] - return df - - -if SELECT_FEAT: - # potentially this can take a few iterations to stabilize. - df = select_features(df, feat_prevalence=.2) - df = select_features(df=df, feat_prevalence=.3, axis=1) -df.shape - - -# %% [markdown] -# ## AutoEncoder architectures - -# %% -# Reload data (for demonstration) - -df = pd.read_csv(fn_intensities, index_col=0) -df.index.name = 'Sample ID' # already set -df.columns.name = 'protein group' # not set due to csv disk file format -df = np.log2(df + 1) # log transform -df.head() - -# %% [markdown] -# Test `DAE` or `VAE` model without validation data: - -# %% -model_selected = 'VAE' # 'DAE' -model = AETransformer( - model=model_selected, - hidden_layers=[512,], - latent_dim=50, - out_folder='runs/scikit_interface', - batch_size=10, -) - -# %% -model.fit(df, - epochs_max=2, - cuda=False) - -# %% -df_imputed = model.transform(df) -df_imputed - -# %% [markdown] -# DAE - -# %% -model_selected = 'DAE' -model = AETransformer( - model=model_selected, - hidden_layers=[512,], - latent_dim=50, - out_folder='runs/scikit_interface', - batch_size=10, -) - -# %% -model.fit(df, - epochs_max=2, - cuda=False) - -# %% -df_imputed = model.transform(df) -df_imputed diff --git a/project/04_1_train_pimms_models.ipynb b/project/04_1_train_pimms_models.ipynb index 5169e2d07..657874a23 100644 --- a/project/04_1_train_pimms_models.ipynb +++ b/project/04_1_train_pimms_models.ipynb @@ -26,6 +26,7 @@ "source": [ "import os\n", "from importlib import metadata\n", + "\n", "IN_COLAB = 'COLAB_GPU' in os.environ\n", "if IN_COLAB:\n", " try:\n", @@ -42,45 +43,99 @@ "id": "36b5a27d", "metadata": {}, "source": [ - "If on colab, please restart the environment and run everything from here on." + "If on colab, please restart the environment and run everything from here on.\n", + "\n", + "Specify example data:" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "65b5bdaf", "metadata": {}, "outputs": [], "source": [ "import os\n", + "\n", "IN_COLAB = 'COLAB_GPU' in os.environ\n", "\n", "fn_intensities = 'data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv'\n", "if IN_COLAB:\n", - " fn_intensities = 'https://raw.githubusercontent.com/RasmussenLab/pimms/main/project/data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv'" + " fn_intensities = ('https://raw.githubusercontent.com/RasmussenLab/pimms/main/'\n", + " 'project/data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv')" + ] + }, + { + "cell_type": "markdown", + "id": "2d628ce9", + "metadata": {}, + "source": [ + "Load package." ] }, { "cell_type": "code", "execution_count": null, "id": "1c289d17", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ - "import numpy as np\n", - "import pandas as pd\n", + "import logging\n", + "import random\n", "\n", "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "from IPython.display import display\n", "\n", - "\n", - "from vaep.plotting.defaults import color_model_mapping\n", + "import vaep.filter\n", "import vaep.plotting.data\n", "import vaep.sampling\n", - "\n", - "from vaep.sklearn.cf_transformer import CollaborativeFilteringTransformer\n", + "from vaep.plotting.defaults import color_model_mapping\n", "from vaep.sklearn.ae_transformer import AETransformer\n", + "from vaep.sklearn.cf_transformer import CollaborativeFilteringTransformer\n", "\n", - "vaep.plotting.make_large_descriptors(8)" + "vaep.plotting.make_large_descriptors(8)\n", + "\n", + "\n", + "logger = logger = vaep.logging.setup_nb_logger()\n", + "logging.getLogger('fontTools').setLevel(logging.ERROR)" + ] + }, + { + "cell_type": "markdown", + "id": "59282a30", + "metadata": {}, + "source": [ + "## Parameters\n", + "Can be set by papermill on the command line or manually in the (colab) notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab9b68d6", + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "fn_intensities: str = fn_intensities # path or url to the data file in csv format\n", + "index_name: str = 'Sample ID' # name of the index column\n", + "column_name: str = 'protein group' # name of the column index\n", + "select_features: bool = True # Whether to select features based on prevalence\n", + "feat_prevalence: float = 0.2 # minimum prevalence of a feature to be included\n", + "sample_completeness: float = 0.3 # minimum completeness of a sample to be included\n", + "sample_splits: bool = True # Whether to sample validation and test data\n", + "frac_non_train: float = 0.1 # fraction of non training data (validation and test split)\n", + "frac_mnar: float = 0.0 # fraction of missing not at random data, rest: missing completely at random\n", + "random_state: int = 42 # random state for reproducibility" ] }, { @@ -95,7 +150,11 @@ "cell_type": "code", "execution_count": null, "id": "4a3edbdd", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "df = pd.read_csv(fn_intensities, index_col=0)\n", @@ -116,11 +175,15 @@ "cell_type": "code", "execution_count": null, "id": "4fde25e9", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ - "df.index.name = 'Sample ID' # already set\n", - "df.columns.name = 'protein group' # not set due to csv disk file format\n", + "df.index.name = index_name # already set\n", + "df.columns.name = column_name # not set due to csv disk file format\n", "df.head()" ] }, @@ -128,13 +191,8 @@ "cell_type": "markdown", "id": "c6c788f0", "metadata": {}, - "source": [] - }, - { - "cell_type": "markdown", - "id": "2ab8dc7f", - "metadata": {}, "source": [ + "### Data transformation: log2 transformation\n", "Transform the data using the logarithm, here using base 2:" ] }, @@ -143,7 +201,10 @@ "execution_count": null, "id": "554d4fa7", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -156,7 +217,7 @@ "id": "fbce73d1", "metadata": {}, "source": [ - "two plots on data availability:\n", + "### two plots inspecting data availability\n", "\n", "1. proportion of missing values per feature median (N = protein groups)\n", "2. CDF of available intensities per protein group" @@ -167,7 +228,10 @@ "execution_count": null, "id": "536793bb", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -180,7 +244,10 @@ "execution_count": null, "id": "313bc55e", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -192,6 +259,7 @@ "id": "cf54a3af", "metadata": {}, "source": [ + "### Data selection\n", "define a minimum feature and sample frequency for a feature to be included" ] }, @@ -200,32 +268,17 @@ "execution_count": null, "id": "28ad27de", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ - "SELECT_FEAT = True\n", - "\n", - "\n", - "def select_features(df, feat_prevalence=.2, axis=0):\n", - " # # ! vaep.filter.select_features\n", - " N = df.shape[axis]\n", - " minimum_freq = N * feat_prevalence\n", - " freq = df.notna().sum(axis=axis)\n", - " mask = freq >= minimum_freq\n", - " print(f\"Drop {(~mask).sum()} along axis {axis}.\")\n", - " freq = freq.loc[mask]\n", - " if axis == 0:\n", - " df = df.loc[:, mask]\n", - " else:\n", - " df = df.loc[mask]\n", - " return df\n", - "\n", - "\n", - "if SELECT_FEAT:\n", + "if select_features:\n", " # potentially this can take a few iterations to stabilize.\n", - " df = select_features(df, feat_prevalence=.2)\n", - " df = select_features(df=df, feat_prevalence=.3, axis=1)\n", + " df = vaep.filter.select_features(df, feat_prevalence=feat_prevalence)\n", + " df = vaep.filter.select_features(df=df, feat_prevalence=sample_completeness, axis=1)\n", "df.shape" ] }, @@ -242,12 +295,48 @@ "execution_count": null, "id": "646ea5bb", "metadata": { - "lines_to_next_cell": 2 + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ "df = df.stack().to_frame('intensity')\n", - "df.head()" + "df" + ] + }, + { + "cell_type": "markdown", + "id": "54d4b781", + "metadata": {}, + "source": [ + "## Optionally: Sample data\n", + "- models can be trained without subsetting the data\n", + "- allows evaluation of the models" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2586a1c", + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "if sample_splits:\n", + " splits, thresholds, fake_na_mcar, fake_na_mnar = vaep.sampling.sample_mnar_mcar(\n", + " df_long=df,\n", + " frac_non_train=frac_non_train,\n", + " frac_mnar=frac_mnar,\n", + " random_state=random_state,\n", + " )\n", + " splits = vaep.sampling.check_split_integrity(splits)\n", + "else:\n", + " splits = vaep.sampling.DataSplits(is_wide_format=False)\n", + " splits.train_X = df" ] }, { @@ -263,7 +352,9 @@ "id": "d2390859", "metadata": {}, "source": [ - "## Collaborative Filtering" + "## Collaborative Filtering\n", + "\n", + "Inspect annotations of the scikit-learn like Transformer:" ] }, { @@ -273,7 +364,7 @@ "metadata": {}, "outputs": [], "source": [ - "# # # # CollaborativeFilteringTransformer?" + "CollaborativeFilteringTransformer?" ] }, { @@ -317,12 +408,21 @@ "metadata": {}, "outputs": [], "source": [ - "cf_model.fit(df,\n", + "cf_model.fit(splits.train_X,\n", + " splits.val_y,\n", " cuda=False,\n", " epochs_max=20,\n", " )" ] }, + { + "cell_type": "markdown", + "id": "7db2f4c4", + "metadata": {}, + "source": [ + "Impute missing values usin `transform` method:" + ] + }, { "cell_type": "code", "execution_count": null, @@ -347,7 +447,11 @@ "cell_type": "code", "execution_count": null, "id": "99ff7ecd", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "df_imputed = df_imputed.stack() # long-format\n", @@ -356,16 +460,8 @@ "df_imputed = df_imputed.unstack() # back to wide-format\n", "# some checks\n", "assert len(df) == len(observed)\n", - "assert df_imputed.shape[0] * df_imputed.shape[1] == len(imputed) + len(observed)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "addb7cbf", - "metadata": {}, - "outputs": [], - "source": [ + "assert df_imputed.shape[0] * df_imputed.shape[1] == len(imputed) + len(observed)\n", + "\n", "fig, axes = plt.subplots(2, figsize=(8, 4))\n", "\n", "min_max = vaep.plotting.data.get_min_max_iterable(\n", @@ -405,16 +501,16 @@ "cell_type": "code", "execution_count": null, "id": "b7184c2e", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ - "# Reload data (for demonstration)\n", - "\n", - "df = pd.read_csv(fn_intensities, index_col=0)\n", - "df.index.name = 'Sample ID' # already set\n", - "df.columns.name = 'protein group' # not set due to csv disk file format\n", - "df = np.log2(df + 1) # log transform\n", - "df.head()" + "# Use wide format of data\n", + "splits.to_wide_format()\n", + "splits.train_X" ] }, { @@ -422,21 +518,21 @@ "id": "ae52c6fe", "metadata": {}, "source": [ - "The AutoEncoder model currently need validation data for training.\n", - "We will use 10% of the training data for validation.\n", - "> Expect this limitation to be dropped in the next release. It will still be recommended\n", - "> to use validation data for early stopping." + "Validation data for early stopping (if specified)" ] }, { "cell_type": "code", "execution_count": null, "id": "8bbd0017", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ - "freq_feat = df.notna().sum()\n", - "freq_feat.head() # training data" + "splits.val_y" ] }, { @@ -444,25 +540,25 @@ "id": "6da6c4e2", "metadata": {}, "source": [ - "We will use the `sampling` module to sample the validation data from the training data.\n", - "Could be split differently by providing another `weights` vector." + "Training and validation need the same shape:" ] }, { "cell_type": "code", "execution_count": null, "id": "99e690f9", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ - "val_X, train_X = vaep.sampling.sample_data(df.stack(),\n", - " sample_index_to_drop=0,\n", - " weights=freq_feat,\n", - " frac=0.1,\n", - " random_state=42,)\n", - "val_X, train_X = val_X.unstack(), train_X.unstack()\n", - "val_X = pd.DataFrame(pd.NA, index=train_X.index,\n", - " columns=train_X.columns).fillna(val_X)" + "if splits.val_y is not None:\n", + " splits.val_y = pd.DataFrame(pd.NA, index=splits.train_X.index,\n", + " columns=splits.train_X.columns).fillna(splits.val_y)\n", + "\n", + " print(splits.train_X.shape, splits.val_y.shape)" ] }, { @@ -470,7 +566,7 @@ "id": "45b9c22c", "metadata": {}, "source": [ - "Training data and validation data have the same shape:" + "Select either `DAE` or `VAE` model by chance:" ] }, { @@ -480,43 +576,8 @@ "metadata": {}, "outputs": [], "source": [ - "val_X.shape, train_X.shape" - ] - }, - { - "cell_type": "markdown", - "id": "f89fb41f", - "metadata": {}, - "source": [ - "... but different number of intensities (non-missing values):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "62cd0721", - "metadata": {}, - "outputs": [], - "source": [ - "train_X.notna().sum().sum(), val_X.notna().sum().sum()," - ] - }, - { - "cell_type": "markdown", - "id": "b5a0b973", - "metadata": {}, - "source": [ - "Select either `DAE` or `VAE` model:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "26a12a3e", - "metadata": {}, - "outputs": [], - "source": [ - "model_selected = 'VAE' # 'DAE'\n", + "model_selected = random.choice(['DAE', 'VAE'])\n", + "print(\"Selected model by chance:\", model_selected)\n", "model = AETransformer(\n", " model=model_selected,\n", " hidden_layers=[512,],\n", @@ -529,78 +590,68 @@ { "cell_type": "code", "execution_count": null, - "id": "4d3c7922", + "id": "aea5dc44", "metadata": {}, "outputs": [], "source": [ - "model.fit(train_X, val_X,\n", + "model.fit(splits.train_X, splits.val_y,\n", " epochs_max=50,\n", " cuda=False)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "24ca6c2c", - "metadata": {}, - "outputs": [], - "source": [ - "df_imputed = model.transform(train_X)\n", - "df_imputed" - ] - }, { "cell_type": "markdown", - "id": "17398941", + "id": "f89fb41f", "metadata": {}, "source": [ - "Evaluate the model using the validation data:" + "Impute missing values using `transform` method:" ] }, { "cell_type": "code", "execution_count": null, - "id": "2a664072", + "id": "62cd0721", "metadata": {}, "outputs": [], "source": [ - "pred_val = val_X.stack().to_frame('observed')\n", - "pred_val[model_selected] = df_imputed.stack()\n", - "pred_val" + "df_imputed = model.transform(splits.train_X).stack()\n", + "df_imputed" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "6eaa510a", + "cell_type": "markdown", + "id": "17398941", "metadata": {}, - "outputs": [], "source": [ - "val_metrics = vaep.models.calculte_metrics(pred_val, 'observed')\n", - "# val_metrics = metrics.add_metrics(\n", - "# pred_val, key='test data')\n", - "# val_metrics = pd.DataFrame(val_metrics)\n", - "# val_metrics\n", - "pd.DataFrame(val_metrics)" + "Evaluate the model using the validation data:" ] }, { "cell_type": "code", "execution_count": null, - "id": "f3013daf", - "metadata": {}, + "id": "2a664072", + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ - "fig, ax = plt.subplots(figsize=(8, 2))\n", + "if splits.val_y is not None:\n", + " pred_val = splits.val_y.stack().to_frame('observed')\n", + " pred_val[model_selected] = df_imputed\n", + " val_metrics = vaep.models.calculte_metrics(pred_val, 'observed')\n", + " display(val_metrics)\n", "\n", - "ax, errors_binned = vaep.plotting.errors.plot_errors_by_median(\n", - " pred=pred_val,\n", - " target_col='observed',\n", - " feat_medians=train_X.median(),\n", - " ax=ax,\n", - " metric_name='MAE',\n", - " palette=color_model_mapping\n", - ")" + " fig, ax = plt.subplots(figsize=(8, 2))\n", + "\n", + " ax, errors_binned = vaep.plotting.errors.plot_errors_by_median(\n", + " pred=pred_val,\n", + " target_col='observed',\n", + " feat_medians=splits.train_X.median(),\n", + " ax=ax,\n", + " metric_name='MAE',\n", + " palette=color_model_mapping)" ] }, { @@ -615,38 +666,44 @@ "cell_type": "code", "execution_count": null, "id": "f89799e8", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ - "df_imputed = df_imputed.replace(val_X)" + "splits.to_long_format()\n", + "df_imputed = df_imputed.replace(splits.val_y).replace(splits.test_y)\n", + "df_imputed" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "235fdb66", + "cell_type": "markdown", + "id": "506de661", "metadata": {}, - "outputs": [], "source": [ - "df = df.stack() # long-format\n", - "df_imputed = df_imputed.stack() # long-format\n", - "observed = df_imputed.loc[df.index]\n", - "imputed = df_imputed.loc[df_imputed.index.difference(df.index)]" + "Plot the distribution of the imputed values vs the observed data:" ] }, { "cell_type": "code", "execution_count": null, - "id": "851ab631", + "id": "235fdb66", "metadata": { - "lines_to_next_cell": 2 + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ + "observed = df_imputed.loc[df.index].squeeze()\n", + "imputed = df_imputed.loc[df_imputed.index.difference(df.index)].squeeze()\n", + "\n", "fig, axes = plt.subplots(2, figsize=(8, 4))\n", "\n", - "min_max = vaep.plotting.data.get_min_max_iterable(\n", - " [observed, imputed])\n", + "min_max = vaep.plotting.data.get_min_max_iterable([observed, imputed])\n", + "\n", "label_template = '{method} (N={n:,d})'\n", "ax, _ = vaep.plotting.data.plot_histogram_intensities(\n", " observed,\n", @@ -669,14 +726,6 @@ " alpha=1)\n", "_ = ax.legend()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a235f133", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/project/04_1_train_pimms_models.py b/project/04_1_train_pimms_models.py index 0a11b509c..79db2634b 100644 --- a/project/04_1_train_pimms_models.py +++ b/project/04_1_train_pimms_models.py @@ -13,6 +13,7 @@ # %% import os from importlib import metadata + IN_COLAB = 'COLAB_GPU' in os.environ if IN_COLAB: try: @@ -25,31 +26,60 @@ # %% [markdown] # If on colab, please restart the environment and run everything from here on. +# +# Specify example data: # %% import os + IN_COLAB = 'COLAB_GPU' in os.environ fn_intensities = 'data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv' if IN_COLAB: - fn_intensities = 'https://raw.githubusercontent.com/RasmussenLab/pimms/main/project/data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv' + fn_intensities = ('https://raw.githubusercontent.com/RasmussenLab/pimms/main/' + 'project/data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv') + +# %% [markdown] +# Load package. # %% -import numpy as np -import pandas as pd +import logging +import random import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from IPython.display import display - -from vaep.plotting.defaults import color_model_mapping +import vaep.filter import vaep.plotting.data import vaep.sampling - -from vaep.sklearn.cf_transformer import CollaborativeFilteringTransformer +from vaep.plotting.defaults import color_model_mapping from vaep.sklearn.ae_transformer import AETransformer +from vaep.sklearn.cf_transformer import CollaborativeFilteringTransformer vaep.plotting.make_large_descriptors(8) + +logger = logger = vaep.logging.setup_nb_logger() +logging.getLogger('fontTools').setLevel(logging.ERROR) + +# %% [markdown] +# ## Parameters +# Can be set by papermill on the command line or manually in the (colab) notebook. + +# %% +fn_intensities: str = fn_intensities # path or url to the data file in csv format +index_name: str = 'Sample ID' # name of the index column +column_name: str = 'protein group' # name of the column index +select_features: bool = True # Whether to select features based on prevalence +feat_prevalence: float = 0.2 # minimum prevalence of a feature to be included +sample_completeness: float = 0.3 # minimum completeness of a sample to be included +sample_splits: bool = True # Whether to sample validation and test data +frac_non_train: float = 0.1 # fraction of non training data (validation and test split) +frac_mnar: float = 0.0 # fraction of missing not at random data, rest: missing completely at random +random_state: int = 42 # random state for reproducibility + # %% [markdown] # ## Data @@ -63,14 +93,12 @@ # that the data can be transformed very easily into long format: # %% -df.index.name = 'Sample ID' # already set -df.columns.name = 'protein group' # not set due to csv disk file format +df.index.name = index_name # already set +df.columns.name = column_name # not set due to csv disk file format df.head() # %% [markdown] -# - -# %% [markdown] +# ### Data transformation: log2 transformation # Transform the data using the logarithm, here using base 2: # %% @@ -79,7 +107,7 @@ # %% [markdown] -# two plots on data availability: +# ### two plots inspecting data availability # # 1. proportion of missing values per feature median (N = protein groups) # 2. CDF of available intensities per protein group @@ -94,31 +122,14 @@ # %% [markdown] +# ### Data selection # define a minimum feature and sample frequency for a feature to be included # %% -SELECT_FEAT = True - - -def select_features(df, feat_prevalence=.2, axis=0): - # # ! vaep.filter.select_features - N = df.shape[axis] - minimum_freq = N * feat_prevalence - freq = df.notna().sum(axis=axis) - mask = freq >= minimum_freq - print(f"Drop {(~mask).sum()} along axis {axis}.") - freq = freq.loc[mask] - if axis == 0: - df = df.loc[:, mask] - else: - df = df.loc[mask] - return df - - -if SELECT_FEAT: +if select_features: # potentially this can take a few iterations to stabilize. - df = select_features(df, feat_prevalence=.2) - df = select_features(df=df, feat_prevalence=.3, axis=1) + df = vaep.filter.select_features(df, feat_prevalence=feat_prevalence) + df = vaep.filter.select_features(df=df, feat_prevalence=sample_completeness, axis=1) df.shape @@ -127,17 +138,36 @@ def select_features(df, feat_prevalence=.2, axis=0): # %% df = df.stack().to_frame('intensity') -df.head() +df +# %% [markdown] +# ## Optionally: Sample data +# - models can be trained without subsetting the data +# - allows evaluation of the models + +# %% +if sample_splits: + splits, thresholds, fake_na_mcar, fake_na_mnar = vaep.sampling.sample_mnar_mcar( + df_long=df, + frac_non_train=frac_non_train, + frac_mnar=frac_mnar, + random_state=random_state, + ) + splits = vaep.sampling.check_split_integrity(splits) +else: + splits = vaep.sampling.DataSplits(is_wide_format=False) + splits.train_X = df # %% [markdown] # The resulting DataFrame with one column has an `MulitIndex` with the sample and feature identifier. # %% [markdown] # ## Collaborative Filtering +# +# Inspect annotations of the scikit-learn like Transformer: # %% -# # # # CollaborativeFilteringTransformer? +# # CollaborativeFilteringTransformer? # %% [markdown] # Let's set up collaborative filtering without a validation or test set, using @@ -157,11 +187,15 @@ def select_features(df, feat_prevalence=.2, axis=0): # > This will probably mean setting up a validation set within the model. # %% -cf_model.fit(df, +cf_model.fit(splits.train_X, + splits.val_y, cuda=False, epochs_max=20, ) +# %% [markdown] +# Impute missing values usin `transform` method: + # %% df_imputed = cf_model.transform(df).unstack() assert df_imputed.isna().sum().sum() == 0 @@ -179,7 +213,6 @@ def select_features(df, feat_prevalence=.2, axis=0): assert len(df) == len(observed) assert df_imputed.shape[0] * df_imputed.shape[1] == len(imputed) + len(observed) -# %% fig, axes = plt.subplots(2, figsize=(8, 4)) min_max = vaep.plotting.data.get_min_max_iterable( @@ -210,55 +243,32 @@ def select_features(df, feat_prevalence=.2, axis=0): # ## AutoEncoder architectures # %% -# Reload data (for demonstration) - -df = pd.read_csv(fn_intensities, index_col=0) -df.index.name = 'Sample ID' # already set -df.columns.name = 'protein group' # not set due to csv disk file format -df = np.log2(df + 1) # log transform -df.head() +# Use wide format of data +splits.to_wide_format() +splits.train_X # %% [markdown] -# The AutoEncoder model currently need validation data for training. -# We will use 10% of the training data for validation. -# > Expect this limitation to be dropped in the next release. It will still be recommended -# > to use validation data for early stopping. +# Validation data for early stopping (if specified) # %% -freq_feat = df.notna().sum() -freq_feat.head() # training data +splits.val_y # %% [markdown] -# We will use the `sampling` module to sample the validation data from the training data. -# Could be split differently by providing another `weights` vector. +# Training and validation need the same shape: # %% -val_X, train_X = vaep.sampling.sample_data(df.stack(), - sample_index_to_drop=0, - weights=freq_feat, - frac=0.1, - random_state=42,) -val_X, train_X = val_X.unstack(), train_X.unstack() -val_X = pd.DataFrame(pd.NA, index=train_X.index, - columns=train_X.columns).fillna(val_X) - -# %% [markdown] -# Training data and validation data have the same shape: +if splits.val_y is not None: + splits.val_y = pd.DataFrame(pd.NA, index=splits.train_X.index, + columns=splits.train_X.columns).fillna(splits.val_y) -# %% -val_X.shape, train_X.shape + print(splits.train_X.shape, splits.val_y.shape) # %% [markdown] -# ... but different number of intensities (non-missing values): +# Select either `DAE` or `VAE` model by chance: # %% -train_X.notna().sum().sum(), val_X.notna().sum().sum(), - -# %% [markdown] -# Select either `DAE` or `VAE` model: - -# %% -model_selected = 'VAE' # 'DAE' +model_selected = random.choice(['DAE', 'VAE']) +print("Selected model by chance:", model_selected) model = AETransformer( model=model_selected, hidden_layers=[512,], @@ -268,59 +278,56 @@ def select_features(df, feat_prevalence=.2, axis=0): ) # %% -model.fit(train_X, val_X, +model.fit(splits.train_X, splits.val_y, epochs_max=50, cuda=False) +# %% [markdown] +# Impute missing values using `transform` method: + # %% -df_imputed = model.transform(train_X) +df_imputed = model.transform(splits.train_X).stack() df_imputed # %% [markdown] # Evaluate the model using the validation data: # %% -pred_val = val_X.stack().to_frame('observed') -pred_val[model_selected] = df_imputed.stack() -pred_val - -# %% -val_metrics = vaep.models.calculte_metrics(pred_val, 'observed') -# val_metrics = metrics.add_metrics( -# pred_val, key='test data') -# val_metrics = pd.DataFrame(val_metrics) -# val_metrics -pd.DataFrame(val_metrics) - -# %% -fig, ax = plt.subplots(figsize=(8, 2)) - -ax, errors_binned = vaep.plotting.errors.plot_errors_by_median( - pred=pred_val, - target_col='observed', - feat_medians=train_X.median(), - ax=ax, - metric_name='MAE', - palette=color_model_mapping -) +if splits.val_y is not None: + pred_val = splits.val_y.stack().to_frame('observed') + pred_val[model_selected] = df_imputed + val_metrics = vaep.models.calculte_metrics(pred_val, 'observed') + display(val_metrics) + + fig, ax = plt.subplots(figsize=(8, 2)) + + ax, errors_binned = vaep.plotting.errors.plot_errors_by_median( + pred=pred_val, + target_col='observed', + feat_medians=splits.train_X.median(), + ax=ax, + metric_name='MAE', + palette=color_model_mapping) # %% [markdown] # replace predicted values with validation data values # %% -df_imputed = df_imputed.replace(val_X) +splits.to_long_format() +df_imputed = df_imputed.replace(splits.val_y).replace(splits.test_y) +df_imputed -# %% -df = df.stack() # long-format -df_imputed = df_imputed.stack() # long-format -observed = df_imputed.loc[df.index] -imputed = df_imputed.loc[df_imputed.index.difference(df.index)] +# %% [markdown] +# Plot the distribution of the imputed values vs the observed data: # %% +observed = df_imputed.loc[df.index].squeeze() +imputed = df_imputed.loc[df_imputed.index.difference(df.index)].squeeze() + fig, axes = plt.subplots(2, figsize=(8, 4)) -min_max = vaep.plotting.data.get_min_max_iterable( - [observed, imputed]) +min_max = vaep.plotting.data.get_min_max_iterable([observed, imputed]) + label_template = '{method} (N={n:,d})' ax, _ = vaep.plotting.data.plot_histogram_intensities( observed, @@ -342,6 +349,3 @@ def select_features(df, feat_prevalence=.2, axis=0): color=color_model_mapping[model_selected], alpha=1) _ = ax.legend() - - -# %% diff --git a/project/README.md b/project/README.md index eddd93109..ba3cc6b25 100644 --- a/project/README.md +++ b/project/README.md @@ -84,8 +84,7 @@ papermill 01_0_split_data.ipynb runs/experiment_03/%DATASET%/experiment_03_data tag | notebook | Description --- | --- | --- Tutorials | -tut | 04_1_train_pimms_models.ipynb | main tutorial showing scikit-learn interface partly with validatio data -tut | 04_1_train_DAE_VAE_wo_val_data.ipynb | +- | 04_1_train_pimms_models.ipynb | main tutorial showing scikit-learn (Transformer) interface partly with or without validation data Single experiment | run | 01_0_split_data.ipynb | Create train, validation and test data splits run | 01_0_transform_data_to_wide_format.ipynb | Transform train split to wide format for R models diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index adebc5050..000000000 --- a/requirements.txt +++ /dev/null @@ -1,37 +0,0 @@ -python >= 3.8, < 3.9 -numpy -pandas < 2.0 -scipy >= 1.6 -# plotting -matplotlib -python-kaleido -plotly -seaborn -pip -# ML -pytorch < 2.0 -# pytorch-cuda -scikit-learn -fastai -torchvision -# cudatoolkit=11.7 -tensorboard -umap-learn -# stats -pingouin -statsmodels -# other -tqdm # progress bars -xmltodict # configs -openpyxl # xml -omegaconf -# snakemake -snakemake-minimal < 7.26 -# jupyter -ipykernel -ipython -ipywidgets -jupyterlab # standalone jupyter installation -# jupyter_contrib_nbextensions # delete configuration file if you see an error: https://github.com/jupyter/nbconvert/issues/526#issuecomment-277552771 -jupyter-dash -papermill # execute ipynb's \ No newline at end of file diff --git a/requirements_R.txt b/requirements_R.txt deleted file mode 100644 index c52c1fb66..000000000 --- a/requirements_R.txt +++ /dev/null @@ -1,15 +0,0 @@ -# R3.6.3 is not available as binaries for M1 and M2 amd64 atm (2023-08-11) -r-base # >= 3.6.0, < 4.0 # would be good if it could be relaxed -r-irkernel -# r-biocmanager -r-reshape2 -r-stringi # + rmarkdown hack for reshape2 -r-stringr # reshape2 -r-tidyverse -r-gdata -r-glmnet -r-e1071 -r-norm -r-missforest -r-vim -r-mice \ No newline at end of file diff --git a/requirements_dev.txt b/requirements_dev.txt deleted file mode 100644 index 92626e74b..000000000 --- a/requirements_dev.txt +++ /dev/null @@ -1,13 +0,0 @@ -# dev -pytest -pytest-cov -jupytext -flake8 -flake8-bugbear -build -wheel -setuptools -pre-commit -pre-commit -jupyterlab_code_formatter -jupyterlab-git \ No newline at end of file diff --git a/vaep/__init__.py b/vaep/__init__.py index 98460455a..059ccf970 100644 --- a/vaep/__init__.py +++ b/vaep/__init__.py @@ -9,10 +9,7 @@ from importlib import metadata import njab -import pandas as pd -import pandas.io.formats.format as pf -# from . import logging, nb, pandas, plotting import vaep.logging import vaep.nb import vaep.pandas diff --git a/vaep/analyzers/__init__.py b/vaep/analyzers/__init__.py index 9856dafb9..6d16805a6 100644 --- a/vaep/analyzers/__init__.py +++ b/vaep/analyzers/__init__.py @@ -2,7 +2,7 @@ """ from types import SimpleNamespace -from . import compare_predictions, diff_analysis +from vaep.analyzers import compare_predictions, diff_analysis __all__ = ['diff_analysis', 'compare_predictions', 'Analysis'] diff --git a/vaep/data_handling.py b/vaep/data_handling.py index 1f0bc5404..41be078ac 100644 --- a/vaep/data_handling.py +++ b/vaep/data_handling.py @@ -4,8 +4,6 @@ import numpy as np import pandas as pd -# coverage - def coverage(X: pd.DataFrame, coverage_col: float, coverage_row: float): """Select proteins by column depending on their coverage. diff --git a/vaep/filter.py b/vaep/filter.py new file mode 100644 index 000000000..2d26c4806 --- /dev/null +++ b/vaep/filter.py @@ -0,0 +1,24 @@ +import logging + +import pandas as pd + +logger = logging.getLogger(__name__) + + +def select_features(df: pd.DataFrame, + feat_prevalence: float = .2, + axis: int = 0) -> pd.DataFrame: + """Select features or samples with a minimum prevalence. + """ + N = df.shape[axis] + minimum_freq = N * feat_prevalence + freq = df.notna().sum(axis=axis) + mask = freq >= minimum_freq + axis_synonym = "index" if axis == 0 else "columns" + logger.info(f"Drop {(~mask).sum()} along axis {axis} ({axis_synonym}).") + freq = freq.loc[mask] + if axis == 0: + df = df.loc[:, mask] + else: + df = df.loc[mask] + return df diff --git a/vaep/models/__init__.py b/vaep/models/__init__.py index 3be35408b..2ae111d44 100644 --- a/vaep/models/__init__.py +++ b/vaep/models/__init__.py @@ -16,8 +16,7 @@ from fastcore.foundation import L import vaep - -from . import ae, analysis, collab, vae +from vaep.models import ae, analysis, collab, vae logger = logging.getLogger(__name__) diff --git a/vaep/models/ae.py b/vaep/models/ae.py index fd5d081a1..8295c8560 100644 --- a/vaep/models/ae.py +++ b/vaep/models/ae.py @@ -21,7 +21,7 @@ import vaep.models import vaep.transform -from . import analysis +from vaep.models import analysis logger = logging.getLogger(__name__) diff --git a/vaep/models/collab.py b/vaep/models/collab.py index ec34a4ffb..f54ab6df2 100644 --- a/vaep/models/collab.py +++ b/vaep/models/collab.py @@ -5,14 +5,13 @@ import pandas as pd # import explicit objects for functional annotations from fastai.collab import * -from fastai.collab import (Categorify, CollabDataLoaders, IndexSplitter, - TabularCollab, TransformBlock) +from fastai.collab import (Categorify, IndexSplitter, TabularCollab, + TransformBlock) from fastai.tabular.all import * import vaep.io.dataloaders import vaep.io.datasplits - -from . import analysis +from vaep.models import analysis logger = logging.getLogger(__name__) @@ -49,37 +48,30 @@ def __init__(self, item_column: str = 'peptide', target_column: str = 'intensity', model_kwargs: dict = None, - batch_size: int = 64): + batch_size: int = 1_024): if datasplits.val_y is not None: - self.X, self.frac = combine_data(datasplits.train_X, - datasplits.val_y) + self.X, _ = combine_data(datasplits.train_X, + datasplits.val_y) else: - self.X, self.frac = datasplits.train_X.reset_index(), 0.0 + self.X, _ = datasplits.train_X.reset_index(), 0.0 self.batch_size = batch_size - self.dls = CollabDataLoaders.from_df(self.X, valid_pct=self.frac, - seed=42, - user_name=sample_column, - item_name=item_column, - rating_name=target_column, - bs=self.batch_size) user_name = sample_column item_name = item_column rating_name = target_column cat_names = [user_name, item_name] - ratings = self.X splits = None if datasplits.val_y is not None: idx_splitter = IndexSplitter( - list(range(len(datasplits.train_X), len(datasplits.train_X) + len(datasplits.val_y)))) + list(range(len(datasplits.train_X), len(self.X)))) splits = idx_splitter(self.X) - to = TabularCollab( - ratings, - [Categorify], - cat_names, + self.to = TabularCollab( + self.X, + procs=[Categorify], + cat_names=cat_names, y_names=[rating_name], y_block=TransformBlock(), splits=splits) - self.dls = to.dataloaders(path='.', bs=self.batch_size) + self.dls = self.to.dataloaders(path='.', bs=self.batch_size) self.params = {} if model_kwargs is None: model_kwargs = {} diff --git a/vaep/pandas/__init__.py b/vaep/pandas/__init__.py index 97520bb02..5f82204b1 100644 --- a/vaep/pandas/__init__.py +++ b/vaep/pandas/__init__.py @@ -7,7 +7,30 @@ import omegaconf import pandas as pd -from .calc_errors import calc_errors_per_feat, get_absolute_error +from vaep.pandas.calc_errors import calc_errors_per_feat, get_absolute_error + +__all__ = [ + 'calc_errors_per_feat', + 'get_absolute_error', + 'unique_cols', + 'get_unique_non_unique_columns', + 'prop_unique_index', + 'replace_with', + 'index_to_dict', + 'get_columns_accessor', + 'get_columns_accessor_from_iterable', + 'select_max_by', + 'get_columns_namedtuple', + 'highlight_min', + '_add_indices', + 'interpolate', + 'flatten_dict_of_dicts', + 'key_map', + 'parse_query_expression', + 'length', + 'get_last_index_matching_proportion', + 'get_lower_whiskers', + 'get_counts_per_bin'] def unique_cols(s: pd.Series) -> bool: @@ -285,16 +308,15 @@ def get_lower_whiskers(df: pd.DataFrame, factor: float = 1.5) -> pd.Series: return ret -def get_counts_per_bin(df: pd.DataFrame, bins: range, columns: Optional[List[str]] = None) -> pd.DataFrame: +def get_counts_per_bin(df: pd.DataFrame, + bins: range, + columns: Optional[List[str]] = None) -> pd.DataFrame: """Return counts per bin for selected columns in DataFrame.""" counts_per_bin = dict() if columns is None: columns = df.columns.to_list() for col in columns: - _series = (pd.cut(df[col], bins=bins) - .to_frame() - .groupby(col) - .size()) + _series = (pd.cut(df[col], bins=bins).to_frame().groupby(col).size()) _series.index.name = 'bin' counts_per_bin[col] = _series counts_per_bin = pd.DataFrame(counts_per_bin) diff --git a/vaep/plotting/__init__.py b/vaep/plotting/__init__.py index 17cc86ced..105c183f8 100644 --- a/vaep/plotting/__init__.py +++ b/vaep/plotting/__init__.py @@ -11,11 +11,8 @@ import seaborn import vaep.pandas - -from . import data, defaults, errors, plotly -from .errors import plot_rolling_error - -# from . defaults import order_categories, labels_dict, IDX_ORDER +from vaep.plotting import data, defaults, errors, plotly +from vaep.plotting.errors import plot_rolling_error seaborn.set_style("whitegrid") # seaborn.set_theme() diff --git a/vaep/sampling.py b/vaep/sampling.py index 78939c59b..ae1aaaa1a 100644 --- a/vaep/sampling.py +++ b/vaep/sampling.py @@ -1,13 +1,11 @@ import logging -from typing import Union, Tuple - +from typing import Tuple, Union import numpy as np import pandas as pd from vaep.io.datasplits import DataSplits - logger = logging.getLogger(__name__) @@ -184,8 +182,7 @@ def sample_mnar_mcar(df_long: pd.DataFrame, return splits, thresholds, fake_na_mcar, fake_na_mnar -def get_thresholds(df_long: pd.DataFrame, - frac_non_train: float, +def get_thresholds(df_long: pd.DataFrame, frac_non_train: float, random_state: int) -> pd.Series: """Get thresholds for MNAR/MCAR sampling. Thresholds are sampled from a normal distrubiton with a mean of the quantile of the simulated missing data. @@ -207,10 +204,47 @@ def get_thresholds(df_long: pd.DataFrame, """ quantile_frac = df_long.quantile(frac_non_train) rng = np.random.default_rng(random_state) - thresholds = pd.Series(rng.normal(loc=float(quantile_frac), - scale=float(0.3 * df_long.std()), - size=len(df_long), - ), - index=df_long.index, - ) + thresholds = pd.Series( + rng.normal( + loc=float(quantile_frac), + scale=float(0.3 * df_long.std()), + size=len(df_long), + ), + index=df_long.index, + ) return thresholds + + +def check_split_integrity(splits: DataSplits) -> DataSplits: + """Check if IDs in are only in validation or test data for rare cases. + Returns the corrected splits.""" + diff = (splits + .val_y + .index + .levels[-1] + .difference(splits + .train_X + .index + .levels[-1] + ).to_list()) + if diff: + logger.warning(f"Remove from val: {diff.to_list()}") + to_remove = splits.val_y.loc[pd.IndexSlice[:, diff]] + splits.train_X = pd.concat([splits.train_X, to_remove]) + splits.val_y = splits.val_y.drop(to_remove.index) + + diff = (splits + .test_y + .index + .levels[-1] + .difference(splits + .train_X + .index + .levels[-1] + ).to_list()) + if diff: + logger.warning(f"Remove from test: {diff.to_list()}") + to_remove = splits.test_y.loc[pd.IndexSlice[:, diff]] + splits.train_X = pd.concat([splits.train_X, to_remove]) + splits.test_y = splits.test_y.drop(to_remove.index) + return splits diff --git a/vaep/sklearn/cf_transformer.py b/vaep/sklearn/cf_transformer.py index ac152a92b..022973e6e 100644 --- a/vaep/sklearn/cf_transformer.py +++ b/vaep/sklearn/cf_transformer.py @@ -8,12 +8,13 @@ from fastai import learner from fastai.callback.tracker import EarlyStoppingCallback from fastai.collab import * -from fastai.collab import CollabDataLoaders, EmbeddingDotBias, TabularCollab +from fastai.collab import EmbeddingDotBias, TabularCollab from fastai.data.block import TransformBlock from fastai.data.transforms import IndexSplitter from fastai.learner import Learner from fastai.losses import MSELossFlat from fastai.tabular.all import * +from fastai.tabular.all import TransformBlock from fastai.tabular.core import Categorify from fastai.torch_core import default_device from sklearn.base import BaseEstimator, TransformerMixin @@ -89,21 +90,16 @@ def fit(self, X: pd.Series, y: pd.Series = None, if not cuda: default_device(use=False) # set to cpu if y is not None: - X, frac = collab.combine_data(X, y) + # Concatenate train and validation observations into on dataframe + first_N_train = len(X) + X, _ = collab.combine_data(X, y) else: - X, frac = X.reset_index(), 0.0 - - self.dls = CollabDataLoaders.from_df( - X, - valid_pct=frac, - seed=42, - user_name=self.sample_column, - item_name=self.item_column, - rating_name=self.target_column, - bs=self.batch_size) + X, _ = X.reset_index(), 0.0 + splits = None if y is not None: - idx_splitter = IndexSplitter(list(range(len(X), len(X) + len(y)))) + # specify positional indices of validation data + idx_splitter = IndexSplitter(list(range(first_N_train, len(X)))) splits = idx_splitter(X) self.cat_names = [self.sample_column, self.item_column]