diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 349e8173c..60149e4b1 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -67,19 +67,27 @@ jobs:
       run: |
         cd project
         mkdir runs
-        papermill 04_1_train_DAE_VAE_wo_val_data.ipynb runs/04_1_train_DAE_VAE_wo_val_data.ipynb
         papermill 04_1_train_pimms_models.ipynb runs/04_1_train_pimms_models.ipynb
-    - name: Run demo workflow (integration test)
+        papermill 04_1_train_pimms_models.ipynb runs/04_1_train_pimms_models_no_val.ipynb -p sample_splits False
+    - name: Dry-Run demo workflow (integration test)
       continue-on-error: true
       run: | 
        cd project
        snakemake -p -c1 --configfile config/single_dev_dataset/example/config.yaml -n
+    - name: Run demo workflow (integration test)
+      continue-on-error: true
+      run: | 
+       cd project
        snakemake -p -c4 -k --configfile config/single_dev_dataset/example/config.yaml
     - name: Run demo workflow again (in case of installation issues)
+      continue-on-error: true
       run: | 
         cd project
-        snakemake -p -c1 -n --configfile config/single_dev_dataset/example/config.yaml
         snakemake -p -c4 -k --configfile config/single_dev_dataset/example/config.yaml
+    - name: Run demo workflow again (in case of installation issues) - one thread
+      run: | 
+        cd project
+        snakemake -p -c1 --configfile config/single_dev_dataset/example/config.yaml
     - name: Archive results
     # https://github.com/actions/upload-artifact
       uses: actions/upload-artifact@v4
diff --git a/.github/workflows/test_pkg_on_colab.yaml b/.github/workflows/test_pkg_on_colab.yaml
index 546434bb3..3f0b6e757 100644
--- a/.github/workflows/test_pkg_on_colab.yaml
+++ b/.github/workflows/test_pkg_on_colab.yaml
@@ -1,10 +1,10 @@
 name: Test that tutorial runs on latest colab image
 
 on:
-    push:
-        branches: [dev]
+    # push:
+    #     branches: [main]
     pull_request:
-        branches: [main, dev]
+        branches: [main]
     schedule:
         - cron: '0 2 3 * *'
 
@@ -24,3 +24,5 @@ jobs:
           run: |
             cd project
             papermill 04_1_train_pimms_models.ipynb 04_1_train_pimms_models_output.ipynb
+            papermill 04_1_train_pimms_models.ipynb 04_1_train_pimms_models_no_val.ipynb -p sample_splits False
+
diff --git a/.github/workflows/workflow_website.yaml b/.github/workflows/workflow_website.yaml
index be97836db..d4f4e845a 100644
--- a/.github/workflows/workflow_website.yaml
+++ b/.github/workflows/workflow_website.yaml
@@ -39,9 +39,15 @@ jobs:
         cd project
         snakemake -s workflow/Snakefile_v2.smk --configfile config/alzheimer_study/config.yaml -p -c4 -k
     - name: Run demo workflow again (in case of installation issues)
+      continue-on-error: true
       run: | 
         cd project
         snakemake -s workflow/Snakefile_v2.smk --configfile config/alzheimer_study/config.yaml -p -c4 -k
+    - name: Run demo workflow again (in case of installation issues) with one thread
+      continue-on-error: true
+      run: | 
+        cd project
+        snakemake -s workflow/Snakefile_v2.smk --configfile config/alzheimer_study/config.yaml -p -c1 -k
     - name: Run differential analysis workflow
       run: | 
         cd project
diff --git a/README.md b/README.md
index 4af30064e..b11a02b64 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ The publication is accepted in Nature Communications
 and the pre-print is available [on biorxiv](https://doi.org/10.1101/2023.01.12.523792).
 
 > `PIMMS` was called `vaep` during development.  
-> Before entire refactoring has to been completed the imported package will be `vaep`.
+> Before entire refactoring has been completed the imported package will be `vaep`.
 
 We provide functionality as a python package, an excutable workflow or simply in notebooks.
 
@@ -127,17 +127,24 @@ mamba env create -n pimms -f environment.yml # faster, less then 5mins
 
 If on Mac M1, M2 or having otherwise issue using your accelerator (e.g. GPUs): Install the pytorch dependencies first, then the rest of the environment:
 
-### Install pytorch first (M-chips)
+### Install pytorch first
+
+> :warning: We currently see issues with some installations on M1 chips. A dependency
+> for one workflow is polars, which causes the issue. This should be [fixed now](https://github.com/RasmussenLab/njab/pull/13) 
+> for general use by delayed import 
+> of `mrmr-selection` in `njab`. If you encounter issues, please open an issue.
 
 Check how to install pytorch for your system [here](https://pytorch.org/get-started).
 
 - select the version compatible with your cuda version if you have an nvidia gpu or a Mac M-chip.
 
 ```bash
-conda create -n vaep python=3.9 pip
-conda activate vaep
-# Follow instructions on https://pytorch.org/get-started 
-# conda env update -f environment.yml -n vaep # should not install the rest.
+conda create -n pimms python=3.9 pip
+conda activate pimms
+# Follow instructions on https://pytorch.org/get-started: 
+# CUDA is not available on MacOS, please use default package
+# pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+conda install pytorch::pytorch torchvision torchaudio fastai -c pytorch -c fastai -y
 pip install pimms-learn
 pip install jupyterlab papermill # use run notebook interactively or as a script
 
diff --git a/project/01_0_split_data.ipynb b/project/01_0_split_data.ipynb
index 3b757bb1c..d73788af0 100644
--- a/project/01_0_split_data.ipynb
+++ b/project/01_0_split_data.ipynb
@@ -1370,48 +1370,7 @@
     "# -> or raise error as feature completness treshold is so low that less than 3 samples\n",
     "# per feature are allowd.\n",
     "\n",
-    "diff = (splits\n",
-    "        .val_y\n",
-    "        .index\n",
-    "        .levels[-1]\n",
-    "        .difference(splits\n",
-    "                    .train_X\n",
-    "                    .index\n",
-    "                    .levels[-1]\n",
-    "                    ).to_list())\n",
-    "if diff:\n",
-    "    to_remove = splits.val_y.loc[pd.IndexSlice[:, diff]]\n",
-    "    display(to_remove)\n",
-    "    splits.train_X = pd.concat([splits.train_X, to_remove])\n",
-    "    splits.val_y = splits.val_y.drop(to_remove.index)\n",
-    "diff"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "tags": [
-     "hide-input"
-    ]
-   },
-   "outputs": [],
-   "source": [
-    "diff = (splits\n",
-    "        .test_y\n",
-    "        .index\n",
-    "        .levels[-1]\n",
-    "        .difference(splits\n",
-    "                    .train_X\n",
-    "                    .index\n",
-    "                    .levels[-1]\n",
-    "                    ).to_list())\n",
-    "if diff:\n",
-    "    to_remove = splits.test_y.loc[pd.IndexSlice[:, diff]]\n",
-    "    display(to_remove)\n",
-    "    splits.train_X = pd.concat([splits.train_X, to_remove])\n",
-    "    splits.test_y = splits.test_y.drop(to_remove.index)\n",
-    "diff"
+    "splits = vaep.sampling.check_split_integrity(splits)"
    ]
   },
   {
@@ -1812,17 +1771,6 @@
     "writer.close()\n",
     "dumps"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "tags": [
-     "hide-input"
-    ]
-   },
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/project/01_0_split_data.py b/project/01_0_split_data.py
index 9be123d10..e4b4c2450 100644
--- a/project/01_0_split_data.py
+++ b/project/01_0_split_data.py
@@ -724,38 +724,7 @@ def join_as_str(seq):
 # -> or raise error as feature completness treshold is so low that less than 3 samples
 # per feature are allowd.
 
-diff = (splits
-        .val_y
-        .index
-        .levels[-1]
-        .difference(splits
-                    .train_X
-                    .index
-                    .levels[-1]
-                    ).to_list())
-if diff:
-    to_remove = splits.val_y.loc[pd.IndexSlice[:, diff]]
-    display(to_remove)
-    splits.train_X = pd.concat([splits.train_X, to_remove])
-    splits.val_y = splits.val_y.drop(to_remove.index)
-diff
-
-# %% tags=["hide-input"]
-diff = (splits
-        .test_y
-        .index
-        .levels[-1]
-        .difference(splits
-                    .train_X
-                    .index
-                    .levels[-1]
-                    ).to_list())
-if diff:
-    to_remove = splits.test_y.loc[pd.IndexSlice[:, diff]]
-    display(to_remove)
-    splits.train_X = pd.concat([splits.train_X, to_remove])
-    splits.test_y = splits.test_y.drop(to_remove.index)
-diff
+splits = vaep.sampling.check_split_integrity(splits)
 
 # %% [markdown]
 # Some tools require at least 4 observation in the training data,
@@ -963,5 +932,3 @@ def join_as_str(seq):
 # %% tags=["hide-input"]
 writer.close()
 dumps
-
-# %% tags=["hide-input"]
diff --git a/project/01_1_train_CF.ipynb b/project/01_1_train_CF.ipynb
index b508cd19d..b1d3eac71 100644
--- a/project/01_1_train_CF.ipynb
+++ b/project/01_1_train_CF.ipynb
@@ -98,7 +98,6 @@
     "# model\n",
     "# Dimensionality of encoding dimension (latent space of model)\n",
     "latent_dim: int = 10\n",
-    "# hidden_layers:str = '128_64' # Underscore separated string of layers, '128 64' for the encoder, reversed for decoder\n",
     "sample_idx_position: int = 0  # position of index which is sample ID\n",
     "model: str = 'CF'  # model name\n",
     "model_key: str = 'CF'  # potentially alternative key for model (grid search)\n",
diff --git a/project/01_1_train_CF.py b/project/01_1_train_CF.py
index 68acf2afa..847860649 100644
--- a/project/01_1_train_CF.py
+++ b/project/01_1_train_CF.py
@@ -69,7 +69,6 @@
 # model
 # Dimensionality of encoding dimension (latent space of model)
 latent_dim: int = 10
-# hidden_layers:str = '128_64' # Underscore separated string of layers, '128 64' for the encoder, reversed for decoder
 sample_idx_position: int = 0  # position of index which is sample ID
 model: str = 'CF'  # model name
 model_key: str = 'CF'  # potentially alternative key for model (grid search)
diff --git a/project/01_1_train_NAGuideR_methods.R b/project/01_1_train_NAGuideR_methods.R
index 13997aa17..3918a663c 100644
--- a/project/01_1_train_NAGuideR_methods.R
+++ b/project/01_1_train_NAGuideR_methods.R
@@ -20,6 +20,8 @@
 # - BiocManager could be moved to methods who are installed from BioConductor
 
 # + tags=["hide-input"] vscode={"languageId": "r"}
+# options("install.lock"=FALSE)
+
 packages_base_R <-
   c("BiocManager", "reshape2", "data.table", "readr", "tibble")
 
@@ -130,6 +132,7 @@ nafunctions <- function(x, method = "zero") {
   else if (method == "qrilc") {
     install_bioconductor("impute")
     install_bioconductor("pcaMethods")
+    install_rpackage('gmm')
     install_rpackage('imputeLCMD')
     xxm <- t(df1)
     data_zero1 <-
@@ -139,6 +142,7 @@ nafunctions <- function(x, method = "zero") {
   else if (method == "mindet") {
     install_bioconductor("impute")
     install_bioconductor("pcaMethods")
+    install_rpackage('gmm')
     install_rpackage('imputeLCMD')
     xxm <- as.matrix(df1)
     df <- imputeLCMD::impute.MinDet(xxm, q = 0.01)
@@ -146,6 +150,7 @@ nafunctions <- function(x, method = "zero") {
   else if (method == "minprob") {
     install_bioconductor("impute")
     install_bioconductor("pcaMethods")
+    install_rpackage('gmm')
     install_rpackage('imputeLCMD')
     xxm <- as.matrix(df1)
     df <-
@@ -278,6 +283,7 @@ nafunctions <- function(x, method = "zero") {
     
     install_bioconductor("impute")
     install_bioconductor("pcaMethods")
+    install_rpackage('gmm')
     install_rpackage('imputeLCMD')
     install_rpackage("magrittr")
     install_rpackage("glmnet")
diff --git a/project/01_1_train_NAGuideR_methods.ipynb b/project/01_1_train_NAGuideR_methods.ipynb
index 23fae4bd3..f6a7a6a1b 100644
--- a/project/01_1_train_NAGuideR_methods.ipynb
+++ b/project/01_1_train_NAGuideR_methods.ipynb
@@ -26,6 +26,8 @@
    },
    "outputs": [],
    "source": [
+    "# options(\"install.lock\"=FALSE)\n",
+    "\n",
     "packages_base_R <-\n",
     "  c(\"BiocManager\", \"reshape2\", \"data.table\", \"readr\", \"tibble\")\n",
     "\n",
@@ -160,6 +162,7 @@
     "  else if (method == \"qrilc\") {\n",
     "    install_bioconductor(\"impute\")\n",
     "    install_bioconductor(\"pcaMethods\")\n",
+    "    install_rpackage('gmm')\n",
     "    install_rpackage('imputeLCMD')\n",
     "    xxm <- t(df1)\n",
     "    data_zero1 <-\n",
@@ -169,6 +172,7 @@
     "  else if (method == \"mindet\") {\n",
     "    install_bioconductor(\"impute\")\n",
     "    install_bioconductor(\"pcaMethods\")\n",
+    "    install_rpackage('gmm')\n",
     "    install_rpackage('imputeLCMD')\n",
     "    xxm <- as.matrix(df1)\n",
     "    df <- imputeLCMD::impute.MinDet(xxm, q = 0.01)\n",
@@ -176,6 +180,7 @@
     "  else if (method == \"minprob\") {\n",
     "    install_bioconductor(\"impute\")\n",
     "    install_bioconductor(\"pcaMethods\")\n",
+    "    install_rpackage('gmm')\n",
     "    install_rpackage('imputeLCMD')\n",
     "    xxm <- as.matrix(df1)\n",
     "    df <-\n",
@@ -308,6 +313,7 @@
     "    \n",
     "    install_bioconductor(\"impute\")\n",
     "    install_bioconductor(\"pcaMethods\")\n",
+    "    install_rpackage('gmm')\n",
     "    install_rpackage('imputeLCMD')\n",
     "    install_rpackage(\"magrittr\")\n",
     "    install_rpackage(\"glmnet\")\n",
diff --git a/project/04_1_train_DAE_VAE_wo_val_data.ipynb b/project/04_1_train_DAE_VAE_wo_val_data.ipynb
deleted file mode 100644
index baaae316f..000000000
--- a/project/04_1_train_DAE_VAE_wo_val_data.ipynb
+++ /dev/null
@@ -1,348 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "2b842a1a",
-   "metadata": {},
-   "source": [
-    "# Scikit-learn styple transformers of the data\n",
-    "\n",
-    "1. Load data into pandas dataframe\n",
-    "2. Fit transformer on training data\n",
-    "3. Impute only missing values with predictions from model\n",
-    "\n",
-    "Autoencoders need wide training data, i.e. a sample with all its features' intensities, whereas\n",
-    "Collaborative Filtering needs long training data, i.e. sample identifier a feature identifier and the intensity.\n",
-    "Both data formats can be transformed into each other, but models using long data format do not need to\n",
-    "take care of missing values."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b6c7aec0",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "\n",
-    "import vaep.plotting.data\n",
-    "from vaep.sklearn.ae_transformer import AETransformer\n",
-    "import vaep.sampling\n",
-    "\n",
-    "\n",
-    "IN_COLAB = 'COLAB_GPU' in os.environ\n",
-    "\n",
-    "fn_intensities = 'data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv'\n",
-    "if IN_COLAB:\n",
-    "    fn_intensities = 'https://raw.githubusercontent.com/RasmussenLab/pimms/main/project/data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4f1ccbdd",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\n",
-    "\n",
-    "vaep.plotting.make_large_descriptors(8)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "9ff56309",
-   "metadata": {},
-   "source": [
-    "## Data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b921b86c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = pd.read_csv(fn_intensities, index_col=0)\n",
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "43798bb3",
-   "metadata": {},
-   "source": [
-    "We will need the data in long format for Collaborative Filtering.\n",
-    "Naming both the row and column index assures\n",
-    "that the data can be transformed very easily into long format:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6d29c02d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df.index.name = 'Sample ID'  # already set\n",
-    "df.columns.name = 'protein group'  # not set due to csv disk file format\n",
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "cb166253",
-   "metadata": {},
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "id": "dd2148b8",
-   "metadata": {},
-   "source": [
-    "Transform the data using the logarithm, here using base 2:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b599efb8",
-   "metadata": {
-    "lines_to_next_cell": 2
-   },
-   "outputs": [],
-   "source": [
-    "df = np.log2(df + 1)\n",
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8b264559",
-   "metadata": {},
-   "source": [
-    "two plots on data availability:\n",
-    "\n",
-    "1. proportion of missing values per feature median (N = protein groups)\n",
-    "2. CDF of available intensities per protein group"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "da3c8dba",
-   "metadata": {
-    "lines_to_next_cell": 2
-   },
-   "outputs": [],
-   "source": [
-    "ax = vaep.plotting.data.plot_feat_median_over_prop_missing(\n",
-    "    data=df, type='boxplot')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a912a04a",
-   "metadata": {
-    "lines_to_next_cell": 2
-   },
-   "outputs": [],
-   "source": [
-    "df.notna().sum().sort_values().plot()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "29d451c0",
-   "metadata": {},
-   "source": [
-    "define a minimum feature and sample frequency for a feature to be included"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ed278540",
-   "metadata": {
-    "lines_to_next_cell": 2
-   },
-   "outputs": [],
-   "source": [
-    "SELECT_FEAT = True\n",
-    "\n",
-    "\n",
-    "def select_features(df, feat_prevalence=.2, axis=0):\n",
-    "    # # ! vaep.filter.select_features\n",
-    "    N = df.shape[axis]\n",
-    "    minimum_freq = N * feat_prevalence\n",
-    "    freq = df.notna().sum(axis=axis)\n",
-    "    mask = freq >= minimum_freq\n",
-    "    print(f\"Drop {(~mask).sum()} along axis {axis}.\")\n",
-    "    freq = freq.loc[mask]\n",
-    "    if axis == 0:\n",
-    "        df = df.loc[:, mask]\n",
-    "    else:\n",
-    "        df = df.loc[mask]\n",
-    "    return df\n",
-    "\n",
-    "\n",
-    "if SELECT_FEAT:\n",
-    "    # potentially this can take a few iterations to stabilize.\n",
-    "    df = select_features(df, feat_prevalence=.2)\n",
-    "    df = select_features(df=df, feat_prevalence=.3, axis=1)\n",
-    "df.shape"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "9f4bcf48",
-   "metadata": {},
-   "source": [
-    "## AutoEncoder architectures"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "487a4f7c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Reload data (for demonstration)\n",
-    "\n",
-    "df = pd.read_csv(fn_intensities, index_col=0)\n",
-    "df.index.name = 'Sample ID'  # already set\n",
-    "df.columns.name = 'protein group'  # not set due to csv disk file format\n",
-    "df = np.log2(df + 1)  # log transform\n",
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "ceee9ced",
-   "metadata": {},
-   "source": [
-    "Test `DAE` or `VAE` model without validation data:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5c6b5ab9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model_selected = 'VAE'  # 'DAE'\n",
-    "model = AETransformer(\n",
-    "    model=model_selected,\n",
-    "    hidden_layers=[512,],\n",
-    "    latent_dim=50,\n",
-    "    out_folder='runs/scikit_interface',\n",
-    "    batch_size=10,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f4ab5d10",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.fit(df,\n",
-    "          epochs_max=2,\n",
-    "          cuda=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9ae733fc",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_imputed = model.transform(df)\n",
-    "df_imputed"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "3526ad09",
-   "metadata": {},
-   "source": [
-    "DAE"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e87d1eae",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model_selected = 'DAE'\n",
-    "model = AETransformer(\n",
-    "    model=model_selected,\n",
-    "    hidden_layers=[512,],\n",
-    "    latent_dim=50,\n",
-    "    out_folder='runs/scikit_interface',\n",
-    "    batch_size=10,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c6c60295",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.fit(df,\n",
-    "          epochs_max=2,\n",
-    "          cuda=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "27e80959",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_imputed = model.transform(df)\n",
-    "df_imputed"
-   ]
-  }
- ],
- "metadata": {
-  "jupytext": {
-   "cell_metadata_filter": "-all",
-   "main_language": "python",
-   "notebook_metadata_filter": "-all"
-  },
-  "kernelspec": {
-   "display_name": "Python",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.17"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/project/04_1_train_DAE_VAE_wo_val_data.py b/project/04_1_train_DAE_VAE_wo_val_data.py
deleted file mode 100644
index 2fbc3564d..000000000
--- a/project/04_1_train_DAE_VAE_wo_val_data.py
+++ /dev/null
@@ -1,160 +0,0 @@
-# %% [markdown]
-# # Scikit-learn styple transformers of the data
-#
-# 1. Load data into pandas dataframe
-# 2. Fit transformer on training data
-# 3. Impute only missing values with predictions from model
-#
-# Autoencoders need wide training data, i.e. a sample with all its features' intensities, whereas
-# Collaborative Filtering needs long training data, i.e. sample identifier a feature identifier and the intensity.
-# Both data formats can be transformed into each other, but models using long data format do not need to
-# take care of missing values.
-
-# %%
-import os
-import pandas as pd
-import numpy as np
-
-import vaep.plotting.data
-from vaep.sklearn.ae_transformer import AETransformer
-import vaep.sampling
-
-
-IN_COLAB = 'COLAB_GPU' in os.environ
-
-fn_intensities = 'data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv'
-if IN_COLAB:
-    fn_intensities = 'https://raw.githubusercontent.com/RasmussenLab/pimms/main/project/data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv'
-
-# %%
-
-
-vaep.plotting.make_large_descriptors(8)
-
-# %% [markdown]
-# ## Data
-
-# %%
-df = pd.read_csv(fn_intensities, index_col=0)
-df.head()
-
-# %% [markdown]
-# We will need the data in long format for Collaborative Filtering.
-# Naming both the row and column index assures
-# that the data can be transformed very easily into long format:
-
-# %%
-df.index.name = 'Sample ID'  # already set
-df.columns.name = 'protein group'  # not set due to csv disk file format
-df.head()
-
-# %% [markdown]
-#
-
-# %% [markdown]
-# Transform the data using the logarithm, here using base 2:
-
-# %%
-df = np.log2(df + 1)
-df.head()
-
-
-# %% [markdown]
-# two plots on data availability:
-#
-# 1. proportion of missing values per feature median (N = protein groups)
-# 2. CDF of available intensities per protein group
-
-# %%
-ax = vaep.plotting.data.plot_feat_median_over_prop_missing(
-    data=df, type='boxplot')
-
-
-# %%
-df.notna().sum().sort_values().plot()
-
-
-# %% [markdown]
-# define a minimum feature and sample frequency for a feature to be included
-
-# %%
-SELECT_FEAT = True
-
-
-def select_features(df, feat_prevalence=.2, axis=0):
-    # # ! vaep.filter.select_features
-    N = df.shape[axis]
-    minimum_freq = N * feat_prevalence
-    freq = df.notna().sum(axis=axis)
-    mask = freq >= minimum_freq
-    print(f"Drop {(~mask).sum()} along axis {axis}.")
-    freq = freq.loc[mask]
-    if axis == 0:
-        df = df.loc[:, mask]
-    else:
-        df = df.loc[mask]
-    return df
-
-
-if SELECT_FEAT:
-    # potentially this can take a few iterations to stabilize.
-    df = select_features(df, feat_prevalence=.2)
-    df = select_features(df=df, feat_prevalence=.3, axis=1)
-df.shape
-
-
-# %% [markdown]
-# ## AutoEncoder architectures
-
-# %%
-# Reload data (for demonstration)
-
-df = pd.read_csv(fn_intensities, index_col=0)
-df.index.name = 'Sample ID'  # already set
-df.columns.name = 'protein group'  # not set due to csv disk file format
-df = np.log2(df + 1)  # log transform
-df.head()
-
-# %% [markdown]
-# Test `DAE` or `VAE` model without validation data:
-
-# %%
-model_selected = 'VAE'  # 'DAE'
-model = AETransformer(
-    model=model_selected,
-    hidden_layers=[512,],
-    latent_dim=50,
-    out_folder='runs/scikit_interface',
-    batch_size=10,
-)
-
-# %%
-model.fit(df,
-          epochs_max=2,
-          cuda=False)
-
-# %%
-df_imputed = model.transform(df)
-df_imputed
-
-# %% [markdown]
-# DAE
-
-# %%
-model_selected = 'DAE'
-model = AETransformer(
-    model=model_selected,
-    hidden_layers=[512,],
-    latent_dim=50,
-    out_folder='runs/scikit_interface',
-    batch_size=10,
-)
-
-# %%
-model.fit(df,
-          epochs_max=2,
-          cuda=False)
-
-# %%
-df_imputed = model.transform(df)
-df_imputed
diff --git a/project/04_1_train_pimms_models.ipynb b/project/04_1_train_pimms_models.ipynb
index 5169e2d07..657874a23 100644
--- a/project/04_1_train_pimms_models.ipynb
+++ b/project/04_1_train_pimms_models.ipynb
@@ -26,6 +26,7 @@
    "source": [
     "import os\n",
     "from importlib import metadata\n",
+    "\n",
     "IN_COLAB = 'COLAB_GPU' in os.environ\n",
     "if IN_COLAB:\n",
     "    try:\n",
@@ -42,45 +43,99 @@
    "id": "36b5a27d",
    "metadata": {},
    "source": [
-    "If on colab, please restart the environment and run everything from here on."
+    "If on colab, please restart the environment and run everything from here on.\n",
+    "\n",
+    "Specify example data:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "65b5bdaf",
    "metadata": {},
    "outputs": [],
    "source": [
     "import os\n",
+    "\n",
     "IN_COLAB = 'COLAB_GPU' in os.environ\n",
     "\n",
     "fn_intensities = 'data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv'\n",
     "if IN_COLAB:\n",
-    "    fn_intensities = 'https://raw.githubusercontent.com/RasmussenLab/pimms/main/project/data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv'"
+    "    fn_intensities = ('https://raw.githubusercontent.com/RasmussenLab/pimms/main/'\n",
+    "                      'project/data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2d628ce9",
+   "metadata": {},
+   "source": [
+    "Load package."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "1c289d17",
-   "metadata": {},
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
    "outputs": [],
    "source": [
-    "import numpy as np\n",
-    "import pandas as pd\n",
+    "import logging\n",
+    "import random\n",
     "\n",
     "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from IPython.display import display\n",
     "\n",
-    "\n",
-    "from vaep.plotting.defaults import color_model_mapping\n",
+    "import vaep.filter\n",
     "import vaep.plotting.data\n",
     "import vaep.sampling\n",
-    "\n",
-    "from vaep.sklearn.cf_transformer import CollaborativeFilteringTransformer\n",
+    "from vaep.plotting.defaults import color_model_mapping\n",
     "from vaep.sklearn.ae_transformer import AETransformer\n",
+    "from vaep.sklearn.cf_transformer import CollaborativeFilteringTransformer\n",
     "\n",
-    "vaep.plotting.make_large_descriptors(8)"
+    "vaep.plotting.make_large_descriptors(8)\n",
+    "\n",
+    "\n",
+    "logger = logger = vaep.logging.setup_nb_logger()\n",
+    "logging.getLogger('fontTools').setLevel(logging.ERROR)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "59282a30",
+   "metadata": {},
+   "source": [
+    "## Parameters\n",
+    "Can be set by papermill on the command line or manually in the (colab) notebook."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ab9b68d6",
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "fn_intensities: str = fn_intensities  # path or url to the data file in csv format\n",
+    "index_name: str = 'Sample ID'  # name of the index column\n",
+    "column_name: str = 'protein group'  # name of the column index\n",
+    "select_features: bool = True  # Whether to select features based on prevalence\n",
+    "feat_prevalence: float = 0.2  # minimum prevalence of a feature to be included\n",
+    "sample_completeness: float = 0.3  # minimum completeness of a sample to be included\n",
+    "sample_splits: bool = True  # Whether to sample validation and test data\n",
+    "frac_non_train: float = 0.1  # fraction of non training data (validation and test split)\n",
+    "frac_mnar: float = 0.0  # fraction of missing not at random data, rest: missing completely at random\n",
+    "random_state: int = 42  # random state for reproducibility"
    ]
   },
   {
@@ -95,7 +150,11 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "4a3edbdd",
-   "metadata": {},
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
    "outputs": [],
    "source": [
     "df = pd.read_csv(fn_intensities, index_col=0)\n",
@@ -116,11 +175,15 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "4fde25e9",
-   "metadata": {},
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
    "outputs": [],
    "source": [
-    "df.index.name = 'Sample ID'  # already set\n",
-    "df.columns.name = 'protein group'  # not set due to csv disk file format\n",
+    "df.index.name = index_name  # already set\n",
+    "df.columns.name = column_name  # not set due to csv disk file format\n",
     "df.head()"
    ]
   },
@@ -128,13 +191,8 @@
    "cell_type": "markdown",
    "id": "c6c788f0",
    "metadata": {},
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "id": "2ab8dc7f",
-   "metadata": {},
    "source": [
+    "### Data transformation: log2 transformation\n",
     "Transform the data using the logarithm, here using base 2:"
    ]
   },
@@ -143,7 +201,10 @@
    "execution_count": null,
    "id": "554d4fa7",
    "metadata": {
-    "lines_to_next_cell": 2
+    "lines_to_next_cell": 2,
+    "tags": [
+     "hide-input"
+    ]
    },
    "outputs": [],
    "source": [
@@ -156,7 +217,7 @@
    "id": "fbce73d1",
    "metadata": {},
    "source": [
-    "two plots on data availability:\n",
+    "### two plots inspecting data availability\n",
     "\n",
     "1. proportion of missing values per feature median (N = protein groups)\n",
     "2. CDF of available intensities per protein group"
@@ -167,7 +228,10 @@
    "execution_count": null,
    "id": "536793bb",
    "metadata": {
-    "lines_to_next_cell": 2
+    "lines_to_next_cell": 2,
+    "tags": [
+     "hide-input"
+    ]
    },
    "outputs": [],
    "source": [
@@ -180,7 +244,10 @@
    "execution_count": null,
    "id": "313bc55e",
    "metadata": {
-    "lines_to_next_cell": 2
+    "lines_to_next_cell": 2,
+    "tags": [
+     "hide-input"
+    ]
    },
    "outputs": [],
    "source": [
@@ -192,6 +259,7 @@
    "id": "cf54a3af",
    "metadata": {},
    "source": [
+    "### Data selection\n",
     "define a minimum feature and sample frequency for a feature to be included"
    ]
   },
@@ -200,32 +268,17 @@
    "execution_count": null,
    "id": "28ad27de",
    "metadata": {
-    "lines_to_next_cell": 2
+    "lines_to_next_cell": 2,
+    "tags": [
+     "hide-input"
+    ]
    },
    "outputs": [],
    "source": [
-    "SELECT_FEAT = True\n",
-    "\n",
-    "\n",
-    "def select_features(df, feat_prevalence=.2, axis=0):\n",
-    "    # # ! vaep.filter.select_features\n",
-    "    N = df.shape[axis]\n",
-    "    minimum_freq = N * feat_prevalence\n",
-    "    freq = df.notna().sum(axis=axis)\n",
-    "    mask = freq >= minimum_freq\n",
-    "    print(f\"Drop {(~mask).sum()} along axis {axis}.\")\n",
-    "    freq = freq.loc[mask]\n",
-    "    if axis == 0:\n",
-    "        df = df.loc[:, mask]\n",
-    "    else:\n",
-    "        df = df.loc[mask]\n",
-    "    return df\n",
-    "\n",
-    "\n",
-    "if SELECT_FEAT:\n",
+    "if select_features:\n",
     "    # potentially this can take a few iterations to stabilize.\n",
-    "    df = select_features(df, feat_prevalence=.2)\n",
-    "    df = select_features(df=df, feat_prevalence=.3, axis=1)\n",
+    "    df = vaep.filter.select_features(df, feat_prevalence=feat_prevalence)\n",
+    "    df = vaep.filter.select_features(df=df, feat_prevalence=sample_completeness, axis=1)\n",
     "df.shape"
    ]
   },
@@ -242,12 +295,48 @@
    "execution_count": null,
    "id": "646ea5bb",
    "metadata": {
-    "lines_to_next_cell": 2
+    "tags": [
+     "hide-input"
+    ]
    },
    "outputs": [],
    "source": [
     "df = df.stack().to_frame('intensity')\n",
-    "df.head()"
+    "df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "54d4b781",
+   "metadata": {},
+   "source": [
+    "## Optionally: Sample data\n",
+    "- models can be trained without subsetting the data\n",
+    "- allows evaluation of the models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c2586a1c",
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "if sample_splits:\n",
+    "    splits, thresholds, fake_na_mcar, fake_na_mnar = vaep.sampling.sample_mnar_mcar(\n",
+    "        df_long=df,\n",
+    "        frac_non_train=frac_non_train,\n",
+    "        frac_mnar=frac_mnar,\n",
+    "        random_state=random_state,\n",
+    "    )\n",
+    "    splits = vaep.sampling.check_split_integrity(splits)\n",
+    "else:\n",
+    "    splits = vaep.sampling.DataSplits(is_wide_format=False)\n",
+    "    splits.train_X = df"
    ]
   },
   {
@@ -263,7 +352,9 @@
    "id": "d2390859",
    "metadata": {},
    "source": [
-    "## Collaborative Filtering"
+    "## Collaborative Filtering\n",
+    "\n",
+    "Inspect annotations of the scikit-learn like Transformer:"
    ]
   },
   {
@@ -273,7 +364,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# # # # CollaborativeFilteringTransformer?"
+    "CollaborativeFilteringTransformer?"
    ]
   },
   {
@@ -317,12 +408,21 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "cf_model.fit(df,\n",
+    "cf_model.fit(splits.train_X,\n",
+    "             splits.val_y,\n",
     "             cuda=False,\n",
     "             epochs_max=20,\n",
     "             )"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "7db2f4c4",
+   "metadata": {},
+   "source": [
+    "Impute missing values usin `transform` method:"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -347,7 +447,11 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "99ff7ecd",
-   "metadata": {},
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
    "outputs": [],
    "source": [
     "df_imputed = df_imputed.stack()  # long-format\n",
@@ -356,16 +460,8 @@
     "df_imputed = df_imputed.unstack()  # back to wide-format\n",
     "# some checks\n",
     "assert len(df) == len(observed)\n",
-    "assert df_imputed.shape[0] * df_imputed.shape[1] == len(imputed) + len(observed)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "addb7cbf",
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "assert df_imputed.shape[0] * df_imputed.shape[1] == len(imputed) + len(observed)\n",
+    "\n",
     "fig, axes = plt.subplots(2, figsize=(8, 4))\n",
     "\n",
     "min_max = vaep.plotting.data.get_min_max_iterable(\n",
@@ -405,16 +501,16 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "b7184c2e",
-   "metadata": {},
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
    "outputs": [],
    "source": [
-    "# Reload data (for demonstration)\n",
-    "\n",
-    "df = pd.read_csv(fn_intensities, index_col=0)\n",
-    "df.index.name = 'Sample ID'  # already set\n",
-    "df.columns.name = 'protein group'  # not set due to csv disk file format\n",
-    "df = np.log2(df + 1)  # log transform\n",
-    "df.head()"
+    "# Use wide format of data\n",
+    "splits.to_wide_format()\n",
+    "splits.train_X"
    ]
   },
   {
@@ -422,21 +518,21 @@
    "id": "ae52c6fe",
    "metadata": {},
    "source": [
-    "The AutoEncoder model currently need validation data for training.\n",
-    "We will use 10% of the training data for validation.\n",
-    "> Expect this limitation to be dropped in the next release. It will still be recommended\n",
-    "> to use validation data for early stopping."
+    "Validation data for early stopping (if specified)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "8bbd0017",
-   "metadata": {},
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
    "outputs": [],
    "source": [
-    "freq_feat = df.notna().sum()\n",
-    "freq_feat.head()  # training data"
+    "splits.val_y"
    ]
   },
   {
@@ -444,25 +540,25 @@
    "id": "6da6c4e2",
    "metadata": {},
    "source": [
-    "We will use the `sampling` module to sample the validation data from the training data.\n",
-    "Could be split differently by providing another `weights` vector."
+    "Training and validation need the same shape:"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "99e690f9",
-   "metadata": {},
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
    "outputs": [],
    "source": [
-    "val_X, train_X = vaep.sampling.sample_data(df.stack(),\n",
-    "                                           sample_index_to_drop=0,\n",
-    "                                           weights=freq_feat,\n",
-    "                                           frac=0.1,\n",
-    "                                           random_state=42,)\n",
-    "val_X, train_X = val_X.unstack(), train_X.unstack()\n",
-    "val_X = pd.DataFrame(pd.NA, index=train_X.index,\n",
-    "                     columns=train_X.columns).fillna(val_X)"
+    "if splits.val_y is not None:\n",
+    "    splits.val_y = pd.DataFrame(pd.NA, index=splits.train_X.index,\n",
+    "                                columns=splits.train_X.columns).fillna(splits.val_y)\n",
+    "\n",
+    "    print(splits.train_X.shape, splits.val_y.shape)"
    ]
   },
   {
@@ -470,7 +566,7 @@
    "id": "45b9c22c",
    "metadata": {},
    "source": [
-    "Training data and validation data have the same shape:"
+    "Select either `DAE` or `VAE` model by chance:"
    ]
   },
   {
@@ -480,43 +576,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "val_X.shape, train_X.shape"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "f89fb41f",
-   "metadata": {},
-   "source": [
-    "... but different number of intensities (non-missing values):"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "62cd0721",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "train_X.notna().sum().sum(), val_X.notna().sum().sum(),"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b5a0b973",
-   "metadata": {},
-   "source": [
-    "Select either `DAE` or `VAE` model:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "26a12a3e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model_selected = 'VAE'  # 'DAE'\n",
+    "model_selected = random.choice(['DAE', 'VAE'])\n",
+    "print(\"Selected model by chance:\", model_selected)\n",
     "model = AETransformer(\n",
     "    model=model_selected,\n",
     "    hidden_layers=[512,],\n",
@@ -529,78 +590,68 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "4d3c7922",
+   "id": "aea5dc44",
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.fit(train_X, val_X,\n",
+    "model.fit(splits.train_X, splits.val_y,\n",
     "          epochs_max=50,\n",
     "          cuda=False)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "24ca6c2c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_imputed = model.transform(train_X)\n",
-    "df_imputed"
-   ]
-  },
   {
    "cell_type": "markdown",
-   "id": "17398941",
+   "id": "f89fb41f",
    "metadata": {},
    "source": [
-    "Evaluate the model using the validation data:"
+    "Impute missing values using `transform` method:"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "2a664072",
+   "id": "62cd0721",
    "metadata": {},
    "outputs": [],
    "source": [
-    "pred_val = val_X.stack().to_frame('observed')\n",
-    "pred_val[model_selected] = df_imputed.stack()\n",
-    "pred_val"
+    "df_imputed = model.transform(splits.train_X).stack()\n",
+    "df_imputed"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6eaa510a",
+   "cell_type": "markdown",
+   "id": "17398941",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "val_metrics = vaep.models.calculte_metrics(pred_val, 'observed')\n",
-    "# val_metrics = metrics.add_metrics(\n",
-    "#     pred_val, key='test data')\n",
-    "# val_metrics = pd.DataFrame(val_metrics)\n",
-    "# val_metrics\n",
-    "pd.DataFrame(val_metrics)"
+    "Evaluate the model using the validation data:"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "f3013daf",
-   "metadata": {},
+   "id": "2a664072",
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
    "outputs": [],
    "source": [
-    "fig, ax = plt.subplots(figsize=(8, 2))\n",
+    "if splits.val_y is not None:\n",
+    "    pred_val = splits.val_y.stack().to_frame('observed')\n",
+    "    pred_val[model_selected] = df_imputed\n",
+    "    val_metrics = vaep.models.calculte_metrics(pred_val, 'observed')\n",
+    "    display(val_metrics)\n",
     "\n",
-    "ax, errors_binned = vaep.plotting.errors.plot_errors_by_median(\n",
-    "    pred=pred_val,\n",
-    "    target_col='observed',\n",
-    "    feat_medians=train_X.median(),\n",
-    "    ax=ax,\n",
-    "    metric_name='MAE',\n",
-    "    palette=color_model_mapping\n",
-    ")"
+    "    fig, ax = plt.subplots(figsize=(8, 2))\n",
+    "\n",
+    "    ax, errors_binned = vaep.plotting.errors.plot_errors_by_median(\n",
+    "        pred=pred_val,\n",
+    "        target_col='observed',\n",
+    "        feat_medians=splits.train_X.median(),\n",
+    "        ax=ax,\n",
+    "        metric_name='MAE',\n",
+    "        palette=color_model_mapping)"
    ]
   },
   {
@@ -615,38 +666,44 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "f89799e8",
-   "metadata": {},
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
    "outputs": [],
    "source": [
-    "df_imputed = df_imputed.replace(val_X)"
+    "splits.to_long_format()\n",
+    "df_imputed = df_imputed.replace(splits.val_y).replace(splits.test_y)\n",
+    "df_imputed"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "235fdb66",
+   "cell_type": "markdown",
+   "id": "506de661",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "df = df.stack()  # long-format\n",
-    "df_imputed = df_imputed.stack()  # long-format\n",
-    "observed = df_imputed.loc[df.index]\n",
-    "imputed = df_imputed.loc[df_imputed.index.difference(df.index)]"
+    "Plot the distribution of the imputed values vs the observed data:"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "851ab631",
+   "id": "235fdb66",
    "metadata": {
-    "lines_to_next_cell": 2
+    "tags": [
+     "hide-input"
+    ]
    },
    "outputs": [],
    "source": [
+    "observed = df_imputed.loc[df.index].squeeze()\n",
+    "imputed = df_imputed.loc[df_imputed.index.difference(df.index)].squeeze()\n",
+    "\n",
     "fig, axes = plt.subplots(2, figsize=(8, 4))\n",
     "\n",
-    "min_max = vaep.plotting.data.get_min_max_iterable(\n",
-    "    [observed, imputed])\n",
+    "min_max = vaep.plotting.data.get_min_max_iterable([observed, imputed])\n",
+    "\n",
     "label_template = '{method} (N={n:,d})'\n",
     "ax, _ = vaep.plotting.data.plot_histogram_intensities(\n",
     "    observed,\n",
@@ -669,14 +726,6 @@
     "    alpha=1)\n",
     "_ = ax.legend()"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a235f133",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/project/04_1_train_pimms_models.py b/project/04_1_train_pimms_models.py
index 0a11b509c..79db2634b 100644
--- a/project/04_1_train_pimms_models.py
+++ b/project/04_1_train_pimms_models.py
@@ -13,6 +13,7 @@
 # %%
 import os
 from importlib import metadata
+
 IN_COLAB = 'COLAB_GPU' in os.environ
 if IN_COLAB:
     try:
@@ -25,31 +26,60 @@
 
 # %% [markdown]
 # If on colab, please restart the environment and run everything from here on.
+#
+# Specify example data:
 
 # %%
 import os
+
 IN_COLAB = 'COLAB_GPU' in os.environ
 
 fn_intensities = 'data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv'
 if IN_COLAB:
-    fn_intensities = 'https://raw.githubusercontent.com/RasmussenLab/pimms/main/project/data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv'
+    fn_intensities = ('https://raw.githubusercontent.com/RasmussenLab/pimms/main/'
+                      'project/data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv')
+
+# %% [markdown]
+# Load package.
 
 # %%
-import numpy as np
-import pandas as pd
+import logging
+import random
 
 import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from IPython.display import display
 
-
-from vaep.plotting.defaults import color_model_mapping
+import vaep.filter
 import vaep.plotting.data
 import vaep.sampling
-
-from vaep.sklearn.cf_transformer import CollaborativeFilteringTransformer
+from vaep.plotting.defaults import color_model_mapping
 from vaep.sklearn.ae_transformer import AETransformer
+from vaep.sklearn.cf_transformer import CollaborativeFilteringTransformer
 
 vaep.plotting.make_large_descriptors(8)
 
+
+logger = logger = vaep.logging.setup_nb_logger()
+logging.getLogger('fontTools').setLevel(logging.ERROR)
+
+# %% [markdown]
+# ## Parameters
+# Can be set by papermill on the command line or manually in the (colab) notebook.
+
+# %%
+fn_intensities: str = fn_intensities  # path or url to the data file in csv format
+index_name: str = 'Sample ID'  # name of the index column
+column_name: str = 'protein group'  # name of the column index
+select_features: bool = True  # Whether to select features based on prevalence
+feat_prevalence: float = 0.2  # minimum prevalence of a feature to be included
+sample_completeness: float = 0.3  # minimum completeness of a sample to be included
+sample_splits: bool = True  # Whether to sample validation and test data
+frac_non_train: float = 0.1  # fraction of non training data (validation and test split)
+frac_mnar: float = 0.0  # fraction of missing not at random data, rest: missing completely at random
+random_state: int = 42  # random state for reproducibility
+
 # %% [markdown]
 # ## Data
 
@@ -63,14 +93,12 @@
 # that the data can be transformed very easily into long format:
 
 # %%
-df.index.name = 'Sample ID'  # already set
-df.columns.name = 'protein group'  # not set due to csv disk file format
+df.index.name = index_name  # already set
+df.columns.name = column_name  # not set due to csv disk file format
 df.head()
 
 # %% [markdown]
-#
-
-# %% [markdown]
+# ### Data transformation: log2 transformation
 # Transform the data using the logarithm, here using base 2:
 
 # %%
@@ -79,7 +107,7 @@
 
 
 # %% [markdown]
-# two plots on data availability:
+# ### two plots inspecting data availability
 #
 # 1. proportion of missing values per feature median (N = protein groups)
 # 2. CDF of available intensities per protein group
@@ -94,31 +122,14 @@
 
 
 # %% [markdown]
+# ### Data selection
 # define a minimum feature and sample frequency for a feature to be included
 
 # %%
-SELECT_FEAT = True
-
-
-def select_features(df, feat_prevalence=.2, axis=0):
-    # # ! vaep.filter.select_features
-    N = df.shape[axis]
-    minimum_freq = N * feat_prevalence
-    freq = df.notna().sum(axis=axis)
-    mask = freq >= minimum_freq
-    print(f"Drop {(~mask).sum()} along axis {axis}.")
-    freq = freq.loc[mask]
-    if axis == 0:
-        df = df.loc[:, mask]
-    else:
-        df = df.loc[mask]
-    return df
-
-
-if SELECT_FEAT:
+if select_features:
     # potentially this can take a few iterations to stabilize.
-    df = select_features(df, feat_prevalence=.2)
-    df = select_features(df=df, feat_prevalence=.3, axis=1)
+    df = vaep.filter.select_features(df, feat_prevalence=feat_prevalence)
+    df = vaep.filter.select_features(df=df, feat_prevalence=sample_completeness, axis=1)
 df.shape
 
 
@@ -127,17 +138,36 @@ def select_features(df, feat_prevalence=.2, axis=0):
 
 # %%
 df = df.stack().to_frame('intensity')
-df.head()
+df
 
+# %% [markdown]
+# ## Optionally: Sample data
+# - models can be trained without subsetting the data
+# - allows evaluation of the models
+
+# %%
+if sample_splits:
+    splits, thresholds, fake_na_mcar, fake_na_mnar = vaep.sampling.sample_mnar_mcar(
+        df_long=df,
+        frac_non_train=frac_non_train,
+        frac_mnar=frac_mnar,
+        random_state=random_state,
+    )
+    splits = vaep.sampling.check_split_integrity(splits)
+else:
+    splits = vaep.sampling.DataSplits(is_wide_format=False)
+    splits.train_X = df
 
 # %% [markdown]
 # The resulting DataFrame with one column has an `MulitIndex` with the sample and feature identifier.
 
 # %% [markdown]
 # ## Collaborative Filtering
+#
+# Inspect annotations of the scikit-learn like Transformer:
 
 # %%
-# # # # CollaborativeFilteringTransformer?
+# # CollaborativeFilteringTransformer?
 
 # %% [markdown]
 # Let's set up collaborative filtering without a validation or test set, using
@@ -157,11 +187,15 @@ def select_features(df, feat_prevalence=.2, axis=0):
 # > This will probably mean setting up a validation set within the model.
 
 # %%
-cf_model.fit(df,
+cf_model.fit(splits.train_X,
+             splits.val_y,
              cuda=False,
              epochs_max=20,
              )
 
+# %% [markdown]
+# Impute missing values usin `transform` method:
+
 # %%
 df_imputed = cf_model.transform(df).unstack()
 assert df_imputed.isna().sum().sum() == 0
@@ -179,7 +213,6 @@ def select_features(df, feat_prevalence=.2, axis=0):
 assert len(df) == len(observed)
 assert df_imputed.shape[0] * df_imputed.shape[1] == len(imputed) + len(observed)
 
-# %%
 fig, axes = plt.subplots(2, figsize=(8, 4))
 
 min_max = vaep.plotting.data.get_min_max_iterable(
@@ -210,55 +243,32 @@ def select_features(df, feat_prevalence=.2, axis=0):
 # ## AutoEncoder architectures
 
 # %%
-# Reload data (for demonstration)
-
-df = pd.read_csv(fn_intensities, index_col=0)
-df.index.name = 'Sample ID'  # already set
-df.columns.name = 'protein group'  # not set due to csv disk file format
-df = np.log2(df + 1)  # log transform
-df.head()
+# Use wide format of data
+splits.to_wide_format()
+splits.train_X
 
 # %% [markdown]
-# The AutoEncoder model currently need validation data for training.
-# We will use 10% of the training data for validation.
-# > Expect this limitation to be dropped in the next release. It will still be recommended
-# > to use validation data for early stopping.
+# Validation data for early stopping (if specified)
 
 # %%
-freq_feat = df.notna().sum()
-freq_feat.head()  # training data
+splits.val_y
 
 # %% [markdown]
-# We will use the `sampling` module to sample the validation data from the training data.
-# Could be split differently by providing another `weights` vector.
+# Training and validation need the same shape:
 
 # %%
-val_X, train_X = vaep.sampling.sample_data(df.stack(),
-                                           sample_index_to_drop=0,
-                                           weights=freq_feat,
-                                           frac=0.1,
-                                           random_state=42,)
-val_X, train_X = val_X.unstack(), train_X.unstack()
-val_X = pd.DataFrame(pd.NA, index=train_X.index,
-                     columns=train_X.columns).fillna(val_X)
-
-# %% [markdown]
-# Training data and validation data have the same shape:
+if splits.val_y is not None:
+    splits.val_y = pd.DataFrame(pd.NA, index=splits.train_X.index,
+                                columns=splits.train_X.columns).fillna(splits.val_y)
 
-# %%
-val_X.shape, train_X.shape
+    print(splits.train_X.shape, splits.val_y.shape)
 
 # %% [markdown]
-# ... but different number of intensities (non-missing values):
+# Select either `DAE` or `VAE` model by chance:
 
 # %%
-train_X.notna().sum().sum(), val_X.notna().sum().sum(),
-
-# %% [markdown]
-# Select either `DAE` or `VAE` model:
-
-# %%
-model_selected = 'VAE'  # 'DAE'
+model_selected = random.choice(['DAE', 'VAE'])
+print("Selected model by chance:", model_selected)
 model = AETransformer(
     model=model_selected,
     hidden_layers=[512,],
@@ -268,59 +278,56 @@ def select_features(df, feat_prevalence=.2, axis=0):
 )
 
 # %%
-model.fit(train_X, val_X,
+model.fit(splits.train_X, splits.val_y,
           epochs_max=50,
           cuda=False)
 
+# %% [markdown]
+# Impute missing values using `transform` method:
+
 # %%
-df_imputed = model.transform(train_X)
+df_imputed = model.transform(splits.train_X).stack()
 df_imputed
 
 # %% [markdown]
 # Evaluate the model using the validation data:
 
 # %%
-pred_val = val_X.stack().to_frame('observed')
-pred_val[model_selected] = df_imputed.stack()
-pred_val
-
-# %%
-val_metrics = vaep.models.calculte_metrics(pred_val, 'observed')
-# val_metrics = metrics.add_metrics(
-#     pred_val, key='test data')
-# val_metrics = pd.DataFrame(val_metrics)
-# val_metrics
-pd.DataFrame(val_metrics)
-
-# %%
-fig, ax = plt.subplots(figsize=(8, 2))
-
-ax, errors_binned = vaep.plotting.errors.plot_errors_by_median(
-    pred=pred_val,
-    target_col='observed',
-    feat_medians=train_X.median(),
-    ax=ax,
-    metric_name='MAE',
-    palette=color_model_mapping
-)
+if splits.val_y is not None:
+    pred_val = splits.val_y.stack().to_frame('observed')
+    pred_val[model_selected] = df_imputed
+    val_metrics = vaep.models.calculte_metrics(pred_val, 'observed')
+    display(val_metrics)
+
+    fig, ax = plt.subplots(figsize=(8, 2))
+
+    ax, errors_binned = vaep.plotting.errors.plot_errors_by_median(
+        pred=pred_val,
+        target_col='observed',
+        feat_medians=splits.train_X.median(),
+        ax=ax,
+        metric_name='MAE',
+        palette=color_model_mapping)
 
 # %% [markdown]
 # replace predicted values with validation data values
 
 # %%
-df_imputed = df_imputed.replace(val_X)
+splits.to_long_format()
+df_imputed = df_imputed.replace(splits.val_y).replace(splits.test_y)
+df_imputed
 
-# %%
-df = df.stack()  # long-format
-df_imputed = df_imputed.stack()  # long-format
-observed = df_imputed.loc[df.index]
-imputed = df_imputed.loc[df_imputed.index.difference(df.index)]
+# %% [markdown]
+# Plot the distribution of the imputed values vs the observed data:
 
 # %%
+observed = df_imputed.loc[df.index].squeeze()
+imputed = df_imputed.loc[df_imputed.index.difference(df.index)].squeeze()
+
 fig, axes = plt.subplots(2, figsize=(8, 4))
 
-min_max = vaep.plotting.data.get_min_max_iterable(
-    [observed, imputed])
+min_max = vaep.plotting.data.get_min_max_iterable([observed, imputed])
+
 label_template = '{method} (N={n:,d})'
 ax, _ = vaep.plotting.data.plot_histogram_intensities(
     observed,
@@ -342,6 +349,3 @@ def select_features(df, feat_prevalence=.2, axis=0):
     color=color_model_mapping[model_selected],
     alpha=1)
 _ = ax.legend()
-
-
-# %%
diff --git a/project/README.md b/project/README.md
index eddd93109..ba3cc6b25 100644
--- a/project/README.md
+++ b/project/README.md
@@ -84,8 +84,7 @@ papermill  01_0_split_data.ipynb runs/experiment_03/%DATASET%/experiment_03_data
 tag | notebook  | Description
 --- | ---  |  --- 
 Tutorials | 
-tut  | 04_1_train_pimms_models.ipynb | main tutorial showing scikit-learn interface partly with validatio data
-tut  | 04_1_train_DAE_VAE_wo_val_data.ipynb | 
+-  | 04_1_train_pimms_models.ipynb | main tutorial showing scikit-learn (Transformer) interface partly with or without validation data
 Single experiment |
 run  | 01_0_split_data.ipynb               | Create train, validation and test data splits
 run  | 01_0_transform_data_to_wide_format.ipynb | Transform train split to wide format for R models
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index adebc5050..000000000
--- a/requirements.txt
+++ /dev/null
@@ -1,37 +0,0 @@
-python >= 3.8, < 3.9
-numpy
-pandas < 2.0
-scipy >= 1.6
-# plotting
-matplotlib
-python-kaleido
-plotly
-seaborn
-pip
-# ML
-pytorch < 2.0
-# pytorch-cuda
-scikit-learn
-fastai
-torchvision
-# cudatoolkit=11.7
-tensorboard
-umap-learn
-# stats
-pingouin
-statsmodels
-# other
-tqdm # progress bars
-xmltodict # configs
-openpyxl # xml
-omegaconf
-# snakemake
-snakemake-minimal < 7.26
-# jupyter
-ipykernel
-ipython
-ipywidgets
-jupyterlab # standalone jupyter installation
-# jupyter_contrib_nbextensions # delete configuration file if you see an error: https://github.com/jupyter/nbconvert/issues/526#issuecomment-277552771
-jupyter-dash
-papermill # execute ipynb's
\ No newline at end of file
diff --git a/requirements_R.txt b/requirements_R.txt
deleted file mode 100644
index c52c1fb66..000000000
--- a/requirements_R.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-# R3.6.3 is not available as binaries for M1 and M2 amd64 atm (2023-08-11)
-r-base # >= 3.6.0, < 4.0 # would be good if it could be relaxed
-r-irkernel
-# r-biocmanager
-r-reshape2
-r-stringi # + rmarkdown hack for reshape2
-r-stringr # reshape2
-r-tidyverse
-r-gdata
-r-glmnet
-r-e1071
-r-norm
-r-missforest
-r-vim
-r-mice
\ No newline at end of file
diff --git a/requirements_dev.txt b/requirements_dev.txt
deleted file mode 100644
index 92626e74b..000000000
--- a/requirements_dev.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-# dev
-pytest
-pytest-cov
-jupytext
-flake8
-flake8-bugbear
-build
-wheel
-setuptools
-pre-commit
-pre-commit
-jupyterlab_code_formatter
-jupyterlab-git
\ No newline at end of file
diff --git a/vaep/__init__.py b/vaep/__init__.py
index 98460455a..059ccf970 100644
--- a/vaep/__init__.py
+++ b/vaep/__init__.py
@@ -9,10 +9,7 @@
 from importlib import metadata
 
 import njab
-import pandas as pd
-import pandas.io.formats.format as pf
 
-# from . import logging, nb, pandas, plotting
 import vaep.logging
 import vaep.nb
 import vaep.pandas
diff --git a/vaep/analyzers/__init__.py b/vaep/analyzers/__init__.py
index 9856dafb9..6d16805a6 100644
--- a/vaep/analyzers/__init__.py
+++ b/vaep/analyzers/__init__.py
@@ -2,7 +2,7 @@
 """
 from types import SimpleNamespace
 
-from . import compare_predictions, diff_analysis
+from vaep.analyzers import compare_predictions, diff_analysis
 
 __all__ = ['diff_analysis', 'compare_predictions', 'Analysis']
 
diff --git a/vaep/data_handling.py b/vaep/data_handling.py
index 1f0bc5404..41be078ac 100644
--- a/vaep/data_handling.py
+++ b/vaep/data_handling.py
@@ -4,8 +4,6 @@
 import numpy as np
 import pandas as pd
 
-# coverage
-
 
 def coverage(X: pd.DataFrame, coverage_col: float, coverage_row: float):
     """Select proteins by column depending on their coverage.
diff --git a/vaep/filter.py b/vaep/filter.py
new file mode 100644
index 000000000..2d26c4806
--- /dev/null
+++ b/vaep/filter.py
@@ -0,0 +1,24 @@
+import logging
+
+import pandas as pd
+
+logger = logging.getLogger(__name__)
+
+
+def select_features(df: pd.DataFrame,
+                    feat_prevalence: float = .2,
+                    axis: int = 0) -> pd.DataFrame:
+    """Select features or samples with a minimum prevalence.
+    """
+    N = df.shape[axis]
+    minimum_freq = N * feat_prevalence
+    freq = df.notna().sum(axis=axis)
+    mask = freq >= minimum_freq
+    axis_synonym = "index" if axis == 0 else "columns"
+    logger.info(f"Drop {(~mask).sum()} along axis {axis} ({axis_synonym}).")
+    freq = freq.loc[mask]
+    if axis == 0:
+        df = df.loc[:, mask]
+    else:
+        df = df.loc[mask]
+    return df
diff --git a/vaep/models/__init__.py b/vaep/models/__init__.py
index 3be35408b..2ae111d44 100644
--- a/vaep/models/__init__.py
+++ b/vaep/models/__init__.py
@@ -16,8 +16,7 @@
 from fastcore.foundation import L
 
 import vaep
-
-from . import ae, analysis, collab, vae
+from vaep.models import ae, analysis, collab, vae
 
 logger = logging.getLogger(__name__)
 
diff --git a/vaep/models/ae.py b/vaep/models/ae.py
index fd5d081a1..8295c8560 100644
--- a/vaep/models/ae.py
+++ b/vaep/models/ae.py
@@ -21,7 +21,7 @@
 import vaep.models
 import vaep.transform
 
-from . import analysis
+from vaep.models import analysis
 
 logger = logging.getLogger(__name__)
 
diff --git a/vaep/models/collab.py b/vaep/models/collab.py
index ec34a4ffb..f54ab6df2 100644
--- a/vaep/models/collab.py
+++ b/vaep/models/collab.py
@@ -5,14 +5,13 @@
 import pandas as pd
 # import explicit objects for functional annotations
 from fastai.collab import *
-from fastai.collab import (Categorify, CollabDataLoaders, IndexSplitter,
-                           TabularCollab, TransformBlock)
+from fastai.collab import (Categorify, IndexSplitter, TabularCollab,
+                           TransformBlock)
 from fastai.tabular.all import *
 
 import vaep.io.dataloaders
 import vaep.io.datasplits
-
-from . import analysis
+from vaep.models import analysis
 
 logger = logging.getLogger(__name__)
 
@@ -49,37 +48,30 @@ def __init__(self,
                  item_column: str = 'peptide',
                  target_column: str = 'intensity',
                  model_kwargs: dict = None,
-                 batch_size: int = 64):
+                 batch_size: int = 1_024):
         if datasplits.val_y is not None:
-            self.X, self.frac = combine_data(datasplits.train_X,
-                                             datasplits.val_y)
+            self.X, _ = combine_data(datasplits.train_X,
+                                     datasplits.val_y)
         else:
-            self.X, self.frac = datasplits.train_X.reset_index(), 0.0
+            self.X, _ = datasplits.train_X.reset_index(), 0.0
         self.batch_size = batch_size
-        self.dls = CollabDataLoaders.from_df(self.X, valid_pct=self.frac,
-                                             seed=42,
-                                             user_name=sample_column,
-                                             item_name=item_column,
-                                             rating_name=target_column,
-                                             bs=self.batch_size)
         user_name = sample_column
         item_name = item_column
         rating_name = target_column
         cat_names = [user_name, item_name]
-        ratings = self.X
         splits = None
         if datasplits.val_y is not None:
             idx_splitter = IndexSplitter(
-                list(range(len(datasplits.train_X), len(datasplits.train_X) + len(datasplits.val_y))))
+                list(range(len(datasplits.train_X), len(self.X))))
             splits = idx_splitter(self.X)
-        to = TabularCollab(
-            ratings,
-            [Categorify],
-            cat_names,
+        self.to = TabularCollab(
+            self.X,
+            procs=[Categorify],
+            cat_names=cat_names,
             y_names=[rating_name],
             y_block=TransformBlock(),
             splits=splits)
-        self.dls = to.dataloaders(path='.', bs=self.batch_size)
+        self.dls = self.to.dataloaders(path='.', bs=self.batch_size)
         self.params = {}
         if model_kwargs is None:
             model_kwargs = {}
diff --git a/vaep/pandas/__init__.py b/vaep/pandas/__init__.py
index 97520bb02..5f82204b1 100644
--- a/vaep/pandas/__init__.py
+++ b/vaep/pandas/__init__.py
@@ -7,7 +7,30 @@
 import omegaconf
 import pandas as pd
 
-from .calc_errors import calc_errors_per_feat, get_absolute_error
+from vaep.pandas.calc_errors import calc_errors_per_feat, get_absolute_error
+
+__all__ = [
+    'calc_errors_per_feat',
+    'get_absolute_error',
+    'unique_cols',
+    'get_unique_non_unique_columns',
+    'prop_unique_index',
+    'replace_with',
+    'index_to_dict',
+    'get_columns_accessor',
+    'get_columns_accessor_from_iterable',
+    'select_max_by',
+    'get_columns_namedtuple',
+    'highlight_min',
+    '_add_indices',
+    'interpolate',
+    'flatten_dict_of_dicts',
+    'key_map',
+    'parse_query_expression',
+    'length',
+    'get_last_index_matching_proportion',
+    'get_lower_whiskers',
+    'get_counts_per_bin']
 
 
 def unique_cols(s: pd.Series) -> bool:
@@ -285,16 +308,15 @@ def get_lower_whiskers(df: pd.DataFrame, factor: float = 1.5) -> pd.Series:
     return ret
 
 
-def get_counts_per_bin(df: pd.DataFrame, bins: range, columns: Optional[List[str]] = None) -> pd.DataFrame:
+def get_counts_per_bin(df: pd.DataFrame,
+                       bins: range,
+                       columns: Optional[List[str]] = None) -> pd.DataFrame:
     """Return counts per bin for selected columns in DataFrame."""
     counts_per_bin = dict()
     if columns is None:
         columns = df.columns.to_list()
     for col in columns:
-        _series = (pd.cut(df[col], bins=bins)
-                   .to_frame()
-                   .groupby(col)
-                   .size())
+        _series = (pd.cut(df[col], bins=bins).to_frame().groupby(col).size())
         _series.index.name = 'bin'
         counts_per_bin[col] = _series
     counts_per_bin = pd.DataFrame(counts_per_bin)
diff --git a/vaep/plotting/__init__.py b/vaep/plotting/__init__.py
index 17cc86ced..105c183f8 100644
--- a/vaep/plotting/__init__.py
+++ b/vaep/plotting/__init__.py
@@ -11,11 +11,8 @@
 import seaborn
 
 import vaep.pandas
-
-from . import data, defaults, errors, plotly
-from .errors import plot_rolling_error
-
-# from . defaults import order_categories, labels_dict, IDX_ORDER
+from vaep.plotting import data, defaults, errors, plotly
+from vaep.plotting.errors import plot_rolling_error
 
 seaborn.set_style("whitegrid")
 # seaborn.set_theme()
diff --git a/vaep/sampling.py b/vaep/sampling.py
index 78939c59b..ae1aaaa1a 100644
--- a/vaep/sampling.py
+++ b/vaep/sampling.py
@@ -1,13 +1,11 @@
 import logging
-from typing import Union, Tuple
-
+from typing import Tuple, Union
 
 import numpy as np
 import pandas as pd
 
 from vaep.io.datasplits import DataSplits
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -184,8 +182,7 @@ def sample_mnar_mcar(df_long: pd.DataFrame,
     return splits, thresholds, fake_na_mcar, fake_na_mnar
 
 
-def get_thresholds(df_long: pd.DataFrame,
-                   frac_non_train: float,
+def get_thresholds(df_long: pd.DataFrame, frac_non_train: float,
                    random_state: int) -> pd.Series:
     """Get thresholds for MNAR/MCAR sampling. Thresholds are sampled from a normal
     distrubiton with a mean of the quantile of the simulated missing data.
@@ -207,10 +204,47 @@ def get_thresholds(df_long: pd.DataFrame,
     """
     quantile_frac = df_long.quantile(frac_non_train)
     rng = np.random.default_rng(random_state)
-    thresholds = pd.Series(rng.normal(loc=float(quantile_frac),
-                                      scale=float(0.3 * df_long.std()),
-                                      size=len(df_long),
-                                      ),
-                           index=df_long.index,
-                           )
+    thresholds = pd.Series(
+        rng.normal(
+            loc=float(quantile_frac),
+            scale=float(0.3 * df_long.std()),
+            size=len(df_long),
+        ),
+        index=df_long.index,
+    )
     return thresholds
+
+
+def check_split_integrity(splits: DataSplits) -> DataSplits:
+    """Check if IDs in are only in validation or test data for rare cases.
+    Returns the corrected splits."""
+    diff = (splits
+            .val_y
+            .index
+            .levels[-1]
+            .difference(splits
+                        .train_X
+                        .index
+                        .levels[-1]
+                        ).to_list())
+    if diff:
+        logger.warning(f"Remove from val: {diff.to_list()}")
+        to_remove = splits.val_y.loc[pd.IndexSlice[:, diff]]
+        splits.train_X = pd.concat([splits.train_X, to_remove])
+        splits.val_y = splits.val_y.drop(to_remove.index)
+
+    diff = (splits
+            .test_y
+            .index
+            .levels[-1]
+            .difference(splits
+                        .train_X
+                        .index
+                        .levels[-1]
+                        ).to_list())
+    if diff:
+        logger.warning(f"Remove from test: {diff.to_list()}")
+        to_remove = splits.test_y.loc[pd.IndexSlice[:, diff]]
+        splits.train_X = pd.concat([splits.train_X, to_remove])
+        splits.test_y = splits.test_y.drop(to_remove.index)
+    return splits
diff --git a/vaep/sklearn/cf_transformer.py b/vaep/sklearn/cf_transformer.py
index ac152a92b..022973e6e 100644
--- a/vaep/sklearn/cf_transformer.py
+++ b/vaep/sklearn/cf_transformer.py
@@ -8,12 +8,13 @@
 from fastai import learner
 from fastai.callback.tracker import EarlyStoppingCallback
 from fastai.collab import *
-from fastai.collab import CollabDataLoaders, EmbeddingDotBias, TabularCollab
+from fastai.collab import EmbeddingDotBias, TabularCollab
 from fastai.data.block import TransformBlock
 from fastai.data.transforms import IndexSplitter
 from fastai.learner import Learner
 from fastai.losses import MSELossFlat
 from fastai.tabular.all import *
+from fastai.tabular.all import TransformBlock
 from fastai.tabular.core import Categorify
 from fastai.torch_core import default_device
 from sklearn.base import BaseEstimator, TransformerMixin
@@ -89,21 +90,16 @@ def fit(self, X: pd.Series, y: pd.Series = None,
         if not cuda:
             default_device(use=False)  # set to cpu
         if y is not None:
-            X, frac = collab.combine_data(X, y)
+            # Concatenate train and validation observations into on dataframe
+            first_N_train = len(X)
+            X, _ = collab.combine_data(X, y)
         else:
-            X, frac = X.reset_index(), 0.0
-
-        self.dls = CollabDataLoaders.from_df(
-            X,
-            valid_pct=frac,
-            seed=42,
-            user_name=self.sample_column,
-            item_name=self.item_column,
-            rating_name=self.target_column,
-            bs=self.batch_size)
+            X, _ = X.reset_index(), 0.0
+
         splits = None
         if y is not None:
-            idx_splitter = IndexSplitter(list(range(len(X), len(X) + len(y))))
+            # specify positional indices of validation data
+            idx_splitter = IndexSplitter(list(range(first_N_train, len(X))))
             splits = idx_splitter(X)
 
         self.cat_names = [self.sample_column, self.item_column]