Dev (#69) Fix bug in CollaborativeFilteringTransformer (allow validat…

…ion data) - update workflows (robustness) - MNAR-MCAR sampling added to tutorial, move sanity check in pkg - fix a bug in CollaborativeFilteringTransformer which did not allow a validation split while training - add filtering of initial data to pkg of data
RasmussenLab · Jun 13, 2024 · f22b859 · f22b859
2 parents 0874f5f + ea902a6
commit f22b859
Show file tree

Hide file tree

Showing 29 changed files with 526 additions and 1,040 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -67,19 +67,27 @@ jobs:
       run: |
         cd project
         mkdir runs
-        papermill 04_1_train_DAE_VAE_wo_val_data.ipynb runs/04_1_train_DAE_VAE_wo_val_data.ipynb
         papermill 04_1_train_pimms_models.ipynb runs/04_1_train_pimms_models.ipynb
-    - name: Run demo workflow (integration test)
+        papermill 04_1_train_pimms_models.ipynb runs/04_1_train_pimms_models_no_val.ipynb -p sample_splits False
+    - name: Dry-Run demo workflow (integration test)
       continue-on-error: true
       run: | 
        cd project
        snakemake -p -c1 --configfile config/single_dev_dataset/example/config.yaml -n
+    - name: Run demo workflow (integration test)
+      continue-on-error: true
+      run: | 
+       cd project
        snakemake -p -c4 -k --configfile config/single_dev_dataset/example/config.yaml
     - name: Run demo workflow again (in case of installation issues)
+      continue-on-error: true
       run: | 
         cd project
-        snakemake -p -c1 -n --configfile config/single_dev_dataset/example/config.yaml
         snakemake -p -c4 -k --configfile config/single_dev_dataset/example/config.yaml
+    - name: Run demo workflow again (in case of installation issues) - one thread
+      run: | 
+        cd project
+        snakemake -p -c1 --configfile config/single_dev_dataset/example/config.yaml
     - name: Archive results
     # https://github.com/actions/upload-artifact
       uses: actions/upload-artifact@v4

diff --git a/.github/workflows/test_pkg_on_colab.yaml b/.github/workflows/test_pkg_on_colab.yaml
@@ -1,10 +1,10 @@
 name: Test that tutorial runs on latest colab image
 
 on:
-    push:
-        branches: [dev]
+    # push:
+    #     branches: [main]
     pull_request:
-        branches: [main, dev]
+        branches: [main]
     schedule:
         - cron: '0 2 3 * *'
 
@@ -24,3 +24,5 @@ jobs:
           run: |
             cd project
             papermill 04_1_train_pimms_models.ipynb 04_1_train_pimms_models_output.ipynb
+            papermill 04_1_train_pimms_models.ipynb 04_1_train_pimms_models_no_val.ipynb -p sample_splits False
+
diff --git a/.github/workflows/workflow_website.yaml b/.github/workflows/workflow_website.yaml
@@ -39,9 +39,15 @@ jobs:
         cd project
         snakemake -s workflow/Snakefile_v2.smk --configfile config/alzheimer_study/config.yaml -p -c4 -k
     - name: Run demo workflow again (in case of installation issues)
+      continue-on-error: true
       run: | 
         cd project
         snakemake -s workflow/Snakefile_v2.smk --configfile config/alzheimer_study/config.yaml -p -c4 -k
+    - name: Run demo workflow again (in case of installation issues) with one thread
+      continue-on-error: true
+      run: | 
+        cd project
+        snakemake -s workflow/Snakefile_v2.smk --configfile config/alzheimer_study/config.yaml -p -c1 -k
     - name: Run differential analysis workflow
       run: | 
         cd project

diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@ The publication is accepted in Nature Communications
 and the pre-print is available [on biorxiv](https://doi.org/10.1101/2023.01.12.523792).
 
 > `PIMMS` was called `vaep` during development.  
-> Before entire refactoring has to been completed the imported package will be `vaep`.
+> Before entire refactoring has been completed the imported package will be `vaep`.
 
 We provide functionality as a python package, an excutable workflow or simply in notebooks.
 
@@ -127,17 +127,24 @@ mamba env create -n pimms -f environment.yml # faster, less then 5mins
 
 If on Mac M1, M2 or having otherwise issue using your accelerator (e.g. GPUs): Install the pytorch dependencies first, then the rest of the environment:
 
-### Install pytorch first (M-chips)
+### Install pytorch first
+
+> :warning: We currently see issues with some installations on M1 chips. A dependency
+> for one workflow is polars, which causes the issue. This should be [fixed now](https://github.com/RasmussenLab/njab/pull/13) 
+> for general use by delayed import 
+> of `mrmr-selection` in `njab`. If you encounter issues, please open an issue.
 
 Check how to install pytorch for your system [here](https://pytorch.org/get-started).
 
 - select the version compatible with your cuda version if you have an nvidia gpu or a Mac M-chip.
 
 ```bash
-conda create -n vaep python=3.9 pip
-conda activate vaep
-# Follow instructions on https://pytorch.org/get-started 
-# conda env update -f environment.yml -n vaep # should not install the rest.
+conda create -n pimms python=3.9 pip
+conda activate pimms
+# Follow instructions on https://pytorch.org/get-started: 
+# CUDA is not available on MacOS, please use default package
+# pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+conda install pytorch::pytorch torchvision torchaudio fastai -c pytorch -c fastai -y
 pip install pimms-learn
 pip install jupyterlab papermill # use run notebook interactively or as a script
 

diff --git a/project/01_0_split_data.ipynb b/project/01_0_split_data.ipynb
@@ -1370,48 +1370,7 @@
     "# -> or raise error as feature completness treshold is so low that less than 3 samples\n",
     "# per feature are allowd.\n",
     "\n",
-    "diff = (splits\n",
-    "        .val_y\n",
-    "        .index\n",
-    "        .levels[-1]\n",
-    "        .difference(splits\n",
-    "                    .train_X\n",
-    "                    .index\n",
-    "                    .levels[-1]\n",
-    "                    ).to_list())\n",
-    "if diff:\n",
-    "    to_remove = splits.val_y.loc[pd.IndexSlice[:, diff]]\n",
-    "    display(to_remove)\n",
-    "    splits.train_X = pd.concat([splits.train_X, to_remove])\n",
-    "    splits.val_y = splits.val_y.drop(to_remove.index)\n",
-    "diff"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "tags": [
-     "hide-input"
-    ]
-   },
-   "outputs": [],
-   "source": [
-    "diff = (splits\n",
-    "        .test_y\n",
-    "        .index\n",
-    "        .levels[-1]\n",
-    "        .difference(splits\n",
-    "                    .train_X\n",
-    "                    .index\n",
-    "                    .levels[-1]\n",
-    "                    ).to_list())\n",
-    "if diff:\n",
-    "    to_remove = splits.test_y.loc[pd.IndexSlice[:, diff]]\n",
-    "    display(to_remove)\n",
-    "    splits.train_X = pd.concat([splits.train_X, to_remove])\n",
-    "    splits.test_y = splits.test_y.drop(to_remove.index)\n",
-    "diff"
+    "splits = vaep.sampling.check_split_integrity(splits)"
    ]
   },
   {
@@ -1812,17 +1771,6 @@
     "writer.close()\n",
     "dumps"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "tags": [
-     "hide-input"
-    ]
-   },
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {

diff --git a/project/01_0_split_data.py b/project/01_0_split_data.py
@@ -724,38 +724,7 @@ def join_as_str(seq):
 # -> or raise error as feature completness treshold is so low that less than 3 samples
 # per feature are allowd.
 
-diff = (splits
-        .val_y
-        .index
-        .levels[-1]
-        .difference(splits
-                    .train_X
-                    .index
-                    .levels[-1]
-                    ).to_list())
-if diff:
-    to_remove = splits.val_y.loc[pd.IndexSlice[:, diff]]
-    display(to_remove)
-    splits.train_X = pd.concat([splits.train_X, to_remove])
-    splits.val_y = splits.val_y.drop(to_remove.index)
-diff
-
-# %% tags=["hide-input"]
-diff = (splits
-        .test_y
-        .index
-        .levels[-1]
-        .difference(splits
-                    .train_X
-                    .index
-                    .levels[-1]
-                    ).to_list())
-if diff:
-    to_remove = splits.test_y.loc[pd.IndexSlice[:, diff]]
-    display(to_remove)
-    splits.train_X = pd.concat([splits.train_X, to_remove])
-    splits.test_y = splits.test_y.drop(to_remove.index)
-diff
+splits = vaep.sampling.check_split_integrity(splits)
 
 # %% [markdown]
 # Some tools require at least 4 observation in the training data,
@@ -963,5 +932,3 @@ def join_as_str(seq):
 # %% tags=["hide-input"]
 writer.close()
 dumps
-
-# %% tags=["hide-input"]
diff --git a/project/01_1_train_CF.ipynb b/project/01_1_train_CF.ipynb
@@ -98,7 +98,6 @@
     "# model\n",
     "# Dimensionality of encoding dimension (latent space of model)\n",
     "latent_dim: int = 10\n",
-    "# hidden_layers:str = '128_64' # Underscore separated string of layers, '128 64' for the encoder, reversed for decoder\n",
     "sample_idx_position: int = 0  # position of index which is sample ID\n",
     "model: str = 'CF'  # model name\n",
     "model_key: str = 'CF'  # potentially alternative key for model (grid search)\n",

diff --git a/project/01_1_train_CF.py b/project/01_1_train_CF.py
@@ -69,7 +69,6 @@
 # model
 # Dimensionality of encoding dimension (latent space of model)
 latent_dim: int = 10
-# hidden_layers:str = '128_64' # Underscore separated string of layers, '128 64' for the encoder, reversed for decoder
 sample_idx_position: int = 0  # position of index which is sample ID
 model: str = 'CF'  # model name
 model_key: str = 'CF'  # potentially alternative key for model (grid search)

diff --git a/project/01_1_train_NAGuideR_methods.R b/project/01_1_train_NAGuideR_methods.R
@@ -20,6 +20,8 @@
 # - BiocManager could be moved to methods who are installed from BioConductor
 
 # + tags=["hide-input"] vscode={"languageId": "r"}
+# options("install.lock"=FALSE)
+
 packages_base_R <-
   c("BiocManager", "reshape2", "data.table", "readr", "tibble")
 
@@ -130,6 +132,7 @@ nafunctions <- function(x, method = "zero") {
   else if (method == "qrilc") {
     install_bioconductor("impute")
     install_bioconductor("pcaMethods")
+    install_rpackage('gmm')
     install_rpackage('imputeLCMD')
     xxm <- t(df1)
     data_zero1 <-
@@ -139,13 +142,15 @@ nafunctions <- function(x, method = "zero") {
   else if (method == "mindet") {
     install_bioconductor("impute")
     install_bioconductor("pcaMethods")
+    install_rpackage('gmm')
     install_rpackage('imputeLCMD')
     xxm <- as.matrix(df1)
     df <- imputeLCMD::impute.MinDet(xxm, q = 0.01)
   }
   else if (method == "minprob") {
     install_bioconductor("impute")
     install_bioconductor("pcaMethods")
+    install_rpackage('gmm')
     install_rpackage('imputeLCMD')
     xxm <- as.matrix(df1)
     df <-
@@ -278,6 +283,7 @@ nafunctions <- function(x, method = "zero") {
 
     install_bioconductor("impute")
     install_bioconductor("pcaMethods")
+    install_rpackage('gmm')
     install_rpackage('imputeLCMD')
     install_rpackage("magrittr")
     install_rpackage("glmnet")

diff --git a/project/01_1_train_NAGuideR_methods.ipynb b/project/01_1_train_NAGuideR_methods.ipynb
@@ -26,6 +26,8 @@
    },
    "outputs": [],
    "source": [
+    "# options(\"install.lock\"=FALSE)\n",
+    "\n",
     "packages_base_R <-\n",
     "  c(\"BiocManager\", \"reshape2\", \"data.table\", \"readr\", \"tibble\")\n",
     "\n",
@@ -160,6 +162,7 @@
     "  else if (method == \"qrilc\") {\n",
     "    install_bioconductor(\"impute\")\n",
     "    install_bioconductor(\"pcaMethods\")\n",
+    "    install_rpackage('gmm')\n",
     "    install_rpackage('imputeLCMD')\n",
     "    xxm <- t(df1)\n",
     "    data_zero1 <-\n",
@@ -169,13 +172,15 @@
     "  else if (method == \"mindet\") {\n",
     "    install_bioconductor(\"impute\")\n",
     "    install_bioconductor(\"pcaMethods\")\n",
+    "    install_rpackage('gmm')\n",
     "    install_rpackage('imputeLCMD')\n",
     "    xxm <- as.matrix(df1)\n",
     "    df <- imputeLCMD::impute.MinDet(xxm, q = 0.01)\n",
     "  }\n",
     "  else if (method == \"minprob\") {\n",
     "    install_bioconductor(\"impute\")\n",
     "    install_bioconductor(\"pcaMethods\")\n",
+    "    install_rpackage('gmm')\n",
     "    install_rpackage('imputeLCMD')\n",
     "    xxm <- as.matrix(df1)\n",
     "    df <-\n",
@@ -308,6 +313,7 @@
     "    \n",
     "    install_bioconductor(\"impute\")\n",
     "    install_bioconductor(\"pcaMethods\")\n",
+    "    install_rpackage('gmm')\n",
     "    install_rpackage('imputeLCMD')\n",
     "    install_rpackage(\"magrittr\")\n",
     "    install_rpackage(\"glmnet\")\n",