Merge pull request #48 from RasmussenLab/dev

Scikit-Learn interface
RasmussenLab · Sep 6, 2023 · 275d933 · 275d933
2 parents 95b6e9e + 8f42d8a
commit 275d933
Show file tree

Hide file tree

Showing 104 changed files with 3,419 additions and 38,153 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -4,30 +4,33 @@ on:
     branches: [main, dev]
   pull_request:
     branches: [main, dev]
+  schedule:
+    - cron: '0 2 * * 3,6'
 jobs:
   run-tests:
-    runs-on: ubuntu-20.04
+    runs-on: ${{ matrix.os }}
     defaults:
       run:
         shell: bash -el {0}
     strategy:
       fail-fast: false
       matrix:
-        os: ["ubuntu-latest", "macos-latest", "windows-latest"]
+        os: ["ubuntu-latest",
+             "macos-13",
+             # "windows-latest" # rrcovNA cannot be build from source on windows-server
+             ]
         python-version: ["3.8"]
     steps:
     - name: Checkout
       uses: actions/checkout@v2
-    - name: check files
-      run: |
-        ls ${{ github.workspace }}
     - name: Set up Miniconda
+      # ! change action https://github.com/mamba-org/setup-micromamba
       uses: conda-incubator/setup-miniconda@v2
       with: 
         miniforge-variant: Mambaforge
         # miniforge-version: latest
         use-mamba: true
-        channel-priority: strict
+        channel-priority: disabled
         python-version: ${{ matrix.python-version }}
         environment-file: environment.yml
         activate-environment: vaep
@@ -46,23 +49,73 @@ jobs:
     # # currently part of environment
     # - name: Install package and install library
     #   run: |
-    #     pip install pytest
-    - name: Run Tests on installed package
-      run: pytest . 
+    #     pip install pytest pytest-cov
+    - name: Run Unit tests on installed package
+      run: |
+        pytest . 
     - name: View papermill help message for notebooks (as scripts)
       run: |
         cd project
         papermill 01_0_split_data.ipynb --help-notebook
         papermill 01_1_train_VAE.ipynb --help-notebook
         papermill 01_1_train_DAE.ipynb --help-notebook
         papermill 01_1_train_CF.ipynb --help-notebook
-    - name: Run demo workflow
+    - name: Run demo workflow (integration test)
       run: | 
        cd project
        snakemake -p -c1 --configfile config/single_dev_dataset/example/config.yaml -n
-       snakemake -p -c1 --configfile config/single_dev_dataset/example/config.yaml
+       snakemake -p -c2 -k --configfile config/single_dev_dataset/example/config.yaml
     - name: Archive results
       uses: actions/upload-artifact@v3
       with:
         name: example-workflow-results-${{ matrix.os }}
-        path: project/runs/example/01_2_performance_plots.html
+        path: project/runs/example/01_2_performance_plots.html
+
+  test_pip_pkg_install:
+    runs-on: ${{ matrix.os }}
+    name: test-pip-installation
+    strategy:
+      fail-fast: false
+      matrix:
+        os: ["ubuntu-latest", "macos-latest", "windows-latest"]
+        python-version: ["3.8"]
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: actions/[email protected]
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: install pimms
+        run: python -m pip install .
+
+      - name: Install pytest
+        run: python -m pip install pytest pytest-cov
+
+      - name: Run pytest
+        run: pytest .
+
+
+  publish:
+    name: Publish package
+    if: startsWith(github.event.ref, 'refs/tags/v')
+    needs:
+      - run-tests
+      - test_pip_pkg_install
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: actions/[email protected]
+        with:
+          python-version: "3.8"
+      - name: Install twine and build
+        run: python -m pip install --upgrade twine build
+      - name: Build
+        run: python -m build
+
+      - uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          user: __token__
+          password: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/.gitignore b/.gitignore
@@ -44,4 +44,5 @@ workflows/maxquant/out/
 
 # builds
 docs/_*
-docs/source
+docs/reference
+build
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -0,0 +1,35 @@
+# .readthedocs.yaml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Set the OS, Python version and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.8"
+    # You can also specify other tool versions:
+    # nodejs: "19"
+    # rust: "1.64"
+    # golang: "1.19"
+
+# Build documentation in the "docs/" directory with Sphinx
+sphinx:
+   configuration: docs/conf.py
+
+# Optionally build your docs in additional formats such as PDF and ePub
+# formats:
+#    - pdf
+#    - epub
+
+# Optional but recommended, declare the Python requirements required
+# to build your documentation
+# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
+python:
+   install:
+   - method: pip
+     path: .
+     extra_requirements:
+      - docs
diff --git a/README.md b/README.md
@@ -2,22 +2,21 @@
 
 PIMMS stands for Proteomics Imputation Modeling Mass Spectrometry 
 and is a hommage to our dear British friends 
-who are missing as part of the EU for far too long already.
-(Pimms is also a british summer drink)
+who are missing as part of the EU for far too long already
+(Pimms is also a British summer drink).
 
-The pre-print is available [on biorxiv](https://www.biorxiv.org/content/10.1101/2023.01.12.523792v1).
+The pre-print is available [on biorxiv](https://doi.org/10.1101/2023.01.12.523792).
 
 
-> `PIMMS`was called `vaep` during development.  
+> `PIMMS` was called `vaep` during development.  
 > Before entire refactoring has to been completed the imported package will be
 `vaep`.
 
-We provide functionality as a python package and excutable workflows and notebooks 
-under the [`project`](project) folder, inclduing an example.
+We provide functionality as a python package, an excutable workflow and notebooks.
+
+The models can be used with the scikit-learn interface in the spirit of other scikit-learn imputers. You can try this in colab. [![open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/RasmussenLab/pimms/blob/dev/project/04_1_train_pimms_models.ipynb)
+
 
-The [`workflows`](workflows) folder contains snakemake workflows used for rawfile data processing, 
-both for [running MaxQuant](workflows\maxquant) over a large set of HeLa raw files 
-and ThermoRawFileParser on a list of raw files to [extract their meta data](workflows\metadata).
 
 ## Notebooks as scripts using papermill
 
@@ -33,13 +32,25 @@ papermill 01_1_train_vae.ipynb --help-notebook
 
 > Misstyped argument names won't throw an error when using papermill
 
-### Outlook
+### Python package
+
+For interactive use of the models provided in PIMMS, you can use our
+[python package `pimms-learn`](https://pypi.org/project/pimms-learn/).
+The interface is similar to scikit-learn.
+
+
+```
+pip install pimms-learn
+```
+
 
-We also plan to provide functionality and examples to interactive use of the 
-models developed in PIMMS.
+Then you can use the models on a pandas DataFrame with missing values. Try this in the tutorial on Colab:
+[![open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/RasmussenLab/pimms/blob/dev/project/04_1_train_pimms_models.ipynb)
 
-## Setup
-The package is not yet available as a standalone software on pypi. Currently we use 
+
+## Setup for PIMMS comparison workflow
+
+The package is available as a standalone software on pypi. However, running the entire snakemake workflow in enabled using 
 conda (or mamba) and pip to setup the environment. For a detailed description of setting up
 conda (or mamba), see [instructions on setting up a virtual environment](docs/venv_setup.md).
 
@@ -58,32 +69,56 @@ conda env create -n pimms -f environment.yml # slower
 mamba env create -n pimms -f environment.yml # faster, less then 5mins
 ```
 
-If on Mac M1: use  `environment_m1.yaml` where cudatoolkit is removed.
+If on Mac M1, M2 or having otherwise issue using your accelerator (e.g. GPUs): Install the pytorch dependencies first, then the rest of the environment.
+
+### Install development dependencies
+
+Check how to install pytorch for your system [here](https://pytorch.org/get-started/previous-versions/#v1131).
 
+- select the version compatible with your cuda version if you have an nvidia gpu
+
+```bash
+conda create -n vaep_manuel python=3.8 pip
+conda activate vaep_manuel
+conda update pytorch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 pytorch-cuda=11.7 -c pytorch -c nvidia # might be different
+pip install . # pimms-learn
+pip install papermill jupyterlab # use run notebook interactive or as a script
+cd project
+papermill 04_1_train_pimms_models.ipynb 04_1_train_pimms_models_test.ipynb # second notebook is output
+python 04_1_train_pimms_models.ipynb # just execute the code
+# jupyter lab # open 04_1_train_pimms_models.ipynb
 ```
-conda env create -n pimms -f environment_m1.yml # slower
-mamba env create -n pimms -f environment_m1.yml # faster, less then 5mins
+
+### Entire development installation
+
+
+```bash
+conda create -n pimms_dev -c pytorch -c nvidia -c fastai -c bioconda -c plotly -c conda-forge --file requirements.txt --file requirements_R.txt --file requirements_dev.txt
+pip install -e . # other pip dependencies missing
+snakemake --configfile config/single_dev_dataset/example/config.yaml -F -n
 ```
 
-If on Windows: use `environment_win.yaml` where ~~two R-Bioconductor~~ R-packages (see note bolow) are removed as 
-no binaries are available for Windows. You will need to install these manually afterwards if you want to use methods implemented in R.
+or if you want to update an existing environment
 
-> Note: Turns out that installing dependencies partly by conda and partly manuaelly
-using `BiocManager` is not working.
 
 ```
-conda env create -n pimms -f environment_win.yml # slower
-mamba env create -n pimms -f environment_win.yml # faster, less then 5mins
-# Then if R packages are needed, they are installed on the fly for Windows.
-# Could be used as well for MacOS or Linux.
+conda update  -c defaults -c conda-forge -c fastai -c bioconda -c plotly --file requirements.txt --file requirements_R.txt --file requirements_dev.txt
+```
+
+or using the environment.yml file (can fail on certain systems)
+
 ```
+conda env create -f environment.yml
+```
+
+
+### Troubleshooting
 
 Trouble shoot your R installation by opening jupyter lab
 
 ```
 # in projects folder
 jupyter lab # open 01_1_train_NAGuideR.ipynb
-```
 
 ## Run Demo
 
@@ -191,25 +226,21 @@ From the brief description in the table the exact procedure is not always clear.
 
 
 
-<!-- ### Setup using pip
+## Workflows
 
-> Dependecies are currently provided through `environment.yml`, see above
+The workflows folder in the repository contains snakemake workflows used for rawfile data processing, 
+both for running MaxQuant over a large set of HeLa raw files 
+and ThermoRawFileParser on a list of raw files to extract their meta data. For details see:
 
-From GitHub
-```
-pip install git+https://github.com/RasmussenLab/pimms.git
-```
+>  Webel, Henry, Yasset Perez-Riverol, Annelaura Bach Nielson, and Simon Rasmussen. 2023. “Mass Spectrometry-Based Proteomics Data from Thousands of HeLa Control Samples.” Research Square. https://doi.org/10.21203/rs.3.rs-3083547/v1.
 
-Using the clone repository
-```
-pip install /path/to/cloned/folder 
-```
+### MaxQuant
 
-And using the cloned repository for an editable installation
-```
-pip install -e /path/to/cloned/folder 
-```
+Process single raw files using MaxQuant. See [README](workflows/maxquant/README.md) for details.
 
-## Overview vaep package -->
+### Metadata
 
+Read metadata from single raw files using MaxQuant. See [README](workflows/metadata/README.md) for details.
 
+## Build status
+[![Documentation Status](https://readthedocs.org/projects/pimms/badge/?version=latest)](https://pimms.readthedocs.io/en/latest/?badge=latest)
diff --git a/docs/README.md b/docs/README.md
@@ -6,11 +6,27 @@ In order to build the docs you need to
   2. build the package reference files
   3. run sphinx to create a local html version
 
-Command to be run from `path/to/vaep/docs`, i.e. from within the `docs` package folder: 
+Command to be run from `path/to/pimms/docs`, i.e. from within the `docs` package folder: 
 
 ```cmd
+# pip install pimms[docs]
 # pwd: ./vaep/docs
 conda env update -f environment.yml
-sphinx-apidoc -o source ../vaep
+sphinx-apidoc -o reference ../vaep
 make html
+```
+
+## Build docs
+
+Using Sphinx command line tools. 
+
+Options:
+  - `--separate` to build separate pages for each (sub-)module
+
+```cmd	
+# pwd: ./pimms/docs
+# apidoc
+sphinx-apidoc --force --implicit-namespaces --module-first -o reference ../vaep
+# build docs
+sphinx-build -n -W --keep-going -b html ./ ./_build/
 ```