diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index b1b755ec3..d0505dc5d 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -4,30 +4,33 @@ on: branches: [main, dev] pull_request: branches: [main, dev] + schedule: + - cron: '0 2 * * 3,6' jobs: run-tests: - runs-on: ubuntu-20.04 + runs-on: ${{ matrix.os }} defaults: run: shell: bash -el {0} strategy: fail-fast: false matrix: - os: ["ubuntu-latest", "macos-latest", "windows-latest"] + os: ["ubuntu-latest", + "macos-13", + # "windows-latest" # rrcovNA cannot be build from source on windows-server + ] python-version: ["3.8"] steps: - name: Checkout uses: actions/checkout@v2 - - name: check files - run: | - ls ${{ github.workspace }} - name: Set up Miniconda + # ! change action https://github.com/mamba-org/setup-micromamba uses: conda-incubator/setup-miniconda@v2 with: miniforge-variant: Mambaforge # miniforge-version: latest use-mamba: true - channel-priority: strict + channel-priority: disabled python-version: ${{ matrix.python-version }} environment-file: environment.yml activate-environment: vaep @@ -46,9 +49,10 @@ jobs: # # currently part of environment # - name: Install package and install library # run: | - # pip install pytest - - name: Run Tests on installed package - run: pytest . + # pip install pytest pytest-cov + - name: Run Unit tests on installed package + run: | + pytest . - name: View papermill help message for notebooks (as scripts) run: | cd project @@ -56,13 +60,62 @@ jobs: papermill 01_1_train_VAE.ipynb --help-notebook papermill 01_1_train_DAE.ipynb --help-notebook papermill 01_1_train_CF.ipynb --help-notebook - - name: Run demo workflow + - name: Run demo workflow (integration test) run: | cd project snakemake -p -c1 --configfile config/single_dev_dataset/example/config.yaml -n - snakemake -p -c1 --configfile config/single_dev_dataset/example/config.yaml + snakemake -p -c2 -k --configfile config/single_dev_dataset/example/config.yaml - name: Archive results uses: actions/upload-artifact@v3 with: name: example-workflow-results-${{ matrix.os }} - path: project/runs/example/01_2_performance_plots.html \ No newline at end of file + path: project/runs/example/01_2_performance_plots.html + + test_pip_pkg_install: + runs-on: ${{ matrix.os }} + name: test-pip-installation + strategy: + fail-fast: false + matrix: + os: ["ubuntu-latest", "macos-latest", "windows-latest"] + python-version: ["3.8"] + steps: + - uses: actions/checkout@v3 + + - uses: actions/setup-python@v4.0.0 + with: + python-version: ${{ matrix.python-version }} + + - name: install pimms + run: python -m pip install . + + - name: Install pytest + run: python -m pip install pytest pytest-cov + + - name: Run pytest + run: pytest . + + + publish: + name: Publish package + if: startsWith(github.event.ref, 'refs/tags/v') + needs: + - run-tests + - test_pip_pkg_install + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - uses: actions/setup-python@v4.0.0 + with: + python-version: "3.8" + - name: Install twine and build + run: python -m pip install --upgrade twine build + - name: Build + run: python -m build + + - uses: pypa/gh-action-pypi-publish@release/v1 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} \ No newline at end of file diff --git a/.gitignore b/.gitignore index eac40d1d7..1eef316c3 100644 --- a/.gitignore +++ b/.gitignore @@ -44,4 +44,5 @@ workflows/maxquant/out/ # builds docs/_* -docs/source \ No newline at end of file +docs/reference +build \ No newline at end of file diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 000000000..6e817d6be --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,35 @@ +# .readthedocs.yaml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Set the OS, Python version and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.8" + # You can also specify other tool versions: + # nodejs: "19" + # rust: "1.64" + # golang: "1.19" + +# Build documentation in the "docs/" directory with Sphinx +sphinx: + configuration: docs/conf.py + +# Optionally build your docs in additional formats such as PDF and ePub +# formats: +# - pdf +# - epub + +# Optional but recommended, declare the Python requirements required +# to build your documentation +# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html +python: + install: + - method: pip + path: . + extra_requirements: + - docs \ No newline at end of file diff --git a/README.md b/README.md index 3d9676d50..f01304638 100644 --- a/README.md +++ b/README.md @@ -2,22 +2,21 @@ PIMMS stands for Proteomics Imputation Modeling Mass Spectrometry and is a hommage to our dear British friends -who are missing as part of the EU for far too long already. -(Pimms is also a british summer drink) +who are missing as part of the EU for far too long already +(Pimms is also a British summer drink). -The pre-print is available [on biorxiv](https://www.biorxiv.org/content/10.1101/2023.01.12.523792v1). +The pre-print is available [on biorxiv](https://doi.org/10.1101/2023.01.12.523792). -> `PIMMS`was called `vaep` during development. +> `PIMMS` was called `vaep` during development. > Before entire refactoring has to been completed the imported package will be `vaep`. -We provide functionality as a python package and excutable workflows and notebooks -under the [`project`](project) folder, inclduing an example. +We provide functionality as a python package, an excutable workflow and notebooks. + +The models can be used with the scikit-learn interface in the spirit of other scikit-learn imputers. You can try this in colab. [![open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/RasmussenLab/pimms/blob/dev/project/04_1_train_pimms_models.ipynb) + -The [`workflows`](workflows) folder contains snakemake workflows used for rawfile data processing, -both for [running MaxQuant](workflows\maxquant) over a large set of HeLa raw files -and ThermoRawFileParser on a list of raw files to [extract their meta data](workflows\metadata). ## Notebooks as scripts using papermill @@ -33,13 +32,25 @@ papermill 01_1_train_vae.ipynb --help-notebook > Misstyped argument names won't throw an error when using papermill -### Outlook +### Python package + +For interactive use of the models provided in PIMMS, you can use our +[python package `pimms-learn`](https://pypi.org/project/pimms-learn/). +The interface is similar to scikit-learn. + + +``` +pip install pimms-learn +``` + -We also plan to provide functionality and examples to interactive use of the -models developed in PIMMS. +Then you can use the models on a pandas DataFrame with missing values. Try this in the tutorial on Colab: +[![open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/RasmussenLab/pimms/blob/dev/project/04_1_train_pimms_models.ipynb) -## Setup -The package is not yet available as a standalone software on pypi. Currently we use + +## Setup for PIMMS comparison workflow + +The package is available as a standalone software on pypi. However, running the entire snakemake workflow in enabled using conda (or mamba) and pip to setup the environment. For a detailed description of setting up conda (or mamba), see [instructions on setting up a virtual environment](docs/venv_setup.md). @@ -58,32 +69,56 @@ conda env create -n pimms -f environment.yml # slower mamba env create -n pimms -f environment.yml # faster, less then 5mins ``` -If on Mac M1: use `environment_m1.yaml` where cudatoolkit is removed. +If on Mac M1, M2 or having otherwise issue using your accelerator (e.g. GPUs): Install the pytorch dependencies first, then the rest of the environment. + +### Install development dependencies + +Check how to install pytorch for your system [here](https://pytorch.org/get-started/previous-versions/#v1131). +- select the version compatible with your cuda version if you have an nvidia gpu + +```bash +conda create -n vaep_manuel python=3.8 pip +conda activate vaep_manuel +conda update pytorch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 pytorch-cuda=11.7 -c pytorch -c nvidia # might be different +pip install . # pimms-learn +pip install papermill jupyterlab # use run notebook interactive or as a script +cd project +papermill 04_1_train_pimms_models.ipynb 04_1_train_pimms_models_test.ipynb # second notebook is output +python 04_1_train_pimms_models.ipynb # just execute the code +# jupyter lab # open 04_1_train_pimms_models.ipynb ``` -conda env create -n pimms -f environment_m1.yml # slower -mamba env create -n pimms -f environment_m1.yml # faster, less then 5mins + +### Entire development installation + + +```bash +conda create -n pimms_dev -c pytorch -c nvidia -c fastai -c bioconda -c plotly -c conda-forge --file requirements.txt --file requirements_R.txt --file requirements_dev.txt +pip install -e . # other pip dependencies missing +snakemake --configfile config/single_dev_dataset/example/config.yaml -F -n ``` -If on Windows: use `environment_win.yaml` where ~~two R-Bioconductor~~ R-packages (see note bolow) are removed as -no binaries are available for Windows. You will need to install these manually afterwards if you want to use methods implemented in R. +or if you want to update an existing environment -> Note: Turns out that installing dependencies partly by conda and partly manuaelly -using `BiocManager` is not working. ``` -conda env create -n pimms -f environment_win.yml # slower -mamba env create -n pimms -f environment_win.yml # faster, less then 5mins -# Then if R packages are needed, they are installed on the fly for Windows. -# Could be used as well for MacOS or Linux. +conda update -c defaults -c conda-forge -c fastai -c bioconda -c plotly --file requirements.txt --file requirements_R.txt --file requirements_dev.txt +``` + +or using the environment.yml file (can fail on certain systems) + ``` +conda env create -f environment.yml +``` + + +### Troubleshooting Trouble shoot your R installation by opening jupyter lab ``` # in projects folder jupyter lab # open 01_1_train_NAGuideR.ipynb -``` ## Run Demo @@ -191,25 +226,21 @@ From the brief description in the table the exact procedure is not always clear. - +### Metadata +Read metadata from single raw files using MaxQuant. See [README](workflows/metadata/README.md) for details. +## Build status +[![Documentation Status](https://readthedocs.org/projects/pimms/badge/?version=latest)](https://pimms.readthedocs.io/en/latest/?badge=latest) \ No newline at end of file diff --git a/docs/README.md b/docs/README.md index ee9f54463..290bd4cb7 100644 --- a/docs/README.md +++ b/docs/README.md @@ -6,11 +6,27 @@ In order to build the docs you need to 2. build the package reference files 3. run sphinx to create a local html version -Command to be run from `path/to/vaep/docs`, i.e. from within the `docs` package folder: +Command to be run from `path/to/pimms/docs`, i.e. from within the `docs` package folder: ```cmd +# pip install pimms[docs] # pwd: ./vaep/docs conda env update -f environment.yml -sphinx-apidoc -o source ../vaep +sphinx-apidoc -o reference ../vaep make html +``` + +## Build docs + +Using Sphinx command line tools. + +Options: + - `--separate` to build separate pages for each (sub-)module + +```cmd +# pwd: ./pimms/docs +# apidoc +sphinx-apidoc --force --implicit-namespaces --module-first -o reference ../vaep +# build docs +sphinx-build -n -W --keep-going -b html ./ ./_build/ ``` \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index 17fbe769e..87cba0f5e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -11,18 +11,20 @@ # documentation root, use os.path.abspath to make it absolute, like shown here. # import os -import sys -sys.path.insert(0, os.path.abspath('../.')) +from importlib import metadata +# import sys +# sys.path.insert(0, os.path.abspath('../.')) # -- Project information ----------------------------------------------------- -project = 'vaep' -copyright = '2021, Henry Webel' +project = 'pimms' +copyright = '2023, Henry Webel' author = 'Henry Webel' -# The full version, including alpha/beta/rc tags -release = '0.1' +PACKAGE_VERSION = metadata.version("pimms-learn") +version = PACKAGE_VERSION +release = PACKAGE_VERSION # -- General configuration --------------------------------------------------- @@ -31,10 +33,16 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = ['myst_parser', + # 'sphinx_mdinclude', 'sphinx.ext.napoleon', - # 'sphinx_markdown_tables' + 'sphinx.ext.autodoc', + 'sphinx.ext.autodoc.typehints', ] +myst_enable_extensions = [ + "strikethrough", +] + # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] @@ -48,10 +56,42 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -# -html_theme = 'alabaster' + +html_theme = 'sphinx_book_theme' # pip install sphinx-book-theme + +# check https://github.com/executablebooks/sphinx-book-theme/blob/master/docs/conf.py +html_title = u'Proteomics imputation modelling mass spectrometry (PIMMS)' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +# html_static_path = ['_static'] + +# -- Setup for sphinx-apidoc ------------------------------------------------- + +# Read the Docs doesn't support running arbitrary commands like tox. +# sphinx-apidoc needs to be called manually if Sphinx is running there. +# https://github.com/readthedocs/readthedocs.org/issues/1139 + +if os.environ.get("READTHEDOCS") == "True": + from pathlib import Path + + PROJECT_ROOT = Path(__file__).parent.parent + PACKAGE_ROOT = PROJECT_ROOT / "vaep" + + def run_apidoc(_): + from sphinx.ext import apidoc + apidoc.main([ + "--force", + "--implicit-namespaces", + "--module-first", + # "--separate", + "-o", + str(PROJECT_ROOT / "docs" / "reference"), + str(PACKAGE_ROOT), + str(PACKAGE_ROOT / "*.c"), + str(PACKAGE_ROOT / "*.so"), + ]) + + def setup(app): + app.connect('builder-inited', run_apidoc) diff --git a/docs/index.rst b/docs/index.rst index 57520c095..6513fc1eb 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,29 +1,29 @@ -Welcome to vaep's documentation! +Welcome to PIMMS documentation! ================================ +.. .. mdinclude:: ../README.md +.. https://github.com/sphinx-doc/sphinx/issues/7000#issuecomment-1006645012 +.. https://stackoverflow.com/a/54519037 +.. include:: ../README.md + :parser: myst_parser.sphinx_ + :start-line: 1 .. toctree:: :maxdepth: 1 - :caption: Contents: + :caption: Topic and note pages: - - MaxQuant proteomics_intro + MaxQuant lab_book vae_notes venv_setup - .. toctree:: - :maxdepth: 1 + :maxdepth: 3 :caption: Library: - source/modules - - -.. .. automodule:: vaep -.. :members: + reference/modules Indices and tables diff --git a/docs/proteomics_intro.md b/docs/proteomics_intro.md index 39cbf94e6..48dcb062b 100644 --- a/docs/proteomics_intro.md +++ b/docs/proteomics_intro.md @@ -1,6 +1,6 @@ # Proteomics Introductions -### Lennart Martens - Introduction to Proteomics +## Lennart Martens - Introduction to Proteomics > [lecture videos on youtube](https://www.youtube.com/watch?v=Wy1SwrMzhYk&list=PLXxp6nsBenSX_W8DiOocKJ0laNauYNdYl) - Digitalizer: Transform an analog signal into a digital signal @@ -23,7 +23,7 @@ - electron-capture dissociation (ECD): c and z ions - preserves PTMs as the fracturing is non- ergodic (vibration induced) -### Intro (OpenMS from Tübingen) +## Intro (OpenMS from Tübingen) > [Recording on youtube](https://www.youtube.com/watch?v=tnS-w8ggEAA&list=PL_6fafgzU1nHd13qqjm3uNvyHh62JpPcB&index=2&t=0s) - *Ion mobility* as a way to identify number of charges (ions), adding another dimension to the data - predict MS2 peptide itensities in order to better identify peptides ([MS2PIP](https://iomics.ugent.be/ms2pip/), [DeepMass: Prism](https://github.com/verilylifesciences/deepmass), Prosit)- @@ -40,13 +40,6 @@ -# Introduction to proteomics -Given by Jeppe Madsen and Martin Rzkær - -> Support-Request: SUND-CPR-Mssupport -> E-Mail Subject: CC-QE1-ISSUE - - ## Mass Spectrometry - Unbiased analysis that does not require prior knowledge of the sample composition - Analytical technique which identifies molecules based on their mass and charge (m/z) diff --git a/environment.yml b/environment.yml index 148f8610d..ce3760fba 100644 --- a/environment.yml +++ b/environment.yml @@ -1,31 +1,32 @@ # Dev Environment name: vaep channels: - # - defaults - conda-forge - - bioconda - pytorch - - fastai + - nvidia + - fastai # fastchan + - bioconda - plotly + # - defaults dependencies: - - biopython # Aligner - python=3.8 - - numpy=1.20 + - numpy - pandas=1 - scipy>=1.6 # plotting - - matplotlib=3.3 + - matplotlib - python-kaleido - plotly - seaborn - pip # ML - - pytorch=1.10 - - scikit-learn=1.0 + - pytorch=1 #=1.13.1=py3.8_cuda11.7_cudnn8_0 + # - pytorch-cuda + - scikit-learn - fastai - torchvision - - cudatoolkit=10.2 - - tensorboard + # - cudatoolkit #=11.7 + # - tensorboard - umap-learn # stats - pingouin @@ -41,36 +42,46 @@ dependencies: - ipykernel - ipython - ipywidgets - - jupytext - jupyterlab # standalone jupyter installation - # - jupyterlab_code_formatter - # - jupyterlab-git + # - jupyter_contrib_nbextensions # delete configuration file if you see an error: https://github.com/jupyter/nbconvert/issues/526#issuecomment-277552771 - jupyter-dash - papermill # execute ipynb's # R packages (listed in NAGuideR) - - r-base=3.6 - - r-irkernel + - r-base #=3.6 + - r-devtools # is it needed for source installs on windows server? + - r-irkernel - r-reshape2 - r-stringi # + rmarkdown hack for reshape2 - r-stringr # reshape2 - # - r-gdata - # - r-glmnet + - r-tidyverse + - r-gdata + - r-glmnet - r-e1071 - r-norm - r-missforest - r-vim - r-mice - - r-tidyverse + - r-cluster + - r-mvtnorm + - r-rrcov + - r-gmm + - r-tmvtnorm # - bioconductor-biocinstaller # - r-imputelcmd # bioconda - - bioconductor-impute - - bioconductor-pcamethods + # - bioconductor-impute + # - bioconductor-pcamethods # - rrcovNA, GMSimpute # SeqKnn, pcaMethods, DreamAI # bioconductor # dev - pytest - pytest-cov - - jupytext + # - jupytext + # - flake8 + # - flake8-bugbear + # - build + # - pre-commit + # - jupyterlab_code_formatter + # - jupyterlab-git - pip: - -e . - mrmr-selection diff --git a/environment_m1.yml b/environment_m1.yml deleted file mode 100644 index 6e2d95a6c..000000000 --- a/environment_m1.yml +++ /dev/null @@ -1,58 +0,0 @@ -# Dev Environment -name: vaep -channels: - - defaults - - pytorch - - fastai - - plotly - - conda-forge - - bioconda -dependencies: - - biopython # Aligner - - python=3.8 - - numpy - - pandas>=1.4 - - pandas-profiling - - scipy>=1.6 - # plotting - - matplotlib=3.3 - - python-kaleido - - plotly - - seaborn - - pip - # ML - - pytorch=1.10.* - - scikit-learn=1.0.* - - fastai - - torchvision - # - cudatoolkit=10.2 - - tensorboard - - umap-learn - # stats - - pingouin - - statsmodels - # other - - tqdm # progress bars - - xmltodict # configs - - openpyxl # xml - - omegaconf - # snakemake - - snakemake-minimal - # jupyter - - ipykernel=5.3 - - ipython=7.22 - - ipywidgets=7.6 - - jupytext - - jupyterlab # standalone jupyter installation - - jupyterlab_code_formatter - - jupyterlab-git - - jupyter-dash - - papermill # execute ipynb's - # dev - - pytest - - pytest-cov - - jupytext - - pip: - - -e . - - mrmr-selection - # - git+https://github.com/smazzanti/mrmr.git \ No newline at end of file diff --git a/environment_win.yml b/environment_win.yml deleted file mode 100644 index 9c3c6e515..000000000 --- a/environment_win.yml +++ /dev/null @@ -1,74 +0,0 @@ -# Dev Environment -name: vaep -channels: - # - defaults - - conda-forge - - bioconda - - pytorch - - fastai - - plotly -dependencies: - - python=3.8 - - numpy - - pandas=1 - - scipy - # plotting - - matplotlib - - python-kaleido - - plotly - - seaborn - - pip - # ML - - pytorch=1 - - scikit-learn=1 - - fastai - # - torchvision - - cudatoolkit - # - tensorboard - - umap-learn - # stats - - pingouin - - statsmodels - # other - - tqdm # progress bars - - xmltodict # configs - - openpyxl # xml - - omegaconf - # snakemake - - snakemake-minimal<7.26 - # jupyter - - jupyterlab # standalone jupyter installation - - jupyter_contrib_nbextensions # needed to avoid error when exporting to html - - jupyter-dash - - papermill # execute ipynb's - # R packages (listed in NAGuideR) - - r-base=3.6 - - r-irkernel - - r-reshape2 - - r-stringi # + rmarkdown hack for reshape2 - - r-stringr # reshape2 - # - r-gdata - # - r-glmnet - - r-e1071 - - r-norm - - r-missforest - - r-vim - - r-mice - # - r-tidyverse - - r-readr - - r-tibble - # - r-imputelcmd # bioconda - # - bioconductor-impute # not available for Windows - # - bioconductor-pcamethods # not available for Windows - # - rrcovNA, GMSimpute - # SeqKnn, pcaMethods, DreamAI # bioconductor - # dev - # - pytest - # - pytest-cov - # - jupytext - # - jupyterlab_code_formatter - # - jupyterlab-git - - pip: - - -e . - - mrmr-selection - \ No newline at end of file diff --git a/project/00_0_0_lftp_upload_commands.ipynb b/project/00_0_0_lftp_upload_commands.ipynb index 9598c68eb..81dd8c796 100644 --- a/project/00_0_0_lftp_upload_commands.ipynb +++ b/project/00_0_0_lftp_upload_commands.ipynb @@ -91,6 +91,7 @@ "execution_count": null, "id": "9869ac5e-fab3-4c66-a32c-48ae4fadc0a3", "metadata": { + "lines_to_next_cell": 2, "tags": [] }, "outputs": [], @@ -100,8 +101,9 @@ "df_meta[date_col] = pd.to_datetime(\n", " df_meta[date_col])\n", "df_meta.sort_values(date_col, inplace=True)\n", - "df_meta\n", - "msg = f\"A total of {len(df_meta)} raw files could be read using the ThermoFisherRawFileParser.\" " + "msg = f\"A total of {len(df_meta)} raw files could be read using the ThermoFisherRawFileParser.\"\n", + "print(msg)\n", + "df_meta" ] }, { diff --git a/project/00_0_0_lftp_upload_commands.py b/project/00_0_0_lftp_upload_commands.py index d0c7400bd..921a5733f 100644 --- a/project/00_0_0_lftp_upload_commands.py +++ b/project/00_0_0_lftp_upload_commands.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -65,8 +65,10 @@ def rename(fname, new_sample_id, new_folder=None, ext=None): df_meta[date_col] = pd.to_datetime( df_meta[date_col]) df_meta.sort_values(date_col, inplace=True) +msg = f"A total of {len(df_meta)} raw files could be read using the ThermoFisherRawFileParser." +print(msg) df_meta -msg = f"A total of {len(df_meta)} raw files could be read using the ThermoFisherRawFileParser." + # %% meta_stats = df_meta.describe(include='all', datetime_is_numeric=True) diff --git a/project/00_0_hela_metadata_rawfiles.ipynb b/project/00_0_hela_metadata_rawfiles.ipynb index be5f642be..820e3bfec 100644 --- a/project/00_0_hela_metadata_rawfiles.ipynb +++ b/project/00_0_hela_metadata_rawfiles.ipynb @@ -47,11 +47,11 @@ }, "outputs": [], "source": [ - "fn_rawfile_metadata: str = 'data/rawfile_metadata.csv' # Machine parsed metadata from rawfile workflow\n", + "fn_rawfile_metadata: str = 'data/rawfile_metadata.csv' # Machine parsed metadata from rawfile workflow\n", "# outputs\n", - "fn_files_per_instrument: str = 'data/files_per_instrument.yaml' # All parsed raw files nested by instrument (model, attribute, serial number)\n", - "fn_files_selected: str = 'data/samples_selected.yaml' # selected files based on threshold of identified peptides\n", - "fn_files_per_instrument_selected: str = 'data/files_selected_per_instrument.yaml' # Selected parsed raw files nested by instrument (model, attribute, serial number)" + "fn_files_per_instrument: str = 'data/files_per_instrument.yaml' # All parsed raw files nested by instrument (model, attribute, serial number)\n", + "fn_files_selected: str = 'data/samples_selected.yaml' # selected files based on threshold of identified peptides\n", + "fn_files_per_instrument_selected: str = 'data/files_selected_per_instrument.yaml' # Selected parsed raw files nested by instrument (model, attribute, serial number)" ] }, { @@ -78,8 +78,18 @@ "df_meta_rawfiles[date_col] = pd.to_datetime(\n", " df_meta_rawfiles[date_col])\n", "df_meta_rawfiles.sort_values(date_col, inplace=True)\n", - "df_meta_rawfiles\n", - "msg = f\"A total of {len(df_meta_rawfiles)} raw files could be read using the ThermoFisherRawFileParser.\" " + "df_meta_rawfiles" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32b42511", + "metadata": {}, + "outputs": [], + "source": [ + "msg = f\"A total of {len(df_meta_rawfiles)} raw files could be read using the ThermoFisherRawFileParser.\"\n", + "print(msg)" ] }, { @@ -108,7 +118,7 @@ "metadata": {}, "outputs": [], "source": [ - "meta_stats.loc[:, (meta_stats.loc['unique'] > 1) | (meta_stats.loc['std'] > 0.1)].T" + "meta_stats.loc[:, (meta_stats.loc['unique'] > 1) | (meta_stats.loc['std'] > 0.1)].T" ] }, { @@ -118,7 +128,8 @@ "metadata": {}, "outputs": [], "source": [ - "df_meta_rawfiles_columns = df_meta_rawfiles.columns # needs to go to Config which is not overwriteable by attribute selection\n", + "# needs to go to Config which is not overwriteable by attribute selection\n", + "df_meta_rawfiles_columns = df_meta_rawfiles.columns\n", "meta_raw_names = df_meta_rawfiles.columns.droplevel()\n", "assert meta_raw_names.is_unique\n", "df_meta_rawfiles.columns = meta_raw_names\n", @@ -133,24 +144,24 @@ "outputs": [], "source": [ "meta_raw_selected = [\n", - " 'Content Creation Date', \n", - " 'Thermo Scientific instrument model',\n", - " 'instrument serial number',\n", - " 'Software Version', \n", - " 'Number of MS1 spectra',\n", - " 'Number of MS2 spectra', \n", - " 'Number of scans',\n", - " 'MS max charge',\n", - " 'MS max RT',\n", - " 'MS min MZ',\n", - " 'MS max MZ',\n", - " 'MS scan range', \n", - " 'mass resolution',\n", - " 'Retention time range',\n", - " 'Mz range',\n", - " 'beam-type collision-induced dissociation', \n", - " 'injection volume setting',\n", - " 'dilution factor',\n", + " 'Content Creation Date',\n", + " 'Thermo Scientific instrument model',\n", + " 'instrument serial number',\n", + " 'Software Version',\n", + " 'Number of MS1 spectra',\n", + " 'Number of MS2 spectra',\n", + " 'Number of scans',\n", + " 'MS max charge',\n", + " 'MS max RT',\n", + " 'MS min MZ',\n", + " 'MS max MZ',\n", + " 'MS scan range',\n", + " 'mass resolution',\n", + " 'Retention time range',\n", + " 'Mz range',\n", + " 'beam-type collision-induced dissociation',\n", + " 'injection volume setting',\n", + " 'dilution factor',\n", "]\n", "df_meta_rawfiles[meta_raw_selected].describe(percentiles=np.linspace(0.05, 0.95, 10))" ] @@ -183,7 +194,9 @@ "metadata": {}, "outputs": [], "source": [ - "MetaRawSettings = namedtuple('MetaRawSettings', 'ms_model ms_attr ms_sn ms_firmware max_charge mass_res cid_type inject_volume dill_factor') \n", + "MetaRawSettings = namedtuple(\n", + " 'MetaRawSettings',\n", + " 'ms_model ms_attr ms_sn ms_firmware max_charge mass_res cid_type inject_volume dill_factor')\n", "meta_raw_settings = [\n", " 'Thermo Scientific instrument model',\n", " 'instrument attribute',\n", @@ -219,8 +232,7 @@ "view without `MS max charge`:\n", " - software can be updated\n", " - variation by `injection volume setting` and instrument over time\n", - " - missing `dilution factor`\n", - " " + " - missing `dilution factor`\n" ] }, { @@ -231,7 +243,9 @@ "outputs": [], "source": [ "to_drop = ['MS max charge']\n", - "# df_meta_rawfiles[list(meta_raw_settings)].drop(to_drop, axis=1).drop_duplicates(ignore_index=False) # index gives first example with this combination\n", + "# df_meta_rawfiles[list(meta_raw_settings)].drop(to_drop,\n", + "# axis=1).drop_duplicates(ignore_index=False) # index gives first example\n", + "# with this combination\n", "df_meta_rawfiles[list(meta_raw_settings)].drop(to_drop, axis=1).drop_duplicates(ignore_index=True)" ] }, @@ -250,7 +264,8 @@ "metadata": {}, "outputs": [], "source": [ - "df_meta_rawfiles.groupby([meta_raw_settings.ms_model, meta_raw_settings.ms_firmware])[meta_raw_settings.ms_model].count().sort_values().tail(10)" + "df_meta_rawfiles.groupby([meta_raw_settings.ms_model, meta_raw_settings.ms_firmware])[\n", + " meta_raw_settings.ms_model].count().sort_values().tail(10)" ] }, { @@ -273,8 +288,7 @@ "grouping = df_meta_rawfiles.groupby(list(meta_raw_settings[:3]))\n", "instrument_counts = grouping[meta_raw_settings.ms_model].count().sort_values()\n", "msg += (f\" There are a total of {len(instrument_counts)} unique instruments in the entire dataset (based on the instrument name, attributs and serial number)\"\n", - " f\", of which at least {(instrument_counts >= 1000).sum()} have 1,000 raw files assigned to them. Note that the entire dataset contains fractionated measurements.\"\n", - " )\n", + " f\", of which at least {(instrument_counts >= 1000).sum()} have 1,000 raw files assigned to them. Note that the entire dataset contains fractionated measurements.\")\n", "instrument_counts" ] }, @@ -375,9 +389,9 @@ "grouping = df_meta_rawfiles.loc[files_selected['files']].groupby(list(meta_raw_settings[:3]))\n", "instrument_counts = grouping[meta_raw_settings.ms_model].count().sort_values()\n", "N = 500\n", - "msg += (f\" Among the {len(files_selected['files'])} raw files with a minimum of {files_selected['threshold']:,d} identified peptides there are a total of {len(instrument_counts)} unique instruments with quantified runs\"\n", - " f\", of which {(instrument_counts >= N).sum()} have at least {N:,d} rawfiles assigned to them.\"\n", - " )\n", + "msg += (\n", + " f\" Among the {len(files_selected['files'])} raw files with a minimum of {files_selected['threshold']:,d} identified peptides there are a total of {len(instrument_counts)} unique instruments with quantified runs\"\n", + " f\", of which {(instrument_counts >= N).sum()} have at least {N:,d} rawfiles assigned to them.\")\n", "instrument_counts.to_csv('data/files_selected_per_instrument_counts.csv')\n", "instrument_counts.to_frame('No. samples')" ] diff --git a/project/00_0_hela_metadata_rawfiles.py b/project/00_0_hela_metadata_rawfiles.py index 73f2fcb16..b8f1248df 100644 --- a/project/00_0_hela_metadata_rawfiles.py +++ b/project/00_0_hela_metadata_rawfiles.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -32,11 +32,11 @@ # ## Arguments # %% tags=["parameters"] -fn_rawfile_metadata: str = 'data/rawfile_metadata.csv' # Machine parsed metadata from rawfile workflow +fn_rawfile_metadata: str = 'data/rawfile_metadata.csv' # Machine parsed metadata from rawfile workflow # outputs -fn_files_per_instrument: str = 'data/files_per_instrument.yaml' # All parsed raw files nested by instrument (model, attribute, serial number) -fn_files_selected: str = 'data/samples_selected.yaml' # selected files based on threshold of identified peptides -fn_files_per_instrument_selected: str = 'data/files_selected_per_instrument.yaml' # Selected parsed raw files nested by instrument (model, attribute, serial number) +fn_files_per_instrument: str = 'data/files_per_instrument.yaml' # All parsed raw files nested by instrument (model, attribute, serial number) +fn_files_selected: str = 'data/samples_selected.yaml' # selected files based on threshold of identified peptides +fn_files_per_instrument_selected: str = 'data/files_selected_per_instrument.yaml' # Selected parsed raw files nested by instrument (model, attribute, serial number) # %% [markdown] # ### Machine metadata @@ -50,7 +50,10 @@ df_meta_rawfiles[date_col]) df_meta_rawfiles.sort_values(date_col, inplace=True) df_meta_rawfiles -msg = f"A total of {len(df_meta_rawfiles)} raw files could be read using the ThermoFisherRawFileParser." + +# %% +msg = f"A total of {len(df_meta_rawfiles)} raw files could be read using the ThermoFisherRawFileParser." +print(msg) # %% meta_stats = df_meta_rawfiles.describe(include='all', datetime_is_numeric=True) @@ -60,10 +63,11 @@ # subset with variation # %% -meta_stats.loc[:, (meta_stats.loc['unique'] > 1) | (meta_stats.loc['std'] > 0.1)].T +meta_stats.loc[:, (meta_stats.loc['unique'] > 1) | (meta_stats.loc['std'] > 0.1)].T # %% -df_meta_rawfiles_columns = df_meta_rawfiles.columns # needs to go to Config which is not overwriteable by attribute selection +# needs to go to Config which is not overwriteable by attribute selection +df_meta_rawfiles_columns = df_meta_rawfiles.columns meta_raw_names = df_meta_rawfiles.columns.droplevel() assert meta_raw_names.is_unique df_meta_rawfiles.columns = meta_raw_names @@ -71,24 +75,24 @@ # %% meta_raw_selected = [ - 'Content Creation Date', - 'Thermo Scientific instrument model', - 'instrument serial number', - 'Software Version', - 'Number of MS1 spectra', - 'Number of MS2 spectra', - 'Number of scans', - 'MS max charge', - 'MS max RT', - 'MS min MZ', - 'MS max MZ', - 'MS scan range', - 'mass resolution', - 'Retention time range', - 'Mz range', - 'beam-type collision-induced dissociation', - 'injection volume setting', - 'dilution factor', + 'Content Creation Date', + 'Thermo Scientific instrument model', + 'instrument serial number', + 'Software Version', + 'Number of MS1 spectra', + 'Number of MS2 spectra', + 'Number of scans', + 'MS max charge', + 'MS max RT', + 'MS min MZ', + 'MS max MZ', + 'MS scan range', + 'mass resolution', + 'Retention time range', + 'Mz range', + 'beam-type collision-induced dissociation', + 'injection volume setting', + 'dilution factor', ] df_meta_rawfiles[meta_raw_selected].describe(percentiles=np.linspace(0.05, 0.95, 10)) @@ -104,7 +108,9 @@ # - quite some variation due to `MS max charge`: Is it a parameter? # %% -MetaRawSettings = namedtuple('MetaRawSettings', 'ms_model ms_attr ms_sn ms_firmware max_charge mass_res cid_type inject_volume dill_factor') +MetaRawSettings = namedtuple( + 'MetaRawSettings', + 'ms_model ms_attr ms_sn ms_firmware max_charge mass_res cid_type inject_volume dill_factor') meta_raw_settings = [ 'Thermo Scientific instrument model', 'instrument attribute', @@ -129,18 +135,21 @@ # - software can be updated # - variation by `injection volume setting` and instrument over time # - missing `dilution factor` -# +# # %% to_drop = ['MS max charge'] -# df_meta_rawfiles[list(meta_raw_settings)].drop(to_drop, axis=1).drop_duplicates(ignore_index=False) # index gives first example with this combination +# df_meta_rawfiles[list(meta_raw_settings)].drop(to_drop, +# axis=1).drop_duplicates(ignore_index=False) # index gives first example +# with this combination df_meta_rawfiles[list(meta_raw_settings)].drop(to_drop, axis=1).drop_duplicates(ignore_index=True) # %% [markdown] # Relatively big samples for different machines of the same kind running with the same firmware: # %% -df_meta_rawfiles.groupby([meta_raw_settings.ms_model, meta_raw_settings.ms_firmware])[meta_raw_settings.ms_model].count().sort_values().tail(10) +df_meta_rawfiles.groupby([meta_raw_settings.ms_model, meta_raw_settings.ms_firmware])[ + meta_raw_settings.ms_model].count().sort_values().tail(10) # %% [markdown] # Ignoring instrument software @@ -149,8 +158,7 @@ grouping = df_meta_rawfiles.groupby(list(meta_raw_settings[:3])) instrument_counts = grouping[meta_raw_settings.ms_model].count().sort_values() msg += (f" There are a total of {len(instrument_counts)} unique instruments in the entire dataset (based on the instrument name, attributs and serial number)" - f", of which at least {(instrument_counts >= 1000).sum()} have 1,000 raw files assigned to them. Note that the entire dataset contains fractionated measurements." - ) + f", of which at least {(instrument_counts >= 1000).sum()} have 1,000 raw files assigned to them. Note that the entire dataset contains fractionated measurements.") instrument_counts # %% @@ -194,9 +202,9 @@ grouping = df_meta_rawfiles.loc[files_selected['files']].groupby(list(meta_raw_settings[:3])) instrument_counts = grouping[meta_raw_settings.ms_model].count().sort_values() N = 500 -msg += (f" Among the {len(files_selected['files'])} raw files with a minimum of {files_selected['threshold']:,d} identified peptides there are a total of {len(instrument_counts)} unique instruments with quantified runs" - f", of which {(instrument_counts >= N).sum()} have at least {N:,d} rawfiles assigned to them." - ) +msg += ( + f" Among the {len(files_selected['files'])} raw files with a minimum of {files_selected['threshold']:,d} identified peptides there are a total of {len(instrument_counts)} unique instruments with quantified runs" + f", of which {(instrument_counts >= N).sum()} have at least {N:,d} rawfiles assigned to them.") instrument_counts.to_csv('data/files_selected_per_instrument_counts.csv') instrument_counts.to_frame('No. samples') diff --git a/project/01_0_split_data.ipynb b/project/01_0_split_data.ipynb index f76e6d16b..f6df4bd16 100644 --- a/project/01_0_split_data.ipynb +++ b/project/01_0_split_data.ipynb @@ -673,7 +673,7 @@ "outputs": [], "source": [ "min_max = vaep.plotting.data.min_max(analysis.df.stack())\n", - "ax, bins = vaep.plotting.data.plot_histogram_intensites(\n", + "ax, bins = vaep.plotting.data.plot_histogram_intensities(\n", " analysis.df.stack(), min_max=min_max)\n", "\n", "fname = params.out_figures / 'intensity_distribution_overall'\n", @@ -1150,7 +1150,6 @@ { "cell_type": "code", "execution_count": null, - "id": "2fc31ea0", "metadata": { "lines_to_next_cell": 2 }, @@ -1324,7 +1323,6 @@ }, { "cell_type": "markdown", - "id": "e6d3a66e", "metadata": {}, "source": [ "plot training data missing plots" @@ -1333,7 +1331,6 @@ { "cell_type": "code", "execution_count": null, - "id": "61154b9f", "metadata": {}, "outputs": [], "source": [ @@ -1343,7 +1340,6 @@ { "cell_type": "code", "execution_count": null, - "id": "441c602d", "metadata": {}, "outputs": [], "source": [ @@ -1357,7 +1353,6 @@ { "cell_type": "code", "execution_count": null, - "id": "85aa3b19", "metadata": {}, "outputs": [], "source": [ diff --git a/project/01_0_split_data.py b/project/01_0_split_data.py index 4edd5a21b..48b1fa31d 100644 --- a/project/01_0_split_data.py +++ b/project/01_0_split_data.py @@ -432,7 +432,7 @@ def join_as_str(seq): # %% min_max = vaep.plotting.data.min_max(analysis.df.stack()) -ax, bins = vaep.plotting.data.plot_histogram_intensites( +ax, bins = vaep.plotting.data.plot_histogram_intensities( analysis.df.stack(), min_max=min_max) fname = params.out_figures / 'intensity_distribution_overall' diff --git a/project/01_1_train_CF.ipynb b/project/01_1_train_CF.ipynb index 5f9d78ce6..f67d3e710 100644 --- a/project/01_1_train_CF.ipynb +++ b/project/01_1_train_CF.ipynb @@ -153,10 +153,15 @@ "cell_type": "code", "execution_count": null, "id": "a19fe098-a029-4f71-b7fb-e652a9c16ac7", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "outputs": [], "source": [ - "TEMPLATE_MODEL_PARAMS = 'model_params_{}.json'" + "TEMPLATE_MODEL_PARAMS = 'model_params_{}.json'\n", + "\n", + "if not args.cuda:\n", + " default_device(use=False) # set to cpu" ] }, { @@ -375,6 +380,11 @@ " model_dir=args.out_models)\n", "if args.cuda:\n", " ana_collab.learn.model = ana_collab.learn.model.cuda()\n", + "else:\n", + " # try to set explicitly cpu in case not cuda\n", + " # MPS logic might not work properly in fastai yet https://github.com/fastai/fastai/pull/3858\n", + " ana_collab.learn.model = ana_collab.learn.model.cpu()\n", + "\n", "# learn.summary() # see comment at DAE" ] }, diff --git a/project/01_1_train_CF.py b/project/01_1_train_CF.py index acca8e4ec..642bcb02a 100644 --- a/project/01_1_train_CF.py +++ b/project/01_1_train_CF.py @@ -6,7 +6,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -105,6 +105,10 @@ # %% TEMPLATE_MODEL_PARAMS = 'model_params_{}.json' +if not args.cuda: + default_device(use=False) # set to cpu + + # %% [markdown] # ## Load data in long format @@ -209,6 +213,11 @@ model_dir=args.out_models) if args.cuda: ana_collab.learn.model = ana_collab.learn.model.cuda() +else: + # try to set explicitly cpu in case not cuda + # MPS logic might not work properly in fastai yet https://github.com/fastai/fastai/pull/3858 + ana_collab.learn.model = ana_collab.learn.model.cpu() + # learn.summary() # see comment at DAE # %% [markdown] diff --git a/project/01_1_train_NAGuideR_methods.R b/project/01_1_train_NAGuideR_methods.R index 65296b755..0ef65fdc2 100644 --- a/project/01_1_train_NAGuideR_methods.R +++ b/project/01_1_train_NAGuideR_methods.R @@ -6,7 +6,7 @@ # extension: .R # format_name: light # format_version: '1.5' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: R # language: R @@ -111,17 +111,23 @@ nafunctions <- function(x,method="zero"){ df <- norm::imp.norm(ss, thx, xxm) } else if(method=="qrilc"){ + install_bioconductor("impute") + install_bioconductor("pcaMethods") install_rpackage('imputeLCMD') xxm<-t(df1) data_zero1 <- imputeLCMD::impute.QRILC(xxm, tune.sigma = 1)[[1]] df<-t(data_zero1) } else if(method=="mindet"){ + install_bioconductor("impute") + install_bioconductor("pcaMethods") install_rpackage('imputeLCMD') xxm<-as.matrix(df1) df <- imputeLCMD::impute.MinDet(xxm, q = 0.01) } else if(method=="minprob"){ + install_bioconductor("impute") + install_bioconductor("pcaMethods") install_rpackage('imputeLCMD') xxm<-as.matrix(df1) df <- imputeLCMD::impute.MinProb(xxm, q = 0.01, tune.sigma = 1) @@ -172,7 +178,7 @@ nafunctions <- function(x,method="zero"){ df<-as.data.frame(t(df1x)) } else if(method=="rf"){ - install_rpackage('missForest') + install_rpackage("missForest") data_zero1 <- missForest(t(df1), maxiter =10, ntree = 20 # input$rfntrees ,mtry=floor(nrow(df1)^(1/3)),verbose = TRUE) @@ -269,7 +275,7 @@ feat_name <- original_header[1] # Uncomment to test certain methods (only for debugging, as at least one method per package is tested using Github Actions) -# + +# + vscode={"languageId": "r"} # to_test <- c( # 'ZERO', # 'MINIMUM', diff --git a/project/01_1_train_NAGuideR_methods.ipynb b/project/01_1_train_NAGuideR_methods.ipynb index 623e5cc5c..968614c11 100644 --- a/project/01_1_train_NAGuideR_methods.ipynb +++ b/project/01_1_train_NAGuideR_methods.ipynb @@ -139,17 +139,23 @@ " df <- norm::imp.norm(ss, thx, xxm)\n", " }\n", " else if(method==\"qrilc\"){\n", + " install_bioconductor(\"impute\")\n", + " install_bioconductor(\"pcaMethods\")\n", " install_rpackage('imputeLCMD')\n", " xxm<-t(df1)\n", " data_zero1 <- imputeLCMD::impute.QRILC(xxm, tune.sigma = 1)[[1]]\n", " df<-t(data_zero1)\n", " }\n", " else if(method==\"mindet\"){\n", + " install_bioconductor(\"impute\")\n", + " install_bioconductor(\"pcaMethods\")\n", " install_rpackage('imputeLCMD')\n", " xxm<-as.matrix(df1)\n", " df <- imputeLCMD::impute.MinDet(xxm, q = 0.01)\n", " }\n", " else if(method==\"minprob\"){\n", + " install_bioconductor(\"impute\")\n", + " install_bioconductor(\"pcaMethods\")\n", " install_rpackage('imputeLCMD')\n", " xxm<-as.matrix(df1)\n", " df <- imputeLCMD::impute.MinProb(xxm, q = 0.01, tune.sigma = 1)\n", @@ -200,7 +206,7 @@ " df<-as.data.frame(t(df1x))\n", " }\n", " else if(method==\"rf\"){\n", - " install_rpackage('missForest')\n", + " install_rpackage(\"missForest\")\n", " data_zero1 <- missForest(t(df1), maxiter =10,\n", " ntree = 20 # input$rfntrees\n", " ,mtry=floor(nrow(df1)^(1/3)),verbose = TRUE)\n", @@ -364,7 +370,11 @@ "cell_type": "code", "execution_count": null, "id": "162c5f7f-08f0-44ef-abf5-f0805ab58bb4", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [], "source": [ "# to_test <- c(\n", diff --git a/project/01_1_train_VAE.ipynb b/project/01_1_train_VAE.ipynb index b5d48ba77..a100b8b74 100644 --- a/project/01_1_train_VAE.ipynb +++ b/project/01_1_train_VAE.ipynb @@ -445,7 +445,7 @@ " val_df=data.val_y,\n", " model=models.vae.VAE,\n", " model_kwargs=dict(n_features=data.train_X.shape[-1],\n", - " h_layers=args.hidden_layers,\n", + " n_neurons=args.hidden_layers,\n", " # last_encoder_activation=None,\n", " last_decoder_activation=None,\n", " dim_latent=args.latent_dim),\n", diff --git a/project/01_1_train_VAE.py b/project/01_1_train_VAE.py index e5f91ff59..4943c9d0f 100644 --- a/project/01_1_train_VAE.py +++ b/project/01_1_train_VAE.py @@ -6,7 +6,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -233,7 +233,7 @@ val_df=data.val_y, model=models.vae.VAE, model_kwargs=dict(n_features=data.train_X.shape[-1], - h_layers=args.hidden_layers, + n_neurons=args.hidden_layers, # last_encoder_activation=None, last_decoder_activation=None, dim_latent=args.latent_dim), diff --git a/project/01_2_performance_plots.ipynb b/project/01_2_performance_plots.ipynb index 7ca294696..76320a9d2 100644 --- a/project/01_2_performance_plots.ipynb +++ b/project/01_2_performance_plots.ipynb @@ -799,13 +799,13 @@ " COLORS_TO_USE[:4],\n", " axes):\n", "\n", - " ax, _ = vaep.plotting.data.plot_histogram_intensites(\n", + " ax, _ = vaep.plotting.data.plot_histogram_intensities(\n", " pred_test[TARGET_COL],\n", " color='grey',\n", " min_max=min_max,\n", " ax=ax\n", " )\n", - " ax, _ = vaep.plotting.data.plot_histogram_intensites(\n", + " ax, _ = vaep.plotting.data.plot_histogram_intensities(\n", " pred_test[model],\n", " color=color,\n", " min_max=min_max,\n", diff --git a/project/01_2_performance_plots.py b/project/01_2_performance_plots.py index 90a737272..2060bba1d 100644 --- a/project/01_2_performance_plots.py +++ b/project/01_2_performance_plots.py @@ -420,13 +420,13 @@ def build_text(s): COLORS_TO_USE[:4], axes): - ax, _ = vaep.plotting.data.plot_histogram_intensites( + ax, _ = vaep.plotting.data.plot_histogram_intensities( pred_test[TARGET_COL], color='grey', min_max=min_max, ax=ax ) - ax, _ = vaep.plotting.data.plot_histogram_intensites( + ax, _ = vaep.plotting.data.plot_histogram_intensities( pred_test[model], color=color, min_max=min_max, diff --git a/project/04_1_train_pimms_models.ipynb b/project/04_1_train_pimms_models.ipynb new file mode 100644 index 000000000..f363baacd --- /dev/null +++ b/project/04_1_train_pimms_models.ipynb @@ -0,0 +1,587 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "eae0a078", + "metadata": {}, + "source": [ + "# Scikit-learn styple transformers of the data\n", + "\n", + "1. Load data into pandas dataframe\n", + "2. Fit transformer on training data\n", + "3. Impute only missing values with predictions from model\n", + "\n", + "Autoencoders need wide training data, i.e. a sample with all its features' intensities, whereas\n", + "Collaborative Filtering needs long training data, i.e. sample identifier a feature identifier and the intensity.\n", + "Both data formats can be transformed into each other, but models using long data format do not need to\n", + "take care of missing values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0650846", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "IN_COLAB = 'COLAB_GPU' in os.environ\n", + "if IN_COLAB:\n", + " print(\"Install PIMMS...\")\n", + " !pip install git+https://github.com/RasmussenLab/pimms.git@dev\n", + " # !pip install pimms-learn\n", + " fn_intensities = 'https://raw.githubusercontent.com/RasmussenLab/pimms/main/project/data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv'\n", + "else:\n", + " fn_intensities = 'data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c289d17", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "import matplotlib.pyplot as plt\n", + "\n", + "\n", + "from vaep.plotting.defaults import color_model_mapping\n", + "import vaep.plotting.data \n", + "import vaep.sampling\n", + "\n", + "from vaep.sklearn.cf_transformer import CollaborativeFilteringTransformer\n", + "from vaep.sklearn.ae_transformer import AETransformer\n", + "\n", + "vaep.plotting.make_large_descriptors(8)" + ] + }, + { + "cell_type": "markdown", + "id": "231b6650", + "metadata": {}, + "source": [ + "## Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a3edbdd", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(fn_intensities, index_col=0)\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "727b3ace", + "metadata": {}, + "source": [ + "We will need the data in long format for Collaborative Filtering.\n", + "Naming both the row and column index assures\n", + "that the data can be transformed very easily into long format:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4fde25e9", + "metadata": {}, + "outputs": [], + "source": [ + "df.index.name = 'Sample ID' # already set\n", + "df.columns.name = 'protein group' # not set due to csv disk file format\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "693b1ee5", + "metadata": {}, + "source": [ + "Transform to long-data format:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "646ea5bb", + "metadata": {}, + "outputs": [], + "source": [ + "df = df.stack().to_frame('intensity')\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "2ab8dc7f", + "metadata": {}, + "source": [ + "Transform the data using the logarithm, here using base 2:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "554d4fa7", + "metadata": {}, + "outputs": [], + "source": [ + "df = np.log2(df)\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "7792ce6e", + "metadata": {}, + "source": [ + "The resulting DataFrame with one column has an `MulitIndex` with the sample and feature identifier." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "567854c0", + "metadata": {}, + "outputs": [], + "source": [ + "CollaborativeFilteringTransformer?" + ] + }, + { + "cell_type": "markdown", + "id": "a76ba4ce", + "metadata": {}, + "source": [ + "Let's set up collaborative filtering without a validation or test set, using \n", + "all the data there is." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b547a77", + "metadata": {}, + "outputs": [], + "source": [ + "cf_model = CollaborativeFilteringTransformer(\n", + " target_column='intensity',\n", + " sample_column='Sample ID',\n", + " item_column='protein group',\n", + " out_folder='runs/scikit_interface')" + ] + }, + { + "cell_type": "markdown", + "id": "f86364d4", + "metadata": {}, + "source": [ + "We use `fit` and `transform` to train the model and impute the missing values.\n", + "> Scikit learns interface requires a `X` and `y`. `y` is the validation data in our context.\n", + "> We might have to change the interface to allow usage within pipelines (-> `y` is not needed).\n", + "> This will probably mean setting up a validation set within the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb5ac432", + "metadata": {}, + "outputs": [], + "source": [ + "cf_model.fit(df,\n", + " cuda=True,\n", + " epochs_max=20,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b3dac537", + "metadata": {}, + "outputs": [], + "source": [ + "df_imputed = cf_model.transform(df).unstack()\n", + "assert df_imputed.isna().sum().sum() == 0\n", + "df_imputed.head()" + ] + }, + { + "cell_type": "markdown", + "id": "b94ba21b", + "metadata": {}, + "source": [ + "Let's plot the distribution of the imputed values vs the ones used for training:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99ff7ecd", + "metadata": {}, + "outputs": [], + "source": [ + "df_imputed = df_imputed.stack() # long-format\n", + "observed = df_imputed.loc[df.index]\n", + "imputed = df_imputed.loc[df_imputed.index.difference(df.index)]\n", + "df_imputed = df_imputed.unstack() # back to wide-format\n", + "# some checks\n", + "assert len(df) == len(observed)\n", + "assert df_imputed.shape[0] * df_imputed.shape[1] == len(imputed) + len(observed)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "addb7cbf", + "metadata": {}, + "outputs": [], + "source": [ + "fig, axes = plt.subplots(2, figsize=(8,4))\n", + "\n", + "min_max = vaep.plotting.data.get_min_max_iterable(\n", + " [observed, imputed])\n", + "label_template = '{method} (N={n:,d})'\n", + "ax, _ = vaep.plotting.data.plot_histogram_intensities(\n", + " observed,\n", + " ax=axes[0],\n", + " min_max=min_max,\n", + " label=label_template.format(method='measured',\n", + " n=len(observed),\n", + " ),\n", + " color='grey',\n", + " alpha=1)\n", + "_ = ax.legend()\n", + "ax, _ = vaep.plotting.data.plot_histogram_intensities(\n", + " imputed,\n", + " ax=axes[1],\n", + " min_max=min_max,\n", + " label=label_template.format(method='CF imputed',\n", + " n=len(imputed),\n", + " ),\n", + " color=color_model_mapping['CF'],\n", + " alpha=1)\n", + "_ = ax.legend()" + ] + }, + { + "cell_type": "markdown", + "id": "a6d6552c", + "metadata": {}, + "source": [ + "## AutoEncoder architectures" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b7184c2e", + "metadata": {}, + "outputs": [], + "source": [ + "# Reload data (for demonstration)\n", + "\n", + "df = pd.read_csv(fn_intensities, index_col=0)\n", + "df.index.name = 'Sample ID' # already set\n", + "df.columns.name = 'protein group' # not set due to csv disk file format\n", + "df = np.log2(df) # log transform\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "ae52c6fe", + "metadata": {}, + "source": [ + "The AutoEncoder model currently need validation data for training.\n", + "We will use 10% of the training data for validation. \n", + "> Expect this limitation to be dropped in the next release. It will still be recommended \n", + "> to use validation data for early stopping." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8bbd0017", + "metadata": {}, + "outputs": [], + "source": [ + "freq_feat = df.notna().sum()\n", + "freq_feat.head() # training data" + ] + }, + { + "cell_type": "markdown", + "id": "6da6c4e2", + "metadata": {}, + "source": [ + "We will use the `sampling` module to sample the validation data from the training data.\n", + "Could be split differently by providing another `weights` vector. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99e690f9", + "metadata": {}, + "outputs": [], + "source": [ + "val_X, train_X = vaep.sampling.sample_data(df.stack(),\n", + " sample_index_to_drop=0,\n", + " weights=freq_feat,\n", + " frac=0.1,\n", + " random_state=42,)\n", + "val_X, train_X = val_X.unstack(), train_X.unstack()\n", + "val_X = pd.DataFrame(pd.NA, index=train_X.index,\n", + " columns=train_X.columns).fillna(val_X)" + ] + }, + { + "cell_type": "markdown", + "id": "45b9c22c", + "metadata": {}, + "source": [ + "Training data and validation data have the same shape:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "873f3668", + "metadata": {}, + "outputs": [], + "source": [ + "val_X.shape, train_X.shape" + ] + }, + { + "cell_type": "markdown", + "id": "f89fb41f", + "metadata": {}, + "source": [ + "... but different number of intensities (non-missing values):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62cd0721", + "metadata": {}, + "outputs": [], + "source": [ + "train_X.notna().sum().sum(), val_X.notna().sum().sum()," + ] + }, + { + "cell_type": "markdown", + "id": "b5a0b973", + "metadata": {}, + "source": [ + "Select either `DAE` or `VAE` model:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26a12a3e", + "metadata": {}, + "outputs": [], + "source": [ + "model_selected = 'VAE' # 'DAE'\n", + "model = AETransformer(\n", + " model=model_selected,\n", + " hidden_layers=[512,],\n", + " latent_dim=50,\n", + " out_folder='runs/scikit_interface',\n", + " batch_size=10,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d3c7922", + "metadata": {}, + "outputs": [], + "source": [ + "model.fit(train_X, val_X,\n", + " epochs_max=50,\n", + " cuda=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24ca6c2c", + "metadata": {}, + "outputs": [], + "source": [ + "df_imputed = model.transform(train_X)\n", + "df_imputed" + ] + }, + { + "cell_type": "markdown", + "id": "17398941", + "metadata": {}, + "source": [ + "Evaluate the model using the validation data:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a664072", + "metadata": {}, + "outputs": [], + "source": [ + "pred_val = val_X.stack().to_frame('observed')\n", + "pred_val[model_selected] = df_imputed.stack()\n", + "pred_val" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6eaa510a", + "metadata": {}, + "outputs": [], + "source": [ + "val_metrics = vaep.models.calculte_metrics(pred_val, 'observed')\n", + "# val_metrics = metrics.add_metrics(\n", + "# pred_val, key='test data')\n", + "# val_metrics = pd.DataFrame(val_metrics)\n", + "# val_metrics\n", + "pd.DataFrame(val_metrics)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3013daf", + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots(figsize=(8, 2))\n", + "\n", + "ax, errors_binned = vaep.plotting.errors.plot_errors_by_median(\n", + " pred=pred_val,\n", + " target_col='observed',\n", + " feat_medians=train_X.median(),\n", + " ax=ax,\n", + " metric_name='MAE',\n", + " palette=color_model_mapping\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "e215dba2", + "metadata": {}, + "source": [ + "replace predicted values with validation data values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f89799e8", + "metadata": {}, + "outputs": [], + "source": [ + "df_imputed = df_imputed.replace(val_X)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "235fdb66", + "metadata": {}, + "outputs": [], + "source": [ + "df = df.stack() # long-format\n", + "df_imputed = df_imputed.stack() # long-format\n", + "observed = df_imputed.loc[df.index]\n", + "imputed = df_imputed.loc[df_imputed.index.difference(df.index)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "851ab631", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "fig, axes = plt.subplots(2, figsize=(8,4))\n", + "\n", + "min_max = vaep.plotting.data.get_min_max_iterable(\n", + " [observed, imputed])\n", + "label_template = '{method} (N={n:,d})'\n", + "ax, _ = vaep.plotting.data.plot_histogram_intensities(\n", + " observed,\n", + " ax=axes[0],\n", + " min_max=min_max,\n", + " label=label_template.format(method='measured',\n", + " n=len(observed),\n", + " ),\n", + " color='grey',\n", + " alpha=1)\n", + "_ = ax.legend()\n", + "ax, _ = vaep.plotting.data.plot_histogram_intensities(\n", + " imputed,\n", + " ax=axes[1],\n", + " min_max=min_max,\n", + " label=label_template.format(method=f'{model_selected} imputed',\n", + " n=len(imputed),\n", + " ),\n", + " color=color_model_mapping[model_selected],\n", + " alpha=1)\n", + "_ = ax.legend()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a235f133", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "-all", + "main_language": "python", + "notebook_metadata_filter": "-all" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.17" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/project/04_1_train_pimms_models.py b/project/04_1_train_pimms_models.py new file mode 100644 index 000000000..9c24fd783 --- /dev/null +++ b/project/04_1_train_pimms_models.py @@ -0,0 +1,282 @@ +# %% [markdown] +# # Scikit-learn styple transformers of the data +# +# 1. Load data into pandas dataframe +# 2. Fit transformer on training data +# 3. Impute only missing values with predictions from model +# +# Autoencoders need wide training data, i.e. a sample with all its features' intensities, whereas +# Collaborative Filtering needs long training data, i.e. sample identifier a feature identifier and the intensity. +# Both data formats can be transformed into each other, but models using long data format do not need to +# take care of missing values. + +# %% +import os +IN_COLAB = 'COLAB_GPU' in os.environ +if IN_COLAB: + print("Install PIMMS...") + !pip install git+https://github.com/RasmussenLab/pimms.git@dev + # !pip install pimms-learn + fn_intensities = 'https://raw.githubusercontent.com/RasmussenLab/pimms/main/project/data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv' +else: + fn_intensities = 'data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv' + +# %% +import numpy as np +import pandas as pd + +import matplotlib.pyplot as plt + + +from vaep.plotting.defaults import color_model_mapping +import vaep.plotting.data +import vaep.sampling + +from vaep.sklearn.cf_transformer import CollaborativeFilteringTransformer +from vaep.sklearn.ae_transformer import AETransformer + +vaep.plotting.make_large_descriptors(8) + +# %% [markdown] +# ## Data + +# %% +df = pd.read_csv(fn_intensities, index_col=0) +df.head() + +# %% [markdown] +# We will need the data in long format for Collaborative Filtering. +# Naming both the row and column index assures +# that the data can be transformed very easily into long format: + +# %% +df.index.name = 'Sample ID' # already set +df.columns.name = 'protein group' # not set due to csv disk file format +df.head() + +# %% [markdown] +# Transform to long-data format: + +# %% +df = df.stack().to_frame('intensity') +df.head() + +# %% [markdown] +# Transform the data using the logarithm, here using base 2: + +# %% +df = np.log2(df) +df.head() + +# %% [markdown] +# The resulting DataFrame with one column has an `MulitIndex` with the sample and feature identifier. + +# %% +CollaborativeFilteringTransformer? + +# %% [markdown] +# Let's set up collaborative filtering without a validation or test set, using +# all the data there is. + +# %% +cf_model = CollaborativeFilteringTransformer( + target_column='intensity', + sample_column='Sample ID', + item_column='protein group', + out_folder='runs/scikit_interface') + +# %% [markdown] +# We use `fit` and `transform` to train the model and impute the missing values. +# > Scikit learns interface requires a `X` and `y`. `y` is the validation data in our context. +# > We might have to change the interface to allow usage within pipelines (-> `y` is not needed). +# > This will probably mean setting up a validation set within the model. + +# %% +cf_model.fit(df, + cuda=True, + epochs_max=20, + ) + +# %% +df_imputed = cf_model.transform(df).unstack() +assert df_imputed.isna().sum().sum() == 0 +df_imputed.head() + +# %% [markdown] +# Let's plot the distribution of the imputed values vs the ones used for training: + +# %% +df_imputed = df_imputed.stack() # long-format +observed = df_imputed.loc[df.index] +imputed = df_imputed.loc[df_imputed.index.difference(df.index)] +df_imputed = df_imputed.unstack() # back to wide-format +# some checks +assert len(df) == len(observed) +assert df_imputed.shape[0] * df_imputed.shape[1] == len(imputed) + len(observed) + +# %% +fig, axes = plt.subplots(2, figsize=(8,4)) + +min_max = vaep.plotting.data.get_min_max_iterable( + [observed, imputed]) +label_template = '{method} (N={n:,d})' +ax, _ = vaep.plotting.data.plot_histogram_intensities( + observed, + ax=axes[0], + min_max=min_max, + label=label_template.format(method='measured', + n=len(observed), + ), + color='grey', + alpha=1) +_ = ax.legend() +ax, _ = vaep.plotting.data.plot_histogram_intensities( + imputed, + ax=axes[1], + min_max=min_max, + label=label_template.format(method='CF imputed', + n=len(imputed), + ), + color=color_model_mapping['CF'], + alpha=1) +_ = ax.legend() + +# %% [markdown] +# ## AutoEncoder architectures + +# %% +# Reload data (for demonstration) + +df = pd.read_csv(fn_intensities, index_col=0) +df.index.name = 'Sample ID' # already set +df.columns.name = 'protein group' # not set due to csv disk file format +df = np.log2(df) # log transform +df.head() + +# %% [markdown] +# The AutoEncoder model currently need validation data for training. +# We will use 10% of the training data for validation. +# > Expect this limitation to be dropped in the next release. It will still be recommended +# > to use validation data for early stopping. + +# %% +freq_feat = df.notna().sum() +freq_feat.head() # training data + +# %% [markdown] +# We will use the `sampling` module to sample the validation data from the training data. +# Could be split differently by providing another `weights` vector. + +# %% +val_X, train_X = vaep.sampling.sample_data(df.stack(), + sample_index_to_drop=0, + weights=freq_feat, + frac=0.1, + random_state=42,) +val_X, train_X = val_X.unstack(), train_X.unstack() +val_X = pd.DataFrame(pd.NA, index=train_X.index, + columns=train_X.columns).fillna(val_X) + +# %% [markdown] +# Training data and validation data have the same shape: + +# %% +val_X.shape, train_X.shape + +# %% [markdown] +# ... but different number of intensities (non-missing values): + +# %% +train_X.notna().sum().sum(), val_X.notna().sum().sum(), + +# %% [markdown] +# Select either `DAE` or `VAE` model: + +# %% +model_selected = 'VAE' # 'DAE' +model = AETransformer( + model=model_selected, + hidden_layers=[512,], + latent_dim=50, + out_folder='runs/scikit_interface', + batch_size=10, +) + +# %% +model.fit(train_X, val_X, + epochs_max=50, + cuda=True) + +# %% +df_imputed = model.transform(train_X) +df_imputed + +# %% [markdown] +# Evaluate the model using the validation data: + +# %% +pred_val = val_X.stack().to_frame('observed') +pred_val[model_selected] = df_imputed.stack() +pred_val + +# %% +val_metrics = vaep.models.calculte_metrics(pred_val, 'observed') +# val_metrics = metrics.add_metrics( +# pred_val, key='test data') +# val_metrics = pd.DataFrame(val_metrics) +# val_metrics +pd.DataFrame(val_metrics) + +# %% +fig, ax = plt.subplots(figsize=(8, 2)) + +ax, errors_binned = vaep.plotting.errors.plot_errors_by_median( + pred=pred_val, + target_col='observed', + feat_medians=train_X.median(), + ax=ax, + metric_name='MAE', + palette=color_model_mapping +) + +# %% [markdown] +# replace predicted values with validation data values + +# %% +df_imputed = df_imputed.replace(val_X) + +# %% +df = df.stack() # long-format +df_imputed = df_imputed.stack() # long-format +observed = df_imputed.loc[df.index] +imputed = df_imputed.loc[df_imputed.index.difference(df.index)] + +# %% +fig, axes = plt.subplots(2, figsize=(8,4)) + +min_max = vaep.plotting.data.get_min_max_iterable( + [observed, imputed]) +label_template = '{method} (N={n:,d})' +ax, _ = vaep.plotting.data.plot_histogram_intensities( + observed, + ax=axes[0], + min_max=min_max, + label=label_template.format(method='measured', + n=len(observed), + ), + color='grey', + alpha=1) +_ = ax.legend() +ax, _ = vaep.plotting.data.plot_histogram_intensities( + imputed, + ax=axes[1], + min_max=min_max, + label=label_template.format(method=f'{model_selected} imputed', + n=len(imputed), + ), + color=color_model_mapping[model_selected], + alpha=1) +_ = ax.legend() + + +# %% diff --git a/project/10_4_ald_compare_single_pg.ipynb b/project/10_4_ald_compare_single_pg.ipynb index c38e174cf..a5e0612f1 100644 --- a/project/10_4_ald_compare_single_pg.ipynb +++ b/project/10_4_ald_compare_single_pg.ipynb @@ -88,9 +88,7 @@ "cell_type": "code", "execution_count": null, "id": "b85c6c2a-146c-48bd-9d7b-1fe4eec8a6ae", - "metadata": { - "tags": [] - }, + "metadata": {}, "outputs": [], "source": [ "params = vaep.nb.get_params(args, globals=globals())\n", @@ -175,16 +173,6 @@ " logger.info(f'Added reference method scores from {args.ref_method_score}')" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "d514e013", - "metadata": { - "lines_to_next_cell": 0 - }, - "outputs": [], - "source": [] - }, { "cell_type": "markdown", "id": "79746f59", @@ -387,9 +375,7 @@ "cell_type": "code", "execution_count": null, "id": "624d3301", - "metadata": { - "tags": [] - }, + "metadata": {}, "outputs": [], "source": [ "target = pd.read_csv(args.fn_clinical_data,\n", @@ -620,7 +606,7 @@ "\n", " label_template = '{method} (N={n:,d}, q={q:.3f})'\n", " # observed data\n", - " vaep.plotting.data.plot_histogram_intensites(\n", + " vaep.plotting.data.plot_histogram_intensities(\n", " feat_observed,\n", " ax=ax,\n", " min_max=min_max,\n", @@ -645,7 +631,7 @@ " n=len(pred),\n", " q=float(qvalues.loc[idx, (method, 'qvalue')]\n", " ))\n", - " ax, bins = vaep.plotting.data.plot_histogram_intensites(\n", + " ax, bins = vaep.plotting.data.plot_histogram_intensities(\n", " pred,\n", " ax=ax,\n", " min_max=min_max,\n", diff --git a/project/10_4_ald_compare_single_pg.py b/project/10_4_ald_compare_single_pg.py index c381c86a2..7b3c9279c 100644 --- a/project/10_4_ald_compare_single_pg.py +++ b/project/10_4_ald_compare_single_pg.py @@ -107,7 +107,6 @@ scores = scores.join(scores_reference) logger.info(f'Added reference method scores from {args.ref_method_score}') -# %% # %% [markdown] # ## Load frequencies of observed features @@ -322,7 +321,7 @@ label_template = '{method} (N={n:,d}, q={q:.3f})' # observed data - vaep.plotting.data.plot_histogram_intensites( + vaep.plotting.data.plot_histogram_intensities( feat_observed, ax=ax, min_max=min_max, @@ -347,7 +346,7 @@ n=len(pred), q=float(qvalues.loc[idx, (method, 'qvalue')] )) - ax, bins = vaep.plotting.data.plot_histogram_intensites( + ax, bins = vaep.plotting.data.plot_histogram_intensities( pred, ax=ax, min_max=min_max, diff --git a/project/data/mqpar_example.xml b/project/data/mqpar_example.xml new file mode 100644 index 000000000..671e1e655 --- /dev/null +++ b/project/data/mqpar_example.xml @@ -0,0 +1,473 @@ + + + + + /home/fasta/UP000005640_9606.fasta + >.*\|(.*)\| + >.*\|(.*)\| + + + + 9606 + + + /home/fasta/UP000005640_9606_additional.fasta + >.*\|(.*)\| + >.*\|(.*)\| + + + + 9606 + + + + + + + + 350000 + True + 0.005 + False + False + True + True + revert + all + True + 4600 + True + True + True + 0 + 6 + 0 + 40 + True + False + False + False + False + 0 + 0 + False + False + False + False + 0 + False + False + False + False + False + False + Species + False + 3 + False + True + False + True + False + False + + + + 7 + 0.01 + 0.01 + 0.01 + 0.01 + 8 + 25 + True + 1 + 1 + 0 + False + True + False + + 2 + True + + Oxidation (M) + Acetyl (Protein N-term) + + 0 + 0 + 0 + 0 + 15 + 0 + 1 + + + 200 + False + True + True + True + True + True + True + False + False + False + True + False + 0 + 20 + + none + False + session1 + 1.6.12.0 + + + 4 + 1 + + + + + -1.79589544172745E+308 + 1.79589544172745E+308 + False + False + False + False + False + False + False + + /home/mq_out/example.raw + + + + + + 32767 + + + False + + + 0 + + + + + False + False + + + 0 + 7 + 2 + 1 + False + False + True + 1 + NaN + NaN + MatchFromAndTo + 0 + 8 + True + 35 + True + 1.4 + 1.2 + False + 0 + + + + Standard + False + 0 + 3 + 6 + 100000 + 0 + 0 + False + False + True + False + 2 + 0 + 5 + 2 + 1 + 0 + 0 + 0 + 0 + + Carbamidomethyl (C) + + + Trypsin/P + + + + 0 + False + False + + Oxidation (M) + Acetyl (Protein N-term) + + False + + + + + + + + + False + + + + + True + 20 + 4.5 + True + 2 + True + 0.6 + 0.6 + True + True + False + 70 + False + + 0 + 0 + 0 + NaN + NaN + False + NaN + NaN + 0 + 0 + 0 + 0 + True + False + True + False + + 0 + 6 + False + 0 + 0 + 0 + 0 + + + + + + + + + + False + True + 0.75 + 0 + + + + + + + + 20 + 20 + 0.85 + 2 + 2 + 7 + 1.99 + 0.4 + 0.65 + 0 + 6 + 1 + 3 + 0 + 0.8 + 0 + 1 + 0.5 + 0 + 3 + 25 + 260 + True + + + + + FTMS + 20 + True + 7 + True + 10 + True + True + 12 + 100 + True + True + True + True + False + + + ITMS + 0.5 + False + 0.15 + False + 0.25 + False + False + 8 + 100 + True + True + True + True + False + + + TOF + 40 + True + 0.01 + False + 0.02 + False + True + 10 + 100 + True + True + True + True + False + + + Unknown + 20 + True + 7 + True + 10 + True + True + 12 + 100 + True + True + True + True + False + + + + + CID + False + 1 + 1 + 1 + False + 1 + KRH + + + HCD + False + 1 + 1 + 1 + False + 1 + KRH + + + ETD + False + 1 + 1 + 1 + False + 1 + KRH + + + PQD + False + 1 + 1 + 1 + False + 1 + KRH + + + ETHCD + False + 1 + 1 + 1 + False + 1 + KRH + + + ETCID + False + 1 + 1 + 1 + False + 1 + KRH + + + UVPD + False + 1 + 1 + 1 + False + 1 + KRH + + + Unknown + False + 1 + 1 + 1 + False + 1 + KRH + + + diff --git a/project/doc/Figures/Molecular_structures_of_the_21_proteinogenic_amino_acids.svg b/project/doc/Figures/Molecular_structures_of_the_21_proteinogenic_amino_acids.svg deleted file mode 100644 index 87541196b..000000000 --- a/project/doc/Figures/Molecular_structures_of_the_21_proteinogenic_amino_acids.svg +++ /dev/null @@ -1,5886 +0,0 @@ -image/svg+xml \ No newline at end of file diff --git a/project/doc/Figures/amino_acids_iupac_iub_1983.png b/project/doc/Figures/amino_acids_iupac_iub_1983.png deleted file mode 100644 index f831fa60a..000000000 Binary files a/project/doc/Figures/amino_acids_iupac_iub_1983.png and /dev/null differ diff --git a/project/doc/Figures/fig_proteomics_pipeline.png b/project/doc/Figures/fig_proteomics_pipeline.png deleted file mode 100644 index 9f811ad9c..000000000 Binary files a/project/doc/Figures/fig_proteomics_pipeline.png and /dev/null differ diff --git a/project/doc/Figures/schema_orbitrap_instrument.jpg b/project/doc/Figures/schema_orbitrap_instrument.jpg deleted file mode 100644 index d8c0a5823..000000000 Binary files a/project/doc/Figures/schema_orbitrap_instrument.jpg and /dev/null differ diff --git a/project/doc/MaxQuant.md b/project/doc/MaxQuant.md deleted file mode 100644 index 70f78b280..000000000 --- a/project/doc/MaxQuant.md +++ /dev/null @@ -1,11 +0,0 @@ -# MaxQuant - -MaxQuant `txt` output folder is used for further analysis. - -##### Retention Time (RT) -Files containing retention time (RT) information: -- `evidence.txt` (probably all you ever need) -- `allPeptides.txt` -- `modificationSpecificPeptides.txt` -- `msms.txt` -- `msmsScans.txt` \ No newline at end of file diff --git a/project/doc/ipynbs/01_FASTA_data_agg_by_gene.md b/project/doc/ipynbs/01_FASTA_data_agg_by_gene.md deleted file mode 100644 index 725a4f331..000000000 --- a/project/doc/ipynbs/01_FASTA_data_agg_by_gene.md +++ /dev/null @@ -1,669 +0,0 @@ -# Protein sequence aggregation by gene - - -```python -from collections import defaultdict -import json -from tqdm.notebook import tqdm - -import numpy as np -import pandas as pd - -from src.src.config import FN_FASTA_DB -from src.src.config import fasta_entry as fasta_keys -``` - - -```python -with open(FN_FASTA_DB) as f: - data_fasta = json.load(f)#, indent=4, sort_keys=False) -len(data_fasta) -``` - - - - - 96418 - - - - -```python -gene_isotopes = defaultdict(list) -protein_wo_gene = [] -for key, fasta_entry in tqdm(data_fasta.items()): - gene = fasta_entry[fasta_keys.gene] - if gene: - gene_isotopes[gene].append(key) - else: - protein_wo_gene.append(key) - -print(f"#{len(protein_wo_gene)} proteins have not gene associated: {', '.join(protein_wo_gene[:10])}, ...") -``` - - - HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=96418.0), HTML(value=''))) - - - - #851 proteins have not gene associated: A0A075B6T3, A0A075B732, A0A087WT57, A0A087WUV0, A0A087WVE0, A0A087WW49, A0A087WWC5, A0A087WWL8, A0A087WWU0, A0A087WX66... - - - -```python -gene = 'ACTG1' # Actin as a contaminant protein -gene_isotopes[gene] -``` - - - - - ['P63261', 'I3L1U9', 'I3L3I0', 'I3L3R2', 'I3L4N8', 'J3KT65', 'K7EM38'] - - - - -```python -from pprint import pprint -for isotope in gene_isotopes[gene]: - pprint(data_fasta[isotope]) -``` - - {'gene': 'ACTG1', - 'meta': '>sp|P63261|ACTG_HUMAN Actin, cytoplasmic 2 OS=Homo sapiens OX=9606 ' - 'GN=ACTG1 PE=1 SV=1', - 'peptides': [['MEEEIAALVIDNGSGMCK', - 'AGFAGDDAPR', - 'AVFPSIVGR', - 'HQGVMVGMGQK', - 'DSYVGDEAQSK', - 'YPIEHGIVTNWDDMEK', - 'IWHHTFYNELR', - 'VAPEEHPVLLTEAPLNPK', - 'MTQIMFETFNTPAMYVAIQAVLSLYASGR', - 'TTGIVMDSGDGVTHTVPIYEGYALPHAILR', - 'DLTDYLMK', - 'GYSFTTTAER', - 'LCYVALDFEQEMATAASSSSLEK', - 'SYELPDGQVITIGNER', - 'CPEALFQPSFLGMESCGIHETTFNSIMK', - 'DLYANTVLSGGTTMYPGIADR', - 'EITALAPSTMK', - 'IIAPPER', - 'YSVWIGGSILASLSTFQQMWISK', - 'QEYDESGPSIVHR'], - ['MEEEIAALVIDNGSGMCKAGFAGDDAPR', - 'AGFAGDDAPRAVFPSIVGR', - 'AVFPSIVGRPR', - 'PRHQGVMVGMGQK', - 'HQGVMVGMGQKDSYVGDEAQSK', - 'DSYVGDEAQSKR', - 'RGILTLK', - 'GILTLKYPIEHGIVTNWDDMEK', - 'YPIEHGIVTNWDDMEKIWHHTFYNELR', - 'IWHHTFYNELRVAPEEHPVLLTEAPLNPK', - 'VAPEEHPVLLTEAPLNPKANR', - 'EKMTQIMFETFNTPAMYVAIQAVLSLYASGR', - 'MTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVPIYEGYALPHAILR', - 'TTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGR', - 'LDLAGRDLTDYLMK', - 'DLTDYLMKILTER', - 'ILTERGYSFTTTAER', - 'GYSFTTTAEREIVR', - 'EIVRDIK', - 'EKLCYVALDFEQEMATAASSSSLEK', - 'LCYVALDFEQEMATAASSSSLEKSYELPDGQVITIGNER', - 'SYELPDGQVITIGNERFR', - 'FRCPEALFQPSFLGMESCGIHETTFNSIMK', - 'CPEALFQPSFLGMESCGIHETTFNSIMKCDVDIR', - 'CDVDIRK', - 'KDLYANTVLSGGTTMYPGIADR', - 'DLYANTVLSGGTTMYPGIADRMQK', - 'MQKEITALAPSTMK', - 'EITALAPSTMKIK', - 'IKIIAPPER', - 'IIAPPERK', - 'KYSVWIGGSILASLSTFQQMWISK', - 'YSVWIGGSILASLSTFQQMWISKQEYDESGPSIVHR', - 'QEYDESGPSIVHRK'], - ['MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGR', - 'AGFAGDDAPRAVFPSIVGRPR', - 'AVFPSIVGRPRHQGVMVGMGQK', - 'PRHQGVMVGMGQKDSYVGDEAQSK', - 'HQGVMVGMGQKDSYVGDEAQSKR', - 'DSYVGDEAQSKRGILTLK', - 'RGILTLKYPIEHGIVTNWDDMEK', - 'GILTLKYPIEHGIVTNWDDMEKIWHHTFYNELR', - 'YPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPK', - 'IWHHTFYNELRVAPEEHPVLLTEAPLNPKANR', - 'VAPEEHPVLLTEAPLNPKANREK', - 'ANREKMTQIMFETFNTPAMYVAIQAVLSLYASGR', - 'EKMTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVPIYEGYALPHAILR', - 'MTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGR', - 'TTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGRDLTDYLMK', - 'LDLAGRDLTDYLMKILTER', - 'DLTDYLMKILTERGYSFTTTAER', - 'ILTERGYSFTTTAEREIVR', - 'GYSFTTTAEREIVRDIK', - 'EIVRDIKEK', - 'DIKEKLCYVALDFEQEMATAASSSSLEK', - 'EKLCYVALDFEQEMATAASSSSLEKSYELPDGQVITIGNER', - 'LCYVALDFEQEMATAASSSSLEKSYELPDGQVITIGNERFR', - 'SYELPDGQVITIGNERFRCPEALFQPSFLGMESCGIHETTFNSIMK', - 'FRCPEALFQPSFLGMESCGIHETTFNSIMKCDVDIR', - 'CPEALFQPSFLGMESCGIHETTFNSIMKCDVDIRK', - 'CDVDIRKDLYANTVLSGGTTMYPGIADR', - 'KDLYANTVLSGGTTMYPGIADRMQK', - 'DLYANTVLSGGTTMYPGIADRMQKEITALAPSTMK', - 'MQKEITALAPSTMKIK', - 'EITALAPSTMKIKIIAPPER', - 'IKIIAPPERK', - 'IIAPPERKYSVWIGGSILASLSTFQQMWISK', - 'KYSVWIGGSILASLSTFQQMWISKQEYDESGPSIVHR', - 'YSVWIGGSILASLSTFQQMWISKQEYDESGPSIVHRK', - 'QEYDESGPSIVHRKCF']], - 'seq': 'MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKMTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGRDLTDYLMKILTERGYSFTTTAEREIVRDIKEKLCYVALDFEQEMATAASSSSLEKSYELPDGQVITIGNERFRCPEALFQPSFLGMESCGIHETTFNSIMKCDVDIRKDLYANTVLSGGTTMYPGIADRMQKEITALAPSTMKIKIIAPPERKYSVWIGGSILASLSTFQQMWISKQEYDESGPSIVHRKCF'} - {'gene': 'ACTG1', - 'meta': '>tr|I3L1U9|I3L1U9_HUMAN Isoform of P63261, Actin, cytoplasmic 2 ' - '(Fragment) OS=Homo sapiens OX=9606 GN=ACTG1 PE=1 SV=1', - 'peptides': [['MEEEIAALVIDNGSGMCK', - 'AGFAGDDAPR', - 'AVFPSIVGR', - 'HQGVMVGMGQK', - 'DSYVGDEAQSK', - 'YPIEHGIVTNWDDMEK', - 'IWHHTFYNELR', - 'VAPEEHPVLLTEAPLNPK', - 'MTQIMFETFNTPAMYVAIQAVLSLYASGR', - 'TTGIVMDSGDGVTHTVPIYEGYALPHAILR', - 'DLTDYLMK', - 'GYSFTTTAER'], - ['MEEEIAALVIDNGSGMCKAGFAGDDAPR', - 'AGFAGDDAPRAVFPSIVGR', - 'AVFPSIVGRPR', - 'PRHQGVMVGMGQK', - 'HQGVMVGMGQKDSYVGDEAQSK', - 'DSYVGDEAQSKR', - 'RGILTLK', - 'GILTLKYPIEHGIVTNWDDMEK', - 'YPIEHGIVTNWDDMEKIWHHTFYNELR', - 'IWHHTFYNELRVAPEEHPVLLTEAPLNPK', - 'VAPEEHPVLLTEAPLNPKANR', - 'EKMTQIMFETFNTPAMYVAIQAVLSLYASGR', - 'MTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVPIYEGYALPHAILR', - 'TTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGR', - 'LDLAGRDLTDYLMK', - 'DLTDYLMKILTER', - 'ILTERGYSFTTTAER', - 'GYSFTTTAEREIVR', - 'EIVRDIK'], - ['MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGR', - 'AGFAGDDAPRAVFPSIVGRPR', - 'AVFPSIVGRPRHQGVMVGMGQK', - 'PRHQGVMVGMGQKDSYVGDEAQSK', - 'HQGVMVGMGQKDSYVGDEAQSKR', - 'DSYVGDEAQSKRGILTLK', - 'RGILTLKYPIEHGIVTNWDDMEK', - 'GILTLKYPIEHGIVTNWDDMEKIWHHTFYNELR', - 'YPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPK', - 'IWHHTFYNELRVAPEEHPVLLTEAPLNPKANR', - 'VAPEEHPVLLTEAPLNPKANREK', - 'ANREKMTQIMFETFNTPAMYVAIQAVLSLYASGR', - 'EKMTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVPIYEGYALPHAILR', - 'MTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGR', - 'TTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGRDLTDYLMK', - 'LDLAGRDLTDYLMKILTER', - 'DLTDYLMKILTERGYSFTTTAER', - 'ILTERGYSFTTTAEREIVR', - 'GYSFTTTAEREIVRDIK', - 'EIVRDIKE']], - 'seq': 'MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKMTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGRDLTDYLMKILTERGYSFTTTAEREIVRDIKE'} - {'gene': 'ACTG1', - 'meta': '>tr|I3L3I0|I3L3I0_HUMAN Isoform of P63261, Actin, cytoplasmic 2 ' - '(Fragment) OS=Homo sapiens OX=9606 GN=ACTG1 PE=1 SV=1', - 'peptides': [['MEEEIAALVIDNGSGMCK', - 'AGFAGDDAPR', - 'AVFPSIVGR', - 'HQGVMVGMGQK', - 'DSYVGDEAQSK', - 'YPIEHGIVTNWDDMEK', - 'IWHHTFYNELR', - 'VAPEEHPVLLTEAPLNPK', - 'MTQIMFETFNTPAMYVAIQAVLSLYASGR', - 'TTGIVMDSGDGVTHTVPIYEGYALPHAILR', - 'DLTDYLMK', - 'GYSFTTTAER'], - ['MEEEIAALVIDNGSGMCKAGFAGDDAPR', - 'AGFAGDDAPRAVFPSIVGR', - 'AVFPSIVGRPR', - 'PRHQGVMVGMGQK', - 'HQGVMVGMGQKDSYVGDEAQSK', - 'DSYVGDEAQSKR', - 'RGILTLK', - 'GILTLKYPIEHGIVTNWDDMEK', - 'YPIEHGIVTNWDDMEKIWHHTFYNELR', - 'IWHHTFYNELRVAPEEHPVLLTEAPLNPK', - 'VAPEEHPVLLTEAPLNPKANR', - 'EKMTQIMFETFNTPAMYVAIQAVLSLYASGR', - 'MTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVPIYEGYALPHAILR', - 'TTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGR', - 'LDLAGRDLTDYLMK', - 'DLTDYLMKILTER', - 'ILTERGYSFTTTAER', - 'GYSFTTTAEREIVR', - 'EIVRDIK'], - ['MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGR', - 'AGFAGDDAPRAVFPSIVGRPR', - 'AVFPSIVGRPRHQGVMVGMGQK', - 'PRHQGVMVGMGQKDSYVGDEAQSK', - 'HQGVMVGMGQKDSYVGDEAQSKR', - 'DSYVGDEAQSKRGILTLK', - 'RGILTLKYPIEHGIVTNWDDMEK', - 'GILTLKYPIEHGIVTNWDDMEKIWHHTFYNELR', - 'YPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPK', - 'IWHHTFYNELRVAPEEHPVLLTEAPLNPKANR', - 'VAPEEHPVLLTEAPLNPKANREK', - 'ANREKMTQIMFETFNTPAMYVAIQAVLSLYASGR', - 'EKMTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVPIYEGYALPHAILR', - 'MTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGR', - 'TTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGRDLTDYLMK', - 'LDLAGRDLTDYLMKILTER', - 'DLTDYLMKILTERGYSFTTTAER', - 'ILTERGYSFTTTAEREIVR', - 'GYSFTTTAEREIVRDIK', - 'EIVRDIKE']], - 'seq': 'MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKMTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGRDLTDYLMKILTERGYSFTTTAEREIVRDIKE'} - {'gene': 'ACTG1', - 'meta': '>tr|I3L3R2|I3L3R2_HUMAN Isoform of P63261, Actin, cytoplasmic 2 ' - '(Fragment) OS=Homo sapiens OX=9606 GN=ACTG1 PE=1 SV=1', - 'peptides': [['MEEEIAALVIDNGSGMCK', - 'AGFAGDDAPR', - 'AVFPSIVGR', - 'HQGVMVGMGQK', - 'DSYVGDEAQSK', - 'YPIEHGIVTNWDDMEK', - 'IWHHTFYNELR', - 'VAPEEHPVLLTEAPLNPK', - 'MTQIMFETFNTPAMYVAIQAVLSLYASGR', - 'TTGIVMDSGDGVTHTVP'], - ['MEEEIAALVIDNGSGMCKAGFAGDDAPR', - 'AGFAGDDAPRAVFPSIVGR', - 'AVFPSIVGRPR', - 'PRHQGVMVGMGQK', - 'HQGVMVGMGQKDSYVGDEAQSK', - 'DSYVGDEAQSKR', - 'RGILTLK', - 'GILTLKYPIEHGIVTNWDDMEK', - 'YPIEHGIVTNWDDMEKIWHHTFYNELR', - 'IWHHTFYNELRVAPEEHPVLLTEAPLNPK', - 'VAPEEHPVLLTEAPLNPKANR', - 'EKMTQIMFETFNTPAMYVAIQAVLSLYASGR', - 'MTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVP'], - ['MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGR', - 'AGFAGDDAPRAVFPSIVGRPR', - 'AVFPSIVGRPRHQGVMVGMGQK', - 'PRHQGVMVGMGQKDSYVGDEAQSK', - 'HQGVMVGMGQKDSYVGDEAQSKR', - 'DSYVGDEAQSKRGILTLK', - 'RGILTLKYPIEHGIVTNWDDMEK', - 'GILTLKYPIEHGIVTNWDDMEKIWHHTFYNELR', - 'YPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPK', - 'IWHHTFYNELRVAPEEHPVLLTEAPLNPKANR', - 'VAPEEHPVLLTEAPLNPKANREK', - 'ANREKMTQIMFETFNTPAMYVAIQAVLSLYASGR', - 'EKMTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVP']], - 'seq': 'MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKMTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVP'} - {'gene': 'ACTG1', - 'meta': '>tr|I3L4N8|I3L4N8_HUMAN Isoform of P63261, Actin, cytoplasmic 2 ' - '(Fragment) OS=Homo sapiens OX=9606 GN=ACTG1 PE=1 SV=8', - 'peptides': [['MEEEIAALVIDNGSGMCK', - 'AGFAGDDAPR', - 'AVFPSIVGR', - 'HQDSYVGDEAQSK', - 'YPIEHGIVTNWDDMEK', - 'IWHHTFYNELR', - 'VAPEEHPVLLTEAPLNPK', - 'MTQIMFETFNTPAMYVAIQAVLSLYASGR', - 'TTGIVMDSGDGVTHTVPIYEGYALPHAILR', - 'DLTDYLMK', - 'GYSFTTTAER', - 'LCYVALDFEQEMATAASSSSLEK', - 'SYELPDGQVITI'], - ['MEEEIAALVIDNGSGMCKAGFAGDDAPR', - 'AGFAGDDAPRAVFPSIVGR', - 'AVFPSIVGRPR', - 'PRHQDSYVGDEAQSK', - 'HQDSYVGDEAQSKR', - 'RGILTLK', - 'GILTLKYPIEHGIVTNWDDMEK', - 'YPIEHGIVTNWDDMEKIWHHTFYNELR', - 'IWHHTFYNELRVAPEEHPVLLTEAPLNPK', - 'VAPEEHPVLLTEAPLNPKANR', - 'EKMTQIMFETFNTPAMYVAIQAVLSLYASGR', - 'MTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVPIYEGYALPHAILR', - 'TTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGR', - 'LDLAGRDLTDYLMK', - 'DLTDYLMKILTER', - 'ILTERGYSFTTTAER', - 'GYSFTTTAEREIVR', - 'EIVRDIK', - 'EKLCYVALDFEQEMATAASSSSLEK', - 'LCYVALDFEQEMATAASSSSLEKSYELPDGQVITI'], - ['MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGR', - 'AGFAGDDAPRAVFPSIVGRPR', - 'AVFPSIVGRPRHQDSYVGDEAQSK', - 'PRHQDSYVGDEAQSKR', - 'HQDSYVGDEAQSKRGILTLK', - 'RGILTLKYPIEHGIVTNWDDMEK', - 'GILTLKYPIEHGIVTNWDDMEKIWHHTFYNELR', - 'YPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPK', - 'IWHHTFYNELRVAPEEHPVLLTEAPLNPKANR', - 'VAPEEHPVLLTEAPLNPKANREK', - 'ANREKMTQIMFETFNTPAMYVAIQAVLSLYASGR', - 'EKMTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVPIYEGYALPHAILR', - 'MTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGR', - 'TTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGRDLTDYLMK', - 'LDLAGRDLTDYLMKILTER', - 'DLTDYLMKILTERGYSFTTTAER', - 'ILTERGYSFTTTAEREIVR', - 'GYSFTTTAEREIVRDIK', - 'EIVRDIKEK', - 'DIKEKLCYVALDFEQEMATAASSSSLEK', - 'EKLCYVALDFEQEMATAASSSSLEKSYELPDGQVITI']], - 'seq': 'MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKMTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGRDLTDYLMKILTERGYSFTTTAEREIVRDIKEKLCYVALDFEQEMATAASSSSLEKSYELPDGQVITI'} - {'gene': 'ACTG1', - 'meta': '>tr|J3KT65|J3KT65_HUMAN Isoform of P63261, Actin, cytoplasmic 2 ' - 'OS=Homo sapiens OX=9606 GN=ACTG1 PE=1 SV=1', - 'peptides': [['MEEEIAALVIDNGSGMCK', - 'AGFAGDDAPR', - 'AVFPSIVGR', - 'HQGVMVGMGQK', - 'DSYVGDEAQSK', - 'YPIEHGIVTNWDDMEK', - 'IWHHTFYNELR', - 'VAPEEHPVLLTEAPLNPK', - 'MTQALFLSR', - 'HFLPEASR', - 'PCTWPSR', - 'PCCPSTPLGAPLALSWTLETGSPTR', - 'ATPSPTPSCVWTWLAGT'], - ['MEEEIAALVIDNGSGMCKAGFAGDDAPR', - 'AGFAGDDAPRAVFPSIVGR', - 'AVFPSIVGRPR', - 'PRHQGVMVGMGQK', - 'HQGVMVGMGQKDSYVGDEAQSK', - 'DSYVGDEAQSKR', - 'RGILTLK', - 'GILTLKYPIEHGIVTNWDDMEK', - 'YPIEHGIVTNWDDMEKIWHHTFYNELR', - 'IWHHTFYNELRVAPEEHPVLLTEAPLNPK', - 'VAPEEHPVLLTEAPLNPKANR', - 'EKMTQALFLSR', - 'MTQALFLSRHFLPEASR', - 'HFLPEASRLCLR', - 'LCLRPSTPR', - 'PSTPRPCTWPSR', - 'PCTWPSRPCCPSTPLGAPLALSWTLETGSPTR', - 'PCCPSTPLGAPLALSWTLETGSPTRCPSTR', - 'CPSTRATPSPTPSCVWTWLAGT'], - ['MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGR', - 'AGFAGDDAPRAVFPSIVGRPR', - 'AVFPSIVGRPRHQGVMVGMGQK', - 'PRHQGVMVGMGQKDSYVGDEAQSK', - 'HQGVMVGMGQKDSYVGDEAQSKR', - 'DSYVGDEAQSKRGILTLK', - 'RGILTLKYPIEHGIVTNWDDMEK', - 'GILTLKYPIEHGIVTNWDDMEKIWHHTFYNELR', - 'YPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPK', - 'IWHHTFYNELRVAPEEHPVLLTEAPLNPKANR', - 'VAPEEHPVLLTEAPLNPKANREK', - 'ANREKMTQALFLSR', - 'EKMTQALFLSRHFLPEASR', - 'MTQALFLSRHFLPEASRLCLR', - 'HFLPEASRLCLRPSTPR', - 'LCLRPSTPRPCTWPSR', - 'PSTPRPCTWPSRPCCPSTPLGAPLALSWTLETGSPTR', - 'PCTWPSRPCCPSTPLGAPLALSWTLETGSPTRCPSTR', - 'PCCPSTPLGAPLALSWTLETGSPTRCPSTRATPSPTPSCVWTWLAGT']], - 'seq': 'MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKMTQALFLSRHFLPEASRLCLRPSTPRPCTWPSRPCCPSTPLGAPLALSWTLETGSPTRCPSTRATPSPTPSCVWTWLAGT'} - {'gene': 'ACTG1', - 'meta': '>tr|K7EM38|K7EM38_HUMAN Isoform of P63261, Actin, cytoplasmic 2 ' - '(Fragment) OS=Homo sapiens OX=9606 GN=ACTG1 PE=1 SV=1', - 'peptides': [['MEEEIAALVIDNGSGMCK', - 'AGFAGDDAPR', - 'AVFPSIVGR', - 'HQGVMVGMGQK', - 'DSYVGDEAQSK', - 'YPIEHGIVTNWDDMEK', - 'IWHHTFYNELR', - 'VAPEEHPVLLTEAPLNPK', - 'AVLSLYASGR', - 'TTGIVMDSGD'], - ['MEEEIAALVIDNGSGMCKAGFAGDDAPR', - 'AGFAGDDAPRAVFPSIVGR', - 'AVFPSIVGRPR', - 'PRHQGVMVGMGQK', - 'HQGVMVGMGQKDSYVGDEAQSK', - 'DSYVGDEAQSKR', - 'RGILTLK', - 'GILTLKYPIEHGIVTNWDDMEK', - 'YPIEHGIVTNWDDMEKIWHHTFYNELR', - 'IWHHTFYNELRVAPEEHPVLLTEAPLNPK', - 'VAPEEHPVLLTEAPLNPKAVLSLYASGR', - 'AVLSLYASGRTTGIVMDSGD'], - ['MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGR', - 'AGFAGDDAPRAVFPSIVGRPR', - 'AVFPSIVGRPRHQGVMVGMGQK', - 'PRHQGVMVGMGQKDSYVGDEAQSK', - 'HQGVMVGMGQKDSYVGDEAQSKR', - 'DSYVGDEAQSKRGILTLK', - 'RGILTLKYPIEHGIVTNWDDMEK', - 'GILTLKYPIEHGIVTNWDDMEKIWHHTFYNELR', - 'YPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPK', - 'IWHHTFYNELRVAPEEHPVLLTEAPLNPKAVLSLYASGR', - 'VAPEEHPVLLTEAPLNPKAVLSLYASGRTTGIVMDSGD']], - 'seq': 'MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKAVLSLYASGRTTGIVMDSGD'} - - -## Sequences - - -```python -sequences = {} -for isotope in gene_isotopes[gene]: - sequences[isotope] = data_fasta[isotope][fasta_keys.seq] -sequences -``` - - - - - {'P63261': 'MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKMTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGRDLTDYLMKILTERGYSFTTTAEREIVRDIKEKLCYVALDFEQEMATAASSSSLEKSYELPDGQVITIGNERFRCPEALFQPSFLGMESCGIHETTFNSIMKCDVDIRKDLYANTVLSGGTTMYPGIADRMQKEITALAPSTMKIKIIAPPERKYSVWIGGSILASLSTFQQMWISKQEYDESGPSIVHRKCF', - 'I3L1U9': 'MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKMTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGRDLTDYLMKILTERGYSFTTTAEREIVRDIKE', - 'I3L3I0': 'MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKMTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGRDLTDYLMKILTERGYSFTTTAEREIVRDIKE', - 'I3L3R2': 'MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKMTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVP', - 'I3L4N8': 'MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKMTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGRDLTDYLMKILTERGYSFTTTAEREIVRDIKEKLCYVALDFEQEMATAASSSSLEKSYELPDGQVITI', - 'J3KT65': 'MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKMTQALFLSRHFLPEASRLCLRPSTPRPCTWPSRPCCPSTPLGAPLALSWTLETGSPTRCPSTRATPSPTPSCVWTWLAGT', - 'K7EM38': 'MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKAVLSLYASGRTTGIVMDSGD'} - - - - -```python -sorted(sequences.values(), key=len) -``` - - - - - ['MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKAVLSLYASGRTTGIVMDSGD', - 'MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKMTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVP', - 'MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKMTQALFLSRHFLPEASRLCLRPSTPRPCTWPSRPCCPSTPLGAPLALSWTLETGSPTRCPSTRATPSPTPSCVWTWLAGT', - 'MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKMTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGRDLTDYLMKILTERGYSFTTTAEREIVRDIKE', - 'MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKMTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGRDLTDYLMKILTERGYSFTTTAEREIVRDIKE', - 'MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKMTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGRDLTDYLMKILTERGYSFTTTAEREIVRDIKEKLCYVALDFEQEMATAASSSSLEKSYELPDGQVITI', - 'MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKMTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGRDLTDYLMKILTERGYSFTTTAEREIVRDIKEKLCYVALDFEQEMATAASSSSLEKSYELPDGQVITIGNERFRCPEALFQPSFLGMESCGIHETTFNSIMKCDVDIRKDLYANTVLSGGTTMYPGIADRMQKEITALAPSTMKIKIIAPPERKYSVWIGGSILASLSTFQQMWISKQEYDESGPSIVHRKCF'] - - - - -```python -sequences = pd.Series(sequences) -sequences.str.len() -``` - - - - - P63261 375 - I3L1U9 214 - I3L3I0 214 - I3L3R2 164 - I3L4N8 241 - J3KT65 198 - K7EM38 133 - dtype: int64 - - - - -```python - -``` - - - - - ['MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKAVLSLYASGRTTGIVMDSGD', - 'MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKMTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVP', - 'MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKMTQALFLSRHFLPEASRLCLRPSTPRPCTWPSRPCCPSTPLGAPLALSWTLETGSPTRCPSTRATPSPTPSCVWTWLAGT', - 'MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKMTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGRDLTDYLMKILTERGYSFTTTAEREIVRDIKE', - 'MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKMTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGRDLTDYLMKILTERGYSFTTTAEREIVRDIKE', - 'MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKMTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGRDLTDYLMKILTERGYSFTTTAEREIVRDIKEKLCYVALDFEQEMATAASSSSLEKSYELPDGQVITI', - 'MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKMTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGRDLTDYLMKILTERGYSFTTTAEREIVRDIKEKLCYVALDFEQEMATAASSSSLEKSYELPDGQVITIGNERFRCPEALFQPSFLGMESCGIHETTFNSIMKCDVDIRKDLYANTVLSGGTTMYPGIADRMQKEITALAPSTMKIKIIAPPERKYSVWIGGSILASLSTFQQMWISKQEYDESGPSIVHRKCF'] - - - - -```python -from Bio import Align -aligner = Align.PairwiseAligner() -``` - - -```python -alignments = aligner.align(sequences.loc['I3L1U9'], sequences.loc['I3L3I0']) # Identical? Maybe check if this is more than once the case? -for alignment in alignments: - print(alignment) -``` - - MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKMTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGRDLTDYLMKILTERGYSFTTTAEREIVRDIKE - |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| - MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKMTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGRDLTDYLMKILTERGYSFTTTAEREIVRDIKE - - - - -```python -data_fasta['I3L1U9'][fasta_keys.seq] == data_fasta['I3L3I0'][fasta_keys.seq] -``` - - - - - True - - - - -```python -alignments = aligner.align(sequences.loc['I3L1U9'], sequences.loc['I3L3R2']) # Identical? -for alignment in alignments: - print(alignment) - break -``` - - MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKMTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGRDLTDYLMKILTERGYSFTTTAEREIVRDIKE - ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||-------------------------------------------------- - MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKMTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVP-------------------------------------------------- - - - - -```python -alignments = aligner.align(sequences.loc['P63261'], sequences.loc['K7EM38']) # Identical? -for alignment in alignments: - print(alignment) - break -``` - - MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKMTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGRDLTDYLMKILTERGYSFTTTAEREIVRDIKEKLCYVALDFEQEMATAASSSSLEKSYELPDGQVITIGNERFRCPEALFQPSFLGMESCGIHETTFNSIMKCDVDIRKDLYANTVLSGGTTMYPGIADRMQKEITALAPSTMKIKIIAPPERKYSVWIGGSILASLSTFQQMWISKQEYDESGPSIVHRKCF - ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||-------------------|-----||||||||||||||||||-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKA-------------------V-----LSLYASGRTTGIVMDSGD-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - - - -## Unique Peptides - - -```python -import itertools -peptides = {} -for isotope in gene_isotopes[gene]: - sequences[isotope] = data_fasta[isotope][fasta_keys.peptides][0] - -for peptides in itertools.zip_longest(*sequences.values(), fillvalue=''): - if len(set(peptides)) == 1: - print(f'all identical: {peptides[0]}') - else: - print('\t'.join(peptides)) -``` - - all identical: MEEEIAALVIDNGSGMCK - all identical: AGFAGDDAPR - all identical: AVFPSIVGR - HQGVMVGMGQK HQGVMVGMGQK HQGVMVGMGQK HQGVMVGMGQK HQDSYVGDEAQSK HQGVMVGMGQK HQGVMVGMGQK - DSYVGDEAQSK DSYVGDEAQSK DSYVGDEAQSK DSYVGDEAQSK YPIEHGIVTNWDDMEK DSYVGDEAQSK DSYVGDEAQSK - YPIEHGIVTNWDDMEK YPIEHGIVTNWDDMEK YPIEHGIVTNWDDMEK YPIEHGIVTNWDDMEK IWHHTFYNELR YPIEHGIVTNWDDMEK YPIEHGIVTNWDDMEK - IWHHTFYNELR IWHHTFYNELR IWHHTFYNELR IWHHTFYNELR VAPEEHPVLLTEAPLNPK IWHHTFYNELR IWHHTFYNELR - VAPEEHPVLLTEAPLNPK VAPEEHPVLLTEAPLNPK VAPEEHPVLLTEAPLNPK VAPEEHPVLLTEAPLNPK MTQIMFETFNTPAMYVAIQAVLSLYASGR VAPEEHPVLLTEAPLNPK VAPEEHPVLLTEAPLNPK - MTQIMFETFNTPAMYVAIQAVLSLYASGR MTQIMFETFNTPAMYVAIQAVLSLYASGR MTQIMFETFNTPAMYVAIQAVLSLYASGR MTQIMFETFNTPAMYVAIQAVLSLYASGR TTGIVMDSGDGVTHTVPIYEGYALPHAILR MTQALFLSR AVLSLYASGR - TTGIVMDSGDGVTHTVPIYEGYALPHAILR TTGIVMDSGDGVTHTVPIYEGYALPHAILR TTGIVMDSGDGVTHTVPIYEGYALPHAILR TTGIVMDSGDGVTHTVP DLTDYLMK HFLPEASR TTGIVMDSGD - DLTDYLMK DLTDYLMK DLTDYLMK GYSFTTTAER PCTWPSR - GYSFTTTAER GYSFTTTAER GYSFTTTAER LCYVALDFEQEMATAASSSSLEK PCCPSTPLGAPLALSWTLETGSPTR - LCYVALDFEQEMATAASSSSLEK SYELPDGQVITI ATPSPTPSCVWTWLAGT - SYELPDGQVITIGNER - CPEALFQPSFLGMESCGIHETTFNSIMK - DLYANTVLSGGTTMYPGIADR - EITALAPSTMK - IIAPPER - YSVWIGGSILASLSTFQQMWISK - QEYDESGPSIVHR - - - -```python -for j, peptides in enumerate(sequences.values()): - if j==0: - set_overlap = set(peptides) - else: - set_overlap = set_overlap.intersection(peptides) -set_overlap -``` - - - - - {'AGFAGDDAPR', - 'AVFPSIVGR', - 'IWHHTFYNELR', - 'MEEEIAALVIDNGSGMCK', - 'VAPEEHPVLLTEAPLNPK', - 'YPIEHGIVTNWDDMEK'} - - - - -```python -s -``` diff --git a/project/doc/ipynbs/01_explore_raw_MQ_data.md b/project/doc/ipynbs/01_explore_raw_MQ_data.md deleted file mode 100644 index bd73c74e6..000000000 --- a/project/doc/ipynbs/01_explore_raw_MQ_data.md +++ /dev/null @@ -1,10590 +0,0 @@ -# Explore MaxQuant (MQ) output files of single runs - -The `project/10_training_data.ipynb` notebook does extract information to be used as training data. File specific one could also use the retention time analysis to identify _valid_ co-occurring peptides to be use during training. Potentially this preprocessing step can be used at inference time. - -This notebook contains some relevant analysis for a specific `txt` output-folder in the current project - -##### Analysis overview - -> Report for example data - -- relation between `peptides.txt` and `evidence.txt` - - -```python -import logging -import os -from pathlib import Path -import random - -import ipywidgets as widgets -import pandas as pd -# pd.options.display.float_format = '{:,.1f}'.format - -from vaep.io.mq import FASTA_KEYS, MaxQuantOutput, MaxQuantOutputDynamic -from vaep.io import search_files, search_subfolders - -################## -##### CONFIG ##### -################## - -from src.src.config import FIGUREFOLDER -from src.src.config import FOLDER_RAW_DATA -from src.src.config import FOLDER_KEY # defines how filenames are parsed for use as indices -from src.src.config import FOLDER_DATA # project folder for storing the data - -print(f"Search Raw-Files on path: {FOLDER_RAW_DATA}") -``` - - Search Raw-Files on path: data\mq_out - - - -```python -from datetime import datetime - -#Delete Jupyter notebook root logger handler -logger = logging.getLogger() -logger.handlers = [] - -# logger = logging.getLogger(mq_output.folder.stem) -logger = logging.getLogger('vaep') -logger.setLevel(logging.INFO) - -c_handler = logging.StreamHandler() -c_handler.setLevel(logging.INFO) - - -date_log_file = "{:%y%m%d_%H%M}".format(datetime.now()) -f_handler = logging.FileHandler(f"log_01_explore_raw_MQ_{date_log_file}.txt") -f_handler.setLevel(logging.INFO) - -c_format = logging.Formatter( - f'%(name)s - %(levelname)-8s %(message)s ') - -c_handler.setFormatter(c_format) -f_handler.setFormatter(c_format) - -logger.handlers = [] #remove any handler in case you reexecute the cell -logger.addHandler(c_handler) -logger.addHandler(f_handler) -``` - - -```python -logger.handlers -``` - - - - - [, - ] - - - - -```python -folders = search_subfolders(path=FOLDER_RAW_DATA, depth=1, exclude_root=True) -w_folder = widgets.Dropdown(options=folders, description='Select a folder') -w_folder -``` - - - Dropdown(description='Select a folder', options=(WindowsPath('data/mq_out/20190611_QX3_LiSc_MA_Hela_500ng_LC15… - - - -```python -mq_output = MaxQuantOutput(folder=w_folder.value) -``` - -## Some important columns - -Grouped by a namedtuple allowing attribute access - - -```python -from vaep.io.mq import mq_col -mq_col -``` - - - - - MqColumns(GENE_NAMES='Gene names', INTENSITY='Intensity', RETENTION_TIME='Retention time', CALIBRATED_RETENTION_TIME='Calibrated retention time', SEQUENCE='Sequence', LEADING_RAZOR_PROTEIN='Leading razor protein', PROTEINS='Proteins') - - - -## `peptides.txt` - -> For reference on final "result" - - -```python -pd.options.display.max_columns = len(mq_output.peptides.columns) -mq_output.peptides -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
N-term cleavage windowC-term cleavage windowAmino acid beforeFirst amino acidSecond amino acidSecond last amino acidLast amino acidAmino acid afterA CountR CountN CountD CountC CountQ CountE CountG CountH CountI CountL CountK CountM CountF CountP CountS CountT CountW CountY CountV CountU CountO CountLengthMissed cleavagesMassProteinsLeading razor proteinStart positionEnd positionGene namesProtein namesUnique (Groups)Unique (Proteins)ChargesPEPScoreIntensityReversePotential contaminantidProtein group IDsMod. peptide IDsEvidence IDsMS/MS IDsBest MS/MSOxidation (M) site IDsTaxonomy IDsMS/MS Count
Sequence
AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPGAVGER______________________________LSGPAEVGPGAVGERTPRKKEPPRASPPGGKAAERT1911001511001000433003005204,401.1R4GMQ1;O60341;O60341-2R4GMQ17.058.0KDM1ALysine-specific histone demethylase 1Ayesno40.0102.757,739,000NaNNaN017740000.0NaN96061
AAAAAAAAAVSRTTSSRVLRGGRDRGRAAAAAAAAAVSRRRKRGRAAAAAAAAAVSRRRKAEYPRRRRSSPSRAASRR9100000000000001000100120999.5A0A0A6YYC7;Q96JP5-2;Q96JP5A0A0A6YYC759.070.0ZFP91-CNTF;ZFP91E3 ubiquitin-protein ligase ZFP91yesno20.068.687,575,000NaNNaN12311111.0NaN96061
AAAAAAAGDSDSWDADAFSVEDPVRK______________________________SWDADAFSVEDPVRKVGGGGTAGGDRWEGEMAARKV91050011000101130102002612,592.2O75822;O75822-3;O75822-2O758222.027.0EIF3JEukaryotic translation initiation factor 3 sub...yesno30.0157.9442,780,000NaNNaN218772222.0NaN96061
AAAAAAALQAKTILRQARNHKLRVDKAAAAAAALQAKSDEKRVDKAAAAAAALQAKSDEKAAVAGKKPVVGKAAAKS8000010000110000000000110955.5P36578;H3BM89;H3BU31P36578354.0364.0RPL460S ribosomal protein L4yesno20.0144.43,166,700,000NaNNaN32461333;44.0NaN96062
AAAAAAGAASGLPGPVAQGLK______________________________GAASGLPGPVAQGLKEALVDTLTGILSPVQMAALKE90000104002100210001002101,748.0Q96P70Q96P702.022.0IPO9Importin-9yesyes2;30.0119.793,181,000NaNNaN4416244;55;66.0NaN96062
...........................................................................................................................................................................
YYTSASGDEMVSLKHEDSQNRKKLSELLRYYTSASGDEMVSLKDRYYTSASGDEMVSLKDYCTRMKENQKHIYYRYYLKD10010011001110031021001401,549.7P07900;P07900-2P07900465.0478.0HSP90AA1Heat shock protein HSP 90-alphayesno20.0192.32,132,200,000NaNNaN38,783207740966;4096749202;49203;49204;49205;49206;49207;49208;4920954670;54671;54672;54673;54674;54675;54676;5467...54,679.01311960610
YYTVFDRDNNRPSGPLWILGDVFIGRYYTVFDRDNNRVGFAFIGRYYTVFDRDNNRVGFAEAARL______RYYNRV02220000000001001021001111,461.7A0A1B0GVD5;A0A1B0GWE8;P07339;A0A1B0GW44;A0A1B0...A0A1B0GVD5390.0400.0CTSDCathepsin D;Cathepsin D light chain;Cathepsin ...yesno30.073.8139,230,000NaNNaN38,78437940968492105468054,680.0NaN96061
YYVLNALKGQPVKVRVSYQKLLKYYVLNALKHRPPKAQSYQKLLKYYVLNALKHRPPKAQKKRYLFRSKYYLKH101000000021000000210080982.5Q6P2Q9;I3L0J9Q6P2Q9453.0460.0PRPF8Pre-mRNA-processing-splicing factor 8yesno20.058.7147,430,000NaNNaN38,785352140969492115468154,681.0NaN96061
YYVTIIDAPGHRGITIDISLWKFETSKYYVTIIDAPGHRDFITSKYYVTIIDAPGHRDFIKNMITGTSQADCKYYHRD11010001120000101021001201,403.7P68104;Q5VTE0;A0A087WVQ9;P68104-2;A0A087WV01;Q...P6810485.096.0EEF1A1;EEF1A1P5Elongation factor 1-alpha 1;Putative elongatio...yesno2;30.0131.811,192,000,000NaNNaN38,78628734097049212;49213;4921454682;54683;54684;54685;5468654,683.0NaN96065
YYYIPQYKREVKEHVGTDQFGNKYYYIPQYKNWRGQTITDQFGNKYYYIPQYKNWRGQTIREKRIVEAKYYYKN0000010001010010004000801,136.6Q8N183;D6RA56;H0YA50Q8N18332.039.0NDUFAF2Mimitin, mitochondrialyesno20.083.8147,680,000NaNNaN38,787374540971492155468754,687.0NaN96061
-

38788 rows × 56 columns

-
- - - -`peptides.txt` contains aggregated peptides - - -```python -intensities = mq_output.peptides.Intensity -intensities -``` - - - - - Sequence - AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPGAVGER 57,739,000 - AAAAAAAAAVSR 87,575,000 - AAAAAAAGDSDSWDADAFSVEDPVRK 442,780,000 - AAAAAAALQAK 3,166,700,000 - AAAAAAGAASGLPGPVAQGLK 93,181,000 - ... - YYTSASGDEMVSLK 2,132,200,000 - YYTVFDRDNNR 139,230,000 - YYVLNALK 147,430,000 - YYVTIIDAPGHR 11,192,000,000 - YYYIPQYK 147,680,000 - Name: Intensity, Length: 38788, dtype: int64 - - - -Not all peptides are associated with a Protein or Gene by MQ, although there is evidence for the peptide. This is due to potential `CON_`taminants in the medium which is encouded by default by MQ. - - -```python -mq_output.peptides[FASTA_KEYS].isna().sum() -``` - - - - - Proteins 85 - Gene names 337 - dtype: int64 - - - -## `evidence.txt` - -contains -- retention time for peptides -- has repeated measures of the same sequences, which are all aggregated in `peptides.txt` - - - -```python -pd.options.display.max_columns = len(mq_output.evidence.columns) -mq_output.evidence -``` - - c:\users\kzl465\onedrive - university of copenhagen\vaep\vaep\io\mq.py:87: DtypeWarning: Columns (50,53,58) have mixed types.Specify dtype option on import or set low_memory=False. - return cls.find_attribute(f'_{filename}') - - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
LengthModificationsModified sequenceOxidation (M) ProbabilitiesOxidation (M) Score DiffsAcetyl (Protein N-term)Oxidation (M)Missed cleavagesProteinsLeading proteinsLeading razor proteinGene namesProtein namesTypeRaw fileMS/MS m/zChargem/zMassUncalibrated - Calibrated m/z [ppm]Uncalibrated - Calibrated m/z [Da]Mass error [ppm]Mass error [Da]Uncalibrated mass error [ppm]Uncalibrated mass error [Da]Max intensity m/z 0Retention timeRetention lengthCalibrated retention timeCalibrated retention time startCalibrated retention time finishRetention time calibrationMatch time differenceMatch m/z differenceMatch q-valueMatch scoreNumber of data pointsNumber of scansNumber of isotopic peaksPIFFraction of total spectrumBase peak fractionPEPMS/MS countMS/MS scan numberScoreDelta scoreCombinatoricsIntensityReversePotential contaminantidProtein group IDsPeptide IDMod. peptide IDMS/MS IDsBest MS/MSOxidation (M) site IDsTaxonomy IDs
Sequence
AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPGAVGER52Unmodified_AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVG...NaNNaN000R4GMQ1;O60341;O60341-2R4GMQ1R4GMQ1KDM1ALysine-specific histone demethylase 1AMULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC151,101.841,101.34,401.12.40.00.20.02.60.01,101.885.60.385.685.585.80.0nannannannan84.022.05.00000.01106,834102.791.3157,739,000.0NaNNaN017740000NaN9606
AAAAAAAAAVSR12Unmodified_AAAAAAAAAVSR_NaNNaN000A0A0A6YYC7;Q96JP5-2;Q96JP5A0A0A6YYC7A0A0A6YYC7ZFP91-CNTF;ZFP91E3 ubiquitin-protein ligase ZFP91MULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15500.82500.8999.51.40.00.50.02.00.0500.825.60.225.625.425.60.0nannannannan37.016.03.00000.0130,18468.646.9187,575,000.0NaNNaN12311111NaN9606
AAAAAAAGDSDSWDADAFSVEDPVRK26Acetyl (Protein N-term)_(Acetyl (Protein N-term))AAAAAAAGDSDSWDADAFSV...NaNNaN101O75822;O75822-3;O75822-2O75822O75822EIF3JEukaryotic translation initiation factor 3 sub...MULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15879.43879.12,634.22.20.0-0.8-0.01.40.0879.495.00.695.094.795.30.0nannannannan157.047.05.00000.01118,493157.9144.31442,780,000.0NaNNaN218772222NaN9606
AAAAAAALQAK11Unmodified_AAAAAAALQAK_NaNNaN000P36578;H3BM89;H3BU31P36578P36578RPL460S ribosomal protein L4MULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15479.32478.8955.51.90.0-0.3-0.01.60.0478.826.70.626.726.527.10.0nannannannan163.046.05.00000.0231,655144.4106.813,166,700,000.0NaNNaN32461333;44NaN9606
AAAAAAGAASGLPGPVAQGLK21Acetyl (Protein N-term)_(Acetyl (Protein N-term))AAAAAAGAASGLPGPVAQGLK_NaNNaN100Q96P70Q96P70Q96P70IPO9Importin-9MULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15598.03597.71,790.02.40.0-0.7-0.01.80.0597.796.90.596.996.697.10.0nannannannan85.034.03.00000.01120,70646.636.8140,166,000.0NaNNaN441624455NaN9606
....................................................................................................................................................................................
YYVLNALK8Unmodified_YYVLNALK_NaNNaN000Q6P2Q9;I3L0J9Q6P2Q9Q6P2Q9PRPF8Pre-mRNA-processing-splicing factor 8MULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15492.32492.3982.51.70.0-0.1-0.01.60.0492.370.50.470.570.370.70.0nannannannan87.030.04.00000.0187,76458.737.01147,430,000.0NaNNaN49,211352138,78540,9695468154,681NaN9606
YYVTIIDAPGHR12Unmodified_YYVTIIDAPGHR_NaNNaN000P68104;Q5VTE0;A0A087WVQ9;P68104-2;A0A087WV01;Q...P68104P68104EEF1A1;EEF1A1P5Elongation factor 1-alpha 1;Putative elongatio...MULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15468.93468.91,403.70.80.00.10.01.00.0468.966.40.766.466.266.90.0nannannannan267.057.06.00000.0283,132131.8101.318,630,000,000.0NaNNaN49,212287338,78640,97054682;5468354,683NaN9606
YYVTIIDAPGHR12Unmodified_YYVTIIDAPGHR_NaNNaN000P68104;Q5VTE0;A0A087WVQ9;P68104-2;A0A087WV01;Q...P68104P68104EEF1A1;EEF1A1P5Elongation factor 1-alpha 1;Putative elongatio...MULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15702.92702.91,403.71.40.0-0.3-0.01.10.0702.966.40.666.466.266.80.0nannannannan183.046.05.00000.0183,14770.045.112,458,400,000.0NaNNaN49,213287338,78640,9705468454,684NaN9606
YYVTIIDAPGHR12Unmodified_YYVTIIDAPGHR_NaNNaN000P68104;Q5VTE0;A0A087WVQ9;P68104-2;A0A087WV01;Q...P68104P68104EEF1A1;EEF1A1P5Elongation factor 1-alpha 1;Putative elongatio...MULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15468.93468.91,403.71.20.02.20.03.40.0468.967.11.167.166.968.00.0nannannannan172.089.03.00000.0283,77294.881.41103,490,000.0NaNNaN49,214287338,78640,97054685;5468654,685NaN9606
YYYIPQYK8Unmodified_YYYIPQYK_NaNNaN000Q8N183;D6RA56;H0YA50Q8N183Q8N183NDUFAF2Mimitin, mitochondrialMULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15569.32569.31,136.61.60.00.60.02.20.0569.362.70.862.762.263.00.0nannannannan151.064.04.00000.0178,04383.856.71147,680,000.0NaNNaN49,215374538,78740,9715468754,687NaN9606
-

49216 rows × 59 columns

-
- - - - -```python -mq_output.evidence.Charge.value_counts().sort_index() -``` - - - - - 1 39 - 2 30,290 - 3 16,193 - 4 2,429 - 5 255 - 6 9 - 7 1 - Name: Charge, dtype: int64 - - - - -```python -mask = mq_output.evidence[mq_col.RETENTION_TIME] != mq_output.evidence[mq_col.CALIBRATED_RETENTION_TIME] -print("Number of non-matching retention times between calibrated and non-calibrated column:", mask.sum()) - -# try: -# assert mask.sum() == 0, "More than one replica?" -# except AssertionError as e: -# logger.warning(e) -assert mask.sum() == 0, "More than one replica?" -``` - - Number of non-matching retention times between calibrated and non-calibrated column: 0 - - -Using only one quality control sample, leaves the initial retention time as is. - - -```python -rt = mq_output.evidence[mq_col.CALIBRATED_RETENTION_TIME] -``` - - -```python -pep_measured_freq_in_evidence = rt.index.value_counts() -pep_measured_freq_in_evidence.iloc[:10] # top10 repeatedly measured peptides -``` - - - - - EGMNIVEAMER 40 - VVDLMAHMASK 34 - HQGVMVGMGQK 29 - LMIEMDGTENK 26 - DNSTMGYMMAK 26 - GEMMDLQHGSLFLR 24 - EVDEQMLNVQNK 21 - NMMAACDPR 20 - TLNDELEIIEGMK 18 - VVDLMAHMASKE 18 - Name: Sequence, dtype: int64 - - - - -```python -max_observed_pep_evidence = pep_measured_freq_in_evidence.index[0] -mq_output.evidence.loc[ - max_observed_pep_evidence, - : -] -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
LengthModificationsModified sequenceOxidation (M) ProbabilitiesOxidation (M) Score DiffsAcetyl (Protein N-term)Oxidation (M)Missed cleavagesProteinsLeading proteinsLeading razor proteinGene namesProtein namesTypeRaw fileMS/MS m/zChargem/zMassUncalibrated - Calibrated m/z [ppm]Uncalibrated - Calibrated m/z [Da]Mass error [ppm]Mass error [Da]Uncalibrated mass error [ppm]Uncalibrated mass error [Da]Max intensity m/z 0Retention timeRetention lengthCalibrated retention timeCalibrated retention time startCalibrated retention time finishRetention time calibrationMatch time differenceMatch m/z differenceMatch q-valueMatch scoreNumber of data pointsNumber of scansNumber of isotopic peaksPIFFraction of total spectrumBase peak fractionPEPMS/MS countMS/MS scan numberScoreDelta scoreCombinatoricsIntensityReversePotential contaminantidProtein group IDsPeptide IDMod. peptide IDMS/MS IDsBest MS/MSOxidation (M) site IDsTaxonomy IDs
Sequence
EGMNIVEAMER112 Oxidation (M)_EGM(Oxidation (M))NIVEAM(Oxidation (M))ER_EGM(1)NIVEAM(1)EREGM(75)NIVEAM(75)ER020P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15655.82655.81,309.61.50.00.40.01.90.0655.838.70.838.738.439.10.0nannannannan83.061.02.00000.0147,21575.255.4173,297,000.0NaNNaN9,4952846;28477,4877,8781042110,4212623;26249606
EGMNIVEAMER112 Oxidation (M)_EGM(Oxidation (M))NIVEAM(Oxidation (M))ER_EGM(1)NIVEAM(1)EREGM(75)NIVEAM(75)ER020P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15655.82655.81,309.61.80.0-1.2-0.00.60.0655.840.21.340.239.240.60.0nannannannan187.0106.02.00000.0248,75175.456.9164,737,000.0NaNNaN9,4962846;28477,4877,87810422;1042310,4222623;26249606
EGMNIVEAMER112 Oxidation (M)_EGM(Oxidation (M))NIVEAM(Oxidation (M))ER_EGM(1)NIVEAM(1)EREGM(80)NIVEAM(80)ER020P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15655.82655.81,309.61.80.0-0.4-0.01.40.0655.843.20.943.242.543.40.0nannannannan103.069.02.00000.0152,65680.058.6177,980,000.0NaNNaN9,4972846;28477,4877,8781042410,4242623;26249606
EGMNIVEAMER112 Oxidation (M)_EGM(Oxidation (M))NIVEAM(Oxidation (M))ER_EGM(1)NIVEAM(1)EREGM(70)NIVEAM(70)ER020P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15655.82655.81,309.61.70.01.00.02.70.0655.847.31.047.346.947.90.0nannannannan114.079.02.00000.0259,00770.050.2141,367,000.0NaNNaN9,4982846;28477,4877,87810425;1042610,4262623;26249606
EGMNIVEAMER112 Oxidation (M)_EGM(Oxidation (M))NIVEAM(Oxidation (M))ER_EGM(1)NIVEAM(1)EREGM(70)NIVEAM(70)ER020P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15655.82655.81,309.61.80.00.60.02.40.0655.849.31.249.349.050.20.0nannannannan184.095.03.00000.0160,97570.143.7148,211,000.0NaNNaN9,4992846;28477,4877,8781042710,4272623;26249606
EGMNIVEAMER112 Oxidation (M)_EGM(Oxidation (M))NIVEAM(Oxidation (M))ER_EGM(1)NIVEAM(1)EREGM(51)NIVEAM(51)ER020P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15655.82655.81,309.62.00.00.00.02.00.0655.850.81.150.850.251.30.0nannannannan196.087.03.00000.0262,28451.136.7138,053,000.0NaNNaN9,5002846;28477,4877,87810428;1042910,4282623;26249606
EGMNIVEAMER112 Oxidation (M)_EGM(Oxidation (M))NIVEAM(Oxidation (M))ER_EGM(1)NIVEAM(1)EREGM(76)NIVEAM(76)ER020P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15655.82655.81,309.61.60.0-0.7-0.00.90.0655.853.21.253.252.253.40.0nannannannan186.095.02.00000.0365,43676.257.0137,506,000.0NaNNaN9,5012846;28477,4877,87810430;10431;1043210,4312623;26249606
EGMNIVEAMER112 Oxidation (M)_EGM(Oxidation (M))NIVEAM(Oxidation (M))ER_EGM(1)NIVEAM(1)EREGM(81)NIVEAM(81)ER020P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15655.82655.81,309.6nannannannannannannan29.41.029.428.929.90.0nannannannannannannan0000.0135,27780.958.31nanNaNNaN9,5022846;28477,4877,8781043310,4332623;26249606
EGMNIVEAMER112 Oxidation (M)_EGM(Oxidation (M))NIVEAM(Oxidation (M))ER_EGM(1)NIVEAM(1)EREGM(82)NIVEAM(82)ER020P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15655.82655.81,309.6nannannannannannannan30.21.030.229.730.70.0nannannannannannannan0000.0136,30282.362.51nanNaNNaN9,5032846;28477,4877,8781043410,4342623;26249606
EGMNIVEAMER112 Oxidation (M)_EGM(Oxidation (M))NIVEAM(Oxidation (M))ER_EGM(1)NIVEAM(1)EREGM(82)NIVEAM(82)ER020P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15655.82655.81,309.6nannannannannannannan30.91.030.930.431.40.0nannannannannannannan0000.0137,19882.066.91nanNaNNaN9,5042846;28477,4877,8781043510,4352623;26249606
EGMNIVEAMER112 Oxidation (M)_EGM(Oxidation (M))NIVEAM(Oxidation (M))ER_EGM(1)NIVEAM(1)EREGM(83)NIVEAM(83)ER020P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15655.82655.81,309.6nannannannannannannan31.71.031.731.232.20.0nannannannannannannan0000.0138,22982.964.11nanNaNNaN9,5052846;28477,4877,8781043610,4362623;26249606
EGMNIVEAMER112 Oxidation (M)_EGM(Oxidation (M))NIVEAM(Oxidation (M))ER_EGM(1)NIVEAM(1)EREGM(64)NIVEAM(64)ER020P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15655.82655.81,309.6nannannannannannannan32.21.032.231.732.70.0nannannannannannannan0000.0138,90464.150.51nanNaNNaN9,5062846;28477,4877,8781043710,4372623;26249606
EGMNIVEAMER112 Oxidation (M)_EGM(Oxidation (M))NIVEAM(Oxidation (M))ER_EGM(1)NIVEAM(1)EREGM(83)NIVEAM(83)ER020P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15655.82655.81,309.6nannannannannannannan32.71.032.732.233.20.0nannannannannannannan0000.0139,64682.863.01nanNaNNaN9,5072846;28477,4877,8781043810,4382623;26249606
EGMNIVEAMER112 Oxidation (M)_EGM(Oxidation (M))NIVEAM(Oxidation (M))ER_EGM(1)NIVEAM(1)EREGM(70)NIVEAM(70)ER020P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15655.82655.81,309.6nannannannannannannan35.01.035.034.535.50.0nannannannannannannan0000.0142,52570.452.61nanNaNNaN9,5082846;28477,4877,8781043910,4392623;26249606
EGMNIVEAMER112 Oxidation (M)_EGM(Oxidation (M))NIVEAM(Oxidation (M))ER_EGM(1)NIVEAM(1)EREGM(84)NIVEAM(84)ER020P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15655.82655.81,309.6nannannannannannannan41.11.041.140.641.60.0nannannannannannannan0000.0150,38284.169.41nanNaNNaN9,5092846;28477,4877,8781044010,4402623;26249606
EGMNIVEAMER112 Oxidation (M)_EGM(Oxidation (M))NIVEAM(Oxidation (M))ER_EGM(1)NIVEAM(1)EREGM(72)NIVEAM(72)ER020P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15655.82655.81,309.6nannannannannannannan41.81.041.841.242.20.0nannannannannannannan0000.0151,26172.052.21nanNaNNaN9,5102846;28477,4877,8781044110,4412623;26249606
EGMNIVEAMER112 Oxidation (M)_EGM(Oxidation (M))NIVEAM(Oxidation (M))ER_EGM(1)NIVEAM(1)EREGM(88)NIVEAM(88)ER020P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15655.82655.81,309.6nannannannannannannan42.31.042.341.842.80.0nannannannannannannan0000.0151,91688.169.71nanNaNNaN9,5112846;28477,4877,8781044210,4422623;26249606
EGMNIVEAMER112 Oxidation (M)_EGM(Oxidation (M))NIVEAM(Oxidation (M))ER_EGM(1)NIVEAM(1)EREGM(59)NIVEAM(59)ER020P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15655.82655.81,309.6nannannannannannannan43.91.043.943.444.40.0nannannannannannannan0000.0154,01658.634.61nanNaNNaN9,5122846;28477,4877,8781044310,4432623;26249606
EGMNIVEAMER112 Oxidation (M)_EGM(Oxidation (M))NIVEAM(Oxidation (M))ER_EGM(1)NIVEAM(1)EREGM(84)NIVEAM(84)ER020P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15655.82655.81,309.6nannannannannannannan44.51.044.544.045.00.0nannannannannannannan0000.0154,79984.164.31nanNaNNaN9,5132846;28477,4877,8781044410,4442623;26249606
EGMNIVEAMER112 Oxidation (M)_EGM(Oxidation (M))NIVEAM(Oxidation (M))ER_EGM(1)NIVEAM(1)EREGM(82)NIVEAM(82)ER020P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15655.82655.81,309.6nannannannannannannan45.11.045.144.645.60.0nannannannannannannan0000.0155,58282.363.01nanNaNNaN9,5142846;28477,4877,8781044510,4452623;26249606
EGMNIVEAMER112 Oxidation (M)_EGM(Oxidation (M))NIVEAM(Oxidation (M))ER_EGM(1)NIVEAM(1)EREGM(64)NIVEAM(64)ER020P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15655.82655.81,309.6nannannannannannannan46.21.046.245.746.70.0nannannannannannannan0000.0156,93964.150.11nanNaNNaN9,5152846;28477,4877,8781044610,4462623;26249606
EGMNIVEAMER112 Oxidation (M)_EGM(Oxidation (M))NIVEAM(Oxidation (M))ER_EGM(1)NIVEAM(1)EREGM(50)NIVEAM(50)ER020P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15655.82655.81,309.6nannannannannannannan46.71.046.746.247.20.0nannannannannannannan0000.0157,62950.125.21nanNaNNaN9,5162846;28477,4877,8781044710,4472623;26249606
EGMNIVEAMER112 Oxidation (M)_EGM(Oxidation (M))NIVEAM(Oxidation (M))ER_EGM(1)NIVEAM(1)EREGM(48)NIVEAM(48)ER020P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15655.82655.81,309.6nannannannannannannan48.81.048.848.349.30.0nannannannannannannan0000.0160,30447.628.31nanNaNNaN9,5172846;28477,4877,8781044810,4482623;26249606
EGMNIVEAMER112 Oxidation (M)_EGM(Oxidation (M))NIVEAM(Oxidation (M))ER_EGM(1)NIVEAM(1)EREGM(53)NIVEAM(53)ER020P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15655.82655.81,309.6nannannannannannannan53.81.053.853.354.30.0nannannannannannannan0000.0166,75153.038.91nanNaNNaN9,5182846;28477,4877,8781044910,4492623;26249606
EGMNIVEAMER112 Oxidation (M)_EGM(Oxidation (M))NIVEAM(Oxidation (M))ER_EGM(1)NIVEAM(1)EREGM(55)NIVEAM(55)ER020P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MULTI-SECPEP20190611_QX3_LiSc_MA_Hela_500ng_LC15656.32655.81,309.61.50.0-1.4-0.00.10.0655.834.80.334.834.735.00.0nannannannan44.026.02.00000.0142,14255.440.7177,147,000.0NaNNaN9,5192846;28477,4877,8781045010,4502623;26249606
EGMNIVEAMER112 Oxidation (M)_EGM(Oxidation (M))NIVEAM(Oxidation (M))ER_EGM(1)NIVEAM(1)EREGM(79)NIVEAM(79)ER020P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MULTI-SECPEP20190611_QX3_LiSc_MA_Hela_500ng_LC15656.32655.81,309.61.70.00.50.02.20.0655.854.81.354.854.055.30.0nannannannan233.0101.03.00000.0167,27378.863.6140,065,000.0NaNNaN9,5202846;28477,4877,8781045110,4512623;26249606
EGMNIVEAMER11Oxidation (M)_EGMNIVEAM(Oxidation (M))ER_EGMNIVEAM(1)EREGM(-76)NIVEAM(76)ER010P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15647.82647.81,293.61.90.00.40.02.30.0647.847.10.647.146.947.40.0nannannannan168.044.06.00000.0158,09184.172.32320,290,000.0NaNNaN9,5212846;28477,4877,8791045210,4522623;26249606
EGMNIVEAMER11Oxidation (M)_EGMNIVEAM(Oxidation (M))ER_EGMNIVEAM(1)EREGM(-100)NIVEAM(100)ER010P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15647.82647.81,293.62.10.0-0.8-0.01.20.0647.847.61.547.647.348.70.0nannannannan235.0116.05.00000.0358,731112.7101.02507,440,000.0NaNNaN9,5222846;28477,4877,87910453;10454;1045510,4532623;26249606
EGMNIVEAMER11Oxidation (M)_EGMNIVEAM(Oxidation (M))ER_EGMNIVEAM(1)EREGM(-53)NIVEAM(53)ER010P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15648.32647.81,293.61.70.00.70.02.40.0647.851.60.351.651.351.70.0nannannannan41.025.02.00000.0163,82757.148.9235,035,000.0NaNNaN9,5232846;28477,4877,8791045610,4562623;26249606
EGMNIVEAMER11Oxidation (M)_EGM(Oxidation (M))NIVEAMER_EGM(1)NIVEAMEREGM(120)NIVEAM(-120)ER010P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15647.82647.81,293.61.60.0-0.7-0.00.90.0647.856.44.256.455.759.90.0nannannannan856.0332.05.00000.0669,899137.9104.621,263,300,000.0NaNNaN9,5242846;28477,4877,87910457;10458;10459;10460;10461;1046210,4572623;26249606
EGMNIVEAMER11Oxidation (M)_EGM(Oxidation (M))NIVEAMER_EGM(0.999)NIVEAM(0.001)EREGM(29)NIVEAM(-29)ER010P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15647.82647.81,293.62.40.0-2.7-0.0-0.3-0.0647.864.20.364.264.164.40.0nannannannan38.025.02.00000.0180,10851.137.1215,466,000.0NaNNaN9,5252846;28477,4877,8791046310,4632623;26249606
EGMNIVEAMER11Oxidation (M)_EGMNIVEAM(Oxidation (M))ER_EGMNIVEAM(1)EREGM(-48)NIVEAM(48)ER010P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15647.82647.81,293.6nannannannannannannan53.61.053.653.154.10.0nannannannannannannan0000.0166,43751.837.12nanNaNNaN9,5262846;28477,4877,8791046410,4642623;26249606
EGMNIVEAMER11Oxidation (M)_EGM(Oxidation (M))NIVEAMER_EGM(0.999)NIVEAM(0.001)EREGM(31)NIVEAM(-31)ER010P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15647.82647.81,293.6nannannannannannannan63.11.063.162.663.60.0nannannannannannannan0000.0178,58341.530.72nanNaNNaN9,5272846;28477,4877,8791046510,4652623;26249606
EGMNIVEAMER11Oxidation (M)_EGM(Oxidation (M))NIVEAMER_EGM(0.999)NIVEAM(0.001)EREGM(28)NIVEAM(-28)ER010P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15647.82647.81,293.6nannannannannannannan69.31.069.368.869.80.0nannannannannannannan0000.0186,45249.336.22nanNaNNaN9,5282846;28477,4877,8791046610,4662623;26249606
EGMNIVEAMER11Oxidation (M)_EGM(Oxidation (M))NIVEAMER_EGM(1)NIVEAMEREGM(37)NIVEAM(-37)ER010P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15647.82647.81,293.6nannannannannannannan72.31.072.371.872.80.0nannannannannannannan0000.0190,14447.627.32nanNaNNaN9,5292846;28477,4877,8791046710,4672623;26249606
EGMNIVEAMER11Oxidation (M)_EGMNIVEAM(Oxidation (M))ER_EGMNIVEAM(1)EREGM(-94)NIVEAM(94)ER010P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MULTI-SECPEP20190611_QX3_LiSc_MA_Hela_500ng_LC15647.72647.81,293.62.00.0-0.3-0.01.60.0647.849.12.649.148.751.3-0.0nannannannan275.0209.02.00000.0161,683103.389.02126,220,000.0NaNNaN9,5302846;28477,4877,8791046810,4682623;26249606
EGMNIVEAMER11Oxidation (M)_EGMNIVEAM(Oxidation (M))ER_EGMNIVEAM(1)EREGM(-53)NIVEAM(53)ER010P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MULTI-SECPEP20190611_QX3_LiSc_MA_Hela_500ng_LC15648.32647.81,293.61.70.00.10.01.80.0647.851.70.251.751.751.90.0nannannannan29.018.02.00000.0163,98357.346.2231,675,000.0NaNNaN9,5312846;28477,4877,8791046910,4692623;26249606
EGMNIVEAMER11Oxidation (M)_EGM(Oxidation (M))NIVEAMER_EGM(0.999)NIVEAM(0.001)EREGM(30)NIVEAM(-30)ER010P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MULTI-SECPEP20190611_QX3_LiSc_MA_Hela_500ng_LC15647.42647.81,293.62.20.0-1.5-0.00.80.0647.865.50.865.564.965.70.0nannannannan127.059.03.00000.0181,30051.140.3223,725,000.0NaNNaN9,5322846;28477,4877,8791047010,4702623;26249606
EGMNIVEAMER11Oxidation (M)_EGM(Oxidation (M))NIVEAMER_EGM(1)NIVEAMEREGM(34)NIVEAM(-34)ER010P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MULTI-SECPEP20190611_QX3_LiSc_MA_Hela_500ng_LC15647.62647.81,293.62.30.0-2.7-0.0-0.4-0.0648.371.61.071.670.971.90.0nannannannan150.080.03.00000.0189,22560.446.2243,865,000.0NaNNaN9,5332846;28477,4877,8791047110,4712623;26249606
EGMNIVEAMER11Unmodified_EGMNIVEAMER_NaNNaN000P62937;P62937-2P62937;P62937-2P62937PPIAPeptidyl-prolyl cis-trans isomerase A;Peptidyl...MULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15640.32639.81,277.62.20.0-0.3-0.02.00.0639.875.70.975.775.276.10.0nannannannan283.069.07.00000.0394,309156.2129.81782,440,000.0NaNNaN9,5342846;28477,4877,88010472;10473;1047410,473NaN9606
-
- - - -The retention time index is non-unique. - - -```python -print('The retention time index is unique: {}'.format(rt.index.is_unique)) -``` - - The retention time index is unique: False - - -Peptides observed more than once at different times. - - -```python -mask_duplicated = rt.index.duplicated(keep=False) -rt_duplicates = rt.loc[mask_duplicated] -rt_duplicates -``` - - - - - Sequence - AAAAAAGAASGLPGPVAQGLK 96.9 - AAAAAAGAASGLPGPVAQGLK 96.9 - AAAAMAK 9.0 - AAAAMAK 9.0 - AAAASAAEAGIATTGTEDSDDALLK 72.0 - ... - YYTSASGDEMVSLK 57.6 - YYTSASGDEMVSLK 58.6 - YYVTIIDAPGHR 66.4 - YYVTIIDAPGHR 66.4 - YYVTIIDAPGHR 67.1 - Name: Calibrated retention time, Length: 17960, dtype: float64 - - - - -```python -mq_output.evidence.loc[mask_duplicated, [ - 'Charge', 'Calibrated retention time', 'Intensity']] -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ChargeCalibrated retention timeIntensity
Sequence
AAAAAAGAASGLPGPVAQGLK396.940,166,000.0
AAAAAAGAASGLPGPVAQGLK296.953,015,000.0
AAAAMAK29.0961,050,000.0
AAAAMAK29.0961,050,000.0
AAAASAAEAGIATTGTEDSDDALLK272.049,613,000.0
............
YYTSASGDEMVSLK257.6nan
YYTSASGDEMVSLK258.6790,360,000.0
YYVTIIDAPGHR366.48,630,000,000.0
YYVTIIDAPGHR266.42,458,400,000.0
YYVTIIDAPGHR367.1103,490,000.0
-

17960 rows × 3 columns

-
- - - -Calculate median intensity and calculate standard deviation - - -```python -_agg_functions = ['median', 'std'] - -rt_summary = rt.groupby(level=0).agg(_agg_functions) -rt_summary -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
medianstd
Sequence
AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPGAVGER85.6nan
AAAAAAAAAVSR25.6nan
AAAAAAAGDSDSWDADAFSVEDPVRK95.0nan
AAAAAAALQAK26.7nan
AAAAAAGAASGLPGPVAQGLK96.90.0
.........
YYTSASGDEMVSLK51.24.6
YYTVFDRDNNR43.1nan
YYVLNALK70.5nan
YYVTIIDAPGHR66.40.4
YYYIPQYK62.7nan
-

38787 rows × 2 columns

-
- - - -Let's see several quartiles for both median and standard deviation (the columns are independent from each other) for the retention time - - -```python -rt_summary.describe(percentiles=[0.8, 0.9, 0.95, 0.96, 0.97, 0.98, 0.99]) -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
medianstd
count38,787.07,531.0
mean61.32.2
std28.84.0
min0.00.0
50%62.10.0
80%90.75.4
90%101.28.3
95%105.510.5
96%105.911.1
97%106.411.8
98%106.812.7
99%107.414.5
max109.867.2
-
- - - - -```python -rt_summary['median'] -``` - - - - - Sequence - AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPGAVGER 85.6 - AAAAAAAAAVSR 25.6 - AAAAAAAGDSDSWDADAFSVEDPVRK 95.0 - AAAAAAALQAK 26.7 - AAAAAAGAASGLPGPVAQGLK 96.9 - ... - YYTSASGDEMVSLK 51.2 - YYTVFDRDNNR 43.1 - YYVLNALK 70.5 - YYVTIIDAPGHR 66.4 - YYYIPQYK 62.7 - Name: median, Length: 38787, dtype: float64 - - - -A large standard-deviation indicates that the intensity values originate from time points (in min) widely spread. - -### Peptides observed several times a different points of experimental run - - -```python -mask = rt_summary['std'] > 40.0 -mask_indices = mask[mask].index -rt.loc[mask_indices] -``` - - - - - Sequence - HFELGGDKK 16.3 - HFELGGDKK 106.5 - SYTLNAVSFHFLGEQK 97.3 - SYTLNAVSFHFLGEQK 2.2 - TVQGSGHQEHINIHK 14.3 - TVQGSGHQEHINIHK 14.3 - TVQGSGHQEHINIHK 88.8 - Name: Calibrated retention time, dtype: float64 - - - -Peptides with differen RT have different charges. - - -```python -mq_output.evidence.loc[mask_indices] -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
LengthModificationsModified sequenceOxidation (M) ProbabilitiesOxidation (M) Score DiffsAcetyl (Protein N-term)Oxidation (M)Missed cleavagesProteinsLeading proteinsLeading razor proteinGene namesProtein namesTypeRaw fileMS/MS m/zChargem/zMassUncalibrated - Calibrated m/z [ppm]Uncalibrated - Calibrated m/z [Da]Mass error [ppm]Mass error [Da]Uncalibrated mass error [ppm]Uncalibrated mass error [Da]Max intensity m/z 0Retention timeRetention lengthCalibrated retention timeCalibrated retention time startCalibrated retention time finishRetention time calibrationMatch time differenceMatch m/z differenceMatch q-valueMatch scoreNumber of data pointsNumber of scansNumber of isotopic peaksPIFFraction of total spectrumBase peak fractionPEPMS/MS countMS/MS scan numberScoreDelta scoreCombinatoricsIntensityReversePotential contaminantidProtein group IDsPeptide IDMod. peptide IDMS/MS IDsBest MS/MSOxidation (M) site IDsTaxonomy IDs
Sequence
HFELGGDKK9Unmodified_HFELGGDKK_NaNNaN001H0Y5B4;H7BZ11;J3KQN4;P83881;R4GN19;Q969Q0H0Y5B4;Q969Q0Q969Q0RPL36A;RPL36A-HNRNPH2;RPL36AL60S ribosomal protein L36a;60S ribosomal prote...MULTI-SECPEP20190611_QX3_LiSc_MA_Hela_500ng_LC15515.72515.81,029.51.70.0-2.9-0.0-1.2-0.0515.816.30.316.316.116.40.0nannannannan56.022.03.00000.0118,124110.480.2174,456,000.0NaNNaN17,8811271;400414,16114,8781978319,783NaN9606
HFELGGDKK9Unmodified_HFELGGDKK_NaNNaN001H0Y5B4;H7BZ11;J3KQN4;P83881;R4GN19;Q969Q0H0Y5B4;Q969Q0Q969Q0RPL36A;RPL36A-HNRNPH2;RPL36AL60S ribosomal protein L36a;60S ribosomal prote...MULTI-SECPEP20190611_QX3_LiSc_MA_Hela_500ng_LC151,030.611,030.51,029.51.40.02.20.03.60.01,030.5106.50.1106.5106.5106.50.0nannannannan9.04.03.00000.01132,8677.91.8126,935,000.0NaNNaN17,8821271;400414,16114,8781978419,784NaN9606
SYTLNAVSFHFLGEQK16Unmodified_SYTLNAVSFHFLGEQK_NaNNaN000M0R2B7;P28340;M0QZR8;A0A2R8Y7K6;A0A2R8Y705M0R2B7M0R2B7POLD1DNA polymerase;DNA polymerase delta catalytic ...MULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15614.33614.31,839.91.90.0-2.3-0.0-0.3-0.0614.397.30.597.397.097.50.0nannannannan143.037.06.00000.02121,497127.9110.4161,218,000.0NaNNaN39,963156931,46433,25944367;4436844,368NaN9606
SYTLNAVSFHFLGEQK16Unmodified_SYTLNAVSFHFLGEQK_NaNNaN000M0R2B7;P28340;M0QZR8;A0A2R8Y7K6;A0A2R8Y705M0R2B7M0R2B7POLD1DNA polymerase;DNA polymerase delta catalytic ...MULTI-SECPEP20190611_QX3_LiSc_MA_Hela_500ng_LC15308.26307.71,839.9-0.0-0.01.70.01.70.0307.72.20.12.22.12.30.0nannannannan32.023.02.00000.012,10216.45.4113,276,000.0NaNNaN39,964156931,46433,2594436944,369NaN9606
TVQGSGHQEHINIHK15Unmodified_TVQGSGHQEHINIHK_NaNNaN000Q14247;Q14247-3;Q14247-2Q14247Q14247CTTNSrc substrate cortactinMULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15562.63562.31,683.81.30.0-0.0-0.01.20.0562.314.30.214.314.314.5-0.0nannannannan56.017.05.00000.0115,51697.277.111,575,100,000.0NaNNaN42,769315533,65335,5654748447,484NaN9606
TVQGSGHQEHINIHK15Unmodified_TVQGSGHQEHINIHK_NaNNaN000Q14247;Q14247-3;Q14247-2Q14247Q14247CTTNSrc substrate cortactinMULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15422.04422.01,683.80.90.0-0.3-0.00.70.0422.014.30.314.314.314.60.0nannannannan80.024.06.00000.0115,61124.111.312,937,200,000.0NaNNaN42,770315533,65335,5654748547,485NaN9606
TVQGSGHQEHINIHK15Unmodified_TVQGSGHQEHINIHK_NaNNaN000Q14247;Q14247-3;Q14247-2Q14247Q14247CTTNSrc substrate cortactinMULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15842.92842.91,683.82.00.02.20.04.20.0842.988.80.388.888.789.00.0nannannannan62.025.03.00000.01110,88962.54.1129,400,000.0NaNNaN42,771315533,65335,5654748647,486NaN9606
-
- - - -Model evaluation possibility: Discard samples with several measurements from an experiment and predict value. See which intensity measurement corresponds more closely. - - -```python -from numpy import random -_peptide = random.choice(mask_indices) -``` - - -```python -mq_output.evidence.loc[_peptide] -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
LengthModificationsModified sequenceOxidation (M) ProbabilitiesOxidation (M) Score DiffsAcetyl (Protein N-term)Oxidation (M)Missed cleavagesProteinsLeading proteinsLeading razor proteinGene namesProtein namesTypeRaw fileMS/MS m/zChargem/zMassUncalibrated - Calibrated m/z [ppm]Uncalibrated - Calibrated m/z [Da]Mass error [ppm]Mass error [Da]Uncalibrated mass error [ppm]Uncalibrated mass error [Da]Max intensity m/z 0Retention timeRetention lengthCalibrated retention timeCalibrated retention time startCalibrated retention time finishRetention time calibrationMatch time differenceMatch m/z differenceMatch q-valueMatch scoreNumber of data pointsNumber of scansNumber of isotopic peaksPIFFraction of total spectrumBase peak fractionPEPMS/MS countMS/MS scan numberScoreDelta scoreCombinatoricsIntensityReversePotential contaminantidProtein group IDsPeptide IDMod. peptide IDMS/MS IDsBest MS/MSOxidation (M) site IDsTaxonomy IDs
Sequence
HFELGGDKK9Unmodified_HFELGGDKK_NaNNaN001H0Y5B4;H7BZ11;J3KQN4;P83881;R4GN19;Q969Q0H0Y5B4;Q969Q0Q969Q0RPL36A;RPL36A-HNRNPH2;RPL36AL60S ribosomal protein L36a;60S ribosomal prote...MULTI-SECPEP20190611_QX3_LiSc_MA_Hela_500ng_LC15515.72515.81,029.51.70.0-2.9-0.0-1.2-0.0515.816.30.316.316.116.40.0nannannannan56.022.03.00000.0118,124110.480.2174,456,000.0NaNNaN17,8811271;400414,16114,8781978319,783NaN9606
HFELGGDKK9Unmodified_HFELGGDKK_NaNNaN001H0Y5B4;H7BZ11;J3KQN4;P83881;R4GN19;Q969Q0H0Y5B4;Q969Q0Q969Q0RPL36A;RPL36A-HNRNPH2;RPL36AL60S ribosomal protein L36a;60S ribosomal prote...MULTI-SECPEP20190611_QX3_LiSc_MA_Hela_500ng_LC151,030.611,030.51,029.51.40.02.20.03.60.01,030.5106.50.1106.5106.5106.50.0nannannannan9.04.03.00000.01132,8677.91.8126,935,000.0NaNNaN17,8821271;400414,16114,8781978419,784NaN9606
-
- - - -`Type` column indicates if peptide is based on one or more MS-MS spectra. - - -```python -mq_output.peptides.loc[_peptide].to_frame().T -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
N-term cleavage windowC-term cleavage windowAmino acid beforeFirst amino acidSecond amino acidSecond last amino acidLast amino acidAmino acid afterA CountR CountN CountD CountC CountQ CountE CountG CountH CountI CountL CountK CountM CountF CountP CountS CountT CountW CountY CountV CountU CountO CountLengthMissed cleavagesMassProteinsLeading razor proteinStart positionEnd positionGene namesProtein namesUnique (Groups)Unique (Proteins)ChargesPEPScoreIntensityReversePotential contaminantidProtein group IDsMod. peptide IDsEvidence IDsMS/MS IDsBest MS/MSOxidation (M) site IDsTaxonomy IDsMS/MS Count
HFELGGDKKPNCRSKRMLAIKRCKHFELGGDKKRKGQVIAIKRCKHFELGGDKKRKGQVIQF_______KHFKKR0001001210120100000000911,029.5H0Y5B4;H7BZ11;J3KQN4;P83881;R4GN19;Q969Q0Q969Q090.098.0RPL36A;RPL36A-HNRNPH2;RPL36AL60S ribosomal protein L36a;60S ribosomal prote...nono1;20.0110.4101390000NaNNaN141611271;40041487817881;1788219783;1978419,783.0NaN96060
-
- - - -## Differences in intensities b/w peptides.txt and evidence.txt - - -The intensity reported in `peptides.txt` corresponds to roughly to the sum of the intensities found in different scans: - - -```python -from numpy.testing import assert_almost_equal - -col_intensity = mq_col.INTENSITY -try: - - assert_almost_equal( - _pep_int_evidence := mq_output.evidence.loc[_peptide, col_intensity].sum(), - _pep_int_peptides := mq_output.peptides.loc[_peptide, col_intensity], - err_msg='Mismatch between evidence.txt summed peptide intensities to reported peptide intensities in peptides.txt') -except AssertionError as e: - logging.error( - f"{e}\n Difference: {_pep_int_evidence - _pep_int_peptides:,.2f}") -``` - - ERROR:root: - Arrays are not almost equal to 7 decimals - Mismatch between evidence.txt summed peptide intensities to reported peptide intensities in peptides.txt - ACTUAL: 101391000.0 - DESIRED: 101390000 - Difference: 1,000.00 - - - -```python -mq_output.evidence.loc[_peptide, col_intensity] -``` - - - - - Sequence - HFELGGDKK 74,456,000.0 - HFELGGDKK 26,935,000.0 - Name: Intensity, dtype: float64 - - - - -```python -mq_output.peptides.loc[_peptide, col_intensity] -``` - - - - - 101390000 - - - -Make this comparison for all peptides - - -```python -_pep_int_evidence = mq_output.evidence.groupby( - level=0).agg({col_intensity: [sum, len]}) -_pep_int_evidence.columns = [col_intensity, 'count'] -_pep_int_evidence -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Intensitycount
Sequence
AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPGAVGER57,739,000.01.0
AAAAAAAAAVSR87,575,000.01.0
AAAAAAAGDSDSWDADAFSVEDPVRK442,780,000.01.0
AAAAAAALQAK3,166,700,000.01.0
AAAAAAGAASGLPGPVAQGLK93,181,000.02.0
.........
YYTSASGDEMVSLK2,132,209,000.08.0
YYTVFDRDNNR139,230,000.01.0
YYVLNALK147,430,000.01.0
YYVTIIDAPGHR11,191,890,000.03.0
YYYIPQYK147,680,000.01.0
-

38787 rows × 2 columns

-
- - - - -```python -_diff = _pep_int_evidence[col_intensity] - \ - mq_output.peptides[col_intensity].astype(float) -mask_diff = _diff != 0.0 -_pep_int_evidence.loc[mask_diff].describe() -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Intensitycount
count4,910.04,910.0
mean1,107,382,829.32.5
std2,536,359,037.31.7
min12,305,100.02.0
25%167,462,500.02.0
50%324,814,500.02.0
75%1,026,113,000.02.0
max57,964,490,000.040.0
-
- - - - -```python -_diff.loc[mask_diff] -``` - - - - - Sequence - AAAASAAEAGIATTGTEDSDDALLK 3,000.0 - AAAIGIDLGTTYSCVGVFQHGK 40,000.0 - AAALEFLNRFEEAK -90,000.0 - AAAPAPEEEMDECEQALAAEPK 21,000.0 - AAASLAAVSGTAAASLGSAQPTDLGAHK -8,000.0 - ... - YYLHDDREGEGSDK 1,000.0 - YYPTEDVPRK 60,000.0 - YYQTIGNHASYYK -1,000.0 - YYTSASGDEMVSLK 9,000.0 - YYVTIIDAPGHR -110,000.0 - Name: Intensity, Length: 4911, dtype: float64 - - - - -```python -_diff[mask_diff].describe() -``` - - - - - count 4,910.0 - mean -38.9 - std 49,225.3 - min -800,000.0 - 25% -4,000.0 - 50% 1,000.0 - 75% 4,000.0 - max 620,000.0 - Name: Intensity, dtype: float64 - - - -Several smaller and larger differences in an intensity range way below the detection limit arise for some sequences. - -### Ideas on source of difference - - Are all peptides (sequences) which are based on single observations in `evidence.txt` represented as is in `peptides.txt`? - - how many peptides with more than one PTM have non-zero differences between the sum of intensity values in `evidence.txt` and the respective value in `peptides.txt`? - - maybe some peptides are filtered based on assignment as contaminent (`CON__`)? - - -```python -# ToDo see above -``` - - -```python -_diff_indices = _diff[mask_diff].index -# some pep-seq in peptides.txt not in evidence.txt -_diff_indices = _diff_indices.intersection(mq_output.evidence.index.unique()) -``` - - -```python -from numpy import random -sample_index = random.choice(_diff_indices) -``` - - -```python -mq_output.evidence.loc[sample_index] -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
LengthModificationsModified sequenceOxidation (M) ProbabilitiesOxidation (M) Score DiffsAcetyl (Protein N-term)Oxidation (M)Missed cleavagesProteinsLeading proteinsLeading razor proteinGene namesProtein namesTypeRaw fileMS/MS m/zChargem/zMassUncalibrated - Calibrated m/z [ppm]Uncalibrated - Calibrated m/z [Da]Mass error [ppm]Mass error [Da]Uncalibrated mass error [ppm]Uncalibrated mass error [Da]Max intensity m/z 0Retention timeRetention lengthCalibrated retention timeCalibrated retention time startCalibrated retention time finishRetention time calibrationMatch time differenceMatch m/z differenceMatch q-valueMatch scoreNumber of data pointsNumber of scansNumber of isotopic peaksPIFFraction of total spectrumBase peak fractionPEPMS/MS countMS/MS scan numberScoreDelta scoreCombinatoricsIntensityReversePotential contaminantidProtein group IDsPeptide IDMod. peptide IDMS/MS IDsBest MS/MSOxidation (M) site IDsTaxonomy IDs
Sequence
DTSFEQHVLWHTGGK15Unmodified_DTSFEQHVLWHTGGK_NaNNaN000P49327;A0A0U1RQF0P49327P49327FASNFatty acid synthase;[Acyl-carrier-protein] S-a...MULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15581.33581.31,740.81.20.0-0.3-0.00.90.0581.361.20.661.261.061.60.0nannannannan172.046.05.00000.0276,227100.581.71853,650,000.0NaNNaN7,68425896,0116,3388473;84748,474NaN9606
DTSFEQHVLWHTGGK15Unmodified_DTSFEQHVLWHTGGK_NaNNaN000P49327;A0A0U1RQF0P49327P49327FASNFatty acid synthase;[Acyl-carrier-protein] S-a...MULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15436.24436.21,740.81.00.0-0.1-0.00.90.0436.261.20.461.261.161.50.0nannannannan82.034.03.00000.0176,02435.330.21109,010,000.0NaNNaN7,68525896,0116,33884758,475NaN9606
DTSFEQHVLWHTGGK15Unmodified_DTSFEQHVLWHTGGK_NaNNaN000P49327;A0A0U1RQF0P49327P49327FASNFatty acid synthase;[Acyl-carrier-protein] S-a...MULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15871.92871.41,740.81.70.0-0.1-0.01.60.0871.461.20.361.261.161.30.0nannannannan45.020.03.00000.0176,100232.5208.7130,095,000.0NaNNaN7,68625896,0116,33884768,476NaN9606
-
- - - - -```python -mq_output.peptides.loc[sample_index].to_frame().T -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
N-term cleavage windowC-term cleavage windowAmino acid beforeFirst amino acidSecond amino acidSecond last amino acidLast amino acidAmino acid afterA CountR CountN CountD CountC CountQ CountE CountG CountH CountI CountL CountK CountM CountF CountP CountS CountT CountW CountY CountV CountU CountO CountLengthMissed cleavagesMassProteinsLeading razor proteinStart positionEnd positionGene namesProtein namesUnique (Groups)Unique (Proteins)ChargesPEPScoreIntensityReversePotential contaminantidProtein group IDsMod. peptide IDsEvidence IDsMS/MS IDsBest MS/MSOxidation (M) site IDsTaxonomy IDsMS/MS Count
DTSFEQHVLWHTGGKARFPQLDSTSFANSRDTSFEQHVLWHTGGKDTSFEQHVLWHTGGKGVDLVLNSLAEEKLQRDTGKG00010112201101012101001501,740.8P49327;A0A0U1RQF0P493271,725.01,739.0FASNFatty acid synthase;[Acyl-carrier-protein] S-a...yesno2;3;40.0232.5992760000NaNNaN6011258963387684;7685;76868473;8474;8475;84768,476.0NaN96064
-
- - - -### Modifications - - -```python -mq_output.evidence.Modifications.value_counts() -``` - - - - - Unmodified 42,587 - Oxidation (M) 5,740 - 2 Oxidation (M) 405 - Acetyl (Protein N-term) 367 - Acetyl (Protein N-term),Oxidation (M) 96 - 3 Oxidation (M) 12 - Acetyl (Protein N-term),2 Oxidation (M) 9 - Name: Modifications, dtype: int64 - - - -### Potential contaminant peptides - -The `CON__` entries are possible contaminations resulting from sample preparation using a e.g. a serum: - -```python -data_fasta['ENSEMBL:ENSBTAP00000024146'] -data_fasta['P12763'] # bovine serum protein -> present in cell cultures and in list of default contaminant in MQ -data_fasta['P00735'] # also bovin serum protein -``` - - -```python -mask = mq_output.peptides['Potential contaminant'].notna() -mq_output.peptides.loc[mask] -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
N-term cleavage windowC-term cleavage windowAmino acid beforeFirst amino acidSecond amino acidSecond last amino acidLast amino acidAmino acid afterA CountR CountN CountD CountC CountQ CountE CountG CountH CountI CountL CountK CountM CountF CountP CountS CountT CountW CountY CountV CountU CountO CountLengthMissed cleavagesMassProteinsLeading razor proteinStart positionEnd positionGene namesProtein namesUnique (Groups)Unique (Proteins)ChargesPEPScoreIntensityReversePotential contaminantidProtein group IDsMod. peptide IDsEvidence IDsMS/MS IDsBest MS/MSOxidation (M) site IDsTaxonomy IDsMS/MS Count
Sequence
AAANFFSASCVPCADQSSFPKKLYKELPDPQESIQRAAANFFSASCVPCADSASCVPCADQSSFPKLCQLCAGKGTDKCACRAAPKL50112100000103240001002102,261.0CON__Q29443;CON__Q0IIK2CON__Q29443148.0168.0NaNNaNyesno20.0186.442,902,000NaN+5085653596363.0NaN-1;96061
AATLSTLAGQPLLERIRERFGPLVEQGQSRAATLSTLAGQPLLERAATLSTLAGQPLLERAEAWRQKLHGRLEEVRAAERA31000111004000112000001501,539.9CON__Q03247CON__Q03247209.0223.0NaNNaNyesyes20.047.0108,820,000NaN+352854372442475475.0NaN-11
AAVTSLFAKLATTQTDTMLSAEEKAAVTSLFAKVKVDEVLSAEEKAAVTSLFAKVKVDEVGGEALGRLLKAAAKV300000000011010110010090906.5CON__Q3SX09CON__Q3SX0964.072.0NaNNaNyesyes1;20.0152.51,661,000,000NaN+390862413492;493535;536535.0NaN-1;96061
ADLEMQIENLKINGLRRVLDELTLARADLEMQIENLKEELATLARADLEMQIENLKEELAYLKKNHEEEMNRADLKE10110120012110000000001101,302.6Q04695;CON__Q04695;F5GWP8;CON__Q9QWL7;K7EPJ9Q04695202.0212.0KRT17Keratin, type I cytoskeletal 17yesno20.0147.2403,820,000NaN+545855581701768768.0NaN9606;-10
AEAEAWYQTKAEVKAQYEEMAKCSRAEAEAWYQTKFETLQAKCSRAEAEAWYQTKFETLQAQAGKHGDDLRAETKF30000120000100001110001001,195.6CON__P08729;A0A1W2PRP1;CON__Q9DCV7;CON__Q3KNV1...CON__P08729277.0286.0KRT7Keratin, type II cytoskeletal 7nono20.0128.4612,390,000NaN+631848;861670802880880.0NaN-1;96061
...........................................................................................................................................................................
YGFYTHVFRMGIVSWGEGCDRDGKYGFYTHVFRLKKWIQCDRDGKYGFYTHVFRLKKWIQKVIDRLGS_KYGFRL0100000110000200102100901,188.6CON__P00735;E9PIT3;P00734CON__P00735603.0611.0F2Prothrombin;Activation peptide fragment 1;Acti...yesno2;30.055.495,689,000NaN+37,9028394004148124;4812553466;5346753,467.0NaN-1;96062
YICDNQDTISSKGDLLECADDRADLAKYICDNQDTISSKLKELAKYICDNQDTISSKLKECCDKPLLEKSHCKYISKL00121100020100021010001201,442.6CON__P02769CON__P02769286.0297.0NaNNaNyesyes20.0133.231,554,000NaN+38,02784440177482995366053,660.0NaN-1;96061
YLDSTFTKYVPEDTVYKKVVFRKYLDSTFTKLDPQGEYKKVVFRKYLDSTFTKLDPQGEYEEHLGILGKYLTKL000100000011010120100080973.5CON__Q28107;A0A0A0MRJ7;P12259CON__Q281071,610.01,617.0F5Coagulation factor V;Coagulation factor V heav...yesno20.154.836,050,000NaN+38,13085940283484245380353,803.0NaN-1;96061
YVYNYEAESSSGVPGTADSRPSCPKDAVRFKHLRKYVYNYEAESSSGVPGEAESSSGVPGTADSRSATKINCKVELEVPQKYVSRS21110022000000141032002002,150.9CON__ENSEMBL:ENSBTAP00000032840CON__ENSEMBL:ENSBTAP0000003284049.068.0NaNNaNyesyes2;30.0209.1302,140,000NaN+38,7278374090749134;4913554598;5459954,598.0NaN-1;96062
YYGYTGAFREKGTGKECVPNSNERYYGYTGAFRCLVEKGPNSNERYYGYTGAFRCLVEKGDVAFVKDQTRYYFRC1100000200000100103000901,096.5CON__Q29443;CON__Q0IIK2;E7EQB2;E7ER44;P02788-2...CON__Q29443521.0529.0LTFLactotransferrin;Lactoferricin-H;Kaliocin-1;La...yesno20.075.1177,510,000NaN+38,75585640937491685463454,634.0NaN-1;96061
-

348 rows × 56 columns

-
- - - - -```python - -``` - -### Aggregate identifiers in evidence.txt - - -```python -fasta_keys = ["Proteins", "Leading proteins", - "Leading razor protein", "Gene names"] -mq_output.evidence[fasta_keys] -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ProteinsLeading proteinsLeading razor proteinGene names
Sequence
AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPGAVGERR4GMQ1;O60341;O60341-2R4GMQ1R4GMQ1KDM1A
AAAAAAAAAVSRA0A0A6YYC7;Q96JP5-2;Q96JP5A0A0A6YYC7A0A0A6YYC7ZFP91-CNTF;ZFP91
AAAAAAAGDSDSWDADAFSVEDPVRKO75822;O75822-3;O75822-2O75822O75822EIF3J
AAAAAAALQAKP36578;H3BM89;H3BU31P36578P36578RPL4
AAAAAAGAASGLPGPVAQGLKQ96P70Q96P70Q96P70IPO9
...............
YYVLNALKQ6P2Q9;I3L0J9Q6P2Q9Q6P2Q9PRPF8
YYVTIIDAPGHRP68104;Q5VTE0;A0A087WVQ9;P68104-2;A0A087WV01;Q...P68104P68104EEF1A1;EEF1A1P5
YYVTIIDAPGHRP68104;Q5VTE0;A0A087WVQ9;P68104-2;A0A087WV01;Q...P68104P68104EEF1A1;EEF1A1P5
YYVTIIDAPGHRP68104;Q5VTE0;A0A087WVQ9;P68104-2;A0A087WV01;Q...P68104P68104EEF1A1;EEF1A1P5
YYYIPQYKQ8N183;D6RA56;H0YA50Q8N183Q8N183NDUFAF2
-

49216 rows × 4 columns

-
- - - -The protein assignment information is not entirely unique for each group of peptides. - -## align intensities and retention time (RT) for peptides - -- intensities are values reported in `peptides.txt` -- some (few) peptides in `peptides.txt` are not in `evidence.txt`, but then probably zero - - -```python -intensities.index -``` - - - - - Index(['AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPGAVGER', 'AAAAAAAAAVSR', - 'AAAAAAAGDSDSWDADAFSVEDPVRK', 'AAAAAAALQAK', 'AAAAAAGAASGLPGPVAQGLK', - 'AAAAAATAAAAASIR', 'AAAAADLANR', 'AAAAAEQQQFYLLLGNLLSPDNVVR', - 'AAAAASAAGPGGLVAGK', 'AAAAASHLNLDALR', - ... - 'YYTEFPTVLDITAEDPSK', 'YYTGNYDQYVK', 'YYTLFGR', 'YYTPTISR', - 'YYTPVPCESATAK', 'YYTSASGDEMVSLK', 'YYTVFDRDNNR', 'YYVLNALK', - 'YYVTIIDAPGHR', 'YYYIPQYK'], - dtype='object', name='Sequence', length=38788) - - - - -```python -seq_w_summed_intensities = intensities.to_frame().merge( - rt_summary, left_index=True, right_index=True, how='left') -``` - - -```python -seq_w_summed_intensities -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Intensitymedianstd
Sequence
AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPGAVGER57,739,00085.6nan
AAAAAAAAAVSR87,575,00025.6nan
AAAAAAAGDSDSWDADAFSVEDPVRK442,780,00095.0nan
AAAAAAALQAK3,166,700,00026.7nan
AAAAAAGAASGLPGPVAQGLK93,181,00096.90.0
............
YYTSASGDEMVSLK2,132,200,00051.24.6
YYTVFDRDNNR139,230,00043.1nan
YYVLNALK147,430,00070.5nan
YYVTIIDAPGHR11,192,000,00066.40.4
YYYIPQYK147,680,00062.7nan
-

38788 rows × 3 columns

-
- - - - -```python -mask = ~mq_output.evidence.reset_index( -)[["Sequence", "Proteins", "Gene names"]].duplicated() -mask.index = mq_output.evidence.index -``` - - -```python -diff_ = seq_w_summed_intensities.index.unique().difference(mask.index.unique()) -diff_.to_list() -``` - - - - - ['GIPNMLLSEEETES'] - - - - -```python -# mq_output.msms.set_index('Sequence').loc['GIPNMLLSEEETES'] -``` - - -```python -# There is no evidence, but then it is reported in peptides?! -# Is this the case for more than one MQ-RUN (last or first not written to file?) -try: - if len(diff_) > 0: - mq_output.evidence.loc[diff_] -except KeyError as e: - logging.error(e) -``` - - ERROR:root:"None of [Index(['GIPNMLLSEEETES'], dtype='object', name='Sequence')] are in the [index]" - - - -```python -mq_output.peptides.loc[diff_] -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
N-term cleavage windowC-term cleavage windowAmino acid beforeFirst amino acidSecond amino acidSecond last amino acidLast amino acidAmino acid afterA CountR CountN CountD CountC CountQ CountE CountG CountH CountI CountL CountK CountM CountF CountP CountS CountT CountW CountY CountV CountU CountO CountLengthMissed cleavagesMassProteinsLeading razor proteinStart positionEnd positionGene namesProtein namesUnique (Groups)Unique (Proteins)ChargesPEPScoreIntensityReversePotential contaminantidProtein group IDsMod. peptide IDsEvidence IDsMS/MS IDsBest MS/MSOxidation (M) site IDsTaxonomy IDsMS/MS Count
Sequence
GIPNMLLSEEETESTLQCPESGRMFPISRGIPNMLLSEEETES_RGIPNMLLSEEETES_______________RGIES-00100041012010121000001401,547.7Q9UI30-2;Q9UI30;F5GYQ2Q9UI30-2107.0120.0TRMT112Multifunctional methyltransferase subunit TRM1...nonoNaN1.0nan0NaNNaN12,4314858NaNNaNNaNnanNaN96060
-
- - - -### Option: Peptide scan with highest score for repeatedly measured peptides - -- only select one of repeated peptide scans, namely the one with the highest score -- discards information, no summation of peptide intensities -- yields unique retention time per peptide, by discarding additional information - - -```python -COL_SCORE = 'Score' -mq_output.evidence.groupby(level=0)[COL_SCORE].max() -``` - - - - - Sequence - AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPGAVGER 102.7 - AAAAAAAAAVSR 68.6 - AAAAAAAGDSDSWDADAFSVEDPVRK 157.9 - AAAAAAALQAK 144.4 - AAAAAAGAASGLPGPVAQGLK 119.7 - ... - YYTSASGDEMVSLK 192.3 - YYTVFDRDNNR 73.8 - YYVLNALK 58.7 - YYVTIIDAPGHR 131.8 - YYYIPQYK 83.8 - Name: Score, Length: 38787, dtype: float64 - - - - -```python -mask_max_per_seq = mq_output.evidence.groupby( - level=0)[COL_SCORE].transform("max").eq(mq_output.evidence[COL_SCORE]) -mask_intensity_not_na = mq_output.evidence.Intensity.notna() -mask = mask_max_per_seq & mask_intensity_not_na -``` - -This leads to a non-unique mapping, as some scores are exactly the same for two peptides. - - -```python -mask_duplicates = mq_output.evidence.loc[mask].sort_values( - mq_col.INTENSITY).index.duplicated() -sequences_duplicated = mq_output.evidence.loc[mask].index[mask_duplicates] -mq_output.evidence.loc[mask].loc[sequences_duplicated, [ - COL_SCORE, mq_col.INTENSITY, mq_col.RETENTION_TIME]] # .groupby(level=0).agg({mq_col.INTENSITY : max}) -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ScoreIntensityRetention time
Sequence
EFTNVYIK78.61,466,400,000.056.1
EYFGGFGEVESIELPMDNK286.0207,640,000.0105.5
GPALELVATVAK45.924,976,000.086.8
KVEELEGEITTLNHK110.3339,210,000.055.6
LANCHLDLVR29.328,709,000.048.1
LDDDSERK56.445,506,000.02.1
LGASPLHVDLATLR93.5115,420,000.088.1
LQIPEEER94.162,826,000.043.5
LRDNGLLAK41.015,599,000.028.3
LVGPEEALSPGEAR45.9259,180,000.060.0
LYTAWAYQHELEETTVPEIQR109.936,863,000.095.4
NFGEDMDDER81.338,630,000.017.7
NNPDIPELHRPVVK31.027,234,000.053.5
PLSAVPK77.718,672,000.061.9
QLAVAEGKPPEAPK88.221,723,000.040.9
QVFGTHTTQK31.887,260,000.014.3
RDFAPPGQQK99.9108,060,000.020.5
SSGGSEHSTEGSVSLGDGQLNR90.7218,500,000.040.1
SVMAAAQVAGLNCLR181.951,002,000.083.2
THFDYQFGYR29.566,199,000.058.0
TKPQDMISAGGESVAGITAISGKPGDK107.239,795,000.078.3
YEIDLDTSDHAHLEHITR150.6589,160,000.062.1
YSPTSPTYSPTSPK118.937,969,000.040.4
-
- - - - -```python -mask = mq_output.evidence.reset_index().sort_values( - by=["Sequence", "Score", mq_col.INTENSITY]).duplicated(subset=["Sequence", "Score"], keep='last') -_sequences = mq_output.evidence.index[mask] -mq_output.evidence.loc[_sequences, [ - "Score", "Retention time", mq_col.INTENSITY, "Proteins"]] -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ScoreRetention timeIntensityProteins
Sequence
ADLEEYMSKPMK57.654.9nanNaN
ADLEEYMSKPMK57.654.946,217,000.0NaN
AGEGTYALDSESCMEK165.735.337,690,000.0B5MCF9;O00541-2;O00541;B3KXD6
AGEGTYALDSESCMEK165.746.227,125,000.0B5MCF9;O00541-2;O00541;B3KXD6
ASINMLR75.325.7318,790,000.0P18124;A8MUD9;C9JIJ5
...............
YYTSASGDEMVSLK109.652.024,332,000.0P07900;P07900-2
YYTSASGDEMVSLK61.452.816,513,000.0P07900;P07900-2
YYTSASGDEMVSLK85.348.9nanP07900;P07900-2
YYTSASGDEMVSLK45.357.6nanP07900;P07900-2
YYTSASGDEMVSLK192.358.6790,360,000.0P07900;P07900-2
-

1050 rows × 4 columns

-
- - - -- random, non missing intensity? - - -```python -aggregators = ["Sequence", "Score", mq_col.INTENSITY] -mask_intensity_not_na = mq_output.evidence.Intensity.notna() -seq_max_score_max_intensity = mq_output.evidence.loc[mask_intensity_not_na].reset_index( -)[aggregators+["Proteins", "Gene names"]].sort_values(by=aggregators).set_index("Sequence").groupby(level=0).last() -seq_max_score_max_intensity -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ScoreIntensityProteinsGene names
Sequence
AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPGAVGER102.757,739,000.0R4GMQ1;O60341;O60341-2KDM1A
AAAAAAAAAVSR68.687,575,000.0A0A0A6YYC7;Q96JP5-2;Q96JP5ZFP91-CNTF;ZFP91
AAAAAAAGDSDSWDADAFSVEDPVRK157.9442,780,000.0O75822;O75822-3;O75822-2EIF3J
AAAAAAALQAK144.43,166,700,000.0P36578;H3BM89;H3BU31RPL4
AAAAAAGAASGLPGPVAQGLK119.753,015,000.0Q96P70IPO9
...............
YYTSASGDEMVSLK192.3790,360,000.0P07900;P07900-2HSP90AA1
YYTVFDRDNNR73.8139,230,000.0A0A1B0GVD5;A0A1B0GWE8;P07339;A0A1B0GW44;A0A1B0...CTSD
YYVLNALK58.7147,430,000.0Q6P2Q9;I3L0J9PRPF8
YYVTIIDAPGHR131.88,630,000,000.0P68104;Q5VTE0;A0A087WVQ9;P68104-2;A0A087WV01;Q...EEF1A1;EEF1A1P5
YYYIPQYK83.8147,680,000.0Q8N183;D6RA56;H0YA50NDUFAF2
-

36899 rows × 4 columns

-
- - - - -```python -# drop NA intensities first. -assert seq_max_score_max_intensity.Intensity.isna().sum() == 0 -``` - -Certain peptides have no Protein or gene assigned. - - -```python -seq_max_score_max_intensity.isna().sum() -``` - - - - - Score 0 - Intensity 0 - Proteins 74 - Gene names 310 - dtype: int64 - - - - -```python -mask_seq_selected_not_assigned = seq_max_score_max_intensity.Proteins.isna( -) | seq_max_score_max_intensity["Gene names"].isna() -seq_max_score_max_intensity.loc[mask_seq_selected_not_assigned] -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ScoreIntensityProteinsGene names
Sequence
AAAAMAK67.4961,050,000.0NaNNaN
AAANFFSASCVPCADQSSFPK186.442,902,000.0CON__Q29443;CON__Q0IIK2NaN
AALLDQALSNAR59.021,232,000.0Q9UPN3-3;Q9UPN3-2NaN
AATLSTLAGQPLLER47.0108,820,000.0CON__Q03247NaN
AAVTSLFAK152.51,616,000,000.0CON__Q3SX09NaN
...............
YEDEINRR123.7632,990,000.0CON__P08729NaN
YICDNQDTISSK133.231,554,000.0CON__P02769NaN
YLDISMTPDVSNK68.070,850,000.0NaNNaN
YLQSVFVQVAAATTYR54.823,021,000.0NaNNaN
YVYNYEAESSSGVPGTADSR209.1195,420,000.0CON__ENSEMBL:ENSBTAP00000032840NaN
-

310 rows × 4 columns

-
- - - -These might be a candiate for evaluating predictions, as the information is measured, but unknown. If they cannot be assigned, the closest fit on different genes with model predictions could be a criterion for selection - -## Create dumps of intensities in `peptides.txt` - - -```python -# mq_output.evidence.loc["AAAGGGGGGAAAAGR"] -``` - - -```python -# ToDo: dump this? -mq_output.dump_intensity(folder='data/peptides_txt_intensities/') -``` - - vaep.io.mq - INFO Dumped intensities in peptides.txt: data\peptides_txt_intensities\20190611_QX3_LiSc_MA_Hela_500ng_LC15.json. - INFO:vaep.io.mq:Dumped intensities in peptides.txt: data\peptides_txt_intensities\20190611_QX3_LiSc_MA_Hela_500ng_LC15.json. - - -## Create dumps per gene - -Some hundred peptides map to more than two genes - - -```python -def length_(x): - """Len function which return 0 if object (probably np.nan) has no length. - Otherwise return length of list, pandas.Series, numpy.array, dict, etc.""" - try: - return len(x) - except: - return 0 - - -seq_max_score_max_intensity[mq_col.GENE_NAMES].str.split(";" - ).apply(lambda x: length_(x) - ).value_counts( -).sort_index() -``` - - - - - 0 310 - 1 34,310 - 2 1,735 - 3 322 - 4 91 - 5 42 - 6 21 - 7 30 - 8 7 - 9 10 - 10 4 - 11 2 - 12 2 - 13 1 - 14 4 - 15 4 - 16 1 - 17 2 - 67 1 - Name: Gene names, dtype: int64 - - - -Mostly unique genes associated with a peptide. - -### Select sensible training data per gene -- sequence coverage information? -- minimal number or minimal sequence coverage, otherwise discared -- multiple genes: - - select first and add reference in others - - split and dump repeatedly - -Load fasta-file information - - -```python -import json - -import src.config - -with open(src.config.FN_FASTA_DB) as f: - data_fasta = json.load(f) -print(f'Number of proteins in fasta file DB: {len(data_fasta)}') -``` - - Number of proteins in fasta file DB: 96418 - - - -```python -# schema validation? Load class with schema? -# -> Fasta-File creation should save schema with it -``` - -### Fasta Entries considered as contaminants by MQ - - -```python -mask_potential_contaminant = mq_output.peptides['Potential contaminant'] == '+' -contaminants = mq_output.peptides.loc[mask_potential_contaminant, [mq_col.PROTEINS, mq_col.LEADING_RAZOR_PROTEIN]] -contaminants.head() -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ProteinsLeading razor protein
Sequence
AAANFFSASCVPCADQSSFPKCON__Q29443;CON__Q0IIK2CON__Q29443
AATLSTLAGQPLLERCON__Q03247CON__Q03247
AAVTSLFAKCON__Q3SX09CON__Q3SX09
ADLEMQIENLKQ04695;CON__Q04695;F5GWP8;CON__Q9QWL7;K7EPJ9Q04695
AEAEAWYQTKCON__P08729;A0A1W2PRP1;CON__Q9DCV7;CON__Q3KNV1...CON__P08729
-
- - - - -```python -unique_cont = contaminants[mq_col.PROTEINS].str.split(';').to_list() -set_all = set().union(*unique_cont) -set_cont = {x.split('CON__')[-1] for x in set_all if 'CON__' in x} -set_proteins_to_remove = set_all.intersection(set_cont) -set_proteins_to_remove -``` - - - - - {'O43790', - 'O76013', - 'O76014', - 'O76015', - 'O95678', - 'P02533', - 'P02538', - 'P04259', - 'P04264', - 'P05787', - 'P08729', - 'P08779', - 'P12035', - 'P13645', - 'P13647', - 'P19012', - 'P19013', - 'P35900', - 'P35908', - 'P48668', - 'P78385', - 'P78386', - 'Q01546', - 'Q04695', - 'Q14525', - 'Q14532', - 'Q14533', - 'Q15323', - 'Q2M2I5', - 'Q3SY84', - 'Q7RTS7', - 'Q7Z3Y7', - 'Q7Z794', - 'Q8N1A0', - 'Q8N1N4-2', - 'Q92764', - 'Q9C075', - 'Q9NSB2'} - - - -List of proteins which are both in the fasta file and potential contaminants - - -```python -mask = mq_output.peptides[mq_col.LEADING_RAZOR_PROTEIN].isin(set_proteins_to_remove) -mq_output.peptides.loc[mask, 'Potential contaminant'].value_counts() # ToDo: Remove potential contaminants, check evidence.txt -``` - - - - - + 63 - Name: Potential contaminant, dtype: int64 - - - -### `id_map`: Find genes based on fasta file - -Using `ID_MAP`, all protein entries for that gene are queried and combined. - - -```python -# # slow! discarded for now - -# from src.src.config import FN_ID_MAP - -# with open(FN_ID_MAP) as f: -# id_map = json.load(f) -# id_map = pd.read_json(FN_ID_MAP, orient="split") - -# protein_groups_per_gene = id_map.groupby(by="gene") -# gene_found = [] -# for name, gene_data in protein_groups_per_gene: - -# _peptides = set() -# for protein_id in gene_data.index: -# _peptides = _peptides.union(p for p_list in data_fasta[protein_id]['peptides'] -# for p in p_list) - -# # select intersection of theoretical peptides for gene with observed peptides -# _matched = mq_output.peptides.index.intersection(_peptides) -# # add completness? -# if not _matched.empty and len(_matched) > 3: -# gene_found.append(name) -# # -# if not len(gene_found) % 500 : -# print(f"Found {len(gene_found):6}") -# print(f"Total: {len(gene_found):5}") -``` - -Compare this with the entries in the `Gene names` column of `peptides.txt` - -> Mapping is non-unique. MQ has no treshold on number of identified peptides. (How many (unique) peptides does MQ need?) - -### `peptides.txt`: Multiple Genes per peptides - -- can gene name be collapsed meaningfully? -- some gene groups share common stem -> can this be used? - - -```python -mq_output.peptides[mq_col.GENE_NAMES].head(10) -``` - - - - - Sequence - AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPGAVGER KDM1A - AAAAAAAAAVSR ZFP91-CNTF;ZFP91 - AAAAAAAGDSDSWDADAFSVEDPVRK EIF3J - AAAAAAALQAK RPL4 - AAAAAAGAASGLPGPVAQGLK IPO9 - AAAAAATAAAAASIR SCFD1 - AAAAADLANR CLPX - AAAAAEQQQFYLLLGNLLSPDNVVR IPO5 - AAAAASAAGPGGLVAGK UNC119B - AAAAASHLNLDALR TRIM32 - Name: Gene names, dtype: object - - - - -```python -import vaep.io.mq as mq - -gene_sets_unique = mq_output.peptides["Gene names"].unique() - -N_GENE_SETS = len(gene_sets_unique) -print(f'There are {N_GENE_SETS} unique sets of genes.') -assert N_GENE_SETS != 0, 'No genes?' - -genes_single_unique = mq.get_set_of_genes(gene_sets_unique) -N_GENE_SINGLE_UNIQUE = len(genes_single_unique) - -mq.validate_gene_set(N_GENE_SINGLE_UNIQUE, N_GENE_SETS) -``` - - There are 5953 unique sets of genes. - There are however less unique-single genes 5876 than sets. - - -How often do genes names appear in unique sets? - - -```python -genes_counted_each_in_unique_sets = pd.Series(mq.count_genes_in_sets( - gene_sets=gene_sets_unique)) - -title_ = 'Frequency of counts for each gene in unique set of genes' - -ax = genes_counted_each_in_unique_sets.value_counts().sort_index().plot( - kind='bar', - title=title_, - xlabel='Count of a gene', - ylabel='Frequency of counts', - ax=None, -) -fig = ax.get_figure() - -fig_folder = FIGUREFOLDER / mq_output.folder.stem -fig_folder.mkdir(exist_ok=True) -fig.savefig(fig_folder / f'{title_}.pdf') -``` - - -![png](01_explore_raw_MQ_data_files/01_explore_raw_MQ_data_111_0.png) - - -Unique gene sets with more than one gene: - - -```python -gene_sets_unique = pd.Series(gene_sets_unique).dropna() - -mask_more_than_one_gene = gene_sets_unique.str.contains(';') -gene_sets_unique.loc[mask_more_than_one_gene] -``` - - - - - 1 ZFP91-CNTF;ZFP91 - 12 EEF1E1;EEF1E1-BLOC1S5 - 42 HSPA1B;HSPA1A - 98 HNRNPUL2;HNRNPUL2-BSCL2 - 111 GALNT2;POC1B-GALNT4;GALNT4 - ... - 5,944 RIC8A;RIC8B - 5,945 RFX1;RFX3;RFX2 - 5,947 SNX1;SNX2 - 5,949 STAG2;STAG1 - 5,951 FERMT2;FERMT1 - Length: 1019, dtype: object - - - -### Long format for genes - `peptides_with_single_gene` - -Expand the rows for sets of genes using [`pandas.DataFrame.explode`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.explode.html). - -Does a group of peptide only assigns unique set of genes? Genes can have more than one protein. - - first build groups - - then see matches (see further below) - - - -```python -peptides_with_single_gene = mq.get_peptides_with_single_gene( - peptides=mq_output.peptides) -peptides_with_single_gene -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
IntensityLeading razor proteinProteinsGene names
Sequence
AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPGAVGER57,739,000R4GMQ1R4GMQ1;O60341;O60341-2KDM1A
AAAAAAAAAVSR87,575,000A0A0A6YYC7A0A0A6YYC7;Q96JP5-2;Q96JP5ZFP91-CNTF
AAAAAAAAAVSR87,575,000A0A0A6YYC7A0A0A6YYC7;Q96JP5-2;Q96JP5ZFP91
AAAAAAAGDSDSWDADAFSVEDPVRK442,780,000O75822O75822;O75822-3;O75822-2EIF3J
AAAAAAALQAK3,166,700,000P36578P36578;H3BM89;H3BU31RPL4
...............
YYTVFDRDNNR139,230,000A0A1B0GVD5A0A1B0GVD5;A0A1B0GWE8;P07339;A0A1B0GW44;A0A1B0...CTSD
YYVLNALK147,430,000Q6P2Q9Q6P2Q9;I3L0J9PRPF8
YYVTIIDAPGHR11,192,000,000P68104P68104;Q5VTE0;A0A087WVQ9;P68104-2;A0A087WV01;Q...EEF1A1
YYVTIIDAPGHR11,192,000,000P68104P68104;Q5VTE0;A0A087WVQ9;P68104-2;A0A087WV01;Q...EEF1A1P5
YYYIPQYK147,680,000Q8N183Q8N183;D6RA56;H0YA50NDUFAF2
-

42131 rows × 4 columns

-
- - - - -```python -peptides_with_single_gene.dtypes -``` - - - - - Intensity int64 - Leading razor protein object - Proteins object - Gene names object - dtype: object - - - - -```python -print( - f"DataFrame has due to unfolding now {len(peptides_with_single_gene)} instead of {len(mq_output.peptides)} rows") -``` - - DataFrame has due to unfolding now 42131 instead of 38788 rows - - -Should peptides from potential contaminants be considered? - - -```python -mask = peptides_with_single_gene['Proteins'].str.contains('CON__') -peptides_with_single_gene.loc[mask] -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
IntensityLeading razor proteinProteinsGene names
Sequence
ADLEMQIENLK403,820,000Q04695Q04695;CON__Q04695;F5GWP8;CON__Q9QWL7;K7EPJ9KRT17
AEAEAWYQTK612,390,000CON__P08729CON__P08729;A0A1W2PRP1;CON__Q9DCV7;CON__Q3KNV1...KRT7
AEAESMYQIK624,330,000P05787P05787;CON__P05787;P05787-2;CON__H-INV:HIT0002...KRT8
AGWNAYIDNLMADGTCQDAAIVGYK1,810,900,000P07737P07737;CON__P02584PFN1
AKQEELEAALQR562,380,000CON__P08729CON__P08729;CON__Q3KNV1;P08729KRT7
...............
YETELAMR1,442,600,000P05783P05783;F8VZY9;CON__P05784KRT18
YGFYTHVFR95,689,000CON__P00735CON__P00735;E9PIT3;P00734F2
YLDSTFTK36,050,000CON__Q28107CON__Q28107;A0A0A0MRJ7;P12259F5
YPNCPTDVR0O43929O43929;O43929-2;O43929-3;CON__Q2YDI2ORC4
YYGYTGAFR177,510,000CON__Q29443CON__Q29443;CON__Q0IIK2;E7EQB2;E7ER44;P02788-2...LTF
-

351 rows × 4 columns

-
- - - - -```python -_mask_con = peptides_with_single_gene.loc[mask, mq_col.PROTEINS].str.split(";" - ).apply(lambda x: [True if "CON_" in item else False for item in x] - ).apply(all) - -assert _mask_con.sum() == 0, "There are peptides resulting only from possible confounders: {}".format( - ", ".join(str(x) for x in peptides_with_single_gene.loc[mask, mq_col.PROTEINS].loc[_mask_con].index)) -``` - - -```python -peptides_per_gene = peptides_with_single_gene.value_counts(mq_col.GENE_NAMES) -peptides_per_gene -``` - - - - - Gene names - PLEC 195 - AHNAK 164 - DYNC1H1 140 - PRKDC 121 - FLNB 106 - .. - COMMD3 1 - NDFIP2 1 - COMMD6 1 - COMMD8 1 - A1CF 1 - Length: 5876, dtype: int64 - - - - -#### Find genes based on `Gene names` column in elonged data-set - -More efficient as it does not query unnecessary data or data twice. - - -```python -protein_groups_per_gene = peptides_with_single_gene.groupby( - by=mq_col.GENE_NAMES, dropna=True) - -gene_data = protein_groups_per_gene.get_group(peptides_per_gene.index[3]) -gene_data -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
IntensityLeading razor proteinProteinsGene names
Sequence
AALSALESFLK781,140,000P78527P78527;P78527-2PRKDC
AFLGELK356,860,000P78527P78527;P78527-2PRKDC
AGLLHNILPSQSTDLHHSVGTELLSLVYK1,136,800,000P78527P78527;P78527-2PRKDC
AIRPQIDLK350,810,000P78527P78527;P78527-2PRKDC
AMHGELQK115,600,000P78527P78527;P78527-2PRKDC
...............
VVQMLGSLGGQINK92,400,000P78527P78527;P78527-2PRKDC
WCAHTNVELK30,859,000P78527P78527;P78527-2PRKDC
YFEGVSPK377,530,000P78527P78527;P78527-2PRKDC
YNFPVEVEVPMER229,970,000P78527P78527;P78527-2PRKDC
YPEETLSLMTK0P78527P78527;P78527-2PRKDC
-

121 rows × 4 columns

-
- - - - -```python -list_of_proteins = gene_data[mq_col.PROTEINS].str.split(';').to_list() -set_of_proteins = set().union(*list_of_proteins) -set_of_proteins = {x for x in set_of_proteins if 'CON__' not in x} -set_of_proteins -``` - - - - - {'F5GX40', 'H0YG84', 'P78527', 'P78527-2'} - - - - -```python -gene_data[mq_col.PROTEINS].value_counts() # combine? select first in case of a CON_ as leading razor protein? -``` - - - - - P78527;P78527-2 114 - P78527;P78527-2;F5GX40 4 - P78527 2 - P78527;P78527-2;H0YG84 1 - Name: Proteins, dtype: int64 - - - - -```python -protein_id = set_of_proteins.pop() -print(protein_id) -data_fasta[protein_id]['seq'] -``` - - F5GX40 - - - - - - 'MGQKIAPYSVEIKNTCTSVYTKDRAAKCKIPALDLLIKLLQTFRSSRLMDEFKIGELFSKFYGELALKKK' - - - - -```python -data_fasta[protein_id] -``` - - - - - {'meta': '>tr|F5GX40|F5GX40_HUMAN Isoform of P78527, DNA-dependent protein kinase catalytic subunit (Fragment) OS=Homo sapiens OX=9606 GN=PRKDC PE=1 SV=1', - 'gene': 'PRKDC', - 'seq': 'MGQKIAPYSVEIKNTCTSVYTKDRAAKCKIPALDLLIKLLQTFRSSRLMDEFKIGELFSKFYGELALKKK', - 'peptides': [['IAPYSVEIK', 'NTCTSVYTK', 'IPALDLLIK', 'IGELFSK', 'FYGELALK'], - ['MGQKIAPYSVEIK', - 'IAPYSVEIKNTCTSVYTK', - 'NTCTSVYTKDR', - 'CKIPALDLLIK', - 'IPALDLLIKLLQTFR', - 'LLQTFRSSR', - 'SSRLMDEFK', - 'LMDEFKIGELFSK', - 'IGELFSKFYGELALK', - 'FYGELALKK'], - ['MGQKIAPYSVEIKNTCTSVYTK', - 'IAPYSVEIKNTCTSVYTKDR', - 'NTCTSVYTKDRAAK', - 'DRAAKCK', - 'AAKCKIPALDLLIK', - 'CKIPALDLLIKLLQTFR', - 'IPALDLLIKLLQTFRSSR', - 'LLQTFRSSRLMDEFK', - 'SSRLMDEFKIGELFSK', - 'LMDEFKIGELFSKFYGELALK', - 'IGELFSKFYGELALKK', - 'FYGELALKKK']]} - - - -### Sample completeness -Find a sample with a certain completeness level: - - -```python -peps_exact_cleaved = mq.find_exact_cleaved_peptides_for_razor_protein( - gene_data, fasta_db=data_fasta) -peps_exact_cleaved[:10] -``` - - - - - ['MAGSGAGVR', - 'LQETLSAADR', - 'CGAALAGHQLIR', - 'GLGQECVLSSSPAVLALQTSLVFSR', - 'DFGLLVFVR', - 'SLNSIEFR', - 'FLCIFLEK', - 'IAPYSVEIK', - 'NTCTSVYTK', - 'IPALDLLIK'] - - - -Then search the list of possible peptides originating from the fasta files assuming no miscleavages to the set of found peptides. - -- How many unique exact-cleaved peptides can be mapped to any peptide found in the sample (**completness**)? - - -```python -peps_in_data = gene_data.index - -mq.calculate_completness_for_sample( - peps_exact_cleaved=peps_exact_cleaved, - peps_in_data=peps_in_data) -``` - - - - - 0.4978723404255319 - - - -The number of peptides found can be then used to calculate the completeness - -Select candidates by completeness of training data in single samples and save by experiment name - - -```python -mq_output.folder.stem # needs to go to root? -``` - - - - - '20190611_QX3_LiSc_MA_Hela_500ng_LC15' - - - -### GeneData accessor? - -- [Registering custom accessors tutorial](https://pandas.pydata.org/pandas-docs/stable/development/extending.html#registering-custom-accessors) - - -```python -# @pd.api.extensions.register_dataframe_accessor('gene') -# class GeneDataAccessor: - -# COL_INTENSITY = mq_col.INTENSITY -# COL_RAZOR_PROT = 'Leading razor protein' -# COL_PROTEINS = 'Proteins' -# COL_GENE_NAME = 'Gene names' - -# COLS_EXPECTED = {COL_INTENSITY, COL_RAZOR_PROT, COL_PROTEINS, COL_GENE_NAME} - -# def __init__(self, pandas_df): -# self._validate(df=pandas_df) - -# @classmethod -# def _validate(cls, df): -# """Verify if expected columns and layout apply to panda.DataFrame (view)""" -# _found_columns = cls.COLS_EXPECTED.intersection(df.columns) -# if not _found_columns == cls.COLS_EXPECTED: -# raise AttributeError("Expected columns not in DataFrame: {}".format( -# list(cls.COLS_EXPECTED - _found_columns))) -# if not len(df[COL_RAZOR_PROT].unique()) != 1: - - -# # GeneDataAccessor(gene_data.drop(mq_col.INTENSITY, axis=1)) -# # GeneDataAccessor(gene_data) -# # gene_data.drop(mq_col.INTENSITY, axis=1).gene -# gene_data.gene -``` - -### Gene Data Mapper? - - -```python -class GeneDataMapper: - - COL_INTENSITY = mq_col.INTENSITY - COL_RAZOR_PROT = mq_col.LEADING_RAZOR_PROTEIN - COL_PROTEINS = mq_col.PROTEINS - COL_GENE_NAME = mq_col.GENE_NAMES - - COLS_EXPECTED = {COL_INTENSITY, COL_RAZOR_PROT, - COL_PROTEINS, COL_GENE_NAME} - - def __init__(self, pandas_df, fasta_dict): - self._validate(df=pandas_df) - self._df = pandas_df - self._fasta_dict = fasta_dict - - # self.log? - - @classmethod - def _validate(cls, df): - """Verify if expected columns and layout apply to panda.DataFrame (view)""" - _found_columns = cls.COLS_EXPECTED.intersection(df.columns) - if not _found_columns == cls.COLS_EXPECTED: - raise AttributeError("Expected columns not in DataFrame: {}".format( - list(cls.COLS_EXPECTED - _found_columns))) - if len(df[cls.COL_RAZOR_PROT].unique()) != 1: - raise ValueError( - "Non-unique razor-protein in DataFrame: ", df[cls.COL_RAZOR_PROT].unique()) - - def __repr__(self): - return f"{self.__class__.__name__} at {id(self)}" - - -GeneDataMapper(gene_data, data_fasta) -``` - - - - - GeneDataMapper at 2431469223024 - - - -### Dump samples as json - -- select unique gene-names in set (have to be shared by all peptides) -- dump peptide intensities as json from `peptides.txt` - - -```python -peptides_with_single_gene # long-format with repeated peptide information by gene -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
IntensityLeading razor proteinProteinsGene names
Sequence
AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPGAVGER57,739,000R4GMQ1R4GMQ1;O60341;O60341-2KDM1A
AAAAAAAAAVSR87,575,000A0A0A6YYC7A0A0A6YYC7;Q96JP5-2;Q96JP5ZFP91-CNTF
AAAAAAAAAVSR87,575,000A0A0A6YYC7A0A0A6YYC7;Q96JP5-2;Q96JP5ZFP91
AAAAAAAGDSDSWDADAFSVEDPVRK442,780,000O75822O75822;O75822-3;O75822-2EIF3J
AAAAAAALQAK3,166,700,000P36578P36578;H3BM89;H3BU31RPL4
...............
YYTVFDRDNNR139,230,000A0A1B0GVD5A0A1B0GVD5;A0A1B0GWE8;P07339;A0A1B0GW44;A0A1B0...CTSD
YYVLNALK147,430,000Q6P2Q9Q6P2Q9;I3L0J9PRPF8
YYVTIIDAPGHR11,192,000,000P68104P68104;Q5VTE0;A0A087WVQ9;P68104-2;A0A087WV01;Q...EEF1A1
YYVTIIDAPGHR11,192,000,000P68104P68104;Q5VTE0;A0A087WVQ9;P68104-2;A0A087WV01;Q...EEF1A1P5
YYYIPQYK147,680,000Q8N183Q8N183;D6RA56;H0YA50NDUFAF2
-

42131 rows × 4 columns

-
- - - - -```python -root_logger = logging.getLogger() -root_logger.handlers = [] -root_logger.handlers -``` - - - - - [] - - - - -```python -genes_counted_each_in_unique_sets = pd.Series(mq.count_genes_in_sets( - gene_sets=gene_sets_unique)) - -# # ToDo: Develop -# class MaxQuantTrainingDataExtractor(): -# """Class to extract training data from `MaxQuantOutput`.""" - -# def __init__(self, out_folder): -# self.out_folder = Path(out_folder) -# self.out_folder.mkdir(exist_ok=True) -# self.fname_template = '{gene}.json' - -completeness_per_gene = mq.ExtractFromPeptidesTxt( - out_folder='train', mq_output_object=mq_output, fasta_db=data_fasta)() -``` - - vaep.io.mq - INFO - Gene: A2M : Potential contaminent protein is leading razor protein: CON__ENSEMBL:ENSBTAP00000024146 (Gene: A2M) - vaep.io.mq - WARNING - Gene: ABR : More than one razor protein (try first): B7Z683, P11274 (Gene: ABR) - vaep.io.mq - WARNING - Gene: ACSL4 : More than one razor protein (try first): O60488-2, O95573 (Gene: ACSL4) - vaep.io.mq - WARNING - Gene: ACTA1 : More than one razor protein (try first): P63261, P68133 (Gene: ACTA1) - vaep.io.mq - WARNING - Gene: ACTA2 : More than one razor protein (try first): P63261, P68133 (Gene: ACTA2) - vaep.io.mq - WARNING - Gene: ACTB : More than one razor protein (try first): P63261, P60709 (Gene: ACTB) - vaep.io.mq - WARNING - Gene: ACTC1 : More than one razor protein (try first): P63261, P68133 (Gene: ACTC1) - vaep.io.mq - WARNING - Gene: ACTG2 : More than one razor protein (try first): P63261, P68133 (Gene: ACTG2) - vaep.io.mq - WARNING - Gene: ACTN1 : More than one razor protein (try first): P12814, O43707 (Gene: ACTN1) - vaep.io.mq - WARNING - Gene: ACTN2 : More than one razor protein (try first): P12814, O43707 (Gene: ACTN2) - vaep.io.mq - WARNING - Gene: ACTN3 : More than one razor protein (try first): O43707, P12814 (Gene: ACTN3) - vaep.io.mq - WARNING - Gene: ACTR1B : More than one razor protein (try first): P61163, P42025 (Gene: ACTR1B) - vaep.io.mq - WARNING - Gene: AHCYL2 : More than one razor protein (try first): O43865, Q96HN2-2 (Gene: AHCYL2) - vaep.io.mq - WARNING - Gene: AHNAK2 : More than one razor protein (try first): Q09666, Q8IVF2-3 (Gene: AHNAK2) - vaep.io.mq - INFO - Gene: AHSG : Potential contaminent protein is leading razor protein: CON__P12763 (Gene: AHSG) - vaep.io.mq - WARNING More than one set of genes: ['CON__P12763;P02765' 'CON__P12763;P02765;C9JV77'] - vaep.io.mq - INFO - Gene: ALB : Potential contaminent protein is leading razor protein: CON__P02769 (Gene: ALB) - vaep.io.mq - WARNING More than one set of genes: ['CON__P02769;C9JKR2;H0YA55;D6RHD5;B7WNR0;A0A0C4DGB6;CON__P02768-1;P02768-2;P02768;A0A087WWT3;P02768-3' - 'CON__P02769;C9JKR2;H0YA55;D6RHD5;B7WNR0;A0A0C4DGB6;CON__P02768-1;P02768-2;P02768'] - vaep.io.mq - WARNING - Gene: ALB : Non-unique other protein set found (select first): D6RHD5, P02768, B7WNR0, P02768-2, A0A0C4DGB6, H0YA55, C9JKR2 - vaep.io.mq - WARNING - Gene: ALDOC : More than one razor protein (try first): P09972, P04075 (Gene: ALDOC) - vaep.io.mq - WARNING - Gene: ANKRD17 : More than one razor protein (try first): Q8IWZ3-6, H0YM23 (Gene: ANKRD17) - vaep.io.mq - WARNING - Gene: ANP32B : More than one razor protein (try first): Q92688-2, P39687 (Gene: ANP32B) - vaep.io.mq - WARNING - Gene: AP1B1 : More than one razor protein (try first): Q10567-3, P63010 (Gene: AP1B1) - vaep.io.mq - WARNING - Gene: AP2A2 : More than one razor protein (try first): O95782-2, O94973 (Gene: AP2A2) - vaep.io.mq - WARNING - Gene: APOB : More than one razor protein (try first): P04114, CON__ENSEMBL:ENSBTAP00000032840 (Gene: APOB) - vaep.io.mq - WARNING - Gene: APOE : More than one razor protein (try first): P02649, CON__Q03247 (Gene: APOE) - vaep.io.mq - WARNING - Gene: ARF4 : More than one razor protein (try first): P84077, P18085 (Gene: ARF4) - vaep.io.mq - WARNING - Gene: ARF5 : More than one razor protein (try first): P84077, P84085 (Gene: ARF5) - vaep.io.mq - WARNING - Gene: ARFGEF1 : More than one razor protein (try first): Q9Y6D5, Q9Y6D6 (Gene: ARFGEF1) - vaep.io.mq - WARNING - Gene: ARPC1A : More than one razor protein (try first): A0A1W2PNV4, Q92747 (Gene: ARPC1A) - vaep.io.mq - WARNING - Gene: ARPC1B : More than one razor protein (try first): A0A1W2PNV4, C9JEY1 (Gene: ARPC1B) - vaep.io.mq - WARNING - Gene: ATAD2B : More than one razor protein (try first): Q6PL18, Q9ULI0 (Gene: ATAD2B) - vaep.io.mq - WARNING - Gene: ATAD3B : More than one razor protein (try first): Q9NVI7-2, Q5T9A4 (Gene: ATAD3B) - vaep.io.mq - WARNING - Gene: ATAD3C : More than one razor protein (try first): Q5T9A4, Q9NVI7-2 (Gene: ATAD3C) - vaep.io.mq - WARNING - Gene: ATF1 : More than one razor protein (try first): P18846-2, P16220-3 (Gene: ATF1) - vaep.io.mq - WARNING - Gene: ATP12A : More than one razor protein (try first): P54707, P05023 (Gene: ATP12A) - vaep.io.mq - WARNING - Gene: ATP2B4 : More than one razor protein (try first): P23634-5, P20020-1 (Gene: ATP2B4) - vaep.io.mq - WARNING - Gene: ATP6V0A2: More than one razor protein (try first): Q9Y487, Q93050-1 (Gene: ATP6V0A2) - vaep.io.mq - WARNING - Gene: AURKA : More than one razor protein (try first): Q96GD4, O14965 (Gene: AURKA) - vaep.io.mq - WARNING - Gene: BCLAF1 : More than one razor protein (try first): Q9NYF8, Q9Y2W1, E9PK91 (Gene: BCLAF1) - vaep.io.mq - WARNING - Gene: BRAF : More than one razor protein (try first): Q96II5, H7C560 (Gene: BRAF) - vaep.io.mq - WARNING - Gene: BRD2 : More than one razor protein (try first): P25440-2, O60885 (Gene: BRD2) - vaep.io.mq - WARNING - Gene: BRD3 : More than one razor protein (try first): Q15059, O60885 (Gene: BRD3) - vaep.io.mq - WARNING - Gene: BZW1 : More than one razor protein (try first): Q7L1Q6, E7ETZ4 (Gene: BZW1) - vaep.io.mq - WARNING - Gene: C3 : More than one razor protein (try first): P01024, CON__Q2UVX4 (Gene: C3) - vaep.io.mq - WARNING - Gene: CAD : More than one razor protein (try first): F8VPD4, P31327 (Gene: CAD) - vaep.io.mq - WARNING - Gene: CAPZA2 : More than one razor protein (try first): P47755, P52907 (Gene: CAPZA2) - vaep.io.mq - WARNING - Gene: CBR3 : More than one razor protein (try first): O75828, P16152 (Gene: CBR3) - vaep.io.mq - WARNING - Gene: CBX1 : More than one razor protein (try first): P83916, Q13185 (Gene: CBX1) - vaep.io.mq - WARNING - Gene: CBX5 : More than one razor protein (try first): Q13185, P45973 (Gene: CBX5) - vaep.io.mq - WARNING - Gene: CCT6B : More than one razor protein (try first): P40227, Q92526 (Gene: CCT6B) - vaep.io.mq - WARNING - Gene: CDC34 : More than one razor protein (try first): P49427, Q712K3 (Gene: CDC34) - vaep.io.mq - WARNING - Gene: CDK12 : More than one razor protein (try first): Q9NYV4-2, P06493 (Gene: CDK12) - vaep.io.mq - WARNING - Gene: CDK13 : More than one razor protein (try first): A0A2R8Y7W5, P06493 (Gene: CDK13) - vaep.io.mq - WARNING - Gene: CDK16 : More than one razor protein (try first): E5RGN0, P06493 (Gene: CDK16) - vaep.io.mq - WARNING - Gene: CDK2 : More than one razor protein (try first): G3V5T9, P06493 (Gene: CDK2) - vaep.io.mq - WARNING - Gene: CDK4 : More than one razor protein (try first): P11802, P06493 (Gene: CDK4) - vaep.io.mq - WARNING - Gene: CDK5 : More than one razor protein (try first): Q00535, P06493 (Gene: CDK5) - vaep.io.mq - WARNING - Gene: CDK6 : More than one razor protein (try first): Q00534, P06493 (Gene: CDK6) - vaep.io.mq - WARNING - Gene: CDK9 : More than one razor protein (try first): P50750-2, P06493 (Gene: CDK9) - vaep.io.mq - WARNING - Gene: CDKN2A : More than one razor protein (try first): J3QRG6, Q8N726 (Gene: CDKN2A) - vaep.io.mq - WARNING - Gene: CFL2 : More than one razor protein (try first): Q9Y281, G3V1A4 (Gene: CFL2) - vaep.io.mq - WARNING - Gene: CHD2 : More than one razor protein (try first): O14647-2, O14646-2 (Gene: CHD2) - vaep.io.mq - WARNING - Gene: CHD3 : More than one razor protein (try first): A0A2R8Y425, Q12873-2 (Gene: CHD3) - vaep.io.mq - WARNING - Gene: CHD6 : More than one razor protein (try first): Q9HCK8, O14646-2 (Gene: CHD6) - vaep.io.mq - WARNING - Gene: CHD7 : More than one razor protein (try first): Q9P2D1, Q9HCK8, O14646-2 (Gene: CHD7) - vaep.io.mq - WARNING - Gene: CHD8 : More than one razor protein (try first): Q9HCK8, O14646-2 (Gene: CHD8) - vaep.io.mq - WARNING - Gene: CHD9 : More than one razor protein (try first): Q9HCK8, O14646-2 (Gene: CHD9) - vaep.io.mq - WARNING - Gene: CHMP2B : More than one razor protein (try first): C9J0A7, Q92828 (Gene: CHMP2B) - vaep.io.mq - WARNING - Gene: CHMP4A : More than one razor protein (try first): E9PSI1, Q9BY43-2 (Gene: CHMP4A) - vaep.io.mq - WARNING - Gene: CHTF18 : More than one razor protein (try first): E7EXA6, Q15813-2 (Gene: CHTF18) - vaep.io.mq - WARNING - Gene: CLASP1 : More than one razor protein (try first): F8WA11, E3W994 (Gene: CLASP1) - vaep.io.mq - WARNING - Gene: CLTCL1 : More than one razor protein (try first): A0A087WVQ6, A0A087WX41 (Gene: CLTCL1) - vaep.io.mq - WARNING - Gene: CLUH : More than one razor protein (try first): A0A494C0R8, O75153 (Gene: CLUH) - vaep.io.mq - WARNING - Gene: CNBP : More than one razor protein (try first): P62633-7, P62633-8 (Gene: CNBP) - vaep.io.mq - WARNING - Gene: CNNM4 : More than one razor protein (try first): Q6P4Q7, Q8NE01 (Gene: CNNM4) - vaep.io.mq - WARNING - Gene: CNPY2 : More than one razor protein (try first): F8W031, Q9Y2B0 (Gene: CNPY2) - vaep.io.mq - WARNING - Gene: COPG2 : More than one razor protein (try first): Q9UBF2, Q9Y678 (Gene: COPG2) - vaep.io.mq - WARNING - Gene: CPNE3 : More than one razor protein (try first): Q86YQ8, O75131 (Gene: CPNE3) - vaep.io.mq - WARNING - Gene: CREM : More than one razor protein (try first): E9PAR2, P16220-3 (Gene: CREM) - vaep.io.mq - WARNING - Gene: CSNK1E : More than one razor protein (try first): P49674, H7BYT1 (Gene: CSNK1E) - vaep.io.mq - WARNING - Gene: CSTF2T : More than one razor protein (try first): E7EWR4, Q9H0L4 (Gene: CSTF2T) - vaep.io.mq - WARNING - Gene: CTBP2 : More than one razor protein (try first): Q13363-2, Q5SQP8 (Gene: CTBP2) - vaep.io.mq - WARNING - Gene: CTPS2 : More than one razor protein (try first): P17812, Q9NRF8 (Gene: CTPS2) - vaep.io.mq - WARNING - Gene: CUL4B : More than one razor protein (try first): K4DI93, Q13619 (Gene: CUL4B) - vaep.io.mq - WARNING - Gene: CYFIP2 : More than one razor protein (try first): Q7L576, H7C229 (Gene: CYFIP2) - vaep.io.mq - WARNING - Gene: DDX17 : More than one razor protein (try first): J3KTA4, A0A1X7SBZ2, A0A2R8YF78 (Gene: DDX17) - vaep.io.mq - WARNING - Gene: DDX39A : More than one razor protein (try first): O00148, Q13838-2 (Gene: DDX39A) - vaep.io.mq - WARNING - Gene: DDX5 : More than one razor protein (try first): J3KTA4, A0A2R8YF78 (Gene: DDX5) - vaep.io.mq - WARNING - Gene: DDX50 : More than one razor protein (try first): Q9NR30, Q9BQ39 (Gene: DDX50) - vaep.io.mq - WARNING - Gene: DHX16 : More than one razor protein (try first): O60231, O43143 (Gene: DHX16) - vaep.io.mq - WARNING - Gene: DHX57 : More than one razor protein (try first): Q6P158, Q9H2U1-2, Q7Z478 (Gene: DHX57) - vaep.io.mq - WARNING - Gene: DHX8 : More than one razor protein (try first): F5H658, O43143 (Gene: DHX8) - vaep.io.mq - WARNING - Gene: DNAJB4 : More than one razor protein (try first): Q9UDY4, P25685 (Gene: DNAJB4) - vaep.io.mq - WARNING - Gene: EEF1A2 : More than one razor protein (try first): P68104, A0A2U3TZH3 (Gene: EEF1A2) - vaep.io.mq - WARNING - Gene: EEF1B2 : More than one razor protein (try first): P29692, P24534 (Gene: EEF1B2) - vaep.io.mq - WARNING - Gene: EEF1D : More than one razor protein (try first): P29692, E9PRY8, P29692-4 (Gene: EEF1D) - vaep.io.mq - WARNING - Gene: EFTUD2 : More than one razor protein (try first): Q15029-2, P13639 (Gene: EFTUD2) - vaep.io.mq - WARNING - Gene: EHD2 : More than one razor protein (try first): Q9NZN4, A0A024R571, Q9H223 (Gene: EHD2) - vaep.io.mq - WARNING - Gene: EHD4 : More than one razor protein (try first): Q9H223, A0A024R571 (Gene: EHD4) - vaep.io.mq - WARNING - Gene: EHMT2 : More than one razor protein (try first): A0A1B0GV09, A0A0G2JRR0 (Gene: EHMT2) - vaep.io.mq - WARNING - Gene: EIF4A2 : More than one razor protein (try first): Q14240, P60842 (Gene: EIF4A2) - vaep.io.mq - WARNING - Gene: EIF4A3 : More than one razor protein (try first): P38919, P60842 (Gene: EIF4A3) - vaep.io.mq - WARNING - Gene: EIF4G1 : More than one razor protein (try first): E7EX73, Q04637-5 (Gene: EIF4G1) - vaep.io.mq - WARNING - Gene: EIF4G3 : More than one razor protein (try first): A0A0A0MSA7, E7EX73, P48506 (Gene: EIF4G3) - vaep.io.mq - WARNING - Gene: EMC3 : More than one razor protein (try first): Q9P0I2, C9JLM9 (Gene: EMC3) - vaep.io.mq - WARNING - Gene: ENO2 : More than one razor protein (try first): P06733, P09104-2 (Gene: ENO2) - vaep.io.mq - WARNING - Gene: ENO3 : More than one razor protein (try first): P06733, P09104-2, P13929-3 (Gene: ENO3) - vaep.io.mq - WARNING - Gene: EPB41L2 : More than one razor protein (try first): E9PHY5, O43491 (Gene: EPB41L2) - vaep.io.mq - WARNING - Gene: EPHA2 : More than one razor protein (try first): Q96L35, P29317 (Gene: EPHA2) - vaep.io.mq - WARNING - Gene: EPPK1 : More than one razor protein (try first): A0A075B730, Q15149 (Gene: EPPK1) - vaep.io.mq - WARNING - Gene: EPS15L1 : More than one razor protein (try first): M0R165, M0QY01 (Gene: EPS15L1) - vaep.io.mq - WARNING - Gene: ERLIN1 : More than one razor protein (try first): O75477, E5RHW4 (Gene: ERLIN1) - vaep.io.mq - WARNING - Gene: EXOC5 : More than one razor protein (try first): A0A0A0MSI8, Q14980-2 (Gene: EXOC5) - vaep.io.mq - WARNING - Gene: EZR : More than one razor protein (try first): E7EQR4, P26038, A0A2R8Y5S7 (Gene: EZR) - vaep.io.mq - INFO - Gene: F2 : Potential contaminent protein is leading razor protein: CON__P00735 (Gene: F2) - vaep.io.mq - WARNING - Gene: F2 : Non-unique other protein set found (select first): P00734, E9PIT3 - vaep.io.mq - INFO - Gene: F5 : Potential contaminent protein is leading razor protein: CON__Q28107 (Gene: F5) - vaep.io.mq - WARNING - Gene: F5 : Non-unique other protein set found (select first): A0A0A0MRJ7, P12259 - vaep.io.mq - WARNING - Gene: FAM120C : More than one razor protein (try first): Q9NZB2, F8W881 (Gene: FAM120C) - vaep.io.mq - WARNING - Gene: FAM98B : More than one razor protein (try first): Q8NCA5-2, Q52LJ0 (Gene: FAM98B) - vaep.io.mq - WARNING - Gene: FBLL1 : More than one razor protein (try first): P22087, A6NHQ2 (Gene: FBLL1) - vaep.io.mq - WARNING - Gene: FIGNL1 : More than one razor protein (try first): O75351, Q6PIW4-2 (Gene: FIGNL1) - vaep.io.mq - WARNING - Gene: FLII : More than one razor protein (try first): Q13045, Q13045-2 (Gene: FLII) - vaep.io.mq - WARNING - Gene: FLNA : More than one razor protein (try first): Q60FE5, O75369-8 (Gene: FLNA) - vaep.io.mq - WARNING - Gene: FLYWCH1 : More than one razor protein (try first): Q4VC44-2, Q96CP2 (Gene: FLYWCH1) - vaep.io.mq - WARNING - Gene: FMNL2 : More than one razor protein (try first): Q96PY5-3, O95466 (Gene: FMNL2) - vaep.io.mq - WARNING - Gene: FMR1 : More than one razor protein (try first): A8MQB8, B4DXZ6 (Gene: FMR1) - vaep.io.mq - WARNING - Gene: FOCAD : More than one razor protein (try first): Q5VW36, S4R400 (Gene: FOCAD) - vaep.io.mq - WARNING - Gene: FUBP1 : More than one razor protein (try first): Q96AE4, Q96AE4-2 (Gene: FUBP1) - vaep.io.mq - WARNING - Gene: FUBP3 : More than one razor protein (try first): Q96I24, Q96AE4 (Gene: FUBP3) - vaep.io.mq - WARNING - Gene: FUS : More than one razor protein (try first): Q92804-2, P35637 (Gene: FUS) - vaep.io.mq - WARNING - Gene: FXR2 : More than one razor protein (try first): P51116, A8MQB8, B4DXZ6 (Gene: FXR2) - vaep.io.mq - WARNING - Gene: FYN : More than one razor protein (try first): J3QRU1, P06241-3 (Gene: FYN) - vaep.io.mq - WARNING - Gene: G3BP2 : More than one razor protein (try first): Q9UN86-2, Q13283 (Gene: G3BP2) - vaep.io.mq - WARNING - Gene: GABPB1 : More than one razor protein (try first): H9ZYI9, Q06547 (Gene: GABPB1) - vaep.io.mq - WARNING - Gene: GANAB : More than one razor protein (try first): Q14697, Q14697-2 (Gene: GANAB) - vaep.io.mq - WARNING - Gene: GATAD2A : More than one razor protein (try first): Q86YP4-2, A0A0U1RRM1 (Gene: GATAD2A) - vaep.io.mq - WARNING - Gene: GDI1 : More than one razor protein (try first): P50395, P31150 (Gene: GDI1) - vaep.io.mq - WARNING - Gene: GFAP : More than one razor protein (try first): P08670, P05787 (Gene: GFAP) - vaep.io.mq - WARNING - Gene: GIT2 : More than one razor protein (try first): F8VXI9, J3QRU8 (Gene: GIT2) - vaep.io.mq - WARNING - Gene: GNAI1 : More than one razor protein (try first): P08754, P63096 (Gene: GNAI1) - vaep.io.mq - WARNING - Gene: GNAI2 : More than one razor protein (try first): P08754, P04899-4 (Gene: GNAI2) - vaep.io.mq - WARNING - Gene: GNAL : More than one razor protein (try first): P63092-3, P08754 (Gene: GNAL) - vaep.io.mq - WARNING - Gene: GNAS : More than one razor protein (try first): P63092-3, P08754 (Gene: GNAS) - vaep.io.mq - WARNING - Gene: GNB2 : More than one razor protein (try first): P62879, P62873-2 (Gene: GNB2) - vaep.io.mq - WARNING - Gene: GNB4 : More than one razor protein (try first): P62873-2, P62879, Q9HAV0 (Gene: GNB4) - vaep.io.mq - WARNING - Gene: GOLGA4 : More than one razor protein (try first): Q13439-3, E7EVX2 (Gene: GOLGA4) - vaep.io.mq - WARNING - Gene: GOLPH3L : More than one razor protein (try first): Q9H4A6, Q5T5I6 (Gene: GOLPH3L) - vaep.io.mq - WARNING - Gene: GSK3A : More than one razor protein (try first): A0A3B3ITW1, P49840 (Gene: GSK3A) - vaep.io.mq - WARNING - Gene: H2AFJ : More than one razor protein (try first): Q96QV6, Q96KK5 (Gene: H2AFJ) - vaep.io.mq - WARNING - Gene: H2AFV : More than one razor protein (try first): Q96QV6, Q71UI9 (Gene: H2AFV) - vaep.io.mq - WARNING - Gene: H2AFZ : More than one razor protein (try first): Q96QV6, Q71UI9 (Gene: H2AFZ) - vaep.io.mq - WARNING - Gene: H3F3A : More than one razor protein (try first): Q71DI3, K7EK07 (Gene: H3F3A) - vaep.io.mq - WARNING - Gene: H3F3B : More than one razor protein (try first): Q71DI3, K7EK07 (Gene: H3F3B) - vaep.io.mq - WARNING - Gene: HARS2 : More than one razor protein (try first): P12081-4, A0A2R8Y6I1 (Gene: HARS2) - vaep.io.mq - WARNING - Gene: HBA1 : More than one razor protein (try first): G3V1N2, CON__P01966 (Gene: HBA1) - vaep.io.mq - WARNING - Gene: HBA2 : More than one razor protein (try first): G3V1N2, CON__P01966 (Gene: HBA2) - vaep.io.mq - INFO - Gene: HBB : Potential contaminent protein is leading razor protein: CON__Q3SX09 (Gene: HBB) - vaep.io.mq - WARNING More than one set of genes: ['CON__Q3SX09;A0A2R8Y7R2;E9PFT6;P68871;P02042' - 'CON__P02070;A8MUF7;F8W6P5;E9PEW8;A0A2R8Y7X9;P69892;P69891;P02100;CON__Q3SX09;A0A2R8Y7R2;E9PFT6;P68871;P02042'] - vaep.io.mq - WARNING - Gene: HBB : Non-unique other protein set found (select first): A0A2R8Y7R2, P68871, E9PFT6, P02042 - vaep.io.mq - INFO - Gene: HBD : Potential contaminent protein is leading razor protein: CON__Q3SX09 (Gene: HBD) - vaep.io.mq - WARNING More than one set of genes: ['CON__Q3SX09;A0A2R8Y7R2;E9PFT6;P68871;P02042' - 'CON__P02070;A8MUF7;F8W6P5;E9PEW8;A0A2R8Y7X9;P69892;P69891;P02100;CON__Q3SX09;A0A2R8Y7R2;E9PFT6;P68871;P02042'] - vaep.io.mq - WARNING - Gene: HBD : Non-unique other protein set found (select first): A0A2R8Y7R2, P68871, E9PFT6, P02042 - vaep.io.mq - INFO - Gene: HBE1 : Potential contaminent protein is leading razor protein: CON__Q3SX09 (Gene: HBE1) - vaep.io.mq - WARNING - Gene: HBE1 : Non-unique other protein set found (select first): A0A2R8Y7R2, P68871, A0A2R8Y7X9, E9PEW8, P69892, F8W6P5, P69891, E9PFT6, A8MUF7, P02100, P02042 - vaep.io.mq - INFO - Gene: HBG1 : Potential contaminent protein is leading razor protein: CON__Q3SX09 (Gene: HBG1) - vaep.io.mq - WARNING - Gene: HBG1 : Non-unique other protein set found (select first): A0A2R8Y7R2, P68871, A0A2R8Y7X9, E9PEW8, P69892, F8W6P5, P69891, E9PFT6, A8MUF7, P02100, P02042 - vaep.io.mq - INFO - Gene: HBG2 : Potential contaminent protein is leading razor protein: CON__Q3SX09 (Gene: HBG2) - vaep.io.mq - WARNING - Gene: HBG2 : Non-unique other protein set found (select first): A0A2R8Y7R2, P68871, A0A2R8Y7X9, E9PEW8, P69892, F8W6P5, P69891, E9PFT6, A8MUF7, P02100, P02042 - vaep.io.mq - INFO - Gene: HBZ : Potential contaminent protein is leading razor protein: CON__P01966 (Gene: HBZ) - vaep.io.mq - WARNING - Gene: HBZ : Non-unique other protein set found (select first): P69905, P02008, A0A2R8Y7C0, G3V1N2 - vaep.io.mq - WARNING - Gene: HDAC1 : More than one razor protein (try first): Q13547, Q92769 (Gene: HDAC1) - vaep.io.mq - WARNING - Gene: HDGFRP2 : More than one razor protein (try first): Q7Z4V5-2, P51858 (Gene: HDGFRP2) - vaep.io.mq - WARNING - Gene: HIP1 : More than one razor protein (try first): O75146, O00291-3 (Gene: HIP1) - vaep.io.mq - WARNING - Gene: HIST1H1E: More than one razor protein (try first): P16403, P10412 (Gene: HIST1H1E) - vaep.io.mq - WARNING - Gene: HIST1H2AB: More than one razor protein (try first): Q96QV6, Q96KK5 (Gene: HIST1H2AB) - vaep.io.mq - WARNING - Gene: HIST1H2AC: More than one razor protein (try first): Q96QV6, Q96KK5 (Gene: HIST1H2AC) - vaep.io.mq - WARNING - Gene: HIST1H2AD: More than one razor protein (try first): Q96QV6, Q96KK5 (Gene: HIST1H2AD) - vaep.io.mq - WARNING - Gene: HIST1H2AG: More than one razor protein (try first): Q96QV6, Q96KK5 (Gene: HIST1H2AG) - vaep.io.mq - WARNING - Gene: HIST1H2AH: More than one razor protein (try first): Q96QV6, Q96KK5 (Gene: HIST1H2AH) - vaep.io.mq - WARNING - Gene: HIST1H2AJ: More than one razor protein (try first): Q96QV6, Q96KK5 (Gene: HIST1H2AJ) - vaep.io.mq - WARNING - Gene: HIST1H2BB: More than one razor protein (try first): U3KQK0, Q16778 (Gene: HIST1H2BB) - vaep.io.mq - WARNING - Gene: HIST1H2BJ: More than one razor protein (try first): U3KQK0, Q16778 (Gene: HIST1H2BJ) - vaep.io.mq - WARNING - Gene: HIST1H2BO: More than one razor protein (try first): U3KQK0, Q16778 (Gene: HIST1H2BO) - vaep.io.mq - WARNING - Gene: HIST1H3A: More than one razor protein (try first): Q71DI3, P68431 (Gene: HIST1H3A) - vaep.io.mq - WARNING - Gene: HIST2H2AA3: More than one razor protein (try first): Q96QV6, Q96KK5 (Gene: HIST2H2AA3) - vaep.io.mq - WARNING - Gene: HIST2H2AB: More than one razor protein (try first): Q96QV6, Q8IUE6 (Gene: HIST2H2AB) - vaep.io.mq - WARNING - Gene: HIST2H2AC: More than one razor protein (try first): Q96QV6, Q96KK5 (Gene: HIST2H2AC) - vaep.io.mq - WARNING - Gene: HIST2H2BC: More than one razor protein (try first): Q16778, U3KQK0 (Gene: HIST2H2BC) - vaep.io.mq - WARNING - Gene: HIST2H2BD: More than one razor protein (try first): Q16778, U3KQK0 (Gene: HIST2H2BD) - vaep.io.mq - WARNING - Gene: HIST2H2BE: More than one razor protein (try first): U3KQK0, Q16778 (Gene: HIST2H2BE) - vaep.io.mq - WARNING - Gene: HIST2H3PS2: More than one razor protein (try first): Q5TEC6, Q71DI3 (Gene: HIST2H3PS2) - vaep.io.mq - WARNING - Gene: HIST3H2A: More than one razor protein (try first): Q96QV6, Q96KK5 (Gene: HIST3H2A) - vaep.io.mq - WARNING - Gene: HIST3H2BB: More than one razor protein (try first): U3KQK0, Q16778 (Gene: HIST3H2BB) - vaep.io.mq - WARNING - Gene: HK1 : More than one razor protein (try first): P19367-4, E9PB90 (Gene: HK1) - vaep.io.mq - WARNING - Gene: HLA-A : More than one razor protein (try first): P01891, P10316 (Gene: HLA-A) - vaep.io.mq - WARNING - Gene: HLA-B : More than one razor protein (try first): P01891, P30508, A0A140T9M0, P30484 (Gene: HLA-B) - vaep.io.mq - WARNING - Gene: HLA-C : More than one razor protein (try first): P30508, P01891, A0A140T9M0 (Gene: HLA-C) - vaep.io.mq - WARNING - Gene: HLA-G : More than one razor protein (try first): Q8MH48, P30508 (Gene: HLA-G) - vaep.io.mq - WARNING - Gene: HLA-H : More than one razor protein (try first): P01891, P30508 (Gene: HLA-H) - vaep.io.mq - WARNING - Gene: HMGA1 : More than one razor protein (try first): P17096, P17096-2 (Gene: HMGA1) - vaep.io.mq - WARNING - Gene: HMGB2 : More than one razor protein (try first): P26583, Q5T7C4 (Gene: HMGB2) - vaep.io.mq - WARNING - Gene: HNRNPA1 : More than one razor protein (try first): P09651-2, P22626 (Gene: HNRNPA1) - vaep.io.mq - WARNING - Gene: HNRNPA1L2: More than one razor protein (try first): P09651-2, P22626 (Gene: HNRNPA1L2) - vaep.io.mq - WARNING - Gene: HNRNPA3 : More than one razor protein (try first): P51991, P09651-2 (Gene: HNRNPA3) - vaep.io.mq - WARNING - Gene: HNRNPAB : More than one razor protein (try first): Q14103-3, Q99729-3 (Gene: HNRNPAB) - vaep.io.mq - WARNING - Gene: HNRNPDL : More than one razor protein (try first): A0A087WUK2, Q14103-3 (Gene: HNRNPDL) - vaep.io.mq - WARNING - Gene: HNRNPF : More than one razor protein (try first): P31943, P52597 (Gene: HNRNPF) - vaep.io.mq - WARNING - Gene: HNRNPH2 : More than one razor protein (try first): P31943, P55795 (Gene: HNRNPH2) - vaep.io.mq - WARNING - Gene: HNRNPK : More than one razor protein (try first): P61978, P61978-3 (Gene: HNRNPK) - vaep.io.mq - WARNING - Gene: HNRNPM : More than one razor protein (try first): A0A087X0X3, M0QZM1 (Gene: HNRNPM) - vaep.io.mq - WARNING - Gene: HNRNPR : More than one razor protein (try first): O60506-2, O43390 (Gene: HNRNPR) - vaep.io.mq - WARNING - Gene: HNRNPU : More than one razor protein (try first): Q00839, Q5RI18 (Gene: HNRNPU) - vaep.io.mq - WARNING - Gene: HSP90AA1: More than one razor protein (try first): P08238, P07900 (Gene: HSP90AA1) - vaep.io.mq - WARNING - Gene: HSP90AA2P: More than one razor protein (try first): P08238, P07900 (Gene: HSP90AA2P) - vaep.io.mq - WARNING - Gene: HSP90AA5P: More than one razor protein (try first): P08238, P07900 (Gene: HSP90AA5P) - vaep.io.mq - WARNING - Gene: HSP90AB2P: More than one razor protein (try first): P08238, Q58FF8 (Gene: HSP90AB2P) - vaep.io.mq - WARNING - Gene: HSP90AB4P: More than one razor protein (try first): P08238, Q58FF6 (Gene: HSP90AB4P) - vaep.io.mq - WARNING - Gene: HSP90B1 : More than one razor protein (try first): P14625, P08238 (Gene: HSP90B1) - vaep.io.mq - WARNING - Gene: HSPA1A : More than one razor protein (try first): A0A0G2JIW1, P11142 (Gene: HSPA1A) - vaep.io.mq - WARNING - Gene: HSPA1B : More than one razor protein (try first): A0A0G2JIW1, P11142 (Gene: HSPA1B) - vaep.io.mq - WARNING - Gene: HSPA4L : More than one razor protein (try first): O95757, Q92598, P34932 (Gene: HSPA4L) - vaep.io.mq - WARNING - Gene: HSPA5 : More than one razor protein (try first): P11021, P11142 (Gene: HSPA5) - vaep.io.mq - WARNING - Gene: HSPA6 : More than one razor protein (try first): A0A0G2JIW1, P11142, P17066 (Gene: HSPA6) - vaep.io.mq - WARNING - Gene: HSPA7 : More than one razor protein (try first): A0A0G2JIW1, P11142, P17066 (Gene: HSPA7) - vaep.io.mq - WARNING - Gene: HSPA9 : More than one razor protein (try first): P38646, P11142 (Gene: HSPA9) - vaep.io.mq - WARNING - Gene: HSPE1-MOB4: More than one razor protein (try first): P61604, S4R3N1 (Gene: HSPE1-MOB4) - vaep.io.mq - WARNING - Gene: HSPH1 : More than one razor protein (try first): Q92598, P34932 (Gene: HSPH1) - vaep.io.mq - WARNING - Gene: HUWE1 : More than one razor protein (try first): Q7Z6Z7-2, A0A087X1S3 (Gene: HUWE1) - vaep.io.mq - WARNING - Gene: IDH2 : More than one razor protein (try first): P48735, O75874 (Gene: IDH2) - vaep.io.mq - WARNING - Gene: IFITM2 : More than one razor protein (try first): Q01629, E9PS44 (Gene: IFITM2) - vaep.io.mq - WARNING - Gene: IGF2BP1 : More than one razor protein (try first): Q9NZI8, O00425 (Gene: IGF2BP1) - vaep.io.mq - WARNING - Gene: IKBIP : More than one razor protein (try first): Q70UQ0-4, Q70UQ0 (Gene: IKBIP) - vaep.io.mq - WARNING - Gene: IMMT : More than one razor protein (try first): C9J406, Q16891-2 (Gene: IMMT) - vaep.io.mq - WARNING - Gene: IMPDH1 : More than one razor protein (try first): C9J381, P12268 (Gene: IMPDH1) - vaep.io.mq - WARNING - Gene: IPO8 : More than one razor protein (try first): O15397, O95373 (Gene: IPO8) - vaep.io.mq - WARNING - Gene: IQGAP2 : More than one razor protein (try first): P46940, E7EWC2 (Gene: IQGAP2) - vaep.io.mq - INFO - Gene: ITIH2 : Potential contaminent protein is leading razor protein: CON__Q9TRI1 (Gene: ITIH2) - vaep.io.mq - WARNING - Gene: ITIH2 : Non-unique other protein set found (select first): P19823, Q5T987, Q5T985 - vaep.io.mq - WARNING - Gene: ITPR1 : More than one razor protein (try first): Q14643, Q14573 (Gene: ITPR1) - vaep.io.mq - WARNING - Gene: ITPR2 : More than one razor protein (try first): Q14573, Q14571 (Gene: ITPR2) - vaep.io.mq - WARNING - Gene: ITSN2 : More than one razor protein (try first): A0A087WVF7, Q15811 (Gene: ITSN2) - vaep.io.mq - WARNING - Gene: KHSRP : More than one razor protein (try first): Q96AE4, Q92945 (Gene: KHSRP) - vaep.io.mq - WARNING - Gene: KIF1B : More than one razor protein (try first): A0A087WWA3, O43896 (Gene: KIF1B) - vaep.io.mq - WARNING - Gene: KIF1Bbeta: More than one razor protein (try first): A0A087WWA3, O43896 (Gene: KIF1Bbeta) - vaep.io.mq - WARNING - Gene: KLC1 : More than one razor protein (try first): Q07866-4, Q9H0B6 (Gene: KLC1) - vaep.io.mq - WARNING - Gene: KLC4 : More than one razor protein (try first): C9J8T5, Q9H0B6 (Gene: KLC4) - vaep.io.mq - WARNING - Gene: KPNA1 : More than one razor protein (try first): P52294, O60684 (Gene: KPNA1) - vaep.io.mq - WARNING - Gene: KPNA3 : More than one razor protein (try first): O00505, O00629 (Gene: KPNA3) - vaep.io.mq - WARNING - Gene: KRAS : More than one razor protein (try first): P01116-2, P01111 (Gene: KRAS) - vaep.io.mq - WARNING - Gene: KRT1 : More than one razor protein (try first): CON__P04264, P05787 (Gene: KRT1) - vaep.io.mq - INFO - Gene: KRT1 : Potential contaminent protein is leading razor protein: CON__P04264 (Gene: KRT1) - vaep.io.mq - WARNING More than one set of genes: ['CON__P04264;P04264' - 'P05787;CON__P05787;P05787-2;F8VUG2;F8W1U3;CON__H-INV:HIT000292931;F8VP67;CON__Q9H552;P04259;CON__P02538;CON__P48668;CON__P04259;P48668;P02538;CON__Q922U2;CON__Q5XQN5;CON__H-INV:HIT000016045;F8W0C6;CON__Q8VED5;CON__P13647;P13647;CON__Q6IFZ6;CON__P04264;P04264' - 'P05787;CON__P05787;P05787-2;F8VUG2;F8W1U3;F8VP67;P04259;CON__P02538;CON__P48668;CON__P04259;P48668;P02538;CON__Q922U2;CON__Q5XQN5;CON__H-INV:HIT000016045;F8W0C6;CON__Q8VED5;CON__P13647;P13647;CON__Q6IFZ6;CON__P04264;P04264' - 'P05787;CON__P05787;P05787-2;F8VUG2;F8W1U3;F8VP67;P04259;CON__P02538;CON__P48668;CON__P04259;P48668;P02538;CON__Q922U2;CON__Q5XQN5;CON__H-INV:HIT000016045;F8W0C6;CON__Q8VED5;CON__P13647;P13647;CON__Q8BGZ7;CON__P50446;CON__O95678;CON__Q6IFZ6;O95678;Q01546;CON__P12035;CON__Q01546;P12035;CON__Q3TTY5;CON__Q7Z794;P35908;CON__P35908v2;CON__P35908;Q7Z794;H0YIN9;CON__ENSEMBL:ENSBTAP00000038253;CON__P04264;P04264'] - vaep.io.mq - WARNING - Gene: KRT13 : More than one razor protein (try first): P05783, Q04695 (Gene: KRT13) - vaep.io.mq - WARNING - Gene: KRT14 : More than one razor protein (try first): Q04695, P05783 (Gene: KRT14) - vaep.io.mq - WARNING - Gene: KRT15 : More than one razor protein (try first): P05783, Q04695 (Gene: KRT15) - vaep.io.mq - WARNING - Gene: KRT16 : More than one razor protein (try first): Q04695, P05783 (Gene: KRT16) - vaep.io.mq - WARNING - Gene: KRT17 : More than one razor protein (try first): Q04695, P05783 (Gene: KRT17) - vaep.io.mq - WARNING - Gene: KRT3 : More than one razor protein (try first): P05787, P08670 (Gene: KRT3) - vaep.io.mq - WARNING - Gene: KRT4 : More than one razor protein (try first): CON__P08729, P05787 (Gene: KRT4) - vaep.io.mq - INFO - Gene: KRT4 : Potential contaminent protein is leading razor protein: CON__P08729 (Gene: KRT4) - vaep.io.mq - WARNING More than one set of genes: ['CON__P08729;CON__P19013;P19013;CON__Q3KNV1;P08729' - 'P05787;CON__P05787;P05787-2;F8VUG2;F8W1U3;CON__Q9H552;P04259;CON__P02538;CON__P48668;CON__P04259;P48668;P02538;CON__Q922U2;CON__Q5XQN5;F8W0C6;CON__Q8VED5;CON__P13647;P13647;CON__Q8BGZ7;CON__P50446;CON__O95678;CON__Q6IFZ6;O95678;CON__Q6ISB0;CON__Q9NSB2;Q01546;Q9NSB2;CON__P12035;CON__Q01546;P12035;CON__Q3TTY5;CON__Q7Z794;P35908;CON__P35908v2;CON__P35908;CON__P07744;Q7Z794;CON__Q3SY84;CON__Q7RTS7;F8W1S1;Q7RTS7;Q86Y46-2;Q3SY84;CON__Q9R0H5;H0YHD9;CON__Q32MB2;Q86Y46;CON__P08729;CON__Q9DCV7;CON__Q14CN4-1;CON__Q6IME9;CON__Q6NXH9;CON__P19013;Q14CN4-3;Q14CN4-2;Q14CN4;P19013;CON__Q3KNV1;P08729' - 'P05787;CON__P05787;P05787-2;CON__H-INV:HIT000292931;A0A1X7SCE1;K7EJU1;K7EPI4;A0A1W2PS58;A0A1W2PQU7;CON__P08729;CON__Q9DCV7;CON__Q6NXH9;CON__P19013;P19013;CON__Q3KNV1;P08729;CON__Q6KB66-1;Q6KB66-2;Q6KB66;Q6KB66-3'] - vaep.io.mq - WARNING - Gene: KRT4 : Non-unique other protein set found (select first): P19013, P08729 - vaep.io.mq - WARNING - Gene: KRT5 : More than one razor protein (try first): P05787, P08670 (Gene: KRT5) - vaep.io.mq - WARNING - Gene: KRT6A : More than one razor protein (try first): P05787, P08670 (Gene: KRT6A) - vaep.io.mq - WARNING - Gene: KRT6B : More than one razor protein (try first): P05787, P08670 (Gene: KRT6B) - vaep.io.mq - WARNING - Gene: KRT6C : More than one razor protein (try first): P05787, P08670 (Gene: KRT6C) - vaep.io.mq - WARNING - Gene: KRT7 : More than one razor protein (try first): CON__P08729, P05787, P08670, CON__Q3KNV1 (Gene: KRT7) - vaep.io.mq - INFO - Gene: KRT7 : Potential contaminent protein is leading razor protein: CON__P08729 (Gene: KRT7) - vaep.io.mq - WARNING More than one set of genes: ['CON__P08729;A0A1W2PRP1;CON__Q9DCV7;CON__Q3KNV1;P08729' - 'CON__P08729;CON__Q3KNV1;P08729' - 'CON__P08729;A0A1W2PRP1;CON__Q3KNV1;P08729' - 'CON__P08729;CON__Q6NXH9;CON__Q3KNV1;P08729' - 'CON__P08729;CON__P19013;P19013;CON__Q3KNV1;P08729' - 'P05787;CON__P05787;P05787-2;F8VUG2;F8W1U3;CON__Q9H552;P04259;CON__P02538;CON__P48668;CON__P04259;P48668;P02538;CON__Q922U2;CON__Q5XQN5;F8W0C6;CON__Q8VED5;CON__P13647;P13647;CON__Q8BGZ7;CON__P50446;CON__O95678;CON__Q6IFZ6;O95678;CON__Q6ISB0;CON__Q9NSB2;Q01546;Q9NSB2;CON__P12035;CON__Q01546;P12035;CON__Q3TTY5;CON__Q7Z794;P35908;CON__P35908v2;CON__P35908;CON__P07744;Q7Z794;CON__Q3SY84;CON__Q7RTS7;F8W1S1;Q7RTS7;Q86Y46-2;Q3SY84;CON__Q9R0H5;H0YHD9;CON__Q32MB2;Q86Y46;CON__P08729;CON__Q9DCV7;CON__Q14CN4-1;CON__Q6IME9;CON__Q6NXH9;CON__P19013;Q14CN4-3;Q14CN4-2;Q14CN4;P19013;CON__Q3KNV1;P08729' - 'P05787;CON__P05787;P05787-2;F8VUG2;F8W1U3;CON__H-INV:HIT000292931;P04259;CON__P02538;CON__P48668;CON__P04259;P48668;P02538;CON__Q922U2;CON__Q5XQN5;F8W0C6;CON__Q8VED5;CON__P13647;P13647;CON__Q8BGZ7;CON__P50446;CON__O95678;O95678;CON__Q6ISB0;CON__Q9NSB2;Q01546;Q9NSB2;CON__P12035;CON__Q01546;P12035;Q8IWY7;A0A1X7SCE1;CON__P07744;K7EJU1;CON__Q61726;CON__P78386;Q14533;K7EKH9;CON__Q6NT21;CON__O43790;CON__Q7RTT2;CON__Q8N1N4-2;O43790;Q8N1N4-2;P78385;Q8N1N4;B4DIR1;K7ELP4;P78386;CON__Q14533;U3KPR1;A0A087X106;CON__P78385;P08670;B0YJC4;A0A1B0GTT5;CON__P08729;CON__Q9DCV7;CON__Q3KNV1;P08729' - 'CON__Q3KNV1;P08729' - 'P05787;CON__P05787;P05787-2;CON__H-INV:HIT000292931;A0A1X7SCE1;K7EJU1;K7EPI4;A0A1W2PS58;A0A1W2PQU7;CON__P08729;CON__Q9DCV7;CON__Q6NXH9;CON__P19013;P19013;CON__Q3KNV1;P08729;CON__Q6KB66-1;Q6KB66-2;Q6KB66;Q6KB66-3' - 'P05787;CON__P05787;P05787-2;CON__Q9H552;CON__Q6ISB0;CON__Q9NSB2;Q9NSB2;P08670;B0YJC4;B0YJC5;P41219;P17661;P41219-2;H7C5W5;CON__P08729;CON__Q9DCV7;CON__Q3KNV1;P08729' - 'CON__P08729;CON__Q14CN4-1;CON__Q6IME9;Q14CN4-3;Q14CN4-2;Q14CN4;CON__Q3KNV1;P08729'] - vaep.io.mq - WARNING - Gene: KRT72 : More than one razor protein (try first): P05787, CON__P08729 (Gene: KRT72) - vaep.io.mq - WARNING - Gene: KRT75 : More than one razor protein (try first): P05787, P08670 (Gene: KRT75) - vaep.io.mq - WARNING - Gene: KRT76 : More than one razor protein (try first): P05787, P08670 (Gene: KRT76) - vaep.io.mq - WARNING - Gene: KRT8 : More than one razor protein (try first): P05787, P08670 (Gene: KRT8) - vaep.io.mq - WARNING - Gene: KRT80 : More than one razor protein (try first): CON__Q6KB66-1, P05787 (Gene: KRT80) - vaep.io.mq - INFO - Gene: KRT80 : Potential contaminent protein is leading razor protein: CON__Q6KB66-1 (Gene: KRT80) - vaep.io.mq - WARNING More than one set of genes: ['CON__Q6KB66-1;Q6KB66-2;Q6KB66' - 'CON__Q6KB66-1;Q6KB66-2;Q6KB66;CON__Q0VBK2' - 'CON__Q6KB66-1;Q6KB66-2;Q6KB66;Q6KB66-3' - 'P05787;CON__P05787;P05787-2;CON__H-INV:HIT000292931;A0A1X7SCE1;K7EJU1;K7EPI4;A0A1W2PS58;A0A1W2PQU7;CON__P08729;CON__Q9DCV7;CON__Q6NXH9;CON__P19013;P19013;CON__Q3KNV1;P08729;CON__Q6KB66-1;Q6KB66-2;Q6KB66;Q6KB66-3'] - vaep.io.mq - WARNING - Gene: KRT80 : Non-unique other protein set found (select first): Q6KB66-2, Q6KB66 - vaep.io.mq - WARNING - Gene: KRT84 : More than one razor protein (try first): P05787, P08670 (Gene: KRT84) - vaep.io.mq - WARNING - Gene: LARP1B : More than one razor protein (try first): Q6PKG0, A0A3B3ISF0 (Gene: LARP1B) - vaep.io.mq - WARNING - Gene: LDHB : More than one razor protein (try first): P07195, P00338-3 (Gene: LDHB) - vaep.io.mq - WARNING - Gene: LMNA : More than one razor protein (try first): P02545, P02545-2 (Gene: LMNA) - vaep.io.mq - WARNING - Gene: LMNB1 : More than one razor protein (try first): P20700, P02545 (Gene: LMNB1) - vaep.io.mq - WARNING - Gene: LMNB2 : More than one razor protein (try first): Q03252, P20700, P02545 (Gene: LMNB2) - vaep.io.mq - WARNING - Gene: LRRFIP2 : More than one razor protein (try first): Q32MZ4-3, Q9Y608-4 (Gene: LRRFIP2) - vaep.io.mq - INFO - Gene: LTF : Potential contaminent protein is leading razor protein: CON__Q29443 (Gene: LTF) - vaep.io.mq - WARNING - Gene: LTF : Non-unique other protein set found (select first): P02788-2, E7EQB2, E7ER44, P02788 - vaep.io.mq - WARNING - Gene: LUC7L : More than one razor protein (try first): Q9Y383, Q9NQ29-2 (Gene: LUC7L) - vaep.io.mq - WARNING - Gene: LUC7L2 : More than one razor protein (try first): Q9Y383, Q9Y383-3 (Gene: LUC7L2) - vaep.io.mq - WARNING - Gene: MAP2K2 : More than one razor protein (try first): P36507, Q02750 (Gene: MAP2K2) - vaep.io.mq - WARNING - Gene: MAP2K6 : More than one razor protein (try first): P52564, P46734-2 (Gene: MAP2K6) - vaep.io.mq - WARNING - Gene: MAPKAPK2: More than one razor protein (try first): Q16644, P49137-2 (Gene: MAPKAPK2) - vaep.io.mq - WARNING - Gene: MAPRE2 : More than one razor protein (try first): Q15555-4, Q15691 (Gene: MAPRE2) - vaep.io.mq - WARNING - Gene: MARC2 : More than one razor protein (try first): Q5VT66, F6V6Z1 (Gene: MARC2) - vaep.io.mq - WARNING - Gene: MARK1 : More than one razor protein (try first): B4DIB3, Q7KZI7 (Gene: MARK1) - vaep.io.mq - WARNING - Gene: MARK3 : More than one razor protein (try first): P27448-6, B4DIB3, Q7KZI7 (Gene: MARK3) - vaep.io.mq - WARNING - Gene: MASTL : More than one razor protein (try first): Q96GX5-2, E9PGT3 (Gene: MASTL) - vaep.io.mq - WARNING - Gene: MBNL3 : More than one razor protein (try first): Q9NR56, B1AKI6 (Gene: MBNL3) - vaep.io.mq - WARNING - Gene: MOCS2 : More than one razor protein (try first): E9PKT9, O96007 (Gene: MOCS2) - vaep.io.mq - WARNING - Gene: MORF4L2 : More than one razor protein (try first): Q15014, B3KTM8 (Gene: MORF4L2) - vaep.io.mq - WARNING - Gene: MRPL12 : More than one razor protein (try first): P52815, B4DLN1 (Gene: MRPL12) - vaep.io.mq - WARNING - Gene: MRPS28 : More than one razor protein (try first): Q9Y2Q9, H0YC42 (Gene: MRPS28) - vaep.io.mq - WARNING - Gene: MTA1 : More than one razor protein (try first): E7ESY4, O94776 (Gene: MTA1) - vaep.io.mq - WARNING - Gene: MTCH2 : More than one razor protein (try first): Q9Y6C9, E9PIE4 (Gene: MTCH2) - vaep.io.mq - WARNING - Gene: MTHFD1L : More than one razor protein (try first): B7ZM99, P11586 (Gene: MTHFD1L) - vaep.io.mq - WARNING - Gene: MYL9 : More than one razor protein (try first): J3QRS3, P24844 (Gene: MYL9) - vaep.io.mq - WARNING - Gene: NAP1L4 : More than one razor protein (try first): E9PNW0, P55209 (Gene: NAP1L4) - vaep.io.mq - WARNING - Gene: NARS : More than one razor protein (try first): O43776, K7ENF0 (Gene: NARS) - vaep.io.mq - WARNING - Gene: NEDD4L : More than one razor protein (try first): P46934-4, Q96PU5-3 (Gene: NEDD4L) - vaep.io.mq - WARNING - Gene: NIPSNAP1: More than one razor protein (try first): F8WCR5, F8WBI5 (Gene: NIPSNAP1) - vaep.io.mq - WARNING - Gene: NME1 : More than one razor protein (try first): Q32Q12, P15531 (Gene: NME1) - vaep.io.mq - WARNING - Gene: NOMO2 : More than one razor protein (try first): A0A0G2JN29, J3KN36 (Gene: NOMO2) - vaep.io.mq - WARNING - Gene: NOMO3 : More than one razor protein (try first): A0A0G2JN29, J3KN36 (Gene: NOMO3) - vaep.io.mq - WARNING - Gene: NONO : More than one razor protein (try first): Q15233, P23246 (Gene: NONO) - vaep.io.mq - WARNING - Gene: NPM1 : More than one razor protein (try first): P06748-2, P06748, E5RI98 (Gene: NPM1) - vaep.io.mq - WARNING - Gene: ORC3 : More than one razor protein (try first): Q9UBD5, O75330-2 (Gene: ORC3) - vaep.io.mq - WARNING - Gene: OSBPL11 : More than one razor protein (try first): Q9BXB4, Q9BXB5 (Gene: OSBPL11) - vaep.io.mq - WARNING - Gene: PABPC4 : More than one razor protein (try first): Q13310-3, P11940 (Gene: PABPC4) - vaep.io.mq - WARNING - Gene: PAK1 : More than one razor protein (try first): Q13177, Q13153 (Gene: PAK1) - vaep.io.mq - WARNING - Gene: PARP12 : More than one razor protein (try first): Q9H0J9, Q7Z2W4 (Gene: PARP12) - vaep.io.mq - WARNING - Gene: PCBP2 : More than one razor protein (try first): Q15365, Q15366-6, F8VZX2 (Gene: PCBP2) - vaep.io.mq - WARNING - Gene: PCBP3 : More than one razor protein (try first): Q15365, Q15366-6 (Gene: PCBP3) - vaep.io.mq - WARNING - Gene: PDE3B : More than one razor protein (try first): Q14432, Q13370-2 (Gene: PDE3B) - vaep.io.mq - WARNING - Gene: PDIA3 : More than one razor protein (try first): P30101, H7BZJ3 (Gene: PDIA3) - vaep.io.mq - WARNING - Gene: PDS5B : More than one razor protein (try first): Q29RF7, Q9NTI5-2 (Gene: PDS5B) - vaep.io.mq - WARNING - Gene: PEX19 : More than one razor protein (try first): Q5QNY5, G3V3G9 (Gene: PEX19) - vaep.io.mq - WARNING - Gene: PFKL : More than one razor protein (try first): Q01813, P17858, P08237-3 (Gene: PFKL) - vaep.io.mq - WARNING - Gene: PFKM : More than one razor protein (try first): P08237-3, Q01813 (Gene: PFKM) - vaep.io.mq - WARNING - Gene: PGRMC2 : More than one razor protein (try first): O15173-2, O00264 (Gene: PGRMC2) - vaep.io.mq - WARNING - Gene: PITPNA : More than one razor protein (try first): P48739, F5GWE5 (Gene: PITPNA) - vaep.io.mq - WARNING - Gene: PLEC : More than one razor protein (try first): Q15149, Q15149-8, Q15149-4 (Gene: PLEC) - vaep.io.mq - WARNING - Gene: PLS1 : More than one razor protein (try first): P13797, Q14651 (Gene: PLS1) - vaep.io.mq - WARNING - Gene: POLR3A : More than one razor protein (try first): O14802, B9ZVN9 (Gene: POLR3A) - vaep.io.mq - INFO - Gene: POSTN : Potential contaminent protein is leading razor protein: CON__Q2KJC7 (Gene: POSTN) - vaep.io.mq - WARNING - Gene: POSTN : Non-unique other protein set found (select first): Q15063-7, Q15063-5, Q15063-4, B1ALD9, Q15063-3, Q15063-6, Q15063, Q15063-2 - vaep.io.mq - WARNING - Gene: POTEE : More than one razor protein (try first): P63261, Q6S8J3 (Gene: POTEE) - vaep.io.mq - WARNING - Gene: POTEF : More than one razor protein (try first): P63261, Q6S8J3 (Gene: POTEF) - vaep.io.mq - WARNING - Gene: POTEI : More than one razor protein (try first): P63261, Q6S8J3 (Gene: POTEI) - vaep.io.mq - WARNING - Gene: POTEJ : More than one razor protein (try first): P63261, Q6S8J3 (Gene: POTEJ) - vaep.io.mq - WARNING - Gene: PPIA : More than one razor protein (try first): P62937, P62937-2 (Gene: PPIA) - vaep.io.mq - WARNING - Gene: PPP1CB : More than one razor protein (try first): P62140, P62136 (Gene: PPP1CB) - vaep.io.mq - WARNING - Gene: PPP1CC : More than one razor protein (try first): P62136, F8VYE8 (Gene: PPP1CC) - vaep.io.mq - WARNING - Gene: PPP2CB : More than one razor protein (try first): P67775, P62714 (Gene: PPP2CB) - vaep.io.mq - WARNING - Gene: PPP2R1B : More than one razor protein (try first): P30154, P30153 (Gene: PPP2R1B) - vaep.io.mq - WARNING - Gene: PPP2R5C : More than one razor protein (try first): H0YJ75, E9PFR3 (Gene: PPP2R5C) - vaep.io.mq - WARNING - Gene: PPP2R5E : More than one razor protein (try first): Q16537-3, Q15172 (Gene: PPP2R5E) - vaep.io.mq - WARNING - Gene: PPP4C : More than one razor protein (try first): H3BV22, P67775 (Gene: PPP4C) - vaep.io.mq - WARNING - Gene: PRDX2 : More than one razor protein (try first): P32119, Q06830 (Gene: PRDX2) - vaep.io.mq - WARNING - Gene: PRDX4 : More than one razor protein (try first): Q13162, Q06830 (Gene: PRDX4) - vaep.io.mq - WARNING - Gene: PRPS1 : More than one razor protein (try first): P11908, B1ALA9 (Gene: PRPS1) - vaep.io.mq - WARNING - Gene: PRPSAP2 : More than one razor protein (try first): O60256, Q14558 (Gene: PRPSAP2) - vaep.io.mq - WARNING - Gene: PRRC2A : More than one razor protein (try first): P48634, E7EPN9 (Gene: PRRC2A) - vaep.io.mq - WARNING - Gene: PRRC2B : More than one razor protein (try first): E7EPN9, Q5JSZ5, Q8NE71 (Gene: PRRC2B) - vaep.io.mq - WARNING - Gene: PSIP1 : More than one razor protein (try first): O75475, P51858 (Gene: PSIP1) - vaep.io.mq - WARNING - Gene: PSMD4 : More than one razor protein (try first): P55036, Q5VWC4 (Gene: PSMD4) - vaep.io.mq - WARNING - Gene: PSPC1 : More than one razor protein (try first): Q8WXF1, Q15233 (Gene: PSPC1) - vaep.io.mq - WARNING - Gene: PTBP3 : More than one razor protein (try first): O95758-1, A0A0U1RRM4 (Gene: PTBP3) - vaep.io.mq - WARNING - Gene: PUM2 : More than one razor protein (try first): A0A0C4DG68, H0YEH2 (Gene: PUM2) - vaep.io.mq - WARNING - Gene: PYCR1 : More than one razor protein (try first): P32322, Q96C36 (Gene: PYCR1) - vaep.io.mq - WARNING - Gene: PYGB : More than one razor protein (try first): P11216, P06737 (Gene: PYGB) - vaep.io.mq - WARNING - Gene: QARS : More than one razor protein (try first): P47897, P07814 (Gene: QARS) - vaep.io.mq - WARNING - Gene: RAB10 : More than one razor protein (try first): P61026, P62820 (Gene: RAB10) - vaep.io.mq - WARNING - Gene: RAB13 : More than one razor protein (try first): P62820, P51153 (Gene: RAB13) - vaep.io.mq - WARNING - Gene: RAB1B : More than one razor protein (try first): P62820, Q9H0U4 (Gene: RAB1B) - vaep.io.mq - WARNING - Gene: RAB1C : More than one razor protein (try first): P62820, Q9H0U4 (Gene: RAB1C) - vaep.io.mq - WARNING - Gene: RAB34 : More than one razor protein (try first): Q9BZG1, A0A1C7CYW6 (Gene: RAB34) - vaep.io.mq - WARNING - Gene: RAB5A : More than one razor protein (try first): P51148, P20339-2 (Gene: RAB5A) - vaep.io.mq - WARNING - Gene: RAB5B : More than one razor protein (try first): P51148, P61020 (Gene: RAB5B) - vaep.io.mq - WARNING - Gene: RAB8A : More than one razor protein (try first): P61006, P62820 (Gene: RAB8A) - vaep.io.mq - WARNING - Gene: RAB8B : More than one razor protein (try first): P62820, P61006 (Gene: RAB8B) - vaep.io.mq - WARNING - Gene: RABGAP1L: More than one razor protein (try first): A0A0C4DG54, Q9Y3P9 (Gene: RABGAP1L) - vaep.io.mq - WARNING - Gene: RAC1 : More than one razor protein (try first): P63000, P60953 (Gene: RAC1) - vaep.io.mq - WARNING - Gene: RAC2 : More than one razor protein (try first): P60953, P63000 (Gene: RAC2) - vaep.io.mq - WARNING - Gene: RAC3 : More than one razor protein (try first): P63000, P60953 (Gene: RAC3) - vaep.io.mq - WARNING - Gene: RAD23A : More than one razor protein (try first): P54727, P54725-3 (Gene: RAD23A) - vaep.io.mq - WARNING - Gene: RAF1 : More than one razor protein (try first): H7C155, Q96II5 (Gene: RAF1) - vaep.io.mq - WARNING - Gene: RALB : More than one razor protein (try first): P11234, P11233 (Gene: RALB) - vaep.io.mq - WARNING - Gene: RANBP6 : More than one razor protein (try first): O00410, O60518 (Gene: RANBP6) - vaep.io.mq - WARNING - Gene: RAP1A : More than one razor protein (try first): P61224, P62834 (Gene: RAP1A) - vaep.io.mq - WARNING - Gene: RAP2B : More than one razor protein (try first): P61225, Q9Y3L5 (Gene: RAP2B) - vaep.io.mq - WARNING - Gene: RBBP4 : More than one razor protein (try first): Q09028-3, Q16576 (Gene: RBBP4) - vaep.io.mq - WARNING - Gene: RBM27 : More than one razor protein (try first): Q5T8P6-2, Q9P2N5 (Gene: RBM27) - vaep.io.mq - WARNING - Gene: RBM4B : More than one razor protein (try first): E9PB51, Q9BQ04 (Gene: RBM4B) - vaep.io.mq - WARNING - Gene: RBM5 : More than one razor protein (try first): P52756, P98175-2 (Gene: RBM5) - vaep.io.mq - WARNING - Gene: RBMS2 : More than one razor protein (try first): E7ETU5, F8VV01 (Gene: RBMS2) - vaep.io.mq - WARNING - Gene: RBMXL1 : More than one razor protein (try first): P38159, Q96E39 (Gene: RBMXL1) - vaep.io.mq - WARNING - Gene: RCOR3 : More than one razor protein (try first): Q9P2K3-4, Q9UKL0 (Gene: RCOR3) - vaep.io.mq - WARNING - Gene: RDX : More than one razor protein (try first): A0A2R8Y5S7, P26038 (Gene: RDX) - vaep.io.mq - WARNING - Gene: RHOB : More than one razor protein (try first): P62745, P61586 (Gene: RHOB) - vaep.io.mq - WARNING - Gene: RHOC : More than one razor protein (try first): Q5JR08, P61586 (Gene: RHOC) - vaep.io.mq - WARNING - Gene: RHOG : More than one razor protein (try first): P60953, P84095 (Gene: RHOG) - vaep.io.mq - WARNING - Gene: RHOT2 : More than one razor protein (try first): H7BXZ6, I3L2C6 (Gene: RHOT2) - vaep.io.mq - WARNING - Gene: RNF20 : More than one razor protein (try first): Q5VTR2, O75150 (Gene: RNF20) - vaep.io.mq - WARNING - Gene: ROCK1 : More than one razor protein (try first): A0A0U1RQV4, O75116 (Gene: ROCK1) - vaep.io.mq - WARNING - Gene: RPL10 : More than one razor protein (try first): X1WI28, F8W7C6 (Gene: RPL10) - vaep.io.mq - WARNING - Gene: RPL26L1 : More than one razor protein (try first): P61254, Q9UNX3 (Gene: RPL26L1) - vaep.io.mq - WARNING - Gene: RPL36A : More than one razor protein (try first): H0Y5B4, Q969Q0 (Gene: RPL36A) - vaep.io.mq - WARNING - Gene: RPL36A-HNRNPH2: More than one razor protein (try first): H0Y5B4, Q969Q0 (Gene: RPL36A-HNRNPH2) - vaep.io.mq - WARNING - Gene: RPS10 : More than one razor protein (try first): P46783, A0A1W2PQS6, Q15149 (Gene: RPS10) - vaep.io.mq - WARNING - Gene: RPS10P5 : More than one razor protein (try first): P46783, Q15149, A0A1W2PQS6 (Gene: RPS10P5) - vaep.io.mq - WARNING - Gene: RPS27L : More than one razor protein (try first): H0YMV8, P42677 (Gene: RPS27L) - vaep.io.mq - WARNING - Gene: RPS6KA3 : More than one razor protein (try first): P51812, E9PGT3 (Gene: RPS6KA3) - vaep.io.mq - WARNING - Gene: RPS6KA6 : More than one razor protein (try first): P51812, E9PGT3 (Gene: RPS6KA6) - vaep.io.mq - WARNING - Gene: RPS6KB1 : More than one razor protein (try first): Q9UBS0, P23443-4 (Gene: RPS6KB1) - vaep.io.mq - WARNING - Gene: RRAS : More than one razor protein (try first): P10301, E9PK85 (Gene: RRAS) - vaep.io.mq - WARNING - Gene: RUFY2 : More than one razor protein (try first): H0YD93, Q96T51 (Gene: RUFY2) - vaep.io.mq - WARNING - Gene: SAFB2 : More than one razor protein (try first): Q15424, Q14151 (Gene: SAFB2) - vaep.io.mq - WARNING - Gene: SAR1B : More than one razor protein (try first): D6RD69, Q9NR31 (Gene: SAR1B) - vaep.io.mq - WARNING - Gene: SART3 : More than one razor protein (try first): Q15020, F8VV04 (Gene: SART3) - vaep.io.mq - WARNING - Gene: SCAF4 : More than one razor protein (try first): O95104-3, A0A0A0MT33 (Gene: SCAF4) - vaep.io.mq - WARNING - Gene: SDCBP : More than one razor protein (try first): G5EA09, O00560-3 (Gene: SDCBP) - vaep.io.mq - WARNING - Gene: SEC23A : More than one razor protein (try first): F5H365, A0A2R8YFH5 (Gene: SEC23A) - vaep.io.mq - WARNING - Gene: SERBP1 : More than one razor protein (try first): Q8NC51-3, Q8NC51 (Gene: SERBP1) - vaep.io.mq - WARNING - Gene: SERPINB8: More than one razor protein (try first): P50452, P30740 (Gene: SERPINB8) - vaep.io.mq - WARNING - Gene: SF1 : More than one razor protein (try first): Q15637-4, Q15637-5, H7C561 (Gene: SF1) - vaep.io.mq - WARNING - Gene: SFN : More than one razor protein (try first): P62258, P31947-2 (Gene: SFN) - vaep.io.mq - WARNING - Gene: SLC25A12: More than one razor protein (try first): Q9UJS0-2, O75746 (Gene: SLC25A12) - vaep.io.mq - WARNING - Gene: SLC25A4 : More than one razor protein (try first): P12236, P05141 (Gene: SLC25A4) - vaep.io.mq - WARNING - Gene: SLC25A6 : More than one razor protein (try first): P12236, P05141 (Gene: SLC25A6) - vaep.io.mq - WARNING - Gene: SLC3A2 : More than one razor protein (try first): F5GZS6, F5H867 (Gene: SLC3A2) - vaep.io.mq - WARNING - Gene: SLC6A15 : More than one razor protein (try first): F8VSG1, F8VX16 (Gene: SLC6A15) - vaep.io.mq - WARNING - Gene: SLK : More than one razor protein (try first): Q9H2G2-2, Q9H2G2 (Gene: SLK) - vaep.io.mq - WARNING - Gene: SMARCA2 : More than one razor protein (try first): P51531-2, P51532 (Gene: SMARCA2) - vaep.io.mq - WARNING - Gene: SMARCC1 : More than one razor protein (try first): Q92922, F8VXC8 (Gene: SMARCC1) - vaep.io.mq - WARNING - Gene: SMEK2 : More than one razor protein (try first): G3V5Z3, Q5MIZ7-3 (Gene: SMEK2) - vaep.io.mq - WARNING - Gene: SMPD4 : More than one razor protein (try first): F8WF03, Q9NXE4-2 (Gene: SMPD4) - vaep.io.mq - WARNING - Gene: SNRPB2 : More than one razor protein (try first): P09012, P08579 (Gene: SNRPB2) - vaep.io.mq - WARNING - Gene: SNX12 : More than one razor protein (try first): A0A087X0R6, O60493 (Gene: SNX12) - vaep.io.mq - WARNING - Gene: SNX2 : More than one razor protein (try first): O60749, Q13596-2 (Gene: SNX2) - vaep.io.mq - WARNING - Gene: SNX6 : More than one razor protein (try first): A0A0A0MRI2, Q9Y5X3 (Gene: SNX6) - vaep.io.mq - WARNING - Gene: SP100 : More than one razor protein (try first): Q5T7C4, P23497 (Gene: SP100) - vaep.io.mq - WARNING - Gene: SPATA5 : More than one razor protein (try first): Q8NB90, P62195-2 (Gene: SPATA5) - vaep.io.mq - WARNING - Gene: SPECC1L : More than one razor protein (try first): A0A2R8Y5S7, A0A494C1J1 (Gene: SPECC1L) - vaep.io.mq - WARNING - Gene: SPECC1L-ADORA2A: More than one razor protein (try first): A0A2R8Y5S7, A0A494C1J1 (Gene: SPECC1L-ADORA2A) - vaep.io.mq - WARNING - Gene: SPTBN1 : More than one razor protein (try first): Q01082, Q01082-3 (Gene: SPTBN1) - vaep.io.mq - WARNING - Gene: SPTBN2 : More than one razor protein (try first): O15020, Q01082 (Gene: SPTBN2) - vaep.io.mq - WARNING - Gene: SRPK2 : More than one razor protein (try first): P78362-2, Q96SB4 (Gene: SRPK2) - vaep.io.mq - WARNING - Gene: SRSF4 : More than one razor protein (try first): Q08170, Q13247 (Gene: SRSF4) - vaep.io.mq - WARNING - Gene: SRSF5 : More than one razor protein (try first): Q13247, Q13243-3 (Gene: SRSF5) - vaep.io.mq - WARNING - Gene: SRSF7 : More than one razor protein (try first): A0A0B4J1Z1, P84103 (Gene: SRSF7) - vaep.io.mq - WARNING - Gene: STAM2 : More than one razor protein (try first): O75886, Q92783-2 (Gene: STAM2) - vaep.io.mq - WARNING - Gene: STAT5B : More than one razor protein (try first): K7EK35, P51692 (Gene: STAT5B) - vaep.io.mq - WARNING - Gene: STK10 : More than one razor protein (try first): O94804, Q9H2G2-2 (Gene: STK10) - vaep.io.mq - WARNING - Gene: STK24 : More than one razor protein (try first): B4DR80, Q8NBY1 (Gene: STK24) - vaep.io.mq - WARNING - Gene: STMN2 : More than one razor protein (try first): E5RGX5, P16949 (Gene: STMN2) - vaep.io.mq - WARNING - Gene: STRBP : More than one razor protein (try first): Q96SI9-2, Q12906-7 (Gene: STRBP) - vaep.io.mq - WARNING - Gene: STT3B : More than one razor protein (try first): P46977, Q8TCJ2 (Gene: STT3B) - vaep.io.mq - WARNING - Gene: SUGT1 : More than one razor protein (try first): Q9Y2Z0, Q9Y2Z0-2 (Gene: SUGT1) - vaep.io.mq - WARNING - Gene: SUMO3 : More than one razor protein (try first): A8MUA9, P61956-2 (Gene: SUMO3) - vaep.io.mq - WARNING - Gene: TACC1 : More than one razor protein (try first): O75410-2, Q4VXL4 (Gene: TACC1) - vaep.io.mq - WARNING - Gene: TAF9 : More than one razor protein (try first): A0A087WVD7, D6RGK3 (Gene: TAF9) - vaep.io.mq - WARNING - Gene: TAP2 : More than one razor protein (try first): A0A0G2JLV0, Q9NRK6 (Gene: TAP2) - vaep.io.mq - WARNING - Gene: TBL1X : More than one razor protein (try first): Q9BZK7, O60907-2 (Gene: TBL1X) - vaep.io.mq - WARNING - Gene: TCOF1 : More than one razor protein (try first): Q13428-8, A0A3B3IS06 (Gene: TCOF1) - vaep.io.mq - WARNING - Gene: TCP1 : More than one razor protein (try first): F5H136, P17987 (Gene: TCP1) - vaep.io.mq - WARNING - Gene: TIAL1 : More than one razor protein (try first): Q01085, F8W8I6 (Gene: TIAL1) - vaep.io.mq - WARNING - Gene: TLN2 : More than one razor protein (try first): Q9Y4G6, Q9Y490 (Gene: TLN2) - vaep.io.mq - WARNING - Gene: TMPO : More than one razor protein (try first): P42166, P42167 (Gene: TMPO) - vaep.io.mq - WARNING - Gene: TMSB4X : More than one razor protein (try first): P63313, P62328 (Gene: TMSB4X) - vaep.io.mq - WARNING - Gene: TNPO2 : More than one razor protein (try first): Q92973-2, O14787-2 (Gene: TNPO2) - vaep.io.mq - WARNING - Gene: TOP2B : More than one razor protein (try first): Q02880-2, P11388 (Gene: TOP2B) - vaep.io.mq - WARNING - Gene: TOR1AIP2: More than one razor protein (try first): Q9H496, Q8NFQ8, A0A0A0MSK5 (Gene: TOR1AIP2) - vaep.io.mq - WARNING - Gene: TPM1 : More than one razor protein (try first): F5H7S3, P07951-2, P09493, A0A087WWU8 (Gene: TPM1) - vaep.io.mq - WARNING - Gene: TPM2b : More than one razor protein (try first): P07951-2, A0A087WWU8, P67936 (Gene: TPM2b) - vaep.io.mq - WARNING - Gene: TPM4 : More than one razor protein (try first): P67936, A0A087WWU8 (Gene: TPM4) - vaep.io.mq - WARNING - Gene: TRA2A : More than one razor protein (try first): P62995-3, Q13595-2 (Gene: TRA2A) - vaep.io.mq - WARNING - Gene: TRIM24 : More than one razor protein (try first): O15164-2, Q9UPN9 (Gene: TRIM24) - vaep.io.mq - WARNING - Gene: TSC22D1 : More than one razor protein (try first): Q9Y3Q8, Q15714-2 (Gene: TSC22D1) - vaep.io.mq - WARNING - Gene: TSC22D2 : More than one razor protein (try first): O75157-2, Q9Y3Q8 (Gene: TSC22D2) - vaep.io.mq - WARNING - Gene: TUBA1C : More than one razor protein (try first): P68363, F5H5D3 (Gene: TUBA1C) - vaep.io.mq - WARNING - Gene: TUBA4A : More than one razor protein (try first): P68363, P68366-2 (Gene: TUBA4A) - vaep.io.mq - WARNING - Gene: TUBA8 : More than one razor protein (try first): P68363, Q9NY65-2, F5H5D3 (Gene: TUBA8) - vaep.io.mq - WARNING - Gene: TUBAL3 : More than one razor protein (try first): P68363, A6NHL2-2 (Gene: TUBAL3) - vaep.io.mq - WARNING - Gene: TUBB1 : More than one razor protein (try first): Q5JP53, Q13509 (Gene: TUBB1) - vaep.io.mq - WARNING - Gene: TUBB2A : More than one razor protein (try first): Q5JP53, Q9BVA1 (Gene: TUBB2A) - vaep.io.mq - WARNING - Gene: TUBB2B : More than one razor protein (try first): Q5JP53, Q9BVA1 (Gene: TUBB2B) - vaep.io.mq - WARNING - Gene: TUBB3 : More than one razor protein (try first): Q5JP53, P68371, Q13509 (Gene: TUBB3) - vaep.io.mq - WARNING - Gene: TUBB4A : More than one razor protein (try first): P68371, Q5JP53 (Gene: TUBB4A) - vaep.io.mq - WARNING - Gene: TUBB4B : More than one razor protein (try first): P68371, Q5JP53 (Gene: TUBB4B) - vaep.io.mq - WARNING - Gene: TUBB6 : More than one razor protein (try first): Q9BUF5, Q13509, Q5JP53, Q9BVA1 (Gene: TUBB6) - vaep.io.mq - WARNING - Gene: TUBB8 : More than one razor protein (try first): P68371, Q3ZCM7, Q5JP53 (Gene: TUBB8) - vaep.io.mq - WARNING - Gene: TWF2 : More than one razor protein (try first): Q6IBS0, Q12792 (Gene: TWF2) - vaep.io.mq - WARNING - Gene: TXNRD2 : More than one razor protein (try first): A0A182DWF2, Q16881 (Gene: TXNRD2) - vaep.io.mq - WARNING - Gene: UBE2B : More than one razor protein (try first): H0YA80, A0A0D9SG71 (Gene: UBE2B) - vaep.io.mq - WARNING - Gene: UBE2E2 : More than one razor protein (try first): R4GND1, C9J2P0 (Gene: UBE2E2) - vaep.io.mq - WARNING - Gene: UBE2E3 : More than one razor protein (try first): R4GND1, C9J2P0 (Gene: UBE2E3) - vaep.io.mq - WARNING - Gene: UBE2V2 : More than one razor protein (try first): I3L0A0, Q15819 (Gene: UBE2V2) - vaep.io.mq - WARNING - Gene: UBP1 : More than one razor protein (try first): Q9NZI7, Q12800-4 (Gene: UBP1) - vaep.io.mq - WARNING - Gene: UBQLN2 : More than one razor protein (try first): Q9UHD9, Q9UMX0-2 (Gene: UBQLN2) - vaep.io.mq - WARNING - Gene: UBQLN4 : More than one razor protein (try first): Q9NRR5, Q9UMX0-2 (Gene: UBQLN4) - vaep.io.mq - WARNING - Gene: UBR4 : More than one razor protein (try first): Q5T4S7-3, X6R960 (Gene: UBR4) - vaep.io.mq - WARNING - Gene: USP13 : More than one razor protein (try first): Q92995, P45974 (Gene: USP13) - vaep.io.mq - WARNING - Gene: USP4 : More than one razor protein (try first): Q13107-2, Q9Y4E8 (Gene: USP4) - vaep.io.mq - WARNING - Gene: VAPB : More than one razor protein (try first): O95292, Q9P0L0 (Gene: VAPB) - vaep.io.mq - WARNING - Gene: VEZF1 : More than one razor protein (try first): I3L2Z5, Q14119 (Gene: VEZF1) - vaep.io.mq - WARNING - Gene: VPS4A : More than one razor protein (try first): O75351, Q9UN37 (Gene: VPS4A) - vaep.io.mq - INFO - Gene: VTN : Potential contaminent protein is leading razor protein: CON__Q3ZBS7 (Gene: VTN) - vaep.io.mq - WARNING - Gene: YBX3 : More than one razor protein (try first): P16989-2, P67809, P16989 (Gene: YBX3) - vaep.io.mq - WARNING - Gene: YTHDF1 : More than one razor protein (try first): Q9Y5A9, Q9BYJ9, Q7Z739 (Gene: YTHDF1) - vaep.io.mq - WARNING - Gene: YTHDF3 : More than one razor protein (try first): Q7Z739, Q9Y5A9 (Gene: YTHDF3) - vaep.io.mq - WARNING - Gene: YWHAB : More than one razor protein (try first): P31946-2, P62258, P63104, P61981 (Gene: YWHAB) - vaep.io.mq - WARNING - Gene: YWHAG : More than one razor protein (try first): P61981, P62258 (Gene: YWHAG) - vaep.io.mq - WARNING - Gene: YWHAH : More than one razor protein (try first): Q04917, P62258 (Gene: YWHAH) - vaep.io.mq - WARNING - Gene: YWHAQ : More than one razor protein (try first): P27348, P62258, P63104 (Gene: YWHAQ) - vaep.io.mq - WARNING - Gene: YWHAZ : More than one razor protein (try first): P63104, P62258 (Gene: YWHAZ) - vaep.io.mq - WARNING - Gene: ZC3H7B : More than one razor protein (try first): Q8IWR0, Q9UGR2 (Gene: ZC3H7B) - vaep.io.mq - INFO Dumped 646 genes from 20190611_QX3_LiSc_MA_Hela_500ng_LC15 - - - -```python -# same code fails in `vaep.io.mq`, ABC needed? -isinstance(mq_output, MaxQuantOutput), type(mq_output) -``` - - - - - (True, vaep.io.mq.MaxQuantOutput) - - - -#### Descriptics - - -```python -s_completeness = pd.Series(completeness_per_gene, name='completenes_by_gene') -s_completeness.describe() -``` - - - - - count 5,876.0 - mean 0.3 - std 0.2 - min 0.0 - 25% 0.1 - 50% 0.2 - 75% 0.4 - max 1.0 - Name: completenes_by_gene, dtype: float64 - - - - -```python -N_BINS = 20 -ax = s_completeness.plot(kind='hist', - bins=N_BINS, - xticks=[x/100 for x in range(0, 101, 5)], - figsize=(10, 5), - rot=90, - title=f"Frequency of proportion of observed exact peptides (completness) per razor protein from 0 to 1 in {N_BINS} bins" - f"\nin sample {mq_output.folder.stem}") - -_ = ax.set_xlabel( - "Proportion of exactly observed peptides (including up to 2 mis-cleavages)") - -fig = ax.get_figure() -fig.tight_layout() -fig.savefig(FIGUREFOLDER / mq_output.folder.stem / 'freq_completeness.png') -``` - - -![png](01_explore_raw_MQ_data_files/01_explore_raw_MQ_data_146_0.png) - - -based on completeness, select valid training data - - -```python -# continously decrease this number in the scope of the project -mask = s_completeness > .6 -s_completeness.loc[mask] -``` - - - - - ACOT13 0.8 - ACTB 0.8 - ACTG1 0.8 - ACTN1 0.6 - ACTN4 0.7 - .. - YWHAE 0.9 - YWHAG 0.8 - YWHAQ 0.8 - YWHAZ 0.9 - ZYX 0.6 - Name: completenes_by_gene, Length: 590, dtype: float64 - - diff --git a/project/doc/ipynbs/01_explore_raw_MQ_data_files/01_explore_raw_MQ_data_111_0.png b/project/doc/ipynbs/01_explore_raw_MQ_data_files/01_explore_raw_MQ_data_111_0.png deleted file mode 100644 index 559b597ab..000000000 Binary files a/project/doc/ipynbs/01_explore_raw_MQ_data_files/01_explore_raw_MQ_data_111_0.png and /dev/null differ diff --git a/project/doc/ipynbs/01_explore_raw_MQ_data_files/01_explore_raw_MQ_data_146_0.png b/project/doc/ipynbs/01_explore_raw_MQ_data_files/01_explore_raw_MQ_data_146_0.png deleted file mode 100644 index 991323ef2..000000000 Binary files a/project/doc/ipynbs/01_explore_raw_MQ_data_files/01_explore_raw_MQ_data_146_0.png and /dev/null differ diff --git a/project/doc/ipynbs/11_training_data_exploration_peptides.md b/project/doc/ipynbs/11_training_data_exploration_peptides.md deleted file mode 100644 index a91cbc558..000000000 --- a/project/doc/ipynbs/11_training_data_exploration_peptides.md +++ /dev/null @@ -1,6747 +0,0 @@ -# Peptides - -Load peptides selected for training - - -```python -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt - -from src import config -from vaep.analyzers import analyzers - -# from sklearn import preprocessing -# from sklearn.decomposition import PCA -import seaborn as sns - -import vaep - -pd.options.display.max_columns = 100 -pd.options.display.min_rows = 30 -``` - - FOLDER_MQ_TXT_DATA = data\mq_out - - -## Descriptive Statistics (Linear case) - -- spread of peptide quantifications between samples -- spread of quantifications within samples -- correlation analysis: can linear correlation be picked up? - - -### Peptides - - -```python -FNAME = 'df_intensities_N07813_M01000' -FN_PEPTIDE_INTENSITIES = config.FOLDER_DATA / FNAME -FIGUREFOLDER = config.FIGUREFOLDER / FNAME -FIGUREFOLDER.mkdir(exist_ok=True) -``` - - -```python -N_FIRST_ROWS = None # possibility to select N first rows -analysis = analyzers.AnalyzePeptides.from_csv(fname=FN_PEPTIDE_INTENSITIES, nrows=N_FIRST_ROWS) -analysis.describe_peptides(sample_n=30) -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
TALIHDGLARFNEEHIPDSPFVVPVASPSGDARSGAQASSTPLSPTREQISDIDDAVRSPYTVTVGQACNPSACRSLTNDWEDHLAVKGGMGSGGLATGIAGGLAGMGGIQNEKYGDGGSTFQSTTGHCVHMRLVLVGDGGTGKWSGPLSLQEVDEQPQHPLHVTYAGAAVDELGKDLLHPSPEEEKTLQIFNIEMKAGAIAPCEVTVPAQNTGLGPEKTAVVVGTITDDVRYTVQDESHSEWVSCVRGVDEVTIVNILTNRSAEFLLHMLKTAFDEAIAELDTLNEDSYKGAGTDDHTLIRLSVLGAITSVQQRNQVALNPQNTVFDAKLAPITSDPTEATAVGAVEASFKLSPPYSSPQEFAQDVGRLQLWDTAGQERSTAGDTHLGGEDFDNRSHTILLVQPTKMDATANDVPSPYEVRSYCAEIAHNVSSKVLLPEYGGTKVLAVNQENEQLMEDYEK
count7,600.0007,506.0007,411.0007,391.0007,430.0007,728.0007,478.0007,414.0007,555.0007,474.0007,519.0007,403.0007,556.0007,429.0007,385.0007,616.0007,687.0007,424.0007,395.0007,500.0007,627.0007,698.0007,451.0007,496.0007,677.0007,521.0007,622.0007,545.0007,528.0007,434.000
mean1,421,973,967.249786,460,387.144505,834,649.143966,069,353.741372,199,024.4413,689,931,536.982672,170,734.020918,210,755.1522,199,479,881.6021,408,685,203.532976,539,830.164317,222,216.431981,323,585.521920,763,307.484518,797,230.9683,320,534,026.9681,113,957,436.6331,166,961,953.043953,373,943.442236,175,349.0931,266,474,449.1413,668,408,443.336143,961,101.745552,575,572.4793,075,420,507.646644,848,827.539469,675,591.0261,049,845,729.1722,385,589,155.393191,500,056.255
std1,723,677,276.987693,990,474.250432,014,085.657982,542,709.391399,614,425.3433,778,115,570.3411,227,907,038.0391,334,548,721.8201,979,222,398.8801,730,125,115.5771,250,215,495.514293,766,401.1061,051,915,582.6601,258,672,916.699702,924,312.6992,835,176,216.3461,050,930,838.0051,062,125,567.075897,715,467.357242,849,890.2841,777,775,210.9762,885,008,198.807192,839,191.009523,967,080.5633,603,302,812.275771,401,925.316564,039,333.325976,058,017.7372,179,108,146.628211,872,794.823
min708,840.0001,506,400.0001,739,600.0001,010,300.0001,838,900.0002,733,500.0001,713,700.0001,250,900.0001,448,500.0001,626,800.0001,931,200.000796,680.0002,620,400.0001,468,800.0001,500,800.000401,110.0001,022,400.000708,790.000664,550.0001,185,000.0002,048,400.0001,154,900.0001,815,200.0002,167,500.0001,437,000.000677,200.0002,277,200.0002,231,700.0001,492,300.0001,217,500.000
25%473,762,500.000323,900,000.000236,125,000.000387,005,000.000114,452,500.0001,389,025,000.000139,342,500.000164,297,500.000973,775,000.000477,245,000.000306,175,000.000131,065,000.000333,887,500.000249,730,000.000141,610,000.0001,288,525,000.000431,820,000.000367,077,500.000332,780,000.00090,517,500.000424,120,000.0001,617,975,000.00046,024,500.000210,735,000.0001,163,200,000.000234,880,000.000178,510,000.000389,480,000.0001,049,950,000.00067,820,500.000
50%892,095,000.000609,095,000.000374,590,000.000704,060,000.000251,575,000.0002,551,200,000.000350,125,000.000444,175,000.0001,670,600,000.000878,415,000.000600,110,000.000237,600,000.000661,100,000.000536,980,000.000292,240,000.0002,487,100,000.000816,800,000.000893,925,000.000705,530,000.000165,010,000.000709,240,000.0002,911,750,000.00080,197,000.000384,255,000.0001,997,900,000.000425,810,000.000295,410,000.000765,100,000.0001,737,650,000.000125,005,000.000
75%1,606,900,000.0001,049,450,000.000623,395,000.0001,194,450,000.000486,015,000.0004,511,925,000.000708,510,000.0001,058,650,000.0002,711,550,000.0001,557,225,000.0001,056,500,000.000427,025,000.0001,240,000,000.000988,290,000.000593,650,000.0004,660,825,000.0001,468,050,000.0001,658,950,000.0001,264,350,000.000281,177,500.0001,252,950,000.0004,933,250,000.000154,490,000.000706,912,500.0003,293,600,000.000725,330,000.000512,512,500.0001,389,700,000.0002,931,000,000.000246,677,500.000
max17,680,000,000.00010,892,000,000.0006,664,800,000.00020,812,000,000.0005,618,700,000.00045,946,000,000.00027,939,000,000.00012,833,000,000.00027,228,000,000.00016,413,000,000.00011,896,000,000.0006,808,600,000.00012,396,000,000.00014,025,000,000.0007,758,600,000.00025,866,000,000.00016,165,000,000.00013,087,000,000.00011,123,000,000.0003,813,700,000.00017,397,000,000.00030,067,000,000.0002,933,400,000.0007,092,500,000.00026,607,000,000.00013,662,000,000.0006,576,800,000.00012,680,000,000.00018,192,000,000.0005,457,200,000.000
CV1.2120.8820.8541.0171.0741.0241.8271.4530.9001.2281.2800.9261.0721.3671.3550.8540.9430.9100.9421.0281.4040.7861.3400.9481.1721.1961.2010.9300.9131.106
-
- - - - -```python -sample = analysis.df.sample(n=30, axis=1) -# ToDo func is assigned to df -corr_lower_triangle = analyzers.corr_lower_triangle(sample) -corr_lower_triangle -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
SYELPDGQVITIGNEREITALAPSTMKTINEVENQILTRHIADLAGNSEVILPVPAFNVINGGSHAGNKTPLHEIALSIKVFLENVIRTANDMIHAENMREQIVPKPEEEVAQKLATQSNEITIPVTFESRYGDGGSTFQSTTGHCVHMRKVEEAEPEEFVVEKYNEQHVPGSPFTARHNQLPLVIEFTEQTAPKEHDPVGQMVNNPKDLSHIGDAVVISCAKVANVSLLALYKSYSPYDMLESIRKVLSGTIHAGQPVKNRPTSISWDGLDSGKSTTTGHLIYKMVSDINNGWQHLEQAEKAMGIMNSFVNDIFERVVVLMGSTSDLGHCEKGYSFTTTAERAQTAHIVLEDGTKSGDAAIVDMVPGKPMCVESFSDYPPLGRVEFMDDTSRYLTVAAVFRNIPGITLLNVSKARFEELNADLFR
SYELPDGQVITIGNERNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
EITALAPSTMK0.743NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
TINEVENQILTR0.6260.690NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
HIADLAGNSEVILPVPAFNVINGGSHAGNK0.6540.5890.414NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
TPLHEIALSIK0.5790.6010.3430.688NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
VFLENVIR0.7370.7640.5130.5940.665NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
TANDMIHAENMR0.4470.5480.5800.4100.3660.442NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
EQIVPKPEEEVAQK0.5840.6010.2790.6690.7910.6180.328NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
LATQSNEITIPVTFESR0.7330.6700.4160.7490.7670.6670.3520.798NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
YGDGGSTFQSTTGHCVHMR0.5620.6070.3960.6700.7830.6400.4740.7440.730NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
KVEEAEPEEFVVEK0.6360.6640.3480.6990.8560.6730.3730.8480.8450.801NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
YNEQHVPGSPFTAR0.3930.5540.7070.2020.0690.3120.5710.0460.1540.1320.086NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
HNQLPLVIEFTEQTAPK0.6270.5710.3640.6960.7710.6470.3550.7740.7730.6350.7980.146NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
EHDPVGQMVNNPK0.5430.5690.3160.6070.7290.5630.3230.7410.7340.7250.7850.0700.687NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
DLSHIGDAVVISCAK0.5810.5850.3330.6670.8620.6480.3720.7920.7580.8120.8610.0530.7500.750NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
VANVSLLALYK0.7020.6950.6010.5280.5380.6420.5310.5230.6110.5270.5630.4510.6120.4880.547NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
SYSPYDMLESIRK0.4800.5740.6910.1630.0690.3830.4520.0380.1500.0850.0780.7090.1600.0710.0470.548NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
VLSGTIHAGQPVK0.5710.6100.3300.6520.8000.6150.3650.7900.7690.7800.8390.0750.7270.7510.7940.5270.090NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
NRPTSISWDGLDSGK0.6140.6360.4080.6790.7490.6150.4690.7510.7690.7800.8480.1840.7430.7220.8050.6080.1420.767NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
STTTGHLIYK0.6760.7330.4750.6710.7260.7070.4850.7590.7530.7620.7920.2640.6640.6960.7330.6310.2420.7540.749NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
MVSDINNGWQHLEQAEK0.4870.5590.7910.3760.2600.3750.5580.2090.3390.3590.2570.6760.2400.2450.2590.4790.5800.2510.3620.374NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
AMGIMNSFVNDIFER0.5350.5710.4930.4160.2730.5480.3800.2390.3590.2770.2800.4550.3370.2180.2510.5560.5690.2680.2880.3830.379NaNNaNNaNNaNNaNNaNNaNNaNNaN
VVVLMGSTSDLGHCEK0.6140.7070.5630.5930.7010.6650.5150.6320.6530.7280.7030.3900.6400.5990.7250.6380.3610.6770.6700.6980.5150.421NaNNaNNaNNaNNaNNaNNaNNaN
GYSFTTTAER0.6820.7490.4910.6710.7280.7640.5140.7300.7430.8020.7700.2890.6690.6450.7840.5930.2280.6990.7380.8000.4180.3760.715NaNNaNNaNNaNNaNNaNNaN
AQTAHIVLEDGTK0.5590.5770.2720.6230.8160.6350.3750.8110.7440.8120.8550.0380.7560.7620.8400.4990.0220.8160.8080.7700.1730.2450.6660.749NaNNaNNaNNaNNaNNaN
SGDAAIVDMVPGKPMCVESFSDYPPLGR0.5960.6170.6600.4570.2910.4650.4030.2740.4210.3450.3120.5910.3620.2860.2750.5260.6240.2800.3320.4250.5900.5940.4890.4340.231NaNNaNNaNNaNNaN
VEFMDDTSR0.5740.6230.3650.6770.8310.6560.4830.7920.7780.8250.8350.1200.7630.7470.8470.5950.0360.8080.7630.7880.3040.2530.7290.8150.8060.288NaNNaNNaNNaN
YLTVAAVFR0.6650.6920.5600.6370.6850.6960.5790.6300.6860.7440.6940.3810.6790.5760.7320.6500.3270.6350.7270.7000.5010.4130.7150.8000.6820.4530.738NaNNaNNaN
NIPGITLLNVSK0.6610.5900.3950.6330.7150.7150.4190.7110.7710.6730.7530.1690.7570.6450.7490.6870.1890.6720.7200.7010.2970.3460.6610.7160.7090.3680.7410.686NaNNaN
ARFEELNADLFR0.6100.5980.4370.6930.7890.6830.4970.7040.7310.8310.7830.2080.7040.6330.8190.5550.1270.7230.7570.7180.4110.3020.7050.7910.7530.3350.7990.8070.695NaN
-
- - - - -```python -fig, axes = analyzers.plot_corr_histogram(corr_lower_triangle, bins=40) -``` - - -![png](11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_7_0.png) - - -### Samples - - -```python -analysis.df.sample(30, axis=0).T.describe() -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
20180424_QE1_nLC10_EM_QC_HELA_02_25020190619_QX7_IgPa_MA_HeLa_Br14_500ng20170117_QE1_nLC1_EM_QC_MNT_hela20180519_QE5_Evo2_DBJ_LFQprot_HeLa_500ng_44min_15000_0320190710_QE10_nLC0_LiNi_QC_45cm_HeLa_MUC_0120190614_QE4_nLC12_MM_QC_MNT_HELA_0220180412_QE9_nLC13_AnMu_QC_MNT_HELA_0220160923_QE1_nlc1_EM_QC_MNT_hela20181012_QE9_nLC13_FaCo_Hela_MNT_50cm_155min_1_2018101222061820190207_QE8_nLC0_ASD_QC_HeLa_43cm320160629_QE4_nLC4_SCL_QC_MNT_HeLa_0220151008_LUMOS1_LCE_ChKe_DEV_HELA_FTms2_RelaxedPrecursor_0320190709_QX6_MaTa_MA_HeLa_500ng_LC0920180924_QE1_nLC10_GP_QC_HELA_0220190408_QE7_nLC3_OOE_QC_MNT_HeLa_250ng_RO-00620160628_QE1_nlc2_BTW_qc_hela220191205_QE10_nLC13_NHS_45cm_HeLa_Muc_Test_520180419_QE1_nlc10_GP_QC_HELA_250_0120160704_QE4_LC4_IAH_QC_MNT_HeLa_0220190627_QX3_MaMu_MA_Hela_500ng_LC1520180315_QE6_nLC12_MR_QC_MNT_HELA_250ng_new_0220170406_QE4_LC12_MPL_QC_MNT_HeLa_0220151216_QE1_UPLC3_BTW_SA_hela_proteome_1-1_SCX_0420150901_QE4_UPLC5_KBS_QC_MNT_HeLa_0120180524_QE7_nLC7_MEM_QC_MNT_HeLa_250ng_0420180805_QE8_nLC13_BDA_QC_MNT_HeLa_50cm_0220150722_QE6_UPLC6_SCL_QC_MNT_HELA_0120160117_QE3_UPLC8_LVS_MNT_HELA_0120180718_QE6_nLC6_CSC_QC_HeLa_0220180219_QE6_LC12_AS_QC_MNT_HeLa_500ng_01
count996.000985.000993.000938.000996.000999.000924.000997.000908.000998.000995.000932.000763.000995.000998.000997.000990.000991.000993.000985.000993.000990.000607.000986.000999.000987.000989.000987.000999.000998.000
mean1,392,712,958.8355,373,476,844.670962,586,302.1152,541,550,101.2794,301,161,032.1291,816,145,523.5241,989,111,673.160937,092,972.116232,203,800.0001,992,806,884.770769,075,250.251320,385,632.2963,815,680,347.313516,493,794.6732,421,345,111.222593,688,419.5591,598,086,296.5661,641,802,313.824943,439,841.8933,105,161,114.2131,879,992,276.939916,228,116.970896,553,706.590685,768,706.7952,970,362,540.5411,719,667,728.470553,156,694.135549,030,982.4722,117,272,242.2423,709,578,405.812
std2,242,982,517.0636,745,097,875.3581,312,016,794.6194,420,039,110.6794,916,861,423.9403,209,273,798.5163,070,578,820.7141,452,733,147.020459,185,985.4943,679,890,769.7771,380,656,104.155714,312,784.8874,152,393,239.322892,398,431.5454,079,306,624.733942,540,467.3532,356,756,656.4412,784,782,476.7251,525,905,025.9824,195,656,179.8843,023,162,035.5401,421,681,261.2051,689,550,716.1001,210,309,371.7234,655,439,980.8053,044,974,253.469924,147,202.199950,693,505.3682,966,082,934.4365,052,111,439.322
min18,194,000.00017,275,000.00012,917,000.00010,593,000.00045,973,000.00019,634,000.00012,706,000.0008,361,200.0001,707,200.00020,438,000.0004,565,200.0006,370,400.00018,108,000.0006,500,700.00034,927,000.0004,499,700.0009,439,600.00023,242,000.00012,832,000.0002,236,800.00016,468,000.0009,433,800.0003,790,800.0008,075,900.00032,390,000.00010,755,000.0004,912,100.0004,497,100.00020,440,000.00078,939,000.000
25%325,887,500.0001,456,900,000.000309,410,000.000496,900,000.0001,274,600,000.000431,820,000.000410,887,500.000245,110,000.00044,293,000.000444,457,500.000202,990,000.00060,375,500.0001,035,100,000.000122,800,000.000610,137,500.000148,610,000.000341,875,000.000381,635,000.000250,680,000.000731,660,000.000452,680,000.000269,167,500.00063,576,000.000141,055,000.000788,835,000.000330,590,000.000112,510,000.000133,835,000.000627,670,000.0001,043,325,000.000
50%696,145,000.0003,209,800,000.000562,140,000.0001,154,750,000.0002,694,450,000.000893,340,000.000987,740,000.000492,880,000.00095,928,000.000920,540,000.000383,260,000.000128,540,000.0002,470,500,000.000243,820,000.0001,193,750,000.000301,090,000.000778,745,000.000769,930,000.000490,260,000.0001,724,600,000.000932,080,000.000490,675,000.000289,250,000.000314,150,000.0001,495,900,000.000762,520,000.000251,910,000.000265,830,000.0001,149,200,000.0002,069,950,000.000
75%1,444,200,000.0006,575,800,000.0001,096,000,000.0002,633,750,000.0005,434,400,000.0001,824,450,000.0002,064,475,000.0001,028,900,000.000210,355,000.0001,829,600,000.000789,355,000.000297,082,500.0005,001,500,000.000527,675,000.0002,443,775,000.000651,220,000.0001,758,800,000.0001,724,450,000.000991,460,000.0003,765,900,000.0001,965,400,000.000947,270,000.0001,031,000,000.000703,440,000.0003,162,400,000.0001,706,250,000.000565,400,000.000543,040,000.0002,357,600,000.0004,090,225,000.000
max22,482,000,000.00077,523,000,000.00016,331,000,000.00051,248,000,000.00044,903,000,000.00036,166,000,000.00025,499,000,000.00016,561,000,000.0005,309,300,000.00047,214,000,000.00018,560,000,000.00011,148,000,000.00025,813,000,000.00010,282,000,000.00056,563,000,000.00010,532,000,000.00020,075,000,000.00042,175,000,000.00018,265,000,000.00049,217,000,000.00038,549,000,000.00015,485,000,000.00020,139,000,000.00012,966,000,000.00046,644,000,000.00031,612,000,000.0008,905,000,000.00012,113,000,000.00031,098,000,000.00045,647,000,000.000
-
- - - -### Peptides (all) - - -```python -stats = analysis.describe_peptides() -``` - - -```python -_ = stats.loc['CV'].hist(figsize=(10, 4)) # biological coefficient of variation: standard deviation (variation) w.r.t mean -``` - - -![png](11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_12_0.png) - - - -```python -_ = stats.loc['count'].hist(figsize=(10,4)) -``` - - -![png](11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_13_0.png) - - - -```python -X = analysis.df -INDEX_NAME = 'Sample ID' -analysis.df.index.name = INDEX_NAME -``` - - -```python -analysis.df -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
AAAAAAALQAKAAFDDAIAELDTLSEESYKAAHSEGNTTAGLDMRAAVATFLQSVQVPEFTPKAAVEEGIVLGGGCALLRAAVPSGASTGIYEALELRAAVPSGASTGIYEALELRDNDKACANPAAGSVILLENLRACGLVASNLNLKPGECLRADLINNLGTIAKADLLLSTQPGREEGSPLELERADRDESSPYAAMLAAQDVAQRAEEYEFLTPVEEAPKAFDSGIIPMEFVNKAFGYYGPLRAFYPEEISSMVLTKAGAGSATLSMAYAGARAGFAGDDAPRAGGAAVVITEPEHTKAGGEAGVTLGQPHLSRAGKPVICATQMLESMIKAGLQFPVGRAGNLGGGVVTIERAGVNTVTTLVENKKAHGPGLEGGLVGKPAEFTIDTKAHQVVEDGYEFFAKAHSSMVGVNLPQKAIADTGANVVVTGGKAIAELGIYPAVDPLDSTSRAIGVLTSGGDAQGMNAAVRAIIIFVPVPQLKAILVDLEPGTMDSVRAIPQLQGYLRAITGASLADIMAKAIVAIENPADVSVISSRALAAAGYDVEKALANVNIGSLICNVGAGGPAPAAGAAPAGGPAPSTAAAPAEEKALDIAENEMPGLMRALDTMNFDVIKALDVMVSTFHKALEHFTDLYDIKALESPERPFLAILGGAKALIAAQYSGAQVRALIVLAHSERALLFIPRALLFVPRALLVTASQCQQPAENKALMLQGVDLLADAVAVTMGPKALPFWNEEIVPQIKALQSGQCAGAALDVFTEEPPRDR...SPFEVQVGPEAGMQKSPYQEFTDHLVKSPYTVTVGQACNPSACRSTGGAPTFNVTVTKSTVHEILCKSYGRPPPDVEGMTSLKTANDMIHAENMRTFSYAGFEMQPKTGAAPIIDVVRTGVAVNKPAEFTVDAKTINEVENQILTRTLQALQIPAAKTLSDYNIQKTTPSVVAFTADGERTVPEELVKPEELSKVACIGAWHPARVAHSFNCTPIEGMLSHQLKVALVYGQMNEPPGARVDFNVPMKVEFMDDTSRVELVPPTPAEIPRVEPGLGADNSVVRVFQFLNAKVGDAIPAVEVFEGEPGNKVGLQVVAVKVGQEIEVRPGIVSKVIDPATATSVDLRVIMVTGDHPITAKVITIMQNPRVIVVGNPANTNCLTASKVLALPEPSPAAPTLRVLAMSGDPNYLHRVLQALEGLKVNGRPLEMIEPRVNNSSLIGLGYTQTLKPGIKVNVPVIGGHAGKVPPAINQFTQALDRVSQEHPVVLTKVSVADHSLHLSKVTAQGPGLEPSGNIANKVVFVFGPDKVVFVFGPDKKVYALPEDLVEVKPKYADLTEDQLPSCESLKYDDMAAAMKYDDMAACMKYDDMATCMKYLAEVACGDDRKYLDEDTIYHLQPSGRYRVPDVLVADPPIAR
Sample ID
20150330_QE2_UPLC4_BTW_sa_hela_C18_35b368,350,000.0002,978,300,000.000447,870,000.000329,590,000.0002,052,900,000.0005,346,500,000.0005,538,600,000.000315,280,000.000225,940,000.00086,896,000.000172,400,000.00060,875,000.00055,623,000.000230,700,000.00028,325,000.000394,320,000.000974,330,000.00070,024,000.000128,060,000.00043,064,000.0002,207,600,000.000181,550,000.000483,150,000.000124,710,000.000108,700,000.00089,397,000.00053,419,000.000347,860,000.000444,000,000.00020,867,000.0002,909,600,000.0002,199,200,000.00018,084,000.000248,090,000.00089,239,000.0002,256,600,000.000413,850,000.00018,635,000.00035,506,000.0003,724,600,000.00092,645,000.0002,720,400,000.0001,972,400,000.0001,952,400,000.000287,410,000.00083,841,000.00051,588,000.0001,024,000,000.0001,397,700,000.000131,250,000.000...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
20190108_QE1_nLC2_MB_QC_MNT_HELA_new_012,120,200,000.0001,541,700,000.000225,130,000.000346,330,000.000836,740,000.000NaN2,130,300,000.000941,980,000.0001,306,000,000.0002,351,200,000.000430,070,000.000319,080,000.000NaN554,990,000.000290,570,000.000514,890,000.000294,180,000.0007,117,300,000.000329,630,000.00091,194,000.0002,666,700,000.0006,983,100,000.000715,940,000.000826,660,000.000324,460,000.000250,010,000.000753,870,000.000331,430,000.000237,370,000.000130,370,000.000953,210,000.0002,160,000,000.000469,400,000.000905,390,000.000482,890,000.0003,170,400,000.000536,330,000.000229,350,000.000374,100,000.0002,425,900,000.000500,900,000.0001,523,400,000.000691,710,000.000484,580,000.0001,458,100,000.000911,950,000.000426,180,000.0001,241,500,000.000893,960,000.00041,924,000.000...132,470,000.000561,510,000.000136,050,000.0002,574,200,000.0002,206,500,000.00041,269,000.000570,850,000.000354,160,000.000662,960,000.000822,760,000.000673,190,000.000116,680,000.0001,582,300,000.000205,630,000.000162,050,000.000958,260,000.000824,820,000.00098,044,000.000729,670,000.000181,150,000.00069,166,000.000432,140,000.000665,260,000.000150,890,000.0002,881,900,000.000369,370,000.00070,220,000.000266,810,000.000639,230,000.000279,710,000.000104,350,000.000304,800,000.000250,500,000.000394,930,000.000390,350,000.00072,787,000.000195,100,000.000469,560,000.000205,290,000.000779,340,000.000520,620,000.000546,910,000.000287,240,000.000428,720,000.000430,640,000.000446,460,000.000212,690,000.000453,100,000.000109,250,000.000281,660,000.000
20190422_QE4_LC12_JE-IAH_QC_MNT_HeLa_01b1,884,100,000.0003,770,700,000.000263,090,000.000591,560,000.0001,704,400,000.000561,130,000.0003,633,200,000.0002,042,400,000.0002,974,900,000.0004,060,300,000.0001,946,400,000.000591,070,000.000282,450,000.000857,540,000.000690,610,000.000839,550,000.000344,620,000.0008,610,700,000.000398,970,000.000123,500,000.0004,171,800,000.0009,473,200,000.0001,342,800,000.0001,474,700,000.000660,890,000.000464,310,000.0001,405,600,000.000422,570,000.0001,082,400,000.000246,560,000.0001,386,100,000.0003,962,500,000.000821,470,000.0001,117,300,000.000828,960,000.0006,477,200,000.0001,619,900,000.000390,380,000.000688,140,000.0001,874,500,000.000366,990,000.0002,600,400,000.0001,046,400,000.000586,540,000.0001,348,000,000.000967,890,000.000804,410,000.00011,279,000,000.0001,950,900,000.000156,140,000.000...417,330,000.0001,003,200,000.000670,310,000.0003,694,000,000.0005,340,600,000.000123,840,000.000570,440,000.000296,360,000.0001,250,800,000.0002,099,500,000.0001,138,900,000.000175,610,000.0002,196,200,000.000434,370,000.000220,370,000.000883,140,000.000726,710,000.000165,020,000.000834,220,000.000256,340,000.000126,530,000.000565,670,000.000671,060,000.000359,370,000.0001,871,700,000.000671,190,000.000122,440,000.000479,110,000.000913,080,000.000552,080,000.000161,420,000.000671,660,000.000380,520,000.000601,110,000.0001,417,100,000.000125,740,000.000301,600,000.000431,340,000.000346,100,000.0001,563,600,000.000596,660,000.000570,070,000.000488,160,000.000692,360,000.000493,690,000.000543,140,000.000223,520,000.000712,380,000.000270,370,000.000504,450,000.000
20191217_QE2_NLC0_GP_QC_MNT_HELA_013,843,200,000.0002,332,300,000.00044,942,000.000671,390,000.0001,579,900,000.000431,240,000.0008,998,200,000.0002,782,000,000.0002,632,500,000.0006,855,900,000.0001,216,100,000.000796,530,000.000262,810,000.0001,422,900,000.000576,010,000.0001,275,300,000.000451,550,000.00017,391,000,000.000718,170,000.00096,927,000.0004,826,300,000.00015,025,000,000.0001,678,400,000.0001,384,200,000.000615,190,000.000472,280,000.0001,375,300,000.000458,760,000.0001,313,600,000.000277,790,000.0002,122,400,000.0004,254,000,000.0001,176,900,000.0001,407,700,000.000904,550,000.0008,652,200,000.0002,552,100,000.000363,940,000.000771,890,000.0004,305,500,000.000660,830,000.0003,012,100,000.0001,435,000,000.000813,650,000.0002,505,800,000.0001,724,200,000.000767,680,000.0006,946,900,000.0002,430,700,000.000197,720,000.000...346,130,000.0001,343,600,000.000462,060,000.0007,615,700,000.0006,855,500,000.000226,490,000.000123,800,000.000482,020,000.0001,631,900,000.0002,513,300,000.0001,618,300,000.000210,160,000.0002,282,300,000.000388,330,000.000253,810,000.0001,563,600,000.0001,424,000,000.000206,870,000.0001,113,600,000.000293,320,000.000128,980,000.000975,910,000.0001,077,100,000.000422,580,000.0003,064,200,000.000866,180,000.00091,263,000.000504,810,000.000840,040,000.000533,820,000.000156,300,000.000466,450,000.000415,020,000.000662,800,000.0001,964,900,000.00089,326,000.000292,080,000.000534,910,000.000602,180,000.0001,741,500,000.000941,900,000.000829,820,000.000410,690,000.0001,062,800,000.000772,130,000.000656,540,000.000381,010,000.0001,200,700,000.000233,190,000.000662,210,000.000
20180307_QE1_nlc10_GP_QC_HeLa_2501,195,300,000.0001,925,300,000.000400,460,000.000404,550,000.0001,657,500,000.000NaN2,472,200,000.0001,094,200,000.000732,080,000.0002,715,900,000.000621,840,000.000567,820,000.000222,210,000.000625,840,000.000301,760,000.000475,050,000.000321,500,000.000NaN219,150,000.000198,160,000.0007,167,200,000.0008,823,000,000.000739,590,000.000847,290,000.000342,210,000.000216,400,000.0001,272,500,000.000399,140,000.000688,720,000.000184,300,000.0001,020,000,000.0003,846,000,000.000440,000,000.000590,060,000.000566,060,000.000NaN1,513,300,000.000244,940,000.000373,600,000.0001,688,500,000.000171,210,000.0001,545,600,000.000725,340,000.000182,160,000.0001,357,900,000.000570,870,000.000539,140,000.0003,863,500,000.000835,450,000.00055,858,000.000...242,130,000.000378,530,000.000825,710,000.0004,129,600,000.0005,201,600,000.000120,540,000.000533,460,000.000201,960,000.000856,300,000.0001,760,800,000.0001,664,000,000.000156,210,000.0003,589,700,000.000359,180,000.000101,060,000.000675,530,000.000548,740,000.000115,010,000.000641,680,000.000525,170,000.00061,357,000.000765,510,000.000NaN240,070,000.0001,597,200,000.000418,490,000.000195,360,000.000251,040,000.000853,200,000.000268,120,000.000205,370,000.000199,640,000.000217,750,000.000331,100,000.000771,670,000.000120,840,000.000714,590,000.00071,063,000.000108,690,000.0001,114,800,000.000265,310,000.000308,450,000.000223,450,000.000719,850,000.000646,430,000.000741,350,000.000306,250,000.000563,720,000.000134,650,000.000112,510,000.000
20180413_QE2_nLC1_MB_QC_HeLa_250_021,421,900,000.0003,293,300,000.000805,130,000.000782,910,000.0001,077,200,000.0001,237,900,000.0003,160,200,000.0001,920,400,000.0003,075,900,000.0004,426,300,000.0001,535,100,000.0001,285,700,000.000538,170,000.000977,760,000.000506,080,000.000256,860,000.000383,590,000.00016,276,000,000.000328,470,000.000367,310,000.0009,465,500,000.0008,751,200,000.0001,327,100,000.000908,240,000.0001,039,700,000.000378,180,000.0002,241,400,000.000579,510,000.0001,034,300,000.000272,610,000.0001,781,200,000.0003,405,600,000.000810,820,000.0001,410,200,000.0001,079,100,000.0004,404,000,000.0002,237,800,000.000311,630,000.000788,900,000.0004,712,400,000.000456,950,000.0002,603,000,000.0001,988,300,000.000588,530,000.0002,228,000,000.0001,106,800,000.000868,840,000.0002,572,300,000.0001,778,000,000.000104,720,000.000...329,460,000.000940,260,000.000989,330,000.0004,715,300,000.0002,282,900,000.000178,440,000.0001,047,600,000.000336,170,000.0001,167,900,000.0002,885,000,000.0001,788,600,000.000262,720,000.0005,587,900,000.000477,980,000.000241,500,000.0001,631,800,000.000940,950,000.00079,304,000.000843,730,000.000983,690,000.000106,420,000.000846,240,000.000932,960,000.000234,950,000.0002,288,200,000.000803,090,000.000296,970,000.000396,450,000.0001,200,300,000.000560,540,000.000391,520,000.000450,300,000.000378,890,000.000478,140,000.0001,336,000,000.000274,820,000.000501,910,000.0001,666,900,000.000572,540,000.0001,809,500,000.000530,300,000.000954,000,000.000812,200,000.000598,430,000.0001,164,300,000.0001,354,900,000.00062,762,000.0001,460,300,000.000281,560,000.000399,110,000.000
20190103_QE8_nLC0_LiNi_QC_MNT_15cm_Hela_023,149,100,000.0002,203,400,000.00071,513,000.0001,459,800,000.0001,930,000,000.000632,430,000.0005,167,000,000.0002,656,300,000.0002,748,100,000.0006,823,000,000.0001,008,400,000.000527,220,000.000393,800,000.000NaNNaN1,618,900,000.000NaN12,396,000,000.00076,376,000.00031,483,000.000710,910,000.00013,760,000,000.0001,842,000,000.000NaN110,950,000.000327,270,000.0001,243,500,000.000640,000,000.0001,740,800,000.000194,940,000.0004,127,300,000.0004,102,100,000.000NaN35,455,000.0001,481,000,000.0006,652,500,000.0003,723,300,000.000393,520,000.000867,450,000.000843,580,000.000166,760,000.0003,428,800,000.0001,462,000,000.000213,240,000.0002,838,100,000.0002,673,300,000.000978,540,000.0007,175,600,000.0003,016,300,000.000NaN...339,530,000.000166,360,000.000491,790,000.000NaNNaNNaN92,085,000.000448,260,000.0001,645,400,000.000174,670,000.0001,172,900,000.000317,180,000.000NaN503,120,000.000361,060,000.000119,750,000.000152,680,000.000521,140,000.000NaN349,610,000.000NaN1,205,400,000.000NaN660,570,000.0004,247,000,000.00070,908,000.000NaNNaNNaN764,590,000.000285,050,000.00057,243,000.000537,100,000.000NaN3,398,700,000.000NaN189,930,000.000NaNNaN1,908,300,000.000NaN193,710,000.000NaN1,075,400,000.000NaN658,260,000.000373,460,000.000857,530,000.000193,360,000.000NaN
20181016_QE10_nLC14_KS_QC_MNT_HeLa_15cm_011,238,300,000.0005,610,500,000.000316,930,000.000996,640,000.0003,351,200,000.0003,548,600,000.0005,249,200,000.0002,983,900,000.0001,737,200,000.0004,036,400,000.000974,900,000.000940,330,000.000531,460,000.0001,318,900,000.000606,430,000.000416,480,000.000731,480,000.00012,645,000,000.000472,070,000.000233,340,000.0005,204,600,000.00014,861,000,000.0001,574,400,000.000483,170,000.000256,110,000.000324,680,000.0001,896,100,000.000448,350,000.0001,815,600,000.000175,770,000.0001,871,700,000.0003,963,400,000.000604,720,000.000840,530,000.0001,067,500,000.0006,548,000,000.0001,385,200,000.000387,000,000.000666,550,000.0004,557,500,000.000560,900,000.000881,610,000.0001,184,400,000.000757,250,000.0001,761,700,000.0001,178,600,000.000226,820,000.0007,517,200,000.0001,670,300,000.000118,590,000.000...160,560,000.000857,450,000.000461,610,000.0005,412,300,000.0003,454,600,000.000141,290,000.000818,420,000.000367,580,000.0001,254,600,000.000990,720,000.000798,310,000.000183,350,000.0006,096,600,000.0001,024,500,000.000210,610,000.000736,950,000.000540,100,000.000286,810,000.0001,416,100,000.000772,540,000.000160,280,000.000878,430,000.000773,070,000.000522,690,000.0003,132,200,000.000500,800,000.000317,540,000.000266,060,000.0001,205,400,000.000414,500,000.000328,280,000.000231,710,000.000390,780,000.000577,290,000.0001,468,900,000.000316,630,000.0001,130,700,000.000978,800,000.000648,310,000.000929,050,000.0001,010,600,000.000252,610,000.000218,310,000.000678,100,000.000548,480,000.000989,260,000.000370,180,000.000729,110,000.000361,660,000.000410,430,000.000
20180711_QE10_nLC14_RS_QC_MNT_HeLa_02499,010,000.0002,253,300,000.000141,790,000.000508,900,000.000550,190,000.000499,670,000.0002,077,800,000.000697,760,000.000393,640,000.0002,106,900,000.000559,490,000.000315,310,000.000243,920,000.000193,190,000.000340,980,000.000248,790,000.000526,470,000.0003,501,600,000.000234,420,000.000179,450,000.0002,483,500,000.0008,510,400,000.000943,860,000.000313,530,000.000227,940,000.000181,800,000.000934,590,000.000228,710,000.000338,710,000.000110,600,000.0001,409,900,000.0001,098,900,000.000381,590,000.000497,700,000.000454,630,000.0001,994,600,000.000871,770,000.00081,480,000.000179,640,000.000728,070,000.000360,950,000.0001,166,100,000.000525,540,000.000396,230,000.0001,048,000,000.0001,162,800,000.000140,320,000.0003,385,100,000.0001,484,300,000.00085,883,000.000...91,176,000.000688,390,000.000216,750,000.0002,467,800,000.0001,523,200,000.000127,230,000.000281,920,000.000152,050,000.000724,370,000.000486,120,000.000357,010,000.000131,860,000.0002,531,700,000.000425,680,000.000237,380,000.000376,140,000.00059,747,000.000165,290,000.000845,160,000.000350,150,000.000163,970,000.000559,890,000.000428,150,000.000314,870,000.0002,072,100,000.000331,580,000.000169,860,000.000305,190,000.000371,780,000.000245,350,000.000172,980,000.000124,690,000.000171,230,000.000408,160,000.000896,540,000.000436,860,000.000396,020,000.000578,750,000.000277,300,000.000680,250,000.000682,000,000.000165,750,000.000145,320,000.000316,010,000.000258,690,000.000238,610,000.000133,660,000.000446,300,000.000142,190,000.000192,690,000.000
20190530_QE2_NLC1_GP_QC_MNT_HELA_01396,620,000.000322,060,000.00075,722,000.00073,915,000.000200,290,000.00071,269,000.000929,360,000.000326,530,000.000403,170,000.000713,800,000.000157,610,000.000106,020,000.00043,616,000.000119,040,000.000117,250,000.000144,650,000.00084,069,000.0001,951,500,000.00056,819,000.00028,166,000.000335,190,000.0001,485,800,000.000201,820,000.000164,070,000.000113,230,000.00057,270,000.000196,540,000.00095,254,000.000130,190,000.00029,423,000.000244,970,000.000727,250,000.000126,420,000.000193,760,000.000136,630,000.0001,110,700,000.000112,990,000.00037,144,000.000139,600,000.000589,850,000.00091,860,000.000305,940,000.000213,420,000.000177,410,000.000380,450,000.000365,920,000.00062,660,000.000555,660,000.000194,980,000.00027,477,000.000...57,186,000.00088,652,000.00061,249,000.0001,006,200,000.000352,180,000.00031,343,000.00067,251,000.00092,373,000.000186,540,000.000175,650,000.000245,530,000.00024,801,000.000325,350,000.00062,083,000.00041,239,000.000109,320,000.000147,600,000.00037,950,000.000219,350,000.00062,256,000.00032,070,000.000113,940,000.000173,640,000.00060,368,000.000277,740,000.000112,090,000.00026,231,000.00053,656,000.000142,940,000.00084,084,000.00027,560,000.00044,032,000.00056,535,000.00043,451,000.000265,490,000.00016,221,000.00042,583,000.000124,300,000.00020,395,000.000184,430,000.000113,620,000.000122,130,000.00044,283,000.000157,880,000.000130,350,000.00094,170,000.00070,228,000.000203,520,000.00055,411,000.00080,090,000.000
20160401_QE6_nLC6_ASD_QC_HELA_02_160414170220642,640,000.0001,640,900,000.000254,240,000.000402,560,000.000399,820,000.000706,400,000.0001,137,300,000.000747,360,000.000317,610,000.0001,820,900,000.000715,580,000.000752,430,000.000237,390,000.00047,627,000.000310,740,000.000567,150,000.000201,290,000.0005,786,200,000.00032,862,000.000104,990,000.0002,248,200,000.0005,504,500,000.0001,074,300,000.000104,250,000.000186,320,000.000187,940,000.0001,181,900,000.000201,470,000.000682,280,000.00023,892,000.000927,060,000.0001,752,100,000.000536,770,000.000499,050,000.000512,720,000.0001,564,900,000.000729,600,000.000172,170,000.000351,950,000.0001,052,400,000.000313,890,000.0002,148,900,000.000228,950,000.000186,320,000.0001,413,800,000.0001,208,700,000.000178,040,000.0001,420,900,000.0001,239,500,000.00041,001,000.000...78,029,000.000802,710,000.00083,655,000.0008,704,300.000768,360,000.00090,731,000.000454,890,000.000156,210,000.000637,690,000.000566,070,000.000482,230,000.000119,740,000.0003,292,400,000.000403,900,000.000160,170,000.000376,390,000.000193,900,000.00093,188,000.000665,110,000.000353,140,000.00088,372,000.000351,900,000.000471,740,000.00094,743,000.0002,128,200,000.000285,980,000.000159,250,000.000202,640,000.000756,730,000.000201,200,000.000159,710,000.000221,920,000.000140,860,000.000349,100,000.000471,940,000.000204,800,000.000386,680,000.000643,070,000.000250,670,000.000686,200,000.000650,860,000.00031,042,000.000170,940,000.000276,020,000.000389,080,000.000245,760,000.000110,700,000.000215,840,000.000137,840,000.000327,740,000.000
20170509_QE2_nlc10_BKH_QC_MNT_HeLa011,286,900,000.000792,040,000.000348,880,000.000255,970,000.000226,600,000.0001,141,900,000.0001,159,300,000.000668,740,000.000234,340,000.0001,582,400,000.000476,140,000.000425,820,000.000141,870,000.00018,350,000.000220,370,000.000457,960,000.000139,340,000.0005,661,800,000.000263,290,000.00084,332,000.0001,339,100,000.0006,712,300,000.000842,000,000.000160,330,000.000124,310,000.00048,907,000.0001,208,100,000.000221,410,000.000542,510,000.00023,878,000.000509,980,000.000843,980,000.000275,180,000.000310,300,000.000262,150,000.0001,596,100,000.000837,440,000.000163,100,000.000204,960,000.0001,109,600,000.000199,860,000.0001,561,000,000.000448,470,000.000203,890,000.000983,220,000.0001,143,300,000.00090,790,000.000335,440,000.000600,260,000.00033,043,000.000...59,991,000.000305,710,000.000108,420,000.0002,162,400,000.000809,980,000.00054,686,000.000481,380,000.00095,814,000.000398,060,000.000438,630,000.000288,610,000.00052,531,000.0002,309,600,000.000316,180,000.000119,270,000.000346,940,000.00010,416,000.00077,059,000.000588,100,000.000186,650,000.00046,661,000.000386,450,000.000318,970,000.00073,229,000.0001,616,300,000.000319,160,000.000160,160,000.000151,450,000.000489,050,000.000161,890,000.000124,520,000.000120,670,000.000248,000,000.000334,290,000.000390,400,000.000149,910,000.000231,630,000.000789,490,000.000293,440,000.000393,090,000.000388,750,000.00066,577,000.00093,201,000.000187,040,000.00055,440,000.000167,580,000.000100,310,000.000140,020,000.00084,219,000.000NaN
20180601_QE8_nLC5_JM_QC_MNT_HeLa_7506,150,000.0001,852,700,000.000272,890,000.000601,260,000.000619,760,000.000864,770,000.0002,081,900,000.0001,527,500,000.000439,170,000.0001,316,400,000.000480,990,000.000604,480,000.000279,380,000.000524,820,000.000191,540,000.00077,975,000.000359,930,000.0006,833,100,000.00070,627,000.000122,070,000.0001,845,900,000.0004,855,500,000.000NaN193,730,000.000139,990,000.00088,478,000.000835,280,000.000183,130,000.000779,950,000.00091,652,000.000391,760,000.0001,235,500,000.000363,950,000.000323,680,000.000483,990,000.0002,235,300,000.0001,080,800,000.000146,310,000.000236,660,000.000755,400,000.000133,940,000.0001,444,100,000.000NaN297,760,000.000503,830,000.000283,210,000.000141,870,000.000771,810,000.000665,910,000.00096,771,000.000...83,443,000.000327,320,000.000251,150,000.0002,503,800,000.0001,391,900,000.00098,016,000.000379,390,000.000187,740,000.000509,410,000.000452,460,000.000357,550,000.00080,368,000.0002,585,500,000.000516,910,000.000109,810,000.000247,510,000.000185,270,000.000189,510,000.000518,360,000.000326,130,000.00088,715,000.000489,770,000.000232,100,000.000226,770,000.0001,166,000,000.000149,240,000.000137,330,000.000139,910,000.000449,430,000.000190,240,000.000136,250,000.000100,590,000.00088,725,000.000142,750,000.000564,210,000.000236,610,000.000248,830,000.000181,540,000.000256,000,000.000426,320,000.000382,690,000.00091,954,000.00090,708,000.000327,510,000.000593,010,000.000595,470,000.000213,300,000.000380,440,000.000131,930,000.000113,270,000.000
20180813_QE4_LC6_MR_QC_MNT_2newHeLa_06928,030,000.0003,708,700,000.00051,508,000.000686,140,000.0001,061,800,000.0001,021,400,000.0003,704,100,000.0002,041,200,000.0001,300,100,000.0004,552,100,000.0001,096,300,000.000900,600,000.000462,560,000.000NaN662,230,000.000843,700,000.000380,920,000.00011,138,000,000.000315,870,000.000286,290,000.0002,891,300,000.00011,942,000,000.0001,083,400,000.000449,160,000.000449,270,000.000320,630,000.0002,338,200,000.000326,780,000.0001,107,100,000.000188,040,000.0001,315,700,000.0003,087,700,000.000731,300,000.000979,350,000.0001,054,900,000.0004,245,400,000.0001,363,500,000.000349,240,000.000535,980,000.0004,424,000,000.000595,210,000.0002,347,300,000.000888,470,000.000569,940,000.0001,987,600,000.0001,449,000,000.000276,630,000.0004,838,400,000.000958,190,000.000107,190,000.000...151,130,000.0001,041,000,000.000414,370,000.0003,796,600,000.0003,021,900,000.000104,690,000.000840,500,000.000339,250,000.0001,207,300,000.0001,090,200,000.000831,280,000.000162,990,000.0004,453,900,000.000983,090,000.000214,650,000.000858,400,000.000874,240,000.000143,040,000.000521,780,000.000743,610,000.000146,060,000.000685,540,000.0001,033,000,000.000351,580,000.0003,187,400,000.00047,692,000.000240,100,000.000365,010,000.000996,990,000.000349,230,000.000333,980,000.000286,780,000.000351,570,000.000493,880,000.0001,095,700,000.000343,800,000.000362,850,000.0001,201,500,000.000698,760,000.0001,008,100,000.000NaN182,020,000.000210,690,000.000571,530,000.000530,240,000.000901,750,000.000349,120,000.000846,250,000.000390,180,000.000485,390,000.000
20180518_QE4_nLC6_MR_QC_MNT_HeLa_34708,070,000.0001,986,000,000.000371,780,000.000567,700,000.000680,990,000.0001,172,000,000.0002,345,200,000.0001,754,400,000.0001,252,600,000.0003,266,100,000.000593,110,000.000480,830,000.000280,170,000.000692,440,000.000302,820,000.000251,240,000.000143,340,000.0009,832,900,000.000170,740,000.000129,740,000.0005,766,600,000.0008,582,300,000.000550,890,000.000569,480,000.000577,460,000.000229,540,000.000801,850,000.000261,230,000.000626,530,000.000126,480,000.0001,381,800,000.0001,660,400,000.000501,980,000.000446,390,000.000498,870,000.0002,716,700,000.0001,979,000,000.000154,490,000.000233,360,000.0002,011,600,000.000291,580,000.0002,645,900,000.000562,750,000.000265,090,000.0001,300,000,000.000761,850,000.000291,060,000.0003,951,900,000.000723,120,000.00055,051,000.000...31,402,000.000673,080,000.000409,320,000.0002,823,200,000.0005,932,400,000.00065,115,000.000430,950,000.000201,960,000.000840,190,000.0001,005,100,000.0001,068,100,000.000136,490,000.0002,819,900,000.000395,000,000.000130,500,000.000425,780,000.000320,520,000.000112,610,000.000566,790,000.000554,210,000.00050,409,000.000528,380,000.000571,520,000.000146,540,000.000NaN343,250,000.000162,690,000.000167,790,000.000854,640,000.000170,640,000.000152,300,000.000152,920,000.000219,380,000.000467,870,000.000587,300,000.000181,180,000.000326,840,000.000144,210,000.000259,680,000.000466,680,000.000596,720,000.000305,250,000.000173,440,000.000350,160,000.000593,140,000.000757,250,000.000227,890,000.000483,650,000.000182,040,000.000256,180,000.000
..................................................................................................................................................................................................................................................................................................................
20150619_QE1_UPLC_BTW_QC_MNT_HELA011,139,600,000.000495,650,000.000163,260,000.000139,090,000.000882,620,000.000NaN1,726,000,000.0001,204,400,000.0001,710,300,000.0001,749,100,000.000206,160,000.00097,986,000.000196,680,000.00024,819,000.000331,370,000.000574,400,000.000124,000,000.0005,360,200,000.00016,276,000.00061,690,000.0002,209,800,000.0005,669,400,000.000944,020,000.000306,850,000.000350,900,000.000314,510,000.000275,130,000.000194,310,000.000679,340,000.000131,510,000.000828,320,000.0001,941,000,000.000311,320,000.000462,210,000.000583,410,000.0001,417,200,000.000510,410,000.000232,330,000.000134,130,000.0001,346,500,000.000156,780,000.0002,520,100,000.000424,900,000.000496,500,000.0001,210,000,000.000754,650,000.000159,600,000.0002,397,700,000.000873,130,000.00084,552,000.000...164,760,000.000384,590,000.000282,140,000.0001,640,600,000.0003,540,000,000.000NaN395,260,000.000136,270,000.000508,750,000.000992,480,000.00050,196,000.00043,280,000.000955,520,000.000180,680,000.000106,070,000.000472,000,000.000187,560,000.000219,180,000.000527,200,000.000163,840,000.00071,624,000.00098,617,000.000286,210,000.00069,378,000.000770,640,000.000201,130,000.00065,194,000.00029,953,000.000439,280,000.000213,160,000.000102,390,000.000217,520,000.000223,530,000.000175,580,000.00084,358,000.000110,820,000.00020,459,000.000162,530,000.000163,140,000.000413,410,000.000351,800,000.000393,960,000.000307,040,000.000189,720,000.000270,540,000.000291,610,000.000117,450,000.000174,250,000.00026,459,000.000342,170,000.000
20180427_QE9_nLC14_FaCo_QC_MNT_Hela_50cm_2,5ul_2499,610,000.000586,330,000.00068,463,000.000344,140,000.000557,170,000.000664,800,000.0001,203,600,000.000516,600,000.000501,160,000.0002,170,900,000.000464,110,000.000211,530,000.000145,340,000.000189,130,000.000227,910,000.000157,780,000.000210,970,000.0005,246,000,000.000234,190,000.000150,620,000.000771,290,000.0008,674,200,000.000905,600,000.000493,190,000.000145,110,000.00098,770,000.000702,600,000.000259,810,000.000198,630,000.00051,305,000.000427,810,000.000644,520,000.000342,750,000.000377,610,000.000570,870,000.0003,144,300,000.000750,110,000.00060,400,000.000137,520,000.0001,252,100,000.000156,490,000.0001,658,500,000.000591,550,000.000463,440,000.0001,133,100,000.000785,470,000.000181,850,000.000944,600,000.000311,500,000.00088,557,000.000...49,880,000.000623,180,000.000194,410,000.0002,167,600,000.00038,689,000.00068,295,000.00071,226,000.00046,623,000.000612,680,000.000522,130,000.000486,660,000.00091,995,000.0002,364,100,000.000421,540,000.000184,490,000.000330,660,000.000139,170,000.000177,960,000.000793,020,000.000167,150,000.000196,270,000.000569,290,000.000474,600,000.000266,320,000.0001,583,900,000.000303,890,000.000151,110,000.000108,970,000.000454,410,000.000237,840,000.000162,690,000.00078,556,000.000NaN105,760,000.000695,760,000.000475,450,000.000435,100,000.000495,590,000.000448,910,000.000601,330,000.000399,930,000.000132,690,000.000145,760,000.000242,430,000.00043,591,000.00077,826,000.00027,784,000.000234,320,000.000NaN231,320,000.000
20161111_QE6_nLC4_SCL_QC_HeLa_061,161,600,000.0002,541,500,000.000439,210,000.000463,030,000.0001,023,000,000.0002,139,600,000.0001,827,400,000.0001,193,200,000.000676,140,000.00011,687,000.000759,970,000.000439,660,000.000276,580,000.000818,480,000.000259,160,000.000676,620,000.000246,850,000.0006,852,600,000.000197,400,000.000185,290,000.000562,990,000.0006,822,500,000.000358,270,000.000194,990,000.000254,520,000.000174,540,000.0001,244,200,000.000204,020,000.000965,160,000.00065,191,000.000930,320,000.0002,510,100,000.000311,350,000.000599,770,000.000652,830,000.0002,351,800,000.0001,345,300,000.000207,060,000.000439,740,000.0001,876,500,000.000242,090,000.0002,472,500,000.000454,780,000.000290,480,000.0001,368,800,000.000942,260,000.000196,290,000.000709,990,000.000884,260,000.00036,725,000.000...70,091,000.000246,930,000.000118,900,000.0002,506,100,000.000488,320,000.00085,128,000.000727,820,000.000201,560,000.000776,300,000.000665,010,000.000505,280,000.00092,481,000.0002,877,200,000.000402,970,000.000118,750,000.000380,750,000.000181,670,000.00093,525,000.000608,140,000.000424,820,000.00070,983,000.000611,390,000.000450,750,000.000201,910,000.0002,909,300,000.000313,590,000.000150,350,000.000263,500,000.000582,830,000.000140,390,000.000201,400,000.000146,620,000.000445,070,000.000840,960,000.000447,050,000.000101,330,000.0001,079,700,000.0001,176,400,000.000136,340,000.000837,170,000.000609,310,000.00052,019,000.00089,408,000.000309,690,000.000789,510,000.000279,640,000.000191,690,000.000178,400,000.000176,450,000.000239,390,000.000
20190607_QX4_JiYu_MA_HeLa_500ng3,195,800,000.0003,090,300,000.0001,679,000,000.000NaN4,487,000,000.0006,633,100,000.0002,635,500,000.0001,505,400,000.000138,080,000.00010,876,000,000.0003,256,600,000.0002,138,900,000.000704,530,000.000389,200,000.0001,448,700,000.0002,838,800,000.000418,730,000.00023,496,000,000.000869,930,000.000914,410,000.0003,375,800,000.00020,309,000,000.0001,695,600,000.000783,310,000.000169,150,000.000636,510,000.0003,253,300,000.0001,101,500,000.0001,635,800,000.000448,600,000.0002,092,900,000.0008,271,800,000.0002,729,600,000.0002,746,000,000.0001,759,000,000.0009,938,100,000.0003,662,300,000.000843,470,000.0001,595,200,000.0004,463,300,000.000415,750,000.0002,073,100,000.0002,067,300,000.000730,770,000.0006,174,600,000.0002,981,700,000.000593,710,000.0003,994,000,000.0003,962,800,000.000227,910,000.000...231,370,000.0003,426,100,000.000159,240,000.0003,103,800,000.0005,739,400,000.000677,560,000.000813,280,000.000636,160,000.0003,252,100,000.000103,830,000.000986,860,000.000421,870,000.0009,442,400,000.0002,200,100,000.0001,668,800,000.0002,132,100,000.0003,826,900,000.000741,710,000.0001,630,700,000.0001,820,700,000.000456,380,000.000387,070,000.0001,839,800,000.0001,314,800,000.0005,414,700,000.0001,531,900,000.0001,935,000,000.000605,400,000.0002,923,200,000.000856,220,000.000486,400,000.000907,560,000.0002,790,600,000.0001,251,800,000.00023,855,000.0001,951,100,000.0001,784,300,000.0004,081,200,000.000638,910,000.000276,770,000.0003,243,200,000.000535,960,000.0001,133,500,000.0001,757,500,000.0002,134,800,000.0001,190,900,000.000803,690,000.0001,644,400,000.0001,316,600,000.000141,390,000.000
20180812_QE9_nLC02_FaCo_QC_Hela_50cm_011,182,300,000.000390,770,000.000407,310,000.000681,450,000.0001,229,600,000.0001,839,000,000.0004,355,200,000.0001,952,200,000.0001,230,100,000.0003,578,100,000.000983,930,000.000529,650,000.000433,430,000.000166,760,000.000581,200,000.000786,680,000.000680,310,000.00010,860,000,000.000521,870,000.000259,090,000.0003,581,500,000.0009,811,300,000.0001,179,700,000.000495,810,000.000250,490,000.000284,070,000.0001,918,000,000.000483,090,000.000619,590,000.000140,650,000.0001,276,500,000.0001,902,000,000.0001,164,600,000.000692,170,000.000735,520,000.0006,268,700,000.000223,740,000.000215,990,000.000184,640,000.0002,909,700,000.000532,520,000.0002,994,300,000.0001,125,400,000.000655,510,000.0001,690,200,000.0001,072,500,000.000304,420,000.0002,138,300,000.000882,120,000.000190,360,000.000...123,030,000.0001,358,400,000.000286,610,000.0003,359,800,000.0002,807,000,000.000146,740,000.000433,940,000.000261,940,000.0001,100,200,000.0001,006,200,000.000548,800,000.000191,650,000.0004,287,800,000.000806,650,000.000273,980,000.0001,203,700,000.000465,940,000.000288,700,000.0001,205,000,000.000690,900,000.000265,810,000.000932,560,000.000743,140,000.000492,370,000.0003,701,300,000.000567,820,000.000298,110,000.000149,750,000.0001,049,900,000.000303,380,000.000241,680,000.00068,874,000.000356,030,000.000431,790,000.0001,189,700,000.000463,200,000.000777,070,000.000926,000,000.000342,860,000.000781,690,000.000618,680,000.000229,460,000.000175,250,000.000524,190,000.000755,530,000.000737,950,000.000244,100,000.000849,740,000.000342,250,000.00042,221,000.000
20170808_QE7_nLC11_MEM_QC_MNT_HeLa_02579,360,000.0001,690,200,000.000259,710,000.000458,000,000.0001,507,500,000.0001,059,300,000.0003,101,000,000.000834,030,000.0002,081,600,000.0002,232,000,000.000716,500,000.000626,350,000.000192,820,000.00056,879,000.000313,250,000.000535,070,000.000344,890,000.0008,800,100,000.00032,016,000.000152,330,000.0006,589,400,000.0006,605,600,000.000638,880,000.000468,060,000.000342,890,000.000203,330,000.000773,070,000.000243,230,000.000475,530,000.000157,010,000.0001,444,600,000.0002,166,100,000.000412,820,000.000491,930,000.000539,860,000.0002,854,000,000.0001,113,900,000.000118,150,000.000378,260,000.0001,912,900,000.000131,100,000.000979,980,000.000624,470,000.000282,170,000.0001,364,200,000.000654,660,000.000326,010,000.0002,863,600,000.0001,268,000,000.00096,547,000.000...223,840,000.000599,890,000.000688,880,000.0002,470,200,000.0001,897,800,000.00079,382,000.000378,400,000.000168,910,000.000545,050,000.0001,154,800,000.0001,144,000,000.000156,050,000.0002,882,300,000.000NaNNaN387,580,000.000348,990,000.00024,983,000.000545,320,000.000377,380,000.00068,044,000.000NaN459,460,000.000174,260,000.0001,074,000,000.000188,440,000.000209,120,000.000104,020,000.000591,890,000.000251,440,000.000194,720,000.000164,530,000.000157,140,000.00030,683,000.00021,617,000.000149,380,000.000336,790,000.000297,090,000.000169,710,000.000751,650,000.000261,600,000.000309,840,000.000346,980,000.000450,220,000.000511,480,000.000765,440,000.000328,010,000.000599,030,000.000174,020,000.000NaN
20180822_QE7_nLC7_AL_QC_HeLa_03_20180826033504848,800,000.0006,180,300,000.000NaN690,820,000.000775,950,000.000827,280,000.0003,516,900,000.0002,008,400,000.0001,575,300,000.0004,461,400,000.000695,110,000.000721,930,000.000360,190,000.0001,204,500,000.000360,220,000.000913,400,000.000172,280,000.0005,040,000,000.000133,100,000.00014,783,000.0003,188,600,000.0007,764,800,000.000819,340,000.00077,255,000.000158,940,000.000181,410,000.000319,400,000.00076,177,000.000914,730,000.00036,715,000.000707,820,000.0001,828,800,000.000921,060,000.0001,152,500,000.0001,017,200,000.0002,539,000,000.0001,657,900,000.000376,000,000.000377,670,000.0001,062,300,000.000471,380,000.0002,699,200,000.000222,920,000.000352,220,000.0002,409,800,000.000718,030,000.00064,164,000.0002,274,400,000.0001,060,500,000.00087,769,000.000...53,683,000.000426,460,000.000234,120,000.0001,867,500,000.0001,018,200,000.00085,858,000.000515,310,000.000291,910,000.0001,241,300,000.000301,000,000.000445,840,000.00082,363,000.0001,568,400,000.000661,080,000.000130,220,000.000210,230,000.000412,660,000.000209,470,000.000547,100,000.000252,100,000.000125,700,000.000NaNNaN382,990,000.000532,870,000.000213,220,000.000103,110,000.000140,700,000.000344,960,000.00036,488,000.000495,960,000.00082,557,000.000118,410,000.000181,860,000.0001,519,000,000.00091,685,000.000533,020,000.000509,410,000.000110,030,000.000475,380,000.000631,950,000.000234,200,000.000NaN517,120,000.000422,810,000.000532,170,000.000191,460,000.000295,150,000.000185,970,000.000493,090,000.000
20190604_QE10_nLC13_LiNi_QC_MNT_15cm_HeLa_02990,650,000.000991,840,000.000186,350,000.000407,450,000.000828,060,000.000422,350,000.0002,583,500,000.0001,483,000,000.0001,029,600,000.0002,294,900,000.000359,630,000.000298,450,000.000147,720,000.000512,480,000.000354,520,000.000766,050,000.000214,740,000.0009,036,200,000.000399,650,000.00082,758,000.0002,638,700,000.0008,799,700,000.000832,470,000.000702,050,000.000204,500,000.000150,910,000.000689,760,000.000312,820,000.000453,720,000.000124,690,000.0001,275,200,000.0001,478,200,000.000839,120,000.000539,480,000.000342,810,000.0003,028,600,000.000705,740,000.00092,918,000.000313,530,000.0002,223,000,000.000317,090,000.0002,008,700,000.000766,620,000.000513,590,000.0001,622,600,000.0001,539,900,000.000335,340,000.0002,035,300,000.0001,200,900,000.00066,880,000.000...172,530,000.000859,550,000.000322,280,000.0003,119,300,000.0002,672,000,000.000182,160,000.000391,210,000.000208,180,000.000902,970,000.0001,076,000,000.000543,290,000.000112,840,000.0001,080,100,000.000147,160,000.000304,810,000.000881,200,000.000397,460,000.000248,420,000.000819,200,000.000214,460,000.000114,220,000.000513,930,000.000623,310,000.000196,000,000.0002,080,000,000.00053,993,000.00062,614,000.000NaN480,100,000.000303,360,000.000100,090,000.000245,730,000.000236,980,000.000676,200,000.000914,770,000.000158,660,000.000202,870,000.000444,670,000.000192,650,000.000609,020,000.000585,050,000.000712,380,000.000359,550,000.000326,650,000.000459,340,000.000360,050,000.000199,070,000.000639,420,000.000133,710,000.000399,000,000.000
20180131_QE1_nlc10_TW_QC_HeLa_11,866,100,000.0002,386,300,000.000541,290,000.000464,790,000.0002,532,800,000.0002,176,900,000.0003,321,300,000.0001,175,600,000.0003,078,000,000.0004,674,000,000.0001,096,700,000.000920,180,000.000417,330,000.000743,380,000.000421,860,000.000486,970,000.000322,930,000.00017,063,000,000.000260,170,000.000397,620,000.0002,670,700,000.00010,580,000,000.00013,361,000.0001,796,900,000.000773,680,000.000494,030,000.0001,106,700,000.000479,940,000.000548,080,000.000316,770,000.0001,147,700,000.0002,913,300,000.000519,590,000.0001,540,200,000.000721,530,000.0005,530,000,000.0001,337,000,000.000271,060,000.000795,050,000.0003,768,100,000.000529,710,000.0001,751,800,000.0001,594,200,000.000723,330,000.0002,734,300,000.0001,374,300,000.000670,090,000.0002,453,300,000.000985,300,000.00059,624,000.000...230,050,000.000699,900,000.000581,890,000.0003,250,700,000.0002,448,200,000.000183,890,000.0001,138,000,000.000333,930,000.0001,229,000,000.0002,002,600,000.0001,782,300,000.000174,660,000.0004,611,700,000.000532,020,000.000171,380,000.0001,359,800,000.0001,099,400,000.000184,580,000.000931,800,000.000645,540,000.00083,631,000.000832,400,000.000938,820,000.000220,920,000.0003,425,300,000.000838,350,000.000226,280,000.000425,940,000.0001,428,900,000.000311,510,000.000271,340,000.000375,970,000.000506,940,000.000586,420,000.0001,414,900,000.000237,670,000.000751,710,000.000424,950,000.000709,600,000.0001,641,100,000.000581,270,000.000589,300,000.000415,380,000.000724,400,000.000887,530,000.0001,433,200,000.00067,046,000.0001,069,800,000.000205,290,000.000371,180,000.000
20180529_QE9_nLC13_QC_ASD_Hela4,140,300,000.0005,678,800,000.000888,310,000.0001,669,500,000.0002,690,500,000.0004,521,200,000.0008,971,600,000.0004,944,400,000.0002,524,100,000.0006,875,200,000.0002,217,800,000.0001,524,200,000.000899,930,000.0001,943,200,000.000959,920,000.0001,861,100,000.0001,414,900,000.00019,237,000,000.0001,779,800,000.000718,330,000.0009,007,300,000.00018,514,000,000.0003,075,600,000.000678,460,000.000773,210,000.000642,540,000.0002,586,100,000.000806,210,000.0001,955,000,000.000314,060,000.0002,880,200,000.0005,425,600,000.0001,260,500,000.0001,848,100,000.0001,649,300,000.0009,189,100,000.0002,416,700,000.000794,590,000.0001,205,300,000.0005,362,300,000.0001,255,200,000.0001,505,400,000.0002,435,700,000.0001,351,700,000.0003,249,500,000.0002,897,800,000.000506,430,000.0006,748,200,000.0003,617,300,000.000481,450,000.000...371,510,000.0002,111,800,000.000865,390,000.0007,881,500,000.0007,287,700,000.000374,780,000.0002,253,500,000.000887,860,000.0002,585,400,000.0001,714,600,000.0001,914,400,000.000380,490,000.0007,314,300,000.0001,784,800,000.000554,880,000.000286,680,000.000740,780,000.000464,570,000.0003,719,000,000.0001,754,900,000.000284,620,000.0002,535,700,000.0001,840,800,000.000828,410,000.0005,089,700,000.0001,376,300,000.0001,000,100,000.000815,610,000.0002,487,100,000.0001,257,700,000.000559,490,000.000431,470,000.000805,680,000.0001,784,400,000.0002,579,900,000.000567,270,000.0001,084,100,000.0001,952,800,000.0001,267,800,000.0001,574,900,000.0002,074,300,000.0001,145,100,000.000554,680,000.0001,366,900,000.0002,420,200,000.0002,068,300,000.000905,000,000.0001,355,300,000.000818,890,000.000825,010,000.000
20160809_QE1_nlc2_MB_QC_Hela1680,870,000.000746,850,000.000528,910,000.000323,580,000.000221,150,000.0001,027,100,000.0002,118,800,000.000680,960,000.000267,410,000.0001,623,200,000.000649,120,000.000463,180,000.000271,590,000.000732,530,000.000245,810,000.000468,720,000.000116,480,000.0004,281,100,000.000125,860,000.00060,419,000.0001,035,000,000.0003,042,900,000.000481,030,000.00096,983,000.00092,001,000.000144,480,000.000639,500,000.000169,120,000.000314,880,000.00035,910,000.000428,170,000.0001,521,900,000.000289,390,000.000367,670,000.000517,110,000.0002,937,100,000.000752,970,000.000118,650,000.000309,480,000.0001,148,000,000.000174,030,000.0001,184,300,000.000361,020,000.000119,630,000.000815,010,000.000653,960,000.000148,590,000.000324,420,000.00085,457,000.000116,030,000.000...62,307,000.000358,500,000.000126,740,000.0001,322,200,000.000469,570,000.000117,210,000.000696,530,000.000154,480,000.000375,610,000.000355,940,000.000649,690,000.00061,704,000.0003,047,900,000.000238,740,000.00089,794,000.000223,020,000.00061,645,000.00069,401,000.000483,380,000.000559,300,000.00067,966,000.000454,140,000.000367,690,000.000186,050,000.0001,100,900,000.000189,890,000.000140,780,000.000113,890,000.000503,320,000.000161,490,000.000173,030,000.000NaN226,590,000.000145,400,000.000570,380,000.000126,030,000.000289,200,000.000400,940,000.00069,549,000.000996,660,000.000269,650,000.00029,535,000.00067,927,000.000326,680,000.000455,560,000.000245,990,000.000104,550,000.000167,210,000.000167,020,000.000147,580,000.000
20180427_QE2_nLC1_GP_QC_MNT_HeLa_250_011,925,300,000.0001,313,200,000.000384,280,000.000456,150,000.000519,800,000.000887,210,000.0002,750,100,000.0001,428,400,000.0001,770,700,000.0002,643,300,000.000897,860,000.000474,390,000.000238,010,000.000540,680,000.000181,330,000.000550,790,000.000150,790,000.00010,364,000,000.000204,880,000.00079,986,000.0001,313,800,000.0004,937,600,000.000780,870,000.000411,240,000.000178,760,000.000124,020,000.000656,720,000.000140,010,000.000440,760,000.000116,750,000.000960,960,000.0002,174,100,000.000389,080,000.000552,360,000.000836,900,000.0002,626,000,000.0001,815,400,000.000126,830,000.000310,520,000.0001,684,400,000.000131,210,000.0001,728,400,000.000860,120,000.000221,470,000.0001,364,000,000.000494,420,000.000322,940,000.0003,767,100,000.000882,800,000.00053,662,000.000...159,340,000.000460,500,000.000444,070,000.0002,728,800,000.0002,376,400,000.00044,645,000.000462,380,000.000201,900,000.000431,880,000.000949,890,000.000782,010,000.00085,566,000.0003,151,900,000.000223,250,000.00085,386,000.000355,720,000.000450,820,000.00090,110,000.000533,350,000.00033,854,000.00072,593,000.000554,750,000.000385,940,000.000179,040,000.0001,093,400,000.000209,490,000.000157,920,000.000161,170,000.000555,570,000.000147,640,000.000196,300,000.000171,080,000.000143,800,000.000419,140,000.000725,540,000.000118,560,000.000319,960,000.000282,360,000.00071,862,000.000765,170,000.000193,810,000.000262,830,000.000301,830,000.000264,970,000.000421,620,000.000558,680,000.000263,760,000.000542,070,000.000101,690,000.000192,770,000.000
20190701_QE4_LC12_IAH_QC_MNT_HeLa_021,737,700,000.0004,570,900,000.000536,730,000.000369,860,000.0001,542,200,000.000423,840,000.0007,232,600,000.0002,820,200,000.0003,148,500,000.0006,539,100,000.0001,642,100,000.000563,240,000.000336,320,000.0001,723,400,000.000939,890,000.0001,628,100,000.000521,670,000.00017,776,000,000.000773,250,000.000148,400,000.0006,518,600,000.00016,288,000,000.0001,789,600,000.0001,844,200,000.000971,510,000.000424,160,000.0001,350,000,000.000489,910,000.0001,527,300,000.000314,660,000.0002,163,400,000.0005,990,400,000.0001,367,600,000.0001,743,300,000.0001,429,500,000.0008,352,500,000.0001,992,600,000.000465,080,000.00065,549,000.0006,696,400,000.000902,990,000.0004,770,100,000.0001,493,700,000.0001,023,800,000.0003,021,500,000.0002,531,200,000.000704,410,000.00016,697,000,000.0002,254,800,000.000105,870,000.000...390,450,000.0002,565,500,000.000486,850,000.0006,478,400,000.000NaN196,040,000.0001,018,700,000.000471,040,000.0001,780,500,000.0002,875,900,000.0001,484,500,000.000276,620,000.0003,429,300,000.000507,350,000.000381,600,000.0002,129,700,000.0001,221,000,000.000259,000,000.0001,823,000,000.000347,420,000.000157,280,000.000939,750,000.0002,271,100,000.000469,270,000.0004,077,400,000.00066,826,000.000129,830,000.000573,770,000.0001,540,700,000.000629,470,000.000183,840,000.000564,080,000.000427,860,000.00064,565,000.0002,160,100,000.000189,120,000.000387,970,000.000810,990,000.000976,980,000.0002,159,000,000.000935,780,000.0001,130,000,000.000600,610,000.0001,034,200,000.0001,010,700,000.000862,560,000.000413,650,000.0001,500,500,000.000354,690,000.000991,000,000.000
20190821_QE8_nLC14_ASD_QC_MNT_HeLa_43,997,000,000.0003,587,800,000.0001,908,400,000.0001,587,500,000.0007,903,400,000.00014,233,000,000.0003,056,500,000.0003,151,700,000.000148,430,000.00015,386,000,000.0002,675,200,000.0001,587,300,000.000746,200,000.000NaN1,631,000,000.0007,588,900,000.000624,070,000.00025,318,000,000.000965,780,000.0001,059,900,000.0006,741,900,000.00025,993,000,000.0003,647,400,000.000575,140,000.000292,450,000.000459,030,000.0002,421,200,000.0001,339,100,000.0003,424,700,000.000391,430,000.000997,020,000.00012,049,000,000.0002,362,100,000.0002,184,800,000.0003,274,900,000.00013,292,000,000.0006,374,700,000.000988,980,000.0002,005,300,000.0003,609,000,000.000603,290,000.0005,514,400,000.0002,988,500,000.0001,557,300,000.0007,565,800,000.0003,699,400,000.000570,160,000.0001,308,600,000.0005,131,700,000.000232,830,000.000...232,450,000.0003,401,900,000.000190,660,000.0005,460,800,000.0005,393,700,000.000550,600,000.000462,650,000.000573,850,000.0003,515,000,000.000118,150,000.000870,400,000.000678,810,000.0009,947,900,000.0002,385,200,000.0001,520,800,000.0002,408,800,000.0001,907,900,000.000652,040,000.0001,964,500,000.0002,092,100,000.00029,336,000.000435,130,000.0002,018,000,000.0001,683,800,000.0008,161,300,000.0001,617,700,000.0002,588,500,000.000129,320,000.0003,189,100,000.000948,350,000.000463,130,000.000674,680,000.0001,587,000,000.0001,136,000,000.000586,460,000.0001,715,100,000.0003,110,200,000.0004,427,500,000.0001,387,000,000.000292,830,000.0003,103,300,000.000366,750,000.0001,039,400,000.0001,747,800,000.0001,742,000,000.000104,780,000.000868,520,000.0002,327,200,000.0001,029,200,000.000831,320,000.000
20191028_QX3_LiSc_MA_Hela_500ng_LC15_13,885,000,000.0003,379,700,000.000292,270,000.0002,134,500,000.0007,589,700,000.00010,846,000,000.0009,238,500,000.000764,970,000.000239,100,000.00017,170,000,000.0004,077,300,000.0004,165,000,000.000925,600,000.000828,060,000.0002,661,400,000.00087,183,000.0001,253,200,000.00030,693,000,000.000306,340,000.0001,820,800,000.0006,113,200,000.00027,014,000,000.0004,056,800,000.00093,257,000.000269,340,000.000543,950,000.0003,111,700,000.000NaNNaN788,430,000.0001,635,600,000.0001,548,600,000.0004,639,400,000.0004,576,400,000.0003,098,500,000.00013,577,000,000.000771,330,000.0001,695,700,000.0002,405,400,000.0004,251,500,000.0002,368,700,000.0003,198,900,000.0003,651,300,000.0001,545,100,000.0007,252,500,000.0005,061,700,000.0001,036,300,000.00016,418,000,000.0005,053,500,000.000574,380,000.000...644,270,000.0004,442,000,000.000253,100,000.0007,092,000,000.0007,122,800,000.0001,046,900,000.000854,110,000.000766,060,000.0005,188,500,000.000124,530,000.0001,872,700,000.000940,030,000.00011,206,000,000.0003,701,700,000.0002,001,800,000.0003,273,900,000.0004,675,800,000.000904,450,000.0001,915,900,000.0002,534,900,000.000460,220,000.000512,860,000.0002,554,900,000.0001,515,900,000.0006,054,700,000.0002,276,500,000.0002,813,200,000.0001,079,200,000.0004,016,500,000.000903,660,000.000632,630,000.0001,316,200,000.0004,082,700,000.0002,819,900,000.0001,067,900,000.0002,298,100,000.0002,751,800,000.0005,743,700,000.0001,380,500,000.000NaN4,048,100,000.000517,170,000.000157,810,000.0002,928,800,000.0003,124,200,000.0002,215,300,000.0001,111,100,000.0002,893,900,000.0002,000,300,000.0001,279,100,000.000
-

7813 rows × 1000 columns

-
- - - - -```python -N_MIN_OBS = 800 # here: present in 80% of the samples -mask_min_obsevation = analysis.df.notna().sum() >= N_MIN_OBS -mask_min_obsevation.sum() -``` - - - - - 1000 - - - -Reference analysis.df as `X` - - -```python -X = analysis.df -``` - -## Peptitome is spares - - -```python -def get_sorted_not_missing(X: pd.DataFrame): - """Return a Dataframe with missing values. Order columns by degree of completness - over columns from variables least to most shared among observations.""" - X = X.notna().astype(int) - return X[X.mean().sort_values().index] -``` - - -```python -%time not_missing = get_sorted_not_missing(X) -not_missing.iloc[:, -10:].describe() -``` - - Wall time: 31 ms - - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
EHALLAYTLGVKTFIAIKPDGVQRDLYANTVLSGGTTMYPGIADRIIPGFMCQGGDFTRSYELPDGQVITIGNERVAPEEHPVLLTEAPLNPKEITALAPSTMKIISNASCTTNCLAPLAKHQGVMVGMGQKTVTAMDVVYALK
count7,813.0007,813.0007,813.0007,813.0007,813.0007,813.0007,813.0007,813.0007,813.0007,813.000
mean0.9920.9930.9930.9930.9930.9940.9940.9940.9950.996
std0.0880.0840.0830.0810.0810.0790.0770.0760.0700.062
min0.0000.0000.0000.0000.0000.0000.0000.0000.0000.000
25%1.0001.0001.0001.0001.0001.0001.0001.0001.0001.000
50%1.0001.0001.0001.0001.0001.0001.0001.0001.0001.000
75%1.0001.0001.0001.0001.0001.0001.0001.0001.0001.000
max1.0001.0001.0001.0001.0001.0001.0001.0001.0001.000
-
- - - - -```python -not_missing.iloc[:10, -10:] -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
EHALLAYTLGVKTFIAIKPDGVQRDLYANTVLSGGTTMYPGIADRIIPGFMCQGGDFTRSYELPDGQVITIGNERVAPEEHPVLLTEAPLNPKEITALAPSTMKIISNASCTTNCLAPLAKHQGVMVGMGQKTVTAMDVVYALK
Sample ID
20150330_QE2_UPLC4_BTW_sa_hela_C18_35b1111111111
20190108_QE1_nLC2_MB_QC_MNT_HELA_new_011111111111
20190422_QE4_LC12_JE-IAH_QC_MNT_HeLa_01b1111111111
20191217_QE2_NLC0_GP_QC_MNT_HELA_011111111111
20180307_QE1_nlc10_GP_QC_HeLa_2501111111111
20180413_QE2_nLC1_MB_QC_HeLa_250_021111111111
20190103_QE8_nLC0_LiNi_QC_MNT_15cm_Hela_021111111111
20181016_QE10_nLC14_KS_QC_MNT_HeLa_15cm_011111111111
20180711_QE10_nLC14_RS_QC_MNT_HeLa_021111111111
20190530_QE2_NLC1_GP_QC_MNT_HELA_011111111111
-
- - - - -```python -N_MOST_COMMON_PEPTIDES = 300 -data_to_visualize = not_missing.iloc[:, -N_MOST_COMMON_PEPTIDES:] -print(f"Look at missingness pattern of {N_MOST_COMMON_PEPTIDES} most common peptides across sample.\n" - f"Data matrix dimension used for printing: { data_to_visualize.shape}") - - -# grid_kws = {"width_ratios": (.9, .05), "hspace": 0.5} -# fig_heatmap_missing, (axes_heatmap_missing, cbar_ax) = plt.subplots( -# 1, 1, gridspec_kw=grid_kws, figsize=(12, 8)) -# USE_CBAR = True - -fig_heatmap_missing, axes_heatmap_missing = plt.subplots( - 1, 1, figsize=(12, 8)) -USE_CBAR = False - -axes_heatmap_missing = sns.heatmap(data_to_visualize, - ax=axes_heatmap_missing, - cbar = USE_CBAR, -# cbar_ax=cbar_ax, -# cbar_kws={"orientation": "vertical"}, - ) -``` - - Look at missingness pattern of 300 most common peptides across sample. - Data matrix dimension used for printing: (7813, 300) - - - -![png](11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_23_1.png) - - -White patches indicates that a peptide has been measured, black means it was not measured. Some samples (rows) have few of the most common peptides. This suggests to set a minimum of total peptides in a sample, which is common pratice. - -> An algorithm should work with the most common peptides and base it's inference capabilities after training on these. - - -```python -data_to_visualize.sum(axis=1).nsmallest(20) # Samplest with the fewest measurements out of the seletion -``` - - - - - Sample ID - 20150513_QE7_UPLC10_DBJ_QC_SA_HELAS3_Exp5D_19frac_12 81 - 20190826_QX7_AnBr_MA_HeLa_Br14_500ng_01_190827114215 81 - 20180920_QE9_nLC13_LiNi_QC_Hela2 85 - 20150513_QE7_UPLC10_DBJ_QC_SA_HELAS3_Exp5D_19frac_8 85 - 20150513_QE7_UPLC10_DBJ_QC_SA_HELAS3_Exp5D_19frac_7 86 - 20150513_QE7_UPLC10_DBJ_QC_SA_HELAS3_Exp5D_19frac_14 86 - 20150513_QE7_UPLC10_DBJ_QC_SA_HELAS3_Exp5D_19frac_17 87 - 20160609_QE2_nLC1_BTW_SA_hela_W_proteome_exp2_10 89 - 20150513_QE7_UPLC10_DBJ_QC_SA_HELAS3_Exp5D_14frac_7 91 - 20160609_QE1_nlc2_BTW_SA_hela_W_proteome_exp1_14 92 - 20150513_QE7_UPLC10_DBJ_QC_SA_HELAS3_Exp5D_19frac_15 93 - 20160609_QE2_nLC1_BTW_SA_hela_W_proteome_exp2_09 93 - 20150513_QE7_UPLC10_DBJ_QC_SA_HELAS3_Exp5D_14frac_5 93 - 20160714_QE2_nLC0_SS_SA_hela_L_1Gy_M_10Gy_H_2hrs_50mM_pH11 94 - 20150513_QE7_UPLC10_DBJ_QC_SA_HELAS3_Exp5D_14frac_8 94 - 20190806_QX6_MaTa_MA_HeLa_500ng_LC09 95 - 20160609_QE1_nlc2_BTW_SA_hela_W_proteome_exp1_11 95 - 20160609_QE1_nlc2_BTW_SA_hela_W_proteome_exp1_13 96 - 20181128_QE8_nLC0_RSR_QC_HeLa_Newprep_15cm_01 96 - 20160609_QE1_nlc2_BTW_SA_hela_W_proteome_exp1_10 97 - dtype: int64 - - - - -```python -# # This currently crashes if you want to have a pdf -from vaep.plotting import _savefig -from datetime import datetime -datetime_now = datetime.now() - -from functools import partial -_savefig = partial(_savefig, folder=FIGUREFOLDER) - - -_savefig(fig_heatmap_missing, - f'peptides_heatmap_missing_{datetime_now:%y%m%d}', pdf=False) -``` - -## Sample stats - - -```python -TYPE = 'peptides' -COL_NO_MISSING, COL_NO_IDENTIFIED = f'no_missing_{TYPE}', f'no_identified_{TYPE}' -COL_PROP_SAMPLES = 'prop_samples' - - -def compute_stats_missing(X): - """Dataset of repeated samples indicating if an observation - has the variables observed or missing x\in\{0,1\}""" - sample_stats = X.index.to_frame(index=False).reset_index() - sample_stats.columns = ['SampleID_int', 'INDEX'] - sample_stats.set_index('INDEX', inplace=True) - - sample_stats[COL_NO_IDENTIFIED] = X.sum(axis=1) - sample_stats[COL_NO_MISSING] = (X == 0).sum(axis=1) - - assert all(sample_stats[[COL_NO_IDENTIFIED, COL_NO_MISSING]].sum( - axis=1) == X.shape[1]) - sample_stats = sample_stats.sort_values( - by=COL_NO_IDENTIFIED, ascending=False) - sample_stats[COL_PROP_SAMPLES] = np.array( - range(1, len(sample_stats)+1)) / len(sample_stats) - return sample_stats - - -sample_stats = compute_stats_missing(not_missing) -``` - - -```python -sample_stats -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
SampleID_intno_identified_peptidesno_missing_peptidesprop_samples
INDEX
20180911_QE6_LC12_SCL_MVM_QC_MNT_Hela_035,9771,00000.000
20180601_QE8_nLC1_JM_QC_MNT_HeLa_127,6551,00000.000
20180827_QE1_nLC10_EM_QC_HELA_new_026,7801,00000.000
20180831_QE3_nLC3_AL_UL_QC_MNT_HeLa_01_201810081301305,2241,00000.001
20190701_QE4_LC12_IAH_QC_MNT_HeLa_036,7691,00000.001
20181112_QE7_nLC11_MEM_QC_HeLa_023601,00000.001
20190708_QE6_nLC4_JE_QC_MNT_HeLa_013,2921,00000.001
20180219_QE6_LC12_AS_QC_MNT_HeLa_012,0401,00000.001
20180802_QE5_nLC11_AP_QC_MNT_HeLa_23,3081,00000.001
20160820_QE4_nLC4_SCL_QC_HeLa_045,2641,00000.001
20180813_QE7_nLC7_KBE_QC_MNT_HELA_027,2311,00000.001
20180926_QE7_nLC11_AL_QC_HeLa_087,2391,00000.002
20160812_QE6_nLC12_MM_QC_MNT_HELA_02_1608140126537,2611,00000.002
20180601_QE8_nLC1_JM_QC_MNT_HeLa_133,3561,00000.002
20181205_QE5_nLC7_RJC_QC_MNT_HeLa_21,8191,00000.002
...............
20150513_QE7_UPLC10_DBJ_QC_SA_HELAS3_Exp5D_19frac_167,4092197810.998
20150513_QE7_UPLC10_DBJ_QC_SA_HELAS3_Exp5D_14frac_64282177830.998
20150513_QE7_UPLC10_DBJ_QC_SA_HELAS3_Exp5D_14frac_101,7742157850.998
20160826_QE3_nLC5_DBJ_SA_HELA_12frac_concat_100ug_83,1022157850.999
20150513_QE7_UPLC10_DBJ_QC_SA_HELAS3_Exp5D_14frac_93,4292137870.999
20150513_QE7_UPLC10_DBJ_QC_SA_HELAS3_Exp5D_19frac_151,7162097910.999
20150513_QE7_UPLC10_DBJ_QC_SA_HELAS3_Exp5D_14frac_54,1472057950.999
20150513_QE7_UPLC10_DBJ_QC_SA_HELAS3_Exp5D_14frac_77,4712047960.999
20150513_QE7_UPLC10_DBJ_QC_SA_HELAS3_Exp5D_19frac_107,7361978030.999
20150513_QE7_UPLC10_DBJ_QC_SA_HELAS3_Exp5D_14frac_87,6601958050.999
20150513_QE7_UPLC10_DBJ_QC_SA_HELAS3_Exp5D_19frac_75,6491948060.999
20150513_QE7_UPLC10_DBJ_QC_SA_HELAS3_Exp5D_19frac_146,0561898111.000
20150513_QE7_UPLC10_DBJ_QC_SA_HELAS3_Exp5D_19frac_86,4151878131.000
20150513_QE7_UPLC10_DBJ_QC_SA_HELAS3_Exp5D_19frac_172,6061858151.000
20150513_QE7_UPLC10_DBJ_QC_SA_HELAS3_Exp5D_19frac_122,0861778231.000
-

7813 rows × 4 columns

-
- - - - -```python -fig_ident = sns.relplot( - x='SampleID_int', y=COL_NO_IDENTIFIED, data=sample_stats) -fig_ident.set_axis_labels('Sample ID', f'Frequency of identified {TYPE}') -fig_ident.fig.suptitle(f'Frequency of identified {TYPE} by sample id', y=1.03) -_savefig(fig_ident, f'identified_{TYPE}_by_sample', folder=FIGUREFOLDER) - -fig_ident_dist = sns.relplot( - x=COL_PROP_SAMPLES, y=COL_NO_IDENTIFIED, data=sample_stats) -fig_ident_dist.set_axis_labels( - 'Proportion of samples (sorted by frequency)', f'Frequency of identified {TYPE}') -fig_ident_dist.fig.suptitle( - f'Frequency of identified {TYPE} groups by sample id', y=1.03) -_savefig(fig_ident_dist, f'identified_{TYPE}_ordered', folder=FIGUREFOLDER) -``` - - -![png](11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_30_0.png) - - - -![png](11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_30_1.png) - - - -```python -COL_NO_MISSING_PROP = COL_NO_MISSING + '_PROP' -sample_stats[COL_NO_MISSING_PROP] = sample_stats[COL_NO_MISSING] / \ - float(X.shape[1]) - -# from ggplot import * -# ggplot(aes(x='nan_proc'), data = nonnan) + geom_histogram(binwidth = 0.005) #+ ylim(0,0.025) -sns.set(style="darkgrid") -g = sns.relplot(x='prop_samples', y=COL_NO_MISSING_PROP, data=sample_stats) -plt.subplots_adjust(top=0.9) -g.set_axis_labels( - "Proportion of samples (sorted by frequency)", "proportion missing") -g.fig.suptitle(f'Proportion of missing {TYPE} ordered') -_savefig(g, "proportion_proteins_missing") -``` - - -![png](11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_31_0.png) - - -## Look at sequences - - -```python -class SequenceAnalyser(): - - def __init__(self, sequences: pd.Series): - if not isinstance(sequences, pd.Series): - raise ValueError( - "Please provide a pandas.Series, not {}".format(type(sequences))) - self.sequences = sequences - - def calc_counts(self, n_characters): - return self.sequences.str[:n_characters].value_counts() - - def length(self): - return self.sequences.str.len().sort_values() -``` - - -```python -sequences = SequenceAnalyser(X.columns.to_series()) -sequences.length() -``` - - - - - IGIEIIK 7 - LMVALAK 7 - FDLMYAK 7 - LTGMAFR 7 - ALLFIPR 7 - ALLFVPR 7 - STELLIR 7 - HLQLAIR 7 - DPFAHLPK 8 - LAHEVGWK 8 - IIAPPERK 8 - GFGFGLVK 8 - IVEVLLMK 8 - AMVSEFLK 8 - DLTDYLMK 8 - .. - ELASQPDVDGFLVGGASLKPEFVDIINAK 29 - LNSVQSSERPLFLVHPIEGSTTVFHSLASR 30 - YTLPPGVDPTQVSSSLSPEGTLTVEAPMPK 30 - HIADLAGNSEVILPVPAFNVINGGSHAGNK 30 - TTGIVMDSGDGVTHTVPIYEGYALPHAILR 30 - LHQLAMQQSHFPMTHGNTGFSGIESSSPEVK 31 - SYIEGYVPSQADVAVFEAVSSPPPADLCHALR 32 - EAESCDCLQGFQLTHSLGGGTGSGMGTLLLSK 32 - WSGPLSLQEVDEQPQHPLHVTYAGAAVDELGK 32 - LQAALDDEEAGGRPAMEPGNGSLDLGGDSAGR 32 - LASVPAGGAVAVSAAPGSAAPAAGSAPAAAEEK 33 - LVQAFQYTDEHGEVCPAGWKPGSDTIKPNVDDSK 34 - VISHAISEHVEDAGVHSGDATLMLPTQTISQGAIEK 36 - GILGYTEHQVVSSDFNSDTHSSTFDAGAGIALNDHFVK 38 - ALANVNIGSLICNVGAGGPAPAAGAAPAGGPAPSTAAAPAEEK 43 - Length: 1000, dtype: int64 - - - - -```python -import ipywidgets as w -_ = w.interact(sequences.calc_counts, - n_characters=w.IntSlider(value=4, min=1, max=55)) -``` - - - interactive(children=(IntSlider(value=4, description='n_characters', max=55, min=1), Output()), _dom_classes=(… - - - -```python -sequences_p4 = sequences.calc_counts(4) -display(sequences_p4.head()) -sequences_p4.loc[sequences_p4.isin(('CON_', 'REV_'))].sort_index() -``` - - - SLHD 3 - GFGF 3 - YLAE 3 - ISMP 3 - TAFD 3 - dtype: int64 - - - - - - Series([], dtype: int64) - - - -What to do when - - -``` -AAAAAAAAAAGAAGGRGSGPGR -AAAAAAAAAAGAAGGRGSGPGRR - -AAAANSGSSLPLFDCPTWAGKPPPGLHLDVVK -AAAANSGSSLPLFDCPTWAGKPPPGLHLDVVKGDK -``` - - - -## Select Proteins - -### Minumum required sample quality -First define the minum requirement of a sample to be kept in - - -```python -import ipywidgets as w -MIN_DEPTH_SAMPLE = 1500 -w_min_depth_sample = w.IntSlider( - value=MIN_DEPTH_SAMPLE, min=0, max=max(sample_stats[COL_NO_IDENTIFIED])) -print(f'Minimum {TYPE} per sample observed:') -w_min_depth_sample -``` - - Minimum peptides per sample observed: - - - - IntSlider(value=1000, max=1000) - - - -```python -mask_samples = sample_stats[COL_NO_IDENTIFIED] >= w_min_depth_sample.value -print(f"Selected {mask_samples.sum()} samples") -``` - - Selected 90 samples - - - -```python -from vaep.data_handling import coverage -x_50 = coverage(X.loc[mask_samples], coverage_col=0.5, coverage_row=0.2) -# x_50_pca = log_z_zeroone_na(x_50) # there is a huge difference if NA is set to low value or mean!! -x_90 = coverage(X.loc[mask_samples], 0.9, 0.9) -``` - - -```python -x_50.shape, x_90.shape -``` - - - - - ((90, 1000), (90, 1000)) - - - - -```python -x_90.sample() -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
AAAAAAALQAKAAFDDAIAELDTLSEESYKAAHSEGNTTAGLDMRAAVATFLQSVQVPEFTPKAAVEEGIVLGGGCALLRAAVPSGASTGIYEALELRAAVPSGASTGIYEALELRDNDKACANPAAGSVILLENLRACGLVASNLNLKPGECLRADLINNLGTIAKADLLLSTQPGREEGSPLELERADRDESSPYAAMLAAQDVAQRAEEYEFLTPVEEAPKAFDSGIIPMEFVNKAFGYYGPLRAFYPEEISSMVLTKAGAGSATLSMAYAGARAGFAGDDAPRAGGAAVVITEPEHTKAGGEAGVTLGQPHLSRAGKPVICATQMLESMIKAGLQFPVGRAGNLGGGVVTIERAGVNTVTTLVENKKAHGPGLEGGLVGKPAEFTIDTKAHQVVEDGYEFFAKAHSSMVGVNLPQKAIADTGANVVVTGGKAIAELGIYPAVDPLDSTSRAIGVLTSGGDAQGMNAAVRAIIIFVPVPQLKAILVDLEPGTMDSVRAIPQLQGYLRAITGASLADIMAKAIVAIENPADVSVISSRALAAAGYDVEKALANVNIGSLICNVGAGGPAPAAGAAPAGGPAPSTAAAPAEEKALDIAENEMPGLMRALDTMNFDVIKALDVMVSTFHKALEHFTDLYDIKALESPERPFLAILGGAKALIAAQYSGAQVRALIVLAHSERALLFIPRALLFVPRALLVTASQCQQPAENKALMLQGVDLLADAVAVTMGPKALPFWNEEIVPQIKALQSGQCAGAALDVFTEEPPRDR...SPFEVQVGPEAGMQKSPYQEFTDHLVKSPYTVTVGQACNPSACRSTGGAPTFNVTVTKSTVHEILCKSYGRPPPDVEGMTSLKTANDMIHAENMRTFSYAGFEMQPKTGAAPIIDVVRTGVAVNKPAEFTVDAKTINEVENQILTRTLQALQIPAAKTLSDYNIQKTTPSVVAFTADGERTVPEELVKPEELSKVACIGAWHPARVAHSFNCTPIEGMLSHQLKVALVYGQMNEPPGARVDFNVPMKVEFMDDTSRVELVPPTPAEIPRVEPGLGADNSVVRVFQFLNAKVGDAIPAVEVFEGEPGNKVGLQVVAVKVGQEIEVRPGIVSKVIDPATATSVDLRVIMVTGDHPITAKVITIMQNPRVIVVGNPANTNCLTASKVLALPEPSPAAPTLRVLAMSGDPNYLHRVLQALEGLKVNGRPLEMIEPRVNNSSLIGLGYTQTLKPGIKVNVPVIGGHAGKVPPAINQFTQALDRVSQEHPVVLTKVSVADHSLHLSKVTAQGPGLEPSGNIANKVVFVFGPDKVVFVFGPDKKVYALPEDLVEVKPKYADLTEDQLPSCESLKYDDMAAAMKYDDMAACMKYDDMATCMKYLAEVACGDDRKYLDEDTIYHLQPSGRYRVPDVLVADPPIAR
Sample ID
20181019_QE4_nLC12_MR_QC_MNT_Hela_11,033,000,000.0002,622,700,000.000314,650,000.000565,470,000.0001,231,000,000.0001,435,300,000.0002,709,300,000.0001,432,700,000.000988,990,000.0002,825,000,000.000688,310,000.000569,430,000.000288,380,000.000137,850,000.000432,210,000.000158,030,000.000399,140,000.00010,036,000,000.000395,940,000.000201,120,000.0002,285,500,000.0009,550,400,000.000715,800,000.000411,970,000.000287,610,000.000237,830,000.0001,485,400,000.000262,930,000.000820,480,000.000153,500,000.0001,296,300,000.0002,452,200,000.000530,810,000.000764,720,000.000723,410,000.0004,848,700,000.000990,760,000.000266,930,000.000476,540,000.0003,643,200,000.000351,260,000.000287,170,000.000647,320,000.000550,750,000.0001,585,300,000.0001,376,100,000.000243,460,000.0003,986,000,000.000818,730,000.00064,242,000.000...115,440,000.000551,300,000.000296,100,000.0002,814,700,000.0002,659,600,000.00079,571,000.000736,280,000.000243,420,000.000987,870,000.000830,090,000.000630,080,000.000130,150,000.0003,691,600,000.000569,410,000.000176,000,000.000713,990,000.000582,350,000.000141,150,000.0001,193,500,000.000574,890,000.000117,830,000.000576,950,000.000709,160,000.000271,680,000.0002,414,600,000.000478,860,000.000204,810,000.000284,000,000.000908,970,000.000230,890,000.000221,390,000.000163,470,000.000280,060,000.000343,890,000.000688,220,000.000312,110,000.000359,970,000.000540,770,000.000576,730,000.000713,550,000.000698,020,000.000236,850,000.000147,940,000.000437,870,000.000405,720,000.000297,210,000.000290,220,000.000600,990,000.000240,050,000.000322,910,000.000
-

1 rows × 1000 columns

-
- - - -### Distribution of Intensity values -- comparing non-transformed to $\log_{10}$ transformed - - -```python -from vaep.transform import log -from random import sample -sample = x_50.sample().iloc[0] -sample_id = sample.name -print("Sample ID:", sample_id) -sns.set(style="darkgrid") - -fig, axes = plt.subplots(1, 2, figsize=(12, 5)) -sns.histplot(sample, bins=100, ax=axes[0]) -axes[0].set_title("Unnormalized distribution") - -# natural logarithm, could also be base_2, base_10 logarithm -sample_log = log(sample) -sns.histplot(sample_log, bins=100, ax=axes[1]) -axes[1].set_title('log (ln) normalized distribution') - -_ = fig.suptitle( - f"Dynamic Range of measured intensities in sample {sample_id}") -fig.tight_layout(rect=[0, 0.03, 1, 0.95]) -_savefig(fig, 'distribution_peptides_sample_' + str(sample_id)) -``` - - Sample ID: 20181219_QE1_nLC2_GP_QC_MNT_HELA_01 - - - -![png](11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_46_1.png) - - - -```python -from vaep.transform import log -from random import sample -sample = x_50.sample(axis=1) -sample_id = sample.columns[0] -print("Sample ID:", sample_id) -sns.set(style="darkgrid") -fig, axes = plt.subplots(1, 2, figsize=(12, 5)) -sns.histplot(sample, bins=100, ax=axes[0]) -axes[0].set_title("Unnormalized distribution") - -# natural logarithm, could also be base_2, base_10 logarithm -sample_log = log(sample) -sns.histplot(sample_log, bins=100, ax=axes[1]) -axes[1].set_title('log (ln) normalized distribution') - -fig.suptitle(f"Dynamic range of {sample_id} between samples") -fig.tight_layout(rect=[0, 0.03, 1, 0.95]) -_savefig(fig, 'distribution_peptides_sample_' + str(sample_id)) -``` - - Sample ID: GGEIQPVSVK - - - -![png](11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_47_1.png) - - -### Reference table intensities (natural logarithm) - -14 to 23 spans a dynamic range of 3 orders of base 10 - - -```python -pd.set_option('precision', 2) - -dynamic_range = pd.DataFrame(range(14, 24), columns=['x']) -dynamic_range['$e^x$'] = dynamic_range.x.apply(np.exp) -dynamic_range.set_index('x', inplace=True) -dynamic_range.index.name = '' -dynamic_range.T -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
14151617181920212223
$e^x$1,202,604.2843,269,017.3728,886,110.52124,154,952.75465,659,969.137178,482,300.963485,165,195.4101,318,815,734.4833,584,912,846.1329,744,803,446.249
-
- - - -## Next UP - -### Find Protein of Peptides -- check with some reference list of peptides: This is created in `project\FASTA_tryptic_digest.ipynb` - - -```python - -``` diff --git a/project/doc/ipynbs/11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_12_0.png b/project/doc/ipynbs/11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_12_0.png deleted file mode 100644 index e5d16b07c..000000000 Binary files a/project/doc/ipynbs/11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_12_0.png and /dev/null differ diff --git a/project/doc/ipynbs/11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_13_0.png b/project/doc/ipynbs/11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_13_0.png deleted file mode 100644 index 96e70c0db..000000000 Binary files a/project/doc/ipynbs/11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_13_0.png and /dev/null differ diff --git a/project/doc/ipynbs/11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_23_1.png b/project/doc/ipynbs/11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_23_1.png deleted file mode 100644 index 9bc742f29..000000000 Binary files a/project/doc/ipynbs/11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_23_1.png and /dev/null differ diff --git a/project/doc/ipynbs/11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_30_0.png b/project/doc/ipynbs/11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_30_0.png deleted file mode 100644 index 50938ecba..000000000 Binary files a/project/doc/ipynbs/11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_30_0.png and /dev/null differ diff --git a/project/doc/ipynbs/11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_30_1.png b/project/doc/ipynbs/11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_30_1.png deleted file mode 100644 index 82caf0402..000000000 Binary files a/project/doc/ipynbs/11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_30_1.png and /dev/null differ diff --git a/project/doc/ipynbs/11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_31_0.png b/project/doc/ipynbs/11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_31_0.png deleted file mode 100644 index 4c771d84d..000000000 Binary files a/project/doc/ipynbs/11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_31_0.png and /dev/null differ diff --git a/project/doc/ipynbs/11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_46_1.png b/project/doc/ipynbs/11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_46_1.png deleted file mode 100644 index 8b5d9fbe0..000000000 Binary files a/project/doc/ipynbs/11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_46_1.png and /dev/null differ diff --git a/project/doc/ipynbs/11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_47_1.png b/project/doc/ipynbs/11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_47_1.png deleted file mode 100644 index 659a59dc1..000000000 Binary files a/project/doc/ipynbs/11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_47_1.png and /dev/null differ diff --git a/project/doc/ipynbs/11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_7_0.png b/project/doc/ipynbs/11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_7_0.png deleted file mode 100644 index 5d13250f1..000000000 Binary files a/project/doc/ipynbs/11_training_data_exploration_peptides_files/11_training_data_exploration_peptides_7_0.png and /dev/null differ diff --git a/project/doc/ipynbs/12_experiment_01_small_example.md b/project/doc/ipynbs/12_experiment_01_small_example.md deleted file mode 100644 index 43ab3f85b..000000000 --- a/project/doc/ipynbs/12_experiment_01_small_example.md +++ /dev/null @@ -1,7388 +0,0 @@ -# Experiment 1 - - -```python -ADD_TENSORBOARD = False -``` - - -```python -from src.nb_imports import * -``` - - FOLDER_MQ_TXT_DATA = data\mq_out - - - -```python -figures = {} # collection of ax or figures -``` - - -```python -import logging -from vaep.logging import setup_logger - -logger = setup_logger(logger=logging.getLogger('vaep')) -logger.info("Experiment 01") -``` - - vaep - INFO Experiment 01 - - -## Load data - -- 1000 features (most abundant peptides) -- later a subset of samples is selected - - -```python -N_SAMPLES_TO_LOAD = None -FN_PEPTIDE_INTENSITIES = config.FOLDER_DATA / 'df_intensities_N07813_M01000' -FN_PEPTIDE_INTENSITIES = config.FOLDER_DATA / 'df_intensities_N00090_M01000' - -analysis = AnalyzePeptides.from_csv( - fname=FN_PEPTIDE_INTENSITIES, nrows=N_SAMPLES_TO_LOAD) -analysis.df = analysis.df.sort_index() # sort by date -assert analysis.df.index.is_unique, "Non-unique training samples" -analysis -``` - - - - - AnalyzePeptides with attributes: df, stats - - - -### Select consecutives samples for training - - -```python -from numpy import log2 -import random -from vaep.utils import sample_iterable - -N_SAMPLES = min(len(analysis.df), 1000) -logger.info(f"Selected {N_SAMPLES}") -analysis.N_SAMPLES = N_SAMPLES - -M = 10 - -columns_selected = sorted(sample_iterable(list(analysis.df.columns), n=M)) -analysis.df = analysis.df.loc[:, columns_selected] - - -def get_consecutive_data_indices(index, n_samples=N_SAMPLES): - start_sample = len(index) - n_samples - start_sample = random.randint(0, start_sample) - return index[start_sample:start_sample+n_samples] - - -indices_selected = get_consecutive_data_indices(analysis.df.index) -analysis.samples = indices_selected -analysis.df = analysis.df.loc[indices_selected] - -LOG_TRANSFORM = log2 # None -if LOG_TRANSFORM: - analysis.df = LOG_TRANSFORM(analysis.df) - -FRACTION = 0.8 - - -class Indices(SimpleNamespace): - pass - - -indices = Indices() -indices.train, indices.valid = indices_selected[:int( - FRACTION*N_SAMPLES)], indices_selected[int(FRACTION*N_SAMPLES):] -analysis.indices = indices - -analysis.df_train = analysis.df.loc[indices.train] -analysis.df_valid = analysis.df.loc[indices.valid] - -#rebuild original with multi-index -analysis.df_by_split = pd.concat((analysis.df_train, analysis.df_valid), keys=['train', 'valid']) - -analysis.df_by_split.sample(n=10, axis=0).sample(n=5, axis=1).sort_index() -``` - - vaep - INFO Selected 90 - - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
LALVTGGEIASTFDHPELVKLLEVEHPAAKARFEELCSDLFRVNNSSLIGLGYTQTLKPGIKAVAEQIPLLVQGVR
Sample ID
train20160812_QE6_nLC12_MM_QC_MNT_HELA_02_16081401265328.48527.46128.11528.64425.932
20180126_QE7_nLC11_DBJ_QC_HELA_0230.81229.11029.46530.32727.387
20180721_QE7_nLC3_KBE_QC_HeLa_0131.10528.57030.35930.41926.386
20180807_QE3_nLC3_KBE_QC_MNT_HELA_0129.11328.51028.25829.28125.805
20180831_QE3_nLC3_AL_UL_QC_MNT_HeLa_01_2018100813013029.45128.67229.05829.20926.265
20181024_QE7_nLC11_PR_QC_HeLa_0230.43429.16030.00830.84827.211
valid20181102_QE2_NLC10_MR_QC_MNT_HELA_0130.56529.34429.89730.86727.728
20181215_QE2_NLC10_ANHO_QC_MNT_HELA_0429.49129.42229.47630.29726.756
20190708_QE6_nLC4_JE_QC_MNT_HeLa_0131.42627.00030.80131.72426.106
20191128_QE8_nLC9_ASD_QC_HeLa_1_2019112816531329.19530.50730.79329.42726.552
-
- - - -### Training and Validation datasets - - -```python -n_samples, n_features = analysis.df.shape -msg = "Total:\nN samples: {:10,d} - N Peptides: {:10,d}\n".format( - n_samples, n_features) -n_train, n_valid = len(analysis.df_train), len(analysis.df_valid) -msg += "N train set: {:8,d} - N valid set: {:9,d}".format(n_train, n_valid) -print(msg) -``` - - Total: - N samples: 90 - N Peptides: 10 - N train set: 72 - N valid set: 18 - - - -```python -detection_limit = analysis.df.min().min() if LOG_TRANSFORM else np.log10( - analysis.df).min().min() # all zeros become nan. -"Detection limit: {:6.3f}, corresponding to intensity value of {:,d}".format( - detection_limit, - int(10 ** detection_limit) -) -``` - - - - - 'Detection limit: 24.102, corresponding to intensity value of 1,265,595,277,049,347,616,800,768' - - - -### Create meta data from filename - - -```python -from vaep.analyzers import metadata - -data_meta = metadata.get_metadata_from_filenames(indices_selected) -analysis.df_meta = pd.DataFrame.from_dict( - data_meta, orient='index') -# analysis.df_meta['date'] = pd.to_datetime(analysis.df_meta['date']) -analysis.df_meta -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
datems_instrumentresearcherlc_instrumentrest
20151128_QE7_UPLC11_RJC_DEV_columnsTest_HeLa_0120151128QE7RJCLC11UPDEV_columnsTest_HeLa_01
20160105_QE6_nLC4_MM_QC_MNT_HELA_01_17010620180620160105QE6MMnLC4QC_MNT_HELA_01_170106201806
20160311_QE6_LC6_SCL_QC_MNT_HeLa_0120160311QE6SCLLC6QC_MNT_HeLa_01
20160401_QE6_nLC6_ASD_QC_HELA_0320160401QE6ASDnLC6QC_HELA_03
20160404_QE2_nlc1_QC_hela_16040421012520160404QE2QCnlc1hela_160404210125
..................
20190527_QE4_LC12_AS_QC_MNT_HeLa_0220190527QE4ASLC12QC_MNT_HeLa_02
20190701_QE4_LC12_IAH_QC_MNT_HeLa_0320190701QE4IAHLC12QC_MNT_HeLa_03
20190708_QE6_nLC4_JE_QC_MNT_HeLa_0120190708QE6JEnLC4QC_MNT_HeLa_01
20191128_QE8_nLC9_ASD_QC_HeLa_120191128QE8ASDnLC9QC_HeLa_1
20191128_QE8_nLC9_ASD_QC_HeLa_1_2019112816531320191128QE8ASDnLC9QC_HeLa_1_20191128165313
-

90 rows × 5 columns

-
- - - -- possibility to group data in time along `(machine, lc)` pairs - - -```python -analysis.df_meta.loc[indices.train].describe(datetime_is_numeric=False) -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
datems_instrumentresearcherlc_instrumentrest
count7272727272
unique5111271758
top20180601QE4MRnLC1QC_MNT_HeLa_01
freq7101595
-
- - - - -```python -# This becomes part of analysis -def compare_meta_data_for_splits(meta, indices): - - _indices = vars(indices) - logger.info('Found vars: {}'.format(', '.join(str(x) - for x in _indices.keys()))) - - for key_split, split in _indices.items(): - print(f"{key_split:8} - split description:") - display( - meta.loc[split].describe(datetime_is_numeric=True) - ) - - _meta_features = list(meta.columns) - - for _column in _meta_features: - display( - _=pd.DataFrame({ - key_split: meta.loc[split, _column].value_counts(normalize=True) for key_split, split in _indices.items() - }).sort_index().plot(kind='line', rot=90, figsize=(10, 5), title=f"{_column} value Counts for different splits") - ) - - -compare_meta_data_for_splits(analysis.df_meta.iloc[:, :2], indices) -``` - - vaep - INFO Found vars: train, valid - train - split description: - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
datems_instrument
count7272
unique5111
top20180601QE4
freq710
-
- - - valid - split description: - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
datems_instrument
count1818
unique168
top20190527QE5
freq24
-
- - - -![png](12_experiment_01_small_example_files/12_experiment_01_small_example_16_4.png) - - - -![png](12_experiment_01_small_example_files/12_experiment_01_small_example_16_5.png) - - -### PCA plot of original data - - -```python -import matplotlib.pyplot as plt -import matplotlib.dates as mdates - -from sklearn.decomposition import PCA - - -def _add_indices(array, original_df, index_only=False): - index = original_df.index - columns = None - if not index_only: - columns = original_df.columns - return pd.DataFrame(array, index=index, columns=columns) - - -def run_pca(df, n_components=2): - """Run PCA on DataFrame. - - Returns - ------- - pandas.DataFrame - with same indices as in original DataFrame - """ - pca = PCA(n_components=n_components).fit_transform(df) - cols = [f'pc{i}' for i in range(n_components)] - pca = pd.DataFrame(pca, index=df.index, columns=cols) - return pca - - -def scatter_plot_w_dates(ax, df, dates=None): - """plot first vs. second column in DataFrame. - Use dates to color data.""" - - cols = df.columns - - if isinstance(dates, str): - dates = df['dates'] - - ax = ax.scatter( - x=df[cols[0]], - y=df[cols[1]], - c=[mdates.date2num(t) for t in pd.to_datetime(dates) - ] if dates is not None else None - ) - return ax - - -scaler = StandardScaler().fit(analysis.df) -pca = run_pca(df=scaler.transform(analysis.df_by_split, copy=None)) -cols = list(pca.columns) - -fig, axes = plt.subplots(ncols=2, figsize=(15, 8)) - -# by split -ax = axes[0] -ax = pca.loc['train'].plot.scatter( - x=cols[0], y=cols[1], color='blue', label='train', ax=ax) -ax = pca.loc['valid'].plot.scatter( - x=cols[0], y=cols[1], color='orange', label='valid', ax=ax) - -# by dates -ax = axes[1] -ax = scatter_plot_w_dates(ax, pca, dates=analysis.df_meta.date) - -loc = mdates.AutoDateLocator() -_ = fig.colorbar(ax, ticks=loc, - format=mdates.AutoDateFormatter(loc)) - -figures[('pca', 'original')] = fig -``` - - -![png](12_experiment_01_small_example_files/12_experiment_01_small_example_18_0.png) - - -- [x] color sample by date (heatmap?) - -### Analysis state so far - - -```python -analysis -``` - - - - - AnalyzePeptides with attributes: N_SAMPLES, df, df_by_split, df_meta, df_train, df_valid, indices, samples, stats - - - -### Correlation - - -```python -analyzers.corr_lower_triangle(analysis.df) -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ACANPAAGSVILLENLRARFEELCSDLFRAVAEQIPLLVQGVREGPYDVVVLPGGNLGAQNLSESAAVKHGSLGFLPRIDIIPNPQERLALVTGGEIASTFDHPELVKLLEVEHPAAKSTESLQANVQRVNNSSLIGLGYTQTLKPGIK
ACANPAAGSVILLENLRNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
ARFEELCSDLFR0.705NaNNaNNaNNaNNaNNaNNaNNaNNaN
AVAEQIPLLVQGVR0.7360.532NaNNaNNaNNaNNaNNaNNaNNaN
EGPYDVVVLPGGNLGAQNLSESAAVK0.8470.6530.747NaNNaNNaNNaNNaNNaNNaN
HGSLGFLPR0.5120.4360.4560.421NaNNaNNaNNaNNaNNaN
IDIIPNPQER0.7350.6820.6870.7780.533NaNNaNNaNNaNNaN
LALVTGGEIASTFDHPELVK0.7580.5490.6580.6640.4460.507NaNNaNNaNNaN
LLEVEHPAAK0.6030.5640.4010.4770.4750.5370.441NaNNaNNaN
STESLQANVQR0.7370.7630.6890.7290.5030.7920.6130.563NaNNaN
VNNSSLIGLGYTQTLKPGIK0.7970.7320.7500.8050.4500.7340.6620.4540.805NaN
-
- - - -### Results - -Helper function and results dictionary - - -```python -analysis.results = {} - - -def describe_abs_diff(y_true: pd.DataFrame, y_pred: pd.DataFrame): - _abs_diff = y_true - y_pred - return _abs_diff.abs().describe().to_dict() -``` - -## Baseline supervised RF models - -- M RandomForest baseline models, each predicting one feature based on the M-1 other features -- get an idea of a possible baseline performance - - could be used together with imputation of inputs - - with some effort this could be scaled to predict only missing peptides - - - -```python -from sklearn.ensemble import RandomForestRegressor -from sklearn.metrics import mean_squared_error -metrics = {} - - -peptides = list(analysis.df_train.columns) -metrics = {} -pred_valid = {} - -for i in range(M): - train_columns = list(range(M)) - test_column = i - train_columns.remove(i) - train_columns = [peptides[i] for i in train_columns] - test_column = peptides[test_column] - logger.debug( - f"Train columns: {', '.join(train_columns)}\nTest column: {test_column}") - _df_train, _y_train = analysis.df_train[train_columns], analysis.df_train[test_column] - _df_valid, _y_valid = analysis.df_valid[train_columns], analysis.df_valid[test_column] - rf_reg = RandomForestRegressor() - rf_reg.fit(X=_df_train, y=_y_train) - # metrics - _metrics = {} - _metrics[('MSE', 'train')] = mean_squared_error( - y_true=_y_train, y_pred=rf_reg.predict(_df_train)) - y_pred_valid = rf_reg.predict(_df_valid) - _metrics[('MSE', 'valid')] = mean_squared_error( - y_true=_y_valid, y_pred=y_pred_valid) - metrics[test_column] = _metrics - # predictions - pred_valid[test_column] = y_pred_valid -pd.DataFrame(metrics) -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ACANPAAGSVILLENLRARFEELCSDLFRAVAEQIPLLVQGVREGPYDVVVLPGGNLGAQNLSESAAVKHGSLGFLPRIDIIPNPQERLALVTGGEIASTFDHPELVKLLEVEHPAAKSTESLQANVQRVNNSSLIGLGYTQTLKPGIK
MSEtrain0.0160.0450.0210.0340.1640.0160.0260.1410.0330.013
valid0.3210.5120.3950.3921.2060.9021.6031.2870.3290.323
-
- - - - -```python -analysis.pred_rf = pd.DataFrame(pred_valid, index=analysis.df_valid.index) -analysis.pred_rf -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ACANPAAGSVILLENLRARFEELCSDLFRAVAEQIPLLVQGVREGPYDVVVLPGGNLGAQNLSESAAVKHGSLGFLPRIDIIPNPQERLALVTGGEIASTFDHPELVKLLEVEHPAAKSTESLQANVQRVNNSSLIGLGYTQTLKPGIK
Sample ID
20181029_QE3_nLC3_KBE_QC_MNT_HELA_0230.30728.47125.65628.28926.67229.34728.95427.81728.78429.040
20181102_QE2_NLC10_MR_QC_MNT_HELA_0131.46030.21927.54430.10029.74731.15630.45529.17430.09530.684
20181107_QE6_nLC12_MR_QC_MNT_HELA_New_0131.10029.06526.96729.70629.37130.71529.98628.61529.88830.331
20181110_QE5_nLC5_OOE_QC_MNT_HELA_15cm_250ng_RO-00329.49628.29225.30127.95427.30129.11628.62026.96128.32128.702
20181112_QE7_nLC11_MEM_QC_HeLa_0231.38629.70627.48629.95729.69030.97030.43328.93629.84030.743
20181119_QE1_nLC2_TW_QC_HeLa_130.79229.12326.67229.10629.33830.09029.68828.29229.53229.789
20181120_QE5_nLC7_AP_HeLa_230.79629.40126.67629.45929.23830.54430.02028.49929.72130.017
20181126_QE2_NLC10_MN_QC_HELA_0230.63429.15126.24828.55028.04829.86429.49828.47929.43629.607
20181205_QE5_nLC7_RJC_QC_MNT_HeLa_229.87928.23625.49827.57627.19529.01828.68826.42628.44428.919
20181215_QE2_NLC10_ANHO_QC_MNT_HELA_0430.86029.64226.65628.91328.85230.25929.50428.06729.78229.767
20181219_QE1_nLC2_GP_QC_MNT_HELA_0129.97629.11526.15928.61828.01829.69529.01827.98629.15529.423
20190107_QE5_nLC5_DS_QC_MNT_HeLa_FlashPack_0229.64729.64927.14029.28328.71130.41030.31428.22629.74530.056
20190527_QE4_LC12_AS_QC_MNT_HeLa_0131.81130.10527.61929.62829.79730.63430.62729.06830.62130.624
20190527_QE4_LC12_AS_QC_MNT_HeLa_0231.83930.04327.66330.08729.86731.43130.60229.02830.68830.860
20190701_QE4_LC12_IAH_QC_MNT_HeLa_0330.99430.24927.43229.58329.73630.48630.46629.01530.57530.269
20190708_QE6_nLC4_JE_QC_MNT_HeLa_0131.90529.80027.67228.67029.13130.57330.77628.85930.71930.404
20191128_QE8_nLC9_ASD_QC_HeLa_131.73629.18727.24729.21329.52130.37030.41529.10429.74330.590
20191128_QE8_nLC9_ASD_QC_HeLa_1_2019112816531330.78929.25827.23529.02328.28830.43730.35729.10229.69030.562
-
- - - -Overfits to training data as it should. - - -```python -analysis.df_valid -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ACANPAAGSVILLENLRARFEELCSDLFRAVAEQIPLLVQGVREGPYDVVVLPGGNLGAQNLSESAAVKHGSLGFLPRIDIIPNPQERLALVTGGEIASTFDHPELVKLLEVEHPAAKSTESLQANVQRVNNSSLIGLGYTQTLKPGIK
Sample ID
20181029_QE3_nLC3_KBE_QC_MNT_HELA_0230.15728.67226.03827.95128.61029.35629.42727.96428.61528.887
20181102_QE2_NLC10_MR_QC_MNT_HELA_0131.99929.89727.72830.00129.96030.79130.56529.34429.92730.867
20181107_QE6_nLC12_MR_QC_MNT_HELA_New_0131.13829.90627.07129.64529.73830.69730.52628.87729.16329.929
20181110_QE5_nLC5_OOE_QC_MNT_HELA_15cm_250ng_RO-00329.47828.01225.67927.57028.11929.03728.65628.03028.08228.665
20181112_QE7_nLC11_MEM_QC_HeLa_0231.43029.94527.51130.04627.13830.91130.18129.21030.59030.611
20181119_QE1_nLC2_TW_QC_HeLa_131.04529.59826.50829.26029.00030.34130.12226.89629.19729.589
20181120_QE5_nLC7_AP_HeLa_230.79528.81627.43428.97229.69330.86029.91929.01729.13530.099
20181126_QE2_NLC10_MN_QC_HELA_0230.38629.24726.28128.60129.03529.99929.69128.79428.83329.895
20181205_QE5_nLC7_RJC_QC_MNT_HeLa_229.43227.39825.82328.07828.15629.23828.27528.90928.20828.280
20181215_QE2_NLC10_ANHO_QC_MNT_HELA_0430.43629.47626.75629.21329.19830.50529.49129.42229.37630.297
20181219_QE1_nLC2_GP_QC_MNT_HELA_0130.06228.98026.38827.93228.71029.78529.05329.07628.95129.246
20190107_QE5_nLC5_DS_QC_MNT_HeLa_FlashPack_0231.31530.20425.99529.47728.72030.65625.26329.30029.70630.483
20190527_QE4_LC12_AS_QC_MNT_HeLa_0132.03930.39926.77730.25529.09531.07930.90629.71630.75730.992
20190527_QE4_LC12_AS_QC_MNT_HeLa_0231.99630.48727.19930.29428.74231.08130.93229.74430.68431.114
20190701_QE4_LC12_IAH_QC_MNT_HeLa_0331.53530.13426.61429.43629.07530.95930.33129.94630.08631.134
20190708_QE6_nLC4_JE_QC_MNT_HeLa_0130.73630.80126.10630.55728.89031.37331.42627.00030.80031.724
20191128_QE8_nLC9_ASD_QC_HeLa_131.35530.90226.81530.09830.42332.99829.81730.49430.99629.837
20191128_QE8_nLC9_ASD_QC_HeLa_1_2019112816531331.31630.79326.55229.95030.45733.20029.19530.50731.03629.427
-
- - - - -```python -analysis.results['RF baseline'] = describe_abs_diff( - y_true=analysis.df_valid, y_pred=analysis.pred_rf) -pd.DataFrame(analysis.results['RF baseline']) -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ACANPAAGSVILLENLRARFEELCSDLFRAVAEQIPLLVQGVREGPYDVVVLPGGNLGAQNLSESAAVKHGSLGFLPRIDIIPNPQERLALVTGGEIASTFDHPELVKLLEVEHPAAKSTESLQANVQRVNNSSLIGLGYTQTLKPGIK
count18.00018.00018.00018.00018.00018.00018.00018.00018.00018.000
mean0.3840.5460.4790.4460.8600.5270.6000.9500.4320.441
std0.4280.4750.4180.4530.7030.8131.1470.6370.3890.368
min0.0010.0960.0250.0510.0080.0090.0120.1470.0050.038
25%0.1020.2110.1690.1480.3510.1010.1160.3660.1680.179
50%0.2500.3830.3800.3190.6970.2480.3051.0000.2870.328
75%0.5070.7750.7390.5960.9800.4250.5241.3810.5990.612
max1.6681.7151.5661.8882.5522.7645.0512.4841.3451.320
-
- - - -Could a model help in identifying extraordinar differences in samples? Something to focus on? - -## DL Setup - - -```python -import vaep.model as vaep_model -from vaep.cmd import get_args - -BATCH_SIZE, EPOCHS = 8, 30 -args = get_args(batch_size=BATCH_SIZE, epochs=EPOCHS, - no_cuda=True) # data transfer to GPU seems slow -kwargs = {'num_workers': 2, 'pin_memory': True} if args.cuda else {} - -# torch.manual_seed(args.seed) -device = torch.device("cuda" if args.cuda else "cpu") -device - -print(args, device) -``` - - Namespace(batch_size=8, cuda=False, epochs=30, log_interval=10, no_cuda=True, seed=43) cpu - - -## Simple AE -- should also heavily overfit the training data - - -```python -from vaep.transform import ShiftedStandardScaler - -args_ae = {} -args_ae['SCALER'] = StandardScaler -args_ae['SCALER'] = ShiftedStandardScaler - -# select initial data: transformed vs not log transformed -scaler = args_ae['SCALER'](scale_var=2).fit(analysis.df_train) -# five examples from validation dataset -scaler.transform(analysis.df_train).describe(percentiles=[0.025, 0.975]) -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ACANPAAGSVILLENLRARFEELCSDLFRAVAEQIPLLVQGVREGPYDVVVLPGGNLGAQNLSESAAVKHGSLGFLPRIDIIPNPQERLALVTGGEIASTFDHPELVKLLEVEHPAAKSTESLQANVQRVNNSSLIGLGYTQTLKPGIK
count72.00072.00072.00072.00072.00072.00072.00072.00072.00072.000
mean0.5000.5000.5000.5000.5000.5000.5000.5000.5000.500
std0.5040.5040.5040.5040.5040.5040.5040.5040.5040.504
min-0.873-0.879-0.827-0.691-1.060-0.875-0.672-1.245-0.962-0.731
2.5%-0.598-0.487-0.595-0.553-0.951-0.398-0.507-0.595-0.537-0.454
50%0.5690.5700.5010.4600.6390.4890.5910.6190.4890.483
97.5%1.3451.2261.4731.2611.0391.4661.3651.1851.5401.477
max1.4651.8081.5161.3761.1861.6081.5891.3171.7901.711
-
- - - - -```python -scaler.transform(analysis.df_valid).describe(percentiles=[0.025, 0.975]) -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ACANPAAGSVILLENLRARFEELCSDLFRAVAEQIPLLVQGVREGPYDVVVLPGGNLGAQNLSESAAVKHGSLGFLPRIDIIPNPQERLALVTGGEIASTFDHPELVKLLEVEHPAAKSTESLQANVQRVNNSSLIGLGYTQTLKPGIK
count18.00018.00018.00018.00018.00018.00018.00018.00018.00018.000
mean0.6800.7290.4470.6170.6090.7960.5390.8550.5860.626
std0.4640.6330.4220.5240.3140.7480.6750.4330.6390.672
min-0.183-0.726-0.225-0.361-0.118-0.343-1.629-0.050-0.462-0.649
2.5%-0.172-0.557-0.182-0.2740.041-0.285-0.997-0.031-0.427-0.532
50%0.7760.8930.4160.7080.6000.8160.6440.9110.4980.594
97.5%1.3141.5121.1641.2681.1442.4251.3101.4921.4711.639
max1.3241.5401.2291.3311.1502.4841.4131.4941.4821.818
-
- - - - -```python -from torchvision import transforms -from torch.utils.data import DataLoader -from vaep.io.datasets import PeptideDatasetInMemoryNoMissings - -# ToDo: replace with helper class (see below) -tf_norm = None # replace with Normalizer - -dataset_train = PeptideDatasetInMemoryNoMissings( - data=scaler.transform(analysis.df_train), transform=tf_norm) -dataset_valid = PeptideDatasetInMemoryNoMissings( - data=scaler.transform(analysis.df_valid), transform=tf_norm) -dl_train = DataLoader(dataset_train, batch_size=args.batch_size, shuffle=True) -dl_valid = DataLoader(dataset_valid, batch_size=args.batch_size, shuffle=False) -``` - -### Without Noise - - -```python -model = vaep_model.Autoencoder(n_features=M, n_neurons=int( - M/2), last_decoder_activation=None, dim_latent=3).double() -criterion = torch.nn.MSELoss(reduction='sum') - -learning_rate = 1e-2 - -optimizer = torch.optim.Adam( - model.parameters(), - lr=learning_rate, -) - -# Train standard autoencoder (AE) - -train_losses, valid_losses = [], [] - -# do = nn.Dropout() # for denoising AE -for epoch in range(args.epochs): - # ===================train========================== - for data in dl_train: - model.train() - data = data.to(device) - # noise = do(torch.ones(data.shape)).to(device) # for denoising AE - # data_corrupted = (data * noise).to(device) # for denoising AE - # ===================forward===================== - output = model(data) - loss = criterion(output, data) - # ===================backward==================== - optimizer.zero_grad() - loss.backward() - optimizer.step() - train_losses.append(loss.item()) - # ===================validate======================== - for data in dl_valid: - model.eval() - data = data.to(device) - output = model(data) - loss = criterion(output, data) - valid_losses.append(loss.item()) - - # ===================log============================= - print(f'epoch [{epoch + 1:03d}/{args.epochs}], ' - f'train-loss: {np.mean(train_losses[-len(dl_train):]):.4f},' - f'valid-loss: {np.mean(valid_losses[-len(dl_valid):]):.4f}') -``` - - epoch [001/30], train-loss: 38.1450,valid-loss: 32.5391 - epoch [002/30], train-loss: 24.1494,valid-loss: 20.5401 - epoch [003/30], train-loss: 16.4426,valid-loss: 15.0347 - epoch [004/30], train-loss: 13.7267,valid-loss: 13.3773 - epoch [005/30], train-loss: 12.3260,valid-loss: 13.4774 - epoch [006/30], train-loss: 10.4638,valid-loss: 12.3712 - epoch [007/30], train-loss: 9.1209,valid-loss: 10.6333 - epoch [008/30], train-loss: 8.0506,valid-loss: 10.0156 - epoch [009/30], train-loss: 7.1928,valid-loss: 9.4267 - epoch [010/30], train-loss: 6.7741,valid-loss: 9.2030 - epoch [011/30], train-loss: 6.4391,valid-loss: 9.0134 - epoch [012/30], train-loss: 6.3059,valid-loss: 8.8587 - epoch [013/30], train-loss: 6.2833,valid-loss: 8.8802 - epoch [014/30], train-loss: 6.1364,valid-loss: 8.6886 - epoch [015/30], train-loss: 5.9696,valid-loss: 8.5828 - epoch [016/30], train-loss: 5.9153,valid-loss: 8.5040 - epoch [017/30], train-loss: 5.7050,valid-loss: 8.4550 - epoch [018/30], train-loss: 5.6063,valid-loss: 8.4865 - epoch [019/30], train-loss: 5.3881,valid-loss: 8.2474 - epoch [020/30], train-loss: 5.1049,valid-loss: 8.1140 - epoch [021/30], train-loss: 4.8991,valid-loss: 8.0341 - epoch [022/30], train-loss: 4.8181,valid-loss: 7.9130 - epoch [023/30], train-loss: 4.7504,valid-loss: 8.1090 - epoch [024/30], train-loss: 4.8021,valid-loss: 8.0977 - epoch [025/30], train-loss: 4.8373,valid-loss: 8.0948 - epoch [026/30], train-loss: 4.7489,valid-loss: 7.9340 - epoch [027/30], train-loss: 4.7265,valid-loss: 7.9318 - epoch [028/30], train-loss: 4.6440,valid-loss: 7.9384 - epoch [029/30], train-loss: 4.6102,valid-loss: 7.9085 - epoch [030/30], train-loss: 4.6293,valid-loss: 7.9636 - - - -```python -df_train_losses = vaep_model.process_train_loss({'MSE train': train_losses}) - -# Plotting is boilerplate code: -_ = df_train_losses.plot(kind='scatter', x='steps', y='MSE train smoothed', figsize=( - 15, 8), title='Exponential smoothed training loss', ylim=(0, None)) -df_train_losses.tail() -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
stepsMSE trainMSE train smoothed
2652657.1724.920
2662664.4604.874
2672672.3474.621
2682684.3294.592
2692693.4764.480
-
- - - - -![png](12_experiment_01_small_example_files/12_experiment_01_small_example_41_1.png) - - - -```python -def get_pred(model, dataloader): - pred = [] - model.eval() - for data in dataloader: - data = data.to(device) - output = model(data) - pred.append(output.detach().numpy()) - return pred - - -pred = get_pred(model, dl_valid) -analysis.pred_aa_simple = vaep_model.build_df_from_pred_batches( - pred, scaler, index=analysis.df_valid.index, columns=analysis.df_valid.columns) -analysis.pred_aa_simple -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ACANPAAGSVILLENLRARFEELCSDLFRAVAEQIPLLVQGVREGPYDVVVLPGGNLGAQNLSESAAVKHGSLGFLPRIDIIPNPQERLALVTGGEIASTFDHPELVKLLEVEHPAAKSTESLQANVQRVNNSSLIGLGYTQTLKPGIK
Sample ID
20181029_QE3_nLC3_KBE_QC_MNT_HELA_0229.88628.93925.85428.18128.54329.44828.92728.15228.87829.136
20181102_QE2_NLC10_MR_QC_MNT_HELA_0131.71430.06927.63930.21929.77731.21830.78629.18330.47330.859
20181107_QE6_nLC12_MR_QC_MNT_HELA_New_0131.21029.88026.97429.57029.69630.63230.29529.19030.03530.247
20181110_QE5_nLC5_OOE_QC_MNT_HELA_15cm_250ng_RO-00329.40028.52325.51727.74527.97629.06128.41027.61928.47828.768
20181112_QE7_nLC11_MEM_QC_HeLa_0231.51229.78427.71330.15029.22731.24230.46428.46030.32330.832
20181119_QE1_nLC2_TW_QC_HeLa_130.76929.45926.82329.21528.96630.44529.69728.26729.66230.003
20181120_QE5_nLC7_AP_HeLa_231.00029.55827.08529.51029.07230.67129.96028.38929.87330.252
20181126_QE2_NLC10_MN_QC_HELA_0230.51229.39126.36528.83029.11829.99029.58828.69429.42429.650
20181205_QE5_nLC7_RJC_QC_MNT_HeLa_229.36828.51125.46627.69827.98929.00728.39727.66528.44728.731
20181215_QE2_NLC10_ANHO_QC_MNT_HELA_0430.95929.66126.81529.34329.40030.44030.03228.90529.82230.073
20181219_QE1_nLC2_GP_QC_MNT_HELA_0130.14729.13726.04828.43928.81429.65729.21428.42629.10229.341
20190107_QE5_nLC5_DS_QC_MNT_HeLa_FlashPack_0230.59729.25126.76229.11728.66130.33029.52427.99929.53629.914
20190527_QE4_LC12_AS_QC_MNT_HeLa_0131.81330.19327.54230.27130.05931.13530.98929.67530.57030.831
20190527_QE4_LC12_AS_QC_MNT_HeLa_0231.88130.19727.69830.36929.99031.27231.01829.52130.61830.961
20190701_QE4_LC12_IAH_QC_MNT_HeLa_0331.57630.04027.34230.02229.88330.93830.73729.48330.36730.621
20190708_QE6_nLC4_JE_QC_MNT_HeLa_0131.81630.12127.78230.33629.79731.35830.86329.13730.55230.987
20191128_QE8_nLC9_ASD_QC_HeLa_131.63530.18027.26130.01230.12230.92230.79929.72430.41730.565
20191128_QE8_nLC9_ASD_QC_HeLa_1_2019112816531331.49030.11327.10229.83430.05730.78230.63929.65030.28830.411
-
- - - - -```python -display(analysis.df_valid) # true values -``` - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ACANPAAGSVILLENLRARFEELCSDLFRAVAEQIPLLVQGVREGPYDVVVLPGGNLGAQNLSESAAVKHGSLGFLPRIDIIPNPQERLALVTGGEIASTFDHPELVKLLEVEHPAAKSTESLQANVQRVNNSSLIGLGYTQTLKPGIK
Sample ID
20181029_QE3_nLC3_KBE_QC_MNT_HELA_0230.15728.67226.03827.95128.61029.35629.42727.96428.61528.887
20181102_QE2_NLC10_MR_QC_MNT_HELA_0131.99929.89727.72830.00129.96030.79130.56529.34429.92730.867
20181107_QE6_nLC12_MR_QC_MNT_HELA_New_0131.13829.90627.07129.64529.73830.69730.52628.87729.16329.929
20181110_QE5_nLC5_OOE_QC_MNT_HELA_15cm_250ng_RO-00329.47828.01225.67927.57028.11929.03728.65628.03028.08228.665
20181112_QE7_nLC11_MEM_QC_HeLa_0231.43029.94527.51130.04627.13830.91130.18129.21030.59030.611
20181119_QE1_nLC2_TW_QC_HeLa_131.04529.59826.50829.26029.00030.34130.12226.89629.19729.589
20181120_QE5_nLC7_AP_HeLa_230.79528.81627.43428.97229.69330.86029.91929.01729.13530.099
20181126_QE2_NLC10_MN_QC_HELA_0230.38629.24726.28128.60129.03529.99929.69128.79428.83329.895
20181205_QE5_nLC7_RJC_QC_MNT_HeLa_229.43227.39825.82328.07828.15629.23828.27528.90928.20828.280
20181215_QE2_NLC10_ANHO_QC_MNT_HELA_0430.43629.47626.75629.21329.19830.50529.49129.42229.37630.297
20181219_QE1_nLC2_GP_QC_MNT_HELA_0130.06228.98026.38827.93228.71029.78529.05329.07628.95129.246
20190107_QE5_nLC5_DS_QC_MNT_HeLa_FlashPack_0231.31530.20425.99529.47728.72030.65625.26329.30029.70630.483
20190527_QE4_LC12_AS_QC_MNT_HeLa_0132.03930.39926.77730.25529.09531.07930.90629.71630.75730.992
20190527_QE4_LC12_AS_QC_MNT_HeLa_0231.99630.48727.19930.29428.74231.08130.93229.74430.68431.114
20190701_QE4_LC12_IAH_QC_MNT_HeLa_0331.53530.13426.61429.43629.07530.95930.33129.94630.08631.134
20190708_QE6_nLC4_JE_QC_MNT_HeLa_0130.73630.80126.10630.55728.89031.37331.42627.00030.80031.724
20191128_QE8_nLC9_ASD_QC_HeLa_131.35530.90226.81530.09830.42332.99829.81730.49430.99629.837
20191128_QE8_nLC9_ASD_QC_HeLa_1_2019112816531331.31630.79326.55229.95030.45733.20029.19530.50731.03629.427
-
- - - -```python -analysis.results['Simple AE'] = describe_abs_diff( - y_true=analysis.df_valid, y_pred=analysis.pred_aa_simple) -pd.DataFrame(analysis.results['Simple AE']) -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ACANPAAGSVILLENLRARFEELCSDLFRAVAEQIPLLVQGVREGPYDVVVLPGGNLGAQNLSESAAVKHGSLGFLPRIDIIPNPQERLALVTGGEIASTFDHPELVKLLEVEHPAAKSTESLQANVQRVNNSSLIGLGYTQTLKPGIK
count18.00018.00018.00018.00018.00018.00018.00018.00018.00018.000
mean0.2610.4020.4260.2270.4680.3760.5940.6740.4030.351
std0.2680.3300.3900.1760.5520.6940.9810.5470.2330.265
min0.0400.0260.0600.0160.0340.0090.0410.0410.0660.008
25%0.0830.1580.1670.0910.0880.0580.1320.2450.2410.156
50%0.1890.2360.3440.1970.1920.1160.2640.5730.3390.247
75%0.2790.6800.5370.3280.7620.3020.5300.8360.5710.498
max1.0801.1131.6760.5852.0882.4184.2602.1370.8720.983
-
- - - -### With noise: Denoising AE - -- noise is added during training: some values are set to zero (which is the center for standard normalized intensities) -- noise model could be adapted to reflect the observed noise in the training data - > extrapolation to near future should hold - - -```python -model = vaep_model.Autoencoder(n_features=M, n_neurons=int( - M/2), last_decoder_activation=None, dim_latent=3).double() -criterion = torch.nn.MSELoss(reduction='sum') - -learning_rate = 1e-3 - -optimizer = torch.optim.Adam( - model.parameters(), - lr=learning_rate, -) - -# Train denoising autoencoder (AE) - -train_losses, valid_losses = [], [] - -do = torch.nn.Dropout() # for denoising AE -for epoch in range(args.epochs): - # ===================train========================== - for data in dl_train: - model.train() - data = data.to(device) - noise = do(torch.ones(data.shape)).to(device) # for denoising AE - data_corrupted = (data * noise).to(device) # for denoising AE - # ===================forward===================== - output = model(data) - loss = criterion(output, data) - # ===================backward==================== - optimizer.zero_grad() - loss.backward() - optimizer.step() - train_losses.append(loss.item()) - # ===================validate======================== - for data in dl_valid: - model.eval() - data = data.to(device) - output = model(data) - loss = criterion(output, data) - valid_losses.append(loss.item()) - - # ===================log============================= - print(f'epoch [{epoch + 1:03d}/{args.epochs}], ' - f'train-loss: {np.mean(train_losses[-len(dl_train):]):.4f},' - f'valid-loss: {np.mean(valid_losses[-len(dl_valid):]):.4f}') -``` - - epoch [001/30], train-loss: 51.9933,valid-loss: 50.1242 - epoch [002/30], train-loss: 48.2566,valid-loss: 47.1076 - epoch [003/30], train-loss: 44.9220,valid-loss: 44.3742 - epoch [004/30], train-loss: 41.9375,valid-loss: 41.8572 - epoch [005/30], train-loss: 39.2574,valid-loss: 39.4566 - epoch [006/30], train-loss: 36.6952,valid-loss: 37.2788 - epoch [007/30], train-loss: 34.3916,valid-loss: 35.2142 - epoch [008/30], train-loss: 32.2268,valid-loss: 33.2803 - epoch [009/30], train-loss: 30.2091,valid-loss: 31.4341 - epoch [010/30], train-loss: 28.2243,valid-loss: 29.7624 - epoch [011/30], train-loss: 26.4811,valid-loss: 28.1144 - epoch [012/30], train-loss: 24.8428,valid-loss: 26.5174 - epoch [013/30], train-loss: 23.2275,valid-loss: 25.1001 - epoch [014/30], train-loss: 21.7996,valid-loss: 23.7714 - epoch [015/30], train-loss: 20.5191,valid-loss: 22.5523 - epoch [016/30], train-loss: 19.3051,valid-loss: 21.4910 - epoch [017/30], train-loss: 18.2187,valid-loss: 20.5648 - epoch [018/30], train-loss: 17.2544,valid-loss: 19.7392 - epoch [019/30], train-loss: 16.4306,valid-loss: 18.9481 - epoch [020/30], train-loss: 15.6538,valid-loss: 18.2895 - epoch [021/30], train-loss: 14.9991,valid-loss: 17.6659 - epoch [022/30], train-loss: 14.4131,valid-loss: 17.1244 - epoch [023/30], train-loss: 13.8716,valid-loss: 16.6460 - epoch [024/30], train-loss: 13.4321,valid-loss: 16.2146 - epoch [025/30], train-loss: 13.0366,valid-loss: 15.8306 - epoch [026/30], train-loss: 12.6892,valid-loss: 15.4480 - epoch [027/30], train-loss: 12.3829,valid-loss: 15.1167 - epoch [028/30], train-loss: 12.0784,valid-loss: 14.8496 - epoch [029/30], train-loss: 11.8405,valid-loss: 14.6078 - epoch [030/30], train-loss: 11.6030,valid-loss: 14.3591 - - - -```python -df_train_losses = vaep_model.process_train_loss({'MSE train': train_losses}) - -# Plotting is boilerplate code: -_ = df_train_losses.plot(kind='scatter', x='steps', y='MSE train smoothed', figsize=( - 15, 8), title='Exponential smoothed training loss', ylim=(0, None)) -df_train_losses.tail() -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
stepsMSE trainMSE train smoothed
26526512.13411.484
26626612.07311.543
26726721.64412.553
2682688.12212.110
2692699.01011.800
-
- - - - -![png](12_experiment_01_small_example_files/12_experiment_01_small_example_47_1.png) - - - -```python -pred = get_pred(model, dl_valid) -analysis.pred_aa_denoised = vaep_model.build_df_from_pred_batches( - pred, scaler, index=analysis.df_valid.index, columns=analysis.df_valid.columns) -analysis.results['denoising AE'] = describe_abs_diff( - y_true=analysis.df_valid, y_pred=analysis.pred_aa_denoised) -pd.DataFrame(analysis.results['denoising AE']) -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ACANPAAGSVILLENLRARFEELCSDLFRAVAEQIPLLVQGVREGPYDVVVLPGGNLGAQNLSESAAVKHGSLGFLPRIDIIPNPQERLALVTGGEIASTFDHPELVKLLEVEHPAAKSTESLQANVQRVNNSSLIGLGYTQTLKPGIK
count18.00018.00018.00018.00018.00018.00018.00018.00018.00018.000
mean0.5880.7330.4340.5850.6830.5580.7231.2770.5840.593
std0.3430.4680.2310.3650.5600.7171.0170.5090.4350.367
min0.0380.0140.0790.0240.0230.0580.0400.4070.0240.045
25%0.4020.4090.2730.2840.2530.1700.2810.9720.2700.390
50%0.5010.6560.3960.6090.5530.3230.3911.2260.4380.578
75%0.7490.9810.4960.8171.0320.4880.7191.4510.9920.788
max1.1821.7000.9091.2581.8182.5644.5192.2181.3601.518
-
- - - -## Collaborative Filtering setup - -Components -- each sample has an embedding vector and an intercept -- each peptide has an embedding vector and an intercept -- scalar product of embeddings yields predictions - - - - -```python -from fastai.collab import CollabDataLoaders, MSELossFlat, Learner -from types import SimpleNamespace -# data format - -analysis.collab = Analysis() -collab = analysis.collab -collab.columns = 'peptide,Sample ID,intensity'.split(',') -``` - - -```python -analysis.collab -``` - - - - - Analysis(columns=['peptide', 'Sample ID', 'intensity']) - - - - -```python -collab.df = analysis.df.unstack().reset_index(drop=False).rename( - columns={'level_0': 'peptide', 0: 'intensity'}) -collab.df.head() -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
peptideSample IDintensity
0ACANPAAGSVILLENLR20151128_QE7_UPLC11_RJC_DEV_columnsTest_HeLa_0128.553
1ACANPAAGSVILLENLR20160105_QE6_nLC4_MM_QC_MNT_HELA_01_17010620180630.451
2ACANPAAGSVILLENLR20160311_QE6_LC6_SCL_QC_MNT_HeLa_0128.763
3ACANPAAGSVILLENLR20160401_QE6_nLC6_ASD_QC_HELA_0329.676
4ACANPAAGSVILLENLR20160404_QE2_nlc1_QC_hela_16040421012528.240
-
- - - - -```python -dls = CollabDataLoaders.from_df( - collab.df, user_name='Sample ID', item_name='peptide', rating_name='intensity', bs=64) -dls.show_batch() -``` - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Sample IDpeptideintensity
020180906_QE5_nLC5_KBE_QC_MNT_HELA_03IDIIPNPQER30.787
120160404_QE2_nlc1_QC_hela_160404210125AVAEQIPLLVQGVR25.963
220160820_QE4_nLC4_SCL_QC_HeLa_04LALVTGGEIASTFDHPELVK29.768
320180927_QE1_nLC10_GP_QC_HELA_G8ARFEELCSDLFR29.601
420160404_QE2_nlc1_QC_hela_160404210125VNNSSLIGLGYTQTLKPGIK28.648
520180518_QE4_nLC6_MR_QC_MNT_HeLa_6EGPYDVVVLPGGNLGAQNLSESAAVK29.821
620180518_QE4_nLC6_MR_QC_MNT_HeLa_6AVAEQIPLLVQGVR26.952
720180926_QE7_nLC11_AL_QC_HeLa_08ARFEELCSDLFR30.108
820180226_QE10_nLC0_MR_QC_MNT_Hela_02EGPYDVVVLPGGNLGAQNLSESAAVK29.131
920180514_LUMOS1_LC4_SCL-IAH_QC_MNT_HeLa_01EGPYDVVVLPGGNLGAQNLSESAAVK29.584
- - - -```python -# dls.classes -``` - - -```python -import fastai.torch_core -device = torch.device('cpu') -fastai.torch_core.defaults.device = torch.device('cpu') - - -collab.model_args = {} -collab.model_args['n_samples'] = len(dls.classes['Sample ID']) -collab.model_args['n_peptides'] = len(dls.classes['peptide']) -collab.model_args['dim_latent_factors'] = 5 -collab.model_args['y_range'] = ( - int(collab.df['intensity'].min()), int(collab.df['intensity'].max())+1) - -collab.model_args -``` - - - - - {'n_samples': 91, - 'n_peptides': 11, - 'dim_latent_factors': 5, - 'y_range': (24, 34)} - - - - -```python -model = vaep_model.DotProductBias(**collab.model_args) -learn = Learner(dls, model, loss_func=MSELossFlat()) -learn.summary() -``` - - - - - - - - - DotProductBias (Input shape: 64) - ============================================================================ - Layer (type) Output Shape Param # Trainable - ============================================================================ - 64 x 5 - Embedding 455 True - ____________________________________________________________________________ - 64 x 1 - Embedding 91 True - ____________________________________________________________________________ - 64 x 5 - Embedding 55 True - ____________________________________________________________________________ - 64 x 1 - Embedding 11 True - ____________________________________________________________________________ - - Total params: 612 - Total trainable params: 612 - Total non-trainable params: 0 - - Optimizer used: - Loss function: FlattenedLoss of MSELoss() - - Callbacks: - - TrainEvalCallback - - Recorder - - ProgressCallback - - - - -```python -learn.fit_one_cycle(args.epochs, 5e-3) -``` - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
epochtrain_lossvalid_losstime
02.0785302.10675000:00
12.0654502.07347800:00
22.0411032.00357400:00
31.9894701.88231900:00
41.9205851.69130800:00
51.8137201.42759800:00
61.6584761.13402500:00
71.4718000.91911000:00
81.2904010.77250900:00
91.1241440.69594400:00
100.9827990.65233200:00
110.8617230.61931400:00
120.7600980.59521000:00
130.6757580.58510700:00
140.6059570.57420600:00
150.5466770.56800100:00
160.4969980.56149300:00
170.4531540.55999900:00
180.4174440.55936400:00
190.3861490.55606500:00
200.3603500.55581200:00
210.3386050.55623400:00
220.3200730.55542900:00
230.3049580.55455100:00
240.2923980.55438600:00
250.2815760.55409200:00
260.2728280.55389400:00
270.2653580.55390700:00
280.2583390.55387100:00
290.2536410.55386200:00
- - - -```python -# this shows it along the mini-batches, no easy customization -learn.recorder.plot_loss() -``` - - -![png](12_experiment_01_small_example_files/12_experiment_01_small_example_58_0.png) - - - -```python -# learn.recorder.plot_loss?? -import matplotlib.pyplot as plt -from fastcore.foundation import L - - -def plot_loss(self, skip_start=5, with_valid=True, ax=None): - if not ax: - fig, ax = plt.subplots() - ax.plot(list(range(skip_start, len(self.losses))), - self.losses[skip_start:], label='train') - if with_valid: - idx = (np.array(self.iters) < skip_start).sum() - ax.plot(self.iters[idx:], L( - self.values[idx:]).itemgot(1), label='valid') - ax.legend() - - -fig, ax = plt.subplots(figsize=(10, 8)) -plot_loss(learn.recorder, ax=ax) -``` - - -![png](12_experiment_01_small_example_files/12_experiment_01_small_example_59_0.png) - - - -```python -# epoch values -for x in learn.recorder.values: - print(x) - break -``` - - [2.0785300731658936, 2.106750249862671] - - - -```python -# get_preds is overloaded, but hardly documented https://docs.fast.ai/learner.html#Learner.get_preds -encodings, pred, target = learn.get_preds( - with_input=True) # per default validation data -``` - - - - - -The analysis concept changes. Here only the (masked) missing peptides could be assessed - without the having entire samples as validation cohorts. Although there is no need for a complete sample, one needs at least some information of a sample to train the sample embedding, leading to a change in the setup. - - - Collaborative Filtering can be trained on all available data to infer the missing peptides - - -```python -pred_df = pd.DataFrame([{'Sample ID': dls.classes['Sample ID'][obs[0]], 'peptide': dls.classes['peptide'] - [obs[1]], 'intensity': pred_intensity.item()} for obs, pred_intensity in zip(encodings, pred)]) -pred_df = pred_df.pivot(index='Sample ID', columns='peptide') -pred_df -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
intensity
peptideACANPAAGSVILLENLRARFEELCSDLFRAVAEQIPLLVQGVREGPYDVVVLPGGNLGAQNLSESAAVKHGSLGFLPRIDIIPNPQERLALVTGGEIASTFDHPELVKLLEVEHPAAKSTESLQANVQRVNNSSLIGLGYTQTLKPGIK
Sample ID
20151128_QE7_UPLC11_RJC_DEV_columnsTest_HeLa_01NaNNaN27.756NaNNaNNaNNaNNaN27.528NaN
20160105_QE6_nLC4_MM_QC_MNT_HELA_01_17010620180630.812NaNNaN29.117NaNNaN29.511NaNNaNNaN
20160311_QE6_LC6_SCL_QC_MNT_HeLa_01NaN28.138NaNNaNNaN29.065NaNNaNNaN28.786
20160401_QE6_nLC6_ASD_QC_HELA_03NaNNaN27.595NaNNaNNaN28.767NaNNaN29.293
20160404_QE2_nlc1_QC_hela_160404210125NaNNaNNaNNaN27.622NaNNaNNaN28.218NaN
.................................
20190527_QE4_LC12_AS_QC_MNT_HeLa_02NaN30.277NaNNaNNaNNaNNaNNaNNaNNaN
20190701_QE4_LC12_IAH_QC_MNT_HeLa_03NaNNaN26.69329.811NaNNaNNaN28.875NaNNaN
20190708_QE6_nLC4_JE_QC_MNT_HeLa_01NaNNaNNaNNaNNaN31.248NaN29.23030.50830.678
20191128_QE8_nLC9_ASD_QC_HeLa_1NaNNaNNaNNaNNaN30.962NaNNaNNaNNaN
20191128_QE8_nLC9_ASD_QC_HeLa_1_20191128165313NaNNaN26.942NaNNaNNaNNaN29.263NaNNaN
-

81 rows × 10 columns

-
- - - -PyTorch Model used directly: - - -```python -valid_dl = learn.dls.valid -model.to(device) -for X, target in valid_dl: - print(learn.model(X[:1])) - break -``` - - tensor([[29.6450]], grad_fn=) - - -Switching from DotProduct to FNN based on embeddings as implemented in fastai - - -```python -from fastai.collab import collab_learner -from fastai.collab import get_emb_sz -# get_emb_sz?? -``` - - -```python -get_emb_sz(dls) # default embedding sizes based on dataloader for NN -``` - - - - - [(91, 20), (11, 6)] - - - - -```python -collab.model_args # from above -``` - - - - - {'n_samples': 91, - 'n_peptides': 11, - 'dim_latent_factors': 5, - 'y_range': (24, 34)} - - - - -```python -from fastai.collab import collab_learner - -learn = collab_learner( - dls, use_nn=True, y_range=collab.model_args['y_range'], layers=[20, 10]) -learn.summary() -``` - - - - - - - - - EmbeddingNN (Input shape: 64) - ============================================================================ - Layer (type) Output Shape Param # Trainable - ============================================================================ - 64 x 20 - Embedding 1820 True - ____________________________________________________________________________ - 64 x 6 - Embedding 66 True - Dropout - BatchNorm1d 52 True - ____________________________________________________________________________ - 64 x 20 - Linear 520 True - ReLU - BatchNorm1d 40 True - ____________________________________________________________________________ - 64 x 10 - Linear 200 True - ReLU - ____________________________________________________________________________ - 64 x 1 - Linear 11 True - SigmoidRange - ____________________________________________________________________________ - - Total params: 2,709 - Total trainable params: 2,709 - Total non-trainable params: 0 - - Optimizer used: - Loss function: FlattenedLoss of MSELoss() - - Callbacks: - - TrainEvalCallback - - Recorder - - ProgressCallback - - - - -```python -learn.fit_one_cycle(30, 5e-3, wd=0.1) -``` - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
epochtrain_lossvalid_losstime
02.3441682.17873500:00
11.9763072.22459400:00
21.6086462.11144600:00
31.2633721.71863700:00
41.0269901.11641100:00
50.8677410.83868200:00
60.7521320.53102600:00
70.6601340.48302300:00
80.5760400.48164200:00
90.5184520.48623300:00
100.4693520.45509000:00
110.4246550.44063300:00
120.3889930.49356500:00
130.3536090.45789900:00
140.3264790.46954000:00
150.2958370.46290800:00
160.2766310.49850800:00
170.2573940.49494000:00
180.2468680.50421500:00
190.2286920.46902800:00
200.2135720.49966800:00
210.1996120.48103700:00
220.1865800.47063900:00
230.1733270.48267800:00
240.1630540.49452100:00
250.1541270.48407000:00
260.1451590.48387400:00
270.1387940.48547100:00
280.1340030.48147700:00
290.1346710.48656200:00
- - -## VAE - -### Transform: Non-log transformed data (Single run) - -Scale samples according to training data - - -```python -from sklearn.preprocessing import MinMaxScaler - -args_vae = {} -args_vae['SCALER'] = MinMaxScaler -# select initial data: transformed vs not log transformed -scaler = args_vae['SCALER']().fit(analysis.df_train) -scaler.transform(analysis.df_valid.iloc[:5]) -``` - - - - - array([[0.47445019, 0.36365456, 0.36549262, 0.26387401, 0.66955657, - 0.30150902, 0.48590535, 0.64463529, 0.30895769, 0.21176791], - [0.92999583, 0.65836564, 0.87751846, 0.82593777, 0.89923995, - 0.69385728, 0.73439981, 0.8750341 , 0.6227158 , 0.7927601 ], - [0.71693449, 0.66050277, 0.67825525, 0.72851001, 0.86152074, - 0.66832776, 0.72579736, 0.79702518, 0.43990085, 0.5176384 ], - [0.30641543, 0.20464727, 0.25665079, 0.15941278, 0.58609146, - 0.21429095, 0.31743156, 0.65568975, 0.18152343, 0.14663306], - [0.78921018, 0.67000572, 0.8116531 , 0.83837227, 0.41929881, - 0.72668818, 0.65039402, 0.85251152, 0.78133607, 0.71769148]]) - - - -### Dataloaders - - -```python -from vaep.io.datasets import PeptideDatasetInMemoryNoMissings -from vaep.io.dataloaders import DataLoadersCreator - -data_loader_creator = DataLoadersCreator( - df_train=analysis.df_train, - df_valid=analysis.df_valid, - scaler=scaler, - DataSetClass=PeptideDatasetInMemoryNoMissings, - batch_size=args.batch_size) - -dl_train, dl_valid = data_loader_creator.get_dls(shuffle_train=True) - -logger.info( - "N train: {:5,d} \nN valid: {:5,d}".format( - len(dl_train.dataset), len(dl_valid.dataset)) -) -``` - - vaep - INFO N train: 72 - N valid: 18 - - -### Model - - -```python -from torch.nn import functional as F -from torch.nn import Sigmoid -from vaep.model import VAE - -n_neurons = 6 -logger.info(f'Latent layer neurons: {n_neurons}') - -model = vaep_model.VAE(n_features=n_features, - n_neurons=n_neurons, - last_decoder_activation=Sigmoid, - last_encoder_activation=None, - dim_latent=4).double() -model = model.to(device) - -logger.info(model) -``` - - vaep - INFO Latent layer neurons: 6 - vaep - INFO VAE( - (decoder): Sequential( - (0): Linear(in_features=4, out_features=6, bias=True) - (1): Tanh() - (2): Linear(in_features=6, out_features=10, bias=True) - (3): Sigmoid() - ) - (encoder): Sequential( - (0): Linear(in_features=10, out_features=6, bias=True) - (1): Tanh() - (2): Linear(in_features=6, out_features=8, bias=True) - ) - ) - - -### Optimizers - - - -```python -from torch import optim -optimizer = optim.Adam(params=model.parameters(), - lr=1e-3) -``` - -### Tensorboard - - -```python -if ADD_TENSORBOARD: - tensorboard_model_namer = TensorboardModelNamer( - prefix_folder='experiment_01') - writer = tensorboard_model_namer.get_writer(1, [n_neurons], 'scaler') - logger.info(f"Logging to: {writer.get_logdir()}") - - # data, mask = next(iter(dl_train)) - # writer.add_image( - # f'{len(mask)} mask for this batch of samples', mask, dataformats='HW') - - data = next(iter(dl_train)) - writer.add_image( - f'{len(data)} batch of sampled data (as heatmap)', data, dataformats='HW') - - # ToDo: compiler warning: error or tracer error? - writer.add_graph(model, input_to_model=data.to( - device)) # try to add after training? - writer.flush() -``` - -### Training Loop - - -```python -from collections import defaultdict -from functools import partial - -import torch - - -def run_experiment(model, dls, writer, args): - metrics = defaultdict(dict) - metrics_per_batch = defaultdict(list) - dl_train, dl_valid = dls - msg_eval_epoch = "Validation Set - Epoch: {:3d} - loss: {:7.3f} - mse: {:5.3f} - KLD: {:5.3f}" - - def _append_batch_metrics(batch_metrics_epoch, d_metrics=metrics_per_batch, dataset_name='train'): - """Append single batch metrics to global dictionary.""" - for d in batch_metrics_epoch.values(): - for key, value in d.items(): - d_metrics[(dataset_name, key)].append(d[key]) - return None # Signal in-place operation - - def _agg_metric_per_epoch(batch_metrics_epoch, epoch, d_metrics=metrics, dataset_name='train'): - keys = next(iter(batch_metrics_epoch.values())).keys() - for key in keys: - d_metrics[(dataset_name, key)][epoch] = np.mean([d[key] - for d in batch_metrics_epoch.values()]) - return None # Signal in-place operation - - for epoch in range(1, args.epochs+1): - _epoch_metrics = vaep_model.train(model=model, train_loader=dl_train, - optimizer=optimizer, device=device) - n_batches = len(dl_train) - - _append_batch_metrics(_epoch_metrics) - - _agg_metric_per_epoch(_epoch_metrics, epoch) - - _epoch_metrics_valid = vaep_model.evaluate( - model=model, data_loader=dl_valid, device=device) - n_batches = len(dl_valid) - _append_batch_metrics(_epoch_metrics_valid, dataset_name='valid') - _agg_metric_per_epoch(_epoch_metrics_valid, - epoch, dataset_name='valid') - - if writer: - writer.add_scalar('avg validation loss', - _epoch_metric_valid['loss'] / n_batchnes, - epoch) - - return dict(metrics), dict(metrics_per_batch) -``` - - -```python -metrics, metrics_per_batch = run_experiment(model=model, dls=( - dl_train, dl_valid), writer=None, args=args) # decide about format -``` - - -```python -df_train_losses = vaep_model.process_train_loss( - {'training loss': metrics_per_batch[('train', 'loss')]}) - -# Plotting is boilerplate code: -_ = df_train_losses.plot(kind='scatter', x='steps', y='training loss smoothed', figsize=( - 15, 8), title='Exponential smoothed training loss', ylim=(0, None)) -df_train_losses -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
stepstraining losstraining loss smoothed
007.3097.309
117.4977.408
227.2217.339
337.3267.335
447.3607.341
............
2652656.7176.810
2662666.8116.810
2672676.6376.792
2682686.7286.786
2692696.7046.778
-

270 rows × 3 columns

-
- - - - -![png](12_experiment_01_small_example_files/12_experiment_01_small_example_86_1.png) - - - -```python -metrics_per_batch.keys() -``` - - - - - dict_keys([('train', 'loss'), ('train', 'recon_loss'), ('train', 'KLD'), ('valid', 'loss'), ('valid', 'recon_loss'), ('valid', 'KLD')]) - - - -### One epoch - - -```python -logger.setLevel(logging.DEBUG) -batch_metrics_last_epoch = vaep_model.train(model=model, train_loader=dl_train, - optimizer=optimizer, device=device) -pd.DataFrame.from_dict(batch_metrics_last_epoch, orient='index') -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
lossrecon_lossKLD
06.9236.9170.006
17.0577.0490.009
26.7686.7640.005
36.4746.4700.004
46.7206.7130.007
56.8356.8270.009
66.7116.7050.007
76.7036.7000.003
86.9916.9790.013
-
- - - -Currently: No improvements - -#### Performance plots - - -```python -metrics = pd.DataFrame(metrics) -_ = metrics.plot( - figsize=(18, 6), xlim=(1, args.epochs)) -``` - - -![png](12_experiment_01_small_example_files/12_experiment_01_small_example_92_0.png) - - - -```python -metrics -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
trainvalid
lossrecon_lossKLDlossrecon_lossKLD
17.2986.9660.3681.1990.9090.322
27.2296.9770.2801.1270.9120.238
37.1286.9400.2091.0720.9170.172
47.0996.9610.1541.0290.9190.122
57.0046.9040.1110.9980.9210.085
66.9996.9270.0800.9770.9220.061
76.9616.9090.0580.9620.9210.045
86.9296.8910.0430.9490.9180.034
96.9306.9010.0330.9360.9100.029
106.9156.8910.0270.9250.9030.025
116.9186.8970.0230.9150.8960.022
126.8696.8510.0200.9080.8910.019
136.9186.9020.0170.9010.8850.018
146.9236.9090.0160.8960.8790.019
156.8476.8330.0160.8900.8740.019
166.8846.8710.0140.8850.8690.017
176.8876.8750.0140.8780.8620.017
186.8386.8260.0130.8720.8570.017
196.8626.8510.0130.8680.8530.017
206.8366.8260.0120.8630.8490.016
216.8496.8390.0110.8580.8440.015
226.8156.8060.0110.8510.8380.015
236.8516.8420.0100.8470.8340.015
246.8266.8170.0100.8420.8290.015
256.8146.8060.0090.8370.8240.015
266.8246.8160.0090.8310.8180.015
276.8166.8090.0080.8280.8140.015
286.8066.7980.0080.8220.8090.015
296.7796.7710.0080.8150.8020.014
306.7796.7710.0080.8100.7980.013
-
- - - - -```python -selected = [(_split, _metric) - for _split in ['train', 'valid'] - for _metric in ['loss'] - ] -_ = metrics[selected].plot( - figsize=(18, 6)) -``` - - -![png](12_experiment_01_small_example_files/12_experiment_01_small_example_94_0.png) - - -### Predictions validation set - - -```python -_epoch_metric_valid, pred = vaep_model.evaluate( - model=model, data_loader=dl_valid, device=device, return_pred=True) -# raw predictions -pd.DataFrame(np.vstack(pred), index=analysis.df_valid.index, - columns=analysis.df_valid.columns) -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ACANPAAGSVILLENLRARFEELCSDLFRAVAEQIPLLVQGVREGPYDVVVLPGGNLGAQNLSESAAVKHGSLGFLPRIDIIPNPQERLALVTGGEIASTFDHPELVKLLEVEHPAAKSTESLQANVQRVNNSSLIGLGYTQTLKPGIK
Sample ID
20181029_QE3_nLC3_KBE_QC_MNT_HELA_020.5650.5280.5580.5980.6690.4990.5080.6280.5390.481
20181102_QE2_NLC10_MR_QC_MNT_HELA_010.5650.5300.5580.5950.6650.4980.5090.6240.5400.480
20181107_QE6_nLC12_MR_QC_MNT_HELA_New_010.5670.5300.5580.5960.6650.4980.5090.6240.5400.481
20181110_QE5_nLC5_OOE_QC_MNT_HELA_15cm_250ng_RO-0030.5640.5260.5580.6000.6710.5000.5060.6310.5380.481
20181112_QE7_nLC11_MEM_QC_HeLa_020.5650.5280.5580.5980.6680.4990.5070.6280.5380.480
20181119_QE1_nLC2_TW_QC_HeLa_10.5660.5290.5590.5960.6650.4980.5090.6240.5400.480
20181120_QE5_nLC7_AP_HeLa_20.5650.5290.5580.5970.6670.4990.5080.6260.5390.481
20181126_QE2_NLC10_MN_QC_HELA_020.5650.5280.5580.5980.6680.4990.5070.6280.5390.480
20181205_QE5_nLC7_RJC_QC_MNT_HeLa_20.5650.5260.5580.6000.6720.5000.5060.6320.5370.481
20181215_QE2_NLC10_ANHO_QC_MNT_HELA_040.5650.5280.5580.5970.6680.4990.5070.6280.5380.480
20181219_QE1_nLC2_GP_QC_MNT_HELA_010.5650.5270.5580.5980.6690.4990.5070.6290.5380.481
20190107_QE5_nLC5_DS_QC_MNT_HeLa_FlashPack_020.5610.5240.5570.5970.6730.4990.5010.6330.5370.476
20190527_QE4_LC12_AS_QC_MNT_HeLa_010.5660.5290.5580.5970.6660.4980.5080.6260.5390.480
20190527_QE4_LC12_AS_QC_MNT_HeLa_020.5660.5290.5580.5970.6660.4980.5080.6260.5390.480
20190701_QE4_LC12_IAH_QC_MNT_HeLa_030.5650.5280.5580.5970.6680.4990.5070.6280.5380.480
20190708_QE6_nLC4_JE_QC_MNT_HeLa_010.5650.5280.5580.5950.6660.4980.5070.6250.5390.478
20191128_QE8_nLC9_ASD_QC_HeLa_10.5680.5300.5570.5980.6660.4980.5080.6270.5390.481
20191128_QE8_nLC9_ASD_QC_HeLa_1_201911281653130.5680.5290.5570.5980.6660.4980.5080.6270.5380.481
-
- - - - -```python -# integrate label in dataloader -analysis.pred_vae = vaep_model.build_df_from_pred_batches( - pred, scaler, index=analysis.df_valid.index, columns=analysis.df_valid.columns) -analysis.pred_vae -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ACANPAAGSVILLENLRARFEELCSDLFRAVAEQIPLLVQGVREGPYDVVVLPGGNLGAQNLSESAAVKHGSLGFLPRIDIIPNPQERLALVTGGEIASTFDHPELVKLLEVEHPAAKSTESLQANVQRVNNSSLIGLGYTQTLKPGIK
Sample ID
20181029_QE3_nLC3_KBE_QC_MNT_HELA_0230.52329.35426.67429.17028.60430.08029.52827.86229.57629.804
20181102_QE2_NLC10_MR_QC_MNT_HELA_0130.52529.36226.67429.16028.58030.07429.53227.84029.58029.801
20181107_QE6_nLC12_MR_QC_MNT_HELA_New_0130.53029.36326.67429.16328.58130.07529.53527.84129.57929.804
20181110_QE5_nLC5_OOE_QC_MNT_HELA_15cm_250ng_RO-00330.51929.34626.67329.17528.62130.08229.52027.88129.57129.803
20181112_QE7_nLC11_MEM_QC_HeLa_0230.52529.35426.67329.16828.60230.07729.52427.86429.57429.801
20181119_QE1_nLC2_TW_QC_HeLa_130.52629.36026.67629.16228.58530.07529.53427.84129.58029.802
20181120_QE5_nLC7_AP_HeLa_230.52529.35826.67429.16628.59330.07729.53027.85229.57729.803
20181126_QE2_NLC10_MN_QC_HELA_0230.52329.35426.67329.16828.60130.07829.52527.86229.57529.802
20181205_QE5_nLC7_RJC_QC_MNT_HeLa_230.52229.34626.67229.17828.62430.08329.52027.88629.57029.805
20181215_QE2_NLC10_ANHO_QC_MNT_HELA_0430.52329.35326.67329.16728.60230.07729.52327.86429.57429.800
20181219_QE1_nLC2_GP_QC_MNT_HELA_0130.52329.35226.67329.17128.60830.07929.52427.86929.57429.803
20190107_QE5_nLC5_DS_QC_MNT_HeLa_FlashPack_0230.50829.33726.66929.16728.62930.07829.49827.89729.56729.788
20190527_QE4_LC12_AS_QC_MNT_HeLa_0130.52729.35926.67329.16428.58930.07529.52927.85229.57729.802
20190527_QE4_LC12_AS_QC_MNT_HeLa_0230.52629.35926.67329.16528.59030.07529.52827.85229.57729.801
20190701_QE4_LC12_IAH_QC_MNT_HeLa_0330.52329.35426.67229.16728.60030.07729.52327.86329.57429.801
20190708_QE6_nLC4_JE_QC_MNT_HeLa_0130.52229.35626.67529.15928.59030.07329.52427.84929.57729.794
20191128_QE8_nLC9_ASD_QC_HeLa_130.53429.36226.67129.16928.58830.07429.53027.85629.57529.805
20191128_QE8_nLC9_ASD_QC_HeLa_1_2019112816531330.53429.36126.67129.17128.59130.07529.52827.86029.57429.805
-
- - - - -```python -analysis.df_valid -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ACANPAAGSVILLENLRARFEELCSDLFRAVAEQIPLLVQGVREGPYDVVVLPGGNLGAQNLSESAAVKHGSLGFLPRIDIIPNPQERLALVTGGEIASTFDHPELVKLLEVEHPAAKSTESLQANVQRVNNSSLIGLGYTQTLKPGIK
Sample ID
20181029_QE3_nLC3_KBE_QC_MNT_HELA_0230.15728.67226.03827.95128.61029.35629.42727.96428.61528.887
20181102_QE2_NLC10_MR_QC_MNT_HELA_0131.99929.89727.72830.00129.96030.79130.56529.34429.92730.867
20181107_QE6_nLC12_MR_QC_MNT_HELA_New_0131.13829.90627.07129.64529.73830.69730.52628.87729.16329.929
20181110_QE5_nLC5_OOE_QC_MNT_HELA_15cm_250ng_RO-00329.47828.01225.67927.57028.11929.03728.65628.03028.08228.665
20181112_QE7_nLC11_MEM_QC_HeLa_0231.43029.94527.51130.04627.13830.91130.18129.21030.59030.611
20181119_QE1_nLC2_TW_QC_HeLa_131.04529.59826.50829.26029.00030.34130.12226.89629.19729.589
20181120_QE5_nLC7_AP_HeLa_230.79528.81627.43428.97229.69330.86029.91929.01729.13530.099
20181126_QE2_NLC10_MN_QC_HELA_0230.38629.24726.28128.60129.03529.99929.69128.79428.83329.895
20181205_QE5_nLC7_RJC_QC_MNT_HeLa_229.43227.39825.82328.07828.15629.23828.27528.90928.20828.280
20181215_QE2_NLC10_ANHO_QC_MNT_HELA_0430.43629.47626.75629.21329.19830.50529.49129.42229.37630.297
20181219_QE1_nLC2_GP_QC_MNT_HELA_0130.06228.98026.38827.93228.71029.78529.05329.07628.95129.246
20190107_QE5_nLC5_DS_QC_MNT_HeLa_FlashPack_0231.31530.20425.99529.47728.72030.65625.26329.30029.70630.483
20190527_QE4_LC12_AS_QC_MNT_HeLa_0132.03930.39926.77730.25529.09531.07930.90629.71630.75730.992
20190527_QE4_LC12_AS_QC_MNT_HeLa_0231.99630.48727.19930.29428.74231.08130.93229.74430.68431.114
20190701_QE4_LC12_IAH_QC_MNT_HeLa_0331.53530.13426.61429.43629.07530.95930.33129.94630.08631.134
20190708_QE6_nLC4_JE_QC_MNT_HeLa_0130.73630.80126.10630.55728.89031.37331.42627.00030.80031.724
20191128_QE8_nLC9_ASD_QC_HeLa_131.35530.90226.81530.09830.42332.99829.81730.49430.99629.837
20191128_QE8_nLC9_ASD_QC_HeLa_1_2019112816531331.31630.79326.55229.95030.45733.20029.19530.50731.03629.427
-
- - - -Absolute differences between VAE prediction and true values - - -```python -analysis.results['VAE'] = describe_abs_diff( - y_true=analysis.df_valid, y_pred=analysis.pred_vae) -pd.DataFrame(analysis.results['VAE']) -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ACANPAAGSVILLENLRARFEELCSDLFRAVAEQIPLLVQGVREGPYDVVVLPGGNLGAQNLSESAAVKHGSLGFLPRIDIIPNPQERLALVTGGEIASTFDHPELVKLLEVEHPAAKSTESLQANVQRVNNSSLIGLGYTQTLKPGIK
count18.00018.00018.00018.00018.00018.00018.00018.00018.00018.000
mean0.7540.8470.4810.7870.7140.9700.9381.3520.8350.784
std0.4580.5310.3320.4720.5980.8080.9690.7010.4640.552
min0.0870.1070.0590.0460.0050.0790.0310.1020.1390.032
25%0.3890.5370.1490.3530.3290.5890.3470.9640.4230.316
50%0.7950.7310.4610.8590.4890.8080.7321.2760.8510.753
75%1.0341.2830.7391.1221.1431.0051.1921.7881.2121.177
max1.5121.9481.0541.6051.8653.1264.2342.6471.4891.930
-
- - - -Absolute differences in case of mean prediction using **training** data means - - -```python -(analysis.df_valid - analysis.df_train.mean()).abs().describe() -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ACANPAAGSVILLENLRARFEELCSDLFRAVAEQIPLLVQGVREGPYDVVVLPGGNLGAQNLSESAAVKHGSLGFLPRIDIIPNPQERLALVTGGEIASTFDHPELVKLLEVEHPAAKSTESLQANVQRVNNSSLIGLGYTQTLKPGIK
count18.00018.00018.00018.00018.00018.00018.00018.00018.00018.000
mean0.7260.8700.4840.8100.6300.8780.9311.1470.8350.760
std0.4350.5380.3330.4530.5830.7730.9800.6020.4730.549
min0.1220.0470.0550.1180.0150.0620.0840.1520.1610.011
25%0.4380.5090.1600.4130.1710.4370.3530.7530.3900.324
50%0.7020.7460.4590.9330.3910.6560.6801.1390.8200.683
75%1.0821.2600.7261.1530.9701.0111.2231.4721.2451.191
max1.4261.8961.0281.5201.7002.9224.3122.3241.4911.840
-
- - - -### Mean and logvar - - -```python -dl_train, dl_valid = data_loader_creator.get_dls( - shuffle_train=False) # to have know the samples - -latent_space = defaultdict(list) -model.eval() - - -def _add_pred_to_d(d, k, pred_fct): - _recon, _mu, _logvar = pred_fct(x) - _recon, _mu, _logvar = _recon.detach().numpy(), _mu.detach().numpy(), _logvar.detach().numpy() - d[(k, "mu")].append(_mu) - d[(k, "logvar")].append(_logvar) - d[(k, "recon")].append(_recon) - - -for x in dl_train: - key = 'train' - _add_pred_to_d(d=latent_space, k=key, pred_fct=model) -for x in dl_valid: - key = 'valid' - _add_pred_to_d(d=latent_space, k=key, pred_fct=model) - -# import importlib; importlib.reload(vaep_model) -for (split, stat), arrays in latent_space.items(): - _index = getattr(analysis.indices, split) - latent_space[(split, stat)] = vaep_model.build_df_from_pred_batches( - pred=arrays, index=_index) -``` - - -```python -latent_space.keys() -``` - - - - - dict_keys([('train', 'mu'), ('train', 'logvar'), ('train', 'recon'), ('valid', 'mu'), ('valid', 'logvar'), ('valid', 'recon')]) - - - -PCA plot of latent means - - -```python -# analysis.vae.mu -vae_mu = pd.concat([ - latent_space[('train', 'mu')], - latent_space[('valid', 'mu')] -], keys=['train', 'valid']) -vae_mu - -pca = run_pca(vae_mu) -``` - - -```python -split = 'train' -ax = pca.loc[split].plot.scatter( - x=cols[0], y=cols[1], title='First two PCs of mu', color='blue', label=split) -split = 'valid' -ax = pca.loc[split].plot.scatter( - x=cols[0], y=cols[1], title='First two PCs of encoding(mu)', color='orange', label=split, ax=ax) -``` - - -![png](12_experiment_01_small_example_files/12_experiment_01_small_example_108_0.png) - - - -```python -cols = list(pca.columns) - -fig, axes = plt.subplots(ncols=2, figsize=(15, 8)) - -# by split -ax = axes[0] - -split = 'train' -ax = pca.loc[split].plot.scatter( - x=cols[0], y=cols[1], title='First two PCs of mu', color='blue', label=split, ax=ax) -split = 'valid' -ax = pca.loc[split].plot.scatter( - x=cols[0], y=cols[1], title='First two PCs of encoding(mu)', color='orange', label=split, ax=ax) - -# by dates -ax = axes[1] -ax = scatter_plot_w_dates(ax, pca, dates=analysis.df_meta.date) - -loc = mdates.AutoDateLocator() -_ = fig.colorbar(ax, ticks=loc, - format=mdates.AutoDateFormatter(loc)) - -figures[('pca', 'vae')] = fig -``` - - -![png](12_experiment_01_small_example_files/12_experiment_01_small_example_109_0.png) - - -Compare to original PCA - - -```python -figures[('pca', 'original')] # pca on std-normalized data -``` - - - - -![png](12_experiment_01_small_example_files/12_experiment_01_small_example_111_0.png) - - - -Distance of samples from mean sample - - -```python -def euclidian_distance(df, axis=0): - X = (df - df.mean(axis=axis))**2 - axis = 1-axis - X = X.sum(axis=axis) - X = X**0.5 - return X - - -# latent -dist = {} -dist['vae_mu'] =euclidian_distance(vae_mu) - -# reconstructed -vae_recon = pd.concat([ - latent_space[('train', 'recon')], - latent_space[('valid', 'recon')] -], keys=['train', 'valid']) - -dist['vae_recon'] =euclidian_distance(vae_recon) - -#non-scaled original -dist['original'] = euclidian_distance(analysis.df_by_split) - -#scaled original -scaler = StandardScaler().fit(analysis.df_train) -X = scaler.transform(analysis.df_by_split) -dist['original_normalized'] = euclidian_distance(X) - -# can different dimensionality be compared directly? -``` - - -```python -import itertools -fig, axes = plt.subplots(nrows=len(dist), figsize=(10,7*len(dist))) -axes = itertools.chain(axes) - -for i, (key, _s) in enumerate(dist.items()): - ax = next(axes) - _ = _s.sort_values().plot(rot=90, ax=ax, title=key) -_ = fig.tight_layout() -``` - - -![png](12_experiment_01_small_example_files/12_experiment_01_small_example_114_0.png) - - - -```python -dist = pd.DataFrame(dist)#.sort_values('original') -dist -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
vae_muvae_reconoriginaloriginal_normalized
Sample ID
train20151128_QE7_UPLC11_RJC_DEV_columnsTest_HeLa_010.1880.0137.4958.231
20160105_QE6_nLC4_MM_QC_MNT_HELA_01_1701062018060.1200.0062.3762.379
20160311_QE6_LC6_SCL_QC_MNT_HeLa_010.1250.0084.7185.015
20160401_QE6_nLC6_ASD_QC_HELA_030.0350.0032.2752.696
20160404_QE2_nlc1_QC_hela_1604042101250.1370.0114.5105.219
..................
valid20190527_QE4_LC12_AS_QC_MNT_HeLa_020.0510.0033.3983.989
20190701_QE4_LC12_IAH_QC_MNT_HeLa_030.0490.0012.5862.948
20190708_QE6_nLC4_JE_QC_MNT_HeLa_010.0650.0043.9254.747
20191128_QE8_nLC9_ASD_QC_HeLa_10.1510.0054.4705.200
20191128_QE8_nLC9_ASD_QC_HeLa_1_201911281653130.1610.0044.5915.388
-

90 rows × 4 columns

-
- - - - -```python -_ = dist.sort_values(by='original').plot(rot=90) -``` - - -![png](12_experiment_01_small_example_files/12_experiment_01_small_example_116_0.png) - - -## Tensorboard - -- can be run from notebook -- or in a separate process to inspect currently running training loops - - -```python -if ADD_TENSORBOARD: - print("Run to see updates: \n\n\ttensorboard " - f"--logdir {tensorboard_model_namer.folder.absolute()}") -``` - -## Compare metrics on AE and VAE - -- Collaborative Filtering currently not comparable as setup differs - - -```python -analysis.results.keys() -``` - - - - - dict_keys(['RF baseline', 'Simple AE', 'denoising AE', 'VAE']) - - - - -```python -print("Choose from list of keys: ", - ", ".join( - list(next(iter(next(iter(analysis.results.values())).values())).keys())) - ) -_selected_metric = "50%" # median -print("Currently selected:", _selected_metric) -``` - - Choose from list of keys: count, mean, std, min, 25%, 50%, 75%, max - Currently selected: 50% - - - -```python -# # Comparison Series -# comparison = {(peptide, model_name): stats[_selected_metric] for model_name, description in analysis.results.items() for peptide, stats in description.items()} -# pd.Series(comparison).sort_index() -``` - - -```python -# # Comparison as DataFrame -comparison = {} -for model_name, description in analysis.results.items(): - comparison[model_name] = {peptide: stats[_selected_metric] - for peptide, stats in description.items()} - -pd.DataFrame(comparison).style.apply(vaep.pandas.highlight_min, axis=1) -``` - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
RF baseline Simple AE denoising AE VAE
ACANPAAGSVILLENLR0.2503870.1890530.5005090.794539
ARFEELCSDLFR0.3829970.2362050.6563740.730623
AVAEQIPLLVQGVR0.3795810.3440610.3960370.461284
EGPYDVVVLPGGNLGAQNLSESAAVK0.3190980.1965300.6093190.859122
HGSLGFLPR0.6965040.1924650.5529250.488594
IDIIPNPQER0.2484640.1163730.3230130.808358
LALVTGGEIASTFDHPELVK0.3050450.2644090.3906860.732405
LLEVEHPAAK1.0004930.5726481.2261541.276345
STESLQANVQR0.2867120.3385880.4380540.851372
VNNSSLIGLGYTQTLKPGIK0.3281240.2466870.5778050.752543
- - - -## Hyperparameter comparison - -- [x] order data by date: consecutive samples from training to validation -- [x] check stratification based on machine and column length between splits - - Do validation and training data have same proportion of machine types? -> generally no, would need to be added - - not (all) machines are running continously or are continously checked -- [x] complete meta data reading based on filenames -- [x] compare performance regarding data normalization - - in original intensity space (non-log-transformed) - > -- [ ] compare performance regarding several hyperparameters of VAE (layers, activation, etc) - - plot different losses in one plot as validation data set is the same -- [ ] increase number of samples in training set and create result plot -- [ ] increase the number of peptides (features) -- [ ] mask some values in the validation set missing (Quality Assessment) -- [ ] write main function which trains an entire model (including data transformations) -- [ ] add initial PCA plot with samples. Is there any grouping observable? (plotly express) - -Debug -- [x] Check reporting of loss again: average sample loss or average peptide loss? -- [x] take a close look at VAE tutorial of PyTorch (data normalization, etc) -- [x] reduce the features size to fewer samples - -VAE -- original inputs between 0 and 1 as decoder outputs are transformed originally using the sigmoid fct -- original model use `tanh` activations -- think about the definition of `MSE` in a mini-batch. Should be peptide wise? - - VAMB does sum over a sample and then takes the mean of the sum (alternative?) - - multi-output regression? -- learning requires active masking: Mask inputs which should be learned to be recovered. Feed original, - not masked image as target to loss. - -- [ ] Run MNIST example with MSE loss. Does it still work? -- [x] Normalize inputs to zero and one, use MNIST VAE. Does it work? - - yes, it learns better then -- [x] Regress M peptide intensities on 1 other peptide intensity. Does it work? (Reference performance) - - RF baseline model established -- [x] Build a normal AE without probabilistic bottleneck. Does this work? - - yes - -Refactoring - -- [x] get epoch out of train, eval etc - - -Ideas - - combine 1000 most abundant peptides as guidance for different sets of low abundant peptides - - show the difference between original and reconstruction using a cm in an Image? batch-wise? - -- Current optimum for comparision is zero - -> The comparison where relatively low abundant, but not super low-abundant peptides will be masked, could skew the comparison. - - -```python -# writer # new writer -# dls = get_dls(data_in_memory, scaler) -# model = VAE() -# writer = # new writer for each setup -# metrics = run_experiment(model, dls, writer) -# overview['experiment_name'] = metrics -``` - -### Inspect batches of the trained model - - -```python -index_valid = analysis.df_valid.index -index_train = analysis.df_train.index -columns_ = analysis.df_train.columns - -model.eval() -``` - - - - - VAE( - (decoder): Sequential( - (0): Linear(in_features=4, out_features=6, bias=True) - (1): Tanh() - (2): Linear(in_features=6, out_features=10, bias=True) - (3): Sigmoid() - ) - (encoder): Sequential( - (0): Linear(in_features=10, out_features=6, bias=True) - (1): Tanh() - (2): Linear(in_features=6, out_features=8, bias=True) - ) - ) - - - -#### Training batch example - - -```python -model.to('cpu') -iter_dl_train = iter(dl_train) -batch = next(iter_dl_train) -batch_mask = None -try: - batch, batch_mask = batch - batch_masked = batch * batch_mask -except ValueError: - batch = batch -batch_recon, mu, logvar = model(batch) -``` - - -```python -batch_recon -``` - - - - - tensor([[0.5611, 0.5230, 0.5580, 0.5997, 0.6752, 0.5011, 0.5036, 0.6337, 0.5369, - 0.4788], - [0.5634, 0.5284, 0.5590, 0.5939, 0.6653, 0.4979, 0.5080, 0.6235, 0.5399, - 0.4781], - [0.5621, 0.5247, 0.5585, 0.5978, 0.6722, 0.4999, 0.5049, 0.6305, 0.5378, - 0.4782], - [0.5640, 0.5268, 0.5581, 0.5973, 0.6691, 0.4990, 0.5064, 0.6283, 0.5383, - 0.4793], - [0.5629, 0.5242, 0.5575, 0.5999, 0.6738, 0.5003, 0.5039, 0.6333, 0.5368, - 0.4792], - [0.5645, 0.5270, 0.5583, 0.5980, 0.6694, 0.4994, 0.5072, 0.6282, 0.5384, - 0.4801], - [0.5653, 0.5287, 0.5589, 0.5962, 0.6663, 0.4985, 0.5089, 0.6246, 0.5395, - 0.4800], - [0.5637, 0.5261, 0.5584, 0.5981, 0.6707, 0.4996, 0.5062, 0.6293, 0.5381, - 0.4794]], dtype=torch.float64, grad_fn=) - - - - -```python -_batch_metrics = vaep_model.loss_function(batch_recon, batch, mu, logvar) -_batch_metrics -``` - - - - - {'loss': tensor(6.4421, dtype=torch.float64, grad_fn=), - 'recon_loss': tensor(6.3307, dtype=torch.float64, grad_fn=), - 'KLD': tensor(0.1237, dtype=torch.float64, grad_fn=)} - - - - -```python -if batch_mask: - # avg per peptide loss -> should be close to zero (ref: std=1) - _mse = ((batch * batch_mask) - (batch_recon * batch_mask)).pow(2).sum() -else: - _mse = (batch - batch_recon).pow(2).sum() -_mse -``` - - - - - tensor(6.3307, dtype=torch.float64, grad_fn=) - - - - -```python -from torch import nn - -loss = nn.MSELoss(reduction='sum') -if batch_mask: - _mse = loss(input=batch_recon*batch_mask, target=batch * batch_mask) -else: - _mse = loss(input=batch_recon, target=batch) -_mse -``` - - - - - tensor(6.3307, dtype=torch.float64, grad_fn=) - - - - -```python -from torch.nn import functional as F -if batch_mask: - batch_sse = F.mse_loss(input=batch_recon*batch_mask, - target=batch * batch_mask, reduction='sum') -else: - batch_sse = F.mse_loss(input=batch_recon, - target=batch, reduction='sum') -batch_sse -``` - - - - - tensor(6.3307, dtype=torch.float64, grad_fn=) - - - -#### Validation batch example - - -```python -# validation data loader is not shuffled -N_valid = len(dl_valid.dataset) - -model.eval() - -iter_dl_valid = iter(dl_valid) - -batch = next(iter_dl_valid) -batch_mask = None -try: - batch, batch_mask = batch - batch_masked = batch * batch_mask -except ValueError: - batch = batch - -batch_recon, mu, logvar = model(batch) -``` - - -```python -batch_recon -``` - - - - - tensor([[0.5649, 0.5276, 0.5582, 0.5981, 0.6686, 0.4994, 0.5080, 0.6275, 0.5387, - 0.4809], - [0.5653, 0.5296, 0.5581, 0.5954, 0.6645, 0.4979, 0.5088, 0.6240, 0.5396, - 0.4800], - [0.5665, 0.5300, 0.5582, 0.5963, 0.6647, 0.4980, 0.5094, 0.6241, 0.5395, - 0.4808], - [0.5639, 0.5259, 0.5577, 0.5995, 0.6715, 0.5001, 0.5061, 0.6308, 0.5377, - 0.4806], - [0.5653, 0.5277, 0.5579, 0.5977, 0.6683, 0.4987, 0.5070, 0.6280, 0.5383, - 0.4800], - [0.5656, 0.5293, 0.5587, 0.5959, 0.6654, 0.4982, 0.5092, 0.6241, 0.5396, - 0.4801], - [0.5653, 0.5286, 0.5581, 0.5970, 0.6667, 0.4986, 0.5083, 0.6260, 0.5391, - 0.4806], - [0.5649, 0.5277, 0.5579, 0.5975, 0.6681, 0.4989, 0.5073, 0.6275, 0.5385, - 0.4801]], dtype=torch.float64, grad_fn=) - - - - -```python -_batch_metrics = vaep_model.loss_function(batch_recon, batch, mu, logvar) -_batch_metrics -``` - - - - - {'loss': tensor(2.9513, dtype=torch.float64, grad_fn=), - 'recon_loss': tensor(2.9164, dtype=torch.float64, grad_fn=), - 'KLD': tensor(0.0388, dtype=torch.float64, grad_fn=)} - - - - -```python -if batch_mask: - # avg per peptide loss -> should be close to zero (ref: std=1) - _mse = ((batch * batch_mask) - (batch_recon * batch_mask)).pow(2).sum() -else: - _mse = (batch - batch_recon).pow(2).sum() -_mse -``` - - - - - tensor(2.9164, dtype=torch.float64, grad_fn=) - - - - -```python -from torch import nn - -loss = nn.MSELoss(reduction='sum') -if batch_mask: - _mse = loss(input=batch_recon*batch_mask, target=batch * batch_mask) -else: - _mse = loss(input=batch_recon, target=batch) -_mse -``` - - - - - tensor(2.9164, dtype=torch.float64, grad_fn=) - - - - -```python -from torch.nn import functional as F -if batch_mask: - batch_sse = F.mse_loss(input=batch_recon*batch_mask, - target=batch * batch_mask, reduction='sum') -else: - batch_sse = F.mse_loss(input=batch_recon, - target=batch, reduction='sum') -batch_sse -``` - - - - - tensor(2.9164, dtype=torch.float64, grad_fn=) - - - -### Inspect Validation data - -- VAMB training epoch normalizes by number of batches, [see](https://github.com/RasmussenLab/vamb/blob/734b741b85296377937de54166b7db274bc7ba9c/vamb/encode.py#L284-L335) - - -```python -# validation data loader is not shuffled -iter_dl_valid = iter(dl_valid) - -batch = next(iter_dl_valid) -batch_mask = None -try: - batch, batch_mask = batch - batch_masked = batch * batch_mask -except ValueError: - batch = batch - -M = batch.shape[-1] -batch_recon, _, _ = model(batch) - -data = batch.detach().numpy() -if batch_mask: - mask = batch_mask.detach().numpy() -pred = batch_recon.detach().numpy() - -for batch in iter_dl_valid: - try: - # ToDo: Test if this works - if not type(batch) == torch.Tensor: - batch, batch_mask = batch - batch_masked = batch * batch_mask - except ValueError: - batch = batch - batch_recon, _, _ = model(batch) - data = np.append(data, batch.view([-1, M]), axis=0) - - if batch_mask: - mask = np.append(mask, batch_mask, axis=0) - pred = np.append(pred, batch_recon.detach().numpy().reshape(-1, M), axis=0) - -expected_shape = analysis.df_valid.shape -assert data.shape == expected_shape -assert pred.shape == expected_shape -if batch_mask: - assert mask.shape == expected_shape - -data = pd.DataFrame(data, index=index_valid, - columns=columns_).replace(0.0, np.nan) -pred = pd.DataFrame(pred, index=index_valid, columns=columns_) -mask = pd.DataFrame(mask, index=index_valid, - columns=columns_) if batch_mask else None -``` - - -```python -pd.DataFrame( - scaler.inverse_transform(pred), - index=index_valid, - columns=columns_ -) -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ACANPAAGSVILLENLRARFEELCSDLFRAVAEQIPLLVQGVREGPYDVVVLPGGNLGAQNLSESAAVKHGSLGFLPRIDIIPNPQERLALVTGGEIASTFDHPELVKLLEVEHPAAKSTESLQANVQRVNNSSLIGLGYTQTLKPGIK
Sample ID
20181029_QE3_nLC3_KBE_QC_MNT_HELA_0231.10229.70227.09429.61829.63130.64730.09028.91629.95430.219
20181102_QE2_NLC10_MR_QC_MNT_HELA_0131.10329.70327.09429.61529.62630.64630.09128.91229.95430.219
20181107_QE6_nLC12_MR_QC_MNT_HELA_New_0131.10429.70327.09429.61629.62630.64630.09228.91229.95430.219
20181110_QE5_nLC5_OOE_QC_MNT_HELA_15cm_250ng_RO-00331.10129.70027.09329.61929.63530.64730.08828.92029.95330.219
20181112_QE7_nLC11_MEM_QC_HeLa_0231.10229.70227.09429.61829.63130.64630.08928.91729.95330.219
20181119_QE1_nLC2_TW_QC_HeLa_131.10329.70327.09429.61629.62730.64630.09228.91229.95430.219
20181120_QE5_nLC7_AP_HeLa_231.10329.70227.09429.61729.62930.64630.09128.91529.95430.219
20181126_QE2_NLC10_MN_QC_HELA_0231.10229.70227.09429.61729.63130.64630.09028.91629.95330.219
20181205_QE5_nLC7_RJC_QC_MNT_HeLa_231.10229.70027.09329.62029.63630.64730.08828.92129.95330.219
20181215_QE2_NLC10_ANHO_QC_MNT_HELA_0431.10229.70227.09429.61729.63130.64630.08928.91729.95330.218
20181219_QE1_nLC2_GP_QC_MNT_HELA_0131.10229.70127.09429.61829.63230.64730.08928.91829.95330.219
20190107_QE5_nLC5_DS_QC_MNT_HeLa_FlashPack_0231.09929.69927.09329.61729.63730.64630.08428.92329.95230.216
20190527_QE4_LC12_AS_QC_MNT_HeLa_0131.10329.70327.09429.61729.62830.64630.09028.91429.95430.219
20190527_QE4_LC12_AS_QC_MNT_HeLa_0231.10329.70327.09429.61729.62830.64630.09028.91529.95430.219
20190701_QE4_LC12_IAH_QC_MNT_HeLa_0331.10229.70227.09329.61729.63030.64630.08928.91729.95330.219
20190708_QE6_nLC4_JE_QC_MNT_HeLa_0131.10229.70227.09429.61529.62830.64530.08928.91429.95430.217
20191128_QE8_nLC9_ASD_QC_HeLa_131.10429.70327.09329.61829.62830.64630.09128.91529.95330.220
20191128_QE8_nLC9_ASD_QC_HeLa_1_2019112816531331.10529.70327.09329.61829.62830.64630.09028.91629.95330.220
-
- - - - -```python -metrics.iloc[-1] # mse loss get's most weight in combined loss -``` - - - - - train loss 6.779 - recon_loss 6.771 - KLD 0.008 - valid loss 0.810 - recon_loss 0.798 - KLD 0.013 - Name: 30, dtype: float64 - - - - -```python -metrics.iloc[-1].loc[('valid', 'recon_loss')] -``` - - - - - 0.7980607149304116 - - - -Average prediction error per peptides: - -- std. dev is one, so a prediction - - -```python -# check that losses reported match loss calculated form predictions -((pred - data)**2).sum().sum() / data.notna().sum().sum() -``` - - - - - 0.06152830662682402 - - - - -```python -(pred - data).iloc[:10, :5] -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ACANPAAGSVILLENLRARFEELCSDLFRAVAEQIPLLVQGVREGPYDVVVLPGGNLGAQNLSESAAVKHGSLGFLPR
Sample ID
20181029_QE3_nLC3_KBE_QC_MNT_HELA_020.0900.1640.1930.334-0.001
20181102_QE2_NLC10_MR_QC_MNT_HELA_01-0.365-0.129-0.319-0.231-0.235
20181107_QE6_nLC12_MR_QC_MNT_HELA_New_01-0.150-0.131-0.120-0.132-0.197
20181110_QE5_nLC5_OOE_QC_MNT_HELA_15cm_250ng_RO-0030.2580.3210.3010.4400.085
20181112_QE7_nLC11_MEM_QC_HeLa_02-0.224-0.142-0.254-0.2410.249
20181119_QE1_nLC2_TW_QC_HeLa_1-0.128-0.0570.051-0.027-0.071
20181120_QE5_nLC7_AP_HeLa_2-0.0670.130-0.2300.053-0.187
20181126_QE2_NLC10_MN_QC_HELA_020.0340.0260.1190.155-0.074
20181205_QE5_nLC7_RJC_QC_MNT_HeLa_20.2700.4690.2570.3020.080
20181215_QE2_NLC10_ANHO_QC_MNT_HELA_040.021-0.030-0.025-0.013-0.101
-
- - - - -```python -((pred - data).iloc[:10, :5])**2 -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ACANPAAGSVILLENLRARFEELCSDLFRAVAEQIPLLVQGVREGPYDVVVLPGGNLGAQNLSESAAVKHGSLGFLPR
Sample ID
20181029_QE3_nLC3_KBE_QC_MNT_HELA_020.0080.0270.0370.1120.000
20181102_QE2_NLC10_MR_QC_MNT_HELA_010.1330.0170.1020.0530.055
20181107_QE6_nLC12_MR_QC_MNT_HELA_New_010.0230.0170.0140.0170.039
20181110_QE5_nLC5_OOE_QC_MNT_HELA_15cm_250ng_RO-0030.0660.1030.0910.1940.007
20181112_QE7_nLC11_MEM_QC_HeLa_020.0500.0200.0640.0580.062
20181119_QE1_nLC2_TW_QC_HeLa_10.0170.0030.0030.0010.005
20181120_QE5_nLC7_AP_HeLa_20.0040.0170.0530.0030.035
20181126_QE2_NLC10_MN_QC_HELA_020.0010.0010.0140.0240.005
20181205_QE5_nLC7_RJC_QC_MNT_HeLa_20.0730.2200.0660.0910.006
20181215_QE2_NLC10_ANHO_QC_MNT_HELA_040.0000.0010.0010.0000.010
-
- - - - -```python -(pred - data).notna().sum().sum() -``` - - - - - 180 - - - - -```python -N, M = data.shape -data.isna().sum().sum() / (N*M) # only few missings -``` - - - - - 0.0 - - - - -```python - -``` diff --git a/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_108_0.png b/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_108_0.png deleted file mode 100644 index 3d3acbdd8..000000000 Binary files a/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_108_0.png and /dev/null differ diff --git a/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_109_0.png b/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_109_0.png deleted file mode 100644 index cdf92098d..000000000 Binary files a/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_109_0.png and /dev/null differ diff --git a/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_111_0.png b/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_111_0.png deleted file mode 100644 index 80ea28cc5..000000000 Binary files a/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_111_0.png and /dev/null differ diff --git a/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_114_0.png b/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_114_0.png deleted file mode 100644 index 035021f36..000000000 Binary files a/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_114_0.png and /dev/null differ diff --git a/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_116_0.png b/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_116_0.png deleted file mode 100644 index 0e366f63b..000000000 Binary files a/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_116_0.png and /dev/null differ diff --git a/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_16_4.png b/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_16_4.png deleted file mode 100644 index 9e67a2995..000000000 Binary files a/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_16_4.png and /dev/null differ diff --git a/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_16_5.png b/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_16_5.png deleted file mode 100644 index 58bc1ee03..000000000 Binary files a/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_16_5.png and /dev/null differ diff --git a/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_18_0.png b/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_18_0.png deleted file mode 100644 index 80ea28cc5..000000000 Binary files a/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_18_0.png and /dev/null differ diff --git a/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_41_1.png b/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_41_1.png deleted file mode 100644 index f60e34c55..000000000 Binary files a/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_41_1.png and /dev/null differ diff --git a/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_47_1.png b/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_47_1.png deleted file mode 100644 index 0b4ee5c28..000000000 Binary files a/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_47_1.png and /dev/null differ diff --git a/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_58_0.png b/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_58_0.png deleted file mode 100644 index 8fe9423d9..000000000 Binary files a/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_58_0.png and /dev/null differ diff --git a/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_59_0.png b/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_59_0.png deleted file mode 100644 index bfd804bea..000000000 Binary files a/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_59_0.png and /dev/null differ diff --git a/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_86_1.png b/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_86_1.png deleted file mode 100644 index 52ae5faca..000000000 Binary files a/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_86_1.png and /dev/null differ diff --git a/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_92_0.png b/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_92_0.png deleted file mode 100644 index 8d04c0b57..000000000 Binary files a/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_92_0.png and /dev/null differ diff --git a/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_94_0.png b/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_94_0.png deleted file mode 100644 index 72c50857d..000000000 Binary files a/project/doc/ipynbs/12_experiment_01_small_example_files/12_experiment_01_small_example_94_0.png and /dev/null differ diff --git a/project/doc/ipynbs/12_experiment_01_transforms.md b/project/doc/ipynbs/12_experiment_01_transforms.md deleted file mode 100644 index 4674c5f9b..000000000 --- a/project/doc/ipynbs/12_experiment_01_transforms.md +++ /dev/null @@ -1,2182 +0,0 @@ -# Linear vs Log Transformed data - - -```python -from pathlib import Path - -from src import config -from vaep.analyzers.analyzers import * -from vaep.transform import StandardScaler, get_df_fitted_mean_std -``` - - FOLDER_MQ_TXT_DATA = data\mq_out - - - -```python -import logging -from vaep.logging import setup_logger - -logger = logging.getLogger() # returns root-logger -logger.setLevel(logging.CRITICAL) # silence for everything else -logger.handlers = [] - - -logger = setup_logger(logger=logging.getLogger('vaep')) -logger.info("Experiment 01") -``` - - vaep - INFO Experiment 01 - - -## Load data - -- 1000 features (most abundant peptides) -- later a subset of samples is selected - - -```python -N_SAMPLES_TO_LOAD = None -FN_PEPTIDE_INTENSITIES = config.FOLDER_DATA / 'df_intensities_N07813_M01000' -analysis = AnalyzePeptides.from_csv( - fname=FN_PEPTIDE_INTENSITIES, nrows=N_SAMPLES_TO_LOAD) -analysis.df = analysis.df.sort_index() # sort by date -assert analysis.df.index.is_unique, "Non-unique training samples" -analysis -``` - - - - - AnalyzePeptides with attributes: df, stats - - - -### Select consecutives samples for training - - -```python -import random - -N_SAMPLES = 1000 -logger.info(f"Selected {N_SAMPLES}") -analysis.N_SAMPLES = N_SAMPLES - - -def get_consecutive_data_indices(index, n_samples=N_SAMPLES): - start_sample = len(index) - n_samples - start_sample = random.randint(0, start_sample) - return index[start_sample:start_sample+n_samples] - - -indices_selected = get_consecutive_data_indices(analysis.df.index) -analysis.samples = indices_selected -analysis.df = analysis.df.loc[indices_selected] - -FRACTION = 0.9 - -class Indices(SimpleNamespace): - pass - -indices = Indices() -indices.train, indices.valid = indices_selected[:int( - FRACTION*N_SAMPLES)], indices_selected[int(FRACTION*N_SAMPLES):] -analysis.indices = indices - -analysis.df_train = analysis.df.loc[indices.train] -analysis.df_valid = analysis.df.loc[indices.valid] - -analysis.df -``` - - vaep - INFO Selected 1000 - - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
AAAAAAALQAKAAFDDAIAELDTLSEESYKAAHSEGNTTAGLDMRAAVATFLQSVQVPEFTPKAAVEEGIVLGGGCALLRAAVPSGASTGIYEALELRAAVPSGASTGIYEALELRDNDKACANPAAGSVILLENLRACGLVASNLNLKPGECLRADLINNLGTIAK...VVFVFGPDKVVFVFGPDKKVYALPEDLVEVKPKYADLTEDQLPSCESLKYDDMAAAMKYDDMAACMKYDDMATCMKYLAEVACGDDRKYLDEDTIYHLQPSGRYRVPDVLVADPPIAR
20181102_QE2_NLC10_MR_QC_MNT_NEWHELA_021,528,300,000.0001,537,100,000.000122,850,000.000603,890,000.000381,760,000.0001,553,000,000.0002,381,800,000.0002,786,300,000.000722,020,000.0003,724,600,000.000...898,310,000.000226,610,000.000227,440,000.000610,110,000.0001,386,400,000.0001,241,500,000.000513,480,000.0001,025,700,000.000400,570,000.000420,340,000.000
20181102_QE5_nLC5_OOE_QC_HELA_15cm_250ng_RO-045980,120,000.0003,848,700,000.000326,120,000.000683,470,000.0001,779,500,000.0002,177,200,000.0003,997,400,000.0002,208,400,000.0001,154,200,000.0003,612,300,000.000...827,350,000.000279,940,000.000183,970,000.000641,010,000.000799,580,000.000949,360,000.000390,970,000.000554,550,000.000284,930,000.000361,490,000.000
20181102_QE5_nLC5_OOE_QC_MNT_HELA_15cm_250ng_RO-0031,278,200,000.0003,071,800,000.000272,870,000.000521,070,000.0001,812,800,000.0001,926,900,000.0004,697,300,000.0001,801,500,000.000979,210,000.0003,014,500,000.000...739,910,000.000239,420,000.000149,320,000.000533,610,000.000453,970,000.000881,350,000.000367,130,000.000797,950,000.000302,060,000.000280,900,000.000
20181102_QE5_nLC5_OOE_QC_MNT_HELA_15cm_250ng_RO-0041,005,200,000.0002,817,700,000.000276,740,000.000532,060,000.0001,751,300,000.0002,001,000,000.0004,227,100,000.0001,631,200,000.000931,620,000.0002,854,800,000.000...764,310,000.000224,430,000.000156,400,000.000519,150,000.000353,830,000.000729,580,000.000411,470,000.000692,640,000.000234,980,000.000275,490,000.000
20181102_QE9_nLC2_AnMu_QC_Hela_012,850,100,000.0005,735,800,000.000946,660,000.0001,110,400,000.0002,128,000,000.0003,407,000,000.0007,362,800,000.0003,465,400,000.0002,166,300,000.0006,118,300,000.000...1,569,800,000.000607,620,000.000474,790,000.000722,820,000.0002,056,700,000.0001,290,800,000.000768,210,000.0001,815,000,000.000649,070,000.000789,800,000.000
..................................................................
20190618_QX4_JiYu_MA_HeLa_500ng_centroid956,600,000.000NaN287,710,000.000229,730,000.0001,690,400,000.0002,652,800,000.000203,080,000.000512,680,000.0007,133,200.0004,499,600,000.000...532,990,000.00074,872,000.000196,530,000.00012,611,000.000269,370,000.000138,830,000.00010,082,000.000203,200,000.000159,500,000.000232,050,000.000
20190619_QE1_nLC2_GP_QC_MNT_HELA_011,970,700,000.0003,392,600,000.000340,020,000.000207,610,000.0001,599,100,000.000353,510,000.0005,461,600,000.0002,345,700,000.0001,749,400,000.0005,416,900,000.000...870,560,000.000618,200,000.000244,310,000.000651,290,000.000833,600,000.000631,290,000.000359,960,000.000875,410,000.000124,410,000.000519,200,000.000
20190619_QE2_NLC1_GP_QC_MNT_HELA_012,828,000,000.0006,333,800,000.000272,190,000.000825,240,000.0001,974,700,000.000356,200,000.0006,333,800,000.0004,008,700,000.0003,528,400,000.0005,925,800,000.000...998,360,000.0001,032,500,000.000557,840,000.000847,160,000.000604,330,000.000779,800,000.000235,340,000.000864,550,000.000232,780,000.000768,690,000.000
20190619_QE7_nLC7_AP_QC_MNT_HeLa_011,943,900,000.0007,343,300,000.000254,950,000.000844,140,000.0002,698,700,000.000880,060,000.0005,494,000,000.0003,961,500,000.0002,728,400,000.0004,173,000,000.000...776,440,000.000671,950,000.000367,100,000.000726,440,000.000414,300,000.000594,570,000.000247,920,000.000734,510,000.000190,580,000.000428,270,000.000
20190619_QE7_nLC7_AP_QC_MNT_HeLa_021,533,400,000.0008,054,600,000.000213,290,000.0001,022,400,000.0002,404,600,000.000997,420,000.0007,297,600,000.0004,953,700,000.0003,101,800,000.0004,862,300,000.000...738,070,000.000647,420,000.000468,860,000.0001,107,700,000.000468,980,000.000626,090,000.000247,360,000.000875,230,000.000218,610,000.000625,780,000.000
-

1000 rows × 1000 columns

-
- - - -## Transforms - -### Custom Transforms - -- illustrate using adapted scikit-learn [`StandardScaler`](https://scikit-learn.org/stable/modules/preprocessing.html) - - -```python -N, M = 10, 10 # Samples, Features -analysis.df_train.iloc[:N, :M] -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
AAAAAAALQAKAAFDDAIAELDTLSEESYKAAHSEGNTTAGLDMRAAVATFLQSVQVPEFTPKAAVEEGIVLGGGCALLRAAVPSGASTGIYEALELRAAVPSGASTGIYEALELRDNDKACANPAAGSVILLENLRACGLVASNLNLKPGECLRADLINNLGTIAK
20181102_QE2_NLC10_MR_QC_MNT_NEWHELA_021,528,300,000.0001,537,100,000.000122,850,000.000603,890,000.000381,760,000.0001,553,000,000.0002,381,800,000.0002,786,300,000.000722,020,000.0003,724,600,000.000
20181102_QE5_nLC5_OOE_QC_HELA_15cm_250ng_RO-045980,120,000.0003,848,700,000.000326,120,000.000683,470,000.0001,779,500,000.0002,177,200,000.0003,997,400,000.0002,208,400,000.0001,154,200,000.0003,612,300,000.000
20181102_QE5_nLC5_OOE_QC_MNT_HELA_15cm_250ng_RO-0031,278,200,000.0003,071,800,000.000272,870,000.000521,070,000.0001,812,800,000.0001,926,900,000.0004,697,300,000.0001,801,500,000.000979,210,000.0003,014,500,000.000
20181102_QE5_nLC5_OOE_QC_MNT_HELA_15cm_250ng_RO-0041,005,200,000.0002,817,700,000.000276,740,000.000532,060,000.0001,751,300,000.0002,001,000,000.0004,227,100,000.0001,631,200,000.000931,620,000.0002,854,800,000.000
20181102_QE9_nLC2_AnMu_QC_Hela_012,850,100,000.0005,735,800,000.000946,660,000.0001,110,400,000.0002,128,000,000.0003,407,000,000.0007,362,800,000.0003,465,400,000.0002,166,300,000.0006,118,300,000.000
20181103_QE1_nLC1_RG_QC_HeLa_1582,300,000.000449,810,000.000100,500,000.000153,260,000.000272,230,000.000215,700,000.000932,700,000.000379,450,000.000195,630,000.0001,327,900,000.000
20181104_QE1_nLC1_RG_QC_HeLa_1691,850,000.000435,430,000.00019,396,000.000145,320,000.000234,910,000.000299,550,000.000898,520,000.000299,230,000.000187,800,000.0001,191,600,000.000
20181105_QE2_NLC10_MR_QC_MNT_HELA_01309,600,000.0002,037,100,000.00020,302,000.000653,140,000.0001,551,800,000.0001,696,800,000.0003,094,600,000.0002,332,800,000.000338,130,000.0002,568,900,000.000
20181105_QE2_NLC10_MR_QC_MNT_HELA_02354,740,000.0002,463,200,000.000183,360,000.000705,770,000.0001,752,900,000.0002,004,400,000.0003,882,200,000.0002,380,900,000.000289,680,000.0002,777,800,000.000
20181105_QE8_nLC0_BDA_QC_MNT_HeLa_15cm_011,515,800,000.0001,547,100,000.000233,270,000.000NaN665,520,000.000766,110,000.0002,587,400,000.000NaN194,130,000.0006,050,700,000.000
-
- - - - -```python -analysis.df_train.iloc[:, :M].describe() -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
AAAAAAALQAKAAFDDAIAELDTLSEESYKAAHSEGNTTAGLDMRAAVATFLQSVQVPEFTPKAAVEEGIVLGGGCALLRAAVPSGASTGIYEALELRAAVPSGASTGIYEALELRDNDKACANPAAGSVILLENLRACGLVASNLNLKPGECLRADLINNLGTIAK
count874.000874.000856.000844.000891.000863.000846.000877.000837.000893.000
mean1,788,079,930.5493,494,782,792.792577,080,270.561925,996,773.4602,324,476,663.3002,674,107,859.0965,592,034,007.0922,444,357,716.6481,845,598,966.9066,298,977,910.078
std1,314,314,893.7912,271,403,889.022892,849,786.124932,199,047.8792,560,968,866.9794,908,542,456.6963,348,733,211.6761,396,902,640.0321,420,774,527.0555,868,156,202.236
min8,448,300.0005,228,900.0004,166,600.0001,422,200.0002,048,500.0002,371,900.00012,913,000.0005,878,500.0001,508,900.0002,279,700.000
25%930,870,000.0001,706,175,000.000127,477,500.000379,340,000.0001,003,250,000.000415,790,000.0003,199,700,000.0001,374,000,000.000643,510,000.0003,047,600,000.000
50%1,417,650,000.0003,092,050,000.000272,860,000.000676,615,000.0001,537,500,000.000699,500,000.0005,240,950,000.0002,284,300,000.0001,560,700,000.0004,668,500,000.000
75%2,227,025,000.0004,924,625,000.000458,265,000.0001,101,625,000.0002,357,750,000.0001,739,100,000.0007,781,775,000.0003,285,000,000.0002,856,200,000.0006,727,900,000.000
max7,678,700,000.00012,728,000,000.0005,324,200,000.0008,230,500,000.00016,674,000,000.00028,870,000,000.00021,767,000,000.0009,429,100,000.0008,520,800,000.00055,554,000,000.000
-
- - - -### StandardScaler on raw data - - -```python -scaler = StandardScaler().fit(analysis.df_train) -scaler_df = get_df_fitted_mean_std(scaler, index=analysis.df_train.columns) -scaler_df.head(N) -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
meanstddev
AAAAAAALQAK1,788,079,930.5491,313,562,782.202
AAFDDAIAELDTLSEESYK3,494,782,792.7922,270,104,086.976
AAHSEGNTTAGLDMR577,080,270.561892,328,109.313
AAVATFLQSVQVPEFTPK925,996,773.460931,646,633.580
AAVEEGIVLGGGCALLR2,324,476,663.3002,559,531,331.717
AAVPSGASTGIYEALELR2,674,107,859.0964,905,697,749.151
AAVPSGASTGIYEALELRDNDK5,592,034,007.0923,346,753,469.728
ACANPAAGSVILLENLR2,444,357,716.6481,396,106,003.161
ACGLVASNLNLKPGECLR1,845,598,966.9061,419,925,543.097
ADLINNLGTIAK6,298,977,910.0785,864,869,640.117
-
- - - - -```python -sample = scaler.transform(analysis.df_train.iloc[:N]) -sample.iloc[:, :M] -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
AAAAAAALQAKAAFDDAIAELDTLSEESYKAAHSEGNTTAGLDMRAAVATFLQSVQVPEFTPKAAVEEGIVLGGGCALLRAAVPSGASTGIYEALELRAAVPSGASTGIYEALELRDNDKACANPAAGSVILLENLRACGLVASNLNLKPGECLRADLINNLGTIAK
20181102_QE2_NLC10_MR_QC_MNT_NEWHELA_02-0.198-0.862-0.509-0.346-0.759-0.229-0.9590.245-0.791-0.439
20181102_QE5_nLC5_OOE_QC_HELA_15cm_250ng_RO-045-0.6150.156-0.281-0.260-0.213-0.101-0.476-0.169-0.487-0.458
20181102_QE5_nLC5_OOE_QC_MNT_HELA_15cm_250ng_RO-003-0.388-0.186-0.341-0.435-0.200-0.152-0.267-0.460-0.610-0.560
20181102_QE5_nLC5_OOE_QC_MNT_HELA_15cm_250ng_RO-004-0.596-0.298-0.337-0.423-0.224-0.137-0.408-0.582-0.644-0.587
20181102_QE9_nLC2_AnMu_QC_Hela_010.8090.9870.4140.198-0.0770.1490.5290.7310.226-0.031
20181103_QE1_nLC1_RG_QC_HeLa_1-0.918-1.341-0.534-0.829-0.802-0.501-1.392-1.479-1.162-0.848
20181104_QE1_nLC1_RG_QC_HeLa_1-0.835-1.348-0.625-0.838-0.816-0.484-1.402-1.537-1.168-0.871
20181105_QE2_NLC10_MR_QC_MNT_HELA_01-1.126-0.642-0.624-0.293-0.302-0.199-0.746-0.080-1.062-0.636
20181105_QE2_NLC10_MR_QC_MNT_HELA_02-1.091-0.454-0.441-0.236-0.223-0.137-0.511-0.045-1.096-0.600
20181105_QE8_nLC0_BDA_QC_MNT_HeLa_15cm_01-0.207-0.858-0.385NaN-0.648-0.389-0.898NaN-1.163-0.042
-
- - - - -```python -sample = scaler.inverse_transform(sample) -sample.iloc[:, :M] -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
AAAAAAALQAKAAFDDAIAELDTLSEESYKAAHSEGNTTAGLDMRAAVATFLQSVQVPEFTPKAAVEEGIVLGGGCALLRAAVPSGASTGIYEALELRAAVPSGASTGIYEALELRDNDKACANPAAGSVILLENLRACGLVASNLNLKPGECLRADLINNLGTIAK
20181102_QE2_NLC10_MR_QC_MNT_NEWHELA_021,528,300,000.0001,537,100,000.000122,850,000.000603,890,000.000381,760,000.0001,553,000,000.0002,381,800,000.0002,786,300,000.000722,020,000.0003,724,600,000.000
20181102_QE5_nLC5_OOE_QC_HELA_15cm_250ng_RO-045980,120,000.0003,848,700,000.000326,120,000.000683,470,000.0001,779,500,000.0002,177,200,000.0003,997,400,000.0002,208,400,000.0001,154,200,000.0003,612,300,000.000
20181102_QE5_nLC5_OOE_QC_MNT_HELA_15cm_250ng_RO-0031,278,200,000.0003,071,800,000.000272,870,000.000521,070,000.0001,812,800,000.0001,926,900,000.0004,697,300,000.0001,801,500,000.000979,210,000.0003,014,500,000.000
20181102_QE5_nLC5_OOE_QC_MNT_HELA_15cm_250ng_RO-0041,005,200,000.0002,817,700,000.000276,740,000.000532,060,000.0001,751,300,000.0002,001,000,000.0004,227,100,000.0001,631,200,000.000931,620,000.0002,854,800,000.000
20181102_QE9_nLC2_AnMu_QC_Hela_012,850,100,000.0005,735,800,000.000946,660,000.0001,110,400,000.0002,128,000,000.0003,407,000,000.0007,362,800,000.0003,465,400,000.0002,166,300,000.0006,118,300,000.000
20181103_QE1_nLC1_RG_QC_HeLa_1582,300,000.000449,810,000.000100,500,000.000153,260,000.000272,230,000.000215,700,000.000932,700,000.000379,450,000.000195,630,000.0001,327,900,000.000
20181104_QE1_nLC1_RG_QC_HeLa_1691,850,000.000435,430,000.00019,396,000.000145,320,000.000234,910,000.000299,550,000.000898,520,000.000299,230,000.000187,800,000.0001,191,600,000.000
20181105_QE2_NLC10_MR_QC_MNT_HELA_01309,600,000.0002,037,100,000.00020,302,000.000653,140,000.0001,551,800,000.0001,696,800,000.0003,094,600,000.0002,332,800,000.000338,130,000.0002,568,900,000.000
20181105_QE2_NLC10_MR_QC_MNT_HELA_02354,740,000.0002,463,200,000.000183,360,000.000705,770,000.0001,752,900,000.0002,004,400,000.0003,882,200,000.0002,380,900,000.000289,680,000.0002,777,800,000.000
20181105_QE8_nLC0_BDA_QC_MNT_HeLa_15cm_011,515,800,000.0001,547,100,000.000233,270,000.000NaN665,520,000.000766,110,000.0002,587,400,000.000NaN194,130,000.0006,050,700,000.000
-
- - - -### StandardScaler on log10 transformed data - - -```python -X_log10 = np.log10(analysis.df_train) -X_log10.iloc[:N, :M] -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
AAAAAAALQAKAAFDDAIAELDTLSEESYKAAHSEGNTTAGLDMRAAVATFLQSVQVPEFTPKAAVEEGIVLGGGCALLRAAVPSGASTGIYEALELRAAVPSGASTGIYEALELRDNDKACANPAAGSVILLENLRACGLVASNLNLKPGECLRADLINNLGTIAK
20181102_QE2_NLC10_MR_QC_MNT_NEWHELA_029.1849.1878.0898.7818.5829.1919.3779.4458.8599.571
20181102_QE5_nLC5_OOE_QC_HELA_15cm_250ng_RO-0458.9919.5858.5138.8359.2509.3389.6029.3449.0629.558
20181102_QE5_nLC5_OOE_QC_MNT_HELA_15cm_250ng_RO-0039.1079.4878.4368.7179.2589.2859.6729.2568.9919.479
20181102_QE5_nLC5_OOE_QC_MNT_HELA_15cm_250ng_RO-0049.0029.4508.4428.7269.2439.3019.6269.2138.9699.456
20181102_QE9_nLC2_AnMu_QC_Hela_019.4559.7598.9769.0459.3289.5329.8679.5409.3369.787
20181103_QE1_nLC1_RG_QC_HeLa_18.7658.6538.0028.1858.4358.3348.9708.5798.2919.123
20181104_QE1_nLC1_RG_QC_HeLa_18.8408.6397.2888.1628.3718.4768.9548.4768.2749.076
20181105_QE2_NLC10_MR_QC_MNT_HELA_018.4919.3097.3088.8159.1919.2309.4919.3688.5299.410
20181105_QE2_NLC10_MR_QC_MNT_HELA_028.5509.3918.2638.8499.2449.3029.5899.3778.4629.444
20181105_QE8_nLC0_BDA_QC_MNT_HeLa_15cm_019.1819.1908.368NaN8.8238.8849.413NaN8.2889.782
-
- - - - -```python -scaler_log = StandardScaler( -).fit(X=X_log10) -scaler_log_df = get_df_fitted_mean_std(scaler_log, index=analysis.df.index) -scaler_log_df.head(N) -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
meanstddev
20181102_QE2_NLC10_MR_QC_MNT_NEWHELA_029.1170.407
20181102_QE5_nLC5_OOE_QC_HELA_15cm_250ng_RO-0459.4130.413
20181102_QE5_nLC5_OOE_QC_MNT_HELA_15cm_250ng_RO-0038.4010.574
20181102_QE5_nLC5_OOE_QC_MNT_HELA_15cm_250ng_RO-0048.7910.424
20181102_QE9_nLC2_AnMu_QC_Hela_019.1840.434
20181103_QE1_nLC1_RG_QC_HeLa_18.9810.584
20181104_QE1_nLC1_RG_QC_HeLa_19.6290.399
20181105_QE2_NLC10_MR_QC_MNT_HELA_019.2940.339
20181105_QE2_NLC10_MR_QC_MNT_HELA_029.0680.511
20181105_QE8_nLC0_BDA_QC_MNT_HeLa_15cm_019.6360.457
-
- - - - -```python -sample_log10 = scaler_log.transform(X_log10.iloc[:N]) -sample_log10.iloc[:, :M] -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
AAAAAAALQAKAAFDDAIAELDTLSEESYKAAHSEGNTTAGLDMRAAVATFLQSVQVPEFTPKAAVEEGIVLGGGCALLRAAVPSGASTGIYEALELRAAVPSGASTGIYEALELRDNDKACANPAAGSVILLENLRACGLVASNLNLKPGECLRADLINNLGTIAK
20181102_QE2_NLC10_MR_QC_MNT_NEWHELA_020.165-0.548-0.543-0.024-1.3900.360-0.6330.446-0.409-0.141
20181102_QE5_nLC5_OOE_QC_HELA_15cm_250ng_RO-045-0.3090.4170.1960.1020.1520.611-0.0690.148-0.011-0.170
20181102_QE5_nLC5_OOE_QC_MNT_HELA_15cm_250ng_RO-003-0.0260.1800.061-0.1750.1710.5200.106-0.113-0.150-0.342
20181102_QE5_nLC5_OOE_QC_MNT_HELA_15cm_250ng_RO-004-0.2820.0900.072-0.1540.1360.548-0.008-0.240-0.192-0.394
20181102_QE9_nLC2_AnMu_QC_Hela_010.8300.8371.0020.5990.3310.9440.5960.7260.5240.330
20181103_QE1_nLC1_RG_QC_HeLa_1-0.865-1.840-0.694-1.427-1.728-1.108-1.655-2.110-1.518-1.121
20181104_QE1_nLC1_RG_QC_HeLa_1-0.681-1.874-1.939-1.482-1.876-0.864-1.695-2.415-1.552-1.223
20181105_QE2_NLC10_MR_QC_MNT_HELA_01-1.539-0.252-1.9040.0560.0150.426-0.3480.219-1.053-0.494
20181105_QE2_NLC10_MR_QC_MNT_HELA_02-1.394-0.052-0.2400.1350.1370.550-0.1010.245-1.184-0.420
20181105_QE8_nLC0_BDA_QC_MNT_HeLa_15cm_010.156-0.541-0.058NaN-0.833-0.166-0.543NaN-1.5240.319
-
- - - - -```python -scaler_log.inverse_transform(sample_log10).iloc[:, :M] -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
AAAAAAALQAKAAFDDAIAELDTLSEESYKAAHSEGNTTAGLDMRAAVATFLQSVQVPEFTPKAAVEEGIVLGGGCALLRAAVPSGASTGIYEALELRAAVPSGASTGIYEALELRDNDKACANPAAGSVILLENLRACGLVASNLNLKPGECLRADLINNLGTIAK
20181102_QE2_NLC10_MR_QC_MNT_NEWHELA_029.1849.1878.0898.7818.5829.1919.3779.4458.8599.571
20181102_QE5_nLC5_OOE_QC_HELA_15cm_250ng_RO-0458.9919.5858.5138.8359.2509.3389.6029.3449.0629.558
20181102_QE5_nLC5_OOE_QC_MNT_HELA_15cm_250ng_RO-0039.1079.4878.4368.7179.2589.2859.6729.2568.9919.479
20181102_QE5_nLC5_OOE_QC_MNT_HELA_15cm_250ng_RO-0049.0029.4508.4428.7269.2439.3019.6269.2138.9699.456
20181102_QE9_nLC2_AnMu_QC_Hela_019.4559.7598.9769.0459.3289.5329.8679.5409.3369.787
20181103_QE1_nLC1_RG_QC_HeLa_18.7658.6538.0028.1858.4358.3348.9708.5798.2919.123
20181104_QE1_nLC1_RG_QC_HeLa_18.8408.6397.2888.1628.3718.4768.9548.4768.2749.076
20181105_QE2_NLC10_MR_QC_MNT_HELA_018.4919.3097.3088.8159.1919.2309.4919.3688.5299.410
20181105_QE2_NLC10_MR_QC_MNT_HELA_028.5509.3918.2638.8499.2449.3029.5899.3778.4629.444
20181105_QE8_nLC0_BDA_QC_MNT_HeLa_15cm_019.1819.1908.368NaN8.8238.8849.413NaN8.2889.782
-
- - - -### Sanity checks - -#### Correlation - -- Correlation between the computed `means_` should be nearly perfect -- Correlation between peptide intensities should be high -- As taking the logarithm is a monoton, but non-linear transformation, the linear Pearson correlation can change substantially. [[link]](https://stats.stackexchange.com/questions/127121/do-logs-modify-the-correlation-between-two-variables) - - -```python -print("Correlation between mean values of linear vs. log-transformed values:", - f"{np.corrcoef(scaler.mean_, scaler_log.mean_)[1,0]:.4f}", sep='\n') -``` - - Correlation between mean values of linear vs. log-transformed values: - 0.7971 - - - -```python -pd.options.display.float_format = '{:,.3f}'.format - -analysis.corr_linear_vs_log = scaler.transform(X=analysis.df).corrwith( - other=scaler_log.transform(X_log10), - axis=0) -analysis.corr_linear_vs_log.describe() -``` - - - - - count 1,000.000 - mean 0.825 - std 0.036 - min 0.653 - 25% 0.803 - 50% 0.829 - 75% 0.849 - max 0.914 - dtype: float64 - - - - -```python -# own implemention could be slightly faster as data is already demeanded and standardized. -# pd.DataFrame.corrwith? -``` - -#### Distribution - - -```python -import seaborn as sns -from vaep.utils import sample_iterable - -columns_sampled = sample_iterable(list(analysis.df.columns), n=12) -print(columns_sampled) -``` - - ['TIAMDGTEGLVR', 'TAFDDAIAELDTLNEDSYK', 'MEGPLSVFGDR', 'ILQDGGLQVVEK', 'FYEQMNGPVAGASR', 'ILLTEPPMNPTK', 'GLCAIAQAESLR', 'LLLQVQHASK', 'SGGLGGSHALLLLR', 'AVTEQGAELSNEER', 'FVNVVPTFGK', 'VACIGAWHPAR'] - - - -```python -def plot_scaled_sample(columns_sampled: list, scaler, df: pd.DataFrame = analysis.df): - _scaled = scaler.transform(df) - display(_scaled.describe()) - _min, _max = _scaled.min().min(), _scaled.max().max() - return _min, _max - print(list(range(_min, _max, step=0.5))) - - -_min, _max = plot_scaled_sample(columns_sampled=columns_sampled, scaler=scaler) -``` - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
AAAAAAALQAKAAFDDAIAELDTLSEESYKAAHSEGNTTAGLDMRAAVATFLQSVQVPEFTPKAAVEEGIVLGGGCALLRAAVPSGASTGIYEALELRAAVPSGASTGIYEALELRDNDKACANPAAGSVILLENLRACGLVASNLNLKPGECLRADLINNLGTIAK...VVFVFGPDKVVFVFGPDKKVYALPEDLVEVKPKYADLTEDQLPSCESLKYDDMAAAMKYDDMAACMKYDDMATCMKYLAEVACGDDRKYLDEDTIYHLQPSGRYRVPDVLVADPPIAR
count965.000964.000954.000935.000990.000958.000946.000966.000930.000993.000...969.000971.000984.000990.000981.000987.000973.000992.000981.000955.000
mean0.0560.0510.0940.1080.1140.093-0.0060.005-0.0440.097...0.0720.0200.0960.0660.1130.0930.0930.0920.0930.071
std1.0361.0541.1061.1631.1431.0940.9931.0101.0011.209...1.0490.9771.1181.0531.1451.1061.1291.0971.1091.065
min-1.355-1.537-0.642-0.992-0.907-0.545-1.667-1.747-1.299-1.074...-0.851-1.433-1.116-0.858-0.857-1.199-0.910-1.012-0.702-1.420
25%-0.643-0.768-0.496-0.585-0.507-0.460-0.724-0.766-0.919-0.541...-0.492-0.793-0.662-0.537-0.499-0.607-0.536-0.559-0.455-0.661
50%-0.242-0.142-0.330-0.250-0.284-0.392-0.095-0.118-0.267-0.249...-0.338-0.055-0.235-0.300-0.236-0.171-0.261-0.282-0.317-0.146
75%0.4580.654-0.0830.2650.091-0.0900.6190.6080.6430.133...-0.0210.5870.4080.0650.1110.3280.1130.239-0.0890.573
max4.4844.9155.3207.8407.5856.0214.8335.0034.70118.435...5.4484.1507.1495.6237.0026.7246.2716.2635.7935.356
-

8 rows × 1000 columns

-
- - - -```python -# if bins should be equal between plots -# addon -import math -xlim = [-5, 5] -FACTOR = 1 -[x/FACTOR for x in range(math.floor(xlim[0])*FACTOR, - math.ceil(xlim[1])*FACTOR+1)] -``` - - - - - [-5.0, -4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0] - - - - -```python -import matplotlib.pyplot as plt - -columns_sampled = sample_iterable(list(analysis.df.columns), n=9) -subplot_kw = {'xlim': [-5, 5], 'ylim': [0, 600]} -fig, axes = plt.subplots(nrows=3, ncols=3, figsize=( - 15, 15), sharey=True, sharex=True, subplot_kw=subplot_kw) -_ = scaler_log.transform(X_log10)[columns_sampled].hist( - figsize=(15, 15), ax=axes) -axes = scaler.transform(analysis.df)[columns_sampled].hist( - figsize=(15, 15), ax=axes) -_ = fig.legend(('linear', 'log'), loc=5) -``` - - -![png](12_experiment_01_transforms_files/12_experiment_01_transforms_29_0.png) - - - -```python -caption = (f"Frequencies are capped at {subplot_kw['ylim'][1]} and " - "their standardized intensity values plotted between {} and {}.".format( - *subplot_kw['xlim']) - ) -print(caption) -``` - - Frequencies are capped at 600 and their standardized intensity values plotted between -5 and 5. - - -### Analysis state so far - - -```python -analysis -``` - - - - - AnalyzePeptides with attributes: N_SAMPLES, corr_linear_vs_log, df, df_train, df_valid, indices, samples, stats - - diff --git a/project/doc/ipynbs/12_experiment_01_transforms_files/12_experiment_01_transforms_29_0.png b/project/doc/ipynbs/12_experiment_01_transforms_files/12_experiment_01_transforms_29_0.png deleted file mode 100644 index 935b5233b..000000000 Binary files a/project/doc/ipynbs/12_experiment_01_transforms_files/12_experiment_01_transforms_29_0.png and /dev/null differ diff --git a/project/doc/ipynbs/3_select_data.md b/project/doc/ipynbs/3_select_data.md deleted file mode 100644 index 199f2cc24..000000000 --- a/project/doc/ipynbs/3_select_data.md +++ /dev/null @@ -1,659 +0,0 @@ -# Analyse peptides - -## Specification -- access different levels of peptides easily -- select training data per gene easily - - - -```python -import os -import time -import json -import logging - -from IPython.core.debugger import set_trace - -logging.basicConfig(level=logging.INFO) # configures root logger -logger = logging.getLogger() -logger.info("test") -``` - - -```python -import pandas as pd -from src.config import FN_FASTA_DB, FN_ID_MAP, FN_PEPTIDE_INTENSITIES, FN_PEPTIDE_STUMP, FOLDER_DATA - -pd.options.display.float_format = '{:,.1f}'.format -``` - - -```python -id_map = pd.read_json(FN_ID_MAP, orient="split") - -mask_no_gene = id_map.gene.isna() -id_map.loc[mask_no_gene, "gene"] = "-" - -with open(FN_FASTA_DB) as f: - data_fasta = json.load(f) -``` - - -```python -from pathlib import Path -l_peptides_files = list(Path(FOLDER_DATA).glob(f"{FN_PEPTIDE_STUMP}*.pkl")) -assert l_peptides_files, 'No matches found' -l_peptides_files -``` - - -```python -data_peptides = [] -for i, file_path in enumerate(l_peptides_files): - _peptides = pd.read_pickle(file_path) - loaded_dtypes = _peptides.dtypes.unique() - print(f"Current dtypes: {''.join(str(x) for x in loaded_dtypes)}\tFile:{file_path}") - if not isinstance(loaded_dtypes[0], pd.Int64Dtype): - print(f"try converting data: {file_path}.") - # # use less specific integer check? - # loaded_dtypes[0].is_signed_integer or loaded_dtypes[0].is_unsigned_integer - _peptides = _peptides.convert_dtypes() - _peptides.to_pickle(file_path) - data_peptides.append(_peptides) -``` - - -```python -N_total = sum([len(_data) for _data in data_peptides]) - -peptides_intensities = data_peptides.pop(0) - -while len(data_peptides) > 0: - _data = data_peptides.pop(0) - peptides_intensities = peptides_intensities.append(_data) - del _data - -assert len(peptides_intensities) == N_total -logging.info("Loaded {0} samples having a total of {1:,d} peptides.".format(*peptides_intensities.shape)) -``` - - -```python -# avoid reassambly of data? -# data_peptides.to_pickle(FN_PEPTIDE_INTENSITIES) -``` - - -```python -data_peptides = peptides_intensities -set(data_peptides.dtypes) -``` - - -```python -set_peptides = set(data_peptides.columns) -``` - -- switch between list of proteins with any support and non - - set threshold of number of peptides per protein over all samples (some peptides uniquely matched to one protein in on sample is just noise -> check razor peptides) -- show support - - -```python -peptides_2 = ('TTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGR', - 'LDLAGRDLTDYLMK') - -peptides_4 = ( "ILTERGYSFTTTAEREIVR", - "GYSFTTTAEREIVRDIK", - "EIVRDIKEK", - "DIKEKLCYVALDFEQEMATAASSSSLEK") -peptides_4[:0:-1] -``` - - -```python -# logger.setLevel(logging.DEBUG) -COLORS= ["\033[32;2m", "\033[32;1m", "0;34;47m"] -def annotate_overlap(peptides): - i = len(peptides) - if i > 3: - raise ValueError("Two many peptides provided.") - logging.debug(f"First peptide: {peptides[0]} ") - base_peptide = peptides[0][::-1] - logging.debug(f"Reversed pep: {base_peptide}") - colored_part = "" - overlaps = [] - logging.debug(peptides[:0:-1]) - for pep in peptides[:0:-1]: - - logger.debug(f"Find overlap for: {pep}") - overlap = "" - overlap_in_last_step = False - for j, amino_acid in enumerate(pep): - overlap += amino_acid - if overlap[::-1] != base_peptide[:len(overlap)]: - overlap_now = False - else: - overlap_in_last_step = True - logger.debug(f"Found overlap: {overlap}") - if overlap_in_last_step and not overlap_now: - overlaps.append(overlap) - break - logger.debug(f"Search remaining peptide: {base_peptide[len(overlap)::]}") - base_peptide = base_peptide[len(overlap)::] - overlaps.append(base_peptide[::-1]) - return overlaps[::-1] - -assert ''.join(annotate_overlap(peptides_2)) == "TTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGR" -# annotate_overlap(peptides_4) # should raise ValueError -assert ''.join(annotate_overlap(peptides_4[0:3])) == 'ILTERGYSFTTTAEREIVR' -assert ''.join(annotate_overlap(peptides_4[1:])) == 'GYSFTTTAEREIVRDIK' -``` - - -```python -pep_0missed = "GYSFTTTAER" -pep_1missed = ["ILTERGYSFTTTAER", - "GYSFTTTAEREIVR"] -``` - - -```python -from collections import defaultdict -import ipywidgets as w -from src.config import KEY_FASTA_HEADER, KEY_FASTA_SEQ, KEY_PEPTIDES, KEY_GENE_NAME_FASTA - - -pd.options.display.float_format = '{:,.1f}'.format - -TGREEN = "\033[32;2m" # Green Text -TGREEN_2 = "\033[32;1m" # Green Text -RESET = "\033[0;0m" - -w_first_letter = w.Dropdown( - options=id_map[KEY_GENE_NAME_FASTA].str[0].unique()) - -w_genes = w.Dropdown( - options=id_map.gene.loc[id_map[KEY_GENE_NAME_FASTA].str[0] == w_first_letter.value].unique(), - value='ACTB' -) - -mask = id_map.gene == w_genes.value -selected = id_map.loc[mask, "protein"] - - -w_proteins_ids = w.Dropdown(options=selected.index) -w_protein = w.Dropdown(options=selected.unique()) - - -def update_gene_list(first_letter): - """Update proteins when new gene is selected""" - mask_selected_genes = id_map[KEY_GENE_NAME_FASTA].str[0] == w_first_letter.value - w_genes.options = id_map[KEY_GENE_NAME_FASTA].loc[mask_selected_genes].unique() - - -_ = w.interactive_output(update_gene_list, {"first_letter": w_first_letter}) - - -def update_protein_list(gene): - mask = id_map[KEY_GENE_NAME_FASTA] == gene - selected = id_map.loc[mask, "protein"] - w_protein.options = selected.unique() -# w_proteins_ids.options = selected.loc[selected == w_protein.value].index - - -_ = w.interactive_output(update_protein_list, {"gene": w_genes}) - - -def update_protein_id_list(protein): - """Update isotope list when protein is selected""" - mask = id_map.protein == w_protein.value - selected = id_map.protein.loc[mask] - w_proteins_ids.options = selected.index - -_ = w.interactive_output(update_protein_id_list, {'protein': w_protein}) - -d_peptides_observed_prot_id = defaultdict(list) - -def show_sequences(prot_id): - _data = data_fasta[prot_id] - print(f"Protein_ID on Uniport: {prot_id}") - print(f"HEADER: {_data[KEY_FASTA_HEADER]}") -# print(f"Seq : {_data[KEY_FASTA_SEQ]}") - annotate_seq = "Peptides: " - global d_peptides_observed_prot_id - for i, _l in enumerate(_data[KEY_PEPTIDES]): - annotate_seq += f"\nNo. of missed K or R: {i}" - prot_seq_annotated = _data[KEY_FASTA_SEQ] - _change_color = False - for j, _pep in enumerate(_l): - if _pep in set_peptides: - d_peptides_observed_prot_id[prot_id].append(_pep) - if _change_color is False: - _pep_in_green = TGREEN + f"{_pep}" + RESET - _change_color = True - else: - _pep_in_green = TGREEN_2 + f"{_pep}" + RESET - _change_color = False - prot_seq_annotated = prot_seq_annotated.replace(_pep, _pep_in_green) - _pep = _pep_in_green - else: - _change_color = False - if j==0: - annotate_seq += "\n\t" - else: - annotate_seq += ",\n\t" - annotate_seq += _pep - - print(f"Seq {i}: {prot_seq_annotated}") - print(annotate_seq) - - - _ = data_peptides[d_peptides_observed_prot_id[prot_id]].dropna(how='all') - if _.columns.size > 2: - display(_) - display(_.describe()) - else: - print("\nNo empirical evidence for protein") - -w_out = w.interactive_output(show_sequences, {"prot_id": w_proteins_ids}) - -label_first_letter = w.Label(value='First letter of Gene') -label_genes = w.Label('Gene') -label_protein = w.Label('Protein') -label_proteins_ids = w.Label('Protein Isotopes') - -panel_levels = w.VBox([ - w.HBox([ - w.VBox([label_first_letter, w_first_letter]), - w.VBox([label_genes, w_genes]), - w.VBox([label_protein, w_protein]), - w.VBox([label_proteins_ids, w_proteins_ids]) - ]), - w_out] -) -panel_levels -``` - -> create styler object? - -- [ ] replace zeros with NaN -- [ ] display summary statistics on log-scale (but do not compute summary based on log-scale) - -Get meta-data - - -```python -query_template = "https://www.uniprot.org/uniprot/?query=accession:{prot_id}&format=txt" -``` - -- relatively short peptides resulting from one missed cleaveage, do not appear in the upper part. - -- `gene` `->` `Protein_ID` (contains information of `gene` `->` `protein_isotopes` -- `protein_ID` `->` `sequences` (`FN_FASTA_DB`) - - -```python -import pickle -from tqdm.notebook import tqdm -from src.config import FN_PROTEIN_SUPPORT_MAP, FN_PROTEIN_SUPPORT_FREQ -# from vaep.utils import sample_iterable - -try: - if (time.time() - os.path.getmtime(FN_PROTEIN_SUPPORT_MAP)) / 3600 / 24 > 7: - # recompute file every week - raise FileNotFoundError - df_protein_support = pd.read_pickle(FN_PROTEIN_SUPPORT_MAP) - with open(FN_PROTEIN_SUPPORT_FREQ, 'rb') as f: - d_protein_support_freq = pickle.load(f) -except FileNotFoundError: - d_protein_support = {} - d_protein_support_freq = {} - for prot_id in tqdm(data_fasta.keys()): - _data = data_fasta[prot_id] - peptides_measured = [] - for i, _l in enumerate(_data[KEY_PEPTIDES]): - for _pep in _l: - if _pep in set_peptides: - peptides_measured.append(_pep) - _d_protein_support = {} - _df_support_protein = data_peptides[peptides_measured].dropna(how='all') - - _n_samples = len(_df_support_protein) - if _n_samples > 0: - _d_protein_support['N_samples'] = _n_samples - d_protein_support_freq[prot_id] = _df_support_protein.notna().sum().to_dict() - d_protein_support[prot_id] = _d_protein_support - else: - d_protein_support[prot_id] = None - - df_protein_support = pd.DataFrame(d_protein_support).T.dropna() - df_protein_support = df_protein_support.join(id_map) - df_protein_support.to_pickle(FN_PROTEIN_SUPPORT_MAP) - - with open(FN_PROTEIN_SUPPORT_FREQ, 'wb') as f: - pickle.dump(d_protein_support_freq, f) -``` - - -```python -l_proteins_good_support = df_protein_support.sort_values(by='N_samples').tail(100).index.to_list() -``` - - -```python -d_protein_support_freq['I3L3I0'] -``` - -## Connect to experimental peptide data - -Check if counts by `data_fasta`. - - -```python -from tqdm.notebook import tqdm - -counts_observed_by_missed_cleavages = {} -for _protein_id, _data in tqdm(data_fasta.items()): - _peptides = _data[KEY_PEPTIDES] - _counts = {} - for i, _l in enumerate(_peptides): - _counts[i] = 0 - for _pep in _l: - if _pep in set_peptides: - _counts[i] += 1 - counts_observed_by_missed_cleavages[_protein_id] = _counts -``` - - -```python -df_counts_observed_by_missed_cleavages = pd.DataFrame( - counts_observed_by_missed_cleavages -).T -``` - - -```python -import matplotlib.pyplot as plt -from matplotlib import table - -fig, axes = plt.subplots(ncols=2, gridspec_kw={"width_ratios": [5, 1], "wspace": 0.2}, figsize=(10,4)) - -_counts_summed = df_counts_observed_by_missed_cleavages.sum() -_counts_summed.name = "frequency" - -ax = axes[0] -_ = _counts_summed.plot(kind="bar", ax=ax) -ax.set_xlabel("peptides from n miscleavages") -ax.set_ylabel("frequency") - -ax = axes[1] -ax.axis("off") -_ = pd.plotting.table(ax=ax, data=_counts_summed, loc="best", colWidths=[1], edges='open') -_ = fig.suptitle('Peptides frequencies') -``` - -These are unnormalized counts in the meaning of that _razor_ peptides are counted as often as they are matched. - - -```python -mask = df_counts_observed_by_missed_cleavages != 0 -df_prot_observed = df_counts_observed_by_missed_cleavages.replace(0, pd.NA) -``` - - -```python -df_prot_observed = df_prot_observed.dropna(axis=0, how="all") -df_prot_observed = df_prot_observed.fillna(0) -df_prot_observed = df_prot_observed.convert_dtypes() -``` - - -```python -from vaep.pandas import combine_value_counts - -combine_value_counts(df_prot_observed) -``` - - -```python -freq_pep_mapped_to_protID = df_prot_observed.sum(axis=1).value_counts() -freq_pep_mapped_to_protID = freq_pep_mapped_to_protID.sort_index() -``` - - -```python -freq_pep_mapped_to_protID -``` - -### Genes with support in data - -try software to identify the _most likely_ protein. [PyOpenMS](https://pyopenms.readthedocs.io/en/latest/) or [Pyteomics](https://pyteomics.readthedocs.io/en/latest/)? - - -```python - -``` - -## Imputation: Train model - -> Select Gene or Protein - -As the samples are all obtained from the same biological sample (in principal), the single run should somehow be comparable. -An description of variablity (from the Data Scientist perspective) can highlight some commenly known facts about proteomics experiments: - - batch effects: Measurements on consecutive days are have to be normalized to each other - - scoring: PSM are assigned to a peptide based on a score. Small variations can lead to different assignments - -Can a complex representation of a sample level out experimental variation on an in principle comparable data. - -### Strategy -- first start using peptides from single Protein_IDs -- then move to all models from genes -- explore structure - - -```python -d_peptides_observed_prot_id -``` - - -```python -data_peptides.shape -``` - - -```python -w_select_proteins_good_support = w.Dropdown(options=l_proteins_good_support) -w_select_proteins_queried = w.Dropdown(options=list(d_peptides_observed_prot_id.keys())) - -# select from top100 or above - -import vaep -from vaep.transform import log -from src.config import PROTEIN_DUMPS - -def main_trigger(prot_id): - """Explore protein data - - Global Variables used - --------------------- - data_peptides : pandas.DataFrame - id_map : pandas.DataFrame - d_peptides_observed_prot_id: dict - - - Global variables set - -------------------- - peptides_selected_log10: pandas.DataFrame - Current selection of data for protein_id. All possible features are returned. log10 transformed - prod_id : str - Passed prot_id to function exposed globally - """ - print(f'Protein Identifier: {prot_id}') - _gene_name = id_map.loc[prot_id, KEY_GENE_NAME_FASTA] # Select gene name, based on selected FASTA-File - _protein = id_map.protein.loc[prot_id] # Protein Name summarized several UNIPROT isotopes (PROT, PROT_2, PROT_3, etc) - print(f'Gene Identifier {_gene_name}') - # configure viewer above - w_first_letter.value = _gene_name[0] - w_genes.value = _gene_name - w_protein.value = _protein - w_proteins_ids.value = prot_id - - peptides_measured = d_peptides_observed_prot_id[prot_id] # get observed peptides according to pre-computed dictionary - n_peptides_in_selection = len(peptides_measured) - print(f"Found {n_peptides_in_selection} peptides measured of this protein.\n\n") - - peptides_selected = data_peptides[peptides_measured] # select subsample (as view) of peptides - mask_selected_notna = data_peptides[peptides_measured].notna() - selected_notna_summed_ax1 = mask_selected_notna.sum(axis=1) - print("How many samples have how many peptides quantified?") - for n_peptides, n_samples in selected_notna_summed_ax1.value_counts().sort_index().tail(10).items(): - print(f"In {n_samples:5} samples are {n_peptides:5} peptides measured.") - - PROP_DATA_COMPLETENESS = 0.5 - mask_samples_selected = selected_notna_summed_ax1 >= int(n_peptides_in_selection * PROP_DATA_COMPLETENESS) - print(f"\nUsing a share of at least {PROP_DATA_COMPLETENESS}, " - f"i.e. at least {int(n_peptides_in_selection * PROP_DATA_COMPLETENESS)} out of {n_peptides_in_selection}.", - f"In total {mask_samples_selected.sum()} samples are selected for further analysis.", sep="\n") - # from IPython.core.debugger import set_trace; set_trace() - _ = peptides_selected.loc[mask_samples_selected, peptides_measured] - _.index.name = f"protein_id {prot_id}" - # _.to_json(PROTEIN_DUMPS / f"{prot_id}.json") - - display(_) - # display(_.describe()) - global peptides_selected_log10 - peptides_selected_log10 = _.apply(log) # selected in widget overview above - display(peptides_selected_log10) - display(peptides_selected_log10.describe()) - global prot_last - prot_last = prot_id - -w.VBox([ - w.HBox( - [ - w.VBox( - [ - w.Label(f"Top {len(l_proteins_good_support)} covered proteins"), - w_select_proteins_good_support, - ] - ), - w.VBox([w.Label("Queried proteins from above"), w_select_proteins_queried]), - ] - ), - w.interactive_output(main_trigger, {"prot_id": w_select_proteins_good_support}) -]) -``` - - -```python -from datetime import datetime - -import torch -from torch import optim -from torch.utils.tensorboard import SummaryWriter - -# import importlib; importlib.reload(vaep.model) -from vaep.model import train -from vaep.model import VAE -from vaep.model import loss_function -from vaep.cmd import get_args - -# from vaep.model import PeptideDatasetInMemory -# import importlib; importlib.reload(vaep.io.datasets) -from vaep.io.datasets import PeptideDatasetInMemory -``` - - -```python -# # https://pytorch.org/docs/stable/data.html#memory-pinning -# from torch.utils.data import TensorDataset, DataLoader -# class SimpleCustomBatch: -# def __init__(self, data): -# transposed_data = list(zip(*data)) -# self.inp = torch.stack(transposed_data[0], 0) -# self.tgt = torch.stack(transposed_data[1], 0) - -# # custom memory pinning method on custom type -# def pin_memory(self): -# self.inp = self.inp.pin_memory() -# self.tgt = self.tgt.pin_memory() -# return self - -# def collate_wrapper(batch): -# return SimpleCustomBatch(batch) - -# inps = torch.arange(10 * 5, dtype=torch.float32).view(10, 5) -# tgts = torch.arange(10 * 5, dtype=torch.float32).view(10, 5) -# dataset = TensorDataset(inps, tgts) - -# loader = DataLoader(dataset, batch_size=2, collate_fn=collate_wrapper, -# pin_memory=True) - -# for batch_ndx, sample in enumerate(loader): -# print(sample.inp.is_pinned()) -# print(sample.tgt.is_pinned()) -``` - - -```python -args = get_args(no_cuda=True) -kwargs = {'num_workers': 2, 'pin_memory': True} if args.cuda else {} - -torch.manual_seed(args.seed) -device = torch.device("cuda" if args.cuda else "cpu") -device -``` - - -```python -w_start_training = w.Button(description='Train on new selection') - -def main_train(): - n_samples, n_features = peptides_selected_log10.shape - detection_limit = float(int(peptides_selected_log10.min().min())) - detection_limit # replace by mean of sample/ features? - - dataset_in_memory = PeptideDatasetInMemory(data=peptides_selected_log10, fill_na=detection_limit, device=device) - - train_loader = torch.utils.data.DataLoader( - dataset=dataset_in_memory, - batch_size=args.batch_size, shuffle=True, **kwargs) - #ToDo: Send data to correct device set above manually. Check docs.. - - data, mask = next(iter(train_loader)) - - writer = SummaryWriter(f'runs/{prot_last}_{format(datetime.now(), "%y%m%d_%H%M")}') - writer.add_image(f'{len(data)} batch of sampled data (as heatmap)', data, dataformats='HW') - writer.add_image(f'{len(mask)} mask for this batch of samples', mask, dataformats='HW') - - global model - model = VAE(n_features=n_features, n_neurons=30).double() - model = model.to(device, non_blocking=True) - writer.add_graph(model, input_to_model=data) - optimizer = optim.Adam(model.parameters(), lr=1e-4) - - for epoch in range(1, args.epochs): - train(epoch, model=model, train_loader=train_loader, optimizer=optimizer, device=device, writer=writer) - writer.flush() - writer.close() - -# w_out_training = w.interactive_output(w_start_training, w_start_training) - -w_out_training = w.Output() -display(w_start_training, w_out_training) - -def on_button_clicked(b): - with w_out_training: - main_train() - -w_start_training.on_click(on_button_clicked) -``` - - -```python -# # Load the TensorBoard notebook extension -# %load_ext tensorboard -``` - -Idea: Select a protein which leads to training. Each selection will create a dump of the selected data, which can be used in the `XZY.ipynb` for model fine-tuning. diff --git a/project/doc/ipynbs/5_0_summaries.md b/project/doc/ipynbs/5_0_summaries.md deleted file mode 100644 index b03c982e7..000000000 --- a/project/doc/ipynbs/5_0_summaries.md +++ /dev/null @@ -1,996 +0,0 @@ -# Analysis of `summaries.txt` information - -- number of raw files (no here) -- number of raw files with MQ-Output -- MS1 per file -- MS2 per file - - -```python -import ipywidgets as widgets - -from vaep.io.data_objects import MqAllSummaries -import vaep - -from src.src.config import FN_ALL_SUMMARIES - -mq_all_summaries = MqAllSummaries() -mq_all_summaries.df.describe().T -``` - - MqAllSummaries: Load summaries of 9381 folders. - - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
countmeanstdmin25%50%75%max
Enzyme first search0<NA><NA><NA><NA><NA><NA><NA>
Enzyme mode first search0<NA><NA><NA><NA><NA><NA><NA>
Multi modifications0<NA><NA><NA><NA><NA><NA><NA>
Variable modifications first search0<NA><NA><NA><NA><NA><NA><NA>
Multiplicity9,381.01.00.01.01.01.01.01.0
Max. missed cleavages9,381.02.00.02.02.02.02.02.0
Labels00<NA><NA><NA><NA><NA><NA><NA>
Time-dependent recalibration0<NA><NA><NA><NA><NA><NA><NA>
MS9,381.011,285.25,682.40.09,004.011,529.012,883.047,267.0
MS/MS9,381.079,293.143,645.30.036,327.084,141.0117,083.0189,955.0
MS39,381.046.41,220.80.00.00.00.066,980.0
MS/MS Submitted9,381.091,523.349,986.00.041,827.097,355.0133,756.0230,182.0
MS/MS Submitted (SIL)9,381.067,044.937,788.40.030,534.071,627.0100,832.0157,459.0
MS/MS Submitted (ISO)9,381.00.00.00.00.00.00.00.0
MS/MS Submitted (PEAK)9,381.024,478.415,152.10.012,960.025,388.032,900.0126,478.0
MS/MS Identified9,381.031,944.820,572.50.09,697.039,963.049,095.076,652.0
MS/MS Identified (SIL)9,381.030,576.119,735.90.09,323.038,040.047,185.074,314.0
MS/MS Identified (ISO)9,381.00.00.00.00.00.00.00.0
MS/MS Identified (PEAK)9,381.01,368.7997.60.0406.01,513.02,047.07,729.0
MS/MS Identified [%]9,381.031.316.00.022.036.041.078.0
MS/MS Identified (SIL) [%]9,381.040.219.20.031.046.052.080.0
MS/MS Identified (ISO) [%]9,381.00.00.00.00.00.00.00.0
MS/MS Identified (PEAK) [%]9,381.05.84.00.03.45.77.350.0
Peptide Sequences Identified9,381.024,232.415,771.50.07,245.029,946.036,782.054,316.0
Peaks9,381.01,189,631.5499,261.20.0935,204.01,317,764.01,462,935.05,503,705.0
Peaks Sequenced9,381.073,119.742,111.20.034,084.078,421.0109,051.0176,131.0
Peaks Sequenced [%]9,378.05.92.70.04.46.47.919.0
Peaks Repeatedly Sequenced9,381.03,533.22,913.50.01,442.02,997.04,884.022,374.0
Peaks Repeatedly Sequenced [%]9,287.07.611.80.02.84.16.585.0
Isotope Patterns9,381.0155,514.466,895.80.0107,488.0176,627.0202,098.01,013,471.0
Isotope Patterns Sequenced9,381.058,522.633,454.20.027,623.064,766.087,417.0123,727.0
Isotope Patterns Sequenced (z>1)9,381.057,494.232,957.60.027,193.063,874.085,933.0121,666.0
Isotope Patterns Sequenced [%]9,378.034.413.90.026.038.045.068.0
Isotope Patterns Sequenced (z>1) [%]9,378.038.114.60.030.042.048.074.0
Isotope Patterns Repeatedly Sequenced9,381.06,585.24,831.20.02,121.06,367.09,853.028,582.0
Isotope Patterns Repeatedly Sequenced [%]9,281.013.212.40.07.310.014.091.0
Av. Absolute Mass Deviation [ppm]9,240.00.70.20.00.60.70.84.4
Mass Standard Deviation [ppm]9,240.01.00.20.00.91.01.14.4
Av. Absolute Mass Deviation [mDa]9,240.00.50.20.00.40.40.52.9
Mass Standard Deviation [mDa]9,240.00.70.20.00.60.70.73.1
-
- - - -Find unique columns, see [post](https://stackoverflow.com/a/54405767/9684872) - - -```python -from vaep.pandas import unique_cols -unique_cols(mq_all_summaries.df.Multiplicity), unique_cols(mq_all_summaries.df["Variable modifications first search"]) # int, NA -``` - - - - - (True, True) - - - - -```python -from vaep.pandas import get_unique_non_unique_columns -columns = get_unique_non_unique_columns(mq_all_summaries.df) -mq_all_summaries.df[columns.unique] -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
EnzymeEnzyme modeEnzyme first searchEnzyme mode first searchUse enzyme first searchVariable modificationsFixed modificationsMulti modificationsVariable modifications first searchUse variable modifications first searchRequantifyMultiplicityMax. missed cleavagesLabels0LC-MS run typeTime-dependent recalibrationMS/MS Submitted (ISO)MS/MS Identified (ISO)MS/MS Identified (ISO) [%]
20190819_QX2_SeVW_MA_HeLa_500ng_CTCDoff_LC05Trypsin/PSpecific<NA><NA>FalseOxidation (M);Acetyl (Protein N-term)Carbamidomethyl (C)<NA><NA>FalseFalse12<NA>Standard<NA>000
20200924_EXPL6_nLC09_MBK_QC_MNT_HeLa_42cm_FAIMS_500ng_Short_05Trypsin/PSpecific<NA><NA>FalseOxidation (M);Acetyl (Protein N-term)Carbamidomethyl (C)<NA><NA>FalseFalse12<NA>Standard<NA>000
20170509_QE4_LC12_IAH_QC_MNT_HeLa_01Trypsin/PSpecific<NA><NA>FalseOxidation (M);Acetyl (Protein N-term)Carbamidomethyl (C)<NA><NA>FalseFalse12<NA>Standard<NA>000
20190624_QE4_nLC12_MM_QC_MNT_HELA_01_20190625144904Trypsin/PSpecific<NA><NA>FalseOxidation (M);Acetyl (Protein N-term)Carbamidomethyl (C)<NA><NA>FalseFalse12<NA>Standard<NA>000
20190110_QE2_NLC10_GP_QC_MNT_HELA_01Trypsin/PSpecific<NA><NA>FalseOxidation (M);Acetyl (Protein N-term)Carbamidomethyl (C)<NA><NA>FalseFalse12<NA>Standard<NA>000
............................................................
20160609_QE2_nLC1_BTW_SA_hela_W_proteome_exp2_08Trypsin/PSpecific<NA><NA>FalseOxidation (M);Acetyl (Protein N-term)Carbamidomethyl (C)<NA><NA>FalseFalse12<NA>Standard<NA>000
20160304_LUMOS1_nLC9_ChKe_DEV_HeLa_10xPatch_AGC4e4_cutoff5e5_01Trypsin/PSpecific<NA><NA>FalseOxidation (M);Acetyl (Protein N-term)Carbamidomethyl (C)<NA><NA>FalseFalse12<NA>Standard<NA>000
20160507_LUMOS1_nLC14_RJC_QC_MNTv3_HeLa_03Trypsin/PSpecific<NA><NA>FalseOxidation (M);Acetyl (Protein N-term)Carbamidomethyl (C)<NA><NA>FalseFalse12<NA>Standard<NA>000
20160607_QE1_nlc2_BTW_SA_hela_100pctAc-L_PCA-H_1-10_ACK_01Trypsin/PSpecific<NA><NA>FalseOxidation (M);Acetyl (Protein N-term)Carbamidomethyl (C)<NA><NA>FalseFalse12<NA>Standard<NA>000
20160714_QE2_nLC0_SS_SA_hela_L_1Gy_M_10Gy_H_2hrs_400mM_pH11Trypsin/PSpecific<NA><NA>FalseOxidation (M);Acetyl (Protein N-term)Carbamidomethyl (C)<NA><NA>FalseFalse12<NA>Standard<NA>000
-

9381 rows × 19 columns

-
- - - - -```python -mq_all_summaries.df[columns.unique].dtypes -``` - - - - - Enzyme category - Enzyme mode category - Enzyme first search Int64 - Enzyme mode first search Int64 - Use enzyme first search boolean - Variable modifications category - Fixed modifications category - Multi modifications Int64 - Variable modifications first search Int64 - Use variable modifications first search boolean - Requantify boolean - Multiplicity Int64 - Max. missed cleavages Int64 - Labels0 Int64 - LC-MS run type category - Time-dependent recalibration Int64 - MS/MS Submitted (ISO) Int64 - MS/MS Identified (ISO) Int64 - MS/MS Identified (ISO) [%] Int64 - dtype: object - - - - -```python -mq_all_summaries.df[columns.unique].iloc[0,:] -``` - - - - - Enzyme Trypsin/P - Enzyme mode Specific - Enzyme first search - Enzyme mode first search - Use enzyme first search False - Variable modifications Oxidation (M);Acetyl (Protein N-term) - Fixed modifications Carbamidomethyl (C) - Multi modifications - Variable modifications first search - Use variable modifications first search False - Requantify False - Multiplicity 1 - Max. missed cleavages 2 - Labels0 - LC-MS run type Standard - Time-dependent recalibration - MS/MS Submitted (ISO) 0 - MS/MS Identified (ISO) 0 - MS/MS Identified (ISO) [%] 0 - Name: 20190819_QX2_SeVW_MA_HeLa_500ng_CTCDoff_LC05, dtype: object - - - -## Analysis of completeness - - -```python -class col_summary: - MS = 'MS' - MS2 = 'MS/MS Identified' - -MS_spectra = mq_all_summaries.df[[col_summary.MS, col_summary.MS2]] -def compute_summary(threshold_ms2_identified): - mask = MS_spectra[col_summary.MS2] >= threshold_ms2_identified - display(MS_spectra.loc[mask].describe()) - -w_ions_range = widgets.IntSlider(value=0.0, min=.0, max=MS_spectra[col_summary.MS2].max()) -display(widgets.interactive(compute_summary, threshold_ms2_identified=w_ions_range)) -``` - - - interactive(children=(IntSlider(value=0, description='threshold_ms2_identified', max=76652), Output()), _dom_c… - - - -```python -mask = (MS_spectra < 1).any(axis=1) -MS_spectra.loc[mask] -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
MSMS/MS Identified
20190423_QE7_Evo1_UHG_QC_MNT_HELA_02_2019042419561954180
20131112_QE2_UPLC4_Vyt_MNT_HeLa_1218640
20190906_QE8_nLC14_FM_QC_MNT_HeLa_50cm_test476980
20191021_QE7_Evo4_QC_MNT_Hela_100ng_21min_0354020
20191029_QE8_nLC01_FaCo_MNT_QC_Hela_15cm76040
.........
20180509_QE1_nLC10_BTW_SA_HeLa_AQUA_Ac_E2_0-1pct_F02215730
20180306_QE8_nLC1_BDA_QC_MNT_HeLa_02265060
20160627_LUMOS1_nLC13_ChKe_DEV_HeLa_FullMS_DTSon_01_160628223911240870
20160627_LUMOS1_nLC13_ChKe_DEV_HeLa_FullMS_DTSoff_01240520
20160715_LUMOS1_nLC13_ChKe_DEV_HeLa_FullMS_DTSon_01242270
-

112 rows × 2 columns

-
- - - - -```python - -``` diff --git a/project/doc/ipynbs/json_formats.md b/project/doc/ipynbs/json_formats.md deleted file mode 100644 index c632a63d6..000000000 --- a/project/doc/ipynbs/json_formats.md +++ /dev/null @@ -1,900 +0,0 @@ -# Json Formats - -- object is loaded with the correct conversions (but this is re-computed) -- can shared information be saved as "meta" information? - -- [`pd.json_normalize`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.json_normalize.html) should be able to efficiently combine information - - -```python -import pandas as pd -from vaep.io.data_objects import MqAllSummaries -from vaep.pandas import get_unique_non_unique_columns - -mq_all_summaries = MqAllSummaries() -``` - - MqAllSummaries: Load summaries of 9381 folders. - - -## summaries.json - -### Table format with schema - - -```python -# json format with categories -columns = get_unique_non_unique_columns(mq_all_summaries.df) -columns.unique[:2] -``` - - - - - Index(['Enzyme', 'Enzyme mode'], dtype='object') - - - - -```python -mq_all_summaries.df[columns.unique[:3]].dtypes -``` - - - - - Enzyme category - Enzyme mode category - Enzyme first search Int64 - dtype: object - - - - -```python -type(mq_all_summaries.df.iloc[0,3]) -``` - - - - - pandas._libs.missing.NAType - - - - -```python -meta = mq_all_summaries.df[columns.unique].iloc[0].to_json(indent=4, orient='table') -# print(meta) -``` - - -```python -pd.read_json(meta, orient='table').T.convert_dtypes() -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
EnzymeEnzyme modeEnzyme first searchEnzyme mode first searchUse enzyme first searchVariable modificationsFixed modificationsMulti modificationsVariable modifications first searchUse variable modifications first searchRequantifyMultiplicityMax. missed cleavagesLabels0LC-MS run typeTime-dependent recalibrationMS/MS Submitted (ISO)MS/MS Identified (ISO)MS/MS Identified (ISO) [%]
20190819_QX2_SeVW_MA_HeLa_500ng_CTCDoff_LC05Trypsin/PSpecific<NA><NA>FalseOxidation (M);Acetyl (Protein N-term)Carbamidomethyl (C)<NA><NA>FalseFalse12<NA>Standard<NA>000
-
- - - - -```python -pd.read_json(meta, orient='table') # produce errors when having int columns has NaN -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
20190819_QX2_SeVW_MA_HeLa_500ng_CTCDoff_LC05
EnzymeTrypsin/P
Enzyme modeSpecific
Enzyme first searchNaN
Enzyme mode first searchNaN
Use enzyme first searchFalse
Variable modificationsOxidation (M);Acetyl (Protein N-term)
Fixed modificationsCarbamidomethyl (C)
Multi modificationsNaN
Variable modifications first searchNaN
Use variable modifications first searchFalse
RequantifyFalse
Multiplicity1
Max. missed cleavages2
Labels0NaN
LC-MS run typeStandard
Time-dependent recalibrationNaN
MS/MS Submitted (ISO)0
MS/MS Identified (ISO)0
MS/MS Identified (ISO) [%]0
-
- - - - -```python -pd.options.display.max_columns = len(columns.non_unique) -# mq_all_summaries.df[columns.non_unique] -``` - - -```python -data = mq_all_summaries.df[columns.non_unique].iloc[0:3].to_json() -data = pd.read_json(data) -data -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
MSMS/MSMS3MS/MS SubmittedMS/MS Submitted (SIL)MS/MS Submitted (PEAK)MS/MS IdentifiedMS/MS Identified (SIL)MS/MS Identified (PEAK)MS/MS Identified [%]MS/MS Identified (SIL) [%]MS/MS Identified (PEAK) [%]Peptide Sequences IdentifiedPeaksPeaks SequencedPeaks Sequenced [%]Peaks Repeatedly SequencedPeaks Repeatedly Sequenced [%]Isotope PatternsIsotope Patterns SequencedIsotope Patterns Sequenced (z>1)Isotope Patterns Sequenced [%]Isotope Patterns Sequenced (z>1) [%]Isotope Patterns Repeatedly SequencedIsotope Patterns Repeatedly Sequenced [%]RecalibratedAv. Absolute Mass Deviation [ppm]Mass Standard Deviation [ppm]Av. Absolute Mass Deviation [mDa]Mass Standard Deviation [mDa]
20190819_QX2_SeVW_MA_HeLa_500ng_CTCDoff_LC0512,136123,1870138,029108,34529,68455,34153,6791,66240505.646,8961,323,164121,0249.11,4201.2224,886102,240101,14245505,5865.5+0.71.00.50.7
20200924_EXPL6_nLC09_MBK_QC_MNT_HeLa_42cm_FAIMS_500ng_Short_0520,25318,887023,17514,5998,5765,0304,89913122341.54,005597,95417,0712.94622.756,37413,00612,89523251,37211.0+1.11.40.81.0
20170509_QE4_LC12_IAH_QC_MNT_HeLa_0114,78681,890092,46271,31821,14451,46448,9302,534566912.039,0111,480,44876,8235.22,6573.5202,66365,45764,65732355,0107.7+0.60.90.40.6
-
- - - - -```python -mq_all_summaries.fp_summaries.parent / mq_all_summaries.fp_summaries.stem / '_meta.json' -``` - - - - - WindowsPath('data/processed/all_summaries/_meta.json') - - - - -```python -meta = mq_all_summaries.df[columns.unique].iloc[0].to_json(indent=4) -meta = pd.read_json(meta, typ='series') -meta -``` - - - - - Enzyme Trypsin/P - Enzyme mode Specific - Enzyme first search None - Enzyme mode first search None - Use enzyme first search False - Variable modifications Oxidation (M);Acetyl (Protein N-term) - Fixed modifications Carbamidomethyl (C) - Multi modifications None - Variable modifications first search None - Use variable modifications first search False - Requantify False - Multiplicity 1 - Max. missed cleavages 2 - Labels0 None - LC-MS run type Standard - Time-dependent recalibration None - MS/MS Submitted (ISO) 0 - MS/MS Identified (ISO) 0 - MS/MS Identified (ISO) [%] 0 - dtype: object - - - - -```python -for col, value in meta.items(): - data[col] = value -``` - - -```python -data -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
MSMS/MSMS3MS/MS SubmittedMS/MS Submitted (SIL)MS/MS Submitted (PEAK)MS/MS IdentifiedMS/MS Identified (SIL)MS/MS Identified (PEAK)MS/MS Identified [%]MS/MS Identified (SIL) [%]MS/MS Identified (PEAK) [%]Peptide Sequences IdentifiedPeaksPeaks Sequenced...Use enzyme first searchVariable modificationsFixed modificationsMulti modificationsVariable modifications first searchUse variable modifications first searchRequantifyMultiplicityMax. missed cleavagesLabels0LC-MS run typeTime-dependent recalibrationMS/MS Submitted (ISO)MS/MS Identified (ISO)MS/MS Identified (ISO) [%]
20190819_QX2_SeVW_MA_HeLa_500ng_CTCDoff_LC0512,136123,1870138,029108,34529,68455,34153,6791,66240505.646,8961,323,164121,024...FalseOxidation (M);Acetyl (Protein N-term)Carbamidomethyl (C)NoneNoneFalseFalse12NoneStandardNone000
20200924_EXPL6_nLC09_MBK_QC_MNT_HeLa_42cm_FAIMS_500ng_Short_0520,25318,887023,17514,5998,5765,0304,89913122341.54,005597,95417,071...FalseOxidation (M);Acetyl (Protein N-term)Carbamidomethyl (C)NoneNoneFalseFalse12NoneStandardNone000
20170509_QE4_LC12_IAH_QC_MNT_HeLa_0114,78681,890092,46271,31821,14451,46448,9302,534566912.039,0111,480,44876,823...FalseOxidation (M);Acetyl (Protein N-term)Carbamidomethyl (C)NoneNoneFalseFalse12NoneStandardNone000
-

3 rows × 49 columns

-
- - - -## Table schema bug - -- filed bug report on pandas [#40255](https://github.com/pandas-dev/pandas/issues/40255) - - -```python -pd.show_versions() -``` - - - INSTALLED VERSIONS - ------------------ - commit : f2c8480af2f25efdbd803218b9d87980f416563e - python : 3.8.5.final.0 - python-bits : 64 - OS : Windows - OS-release : 10 - Version : 10.0.19041 - machine : AMD64 - processor : Intel64 Family 6 Model 165 Stepping 2, GenuineIntel - byteorder : little - LC_ALL : None - LANG : None - LOCALE : Danish_Denmark.1252 - - pandas : 1.2.3 - numpy : 1.18.5 - pytz : 2020.1 - dateutil : 2.8.1 - pip : 20.2.4 - setuptools : 50.3.1.post20201107 - Cython : None - pytest : None - hypothesis : None - sphinx : 3.5.1 - blosc : None - feather : None - xlsxwriter : None - lxml.etree : None - html5lib : None - pymysql : None - psycopg2 : None - jinja2 : 2.11.2 - IPython : 7.19.0 - pandas_datareader: None - bs4 : None - bottleneck : None - fsspec : None - fastparquet : None - gcsfs : None - matplotlib : 3.3.2 - numexpr : None - odfpy : None - openpyxl : 3.0.5 - pandas_gbq : None - pyarrow : None - pyxlsb : None - s3fs : None - scipy : 1.5.2 - sqlalchemy : None - tables : None - tabulate : None - xarray : None - xlrd : None - xlwt : None - numba : None - - - -```python -pd.__version__ -``` - - - - - '1.2.3' - - - - -```python -import pandas -data = {'A' : [1, 2, 2, pd.NA, 4, 8, 8, 8, 8, 9], - 'B': [pd.NA] * 10} -data = pd.DataFrame(data) -data = data.astype(pd.Int64Dtype()) # in my example I get this from data.convert_dtypes() -data_json = data.to_json(orient='table', indent=4) -pd.read_json(data_json, orient='table') #ValueError: Cannot convert non-finite values (NA or inf) to integer -``` - - - --------------------------------------------------------------------------- - - ValueError Traceback (most recent call last) - - in - 5 data = data.astype(pd.Int64Dtype()) # in my example I get this from data.convert_dtypes() - 6 data_json = data.to_json(orient='table', indent=4) - ----> 7 pd.read_json(data_json, orient='table') #ValueError: Cannot convert non-finite values (NA or inf) to integer - - - ~\Anaconda3\envs\vaep\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs) - 197 else: - 198 kwargs[new_arg_name] = new_arg_value - --> 199 return func(*args, **kwargs) - 200 - 201 return cast(F, wrapper) - - - ~\Anaconda3\envs\vaep\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs) - 297 ) - 298 warnings.warn(msg, FutureWarning, stacklevel=stacklevel) - --> 299 return func(*args, **kwargs) - 300 - 301 return wrapper - - - ~\Anaconda3\envs\vaep\lib\site-packages\pandas\io\json\_json.py in read_json(path_or_buf, orient, typ, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit, encoding, lines, chunksize, compression, nrows, storage_options) - 561 - 562 with json_reader: - --> 563 return json_reader.read() - 564 - 565 - - - ~\Anaconda3\envs\vaep\lib\site-packages\pandas\io\json\_json.py in read(self) - 692 obj = self._get_object_parser(self._combine_lines(data_lines)) - 693 else: - --> 694 obj = self._get_object_parser(self.data) - 695 self.close() - 696 return obj - - - ~\Anaconda3\envs\vaep\lib\site-packages\pandas\io\json\_json.py in _get_object_parser(self, json) - 714 obj = None - 715 if typ == "frame": - --> 716 obj = FrameParser(json, **kwargs).parse() - 717 - 718 if typ == "series" or obj is None: - - - ~\Anaconda3\envs\vaep\lib\site-packages\pandas\io\json\_json.py in parse(self) - 829 - 830 else: - --> 831 self._parse_no_numpy() - 832 - 833 if self.obj is None: - - - ~\Anaconda3\envs\vaep\lib\site-packages\pandas\io\json\_json.py in _parse_no_numpy(self) - 1093 ) - 1094 elif orient == "table": - -> 1095 self.obj = parse_table_schema(json, precise_float=self.precise_float) - 1096 else: - 1097 self.obj = DataFrame( - - - ~\Anaconda3\envs\vaep\lib\site-packages\pandas\io\json\_table_schema.py in parse_table_schema(json, precise_float) - 330 ) - 331 - --> 332 df = df.astype(dtypes) - 333 - 334 if "primaryKey" in table["schema"]: - - - ~\Anaconda3\envs\vaep\lib\site-packages\pandas\core\generic.py in astype(self, dtype, copy, errors) - 5860 if col_name in dtype: - 5861 results.append( - -> 5862 col.astype(dtype=dtype[col_name], copy=copy, errors=errors) - 5863 ) - 5864 else: - - - ~\Anaconda3\envs\vaep\lib\site-packages\pandas\core\generic.py in astype(self, dtype, copy, errors) - 5875 else: - 5876 # else, only a single dtype is given - -> 5877 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors) - 5878 return self._constructor(new_data).__finalize__(self, method="astype") - 5879 - - - ~\Anaconda3\envs\vaep\lib\site-packages\pandas\core\internals\managers.py in astype(self, dtype, copy, errors) - 629 self, dtype, copy: bool = False, errors: str = "raise" - 630 ) -> "BlockManager": - --> 631 return self.apply("astype", dtype=dtype, copy=copy, errors=errors) - 632 - 633 def convert( - - - ~\Anaconda3\envs\vaep\lib\site-packages\pandas\core\internals\managers.py in apply(self, f, align_keys, ignore_failures, **kwargs) - 425 applied = b.apply(f, **kwargs) - 426 else: - --> 427 applied = getattr(b, f)(**kwargs) - 428 except (TypeError, NotImplementedError): - 429 if not ignore_failures: - - - ~\Anaconda3\envs\vaep\lib\site-packages\pandas\core\internals\blocks.py in astype(self, dtype, copy, errors) - 671 vals1d = values.ravel() - 672 try: - --> 673 values = astype_nansafe(vals1d, dtype, copy=True) - 674 except (ValueError, TypeError): - 675 # e.g. astype_nansafe can fail on object-dtype of strings - - - ~\Anaconda3\envs\vaep\lib\site-packages\pandas\core\dtypes\cast.py in astype_nansafe(arr, dtype, copy, skipna) - 1066 - 1067 if not np.isfinite(arr).all(): - -> 1068 raise ValueError("Cannot convert non-finite values (NA or inf) to integer") - 1069 - 1070 elif is_object_dtype(arr): - - - ValueError: Cannot convert non-finite values (NA or inf) to integer - - - -```python -print(data.to_string()) -``` - - A B - 0 1 - 1 2 - 2 2 - 3 - 4 4 - 5 8 - 6 8 - 7 8 - 8 8 - 9 9 - - - -```python -N = 3 -meta = mq_all_summaries.df[columns.unique[:N]].iloc[0:2].reset_index(drop=True) -meta.to_dict() -``` - - - - - {'Enzyme': {0: 'Trypsin/P', 1: 'Trypsin/P'}, - 'Enzyme mode': {0: 'Specific', 1: 'Specific'}, - 'Enzyme first search': {0: , 1: }} - - diff --git a/project/doc/ipynbs/postprocess.py b/project/doc/ipynbs/postprocess.py deleted file mode 100644 index 1f5122443..000000000 --- a/project/doc/ipynbs/postprocess.py +++ /dev/null @@ -1,77 +0,0 @@ -import re -import argparse -from pathlib import Path, PosixPath - -collection_regex = {} - -l_regexes = [ - '', - ' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
intensity
Sample IDpeptide
sample_000feat_0013.471
feat_0213.852
feat_0324.578
feat_0410.293
feat_0620.624
.........
sample_099feat_1027.959
feat_115.352
feat_1228.055
feat_1327.344
feat_146.033
-

1341 rows × 1 columns

- - - - - -```python -print(f"Based on total number of rows, 95% is roughly: {int(len(X) * 0.95)}") -print("Based on each sample's 95% obs, it is roughly: {}".format( - X.groupby('Sample ID').apply(lambda df: int(len(df) * 0.95)).sum())) -``` - - Based on total number of rows, 95% is roughly: 1273 - Based on each sample's 95% obs, it is roughly: 1241 - - -## Samling using a column with the weights - - -```python -X = X.join(freq, on='peptide') -X -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
intensityfreq
Sample IDpeptide
sample_000feat_0013.47188
feat_0213.85294
feat_0324.57891
feat_0410.29387
feat_0620.62486
............
sample_099feat_1027.95991
feat_115.35290
feat_1228.05594
feat_1327.34487
feat_146.03387
-

1341 rows × 2 columns

-
- - - - -```python -t = X.groupby('Sample ID').get_group('sample_003') -t -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
intensityfreq
Sample IDpeptide
sample_003feat_006.09988
feat_0124.74688
feat_023.47594
feat_037.47991
feat_0424.09587
feat_0522.28788
feat_0616.55786
feat_0718.80993
feat_0813.07287
feat_1026.14291
feat_1127.63690
feat_1219.93494
feat_1326.15787
feat_1420.42787
-
- - - - -```python -t.sample(frac=0.75, weights='freq') -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
intensityfreq
Sample IDpeptide
sample_003feat_023.47594
feat_1326.15787
feat_0124.74688
feat_0522.28788
feat_0813.07287
feat_037.47991
feat_1127.63690
feat_006.09988
feat_1420.42787
feat_0424.09587
-
- - - -Sampling the entire DataFrame based on the freq will normalize on N of all rows. The normalization leaves relative frequency the same (if no floating point unprecision is reached) - - -```python -# number of rows not the same as when using groupby (see above) -X.sample(frac=0.95, weights='freq') -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
intensityfreq
Sample IDpeptide
sample_033feat_036.94991
sample_015feat_023.21994
sample_056feat_0528.55588
sample_009feat_0618.60186
sample_058feat_0816.58887
............
sample_059feat_0318.74991
sample_082feat_037.33491
sample_063feat_0729.33793
sample_047feat_0823.54087
sample_064feat_0228.61794
-

1274 rows × 2 columns

-
- - - -### Sampling fails with groupby, reindexing needed - -The above is not mapped one to one to the groupby sample method. One needs to apply it to every single df. - - -```python -# X.groupby('Sample ID').sample(frac=0.95, weights='freq') # does not work -X.groupby('Sample ID').apply( - lambda df: df.reset_index(0, drop=True).sample(frac=0.95, weights='freq') -).drop('freq', axis=1) -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
intensity
Sample IDpeptide
sample_000feat_0410.293
feat_1015.041
feat_0013.471
feat_0324.578
feat_1127.786
.........
sample_099feat_005.455
feat_069.720
feat_1228.055
feat_146.033
feat_0920.538
-

1243 rows × 1 columns

-
- - - -And passing a Series need the original X to be indexed the same (multi-indices are not supported) - - -```python -# for i, t in X.groupby('Sample ID'): -# t = t.sample(frac=0.75, weights=freq) -# t -``` - - -```python -X = X.reset_index('Sample ID') -X -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Sample IDintensityfreq
peptide
feat_00sample_00013.47188
feat_02sample_00013.85294
feat_03sample_00024.57891
feat_04sample_00010.29387
feat_06sample_00020.62486
............
feat_10sample_09927.95991
feat_11sample_0995.35290
feat_12sample_09928.05594
feat_13sample_09927.34487
feat_14sample_0996.03387
-

1341 rows × 3 columns

-
- - - - -```python -X.groupby(by='Sample ID').sample(frac=0.95, weights=freq) -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Sample IDintensityfreq
peptide
feat_11sample_00027.78690
feat_03sample_00024.57891
feat_02sample_00013.85294
feat_06sample_00020.62486
feat_07sample_0007.05393
............
feat_07sample_09914.48393
feat_06sample_0999.72086
feat_00sample_0995.45588
feat_08sample_0996.91087
feat_11sample_0995.35290
-

1243 rows × 3 columns

-
- - - - -```python -X.groupby(by='Sample ID').get_group('sample_002') -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Sample IDintensityfreq
peptide
feat_00sample_00227.59588
feat_01sample_00212.96788
feat_02sample_00211.77194
feat_03sample_00214.88091
feat_04sample_0024.00887
feat_05sample_0025.05688
feat_06sample_0029.37586
feat_07sample_00225.79893
feat_08sample_0029.93087
feat_09sample_00216.75490
feat_10sample_00224.12091
feat_11sample_0024.31490
feat_12sample_00225.84494
feat_13sample_00221.95887
-
- - - -## Sanity check: Downsampling the first feature - - -```python -freq.loc['feat_00'] = 1 # none should be selected -``` - - -```python -freq = freq / freq.sum() -freq -``` - - - - - feat_00 0.001 - feat_01 0.070 - feat_02 0.075 - feat_03 0.073 - feat_04 0.069 - feat_05 0.070 - feat_06 0.069 - feat_07 0.074 - feat_08 0.069 - feat_09 0.072 - feat_10 0.073 - feat_11 0.072 - feat_12 0.075 - feat_13 0.069 - feat_14 0.069 - Name: freq, dtype: float64 - - - - -```python -X.groupby(by='Sample ID').sample( - frac=0.5, weights=freq).sort_index().reset_index().peptide.value_counts() -``` - - - - - feat_05 59 - feat_02 54 - feat_12 53 - feat_03 53 - feat_14 50 - feat_11 49 - feat_09 48 - feat_07 47 - feat_13 47 - feat_06 45 - feat_01 42 - feat_04 42 - feat_10 41 - feat_08 36 - Name: peptide, dtype: int64 - - - -## Using a series - -- in the above approach, sampling weights might be readjusted based on the values present in `sample` as `NAN`s lead to the weights not summing up. Alteratively one could loop through the wide format rows and sample values from these. - - -```python -freq -``` - - - - - feat_00 0.001 - feat_01 0.070 - feat_02 0.075 - feat_03 0.073 - feat_04 0.069 - feat_05 0.070 - feat_06 0.069 - feat_07 0.074 - feat_08 0.069 - feat_09 0.072 - feat_10 0.073 - feat_11 0.072 - feat_12 0.075 - feat_13 0.069 - feat_14 0.069 - Name: freq, dtype: float64 - - - - -```python -X = X.drop('freq', axis=1).set_index( - 'Sample ID', append=True).squeeze().unstack(0) -X -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
peptidefeat_00feat_01feat_02feat_03feat_04feat_05feat_06feat_07feat_08feat_09feat_10feat_11feat_12feat_13feat_14
Sample ID
sample_00013.471NaN13.85224.57810.293NaN20.6247.05324.53520.48115.04127.78619.603NaNNaN
sample_00126.47227.28112.52826.6386.44327.94813.542NaN6.11917.85125.51224.46626.7413.57921.051
sample_00227.59512.96711.77114.8804.0085.0569.37525.7989.93016.75424.1204.31425.84421.958NaN
sample_0036.09924.7463.4757.47924.09522.28716.55718.80913.072NaN26.14227.63619.93426.15720.427
sample_0043.27622.40926.39314.98927.34418.466NaN19.6155.13315.27019.781NaN11.36918.72515.991
................................................
sample_095NaN17.7695.0935.2687.37913.0065.19415.4029.70120.4494.25313.47711.30621.89920.080
sample_09624.642NaN25.37422.54020.75122.6504.8589.490NaN15.8826.68122.29726.42120.09411.791
sample_097NaN15.63628.46119.60511.8618.05526.53922.26221.05426.53316.9308.53213.95724.64722.729
sample_0988.36323.546NaN11.4968.0207.4158.39429.37710.09811.37624.1365.80020.1784.77210.712
sample_0995.45512.11825.23012.28128.02418.2499.72014.4836.91020.53827.9595.35228.05527.3446.033
-

100 rows × 15 columns

-
- - - - -```python -X.iloc[0].sample(frac=0.8, weights=freq).sort_index() -``` - - - - - peptide - feat_01 NaN - feat_02 13.852 - feat_03 24.578 - feat_05 NaN - feat_06 20.624 - feat_07 7.053 - feat_08 24.535 - feat_09 20.481 - feat_10 15.041 - feat_12 19.603 - feat_13 NaN - feat_14 NaN - Name: sample_000, dtype: float64 - - - -Sampling using the wide format would garuantee that the weights are not adjusted based on missing values, but that instead missing values are sample into on or the other set. Ultimately `NaN`s are dropped also in this approach. - - -```python -import pandas as pd -data = {} -for row_key in X.index: - data[row_key] = X.loc[row_key].sample(frac=0.8, weights=freq) -pd.DataFrame(data).stack() -``` - - - - - peptide - feat_00 sample_004 3.276 - sample_012 8.782 - sample_035 7.149 - sample_044 15.908 - sample_090 29.630 - ... - feat_14 sample_095 20.080 - sample_096 11.791 - sample_097 22.729 - sample_098 10.712 - sample_099 6.033 - Length: 1078, dtype: float64 - - diff --git a/project/doc/lab_book.md b/project/doc/lab_book.md deleted file mode 100644 index 5a7153db1..000000000 --- a/project/doc/lab_book.md +++ /dev/null @@ -1,21 +0,0 @@ -Most ideas will be logged using the issues of the repository. - -### Clustering Hela data - -### Imputation -See if there is literature how to impute missing data in the field of proteomics. -For missing values on the dependent variable zero inflated models have been -developed. Maybe there is similar things for having yero values. - -### Pride cell lines -Select one other cell line which is on Pride processed by MaxQuant. See how -it compares?! - -- number of proteins overlapping of different share of most abundant? -- non-overlapping proteins: most abundant by cellline - (DDA fails to find peptides reliably in every run) - -### DDA workflow -The preprocessing over raw datafiles needs to be done uniquely -having the samples from different sources. Select a DDA workflow -in order to process raw data. diff --git a/project/doc/proteomics_intro.md b/project/doc/proteomics_intro.md deleted file mode 100644 index 0ff882882..000000000 --- a/project/doc/proteomics_intro.md +++ /dev/null @@ -1,173 +0,0 @@ -## Lennart Martens - [Introduction to Prteomics videos](https://www.youtube.com/watch?v=Wy1SwrMzhYk&list=PLXxp6nsBenSX_W8DiOocKJ0laNauYNdYl) -- Digitalizer: Transform an analog signal into a digital signal - -- Ionization Types - - MALDI: Analyte (peptide) is charged by matrix molecules by *one* ion only (mechanism is not so clear) - - three common matrix molecules in proteomics (CHCA, SA, DHB): Benzeme group taking up laser energy. - Inherent bias to three amino-acids by MALDI due to three amino-acids sharing conformational similarity with benzeme group - - Electrospray ionization (ESI): Acid is added to the liquid in order to obtain charged peptides. - - temperature of the needle - - peptides can take more than one ion - -- roughly 150.000 possible peptides if 5000 genes are expressed in a tissue (without any PTMs) - -- Detectors get worse over time. In the maintence sample one should see a spike when a new - detector is replaced for the old one. - -- Fragmentation of peptides - - collision-induced dissociation (CID): b and y ions - - ergodic process, continously stronger vibration of peptide cleaves PSMs - - electron-capture dissociation (ECD): c and z ions - - preserves PTMs as the fracturing is non- ergodic (vibration induced) - -## Intro (OpenMS from Tübingen) - [Video](https://www.youtube.com/watch?v=tnS-w8ggEAA&list=PL_6fafgzU1nHd13qqjm3uNvyHh62JpPcB&index=2&t=0s) -- *Ion mobility* as a way to identify number of charges (ions), adding another dimension to the data -- predict MS2 peptide itensities in order to better identify peptides ([MS2PIP](https://iomics.ugent.be/ms2pip/), [DeepMass: Prism](https://github.com/verilylifesciences/deepmass), Prosit)- - - Question: Does this take amino-acid sequences and provides spectra? -- number of mappings from peptides to protein (How many peptides per peptide?) -- absoute quantification siscpa, aqua -- feature-based label-free quantification - - does scale to (100?) - - quantification of isotopes (3D integral: intensity over retention time and m/z ) -- [SWATH-MS](https://imsb.ethz.ch/research/aebersold/research/swath-ms.html): DIA in DDA setting? -- [pyOpenMS](https://pyopenms.readthedocs.io/en/latest/) -- HUGO PSI Standards Formats: Machines do not provide all the same standardized dataset. -- KNIME is popular for custom machines. `Nextflow` for cloud providers - - - -## Introduction to proteomics -Given by Jeppe Madsen and Martin Rzkær - -> Support-Request: SUND-CPR-Mssupport -> E-Mail Subject: CC-QE1-ISSUE - -## Blog articles - -- FAIMS, DDA and DIA - ideas and performance comparisons, [thermo-fischer 2020-04-23](https://www.technologynetworks.com/analysis/articles/how-faims-is-changing-the-game-in-proteomics-333843) - - -## Mass Spectrometry -- Unbiased analysis that does not require prior knowledge of the sample composition -- Analytical technique which identifies molecules based on their mass and charge (m/z) -- Proteomics: the large-scale study of proteins. - - -## Pipeline - -![Proteomics Pipeline](Figures/fig_proteomics_pipeline.png) - -### Liquid Chromatorgraphy (LC) -- Peptide separation by hydrophobicity - - hydrophilic vs hydrophobic liquids (Acetonitrile) - -### Column -- reverse phase (chromatography), see [wikipedia](Reversed-phase chromatography - (also called RPC, reverse-phase chromatography, or hydrophobic chromatography) - includes any chromatographic method that uses a hydrophobic stationary phase. - RPC refers to liquid (rather than gas) chromatography.) - > Reversed-phase chromatography (also called RPC, reverse-phase chromatography, or hydrophobic chromatography) includes any chromatographic method that uses a hydrophobic stationary phase. RPC refers to liquid (rather than gas) chromatography. - > (...) Reversed-phase chromatography is a technique using alkyl chains covalently bonded to the stationary phase particles in order to create a hydrophobic stationary phase, which has a stronger affinity for hydrophobic or less polar compounds. The use of a hydrophobic stationary phase is essentially the reverse of normal phase chromatography, since the polarity of the mobile and stationary phases have been inverted – hence the term reversed-phase chromatography. - -- 75um ID packed with 3um/1.9um reverse phase C18 beads. Pulled fused silica -- Column performance is very important for your experiments. - - If the column is not packed perfectly you will have dead volumes and peak tailing. - - You will pick the same peptides for identification - -### Mass Specometry -> "One of the most significant differences between transcriptomics and proteomics is in the dynamic range of mRNA and protein concentrations inside the cell. While the protein abundances stretch over at least seven orders of magnitude, from one copy per cell to ten million copies per cell, the mRNA dynamic range covers only three or four orders of magnitude." (https://doi.org/10.1002/pmic.201200451) - -Claim: Around 5000 proteins should be identified for each sample. - -#### Data Dependent Acquistion (DDA) -Orbitrap specific steps: -1. MS1: mix of peptides to identify most candidates for MS2 scan -2. MS2: one peptide (z/m ratio) which is then fragmented and scanned - -> Default: 12 MS2 and 1MS1 scan in parallel - -### Peptide Identification -- How do we get from acquired spectra to protein and peptide identifications? -- some peptides have the same mass -- To identify peptides the mass spectrometer performs a fragment (MS2) scan on an isolated peptide ion - - peptides with the same m/z ratio are fragmented and then analyzed ("de novo" sequencing) - - -## Amino Acids and residuals - -Name | abr | code | Residue Mass ---- | --- | --- | --- -Alanine | Ala | A | 71.03711 -Arginine | Arg | R | 156.10111 -Aspartic Acid | Asn | N | 114.04293 -Cysteine | Cys | C | 103.00919 -Glutamic Acid | Glu | E | 129.04259 -Glutamine | Gln | Q | 128.05858 -Glycine | Gly | G | 57.02146 -Histidine | His | H | 137.05891 -Isoleucine | Ile | I | 113.08406 -Leucine | Leu | L | 113.08406 -Lysine | Lys | K | 128.09496 -Methionine | Met | M | 131.04049 -Phenyalanine | Phe | F | 147.06841 -Proline | Pro | P | 97.05276 -Serine | Ser | S | 87.03203 -Threonine | Thr | T | 101.04768 -Trypthophan | Trp | W | 186.07931 -Tyrosine | Tyr | Y | 163.06333 -Valine | Val | V | 99.06841 - -- Residue mass is referring to the mass in an peptide of a amino acid - - -## Confunding Factors (or Hyperparameters) -> Critical parameters for DDA methods. Recommendation for machines at CPR - -- Max Injection Time - - The maximum time which the instrument will use to reach the target amount of ions in the C-trap - - Low max injection times gives faster scans speed. - - High max injection times gives better intensity and dynamic range - -- Automatic Gain Control (AGC) Target - - The target ion amount which will be accumulated in the C- trap - - A higher AGC target will give higher intensity - -- Dynamic Exclusion Time - - The time which the instrument will exclude precursors already selected for MS2. - - Exclusion time is dependent on the length of your gradient. - - For 145 minutes, we usually have 30 seconds -- Number of MS2 scans (Top N) - - More MS2 scans gives deeper protein coverage but slower speed. - - -- sample overloading (too much liquid) messes up scan (relation to _Dynamic Range_?) -- dwell time - - the time a particular ion (m/z) signal is monitored -- cycle time - - -## Techniques -- TOF -- ORBITRAP -- PASEF -- FAIMS -- HCD Cell - -### Orbitrap -![Schema Orbitrap](Figures/schema_orbitrap_instrument.jpg) - -## Amino Acid weights -- fragments of peptides are identified on their weights - - -## Glossar - -Term | meaning ----- | --- -c-Trap | Meaning of c? collects one million particles before forwarding them -cv | compensation voltage (in FAIMS) -elute | remove (an adsorbed substance) by washing with a solvent, especially in chromatography -FAIMS | high-field asymmetric waveform ion mobility spectrometry -HCD | -gcf | gas phase fractionation -XIC | Extracted Ion Current diff --git a/project/doc/vae_notes.md b/project/doc/vae_notes.md deleted file mode 100644 index 0595eebc4..000000000 --- a/project/doc/vae_notes.md +++ /dev/null @@ -1,9 +0,0 @@ -# Autoencoders (AE) - -## Linear AE - -## Denoising AE - -## Variational AE - -## Denoising Variational AE diff --git a/project/doc/venv_setup.md b/project/doc/venv_setup.md deleted file mode 100644 index ba55c04a8..000000000 --- a/project/doc/venv_setup.md +++ /dev/null @@ -1,15 +0,0 @@ -# Setting up an Virtual Environment - -## Conda env - -Installing the development version, run -``` -conda env create -f environment.yml -``` -and register the Ipython Kernel (alternatively install the whole Jupyter Suite -in the virtual env) -``` -python -m ipykernel install --user --name other-env --display-name "Python (other-env)" -``` - -## Virtual env diff --git a/project/erda_05_parse_paramter_files.ipynb b/project/erda_05_parse_paramter_files.ipynb new file mode 100644 index 000000000..60a05b72d --- /dev/null +++ b/project/erda_05_parse_paramter_files.ipynb @@ -0,0 +1,548 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "368f6451-9bec-4ca6-9921-c5ab69c23153", + "metadata": {}, + "source": [ + "# Parse parameter files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c3a8745-20e5-4353-a3d6-950a3bc1dd6c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from pprint import pprint\n", + "import collections\n", + "from pathlib import Path\n", + "from tqdm.notebook import tqdm\n", + "\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92c20a10-a814-4a50-8186-d05fa1e14498", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "test_file = 'data/mqpar_example.xml'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1645c86-7799-46db-92cd-cd5157cd11d8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def extend_tuple(t, target_length: int):\n", + " if not isinstance(t, tuple):\n", + " raise TypeError(\n", + " f\"Wrong type provided. Expected tuple, got {type(t)} : {t!r}\")\n", + " if len(t) > target_length:\n", + " raise ValueError(\n", + " f\"Tuple is too long (got {len(t)}, expected {target_length}: {t!r}\")\n", + " return t + (None,) * (target_length - len(t))\n", + "# extend_tuple(\"test\", 4)\n", + "# extend_tuple(('k1', 'k2'), 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4341046b-13d2-49c5-924c-a73fd9f366d1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def extend_tuples_with_none(list_of_tuples, target_length):\n", + " extended_tuples = []\n", + " for tuple_ in list_of_tuples:\n", + " # if len(tuple_) > target_length:\n", + " # raise ValueError(f\"tuple is too long: {len(tuple_)}\")\n", + " extended_tuple = extend_tuple(tuple_, target_length)\n", + " extended_tuples.append(extended_tuple)\n", + " return extended_tuples\n", + "\n", + "\n", + "list_of_tuples = [(1, 2), (3, 4, 5), (6,)]\n", + "extend_tuples_with_none(list_of_tuples, 3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8691214b-65a1-4c27-92d7-f927dbac61bf", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import xml.etree.ElementTree as ET\n", + "\n", + "def add_record(data, tag, record):\n", + " if tag in data:\n", + " if isinstance(data[tag], list):\n", + " data[tag].append(record)\n", + " else:\n", + " data[tag] = [data[tag], record]\n", + " else:\n", + " data[tag] = record\n", + " return data\n", + "\n", + "\n", + "def read_xml_record(element):\n", + " data = dict()\n", + " for child in element:\n", + " if len(child) > 1 and child.tag:\n", + " # if there is a list, process each element one by one\n", + " # either nested or a plain text\n", + " data[child.tag] = [add_record(dict(), tag=child.tag, record=read_xml_record(child) if not (\n", + " child.text and child.text.strip()) else child.text.strip()) for child in child]\n", + " elif child.text and child.text.strip():\n", + " # just plain text record\n", + " data = add_record(data=data, tag=child.tag,\n", + " record=child.text.strip())\n", + " else:\n", + " record = read_xml_record(child)\n", + " data = add_record(data, child.tag, record)\n", + " if not data:\n", + " # empty strings and None are normalzied to None\n", + " return None\n", + " return data\n", + "\n", + "\n", + "tree = ET.parse(test_file)\n", + "root = tree.getroot()\n", + "\n", + "record_example = read_xml_record(root)\n", + "record_example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10f3103a-8133-4d01-9c6a-efcc75d85295", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "\n", + "\n", + "def flatten_dict_of_dicts(d: dict, parent_key: str = '') -> dict:\n", + " \"\"\"Build tuples for nested dictionaries for use as `pandas.MultiIndex`.\n", + "\n", + " Parameters\n", + " ----------\n", + " d : dict\n", + " Nested dictionary for which all keys are flattened to tuples.\n", + " parent_key : str, optional\n", + " Outer key (used for recursion), by default ''\n", + "\n", + " Returns\n", + " -------\n", + " dict\n", + " Flattend dictionary with tuple keys: {(outer_key, ..., inner_key) : value}\n", + " \"\"\"\n", + " # simplified and adapted from: https://stackoverflow.com/a/6027615/9684872\n", + " items = []\n", + " for k, v in d.items():\n", + " new_key = parent_key + (k,) if parent_key else (k,)\n", + " if isinstance(v, collections.abc.MutableMapping):\n", + " items.extend(flatten_dict_of_dicts(v, parent_key=new_key))\n", + " elif isinstance(v, list):\n", + " for item in v:\n", + " if isinstance(item, collections.abc.MutableMapping):\n", + " items.extend(flatten_dict_of_dicts(\n", + " item, parent_key=new_key))\n", + " elif isinstance(item, str):\n", + " items.append((new_key, item))\n", + " else:\n", + " raise ValueError(f\"Unknown item: {item:r}\")\n", + " else:\n", + " items.append((new_key, v))\n", + " return items\n", + "\n", + "\n", + "case_1 = {'k': 'v'}\n", + "case_2 = {'k1': {'k2': 'v1', 'k3': 'v2'}}\n", + "case_3 = {'k1': {'k2': [{'k4': 'v1'}, {'k4': 'v2'}]}}\n", + "case_4 = {'k1': [{'k2': {'k4': 'v1', 'k5': 'v2'}},\n", + " {'k2': {'k4': 'v1', 'k5': 'v2'}}]}\n", + "case_5 = {'restrictMods': [{'string': 'Oxidation (M)'},\n", + " {'string': 'Acetyl (Protein N-term)'}]}\n", + "case_6 = {'variableModifications': {\n", + " 'string': ['Oxidation (M)',\n", + " 'Acetyl (Protein N-term)']}}\n", + "\n", + "test_cases = [case_1, case_2, case_3, case_4, case_5, case_6]\n", + "\n", + "for case in (test_cases):\n", + " pprint(flatten_dict_of_dicts(case))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8e19d55-d012-44be-8869-0271e16a7093", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "entries = list()\n", + "for case in test_cases:\n", + " entries.extend(flatten_dict_of_dicts(case))\n", + "[(extend_tuple(k, 4), v) for (k, v) in entries]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "76092ebb-e31e-4bf2-b350-090c51d1e1bc", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def build_Series_from_records(records, index_length=4):\n", + " records = flatten_dict_of_dicts(records)\n", + " idx = pd.MultiIndex.from_tuples(\n", + " (extend_tuple(k, index_length) for (k, v) in records))\n", + " return pd.Series((v for (k, v) in records), index=idx)\n", + "\n", + "\n", + "tree = ET.parse(test_file)\n", + "root = tree.getroot()\n", + "\n", + "record_example = read_xml_record(root)\n", + "flattend = build_Series_from_records(record_example, 4)\n", + "flattend.to_frame('example')" + ] + }, + { + "cell_type": "markdown", + "id": "e63a712a-a6e8-46dc-befc-bc6a98a6a153", + "metadata": {}, + "source": [ + "## Parameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e718825e-428a-4d99-81e6-03cce50da2fc", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# folders to check\n", + "folder_w_params = Path('/home/jovyan/work/mqpar_files')\n", + "root = Path('/home/jovyan/work/')\n", + "dumped_folder = 'mq_out'\n", + "dumped_folder_names = 'mq_out_folder.txt'\n", + "# out\n", + "fname_out = 'data/all_parameter_files.csv'" + ] + }, + { + "cell_type": "markdown", + "id": "891ee5ec-03a2-4d66-845b-a2938c9018f7", + "metadata": {}, + "source": [ + "## Dump of some parameter files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fbdc4c32-9995-43ae-aff3-9b6358cf9ea2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def read_file(file, name, idx_levels=4) -> pd.Series:\n", + " tree = ET.parse(file)\n", + " root = tree.getroot()\n", + " record = read_xml_record(root)\n", + " s = build_Series_from_records(record, idx_levels)\n", + " s.name = name\n", + " return s" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c779378e-8b0c-440b-a43a-c1a10939cf8f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "parameter_files_part_1 = list()\n", + "for file in tqdm(folder_w_params.iterdir()):\n", + " s_parameters = read_file(file, name=file.stem[6:])\n", + " parameter_files_part_1.append(s_parameters)\n", + "\n", + "parameter_files_part_1 = pd.concat(parameter_files_part_1, axis=1).T\n", + "parameter_files_part_1" + ] + }, + { + "cell_type": "markdown", + "id": "a5db69d3-89ed-4670-ae9a-d5e548e43106", + "metadata": {}, + "source": [ + "## Search for parameter files in output folders\n", + "\n", + "- read folders from dump (for stable execution on erda)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6dea94bd-dc1b-4e5b-ad99-0c6f1dc34682", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# # read as generator if file does not exist:\n", + "# folders = list(Path('/home/jovyan/work/mq_out').iterdir())\n", + "\n", + "root = Path('/home/jovyan/work/')\n", + "with open(root / dumped_folder_names) as f:\n", + " folders = list()\n", + " for line in f:\n", + " fpath = root / dumped_folder / line.strip()\n", + " folders.append(fpath)" + ] + }, + { + "cell_type": "markdown", + "id": "fd65f35b-7818-4275-961f-816aedfaa486", + "metadata": {}, + "source": [ + "read paramter files:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32add481-89ba-4c22-b419-f025f98c2f2c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "parameter_files_part_2 = list()\n", + "i = 0\n", + "for folder in tqdm(folders):\n", + " for file in folder.iterdir():\n", + " if file.suffix == '.xml':\n", + " s_parameters = read_file(file, file.parent.name)\n", + " parameter_files_part_2.append(s_parameters)\n", + " i += 1\n", + "\n", + "parameter_files_part_2 = pd.concat(parameter_files_part_2, axis=1).T\n", + "parameter_files_part_2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd945d90-8416-4ddd-9b46-ab7e47ed1840", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "print(f\"Found {i} parameter files\")" + ] + }, + { + "cell_type": "markdown", + "id": "71cf0a35-0cf9-4fb1-9abc-08cad21d4fae", + "metadata": {}, + "source": [ + "## Combine both sets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "985d4fab-49ad-45b7-ae2d-de66cbdae5a4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "parameter_files = pd.concat([parameter_files_part_1, parameter_files_part_2])\n", + "# del parameter_files_part_1, parameter_files_part_2\n", + "parameter_files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81944709-5e74-4067-ab4a-d20e2054ecd0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# 11066" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "48197990-124f-4abc-8c42-3b4982f3cd4b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "parameter_files = parameter_files.infer_objects()\n", + "parameter_files.dtypes.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e0627e5-22c2-4b08-be5b-b57866b15d13", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "parameter_files.to_csv(fname_out)" + ] + }, + { + "cell_type": "markdown", + "id": "33bbf09d-6059-4abc-96d6-677f1dfb3eb5", + "metadata": {}, + "source": [ + "Read aggregated parameters dump" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d749fdc5-9f56-45ab-879c-1e01977e733a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "parameter_files = pd.read_csv(fname_out, index_col=0, header=list(range(4)))\n", + "parameter_files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "858a7dbf-a5e3-47d3-98e4-f130306cfbf0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "parameter_files.dtypes.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88c9f10e-5a4d-4653-b6f8-cfe6152f1b5a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "parameter_files.loc[:, parameter_files.dtypes == 'object']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a6c70d1-c5eb-49ab-82f1-fe919a8b60e7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "parameter_files['fastaFiles']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "319e99f2-9236-406c-b95a-493864dcbf03", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "parameter_files.droplevel(-1, axis=1)['fastaFiles']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f1a185a-b0e8-40bd-b35c-4c9be49099f7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "parameter_files.columns.to_list()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5319b502-58ac-4a5c-94dc-4c9915f302ee", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/project/erda_05_parse_paramter_files.py b/project/erda_05_parse_paramter_files.py new file mode 100644 index 000000000..7f3cb7603 --- /dev/null +++ b/project/erda_05_parse_paramter_files.py @@ -0,0 +1,290 @@ +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.15.1 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Parse parameter files + +# %% +from pprint import pprint +import collections +from pathlib import Path +from tqdm.notebook import tqdm + +import pandas as pd + +# %% +import logging + +import xml.etree.ElementTree as ET + +logger = logging.getLogger() + +test_file = 'data/mqpar_example.xml' + + +# %% +def extend_tuple(t, target_length: int): + if not isinstance(t, tuple): + raise TypeError( + f"Wrong type provided. Expected tuple, got {type(t)} : {t!r}") + if len(t) > target_length: + raise ValueError( + f"Tuple is too long (got {len(t)}, expected {target_length}: {t!r}") + return t + (None,) * (target_length - len(t)) +# extend_tuple("test", 4) +# extend_tuple(('k1', 'k2'), 1) + + +# %% +def extend_tuples_with_none(list_of_tuples, target_length): + extended_tuples = [] + for tuple_ in list_of_tuples: + # if len(tuple_) > target_length: + # raise ValueError(f"tuple is too long: {len(tuple_)}") + extended_tuple = extend_tuple(tuple_, target_length) + extended_tuples.append(extended_tuple) + return extended_tuples + + +list_of_tuples = [(1, 2), (3, 4, 5), (6,)] +extend_tuples_with_none(list_of_tuples, 3) + +# %% + + +def add_record(data, tag, record): + if tag in data: + if isinstance(data[tag], list): + data[tag].append(record) + else: + data[tag] = [data[tag], record] + else: + data[tag] = record + return data + + +def read_xml_record(element): + data = dict() + for child in element: + if len(child) > 1 and child.tag: + # if there is a list, process each element one by one + # either nested or a plain text + data[child.tag] = [add_record(dict(), tag=child.tag, record=read_xml_record(child) if not ( + child.text and child.text.strip()) else child.text.strip()) for child in child] + elif child.text and child.text.strip(): + # just plain text record + data = add_record(data=data, tag=child.tag, + record=child.text.strip()) + else: + record = read_xml_record(child) + data = add_record(data, child.tag, record) + if not data: + # empty strings and None are normalzied to None + return None + return data + + +tree = ET.parse(test_file) +root = tree.getroot() + +record_example = read_xml_record(root) +record_example + +# %% + + +def flatten_dict_of_dicts(d: dict, parent_key: str = '') -> dict: + """Build tuples for nested dictionaries for use as `pandas.MultiIndex`. + + Parameters + ---------- + d : dict + Nested dictionary for which all keys are flattened to tuples. + parent_key : str, optional + Outer key (used for recursion), by default '' + + Returns + ------- + dict + Flattend dictionary with tuple keys: {(outer_key, ..., inner_key) : value} + """ + # simplified and adapted from: https://stackoverflow.com/a/6027615/9684872 + items = [] + for k, v in d.items(): + new_key = parent_key + (k,) if parent_key else (k,) + if isinstance(v, collections.abc.MutableMapping): + items.extend(flatten_dict_of_dicts(v, parent_key=new_key)) + elif isinstance(v, list): + for item in v: + if isinstance(item, collections.abc.MutableMapping): + items.extend(flatten_dict_of_dicts( + item, parent_key=new_key)) + elif isinstance(item, str): + items.append((new_key, item)) + else: + raise ValueError(f"Unknown item: {item:r}") + else: + items.append((new_key, v)) + return items + + +case_1 = {'k': 'v'} +case_2 = {'k1': {'k2': 'v1', 'k3': 'v2'}} +case_3 = {'k1': {'k2': [{'k4': 'v1'}, {'k4': 'v2'}]}} +case_4 = {'k1': [{'k2': {'k4': 'v1', 'k5': 'v2'}}, + {'k2': {'k4': 'v1', 'k5': 'v2'}}]} +case_5 = {'restrictMods': [{'string': 'Oxidation (M)'}, + {'string': 'Acetyl (Protein N-term)'}]} +case_6 = {'variableModifications': { + 'string': ['Oxidation (M)', + 'Acetyl (Protein N-term)']}} + +test_cases = [case_1, case_2, case_3, case_4, case_5, case_6] + +for case in (test_cases): + pprint(flatten_dict_of_dicts(case)) + +# %% +entries = list() +for case in test_cases: + entries.extend(flatten_dict_of_dicts(case)) +[(extend_tuple(k, 4), v) for (k, v) in entries] + + +# %% +def build_Series_from_records(records, index_length=4): + records = flatten_dict_of_dicts(records) + idx = pd.MultiIndex.from_tuples( + (extend_tuple(k, index_length) for (k, v) in records)) + return pd.Series((v for (k, v) in records), index=idx) + + +tree = ET.parse(test_file) +root = tree.getroot() + +record_example = read_xml_record(root) +flattend = build_Series_from_records(record_example, 4) +flattend.to_frame('example') + +# %% [markdown] +# ## Parameters + +# %% +# folders to check +folder_w_params = Path('/home/jovyan/work/mqpar_files') +root = Path('/home/jovyan/work/') +dumped_folder = 'mq_out' +dumped_folder_names = 'mq_out_folder.txt' +# out +fname_out = 'data/all_parameter_files.csv' + + +# %% [markdown] +# ## Dump of some parameter files + +# %% +def read_file(file, name, idx_levels=4) -> pd.Series: + tree = ET.parse(file) + root = tree.getroot() + record = read_xml_record(root) + s = build_Series_from_records(record, idx_levels) + s.name = name + return s + + +# %% +parameter_files_part_1 = list() +for file in tqdm(folder_w_params.iterdir()): + s_parameters = read_file(file, name=file.stem[6:]) + parameter_files_part_1.append(s_parameters) + +parameter_files_part_1 = pd.concat(parameter_files_part_1, axis=1).T +parameter_files_part_1 + +# %% [markdown] +# ## Search for parameter files in output folders +# +# - read folders from dump (for stable execution on erda) + +# %% +# # read as generator if file does not exist: +# folders = list(Path('/home/jovyan/work/mq_out').iterdir()) + +root = Path('/home/jovyan/work/') +with open(root / dumped_folder_names) as f: + folders = list() + for line in f: + fpath = root / dumped_folder / line.strip() + folders.append(fpath) + +# %% [markdown] +# read paramter files: + +# %% +parameter_files_part_2 = list() +i = 0 +for folder in tqdm(folders): + for file in folder.iterdir(): + if file.suffix == '.xml': + s_parameters = read_file(file, file.parent.name) + parameter_files_part_2.append(s_parameters) + i += 1 + +parameter_files_part_2 = pd.concat(parameter_files_part_2, axis=1).T +parameter_files_part_2 + +# %% +print(f"Found {i} parameter files") + +# %% [markdown] +# ## Combine both sets + +# %% +parameter_files = pd.concat([parameter_files_part_1, parameter_files_part_2]) +# del parameter_files_part_1, parameter_files_part_2 +parameter_files + +# %% +# 11066 + +# %% +parameter_files = parameter_files.infer_objects() +parameter_files.dtypes.value_counts() + +# %% +parameter_files.to_csv(fname_out) + +# %% [markdown] +# Read aggregated parameters dump + +# %% +parameter_files = pd.read_csv(fname_out, index_col=0, header=list(range(4))) +parameter_files + +# %% +parameter_files.dtypes.value_counts() + +# %% +parameter_files.loc[:, parameter_files.dtypes == 'object'] + +# %% +parameter_files['fastaFiles'] + +# %% +parameter_files.droplevel(-1, axis=1)['fastaFiles'] + +# %% +parameter_files.columns.to_list() + +# %% diff --git a/project/erda_06_analyze_parameters.ipynb b/project/erda_06_analyze_parameters.ipynb new file mode 100644 index 000000000..9887c58b8 --- /dev/null +++ b/project/erda_06_analyze_parameters.ipynb @@ -0,0 +1,153 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8154dacf", + "metadata": {}, + "source": [ + "# Analyzse and rename dumped parameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad32c7c1", + "metadata": {}, + "outputs": [], + "source": [ + "import vaep\n", + "import pandas as pd\n", + "\n", + "fname_mq_params = 'data/all_parameter_files.csv'\n", + "fname_id_mappings = 'data/rename/selected_old_new_id_mapping.csv'\n", + "\n", + "fname_out = 'data/selected_parameter_files.csv'\n", + "\n", + "parameter_files = pd.read_csv(fname_mq_params, index_col=0, header=list(range(4))\n", + " )\n", + "parameter_files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09d029d6", + "metadata": {}, + "outputs": [], + "source": [ + "# thread experiments...\n", + "vaep.pandas.show_columns_with_variation(\n", + " parameter_files\n", + " .loc[parameter_files.index.duplicated(keep=False)])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05f5fc43", + "metadata": {}, + "outputs": [], + "source": [ + "parameter_files = parameter_files.loc[~parameter_files.index.duplicated()]\n", + "parameter_files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b92ac981", + "metadata": {}, + "outputs": [], + "source": [ + "id_mappings = pd.read_csv(fname_id_mappings, index_col=0, usecols=['Sample ID', 'new_sample_id'])\n", + "id_mappings.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "245e795a", + "metadata": {}, + "outputs": [], + "source": [ + "parameter_files.loc[id_mappings.index]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef8ffc10", + "metadata": {}, + "outputs": [], + "source": [ + "sel = (parameter_files\n", + " .loc[id_mappings.index]\n", + " .drop('filePaths', axis=1)\n", + " .rename(id_mappings['new_sample_id']))\n", + "sel.to_csv(fname_out)\n", + "sel" + ] + }, + { + "cell_type": "markdown", + "id": "b1f69026", + "metadata": {}, + "source": [ + "-inf and + inf cannot be handled correctly (fullMinMz, fullMaxMz)\n", + "number of Threads differs as the setting was varied in the beginning (most runs used 4 threads)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b5ae165", + "metadata": {}, + "outputs": [], + "source": [ + "sel_with_diffs = vaep.pandas.show_columns_with_variation(sel)\n", + "sel_with_diffs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15f413d3", + "metadata": {}, + "outputs": [], + "source": [ + "sel_with_diffs.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee554c42", + "metadata": {}, + "outputs": [], + "source": [ + "sel[('numThreads', 'nan', 'nan', 'nan')].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8dc29350", + "metadata": {}, + "outputs": [], + "source": [ + "# 388 columns are identical\n", + "sel.drop(sel_with_diffs.columns, axis=1\n", + " ).drop_duplicates()" + ] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "-all", + "main_language": "python", + "notebook_metadata_filter": "-all" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/project/erda_06_analyze_parameters.py b/project/erda_06_analyze_parameters.py new file mode 100644 index 000000000..e2449ae5d --- /dev/null +++ b/project/erda_06_analyze_parameters.py @@ -0,0 +1,59 @@ +# %% [markdown] +# # Analyzse and rename dumped parameters + +# %% +import vaep +import pandas as pd + +fname_mq_params = 'data/all_parameter_files.csv' +fname_id_mappings = 'data/rename/selected_old_new_id_mapping.csv' + +fname_out = 'data/selected_parameter_files.csv' + +parameter_files = pd.read_csv(fname_mq_params, index_col=0, header=list(range(4)) + ) +parameter_files + +# %% +# thread experiments... +vaep.pandas.show_columns_with_variation( + parameter_files + .loc[parameter_files.index.duplicated(keep=False)]) + +# %% +parameter_files = parameter_files.loc[~parameter_files.index.duplicated()] +parameter_files + +# %% +id_mappings = pd.read_csv(fname_id_mappings, index_col=0, usecols=['Sample ID', 'new_sample_id']) +id_mappings.head() + +# %% +parameter_files.loc[id_mappings.index] + +# %% +sel = (parameter_files + .loc[id_mappings.index] + .drop('filePaths', axis=1) + .rename(id_mappings['new_sample_id'])) +sel.to_csv(fname_out) +sel + +# %% [markdown] +# -inf and + inf cannot be handled correctly (fullMinMz, fullMaxMz) +# number of Threads differs as the setting was varied in the beginning (most runs used 4 threads) + +# %% +sel_with_diffs = vaep.pandas.show_columns_with_variation(sel) +sel_with_diffs + +# %% +sel_with_diffs.describe() + +# %% +sel[('numThreads', 'nan', 'nan', 'nan')].value_counts() + +# %% +# 388 columns are identical +sel.drop(sel_with_diffs.columns, axis=1 + ).drop_duplicates() diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..07de284aa --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +requires = ["setuptools", "wheel"] +build-backend = "setuptools.build_meta" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 000000000..adebc5050 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,37 @@ +python >= 3.8, < 3.9 +numpy +pandas < 2.0 +scipy >= 1.6 +# plotting +matplotlib +python-kaleido +plotly +seaborn +pip +# ML +pytorch < 2.0 +# pytorch-cuda +scikit-learn +fastai +torchvision +# cudatoolkit=11.7 +tensorboard +umap-learn +# stats +pingouin +statsmodels +# other +tqdm # progress bars +xmltodict # configs +openpyxl # xml +omegaconf +# snakemake +snakemake-minimal < 7.26 +# jupyter +ipykernel +ipython +ipywidgets +jupyterlab # standalone jupyter installation +# jupyter_contrib_nbextensions # delete configuration file if you see an error: https://github.com/jupyter/nbconvert/issues/526#issuecomment-277552771 +jupyter-dash +papermill # execute ipynb's \ No newline at end of file diff --git a/requirements_R.txt b/requirements_R.txt new file mode 100644 index 000000000..c52c1fb66 --- /dev/null +++ b/requirements_R.txt @@ -0,0 +1,15 @@ +# R3.6.3 is not available as binaries for M1 and M2 amd64 atm (2023-08-11) +r-base # >= 3.6.0, < 4.0 # would be good if it could be relaxed +r-irkernel +# r-biocmanager +r-reshape2 +r-stringi # + rmarkdown hack for reshape2 +r-stringr # reshape2 +r-tidyverse +r-gdata +r-glmnet +r-e1071 +r-norm +r-missforest +r-vim +r-mice \ No newline at end of file diff --git a/requirements_dev.txt b/requirements_dev.txt new file mode 100644 index 000000000..92626e74b --- /dev/null +++ b/requirements_dev.txt @@ -0,0 +1,13 @@ +# dev +pytest +pytest-cov +jupytext +flake8 +flake8-bugbear +build +wheel +setuptools +pre-commit +pre-commit +jupyterlab_code_formatter +jupyterlab-git \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 000000000..4e33952ce --- /dev/null +++ b/setup.cfg @@ -0,0 +1,70 @@ +[metadata] +name = pimms-learn +version = attr:vaep.__version__ +description = Imputing (MS-based prote-) omics data using self supervised deep learning models +long_description = file: README.md +long_description_content_type = text/markdown +url = https://github.com/RasmussenLab/pimms +author = Henry Webel +author_email = "Henry Webel" +license = gpl-3.0 +license_files = LICENSE +classifiers = + License :: OSI Approved :: GNU General Public License v3 (GPLv3) + Intended Audience :: Healthcare Industry + Intended Audience :: Science/Research + Programming Language :: Python :: 3 + Topic :: Scientific/Engineering :: Bio-Informatics +[options] +# package_dir = +packages = find: +include_package_data = True +install_requires = + numpy + matplotlib + pandas<2 + plotly + torch<2 + scikit-learn>=1.0 + scipy + seaborn + fastai + omegaconf + tqdm + mrmr-selection + pingouin + +[options.extras_require] +docs = + sphinx + sphinx-book-theme + myst-parser + + +[options.packages.find] +# where = vaep +exclude = + test* + + +###################### +# Tool configuration # +###################### + +[flake8] +max-line-length = 120 +aggressive = 2 + + +[tool:pytest] +testpaths = test +addopts = --cov --strict-markers +xfail_strict = True + +[coverage:run] +source = vaep +branch = True + +[coverage:report] +show_missing = True +skip_covered = True diff --git a/setup.py b/setup.py index ea17c567e..8ab824cc7 100644 --- a/setup.py +++ b/setup.py @@ -1,35 +1,2 @@ -from setuptools import setup, find_packages -setup( - name="vaep", - version="0.1", - packages=find_packages(), - # scripts=['say_hello.py'], - - # Project uses reStructuredText, so ensure that the docutils get - # installed or upgraded on the target machine - install_requires=['docutils>=0.3'], - - package_data={ - # If any package contains *.txt or *.rst files, include them: - '': ['*.txt', '*.rst'], - # And include any *.msg files found in the 'hello' package, too: - 'hello': ['*.msg'], - }, - - # metadata to display on PyPI - author="Henry Webel", - author_email="henry.webel@sund.ku.dk", - description="Variational Autoencoder for Proteomics data.", - keywords="proteomics", - # url="http://example.com/HelloWorld/", # project home page, if any - # project_urls={ - # "Bug Tracker": "https://bugs.example.com/HelloWorld/", - # "Documentation": "https://docs.example.com/HelloWorld/", - # "Source Code": "https://code.example.com/HelloWorld/", - # }, - # classifiers=[ - # 'License :: OSI Approved :: Python Software Foundation License' - # ] - - # could also include long_description, download_url, etc. -) \ No newline at end of file +from setuptools import setup +setup() \ No newline at end of file diff --git a/vaep/__init__.py b/vaep/__init__.py index 0378b3aa4..7402b9bc1 100644 --- a/vaep/__init__.py +++ b/vaep/__init__.py @@ -16,18 +16,25 @@ import vaep.pandas import vaep.plotting import vaep.logging -from vaep.plotting import savefig +import vaep.plotting + +import vaep.nb + +savefig = vaep.plotting.savefig -from . import nb +__license__ = 'GPLv3' +__version__ = (0, 1, 0) -## set some defaults + +# set some defaults class IntArrayFormatter(pf.GenericArrayFormatter): def _format_strings(self): formatter = self.formatter or '{:,d}'.format fmt_values = [formatter(x) for x in self.values] return fmt_values + pd.options.display.float_format = '{:,.3f}'.format pf.IntArrayFormatter = IntArrayFormatter -vaep.plotting.make_large_descriptors('x-large') \ No newline at end of file +vaep.plotting.make_large_descriptors('x-large') diff --git a/vaep/data_handling.py b/vaep/data_handling.py index 519e5ad0a..1f0bc5404 100644 --- a/vaep/data_handling.py +++ b/vaep/data_handling.py @@ -1,25 +1,27 @@ """ -Functionality to handle protein and peptide datasets. +Functionality to handle protein and peptide datasets. """ import numpy as np import pandas as pd -#coverage -def coverage(X:pd.DataFrame, coverage_col:float, coverage_row:float): - """Select proteins by column depending on their coverage. +# coverage + + +def coverage(X: pd.DataFrame, coverage_col: float, coverage_row: float): + """Select proteins by column depending on their coverage. Of these selected proteins, where the rows have a certain number of overall proteins. """ - mask_col = X.isnull().mean() <= 1-coverage_col - df = X.loc[:,mask_col] - mask_row = df.isnull().mean(axis=1) <= 1-coverage_row - df = df.loc[mask_row,:] + mask_col = X.isnull().mean() <= 1 - coverage_col + df = X.loc[:, mask_col] + mask_row = df.isnull().mean(axis=1) <= 1 - coverage_row + df = df.loc[mask_row, :] return df -def compute_stats_missing(X:pd.DataFrame, - col_no_missing:str='no_missing', - col_no_identified:str='no_identified', - col_prop_samples:str='prop_samples') -> pd.DataFrame: +def compute_stats_missing(X: pd.DataFrame, + col_no_missing: str = 'no_missing', + col_no_identified: str = 'no_identified', + col_prop_samples: str = 'prop_samples') -> pd.DataFrame: """Dataset of repeated samples indicating if an observation has the variables observed or missing x in {0,1}""" if X.index.name: @@ -38,12 +40,12 @@ def compute_stats_missing(X:pd.DataFrame, sample_stats = sample_stats.sort_values( by=col_no_identified, ascending=False) sample_stats[col_prop_samples] = np.array( - range(1, len(sample_stats)+1)) / len(sample_stats) + range(1, len(sample_stats) + 1)) / len(sample_stats) return sample_stats def get_sorted_not_missing(X: pd.DataFrame) -> pd.DataFrame: - """Return a Dataframe with missing values. Order columns by degree of completness + """Return a Dataframe with missing values. Order columns by degree of completness over columns from variables least to most shared among observations.""" X = X.notna().astype(int) - return X[X.mean().sort_values().index] \ No newline at end of file + return X[X.mean().sort_values().index] diff --git a/vaep/fasta.py b/vaep/fasta.py index c9084f9fe..bf1351ded 100644 --- a/vaep/fasta.py +++ b/vaep/fasta.py @@ -17,7 +17,7 @@ def get_n_miscleaved(pep_sequences: list, num_missed: int): _miscleaved = [] for i in range(len(pep_sequences)): if i >= num_missed: - _miscleaved.append(''.join(pep_sequences[i-num_missed:i+1])) + _miscleaved.append(''.join(pep_sequences[i - num_missed:i + 1])) return _miscleaved @@ -41,7 +41,7 @@ def cleave_to_tryptic(seq, num_missed_cleavages=1, reversed=False, add_rxk=False peps_seq = [seq, ] - for i in range(1, num_missed_cleavages+1): + for i in range(1, num_missed_cleavages + 1): _seq = get_n_miscleaved(seq, num_missed=i) peps_seq.append(_seq) @@ -53,9 +53,9 @@ def cleave_to_tryptic(seq, num_missed_cleavages=1, reversed=False, add_rxk=False def find_rxk_peptides(l_peptides): - """Combine 3 peptides to one, if the first is an - 'RxK'-peptide: RX, XR, KX, XK - where the X can - be any other amino-acid. + """Combine 3 peptides to one, if the first is an + 'RxK'-peptide: RX, XR, KX, XK - where the X can + be any other amino-acid. Returns ------- @@ -67,7 +67,7 @@ def find_rxk_peptides(l_peptides): for i in range(len(l_peptides) - 2): if len(l_peptides[i]) <= 2: rdx_peptides.append( - ''.join(l_peptides[i:i+3]) + ''.join(l_peptides[i:i + 3]) ) return rdx_peptides else: diff --git a/vaep/imputation.py b/vaep/imputation.py index 98b715ff2..4dd553ae9 100644 --- a/vaep/imputation.py +++ b/vaep/imputation.py @@ -6,7 +6,6 @@ """ from typing import Tuple, Dict -from sklearn.impute import KNNImputer from sklearn.neighbors import NearestNeighbors import scipy import numpy as np @@ -21,9 +20,9 @@ def impute_missing(protein_values, mean=None, std=None): """ - Imputation is based on the mean and standard deviation + Imputation is based on the mean and standard deviation from the protein_values. - If mean and standard deviation (std) are given, + If mean and standard deviation (std) are given, missing values are imputed and protein_values are returned imputed. If no mean and std are given, the mean and std are computed from the non-missing protein_values. @@ -39,7 +38,7 @@ def impute_missing(protein_values, mean=None, std=None): protein_values: pandas.Series """ raise NotImplementedError('Will be the main function combining features') - #clip by zero? + # clip by zero? def _select_data(data: pd.DataFrame, threshold: float): @@ -59,7 +58,7 @@ def _select_data(data: pd.DataFrame, threshold: float): def _sparse_coo_array(data: pd.DataFrame): - """Return a sparse scipy matrix from dense `pandas.DataFrame` with many + """Return a sparse scipy matrix from dense `pandas.DataFrame` with many missing values. """ indices = np.nonzero(~np.isnan(data.to_numpy())) @@ -84,7 +83,7 @@ def _get_weighted_mean(distances, data): # could be done in PCA transformed space def imputation_KNN(data, alone=True, threshold=0.5): """ - + Parameters ---------- @@ -110,7 +109,7 @@ def imputation_KNN(data, alone=True, threshold=0.5): "for ids: {}".format(ids[distances == 0.0]) ) mask = data_selected.iloc[i].isna() - data_selected.loc[i, mask] = mean_imputed.loc[mask] # SettingWithCopyError + data_selected.loc[i, mask] = mean_imputed.loc[mask] # SettingWithCopyError data.update(data_selected) return data @@ -132,7 +131,7 @@ def imputation_normal_distribution(log_intensities: pd.Series, Shift the mean of the log_intensities by factors of their standard deviation to the negative. std_shrinkage: float - Value greater than zero by which to shrink (or inflate) the + Value greater than zero by which to shrink (or inflate) the standard deviation of the log_intensities. """ np.random.seed(RANDOMSEED) @@ -140,7 +139,7 @@ def imputation_normal_distribution(log_intensities: pd.Series, try: log_intensities.Series(log_intensities) logger.warning("Series created of Iterable.") - except: + except BaseException: raise ValueError( "Plese provided data which is a pandas.Series or an Iterable") if mean_shift < 0: @@ -165,10 +164,10 @@ def imputation_normal_distribution(log_intensities: pd.Series, np.random.normal(mean_shifted, std_shrinked)) -def impute_shifted_normal(df_wide:pd.DataFrame, - mean_shift:float=1.8, - std_shrinkage:float=0.3, - completeness:float=0.6, +def impute_shifted_normal(df_wide: pd.DataFrame, + mean_shift: float = 1.8, + std_shrinkage: float = 0.3, + completeness: float = 0.6, axis=1, seed=RANDOMSEED) -> pd.Series: """Get replacements for missing values. @@ -212,15 +211,15 @@ def impute_shifted_normal(df_wide:pd.DataFrame, N, M = df_wide.shape if axis == 1: imputed_shifted_normal = pd.DataFrame( - np.random.normal(mean_shifted, std_shrinked, size=(M, N)), - index=df_wide.columns, - columns=df_wide.index) + np.random.normal(mean_shifted, std_shrinked, size=(M, N)), + index=df_wide.columns, + columns=df_wide.index) imputed_shifted_normal = imputed_shifted_normal.T else: imputed_shifted_normal = pd.DataFrame( - np.random.normal(mean_shifted, std_shrinked, size=(N, M)), - index=df_wide.index, - columns=df_wide.columns) + np.random.normal(mean_shifted, std_shrinked, size=(N, M)), + index=df_wide.index, + columns=df_wide.columns) imputed_shifted_normal = imputed_shifted_normal[df_wide.isna()].stack() return imputed_shifted_normal @@ -234,19 +233,19 @@ def imputation_mixed_norm_KNN(data): return data -def compute_moments_shift(observed: pd.Series, imputed: pd.Series, names:Tuple[str, str]=('observed', 'imputed')) -> Dict[str, float]: +def compute_moments_shift(observed: pd.Series, imputed: pd.Series, + names: Tuple[str, str] = ('observed', 'imputed')) -> Dict[str, float]: """Summary of overall shift of mean and std. dev. of predictions for a imputation method.""" name_obs, name_model = names data = {name: {'mean': series.mean(), 'std': series.std()} for series, name in zip([observed, imputed], names)} observed, imputed = data[name_obs], data[name_model] - shifts = dict() data[name_model]['mean shift (in std)'] = (observed["mean"] - imputed["mean"]) / observed["std"] data[name_model]['std shrinkage'] = imputed["std"] / observed["std"] return data -def stats_by_level(series:pd.Series, index_level:int=0, min_count:int=5) -> pd.Series: +def stats_by_level(series: pd.Series, index_level: int = 0, min_count: int = 5) -> pd.Series: """Count, mean and std. dev. by index level.""" agg = series.groupby(level=index_level).agg(['count', 'mean', 'std']) agg = agg.loc[agg['count'] > min_count] - return agg.mean() \ No newline at end of file + return agg.mean() diff --git a/vaep/io/dataloaders.py b/vaep/io/dataloaders.py index 88128396e..e98d2dd03 100644 --- a/vaep/io/dataloaders.py +++ b/vaep/io/dataloaders.py @@ -67,7 +67,8 @@ def __repr__(self): def get_dls(train_X: pandas.DataFrame, valid_X: pandas.DataFrame, transformer: VaepPipeline, - bs: int = 64) -> DataLoaders: + bs: int = 64, + num_workers=0) -> DataLoaders: """Create training and validation dataloaders Parameters @@ -112,7 +113,10 @@ def get_dls(train_X: pandas.DataFrame, valid_ds = datasets.DatasetWithTargetSpecifyTarget(df=train_X, targets=valid_X, transformer=transformer) - return DataLoaders.from_dsets(train_ds, valid_ds, bs=bs, drop_last=False) + # ! Need for script exection (as plain python file) + # https://pytorch.org/docs/stable/notes/windows.html#multiprocessing-error-without-if-clause-protection + return DataLoaders.from_dsets(train_ds, valid_ds, bs=bs, drop_last=False, + num_workers=num_workers) # dls.test_dl diff --git a/vaep/io/datasplits.py b/vaep/io/datasplits.py index 9e0e08b93..a6de3ace5 100644 --- a/vaep/io/datasplits.py +++ b/vaep/io/datasplits.py @@ -51,10 +51,10 @@ class DataSplits(): test_y: pd.DataFrame = None - def __post_init__(self, is_wide_format=True): + def __post_init__(self): self._items = sorted(self.__dict__) + self._is_wide = self.is_wide_format self._items.remove('is_wide_format') - self._is_wide = is_wide_format # needs to be set explicitly when using the init def __getitem__(self, index): return (self._items[index], getattr(self, self._items[index])) diff --git a/vaep/logging.py b/vaep/logging.py index ddbe54408..6158b5ba8 100644 --- a/vaep/logging.py +++ b/vaep/logging.py @@ -1,3 +1,4 @@ +"""Custom logging setup for notebooks.""" from pathlib import Path from datetime import datetime import logging @@ -8,10 +9,10 @@ def setup_nb_logger(level: int = logging.INFO, - format_str: str = f'%(name)s - %(levelname)-8s %(message)s') -> None: + format_str: str = '%(name)s - %(levelname)-8s %(message)s') -> None: logging.basicConfig(level=level, format=format_str) root_logger = logging.getLogger() - root_logger.setLevel(level) # in case root_logger existed already before calling basicConfig + root_logger.setLevel(level) # in case root_logger existed already before calling basicConfig c_format = logging.Formatter(format_str) if root_logger.handlers: handler = root_logger.handlers[0] @@ -20,6 +21,7 @@ def setup_nb_logger(level: int = logging.INFO, handler.setFormatter(c_format) return root_logger + def setup_logger_w_file(logger, level=logging.INFO, fname_base=None): """Setup logging in project. Takes a logger an creates @@ -43,13 +45,13 @@ def setup_logger_w_file(logger, level=logging.INFO, fname_base=None): >>> logger = logging.getLogger('vaep') >>> _ = setup_logger_w_file(logger) # no logging to file >>> logger.handlers = [] # reset logger - >>> _ = setup_logger_w_file() # + >>> _ = setup_logger_w_file() # """ logger.setLevel(level) logger.handlers = [] # remove any handler in case you reexecute the cell - c_format = logging.Formatter(f'%(name)s - %(levelname)-8s %(message)s') + c_format = logging.Formatter('%(name)s - %(levelname)-8s %(message)s') c_handler = logging.StreamHandler(sys.stdout) c_handler.setLevel(level) diff --git a/vaep/models/__init__.py b/vaep/models/__init__.py index 949ceabba..55ac20670 100644 --- a/vaep/models/__init__.py +++ b/vaep/models/__init__.py @@ -263,10 +263,10 @@ def calculte_metrics(pred_df: pd.DataFrame, else: if issubclass(type(true_col), int): y_true = pred_df.iloc[:, true_col] - pred_df = pred_df.drop(y_true.name, axis=1) + y_pred = pred_df.drop(y_true.name, axis=1) elif issubclass(type(true_col), str): y_true = pred_df[true_col] - pred_df = pred_df.drop(true_col, axis=1) + y_pred = pred_df.drop(true_col, axis=1) else: raise ValueError( f'true_col has to be of type str or int, not {type(true_col)}') diff --git a/vaep/models/ae.py b/vaep/models/ae.py index c3a8cd8e6..29cc99dff 100644 --- a/vaep/models/ae.py +++ b/vaep/models/ae.py @@ -70,7 +70,7 @@ def get_preds_from_df(df: pd.DataFrame, - +leaky_relu_default = nn.LeakyReLU(.1) class Autoencoder(nn.Module): """Autoencoder base class. @@ -79,8 +79,8 @@ class Autoencoder(nn.Module): def __init__(self, n_features: int, - n_neurons: Union[int, list], - activation=nn.LeakyReLU(.1), + n_neurons: Union[int, List[int]], + activation=leaky_relu_default, last_decoder_activation=None, dim_latent: int = 10): """Initialize an Autoencoder @@ -106,7 +106,7 @@ def __init__(self, self.layers = [n_features, *self.n_neurons] self.dim_latent = dim_latent - #define architecture hidden layer + # define architecture hidden layer def build_layer(in_feat, out_feat): return [nn.Linear(in_feat, out_feat), nn.Dropout(0.2), @@ -192,15 +192,15 @@ def get_missing_values(df_train_wide: pd.DataFrame, # def __init__(self, # n_features: int, -# h_layers: int, +# n_neurons: int, # activation=nn.LeakyReLU, # last_encoder_activation=nn.LeakyReLU, # last_decoder_activation=None, # dim_latent: int = 10): # super().__init__() -# self.n_features, self.h_layers = n_features, list(L(h_layers)) -# self.layers = [n_features, *self.h_layers] +# self.n_features, self.n_neurons = n_features, list(L(n_neurons)) +# self.layers = [n_features, *self.n_neurons] # self.dim_latent = dim_latent # # Encoder diff --git a/vaep/models/vae.py b/vaep/models/vae.py index 79436acd1..e7e2e8401 100644 --- a/vaep/models/vae.py +++ b/vaep/models/vae.py @@ -13,23 +13,23 @@ from torch import nn import torch.nn.functional as F - +leaky_relu_default = nn.LeakyReLU(.1) class VAE(nn.Module): def __init__(self, n_features: int, - h_layers: List[str], - activation=nn.LeakyReLU(.1), - # last_encoder_activation=nn.LeakyReLU(.1), + n_neurons: List[int], + activation=leaky_relu_default, + # last_encoder_activation=leaky_relu_default, last_decoder_activation=None, dim_latent: int = 10): super().__init__() - #set up hyperparameters - self.n_features, self.h_layers = n_features, list(h_layers) - self.layers = [n_features, *self.h_layers] + # set up hyperparameters + self.n_features, self.n_neurons = n_features, list(n_neurons) + self.layers = [n_features, *self.n_neurons] self.dim_latent = dim_latent - #define architecture hidden layer + # define architecture hidden layer def build_layer(in_feat, out_feat): return [nn.Linear(in_feat, out_feat), nn.Dropout(0.2), diff --git a/vaep/plotting/data.py b/vaep/plotting/data.py index 1ec2f291e..1fbd31d7a 100644 --- a/vaep/plotting/data.py +++ b/vaep/plotting/data.py @@ -26,11 +26,12 @@ def get_min_max_iterable(series: Iterable[pd.Series]) -> Tuple[int]: return min_bin, max_bin -def plot_histogram_intensites(s: pd.Series, +def plot_histogram_intensities(s: pd.Series, interval_bins=1, min_max=(15, 40), ax=None, **kwargs) -> Tuple[Axes, range]: + """Plot intensities in Series in a certain range and equally spaced intervals.""" min_bin, max_bin = min_max bins = range(min_bin, max_bin, interval_bins) ax = s.plot.hist(bins=bins, xticks=list(bins), diff --git a/vaep/sklearn/__init__.py b/vaep/sklearn/__init__.py index 810959779..fcad94eea 100644 --- a/vaep/sklearn/__init__.py +++ b/vaep/sklearn/__init__.py @@ -1,22 +1,28 @@ +"""Scikit-learn related functions for the project for ALD part. + +Might be moved to a separate package in the future. +""" import pandas as pd import sklearn -from sklearn.model_selection import RepeatedStratifiedKFold, cross_validate +import sklearn.model_selection from mrmr import mrmr_classif from .types import Splits, ResultsSplit, Results, AucRocCurve, PrecisionRecallCurve +default_model: sklearn.linear_model.LogisticRegression = sklearn.linear_model.LogisticRegression( + random_state=42, + solver='liblinear') + def run_model(splits: Splits, - model: sklearn.base.BaseEstimator = sklearn.linear_model.LogisticRegression(random_state=42, solver='liblinear'), + model: sklearn.base.BaseEstimator = default_model, n_feat_to_select=9, ) -> Results: selected_features = mrmr_classif(X=splits.X_train, y=splits.y_train, K=n_feat_to_select) model.fit(splits.X_train[selected_features], splits.y_train) - - pred_score_test = model.predict_proba( splits.X_test[selected_features])[:, 1] results_test = get_results_split(y_true=splits.y_test, y_score=pred_score_test) @@ -25,7 +31,6 @@ def run_model(splits: Splits, splits.X_train[selected_features])[:, 1] results_train = get_results_split(y_true=splits.y_train, y_score=pred_score_train) - ret = Results(model=model, selected_features=selected_features, train=results_train, @@ -48,13 +53,15 @@ def get_results_split(y_true, y_score): return ret -# model = LogisticRegression(random_state=random_state, solver='liblinear') +scoring_defaults = ['precision', 'recall', 'f1', 'balanced_accuracy', 'roc_auc'] + + def find_n_best_features(X, y, name, - model=sklearn.linear_model.LogisticRegression(random_state=42, solver='liblinear'), + model=default_model, groups=None, n_features_max=15, random_state=42, - scoring=['precision', 'recall', 'f1', 'balanced_accuracy', 'roc_auc']): + scoring=scoring_defaults): summary = [] cv = sklearn.model_selection.RepeatedStratifiedKFold( n_splits=5, n_repeats=10, random_state=random_state) @@ -62,7 +69,7 @@ def find_n_best_features(X, y, name, # could have a warning in case _X = X.loc[in_both] _y = y.loc[in_both] - for n_features in range(1, n_features_max+1): + for n_features in range(1, n_features_max + 1): selected_features = mrmr_classif(_X, _y, K=n_features) _X_mrmr = _X[selected_features] scores = sklearn.model_selection.cross_validate( diff --git a/vaep/sklearn/ae_transformer.py b/vaep/sklearn/ae_transformer.py new file mode 100644 index 000000000..6dcec4521 --- /dev/null +++ b/vaep/sklearn/ae_transformer.py @@ -0,0 +1,147 @@ +"""Scikit-learn style interface for Denoising and Variational Autoencoder model.""" +from __future__ import annotations + +from sklearn.impute import SimpleImputer +from sklearn.preprocessing import StandardScaler +import sklearn + +from pathlib import Path + + +from fastai.basics import * +from fastai.callback.all import * +from fastai.torch_basics import * + +from fastai import learner + +from sklearn.utils.validation import check_is_fitted +from sklearn.base import BaseEstimator, TransformerMixin + +from typing import Optional + +import vaep.models as models +from vaep.models import ae + + +# patch plotting function +from vaep.models import plot_loss +learner.Recorder.plot_loss = plot_loss + + +default_pipeline = sklearn.pipeline.Pipeline( + [ + ('normalize', StandardScaler()), + ('impute', SimpleImputer(add_indicator=False)) + ]) + + +class AETransformer(TransformerMixin, BaseEstimator): + """Collaborative Filtering transformer. + + + Parameters + ---------- + demo_param : str, default='demo' + A parameter used for demonstation of how to pass and store paramters. + + Attributes + ---------- + n_features_ : int + The number of features of the data passed to :meth:`fit`. + """ + + def __init__(self, + hidden_layers: list[int], + latent_dim: int = 15, + out_folder: str = '.', + model='VAE', + # y_range:Optional[tuple[int]]=None, + batch_size: int = 64, + ): + self.hidden_layers = hidden_layers + self.latent_dim = latent_dim + self.batch_size = batch_size + self.out_folder = Path(out_folder) + + if model == 'VAE': + self.model = models.vae.VAE + self.cbs = [ae.ModelAdapterVAE()] + self.loss_fct = models.vae.loss_fct + elif model == 'DAE': + self.model = ae.Autoencoder + self.cbs = [ae.ModelAdapter(p=0.2)] + self.loss_fct = MSELossFlat(reduction='sum') + else: + raise ValueError(f'Unknown model {model}, choose either "VAE" or "DAE"') + self.model_name = model + # ! patience? + # EarlyStoppingCallback(patience=args.patience) + + def fit(self, X, y, + epochs_max: int = 100, + cuda: bool = True, + patience: Optional[int] = None): + self.analysis = ae.AutoEncoderAnalysis( # datasplits=data, + train_df=X, + val_df=y, + model=self.model, + model_kwargs=dict(n_features=X.shape[-1], + n_neurons=self.hidden_layers, + last_decoder_activation=None, + dim_latent=self.latent_dim), + transform=default_pipeline, + decode=['normalize'], + bs=self.batch_size) + + self.n_params = self.analysis.n_params_ae + if cuda: + self.analysis.model = self.analysis.model.cuda() + + # results = [] + # loss_fct = partial(models.vae.loss_fct, results=results) + cbs = self.cbs + if patience is not None: + cbs = [*self.cbs, EarlyStoppingCallback(patience=patience)] + self.analysis.learn = Learner(dls=self.analysis.dls, + model=self.analysis.model, + loss_func=self.loss_fct, + cbs=cbs + ) + + suggested_lr = self.analysis.learn.lr_find() + self.analysis.params['suggested_inital_lr'] = suggested_lr.valley + self.analysis.learn.fit_one_cycle(epochs_max, lr_max=suggested_lr.valley) + self.epochs_trained_ = self.analysis.learn.epoch + 1 + N_train_notna = X.notna().sum().sum() + N_val_notna = y.notna().sum().sum() + self.fig_loss_ = models.plot_training_losses( + self.analysis.learn, self.model_name, + folder=self.out_folder, + norm_factors=[N_train_notna, N_val_notna]) + return self + + def transform(self, X): + """ A reference implementation of a transform function. + + Parameters + ---------- + X : {array-like, sparse-matrix}, shape (n_samples, n_features) + The input samples. + + Returns + ------- + X_transformed : array, shape (n_samples, n_features) + The array containing the element-wise square roots of the values + in ``X``. + """ + # Check is fit had been called + check_is_fitted(self, 'epochs_trained_') + + self.analysis.model.eval() + + pred, target = ae.get_preds_from_df( + df=X, + learn=self.analysis.learn, + position_pred_tuple=0, + transformer=self.analysis.transform) + return X.fillna(pred) diff --git a/vaep/sklearn/cf_transformer.py b/vaep/sklearn/cf_transformer.py new file mode 100644 index 000000000..42f7bcc8c --- /dev/null +++ b/vaep/sklearn/cf_transformer.py @@ -0,0 +1,179 @@ +"""Scikit-learn style interface for Collaborative Filtering model.""" +from __future__ import annotations + +from pathlib import Path + +from fastai.tabular.all import * +from fastai.collab import * + +from fastai import learner + +import pandas as pd + +from sklearn.utils.validation import check_is_fitted +from sklearn.base import BaseEstimator, TransformerMixin + +import vaep +from vaep.models import collab +import vaep.models as models + + +# patch plotting function +from vaep.models import plot_loss +learner.Recorder.plot_loss = plot_loss + + +class CollaborativeFilteringTransformer(TransformerMixin, BaseEstimator): + """Collaborative Filtering transformer. + + + Parameters + ---------- + demo_param : str, default='demo' + A parameter used for demonstation of how to pass and store paramters. + + Attributes + ---------- + n_features_ : int + The number of features of the data passed to :meth:`fit`. + """ + + def __init__(self, + target_column: str, + sample_column: str, + item_column: str, + n_factors: int = 15, + out_folder: str = '.', + # y_range:Optional[tuple[int]]=None, + batch_size: int = 4096, + ): + self.target_column = target_column + self.item_column = item_column + self.sample_column = sample_column + self.n_factors = n_factors + self.out_folder = Path(out_folder) + self.batch_size = batch_size + + def fit(self, X: pd.Series, y: pd.Series = None, + cuda: bool = True, + patience: int = 1, + epochs_max=20,): + """A reference implementation of a fitting function for a transformer. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + The training input samples. + y : None + There is no need of a target in a transformer, yet the pipeline API + requires this parameter. + + Returns + ------- + self : object + Returns self. + """ + # ! X = check_array(X, accept_sparse=True) + self.model_kwargs = dict( + + + n_factors=self.n_factors, + y_range=(int(X.squeeze().min()), + int(X.squeeze().max()) + 1) + ) + + if y is not None: + X, frac = collab.combine_data(X, y) + else: + X, frac = X.reset_index(), 0.0 + + self.dls = CollabDataLoaders.from_df( + X, + valid_pct=frac, + seed=42, + user_name=self.sample_column, + item_name=self.item_column, + rating_name=self.target_column, + bs=self.batch_size) + splits = None + if y is not None: + idx_splitter = IndexSplitter(list(range(len(X), len(X) + len(y)))) + splits = idx_splitter(X) + + self.cat_names = [self.sample_column, self.item_column] + self.to = TabularCollab(df=X, + procs=[Categorify], + cat_names=self.cat_names, + y_names=[self.target_column], + y_block=TransformBlock(), + splits=splits) + self.dls = self.to.dataloaders(path='.', bs=self.batch_size) + + self.model = EmbeddingDotBias.from_classes( + classes=self.dls.classes, + **self.model_kwargs) + + self.n_params = models.calc_net_weight_count(self.model) + # ana_collab.params['n_parameters'] = args.n_params + self.learn = Learner(dls=self.dls, + model=self.model, + loss_func=MSELossFlat(), + cbs=EarlyStoppingCallback(patience=patience) if y is not None else None, + model_dir=self.out_folder) + if cuda: + self.learn.model = self.learn.model.cuda() + + # + suggested_lr = self.learn.lr_find() + print(f"{suggested_lr.valley = :.5f}") + + self.learn.fit_one_cycle(epochs_max, + + lr_max=suggested_lr.valley) + self.plot_loss(y) + self.epochs_trained_ = self.learn.epoch + 1 + self.model_kwargs['suggested_inital_lr'] = suggested_lr.valley + # ? own method? + # self.learn.save('collab_model') + + return self + + def transform(self, X): + """ A reference implementation of a transform function. + + Parameters + ---------- + X : {array-like, sparse-matrix}, shape (n_samples, n_features) + The input samples. + + Returns + ------- + X_transformed : array, shape (n_samples, n_features) + The array containing the element-wise square roots of the values + in ``X``. + """ + # Check is fit had been called + check_is_fitted(self, 'epochs_trained_') + + # ! Input validation + # X = check_array(X, accept_sparse=True) + + X = X.squeeze() + mask = X.unstack().isna().stack() + idx_na = mask.loc[mask].index + dl_real_na = self.dls.test_dl(idx_na.to_frame()) + pred_na, _ = self.learn.get_preds(dl=dl_real_na) + pred_na = pd.Series(pred_na, idx_na, name=self.target_column) + return pd.concat([X, pred_na]) + + def plot_loss(self, y, figsize=(8, 4)): # -> Axes: + fig, ax = plt.subplots(figsize=figsize) + ax.set_title('CF loss: Reconstruction loss') + self.learn.recorder.plot_loss(skip_start=5, ax=ax, + with_valid=True if y is not None else False) + vaep.savefig(fig, name='collab_training', + folder=self.out_folder) + self.model_kwargs['batch_size'] = self.batch_size + vaep.io.dump_json(self.model_kwargs, self.out_folder / + 'model_params_{}.json'.format('CF')) + return ax diff --git a/vaep/sklearn/types.py b/vaep/sklearn/types.py index c2cdac654..70dfabe48 100644 --- a/vaep/sklearn/types.py +++ b/vaep/sklearn/types.py @@ -1,3 +1,4 @@ +"""Types used in scikit-learn pipelines.""" import pickle from dataclasses import dataclass from collections import namedtuple @@ -11,8 +12,8 @@ @dataclass class ResultsSplit: - auc: float = None # receiver operation curve area under the curve - aps: float = None # average precision score + auc: float = None # receiver operation curve area under the curve + aps: float = None # average precision score roc: AucRocCurve = None prc: PrecisionRecallCurve = None diff --git a/vaep/tests/test_tf_board.py b/vaep/tests/test_tf_board.py deleted file mode 100644 index 3bbea3398..000000000 --- a/vaep/tests/test_tf_board.py +++ /dev/null @@ -1,20 +0,0 @@ -from vaep.tf_board import TensorboardModelNamer -from vaep.transform import StandardScaler -import pytest - -def test_TensorboardModelNamer(): - expected = 'model_hl01_12_13_14_scaler' - - tensorboard_model_namer = TensorboardModelNamer(prefix_folder='experiment') - - assert tensorboard_model_namer.get_model_name( - hidden_layers=1, neurons=[12, 13, 14], scaler='scaler') == expected - assert tensorboard_model_namer.get_model_name( - hidden_layers=1, neurons='12 13 14', scaler='scaler') == expected - assert tensorboard_model_namer.get_model_name( - hidden_layers=1, neurons='12_13_14', scaler='scaler') == expected - scaler=StandardScaler() - assert tensorboard_model_namer.get_model_name( - hidden_layers=1, neurons='12_13_14', scaler=scaler) == 'model_hl01_12_13_14_StandardScaler()' - with pytest.raises(TypeError): - tensorboard_model_namer.get_writer(hidden_layers=1, neurons=1, scaler=scaler) \ No newline at end of file diff --git a/vaep/tests/test_transfrom.py b/vaep/tests/test_transfrom.py index e329ff1fa..0aa0176e8 100644 --- a/vaep/tests/test_transfrom.py +++ b/vaep/tests/test_transfrom.py @@ -13,10 +13,11 @@ from vaep.transform import StandardScaler, ShiftedStandardScaler, VaepPipeline from vaep.io.datasets import to_tensor -def test_log(): - row = pd.Series([np.NaN, 0.0, np.exp(1), np.exp(2)]) - row = log(row) - assert row.equals(pd.Series([np.NaN, np.NaN, 1.0, 2.0])) +# not used anywhere +# def test_log(): +# row = pd.Series([np.NaN, 0.0, np.exp(1), np.exp(2)]) +# row = log(row) +# assert row.equals(pd.Series([np.NaN, np.NaN, 1.0, 2.0])) def test_StandardScaler(): diff --git a/vaep/transform.py b/vaep/transform.py index 8482df4ee..ee3b9ac87 100644 --- a/vaep/transform.py +++ b/vaep/transform.py @@ -18,7 +18,7 @@ def log(row: pd.Series): - """Apply log Transformation to values.""" + """Apply log Transformation to values setting zeros to NaN.""" return np.log(row.where(row != 0.0)) @@ -35,7 +35,7 @@ def log(row: pd.Series): # axis=0) # analysis.corr_linear_vs_log.describe() -# Can this be a MixIn class? +# ? Can this be a MixIn class? class StandardScaler(preprocessing.StandardScaler): def transform(self, X, copy=None): res = super().transform(X, copy)