diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 48e101087..3fe7ca96b 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -65,7 +65,7 @@ jobs: run: | cd project snakemake -p -c1 --configfile config/single_dev_dataset/example/config.yaml -n - snakemake -p -c2 -k --configfile config/single_dev_dataset/example/config.yaml + snakemake -p -c1 -k --configfile config/single_dev_dataset/example/config.yaml - name: Archive results uses: actions/upload-artifact@v3 with: diff --git a/README.md b/README.md index 04dacdbe6..65782b91b 100644 --- a/README.md +++ b/README.md @@ -196,7 +196,7 @@ Packages either are based on this repository, or were referenced by NAGuideR (Ta From the brief description in the table the exact procedure is not always clear. | Method | Package | source | status | name | -| ------------- | ----------------- | ------ | --- |------------------ | +| ------------- | ----------------- | ------ | ------ |------------------ | | CF | pimms | pip | | Collaborative Filtering | | DAE | pimms | pip | | Denoising Autoencoder | | VAE | pimms | pip | | Variational Autoencoder | @@ -206,7 +206,7 @@ From the brief description in the table the exact procedure is not always clear. | COLMEDIAN | e1071 | CRAN | | replace NA with column median | | ROWMEDIAN | e1071 | CRAN | | replace NA with row median | | KNN_IMPUTE | impute | BIOCONDUCTOR | | k nearest neighbor imputation | -| SEQKNN | SeqKnn | tar file | | Sequential k- nearest neighbor imputation
start with feature with least missing values and re-use imputed values for not yet imputed features +| SEQKNN | SeqKnn | tar file | | Sequential k- nearest neighbor imputation
starts with feature with least missing values and re-use imputed values for not yet imputed features | BPCA | pcaMethods | BIOCONDUCTOR | | Bayesian PCA missing value imputation | SVDMETHOD | pcaMethods | BIOCONDUCTOR | | replace NA initially with zero, use k most significant eigenvalues using Singular Value Decomposition for imputation until convergence | LLS | pcaMethods | BIOCONDUCTOR | | Local least squares imputation of a feature based on k most correlated features @@ -222,26 +222,12 @@ From the brief description in the table the exact procedure is not always clear. | TRKNN | - | script | | truncation k-nearest neighbor imputation | RF | missForest | CRAN | | Random Forest imputation (one feature at a time) | PI | - | - | | Downshifted normal distribution (per sample) +| GSIMP | - | script | | QRILC initialization and iterative Gibbs sampling with generalized linear models (glmnet) +| MSIMPUTE | msImpute | BIOCONDUCTOR | | Missing at random algorithm using low rank approximation +| MSIMPUTE_MNAR | msImpute | BIOCONDUCTOR | | Missing not at random algorithm using low rank approximation | ~~grr~~ | DreamAI | - | Fails to install | Rigde regression | ~~GMS~~ | GMSimpute | tar file | Fails on Windows | Lasso model - -## Workflows - -The workflows folder in the repository contains snakemake workflows used for rawfile data processing, -both for running MaxQuant over a large set of HeLa raw files -and ThermoRawFileParser on a list of raw files to extract their meta data. For details see: - -> Webel, Henry, Yasset Perez-Riverol, Annelaura Bach Nielson, and Simon Rasmussen. 2023. “Mass Spectrometry-Based Proteomics Data from Thousands of HeLa Control Samples.” Research Square. https://doi.org/10.21203/rs.3.rs-3083547/v1. - -### MaxQuant - -Process single raw files using MaxQuant. See [README](workflows/maxquant/README.md) for details. - -### Metadata - -Read metadata from single raw files using MaxQuant. See [README](workflows/metadata/README.md) for details. - ## Build status [![Documentation Status](https://readthedocs.org/projects/pimms/badge/?version=latest)](https://pimms.readthedocs.io/en/latest/?badge=latest) \ No newline at end of file diff --git a/environment.yml b/environment.yml index ce3760fba..8bcde8e71 100644 --- a/environment.yml +++ b/environment.yml @@ -47,8 +47,8 @@ dependencies: - jupyter-dash - papermill # execute ipynb's # R packages (listed in NAGuideR) - - r-base #=3.6 - - r-devtools # is it needed for source installs on windows server? + - r-base + - r-devtools # is it needed for source installs on windows server? - r-irkernel - r-reshape2 - r-stringi # + rmarkdown hack for reshape2 @@ -66,6 +66,7 @@ dependencies: - r-rrcov - r-gmm - r-tmvtnorm + - r-igraph # - bioconductor-biocinstaller # - r-imputelcmd # bioconda # - bioconductor-impute @@ -83,6 +84,5 @@ dependencies: # - jupyterlab_code_formatter # - jupyterlab-git - pip: - - -e . - - mrmr-selection - \ No newline at end of file + - -e . + - mrmr-selection diff --git a/project/00_0_0_lftp_upload_commands.ipynb b/project/00_0_0_lftp_upload_commands.ipynb deleted file mode 100644 index 81dd8c796..000000000 --- a/project/00_0_0_lftp_upload_commands.ipynb +++ /dev/null @@ -1,613 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "56becfbb-813a-4ea4-b29f-c234b1c45098", - "metadata": {}, - "source": [ - "# Rawfile and MaxQuant output folder renaming\n", - "\n", - "- generated using `workflows/metadata`\n", - "- all raw files collected ~50,000\n", - "- creates lftp upload commands" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0c1cab72-7447-473b-a3d5-1aee8c4815e8", - "metadata": {}, - "outputs": [], - "source": [ - "from pathlib import Path, PurePosixPath\n", - "import pandas as pd\n", - "import yaml\n", - "\n", - "\n", - "def rename(fname, new_sample_id, new_folder=None, ext=None):\n", - " fname = PurePosixPath(fname)\n", - " if ext is None:\n", - " ext = fname.suffix\n", - " if new_folder is None:\n", - " new_folder = fname.parent\n", - " else:\n", - " new_folder = PurePosixPath(new_folder)\n", - " fname = new_folder / new_sample_id\n", - " fname = fname.with_suffix(ext)\n", - " return fname.as_posix()" - ] - }, - { - "cell_type": "markdown", - "id": "d195963d-21b6-4766-a82e-761f31b288bf", - "metadata": {}, - "source": [ - "## Arguments" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7e1629d8-3768-4113-9835-3bed95f219a6", - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "fn_rawfile_metadata: str = 'data/rawfile_metadata.csv' # Machine parsed metadata from rawfile workflow\n", - "fn_mq_summaries: str = 'data/samples_selected_summaries.csv' # MaxQuant summary files\n", - "fn_files_selected: str = 'data/samples_selected.yaml' # selected files based on threshold of identified peptides\n", - "out_folder: str = 'data/rename' # output folder\n", - "fn_server_log: str = 'data/rename/mq_out_server.log' # server log of all uploaded files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "06ff016b-f6e0-4cfd-bdd1-fcae5dfc05e9", - "metadata": {}, - "outputs": [], - "source": [ - "out_folder = Path(out_folder)\n", - "out_folder.mkdir(exist_ok=True)\n", - "\n", - "files_out = dict()" - ] - }, - { - "cell_type": "markdown", - "id": "2232196d-7d24-419a-be70-2fac76428eae", - "metadata": {}, - "source": [ - "### Machine metadata\n", - "\n", - "- read from file using [ThermoRawFileParser](https://github.com/compomics/ThermoRawFileParser)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9869ac5e-fab3-4c66-a32c-48ae4fadc0a3", - "metadata": { - "lines_to_next_cell": 2, - "tags": [] - }, - "outputs": [], - "source": [ - "df_meta = pd.read_csv(fn_rawfile_metadata, header=[0, 1], index_col=0, low_memory=False)\n", - "date_col = ('FileProperties', 'Content Creation Date')\n", - "df_meta[date_col] = pd.to_datetime(\n", - " df_meta[date_col])\n", - "df_meta.sort_values(date_col, inplace=True)\n", - "msg = f\"A total of {len(df_meta)} raw files could be read using the ThermoFisherRawFileParser.\"\n", - "print(msg)\n", - "df_meta" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ab1b1bc2-c531-483c-b498-0fb0654bc7dc", - "metadata": {}, - "outputs": [], - "source": [ - "meta_stats = df_meta.describe(include='all', datetime_is_numeric=True)\n", - "meta_stats.T" - ] - }, - { - "cell_type": "markdown", - "id": "a27b9295-47b4-487f-9ef3-84aa2890d843", - "metadata": {}, - "source": [ - "# Erda Paths" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d1c30a7b-fe2e-4a71-a85a-d684cc62ce05", - "metadata": {}, - "outputs": [], - "source": [ - "cols_identifies = [('FileProperties', 'Pathname'),\n", - " ('FileProperties', 'Version'),\n", - " ('FileProperties', 'Content Creation Date'),\n", - " ('InstrumentProperties', 'Thermo Scientific instrument model'),\n", - " ('InstrumentProperties', 'instrument attribute'),\n", - " ('InstrumentProperties', 'instrument serial number'),\n", - " ('InstrumentProperties', 'Software Version'),\n", - " ('InstrumentProperties', 'firmware version'),\n", - "]\n", - "\n", - "df_meta = df_meta[cols_identifies]\n", - "df_meta.columns = [t[-1] for t in cols_identifies]\n", - "df_meta" - ] - }, - { - "cell_type": "markdown", - "id": "7f61bf7b-3473-487b-b8eb-ac5a6191d507", - "metadata": {}, - "source": [ - "Replace `tmp/` with `./` (artefact)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "17dc5955-3265-4709-a0ec-dc1df505e7d5", - "metadata": {}, - "outputs": [], - "source": [ - "df_meta['Pathname'] = df_meta['Pathname'].str.replace('tmp/', './')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "244501b8-2fbd-461a-844a-5df91b61e18d", - "metadata": {}, - "outputs": [], - "source": [ - "df_meta[\"Instrument_name\"] = (\n", - " df_meta[\"Thermo Scientific instrument model\"].str.replace(' ', '-')\n", - " + '_'\n", - " + df_meta[\"instrument serial number\"].str.split('#').str[-1]\n", - ").str.replace(' ', '-')\n", - "\n", - "df_meta[\"Instrument_name\"].value_counts().index" - ] - }, - { - "cell_type": "markdown", - "id": "6add2fad-2c1c-4542-bcfb-efd1a0ea108f", - "metadata": {}, - "source": [ - "Create new sample identifier" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b42f31ba-e0d5-4afc-8328-f5200213ff85", - "metadata": {}, - "outputs": [], - "source": [ - "date_col = \"Content Creation Date\"\n", - "idx_all = (pd.to_datetime(df_meta[date_col]).dt.strftime(\"%Y_%m_%d_%H_%M\")\n", - " + '_'\n", - " + df_meta[\"Instrument_name\"]\n", - ").str.replace(' ', '-')\n", - "\n", - "mask = idx_all.duplicated(keep=False)\n", - "duplicated_sample_idx = idx_all.loc[mask].sort_values() # duplicated dumps\n", - "duplicated_sample_idx" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1ddbec78-97bf-4238-9b46-533d20605973", - "metadata": {}, - "outputs": [], - "source": [ - "df_meta['new_sample_id'] = idx_all\n", - "\n", - "\n", - "\n", - "_n = df_meta.groupby(\"new_sample_id\").cumcount().astype('string').str.replace('0', '')\n", - "_n[_n != ''] = '_r' + _n[_n != '']\n", - "_n.value_counts()\n", - "\n", - "df_meta.loc[mask, \"new_sample_id\"] = df_meta.loc[mask, \"new_sample_id\"] + _n\n", - "\n", - "\n", - "df_meta.loc[mask, [\"Pathname\", \"new_sample_id\"]]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "21962b78-d9c8-4037-aea5-b13e0d5d84ca", - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "df_meta.loc[~mask, [\"Pathname\", \"new_sample_id\"]]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d4e0b289-ba7d-4c9b-a08b-c0aafc733855", - "metadata": {}, - "outputs": [], - "source": [ - "assert df_meta[\"Pathname\"].is_unique\n", - "assert df_meta[\"new_sample_id\"].is_unique" - ] - }, - { - "cell_type": "markdown", - "id": "fb446855-eb2f-4000-8c22-a84e58ce8130", - "metadata": {}, - "source": [ - "### Save new paths to disk" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bef2adce-afc7-4698-8aea-4c6415792133", - "metadata": {}, - "outputs": [], - "source": [ - "df_meta[\"Path_old\"] = df_meta[\"Pathname\"]\n", - "\n", - "df_meta[[\"Path_old\", \"new_sample_id\"]]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a67ac714-c966-47ce-8faf-38ca8c94fca7", - "metadata": {}, - "outputs": [], - "source": [ - "df_meta" - ] - }, - { - "cell_type": "markdown", - "id": "3a738dd3-cb4b-4940-bf48-5192186e3614", - "metadata": {}, - "source": [ - "## Selected Files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d630ba07-0674-40b7-b6e9-92a34f4e4788", - "metadata": {}, - "outputs": [], - "source": [ - "with open(fn_files_selected) as f:\n", - " files_selected = yaml.safe_load(f)\n", - "print(f'Threshold: {files_selected[\"threshold\"]:,d}')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c2204d45-20b5-4b24-8af1-04614769b275", - "metadata": {}, - "outputs": [], - "source": [ - "df_meta.loc[files_selected[\"files\"]]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f79366bc-38ba-415b-bccf-0a842241ca03", - "metadata": {}, - "outputs": [], - "source": [ - "mask = idx_all.duplicated()\n", - "selected = df_meta.loc[~mask].index.intersection(files_selected[\"files\"])\n", - "df_meta.loc[selected]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ccb44ce0-253c-485b-8533-f6716e9855d3", - "metadata": {}, - "outputs": [], - "source": [ - "def build_instrument_name(s):\n", - " \"\"\"Process in order, only keep one name\"\"\"\n", - " ret = ''\n", - " used_before = set()\n", - " for string_w_withspaces in s:\n", - " strings_ = string_w_withspaces.split()\n", - " for string_ in strings_:\n", - " if string_ not in used_before:\n", - " ret += f'_{string_}'\n", - " used_before |= set(strings_)\n", - " ret = (ret[1:] # remove _ from start\n", - " .replace('Slot_#', '')\n", - " .replace('slot_#', '')\n", - " )\n", - " return ret\n", - "\n", - "\n", - "(df_meta[\n", - " [\n", - " \"Thermo Scientific instrument model\",\n", - " \"instrument attribute\",\n", - " \"instrument serial number\",\n", - " ]\n", - " ]\n", - " .sample(20)\n", - " .apply(build_instrument_name, axis=1)\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d1a3d03d-e79d-4cb3-9f9e-7b415060f38b", - "metadata": {}, - "outputs": [], - "source": [ - "fname = out_folder / 'selected_old_new_id_mapping.csv'\n", - "files_out[fname.name] = fname.as_posix()\n", - "df_meta.loc[selected].to_csv(fname)\n", - "fname" - ] - }, - { - "cell_type": "markdown", - "id": "b0c69dd0-53a5-480e-a7cc-bb43b49a09cb", - "metadata": {}, - "source": [ - "### OS rename" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "801ccc99-a0f6-44bb-9605-5cf01cf57d21", - "metadata": {}, - "outputs": [], - "source": [ - "df_meta.loc[selected][[\"Path_old\", \"new_sample_id\"]]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "447c9308", - "metadata": {}, - "outputs": [], - "source": [ - "(df_meta\n", - " .loc[selected, \"Path_old\"]\n", - " .iloc[:3]\n", - " .to_csv(out_folder / 'rawfiles_to_checksum.txt',\n", - " index=False,\n", - " header=False)\n", - " )" - ] - }, - { - "cell_type": "markdown", - "id": "f07d1e53-f017-4d61-970d-3eb4ca2905c5", - "metadata": {}, - "source": [ - "Save summaries for selected files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "62c3084e", - "metadata": {}, - "outputs": [], - "source": [ - "df_summaries = pd.read_csv(fn_mq_summaries, index_col=0)\n", - "df_summaries = df_summaries.loc[selected].rename(df_meta.loc[selected, 'new_sample_id'])\n", - "df_summaries.to_csv(out_folder / 'mq_summaries.csv')\n", - "del df_summaries" - ] - }, - { - "cell_type": "markdown", - "id": "556a7087", - "metadata": {}, - "source": [ - "## Put files on PRgIDE FTP server\n", - "\n", - "rename using `new_sample_id`" - ] - }, - { - "cell_type": "markdown", - "id": "c9f014ce-8efc-48a0-9779-435385dfc792", - "metadata": {}, - "source": [ - "### LFTP commands - raw files\n", - "\n", - "`-f` option allows to pass commands from a file\n", - "One needs to at least an `open` as the first line to log in to an ftp server\n", - "For pride one needs to additionally `cd` to the correct folder:\n", - "```bash\n", - "> open ...\n", - "> cd ...\n", - "```\n", - "to allow parallell commands, use the runtime setting\n", - "```bash\n", - ">>> cat ~/.lftprc \n", - "set cmd:parallel 2\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "8d5998e4", - "metadata": {}, - "source": [ - "Create folders on pride for raw files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "080b3773", - "metadata": {}, - "outputs": [], - "source": [ - "df_meta[\"folder_raw\"] = \"./raw_files/\" + df_meta[\"Instrument_name\"]\n", - "df_meta[\"folder_raw\"].unique()\n", - "\n", - "fname = out_folder / 'raw_file_directories.txt'\n", - "\n", - "commands = 'mkdir -p ' + df_meta.loc[selected, \"folder_raw\"].drop_duplicates()\n", - "commands.to_csv(fname, header=False, index=False)" - ] - }, - { - "cell_type": "markdown", - "id": "1a124d24", - "metadata": {}, - "source": [ - "Create upload commands of raw files to create folders (could be combined with above)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "790c66e2-1a1e-46e2-9fd9-5ad7b86fc793", - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "commands = df_meta.loc[selected]\n", - "commands = (\n", - " 'put ' \n", - " + commands['Path_old'].astype('string')\n", - " + ' -o ' \n", - " + \"./raw_files/\" \n", - " + commands[\"Instrument_name\"] \n", - " + '/'\n", - " + commands['new_sample_id'] + '.raw'\n", - ")\n", - "print(commands.sample(10).to_csv(sep=' ', header=False, index=False))" - ] - }, - { - "cell_type": "markdown", - "id": "b8a6ebb2", - "metadata": {}, - "source": [ - "write all to file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9d1263b3", - "metadata": {}, - "outputs": [], - "source": [ - "fname = out_folder / 'lftp_commands_rawfiles.txt'\n", - "commands.to_csv(fname, header=False, index=False)" - ] - }, - { - "cell_type": "markdown", - "id": "bddf9902-71fe-4b86-930e-ced060d867ff", - "metadata": {}, - "source": [ - "### LFTP commands - MaxQuant output" - ] - }, - { - "cell_type": "markdown", - "id": "2b6faff1-92a4-46ff-a14f-f6fb241265d7", - "metadata": {}, - "source": [ - "Create upload commands of MaxQuant output folders to pride using mirror\n", - "\n", - "- `mq_out` folder\n", - "- move from `Sample ID` folder into `new_sample_id` on erda" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d60b149d-d32a-4d97-b908-1c778a58a224", - "metadata": {}, - "outputs": [], - "source": [ - "commands = df_meta.loc[selected]\n", - "commands = (\n", - " \"mirror -R --only-missing --log log_lftp_mirror.log --exclude-glob *.pdf \" # command\n", - " + \"mq_out/\" + commands.index # source\n", - " + \" ./MQ_tables/\" + commands[\"Instrument_name\"]+ \"/\" + commands[\"new_sample_id\"] # dest\n", - ")\n", - "\n", - "print(commands.sample(10).to_csv(header=False, index=False))" - ] - }, - { - "cell_type": "markdown", - "id": "6989ded5", - "metadata": {}, - "source": [ - "write all to file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "83c04b90-0c4e-4fe7-88f6-ed02cef93a23", - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "fname = out_folder / 'lftp_commands_mq_output.txt'\n", - "commands.to_csv(fname, header=False, index=False)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/project/00_0_0_lftp_upload_commands.py b/project/00_0_0_lftp_upload_commands.py deleted file mode 100644 index 921a5733f..000000000 --- a/project/00_0_0_lftp_upload_commands.py +++ /dev/null @@ -1,315 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.15.0 -# kernelspec: -# display_name: Python 3 -# language: python -# name: python3 -# --- - -# %% [markdown] -# # Rawfile and MaxQuant output folder renaming -# -# - generated using `workflows/metadata` -# - all raw files collected ~50,000 -# - creates lftp upload commands - -# %% -from pathlib import Path, PurePosixPath -import pandas as pd -import yaml - - -def rename(fname, new_sample_id, new_folder=None, ext=None): - fname = PurePosixPath(fname) - if ext is None: - ext = fname.suffix - if new_folder is None: - new_folder = fname.parent - else: - new_folder = PurePosixPath(new_folder) - fname = new_folder / new_sample_id - fname = fname.with_suffix(ext) - return fname.as_posix() - - -# %% [markdown] -# ## Arguments - -# %% tags=["parameters"] -fn_rawfile_metadata: str = 'data/rawfile_metadata.csv' # Machine parsed metadata from rawfile workflow -fn_mq_summaries: str = 'data/samples_selected_summaries.csv' # MaxQuant summary files -fn_files_selected: str = 'data/samples_selected.yaml' # selected files based on threshold of identified peptides -out_folder: str = 'data/rename' # output folder -fn_server_log: str = 'data/rename/mq_out_server.log' # server log of all uploaded files - -# %% -out_folder = Path(out_folder) -out_folder.mkdir(exist_ok=True) - -files_out = dict() - -# %% [markdown] -# ### Machine metadata -# -# - read from file using [ThermoRawFileParser](https://github.com/compomics/ThermoRawFileParser) - -# %% -df_meta = pd.read_csv(fn_rawfile_metadata, header=[0, 1], index_col=0, low_memory=False) -date_col = ('FileProperties', 'Content Creation Date') -df_meta[date_col] = pd.to_datetime( - df_meta[date_col]) -df_meta.sort_values(date_col, inplace=True) -msg = f"A total of {len(df_meta)} raw files could be read using the ThermoFisherRawFileParser." -print(msg) -df_meta - - -# %% -meta_stats = df_meta.describe(include='all', datetime_is_numeric=True) -meta_stats.T - -# %% [markdown] -# # Erda Paths - -# %% -cols_identifies = [('FileProperties', 'Pathname'), - ('FileProperties', 'Version'), - ('FileProperties', 'Content Creation Date'), - ('InstrumentProperties', 'Thermo Scientific instrument model'), - ('InstrumentProperties', 'instrument attribute'), - ('InstrumentProperties', 'instrument serial number'), - ('InstrumentProperties', 'Software Version'), - ('InstrumentProperties', 'firmware version'), -] - -df_meta = df_meta[cols_identifies] -df_meta.columns = [t[-1] for t in cols_identifies] -df_meta - -# %% [markdown] -# Replace `tmp/` with `./` (artefact) - -# %% -df_meta['Pathname'] = df_meta['Pathname'].str.replace('tmp/', './') - -# %% -df_meta["Instrument_name"] = ( - df_meta["Thermo Scientific instrument model"].str.replace(' ', '-') - + '_' - + df_meta["instrument serial number"].str.split('#').str[-1] -).str.replace(' ', '-') - -df_meta["Instrument_name"].value_counts().index - -# %% [markdown] -# Create new sample identifier - -# %% -date_col = "Content Creation Date" -idx_all = (pd.to_datetime(df_meta[date_col]).dt.strftime("%Y_%m_%d_%H_%M") - + '_' - + df_meta["Instrument_name"] -).str.replace(' ', '-') - -mask = idx_all.duplicated(keep=False) -duplicated_sample_idx = idx_all.loc[mask].sort_values() # duplicated dumps -duplicated_sample_idx - -# %% -df_meta['new_sample_id'] = idx_all - - - -_n = df_meta.groupby("new_sample_id").cumcount().astype('string').str.replace('0', '') -_n[_n != ''] = '_r' + _n[_n != ''] -_n.value_counts() - -df_meta.loc[mask, "new_sample_id"] = df_meta.loc[mask, "new_sample_id"] + _n - - -df_meta.loc[mask, ["Pathname", "new_sample_id"]] - -# %% -df_meta.loc[~mask, ["Pathname", "new_sample_id"]] - - -# %% -assert df_meta["Pathname"].is_unique -assert df_meta["new_sample_id"].is_unique - -# %% [markdown] -# ### Save new paths to disk - -# %% -df_meta["Path_old"] = df_meta["Pathname"] - -df_meta[["Path_old", "new_sample_id"]] - -# %% -df_meta - -# %% [markdown] -# ## Selected Files - -# %% -with open(fn_files_selected) as f: - files_selected = yaml.safe_load(f) -print(f'Threshold: {files_selected["threshold"]:,d}') - -# %% -df_meta.loc[files_selected["files"]] - -# %% -mask = idx_all.duplicated() -selected = df_meta.loc[~mask].index.intersection(files_selected["files"]) -df_meta.loc[selected] - - -# %% -def build_instrument_name(s): - """Process in order, only keep one name""" - ret = '' - used_before = set() - for string_w_withspaces in s: - strings_ = string_w_withspaces.split() - for string_ in strings_: - if string_ not in used_before: - ret += f'_{string_}' - used_before |= set(strings_) - ret = (ret[1:] # remove _ from start - .replace('Slot_#', '') - .replace('slot_#', '') - ) - return ret - - -(df_meta[ - [ - "Thermo Scientific instrument model", - "instrument attribute", - "instrument serial number", - ] - ] - .sample(20) - .apply(build_instrument_name, axis=1) -) - -# %% -fname = out_folder / 'selected_old_new_id_mapping.csv' -files_out[fname.name] = fname.as_posix() -df_meta.loc[selected].to_csv(fname) -fname - -# %% [markdown] -# ### OS rename - -# %% -df_meta.loc[selected][["Path_old", "new_sample_id"]] - -# %% -(df_meta - .loc[selected, "Path_old"] - .iloc[:3] - .to_csv(out_folder / 'rawfiles_to_checksum.txt', - index=False, - header=False) - ) - -# %% [markdown] -# Save summaries for selected files - -# %% -df_summaries = pd.read_csv(fn_mq_summaries, index_col=0) -df_summaries = df_summaries.loc[selected].rename(df_meta.loc[selected, 'new_sample_id']) -df_summaries.to_csv(out_folder / 'mq_summaries.csv') -del df_summaries - -# %% [markdown] -# ## Put files on PRgIDE FTP server -# -# rename using `new_sample_id` - -# %% [markdown] -# ### LFTP commands - raw files -# -# `-f` option allows to pass commands from a file -# One needs to at least an `open` as the first line to log in to an ftp server -# For pride one needs to additionally `cd` to the correct folder: -# ```bash -# > open ... -# > cd ... -# ``` -# to allow parallell commands, use the runtime setting -# ```bash -# >>> cat ~/.lftprc -# set cmd:parallel 2 -# ``` - -# %% [markdown] -# Create folders on pride for raw files - -# %% -df_meta["folder_raw"] = "./raw_files/" + df_meta["Instrument_name"] -df_meta["folder_raw"].unique() - -fname = out_folder / 'raw_file_directories.txt' - -commands = 'mkdir -p ' + df_meta.loc[selected, "folder_raw"].drop_duplicates() -commands.to_csv(fname, header=False, index=False) - -# %% [markdown] -# Create upload commands of raw files to create folders (could be combined with above) - -# %% -commands = df_meta.loc[selected] -commands = ( - 'put ' - + commands['Path_old'].astype('string') - + ' -o ' - + "./raw_files/" - + commands["Instrument_name"] - + '/' - + commands['new_sample_id'] + '.raw' -) -print(commands.sample(10).to_csv(sep=' ', header=False, index=False)) - - -# %% [markdown] -# write all to file - -# %% -fname = out_folder / 'lftp_commands_rawfiles.txt' -commands.to_csv(fname, header=False, index=False) - -# %% [markdown] -# ### LFTP commands - MaxQuant output - -# %% [markdown] -# Create upload commands of MaxQuant output folders to pride using mirror -# -# - `mq_out` folder -# - move from `Sample ID` folder into `new_sample_id` on erda - -# %% -commands = df_meta.loc[selected] -commands = ( - "mirror -R --only-missing --log log_lftp_mirror.log --exclude-glob *.pdf " # command - + "mq_out/" + commands.index # source - + " ./MQ_tables/" + commands["Instrument_name"]+ "/" + commands["new_sample_id"] # dest -) - -print(commands.sample(10).to_csv(header=False, index=False)) - -# %% [markdown] -# write all to file - -# %% -fname = out_folder / 'lftp_commands_mq_output.txt' -commands.to_csv(fname, header=False, index=False) - diff --git a/project/00_0_1_check_filesizes.ipynb b/project/00_0_1_check_filesizes.ipynb deleted file mode 100644 index 36b937a6b..000000000 --- a/project/00_0_1_check_filesizes.ipynb +++ /dev/null @@ -1,398 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "f581713e", - "metadata": {}, - "source": [ - "# Check if filesizes of local and uploaded files match\n", - "- could be replaced with checksums, but it's too slow on erda\n", - "- instead: compare if filesizes in bytes more or less match (tolerance of 5 bytes)\n", - "\n", - "many things could be refactored in case a tool should be created from this" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e3e033c1", - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "from collections import namedtuple\n", - "from pathlib import Path, PurePosixPath\n", - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7ad76c5a", - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "# Parameters\n", - "FOLDER = Path('data/rename')\n", - "fname_mq_out_pride = FOLDER / 'mq_out_filesizes_pride.log'\n", - "fname_mq_out_erda = FOLDER / 'mq_out_filesizes_erda.log'\n", - "fname_rawfiles_pride = FOLDER / 'rawfiles_filesizes_pride.log'\n", - "fname_rawfiles_erda = FOLDER / 'rawfiles_filesizes_erda.log'\n", - "fname_filenames_mapping = FOLDER / 'selected_old_new_id_mapping.csv'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eb2fab6a", - "metadata": {}, - "outputs": [], - "source": [ - "df_meta = pd.read_csv(fname_filenames_mapping, index_col='Path_old')\n", - "df_meta" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1f1a85ce", - "metadata": {}, - "outputs": [], - "source": [ - "df_meta['path_pride'] = 'raw_files/' + df_meta['Instrument_name'] + '/' + df_meta[\"new_sample_id\"] + '.raw'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cd5f1ae0", - "metadata": {}, - "outputs": [], - "source": [ - "entries = list()\n", - "Entry = namedtuple('Entry', 'size_erda fname name_erda')\n", - "with open(fname_rawfiles_erda) as f:\n", - " for line in f:\n", - " size, fname = line.strip().split('\\t')\n", - " fname = PurePosixPath(fname)\n", - " if fname.suffix:\n", - " entry = Entry(int(size), str(fname).replace('share_hela_raw/', './'), fname.name)\n", - " if entry.fname in df_meta.index:\n", - " entries.append(entry)\n", - "print(f\"{len(entries) =: }\")\n", - "entries[:3]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0494dd62", - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "entries = pd.DataFrame(entries).set_index('fname')\n", - "entries = (entries\n", - " .join(df_meta.loc[entries.index, 'path_pride'])\n", - " .reset_index()\n", - " .set_index('path_pride')\n", - " .sort_index())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "27b0c5ff", - "metadata": {}, - "outputs": [], - "source": [ - "entries_pride = list()\n", - "Entry = namedtuple('Entry', ['size_pride', 'path_pride', 'name_pride', 'instrument'])\n", - "with open(fname_rawfiles_pride) as f:\n", - " for line in f:\n", - " size, fname = line.strip().split()\n", - " fname = PurePosixPath(fname)\n", - " if fname.suffix:\n", - " entry = Entry(int(size), str(fname), fname.name, fname.parent.name)\n", - " entries_pride.append(entry)\n", - "print(f\"{len(entries_pride) =: }\")\n", - "entries_pride[:3]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "507995c3", - "metadata": {}, - "outputs": [], - "source": [ - "entries_pride = pd.DataFrame(entries_pride).set_index('path_pride').sort_index()\n", - "entries_pride" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b37640f7", - "metadata": {}, - "outputs": [], - "source": [ - "entries = entries.join(entries_pride, on='path_pride', how='left')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "720feda4", - "metadata": {}, - "outputs": [], - "source": [ - "mask = (entries['size_pride'] - entries['size_erda']).abs() > 5\n", - "to_redo = entries.loc[mask].reset_index()\n", - "to_redo " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8aff71d9", - "metadata": {}, - "outputs": [], - "source": [ - "commands = 'put ' + to_redo['fname'] + ' -o ' + to_redo['path_pride']\n", - "print(commands.to_csv(header=False, index=False))" - ] - }, - { - "cell_type": "markdown", - "id": "b6087751", - "metadata": {}, - "source": [ - "## Check MaxQuant output filesizes " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4290a2b6", - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "df_meta = df_meta.reset_index().set_index('Sample ID')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a32ac1ba", - "metadata": {}, - "outputs": [], - "source": [ - "files = list()\n", - "folder = set()\n", - "Entry = namedtuple('Entry', 'size_erda path_erda id_old filename')\n", - "with open(fname_mq_out_erda) as f:\n", - " for line in f:\n", - " size, fname = line.strip().split('\\t')\n", - " fname = PurePosixPath(fname)\n", - " if fname.suffix and fname.suffix != '.pdf':\n", - " entry = Entry(int(size), str(fname), fname.parent.name, fname.name)\n", - " if entry.id_old in df_meta.index:\n", - " files.append(entry)\n", - " if entry.id_old not in folder:\n", - " folder.add(entry.id_old)\n", - " \n", - "print(f\"{len(folder) =: }\")\n", - "print(f\"{len(files) =: }\")\n", - "files[:3]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d63f458a", - "metadata": {}, - "outputs": [], - "source": [ - "files = pd.DataFrame(files).set_index('id_old')\n", - "files = files.join(df_meta[['Instrument_name', 'new_sample_id']])\n", - "files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "082e1cc9", - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "files['path_pride'] = ('MQ_tables/'\n", - " + files['Instrument_name']\n", - " + '/' \n", - " + files[\"new_sample_id\"]\n", - " + '/'\n", - " + files[\"filename\"])\n", - "files['path_pride'].iloc[:4].to_list()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4a27d1e1", - "metadata": {}, - "outputs": [], - "source": [ - "files['filename'].value_counts() # except mqpar.xml all present on erda" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "111c6607", - "metadata": {}, - "outputs": [], - "source": [ - "files_pride = list()\n", - "Entry = namedtuple('Entry', ['size_pride', 'path_pride', 'id_new', 'instrument'])\n", - "with open(fname_mq_out_pride) as f:\n", - " for line in f:\n", - " size, fname = line.strip().split('\\t')\n", - " fname = PurePosixPath(fname)\n", - " if fname.suffix:\n", - " entry = Entry(int(size), str(fname), fname.parent.name, fname.parent.parent.name)\n", - " files_pride.append(entry)\n", - "print(f\"{len(files_pride) =: }\")\n", - "files_pride[:3]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b7776e97", - "metadata": { - "lines_to_next_cell": 0 - }, - "outputs": [], - "source": [ - "files_pride = pd.DataFrame(files_pride).set_index('path_pride')\n", - "files_pride" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d82d3a19", - "metadata": { - "lines_to_next_cell": 0 - }, - "outputs": [], - "source": [ - "files = files.set_index('path_pride').join(files_pride, how='left')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "93722b9a", - "metadata": { - "lines_to_next_cell": 0 - }, - "outputs": [], - "source": [ - "missing_on_pride = files.loc[files['size_pride'].isna()]\n", - "missing_on_pride" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "42059238", - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "missing_on_pride['filename'].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "466e2ba8", - "metadata": { - "lines_to_next_cell": 0 - }, - "outputs": [], - "source": [ - "files['size_diff'] = files['size_pride'] - files['size_erda']\n", - "files['size_diff'].abs().describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f7e03424", - "metadata": {}, - "outputs": [], - "source": [ - "files_redo = files.loc[files['size_diff'].abs() > 5]\n", - "files_redo" - ] - }, - { - "cell_type": "markdown", - "id": "59bbd393", - "metadata": {}, - "source": [ - "ensure quoted paths as they might contain whitespaces" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3fc22aef", - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "to_do = pd.concat([missing_on_pride, files_redo])\n", - "commands = 'put -e \\'' + to_do['path_erda'] + \"' -o '\" + to_do.index + \"'\"\n", - "commands.to_csv(FOLDER / 'mq_out_remaining.txt', header=False, index=False)" - ] - } - ], - "metadata": { - "jupytext": { - "cell_metadata_filter": "-all", - "notebook_metadata_filter": "-all" - }, - "kernelspec": { - "display_name": "vaep", - "language": "python", - "name": "vaep" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/project/00_0_1_check_filesizes.py b/project/00_0_1_check_filesizes.py deleted file mode 100644 index 64fc10c7f..000000000 --- a/project/00_0_1_check_filesizes.py +++ /dev/null @@ -1,166 +0,0 @@ -# %% [markdown] -# # Check if filesizes of local and uploaded files match -# - could be replaced with checksums, but it's too slow on erda -# - instead: compare if filesizes in bytes more or less match (tolerance of 5 bytes) -# -# many things could be refactored in case a tool should be created from this - -# %% -from collections import namedtuple -from pathlib import Path, PurePosixPath -import pandas as pd - - -# %% -# Parameters -FOLDER = Path('data/rename') -fname_mq_out_pride = FOLDER / 'mq_out_filesizes_pride.log' -fname_mq_out_erda = FOLDER / 'mq_out_filesizes_erda.log' -fname_rawfiles_pride = FOLDER / 'rawfiles_filesizes_pride.log' -fname_rawfiles_erda = FOLDER / 'rawfiles_filesizes_erda.log' -fname_filenames_mapping = FOLDER / 'selected_old_new_id_mapping.csv' - - -# %% -df_meta = pd.read_csv(fname_filenames_mapping, index_col='Path_old') -df_meta - -# %% -df_meta['path_pride'] = 'raw_files/' + df_meta['Instrument_name'] + '/' + df_meta["new_sample_id"] + '.raw' - -# %% -entries = list() -Entry = namedtuple('Entry', 'size_erda fname name_erda') -with open(fname_rawfiles_erda) as f: - for line in f: - size, fname = line.strip().split('\t') - fname = PurePosixPath(fname) - if fname.suffix: - entry = Entry(int(size), str(fname).replace('share_hela_raw/', './'), fname.name) - if entry.fname in df_meta.index: - entries.append(entry) -print(f"{len(entries) =: }") -entries[:3] - -# %% -entries = pd.DataFrame(entries).set_index('fname') -entries = (entries - .join(df_meta.loc[entries.index, 'path_pride']) - .reset_index() - .set_index('path_pride') - .sort_index()) - - -# %% -entries_pride = list() -Entry = namedtuple('Entry', ['size_pride', 'path_pride', 'name_pride', 'instrument']) -with open(fname_rawfiles_pride) as f: - for line in f: - size, fname = line.strip().split() - fname = PurePosixPath(fname) - if fname.suffix: - entry = Entry(int(size), str(fname), fname.name, fname.parent.name) - entries_pride.append(entry) -print(f"{len(entries_pride) =: }") -entries_pride[:3] - -# %% -entries_pride = pd.DataFrame(entries_pride).set_index('path_pride').sort_index() -entries_pride - -# %% -entries = entries.join(entries_pride, on='path_pride', how='left') - -# %% -mask = (entries['size_pride'] - entries['size_erda']).abs() > 5 -to_redo = entries.loc[mask].reset_index() -to_redo - -# %% -commands = 'put ' + to_redo['fname'] + ' -o ' + to_redo['path_pride'] -print(commands.to_csv(header=False, index=False)) - -# %% [markdown] -# ## Check MaxQuant output filesizes - -# %% -df_meta = df_meta.reset_index().set_index('Sample ID') - - -# %% -files = list() -folder = set() -Entry = namedtuple('Entry', 'size_erda path_erda id_old filename') -with open(fname_mq_out_erda) as f: - for line in f: - size, fname = line.strip().split('\t') - fname = PurePosixPath(fname) - if fname.suffix and fname.suffix != '.pdf': - entry = Entry(int(size), str(fname), fname.parent.name, fname.name) - if entry.id_old in df_meta.index: - files.append(entry) - if entry.id_old not in folder: - folder.add(entry.id_old) - -print(f"{len(folder) =: }") -print(f"{len(files) =: }") -files[:3] - -# %% -files = pd.DataFrame(files).set_index('id_old') -files = files.join(df_meta[['Instrument_name', 'new_sample_id']]) -files - -# %% -files['path_pride'] = ('MQ_tables/' - + files['Instrument_name'] - + '/' - + files["new_sample_id"] - + '/' - + files["filename"]) -files['path_pride'].iloc[:4].to_list() - - -# %% -files['filename'].value_counts() # except mqpar.xml all present on erda - -# %% -files_pride = list() -Entry = namedtuple('Entry', ['size_pride', 'path_pride', 'id_new', 'instrument']) -with open(fname_mq_out_pride) as f: - for line in f: - size, fname = line.strip().split('\t') - fname = PurePosixPath(fname) - if fname.suffix: - entry = Entry(int(size), str(fname), fname.parent.name, fname.parent.parent.name) - files_pride.append(entry) -print(f"{len(files_pride) =: }") -files_pride[:3] - -# %% -files_pride = pd.DataFrame(files_pride).set_index('path_pride') -files_pride -# %% -files = files.set_index('path_pride').join(files_pride, how='left') -# %% -missing_on_pride = files.loc[files['size_pride'].isna()] -missing_on_pride -# %% -missing_on_pride['filename'].value_counts() - - -# %% -files['size_diff'] = files['size_pride'] - files['size_erda'] -files['size_diff'].abs().describe() -# %% -files_redo = files.loc[files['size_diff'].abs() > 5] -files_redo - -# %% [markdown] -# ensure quoted paths as they might contain whitespaces - -# %% -to_do = pd.concat([missing_on_pride, files_redo]) -commands = 'put -e \'' + to_do['path_erda'] + "' -o '" + to_do.index + "'" -commands.to_csv(FOLDER / 'mq_out_remaining.txt', header=False, index=False) - diff --git a/project/00_0_2_mqout_renaming.ipynb b/project/00_0_2_mqout_renaming.ipynb deleted file mode 100644 index 2f56c1303..000000000 --- a/project/00_0_2_mqout_renaming.ipynb +++ /dev/null @@ -1,165 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "349432fa", - "metadata": {}, - "source": [ - "# Rename file names in MaxQuant output files\n", - "renaming the folder of outputs does not delete all occurences of the names\n", - "in the text files. This needs to be done manually by the PRIDE team using a shell script\n", - "that uses `sed` to replace the old names with the new ones.\n", - "\n", - "uses the list of output as stored on pride dropbox server and meta data of old and new name" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8a10bc6a", - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "from collections import defaultdict\n", - "from pathlib import Path, PurePosixPath\n", - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "614edb34", - "metadata": {}, - "outputs": [], - "source": [ - "FOLDER = Path('data/rename')\n", - "meta_in = FOLDER / 'selected_old_new_id_mapping.csv'\n", - "fn_server_log: str = 'data/rename/mq_out_server.log' # server log of all uploaded files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d67125f5", - "metadata": {}, - "outputs": [], - "source": [ - "df_meta = pd.read_csv(meta_in, index_col='new_sample_id')\n", - "df_meta" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f1acf13f", - "metadata": { - "title": "[makrdown]" - }, - "outputs": [], - "source": [ - "# ## Create commands to rename file names in text files itself\n", - "# - only subset of files contain original file names on exection of MaxQuant" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e700c55f", - "metadata": {}, - "outputs": [], - "source": [ - "files_types = [\"modificationSpecificPeptides.txt\",\n", - "\"mqpar.xml\",\n", - "\"mzRange.txt\",\n", - "\"Oxidation (M)Sites.txt\",\n", - "\"summary.txt\",]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2c31e15b", - "metadata": {}, - "outputs": [], - "source": [ - "name_lookup = df_meta[\"Sample ID\"].reset_index().set_index(\"new_sample_id\")\n", - "name_lookup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4fff3884", - "metadata": {}, - "outputs": [], - "source": [ - "to_rename = list()\n", - "command_template = 'sed -i \"s/{old_name}/{new_name}/g\" \"{fn}\"'\n", - "counter = defaultdict(int)\n", - "\n", - "with open(fn_server_log) as f:\n", - " for line in f:\n", - " fname = PurePosixPath(line.strip())\n", - " if fname.name in files_types:\n", - " new_name = fname.parent.name\n", - " old_name = name_lookup.loc[new_name, 'Sample ID']\n", - " command = command_template.format(old_name=old_name,\n", - " new_name=new_name,\n", - " fn=fname)\n", - " to_rename.append(command)\n", - " \n", - " counter[fname.name] += 1\n", - "len(to_rename)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "87426e9a", - "metadata": {}, - "outputs": [], - "source": [ - "# mqpar.xml missing in some folders\n", - "pd.Series(counter) # maybe one folder has some missing?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "46727631", - "metadata": {}, - "outputs": [], - "source": [ - "with open(FOLDER / 'sed_rename_commands.sh', 'w') as f:\n", - " f.writelines('\\n'.join(to_rename))" - ] - } - ], - "metadata": { - "jupytext": { - "cell_metadata_filter": "title,-all", - "notebook_metadata_filter": "-all" - }, - "kernelspec": { - "display_name": "vaep", - "language": "python", - "name": "vaep" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/project/00_0_2_mqout_renaming.py b/project/00_0_2_mqout_renaming.py deleted file mode 100644 index b61e127e9..000000000 --- a/project/00_0_2_mqout_renaming.py +++ /dev/null @@ -1,64 +0,0 @@ -# %% [markdown] -# # Rename file names in MaxQuant output files -# renaming the folder of outputs does not delete all occurences of the names -# in the text files. This needs to be done manually by the PRIDE team using a shell script -# that uses `sed` to replace the old names with the new ones. -# -# uses the list of output as stored on pride dropbox server and meta data of old and new name - -# %% -from collections import defaultdict -from pathlib import Path, PurePosixPath -import pandas as pd - - -# %% -FOLDER = Path('data/rename') -meta_in = FOLDER / 'selected_old_new_id_mapping.csv' -fn_server_log: str = 'data/rename/mq_out_server.log' # server log of all uploaded files - -# %% -df_meta = pd.read_csv(meta_in, index_col='new_sample_id') -df_meta - -# %% [makrdown] -# ## Create commands to rename file names in text files itself -# - only subset of files contain original file names on exection of MaxQuant - -# %% -files_types = ["modificationSpecificPeptides.txt", -"mqpar.xml", -"mzRange.txt", -"Oxidation (M)Sites.txt", -"summary.txt",] - -# %% -name_lookup = df_meta["Sample ID"].reset_index().set_index("new_sample_id") -name_lookup - -# %% -to_rename = list() -command_template = 'sed -i "s/{old_name}/{new_name}/g" "{fn}"' -counter = defaultdict(int) - -with open(fn_server_log) as f: - for line in f: - fname = PurePosixPath(line.strip()) - if fname.name in files_types: - new_name = fname.parent.name - old_name = name_lookup.loc[new_name, 'Sample ID'] - command = command_template.format(old_name=old_name, - new_name=new_name, - fn=fname) - to_rename.append(command) - - counter[fname.name] += 1 -len(to_rename) - -# %% -# mqpar.xml missing in some folders -pd.Series(counter) # maybe one folder has some missing? - -# %% -with open(FOLDER / 'sed_rename_commands.sh', 'w') as f: - f.writelines('\n'.join(to_rename)) diff --git a/project/00_0_3_create_sdrf.ipynb b/project/00_0_3_create_sdrf.ipynb deleted file mode 100644 index e08e3e973..000000000 --- a/project/00_0_3_create_sdrf.ipynb +++ /dev/null @@ -1,208 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "da553f6b", - "metadata": {}, - "source": [ - "# Create SDRF file\n", - "- [example](https://github.com/bigbio/proteomics-sample-metadata/blob/6f31044f0bcf545ae2da6e853f8ccad011ea4703/annotated-projects/PXD000895/PXD000895.sdrf.tsv)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "85a2d719", - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "from pathlib import Path\n", - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2e9e102d", - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "fn_sdrf_cellline_template = Path('data') / 'sdrf-cell-line-template.tsv'\n", - "fn_meta = Path('data/rename') / 'selected_old_new_id_mapping.csv'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8426e2f1", - "metadata": {}, - "outputs": [], - "source": [ - "df_meta = pd.read_csv(fn_meta, index_col='new_sample_id')\n", - "df_meta" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6712b0d9", - "metadata": { - "lines_to_next_cell": 0 - }, - "outputs": [], - "source": [ - "sdrf = pd.DataFrame() # pd.read_table(fn_sdrf_cellline_template)\n", - "sdrf['source name'] = df_meta.index\n", - "sdrf = sdrf.set_index('source name')\n", - "sdrf['characteristics[organism]'] = 'Homo sapiens'\n", - "sdrf['characteristics[organism part]'] = 'cervex'\n", - "sdrf['characteristics[ancestry category]'] = 'Black'\n", - "sdrf['characteristics[age]'] = '31Y'\n", - "sdrf['characteristics[developmental stage]'] = 'adult'\n", - "sdrf['characteristics[sex]'] = 'female'\n", - "sdrf['characteristics[cell line]'] = 'HeLa cells'\n", - "sdrf['characteristics[cell type]'] = 'epithelial'\n", - "sdrf['characteristics[disease]'] = 'adenocarcinoma'\n", - "sdrf['characteristics[cell line]'] = 'HeLa cells'\n", - "sdrf['characteristics[biological replicate]'] = 1\n", - "sdrf['assay name'] = sdrf.index\n", - "sdrf['technology type'] = 'proteomic profiling by mass spectrometer'\n", - "sdrf['comment[technical replicate]'] = range(0, len(sdrf))\n", - "sdrf['comment[data file]'] = sdrf.index + '.raw'\n", - "sdrf['comment[fraction identifier]'] = 1\n", - "sdrf['comment[label]'] = 'NT=label free sample;AC=MS:1002038' # To check\n", - "sdrf['comment[cleavage agent details]'] = 'NT=Trypsin;AC=MS:1001251'\n", - "sdrf['comment[instrument]'] = df_meta['Instrument_name']\n", - "\n", - "sdrf" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c14ca4f0", - "metadata": {}, - "outputs": [], - "source": [ - "# based on https://www.ebi.ac.uk/ols4/\n", - "#\n", - "# Q Exactive HF-X MS:1002877\n", - "# Q Exactive HF MS:1002523\n", - "# Orbitrap Exploris 480 MS:1003028\n", - "# Exactive Plus MS:1002526\n", - "# Q Exactive MS:1001911\n", - "# Orbitrap Fusion Lumos MS:1002732\n", - "\n", - "\n", - "instrument_ms_mapping = {\n", - " 'Q-Exactive-HF-X-Orbitrap_6070': 'NT=Q Exactive HF-X;AC=MS:1002877:',\n", - " 'Q-Exactive-HF-X-Orbitrap_6071': 'NT=Q Exactive HF-X;AC=MS:1002877:',\n", - " 'Q-Exactive-HF-X-Orbitrap_6075': 'NT=Q Exactive HF-X;AC=MS:1002877:',\n", - " 'Q-Exactive-HF-X-Orbitrap_6101': 'NT=Q Exactive HF-X;AC=MS:1002877:',\n", - " 'Q-Exactive-HF-Orbitrap_207': 'NT=Q Exactive HF;AC=MS:1002523',\n", - " 'Q-Exactive-HF-X-Orbitrap_6096': 'NT=Q Exactive HF-X;AC=MS:1002877:',\n", - " 'Q-Exactive-HF-X-Orbitrap_6078': 'NT=Q Exactive HF-X;AC=MS:1002877:',\n", - " 'Q-Exactive-HF-Orbitrap_147': 'NT=Q Exactive HF;AC=MS:1002523',\n", - " 'Q-Exactive-Orbitrap_1': 'NT=Q Exactive;AC=MS:1001911',\n", - " 'Q-Exactive-HF-Orbitrap_143': 'NT=Q Exactive HF;AC=MS:1002523',\n", - " 'Q-Exactive-HF-Orbitrap_204': 'NT=Q Exactive HF;AC=MS:1002523',\n", - " 'Q-Exactive-HF-X-Orbitrap_6011': 'NT=Q Exactive HF-X;AC=MS:1002877:',\n", - " 'Q-Exactive-HF-Orbitrap_206': 'NT=Q Exactive HF;AC=MS:1002523',\n", - " 'Q-Exactive-HF-X-Orbitrap_6073': 'NT=Q Exactive HF-X;AC=MS:1002877:',\n", - " 'Q-Exactive-HF-Orbitrap_1': 'NT=Q Exactive HF;AC=MS:1002523',\n", - " 'Q-Exactive-HF-Orbitrap_148': 'NT=Q Exactive HF;AC=MS:1002523',\n", - " 'Orbitrap-Fusion-Lumos_FSN20115': 'NT=Orbitrap Fusion Lumos;AC=MS:1002732',\n", - " 'Q-Exactive-HF-X-Orbitrap_6016': 'NT=Q Exactive HF-X;AC=MS:1002877:',\n", - " 'Q-Exactive-HF-X-Orbitrap_6004': 'NT=Q Exactive HF-X;AC=MS:1002877:',\n", - " 'Orbitrap-Exploris-480_MA10132C': 'NT=Orbitrap Exploris 480;AC=MS:1003028',\n", - " 'Q-Exactive-HF-X-Orbitrap_6028': 'NT=Q Exactive HF-X;AC=MS:1002877:',\n", - " 'Q-Exactive-HF-X-Orbitrap_6044': 'NT=Q Exactive HF-X;AC=MS:1002877:',\n", - " 'Q-Exactive-HF-X-Orbitrap_6025': 'NT=Q Exactive HF-X;AC=MS:1002877:',\n", - " 'Q-Exactive-HF-X-Orbitrap_6324': 'NT=Q Exactive HF-X;AC=MS:1002877:',\n", - " 'Orbitrap-Exploris-480_MA10134C': 'NT=Orbitrap Exploris 480;AC=MS:1003028',\n", - " 'Q-Exactive-HF-X-Orbitrap_6022': 'NT=Q Exactive HF-X;AC=MS:1002877:',\n", - " 'Q-Exactive-HF-X-Orbitrap_6043': 'NT=Q Exactive HF-X;AC=MS:1002877:',\n", - " 'Q-Exactive-HF-X-Orbitrap_6013': 'NT=Q Exactive HF-X;AC=MS:1002877:',\n", - " 'Q-Exactive-HF-X-Orbitrap_6023': 'NT=Q Exactive HF-X;AC=MS:1002877:',\n", - " 'Exactive-Series-Orbitrap_6004': 'NT=Q Exactive;AC=MS:1001911',\n", - " 'Orbitrap-Exploris-480_Invalid_SN_0001': 'NT=Orbitrap Exploris 480;AC=MS:1003028',\n", - " 'Orbitrap-Exploris-480_MA10215C': 'NT=Orbitrap Exploris 480;AC=MS:1003028',\n", - " 'Q-Exactive-HF-Orbitrap_2612': 'NT=Q Exactive HF;AC=MS:1002523',\n", - " 'Q-Exactive-Plus-Orbitrap_1': 'NT=Exactive Plus;AC=MS:1002526',\n", - " 'Q-Exactive-Plus-Orbitrap_143': 'NT=Exactive Plus;AC=MS:1002526',\n", - " 'Orbitrap-Exploris-480_MA10130C': 'NT=Orbitrap Exploris 480;AC=MS:1003028',\n", - "}\n", - "sdrf['comment[instrument]'] = sdrf['comment[instrument]'].replace(\n", - " instrument_ms_mapping)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "57c38cb7", - "metadata": {}, - "outputs": [], - "source": [ - "# change order: The column `technology type`` cannot be before the `assay name`` -- ERROR\n", - "# template has wrong order (open PR)\n", - "# -> done now above\n", - "# order = ['characteristics[organism]',\n", - "# 'characteristics[organism part]',\n", - "# 'characteristics[ancestry category]',\n", - "# 'characteristics[cell type]',\n", - "# 'characteristics[disease]',\n", - "# 'characteristics[cell line]',\n", - "# 'characteristics[biological replicate]',\n", - "# 'assay name',\n", - "# 'technology type',\n", - "# 'comment[technical replicate]',\n", - "# 'comment[data file]',\n", - "# 'comment[fraction identifier]',\n", - "# 'comment[label]',\n", - "# 'comment[cleavage agent details]',\n", - "# 'comment[instrument]']\n", - "\n", - "# sdrf = sdrf[order]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d81cff5f", - "metadata": { - "lines_to_next_cell": 0 - }, - "outputs": [], - "source": [ - "fname = Path('data') / 'dev_datasets' / 'Experimental-Design.sdrf.tsv'\n", - "sdrf.to_csv(fname, sep='\\t')\n", - "fname" - ] - }, - { - "cell_type": "markdown", - "id": "b80c5166", - "metadata": {}, - "source": [ - "## Validate SDRF file\n", - "```\n", - "pip install sdrf-pipelines\n", - "parse_sdrf validate-sdrf --sdrf_file project\\data\\dev_datasets\\sdrf.tsv\n", - "```" - ] - } - ], - "metadata": { - "jupytext": { - "cell_metadata_filter": "-all", - "main_language": "python", - "notebook_metadata_filter": "-all" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/project/00_0_3_create_sdrf.py b/project/00_0_3_create_sdrf.py deleted file mode 100644 index 14e033b34..000000000 --- a/project/00_0_3_create_sdrf.py +++ /dev/null @@ -1,127 +0,0 @@ -# %% [markdown] -# # Create SDRF file -# - [example](https://github.com/bigbio/proteomics-sample-metadata/blob/6f31044f0bcf545ae2da6e853f8ccad011ea4703/annotated-projects/PXD000895/PXD000895.sdrf.tsv) - -# %% -from pathlib import Path -import pandas as pd - - -# %% -fn_sdrf_cellline_template = Path('data') / 'sdrf-cell-line-template.tsv' -fn_meta = Path('data/rename') / 'selected_old_new_id_mapping.csv' - - -# %% -df_meta = pd.read_csv(fn_meta, index_col='new_sample_id') -df_meta - -# %% -sdrf = pd.DataFrame() # pd.read_table(fn_sdrf_cellline_template) -sdrf['source name'] = df_meta.index -sdrf = sdrf.set_index('source name') -sdrf['characteristics[organism]'] = 'Homo sapiens' -sdrf['characteristics[organism part]'] = 'cervex' -sdrf['characteristics[ancestry category]'] = 'Black' -sdrf['characteristics[age]'] = '31Y' -sdrf['characteristics[developmental stage]'] = 'adult' -sdrf['characteristics[sex]'] = 'female' -sdrf['characteristics[cell line]'] = 'HeLa cells' -sdrf['characteristics[cell type]'] = 'epithelial' -sdrf['characteristics[disease]'] = 'adenocarcinoma' -sdrf['characteristics[cell line]'] = 'HeLa cells' -sdrf['characteristics[biological replicate]'] = 1 -sdrf['assay name'] = sdrf.index -sdrf['technology type'] = 'proteomic profiling by mass spectrometer' -sdrf['comment[technical replicate]'] = range(0, len(sdrf)) -sdrf['comment[data file]'] = sdrf.index + '.raw' -sdrf['comment[fraction identifier]'] = 1 -sdrf['comment[label]'] = 'NT=label free sample;AC=MS:1002038' # To check -sdrf['comment[cleavage agent details]'] = 'NT=Trypsin;AC=MS:1001251' -sdrf['comment[instrument]'] = df_meta['Instrument_name'] - -sdrf -# %% -# based on https://www.ebi.ac.uk/ols4/ -# -# Q Exactive HF-X MS:1002877 -# Q Exactive HF MS:1002523 -# Orbitrap Exploris 480 MS:1003028 -# Exactive Plus MS:1002526 -# Q Exactive MS:1001911 -# Orbitrap Fusion Lumos MS:1002732 - - -instrument_ms_mapping = { - 'Q-Exactive-HF-X-Orbitrap_6070': 'NT=Q Exactive HF-X;AC=MS:1002877:', - 'Q-Exactive-HF-X-Orbitrap_6071': 'NT=Q Exactive HF-X;AC=MS:1002877:', - 'Q-Exactive-HF-X-Orbitrap_6075': 'NT=Q Exactive HF-X;AC=MS:1002877:', - 'Q-Exactive-HF-X-Orbitrap_6101': 'NT=Q Exactive HF-X;AC=MS:1002877:', - 'Q-Exactive-HF-Orbitrap_207': 'NT=Q Exactive HF;AC=MS:1002523', - 'Q-Exactive-HF-X-Orbitrap_6096': 'NT=Q Exactive HF-X;AC=MS:1002877:', - 'Q-Exactive-HF-X-Orbitrap_6078': 'NT=Q Exactive HF-X;AC=MS:1002877:', - 'Q-Exactive-HF-Orbitrap_147': 'NT=Q Exactive HF;AC=MS:1002523', - 'Q-Exactive-Orbitrap_1': 'NT=Q Exactive;AC=MS:1001911', - 'Q-Exactive-HF-Orbitrap_143': 'NT=Q Exactive HF;AC=MS:1002523', - 'Q-Exactive-HF-Orbitrap_204': 'NT=Q Exactive HF;AC=MS:1002523', - 'Q-Exactive-HF-X-Orbitrap_6011': 'NT=Q Exactive HF-X;AC=MS:1002877:', - 'Q-Exactive-HF-Orbitrap_206': 'NT=Q Exactive HF;AC=MS:1002523', - 'Q-Exactive-HF-X-Orbitrap_6073': 'NT=Q Exactive HF-X;AC=MS:1002877:', - 'Q-Exactive-HF-Orbitrap_1': 'NT=Q Exactive HF;AC=MS:1002523', - 'Q-Exactive-HF-Orbitrap_148': 'NT=Q Exactive HF;AC=MS:1002523', - 'Orbitrap-Fusion-Lumos_FSN20115': 'NT=Orbitrap Fusion Lumos;AC=MS:1002732', - 'Q-Exactive-HF-X-Orbitrap_6016': 'NT=Q Exactive HF-X;AC=MS:1002877:', - 'Q-Exactive-HF-X-Orbitrap_6004': 'NT=Q Exactive HF-X;AC=MS:1002877:', - 'Orbitrap-Exploris-480_MA10132C': 'NT=Orbitrap Exploris 480;AC=MS:1003028', - 'Q-Exactive-HF-X-Orbitrap_6028': 'NT=Q Exactive HF-X;AC=MS:1002877:', - 'Q-Exactive-HF-X-Orbitrap_6044': 'NT=Q Exactive HF-X;AC=MS:1002877:', - 'Q-Exactive-HF-X-Orbitrap_6025': 'NT=Q Exactive HF-X;AC=MS:1002877:', - 'Q-Exactive-HF-X-Orbitrap_6324': 'NT=Q Exactive HF-X;AC=MS:1002877:', - 'Orbitrap-Exploris-480_MA10134C': 'NT=Orbitrap Exploris 480;AC=MS:1003028', - 'Q-Exactive-HF-X-Orbitrap_6022': 'NT=Q Exactive HF-X;AC=MS:1002877:', - 'Q-Exactive-HF-X-Orbitrap_6043': 'NT=Q Exactive HF-X;AC=MS:1002877:', - 'Q-Exactive-HF-X-Orbitrap_6013': 'NT=Q Exactive HF-X;AC=MS:1002877:', - 'Q-Exactive-HF-X-Orbitrap_6023': 'NT=Q Exactive HF-X;AC=MS:1002877:', - 'Exactive-Series-Orbitrap_6004': 'NT=Q Exactive;AC=MS:1001911', - 'Orbitrap-Exploris-480_Invalid_SN_0001': 'NT=Orbitrap Exploris 480;AC=MS:1003028', - 'Orbitrap-Exploris-480_MA10215C': 'NT=Orbitrap Exploris 480;AC=MS:1003028', - 'Q-Exactive-HF-Orbitrap_2612': 'NT=Q Exactive HF;AC=MS:1002523', - 'Q-Exactive-Plus-Orbitrap_1': 'NT=Exactive Plus;AC=MS:1002526', - 'Q-Exactive-Plus-Orbitrap_143': 'NT=Exactive Plus;AC=MS:1002526', - 'Orbitrap-Exploris-480_MA10130C': 'NT=Orbitrap Exploris 480;AC=MS:1003028', -} -sdrf['comment[instrument]'] = sdrf['comment[instrument]'].replace( - instrument_ms_mapping) - -# %% -# change order: The column `technology type`` cannot be before the `assay name`` -- ERROR -# template has wrong order (open PR) -# -> done now above -# order = ['characteristics[organism]', -# 'characteristics[organism part]', -# 'characteristics[ancestry category]', -# 'characteristics[cell type]', -# 'characteristics[disease]', -# 'characteristics[cell line]', -# 'characteristics[biological replicate]', -# 'assay name', -# 'technology type', -# 'comment[technical replicate]', -# 'comment[data file]', -# 'comment[fraction identifier]', -# 'comment[label]', -# 'comment[cleavage agent details]', -# 'comment[instrument]'] - -# sdrf = sdrf[order] - -# %% -fname = Path('data') / 'dev_datasets' / 'Experimental-Design.sdrf.tsv' -sdrf.to_csv(fname, sep='\t') -fname -# %% [markdown] -# ## Validate SDRF file -# ``` -# pip install sdrf-pipelines -# parse_sdrf validate-sdrf --sdrf_file project\data\dev_datasets\sdrf.tsv -# ``` diff --git a/project/00_0_4_create_submission_folder.ipynb b/project/00_0_4_create_submission_folder.ipynb deleted file mode 100644 index 891af6559..000000000 --- a/project/00_0_4_create_submission_folder.ipynb +++ /dev/null @@ -1,185 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "5466db14", - "metadata": { - "lines_to_next_cell": 2 - }, - "source": [ - "# Submission file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "74dcae2a", - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "from collections import defaultdict\n", - "from pathlib import Path, PurePosixPath" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5e780185", - "metadata": {}, - "outputs": [], - "source": [ - "# Parameters\n", - "FOLDER = Path('data/rename')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b5c99d62", - "metadata": {}, - "outputs": [], - "source": [ - "file = FOLDER / 'files_on_pride.log'\n", - "# file = FOLDER / 'files_pride_server_toplevel.log'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2f71d21a", - "metadata": {}, - "outputs": [], - "source": [ - "counts = defaultdict(int)\n", - "with open(file) as f:\n", - " for line in f:\n", - " fname = line.strip()\n", - " suffix = PurePosixPath(fname).suffix\n", - " counts[suffix] += 1\n", - "dict(counts)" - ] - }, - { - "cell_type": "markdown", - "id": "2f18e74e", - "metadata": {}, - "source": [ - "Only create a few files for creation a submission.px template...\n", - "\n", - "# %%\n", - "SUBMISSON_FOLDER = Path('data/rename/submission')\n", - "SUBMISSON_FOLDER.mkdir(exist_ok=True)\n", - "with open(file) as f:\n", - " hash = 'placeholder'\n", - " for line in f:\n", - " # fname = line.strip().split()\n", - " fname = line.strip()\n", - " fname = PurePosixPath(fname).name\n", - " with open(SUBMISSON_FOLDER / fname, 'w') as f_out:\n", - " f_out.write(f'{hash} {fname}')\n", - "# %%\n", - "files = list(SUBMISSON_FOLDER.iterdir())\n", - "print(f\"{len(files) = :,d}\")" - ] - }, - { - "cell_type": "markdown", - "id": "80f86e6a", - "metadata": {}, - "source": [ - "7444 raw files\n", - "7444 zip files with MaxQuant results\n", - "3 zip files with aggregated MaxQuant results\n", - "1 SDRF file as tsv\n", - "2 csv files with metadata of the raw files and the MaxQuant results summaries" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "91903d5e", - "metadata": {}, - "outputs": [], - "source": [ - "# len(files) == 7444*2 + 6 # expected number of files" - ] - }, - { - "cell_type": "markdown", - "id": "64114469", - "metadata": {}, - "source": [ - "This was not really necessary..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5d8ccc0f", - "metadata": {}, - "outputs": [], - "source": [ - "file_types = {'.zip': 'SEARCH',\n", - " '.raw': 'RAW',\n", - " '.csv': 'SEARCH',\n", - " '.tsv': 'EXPERIMENTAL_DESIGN'}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "92389c3b", - "metadata": { - "lines_to_next_cell": 0 - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "files = pd.DataFrame(columns='FMH\tfile_id\tfile_type\tfile_path\tfile_mapping'.split('\\t'))\n", - "files['file_path'] = pd.read_csv(file, header=None)\n", - "files['FMH'] = 'FMH'\n", - "files['file_id'] = files.index\n", - "files['file_type'] = files['file_path'].map(lambda x: file_types[Path(x).suffix])\n", - "files['file_mapping'] = files['file_id'] - 1\n", - "files.loc[\n", - " files['file_type'] != 'SEARCH', 'file_mapping'] = np.nan\n", - "files = files.astype({'file_id': int, 'file_mapping': pd.Int32Dtype()})\n", - "files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "249d011d", - "metadata": { - "lines_to_next_cell": 0 - }, - "outputs": [], - "source": [ - "files.to_csv(FOLDER / 'submiss.px_to_add.tsv', sep='\\t', index=False)" - ] - }, - { - "cell_type": "markdown", - "id": "9107e219", - "metadata": { - "lines_to_next_cell": 2 - }, - "source": [ - "Some manuel adding of the last files still required..." - ] - } - ], - "metadata": { - "jupytext": { - "cell_metadata_filter": "-all", - "main_language": "python", - "notebook_metadata_filter": "-all" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/project/00_0_4_create_submission_folder.py b/project/00_0_4_create_submission_folder.py deleted file mode 100644 index 3078981f5..000000000 --- a/project/00_0_4_create_submission_folder.py +++ /dev/null @@ -1,81 +0,0 @@ -# %% [markdown] -# # Submission file - - -# %% -from collections import defaultdict -from pathlib import Path, PurePosixPath - - -# %% -# Parameters -FOLDER = Path('data/rename') - -# %% -file = FOLDER / 'files_on_pride.log' -# file = FOLDER / 'files_pride_server_toplevel.log' - -# %% -counts = defaultdict(int) -with open(file) as f: - for line in f: - fname = line.strip() - suffix = PurePosixPath(fname).suffix - counts[suffix] += 1 -dict(counts) - -# %% [markdown] -# Only create a few files for creation a submission.px template... -# -# # %% -# SUBMISSON_FOLDER = Path('data/rename/submission') -# SUBMISSON_FOLDER.mkdir(exist_ok=True) -# with open(file) as f: -# hash = 'placeholder' -# for line in f: -# # fname = line.strip().split() -# fname = line.strip() -# fname = PurePosixPath(fname).name -# with open(SUBMISSON_FOLDER / fname, 'w') as f_out: -# f_out.write(f'{hash} {fname}') -# # %% -# files = list(SUBMISSON_FOLDER.iterdir()) -# print(f"{len(files) = :,d}") - -# %% [markdown] -# 7444 raw files -# 7444 zip files with MaxQuant results -# 3 zip files with aggregated MaxQuant results -# 1 SDRF file as tsv -# 2 csv files with metadata of the raw files and the MaxQuant results summaries - -# %% -# len(files) == 7444*2 + 6 # expected number of files - -# %% [markdown] -# This was not really necessary... - -# %% -file_types = {'.zip': 'SEARCH', - '.raw': 'RAW', - '.csv': 'SEARCH', - '.tsv': 'EXPERIMENTAL_DESIGN'} - -# %% -import numpy as np -import pandas as pd -files = pd.DataFrame(columns='FMH file_id file_type file_path file_mapping'.split('\t')) -files['file_path'] = pd.read_csv(file, header=None) -files['FMH'] = 'FMH' -files['file_id'] = files.index -files['file_type'] = files['file_path'].map(lambda x: file_types[Path(x).suffix]) -files['file_mapping'] = files['file_id'] - 1 -files.loc[ - files['file_type'] != 'SEARCH', 'file_mapping'] = np.nan -files = files.astype({'file_id': int, 'file_mapping': pd.Int32Dtype()}) -files -# %% -files.to_csv(FOLDER / 'submiss.px_to_add.tsv', sep='\t', index=False) -# %% [markdown] -# Some manuel adding of the last files still required... - diff --git a/project/00_0_hela_metadata_rawfiles.ipynb b/project/00_0_hela_metadata_rawfiles.ipynb deleted file mode 100644 index 820e3bfec..000000000 --- a/project/00_0_hela_metadata_rawfiles.ipynb +++ /dev/null @@ -1,448 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "8d7f70a2-ce2b-450c-ad95-9e2c879cbdae", - "metadata": {}, - "source": [ - "# Rawfile metadata\n", - "\n", - "- generated using `workflows/metadata`\n", - "- all raw files collected ~50,000" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "18bc909e-536b-430d-984e-ddf45cf16726", - "metadata": {}, - "outputs": [], - "source": [ - "from collections import namedtuple\n", - "from collections import defaultdict\n", - "\n", - "import yaml\n", - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "import vaep.pandas" - ] - }, - { - "cell_type": "markdown", - "id": "3ce6c1cc-6ffc-411c-b3e4-8780939028e0", - "metadata": {}, - "source": [ - "## Arguments" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "204776e0-5693-4621-8380-4e127f3fe290", - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "fn_rawfile_metadata: str = 'data/rawfile_metadata.csv' # Machine parsed metadata from rawfile workflow\n", - "# outputs\n", - "fn_files_per_instrument: str = 'data/files_per_instrument.yaml' # All parsed raw files nested by instrument (model, attribute, serial number)\n", - "fn_files_selected: str = 'data/samples_selected.yaml' # selected files based on threshold of identified peptides\n", - "fn_files_per_instrument_selected: str = 'data/files_selected_per_instrument.yaml' # Selected parsed raw files nested by instrument (model, attribute, serial number)" - ] - }, - { - "cell_type": "markdown", - "id": "f1ff9c99-9162-4a53-99c5-b5691ee0b12a", - "metadata": {}, - "source": [ - "### Machine metadata\n", - "\n", - "- read from file using [ThermoRawFileParser](https://github.com/compomics/ThermoRawFileParser)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "39c2b9ea-2524-4bbd-9780-127873a2c18b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df_meta_rawfiles = pd.read_csv(fn_rawfile_metadata, header=[0, 1], index_col=0, low_memory=False)\n", - "date_col = ('FileProperties', 'Content Creation Date')\n", - "df_meta_rawfiles[date_col] = pd.to_datetime(\n", - " df_meta_rawfiles[date_col])\n", - "df_meta_rawfiles.sort_values(date_col, inplace=True)\n", - "df_meta_rawfiles" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "32b42511", - "metadata": {}, - "outputs": [], - "source": [ - "msg = f\"A total of {len(df_meta_rawfiles)} raw files could be read using the ThermoFisherRawFileParser.\"\n", - "print(msg)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "083e076f-f584-4236-9b9c-0bbd1dfa34bf", - "metadata": {}, - "outputs": [], - "source": [ - "meta_stats = df_meta_rawfiles.describe(include='all', datetime_is_numeric=True)\n", - "meta_stats.T" - ] - }, - { - "cell_type": "markdown", - "id": "0b3ef962-fe95-4af7-9d8e-4427a5950a78", - "metadata": {}, - "source": [ - "subset with variation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5f63f93e-4bbf-4445-80c4-de7a11fe3fee", - "metadata": {}, - "outputs": [], - "source": [ - "meta_stats.loc[:, (meta_stats.loc['unique'] > 1) | (meta_stats.loc['std'] > 0.1)].T" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "abc0cef2-e8d1-4178-b6ea-e073cc9bcd8a", - "metadata": {}, - "outputs": [], - "source": [ - "# needs to go to Config which is not overwriteable by attribute selection\n", - "df_meta_rawfiles_columns = df_meta_rawfiles.columns\n", - "meta_raw_names = df_meta_rawfiles.columns.droplevel()\n", - "assert meta_raw_names.is_unique\n", - "df_meta_rawfiles.columns = meta_raw_names\n", - "df_meta_rawfiles" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2fe29d9e-a7af-4bc1-95c0-67c492431ed1", - "metadata": {}, - "outputs": [], - "source": [ - "meta_raw_selected = [\n", - " 'Content Creation Date',\n", - " 'Thermo Scientific instrument model',\n", - " 'instrument serial number',\n", - " 'Software Version',\n", - " 'Number of MS1 spectra',\n", - " 'Number of MS2 spectra',\n", - " 'Number of scans',\n", - " 'MS max charge',\n", - " 'MS max RT',\n", - " 'MS min MZ',\n", - " 'MS max MZ',\n", - " 'MS scan range',\n", - " 'mass resolution',\n", - " 'Retention time range',\n", - " 'Mz range',\n", - " 'beam-type collision-induced dissociation',\n", - " 'injection volume setting',\n", - " 'dilution factor',\n", - "]\n", - "df_meta_rawfiles[meta_raw_selected].describe(percentiles=np.linspace(0.05, 0.95, 10))" - ] - }, - { - "cell_type": "markdown", - "id": "ab323148-8af6-4ca8-99dc-0c5e9ef86714", - "metadata": {}, - "source": [ - "- `MS min MZ`: outlier clearly shifts means\n", - "- `mass resolution` is unique (can this be?)\n", - "- `dillution factor` is unique (can this be?)" - ] - }, - { - "cell_type": "markdown", - "id": "7a34bb67-6e9b-4f6f-a79c-c953afc0aa4d", - "metadata": {}, - "source": [ - "## Instrument type and settings\n", - "\n", - "check some columns describing settings\n", - " - quite some variation due to `MS max charge`: Is it a parameter?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9accd23f-06f5-47ef-b9fd-110fbdb12752", - "metadata": {}, - "outputs": [], - "source": [ - "MetaRawSettings = namedtuple(\n", - " 'MetaRawSettings',\n", - " 'ms_model ms_attr ms_sn ms_firmware max_charge mass_res cid_type inject_volume dill_factor')\n", - "meta_raw_settings = [\n", - " 'Thermo Scientific instrument model',\n", - " 'instrument attribute',\n", - " 'instrument serial number',\n", - " 'Software Version',\n", - " 'MS max charge',\n", - " 'mass resolution',\n", - " 'beam-type collision-induced dissociation',\n", - " 'injection volume setting',\n", - " 'dilution factor',\n", - "]\n", - "meta_raw_settings = MetaRawSettings(*meta_raw_settings)\n", - "meta_raw_settings" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9a7cbb3b-f97c-42c2-a094-4842d9b722dd", - "metadata": {}, - "outputs": [], - "source": [ - "# index gives first example with this combination\n", - "# df_meta_rawfiles[list(meta_raw_settings)].drop_duplicates()\n", - "df_meta_rawfiles[list(meta_raw_settings)].drop_duplicates(ignore_index=True)" - ] - }, - { - "cell_type": "markdown", - "id": "50f85fa8-a0c5-4c4b-b292-22721989c21a", - "metadata": {}, - "source": [ - "view without `MS max charge`:\n", - " - software can be updated\n", - " - variation by `injection volume setting` and instrument over time\n", - " - missing `dilution factor`\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "35682fcc-503c-444f-ace9-4d50a1726ca3", - "metadata": {}, - "outputs": [], - "source": [ - "to_drop = ['MS max charge']\n", - "# df_meta_rawfiles[list(meta_raw_settings)].drop(to_drop,\n", - "# axis=1).drop_duplicates(ignore_index=False) # index gives first example\n", - "# with this combination\n", - "df_meta_rawfiles[list(meta_raw_settings)].drop(to_drop, axis=1).drop_duplicates(ignore_index=True)" - ] - }, - { - "cell_type": "markdown", - "id": "031c4e4e-600d-48cb-b025-af851b4d5e26", - "metadata": {}, - "source": [ - "Relatively big samples for different machines of the same kind running with the same firmware:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8cf957f5-d3e6-46c2-b15e-f82cd37e5488", - "metadata": {}, - "outputs": [], - "source": [ - "df_meta_rawfiles.groupby([meta_raw_settings.ms_model, meta_raw_settings.ms_firmware])[\n", - " meta_raw_settings.ms_model].count().sort_values().tail(10)" - ] - }, - { - "cell_type": "markdown", - "id": "b890cf2f-3d6e-4a87-adda-0bf947900971", - "metadata": {}, - "source": [ - "Ignoring instrument software" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0caecd7f-a804-4b11-b761-58f68bbc6a20", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "grouping = df_meta_rawfiles.groupby(list(meta_raw_settings[:3]))\n", - "instrument_counts = grouping[meta_raw_settings.ms_model].count().sort_values()\n", - "msg += (f\" There are a total of {len(instrument_counts)} unique instruments in the entire dataset (based on the instrument name, attributs and serial number)\"\n", - " f\", of which at least {(instrument_counts >= 1000).sum()} have 1,000 raw files assigned to them. Note that the entire dataset contains fractionated measurements.\")\n", - "instrument_counts" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e763c1f0-3ca4-404f-a5f5-28a9acbea6f5", - "metadata": {}, - "outputs": [], - "source": [ - "ms_groups = vaep.pandas.create_dict_of_dicts(grouping.groups, verbose=True, transform_values=list)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "29e8caa4-5a7e-46b6-9758-26661548cf18", - "metadata": {}, - "outputs": [], - "source": [ - "# d = dict()\n", - "# for (k1, k2, k3), v in grouping.groups.items():\n", - "# print(f\"{str((k1,k2,k3)):90}: {len(v):>5}\")\n", - "# if not k1 in d:\n", - "# d[k1] = dict()\n", - "# if not k2 in d[k1]:\n", - "# d[k1][k2] = dict()\n", - "# d[k1][k2][k3] = list(v)\n", - "# assert ms_groups == d" - ] - }, - { - "cell_type": "markdown", - "id": "5719cd88-f9dc-4edb-a5f0-61d27cf67599", - "metadata": {}, - "source": [ - "Save selection yaml" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "163089c0-8889-4421-ba88-00ec28594d3b", - "metadata": {}, - "outputs": [], - "source": [ - "with open(fn_files_per_instrument, 'w') as f:\n", - " yaml.dump(ms_groups, f)" - ] - }, - { - "cell_type": "markdown", - "id": "f09f8a60-bd80-4d3c-b86c-ddd2d25417a2", - "metadata": {}, - "source": [ - "## Quantified files\n", - "\n", - "- export nested files with quantified files based on selection based on identified peptides threshold" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ca9b7ce5-f36c-43ef-8912-95ea9421c5a8", - "metadata": {}, - "outputs": [], - "source": [ - "with open(fn_files_selected) as f:\n", - " files_selected = yaml.safe_load(f)\n", - "print(f'Threshold: {files_selected[\"threshold\"]:,d}')" - ] - }, - { - "cell_type": "markdown", - "id": "c50d4263-d44a-4315-aea2-793473452595", - "metadata": {}, - "source": [ - "- save metadata for selected, quantified samples / raw files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a8ed450f-c9aa-4387-aa97-a33444abd5a9", - "metadata": {}, - "outputs": [], - "source": [ - "df_meta_rawfiles.loc[files_selected['files']].to_csv('data/files_selected_metadata.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e438ba02-93ae-4fb1-9947-d2b741269feb", - "metadata": {}, - "outputs": [], - "source": [ - "grouping = df_meta_rawfiles.loc[files_selected['files']].groupby(list(meta_raw_settings[:3]))\n", - "instrument_counts = grouping[meta_raw_settings.ms_model].count().sort_values()\n", - "N = 500\n", - "msg += (\n", - " f\" Among the {len(files_selected['files'])} raw files with a minimum of {files_selected['threshold']:,d} identified peptides there are a total of {len(instrument_counts)} unique instruments with quantified runs\"\n", - " f\", of which {(instrument_counts >= N).sum()} have at least {N:,d} rawfiles assigned to them.\")\n", - "instrument_counts.to_csv('data/files_selected_per_instrument_counts.csv')\n", - "instrument_counts.to_frame('No. samples')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8c61da71-96a0-45a2-a89f-6ea056948d61", - "metadata": {}, - "outputs": [], - "source": [ - "ms_groups = vaep.pandas.create_dict_of_dicts(grouping.groups, verbose=True, transform_values=list)\n", - "with open(fn_files_per_instrument_selected, 'w') as f:\n", - " yaml.dump(ms_groups, f)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d4d50dcb-0715-4eaf-8d47-ce16385e362e", - "metadata": {}, - "outputs": [], - "source": [ - "print(msg)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13" - }, - "vscode": { - "interpreter": { - "hash": "cf83e9cb890c7f96eb0ae04f39a82254555f56a1a0ed2f03b23a8b40fe6cd31c" - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/project/00_0_hela_metadata_rawfiles.py b/project/00_0_hela_metadata_rawfiles.py deleted file mode 100644 index b8f1248df..000000000 --- a/project/00_0_hela_metadata_rawfiles.py +++ /dev/null @@ -1,217 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.15.0 -# kernelspec: -# display_name: Python 3 -# language: python -# name: python3 -# --- - -# %% [markdown] -# # Rawfile metadata -# -# - generated using `workflows/metadata` -# - all raw files collected ~50,000 - -# %% -from collections import namedtuple -from collections import defaultdict - -import yaml -import numpy as np -import pandas as pd - -import vaep.pandas - -# %% [markdown] -# ## Arguments - -# %% tags=["parameters"] -fn_rawfile_metadata: str = 'data/rawfile_metadata.csv' # Machine parsed metadata from rawfile workflow -# outputs -fn_files_per_instrument: str = 'data/files_per_instrument.yaml' # All parsed raw files nested by instrument (model, attribute, serial number) -fn_files_selected: str = 'data/samples_selected.yaml' # selected files based on threshold of identified peptides -fn_files_per_instrument_selected: str = 'data/files_selected_per_instrument.yaml' # Selected parsed raw files nested by instrument (model, attribute, serial number) - -# %% [markdown] -# ### Machine metadata -# -# - read from file using [ThermoRawFileParser](https://github.com/compomics/ThermoRawFileParser) - -# %% -df_meta_rawfiles = pd.read_csv(fn_rawfile_metadata, header=[0, 1], index_col=0, low_memory=False) -date_col = ('FileProperties', 'Content Creation Date') -df_meta_rawfiles[date_col] = pd.to_datetime( - df_meta_rawfiles[date_col]) -df_meta_rawfiles.sort_values(date_col, inplace=True) -df_meta_rawfiles - -# %% -msg = f"A total of {len(df_meta_rawfiles)} raw files could be read using the ThermoFisherRawFileParser." -print(msg) - -# %% -meta_stats = df_meta_rawfiles.describe(include='all', datetime_is_numeric=True) -meta_stats.T - -# %% [markdown] -# subset with variation - -# %% -meta_stats.loc[:, (meta_stats.loc['unique'] > 1) | (meta_stats.loc['std'] > 0.1)].T - -# %% -# needs to go to Config which is not overwriteable by attribute selection -df_meta_rawfiles_columns = df_meta_rawfiles.columns -meta_raw_names = df_meta_rawfiles.columns.droplevel() -assert meta_raw_names.is_unique -df_meta_rawfiles.columns = meta_raw_names -df_meta_rawfiles - -# %% -meta_raw_selected = [ - 'Content Creation Date', - 'Thermo Scientific instrument model', - 'instrument serial number', - 'Software Version', - 'Number of MS1 spectra', - 'Number of MS2 spectra', - 'Number of scans', - 'MS max charge', - 'MS max RT', - 'MS min MZ', - 'MS max MZ', - 'MS scan range', - 'mass resolution', - 'Retention time range', - 'Mz range', - 'beam-type collision-induced dissociation', - 'injection volume setting', - 'dilution factor', -] -df_meta_rawfiles[meta_raw_selected].describe(percentiles=np.linspace(0.05, 0.95, 10)) - -# %% [markdown] -# - `MS min MZ`: outlier clearly shifts means -# - `mass resolution` is unique (can this be?) -# - `dillution factor` is unique (can this be?) - -# %% [markdown] -# ## Instrument type and settings -# -# check some columns describing settings -# - quite some variation due to `MS max charge`: Is it a parameter? - -# %% -MetaRawSettings = namedtuple( - 'MetaRawSettings', - 'ms_model ms_attr ms_sn ms_firmware max_charge mass_res cid_type inject_volume dill_factor') -meta_raw_settings = [ - 'Thermo Scientific instrument model', - 'instrument attribute', - 'instrument serial number', - 'Software Version', - 'MS max charge', - 'mass resolution', - 'beam-type collision-induced dissociation', - 'injection volume setting', - 'dilution factor', -] -meta_raw_settings = MetaRawSettings(*meta_raw_settings) -meta_raw_settings - -# %% -# index gives first example with this combination -# df_meta_rawfiles[list(meta_raw_settings)].drop_duplicates() -df_meta_rawfiles[list(meta_raw_settings)].drop_duplicates(ignore_index=True) - -# %% [markdown] -# view without `MS max charge`: -# - software can be updated -# - variation by `injection volume setting` and instrument over time -# - missing `dilution factor` -# - -# %% -to_drop = ['MS max charge'] -# df_meta_rawfiles[list(meta_raw_settings)].drop(to_drop, -# axis=1).drop_duplicates(ignore_index=False) # index gives first example -# with this combination -df_meta_rawfiles[list(meta_raw_settings)].drop(to_drop, axis=1).drop_duplicates(ignore_index=True) - -# %% [markdown] -# Relatively big samples for different machines of the same kind running with the same firmware: - -# %% -df_meta_rawfiles.groupby([meta_raw_settings.ms_model, meta_raw_settings.ms_firmware])[ - meta_raw_settings.ms_model].count().sort_values().tail(10) - -# %% [markdown] -# Ignoring instrument software - -# %% -grouping = df_meta_rawfiles.groupby(list(meta_raw_settings[:3])) -instrument_counts = grouping[meta_raw_settings.ms_model].count().sort_values() -msg += (f" There are a total of {len(instrument_counts)} unique instruments in the entire dataset (based on the instrument name, attributs and serial number)" - f", of which at least {(instrument_counts >= 1000).sum()} have 1,000 raw files assigned to them. Note that the entire dataset contains fractionated measurements.") -instrument_counts - -# %% -ms_groups = vaep.pandas.create_dict_of_dicts(grouping.groups, verbose=True, transform_values=list) - -# %% -# d = dict() -# for (k1, k2, k3), v in grouping.groups.items(): -# print(f"{str((k1,k2,k3)):90}: {len(v):>5}") -# if not k1 in d: -# d[k1] = dict() -# if not k2 in d[k1]: -# d[k1][k2] = dict() -# d[k1][k2][k3] = list(v) -# assert ms_groups == d - -# %% [markdown] -# Save selection yaml - -# %% -with open(fn_files_per_instrument, 'w') as f: - yaml.dump(ms_groups, f) - -# %% [markdown] -# ## Quantified files -# -# - export nested files with quantified files based on selection based on identified peptides threshold - -# %% -with open(fn_files_selected) as f: - files_selected = yaml.safe_load(f) -print(f'Threshold: {files_selected["threshold"]:,d}') - -# %% [markdown] -# - save metadata for selected, quantified samples / raw files - -# %% -df_meta_rawfiles.loc[files_selected['files']].to_csv('data/files_selected_metadata.csv') - -# %% -grouping = df_meta_rawfiles.loc[files_selected['files']].groupby(list(meta_raw_settings[:3])) -instrument_counts = grouping[meta_raw_settings.ms_model].count().sort_values() -N = 500 -msg += ( - f" Among the {len(files_selected['files'])} raw files with a minimum of {files_selected['threshold']:,d} identified peptides there are a total of {len(instrument_counts)} unique instruments with quantified runs" - f", of which {(instrument_counts >= N).sum()} have at least {N:,d} rawfiles assigned to them.") -instrument_counts.to_csv('data/files_selected_per_instrument_counts.csv') -instrument_counts.to_frame('No. samples') - -# %% -ms_groups = vaep.pandas.create_dict_of_dicts(grouping.groups, verbose=True, transform_values=list) -with open(fn_files_per_instrument_selected, 'w') as f: - yaml.dump(ms_groups, f) - -# %% -print(msg) diff --git a/project/00_1_hela_MQ_summaries.ipynb b/project/00_1_hela_MQ_summaries.ipynb deleted file mode 100644 index 9cd1dee49..000000000 --- a/project/00_1_hela_MQ_summaries.ipynb +++ /dev/null @@ -1,198 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Analysis of `summaries.txt` information\n", - "\n", - "- number of raw files (no here)\n", - "- number of raw files with MQ-Output\n", - "- MS1 per file\n", - "- MS2 per file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import ipywidgets as widgets\n", - "import yaml\n", - "import numpy as np\n", - "import pandas as pd\n", - "import vaep\n", - "\n", - "from config import FN_ALL_SUMMARIES\n", - "print(f\"{FN_ALL_SUMMARIES = }\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "lines_to_next_cell": 2, - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "FN_ALL_SUMMARIES: str = 'data/mq_summaries.csv' # MqAllSummaries json" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mq_all_summaries = pd.read_csv(FN_ALL_SUMMARIES, index_col=0)\n", - "mq_all_summaries" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Find unique columns, see [post](https://stackoverflow.com/a/54405767/9684872)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from vaep.pandas import unique_cols\n", - "unique_cols(mq_all_summaries.Multiplicity), unique_cols(\n", - " mq_all_summaries[\"Variable modifications first search\"]) # int, NA" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from vaep.pandas import get_unique_non_unique_columns\n", - "columns = get_unique_non_unique_columns(mq_all_summaries)\n", - "mq_all_summaries[columns.unique]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mq_all_summaries[columns.unique].dtypes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mq_all_summaries[columns.unique].iloc[0, :]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Analysis of completeness" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class col_summary:\n", - " MS1 = 'MS'\n", - " MS2 = 'MS/MS' \n", - " MS2_identified = 'MS/MS Identified'\n", - " peptides_identified = 'Peptide Sequences Identified'\n", - "\n", - "if mq_all_summaries is None:\n", - " raise ValueError(\"No data assigned\")\n", - " \n", - "MS_spectra = mq_all_summaries[[col_summary.MS1, col_summary.MS2, col_summary.MS2_identified, col_summary.peptides_identified]]\n", - "\n", - "def compute_summary(threshold_identified):\n", - " mask = MS_spectra[col_summary.peptides_identified] >= threshold_identified\n", - " display(MS_spectra.loc[mask].describe(np.linspace(0.05, 0.95, 10)))\n", - "\n", - "w_ions_range = widgets.IntSlider(value=15_000, min=15_000, max=MS_spectra[col_summary.peptides_identified].max())\n", - "display(widgets.interactive(compute_summary, threshold_identified=w_ions_range))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "List of samples without any identified peptides:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mask = (MS_spectra < 1).any(axis=1)\n", - "MS_spectra.loc[mask]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Export selected list of quantified samples\n", - "\n", - "Based on threshold, save a list of the specified samples" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "dump_dict = {'threshold': int(w_ions_range.value)}\n", - "mask = MS_spectra[col_summary.peptides_identified] >= w_ions_range.value\n", - "dump_dict['files'] = MS_spectra.loc[mask].index.to_list()\n", - "\n", - "with open('data/samples_selected.yaml', 'w') as f:\n", - " yaml.dump(dump_dict, stream=f)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "vaep", - "language": "python", - "name": "vaep" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/project/00_1_hela_MQ_summaries.py b/project/00_1_hela_MQ_summaries.py deleted file mode 100644 index 496b0da07..000000000 --- a/project/00_1_hela_MQ_summaries.py +++ /dev/null @@ -1,102 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.14.5 -# kernelspec: -# display_name: vaep -# language: python -# name: vaep -# --- - -# %% [markdown] -# # Analysis of `summaries.txt` information -# -# - number of raw files (no here) -# - number of raw files with MQ-Output -# - MS1 per file -# - MS2 per file - -# %% -import ipywidgets as widgets -import yaml -import numpy as np -import pandas as pd -import vaep - -from config import FN_ALL_SUMMARIES -print(f"{FN_ALL_SUMMARIES = }") - -# %% tags=["parameters"] -FN_ALL_SUMMARIES: str = 'data/mq_summaries.csv' # MqAllSummaries json - - -# %% -mq_all_summaries = pd.read_csv(FN_ALL_SUMMARIES, index_col=0) -mq_all_summaries - -# %% [markdown] -# Find unique columns, see [post](https://stackoverflow.com/a/54405767/9684872) - -# %% -from vaep.pandas import unique_cols -unique_cols(mq_all_summaries.Multiplicity), unique_cols( - mq_all_summaries["Variable modifications first search"]) # int, NA - -# %% -from vaep.pandas import get_unique_non_unique_columns -columns = get_unique_non_unique_columns(mq_all_summaries) -mq_all_summaries[columns.unique] - -# %% -mq_all_summaries[columns.unique].dtypes - -# %% -mq_all_summaries[columns.unique].iloc[0, :] - - -# %% [markdown] -# ## Analysis of completeness - -# %% -class col_summary: - MS1 = 'MS' - MS2 = 'MS/MS' - MS2_identified = 'MS/MS Identified' - peptides_identified = 'Peptide Sequences Identified' - -if mq_all_summaries is None: - raise ValueError("No data assigned") - -MS_spectra = mq_all_summaries[[col_summary.MS1, col_summary.MS2, col_summary.MS2_identified, col_summary.peptides_identified]] - -def compute_summary(threshold_identified): - mask = MS_spectra[col_summary.peptides_identified] >= threshold_identified - display(MS_spectra.loc[mask].describe(np.linspace(0.05, 0.95, 10))) - -w_ions_range = widgets.IntSlider(value=15_000, min=15_000, max=MS_spectra[col_summary.peptides_identified].max()) -display(widgets.interactive(compute_summary, threshold_identified=w_ions_range)) - -# %% [markdown] -# List of samples without any identified peptides: - -# %% -mask = (MS_spectra < 1).any(axis=1) -MS_spectra.loc[mask] - -# %% [markdown] -# ## Export selected list of quantified samples -# -# Based on threshold, save a list of the specified samples - -# %% -dump_dict = {'threshold': int(w_ions_range.value)} -mask = MS_spectra[col_summary.peptides_identified] >= w_ions_range.value -dump_dict['files'] = MS_spectra.loc[mask].index.to_list() - -with open('data/samples_selected.yaml', 'w') as f: - yaml.dump(dump_dict, stream=f) - diff --git a/project/00_2_hela_all_raw_files.ipynb b/project/00_2_hela_all_raw_files.ipynb deleted file mode 100644 index 40ff06b32..000000000 --- a/project/00_2_hela_all_raw_files.ipynb +++ /dev/null @@ -1,801 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# RawFiles Database\n", - "\n", - "- overview of raw files, among others\n", - " - filesize\n", - " - duplicates of raw files\n", - " - search for substrings to find special cases (e.g. fractionated samples)\n", - "\n", - "**Outputs**\n", - "\n", - "Created data and figures\n", - "\n", - "```bash\n", - "'data/all_raw_files_dump_duplicated.txt'\n", - "'data/all_raw_files_dump_unique.csv' # csv file\n", - "'Figures/raw_file_overview.pdf'\n", - "```\n", - "\n", - "**Inputs**\n", - "\n", - "```bash\n", - "'data/all_raw_files_dump.txt'\n", - "```\n", - "\n", - "The ladder can be created using `find` on a server:\n", - "\n", - "```bash\n", - "find . -name '*.raw' -exec ls -l {} \\; > all_raw_files_dump_2021_10_27.txt\n", - "# alternative (changes the format)\n", - "find . -name '*.raw' -ls > all_raw_files_dump_2021_10_27.txt\n", - "```\n", - "\n", - "which was executed in the " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pathlib import Path, PurePosixPath\n", - "from collections import namedtuple\n", - "from functools import partial\n", - "import yaml\n", - "\n", - "import ipywidgets as widgets\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "import logging\n", - "from vaep.logging import setup_logger\n", - "from vaep.analyzers.analyzers import AnalyzePeptides\n", - "from vaep.io.data_objects import MqAllSummaries\n", - "from vaep.io.rawfiles import RawFileViewer, get_unique_stem, find_indices_containing_query, show_fractions\n", - "import config\n", - "from vaep.nb import Config\n", - "from vaep import utils\n", - "\n", - "cfg = Config()\n", - "\n", - "logger = logging.getLogger('vaep')\n", - "logger = setup_logger(logger, fname_base='00_2_hela_all_raw_files_ipynb')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "# FN_ALL_RAW_FILES = config.FOLDER_DATA / config.FN_ALL_RAW_FILES\n", - "FN_ALL_RAW_FILES: str = config.FOLDER_DATA / 'all_raw_files_dump_2021_10_29.txt'\n", - "FN_ALL_SUMMARIES: str = config.FN_ALL_SUMMARIES\n", - "FN_PEPTIDE_INTENSITIES = config.FOLDER_DATA / 'df_intensities_N07285_M01000' " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cfg.FN_ALL_RAW_FILES = FN_ALL_RAW_FILES\n", - "cfg.FN_ALL_SUMMARIES = FN_ALL_SUMMARIES\n", - "cfg.FN_PEPTIDE_INTENSITIES = FN_PEPTIDE_INTENSITIES" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "RawFile = namedtuple('RawFile', 'name path bytes')\n", - "\n", - "data = []\n", - "with open(cfg.FN_ALL_RAW_FILES) as f:\n", - " for line in f:\n", - " line = line.split(maxsplit=8) # ignore white spaces in file names, example:\n", - " #'-rw-r--r--. 1 501 501 282917566 Dec 3 2022 ./share_hela_raw/MNT_202220220921_EXLP1_Evo1_LiNi_ - Copy1.raw'\n", - " path = Path(line[-1].strip())\n", - " data.append(RawFile(path.stem, path, int(line[4])))\n", - "\n", - "data = pd.DataFrame.from_records(\n", - " data, columns=RawFile._fields, index=RawFile._fields[0])\n", - "\n", - "data.sort_values(by='path', inplace=True)\n", - "data.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data['size_gb'] = data['bytes'] / 1024 ** 3\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fname = 'data/processed/all_raw_file_sizes.csv'\n", - "data.to_csv(fname)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Finding duplicates\n", - "\n", - "- add a numeric index column to identify samples" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data['num_index'] = pd.RangeIndex(stop=len(data))\n", - "mask_non_unique = data.reset_index().duplicated(subset=['name', 'bytes'])\n", - "mask_non_unique.index = data.index\n", - "idx_non_unique = data.loc[mask_non_unique].index.unique()\n", - "idx_non_unique # min number of files to remove" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def check_for_duplicates(df):\n", - " if df.index.is_unique:\n", - " print('Only unique files in index.')\n", - " return None\n", - " else:\n", - " non_unique = df.index.value_counts()\n", - " non_unique = non_unique[non_unique > 1]\n", - " # should this be browseable?\n", - " print(f'Number of files with more than 2 duplicates: {(non_unique > 2).sum()}')\n", - " return non_unique\n", - "\n", - "non_unique = check_for_duplicates(df=data)\n", - "non_unique" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Are there cases where only two files share the same name and have different file sizes:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.loc[\n", - " non_unique.index.difference(idx_non_unique) ]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For same sized groups, remove first the onces in the `MNT` folder:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_in_MNT_to_remove = None\n", - "non_unique_remaining = None\n", - "if not data.index.is_unique:\n", - " _data_to_remove = data.loc[idx_non_unique]\n", - " data_in_MNT_to_remove = pd.DataFrame()\n", - " non_unique_remaining = pd.DataFrame()\n", - " for idx, g in _data_to_remove.groupby(level=0):\n", - " mask = ['\\\\MNT' in str(x) for x in g.path]\n", - " assert len(mask) != sum(mask) , f'All files in MNT subfolders: {idx}'\n", - " data_in_MNT_to_remove = data_in_MNT_to_remove.append(g[mask])\n", - " non_unique_remaining = non_unique_remaining.append(g[[x!=True for x in mask]])\n", - "\n", - " del _data_to_remove, mask, idx, g\n", - "\n", - "assert len(data.loc[idx_non_unique]) == len(non_unique_remaining) + len(data_in_MNT_to_remove)\n", - "assert len(non_unique_remaining.loc[['\\\\MNT' in str(x) for x in non_unique_remaining.path]]) == 0, \"There are files in MNT folder left\"\n", - "data_in_MNT_to_remove" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The main junk of duplicated files in in `MNT` subfolders" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "non_unique_remaining_counts = check_for_duplicates(non_unique_remaining)\n", - "non_unique_remaining.loc[non_unique_remaining_counts.index.unique()]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Files with the same name and the same size are considered the same." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mask_non_unique_remaining = non_unique_remaining.reset_index().duplicated(subset=['name', 'bytes'])\n", - "mask_non_unique_remaining.index = non_unique_remaining.index\n", - "data_to_remove = data_in_MNT_to_remove.append(\n", - " non_unique_remaining.loc[mask_non_unique_remaining]\n", - ")\n", - "data_to_remove" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(f\"Save {data_to_remove['size_gb'].sum():1.0f} GB disk space by deleting {len(data_to_remove)} files.\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_unique = data.reset_index().set_index('num_index').drop(data_to_remove.set_index('num_index').index).set_index('name')\n", - "data_unique" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Make sure that every index to remove is still present in `data_unique` which is data to keep" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_unique.loc[data_to_remove.index.unique()]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "assert len(data_unique) + len(data_to_remove) == len(data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Show files which are duplicated, but have different sizes:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# two files have the same name, but different sizes\n", - "data_unique.loc[data_unique.index.duplicated(False)] if not data_unique.index.is_unique else None" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Save unique files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cfg.FN_ALL_RAW_FILES_UNIQUE = utils.append_to_filepath(cfg.FN_ALL_RAW_FILES, config.build_df_fname(data_unique, 'unique'), new_suffix='csv')\n", - "data_unique.to_csv(cfg.FN_ALL_RAW_FILES_UNIQUE)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Export file paths to file to remove them, e.g using `rm $( all_raw_files_dump_duplicated_cleaned.txt\n", - "ls `cat all_raw_files_dump_duplicated_cleaned`\n", - "rm -i `cat all_raw_files_dump_duplicated_cleaned`\n", - "rm -i $( They can be duplicated files with the same file size. Not the case for now" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "idx_shared = mq_summaries.df.index.intersection(data_unique.index)\n", - "\n", - "_file_sizes = data_unique.loc[idx_shared, 'size_gb']\n", - "_file_sizes.loc[_file_sizes.index.duplicated(False)]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "_file_sizes = _file_sizes.loc[~_file_sizes.index.duplicated(keep='last')]\n", - "mq_summaries.df.loc[idx_shared, 'file size in GB'] = _file_sizes\n", - "cols = ['Peptide Sequences Identified', 'file size in GB']\n", - "mq_summaries.df[cols]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mq_summaries.df[cols].describe(np.linspace(0.05, 0.95, 10))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fig, axes = plt.subplots(ncols=3, gridspec_kw={\"width_ratios\": [\n", - " 5, 1, 1], \"wspace\": 0.3}, figsize=(20, 8))\n", - "\n", - "ax = axes[0]\n", - "ax = mq_summaries.df.plot.scatter(x=cols[0], y=cols[1], ax=ax)\n", - "ax.axvline(x=15000)\n", - "\n", - "ax = axes[1]\n", - "ax = mq_summaries.df[cols[0]].plot(kind='box', ax=ax)\n", - "\n", - "\n", - "ax = axes[2]\n", - "ax = mq_summaries.df[cols[1]].plot(kind='box', ax=ax)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For some files with a large number of identified peptides, the file size information seems to be missing." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cfg.figure_1 = config.FIGUREFOLDER / 'figure_1.pdf'\n", - "\n", - "fig.savefig(cfg.figure_1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "threshold = 15_000\n", - "mask = mq_summaries.df[cols[0]] > threshold\n", - "print(\n", - " f\"for threshold of {threshold:,d} quantified peptides:\\n\"\n", - " f\"Total number of files is {mask.sum()}\\n\"\n", - " \"Minimum file-size is {:.3f} GB.\\n\".format(\n", - " mq_summaries.df.loc[mask, cols[1]].min())\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Meta data for all samples" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### From raw file reading" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "files_to_parse = data_unique.loc[idx_shared, 'path'].apply(lambda path: str(PurePosixPath(path)).strip())\n", - "files_to_parse = dict(files=files_to_parse.to_list())\n", - "cfg.remote_files = config.FOLDER_DATA / 'remote_files.yaml'\n", - "with open(cfg.remote_files, 'w') as f:\n", - " yaml.dump(files_to_parse, f)\n", - "print(f\"Saved list of files to: {cfg.remote_files}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### From file name" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "analysis = AnalyzePeptides.from_csv(cfg.FN_ALL_RAW_FILES_UNIQUE,index_col='name') # ToDo: Add numbers to file names\n", - "analysis.df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "analysis.add_metadata(add_prop_not_na=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "Metadata has fewer cases due to duplicates with differnt file sizes ( see above)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "analysis.df.loc[analysis.df.index.duplicated(False)] # keep the larger one" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## cfg" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vars(cfg) # return a dict which is rendered differently in ipython" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13" - }, - "vscode": { - "interpreter": { - "hash": "cf83e9cb890c7f96eb0ae04f39a82254555f56a1a0ed2f03b23a8b40fe6cd31c" - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/project/00_2_hela_all_raw_files.py b/project/00_2_hela_all_raw_files.py deleted file mode 100644 index bde0f37db..000000000 --- a/project/00_2_hela_all_raw_files.py +++ /dev/null @@ -1,427 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.14.5 -# kernelspec: -# display_name: Python 3 -# language: python -# name: python3 -# --- - -# %% [markdown] -# # RawFiles Database -# -# - overview of raw files, among others -# - filesize -# - duplicates of raw files -# - search for substrings to find special cases (e.g. fractionated samples) -# -# **Outputs** -# -# Created data and figures -# -# ```bash -# 'data/all_raw_files_dump_duplicated.txt' -# 'data/all_raw_files_dump_unique.csv' # csv file -# 'Figures/raw_file_overview.pdf' -# ``` -# -# **Inputs** -# -# ```bash -# 'data/all_raw_files_dump.txt' -# ``` -# -# The ladder can be created using `find` on a server: -# -# ```bash -# find . -name '*.raw' -exec ls -l {} \; > all_raw_files_dump_2021_10_27.txt -# # alternative (changes the format) -# find . -name '*.raw' -ls > all_raw_files_dump_2021_10_27.txt -# ``` -# -# which was executed in the - -# %% -from pathlib import Path, PurePosixPath -from collections import namedtuple -from functools import partial -import yaml - -import ipywidgets as widgets -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd - -import logging -from vaep.logging import setup_logger -from vaep.analyzers.analyzers import AnalyzePeptides -from vaep.io.data_objects import MqAllSummaries -from vaep.io.rawfiles import RawFileViewer, get_unique_stem, find_indices_containing_query, show_fractions -import config -from vaep.nb import Config -from vaep import utils - -cfg = Config() - -logger = logging.getLogger('vaep') -logger = setup_logger(logger, fname_base='00_2_hela_all_raw_files_ipynb') - -# %% tags=["parameters"] -# FN_ALL_RAW_FILES = config.FOLDER_DATA / config.FN_ALL_RAW_FILES -FN_ALL_RAW_FILES: str = config.FOLDER_DATA / 'all_raw_files_dump_2021_10_29.txt' -FN_ALL_SUMMARIES: str = config.FN_ALL_SUMMARIES -FN_PEPTIDE_INTENSITIES = config.FOLDER_DATA / 'df_intensities_N07285_M01000' - -# %% -cfg.FN_ALL_RAW_FILES = FN_ALL_RAW_FILES -cfg.FN_ALL_SUMMARIES = FN_ALL_SUMMARIES -cfg.FN_PEPTIDE_INTENSITIES = FN_PEPTIDE_INTENSITIES - -# %% -RawFile = namedtuple('RawFile', 'name path bytes') - -data = [] -with open(cfg.FN_ALL_RAW_FILES) as f: - for line in f: - line = line.split(maxsplit=8) # ignore white spaces in file names, example: - #'-rw-r--r--. 1 501 501 282917566 Dec 3 2022 ./share_hela_raw/MNT_202220220921_EXLP1_Evo1_LiNi_ - Copy1.raw' - path = Path(line[-1].strip()) - data.append(RawFile(path.stem, path, int(line[4]))) - -data = pd.DataFrame.from_records( - data, columns=RawFile._fields, index=RawFile._fields[0]) - -data.sort_values(by='path', inplace=True) -data.head() - -# %% -data['size_gb'] = data['bytes'] / 1024 ** 3 -data - -# %% -fname = 'data/processed/all_raw_file_sizes.csv' -data.to_csv(fname) - -# %% [markdown] -# ## Finding duplicates -# -# - add a numeric index column to identify samples - -# %% -data['num_index'] = pd.RangeIndex(stop=len(data)) -mask_non_unique = data.reset_index().duplicated(subset=['name', 'bytes']) -mask_non_unique.index = data.index -idx_non_unique = data.loc[mask_non_unique].index.unique() -idx_non_unique # min number of files to remove - - -# %% -def check_for_duplicates(df): - if df.index.is_unique: - print('Only unique files in index.') - return None - else: - non_unique = df.index.value_counts() - non_unique = non_unique[non_unique > 1] - # should this be browseable? - print(f'Number of files with more than 2 duplicates: {(non_unique > 2).sum()}') - return non_unique - -non_unique = check_for_duplicates(df=data) -non_unique - -# %% [markdown] -# Are there cases where only two files share the same name and have different file sizes: - -# %% -data.loc[ - non_unique.index.difference(idx_non_unique) ] - -# %% [markdown] -# For same sized groups, remove first the onces in the `MNT` folder: - -# %% -data_in_MNT_to_remove = None -non_unique_remaining = None -if not data.index.is_unique: - _data_to_remove = data.loc[idx_non_unique] - data_in_MNT_to_remove = pd.DataFrame() - non_unique_remaining = pd.DataFrame() - for idx, g in _data_to_remove.groupby(level=0): - mask = ['\\MNT' in str(x) for x in g.path] - assert len(mask) != sum(mask) , f'All files in MNT subfolders: {idx}' - data_in_MNT_to_remove = data_in_MNT_to_remove.append(g[mask]) - non_unique_remaining = non_unique_remaining.append(g[[x!=True for x in mask]]) - - del _data_to_remove, mask, idx, g - -assert len(data.loc[idx_non_unique]) == len(non_unique_remaining) + len(data_in_MNT_to_remove) -assert len(non_unique_remaining.loc[['\\MNT' in str(x) for x in non_unique_remaining.path]]) == 0, "There are files in MNT folder left" -data_in_MNT_to_remove - -# %% [markdown] -# The main junk of duplicated files in in `MNT` subfolders - -# %% -non_unique_remaining_counts = check_for_duplicates(non_unique_remaining) -non_unique_remaining.loc[non_unique_remaining_counts.index.unique()] - -# %% [markdown] -# Files with the same name and the same size are considered the same. - -# %% -mask_non_unique_remaining = non_unique_remaining.reset_index().duplicated(subset=['name', 'bytes']) -mask_non_unique_remaining.index = non_unique_remaining.index -data_to_remove = data_in_MNT_to_remove.append( - non_unique_remaining.loc[mask_non_unique_remaining] -) -data_to_remove - -# %% -print(f"Save {data_to_remove['size_gb'].sum():1.0f} GB disk space by deleting {len(data_to_remove)} files.") - -# %% -data_unique = data.reset_index().set_index('num_index').drop(data_to_remove.set_index('num_index').index).set_index('name') -data_unique - -# %% [markdown] -# Make sure that every index to remove is still present in `data_unique` which is data to keep - -# %% -data_unique.loc[data_to_remove.index.unique()] - -# %% -assert len(data_unique) + len(data_to_remove) == len(data) - -# %% [markdown] -# Show files which are duplicated, but have different sizes: - -# %% -# two files have the same name, but different sizes -data_unique.loc[data_unique.index.duplicated(False)] if not data_unique.index.is_unique else None - -# %% [markdown] -# Save unique files - -# %% -cfg.FN_ALL_RAW_FILES_UNIQUE = utils.append_to_filepath(cfg.FN_ALL_RAW_FILES, config.build_df_fname(data_unique, 'unique'), new_suffix='csv') -data_unique.to_csv(cfg.FN_ALL_RAW_FILES_UNIQUE) - -# %% [markdown] -# Export file paths to file to remove them, e.g using `rm $( all_raw_files_dump_duplicated_cleaned.txt -# ls `cat all_raw_files_dump_duplicated_cleaned` -# rm -i `cat all_raw_files_dump_duplicated_cleaned` -# rm -i $( They can be duplicated files with the same file size. Not the case for now - -# %% -idx_shared = mq_summaries.df.index.intersection(data_unique.index) - -_file_sizes = data_unique.loc[idx_shared, 'size_gb'] -_file_sizes.loc[_file_sizes.index.duplicated(False)] - -# %% -_file_sizes = _file_sizes.loc[~_file_sizes.index.duplicated(keep='last')] -mq_summaries.df.loc[idx_shared, 'file size in GB'] = _file_sizes -cols = ['Peptide Sequences Identified', 'file size in GB'] -mq_summaries.df[cols] - -# %% -mq_summaries.df[cols].describe(np.linspace(0.05, 0.95, 10)) - -# %% -fig, axes = plt.subplots(ncols=3, gridspec_kw={"width_ratios": [ - 5, 1, 1], "wspace": 0.3}, figsize=(20, 8)) - -ax = axes[0] -ax = mq_summaries.df.plot.scatter(x=cols[0], y=cols[1], ax=ax) -ax.axvline(x=15000) - -ax = axes[1] -ax = mq_summaries.df[cols[0]].plot(kind='box', ax=ax) - - -ax = axes[2] -ax = mq_summaries.df[cols[1]].plot(kind='box', ax=ax) - -# %% [markdown] -# For some files with a large number of identified peptides, the file size information seems to be missing. - -# %% -cfg.figure_1 = config.FIGUREFOLDER / 'figure_1.pdf' - -fig.savefig(cfg.figure_1) - -# %% -threshold = 15_000 -mask = mq_summaries.df[cols[0]] > threshold -print( - f"for threshold of {threshold:,d} quantified peptides:\n" - f"Total number of files is {mask.sum()}\n" - "Minimum file-size is {:.3f} GB.\n".format( - mq_summaries.df.loc[mask, cols[1]].min()) -) - -# %% [markdown] -# ## Meta data for all samples - -# %% [markdown] -# ### From raw file reading - -# %% -files_to_parse = data_unique.loc[idx_shared, 'path'].apply(lambda path: str(PurePosixPath(path)).strip()) -files_to_parse = dict(files=files_to_parse.to_list()) -cfg.remote_files = config.FOLDER_DATA / 'remote_files.yaml' -with open(cfg.remote_files, 'w') as f: - yaml.dump(files_to_parse, f) -print(f"Saved list of files to: {cfg.remote_files}") - -# %% [markdown] -# ### From file name - -# %% -analysis = AnalyzePeptides.from_csv(cfg.FN_ALL_RAW_FILES_UNIQUE,index_col='name') # ToDo: Add numbers to file names -analysis.df - -# %% -analysis.add_metadata(add_prop_not_na=False) - -# %% [markdown] -# Metadata has fewer cases due to duplicates with differnt file sizes ( see above) - -# %% -analysis.df.loc[analysis.df.index.duplicated(False)] # keep the larger one - -# %% [markdown] -# ## cfg - -# %% -vars(cfg) # return a dict which is rendered differently in ipython - -# %% diff --git a/project/00_3_0_pride_metadata_creation.ipynb b/project/00_3_0_pride_metadata_creation.ipynb deleted file mode 100644 index 624189c69..000000000 --- a/project/00_3_0_pride_metadata_creation.ipynb +++ /dev/null @@ -1,362 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "823351d5", - "metadata": {}, - "source": [ - "# Selected files\n", - "\n", - "- document metadata and file sizes of published dataset in Scientific Data Report \n", - "\n", - "## Contents\n", - "\n", - "1. Number of files per instrument\n", - "2. Rawfile sizes per instrument\n", - "3. peptide - rawfile map (protein group, precursor)?\n", - " - based on selected samples" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "123e3468", - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "from pathlib import Path\n", - "import pandas as pd" - ] - }, - { - "cell_type": "markdown", - "id": "6f7c2327", - "metadata": {}, - "source": [ - "## PARAMETERS" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7e148b4a", - "metadata": {}, - "outputs": [], - "source": [ - "fn_id_old_new: str = 'data/rename/selected_old_new_id_mapping.csv' # selected samples with pride and original id\n", - "fn_raw_file_size: str = 'processed/all_raw_file_sizes.csv' # raw file sizes\n", - "fn_rawfile_metadata: str = 'data/rawfile_metadata.csv'\n", - "fn_summaries:str = 'data/processed/all_summaries.json'\n", - "date_col:str = 'Content Creation Date'\n", - "out_folder: str = 'data/dev_datasets/pride_upload'" - ] - }, - { - "cell_type": "markdown", - "id": "5ff07632", - "metadata": {}, - "source": [ - "## Prepare outputs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "14eedb3e", - "metadata": {}, - "outputs": [], - "source": [ - "out_folder = Path(out_folder)\n", - "out_folder.mkdir(exist_ok=True)\n", - "files_out = dict()" - ] - }, - { - "cell_type": "markdown", - "id": "9d43c889", - "metadata": {}, - "source": [ - "## ID mapping\n", - "\n", - "- artefact of local vs pride data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "12762362", - "metadata": {}, - "outputs": [], - "source": [ - "df_ids = pd.read_csv(fn_id_old_new, index_col=0)\n", - "df_ids" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d7b493f5", - "metadata": {}, - "outputs": [], - "source": [ - "df_ids.index.is_unique" - ] - }, - { - "cell_type": "markdown", - "id": "9defcf5a", - "metadata": {}, - "source": [ - "## Raw file sizes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fe25de60", - "metadata": {}, - "outputs": [], - "source": [ - "df_raw_file_size = pd.read_csv(fn_raw_file_size, index_col=0)\n", - "df_raw_file_size" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c08a9411", - "metadata": {}, - "outputs": [], - "source": [ - "df_raw_file_size.index.is_unique" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "36527734", - "metadata": {}, - "outputs": [], - "source": [ - "from pathlib import Path\n", - "df_raw_file_size['path'] = df_raw_file_size['path'].apply(lambda x: Path(x).as_posix())\n", - "df_raw_file_size = df_raw_file_size.reset_index().set_index('path')\n", - "df_raw_file_size" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a8b484c0", - "metadata": {}, - "outputs": [], - "source": [ - "df_raw_file_size = df_raw_file_size.loc[df_ids['Path_old'].str[2:].to_list()]\n", - "df_raw_file_size" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6f0ac8b4", - "metadata": {}, - "outputs": [], - "source": [ - "df_raw_file_size = df_raw_file_size.reset_index().set_index('name')" - ] - }, - { - "cell_type": "markdown", - "id": "6d63ff8a", - "metadata": {}, - "source": [ - "## Raw file metadata extracted from ThermoRawFileParser" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1fab8bf8", - "metadata": {}, - "outputs": [], - "source": [ - "df_meta = pd.read_csv(fn_rawfile_metadata, header=[0, 1], index_col=0, low_memory=False)\n", - "assert df_meta.index.is_unique\n", - "df_meta" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "476ecb97", - "metadata": {}, - "outputs": [], - "source": [ - "df_meta = df_meta.loc[df_ids.index]\n", - "df_meta.columns = df_meta.columns.droplevel() # remove top level name\n", - "df_meta" - ] - }, - { - "cell_type": "markdown", - "id": "a97c9046", - "metadata": {}, - "source": [ - "## Summary files from MaxQuant search" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "11a6f39c", - "metadata": {}, - "outputs": [], - "source": [ - "df_summaries = pd.read_json(fn_summaries, orient='index')\n", - "assert df_summaries.index.is_unique\n", - "df_summaries = df_summaries.loc[df_meta.index]\n", - "df_summaries" - ] - }, - { - "cell_type": "markdown", - "id": "934fca92", - "metadata": {}, - "source": [ - "# Combine data and dump" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "63e0cf03", - "metadata": {}, - "outputs": [], - "source": [ - "df_meta = (df_ids\n", - " .join(df_raw_file_size)\n", - " .join(df_meta)\n", - " .join(df_summaries)\n", - " )\n", - "df_meta" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "febfc785", - "metadata": {}, - "outputs": [], - "source": [ - "df_meta = df_meta.set_index('new_sample_id')\n", - "df_meta.index.name = 'Sample ID'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ee5caddf", - "metadata": {}, - "outputs": [], - "source": [ - "df_meta = (df_meta\n", - " .drop(['Path_old', 'Pathname', 'path'], axis=1)\n", - " .rename({'Path_new':'Pathname'}, axis=1)\n", - " .dropna(how='all', axis=1)\n", - " .convert_dtypes()\n", - " .assign(**{date_col: lambda df_meta: pd.to_datetime(df_meta[date_col])})\n", - ")\n", - "df_meta" - ] - }, - { - "cell_type": "markdown", - "id": "99fc9713", - "metadata": {}, - "source": [ - "Save curated data for dumped files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "717eb728", - "metadata": {}, - "outputs": [], - "source": [ - "fname = out_folder / 'pride_metadata.csv'\n", - "files_out[fname.name] = fname.as_posix()\n", - "df_meta.to_csv(fname)\n", - "\n", - "fname = out_folder / 'pride_metadata_schema.json'\n", - "files_out[fname.name] = fname.as_posix()\n", - "df_meta.dtypes.astype('string').to_json(fname)" - ] - }, - { - "cell_type": "markdown", - "id": "a68385fe", - "metadata": {}, - "source": [ - "# Analysis" - ] - }, - { - "cell_type": "markdown", - "id": "01760dca", - "metadata": {}, - "source": [ - "How to load dumped file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9a7c801e", - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "dtypes = pd.read_json(\n", - " files_out['pride_metadata_schema.json'],\n", - " orient='index'\n", - " ).squeeze()\n", - "mask_dates = dtypes.str.contains('datetime') # date columns need to be provide separately\n", - "pd.read_csv(files_out['pride_metadata.csv'],\n", - " parse_dates=mask_dates.loc[mask_dates].index.to_list(),\n", - " dtype=dtypes.loc[~mask_dates].to_dict()\n", - ").dtypes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "566cf8b6", - "metadata": {}, - "outputs": [], - "source": [ - "files_out" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "67d001a5", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/project/00_3_0_pride_metadata_creation.py b/project/00_3_0_pride_metadata_creation.py deleted file mode 100644 index 17594453d..000000000 --- a/project/00_3_0_pride_metadata_creation.py +++ /dev/null @@ -1,166 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.14.5 -# kernelspec: -# display_name: Python 3 -# language: python -# name: python3 -# --- - -# %% [markdown] -# # Selected files -# -# - document metadata and file sizes of published dataset in Scientific Data Report -# -# ## Contents -# -# 1. Number of files per instrument -# 2. Rawfile sizes per instrument -# 3. peptide - rawfile map (protein group, precursor)? -# - based on selected samples - -# %% -from pathlib import Path -import pandas as pd - - -# %% [markdown] -# ## PARAMETERS - -# %% -fn_id_old_new: str = 'data/rename/selected_old_new_id_mapping.csv' # selected samples with pride and original id -fn_raw_file_size: str = 'processed/all_raw_file_sizes.csv' # raw file sizes -fn_rawfile_metadata: str = 'data/rawfile_metadata.csv' -fn_summaries:str = 'data/processed/all_summaries.json' -date_col:str = 'Content Creation Date' -out_folder: str = 'data/dev_datasets/pride_upload' - -# %% [markdown] -# ## Prepare outputs - -# %% -out_folder = Path(out_folder) -out_folder.mkdir(exist_ok=True) -files_out = dict() - -# %% [markdown] -# ## ID mapping -# -# - artefact of local vs pride data - -# %% -df_ids = pd.read_csv(fn_id_old_new, index_col=0) -df_ids - -# %% -df_ids.index.is_unique - -# %% [markdown] -# ## Raw file sizes - -# %% -df_raw_file_size = pd.read_csv(fn_raw_file_size, index_col=0) -df_raw_file_size - -# %% -df_raw_file_size.index.is_unique - -# %% -from pathlib import Path -df_raw_file_size['path'] = df_raw_file_size['path'].apply(lambda x: Path(x).as_posix()) -df_raw_file_size = df_raw_file_size.reset_index().set_index('path') -df_raw_file_size - -# %% -df_raw_file_size = df_raw_file_size.loc[df_ids['Path_old'].str[2:].to_list()] -df_raw_file_size - -# %% -df_raw_file_size = df_raw_file_size.reset_index().set_index('name') - -# %% [markdown] -# ## Raw file metadata extracted from ThermoRawFileParser - -# %% -df_meta = pd.read_csv(fn_rawfile_metadata, header=[0, 1], index_col=0, low_memory=False) -assert df_meta.index.is_unique -df_meta - -# %% -df_meta = df_meta.loc[df_ids.index] -df_meta.columns = df_meta.columns.droplevel() # remove top level name -df_meta - -# %% [markdown] -# ## Summary files from MaxQuant search - -# %% -df_summaries = pd.read_json(fn_summaries, orient='index') -assert df_summaries.index.is_unique -df_summaries = df_summaries.loc[df_meta.index] -df_summaries - -# %% [markdown] -# # Combine data and dump - -# %% -df_meta = (df_ids - .join(df_raw_file_size) - .join(df_meta) - .join(df_summaries) - ) -df_meta - -# %% -df_meta = df_meta.set_index('new_sample_id') -df_meta.index.name = 'Sample ID' - -# %% -df_meta = (df_meta - .drop(['Path_old', 'Pathname', 'path'], axis=1) - .rename({'Path_new':'Pathname'}, axis=1) - .dropna(how='all', axis=1) - .convert_dtypes() - .assign(**{date_col: lambda df_meta: pd.to_datetime(df_meta[date_col])}) -) -df_meta - -# %% [markdown] -# Save curated data for dumped files - -# %% -fname = out_folder / 'pride_metadata.csv' -files_out[fname.name] = fname.as_posix() -df_meta.to_csv(fname) - -fname = out_folder / 'pride_metadata_schema.json' -files_out[fname.name] = fname.as_posix() -df_meta.dtypes.astype('string').to_json(fname) - -# %% [markdown] -# # Analysis - -# %% [markdown] -# How to load dumped file - -# %% -dtypes = pd.read_json( - files_out['pride_metadata_schema.json'], - orient='index' - ).squeeze() -mask_dates = dtypes.str.contains('datetime') # date columns need to be provide separately -pd.read_csv(files_out['pride_metadata.csv'], - parse_dates=mask_dates.loc[mask_dates].index.to_list(), - dtype=dtypes.loc[~mask_dates].to_dict() -).dtypes - - -# %% -files_out - -# %% diff --git a/project/00_3_1_pride_metadata_analysis.ipynb b/project/00_3_1_pride_metadata_analysis.ipynb deleted file mode 100644 index 75f62cdf8..000000000 --- a/project/00_3_1_pride_metadata_analysis.ipynb +++ /dev/null @@ -1,528 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "d9988d7d", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "58232092", - "metadata": {}, - "source": [ - "## Output Excel for Analysis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fba8c071", - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "from pathlib import Path\n", - "import matplotlib.pyplot as plt\n", - "import pandas as pd\n", - "import seaborn\n", - "\n", - "\n", - "from vaep.io import thermo_raw_files\n", - "import vaep.pandas\n", - "\n", - "plt.rcParams['figure.figsize'] = [4, 3]\n", - "vaep.plotting.make_large_descriptors(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "af37ae56", - "metadata": { - "incorrectly_encoded_metadata": "[tags=[\"parameters\"]]" - }, - "outputs": [], - "source": [ - "fn_meta = 'data/pride_metadata.csv'\n", - "date_col: str = 'Content Creation Date'\n", - "out_folder: str = 'data/dev_datasets/pride_upload'" - ] - }, - { - "cell_type": "markdown", - "id": "e2dc3abc", - "metadata": {}, - "source": [ - "## Prepare outputs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "53d6f5dd", - "metadata": {}, - "outputs": [], - "source": [ - "out_folder = Path(out_folder)\n", - "out_folder.mkdir(exist_ok=True)\n", - "files_out = dict()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dc84eb5d", - "metadata": {}, - "outputs": [], - "source": [ - "df_meta = pd.read_csv(fn_meta, index_col=0)\n", - "df_meta" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ce35f226", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "df_meta['instrument_label'] = (\n", - " df_meta[\"Thermo Scientific instrument model\"].str.replace(' ', '-')\n", - " + '_'\n", - " + df_meta[\"instrument serial number\"].str.split('#').str[-1]\n", - ")\n", - "\n", - "# {k: k.replace('-Orbitrap_', ' ').replace('-', ' ').replace('_', ' ')\n", - "# for k in df_meta['instrument_label'].unique()}\n", - "# further small changes applied manually\n", - "# based on https://www.ebi.ac.uk/ols4/\n", - "#\n", - "# Q Exactive HF-X MS:1002877\n", - "# Q Exactive HF MS:1002523\n", - "# Orbitrap Exploris 480 MS:1003028\n", - "# Exactive Plus MS:1002526\n", - "# Q Exactive MS:1001911\n", - "# Orbitrap Fusion Lumos MS:1002732\n", - "\n", - "instrument_labels = {'Q-Exactive-Orbitrap_1': 'Q Exactive 1',\n", - " 'Q-Exactive-Plus-Orbitrap_1': 'Exactive Plus 1',\n", - " 'Q-Exactive-HF-Orbitrap_206': 'Q Exactive HF 206',\n", - " 'Q-Exactive-Plus-Orbitrap_143': 'Exactive Plus 143',\n", - " 'Q-Exactive-HF-Orbitrap_1': 'Q Exactive HF 1',\n", - " 'Q-Exactive-HF-Orbitrap_147': 'Q Exactive HF 147',\n", - " 'Q-Exactive-HF-Orbitrap_204': 'Q Exactive HF 204',\n", - " 'Q-Exactive-HF-Orbitrap_148': 'Q Exactive HF 148',\n", - " 'Q-Exactive-HF-Orbitrap_207': 'Q Exactive HF 207',\n", - " 'Q-Exactive-HF-Orbitrap_143': 'Q Exactive HF 143',\n", - " 'Orbitrap-Fusion-Lumos_FSN20115': 'Orbitrap Fusion Lumos FSN20115',\n", - " 'Q-Exactive-HF-Orbitrap_2612': 'Q Exactive HF 2612',\n", - " 'Q-Exactive-HF-X-Orbitrap_6016': 'Q Exactive HF-X 6016',\n", - " 'Q-Exactive-HF-X-Orbitrap_6004': 'Q Exactive HF-X 6004',\n", - " 'Q-Exactive-HF-X-Orbitrap_6075': 'Q Exactive HF-X 6075',\n", - " 'Q-Exactive-HF-X-Orbitrap_6078': 'Q Exactive HF-X 6078',\n", - " 'Q-Exactive-HF-X-Orbitrap_6070': 'Q Exactive HF-X 6070',\n", - " 'Q-Exactive-HF-X-Orbitrap_6071': 'Q Exactive HF-X 6071',\n", - " 'Q-Exactive-HF-X-Orbitrap_6011': 'Q Exactive HF-X 6011',\n", - " 'Q-Exactive-HF-X-Orbitrap_6073': 'Q Exactive HF-X 6073',\n", - " 'Q-Exactive-HF-X-Orbitrap_6101': 'Q Exactive HF-X 6101',\n", - " 'Q-Exactive-HF-X-Orbitrap_6096': 'Q Exactive HF-X 6096',\n", - " 'Exactive-Series-Orbitrap_6004': 'Exactive Series 6004',\n", - " 'Q-Exactive-HF-X-Orbitrap_6043': 'Q Exactive HF-X 6043',\n", - " 'Q-Exactive-HF-X-Orbitrap_6025': 'Q Exactive HF-X 6025',\n", - " 'Q-Exactive-HF-X-Orbitrap_6022': 'Q Exactive HF-X 6022',\n", - " 'Q-Exactive-HF-X-Orbitrap_6023': 'Q Exactive HF-X 6023',\n", - " 'Q-Exactive-HF-X-Orbitrap_6028': 'Q Exactive HF-X 6028',\n", - " 'Q-Exactive-HF-X-Orbitrap_6013': 'Q Exactive HF-X 6013',\n", - " 'Q-Exactive-HF-X-Orbitrap_6044': 'Q Exactive HF-X 6044',\n", - " 'Q-Exactive-HF-X-Orbitrap_6324': 'Q Exactive HF-X 6324',\n", - " 'Orbitrap-Exploris-480_Invalid_SN_0001': 'Orbitrap Exploris 480 Invalid SN 0001',\n", - " 'Orbitrap-Exploris-480_MA10134C': 'Orbitrap Exploris 480 MA10134C',\n", - " 'Orbitrap-Exploris-480_MA10132C': 'Orbitrap Exploris 480 MA10132C',\n", - " 'Orbitrap-Exploris-480_MA10130C': 'Orbitrap Exploris 480 MA10130C',\n", - " 'Orbitrap-Exploris-480_MA10215C': 'Orbitrap Exploris 480 MA10215C'}\n", - "\n", - "df_meta[\"instrument_label\"] = df_meta[\"instrument_label\"].replace(instrument_labels)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5a6e9cd8", - "metadata": {}, - "outputs": [], - "source": [ - "writer_args = dict(float_format='%.3f')\n", - "fname = out_folder / 'pride_data_infos.xlsx'\n", - "files_out[fname.name] = fname.as_posix()\n", - "excel_writer = pd.ExcelWriter(fname)" - ] - }, - { - "cell_type": "markdown", - "id": "afcf4bf8", - "metadata": {}, - "source": [ - "## Varying data between runs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "17db54de", - "metadata": {}, - "outputs": [], - "source": [ - "meta_stats = df_meta.describe(include='all', datetime_is_numeric=True)\n", - "meta_stats.T.to_excel(excel_writer, sheet_name='des_stats', **writer_args)\n", - "\n", - "view = meta_stats.loc[:, (meta_stats.loc['unique'] > 1)\n", - " | (meta_stats.loc['std'] > 0.01)].T\n", - "view.to_excel(excel_writer, sheet_name='des_stats_varying', **writer_args)" - ] - }, - { - "cell_type": "markdown", - "id": "fec4afa2", - "metadata": {}, - "source": [ - "## Instruments in selection" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "efce5cbd", - "metadata": {}, - "outputs": [], - "source": [ - "thermo_raw_files.cols_instrument" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "577be1d5", - "metadata": {}, - "outputs": [], - "source": [ - "df_meta[date_col] = pd.to_datetime(df_meta[date_col])\n", - "\n", - "counts_instrument = (df_meta\n", - " .groupby(thermo_raw_files.cols_instrument)[date_col]\n", - " .agg(['count', 'min', 'max'])\n", - " .sort_values(by=thermo_raw_files.cols_instrument[:2] + ['count'], ascending=False))\n", - "\n", - "counts_instrument = counts_instrument.join(\n", - " (df_meta\n", - " [[*thermo_raw_files.cols_instrument, 'instrument_label']]\n", - " .drop_duplicates()\n", - " .set_index(thermo_raw_files.cols_instrument)\n", - " )\n", - " .set_index('instrument_label', append=True)\n", - ")\n", - "counts_instrument.to_excel(\n", - " excel_writer, sheet_name='instruments', **writer_args)\n", - "counts_instrument" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "087c2c08", - "metadata": {}, - "outputs": [], - "source": [ - "top10_instruments = counts_instrument['count'].nlargest(10)\n", - "top10_instruments" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7aceab19", - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "mask_top10_instruments = (df_meta[thermo_raw_files.cols_instrument]\n", - " .apply(\n", - " lambda x: tuple(x) in top10_instruments.index, axis=1))\n", - "assert mask_top10_instruments.sum() == top10_instruments.sum()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "74377ab0", - "metadata": {}, - "outputs": [], - "source": [ - "# counts_instrument = (df_meta\n", - "# .groupby(['instrument_label'])[date_col]\n", - "# .agg(['count', 'min', 'max'])\n", - "# .sort_values('count', ascending=False)\n", - "# )\n", - "counts_instrument = (counts_instrument\n", - " .reset_index()\n", - " .set_index('instrument_label')\n", - " ['count']\n", - " .sort_values(ascending=False)\n", - " )\n", - "counts_instrument" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2d48b585", - "metadata": {}, - "outputs": [], - "source": [ - "fig, ax = plt.subplots()\n", - "ax = (counts_instrument\n", - " .plot\n", - " .bar(\n", - " ax=ax,\n", - " )\n", - ")\n", - "ax.set_xlabel('')\n", - "ax.set_ylabel('number of samples (runs)')\n", - "fname = out_folder / 'number_of_samples_per_instrument.pdf'\n", - "files_out[fname.name] = fname.as_posix()\n", - "vaep.savefig(fig, fname)" - ] - }, - { - "cell_type": "markdown", - "id": "f44798a4", - "metadata": {}, - "source": [ - "## File size and number of identifications" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "254018bd", - "metadata": {}, - "outputs": [], - "source": [ - "cols = ['Peptide Sequences Identified', 'size_gb']\n", - "\n", - "mask = ((df_meta[cols[0]] < 20_000) & (df_meta[cols[1]] > 3.5)\n", - " | (df_meta[cols[1]] > 5)\n", - " )\n", - "\n", - "cols = ['Peptide Sequences Identified', 'size_gb']\n", - "ax = (df_meta\n", - " .loc[~mask, cols]\n", - " .plot\n", - " .scatter(cols[0], cols[1],\n", - " label='large files',\n", - " s=2,\n", - " )\n", - " )\n", - "ax = (df_meta\n", - " .loc[mask, cols]\n", - " .plot\n", - " .scatter(cols[0], cols[1],\n", - " color='orange',\n", - " label='normal files',\n", - " ylabel='filesize (in GB)',\n", - " ax=ax,\n", - " s=2,\n", - " )\n", - " )\n", - "ax.xaxis.set_major_formatter(\"{x:,.0f}\")\n", - "fname = out_folder / 'filesize_vs_iden_peptides.pdf'\n", - "files_out[fname.name] = fname.as_posix()\n", - "vaep.savefig(ax.get_figure(), fname)\n", - "\n", - "\n", - "view = df_meta.loc[mask].sort_values(by=cols)\n", - "view.to_excel(excel_writer, sheet_name='instrument_outliers', **writer_args)\n", - "view" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c718ce51", - "metadata": {}, - "outputs": [], - "source": [ - "cols = ['Number of MS1 spectra', 'Number of MS2 spectra',\n", - " 'Peptide Sequences Identified']\n", - "cols = vaep.pandas.get_columns_accessor_from_iterable(cols)\n", - "\n", - "view = df_meta.loc[mask_top10_instruments]\n", - "view[\"instrument_label+N\"] = view[\"instrument_label\"].replace(counts_instrument.to_frame().apply( lambda s: f\"{s.name} (N={s['count']:03d})\" , axis=1))\n", - "view" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f55d77d2", - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "fig, ax = plt.subplots()\n", - "\n", - "ax = seaborn.scatterplot(view,\n", - " x=cols.Number_of_MS1_spectra,\n", - " y=cols.Number_of_MS2_spectra,\n", - " hue='instrument_label+N',\n", - " legend='brief',\n", - " ax=ax,\n", - " s=5,\n", - " palette='deep')\n", - "_ = ax.legend(fontsize=5,\n", - " title_fontsize=5,\n", - " markerscale=0.4,\n", - " title='instrument label',\n", - " loc='upper right',\n", - " # alignment='left',\n", - ")\n", - "ax.xaxis.set_major_formatter(\"{x:,.0f}\")\n", - "ax.yaxis.set_major_formatter(\"{x:,.0f}\")\n", - "fname = out_folder / 'ms1_to_ms2_top10_instruments.pdf'\n", - "files_out[fname.name] = fname.as_posix()\n", - "vaep.savefig(fig, fname)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e8873c50", - "metadata": {}, - "outputs": [], - "source": [ - "fig, ax = plt.subplots()\n", - "ax = view.plot.scatter(x=cols.Peptide_Sequences_Identified,\n", - " y=cols.Number_of_MS1_spectra,\n", - " label=cols.Number_of_MS1_spectra,\n", - " s=2,\n", - " c='green',\n", - " ax=ax)\n", - "ax = view.plot.scatter(x=cols.Peptide_Sequences_Identified,\n", - " y=cols.Number_of_MS2_spectra,\n", - " label=cols.Number_of_MS2_spectra,\n", - " ylabel='# spectra',\n", - " s=2,\n", - " ax=ax)\n", - "fname = out_folder / 'ms1_vs_ms2.pdf'\n", - "ax.xaxis.set_major_formatter(\"{x:,.0f}\")\n", - "ax.yaxis.set_major_formatter(\"{x:,.0f}\")\n", - "files_out[fname.name] = fname.as_posix()\n", - "vaep.savefig(fig, fname)" - ] - }, - { - "cell_type": "markdown", - "id": "455debba", - "metadata": {}, - "source": [ - "## run length to number of identified peptides" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5d50ea32", - "metadata": {}, - "outputs": [], - "source": [ - "df_meta.filter(like='RT', axis=1).describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2ef17002", - "metadata": {}, - "outputs": [], - "source": [ - "cols = ['MS max RT',\n", - " 'Peptide Sequences Identified']\n", - "cols = vaep.pandas.get_columns_accessor_from_iterable(cols)\n", - "\n", - "fig, ax = plt.subplots()\n", - "\n", - "ax = ax = seaborn.scatterplot(\n", - " view,\n", - " x=cols.MS_max_RT,\n", - " y=cols.Peptide_Sequences_Identified,\n", - " hue='instrument_label+N',\n", - " legend='brief',\n", - " ax=ax,\n", - " s=5,\n", - " palette='deep')\n", - "_ = ax.legend(fontsize=5,\n", - " title_fontsize=5,\n", - " markerscale=0.4,\n", - " title='instrument label',\n", - " )\n", - "ax.yaxis.set_major_formatter(\"{x:,.0f}\")\n", - "fname = out_folder / 'RT_vs_identified_peptides_top10_instruments.pdf'\n", - "files_out[fname.name] = fname.as_posix()\n", - "vaep.savefig(ax.get_figure(), fname)" - ] - }, - { - "cell_type": "markdown", - "id": "e51b090d", - "metadata": {}, - "source": [ - "## Outputs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "219fb5c9", - "metadata": {}, - "outputs": [], - "source": [ - "excel_writer.close()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4eaaeba3", - "metadata": { - "lines_to_next_cell": 0 - }, - "outputs": [], - "source": [ - "files_out" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2c79fa00", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "jupytext": { - "cell_metadata_filter": "incorrectly_encoded_metadata,-all", - "main_language": "python", - "notebook_metadata_filter": "-all" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/project/00_3_1_pride_metadata_analysis.py b/project/00_3_1_pride_metadata_analysis.py deleted file mode 100644 index 2ec9a8fae..000000000 --- a/project/00_3_1_pride_metadata_analysis.py +++ /dev/null @@ -1,312 +0,0 @@ -# %% - -# %% [markdown] -# ## Output Excel for Analysis - -# %% -from pathlib import Path -import matplotlib.pyplot as plt -import pandas as pd -import seaborn - - -from vaep.io import thermo_raw_files -import vaep.pandas - -plt.rcParams['figure.figsize'] = [4, 3] -vaep.plotting.make_large_descriptors(5) - - -# %% [tags=["parameters"]] -fn_meta = 'data/pride_metadata.csv' -date_col: str = 'Content Creation Date' -out_folder: str = 'data/dev_datasets/pride_upload' - -# %% [markdown] -# ## Prepare outputs - -# %% -out_folder = Path(out_folder) -out_folder.mkdir(exist_ok=True) -files_out = dict() - -# %% -df_meta = pd.read_csv(fn_meta, index_col=0) -df_meta - -# %% - -df_meta['instrument_label'] = ( - df_meta["Thermo Scientific instrument model"].str.replace(' ', '-') - + '_' - + df_meta["instrument serial number"].str.split('#').str[-1] -) - -# {k: k.replace('-Orbitrap_', ' ').replace('-', ' ').replace('_', ' ') -# for k in df_meta['instrument_label'].unique()} -# further small changes applied manually -# based on https://www.ebi.ac.uk/ols4/ -# -# Q Exactive HF-X MS:1002877 -# Q Exactive HF MS:1002523 -# Orbitrap Exploris 480 MS:1003028 -# Exactive Plus MS:1002526 -# Q Exactive MS:1001911 -# Orbitrap Fusion Lumos MS:1002732 - -instrument_labels = {'Q-Exactive-Orbitrap_1': 'Q Exactive 1', - 'Q-Exactive-Plus-Orbitrap_1': 'Exactive Plus 1', - 'Q-Exactive-HF-Orbitrap_206': 'Q Exactive HF 206', - 'Q-Exactive-Plus-Orbitrap_143': 'Exactive Plus 143', - 'Q-Exactive-HF-Orbitrap_1': 'Q Exactive HF 1', - 'Q-Exactive-HF-Orbitrap_147': 'Q Exactive HF 147', - 'Q-Exactive-HF-Orbitrap_204': 'Q Exactive HF 204', - 'Q-Exactive-HF-Orbitrap_148': 'Q Exactive HF 148', - 'Q-Exactive-HF-Orbitrap_207': 'Q Exactive HF 207', - 'Q-Exactive-HF-Orbitrap_143': 'Q Exactive HF 143', - 'Orbitrap-Fusion-Lumos_FSN20115': 'Orbitrap Fusion Lumos FSN20115', - 'Q-Exactive-HF-Orbitrap_2612': 'Q Exactive HF 2612', - 'Q-Exactive-HF-X-Orbitrap_6016': 'Q Exactive HF-X 6016', - 'Q-Exactive-HF-X-Orbitrap_6004': 'Q Exactive HF-X 6004', - 'Q-Exactive-HF-X-Orbitrap_6075': 'Q Exactive HF-X 6075', - 'Q-Exactive-HF-X-Orbitrap_6078': 'Q Exactive HF-X 6078', - 'Q-Exactive-HF-X-Orbitrap_6070': 'Q Exactive HF-X 6070', - 'Q-Exactive-HF-X-Orbitrap_6071': 'Q Exactive HF-X 6071', - 'Q-Exactive-HF-X-Orbitrap_6011': 'Q Exactive HF-X 6011', - 'Q-Exactive-HF-X-Orbitrap_6073': 'Q Exactive HF-X 6073', - 'Q-Exactive-HF-X-Orbitrap_6101': 'Q Exactive HF-X 6101', - 'Q-Exactive-HF-X-Orbitrap_6096': 'Q Exactive HF-X 6096', - 'Exactive-Series-Orbitrap_6004': 'Exactive Series 6004', - 'Q-Exactive-HF-X-Orbitrap_6043': 'Q Exactive HF-X 6043', - 'Q-Exactive-HF-X-Orbitrap_6025': 'Q Exactive HF-X 6025', - 'Q-Exactive-HF-X-Orbitrap_6022': 'Q Exactive HF-X 6022', - 'Q-Exactive-HF-X-Orbitrap_6023': 'Q Exactive HF-X 6023', - 'Q-Exactive-HF-X-Orbitrap_6028': 'Q Exactive HF-X 6028', - 'Q-Exactive-HF-X-Orbitrap_6013': 'Q Exactive HF-X 6013', - 'Q-Exactive-HF-X-Orbitrap_6044': 'Q Exactive HF-X 6044', - 'Q-Exactive-HF-X-Orbitrap_6324': 'Q Exactive HF-X 6324', - 'Orbitrap-Exploris-480_Invalid_SN_0001': 'Orbitrap Exploris 480 Invalid SN 0001', - 'Orbitrap-Exploris-480_MA10134C': 'Orbitrap Exploris 480 MA10134C', - 'Orbitrap-Exploris-480_MA10132C': 'Orbitrap Exploris 480 MA10132C', - 'Orbitrap-Exploris-480_MA10130C': 'Orbitrap Exploris 480 MA10130C', - 'Orbitrap-Exploris-480_MA10215C': 'Orbitrap Exploris 480 MA10215C'} - -df_meta["instrument_label"] = df_meta["instrument_label"].replace(instrument_labels) - -# %% -writer_args = dict(float_format='%.3f') -fname = out_folder / 'pride_data_infos.xlsx' -files_out[fname.name] = fname.as_posix() -excel_writer = pd.ExcelWriter(fname) - -# %% [markdown] -# ## Varying data between runs - -# %% -meta_stats = df_meta.describe(include='all', datetime_is_numeric=True) -meta_stats.T.to_excel(excel_writer, sheet_name='des_stats', **writer_args) - -view = meta_stats.loc[:, (meta_stats.loc['unique'] > 1) - | (meta_stats.loc['std'] > 0.01)].T -view.to_excel(excel_writer, sheet_name='des_stats_varying', **writer_args) - -# %% [markdown] -# ## Instruments in selection - -# %% -thermo_raw_files.cols_instrument - -# %% -df_meta[date_col] = pd.to_datetime(df_meta[date_col]) - -counts_instrument = (df_meta - .groupby(thermo_raw_files.cols_instrument)[date_col] - .agg(['count', 'min', 'max']) - .sort_values(by=thermo_raw_files.cols_instrument[:2] + ['count'], ascending=False)) - -counts_instrument = counts_instrument.join( - (df_meta - [[*thermo_raw_files.cols_instrument, 'instrument_label']] - .drop_duplicates() - .set_index(thermo_raw_files.cols_instrument) - ) - .set_index('instrument_label', append=True) -) -counts_instrument.to_excel( - excel_writer, sheet_name='instruments', **writer_args) -counts_instrument - -# %% -top10_instruments = counts_instrument['count'].nlargest(10) -top10_instruments - -# %% -mask_top10_instruments = (df_meta[thermo_raw_files.cols_instrument] - .apply( - lambda x: tuple(x) in top10_instruments.index, axis=1)) -assert mask_top10_instruments.sum() == top10_instruments.sum() - - -# %% -# counts_instrument = (df_meta -# .groupby(['instrument_label'])[date_col] -# .agg(['count', 'min', 'max']) -# .sort_values('count', ascending=False) -# ) -counts_instrument = (counts_instrument - .reset_index() - .set_index('instrument_label') - ['count'] - .sort_values(ascending=False) - ) -counts_instrument - -# %% -fig, ax = plt.subplots() -ax = (counts_instrument - .plot - .bar( - ax=ax, - ) -) -ax.set_xlabel('') -ax.set_ylabel('number of samples (runs)') -fname = out_folder / 'number_of_samples_per_instrument.pdf' -files_out[fname.name] = fname.as_posix() -vaep.savefig(fig, fname) - -# %% [markdown] -# ## File size and number of identifications - -# %% -cols = ['Peptide Sequences Identified', 'size_gb'] - -mask = ((df_meta[cols[0]] < 20_000) & (df_meta[cols[1]] > 3.5) - | (df_meta[cols[1]] > 5) - ) - -cols = ['Peptide Sequences Identified', 'size_gb'] -ax = (df_meta - .loc[~mask, cols] - .plot - .scatter(cols[0], cols[1], - label='large files', - s=2, - ) - ) -ax = (df_meta - .loc[mask, cols] - .plot - .scatter(cols[0], cols[1], - color='orange', - label='normal files', - ylabel='filesize (in GB)', - ax=ax, - s=2, - ) - ) -ax.xaxis.set_major_formatter("{x:,.0f}") -fname = out_folder / 'filesize_vs_iden_peptides.pdf' -files_out[fname.name] = fname.as_posix() -vaep.savefig(ax.get_figure(), fname) - - -view = df_meta.loc[mask].sort_values(by=cols) -view.to_excel(excel_writer, sheet_name='instrument_outliers', **writer_args) -view - -# %% -cols = ['Number of MS1 spectra', 'Number of MS2 spectra', - 'Peptide Sequences Identified'] -cols = vaep.pandas.get_columns_accessor_from_iterable(cols) - -view = df_meta.loc[mask_top10_instruments] -view["instrument_label+N"] = view["instrument_label"].replace(counts_instrument.to_frame().apply( lambda s: f"{s.name} (N={s['count']:03d})" , axis=1)) -view - -# %% -fig, ax = plt.subplots() - -ax = seaborn.scatterplot(view, - x=cols.Number_of_MS1_spectra, - y=cols.Number_of_MS2_spectra, - hue='instrument_label+N', - legend='brief', - ax=ax, - s=5, - palette='deep') -_ = ax.legend(fontsize=5, - title_fontsize=5, - markerscale=0.4, - title='instrument label', - loc='upper right', - # alignment='left', -) -ax.xaxis.set_major_formatter("{x:,.0f}") -ax.yaxis.set_major_formatter("{x:,.0f}") -fname = out_folder / 'ms1_to_ms2_top10_instruments.pdf' -files_out[fname.name] = fname.as_posix() -vaep.savefig(fig, fname) - - -# %% -fig, ax = plt.subplots() -ax = view.plot.scatter(x=cols.Peptide_Sequences_Identified, - y=cols.Number_of_MS1_spectra, - label=cols.Number_of_MS1_spectra, - s=2, - c='green', - ax=ax) -ax = view.plot.scatter(x=cols.Peptide_Sequences_Identified, - y=cols.Number_of_MS2_spectra, - label=cols.Number_of_MS2_spectra, - ylabel='# spectra', - s=2, - ax=ax) -fname = out_folder / 'ms1_vs_ms2.pdf' -ax.xaxis.set_major_formatter("{x:,.0f}") -ax.yaxis.set_major_formatter("{x:,.0f}") -files_out[fname.name] = fname.as_posix() -vaep.savefig(fig, fname) - -# %% [markdown] -# ## run length to number of identified peptides - -# %% -df_meta.filter(like='RT', axis=1).describe() - -# %% -cols = ['MS max RT', - 'Peptide Sequences Identified'] -cols = vaep.pandas.get_columns_accessor_from_iterable(cols) - -fig, ax = plt.subplots() - -ax = seaborn.scatterplot( - view, - x=cols.MS_max_RT, - y=cols.Peptide_Sequences_Identified, - hue='instrument_label+N', - legend='brief', - ax=ax, - s=5, - palette='deep') -_ = ax.legend(fontsize=5, - title_fontsize=5, - markerscale=0.4, - title='instrument label', - ) -ax.yaxis.set_major_formatter("{x:,.0f}") -fname = out_folder / 'RT_vs_identified_peptides_top10_instruments.pdf' -files_out[fname.name] = fname.as_posix() -vaep.savefig(ax.get_figure(), fname) - -# %% [markdown] -# ## Outputs - -# %% -excel_writer.close() - -# %% -files_out -# %% diff --git a/project/00_4_development_dataset_support.py b/project/00_4_development_dataset_support.py deleted file mode 100644 index c6f34a9dd..000000000 --- a/project/00_4_development_dataset_support.py +++ /dev/null @@ -1,50 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.14.5 -# kernelspec: -# display_name: vaep -# language: python -# name: python3 -# --- - -# %% [markdown] -# # Support of dumped data - -# %% -import numpy as np -import pandas as pd -import plotly.express as px - -import vaep # set formatting defaults - -# %% [markdown] -# ## Parameters - -# %% tags=["parameters"] -support_json: str = 'data\dev_datasets\df_intensities_proteinGroups_long\Q_Exactive_HF_X_Orbitrap_6070_support.json' # Path to json support file - -# %% [markdown] -# ## Completeness of samples - -# %% -support = pd.read_json(support_json, typ='series').sort_values().to_frame('no. of features') -support.head() - -# %% -support.describe(percentiles=np.linspace(0.1,1,10)) - -# %% -ax = support.plot(rot=90, figsize=(20,10), legend=False) -ax.set_ylabel('number of features') -ax.yaxis.set_major_formatter("{x:,.0f}") - -# %% -px.line(support, height=1000) - -# %% [markdown] -# The one with very few identification are mainly fractions of entire samples diff --git a/project/00_4_hela_development_dataset_splitting.ipynb b/project/00_4_hela_development_dataset_splitting.ipynb deleted file mode 100644 index 2a1a337cf..000000000 --- a/project/00_4_hela_development_dataset_splitting.ipynb +++ /dev/null @@ -1,828 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Split up data into single datasets\n", - "\n", - "- create datasets per (set of) instruments for a specific experiments\n", - "- drop some samples based on quality criteria" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pathlib import Path\n", - "\n", - "import numpy as np\n", - "import pandas as pd\n", - "import matplotlib\n", - "import matplotlib.pyplot as plt\n", - "import matplotlib.dates\n", - "import seaborn as sns\n", - "\n", - "import umap\n", - "\n", - "from vaep.io import thermo_raw_files\n", - "from vaep.analyzers import analyzers\n", - "\n", - "from config import erda_dumps\n", - "from config import defaults\n", - "\n", - "import vaep\n", - "import vaep.io.filenames\n", - "from vaep.logging import setup_nb_logger\n", - "\n", - "logger = setup_nb_logger()\n", - "\n", - "FOLDER_DATA = defaults.FOLDER_DATA" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vaep.plotting.make_large_descriptors()\n", - "FIGSIZE = (15, 10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Parameters" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "N_MIN_INSTRUMENT = 300\n", - "META_DATA: str = 'data/files_selected_metadata.csv'\n", - "FILE_EXT = 'pkl' # 'csv' or 'pkl'\n", - "SAMPLE_ID = 'Sample ID'\n", - "\n", - "DUMP: str = erda_dumps.FN_PROTEIN_GROUPS # Filepath to erda dump\n", - "OUT_NAME = 'protein group' # for legends labels\n", - "# DUMP: str = erda_dumps.FN_PEPTIDES\n", - "# OUT_NAME = 'peptide' # for legends labels\n", - "# DUMP: str = erda_dumps.FN_EVIDENCE\n", - "# OUT_NAME = 'precursor' # for legends labels\n", - "\n", - "FOLDER_DATASETS: str = f'dev_datasets/{DUMP.stem}'\n", - "\n", - "INSTRUMENT_LEGEND_TITLE = 'Q Exactive HF-X Orbitrap'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# FILE_EXT = 'csv'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Make sure output folder exists" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "DUMP = Path(DUMP) # set parameter from cli or yaml to Path\n", - "FOLDER_DATASETS = defaults.FOLDER_DATA / FOLDER_DATASETS\n", - "FOLDER_DATASETS.mkdir(exist_ok=True, parents=True)\n", - "logger.info(f\"Folder for datasets to be created: {FOLDER_DATASETS.absolute()}\")\n", - "\n", - "files_out = dict()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Dumps" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- load dumps\n", - "- load file to machine mappings" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data = pd.read_pickle(DUMP)\n", - "data = data.squeeze() # In case it is a DataFrame, not a series (-> leads to MultiIndex)\n", - "# name_data = data.name\n", - "logger.info(\n", - " f\"Number of rows (row = sample, feature, intensity): {len(data):,d}\")\n", - "data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Make categorical index a normal string index (this lead to problems when selecting data using `loc` and grouping data as level of data could not easily be removed from MultiIndex)\n", - "\n", - "- see [blog](https://towardsdatascience.com/staying-sane-while-adopting-pandas-categorical-datatypes-78dbd19dcd8a)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# index_columns = data.index.names\n", - "# data = data.reset_index()\n", - "# print(data.memory_usage(deep=True))\n", - "# cat_columns = data.columns[data.dtypes == 'category']\n", - "# if not cat_columns.empty:\n", - "# data[cat_columns] = data[cat_columns].astype('object')\n", - "# print(\"non categorical: \\n\", data.memory_usage(deep=True))\n", - "# logger.warning(\n", - "# \"if time allows, this should be investigate -> use of loc with data which is not categorical\")\n", - "# data = data.set_index(index_columns)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# feat_name = list(data.index.names)\n", - "# feat_name.remove(SAMPLE_ID)\n", - "feat_name = (OUT_NAME,)\n", - "feat_name # index name(s) which are not the sample index" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# M = len(data.index.levels[-1])\n", - "N, M = data.shape\n", - "logger.info(f\"Number of unqiue features: {M}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Filter data by metadata" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# sample_ids = data.index.levels[0] # assume first index position is Sample ID?\n", - "sample_ids = data.index.unique() #.get_level_values(SAMPLE_ID).unique() # more explict\n", - "sample_ids" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "lines_to_next_cell": 0 - }, - "source": [ - "### Meta Data\n", - "\n", - "- based on ThermoRawFileParser" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_meta = pd.read_csv(META_DATA, index_col=SAMPLE_ID)\n", - "date_col = 'Content Creation Date'\n", - "df_meta[date_col] = pd.to_datetime(df_meta[date_col])\n", - "df_meta = df_meta.loc[sample_ids]\n", - "df_meta" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "lines_to_next_cell": 0 - }, - "source": [ - "## Rename samples\n", - "- to \"YEAR_MONTH_DAY_HOUR_MIN_INSTRUMENT\" (no encoding of information intended)\n", - "- check that instrument names are unique\n", - "- drop metadata (entire)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "lines_to_next_cell": 0 - }, - "outputs": [], - "source": [ - "idx_all = (pd.to_datetime(df_meta[\"Content Creation Date\"]).dt.strftime(\"%Y_%m_%d_%H_%M\")\n", - " + '_'\n", - " + df_meta[\"Thermo Scientific instrument model\"].str.replace(' ', '-')\n", - " + '_'\n", - " + df_meta[\"instrument serial number\"].str.split('#').str[-1])\n", - "\n", - "mask = idx_all.duplicated(keep=False)\n", - "duplicated_sample_idx = idx_all.loc[mask].sort_values() # duplicated dumps\n", - "duplicated_sample_idx\n", - "\n", - "#" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_duplicates = data.loc[duplicated_sample_idx.index] #.unstack()\n", - "# data_duplicates.T.corr() # same samples are have corr. of 1\n", - "data_duplicates.sum(axis=1) # keep only one seems okay" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "idx_unique = idx_all.drop_duplicates()\n", - "idx_unique" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_meta = df_meta.loc[idx_unique.index].rename(idx_unique)\n", - "df_meta" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# data = data.unstack(feat_name) # needed later anyways\n", - "data = data.loc[idx_unique.index].rename(idx_unique)\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "meta_to_drop = ['Pathname']\n", - "fname = FOLDER_DATASETS / 'metadata.csv'\n", - "files_out[fname.name] = fname\n", - "df_meta.drop(meta_to_drop, axis=1).to_csv(fname)\n", - "logger.info(f\"{fname = }\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Support per sample in entire data set" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "counts = data.count(axis=1) # wide format\n", - "N = len(counts)\n", - "fname = FOLDER_DATASETS / 'support_all.json'\n", - "files_out[fname.name] = fname\n", - "counts.to_json(fname, indent=4)\n", - "ax = (counts\n", - " .sort_values() # will raise an error with a DataFrame\n", - " .reset_index(drop=True)\n", - " .plot(rot=45,\n", - " figsize=FIGSIZE,\n", - " grid=True,\n", - " ylabel='number of features in sample',\n", - " xlabel='Sample rank ordered by number of features',\n", - " title=f'Support of {N:,d} samples features over {M} features ({\", \".join(feat_name)})',\n", - " ))\n", - "vaep.plotting.add_prop_as_second_yaxis(ax, M)\n", - "fig = ax.get_figure()\n", - "fig.tight_layout()\n", - "fname = FOLDER_DATASETS / 'support_all.pdf'\n", - "files_out[fname.name] = fname\n", - "vaep.plotting.savefig(fig, fname)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "counts = data.count(axis=0) # wide format\n", - "counts.to_json(FOLDER_DATASETS / 'feat_completeness_all.json', indent=4)\n", - "ax = (counts\n", - " .sort_values() # will raise an error with a DataFrame\n", - " .reset_index(drop=True)\n", - " .plot(rot=45,\n", - " figsize=FIGSIZE,\n", - " grid=True,\n", - " ylabel='number of samples per feature',\n", - " xlabel='Feature rank ordered by number of samples',\n", - " title=f'Support of {len(counts):,d} features over {N} samples ({\", \".join(feat_name)})',\n", - " ))\n", - "vaep.plotting.add_prop_as_second_yaxis(ax, N)\n", - "fig = ax.get_figure()\n", - "fname = FOLDER_DATASETS / 'feat_per_sample_all.pdf'\n", - "files_out[fname.stem] = fname\n", - "vaep.plotting.savefig(fig, fname)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Available instruments" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "counts_instrument = df_meta.groupby(thermo_raw_files.cols_instrument)[date_col].agg(\n", - " ['count', 'min', 'max']).sort_values(by=thermo_raw_files.cols_instrument[:2] + ['count'], ascending=False)\n", - "counts_instrument" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "len(counts_instrument)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "selected_instruments = counts_instrument.query(f\"count >= {N_MIN_INSTRUMENT}\")\n", - "fname = FOLDER_DATASETS / 'dataset_info.xlsx'\n", - "files_out[fname.name] = fname\n", - "selected_instruments.to_latex(fname.with_suffix('.tex'))\n", - "selected_instruments.to_excel(fname)\n", - "logger.info(f\"Save Information to: {fname} (as xlsx and tex)\")\n", - "selected_instruments" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Summary plot - UMAP\n", - "\n", - "- embedding based on all samples\n", - "- visualization of top 5 instruments" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "reducer = umap.UMAP(random_state=42)\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "embedding = reducer.fit_transform(data.fillna(data.median()))\n", - "embedding = pd.DataFrame(embedding, index=data.index,\n", - " columns=['UMAP 1', 'UMAP 2'])\n", - "embedding = embedding.join(\n", - " df_meta[[\"Content Creation Date\", \"instrument serial number\"]])\n", - "d_instrument_counts = counts_instrument['count'].reset_index(\n", - " level=[0, 1], drop=True).to_dict()\n", - "embedding[\"count\"] = embedding[\"instrument serial number\"].replace(\n", - " d_instrument_counts)\n", - "embedding" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "digits = int(np.ceil(np.log10(embedding[\"count\"].max())))\n", - "digits" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "embedding[\"instrument with N\"] = embedding[[\"instrument serial number\",\n", - " \"count\"]].apply(lambda s: f\"{s[0]} (N={s[1]:{digits}d})\", axis=1)\n", - "embedding[\"instrument with N\"] = embedding[\"instrument with N\"].str.replace(\n", - " 'Exactive Series slot', 'Instrument')\n", - "embedding" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "define top five instruments" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "top_5 = counts_instrument[\"count\"].nlargest(5)\n", - "top_5 = top_5.index.levels[-1]\n", - "embedding[\"instrument\"] = embedding[\"instrument serial number\"].apply(\n", - " lambda x: x if x in top_5 else 'other')\n", - "mask_top_5 = embedding[\"instrument\"] != 'other'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "embedding[\"Date (90 days intervals)\"] = embedding[\"Content Creation Date\"].dt.round(\n", - " \"90D\").astype(str)\n", - "to_plot = embedding.loc[mask_top_5]\n", - "print(f\"N samples in plot: {len(to_plot):,d}\")\n", - "fig, ax = plt.subplots(figsize=(20, 10))\n", - "\n", - "ax = sns.scatterplot(data=to_plot, x='UMAP 1', y='UMAP 2', style=\"instrument with N\",\n", - " hue=\"Date (90 days intervals)\", ax=ax) # =\"Content Creation Date\")\n", - "\n", - "fname = FOLDER_DATASETS / 'umap_interval90days_top5_instruments.pdf'\n", - "files_out[fname.name] = fname\n", - "vaep.savefig(fig, name=fname)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "markers = ['o', 'x', 's', 'P', 'D', '.']\n", - "alpha = 0.6\n", - "fig, ax = plt.subplots(figsize=(12, 8))\n", - "groups = list()\n", - "\n", - "vaep.plotting.make_large_descriptors()\n", - "embedding[\"Content Creation Date\"] = embedding[\"Content Creation Date\"].dt.round(\n", - " \"D\")\n", - "embedding[\"mdate\"] = embedding[\"Content Creation Date\"].apply(\n", - " matplotlib.dates.date2num)\n", - "\n", - "to_plot = embedding.loc[mask_top_5]\n", - "\n", - "norm = matplotlib.colors.Normalize(\n", - " embedding[\"mdate\"].quantile(0.05), embedding[\"mdate\"].quantile(0.95))\n", - "cmap = sns.color_palette(\"cubehelix\", as_cmap=True)\n", - "\n", - "\n", - "for k, _to_plot in to_plot.groupby('instrument with N'):\n", - " if markers:\n", - " marker = markers.pop(0)\n", - " _ = ax.scatter(\n", - " x=_to_plot[\"UMAP 1\"],\n", - " y=_to_plot[\"UMAP 2\"],\n", - " c=_to_plot[\"mdate\"],\n", - " alpha=alpha,\n", - " marker=marker,\n", - " cmap=cmap,\n", - " norm=norm\n", - " )\n", - " groups.append(k)\n", - "\n", - "cbar = vaep.analyzers.analyzers.add_date_colorbar(\n", - " ax.collections[0], ax=ax)\n", - "cbar.ax.set_ylabel(\"date of measurement\", labelpad=-115, loc='center')\n", - "ax.legend(ax.collections, groups,\n", - " title=INSTRUMENT_LEGEND_TITLE, fontsize='xx-large')\n", - "ax.set_xlabel('UMAP 1') # , fontdict={'size': 16})\n", - "ax.set_ylabel('UMAP 2')\n", - "\n", - "fname = FOLDER_DATASETS / 'umap_date_top5_instruments.pdf'\n", - "files_out[fname.name] = fname\n", - "vaep.savefig(fig, name=fname)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Summary statistics for top 5 instruments " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "fig, ax = plt.subplots(1, 1, figsize=(6, 6))\n", - "# boxplot: number of available sample for included features\n", - "to_plot = (data\n", - " .loc[mask_top_5]\n", - " .notna()\n", - " .sum(axis=0)\n", - " .reset_index(drop=True)\n", - " .to_frame(f'{OUT_NAME.capitalize()} prevalence')\n", - " )\n", - "# boxplot: number of features per sample\n", - "to_plot = (to_plot\n", - " .join(data\n", - " .loc[mask_top_5]\n", - " .notna()\n", - " .sum(axis=1)\n", - " .reset_index(drop=True)\n", - " .to_frame(f'{OUT_NAME.capitalize()}s per sample'))\n", - " )\n", - "to_plot = (to_plot\n", - " .join(counts_instrument\n", - " .reset_index([0, 1], drop=True)\n", - " .loc[top_5, 'count']\n", - " .reset_index(drop=True)\n", - " .rename('Samples per instrument', axis='index'))\n", - " )\n", - "ax = to_plot.plot(kind='box', ax=ax, fontsize=16, )\n", - "ax.set_ylabel('number of observations',\n", - " fontdict={'fontsize': 14})\n", - "ax.set_xticklabels(ax.get_xticklabels(), rotation=45,\n", - " horizontalalignment='right')\n", - "to_plot.to_csv(FOLDER_DATASETS / 'summary_statistics_dump_data.csv')\n", - "\n", - "fname = FOLDER_DATASETS / 'summary_statistics_dump.pdf'\n", - "files_out[fname.name] = fname\n", - "vaep.savefig(fig, name=fname)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "top_5_meta = df_meta.loc[mask_top_5] \n", - "top_5_meta[['injection volume setting', 'dilution factor']].describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Meta data stats for top 5" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for _instrument, _df_meta_instrument in top_5_meta.groupby(by=thermo_raw_files.cols_instrument):\n", - " print('#'* 80, ' - '.join(_instrument), sep='\\n')\n", - " display(_df_meta_instrument.describe())\n", - " display(_df_meta_instrument['injection volume setting'].value_counts())\n", - " break" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Dump single experiments\n", - "\n", - "in wide format" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# data = data.stack(feat_name)\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cols = selected_instruments.index.names\n", - "\n", - "file_formats = {'pkl': 'to_pickle',\n", - " 'pickle': 'to_pickle',\n", - " 'csv': 'to_csv'}\n", - "\n", - "\n", - "for values in selected_instruments.index:\n", - " mask = df_meta[cols] == values\n", - " logger.info(f\"Samples: {mask.all(axis=1).sum()}\")\n", - " sample_ids = df_meta.loc[mask.all(axis=1)]\n", - " display(sample_ids.sort_index())\n", - " sample_ids = sample_ids.index\n", - " # which categorical this might need to be a categorical Index as well?\n", - " dataset = data.loc[sample_ids]\n", - " # dataset.index = dataset.index.remove_unused_levels()\n", - "\n", - " display(dataset\n", - " # .unstack(dataset.index.names[1:])\n", - " .sort_index()\n", - " )\n", - "\n", - " fname_dataset = vaep.io.get_fname_from_keys(values,\n", - " file_ext=f\".{FILE_EXT}\")\n", - " fname_dataset = (FOLDER_DATASETS /\n", - " fname_dataset.name.replace('Exactive_Series_slot_#', ''))\n", - " files_out[fname_dataset.name] = fname_dataset\n", - " logger.info(f'Dump dataset with N = {len(dataset)} to {fname_dataset}')\n", - " _to_file_format = getattr(dataset, file_formats[FILE_EXT])\n", - " _to_file_format(fname_dataset)\n", - "\n", - " # calculate support\n", - " counts = dataset.count(axis=1).squeeze()\n", - " ## to disk\n", - " fname_support = vaep.io.get_fname_from_keys(values,\n", - " folder='.',\n", - " file_ext=\"\")\n", - " fname_support = (FOLDER_DATASETS /\n", - " (fname_support.stem + '_support.json').replace('Exactive_Series_slot_#', ''))\n", - " files_out[fname_support.name] = fname_support\n", - " logger.info(f\"Dump support to: {fname_support.as_posix()}\")\n", - " \n", - " counts.to_json(fname_support, indent=4)\n", - "\n", - " # very slow alternative, but 100% correct\n", - " # M = dataset.index.droplevel(SAMPLE_ID).nunique()\n", - " N, M = dataset.shape\n", - "\n", - " # plot support:\n", - " fig, ax = plt.subplots()\n", - " ax = (counts\n", - " .sort_values() # will raise an error with a DataFrame\n", - " .reset_index(drop=True)\n", - " .plot(rot=45,\n", - " ax=ax,\n", - " figsize=FIGSIZE,\n", - " grid=True,\n", - " xlabel='Count of samples ordered by number of features',\n", - " title=f'Support of {len(counts):,d} samples features over {M} features ({\", \".join(feat_name)})',\n", - " ))\n", - " vaep.plotting.add_prop_as_second_yaxis(ax, M)\n", - " fig.tight_layout()\n", - " fname_support = fname_support.with_suffix('.pdf') \n", - " files_out[fname_support.name] = fname_support\n", - " vaep.plotting.savefig(fig, name=fname_support)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Last example dumped" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# add json dump as target file for script for workflows\n", - "fname = FOLDER_DATASETS / 'selected_instruments.json'\n", - "files_out[fname.name] = fname\n", - "selected_instruments.to_json(fname, indent=4)\n", - "logger.info(f\"Saved: {fname}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "files_out" - ] - } - ], - "metadata": { - "interpreter": { - "hash": "cf83e9cb890c7f96eb0ae04f39a82254555f56a1a0ed2f03b23a8b40fe6cd31c" - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/project/00_4_hela_development_dataset_splitting.py b/project/00_4_hela_development_dataset_splitting.py deleted file mode 100644 index bafc8dac3..000000000 --- a/project/00_4_hela_development_dataset_splitting.py +++ /dev/null @@ -1,505 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.14.5 -# kernelspec: -# display_name: Python 3 -# language: python -# name: python3 -# --- - -# %% [markdown] -# # Split up data into single datasets -# -# - create datasets per (set of) instruments for a specific experiments -# - drop some samples based on quality criteria - -# %% -from pathlib import Path - -import numpy as np -import pandas as pd -import matplotlib -import matplotlib.pyplot as plt -import matplotlib.dates -import seaborn as sns - -import umap - -from vaep.io import thermo_raw_files -from vaep.analyzers import analyzers - -from config import erda_dumps -from config import defaults - -import vaep -import vaep.io.filenames -from vaep.logging import setup_nb_logger - -logger = setup_nb_logger() - -FOLDER_DATA = defaults.FOLDER_DATA - -# %% -vaep.plotting.make_large_descriptors() -FIGSIZE = (15, 10) - -# %% [markdown] -# ## Parameters - -# %% tags=["parameters"] -N_MIN_INSTRUMENT = 300 -META_DATA: str = 'data/files_selected_metadata.csv' -FILE_EXT = 'pkl' # 'csv' or 'pkl' -SAMPLE_ID = 'Sample ID' - -DUMP: str = erda_dumps.FN_PROTEIN_GROUPS # Filepath to erda dump -OUT_NAME = 'protein group' # for legends labels -# DUMP: str = erda_dumps.FN_PEPTIDES -# OUT_NAME = 'peptide' # for legends labels -# DUMP: str = erda_dumps.FN_EVIDENCE -# OUT_NAME = 'precursor' # for legends labels - -FOLDER_DATASETS: str = f'dev_datasets/{DUMP.stem}' - -INSTRUMENT_LEGEND_TITLE = 'Q Exactive HF-X Orbitrap' - -# %% -# FILE_EXT = 'csv' - -# %% [markdown] -# Make sure output folder exists - -# %% -DUMP = Path(DUMP) # set parameter from cli or yaml to Path -FOLDER_DATASETS = defaults.FOLDER_DATA / FOLDER_DATASETS -FOLDER_DATASETS.mkdir(exist_ok=True, parents=True) -logger.info(f"Folder for datasets to be created: {FOLDER_DATASETS.absolute()}") - -files_out = dict() - -# %% [markdown] -# ## Dumps - -# %% [markdown] -# - load dumps -# - load file to machine mappings - -# %% -data = pd.read_pickle(DUMP) -data = data.squeeze() # In case it is a DataFrame, not a series (-> leads to MultiIndex) -# name_data = data.name -logger.info( - f"Number of rows (row = sample, feature, intensity): {len(data):,d}") -data - -# %% [markdown] -# Make categorical index a normal string index (this lead to problems when selecting data using `loc` and grouping data as level of data could not easily be removed from MultiIndex) -# -# - see [blog](https://towardsdatascience.com/staying-sane-while-adopting-pandas-categorical-datatypes-78dbd19dcd8a) - -# %% -# index_columns = data.index.names -# data = data.reset_index() -# print(data.memory_usage(deep=True)) -# cat_columns = data.columns[data.dtypes == 'category'] -# if not cat_columns.empty: -# data[cat_columns] = data[cat_columns].astype('object') -# print("non categorical: \n", data.memory_usage(deep=True)) -# logger.warning( -# "if time allows, this should be investigate -> use of loc with data which is not categorical") -# data = data.set_index(index_columns) - -# %% -# feat_name = list(data.index.names) -# feat_name.remove(SAMPLE_ID) -feat_name = (OUT_NAME,) -feat_name # index name(s) which are not the sample index - -# %% -# M = len(data.index.levels[-1]) -N, M = data.shape -logger.info(f"Number of unqiue features: {M}") - -# %% [markdown] -# ## Filter data by metadata - -# %% -# sample_ids = data.index.levels[0] # assume first index position is Sample ID? -sample_ids = data.index.unique() #.get_level_values(SAMPLE_ID).unique() # more explict -sample_ids - -# %% [markdown] -# ### Meta Data -# -# - based on ThermoRawFileParser -# %% -df_meta = pd.read_csv(META_DATA, index_col=SAMPLE_ID) -date_col = 'Content Creation Date' -df_meta[date_col] = pd.to_datetime(df_meta[date_col]) -df_meta = df_meta.loc[sample_ids] -df_meta - -# %% [markdown] -# ## Rename samples -# - to "YEAR_MONTH_DAY_HOUR_MIN_INSTRUMENT" (no encoding of information intended) -# - check that instrument names are unique -# - drop metadata (entire) -# %% -idx_all = (pd.to_datetime(df_meta["Content Creation Date"]).dt.strftime("%Y_%m_%d_%H_%M") - + '_' - + df_meta["Thermo Scientific instrument model"].str.replace(' ', '-') - + '_' - + df_meta["instrument serial number"].str.split('#').str[-1]) - -mask = idx_all.duplicated(keep=False) -duplicated_sample_idx = idx_all.loc[mask].sort_values() # duplicated dumps -duplicated_sample_idx - -# -# %% -data_duplicates = data.loc[duplicated_sample_idx.index] #.unstack() -# data_duplicates.T.corr() # same samples are have corr. of 1 -data_duplicates.sum(axis=1) # keep only one seems okay - -# %% -idx_unique = idx_all.drop_duplicates() -idx_unique - -# %% -df_meta = df_meta.loc[idx_unique.index].rename(idx_unique) -df_meta - -# %% -# data = data.unstack(feat_name) # needed later anyways -data = data.loc[idx_unique.index].rename(idx_unique) -data - -# %% -meta_to_drop = ['Pathname'] -fname = FOLDER_DATASETS / 'metadata.csv' -files_out[fname.name] = fname -df_meta.drop(meta_to_drop, axis=1).to_csv(fname) -logger.info(f"{fname = }") - - -# %% [markdown] -# ## Support per sample in entire data set - -# %% -counts = data.count(axis=1) # wide format -N = len(counts) -fname = FOLDER_DATASETS / 'support_all.json' -files_out[fname.name] = fname -counts.to_json(fname, indent=4) -ax = (counts - .sort_values() # will raise an error with a DataFrame - .reset_index(drop=True) - .plot(rot=45, - figsize=FIGSIZE, - grid=True, - ylabel='number of features in sample', - xlabel='Sample rank ordered by number of features', - title=f'Support of {N:,d} samples features over {M} features ({", ".join(feat_name)})', - )) -vaep.plotting.add_prop_as_second_yaxis(ax, M) -fig = ax.get_figure() -fig.tight_layout() -fname = FOLDER_DATASETS / 'support_all.pdf' -files_out[fname.name] = fname -vaep.plotting.savefig(fig, fname) - - -# %% -counts = data.count(axis=0) # wide format -counts.to_json(FOLDER_DATASETS / 'feat_completeness_all.json', indent=4) -ax = (counts - .sort_values() # will raise an error with a DataFrame - .reset_index(drop=True) - .plot(rot=45, - figsize=FIGSIZE, - grid=True, - ylabel='number of samples per feature', - xlabel='Feature rank ordered by number of samples', - title=f'Support of {len(counts):,d} features over {N} samples ({", ".join(feat_name)})', - )) -vaep.plotting.add_prop_as_second_yaxis(ax, N) -fig = ax.get_figure() -fname = FOLDER_DATASETS / 'feat_per_sample_all.pdf' -files_out[fname.stem] = fname -vaep.plotting.savefig(fig, fname) - - -# %% [markdown] -# ## Available instruments - -# %% -counts_instrument = df_meta.groupby(thermo_raw_files.cols_instrument)[date_col].agg( - ['count', 'min', 'max']).sort_values(by=thermo_raw_files.cols_instrument[:2] + ['count'], ascending=False) -counts_instrument - -# %% -len(counts_instrument) - -# %% -selected_instruments = counts_instrument.query(f"count >= {N_MIN_INSTRUMENT}") -fname = FOLDER_DATASETS / 'dataset_info.xlsx' -files_out[fname.name] = fname -selected_instruments.to_latex(fname.with_suffix('.tex')) -selected_instruments.to_excel(fname) -logger.info(f"Save Information to: {fname} (as xlsx and tex)") -selected_instruments - - -# %% [markdown] -# ## Summary plot - UMAP -# -# - embedding based on all samples -# - visualization of top 5 instruments - -# %% -reducer = umap.UMAP(random_state=42) -data - -# %% -embedding = reducer.fit_transform(data.fillna(data.median())) -embedding = pd.DataFrame(embedding, index=data.index, - columns=['UMAP 1', 'UMAP 2']) -embedding = embedding.join( - df_meta[["Content Creation Date", "instrument serial number"]]) -d_instrument_counts = counts_instrument['count'].reset_index( - level=[0, 1], drop=True).to_dict() -embedding["count"] = embedding["instrument serial number"].replace( - d_instrument_counts) -embedding - -# %% -digits = int(np.ceil(np.log10(embedding["count"].max()))) -digits - -# %% -embedding["instrument with N"] = embedding[["instrument serial number", - "count"]].apply(lambda s: f"{s[0]} (N={s[1]:{digits}d})", axis=1) -embedding["instrument with N"] = embedding["instrument with N"].str.replace( - 'Exactive Series slot', 'Instrument') -embedding - -# %% [markdown] -# define top five instruments - -# %% -top_5 = counts_instrument["count"].nlargest(5) -top_5 = top_5.index.levels[-1] -embedding["instrument"] = embedding["instrument serial number"].apply( - lambda x: x if x in top_5 else 'other') -mask_top_5 = embedding["instrument"] != 'other' - -# %% -embedding["Date (90 days intervals)"] = embedding["Content Creation Date"].dt.round( - "90D").astype(str) -to_plot = embedding.loc[mask_top_5] -print(f"N samples in plot: {len(to_plot):,d}") -fig, ax = plt.subplots(figsize=(20, 10)) - -ax = sns.scatterplot(data=to_plot, x='UMAP 1', y='UMAP 2', style="instrument with N", - hue="Date (90 days intervals)", ax=ax) # ="Content Creation Date") - -fname = FOLDER_DATASETS / 'umap_interval90days_top5_instruments.pdf' -files_out[fname.name] = fname -vaep.savefig(fig, name=fname) - -# %% -markers = ['o', 'x', 's', 'P', 'D', '.'] -alpha = 0.6 -fig, ax = plt.subplots(figsize=(12, 8)) -groups = list() - -vaep.plotting.make_large_descriptors() -embedding["Content Creation Date"] = embedding["Content Creation Date"].dt.round( - "D") -embedding["mdate"] = embedding["Content Creation Date"].apply( - matplotlib.dates.date2num) - -to_plot = embedding.loc[mask_top_5] - -norm = matplotlib.colors.Normalize( - embedding["mdate"].quantile(0.05), embedding["mdate"].quantile(0.95)) -cmap = sns.color_palette("cubehelix", as_cmap=True) - - -for k, _to_plot in to_plot.groupby('instrument with N'): - if markers: - marker = markers.pop(0) - _ = ax.scatter( - x=_to_plot["UMAP 1"], - y=_to_plot["UMAP 2"], - c=_to_plot["mdate"], - alpha=alpha, - marker=marker, - cmap=cmap, - norm=norm - ) - groups.append(k) - -cbar = vaep.analyzers.analyzers.add_date_colorbar( - ax.collections[0], ax=ax) -cbar.ax.set_ylabel("date of measurement", labelpad=-115, loc='center') -ax.legend(ax.collections, groups, - title=INSTRUMENT_LEGEND_TITLE, fontsize='xx-large') -ax.set_xlabel('UMAP 1') # , fontdict={'size': 16}) -ax.set_ylabel('UMAP 2') - -fname = FOLDER_DATASETS / 'umap_date_top5_instruments.pdf' -files_out[fname.name] = fname -vaep.savefig(fig, name=fname) - -# %% [markdown] -# ## Summary statistics for top 5 instruments - -# %% -fig, ax = plt.subplots(1, 1, figsize=(6, 6)) -# boxplot: number of available sample for included features -to_plot = (data - .loc[mask_top_5] - .notna() - .sum(axis=0) - .reset_index(drop=True) - .to_frame(f'{OUT_NAME.capitalize()} prevalence') - ) -# boxplot: number of features per sample -to_plot = (to_plot - .join(data - .loc[mask_top_5] - .notna() - .sum(axis=1) - .reset_index(drop=True) - .to_frame(f'{OUT_NAME.capitalize()}s per sample')) - ) -to_plot = (to_plot - .join(counts_instrument - .reset_index([0, 1], drop=True) - .loc[top_5, 'count'] - .reset_index(drop=True) - .rename('Samples per instrument', axis='index')) - ) -ax = to_plot.plot(kind='box', ax=ax, fontsize=16, ) -ax.set_ylabel('number of observations', - fontdict={'fontsize': 14}) -ax.set_xticklabels(ax.get_xticklabels(), rotation=45, - horizontalalignment='right') -to_plot.to_csv(FOLDER_DATASETS / 'summary_statistics_dump_data.csv') - -fname = FOLDER_DATASETS / 'summary_statistics_dump.pdf' -files_out[fname.name] = fname -vaep.savefig(fig, name=fname) - - -# %% -top_5_meta = df_meta.loc[mask_top_5] -top_5_meta[['injection volume setting', 'dilution factor']].describe() - -# %% [markdown] -# ### Meta data stats for top 5 - -# %% -for _instrument, _df_meta_instrument in top_5_meta.groupby(by=thermo_raw_files.cols_instrument): - print('#'* 80, ' - '.join(_instrument), sep='\n') - display(_df_meta_instrument.describe()) - display(_df_meta_instrument['injection volume setting'].value_counts()) - break - -# %% [markdown] -# ## Dump single experiments -# -# in wide format - -# %% -# data = data.stack(feat_name) -data - -# %% -cols = selected_instruments.index.names - -file_formats = {'pkl': 'to_pickle', - 'pickle': 'to_pickle', - 'csv': 'to_csv'} - - -for values in selected_instruments.index: - mask = df_meta[cols] == values - logger.info(f"Samples: {mask.all(axis=1).sum()}") - sample_ids = df_meta.loc[mask.all(axis=1)] - display(sample_ids.sort_index()) - sample_ids = sample_ids.index - # which categorical this might need to be a categorical Index as well? - dataset = data.loc[sample_ids] - # dataset.index = dataset.index.remove_unused_levels() - - display(dataset - # .unstack(dataset.index.names[1:]) - .sort_index() - ) - - fname_dataset = vaep.io.get_fname_from_keys(values, - file_ext=f".{FILE_EXT}") - fname_dataset = (FOLDER_DATASETS / - fname_dataset.name.replace('Exactive_Series_slot_#', '')) - files_out[fname_dataset.name] = fname_dataset - logger.info(f'Dump dataset with N = {len(dataset)} to {fname_dataset}') - _to_file_format = getattr(dataset, file_formats[FILE_EXT]) - _to_file_format(fname_dataset) - - # calculate support - counts = dataset.count(axis=1).squeeze() - ## to disk - fname_support = vaep.io.get_fname_from_keys(values, - folder='.', - file_ext="") - fname_support = (FOLDER_DATASETS / - (fname_support.stem + '_support.json').replace('Exactive_Series_slot_#', '')) - files_out[fname_support.name] = fname_support - logger.info(f"Dump support to: {fname_support.as_posix()}") - - counts.to_json(fname_support, indent=4) - - # very slow alternative, but 100% correct - # M = dataset.index.droplevel(SAMPLE_ID).nunique() - N, M = dataset.shape - - # plot support: - fig, ax = plt.subplots() - ax = (counts - .sort_values() # will raise an error with a DataFrame - .reset_index(drop=True) - .plot(rot=45, - ax=ax, - figsize=FIGSIZE, - grid=True, - xlabel='Count of samples ordered by number of features', - title=f'Support of {len(counts):,d} samples features over {M} features ({", ".join(feat_name)})', - )) - vaep.plotting.add_prop_as_second_yaxis(ax, M) - fig.tight_layout() - fname_support = fname_support.with_suffix('.pdf') - files_out[fname_support.name] = fname_support - vaep.plotting.savefig(fig, name=fname_support) - -# %% [markdown] -# ## Last example dumped - -# %% -dataset - -# %% -# add json dump as target file for script for workflows -fname = FOLDER_DATASETS / 'selected_instruments.json' -files_out[fname.name] = fname -selected_instruments.to_json(fname, indent=4) -logger.info(f"Saved: {fname}") - -# %% -files_out diff --git a/project/00_5_development_dataset_support.ipynb b/project/00_5_development_dataset_support.ipynb deleted file mode 100644 index 5d921944d..000000000 --- a/project/00_5_development_dataset_support.ipynb +++ /dev/null @@ -1,128 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "ea31410b-bc16-4cf4-9a0b-d48fd463b8ff", - "metadata": {}, - "source": [ - "# Support of dumped data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "22244800-d388-4395-a107-a6c5c2d5038f", - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "import plotly.express as px\n", - "\n", - "import vaep # set formatting defaults" - ] - }, - { - "cell_type": "markdown", - "id": "5fe86eb0-ecda-46ef-9f1b-86fa1152a73d", - "metadata": {}, - "source": [ - "## Parameters" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b8220ff5-8af8-4881-9411-c9164576a9fb", - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "support_json: str = 'data\\dev_datasets\\df_intensities_proteinGroups_long\\Q_Exactive_HF_X_Orbitrap_6070_support.json' # Path to json support file" - ] - }, - { - "cell_type": "markdown", - "id": "ebaf7373-cc81-4cfa-9dea-eb518c059c9a", - "metadata": {}, - "source": [ - "## Completeness of samples" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "82710205-39ff-44da-a4fe-9fbdb489ef4d", - "metadata": {}, - "outputs": [], - "source": [ - "support = pd.read_json(support_json, typ='series').sort_values().to_frame('no. of features')\n", - "support.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4656af82", - "metadata": {}, - "outputs": [], - "source": [ - "support.describe(percentiles=np.linspace(0.1,1,10))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e623a270-4044-4b58-a0a5-036985cb5e38", - "metadata": {}, - "outputs": [], - "source": [ - "ax = support.plot(rot=90, figsize=(20,10), legend=False)\n", - "ax.set_ylabel('number of features')\n", - "ax.yaxis.set_major_formatter(\"{x:,.0f}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e60e883a-570c-430f-a7b7-dd4acc1fc4c8", - "metadata": {}, - "outputs": [], - "source": [ - "px.line(support, height=1000)" - ] - }, - { - "cell_type": "markdown", - "id": "dec709da-23c9-48bd-b233-1e0c6b3bf0c8", - "metadata": {}, - "source": [ - "The one with very few identification are mainly fractions of entire samples" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "vaep", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/project/00_5_training_data_exploration.py b/project/00_5_training_data_exploration.py index 95853d29d..ca1de1883 100644 --- a/project/00_5_training_data_exploration.py +++ b/project/00_5_training_data_exploration.py @@ -26,6 +26,7 @@ # %% from __future__ import annotations import json +import logging from pathlib import Path import numpy as np @@ -42,8 +43,9 @@ from vaep.analyzers import analyzers logger = vaep.logging.setup_nb_logger() +logging.getLogger('fontTools').setLevel(logging.WARNING) -matplotlib.rcParams.update({'font.size': 5, +matplotlib.rcParams.update({'font.size': 6, 'figure.figsize': [4.0, 2.0]}) @@ -189,7 +191,7 @@ def get_dynamic_range(min_max): # %% min_samples_per_feat = int(len(data) * COMPLETENESS_OVER_SAMPLES) print(f"{min_samples_per_feat = }") -mask = data.notna().sum(axis=0) > min_samples_per_feat +mask = data.notna().sum(axis=0) >= min_samples_per_feat print(f"drop = {(~mask).sum()} features") selected = data.loc[:, mask] selected.shape @@ -305,7 +307,7 @@ def get_dynamic_range(min_max): # %%time corr_lower_triangle = analyzers.corr_lower_triangle(data) fig, axes = analyzers.plot_corr_histogram(corr_lower_triangle, bins=40) -fname = FIGUREFOLDER / f'corr_histogram_feat.pdf' +fname = FIGUREFOLDER / 'corr_histogram_feat.pdf' files_out[fname.name] = fname vaep.savefig(fig, name=fname) @@ -317,7 +319,7 @@ def get_dynamic_range(min_max): cv = data.std() / data.mean() # biological coefficient of variation: standard deviation (variation) w.r.t mean ax = cv.hist(bins=30) -fname = FIGUREFOLDER / f'CV_histogram_features.pdf' +fname = FIGUREFOLDER / 'CV_histogram_features.pdf' files_out[fname.name] = fname vaep.savefig(ax.get_figure(), name=fname) @@ -328,7 +330,10 @@ def get_dynamic_range(min_max): # needs to deal with duplicates # notna = data.notna().T.drop_duplicates().T # get index and column names -cg = sns.clustermap(data.notna(), cbar_pos=None) +vaep.plotting.make_large_descriptors(8) +cg = sns.clustermap(data.notna(), + cbar_pos=None, + figsize=(8, 8)) ax = cg.ax_heatmap if PG_SEPARATOR is not None: _new_labels = [l.get_text().split(PG_SEPARATOR)[0] @@ -341,7 +346,8 @@ def get_dynamic_range(min_max): files_out[fname.name] = fname vaep.savefig(cg.fig, name=fname, - pdf=False) + pdf=False, + dpi=600) # %% [markdown] # based on cluster, plot heatmaps of features and samples @@ -351,10 +357,12 @@ def get_dynamic_range(min_max): cg.dendrogram_col.reordered_ind)) == data.shape # %% -vaep.plotting.make_large_descriptors(5) +vaep.plotting.make_large_descriptors(8) +fig, ax = plt.subplots(figsize=(4, 4)) ax = sns.heatmap( data.iloc[cg.dendrogram_row.reordered_ind, cg.dendrogram_col.reordered_ind], + ax=ax, ) only_every_x_ticks(ax, x=2) use_first_n_chars_in_labels(ax, x=SAMPLE_FIRST_N_CHARS) @@ -367,7 +375,7 @@ def get_dynamic_range(min_max): ax.set_yticks([]) fname = FIGUREFOLDER / 'heatmap_intensities_ordered_by_missing_pattern.png' files_out[fname.name] = fname -vaep.savefig(ax.get_figure(), name=fname, pdf=False) +vaep.savefig(fig, name=fname, pdf=False, dpi=600) # ax.get_figure().savefig(fname, dpi=300) # %% [markdown] @@ -378,6 +386,9 @@ def get_dynamic_range(min_max): ax = sns.heatmap( analyzers.corr_lower_triangle( data.iloc[:, cg.dendrogram_col.reordered_ind]), + vmin=-1, + vmax=1, + cbar_kws={'shrink': 0.75}, ax=ax, square=True, ) @@ -392,14 +403,19 @@ def get_dynamic_range(min_max): ax.set_yticks([]) fname = FIGUREFOLDER / 'heatmap_feature_correlation.png' files_out[fname.name] = fname -vaep.savefig(fig, name=fname, pdf=False) +vaep.savefig(fig, name=fname, pdf=False, dpi=600) +# %% +lower_corr = analyzers.corr_lower_triangle( + data.T.iloc[:, cg.dendrogram_row.reordered_ind]) # %% fig, ax = plt.subplots(figsize=(4, 4)) ax = sns.heatmap( - analyzers.corr_lower_triangle( - data.T.iloc[:, cg.dendrogram_row.reordered_ind]), + data=lower_corr, ax=ax, + vmin=-1, + vmax=1, + cbar_kws={'shrink': 0.75}, square=True, ) _ = only_every_x_ticks(ax, x=2) @@ -409,9 +425,10 @@ def get_dynamic_range(min_max): ax.set_yticks([]) fname = FIGUREFOLDER / 'heatmap_sample_correlation.png' files_out[fname.name] = fname -vaep.savefig(fig, name=fname, pdf=False) +vaep.savefig(fig, name=fname, pdf=False, dpi=600) # %% +vaep.plotting.make_large_descriptors(12) kwargs = dict() if NO_TICK_LABELS_ON_HEATMAP: kwargs['xticklabels'] = False @@ -446,6 +463,7 @@ def get_dynamic_range(min_max): sample_stats # %% +vaep.plotting.make_large_descriptors(8) fig_ident = sns.relplot( x='SampleID_int', y=COL_NO_IDENTIFIED, data=sample_stats) fig_ident.set_axis_labels('Sample ID', f'Frequency of identified {TYPE}') @@ -491,3 +509,5 @@ def get_dynamic_range(min_max): # %% files_out + +# %% diff --git a/project/00_6_0_permute_data.ipynb b/project/00_6_0_permute_data.ipynb index d8b6493db..c16637bc1 100644 --- a/project/00_6_0_permute_data.ipynb +++ b/project/00_6_0_permute_data.ipynb @@ -23,6 +23,7 @@ "import numpy as np\n", "import vaep\n", "import vaep.analyzers.analyzers\n", + "from vaep.utils import create_random_df\n", "\n", "logger = vaep.logging.setup_nb_logger()\n", "logger.info(\"Split data and make diagnostic plots\")" @@ -35,7 +36,6 @@ "metadata": {}, "outputs": [], "source": [ - "from vaep.utils import create_random_df\n", "t = create_random_df(N=10, M=3)\n", "t = t.apply(lambda x: np.arange(len(x)))\n", "t" @@ -77,12 +77,11 @@ }, "outputs": [], "source": [ - "FN_INTENSITIES: str = 'data/dev_datasets/df_intensities_proteinGroups_long/Q_Exactive_HF_X_Orbitrap_6070.pkl' # Sample (rows) intensiites for features (columns)\n", - "index_col: Union[str, int] = 0 # Can be either a string or position (typical 0 for first column), or a list of these.\n", - "# wide_format: bool = False # intensities in wide format (more memory efficient of csv). Default is long_format (more precise)\n", - "column_names: List[str] = [\"Gene Names\"] # Manuelly set column names (of Index object in columns)\n", - "out_folder: str = '' # Output folder for permuted data, optional. Default is to save with suffix '_permuted' in same folder as input data\n", - "random_seed: int = 42 # Random seed for reproducibility\n", + "FN_INTENSITIES: str = 'data/dev_datasets/df_intensities_proteinGroups_long/Q_Exactive_HF_X_Orbitrap_6070.pkl' # Sample (rows) intensiites for features (columns)\n", + "index_col: Union[str, int] = 0 # Can be either a string or position (typical 0 for first column), or a list of these.\n", + "column_names: List[str] = [\"Gene Names\"] # Manuelly set column names (of Index object in columns)\n", + "out_folder: str = '' # Output folder for permuted data, optional. Default is to save with suffix '_permuted' in same folder as input data\n", + "random_seed: int = 42 # Random seed for reproducibility\n", "file_format: str = 'pkl'" ] }, @@ -149,9 +148,9 @@ "\n", "\n", "FILE_FORMAT_TO_CONSTRUCTOR_IN = {'csv': 'from_csv',\n", - " 'pkl': 'from_pickle',\n", - " 'pickle': 'from_pickle',\n", - " }\n", + " 'pkl': 'from_pickle',\n", + " 'pickle': 'from_pickle',\n", + " }\n", "\n", "FILE_EXT = Path(args.FN_INTENSITIES).suffix[1:]\n", "logger.info(f\"File format (extension): {FILE_EXT} (!specifies data loading function!)\")" @@ -168,10 +167,10 @@ "source": [ "constructor = getattr(\n", " vaep.analyzers.analyzers.AnalyzePeptides,\n", - " FILE_FORMAT_TO_CONSTRUCTOR_IN[FILE_EXT]) #AnalyzePeptides.from_csv \n", + " FILE_FORMAT_TO_CONSTRUCTOR_IN[FILE_EXT]) # AnalyzePeptides.from_csv\n", "analysis = constructor(fname=args.FN_INTENSITIES,\n", - " index_col=args.index_col,\n", - " )" + " index_col=args.index_col,\n", + " )" ] }, { @@ -215,7 +214,7 @@ "\n", "method = getattr(df, FILE_FORMAT_TO_CONSTRUCTOR.get(FILE_EXT))\n", "\n", - "fname = vaep.utils.append_to_filepath(args.FN_INTENSITIES , 'permuted')\n", + "fname = vaep.utils.append_to_filepath(args.FN_INTENSITIES, 'permuted')\n", "method(fname)" ] }, @@ -228,10 +227,10 @@ "source": [ "constructor = getattr(\n", " vaep.analyzers.analyzers.AnalyzePeptides,\n", - " FILE_FORMAT_TO_CONSTRUCTOR_IN[FILE_EXT]) #AnalyzePeptides.from_csv \n", + " FILE_FORMAT_TO_CONSTRUCTOR_IN[FILE_EXT]) # AnalyzePeptides.from_csv\n", "analysis = constructor(fname=args.FN_INTENSITIES,\n", - " index_col=args.index_col,\n", - " )" + " index_col=args.index_col,\n", + " )" ] } ], @@ -257,7 +256,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.8.17" } }, "nbformat": 4, diff --git a/project/00_6_0_permute_data.py b/project/00_6_0_permute_data.py index bc33b0774..9c6612d2e 100644 --- a/project/00_6_0_permute_data.py +++ b/project/00_6_0_permute_data.py @@ -9,12 +9,12 @@ import numpy as np import vaep import vaep.analyzers.analyzers +from vaep.utils import create_random_df logger = vaep.logging.setup_nb_logger() logger.info("Split data and make diagnostic plots") # %% -from vaep.utils import create_random_df t = create_random_df(N=10, M=3) t = t.apply(lambda x: np.arange(len(x))) t @@ -30,12 +30,11 @@ # %% tags=["parameters"] -FN_INTENSITIES: str = 'data/dev_datasets/df_intensities_proteinGroups_long/Q_Exactive_HF_X_Orbitrap_6070.pkl' # Sample (rows) intensiites for features (columns) -index_col: Union[str, int] = 0 # Can be either a string or position (typical 0 for first column), or a list of these. -# wide_format: bool = False # intensities in wide format (more memory efficient of csv). Default is long_format (more precise) -column_names: List[str] = ["Gene Names"] # Manuelly set column names (of Index object in columns) -out_folder: str = '' # Output folder for permuted data, optional. Default is to save with suffix '_permuted' in same folder as input data -random_seed: int = 42 # Random seed for reproducibility +FN_INTENSITIES: str = 'data/dev_datasets/df_intensities_proteinGroups_long/Q_Exactive_HF_X_Orbitrap_6070.pkl' # Sample (rows) intensiites for features (columns) +index_col: Union[str, int] = 0 # Can be either a string or position (typical 0 for first column), or a list of these. +column_names: List[str] = ["Gene Names"] # Manuelly set column names (of Index object in columns) +out_folder: str = '' # Output folder for permuted data, optional. Default is to save with suffix '_permuted' in same folder as input data +random_seed: int = 42 # Random seed for reproducibility file_format: str = 'pkl' # %% @@ -63,9 +62,9 @@ FILE_FORMAT_TO_CONSTRUCTOR_IN = {'csv': 'from_csv', - 'pkl': 'from_pickle', - 'pickle': 'from_pickle', - } + 'pkl': 'from_pickle', + 'pickle': 'from_pickle', + } FILE_EXT = Path(args.FN_INTENSITIES).suffix[1:] logger.info(f"File format (extension): {FILE_EXT} (!specifies data loading function!)") @@ -73,10 +72,10 @@ # %% constructor = getattr( vaep.analyzers.analyzers.AnalyzePeptides, - FILE_FORMAT_TO_CONSTRUCTOR_IN[FILE_EXT]) #AnalyzePeptides.from_csv + FILE_FORMAT_TO_CONSTRUCTOR_IN[FILE_EXT]) # AnalyzePeptides.from_csv analysis = constructor(fname=args.FN_INTENSITIES, - index_col=args.index_col, - ) + index_col=args.index_col, + ) # %% @@ -95,12 +94,12 @@ method = getattr(df, FILE_FORMAT_TO_CONSTRUCTOR.get(FILE_EXT)) -fname = vaep.utils.append_to_filepath(args.FN_INTENSITIES , 'permuted') +fname = vaep.utils.append_to_filepath(args.FN_INTENSITIES, 'permuted') method(fname) # %% constructor = getattr( vaep.analyzers.analyzers.AnalyzePeptides, - FILE_FORMAT_TO_CONSTRUCTOR_IN[FILE_EXT]) #AnalyzePeptides.from_csv + FILE_FORMAT_TO_CONSTRUCTOR_IN[FILE_EXT]) # AnalyzePeptides.from_csv analysis = constructor(fname=args.FN_INTENSITIES, - index_col=args.index_col, - ) + index_col=args.index_col, + ) diff --git a/project/00_6_hela_training_data_exploration.ipynb b/project/00_6_hela_training_data_exploration.ipynb deleted file mode 100644 index ab393060d..000000000 --- a/project/00_6_hela_training_data_exploration.ipynb +++ /dev/null @@ -1,832 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Inspect data using plots\n", - "- spread of intensities between samples\n", - "- spread of intensities within samples\n", - "- missing data plots: violin, box and histogram - both for features and samples\n", - " - optionally: plot proposed cutoffs (on per default)\n", - "- correlation analysis: can linear correlation be picked up?\n", - "-\n", - "\n", - "Does not save filtered data, this is done by splitting notebook. Only visualisations." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from __future__ import annotations\n", - "import json\n", - "from pathlib import Path\n", - "\n", - "import numpy as np\n", - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "\n", - "import matplotlib\n", - "\n", - "import vaep\n", - "from vaep import plotting\n", - "from vaep.pandas import missing_data\n", - "import vaep.data_handling\n", - "from vaep.analyzers import analyzers\n", - "\n", - "logger = vaep.logging.setup_nb_logger()\n", - "\n", - "matplotlib.rcParams.update({'font.size': 5,\n", - " 'figure.figsize': [4.0, 2.0]})\n", - "\n", - "\n", - "def only_every_x_ticks(ax, x=2, axis=None):\n", - " \"\"\"Sparse out ticks on both axis by factor x\"\"\"\n", - " if axis is None:\n", - " ax.set_xticks(ax.get_xticks()[::x])\n", - " ax.set_yticks(ax.get_yticks()[::x])\n", - " else:\n", - " if axis == 0:\n", - " ax.set_xticks(ax.get_xticks()[::x])\n", - " elif axis == 1:\n", - " ax.set_yticks(ax.get_yticks()[::x])\n", - " else:\n", - " raise ValueError(f'axis must be 0 or 1, got {axis}')\n", - " return ax\n", - "\n", - "\n", - "def use_first_n_chars_in_labels(ax, x=2):\n", - " \"\"\"Take first N characters of labels and use them as new labels\"\"\"\n", - " # xaxis\n", - " _new_labels = [l.get_text()[:x]\n", - " for l in ax.get_xticklabels()]\n", - " _ = ax.set_xticklabels(_new_labels)\n", - " # yaxis\n", - " _new_labels = [l.get_text()[:x] for l in ax.get_yticklabels()]\n", - " _ = ax.set_yticklabels(_new_labels)\n", - " return ax\n", - "\n", - "\n", - "def split_xticklabels(ax, PG_SEPARATOR=';'):\n", - " \"\"\"Split labels by PG_SEPARATOR and only use first part\"\"\"\n", - " if PG_SEPARATOR is not None:\n", - " _new_labels = [l.get_text().split(PG_SEPARATOR)[0]\n", - " for l in ax.get_xticklabels()]\n", - " _ = ax.set_xticklabels(_new_labels)\n", - " return ax\n", - "\n", - "\n", - "def get_clustermap(data,\n", - " figsize=(8, 8),\n", - " cbar_pos: tuple[float, float, float, float] = (\n", - " 0.02, 0.83, 0.03, 0.15),\n", - " **kwargs):\n", - " from sklearn.impute import SimpleImputer\n", - " from vaep.pandas import _add_indices\n", - " X = SimpleImputer().fit_transform(data)\n", - " X = _add_indices(X, data)\n", - " cg = sns.clustermap(X,\n", - " z_score=0,\n", - " cmap=\"vlag\",\n", - " center=0,\n", - " cbar_pos=cbar_pos,\n", - " figsize=figsize,\n", - " **kwargs\n", - " )\n", - " return cg\n", - "\n", - "\n", - "def get_dynamic_range(min_max):\n", - " dynamic_range = pd.DataFrame(range(*min_max), columns=['x'])\n", - " dynamic_range['$2^x$'] = dynamic_range.x.apply(lambda x: 2**x)\n", - " dynamic_range.set_index('x', inplace=True)\n", - " dynamic_range.index.name = ''\n", - " dynamic_range = dynamic_range.T\n", - " return dynamic_range" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Parameters" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "lines_to_next_cell": 2, - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "FN_INTENSITIES: str = 'data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv'\n", - "FOLDER_EXPERIMENT: str = 'runs/example/data_inspection'\n", - "N_FIRST_ROWS = None # possibility to select N first rows\n", - "LOG_TRANSFORM: bool = True # log transform data\n", - "# list of integers or string denoting the index columns (used for csv)\n", - "INDEX_COL: list = [0]\n", - "COL_INDEX_NAME: str = 'Protein groups' # name of column index, can be None\n", - "LONG_FORMAT: bool = False # if True, the data is expected to be in long format\n", - "# Threshold used later for data filtering (here only for visualisation)\n", - "COMPLETENESS_OVER_SAMPLES = 0.25 # 25% of samples have to have that features\n", - "MIN_FEAT_PER_SAMPLE = .4 # 40% of features selected in first step\n", - "# protein group separator, e.g.';' (could also be gene groups)\n", - "PG_SEPARATOR: str = ';'\n", - "SAMPLE_FIRST_N_CHARS: int = 16 # number of characters used for sample names\n", - "# if True, do not use tick on heatmap - only label\n", - "NO_TICK_LABELS_ON_HEATMAP: bool = True" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load and check data\n", - "\n", - "- supported for now: pickle and comma separated\n", - "- transform long to wide data?\n", - "- log transform data using logarithm of two?\n", - "- remove entirely missing columns (features) or rows (samples)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "FOLDER_EXPERIMENT = Path(FOLDER_EXPERIMENT)\n", - "FN_INTENSITIES = Path(FN_INTENSITIES)\n", - "\n", - "FIGUREFOLDER = FOLDER_EXPERIMENT / 'figures'\n", - "FIGUREFOLDER.mkdir(exist_ok=True, parents=True)\n", - "FIGUREFOLDER\n", - "\n", - "files_out = dict()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "lines_to_next_cell": 0 - }, - "outputs": [], - "source": [ - "if FN_INTENSITIES.suffix == '.pkl':\n", - " data = pd.read_pickle(FN_INTENSITIES)\n", - "elif FN_INTENSITIES.suffix == '.csv':\n", - " data = pd.read_csv(FN_INTENSITIES, index_col=INDEX_COL, nrows=N_FIRST_ROWS)\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if LONG_FORMAT:\n", - " data = data.squeeze().unstack()\n", - "if LOG_TRANSFORM:\n", - " data = np.log2(data).astype(float)\n", - "\n", - "\n", - "# drop entrily missing rows or columns\n", - "data = data.dropna(axis=0, how='all').dropna(axis=1, how='all')\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "if len(data.columns.names) > 1:\n", - " _levels_dropped = data.columns.names[1:]\n", - " data.columns = data.columns.droplevel(_levels_dropped)\n", - " logger.warning(\"Drop multiindex level, kepp only first. Dropped: \"\n", - " f\"{_levels_dropped}\")\n", - "# allows overwriting of index name, also to None\n", - "data.columns.name = COL_INDEX_NAME" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Calculate cutoffs for visualization and stats" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- filtering based on many other samples?\n", - "- low feature completeness threshold in comparison to other approaches" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "min_samples_per_feat = int(len(data) * COMPLETENESS_OVER_SAMPLES)\n", - "print(f\"{min_samples_per_feat = }\")\n", - "mask = data.notna().sum(axis=0) > min_samples_per_feat\n", - "print(f\"drop = {(~mask).sum()} features\")\n", - "selected = data.loc[:, mask]\n", - "selected.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "min_feat_per_sample = int(selected.shape[-1] * MIN_FEAT_PER_SAMPLE)\n", - "print(f\"{min_feat_per_sample = }\")\n", - "samples_selected = selected.notna().sum(axis=1) >= min_feat_per_sample\n", - "print(f\"drop = {(~samples_selected).sum()} samples\")\n", - "selected = selected[samples_selected]\n", - "selected.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Update records if cutoffs would be used" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "records = dict(inital=missing_data.get_record(data))\n", - "records" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "records.update(\n", - " dict(filtered=missing_data.get_record(selected)))\n", - "records.update({'params':\n", - " dict(MIN_FEAT_PER_SAMPLE=float(MIN_FEAT_PER_SAMPLE),\n", - " COMPLETENESS_OVER_SAMPLES=float(\n", - " COMPLETENESS_OVER_SAMPLES),\n", - " min_feat_per_sample=int(min_feat_per_sample),\n", - " min_samples_per_feat=int(min_samples_per_feat),)\n", - " })\n", - "records" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "fname = FOLDER_EXPERIMENT / 'records.json'\n", - "files_out[fname.name] = fname\n", - "with open(fname, 'w') as f:\n", - " json.dump(records, f, indent=4)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Plot basic distribution present-absent pattern of features and samples\n", - "\n", - "### Line plots" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fig = plotting.data.plot_missing_dist_highdim(data,\n", - " min_feat_per_sample=min_feat_per_sample,\n", - " min_samples_per_feat=min_samples_per_feat)\n", - "fname = FIGUREFOLDER / f'dist_all_lineplot_w_cutoffs.pdf'\n", - "files_out[fname.name] = fname\n", - "vaep.savefig(fig, name=fname)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fig = plotting.data.plot_missing_dist_highdim(data)\n", - "fname = FIGUREFOLDER / f'dist_all_lineplot_wo_cutoffs.pdf'\n", - "files_out[fname.name] = fname\n", - "vaep.savefig(fig, name=fname)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f891da5c", - "metadata": {}, - "outputs": [], - "source": [ - "fig = plotting.data.plot_missing_pattern_histogram(data,\n", - " min_feat_per_sample=min_feat_per_sample,\n", - " min_samples_per_feat=min_samples_per_feat)\n", - "fname = FIGUREFOLDER / f'dist_all_histogram_w_cutoffs.pdf'\n", - "files_out[fname.name] = fname\n", - "vaep.savefig(fig, name=fname)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1f38e2d9", - "metadata": {}, - "outputs": [], - "source": [ - "fig = plotting.data.plot_missing_pattern_histogram(data)\n", - "fname = FIGUREFOLDER / f'dist_all_histogram_wo_cutoffs.pdf'\n", - "files_out[fname.name] = fname\n", - "vaep.savefig(fig, name=fname)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Boxplots" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fig = plotting.data.plot_missing_dist_boxplots(data)\n", - "fname = FIGUREFOLDER / f'dist_all_boxplots.pdf'\n", - "files_out[fname.name] = fname\n", - "vaep.savefig(fig, name=fname)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Violinplots" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fig = plotting.data.plot_missing_pattern_violinplot(\n", - " data, min_feat_per_sample, min_samples_per_feat)\n", - "fname = FIGUREFOLDER / f'dist_all_violin_plot.pdf'\n", - "files_out[fname.name] = fname\n", - "vaep.savefig(fig, name=fname)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "lines_to_next_cell": 0 - }, - "source": [ - "## Feature medians over prop. of missing of feature" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ax = plotting.data.plot_feat_median_over_prop_missing(\n", - " data=data, type='scatter', s=1)\n", - "fname = FIGUREFOLDER / 'intensity_median_vs_prop_missing_scatter'\n", - "files_out[fname.stem] = fname\n", - "vaep.savefig(ax.get_figure(), fname)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "ax = plotting.data.plot_feat_median_over_prop_missing(\n", - " data=data, type='boxplot', s=.8)\n", - "fname = FIGUREFOLDER / 'intensity_median_vs_prop_missing_boxplot'\n", - "files_out[fname.stem] = fname\n", - "vaep.savefig(ax.get_figure(), fname)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Correlation between peptides\n", - "- linear correlation as indicator that there is some variation which could be used by models (or other heuristics)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "%%time\n", - "corr_lower_triangle = analyzers.corr_lower_triangle(data)\n", - "fig, axes = analyzers.plot_corr_histogram(corr_lower_triangle, bins=40)\n", - "fname = FIGUREFOLDER / f'corr_histogram_feat.pdf'\n", - "files_out[fname.name] = fname\n", - "vaep.savefig(fig, name=fname)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Coefficient of variation (CV) of features" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cv = data.std() / data.mean()\n", - "# biological coefficient of variation: standard deviation (variation) w.r.t mean\n", - "ax = cv.hist(bins=30)\n", - "fname = FIGUREFOLDER / f'CV_histogram_features.pdf'\n", - "files_out[fname.name] = fname\n", - "vaep.savefig(ax.get_figure(), name=fname)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Clustermap and heatmaps of missing values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# needs to deal with duplicates\n", - "# notna = data.notna().T.drop_duplicates().T\n", - "# get index and column names\n", - "cg = sns.clustermap(data.notna(), cbar_pos=None)\n", - "ax = cg.ax_heatmap\n", - "if PG_SEPARATOR is not None:\n", - " _new_labels = [l.get_text().split(PG_SEPARATOR)[0]\n", - " for l in ax.get_xticklabels()]\n", - " _ = ax.set_xticklabels(_new_labels)\n", - "if NO_TICK_LABELS_ON_HEATMAP:\n", - " ax.set_xticks([])\n", - " ax.set_yticks([])\n", - "fname = FIGUREFOLDER / 'clustermap_present_absent_pattern.png'\n", - "files_out[fname.name] = fname\n", - "vaep.savefig(cg.fig,\n", - " name=fname,\n", - " pdf=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "based on cluster, plot heatmaps of features and samples" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "assert (len(cg.dendrogram_row.reordered_ind), len(\n", - " cg.dendrogram_col.reordered_ind)) == data.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vaep.plotting.make_large_descriptors(5)\n", - "ax = sns.heatmap(\n", - " data.iloc[cg.dendrogram_row.reordered_ind,\n", - " cg.dendrogram_col.reordered_ind],\n", - ")\n", - "only_every_x_ticks(ax, x=2)\n", - "use_first_n_chars_in_labels(ax, x=SAMPLE_FIRST_N_CHARS)\n", - "if PG_SEPARATOR is not None:\n", - " _new_labels = [l.get_text().split(PG_SEPARATOR)[0]\n", - " for l in ax.get_xticklabels()]\n", - " _ = ax.set_xticklabels(_new_labels)\n", - "if NO_TICK_LABELS_ON_HEATMAP:\n", - " ax.set_xticks([])\n", - " ax.set_yticks([])\n", - "fname = FIGUREFOLDER / 'heatmap_intensities_ordered_by_missing_pattern.png'\n", - "files_out[fname.name] = fname\n", - "vaep.savefig(ax.get_figure(), name=fname, pdf=False)\n", - "# ax.get_figure().savefig(fname, dpi=300)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Heatmap of sample and feature correlation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fig, ax = plt.subplots(figsize=(4, 4))\n", - "ax = sns.heatmap(\n", - " analyzers.corr_lower_triangle(\n", - " data.iloc[:, cg.dendrogram_col.reordered_ind]),\n", - " ax=ax,\n", - " square=True,\n", - ")\n", - "_ = only_every_x_ticks(ax, x=2)\n", - "_ = use_first_n_chars_in_labels(ax, x=SAMPLE_FIRST_N_CHARS)\n", - "if PG_SEPARATOR is not None:\n", - " _new_labels = [l.get_text().split(PG_SEPARATOR)[0]\n", - " for l in ax.get_xticklabels()]\n", - " _ = ax.set_xticklabels(_new_labels)\n", - "if NO_TICK_LABELS_ON_HEATMAP:\n", - " ax.set_xticks([])\n", - " ax.set_yticks([])\n", - "fname = FIGUREFOLDER / 'heatmap_feature_correlation.png'\n", - "files_out[fname.name] = fname\n", - "vaep.savefig(fig, name=fname, pdf=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fig, ax = plt.subplots(figsize=(4, 4))\n", - "ax = sns.heatmap(\n", - " analyzers.corr_lower_triangle(\n", - " data.T.iloc[:, cg.dendrogram_row.reordered_ind]),\n", - " ax=ax,\n", - " square=True,\n", - ")\n", - "_ = only_every_x_ticks(ax, x=2)\n", - "_ = use_first_n_chars_in_labels(ax, x=SAMPLE_FIRST_N_CHARS)\n", - "if NO_TICK_LABELS_ON_HEATMAP:\n", - " ax.set_xticks([])\n", - " ax.set_yticks([])\n", - "fname = FIGUREFOLDER / 'heatmap_sample_correlation.png'\n", - "files_out[fname.name] = fname\n", - "vaep.savefig(fig, name=fname, pdf=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "kwargs = dict()\n", - "if NO_TICK_LABELS_ON_HEATMAP:\n", - " kwargs['xticklabels'] = False\n", - " kwargs['yticklabels'] = False\n", - "cg = get_clustermap(data, **kwargs)\n", - "ax = cg.ax_heatmap\n", - "if PG_SEPARATOR is not None:\n", - " _new_labels = [l.get_text().split(PG_SEPARATOR)[0]\n", - " for l in ax.get_xticklabels()]\n", - " _ = ax.set_xticklabels(_new_labels)\n", - "_ = only_every_x_ticks(ax, x=2, axis=0)\n", - "_ = use_first_n_chars_in_labels(ax, x=SAMPLE_FIRST_N_CHARS)\n", - "\n", - "fname = FIGUREFOLDER / 'clustermap_intensities_normalized.png'\n", - "files_out[fname.name] = fname\n", - "cg.fig.savefig(fname, dpi=300) # avoid tight_layout\n", - "# tight_layout makes the cbar a bit ugly\n", - "# vaep.savefig(cg.fig,\n", - "# name=fname,\n", - "# pdf=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Sample stats" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "TYPE = 'feat'\n", - "COL_NO_MISSING, COL_NO_IDENTIFIED = f'no_missing_{TYPE}', f'no_identified_{TYPE}'\n", - "COL_PROP_SAMPLES = 'prop_samples'\n", - "\n", - "sample_stats = vaep.data_handling.compute_stats_missing(\n", - " data.notna(), COL_NO_MISSING, COL_NO_IDENTIFIED)\n", - "sample_stats" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fig_ident = sns.relplot(\n", - " x='SampleID_int', y=COL_NO_IDENTIFIED, data=sample_stats)\n", - "fig_ident.set_axis_labels('Sample ID', f'Frequency of identified {TYPE}')\n", - "fig_ident.fig.suptitle(f'Frequency of identified {TYPE} by sample id', y=1.03)\n", - "vaep.savefig(fig_ident, f'identified_{TYPE}_by_sample', folder=FIGUREFOLDER)\n", - "\n", - "fig_ident_dist = sns.relplot(\n", - " x=COL_PROP_SAMPLES, y=COL_NO_IDENTIFIED, data=sample_stats)\n", - "fig_ident_dist.set_axis_labels(\n", - " 'Proportion of samples (sorted by frequency)', f'Frequency of identified {TYPE}')\n", - "fig_ident_dist.fig.suptitle(\n", - " f'Frequency of identified {TYPE} groups by sample id', y=1.03)\n", - "fname = FIGUREFOLDER / f'identified_{TYPE}_ordered.pdf'\n", - "files_out[fname.name] = fname\n", - "vaep.savefig(fig_ident_dist, fname)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "COL_NO_MISSING_PROP = COL_NO_MISSING + '_PROP'\n", - "sample_stats[COL_NO_MISSING_PROP] = sample_stats[COL_NO_MISSING] / \\\n", - " float(data.shape[1])\n", - "sns.set(style=\"white\")\n", - "g = sns.relplot(x='prop_samples', y=COL_NO_MISSING_PROP, data=sample_stats)\n", - "plt.subplots_adjust(top=0.9)\n", - "plt.ylim(0, 1)\n", - "g.set_axis_labels(\n", - " \"Proportion of samples (sorted by frequency)\", \"proportion missing\")\n", - "g.fig.suptitle(f'Proportion of missing {TYPE} ordered')\n", - "\n", - "fname = FIGUREFOLDER / 'proportion_feat_missing.pdf'\n", - "files_out[fname.name] = fname\n", - "vaep.savefig(g, fname)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Reference table intensities (log2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "min_max = int(data.min().min()), int(data.max().max()) + 1\n", - "dynamic_range = None\n", - "if min_max[1] < 100:\n", - " dynamic_range = get_dynamic_range(min_max)\n", - "dynamic_range" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "files_out" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "vaep", - "language": "python", - "name": "vaep" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13" - }, - "nbdime-conflicts": { - "local_diff": [ - { - "diff": [ - { - "diff": [ - { - "diff": [ - { - "key": 4, - "op": "addrange", - "valuelist": "5" - }, - { - "key": 4, - "length": 1, - "op": "removerange" - } - ], - "key": 0, - "op": "patch" - } - ], - "key": "version", - "op": "patch" - } - ], - "key": "language_info", - "op": "patch" - } - ], - "remote_diff": [ - { - "diff": [ - { - "diff": [ - { - "key": 0, - "length": 1, - "op": "removerange" - } - ], - "key": "version", - "op": "patch" - } - ], - "key": "language_info", - "op": "patch" - } - ] - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/project/00_6_hela_training_data_exploration.py b/project/00_6_hela_training_data_exploration.py deleted file mode 100644 index 913a81401..000000000 --- a/project/00_6_hela_training_data_exploration.py +++ /dev/null @@ -1,417 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.14.5 -# kernelspec: -# display_name: vaep -# language: python -# name: vaep -# --- - -# %% [markdown] Collapsed="false" -# # Peptides -# -# Load peptides selected for training - -# %% Collapsed="false" -from datetime import datetime -from functools import partial -from pathlib import Path - -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt - -# from sklearn import preprocessing -# from sklearn.decomposition import PCA -import seaborn as sns - -import vaep -from vaep.data_handling import coverage -from vaep.plotting import _savefig - -import config -from vaep.analyzers import analyzers -from vaep.io.data_objects import PeptideCounter - -pd.options.display.max_columns = 100 -pd.options.display.min_rows = 30 - -# %% [markdown] -# ## Descriptive Statistics (Linear case) -# -# - spread of peptide quantifications between samples -# - spread of quantifications within samples -# - correlation analysis: can linear correlation be picked up? -# - -# %% [markdown] -# ### Peptides - -# %% -FN_PEPTIDE_INTENSITIES = Path('data/dev_datasets/df_intensities_proteinGroups_long_2017_2018_2019_2020_N05015_M04547/Q_Exactive_HF_X_Orbitrap_Exactive_Series_slot_#6070.csv') -FIGUREFOLDER = FN_PEPTIDE_INTENSITIES.parent / 'figures' / FN_PEPTIDE_INTENSITIES.stem -FIGUREFOLDER.mkdir(exist_ok=True, parents=True) -FIGUREFOLDER - -# %% -N_FIRST_ROWS = None # possibility to select N first rows -analysis = analyzers.AnalyzePeptides.from_csv(fname=FN_PEPTIDE_INTENSITIES, index_col=[0,1],nrows=N_FIRST_ROWS) -df = analysis.to_wide_format() -analysis.describe_peptides(sample_n=30) - -# %% [markdown] -# ### Peptide frequency: sellect the N most common peptides -# -# - N most common peptides between samples - -# %% -N = 10 - -peptide_counter = PeptideCounter(config.FNAME_C_PEPTIDES) -peptide_counter.counter.most_common(N) - -# %% -counts = analysis.df.count().sort_values(ascending=False) -counts.iloc[:N] - -# %% -analysis.df[counts.iloc[:N].index] - -# %% [markdown] -# ## Correlation between peptides -# - linear correlation as indicator that there is some variation which could be used by models (or other heuristics) - -# %% -sample = analysis.df.sample(n=30, axis=1) -# ToDo func is assigned to df -corr_lower_triangle = analyzers.corr_lower_triangle(sample) -corr_lower_triangle - -# %% -fig, axes = analyzers.plot_corr_histogram(corr_lower_triangle, bins=40) - -# %% [markdown] -# ### Samples - -# %% -analysis.df.sample(30, axis=0).T.describe() - -# %% [markdown] -# ### Peptides (all) - -# %% -stats = analysis.describe_peptides() - -# %% -_ = stats.loc['CV'].hist(figsize=(10, 4)) # biological coefficient of variation: standard deviation (variation) w.r.t mean - -# %% -_ = stats.loc['count'].hist(figsize=(10,4)) - -# %% Collapsed="false" -INDEX_NAME = 'Sample ID' -analysis.df.index.name = INDEX_NAME - -# %% Collapsed="false" -analysis.df - -# %% Collapsed="false" -N_MIN_OBS = analysis.df.shape[0] * 0.7 # here: present in 70% of the samples -mask_min_obsevation = analysis.df.notna().sum() >= N_MIN_OBS -mask_min_obsevation.sum() - -# %% [markdown] -# Reference analysis.df as `X` - -# %% -X = analysis.df - -# %% [markdown] Collapsed="false" -# ## Completeness of peptides - -# %% Collapsed="false" -# %time not_missing = vaep.data_handling.get_sorted_not_missing(X) -not_missing.iloc[:, -10:].describe() - -# %% Collapsed="false" -sample_completeness = not_missing.sum(axis=1).sort_values() / X.shape[-1] -sample_completeness - -# %% Collapsed="false" -N_MOST_COMMON_PEPTIDES = 300 -data_to_visualize = not_missing.iloc[:, -N_MOST_COMMON_PEPTIDES:] -data_to_visualize = data_to_visualize.loc[sample_completeness.index] -print(f"Look at missingness pattern of {N_MOST_COMMON_PEPTIDES} most common peptides across sample.\n" - f"Data matrix dimension used for printing: { data_to_visualize.shape}") - - -fig_heatmap_missing, axes_heatmap_missing = plt.subplots( - 1, 1, figsize=(12, 8)) -USE_CBAR = False - -axes_heatmap_missing = sns.heatmap(data_to_visualize, - ax=axes_heatmap_missing, - cbar = USE_CBAR, - ) - -# %% [markdown] -# White patches indicates that a peptide has been measured, black means it was not measured. Some samples (rows) have few of the most common peptides. This suggests to set a minimum of total peptides in a sample, which is common pratice. -# -# > An algorithm should work with the most common peptides and base it's inference capabilities after training on these. - -# %% -data_to_visualize.sum(axis=1).nsmallest(20) # Samplest with the fewest measurements out of the seletion - -# %% Collapsed="false" -# # This currently crashes if you want to have a pdf -datetime_now = datetime.now() -_savefig = partial(_savefig, folder=FIGUREFOLDER) - -_savefig(fig_heatmap_missing, - f'peptides_heatmap_missing_{datetime_now:%y%m%d}', pdf=False) - -# %% [markdown] Collapsed="false" -# ## Sample stats - -# %% Collapsed="false" -TYPE = 'peptides' -COL_NO_MISSING, COL_NO_IDENTIFIED = f'no_missing_{TYPE}', f'no_identified_{TYPE}' -COL_PROP_SAMPLES = 'prop_samples' - - -sample_stats = vaep.data_handling.compute_stats_missing(not_missing, COL_NO_MISSING, COL_NO_IDENTIFIED ) -sample_stats - -# %% Collapsed="false" -fig_ident = sns.relplot( - x='SampleID_int', y=COL_NO_IDENTIFIED, data=sample_stats) -fig_ident.set_axis_labels('Sample ID', f'Frequency of identified {TYPE}') -fig_ident.fig.suptitle(f'Frequency of identified {TYPE} by sample id', y=1.03) -_savefig(fig_ident, f'identified_{TYPE}_by_sample', folder=FIGUREFOLDER) - -fig_ident_dist = sns.relplot( - x=COL_PROP_SAMPLES, y=COL_NO_IDENTIFIED, data=sample_stats) -fig_ident_dist.set_axis_labels( - 'Proportion of samples (sorted by frequency)', f'Frequency of identified {TYPE}') -fig_ident_dist.fig.suptitle( - f'Frequency of identified {TYPE} groups by sample id', y=1.03) -_savefig(fig_ident_dist, f'identified_{TYPE}_ordered', folder=FIGUREFOLDER) - -# %% Collapsed="false" -COL_NO_MISSING_PROP = COL_NO_MISSING + '_PROP' -sample_stats[COL_NO_MISSING_PROP] = sample_stats[COL_NO_MISSING] / \ - float(X.shape[1]) - -# from ggplot import * -# ggplot(aes(x='nan_proc'), data = nonnan) + geom_histogram(binwidth = 0.005) #+ ylim(0,0.025) -sns.set(style="darkgrid") -g = sns.relplot(x='prop_samples', y=COL_NO_MISSING_PROP, data=sample_stats) -plt.subplots_adjust(top=0.9) -g.set_axis_labels( - "Proportion of samples (sorted by frequency)", "proportion missing") -g.fig.suptitle(f'Proportion of missing {TYPE} ordered') -_savefig(g, "proportion_proteins_missing") - - -# %% [markdown] Collapsed="false" -# ## Look at sequences -# -# Shows mainly that from a 6-7 AA on, peptides sequences are nearly unique. -# -# > Overlapping peptides (from the start or the end) could still be interesting to find - -# %% Collapsed="false" -class SequenceAnalyser(): - - def __init__(self, sequences: pd.Series): - if not isinstance(sequences, pd.Series): - raise ValueError( - "Please provide a pandas.Series, not {}".format(type(sequences))) - self.sequences = sequences - - def calc_counts(self, n_characters): - return self.sequences.str[:n_characters].value_counts() - - def length(self): - return self.sequences.str.len().sort_values() - - -# %% Collapsed="false" -sequences = SequenceAnalyser(X.columns.to_series()) -sequences.length() - -# %% Collapsed="false" -import ipywidgets as w -_ = w.interact(sequences.calc_counts, - n_characters=w.IntSlider(value=4, min=1, max=55)) - -# %% Collapsed="false" -sequences_p4 = sequences.calc_counts(4) -display(sequences_p4.head()) - -# %% Collapsed="false" -sequences_p4.loc[sequences_p4.isin(('CON_', 'REV_'))].sort_index() - -# %% [markdown] Collapsed="false" -# What to do when -# -# -# ``` -# AAAAAAAAAAGAAGGRGSGPGR -# AAAAAAAAAAGAAGGRGSGPGRR -# -# AAAANSGSSLPLFDCPTWAGKPPPGLHLDVVK -# AAAANSGSSLPLFDCPTWAGKPPPGLHLDVVKGDK -# ``` -# -# - -# %% [markdown] Collapsed="false" -# ## Select Training Data - -# %% [markdown] Collapsed="false" -# ### Minumum required sample quality -# First define the minum requirement of a sample to be kept in - -# %% Collapsed="false" -import ipywidgets as w -range_peps = (0, max(sample_stats[COL_NO_IDENTIFIED])) -MIN_DEPTH_SAMPLE = int(range_peps[1] * 0.6) -w_min_depth_sample = w.IntSlider( - value=MIN_DEPTH_SAMPLE, min=0, max=range_peps[1]) -print(f'Minimum {TYPE} per sample observed:') -w_min_depth_sample - -# %% Collapsed="false" -mask_samples = sample_stats[COL_NO_IDENTIFIED] >= w_min_depth_sample.value -print(f"Selected {mask_samples.sum()} samples") - -# %% Collapsed="false" -x_50 = coverage(X.loc[mask_samples], coverage_col=0.5, coverage_row=0.2) -# x_50_pca = log_z_zeroone_na(x_50) # there is a huge difference if NA is set to low value or mean!! -x_90 = coverage(X.loc[mask_samples], 0.9, 0.9) - -# %% Collapsed="false" -x_50.shape, x_90.shape - -# %% Collapsed="false" -x_90.sample() - -# %% [markdown] -# Data selection should be done for each experiment, so it is not resaved here - -# %% -#from vaep.io.data_objects import get_fname -# fname = config.FOLDER_DATA / get_fname(*x_90.shape) -# print(fname) -# x_90.to_csv(fname) -# fname = config.FOLDER_DATA / get_fname(*x_50.shape) -# print(fname) -# x_50.to_csv(fname) - -# %% [markdown] Collapsed="false" -# ### Distribution of Intensity values -# - comparing non-transformed to $\log_{10}$ transformed -# - log transformation makes data more normal distributed -# -# > log10 or log2 or ln - -# %% [markdown] -# #### Sample with all peptides - -# %% Collapsed="false" -sample = x_50.sample().iloc[0] -sample_id = sample.name -print("Sample ID:", sample_id) - -# %% Collapsed="false" -import matplotlib - -sns.set(style="darkgrid") - - -def plot_dist_comparison( - sample: pd.Series, figsize=(12, 5), - log=np.log, log_name=None, -) -> matplotlib.figure.Figure: - fig, axes = plt.subplots(1, 2, figsize=figsize) - - sns.histplot(sample, bins=100, ax=axes[0]) - axes[0].set_title("Unnormalized distribution") - - sample_log = log(sample) - sns.histplot(sample_log, bins=100, ax=axes[1]) - if not log_name: - log_name = str(log).split("'")[1] - axes[1].set_title(f"{log_name} normalized distribution") - sample_id = sample.name - _ = fig.suptitle(f"Dynamic Range of measured intensities in sample {sample_id}") - fig.tight_layout(rect=[0, 0.03, 1, 0.95]) - return fig - - -fig = plot_dist_comparison(sample) -_savefig(fig, f"distribution_sample_peptides_{str(sample_id)}_ln") - -# %% -fig = plot_dist_comparison(sample, log=np.log2) -_savefig(fig, f"distribution_peptides_sample_{str(sample_id)}_log2") - -# %% -sample_log_stats = np.log2(sample).describe().to_frame('log2') -sample_log_stats['ln'] = np.log (sample).describe() -sample_log_stats - -# %% -print(f"Factor for log2 to ln: {1 / np.log2(np.e) = :.3f}") -c = 1 / np.log2(np.e) - -# %% [markdown] -# If $ log2(x) \sim \mathcal{N}\big(\mu_{log2}, \sigma_{log2}^2 \big) $, then $ ln(x) \sim \mathcal{N}\big(0.693 \cdot \mu_{log2}, 0.693^2 \cdot \sigma_{log2}^2 \big) $. -# -# > Question: Is a wider or narrower distribtion important, or does only be "normal" - -# %% -print(f"mean: {sample_log_stats.loc['mean','log2'] * c = : .3f}") -print(f"std : {sample_log_stats.loc['std' ,'log2'] * c = : .3f}") - -# %% [markdown] -# #### One Peptide, all samples - -# %% Collapsed="false" -from vaep.transform import log -from random import sample -sample = x_50.sample(axis=1).squeeze() -peptide = sample.name - -fig = plot_dist_comparison(sample) -_savefig(fig, f"distribution_peptide_samples_{str(peptide)}_ln") - -# %% [markdown] Collapsed="false" -# ### Reference table intensities (natural logarithm) -# -# 14 to 23 spans a dynamic range of 3 orders of base 10 - -# %% Collapsed="false" -dynamic_range = pd.DataFrame(range(14, 24), columns=['x']) -dynamic_range['$e^x$'] = dynamic_range.x.apply(np.exp) -dynamic_range.set_index('x', inplace=True) -dynamic_range.index.name = '' -dynamic_range.T - -# %% [markdown] Collapsed="false" -# ## Next UP - -# %% [markdown] -# - -# %% [markdown] Collapsed="false" -# ### Find Protein of Peptides -# - check with some reference list of peptides: This is created in `project\FASTA_tryptic_digest.ipynb` - -# %% diff --git a/project/01_0_split_data.ipynb b/project/01_0_split_data.ipynb index f6df4bd16..b4949548c 100644 --- a/project/01_0_split_data.ipynb +++ b/project/01_0_split_data.ipynb @@ -16,10 +16,10 @@ "outputs": [], "source": [ "from pathlib import Path\n", - "\n", + "import logging\n", "from typing import Union, List\n", "\n", - "\n", + "from IPython.display import display\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", @@ -28,31 +28,31 @@ "\n", "import vaep\n", "from vaep.io.datasplits import DataSplits\n", - "from vaep.io import thermo_raw_files\n", - "from vaep.sampling import feature_frequency, sample_data\n", + "from vaep.sampling import feature_frequency\n", "\n", "from vaep.analyzers import analyzers\n", - "from vaep.analyzers.analyzers import AnalyzePeptides\n", + "from vaep.analyzers.analyzers import AnalyzePeptides\n", "\n", "logger = vaep.logging.setup_nb_logger()\n", "logger.info(\"Split data and make diagnostic plots\")\n", + "logging.getLogger('fontTools').setLevel(logging.WARNING)\n", + "\n", "\n", - "def add_meta_data(analysis: AnalyzePeptides, df_meta: pd.DataFrame):\n", + "def add_meta_data(df: pd.DataFrame, df_meta: pd.DataFrame):\n", " try:\n", - " analysis.df = analysis.df.loc[df_meta.index]\n", + " df = df.loc[df_meta.index]\n", " except KeyError as e:\n", " logger.warning(e)\n", " logger.warning(\"Ignore missing samples in quantified samples\")\n", - " analysis.df = analysis.df.loc[analysis.df.index.intersection(\n", + " df = df.loc[df.index.intersection(\n", " df_meta.index)]\n", - "\n", - " analysis.df_meta = df_meta\n", - " return analysis\n", + " return df_meta\n", "\n", "\n", "pd.options.display.max_columns = 32\n", "plt.rcParams['figure.figsize'] = [4, 2]\n", - "vaep.plotting.make_large_descriptors(5)\n", + "\n", + "vaep.plotting.make_large_descriptors(6)\n", "\n", "figures = {} # collection of ax or figures\n", "dumps = {} # collection of data dumps" @@ -87,32 +87,26 @@ }, "outputs": [], "source": [ - "# Sample (rows) intensiites for features (columns)\n", - "FN_INTENSITIES: str = 'data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv'\n", - "# Can be either a string or position (typical 0 for first column), or a list of these.\n", - "index_col: Union[str, int] = 0\n", - "# wide_format: bool = False # intensities in wide format (more memory efficient of csv). Default is long_format (more precise)\n", - "# Manuelly set column names (of Index object in columns)\n", - "column_names: List[str] = [\"Gene Names\"]\n", - "# Machine parsed metadata from raw file (see workflows/metadata), wide format per sample\n", - "fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv'\n", - "# Minimum number or fraction of feature prevalence across samples to be kept\n", - "feat_prevalence: Union[int, float] = 0.25\n", - "# Minimum number or fraction of total requested features per Sample\n", - "sample_completeness: Union[int, float] = 0.5\n", + "FN_INTENSITIES: str = 'data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv' # Sample (rows), features (columns)\n", + "index_col: Union[str, int] = 0 # Can be either a string or position (default 0 for first column), or a list of these.\n", + "column_names: List[str] = [\"Gene Names\"] # Manuelly set column names (of Index object in columns)\n", + "fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv' # metadata for samples (rows)\n", + "feat_prevalence: Union[int, float] = 0.25 # Minimum number or fraction of feature prevalence across samples to be kept\n", + "sample_completeness: Union[int, float] = 0.5 # Minimum number or fraction of total requested features per Sample\n", "select_N: int = None # only use latest N samples\n", - "sample_N: bool = False # if select_N, sample N randomly instead of using latest?\n", + "sample_N: bool = False # if select_N, sample N randomly instead of using latest N\n", "random_state: int = 42 # random state for reproducibility of splits\n", - "# based on raw file meta data, only take samples with RT > min_RT_time\n", - "min_RT_time: Union[int, float] = None\n", - "# Log transformation of initial data (select one of the existing in numpy)\n", - "logarithm: str = 'log2'\n", - "folder_experiment: str = f'runs/example'\n", - "folder_data: str = '' # specify data directory if needed\n", + "min_RT_time: Union[int, float] = None # based on raw file meta data, only take samples with RT > min_RT_time\n", + "logarithm: str = 'log2' # Log transformation of initial data (select one of the existing in numpy)\n", + "folder_experiment: str = 'runs/example' # folder to save figures and data dumps\n", + "folder_data: str = '' # specify special data directory if needed\n", "file_format: str = 'csv' # file format of create splits, default pickle (pkl)\n", "# metadata -> defaults for metadata extracted from machine data, used for plotting\n", "meta_date_col: str = None # date column in meta data\n", - "meta_cat_col: str = None # category column in meta data" + "meta_cat_col: str = None # category column in meta data\n", + "# train, validation and test data splits\n", + "frac_non_train: float = 0.1 # fraction of non training data (validation and test split)\n", + "frac_mnar: float = 0.0 # fraction of missing not at random data, rest: missing completely at random" ] }, { @@ -187,6 +181,7 @@ "metadata": {}, "outputs": [], "source": [ + "# ! factor out file reading to a separate module, not class\n", "# AnalyzePeptides.from_csv\n", "constructor = getattr(AnalyzePeptides, FILE_FORMAT_TO_CONSTRUCTOR[FILE_EXT])\n", "analysis = constructor(fname=params.FN_INTENSITIES,\n", @@ -202,7 +197,9 @@ "log_fct = getattr(np, params.logarithm)\n", "analysis.log_transform(log_fct)\n", "logger.info(f\"{analysis = }\")\n", - "analysis.df" + "df = analysis.df\n", + "del analysis.df # free memory\n", + "df" ] }, { @@ -213,8 +210,13 @@ }, "outputs": [], "source": [ - "ax = analysis.df.notna().sum(axis=0).to_frame(\n", - " analysis.df.columns.name).plot.box()\n", + "ax = (df\n", + " .notna()\n", + " .sum(axis=0)\n", + " .to_frame(df.columns.name)\n", + " .plot\n", + " .box()\n", + " )\n", "ax.set_ylabel('number of observation across samples')" ] }, @@ -228,7 +230,7 @@ "dumps[fname.name] = fname.as_posix()\n", "writer = pd.ExcelWriter(fname)\n", "\n", - "notna = analysis.df.notna()\n", + "notna = df.notna()\n", "data_stats_original = pd.concat(\n", " [\n", " notna.sum().describe().rename('feat_stats'),\n", @@ -243,7 +245,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In case there are multiple features for each intensity values (currenlty: peptide sequence and charge), combine the column names to a single str index.\n", + "In case there are multiple features for each intensity values (currenlty: peptide sequence and charge),\n", + "combine the column names to a single str index.\n", "\n", "> The Collaborative Modeling approach will need a single feature column." ] @@ -258,15 +261,16 @@ " ret = \"_\".join(str(x) for x in seq)\n", " return ret\n", "\n", + "\n", "# ToDo: join multiindex samples indices (pkl dumps)\n", - "# if hasattr(analysis.df.columns, \"levels\"):\n", - "if isinstance(analysis.df.columns, pd.MultiIndex):\n", + "# if hasattr(df.columns, \"levels\"):\n", + "if isinstance(df.columns, pd.MultiIndex):\n", " logger.warning(\"combine MultiIndex columns to one feature column\")\n", - " print(analysis.df.columns[:10].map(join_as_str))\n", - " _new_name = join_as_str(analysis.df.columns.names)\n", - " analysis.df.columns = analysis.df.columns.map(join_as_str)\n", - " analysis.df.columns.name = _new_name\n", - " logger.warning(f\"New name: {analysis.df.columns.names = }\")" + " print(df.columns[:10].map(join_as_str))\n", + " _new_name = join_as_str(df.columns.names)\n", + " df.columns = df.columns.map(join_as_str)\n", + " df.columns.name = _new_name\n", + " logger.warning(f\"New name: {df.columns.names = }\")" ] }, { @@ -287,15 +291,15 @@ "if params.fn_rawfile_metadata:\n", " df_meta = pd.read_csv(params.fn_rawfile_metadata, index_col=0)\n", "else:\n", - " logger.warning(f\"No metadata for samples provided, create placeholder.\")\n", + " logger.warning(\"No metadata for samples provided, create placeholder.\")\n", " if params.meta_date_col:\n", " raise ValueError(\n", " f\"No metadata provided, but data column set: {params.meta_date_col}\")\n", " if params.meta_cat_col:\n", " raise ValueError(\n", " f\"No metadata provided, but data column set: {params.meta_cat_col}\")\n", - " df_meta = pd.DataFrame(index=analysis.df.index)\n", - "df_meta = df_meta.loc[analysis.df.index.to_list()] # index is sample index\n", + " df_meta = pd.DataFrame(index=df.index)\n", + "df_meta = df_meta.loc[df.index.to_list()] # index is sample index\n", "if df_meta.index.name is None:\n", " df_meta.index.name = params.index_col[0]\n", "df_meta" @@ -304,7 +308,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "outputs": [], "source": [ "if params.meta_date_col:\n", @@ -322,22 +328,7 @@ "metadata": {}, "outputs": [], "source": [ - "if df_meta.columns.isin(thermo_raw_files.cols_instrument).sum() == len(thermo_raw_files.cols_instrument):\n", - " display(df_meta.groupby(thermo_raw_files.cols_instrument)[\n", - " params.meta_date_col].agg(['count', 'min', 'max']))\n", - "else:\n", - " logger.info(\n", - " f\"Instrument column not found: {thermo_raw_files.cols_instrument}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_meta.describe(datetime_is_numeric=True,\n", - " percentiles=np.linspace(0.05, 0.95, 10))" + "df_meta.describe(percentiles=np.linspace(0.05, 0.95, 10))" ] }, { @@ -356,7 +347,8 @@ "if params.min_RT_time:\n", " logger.info(\n", " \"Metadata should have 'MS max RT' entry from ThermoRawFileParser\")\n", - " msg = f\"Minimum RT time maxiumum is set to {params.min_RT_time} minutes (to exclude too short runs, which are potentially fractions).\"\n", + " msg = (f\"Minimum RT time maxiumum is set to {params.min_RT_time} minutes\"\n", + " \" (to exclude too short runs, which are potentially fractions).\")\n", " # can be integrated into query string\n", " mask_RT = df_meta['MS max RT'] >= params.min_RT_time\n", " msg += f\" Total number of samples retained: {int(mask_RT.sum())}\"\n", @@ -364,7 +356,7 @@ " logger.info(msg)\n", " df_meta = df_meta.loc[mask_RT]\n", "else:\n", - " logger.warning(f\"Retention time filtering deactivated.\")" + " logger.warning(\"Retention time filtering deactivated.\")" ] }, { @@ -382,7 +374,7 @@ "metadata": {}, "outputs": [], "source": [ - "meta_stats = df_meta.describe(include='all', datetime_is_numeric=True)\n", + "meta_stats = df_meta.describe(include='all')\n", "meta_stats" ] }, @@ -396,7 +388,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "outputs": [], "source": [ "try:\n", @@ -409,65 +403,13 @@ " display(meta_stats.loc[:, (meta_stats.loc['std'] > 0.1)])" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Optional, if using ThermoRawFileParser: check some columns describing settings\n", - " - software can be updated: `Software Version`\n", - " - `mass resolution` setting for instrument\n", - " - colision type for MS2: `beam-type collision-induced dissocation`\n", - " - missing `dilution factor`\n", - " - omit (uncomment if needed):\n", - " - quite some variation due to `MS max charge`: omit\n", - " - variation by `injection volume setting` and instrument over time\n", - " - 500ng of peptides should be injected, based on concentration of peptides this setting is adjusted to get it" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "meta_raw_settings = [\n", - " 'Thermo Scientific instrument model',\n", - " 'instrument serial number',\n", - " 'Software Version',\n", - " # 'MS max charge',\n", - " 'mass resolution',\n", - " 'beam-type collision-induced dissociation',\n", - " # 'injection volume setting',\n", - " 'dilution factor',\n", - "]\n", - "\n", - "if df_meta.columns.isin(meta_raw_settings).sum() == len(meta_raw_settings):\n", - " display(\n", - " # index gives first example with this combination\n", - " df_meta[meta_raw_settings].drop_duplicates()\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- check for variation in `software Version` and `injection volume setting`\n", - "\n", - "\n", - "Update selection of samples based on metadata (e.g. minimal retention time)\n", - "- sort data the same as sorted meta data" - ] - }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "analysis = add_meta_data(analysis, df_meta=df_meta)" + "df_meta = add_meta_data(df, df_meta=df_meta)" ] }, { @@ -483,7 +425,7 @@ "metadata": {}, "outputs": [], "source": [ - "assert analysis.df.index.is_unique, \"Duplicates in index\"" + "assert df.index.is_unique, \"Duplicates in index\"" ] }, { @@ -504,21 +446,23 @@ "outputs": [], "source": [ "if params.select_N is not None:\n", - " params.select_N = min(params.select_N, len(analysis.df_meta))\n", + " params.select_N = min(params.select_N, len(df_meta))\n", " if params.sample_N:\n", - " analysis.df_meta = analysis.df_meta.sample(params.select_N)\n", + " df_meta = df_meta.sample(params.select_N)\n", " else:\n", - " analysis.df_meta = analysis.df_meta.iloc[-params.select_N:]\n", + " df_meta = df_meta.iloc[-params.select_N:]\n", "\n", - " analysis.df = analysis.df.loc[analysis.df_meta.index].dropna(\n", + " df = df.loc[df_meta.index].dropna(\n", " how='all', axis=1)\n", - " ax = analysis.df.T.describe().loc['count'].hist()\n", + " ax = df.T.describe().loc['count'].hist()\n", " _ = ax.set_title('histogram of features for all eligable samples')" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "source": [ "## First Step: Select features by prevalence\n", "- `feat_prevalence` across samples" @@ -530,9 +474,9 @@ "metadata": {}, "outputs": [], "source": [ - "freq_per_feature = analysis.df.notna().sum() # on wide format\n", + "freq_per_feature = df.notna().sum() # on wide format\n", "if isinstance(params.feat_prevalence, float):\n", - " N_samples = len(analysis.df_meta)\n", + " N_samples = len(df)\n", " logger.info(f\"Current number of samples: {N_samples}\")\n", " logger.info(\n", " f\"Feature has to be present in at least {params.feat_prevalence:.2%} of samples\")\n", @@ -546,13 +490,19 @@ "mask = freq_per_feature >= params.feat_prevalence\n", "logger.info(f\"Drop {(~mask).sum()} features\")\n", "freq_per_feature = freq_per_feature.loc[mask]\n", - "analysis.df = analysis.df.loc[:, mask]\n", - "analysis.N, analysis.M = analysis.df.shape\n", - "\n", + "df = df.loc[:, mask]\n", + "analysis.N, analysis.M = df.shape\n", "# # potentially create freq based on DataFrame\n", - "analysis.df\n", - "\n", - "notna = analysis.df.notna()\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "notna = df.notna()\n", "data_stats_filtered = pd.concat(\n", " [\n", " notna.sum().describe().rename('feat_stats'),\n", @@ -587,11 +537,11 @@ " msg = f'Fraction of minimum sample completeness over all features specified with: {params.sample_completeness}\\n'\n", " # assumes df in wide format\n", " params.overwrite_entry('sample_completeness', int(\n", - " analysis.df.shape[1] * params.sample_completeness))\n", + " df.shape[1] * params.sample_completeness))\n", " msg += f'This translates to a minimum number of features per sample (to be included): {params.sample_completeness}'\n", " logger.info(msg)\n", "\n", - "sample_counts = analysis.df.notna().sum(axis=1) # if DataFrame\n", + "sample_counts = df.notna().sum(axis=1) # if DataFrame\n", "sample_counts.describe()" ] }, @@ -604,8 +554,8 @@ "mask = sample_counts > params.sample_completeness\n", "msg = f'Drop {len(mask) - mask.sum()} of {len(mask)} initial samples.'\n", "print(msg)\n", - "analysis.df = analysis.df.loc[mask]\n", - "analysis.df = analysis.df.dropna(\n", + "df = df.loc[mask]\n", + "df = df.dropna(\n", " axis=1, how='all') # drop now missing features" ] }, @@ -615,8 +565,8 @@ "metadata": {}, "outputs": [], "source": [ - "params.N, params.M = analysis.df.shape # save data dimensions\n", - "params.used_samples = analysis.df.index.to_list()" + "params.N, params.M = df.shape # save data dimensions\n", + "params.used_samples = df.index.to_list()" ] }, { @@ -632,10 +582,11 @@ "metadata": {}, "outputs": [], "source": [ - "ax = analysis.df.notna().sum(axis=1).hist()\n", + "group = 1\n", + "ax = df.notna().sum(axis=1).hist()\n", "ax.set_xlabel('features per eligable sample')\n", "ax.set_ylabel('observations')\n", - "fname = params.out_figures / 'hist_features_per_sample'\n", + "fname = params.out_figures / f'0_{group}_hist_features_per_sample'\n", "figures[fname.stem] = fname\n", "vaep.savefig(ax.get_figure(), fname)" ] @@ -648,13 +599,13 @@ }, "outputs": [], "source": [ - "ax = analysis.df.notna().sum(axis=0).sort_values().plot()\n", - "_new_labels = [l.get_text().split(';')[0] for l in ax.get_xticklabels()]\n", + "ax = df.notna().sum(axis=0).sort_values().plot()\n", + "_new_labels = [l_.get_text().split(';')[0] for l_ in ax.get_xticklabels()]\n", "_ = ax.set_xticklabels(_new_labels, rotation=45,\n", " horizontalalignment='right')\n", "ax.set_xlabel('feature prevalence')\n", "ax.set_ylabel('observations')\n", - "fname = params.out_figures / 'feature_prevalence'\n", + "fname = params.out_figures / f'0_{group}_feature_prevalence'\n", "figures[fname.stem] = fname\n", "vaep.savefig(ax.get_figure(), fname)" ] @@ -672,11 +623,11 @@ "metadata": {}, "outputs": [], "source": [ - "min_max = vaep.plotting.data.min_max(analysis.df.stack())\n", + "min_max = vaep.plotting.data.min_max(df.stack())\n", "ax, bins = vaep.plotting.data.plot_histogram_intensities(\n", - " analysis.df.stack(), min_max=min_max)\n", + " df.stack(), min_max=min_max)\n", "\n", - "fname = params.out_figures / 'intensity_distribution_overall'\n", + "fname = params.out_figures / f'0_{group}_intensity_distribution_overall'\n", "figures[fname.stem] = fname\n", "vaep.savefig(ax.get_figure(), fname)" ] @@ -688,8 +639,8 @@ "outputs": [], "source": [ "ax = vaep.plotting.data.plot_feat_median_over_prop_missing(\n", - " data=analysis.df, type='scatter')\n", - "fname = params.out_figures / 'intensity_median_vs_prop_missing_scatter'\n", + " data=df, type='scatter')\n", + "fname = params.out_figures / f'0_{group}_intensity_median_vs_prop_missing_scatter'\n", "figures[fname.stem] = fname\n", "vaep.savefig(ax.get_figure(), fname)" ] @@ -701,8 +652,8 @@ "outputs": [], "source": [ "ax = vaep.plotting.data.plot_feat_median_over_prop_missing(\n", - " data=analysis.df, type='boxplot')\n", - "fname = params.out_figures / 'intensity_median_vs_prop_missing_boxplot'\n", + " data=df, type='boxplot')\n", + "fname = params.out_figures / f'0_{group}_intensity_median_vs_prop_missing_boxplot'\n", "figures[fname.stem] = fname\n", "vaep.savefig(ax.get_figure(), fname)" ] @@ -714,13 +665,6 @@ "### Interactive and Single plots" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Scatter plots need to become interactive." - ] - }, { "cell_type": "code", "execution_count": null, @@ -737,9 +681,10 @@ "outputs": [], "source": [ "K = 2\n", - "analysis.df = analysis.df.astype(float)\n", + "df = df.astype(float)\n", + "analysis.df = df\n", "pcs = analysis.get_PCA(n_components=K) # should be renamed to get_PCs\n", - "pcs = pcs.iloc[:, :K].join(analysis.df_meta).join(sample_counts)\n", + "pcs = pcs.iloc[:, :K].join(df_meta).join(sample_counts)\n", "\n", "pcs_name = pcs.columns[:2]\n", "pcs_index_name = pcs.index.name\n", @@ -753,7 +698,7 @@ "metadata": {}, "outputs": [], "source": [ - "pcs.describe(include='all', datetime_is_numeric=True).T" + "pcs.describe(include='all').T" ] }, { @@ -763,11 +708,11 @@ "outputs": [], "source": [ "if params.meta_cat_col:\n", - " fig, ax = plt.subplots(figsize=(2,2))\n", + " fig, ax = plt.subplots(figsize=(2, 2))\n", " analyzers.seaborn_scatter(\n", " pcs[pcs_name], ax, meta=pcs[params.meta_cat_col], title=f\"by {params.meta_cat_col}\")\n", " fname = (params.out_figures\n", - " / f'pca_sample_by_{\"_\".join(params.meta_cat_col.split())}')\n", + " / f'0_{group}_pca_sample_by_{\"_\".join(params.meta_cat_col.split())}')\n", " figures[fname.stem] = fname\n", " vaep.savefig(fig, fname)" ] @@ -782,7 +727,7 @@ " fig, ax = plt.subplots()\n", " analyzers.plot_date_map(\n", " df=pcs[pcs_name], ax=ax, dates=pcs[params.meta_date_col], title=f'by {params.meta_date_col}')\n", - " fname = params.out_figures / 'pca_sample_by_date'\n", + " fname = params.out_figures / f'0_{group}_pca_sample_by_date'\n", " figures[fname.stem] = fname\n", " vaep.savefig(fig, fname)" ] @@ -791,7 +736,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "- software version: Does it make a difference?\n", "- size: number of features in a single sample" ] }, @@ -811,7 +755,7 @@ " size=5,\n", ")\n", "fname = (params.out_figures\n", - " / f'pca_sample_by_{\"_\".join(col_identified_feat.split())}.pdf')\n", + " / f'0_{group}_pca_sample_by_{\"_\".join(col_identified_feat.split())}.pdf')\n", "figures[fname.stem] = fname\n", "vaep.savefig(fig, fname)" ] @@ -830,11 +774,11 @@ " # color=pcs['Software Version'],\n", " color=col_identified_feat,\n", " template='none',\n", - " width=1200, # 4 inches x 300 dpi\n", - " height=600 # 2 inches x 300 dpi\n", + " width=1200, # 4 inches x 300 dpi\n", + " height=600 # 2 inches x 300 dpi\n", ")\n", "fname = (params.out_figures\n", - " / f'pca_sample_by_{\"_\".join(col_identified_feat.split())}_plotly.pdf')\n", + " / f'0_{group}_pca_sample_by_{\"_\".join(col_identified_feat.split())}_plotly.pdf')\n", "figures[fname.stem] = fname\n", "fig.write_image(fname)\n", "fig # stays interactive in html" @@ -853,7 +797,7 @@ "metadata": {}, "outputs": [], "source": [ - "analysis.df.head()" + "df.head()" ] }, { @@ -862,12 +806,12 @@ "metadata": {}, "outputs": [], "source": [ - "df = analysis.df\n", - "df = df.join(df_meta[params.meta_date_col])\n", - "df = df.set_index(params.meta_date_col).sort_index()\n", + "df_w_date = df.join(df_meta[params.meta_date_col])\n", + "df_w_date = df_w_date.set_index(params.meta_date_col).sort_index()\n", "if not params.meta_date_col == 'PlaceholderTime':\n", - " df.to_period('min')\n", - "df = df.T" + " df_w_date.to_period('min')\n", + "df_w_date = df_w_date.T\n", + "df_w_date" ] }, { @@ -876,13 +820,18 @@ "metadata": {}, "outputs": [], "source": [ - "ax = df.boxplot(rot=80, figsize=(8, 3), fontsize=5,\n", - " showfliers=False, showcaps=False)\n", + "ax = df_w_date.boxplot(rot=80,\n", + " figsize=(8, 3),\n", + " fontsize=6,\n", + " showfliers=False,\n", + " showcaps=False\n", + " )\n", "_ = vaep.plotting.select_xticks(ax)\n", "fig = ax.get_figure()\n", - "fname = params.out_figures / 'median_boxplot'\n", + "fname = params.out_figures / f'0_{group}_median_boxplot'\n", "figures[fname.stem] = fname\n", - "vaep.savefig(fig, fname)" + "vaep.savefig(fig, fname)\n", + "del df_w_date" ] }, { @@ -898,7 +847,7 @@ "metadata": {}, "outputs": [], "source": [ - "df.stack().describe(percentiles=np.linspace(0.05, 0.95, 10))" + "df.stack().describe(percentiles=np.linspace(0.05, 0.95, 19).round(2))" ] }, { @@ -918,23 +867,23 @@ "source": [ "if not params.meta_date_col == 'PlaceholderTime':\n", " dates = df_meta[params.meta_date_col].sort_values()\n", - " # dates.name = 'date'\n", - " median_sample_intensity = (analysis.df\n", + " median_sample_intensity = (df\n", " .median(axis=1)\n", " .to_frame('median intensity'))\n", " median_sample_intensity = median_sample_intensity.join(dates)\n", "\n", " ax = median_sample_intensity.plot.scatter(x=dates.name, y='median intensity',\n", " rot=90,\n", - " fontsize='large',\n", + " # fontsize=6,\n", " figsize=(8, 2),\n", " s=5,\n", " xticks=vaep.plotting.select_dates(\n", " median_sample_intensity[dates.name])\n", " )\n", " fig = ax.get_figure()\n", - " figures['median_scatter'] = params.out_figures / 'median_scatter'\n", - " vaep.savefig(fig, figures['median_scatter'])" + " fname = params.out_figures / f'0_{group}_median_scatter'\n", + " figures[fname.stem] = fname\n", + " vaep.savefig(fig, fname)" ] }, { @@ -948,40 +897,26 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Feature frequency in data\n", - "\n", - "- higher count, higher probability to be sampled into training data\n", - "- missing peptides are sampled both into training as well as into validation dataset\n", - "- everything not in training data is validation data\n", - "\n", - "Based on unmodified training data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "msg = \"Total number of samples in training data split: {}\"\n", - "print(msg.format(len(analysis.df)))" + "## Feature frequency in data" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "outputs": [], "source": [ - "# # analysis.splits.to_wide_format()\n", - "# assert analysis.splits is splits, \"Sanity check failed.\"" + "msg = \"Total number of samples in data: {}\"\n", + "print(msg.format(len(df)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Recalculate feature frequency after selecting some samples" + "Recalculate feature frequency after selecting samples" ] }, { @@ -990,14 +925,16 @@ "metadata": {}, "outputs": [], "source": [ - "freq_per_feature = feature_frequency(analysis.df)\n", + "freq_per_feature = feature_frequency(df)\n", "freq_per_feature" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "outputs": [], "source": [ "# freq_per_feature.name = 'Gene names freq' # name it differently?\n", @@ -1010,85 +947,138 @@ "freq_per_feature.to_pickle(fname)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Conserning sampling with frequency weights:\n", - " - larger weight -> higher probablility of being sampled\n", - " - weights need to be alignable to index of original DataFrame before grouping (same index)" - ] - }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Split: Train, validation and test data\n", "\n", - "- test data is in clinical language often denoted as independent validation cohort\n", - "- validation data (for model)" + "Select features as described in\n", + "> Lazar, Cosmin, Laurent Gatto, Myriam Ferro, Christophe Bruley, and Thomas Burger. 2016.\n", + "> “Accounting for the Multiple Natures of Missing Values in Label-Free Quantitative\n", + "> Proteomics Data Sets to Compare Imputation Strategies.”\n", + "> Journal of Proteome Research 15 (4): 1116–25.\n", + "\n", + "- select `frac_mnar` based on threshold matrix on quantile of overall frac of data to be used\n", + " for validation and test data split, e.g. 0.1 = quantile(0.1)\n", + "- select frac_mnar from intensities selected using threshold matrix" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "outputs": [], "source": [ - "analysis.splits = DataSplits(is_wide_format=False)\n", - "splits = analysis.splits\n", - "print(f\"{analysis.splits = }\")\n", - "analysis.splits.__annotations__" + "splits = DataSplits(is_wide_format=False)\n", + "print(f\"{splits = }\")\n", + "splits.__annotations__" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Sample targets (Fake NAs)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Add goldstandard targets for valiation and test data\n", - "- based on same day\n", - "- same instrument" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create some target values by sampling 5% of the validation and test data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "analysis.to_long_format(inplace=True)\n", - "analysis.df_long" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fake_na, splits.train_X = sample_data(analysis.df_long.squeeze(),\n", - " sample_index_to_drop=0,\n", - " weights=freq_per_feature,\n", - " frac=0.1,\n", - " random_state=params.random_state,)\n", - "assert len(splits.train_X) > len(fake_na)\n", - "splits.val_y = fake_na.sample(frac=0.5, random_state=params.random_state).sort_index()\n", - "splits.test_y = fake_na.loc[fake_na.index.difference(splits.val_y.index)]\n", - "# splits" + "Create some target values by sampling X% of the validation and test data.\n", + "Simulated missing values are not used for validation and testing." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_long = vaep.io.datasplits.long_format(df)\n", + "df_long.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "group = 2\n", + "! move parameter checks to start of script\n", + "if 0.0 <= params.frac_mnar <= 1.0:\n", + " fig, axes = plt.subplots(1, 2, figsize=(8, 2))\n", + " quantile_frac = df_long.quantile(params.frac_non_train)\n", + " rng = np.random.default_rng(params.random_state)\n", + " threshold = pd.Series(rng.normal(loc=float(quantile_frac),\n", + " scale=float(0.3 * df_long.std()),\n", + " size=len(df_long),\n", + " ),\n", + " index=df_long.index,\n", + " )\n", + " # plot data vs threshold data\n", + " ax = axes[0]\n", + " from functools import partial\n", + " plot_histogram_intensities = partial(vaep.plotting.data.plot_histogram_intensities,\n", + " min_max=min_max,\n", + " alpha=0.8)\n", + " plot_histogram_intensities(\n", + " df_long.squeeze(),\n", + " ax=ax,\n", + " label='observed')\n", + " plot_histogram_intensities(\n", + " threshold,\n", + " ax=ax,\n", + " label='thresholds')\n", + " ax.legend()\n", + " # select MNAR (intensity between randomly sampled threshold)\n", + " mask = df_long.squeeze() < threshold\n", + " ! subsample to have exact fraction of MNAR?\n", + " N = len(df_long)\n", + " logger.info(f\"{int(N * params.frac_non_train) = :,d}\")\n", + " N_MNAR = int(params.frac_non_train * params.frac_mnar * N)\n", + " fake_na_mnar = df_long.loc[mask]\n", + " if len(fake_na_mnar) > N_MNAR:\n", + " fake_na_mnar = fake_na_mnar.sample(N_MNAR,\n", + " random_state=params.random_state)\n", + " splits.train_X = df_long.loc[\n", + " df_long.index.difference(\n", + " fake_na_mnar.index)\n", + " ]\n", + " logger.info(f\"{len(fake_na_mnar) = :,d}\")\n", + " N_MCAR = int(N * (1 - params.frac_mnar) * params.frac_non_train)\n", + " fake_na_mcar = splits.train_X.sample(N_MCAR,\n", + " random_state=params.random_state)\n", + " logger.info(f\"{len(fake_na_mcar) = :,d}\")\n", + " splits.train_X = (splits\n", + " .train_X\n", + " .loc[splits\n", + " .train_X\n", + " .index\n", + " .difference(\n", + " fake_na_mcar.index)]\n", + " ).squeeze()\n", + " logger.info(f\"{len(splits.train_X) = :,d}\")\n", + " fake_na = pd.concat([fake_na_mcar, fake_na_mnar]).squeeze()\n", + " logger.info(f\"{len(fake_na) = :,d}\")\n", + " ax = axes[1]\n", + " plot_histogram_intensities(\n", + " fake_na_mnar.squeeze(),\n", + " ax=ax,\n", + " label=f'MNAR ({N_MNAR:,d})',\n", + " color='C2')\n", + " plot_histogram_intensities(\n", + " fake_na_mcar.squeeze(),\n", + " ax=ax,\n", + " color='C3',\n", + " label=f'MCAR ({N_MCAR:,d})')\n", + " ax.legend()\n", + " assert len(fake_na) + len(splits.train_X) == len(df_long)\n", + " fname = params.out_figures / f'0_{group}_mnar_mcar_histograms.pdf'\n", + " figures[fname.stem] = fname\n", + " vaep.savefig(fig, fname)\n", + "else:\n", + " raise ValueError(f\"Invalid MNAR float value (should be betw. 0 and 1): {params.frac_mnar}\")\n", + "\n", + "splits.val_y = fake_na.sample(frac=0.5, random_state=params.random_state)\n", + "splits.test_y = fake_na.loc[fake_na.index.difference(splits.val_y.index)]" ] }, { @@ -1097,7 +1087,7 @@ "metadata": {}, "outputs": [], "source": [ - "splits.test_y" + "splits.test_y.groupby(level=-1).count().describe()" ] }, { @@ -1115,7 +1105,8 @@ "metadata": {}, "outputs": [], "source": [ - "splits.train_X" + "! add option to retain at least N samples per feature\n", + "splits.train_X.groupby(level=-1).count().describe()" ] }, { @@ -1131,14 +1122,14 @@ "# per feature are allowd.\n", "\n", "diff = (splits\n", - " .val_y\n", - " .index\n", - " .levels[-1]\n", - " .difference(splits\n", - " .train_X\n", - " .index\n", - " .levels[-1]\n", - " ).to_list())\n", + " .val_y\n", + " .index\n", + " .levels[-1]\n", + " .difference(splits\n", + " .train_X\n", + " .index\n", + " .levels[-1]\n", + " ).to_list())\n", "if diff:\n", " to_remove = splits.val_y.loc[pd.IndexSlice[:, diff]]\n", " display(to_remove)\n", @@ -1150,20 +1141,18 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "lines_to_next_cell": 2 - }, + "metadata": {}, "outputs": [], "source": [ "diff = (splits\n", - " .test_y\n", - " .index\n", - " .levels[-1]\n", - " .difference(splits\n", - " .train_X\n", - " .index\n", - " .levels[-1]\n", - " ).to_list())\n", + " .test_y\n", + " .index\n", + " .levels[-1]\n", + " .difference(splits\n", + " .train_X\n", + " .index\n", + " .levels[-1]\n", + " ).to_list())\n", "if diff:\n", " to_remove = splits.test_y.loc[pd.IndexSlice[:, diff]]\n", " display(to_remove)\n", @@ -1172,6 +1161,41 @@ "diff" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Some tools require at least 4 observation in the training data,\n", + "which is a good requirment. Due to \"MNAR\" sampling, most measurments\n", + "of a features can end up in the validation or test data.\n", + "\n", + "In that case: Move the validation measurments back to the training data.\n", + "If after this procedure the condition is still not met, a value error is raised." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mask_min_4_measurments = splits.train_X.groupby(level=1).count() < 4\n", + "if mask_min_4_measurments.any():\n", + " idx = mask_min_4_measurments.loc[mask_min_4_measurments].index\n", + " logger.warning(f\"Features with less than 4 measurments in training data: {idx.to_list()}\")\n", + " to_remove = splits.val_y.loc[pd.IndexSlice[:, idx]]\n", + " print(\"To remove from validation data: \")\n", + " display(to_remove)\n", + " splits.train_X = pd.concat([splits.train_X, to_remove])\n", + " splits.val_y = splits.val_y.drop(to_remove.index)\n", + " # check condition again\n", + " mask_min_4_measurments = splits.train_X.groupby(level=1).count() < 4\n", + " if mask_min_4_measurments.any():\n", + " idx = mask_min_4_measurments.loc[mask_min_4_measurments].index\n", + " raise ValueError(\"Some features still have less than 4 measurments in training data\"\n", + " f\" after removing the features from the validation data: {idx.to_list()}\")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1223,12 +1247,12 @@ "metadata": {}, "outputs": [], "source": [ - "splits_df = pd.DataFrame(index=analysis.df_long.index)\n", + "splits_df = pd.DataFrame(index=df_long.index)\n", "splits_df['train'] = splits.train_X\n", "splits_df['val'] = splits.val_y\n", "splits_df['test'] = splits.test_y\n", "stats_splits = splits_df.describe()\n", - "stats_splits.to_excel(writer, 'stats_splits', float_format='%.2f')\n", + "# stats_splits.to_excel(writer, 'stats_splits', float_format='%.2f')\n", "stats_splits" ] }, @@ -1256,6 +1280,7 @@ "metadata": {}, "outputs": [], "source": [ + "group = 3\n", "ax = (splits\n", " .train_X\n", " .plot\n", @@ -1263,7 +1288,7 @@ " bins=bins,\n", " ax=None,\n", " color='C0',\n", - "))\n", + " ))\n", "_ = (splits\n", " .val_y\n", " .plot\n", @@ -1274,7 +1299,7 @@ " legend=True)\n", " )\n", "ax.legend(_legend[:-1])\n", - "fname = params.out_figures / 'test_over_train_split.pdf'\n", + "fname = params.out_figures / f'0_{group}_test_over_train_split.pdf'\n", "figures[fname.name] = fname\n", "vaep.savefig(ax.get_figure(), fname)" ] @@ -1292,11 +1317,11 @@ " legend=False,\n", " stacked=True,\n", " color=['C0', 'C1', 'C2'],\n", - " )\n", + " )\n", "ax.legend(_legend)\n", "ax.set_xlabel('Intensity bins')\n", "ax.yaxis.set_major_formatter(\"{x:,.0f}\")\n", - "fname = params.out_figures / 'splits_freq_stacked.pdf'\n", + "fname = params.out_figures / f'0_{group}_splits_freq_stacked.pdf'\n", "figures[fname.name] = fname\n", "vaep.savefig(ax.get_figure(), fname)" ] @@ -1312,11 +1337,11 @@ " color=['C1', 'C2'],\n", " legend=False,\n", " stacked=True,\n", - " )\n", + " )\n", "ax.legend(_legend[1:])\n", "ax.set_xlabel('Intensity bins')\n", "ax.yaxis.set_major_formatter(\"{x:,.0f}\")\n", - "fname = params.out_figures / 'val_test_split_freq_stacked_.pdf'\n", + "fname = params.out_figures / f'0_{group}_val_test_split_freq_stacked_.pdf'\n", "figures[fname.name] = fname\n", "vaep.savefig(ax.get_figure(), fname)" ] @@ -1345,7 +1370,7 @@ "source": [ "ax = vaep.plotting.data.plot_feat_median_over_prop_missing(\n", " data=splits.train_X, type='scatter')\n", - "fname = params.out_figures / 'intensity_median_vs_prop_missing_scatter_train'\n", + "fname = params.out_figures / f'0_{group}_intensity_median_vs_prop_missing_scatter_train'\n", "figures[fname.stem] = fname\n", "vaep.savefig(ax.get_figure(), fname)" ] @@ -1358,7 +1383,42 @@ "source": [ "ax = vaep.plotting.data.plot_feat_median_over_prop_missing(\n", " data=splits.train_X, type='boxplot')\n", - "fname = params.out_figures / 'intensity_median_vs_prop_missing_boxplot_train'\n", + "fname = params.out_figures / f'0_{group}_intensity_median_vs_prop_missing_boxplot_train'\n", + "figures[fname.stem] = fname\n", + "vaep.savefig(ax.get_figure(), fname)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "medians = (splits\n", + " .train_X\n", + " .median()\n", + " .astype(int)\n", + " ).to_frame('median_floor')\n", + "\n", + "feat_with_median = medians.groupby('median_floor').size().rename('M feat')\n", + "medians = medians.join(feat_with_median, on='median_floor')\n", + "medians = medians.apply(lambda s: \"{:02,d} (N={:3,d})\".format(*s), axis=1)\n", + "\n", + "fig, ax = plt.subplots(figsize=(8, 2))\n", + "s = 1\n", + "s_axes = pd.DataFrame({'medians': medians,\n", + " 'validation split': splits.val_y.notna().sum(),\n", + " 'training split': splits.train_X.notna().sum()}\n", + " ).plot.box(by='medians',\n", + " boxprops=dict(linewidth=s),\n", + " flierprops=dict(markersize=s),\n", + " ax=ax)\n", + "for ax in s_axes:\n", + " _ = ax.set_xticklabels(ax.get_xticklabels(),\n", + " rotation=45,\n", + " horizontalalignment='right')\n", + "\n", + "fname = params.out_figures / f'0_{group}_intensity_median_vs_prop_missing_boxplot_val_train'\n", "figures[fname.stem] = fname\n", "vaep.savefig(ax.get_figure(), fname)" ] diff --git a/project/01_0_split_data.py b/project/01_0_split_data.py index 48b1fa31d..50939fb9c 100644 --- a/project/01_0_split_data.py +++ b/project/01_0_split_data.py @@ -6,7 +6,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -20,10 +20,10 @@ # %% from pathlib import Path - +import logging from typing import Union, List - +from IPython.display import display import numpy as np import pandas as pd import matplotlib.pyplot as plt @@ -32,31 +32,31 @@ import vaep from vaep.io.datasplits import DataSplits -from vaep.io import thermo_raw_files -from vaep.sampling import feature_frequency, sample_data +from vaep.sampling import feature_frequency from vaep.analyzers import analyzers -from vaep.analyzers.analyzers import AnalyzePeptides +from vaep.analyzers.analyzers import AnalyzePeptides logger = vaep.logging.setup_nb_logger() logger.info("Split data and make diagnostic plots") +logging.getLogger('fontTools').setLevel(logging.WARNING) -def add_meta_data(analysis: AnalyzePeptides, df_meta: pd.DataFrame): + +def add_meta_data(df: pd.DataFrame, df_meta: pd.DataFrame): try: - analysis.df = analysis.df.loc[df_meta.index] + df = df.loc[df_meta.index] except KeyError as e: logger.warning(e) logger.warning("Ignore missing samples in quantified samples") - analysis.df = analysis.df.loc[analysis.df.index.intersection( + df = df.loc[df.index.intersection( df_meta.index)] - - analysis.df_meta = df_meta - return analysis + return df_meta pd.options.display.max_columns = 32 plt.rcParams['figure.figsize'] = [4, 2] -vaep.plotting.make_large_descriptors(5) + +vaep.plotting.make_large_descriptors(6) figures = {} # collection of ax or figures dumps = {} # collection of data dumps @@ -70,32 +70,26 @@ def add_meta_data(analysis: AnalyzePeptides, df_meta: pd.DataFrame): args = dict(globals()).keys() # %% tags=["parameters"] -# Sample (rows) intensiites for features (columns) -FN_INTENSITIES: str = 'data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv' -# Can be either a string or position (typical 0 for first column), or a list of these. -index_col: Union[str, int] = 0 -# wide_format: bool = False # intensities in wide format (more memory efficient of csv). Default is long_format (more precise) -# Manuelly set column names (of Index object in columns) -column_names: List[str] = ["Gene Names"] -# Machine parsed metadata from raw file (see workflows/metadata), wide format per sample -fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv' -# Minimum number or fraction of feature prevalence across samples to be kept -feat_prevalence: Union[int, float] = 0.25 -# Minimum number or fraction of total requested features per Sample -sample_completeness: Union[int, float] = 0.5 +FN_INTENSITIES: str = 'data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv' # Sample (rows), features (columns) +index_col: Union[str, int] = 0 # Can be either a string or position (default 0 for first column), or a list of these. +column_names: List[str] = ["Gene Names"] # Manuelly set column names (of Index object in columns) +fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv' # metadata for samples (rows) +feat_prevalence: Union[int, float] = 0.25 # Minimum number or fraction of feature prevalence across samples to be kept +sample_completeness: Union[int, float] = 0.5 # Minimum number or fraction of total requested features per Sample select_N: int = None # only use latest N samples -sample_N: bool = False # if select_N, sample N randomly instead of using latest? +sample_N: bool = False # if select_N, sample N randomly instead of using latest N random_state: int = 42 # random state for reproducibility of splits -# based on raw file meta data, only take samples with RT > min_RT_time -min_RT_time: Union[int, float] = None -# Log transformation of initial data (select one of the existing in numpy) -logarithm: str = 'log2' -folder_experiment: str = f'runs/example' -folder_data: str = '' # specify data directory if needed +min_RT_time: Union[int, float] = None # based on raw file meta data, only take samples with RT > min_RT_time +logarithm: str = 'log2' # Log transformation of initial data (select one of the existing in numpy) +folder_experiment: str = 'runs/example' # folder to save figures and data dumps +folder_data: str = '' # specify special data directory if needed file_format: str = 'csv' # file format of create splits, default pickle (pkl) # metadata -> defaults for metadata extracted from machine data, used for plotting meta_date_col: str = None # date column in meta data meta_cat_col: str = None # category column in meta data +# train, validation and test data splits +frac_non_train: float = 0.1 # fraction of non training data (validation and test split) +frac_mnar: float = 0.0 # fraction of missing not at random data, rest: missing completely at random # %% @@ -132,6 +126,7 @@ def add_meta_data(analysis: AnalyzePeptides, df_meta: pd.DataFrame): f"File format (extension): {FILE_EXT} (!specifies data loading function!)") # %% +# # ! factor out file reading to a separate module, not class # AnalyzePeptides.from_csv constructor = getattr(AnalyzePeptides, FILE_FORMAT_TO_CONSTRUCTOR[FILE_EXT]) analysis = constructor(fname=params.FN_INTENSITIES, @@ -147,11 +142,18 @@ def add_meta_data(analysis: AnalyzePeptides, df_meta: pd.DataFrame): log_fct = getattr(np, params.logarithm) analysis.log_transform(log_fct) logger.info(f"{analysis = }") -analysis.df +df = analysis.df +del analysis.df # free memory +df # %% -ax = analysis.df.notna().sum(axis=0).to_frame( - analysis.df.columns.name).plot.box() +ax = (df + .notna() + .sum(axis=0) + .to_frame(df.columns.name) + .plot + .box() + ) ax.set_ylabel('number of observation across samples') @@ -160,7 +162,7 @@ def add_meta_data(analysis: AnalyzePeptides, df_meta: pd.DataFrame): dumps[fname.name] = fname.as_posix() writer = pd.ExcelWriter(fname) -notna = analysis.df.notna() +notna = df.notna() data_stats_original = pd.concat( [ notna.sum().describe().rename('feat_stats'), @@ -172,7 +174,8 @@ def add_meta_data(analysis: AnalyzePeptides, df_meta: pd.DataFrame): # %% [markdown] -# In case there are multiple features for each intensity values (currenlty: peptide sequence and charge), combine the column names to a single str index. +# In case there are multiple features for each intensity values (currenlty: peptide sequence and charge), +# combine the column names to a single str index. # # > The Collaborative Modeling approach will need a single feature column. @@ -181,15 +184,16 @@ def join_as_str(seq): ret = "_".join(str(x) for x in seq) return ret + # ToDo: join multiindex samples indices (pkl dumps) -# if hasattr(analysis.df.columns, "levels"): -if isinstance(analysis.df.columns, pd.MultiIndex): +# if hasattr(df.columns, "levels"): +if isinstance(df.columns, pd.MultiIndex): logger.warning("combine MultiIndex columns to one feature column") - print(analysis.df.columns[:10].map(join_as_str)) - _new_name = join_as_str(analysis.df.columns.names) - analysis.df.columns = analysis.df.columns.map(join_as_str) - analysis.df.columns.name = _new_name - logger.warning(f"New name: {analysis.df.columns.names = }") + print(df.columns[:10].map(join_as_str)) + _new_name = join_as_str(df.columns.names) + df.columns = df.columns.map(join_as_str) + df.columns.name = _new_name + logger.warning(f"New name: {df.columns.names = }") # %% [markdown] # ## Machine metadata @@ -200,15 +204,15 @@ def join_as_str(seq): if params.fn_rawfile_metadata: df_meta = pd.read_csv(params.fn_rawfile_metadata, index_col=0) else: - logger.warning(f"No metadata for samples provided, create placeholder.") + logger.warning("No metadata for samples provided, create placeholder.") if params.meta_date_col: raise ValueError( f"No metadata provided, but data column set: {params.meta_date_col}") if params.meta_cat_col: raise ValueError( f"No metadata provided, but data column set: {params.meta_cat_col}") - df_meta = pd.DataFrame(index=analysis.df.index) -df_meta = df_meta.loc[analysis.df.index.to_list()] # index is sample index + df_meta = pd.DataFrame(index=df.index) +df_meta = df_meta.loc[df.index.to_list()] # index is sample index if df_meta.index.name is None: df_meta.index.name = params.index_col[0] df_meta @@ -222,17 +226,9 @@ def join_as_str(seq): df_meta[params.meta_date_col] = range(len(df_meta)) df_meta -# %% -if df_meta.columns.isin(thermo_raw_files.cols_instrument).sum() == len(thermo_raw_files.cols_instrument): - display(df_meta.groupby(thermo_raw_files.cols_instrument)[ - params.meta_date_col].agg(['count', 'min', 'max'])) -else: - logger.info( - f"Instrument column not found: {thermo_raw_files.cols_instrument}") # %% -df_meta.describe(datetime_is_numeric=True, - percentiles=np.linspace(0.05, 0.95, 10)) +df_meta.describe(percentiles=np.linspace(0.05, 0.95, 10)) # %% [markdown] # select samples with a minimum retention time @@ -241,7 +237,8 @@ def join_as_str(seq): if params.min_RT_time: logger.info( "Metadata should have 'MS max RT' entry from ThermoRawFileParser") - msg = f"Minimum RT time maxiumum is set to {params.min_RT_time} minutes (to exclude too short runs, which are potentially fractions)." + msg = (f"Minimum RT time maxiumum is set to {params.min_RT_time} minutes" + " (to exclude too short runs, which are potentially fractions).") # can be integrated into query string mask_RT = df_meta['MS max RT'] >= params.min_RT_time msg += f" Total number of samples retained: {int(mask_RT.sum())}" @@ -249,13 +246,13 @@ def join_as_str(seq): logger.info(msg) df_meta = df_meta.loc[mask_RT] else: - logger.warning(f"Retention time filtering deactivated.") + logger.warning("Retention time filtering deactivated.") # %% df_meta = df_meta.sort_values(params.meta_date_col) # %% -meta_stats = df_meta.describe(include='all', datetime_is_numeric=True) +meta_stats = df_meta.describe(include='all') meta_stats # %% [markdown] @@ -271,51 +268,15 @@ def join_as_str(seq): if 'unique' in meta_stats.index: display(meta_stats.loc[:, (meta_stats.loc['std'] > 0.1)]) -# %% [markdown] -# Optional, if using ThermoRawFileParser: check some columns describing settings -# - software can be updated: `Software Version` -# - `mass resolution` setting for instrument -# - colision type for MS2: `beam-type collision-induced dissocation` -# - missing `dilution factor` -# - omit (uncomment if needed): -# - quite some variation due to `MS max charge`: omit -# - variation by `injection volume setting` and instrument over time -# - 500ng of peptides should be injected, based on concentration of peptides this setting is adjusted to get it - -# %% -meta_raw_settings = [ - 'Thermo Scientific instrument model', - 'instrument serial number', - 'Software Version', - # 'MS max charge', - 'mass resolution', - 'beam-type collision-induced dissociation', - # 'injection volume setting', - 'dilution factor', -] - -if df_meta.columns.isin(meta_raw_settings).sum() == len(meta_raw_settings): - display( - # index gives first example with this combination - df_meta[meta_raw_settings].drop_duplicates() - ) - - -# %% [markdown] -# - check for variation in `software Version` and `injection volume setting` -# -# -# Update selection of samples based on metadata (e.g. minimal retention time) -# - sort data the same as sorted meta data # %% -analysis = add_meta_data(analysis, df_meta=df_meta) +df_meta = add_meta_data(df, df_meta=df_meta) # %% [markdown] # Ensure unique indices # %% -assert analysis.df.index.is_unique, "Duplicates in index" +assert df.index.is_unique, "Duplicates in index" # %% [markdown] # ## Select a subset of samples if specified (reduce the number of samples) @@ -326,25 +287,26 @@ def join_as_str(seq): # %% if params.select_N is not None: - params.select_N = min(params.select_N, len(analysis.df_meta)) + params.select_N = min(params.select_N, len(df_meta)) if params.sample_N: - analysis.df_meta = analysis.df_meta.sample(params.select_N) + df_meta = df_meta.sample(params.select_N) else: - analysis.df_meta = analysis.df_meta.iloc[-params.select_N:] + df_meta = df_meta.iloc[-params.select_N:] - analysis.df = analysis.df.loc[analysis.df_meta.index].dropna( + df = df.loc[df_meta.index].dropna( how='all', axis=1) - ax = analysis.df.T.describe().loc['count'].hist() + ax = df.T.describe().loc['count'].hist() _ = ax.set_title('histogram of features for all eligable samples') # %% [markdown] # ## First Step: Select features by prevalence # - `feat_prevalence` across samples + # %% -freq_per_feature = analysis.df.notna().sum() # on wide format +freq_per_feature = df.notna().sum() # on wide format if isinstance(params.feat_prevalence, float): - N_samples = len(analysis.df_meta) + N_samples = len(df) logger.info(f"Current number of samples: {N_samples}") logger.info( f"Feature has to be present in at least {params.feat_prevalence:.2%} of samples") @@ -358,13 +320,13 @@ def join_as_str(seq): mask = freq_per_feature >= params.feat_prevalence logger.info(f"Drop {(~mask).sum()} features") freq_per_feature = freq_per_feature.loc[mask] -analysis.df = analysis.df.loc[:, mask] -analysis.N, analysis.M = analysis.df.shape - +df = df.loc[:, mask] +analysis.N, analysis.M = df.shape # # potentially create freq based on DataFrame -analysis.df +df -notna = analysis.df.notna() +# %% +notna = df.notna() data_stats_filtered = pd.concat( [ notna.sum().describe().rename('feat_stats'), @@ -385,44 +347,45 @@ def join_as_str(seq): msg = f'Fraction of minimum sample completeness over all features specified with: {params.sample_completeness}\n' # assumes df in wide format params.overwrite_entry('sample_completeness', int( - analysis.df.shape[1] * params.sample_completeness)) + df.shape[1] * params.sample_completeness)) msg += f'This translates to a minimum number of features per sample (to be included): {params.sample_completeness}' logger.info(msg) -sample_counts = analysis.df.notna().sum(axis=1) # if DataFrame +sample_counts = df.notna().sum(axis=1) # if DataFrame sample_counts.describe() # %% mask = sample_counts > params.sample_completeness msg = f'Drop {len(mask) - mask.sum()} of {len(mask)} initial samples.' print(msg) -analysis.df = analysis.df.loc[mask] -analysis.df = analysis.df.dropna( +df = df.loc[mask] +df = df.dropna( axis=1, how='all') # drop now missing features # %% -params.N, params.M = analysis.df.shape # save data dimensions -params.used_samples = analysis.df.index.to_list() +params.N, params.M = df.shape # save data dimensions +params.used_samples = df.index.to_list() # %% [markdown] # ### Histogram of features per sample # %% -ax = analysis.df.notna().sum(axis=1).hist() +group = 1 +ax = df.notna().sum(axis=1).hist() ax.set_xlabel('features per eligable sample') ax.set_ylabel('observations') -fname = params.out_figures / 'hist_features_per_sample' +fname = params.out_figures / f'0_{group}_hist_features_per_sample' figures[fname.stem] = fname vaep.savefig(ax.get_figure(), fname) # %% -ax = analysis.df.notna().sum(axis=0).sort_values().plot() -_new_labels = [l.get_text().split(';')[0] for l in ax.get_xticklabels()] +ax = df.notna().sum(axis=0).sort_values().plot() +_new_labels = [l_.get_text().split(';')[0] for l_ in ax.get_xticklabels()] _ = ax.set_xticklabels(_new_labels, rotation=45, horizontalalignment='right') ax.set_xlabel('feature prevalence') ax.set_ylabel('observations') -fname = params.out_figures / 'feature_prevalence' +fname = params.out_figures / f'0_{group}_feature_prevalence' figures[fname.stem] = fname vaep.savefig(ax.get_figure(), fname) @@ -431,42 +394,40 @@ def join_as_str(seq): # ### Number off observations accross feature value # %% -min_max = vaep.plotting.data.min_max(analysis.df.stack()) +min_max = vaep.plotting.data.min_max(df.stack()) ax, bins = vaep.plotting.data.plot_histogram_intensities( - analysis.df.stack(), min_max=min_max) + df.stack(), min_max=min_max) -fname = params.out_figures / 'intensity_distribution_overall' +fname = params.out_figures / f'0_{group}_intensity_distribution_overall' figures[fname.stem] = fname vaep.savefig(ax.get_figure(), fname) # %% ax = vaep.plotting.data.plot_feat_median_over_prop_missing( - data=analysis.df, type='scatter') -fname = params.out_figures / 'intensity_median_vs_prop_missing_scatter' + data=df, type='scatter') +fname = params.out_figures / f'0_{group}_intensity_median_vs_prop_missing_scatter' figures[fname.stem] = fname vaep.savefig(ax.get_figure(), fname) # %% ax = vaep.plotting.data.plot_feat_median_over_prop_missing( - data=analysis.df, type='boxplot') -fname = params.out_figures / 'intensity_median_vs_prop_missing_boxplot' + data=df, type='boxplot') +fname = params.out_figures / f'0_{group}_intensity_median_vs_prop_missing_boxplot' figures[fname.stem] = fname vaep.savefig(ax.get_figure(), fname) # %% [markdown] # ### Interactive and Single plots -# %% [markdown] -# Scatter plots need to become interactive. - # %% sample_counts.name = 'identified features' # %% K = 2 -analysis.df = analysis.df.astype(float) +df = df.astype(float) +analysis.df = df pcs = analysis.get_PCA(n_components=K) # should be renamed to get_PCs -pcs = pcs.iloc[:, :K].join(analysis.df_meta).join(sample_counts) +pcs = pcs.iloc[:, :K].join(df_meta).join(sample_counts) pcs_name = pcs.columns[:2] pcs_index_name = pcs.index.name @@ -474,15 +435,15 @@ def join_as_str(seq): pcs # %% -pcs.describe(include='all', datetime_is_numeric=True).T +pcs.describe(include='all').T # %% if params.meta_cat_col: - fig, ax = plt.subplots(figsize=(2,2)) + fig, ax = plt.subplots(figsize=(2, 2)) analyzers.seaborn_scatter( pcs[pcs_name], ax, meta=pcs[params.meta_cat_col], title=f"by {params.meta_cat_col}") fname = (params.out_figures - / f'pca_sample_by_{"_".join(params.meta_cat_col.split())}') + / f'0_{group}_pca_sample_by_{"_".join(params.meta_cat_col.split())}') figures[fname.stem] = fname vaep.savefig(fig, fname) @@ -491,12 +452,11 @@ def join_as_str(seq): fig, ax = plt.subplots() analyzers.plot_date_map( df=pcs[pcs_name], ax=ax, dates=pcs[params.meta_date_col], title=f'by {params.meta_date_col}') - fname = params.out_figures / 'pca_sample_by_date' + fname = params.out_figures / f'0_{group}_pca_sample_by_date' figures[fname.stem] = fname vaep.savefig(fig, fname) # %% [markdown] -# - software version: Does it make a difference? # - size: number of features in a single sample # %% @@ -510,7 +470,7 @@ def join_as_str(seq): size=5, ) fname = (params.out_figures - / f'pca_sample_by_{"_".join(col_identified_feat.split())}.pdf') + / f'0_{group}_pca_sample_by_{"_".join(col_identified_feat.split())}.pdf') figures[fname.stem] = fname vaep.savefig(fig, fname) @@ -523,11 +483,11 @@ def join_as_str(seq): # color=pcs['Software Version'], color=col_identified_feat, template='none', - width=1200, # 4 inches x 300 dpi - height=600 # 2 inches x 300 dpi + width=1200, # 4 inches x 300 dpi + height=600 # 2 inches x 300 dpi ) fname = (params.out_figures - / f'pca_sample_by_{"_".join(col_identified_feat.split())}_plotly.pdf') + / f'0_{group}_pca_sample_by_{"_".join(col_identified_feat.split())}_plotly.pdf') figures[fname.stem] = fname fig.write_image(fname) fig # stays interactive in html @@ -536,30 +496,35 @@ def join_as_str(seq): # ## Sample Medians and percentiles # %% -analysis.df.head() +df.head() # %% -df = analysis.df -df = df.join(df_meta[params.meta_date_col]) -df = df.set_index(params.meta_date_col).sort_index() +df_w_date = df.join(df_meta[params.meta_date_col]) +df_w_date = df_w_date.set_index(params.meta_date_col).sort_index() if not params.meta_date_col == 'PlaceholderTime': - df.to_period('min') -df = df.T + df_w_date.to_period('min') +df_w_date = df_w_date.T +df_w_date # %% -ax = df.boxplot(rot=80, figsize=(8, 3), fontsize=5, - showfliers=False, showcaps=False) +ax = df_w_date.boxplot(rot=80, + figsize=(8, 3), + fontsize=6, + showfliers=False, + showcaps=False + ) _ = vaep.plotting.select_xticks(ax) fig = ax.get_figure() -fname = params.out_figures / 'median_boxplot' +fname = params.out_figures / f'0_{group}_median_boxplot' figures[fname.stem] = fname vaep.savefig(fig, fname) +del df_w_date # %% [markdown] # Percentiles of intensities in dataset # %% -df.stack().describe(percentiles=np.linspace(0.05, 0.95, 10)) +df.stack().describe(percentiles=np.linspace(0.05, 0.95, 19).round(2)) # %% [markdown] # ### Plot sample median over time @@ -569,49 +534,40 @@ def join_as_str(seq): # %% if not params.meta_date_col == 'PlaceholderTime': dates = df_meta[params.meta_date_col].sort_values() - # dates.name = 'date' - median_sample_intensity = (analysis.df + median_sample_intensity = (df .median(axis=1) .to_frame('median intensity')) median_sample_intensity = median_sample_intensity.join(dates) ax = median_sample_intensity.plot.scatter(x=dates.name, y='median intensity', rot=90, - fontsize='large', + # fontsize=6, figsize=(8, 2), s=5, xticks=vaep.plotting.select_dates( median_sample_intensity[dates.name]) ) fig = ax.get_figure() - figures['median_scatter'] = params.out_figures / 'median_scatter' - vaep.savefig(fig, figures['median_scatter']) + fname = params.out_figures / f'0_{group}_median_scatter' + figures[fname.stem] = fname + vaep.savefig(fig, fname) # %% [markdown] # - the closer the labels are there denser the samples are measured around that time. # %% [markdown] # ## Feature frequency in data -# -# - higher count, higher probability to be sampled into training data -# - missing peptides are sampled both into training as well as into validation dataset -# - everything not in training data is validation data -# -# Based on unmodified training data # %% -msg = "Total number of samples in training data split: {}" -print(msg.format(len(analysis.df))) +msg = "Total number of samples in data: {}" +print(msg.format(len(df))) -# %% -# # analysis.splits.to_wide_format() -# assert analysis.splits is splits, "Sanity check failed." # %% [markdown] -# Recalculate feature frequency after selecting some samples +# Recalculate feature frequency after selecting samples # %% -freq_per_feature = feature_frequency(analysis.df) +freq_per_feature = feature_frequency(df) freq_per_feature # %% @@ -624,57 +580,123 @@ def join_as_str(seq): dumps[fname.name] = fname freq_per_feature.to_pickle(fname) -# %% [markdown] -# Conserning sampling with frequency weights: -# - larger weight -> higher probablility of being sampled -# - weights need to be alignable to index of original DataFrame before grouping (same index) # %% [markdown] # ## Split: Train, validation and test data # -# - test data is in clinical language often denoted as independent validation cohort -# - validation data (for model) - -# %% -analysis.splits = DataSplits(is_wide_format=False) -splits = analysis.splits -print(f"{analysis.splits = }") -analysis.splits.__annotations__ - -# %% [markdown] -# ### Sample targets (Fake NAs) - -# %% [markdown] -# Add goldstandard targets for valiation and test data -# - based on same day -# - same instrument - -# %% [markdown] -# Create some target values by sampling 5% of the validation and test data. - -# %% -analysis.to_long_format(inplace=True) -analysis.df_long +# Select features as described in +# > Lazar, Cosmin, Laurent Gatto, Myriam Ferro, Christophe Bruley, and Thomas Burger. 2016. +# > “Accounting for the Multiple Natures of Missing Values in Label-Free Quantitative +# > Proteomics Data Sets to Compare Imputation Strategies.” +# > Journal of Proteome Research 15 (4): 1116–25. +# +# - select `frac_mnar` based on threshold matrix on quantile of overall frac of data to be used +# for validation and test data split, e.g. 0.1 = quantile(0.1) +# - select frac_mnar from intensities selected using threshold matrix + +# %% +splits = DataSplits(is_wide_format=False) +print(f"{splits = }") +splits.__annotations__ + + +# %% [markdown] +# Create some target values by sampling X% of the validation and test data. +# Simulated missing values are not used for validation and testing. + +# %% +df_long = vaep.io.datasplits.long_format(df) +df_long.head() + +# %% +group = 2 +# ! move parameter checks to start of script +if 0.0 <= params.frac_mnar <= 1.0: + fig, axes = plt.subplots(1, 2, figsize=(8, 2)) + quantile_frac = df_long.quantile(params.frac_non_train) + rng = np.random.default_rng(params.random_state) + threshold = pd.Series(rng.normal(loc=float(quantile_frac), + scale=float(0.3 * df_long.std()), + size=len(df_long), + ), + index=df_long.index, + ) + # plot data vs threshold data + ax = axes[0] + from functools import partial + plot_histogram_intensities = partial(vaep.plotting.data.plot_histogram_intensities, + min_max=min_max, + alpha=0.8) + plot_histogram_intensities( + df_long.squeeze(), + ax=ax, + label='observed') + plot_histogram_intensities( + threshold, + ax=ax, + label='thresholds') + ax.legend() + # select MNAR (intensity between randomly sampled threshold) + mask = df_long.squeeze() < threshold + # ! subsample to have exact fraction of MNAR? + N = len(df_long) + logger.info(f"{int(N * params.frac_non_train) = :,d}") + N_MNAR = int(params.frac_non_train * params.frac_mnar * N) + fake_na_mnar = df_long.loc[mask] + if len(fake_na_mnar) > N_MNAR: + fake_na_mnar = fake_na_mnar.sample(N_MNAR, + random_state=params.random_state) + splits.train_X = df_long.loc[ + df_long.index.difference( + fake_na_mnar.index) + ] + logger.info(f"{len(fake_na_mnar) = :,d}") + N_MCAR = int(N * (1 - params.frac_mnar) * params.frac_non_train) + fake_na_mcar = splits.train_X.sample(N_MCAR, + random_state=params.random_state) + logger.info(f"{len(fake_na_mcar) = :,d}") + splits.train_X = (splits + .train_X + .loc[splits + .train_X + .index + .difference( + fake_na_mcar.index)] + ).squeeze() + logger.info(f"{len(splits.train_X) = :,d}") + fake_na = pd.concat([fake_na_mcar, fake_na_mnar]).squeeze() + logger.info(f"{len(fake_na) = :,d}") + ax = axes[1] + plot_histogram_intensities( + fake_na_mnar.squeeze(), + ax=ax, + label=f'MNAR ({N_MNAR:,d})', + color='C2') + plot_histogram_intensities( + fake_na_mcar.squeeze(), + ax=ax, + color='C3', + label=f'MCAR ({N_MCAR:,d})') + ax.legend() + assert len(fake_na) + len(splits.train_X) == len(df_long) + fname = params.out_figures / f'0_{group}_mnar_mcar_histograms.pdf' + figures[fname.stem] = fname + vaep.savefig(fig, fname) +else: + raise ValueError(f"Invalid MNAR float value (should be betw. 0 and 1): {params.frac_mnar}") -# %% -fake_na, splits.train_X = sample_data(analysis.df_long.squeeze(), - sample_index_to_drop=0, - weights=freq_per_feature, - frac=0.1, - random_state=params.random_state,) -assert len(splits.train_X) > len(fake_na) -splits.val_y = fake_na.sample(frac=0.5, random_state=params.random_state).sort_index() +splits.val_y = fake_na.sample(frac=0.5, random_state=params.random_state) splits.test_y = fake_na.loc[fake_na.index.difference(splits.val_y.index)] -# splits # %% -splits.test_y +splits.test_y.groupby(level=-1).count().describe() # %% splits.val_y # %% -splits.train_X +# ! add option to retain at least N samples per feature +splits.train_X.groupby(level=-1).count().describe() # %% # ToDo check that feature indices and sample indicies overlap @@ -684,14 +706,14 @@ def join_as_str(seq): # per feature are allowd. diff = (splits - .val_y - .index - .levels[-1] - .difference(splits - .train_X - .index - .levels[-1] - ).to_list()) + .val_y + .index + .levels[-1] + .difference(splits + .train_X + .index + .levels[-1] + ).to_list()) if diff: to_remove = splits.val_y.loc[pd.IndexSlice[:, diff]] display(to_remove) @@ -701,14 +723,14 @@ def join_as_str(seq): # %% diff = (splits - .test_y - .index - .levels[-1] - .difference(splits - .train_X - .index - .levels[-1] - ).to_list()) + .test_y + .index + .levels[-1] + .difference(splits + .train_X + .index + .levels[-1] + ).to_list()) if diff: to_remove = splits.test_y.loc[pd.IndexSlice[:, diff]] display(to_remove) @@ -716,6 +738,30 @@ def join_as_str(seq): splits.test_y = splits.test_y.drop(to_remove.index) diff +# %% [markdown] +# Some tools require at least 4 observation in the training data, +# which is a good requirment. Due to "MNAR" sampling, most measurments +# of a features can end up in the validation or test data. +# +# In that case: Move the validation measurments back to the training data. +# If after this procedure the condition is still not met, a value error is raised. + +# %% +mask_min_4_measurments = splits.train_X.groupby(level=1).count() < 4 +if mask_min_4_measurments.any(): + idx = mask_min_4_measurments.loc[mask_min_4_measurments].index + logger.warning(f"Features with less than 4 measurments in training data: {idx.to_list()}") + to_remove = splits.val_y.loc[pd.IndexSlice[:, idx]] + print("To remove from validation data: ") + display(to_remove) + splits.train_X = pd.concat([splits.train_X, to_remove]) + splits.val_y = splits.val_y.drop(to_remove.index) + # check condition again + mask_min_4_measurments = splits.train_X.groupby(level=1).count() < 4 + if mask_min_4_measurments.any(): + idx = mask_min_4_measurments.loc[mask_min_4_measurments].index + raise ValueError("Some features still have less than 4 measurments in training data" + f" after removing the features from the validation data: {idx.to_list()}") # %% [markdown] # ### Save in long format @@ -739,12 +785,12 @@ def join_as_str(seq): # ## plot distribution of splits # %% -splits_df = pd.DataFrame(index=analysis.df_long.index) +splits_df = pd.DataFrame(index=df_long.index) splits_df['train'] = splits.train_X splits_df['val'] = splits.val_y splits_df['test'] = splits.test_y stats_splits = splits_df.describe() -stats_splits.to_excel(writer, 'stats_splits', float_format='%.2f') +# stats_splits.to_excel(writer, 'stats_splits', float_format='%.2f') stats_splits # %% @@ -760,6 +806,7 @@ def join_as_str(seq): print(_legend) # %% +group = 3 ax = (splits .train_X .plot @@ -767,7 +814,7 @@ def join_as_str(seq): bins=bins, ax=None, color='C0', -)) + )) _ = (splits .val_y .plot @@ -778,7 +825,7 @@ def join_as_str(seq): legend=True) ) ax.legend(_legend[:-1]) -fname = params.out_figures / 'test_over_train_split.pdf' +fname = params.out_figures / f'0_{group}_test_over_train_split.pdf' figures[fname.name] = fname vaep.savefig(ax.get_figure(), fname) @@ -790,11 +837,11 @@ def join_as_str(seq): legend=False, stacked=True, color=['C0', 'C1', 'C2'], - ) + ) ax.legend(_legend) ax.set_xlabel('Intensity bins') ax.yaxis.set_major_formatter("{x:,.0f}") -fname = params.out_figures / 'splits_freq_stacked.pdf' +fname = params.out_figures / f'0_{group}_splits_freq_stacked.pdf' figures[fname.name] = fname vaep.savefig(ax.get_figure(), fname) @@ -804,11 +851,11 @@ def join_as_str(seq): color=['C1', 'C2'], legend=False, stacked=True, - ) + ) ax.legend(_legend[1:]) ax.set_xlabel('Intensity bins') ax.yaxis.set_major_formatter("{x:,.0f}") -fname = params.out_figures / 'val_test_split_freq_stacked_.pdf' +fname = params.out_figures / f'0_{group}_val_test_split_freq_stacked_.pdf' figures[fname.name] = fname vaep.savefig(ax.get_figure(), fname) @@ -821,14 +868,43 @@ def join_as_str(seq): # %% ax = vaep.plotting.data.plot_feat_median_over_prop_missing( data=splits.train_X, type='scatter') -fname = params.out_figures / 'intensity_median_vs_prop_missing_scatter_train' +fname = params.out_figures / f'0_{group}_intensity_median_vs_prop_missing_scatter_train' figures[fname.stem] = fname vaep.savefig(ax.get_figure(), fname) # %% ax = vaep.plotting.data.plot_feat_median_over_prop_missing( data=splits.train_X, type='boxplot') -fname = params.out_figures / 'intensity_median_vs_prop_missing_boxplot_train' +fname = params.out_figures / f'0_{group}_intensity_median_vs_prop_missing_boxplot_train' +figures[fname.stem] = fname +vaep.savefig(ax.get_figure(), fname) + +# %% +medians = (splits + .train_X + .median() + .astype(int) + ).to_frame('median_floor') + +feat_with_median = medians.groupby('median_floor').size().rename('M feat') +medians = medians.join(feat_with_median, on='median_floor') +medians = medians.apply(lambda s: "{:02,d} (N={:3,d})".format(*s), axis=1) + +fig, ax = plt.subplots(figsize=(8, 2)) +s = 1 +s_axes = pd.DataFrame({'medians': medians, + 'validation split': splits.val_y.notna().sum(), + 'training split': splits.train_X.notna().sum()} + ).plot.box(by='medians', + boxprops=dict(linewidth=s), + flierprops=dict(markersize=s), + ax=ax) +for ax in s_axes: + _ = ax.set_xticklabels(ax.get_xticklabels(), + rotation=45, + horizontalalignment='right') + +fname = params.out_figures / f'0_{group}_intensity_median_vs_prop_missing_boxplot_val_train' figures[fname.stem] = fname vaep.savefig(ax.get_figure(), fname) diff --git a/project/01_0_transform_data_to_wide_format.ipynb b/project/01_0_transform_data_to_wide_format.ipynb index f8ccd842c..4d5266a1d 100644 --- a/project/01_0_transform_data_to_wide_format.ipynb +++ b/project/01_0_transform_data_to_wide_format.ipynb @@ -145,7 +145,7 @@ "source": [ "fname = params.data / 'sample_annotation_placeholder.csv'\n", "annotation.to_csv(fname)\n", - "fname " + "fname" ] }, { @@ -164,14 +164,15 @@ "cell_type": "code", "execution_count": null, "id": "ce749fdb", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "outputs": [], "source": [ "fname = params.data / 'data_wide_sample_cols.csv'\n", - "# fillna('Filtered') \n", + "# fillna('Filtered')\n", "train_data.T.to_csv(fname)\n", - "fname\n", - "\n" + "fname" ] }, { diff --git a/project/01_0_transform_data_to_wide_format.py b/project/01_0_transform_data_to_wide_format.py index 7fe6f7293..b23bf8154 100644 --- a/project/01_0_transform_data_to_wide_format.py +++ b/project/01_0_transform_data_to_wide_format.py @@ -6,7 +6,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -70,18 +70,17 @@ # %% fname = params.data / 'sample_annotation_placeholder.csv' annotation.to_csv(fname) -fname +fname # %% [markdo] # Save with samples in columns # %% fname = params.data / 'data_wide_sample_cols.csv' -# fillna('Filtered') +# fillna('Filtered') train_data.T.to_csv(fname) fname - # %% # 'data_wide_sample_cols.csv' diff --git a/project/01_1_train_CF.ipynb b/project/01_1_train_CF.ipynb index f67d3e710..d0fba2b54 100644 --- a/project/01_1_train_CF.ipynb +++ b/project/01_1_train_CF.ipynb @@ -15,7 +15,10 @@ "metadata": {}, "outputs": [], "source": [ + "\n", + "\n", "import logging\n", + "\n", "from pprint import pprint\n", "\n", "from fastai.basics import *\n", @@ -26,22 +29,23 @@ "from fastai.tabular.all import *\n", "from fastai.collab import *\n", "\n", - "# overwriting Recorder callback with custom plot_loss\n", - "from vaep.models import plot_loss, RecorderDump\n", - "from fastai import learner\n", - "learner.Recorder.plot_loss = plot_loss\n", - "# import fastai.callback.hook # Learner.summary\n", - "\n", - "\n", "import vaep\n", "import vaep.model\n", "import vaep.models as models\n", - "from vaep.io import datasplits\n", - "from vaep import sampling\n", - "\n", + "from vaep.models import plot_loss, RecorderDump\n", "\n", "import vaep.nb\n", + "from vaep import sampling\n", + "from vaep.io import datasplits\n", + "\n", "from vaep.logging import setup_logger\n", + "\n", + "# overwriting Recorder callback with custom plot_loss\n", + "from fastai import learner\n", + "learner.Recorder.plot_loss = plot_loss\n", + "# import fastai.callback.hook # Learner.summary\n", + "\n", + "\n", "logger = setup_logger(logger=logging.getLogger('vaep'))\n", "logger.info(\n", " \"Experiment 03 - Analysis of latent spaces and performance comparisions\")\n", @@ -96,7 +100,7 @@ "# model\n", "# Dimensionality of encoding dimension (latent space of model)\n", "latent_dim: int = 10\n", - "# hidden_layers:str = '128_64' # A space separated string of layers, '50 20' for the encoder, reverse will be use for decoder\n", + "# hidden_layers:str = '128_64' # Underscore separated string of layers, '128 64' for the encoder, reversed for decoder\n", "sample_idx_position: int = 0 # position of index which is sample ID\n", "model: str = 'CF' # model name\n", "model_key: str = 'CF' # potentially alternative key for model (grid search)\n", @@ -201,6 +205,17 @@ "data.train_X.sample(5)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3311709", + "metadata": {}, + "outputs": [], + "source": [ + "! add check that specified data is available\n", + "# silent error in fastai if e.g. target column is not available" + ] + }, { "cell_type": "markdown", "id": "6045414b", @@ -344,7 +359,7 @@ " target_column='intensity',\n", " model_kwargs=dict(n_factors=args.latent_dim,\n", " y_range=(int(data.train_X.min()),\n", - " int(data.train_X.max())+1)\n", + " int(data.train_X.max()) + 1)\n", " ),\n", " batch_size=args.batch_size)" ] @@ -586,7 +601,9 @@ "source": [ "### Test Datasplit\n", "\n", - "Fake NAs : Artificially created NAs. Some data was sampled and set explicitly to misssing before it was fed to the model for reconstruction." + "Fake NAs : Artificially created NAs. Some data was sampled and set\n", + "explicitly to misssing before it was fed to the model for\n", + "reconstruction." ] }, { diff --git a/project/01_1_train_CF.py b/project/01_1_train_CF.py index 642bcb02a..de18d7276 100644 --- a/project/01_1_train_CF.py +++ b/project/01_1_train_CF.py @@ -17,7 +17,10 @@ # # Collaborative Filtering # %% + + import logging + from pprint import pprint from fastai.basics import * @@ -28,22 +31,23 @@ from fastai.tabular.all import * from fastai.collab import * -# overwriting Recorder callback with custom plot_loss -from vaep.models import plot_loss, RecorderDump -from fastai import learner -learner.Recorder.plot_loss = plot_loss -# import fastai.callback.hook # Learner.summary - - import vaep import vaep.model import vaep.models as models -from vaep.io import datasplits -from vaep import sampling - +from vaep.models import plot_loss, RecorderDump import vaep.nb +from vaep import sampling +from vaep.io import datasplits + from vaep.logging import setup_logger + +# overwriting Recorder callback with custom plot_loss +from fastai import learner +learner.Recorder.plot_loss = plot_loss +# import fastai.callback.hook # Learner.summary + + logger = setup_logger(logger=logging.getLogger('vaep')) logger.info( "Experiment 03 - Analysis of latent spaces and performance comparisions") @@ -75,7 +79,7 @@ # model # Dimensionality of encoding dimension (latent space of model) latent_dim: int = 10 -# hidden_layers:str = '128_64' # A space separated string of layers, '50 20' for the encoder, reverse will be use for decoder +# hidden_layers:str = '128_64' # Underscore separated string of layers, '128 64' for the encoder, reversed for decoder sample_idx_position: int = 0 # position of index which is sample ID model: str = 'CF' # model name model_key: str = 'CF' # potentially alternative key for model (grid search) @@ -122,6 +126,10 @@ # %% data.train_X.sample(5) +# %% +# ! add check that specified data is available +# silent error in fastai if e.g. target column is not available + # %% [markdown] # Infer index names from long format @@ -192,7 +200,7 @@ target_column='intensity', model_kwargs=dict(n_factors=args.latent_dim, y_range=(int(data.train_X.min()), - int(data.train_X.max())+1) + int(data.train_X.max()) + 1) ), batch_size=args.batch_size) @@ -317,7 +325,9 @@ # %% [markdown] # ### Test Datasplit # -# Fake NAs : Artificially created NAs. Some data was sampled and set explicitly to misssing before it was fed to the model for reconstruction. +# Fake NAs : Artificially created NAs. Some data was sampled and set +# explicitly to misssing before it was fed to the model for +# reconstruction. # %% added_metrics = d_metrics.add_metrics(test_pred_fake_na, 'test_fake_na') diff --git a/project/01_1_train_DAE.ipynb b/project/01_1_train_DAE.ipynb index 01f3397db..f8991e229 100644 --- a/project/01_1_train_DAE.ipynb +++ b/project/01_1_train_DAE.ipynb @@ -24,30 +24,31 @@ "from fastai.callback.all import *\n", "from fastai.torch_basics import *\n", "\n", + "import sklearn\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.impute import SimpleImputer\n", + "\n", + "import vaep\n", + "from vaep import sampling\n", + "from vaep.io import datasplits\n", + "from vaep.models import ae\n", + "import vaep.models as models\n", + "import vaep.model\n", + "from vaep.analyzers import analyzers\n", + "\n", "# overwriting Recorder callback with custom plot_loss\n", "from vaep.models import plot_loss\n", "from fastai import learner\n", + "\n", "learner.Recorder.plot_loss = plot_loss\n", "# import fastai.callback.hook # Learner.summary\n", "\n", - "import sklearn\n", - "from sklearn.impute import SimpleImputer\n", - "from sklearn.preprocessing import StandardScaler\n", - "from sklearn.preprocessing import MinMaxScaler\n", "\n", - "import vaep\n", - "from vaep.analyzers import analyzers\n", - "import vaep.model\n", - "import vaep.models as models\n", - "from vaep.models import ae\n", "# from vaep.models import collab as vaep_collab\n", "# from vaep.io.datasets import DatasetWithTarget\n", "# from vaep.transform import VaepPipeline\n", - "from vaep.io import datasplits\n", "# from vaep.io.dataloaders import get_dls, get_test_dl\n", - "from vaep import sampling\n", "\n", - "import vaep.nb as config\n", "logger = vaep.logging.setup_logger(logging.getLogger('vaep'))\n", "logger.info(\n", " \"Experiment 03 - Analysis of latent spaces and performance comparisions\")\n", @@ -470,7 +471,9 @@ "id": "35704935-c739-48f5-9912-1c1ab1e6c4d3", "metadata": {}, "source": [ - "Adding a `EarlyStoppingCallback` results in an error. Potential fix in [PR3509](https://github.com/fastai/fastai/pull/3509) is not yet in current version. Try again later" + "Adding a `EarlyStoppingCallback` results in an error. Potential fix in\n", + "[PR3509](https://github.com/fastai/fastai/pull/3509) is not yet in\n", + "current version. Try again later" ] }, { @@ -776,7 +779,9 @@ "source": [ "### Test Datasplit\n", "\n", - "Fake NAs : Artificially created NAs. Some data was sampled and set explicitly to misssing before it was fed to the model for reconstruction." + "Fake NAs : Artificially created NAs. Some data was sampled and set\n", + "explicitly to misssing before it was fed to the model for\n", + "reconstruction." ] }, { diff --git a/project/01_1_train_DAE.py b/project/01_1_train_DAE.py index f336a1d30..afe60065f 100644 --- a/project/01_1_train_DAE.py +++ b/project/01_1_train_DAE.py @@ -6,7 +6,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -24,30 +24,31 @@ from fastai.callback.all import * from fastai.torch_basics import * +import sklearn +from sklearn.preprocessing import StandardScaler +from sklearn.impute import SimpleImputer + +import vaep +from vaep import sampling +from vaep.io import datasplits +from vaep.models import ae +import vaep.models as models +import vaep.model +from vaep.analyzers import analyzers + # overwriting Recorder callback with custom plot_loss from vaep.models import plot_loss from fastai import learner + learner.Recorder.plot_loss = plot_loss # import fastai.callback.hook # Learner.summary -import sklearn -from sklearn.impute import SimpleImputer -from sklearn.preprocessing import StandardScaler -from sklearn.preprocessing import MinMaxScaler -import vaep -from vaep.analyzers import analyzers -import vaep.model -import vaep.models as models -from vaep.models import ae # from vaep.models import collab as vaep_collab # from vaep.io.datasets import DatasetWithTarget # from vaep.transform import VaepPipeline -from vaep.io import datasplits # from vaep.io.dataloaders import get_dls, get_test_dl -from vaep import sampling -import vaep.nb as config logger = vaep.logging.setup_logger(logging.getLogger('vaep')) logger.info( "Experiment 03 - Analysis of latent spaces and performance comparisions") @@ -257,7 +258,9 @@ analysis.learn.show_training_loop() # %% [markdown] -# Adding a `EarlyStoppingCallback` results in an error. Potential fix in [PR3509](https://github.com/fastai/fastai/pull/3509) is not yet in current version. Try again later +# Adding a `EarlyStoppingCallback` results in an error. Potential fix in +# [PR3509](https://github.com/fastai/fastai/pull/3509) is not yet in +# current version. Try again later # %% # learn.summary() @@ -393,7 +396,9 @@ # %% [markdown] # ### Test Datasplit # -# Fake NAs : Artificially created NAs. Some data was sampled and set explicitly to misssing before it was fed to the model for reconstruction. +# Fake NAs : Artificially created NAs. Some data was sampled and set +# explicitly to misssing before it was fed to the model for +# reconstruction. # %% added_metrics = d_metrics.add_metrics(test_pred_fake_na, 'test_fake_na') diff --git a/project/01_1_train_KNN.ipynb b/project/01_1_train_KNN.ipynb index 895914e80..5ff0cc428 100644 --- a/project/01_1_train_KNN.ipynb +++ b/project/01_1_train_KNN.ipynb @@ -66,25 +66,26 @@ "outputs": [], "source": [ "# files and folders\n", - "folder_experiment:str = 'runs/example' # Datasplit folder with data for experiment\n", - "folder_data:str = '' # specify data directory if needed\n", - "file_format: str = 'csv' # file format of create splits, default pickle (pkl)\n", - "fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv' # Machine parsed metadata from rawfile workflow\n", + "folder_experiment: str = 'runs/example' # Datasplit folder with data for experiment\n", + "folder_data: str = '' # specify data directory if needed\n", + "file_format: str = 'csv' # file format of create splits, default pickle (pkl)\n", + "# Machine parsed metadata from rawfile workflow\n", + "fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv'\n", "# training\n", - "epochs_max:int = 50 # Maximum number of epochs\n", + "epochs_max: int = 50 # Maximum number of epochs\n", "# early_stopping:bool = True # Wheather to use early stopping or not\n", - "batch_size:int = 64 # Batch size for training (and evaluation)\n", - "cuda:bool = True # Whether to use a GPU for training\n", + "batch_size: int = 64 # Batch size for training (and evaluation)\n", + "cuda: bool = True # Whether to use a GPU for training\n", "# model\n", - "neighbors:int = 3 # number of neigherst neighbors to use\n", - "force_train:bool = True # Force training when saved model could be used. Per default re-train model\n", - "sample_idx_position: int = 0 # position of index which is sample ID\n", - "model: str = 'KNN' # model name\n", - "model_key: str = 'KNN' # potentially alternative key for model (grid search)\n", - "save_pred_real_na: bool = True # Save all predictions for missing values\n", + "neighbors: int = 3 # number of neigherst neighbors to use\n", + "force_train: bool = True # Force training when saved model could be used. Per default re-train model\n", + "sample_idx_position: int = 0 # position of index which is sample ID\n", + "model: str = 'KNN' # model name\n", + "model_key: str = 'KNN' # potentially alternative key for model (grid search)\n", + "save_pred_real_na: bool = True # Save all predictions for missing values\n", "# metadata -> defaults for metadata extracted from machine data\n", - "meta_date_col: str = None # date column in meta data\n", - "meta_cat_col: str = None # category column in meta data" + "meta_date_col: str = None # date column in meta data\n", + "meta_cat_col: str = None # category column in meta data" ] }, { @@ -142,7 +143,7 @@ "metadata": {}, "outputs": [], "source": [ - "data = datasplits.DataSplits.from_folder(args.data, file_format=args.file_format) " + "data = datasplits.DataSplits.from_folder(args.data, file_format=args.file_format)" ] }, { @@ -201,7 +202,7 @@ "outputs": [], "source": [ "freq_feat = sampling.frequency_by_index(data.train_X, 0)\n", - "freq_feat.head() # training data" + "freq_feat.head() # training data" ] }, { @@ -217,7 +218,7 @@ "id": "382b887d", "metadata": {}, "source": [ - "The validation fake NA is used to by all models to evaluate training performance. " + "The validation fake NA is used to by all models to evaluate training performance." ] }, { @@ -316,7 +317,7 @@ "metadata": {}, "outputs": [], "source": [ - "val_pred_fake_na[args.model] = pred\n", + "val_pred_fake_na[args.model_key] = pred\n", "val_pred_fake_na" ] }, @@ -327,7 +328,7 @@ "metadata": {}, "outputs": [], "source": [ - "test_pred_fake_na[args.model] = pred\n", + "test_pred_fake_na[args.model_key] = pred\n", "test_pred_fake_na" ] }, @@ -382,8 +383,8 @@ "source": [ "## Comparisons\n", "\n", - "> Note: The interpolated values have less predictions for comparisons than the ones based on models (CF, DAE, VAE) \n", - "> The comparison is therefore not 100% fair as the interpolated samples will have more common ones (especailly the sparser the data) \n", + "> Note: The interpolated values have less predictions for comparisons than the ones based on models (CF, DAE, VAE)\n", + "> The comparison is therefore not 100% fair as the interpolated samples will have more common ones (especailly the sparser the data)\n", "> Could be changed." ] }, @@ -396,7 +397,7 @@ "\n", "- all measured (identified, observed) peptides in validation data\n", "\n", - "> Does not make to much sense to compare collab and AEs, \n", + "> Does not make to much sense to compare collab and AEs,\n", "> as the setup differs of training and validation data differs" ] }, @@ -437,7 +438,9 @@ "source": [ "### Test Datasplit\n", "\n", - "Fake NAs : Artificially created NAs. Some data was sampled and set explicitly to misssing before it was fed to the model for reconstruction." + "Fake NAs : Artificially created NAs. Some data was sampled and set\n", + "explicitly to misssing before it was fed to the model for\n", + "reconstruction." ] }, { @@ -519,7 +522,7 @@ "metadata": {}, "outputs": [], "source": [ - "figures # switch to fnames?" + "figures # switch to fnames?" ] }, { @@ -529,8 +532,8 @@ "metadata": {}, "outputs": [], "source": [ - "args.n_params = 1 # the number of neighbors to consider\n", - "args.dump(fname=args.out_models/ f\"model_config_{args.model_key}.yaml\")\n", + "args.n_params = 1 # the number of neighbors to consider\n", + "args.dump(fname=args.out_models / f\"model_config_{args.model_key}.yaml\")\n", "args" ] } diff --git a/project/01_1_train_KNN.py b/project/01_1_train_KNN.py index 5e8528b5c..ddc390066 100644 --- a/project/01_1_train_KNN.py +++ b/project/01_1_train_KNN.py @@ -6,7 +6,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -44,25 +44,26 @@ # %% tags=["parameters"] # files and folders -folder_experiment:str = 'runs/example' # Datasplit folder with data for experiment -folder_data:str = '' # specify data directory if needed -file_format: str = 'csv' # file format of create splits, default pickle (pkl) -fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv' # Machine parsed metadata from rawfile workflow +folder_experiment: str = 'runs/example' # Datasplit folder with data for experiment +folder_data: str = '' # specify data directory if needed +file_format: str = 'csv' # file format of create splits, default pickle (pkl) +# Machine parsed metadata from rawfile workflow +fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv' # training -epochs_max:int = 50 # Maximum number of epochs +epochs_max: int = 50 # Maximum number of epochs # early_stopping:bool = True # Wheather to use early stopping or not -batch_size:int = 64 # Batch size for training (and evaluation) -cuda:bool = True # Whether to use a GPU for training +batch_size: int = 64 # Batch size for training (and evaluation) +cuda: bool = True # Whether to use a GPU for training # model -neighbors:int = 3 # number of neigherst neighbors to use -force_train:bool = True # Force training when saved model could be used. Per default re-train model -sample_idx_position: int = 0 # position of index which is sample ID -model: str = 'KNN' # model name -model_key: str = 'KNN' # potentially alternative key for model (grid search) -save_pred_real_na: bool = True # Save all predictions for missing values +neighbors: int = 3 # number of neigherst neighbors to use +force_train: bool = True # Force training when saved model could be used. Per default re-train model +sample_idx_position: int = 0 # position of index which is sample ID +model: str = 'KNN' # model name +model_key: str = 'KNN' # potentially alternative key for model (grid search) +save_pred_real_na: bool = True # Save all predictions for missing values # metadata -> defaults for metadata extracted from machine data -meta_date_col: str = None # date column in meta data -meta_cat_col: str = None # category column in meta data +meta_date_col: str = None # date column in meta data +meta_cat_col: str = None # category column in meta data # %% [markdown] # Some argument transformations @@ -83,7 +84,7 @@ # ## Load data in long format # %% -data = datasplits.DataSplits.from_folder(args.data, file_format=args.file_format) +data = datasplits.DataSplits.from_folder(args.data, file_format=args.file_format) # %% [markdown] # data is loaded in long format @@ -106,13 +107,13 @@ # %% freq_feat = sampling.frequency_by_index(data.train_X, 0) -freq_feat.head() # training data +freq_feat.head() # training data # %% [markdown] # ### Simulated missing values # %% [markdown] -# The validation fake NA is used to by all models to evaluate training performance. +# The validation fake NA is used to by all models to evaluate training performance. # %% val_pred_fake_na = data.val_y.to_frame(name='observed') @@ -152,11 +153,11 @@ pred # %% -val_pred_fake_na[args.model] = pred +val_pred_fake_na[args.model_key] = pred val_pred_fake_na # %% -test_pred_fake_na[args.model] = pred +test_pred_fake_na[args.model_key] = pred test_pred_fake_na # %% [markdown] @@ -182,8 +183,8 @@ # %% [markdown] # ## Comparisons # -# > Note: The interpolated values have less predictions for comparisons than the ones based on models (CF, DAE, VAE) -# > The comparison is therefore not 100% fair as the interpolated samples will have more common ones (especailly the sparser the data) +# > Note: The interpolated values have less predictions for comparisons than the ones based on models (CF, DAE, VAE) +# > The comparison is therefore not 100% fair as the interpolated samples will have more common ones (especailly the sparser the data) # > Could be changed. # %% [markdown] @@ -191,7 +192,7 @@ # # - all measured (identified, observed) peptides in validation data # -# > Does not make to much sense to compare collab and AEs, +# > Does not make to much sense to compare collab and AEs, # > as the setup differs of training and validation data differs # %% @@ -208,7 +209,9 @@ # %% [markdown] # ### Test Datasplit # -# Fake NAs : Artificially created NAs. Some data was sampled and set explicitly to misssing before it was fed to the model for reconstruction. +# Fake NAs : Artificially created NAs. Some data was sampled and set +# explicitly to misssing before it was fed to the model for +# reconstruction. # %% added_metrics = d_metrics.add_metrics(test_pred_fake_na, 'test_fake_na') @@ -237,9 +240,9 @@ # ## Config # %% -figures # switch to fnames? +figures # switch to fnames? # %% -args.n_params = 1 # the number of neighbors to consider -args.dump(fname=args.out_models/ f"model_config_{args.model_key}.yaml") +args.n_params = 1 # the number of neighbors to consider +args.dump(fname=args.out_models / f"model_config_{args.model_key}.yaml") args diff --git a/project/01_1_train_Median.ipynb b/project/01_1_train_Median.ipynb index ab7e1965c..3c2933e5d 100644 --- a/project/01_1_train_Median.ipynb +++ b/project/01_1_train_Median.ipynb @@ -66,17 +66,17 @@ "outputs": [], "source": [ "# files and folders\n", - "folder_experiment:str = 'runs/example' # Datasplit folder with data for experiment\n", - "file_format: str = 'csv' # file format of create splits, default pickle (pkl)\n", - "fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv' # Machine parsed metadata from rawfile workflow\n", + "folder_experiment: str = 'runs/example' # Datasplit folder with data for experiment\n", + "file_format: str = 'csv' # file format of create splits, default pickle (pkl)\n", + "fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv' # Metadata for samples\n", "# model\n", - "sample_idx_position: int = 0 # position of index which is sample ID\n", - "model_key: str = 'Median' # model key (lower cased version will be used for file names)\n", - "model: str = 'Median' # model name\n", - "save_pred_real_na: bool = True # Save all predictions for real na\n", + "sample_idx_position: int = 0 # position of index which is sample ID\n", + "model_key: str = 'Median' # model key (lower cased version will be used for file names)\n", + "model: str = 'Median' # model name\n", + "save_pred_real_na: bool = True # Save all predictions for real na\n", "# metadata -> defaults for metadata extracted from machine data\n", - "meta_date_col: str = None # date column in meta data\n", - "meta_cat_col: str = None # category column in meta data" + "meta_date_col: str = None # date column in meta data\n", + "meta_cat_col: str = None # category column in meta data" ] }, { @@ -146,7 +146,7 @@ "metadata": {}, "outputs": [], "source": [ - "data = datasplits.DataSplits.from_folder(args.data, file_format=args.file_format) " + "data = datasplits.DataSplits.from_folder(args.data, file_format=args.file_format)" ] }, { @@ -172,7 +172,7 @@ "id": "fa7dcd09", "metadata": {}, "source": [ - "Infer index names from long format " + "Infer index names from long format" ] }, { @@ -184,7 +184,7 @@ "source": [ "index_columns = list(data.train_X.index.names)\n", "sample_id = index_columns.pop(args.sample_idx_position)\n", - "if len(index_columns) == 1: \n", + "if len(index_columns) == 1:\n", " index_column = index_columns.pop()\n", " index_columns = None\n", " logger.info(f\"{sample_id = }, single feature: {index_column = }\")\n", @@ -241,7 +241,7 @@ "outputs": [], "source": [ "freq_feat = vaep.io.datasplits.load_freq(args.data)\n", - "freq_feat.head() # training data" + "freq_feat.head() # training data" ] }, { @@ -257,7 +257,7 @@ "id": "23ac9141", "metadata": {}, "source": [ - "The validation fake NA is used to by all models to evaluate training performance. " + "The validation fake NA is used to by all models to evaluate training performance." ] }, { @@ -275,12 +275,13 @@ "cell_type": "code", "execution_count": null, "id": "68ea1649", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "outputs": [], "source": [ "test_pred_fake_na = data.test_y.to_frame(name='observed')\n", - "test_pred_fake_na.describe()\n", - "\n" + "test_pred_fake_na.describe()" ] }, { @@ -322,7 +323,7 @@ "metadata": {}, "outputs": [], "source": [ - "# interpolated = vaep.pandas.interpolate(wide_df = data.train_X) \n", + "# interpolated = vaep.pandas.interpolate(wide_df = data.train_X)\n", "# val_pred_fake_na['interpolated'] = interpolated\n", "# test_pred_fake_na['interpolated'] = interpolated\n", "# del interpolated\n", @@ -414,7 +415,7 @@ "metadata": {}, "outputs": [], "source": [ - "feat_freq_val.value_counts().sort_index().head() # require more than one feat?" + "feat_freq_val.value_counts().sort_index().head() # require more than one feat?" ] }, { @@ -429,9 +430,11 @@ "errors_val = errors_val.join(freq_feat).sort_values(by='freq', ascending=True)\n", "\n", "\n", - "errors_val_smoothed = errors_val.copy() #.loc[feat_freq_val > 1]\n", - "errors_val_smoothed[errors_val.columns[:-1]] = errors_val[errors_val.columns[:-1]].rolling(window=200, min_periods=1).mean()\n", - "ax = errors_val_smoothed.plot(x='freq', figsize=(15,10) )\n", + "errors_val_smoothed = errors_val.copy() # .loc[feat_freq_val > 1]\n", + "errors_val_smoothed[errors_val.columns[:-\n", + " 1]] = errors_val[errors_val.columns[:-\n", + " 1]].rolling(window=200, min_periods=1).mean()\n", + "ax = errors_val_smoothed.plot(x='freq', figsize=(15, 10))\n", "# errors_val_smoothed" ] }, @@ -463,8 +466,8 @@ "source": [ "## Comparisons\n", "\n", - "> Note: The interpolated values have less predictions for comparisons than the ones based on models (CF, DAE, VAE) \n", - "> The comparison is therefore not 100% fair as the interpolated samples will have more common ones (especailly the sparser the data) \n", + "> Note: The interpolated values have less predictions for comparisons than the ones based on models (CF, DAE, VAE)\n", + "> The comparison is therefore not 100% fair as the interpolated samples will have more common ones (especailly the sparser the data)\n", "> Could be changed." ] }, @@ -477,7 +480,7 @@ "\n", "- all measured (identified, observed) peptides in validation data\n", "\n", - "> Does not make too much sense to compare collab and AEs, \n", + "> Does not make too much sense to compare collab and AEs,\n", "> as the setup differs of training and validation data differs" ] }, @@ -518,7 +521,9 @@ "source": [ "### Test Datasplit\n", "\n", - "Fake NAs : Artificially created NAs. Some data was sampled and set explicitly to misssing before it was fed to the model for reconstruction." + "Fake NAs : Artificially created NAs. Some data was sampled and set\n", + "explicitly to misssing before it was fed to the model for\n", + "reconstruction." ] }, { @@ -597,7 +602,7 @@ "source": [ "# val\n", "fname = args.out_preds / f\"pred_val_{args.model_key}.csv\"\n", - "setattr(args, fname.stem, fname.as_posix()) # add [] assignment?\n", + "setattr(args, fname.stem, fname.as_posix()) # add [] assignment?\n", "val_pred_fake_na.to_csv(fname)\n", "# test\n", "fname = args.out_preds / f\"pred_test_{args.model_key}.csv\"\n", @@ -620,7 +625,7 @@ "metadata": {}, "outputs": [], "source": [ - "figures # switch to fnames?" + "figures # switch to fnames?" ] }, { @@ -630,7 +635,7 @@ "metadata": {}, "outputs": [], "source": [ - "args.dump(fname=args.out_models/ f\"model_config_{args.model_key}.yaml\")\n", + "args.dump(fname=args.out_models / f\"model_config_{args.model_key}.yaml\")\n", "args" ] } diff --git a/project/01_1_train_Median.py b/project/01_1_train_Median.py index 33d169b61..72a7cf562 100644 --- a/project/01_1_train_Median.py +++ b/project/01_1_train_Median.py @@ -6,7 +6,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -43,17 +43,17 @@ # %% tags=["parameters"] # files and folders -folder_experiment:str = 'runs/example' # Datasplit folder with data for experiment -file_format: str = 'csv' # file format of create splits, default pickle (pkl) -fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv' # Machine parsed metadata from rawfile workflow +folder_experiment: str = 'runs/example' # Datasplit folder with data for experiment +file_format: str = 'csv' # file format of create splits, default pickle (pkl) +fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv' # Metadata for samples # model -sample_idx_position: int = 0 # position of index which is sample ID -model_key: str = 'Median' # model key (lower cased version will be used for file names) -model: str = 'Median' # model name -save_pred_real_na: bool = True # Save all predictions for real na +sample_idx_position: int = 0 # position of index which is sample ID +model_key: str = 'Median' # model key (lower cased version will be used for file names) +model: str = 'Median' # model name +save_pred_real_na: bool = True # Save all predictions for real na # metadata -> defaults for metadata extracted from machine data -meta_date_col: str = None # date column in meta data -meta_cat_col: str = None # category column in meta data +meta_date_col: str = None # date column in meta data +meta_cat_col: str = None # category column in meta data # %% [markdown] @@ -79,7 +79,7 @@ # ## Load data in long format # %% -data = datasplits.DataSplits.from_folder(args.data, file_format=args.file_format) +data = datasplits.DataSplits.from_folder(args.data, file_format=args.file_format) # %% [markdown] # data is loaded in long format @@ -88,12 +88,12 @@ data.train_X.sample(5) # %% [markdown] -# Infer index names from long format +# Infer index names from long format # %% index_columns = list(data.train_X.index.names) sample_id = index_columns.pop(args.sample_idx_position) -if len(index_columns) == 1: +if len(index_columns) == 1: index_column = index_columns.pop() index_columns = None logger.info(f"{sample_id = }, single feature: {index_column = }") @@ -126,13 +126,13 @@ # %% freq_feat = vaep.io.datasplits.load_freq(args.data) -freq_feat.head() # training data +freq_feat.head() # training data # %% [markdown] # ### Produce some addional fake samples # %% [markdown] -# The validation fake NA is used to by all models to evaluate training performance. +# The validation fake NA is used to by all models to evaluate training performance. # %% val_pred_fake_na = data.val_y.to_frame(name='observed') @@ -143,7 +143,6 @@ test_pred_fake_na.describe() - # %% [markdown] # ## Data in wide format # @@ -159,7 +158,7 @@ # ### Add interpolation performance # %% -# interpolated = vaep.pandas.interpolate(wide_df = data.train_X) +# interpolated = vaep.pandas.interpolate(wide_df = data.train_X) # val_pred_fake_na['interpolated'] = interpolated # test_pred_fake_na['interpolated'] = interpolated # del interpolated @@ -207,7 +206,7 @@ # freq_feat.to_frame('overall').join(feat_freq_val).plot.scatter(x='overall', y='freq_val') # %% -feat_freq_val.value_counts().sort_index().head() # require more than one feat? +feat_freq_val.value_counts().sort_index().head() # require more than one feat? # %% errors_val = val_pred_fake_na.drop('observed', axis=1).sub(val_pred_fake_na['observed'], axis=0) @@ -215,9 +214,11 @@ errors_val = errors_val.join(freq_feat).sort_values(by='freq', ascending=True) -errors_val_smoothed = errors_val.copy() #.loc[feat_freq_val > 1] -errors_val_smoothed[errors_val.columns[:-1]] = errors_val[errors_val.columns[:-1]].rolling(window=200, min_periods=1).mean() -ax = errors_val_smoothed.plot(x='freq', figsize=(15,10) ) +errors_val_smoothed = errors_val.copy() # .loc[feat_freq_val > 1] +errors_val_smoothed[errors_val.columns[:- + 1]] = errors_val[errors_val.columns[:- + 1]].rolling(window=200, min_periods=1).mean() +ax = errors_val_smoothed.plot(x='freq', figsize=(15, 10)) # errors_val_smoothed # %% @@ -230,8 +231,8 @@ # %% [markdown] # ## Comparisons # -# > Note: The interpolated values have less predictions for comparisons than the ones based on models (CF, DAE, VAE) -# > The comparison is therefore not 100% fair as the interpolated samples will have more common ones (especailly the sparser the data) +# > Note: The interpolated values have less predictions for comparisons than the ones based on models (CF, DAE, VAE) +# > The comparison is therefore not 100% fair as the interpolated samples will have more common ones (especailly the sparser the data) # > Could be changed. # %% [markdown] @@ -239,7 +240,7 @@ # # - all measured (identified, observed) peptides in validation data # -# > Does not make too much sense to compare collab and AEs, +# > Does not make too much sense to compare collab and AEs, # > as the setup differs of training and validation data differs # %% @@ -256,7 +257,9 @@ # %% [markdown] # ### Test Datasplit # -# Fake NAs : Artificially created NAs. Some data was sampled and set explicitly to misssing before it was fed to the model for reconstruction. +# Fake NAs : Artificially created NAs. Some data was sampled and set +# explicitly to misssing before it was fed to the model for +# reconstruction. # %% added_metrics = d_metrics.add_metrics(test_pred_fake_na, 'test_fake_na') @@ -285,7 +288,7 @@ # %% # val fname = args.out_preds / f"pred_val_{args.model_key}.csv" -setattr(args, fname.stem, fname.as_posix()) # add [] assignment? +setattr(args, fname.stem, fname.as_posix()) # add [] assignment? val_pred_fake_na.to_csv(fname) # test fname = args.out_preds / f"pred_test_{args.model_key}.csv" @@ -296,8 +299,8 @@ # ## Config # %% -figures # switch to fnames? +figures # switch to fnames? # %% -args.dump(fname=args.out_models/ f"model_config_{args.model_key}.yaml") +args.dump(fname=args.out_models / f"model_config_{args.model_key}.yaml") args diff --git a/project/01_1_train_NAGuideR_methods.R b/project/01_1_train_NAGuideR_methods.R index 0ef65fdc2..ce2f91af9 100644 --- a/project/01_1_train_NAGuideR_methods.R +++ b/project/01_1_train_NAGuideR_methods.R @@ -20,30 +20,32 @@ # - BiocManager could be moved to methods who are installed from BioConductor # + vscode={"languageId": "r"} -packages_base_R <- c("BiocManager", "reshape2", "data.table", "readr", "tibble") +packages_base_R <- + c("BiocManager", "reshape2", "data.table", "readr", "tibble") -install_rpackage <- function(pkg){ - # If not installed, install the package - if (!require(pkg, character.only = TRUE)) { - install.packages(pkg) - library(pkg, character.only = TRUE) - } - +install_rpackage <- function(pkg) { + # If not installed, install the package + if (!require(pkg, character.only = TRUE)) { + install.packages(pkg) + library(pkg, character.only = TRUE) + } + } # used in the large imputation function for two packages -install_bioconductor <- function(pkg){ - # If not installed, install the package - if (!require(pkg, character.only = TRUE)) { - BiocManager::install(pkg) - library(pkg, character.only = TRUE) - } - +install_bioconductor <- function(pkg) { + # If not installed, install the package + if (!require(pkg, character.only = TRUE)) { + BiocManager::install(pkg) + library(pkg, character.only = TRUE) + } + } + for (package in packages_base_R) { # Check if the package is already installed - install_rpackage(pkg=package) + install_rpackage(pkg = package) } # - @@ -57,173 +59,289 @@ for (package in packages_base_R) { # - code is only slightly adapted from repo to run here, mainly to install packages on the fly # + vscode={"languageId": "r"} -nafunctions <- function(x,method="zero"){ - df<-df1<-as.data.frame(x) - method<-tolower(method) - if(method=="zero"){ - df[is.na(df)]<-0 - } - else if(method=="minimum"){ - df[is.na(df)]<-min(df1,na.rm = TRUE) - } - else if(method=="colmedian"){ - install_rpackage('e1071') - df<-impute(df1,what ="median") - } - else if(method=="rowmedian"){ - install_rpackage('e1071') - dfx<-impute(t(df1),what ="median") - df<-t(dfx) - } - else if(method=="knn_impute"){ - install_bioconductor('impute') - data_zero1<-impute.knn(as.matrix(df1),k = 10, rowmax = 1, colmax = 1)#rowmax = 0.9, colmax = 0.9 - df<-data_zero1$data - } - else if(method=="seqknn"){ - if(!require(SeqKnn)){ - install.packages("src/R_NAGuideR/SeqKnn_1.0.1.tar.gz", repos = NULL,type="source") - library(SeqKnn) - } - df <- SeqKNN(df1,k = 10) - } - else if(method=="bpca"){ - install_bioconductor('pcaMethods') - data_zero1<-pcaMethods::pca(as.matrix(df1), nPcs = ncol(df1)-1, method = "bpca", maxSteps =100) - df<-completeObs(data_zero1) - } - else if(method=="svdmethod"){ - install_bioconductor('pcaMethods') - data_zero1<-pcaMethods::pca(as.matrix(df1), nPcs = ncol(df1)-1, method = "svdImpute") - df<-completeObs(data_zero1) - } - else if(method=="lls"){ - install_bioconductor('pcaMethods') - data_zero1<-llsImpute(t(df1), k = 10) - df<-t(completeObs(data_zero1)) - } - else if(method=="mle"){ - install_rpackage('norm') - xxm<-as.matrix(df1) - ss <- norm::prelim.norm(xxm) - thx <- norm::em.norm(ss) - norm::rngseed(123) - df <- norm::imp.norm(ss, thx, xxm) - } - else if(method=="qrilc"){ - install_bioconductor("impute") - install_bioconductor("pcaMethods") - install_rpackage('imputeLCMD') - xxm<-t(df1) - data_zero1 <- imputeLCMD::impute.QRILC(xxm, tune.sigma = 1)[[1]] - df<-t(data_zero1) - } - else if(method=="mindet"){ - install_bioconductor("impute") - install_bioconductor("pcaMethods") - install_rpackage('imputeLCMD') - xxm<-as.matrix(df1) - df <- imputeLCMD::impute.MinDet(xxm, q = 0.01) - } - else if(method=="minprob"){ - install_bioconductor("impute") - install_bioconductor("pcaMethods") - install_rpackage('imputeLCMD') - xxm<-as.matrix(df1) - df <- imputeLCMD::impute.MinProb(xxm, q = 0.01, tune.sigma = 1) - } - else if(method=="irm"){ - install_rpackage('VIM') - df <- irmi(df1, trace = TRUE,imp_var=FALSE) - rownames(df)<-rownames(df1) - } - else if(method=="impseq"){ - install_rpackage('rrcovNA') - df <- impSeq(df1) - } - else if(method=="impseqrob"){ - install_rpackage('rrcovNA') - data_zero1 <- impSeqRob(df1, alpha=0.9) - df<-data_zero1$x - } - else if(method=="mice-norm"){ - install_rpackage('mice') - minum<-5 - datareadmi<-mice(df1,m=minum,seed = 1234, method ="norm") - newdatareadmi<-0 - for (i in 1:minum) { - newdatareadmi<-complete(datareadmi,action = i)+newdatareadmi - } - df<-newdatareadmi/minum - rownames(df)<-rownames(df1) - } - else if(method=="mice-cart"){ - install_rpackage('mice') - minum<-5 - datareadmi<-mice(df1,m=minum,seed = 1234, method ="cart") - newdatareadmi<-0 - for (i in 1:minum) { - newdatareadmi<-complete(datareadmi,action = i)+newdatareadmi - } - df<-newdatareadmi/minum - rownames(df)<-rownames(df1) - } - else if(method=="trknn"){ - source('src/R_NAGuideR/Imput_funcs.r') - sim_trKNN_wrapper <- function(data) { - result <- data %>% as.matrix %>% t %>% imputeKNN(., k=10, distance='truncation', perc=0) %>% t - return(result) - } - df1x <- sim_trKNN_wrapper(t(df1)) - df<-as.data.frame(t(df1x)) +nafunctions <- function(x, method = "zero") { + df <- df1 <- as.data.frame(x) + method <- tolower(method) + if (method == "zero") { + df[is.na(df)] <- 0 + } + else if (method == "minimum") { + df[is.na(df)] <- min(df1, na.rm = TRUE) + } + else if (method == "colmedian") { + install_rpackage('e1071') + df <- impute(df1, what = "median") + } + else if (method == "rowmedian") { + install_rpackage('e1071') + dfx <- impute(t(df1), what = "median") + df <- t(dfx) + } + else if (method == "knn_impute") { + install_bioconductor('impute') + data_zero1 <- + impute.knn(as.matrix(df1), + k = 10, + rowmax = 1, + colmax = 1)#rowmax = 0.9, colmax = 0.9 + df <- data_zero1$data + } + else if (method == "seqknn") { + if (!require(SeqKnn)) { + install.packages("src/R_NAGuideR/SeqKnn_1.0.1.tar.gz", + repos = NULL, + type = "source") + library(SeqKnn) } - else if(method=="rf"){ - install_rpackage("missForest") - data_zero1 <- missForest(t(df1), maxiter =10, - ntree = 20 # input$rfntrees - ,mtry=floor(nrow(df1)^(1/3)),verbose = TRUE) - df<-t(data_zero1$ximp) + df <- SeqKNN(df1, k = 10) + } + else if (method == "bpca") { + install_bioconductor('pcaMethods') + data_zero1 <- + pcaMethods::pca( + as.matrix(df1), + nPcs = ncol(df1) - 1, + method = "bpca", + maxSteps = 100 + ) + df <- completeObs(data_zero1) + } + else if (method == "svdmethod") { + install_bioconductor('pcaMethods') + data_zero1 <- + pcaMethods::pca(as.matrix(df1), + nPcs = ncol(df1) - 1, + method = "svdImpute") + df <- completeObs(data_zero1) + } + else if (method == "lls") { + install_bioconductor('pcaMethods') + data_zero1 <- llsImpute(t(df1), k = 10) + df <- t(completeObs(data_zero1)) + } + else if (method == "mle") { + install_rpackage('norm') + xxm <- as.matrix(df1) + ss <- norm::prelim.norm(xxm) + thx <- norm::em.norm(ss) + norm::rngseed(123) + df <- norm::imp.norm(ss, thx, xxm) + } + else if (method == "qrilc") { + install_bioconductor("impute") + install_bioconductor("pcaMethods") + install_rpackage('imputeLCMD') + xxm <- t(df1) + data_zero1 <- + imputeLCMD::impute.QRILC(xxm, tune.sigma = 1)[[1]] + df <- t(data_zero1) + } + else if (method == "mindet") { + install_bioconductor("impute") + install_bioconductor("pcaMethods") + install_rpackage('imputeLCMD') + xxm <- as.matrix(df1) + df <- imputeLCMD::impute.MinDet(xxm, q = 0.01) + } + else if (method == "minprob") { + install_bioconductor("impute") + install_bioconductor("pcaMethods") + install_rpackage('imputeLCMD') + xxm <- as.matrix(df1) + df <- + imputeLCMD::impute.MinProb(xxm, q = 0.01, tune.sigma = 1) + } + else if (method == "irm") { + install_rpackage('VIM') + df <- irmi(df1, trace = TRUE, imp_var = FALSE) + rownames(df) <- rownames(df1) + } + else if (method == "impseq") { + install_rpackage('rrcovNA') + df <- impSeq(df1) + } + else if (method == "impseqrob") { + install_rpackage('rrcovNA') + data_zero1 <- impSeqRob(df1, alpha = 0.9) + df <- data_zero1$x + } + else if (method == "mice-norm") { + install_rpackage('mice') + minum <- 5 + datareadmi <- mice(df1, + m = minum, + seed = 1234, + method = "norm") + newdatareadmi <- 0 + for (i in 1:minum) { + newdatareadmi <- complete(datareadmi, action = i) + newdatareadmi } - else if(method=="pi"){ - width <- 0.3 # input$piwidth - downshift <- 1.8 # input$pidownshift - for(i in 1:ncol(df1)){ - temp <- df1[[i]] - if(sum(is.na(temp))>0){ - temp.sd <- width * sd(temp[!is.na(temp)], na.rm = TRUE) - temp.mean <- mean(temp[!is.na(temp)], na.rm = TRUE) - downshift * sd(temp[!is.na(temp)], na.rm = TRUE) - n.missing <- sum(is.na(temp)) - temp[is.na(temp)] <- rnorm(n.missing, mean = temp.mean, sd = temp.sd) - df[[i]]<-temp - } - } - df + df <- newdatareadmi / minum + rownames(df) <- rownames(df1) + } + else if (method == "mice-cart") { + install_rpackage('mice') + minum <- 5 + datareadmi <- mice(df1, + m = minum, + seed = 1234, + method = "cart") + newdatareadmi <- 0 + for (i in 1:minum) { + newdatareadmi <- complete(datareadmi, action = i) + newdatareadmi } - # else if(method=="grr"){ - # library(DreamAI) - # df<-impute.RegImpute(data=as.matrix(df1), fillmethod = "row_mean", maxiter_RegImpute = 10,conv_nrmse = 1e-03) + df <- newdatareadmi / minum + rownames(df) <- rownames(df1) + } + else if (method == "trknn") { + source('src/R_NAGuideR/Imput_funcs.r') + # sim_trKNN_wrapper <- function(data) { + # result <- data %>% as.matrix %>% t %>% imputeKNN(., k=10, distance='truncation', perc=0) %>% t + # return(result) # } - else if(method=="gms"){ - # install.packages('GMSimpute') - if(!require(GMSimpute)){ - install.packages("src/R_NAGuideR/GMSimpute_0.0.1.1.tar.gz", repos = NULL, type="source"); - library(GMSimpute) + # df1x <- sim_trKNN_wrapper(t(df1)) + # df<-as.data.frame(t(df1x)) + df <- + imputeKNN(as.matrix(df), + k = 10, + distance = 'truncation', + perc = 0) + df <- as.data.frame(df) + } + else if (method == "rf") { + install_rpackage("missForest") + data_zero1 <- missForest( + t(df1), + maxiter = 10, + ntree = 20 # input$rfntrees + , + mtry = floor(nrow(df1) ^ (1 / 3)), + verbose = TRUE + ) + df <- t(data_zero1$ximp) + } + else if (method == "pi") { + width <- 0.3 # input$piwidth + downshift <- 1.8 # input$pidownshift + for (i in 1:ncol(df1)) { + temp <- df1[[i]] + if (sum(is.na(temp)) > 0) { + temp.sd <- width * sd(temp[!is.na(temp)], na.rm = TRUE) + temp.mean <- + mean(temp[!is.na(temp)], na.rm = TRUE) - downshift * sd(temp[!is.na(temp)], na.rm = TRUE) + n.missing <- sum(is.na(temp)) + temp[is.na(temp)] <- + rnorm(n.missing, mean = temp.mean, sd = temp.sd) + df[[i]] <- temp } + } + df + } + # else if(method=="grr"){ + # library(DreamAI) + # df<-impute.RegImpute(data=as.matrix(df1), fillmethod = "row_mean", maxiter_RegImpute = 10,conv_nrmse = 1e-03) + # } + else if (method == "gms") { + # install.packages('GMSimpute') + if (!require(GMSimpute)) { + install.packages( + "src/R_NAGuideR/GMSimpute_0.0.1.1.tar.gz", + repos = NULL, + type = "source" + ) - df<-GMS.Lasso(df1,nfolds=3,log.scale=FALSE,TS.Lasso=TRUE) + library(GMSimpute) } - else{ - stop(paste("Unspported methods so far: ", method)) + + df <- GMS.Lasso(df1, + nfolds = 3, + log.scale = FALSE, + TS.Lasso = TRUE) + } + else if (method == "msimpute") { + install_bioconductor("msImpute") + df <- msImpute(as.matrix(df), + method = 'v2') + df <- as.data.frame(df) + } + else if (method == "msimpute_mnar") { + install_bioconductor("msImpute") + df <- + msImpute(as.matrix(df), + method = 'v2-mnar', + group = rep(1, dim(df)[2])) + df <- as.data.frame(df) + } + else if (method == "gsimp") { + options(stringsAsFactors = F) + # dependencies parly for sourced file + + install_bioconductor("impute") + install_bioconductor("pcaMethods") + install_rpackage('imputeLCMD') + install_rpackage("magrittr") + install_rpackage("glmnet") + install_rpackage("abind") + install_rpackage("foreach") + install_rpackage("doParallel") + source('src/R_NAGuideR/GSimp.R') + + # wrapper function with data pre-processing + pre_processing_GS_wrapper <- function(data_raw_log) { + # samples in rows, features in columns # + # Initialization # + data_raw_log_qrilc <- as.data.frame(data_raw_log) %>% + impute.QRILC() %>% extract2(1) + # Centralization and scaling # + data_raw_log_qrilc_sc <- + scale_recover(data_raw_log_qrilc, method = 'scale') + # Data after centralization and scaling # + data_raw_log_qrilc_sc_df <- data_raw_log_qrilc_sc[[1]] + # Parameters for centralization and scaling (for scaling recovery) # + data_raw_log_qrilc_sc_df_param <- data_raw_log_qrilc_sc[[2]] + # NA position # + NA_pos <- which(is.na(data_raw_log), arr.ind = T) + # NA introduced to log-scaled-initialized data # + data_raw_log_sc <- data_raw_log_qrilc_sc_df + data_raw_log_sc[NA_pos] <- NA + # Feed initialized and missing data into GSimp imputation # + result <- + data_raw_log_sc %>% GS_impute( + ., + iters_each = 50, + iters_all = 10, + initial = data_raw_log_qrilc_sc_df, + lo = -Inf, + hi = 'min', + n_cores = 1, + imp_model = 'glmnet_pred' + ) + data_imp_log_sc <- result$data_imp + # Data recovery # + data_imp <- data_imp_log_sc %>% + scale_recover(., method = 'recover', + param_df = data_raw_log_qrilc_sc_df_param) %>% + extract2(1) + return(data_imp) } - df<-as.data.frame(df) - df + df <- t(df) # samples in rows, feature in columns + df <- pre_processing_GS_wrapper(df) + df <- t(df) # features in rows, samples in columns + } + else{ + stop(paste("Unspported methods so far: ", method)) + } + df <- as.data.frame(df) + df +} # - - # ## Parameters # -# Choose one of the available methods. Some methods might fail for your dataset for unknown reasons (and the error won't always be easy to under) +# Choose one of the available methods. +# Some methods might fail for your dataset for unknown reasons +# (and the error won't always be easy to understand) # ```method # method = 'ZERO' # method = 'MINIMUM' @@ -247,6 +365,10 @@ nafunctions <- function(x,method="zero"){ # method = 'RF' # method = 'PI' # method = 'GMS' +# method = 'TRKNN', +# method = 'MSIMPUTE' +# method = 'MSIMPUTE_MNAR' +# method = 'GSIMP' # ``` # + tags=["parameters"] vscode={"languageId": "r"} @@ -258,47 +380,61 @@ method = 'KNN_IMPUTE' # ## Dump predictions # + vscode={"languageId": "r"} -df <- utils::read.csv(train_split, row.names=1, header=TRUE, stringsAsFactors = FALSE) +df <- + utils::read.csv( + train_split, + row.names = 1, + header = TRUE, + stringsAsFactors = FALSE + ) df # - # - `data.frame` does not allow abritary column names, but only valid column names... -# - tibbles don't support rownames, and the imputation methods rely on normal `data.frame`s. Save the header row for later use. +# - tibbles don't support rownames, and the imputation methods rely on normal `data.frame`s. +# Save the header row for later use. # + vscode={"languageId": "r"} -original_header <- colnames( - readr::read_csv(train_split, n_max=1, col_names=TRUE, skip=0) -) -original_header +original_header <- colnames(readr::read_csv( + train_split, + n_max = 1, + col_names = TRUE, + skip = 0 +)) feat_name <- original_header[1] +original_header[1:5] # - # Uncomment to test certain methods (only for debugging, as at least one method per package is tested using Github Actions) # + vscode={"languageId": "r"} # to_test <- c( - # 'ZERO', - # 'MINIMUM', - # 'COLMEDIAN', - # 'ROWMEDIAN', - # 'KNN_IMPUTE', - # 'SEQKNN', - # 'BPCA', - # 'SVDMETHOD', - # 'LLS', - # 'MLE', - # 'LLS', - # 'QRILC', - # 'MINDET', - # 'MINPROB', - # 'IRM', - # 'IMPSEQ', - # 'IMPSEQROB', - # 'MICE-NORM', - # 'MICE-CART', - # 'RF', - # 'PI', - # 'GMS' # fails to install on Windows +# 'ZERO', +# 'MINIMUM', +# 'COLMEDIAN', +# 'ROWMEDIAN', +# 'KNN_IMPUTE', +# 'SEQKNN', +# 'BPCA', +# 'SVDMETHOD', +# 'LLS', +# 'MLE', +# 'LLS', +# 'QRILC', +# 'MINDET', +# 'MINPROB', +# 'IRM', +# 'IMPSEQ', +# 'IMPSEQROB', +# 'MICE-NORM', +# 'MICE-CART', +# 'RF', +# 'PI', +# 'GMS', # fails to install on Windows +# 'TRKNN', +# 'MSIMPUTE' +# 'MSIMPUTE_MNAR' +# 'GSIMP' # ) # for (method in to_test) { @@ -311,14 +447,11 @@ feat_name <- original_header[1] # + vscode={"languageId": "r"} pred <- nafunctions(df, method) -pred <- tibble::as_tibble( - cbind(rownames(pred), pred) -) +pred <- tibble::as_tibble(cbind(rownames(pred), pred)) names(pred) <- original_header pred - # + vscode={"languageId": "r"} -pred <- reshape2::melt(pred, id.vars=feat_name) +pred <- reshape2::melt(pred, id.vars = feat_name) names(pred) <- c(feat_name, 'Sample ID', method) pred <- pred[reshape2::melt(is.na(df))['value'] == TRUE, ] pred @@ -327,8 +460,10 @@ pred dim(pred) # + vscode={"languageId": "r"} -fname = file.path(folder_experiment, 'preds', paste0('pred_all_', toupper(method), '.csv')) +fname = file.path(folder_experiment, + 'preds', + paste0('pred_all_', toupper(method), '.csv')) fname # + vscode={"languageId": "r"} -write_csv(pred, path=fname) +write_csv(pred, path = fname) diff --git a/project/01_1_train_NAGuideR_methods.ipynb b/project/01_1_train_NAGuideR_methods.ipynb index 968614c11..072e207c2 100644 --- a/project/01_1_train_NAGuideR_methods.ipynb +++ b/project/01_1_train_NAGuideR_methods.ipynb @@ -1,7 +1,6 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "id": "afa6aadb-bb6e-4fa2-8c91-b69d6ff9af43", "metadata": {}, @@ -24,35 +23,36 @@ }, "outputs": [], "source": [ - "packages_base_R <- c(\"BiocManager\", \"reshape2\", \"data.table\", \"readr\", \"tibble\")\n", + "packages_base_R <-\n", + " c(\"BiocManager\", \"reshape2\", \"data.table\", \"readr\", \"tibble\")\n", "\n", - "install_rpackage <- function(pkg){\n", - " # If not installed, install the package\n", - " if (!require(pkg, character.only = TRUE)) {\n", - " install.packages(pkg)\n", - " library(pkg, character.only = TRUE)\n", - " }\n", - " \n", + "install_rpackage <- function(pkg) {\n", + " # If not installed, install the package\n", + " if (!require(pkg, character.only = TRUE)) {\n", + " install.packages(pkg)\n", + " library(pkg, character.only = TRUE)\n", + " }\n", + " \n", "}\n", "\n", "# used in the large imputation function for two packages\n", - "install_bioconductor <- function(pkg){\n", - " # If not installed, install the package\n", - " if (!require(pkg, character.only = TRUE)) {\n", - " BiocManager::install(pkg)\n", - " library(pkg, character.only = TRUE)\n", - " }\n", - " \n", + "install_bioconductor <- function(pkg) {\n", + " # If not installed, install the package\n", + " if (!require(pkg, character.only = TRUE)) {\n", + " BiocManager::install(pkg)\n", + " library(pkg, character.only = TRUE)\n", + " }\n", + " \n", "}\n", "\n", + "\n", "for (package in packages_base_R) {\n", " # Check if the package is already installed\n", - " install_rpackage(pkg=package)\n", + " install_rpackage(pkg = package)\n", "}\n" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "edec29ae-208a-403a-aa77-82782bccba87", "metadata": {}, @@ -61,7 +61,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "c2e51b96-2f46-42c7-a642-a94c628dec04", "metadata": {}, @@ -78,185 +77,300 @@ "execution_count": null, "id": "f9c48bf7-d31c-4073-895b-e9cf920ff1d3", "metadata": { - "lines_to_next_cell": 2, "vscode": { "languageId": "r" } }, "outputs": [], "source": [ - "nafunctions <- function(x,method=\"zero\"){\n", - " df<-df1<-as.data.frame(x)\n", - " method<-tolower(method)\n", - " if(method==\"zero\"){\n", - " df[is.na(df)]<-0\n", - " }\n", - " else if(method==\"minimum\"){\n", - " df[is.na(df)]<-min(df1,na.rm = TRUE)\n", - " }\n", - " else if(method==\"colmedian\"){\n", - " install_rpackage('e1071')\n", - " df<-impute(df1,what =\"median\")\n", - " }\n", - " else if(method==\"rowmedian\"){\n", - " install_rpackage('e1071')\n", - " dfx<-impute(t(df1),what =\"median\")\n", - " df<-t(dfx)\n", - " }\n", - " else if(method==\"knn_impute\"){\n", - " install_bioconductor('impute')\n", - " data_zero1<-impute.knn(as.matrix(df1),k = 10, rowmax = 1, colmax = 1)#rowmax = 0.9, colmax = 0.9\n", - " df<-data_zero1$data\n", - " }\n", - " else if(method==\"seqknn\"){\n", - " if(!require(SeqKnn)){\n", - " install.packages(\"src/R_NAGuideR/SeqKnn_1.0.1.tar.gz\", repos = NULL,type=\"source\")\n", - " library(SeqKnn)\n", - " }\n", - " df <- SeqKNN(df1,k = 10)\n", - " }\n", - " else if(method==\"bpca\"){\n", - " install_bioconductor('pcaMethods')\n", - " data_zero1<-pcaMethods::pca(as.matrix(df1), nPcs = ncol(df1)-1, method = \"bpca\", maxSteps =100)\n", - " df<-completeObs(data_zero1)\n", - " }\n", - " else if(method==\"svdmethod\"){\n", - " install_bioconductor('pcaMethods')\n", - " data_zero1<-pcaMethods::pca(as.matrix(df1), nPcs = ncol(df1)-1, method = \"svdImpute\")\n", - " df<-completeObs(data_zero1)\n", - " }\n", - " else if(method==\"lls\"){\n", - " install_bioconductor('pcaMethods')\n", - " data_zero1<-llsImpute(t(df1), k = 10)\n", - " df<-t(completeObs(data_zero1))\n", - " }\n", - " else if(method==\"mle\"){\n", - " install_rpackage('norm')\n", - " xxm<-as.matrix(df1)\n", - " ss <- norm::prelim.norm(xxm)\n", - " thx <- norm::em.norm(ss)\n", - " norm::rngseed(123)\n", - " df <- norm::imp.norm(ss, thx, xxm)\n", - " }\n", - " else if(method==\"qrilc\"){\n", - " install_bioconductor(\"impute\")\n", - " install_bioconductor(\"pcaMethods\")\n", - " install_rpackage('imputeLCMD')\n", - " xxm<-t(df1)\n", - " data_zero1 <- imputeLCMD::impute.QRILC(xxm, tune.sigma = 1)[[1]]\n", - " df<-t(data_zero1)\n", - " }\n", - " else if(method==\"mindet\"){\n", - " install_bioconductor(\"impute\")\n", - " install_bioconductor(\"pcaMethods\")\n", - " install_rpackage('imputeLCMD')\n", - " xxm<-as.matrix(df1)\n", - " df <- imputeLCMD::impute.MinDet(xxm, q = 0.01)\n", - " }\n", - " else if(method==\"minprob\"){\n", - " install_bioconductor(\"impute\")\n", - " install_bioconductor(\"pcaMethods\")\n", - " install_rpackage('imputeLCMD')\n", - " xxm<-as.matrix(df1)\n", - " df <- imputeLCMD::impute.MinProb(xxm, q = 0.01, tune.sigma = 1)\n", + "nafunctions <- function(x, method = \"zero\") {\n", + " df <- df1 <- as.data.frame(x)\n", + " method <- tolower(method)\n", + " if (method == \"zero\") {\n", + " df[is.na(df)] <- 0\n", + " }\n", + " else if (method == \"minimum\") {\n", + " df[is.na(df)] <- min(df1, na.rm = TRUE)\n", + " }\n", + " else if (method == \"colmedian\") {\n", + " install_rpackage('e1071')\n", + " df <- impute(df1, what = \"median\")\n", + " }\n", + " else if (method == \"rowmedian\") {\n", + " install_rpackage('e1071')\n", + " dfx <- impute(t(df1), what = \"median\")\n", + " df <- t(dfx)\n", + " }\n", + " else if (method == \"knn_impute\") {\n", + " install_bioconductor('impute')\n", + " data_zero1 <-\n", + " impute.knn(as.matrix(df1),\n", + " k = 10,\n", + " rowmax = 1,\n", + " colmax = 1)#rowmax = 0.9, colmax = 0.9\n", + " df <- data_zero1$data\n", + " }\n", + " else if (method == \"seqknn\") {\n", + " if (!require(SeqKnn)) {\n", + " install.packages(\"src/R_NAGuideR/SeqKnn_1.0.1.tar.gz\",\n", + " repos = NULL,\n", + " type = \"source\")\n", + " library(SeqKnn)\n", " }\n", - " else if(method==\"irm\"){\n", - " install_rpackage('VIM')\n", - " df <- irmi(df1, trace = TRUE,imp_var=FALSE)\n", - " rownames(df)<-rownames(df1)\n", + " df <- SeqKNN(df1, k = 10)\n", + " }\n", + " else if (method == \"bpca\") {\n", + " install_bioconductor('pcaMethods')\n", + " data_zero1 <-\n", + " pcaMethods::pca(\n", + " as.matrix(df1),\n", + " nPcs = ncol(df1) - 1,\n", + " method = \"bpca\",\n", + " maxSteps = 100\n", + " )\n", + " df <- completeObs(data_zero1)\n", + " }\n", + " else if (method == \"svdmethod\") {\n", + " install_bioconductor('pcaMethods')\n", + " data_zero1 <-\n", + " pcaMethods::pca(as.matrix(df1),\n", + " nPcs = ncol(df1) - 1,\n", + " method = \"svdImpute\")\n", + " df <- completeObs(data_zero1)\n", + " }\n", + " else if (method == \"lls\") {\n", + " install_bioconductor('pcaMethods')\n", + " data_zero1 <- llsImpute(t(df1), k = 10)\n", + " df <- t(completeObs(data_zero1))\n", + " }\n", + " else if (method == \"mle\") {\n", + " install_rpackage('norm')\n", + " xxm <- as.matrix(df1)\n", + " ss <- norm::prelim.norm(xxm)\n", + " thx <- norm::em.norm(ss)\n", + " norm::rngseed(123)\n", + " df <- norm::imp.norm(ss, thx, xxm)\n", + " }\n", + " else if (method == \"qrilc\") {\n", + " install_bioconductor(\"impute\")\n", + " install_bioconductor(\"pcaMethods\")\n", + " install_rpackage('imputeLCMD')\n", + " xxm <- t(df1)\n", + " data_zero1 <-\n", + " imputeLCMD::impute.QRILC(xxm, tune.sigma = 1)[[1]]\n", + " df <- t(data_zero1)\n", + " }\n", + " else if (method == \"mindet\") {\n", + " install_bioconductor(\"impute\")\n", + " install_bioconductor(\"pcaMethods\")\n", + " install_rpackage('imputeLCMD')\n", + " xxm <- as.matrix(df1)\n", + " df <- imputeLCMD::impute.MinDet(xxm, q = 0.01)\n", + " }\n", + " else if (method == \"minprob\") {\n", + " install_bioconductor(\"impute\")\n", + " install_bioconductor(\"pcaMethods\")\n", + " install_rpackage('imputeLCMD')\n", + " xxm <- as.matrix(df1)\n", + " df <-\n", + " imputeLCMD::impute.MinProb(xxm, q = 0.01, tune.sigma = 1)\n", + " }\n", + " else if (method == \"irm\") {\n", + " install_rpackage('VIM')\n", + " df <- irmi(df1, trace = TRUE, imp_var = FALSE)\n", + " rownames(df) <- rownames(df1)\n", + " }\n", + " else if (method == \"impseq\") {\n", + " install_rpackage('rrcovNA')\n", + " df <- impSeq(df1)\n", + " }\n", + " else if (method == \"impseqrob\") {\n", + " install_rpackage('rrcovNA')\n", + " data_zero1 <- impSeqRob(df1, alpha = 0.9)\n", + " df <- data_zero1$x\n", + " }\n", + " else if (method == \"mice-norm\") {\n", + " install_rpackage('mice')\n", + " minum <- 5\n", + " datareadmi <- mice(df1,\n", + " m = minum,\n", + " seed = 1234,\n", + " method = \"norm\")\n", + " newdatareadmi <- 0\n", + " for (i in 1:minum) {\n", + " newdatareadmi <- complete(datareadmi, action = i) + newdatareadmi\n", " }\n", - " else if(method==\"impseq\"){\n", - " install_rpackage('rrcovNA')\n", - " df <- impSeq(df1)\n", + " df <- newdatareadmi / minum\n", + " rownames(df) <- rownames(df1)\n", + " }\n", + " else if (method == \"mice-cart\") {\n", + " install_rpackage('mice')\n", + " minum <- 5\n", + " datareadmi <- mice(df1,\n", + " m = minum,\n", + " seed = 1234,\n", + " method = \"cart\")\n", + " newdatareadmi <- 0\n", + " for (i in 1:minum) {\n", + " newdatareadmi <- complete(datareadmi, action = i) + newdatareadmi\n", " }\n", - " else if(method==\"impseqrob\"){\n", - " install_rpackage('rrcovNA')\n", - " data_zero1 <- impSeqRob(df1, alpha=0.9)\n", - " df<-data_zero1$x\n", - " }\n", - " else if(method==\"mice-norm\"){\n", - " install_rpackage('mice')\n", - " minum<-5\n", - " datareadmi<-mice(df1,m=minum,seed = 1234, method =\"norm\")\n", - " newdatareadmi<-0\n", - " for (i in 1:minum) {\n", - " newdatareadmi<-complete(datareadmi,action = i)+newdatareadmi\n", - " }\n", - " df<-newdatareadmi/minum\n", - " rownames(df)<-rownames(df1)\n", - " }\n", - " else if(method==\"mice-cart\"){\n", - " install_rpackage('mice')\n", - " minum<-5\n", - " datareadmi<-mice(df1,m=minum,seed = 1234, method =\"cart\")\n", - " newdatareadmi<-0\n", - " for (i in 1:minum) {\n", - " newdatareadmi<-complete(datareadmi,action = i)+newdatareadmi\n", - " }\n", - " df<-newdatareadmi/minum\n", - " rownames(df)<-rownames(df1)\n", - " }\n", - " else if(method==\"trknn\"){\n", - " source('src/R_NAGuideR/Imput_funcs.r')\n", - " sim_trKNN_wrapper <- function(data) {\n", - " result <- data %>% as.matrix %>% t %>% imputeKNN(., k=10, distance='truncation', perc=0) %>% t\n", - " return(result)\n", - " }\n", - " df1x <- sim_trKNN_wrapper(t(df1))\n", - " df<-as.data.frame(t(df1x))\n", - " }\n", - " else if(method==\"rf\"){\n", - " install_rpackage(\"missForest\")\n", - " data_zero1 <- missForest(t(df1), maxiter =10,\n", - " ntree = 20 # input$rfntrees\n", - " ,mtry=floor(nrow(df1)^(1/3)),verbose = TRUE)\n", - " df<-t(data_zero1$ximp)\n", - " }\n", - " else if(method==\"pi\"){\n", - " width <- 0.3 # input$piwidth\n", - " downshift <- 1.8 # input$pidownshift\n", - " for(i in 1:ncol(df1)){\n", - " temp <- df1[[i]]\n", - " if(sum(is.na(temp))>0){\n", - " temp.sd <- width * sd(temp[!is.na(temp)], na.rm = TRUE)\n", - " temp.mean <- mean(temp[!is.na(temp)], na.rm = TRUE) - downshift * sd(temp[!is.na(temp)], na.rm = TRUE)\n", - " n.missing <- sum(is.na(temp))\n", - " temp[is.na(temp)] <- rnorm(n.missing, mean = temp.mean, sd = temp.sd)\n", - " df[[i]]<-temp\n", - " }\n", - " }\n", - " df\n", - " }\n", - " # else if(method==\"grr\"){\n", - " # library(DreamAI)\n", - " # df<-impute.RegImpute(data=as.matrix(df1), fillmethod = \"row_mean\", maxiter_RegImpute = 10,conv_nrmse = 1e-03)\n", + " df <- newdatareadmi / minum\n", + " rownames(df) <- rownames(df1)\n", + " }\n", + " else if (method == \"trknn\") {\n", + " source('src/R_NAGuideR/Imput_funcs.r')\n", + " # sim_trKNN_wrapper <- function(data) {\n", + " # result <- data %>% as.matrix %>% t %>% imputeKNN(., k=10, distance='truncation', perc=0) %>% t\n", + " # return(result)\n", " # }\n", - " else if(method==\"gms\"){\n", - " # install.packages('GMSimpute')\n", - " if(!require(GMSimpute)){\n", - " install.packages(\"src/R_NAGuideR/GMSimpute_0.0.1.1.tar.gz\", repos = NULL, type=\"source\");\n", - " library(GMSimpute)\n", + " # df1x <- sim_trKNN_wrapper(t(df1))\n", + " # df<-as.data.frame(t(df1x))\n", + " df <-\n", + " imputeKNN(as.matrix(df),\n", + " k = 10,\n", + " distance = 'truncation',\n", + " perc = 0)\n", + " df <- as.data.frame(df)\n", + " }\n", + " else if (method == \"rf\") {\n", + " install_rpackage(\"missForest\")\n", + " data_zero1 <- missForest(\n", + " t(df1),\n", + " maxiter = 10,\n", + " ntree = 20 # input$rfntrees\n", + " ,\n", + " mtry = floor(nrow(df1) ^ (1 / 3)),\n", + " verbose = TRUE\n", + " )\n", + " df <- t(data_zero1$ximp)\n", + " }\n", + " else if (method == \"pi\") {\n", + " width <- 0.3 # input$piwidth\n", + " downshift <- 1.8 # input$pidownshift\n", + " for (i in 1:ncol(df1)) {\n", + " temp <- df1[[i]]\n", + " if (sum(is.na(temp)) > 0) {\n", + " temp.sd <- width * sd(temp[!is.na(temp)], na.rm = TRUE)\n", + " temp.mean <-\n", + " mean(temp[!is.na(temp)], na.rm = TRUE) - downshift * sd(temp[!is.na(temp)], na.rm = TRUE)\n", + " n.missing <- sum(is.na(temp))\n", + " temp[is.na(temp)] <-\n", + " rnorm(n.missing, mean = temp.mean, sd = temp.sd)\n", + " df[[i]] <- temp\n", " }\n", + " }\n", + " df\n", + " }\n", + " # else if(method==\"grr\"){\n", + " # library(DreamAI)\n", + " # df<-impute.RegImpute(data=as.matrix(df1), fillmethod = \"row_mean\", maxiter_RegImpute = 10,conv_nrmse = 1e-03)\n", + " # }\n", + " else if (method == \"gms\") {\n", + " # install.packages('GMSimpute')\n", + " if (!require(GMSimpute)) {\n", + " install.packages(\n", + " \"src/R_NAGuideR/GMSimpute_0.0.1.1.tar.gz\",\n", + " repos = NULL,\n", + " type = \"source\"\n", + " )\n", " \n", - " df<-GMS.Lasso(df1,nfolds=3,log.scale=FALSE,TS.Lasso=TRUE)\n", + " library(GMSimpute)\n", " }\n", - " else{\n", - " stop(paste(\"Unspported methods so far: \", method))\n", + " \n", + " df <- GMS.Lasso(df1,\n", + " nfolds = 3,\n", + " log.scale = FALSE,\n", + " TS.Lasso = TRUE)\n", + " }\n", + " else if (method == \"msimpute\") {\n", + " install_bioconductor(\"msImpute\")\n", + " df <- msImpute(as.matrix(df),\n", + " method = 'v2')\n", + " df <- as.data.frame(df)\n", + " }\n", + " else if (method == \"msimpute_mnar\") {\n", + " install_bioconductor(\"msImpute\")\n", + " df <-\n", + " msImpute(as.matrix(df),\n", + " method = 'v2-mnar',\n", + " group = rep(1, dim(df)[2]))\n", + " df <- as.data.frame(df)\n", + " }\n", + " else if (method == \"gsimp\") {\n", + " options(stringsAsFactors = F)\n", + " # dependencies parly for sourced file\n", + " \n", + " install_bioconductor(\"impute\")\n", + " install_bioconductor(\"pcaMethods\")\n", + " install_rpackage('imputeLCMD')\n", + " install_rpackage(\"magrittr\")\n", + " install_rpackage(\"glmnet\")\n", + " install_rpackage(\"abind\")\n", + " install_rpackage(\"foreach\")\n", + " install_rpackage(\"doParallel\")\n", + " source('src/R_NAGuideR/GSimp.R')\n", + " \n", + " # wrapper function with data pre-processing\n", + " pre_processing_GS_wrapper <- function(data_raw_log) {\n", + " # samples in rows, features in columns #\n", + " # Initialization #\n", + " data_raw_log_qrilc <- as.data.frame(data_raw_log) %>%\n", + " impute.QRILC() %>% extract2(1)\n", + " # Centralization and scaling #\n", + " data_raw_log_qrilc_sc <-\n", + " scale_recover(data_raw_log_qrilc, method = 'scale')\n", + " # Data after centralization and scaling #\n", + " data_raw_log_qrilc_sc_df <- data_raw_log_qrilc_sc[[1]]\n", + " # Parameters for centralization and scaling (for scaling recovery) #\n", + " data_raw_log_qrilc_sc_df_param <- data_raw_log_qrilc_sc[[2]]\n", + " # NA position #\n", + " NA_pos <- which(is.na(data_raw_log), arr.ind = T)\n", + " # NA introduced to log-scaled-initialized data #\n", + " data_raw_log_sc <- data_raw_log_qrilc_sc_df\n", + " data_raw_log_sc[NA_pos] <- NA\n", + " # Feed initialized and missing data into GSimp imputation #\n", + " result <-\n", + " data_raw_log_sc %>% GS_impute(\n", + " .,\n", + " iters_each = 50,\n", + " iters_all = 10,\n", + " initial = data_raw_log_qrilc_sc_df,\n", + " lo = -Inf,\n", + " hi = 'min',\n", + " n_cores = 1,\n", + " imp_model = 'glmnet_pred'\n", + " )\n", + " data_imp_log_sc <- result$data_imp\n", + " # Data recovery #\n", + " data_imp <- data_imp_log_sc %>%\n", + " scale_recover(., method = 'recover',\n", + " param_df = data_raw_log_qrilc_sc_df_param) %>%\n", + " extract2(1)\n", + " return(data_imp)\n", " }\n", - " df<-as.data.frame(df)\n", - " df\n", - " }" + " df <- t(df) # samples in rows, feature in columns\n", + " df <- pre_processing_GS_wrapper(df)\n", + " df <- t(df) # features in rows, samples in columns\n", + " \n", + " }\n", + " else{\n", + " stop(paste(\"Unspported methods so far: \", method))\n", + " }\n", + " df <- as.data.frame(df)\n", + " df\n", + "}" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "7152239b-fad2-4e0f-8b3e-98d943cab264", "metadata": {}, "source": [ "## Parameters\n", "\n", - "Choose one of the available methods. Some methods might fail for your dataset for unknown reasons (and the error won't always be easy to under)\n", + "Choose one of the available methods. \n", + "Some methods might fail for your dataset for unknown reasons\n", + "(and the error won't always be easy to understand)\n", "```method\n", "method = 'ZERO'\n", "method = 'MINIMUM'\n", @@ -280,6 +394,10 @@ "method = 'RF'\n", "method = 'PI'\n", "method = 'GMS'\n", + "method = 'TRKNN',\n", + "method = 'MSIMPUTE'\n", + "method = 'MSIMPUTE_MNAR'\n", + "method = 'GSIMP'\n", "```" ] }, @@ -303,7 +421,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "fc7ef882-0cbd-40f7-a77f-cc87f7145171", "metadata": {}, @@ -323,12 +440,17 @@ }, "outputs": [], "source": [ - "df <- utils::read.csv(train_split, row.names=1, header=TRUE, stringsAsFactors = FALSE)\n", + "df <-\n", + " utils::read.csv(\n", + " train_split,\n", + " row.names = 1,\n", + " header = TRUE,\n", + " stringsAsFactors = FALSE\n", + " )\n", "df" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "3ed78a0c-2716-4629-bb15-8e3fd650576a", "metadata": { @@ -336,7 +458,8 @@ }, "source": [ "- `data.frame` does not allow abritary column names, but only valid column names...\n", - "- tibbles don't support rownames, and the imputation methods rely on normal `data.frame`s. Save the header row for later use." + "- tibbles don't support rownames, and the imputation methods rely on normal `data.frame`s.\n", + "Save the header row for later use." ] }, { @@ -351,11 +474,14 @@ }, "outputs": [], "source": [ - "original_header <- colnames(\n", - " readr::read_csv(train_split, n_max=1, col_names=TRUE, skip=0)\n", - ")\n", - "original_header\n", - "feat_name <- original_header[1]" + "original_header <- colnames(readr::read_csv(\n", + " train_split,\n", + " n_max = 1,\n", + " col_names = TRUE,\n", + " skip = 0\n", + "))\n", + "feat_name <- original_header[1]\n", + "original_header[1:5]" ] }, { @@ -378,28 +504,32 @@ "outputs": [], "source": [ "# to_test <- c(\n", - " # 'ZERO',\n", - " # 'MINIMUM',\n", - " # 'COLMEDIAN',\n", - " # 'ROWMEDIAN',\n", - " # 'KNN_IMPUTE',\n", - " # 'SEQKNN',\n", - " # 'BPCA',\n", - " # 'SVDMETHOD',\n", - " # 'LLS',\n", - " # 'MLE',\n", - " # 'LLS',\n", - " # 'QRILC',\n", - " # 'MINDET',\n", - " # 'MINPROB',\n", - " # 'IRM',\n", - " # 'IMPSEQ',\n", - " # 'IMPSEQROB',\n", - " # 'MICE-NORM',\n", - " # 'MICE-CART',\n", - " # 'RF',\n", - " # 'PI',\n", - " # 'GMS' # fails to install on Windows\n", + "# 'ZERO',\n", + "# 'MINIMUM',\n", + "# 'COLMEDIAN',\n", + "# 'ROWMEDIAN',\n", + "# 'KNN_IMPUTE',\n", + "# 'SEQKNN',\n", + "# 'BPCA',\n", + "# 'SVDMETHOD',\n", + "# 'LLS',\n", + "# 'MLE',\n", + "# 'LLS',\n", + "# 'QRILC',\n", + "# 'MINDET',\n", + "# 'MINPROB',\n", + "# 'IRM',\n", + "# 'IMPSEQ',\n", + "# 'IMPSEQROB',\n", + "# 'MICE-NORM',\n", + "# 'MICE-CART',\n", + "# 'RF',\n", + "# 'PI',\n", + "# 'GMS', # fails to install on Windows\n", + "# 'TRKNN',\n", + "# 'MSIMPUTE'\n", + "# 'MSIMPUTE_MNAR'\n", + "# 'GSIMP'\n", "# )\n", "\n", "# for (method in to_test) {\n", @@ -409,7 +539,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "ff4ff1c2-192e-4a48-b5b6-d80ab989b12e", "metadata": {}, @@ -422,6 +551,7 @@ "execution_count": null, "id": "690d47c2-5666-41f2-b13f-9215334f197c", "metadata": { + "lines_to_next_cell": 0, "tags": [], "vscode": { "languageId": "r" @@ -430,9 +560,7 @@ "outputs": [], "source": [ "pred <- nafunctions(df, method)\n", - "pred <- tibble::as_tibble(\n", - " cbind(rownames(pred), pred)\n", - ")\n", + "pred <- tibble::as_tibble(cbind(rownames(pred), pred))\n", "names(pred) <- original_header\n", "pred" ] @@ -449,7 +577,7 @@ }, "outputs": [], "source": [ - "pred <- reshape2::melt(pred, id.vars=feat_name)\n", + "pred <- reshape2::melt(pred, id.vars = feat_name)\n", "names(pred) <- c(feat_name, 'Sample ID', method)\n", "pred <- pred[reshape2::melt(is.na(df))['value'] == TRUE, ]\n", "pred" @@ -480,7 +608,9 @@ }, "outputs": [], "source": [ - "fname = file.path(folder_experiment, 'preds', paste0('pred_all_', toupper(method), '.csv'))\n", + "fname = file.path(folder_experiment,\n", + " 'preds',\n", + " paste0('pred_all_', toupper(method), '.csv'))\n", "fname" ] }, @@ -495,7 +625,7 @@ }, "outputs": [], "source": [ - "write_csv(pred, path=fname)" + "write_csv(pred, path = fname)" ] } ], @@ -514,7 +644,7 @@ "mimetype": "text/x-r-source", "name": "R", "pygments_lexer": "r", - "version": "3.6.3" + "version": "4.1.3" } }, "nbformat": 4, diff --git a/project/01_1_train_RSN.ipynb b/project/01_1_train_RSN.ipynb index cee641bd3..95b208ebf 100644 --- a/project/01_1_train_RSN.ipynb +++ b/project/01_1_train_RSN.ipynb @@ -75,8 +75,8 @@ "# model\n", "sample_idx_position: int = 0 # position of index which is sample ID\n", "# model key (lower cased version will be used for file names)\n", - "axis: int = 1 # impute per row/sample (1) or per column/feat (0). \n", - "completeness = 0.6 # fractio of non missing values for row/sample (axis=0) or column/feat (axis=1)\n", + "axis: int = 1 # impute per row/sample (1) or per column/feat (0).\n", + "completeness = 0.6 # fractio of non missing values for row/sample (axis=0) or column/feat (axis=1)\n", "model_key: str = 'RSN'\n", "model: str = 'RSN' # model name\n", "save_pred_real_na: bool = True # Save all predictions for real na\n", @@ -458,7 +458,9 @@ "source": [ "### Test Datasplit\n", "\n", - "Fake NAs : Artificially created NAs. Some data was sampled and set explicitly to misssing before it was fed to the model for reconstruction." + "Fake NAs : Artificially created NAs. Some data was sampled and set\n", + "explicitly to misssing before it was fed to the model for\n", + "reconstruction." ] }, { diff --git a/project/01_1_train_RSN.py b/project/01_1_train_RSN.py index 578b2be15..73643f02a 100644 --- a/project/01_1_train_RSN.py +++ b/project/01_1_train_RSN.py @@ -6,7 +6,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -52,8 +52,8 @@ # model sample_idx_position: int = 0 # position of index which is sample ID # model key (lower cased version will be used for file names) -axis: int = 1 # impute per row/sample (1) or per column/feat (0). -completeness = 0.6 # fractio of non missing values for row/sample (axis=0) or column/feat (axis=1) +axis: int = 1 # impute per row/sample (1) or per column/feat (0). +completeness = 0.6 # fractio of non missing values for row/sample (axis=0) or column/feat (axis=1) model_key: str = 'RSN' model: str = 'RSN' # model name save_pred_real_na: bool = True # Save all predictions for real na @@ -224,7 +224,9 @@ # %% [markdown] # ### Test Datasplit # -# Fake NAs : Artificially created NAs. Some data was sampled and set explicitly to misssing before it was fed to the model for reconstruction. +# Fake NAs : Artificially created NAs. Some data was sampled and set +# explicitly to misssing before it was fed to the model for +# reconstruction. # %% added_metrics = d_metrics.add_metrics(test_pred_fake_na, 'test_fake_na') diff --git a/project/01_1_train_VAE.ipynb b/project/01_1_train_VAE.ipynb index a100b8b74..d1346cc9c 100644 --- a/project/01_1_train_VAE.ipynb +++ b/project/01_1_train_VAE.ipynb @@ -17,6 +17,7 @@ }, "outputs": [], "source": [ + "\n", "import logging\n", "\n", "\n", @@ -27,24 +28,27 @@ "\n", "from torch.nn import Sigmoid\n", "\n", - "# overwriting Recorder callback with custom plot_loss\n", - "from vaep.models import plot_loss\n", - "from fastai import learner\n", - "learner.Recorder.plot_loss = plot_loss\n", - "\n", "import pandas as pd\n", + "\n", "import sklearn\n", - "from sklearn.impute import SimpleImputer\n", "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.impute import SimpleImputer\n", "\n", "import vaep\n", - "from vaep.analyzers import analyzers\n", - "import vaep.model\n", - "import vaep.models as models\n", - "from vaep.models import ae\n", + "import vaep.nb\n", "from vaep.io import datasplits\n", + "from vaep.models import ae\n", + "import vaep.models as models\n", + "import vaep.model\n", + "from vaep.analyzers import analyzers\n", + "\n", + "\n", + "# overwriting Recorder callback with custom plot_loss\n", + "from vaep.models import plot_loss\n", + "from fastai import learner\n", + "learner.Recorder.plot_loss = plot_loss\n", + "\n", "\n", - "import vaep.nb\n", "logger = vaep.logging.setup_logger(logging.getLogger('vaep'))\n", "logger.info(\n", " \"Experiment 03 - Analysis of latent spaces and performance comparisions\")\n", @@ -502,7 +506,9 @@ "id": "2231b67e", "metadata": {}, "source": [ - "Adding a `EarlyStoppingCallback` results in an error. Potential fix in [PR3509](https://github.com/fastai/fastai/pull/3509) is not yet in current version. Try again later" + "Adding a `EarlyStoppingCallback` results in an error. Potential fix in\n", + "[PR3509](https://github.com/fastai/fastai/pull/3509) is not yet in\n", + "current version. Try again later" ] }, { @@ -882,7 +888,9 @@ "source": [ "### Test Datasplit\n", "\n", - "Fake NAs : Artificially created NAs. Some data was sampled and set explicitly to misssing before it was fed to the model for reconstruction." + "Fake NAs : Artificially created NAs. Some data was sampled and set\n", + "explicitly to misssing before it was fed to the model for\n", + "reconstruction." ] }, { diff --git a/project/01_1_train_VAE.py b/project/01_1_train_VAE.py index 4943c9d0f..6e5dac203 100644 --- a/project/01_1_train_VAE.py +++ b/project/01_1_train_VAE.py @@ -17,6 +17,7 @@ # # Variational Autoencoder # %% + import logging @@ -27,24 +28,27 @@ from torch.nn import Sigmoid -# overwriting Recorder callback with custom plot_loss -from vaep.models import plot_loss -from fastai import learner -learner.Recorder.plot_loss = plot_loss - import pandas as pd + import sklearn -from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler +from sklearn.impute import SimpleImputer import vaep -from vaep.analyzers import analyzers -import vaep.model -import vaep.models as models -from vaep.models import ae +import vaep.nb from vaep.io import datasplits +from vaep.models import ae +import vaep.models as models +import vaep.model +from vaep.analyzers import analyzers + + +# overwriting Recorder callback with custom plot_loss +from vaep.models import plot_loss +from fastai import learner +learner.Recorder.plot_loss = plot_loss + -import vaep.nb logger = vaep.logging.setup_logger(logging.getLogger('vaep')) logger.info( "Experiment 03 - Analysis of latent spaces and performance comparisions") @@ -264,7 +268,9 @@ analysis.learn.show_training_loop() # %% [markdown] -# Adding a `EarlyStoppingCallback` results in an error. Potential fix in [PR3509](https://github.com/fastai/fastai/pull/3509) is not yet in current version. Try again later +# Adding a `EarlyStoppingCallback` results in an error. Potential fix in +# [PR3509](https://github.com/fastai/fastai/pull/3509) is not yet in +# current version. Try again later # %% # learn.summary() @@ -438,7 +444,9 @@ # %% [markdown] # ### Test Datasplit # -# Fake NAs : Artificially created NAs. Some data was sampled and set explicitly to misssing before it was fed to the model for reconstruction. +# Fake NAs : Artificially created NAs. Some data was sampled and set +# explicitly to misssing before it was fed to the model for +# reconstruction. # %% added_metrics = d_metrics.add_metrics(test_pred_fake_na, 'test_fake_na') diff --git a/project/01_1_transfer_NAGuideR_pred.ipynb b/project/01_1_transfer_NAGuideR_pred.ipynb index a9eff84b1..5e86f0fa5 100644 --- a/project/01_1_transfer_NAGuideR_pred.ipynb +++ b/project/01_1_transfer_NAGuideR_pred.ipynb @@ -194,18 +194,20 @@ " .drop(test_pred_fake_na.index))\n", "\n", "for fpath in entire_pred:\n", + " logger.info(f\"Load {fpath = }\")\n", " col_name = fpath.stem.split('_all_')[-1]\n", " pred = pd.read_csv(fpath, index_col=[1, 0])\n", " val_pred_fake_na[col_name] = pred\n", " fname = args.out_preds / f'pred_val_{col_name}.csv'\n", " files_out[fname.name] = fname.as_posix()\n", " val_pred_fake_na[['observed', col_name]].to_csv(fname)\n", + " logger.info(f\"Save {fname = }\")\n", "\n", " test_pred_fake_na[col_name] = pred\n", " fname = args.out_preds / f'pred_test_{col_name}.csv'\n", " files_out[fname.name] = fname.as_posix()\n", " test_pred_fake_na[['observed', col_name]].to_csv(fname)\n", - "\n", + " logger.info(f\"Save {fname = }\")\n", " # hacky, but works:\n", " pred_real_na = (pd.Series(0, index=idx_real_na, name='placeholder')\n", " .to_frame()\n", @@ -215,6 +217,7 @@ " fname = args.out_preds / f'pred_real_na_{col_name}.csv'\n", " files_out[fname.name] = fname.as_posix()\n", " pred_real_na.to_csv(fname)\n", + " logger.info(f\"Save {fname = }\")\n", "\n", "# del pred" ] @@ -255,8 +258,8 @@ "metadata": {}, "outputs": [], "source": [ - "added_metrics = d_metrics.add_metrics(val_pred_fake_na, 'valid_fake_na')\n", - "added_metrics" + "added_metrics = d_metrics.add_metrics(val_pred_fake_na.dropna(how='all', axis=1), 'valid_fake_na')\n", + "pd.DataFrame(added_metrics)" ] }, { @@ -274,8 +277,8 @@ "metadata": {}, "outputs": [], "source": [ - "added_metrics = d_metrics.add_metrics(test_pred_fake_na, 'test_fake_na')\n", - "added_metrics" + "added_metrics = d_metrics.add_metrics(test_pred_fake_na.dropna(how='all', axis=1), 'test_fake_na')\n", + "pd.DataFrame(added_metrics)" ] }, { diff --git a/project/01_1_transfer_NAGuideR_pred.py b/project/01_1_transfer_NAGuideR_pred.py index 3b85de686..786db0b50 100644 --- a/project/01_1_transfer_NAGuideR_pred.py +++ b/project/01_1_transfer_NAGuideR_pred.py @@ -6,7 +6,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.2 # kernelspec: # display_name: Python 3 # language: python @@ -98,18 +98,20 @@ .drop(test_pred_fake_na.index)) for fpath in entire_pred: + logger.info(f"Load {fpath = }") col_name = fpath.stem.split('_all_')[-1] pred = pd.read_csv(fpath, index_col=[1, 0]) val_pred_fake_na[col_name] = pred fname = args.out_preds / f'pred_val_{col_name}.csv' files_out[fname.name] = fname.as_posix() val_pred_fake_na[['observed', col_name]].to_csv(fname) + logger.info(f"Save {fname = }") test_pred_fake_na[col_name] = pred fname = args.out_preds / f'pred_test_{col_name}.csv' files_out[fname.name] = fname.as_posix() test_pred_fake_na[['observed', col_name]].to_csv(fname) - + logger.info(f"Save {fname = }") # hacky, but works: pred_real_na = (pd.Series(0, index=idx_real_na, name='placeholder') .to_frame() @@ -119,6 +121,7 @@ fname = args.out_preds / f'pred_real_na_{col_name}.csv' files_out[fname.name] = fname.as_posix() pred_real_na.to_csv(fname) + logger.info(f"Save {fname = }") # del pred # %% @@ -132,15 +135,15 @@ d_metrics = vaep.models.Metrics() # %% -added_metrics = d_metrics.add_metrics(val_pred_fake_na, 'valid_fake_na') -added_metrics +added_metrics = d_metrics.add_metrics(val_pred_fake_na.dropna(how='all', axis=1), 'valid_fake_na') +pd.DataFrame(added_metrics) # %% [markdown] # ### Test Datasplit # %% -added_metrics = d_metrics.add_metrics(test_pred_fake_na, 'test_fake_na') -added_metrics +added_metrics = d_metrics.add_metrics(test_pred_fake_na.dropna(how='all', axis=1), 'test_fake_na') +pd.DataFrame(added_metrics) # %% metrics_df = vaep.models.get_df_from_nested_dict( diff --git a/project/01_2_performance_plots.ipynb b/project/01_2_performance_plots.ipynb index 76320a9d2..a4870a184 100644 --- a/project/01_2_performance_plots.ipynb +++ b/project/01_2_performance_plots.ipynb @@ -22,17 +22,20 @@ "cell_type": "code", "execution_count": null, "id": "a1e5f978-a0cb-4bb6-98d1-467eda257165", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ + "import logging\n", "import yaml\n", "import random\n", "from pathlib import Path\n", "\n", + "from IPython.display import display\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", - "import seaborn as sns\n", "\n", "import vaep\n", "import vaep.imputation\n", @@ -42,14 +45,15 @@ "from vaep.analyzers import compare_predictions\n", "import vaep.nb\n", "\n", - "pd.options.display.max_rows = 120\n", - "pd.options.display.min_rows = 50\n", + "pd.options.display.max_rows = 30\n", + "pd.options.display.min_rows = 10\n", "pd.options.display.max_colwidth = 100\n", "\n", "plt.rcParams.update({'figure.figsize': (4, 2)})\n", - "vaep.plotting.make_large_descriptors(5)\n", + "vaep.plotting.make_large_descriptors(6)\n", "\n", "logger = vaep.logging.setup_nb_logger()\n", + "logging.getLogger('fontTools').setLevel(logging.WARNING)\n", "\n", "\n", "def load_config_file(fname: Path, first_split='config_') -> dict:\n", @@ -77,7 +81,9 @@ "cell_type": "code", "execution_count": null, "id": "67f5161a", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# catch passed parameters\n", @@ -98,7 +104,6 @@ "execution_count": null, "id": "e6e91c6b-20d6-402c-9577-a2bfd8ba592e", "metadata": { - "lines_to_next_cell": 2, "tags": [ "parameters" ] @@ -113,8 +118,10 @@ "# Machine parsed metadata from rawfile workflow\n", "fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv'\n", "models: str = 'Median,CF,DAE,VAE' # picked models to compare (comma separated)\n", + "sel_models: str = '' # user defined comparison (comma separated)\n", "# Restrict plotting to top N methods for imputation based on error of validation data, maximum 10\n", - "plot_to_n: int = 5" + "plot_to_n: int = 5\n", + "feat_name_display: str = None # display name for feature name (e.g. 'protein group')" ] }, { @@ -129,7 +136,9 @@ "cell_type": "code", "execution_count": null, "id": "ec1509e8-6908-43c3-8909-efbb0229c324", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "args = vaep.nb.get_params(args, globals=globals())\n", @@ -140,7 +149,9 @@ "cell_type": "code", "execution_count": null, "id": "19b33594", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "args = vaep.nb.args_from_dict(args)\n", @@ -151,7 +162,9 @@ "cell_type": "code", "execution_count": null, "id": "59081f60", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "figures = {}\n", @@ -163,7 +176,8 @@ "execution_count": null, "id": "c3e124fb", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [] }, "outputs": [], "source": [ @@ -171,14 +185,20 @@ "METRIC = 'MAE'\n", "MIN_FREQ = None\n", "MODELS_PASSED = args.models.split(',')\n", - "MODELS = MODELS_PASSED.copy()" + "MODELS = MODELS_PASSED.copy()\n", + "FEAT_NAME_DISPLAY = args.feat_name_display\n", + "SEL_MODELS = None\n", + "if args.sel_models:\n", + " SEL_MODELS = args.sel_models.split(',')" ] }, { "cell_type": "code", "execution_count": null, "id": "747d5e4a", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# list(sns.color_palette().as_hex()) # string representation of colors\n", @@ -191,7 +211,9 @@ "cell_type": "code", "execution_count": null, "id": "a4ba2a48-dedc-47a9-b2ea-79936dfc48ef", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "data = datasplits.DataSplits.from_folder(\n", @@ -202,7 +224,9 @@ "cell_type": "code", "execution_count": null, "id": "611a8edf", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "fig, axes = plt.subplots(1, 2, sharey=True)\n", @@ -213,8 +237,8 @@ " title='Test split', size=1)\n", "\n", "fig.suptitle(\"Simulated missing values per sample\", size=8)\n", - "\n", - "fname = args.out_figures / 'fake_na_val_test_splits.png'\n", + "group = 1\n", + "fname = args.out_figures / f'2_{group}_fake_na_val_test_splits.png'\n", "figures[fname.stem] = fname\n", "vaep.savefig(fig, name=fname)" ] @@ -224,20 +248,21 @@ "id": "ffc6d140-f48e-4477-84f3-47a196e0a3d8", "metadata": {}, "source": [ - "## Across data completeness" + "## data completeness across entire data" ] }, { "cell_type": "code", "execution_count": null, "id": "2d043b40-5c74-40cc-a5cf-8d22ac5538a8", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# load frequency of training features...\n", "# needs to be pickle -> index.name needed\n", "freq_feat = vaep.io.datasplits.load_freq(args.data, file='freq_features.json')\n", - "\n", "freq_feat.head() # training data" ] }, @@ -245,45 +270,78 @@ "cell_type": "code", "execution_count": null, "id": "d8f8c3f4-9896-4f0e-8f93-780f90b22573", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "prop = freq_feat / len(data.train_X.index.levels[0])\n", - "prop.to_frame()" + "prop.sort_values().to_frame().plot()" + ] + }, + { + "cell_type": "markdown", + "id": "19e5adfb", + "metadata": {}, + "source": [ + "View training data in wide format" ] }, { "cell_type": "code", "execution_count": null, "id": "9a94ad00-78fd-4541-be5d-68391af99bd5", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "data.to_wide_format()\n", "data.train_X" ] }, + { + "cell_type": "markdown", + "id": "21102a1d", + "metadata": {}, + "source": [ + "Number of samples and features:" + ] + }, { "cell_type": "code", "execution_count": null, "id": "526626c0-98c7-4741-abae-b6fc8c218f23", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "N_SAMPLES, M_FEAT = data.train_X.shape\n", "print(f\"N samples: {N_SAMPLES:,d}, M features: {M_FEAT}\")" ] }, + { + "cell_type": "markdown", + "id": "61186a4e", + "metadata": {}, + "source": [ + "Collect outputs in excel file:" + ] + }, { "cell_type": "code", "execution_count": null, "id": "f3e738bd-79e9-4714-af4d-f3d0d2893353", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "fname = args.folder_experiment / '01_2_performance_summary.xlsx'\n", "dumps[fname.stem] = fname\n", - "writer = pd.ExcelWriter(fname)" + "writer = pd.ExcelWriter(fname)\n", + "print(f\"Saving to: {fname}\")" ] }, { @@ -291,7 +349,7 @@ "id": "bbe028c4-190d-4d50-b8a7-d109817d7b98", "metadata": {}, "source": [ - "# Model specifications\n", + "## Model specifications\n", "- used for bar plot annotations" ] }, @@ -299,7 +357,9 @@ "cell_type": "code", "execution_count": null, "id": "91bc1e12-8477-4eda-a4c2-1f132e468616", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# model_key could be used as key from config file\n", @@ -328,21 +388,14 @@ "cell_type": "code", "execution_count": null, "id": "af8c112f-fb4f-4dcd-b729-9c9558715d88", - "metadata": {}, - "outputs": [], - "source": [ - "# index name\n", - "freq_feat.index.name = data.train_X.columns.name" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8088a91f-6aaa-4b9d-b855-332d2bbf5780", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# index name\n", + "freq_feat.index.name = data.train_X.columns.name\n", + "# sample index name\n", "sample_index_name = data.train_X.index.name" ] }, @@ -367,7 +420,9 @@ "cell_type": "code", "execution_count": null, "id": "4efc3fe6", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "pred_val = compare_predictions.load_split_prediction_by_modelkey(\n", @@ -375,23 +430,12 @@ " split='val',\n", " model_keys=MODELS_PASSED,\n", " shared_columns=[TARGET_COL])\n", + "SAMPLE_ID, FEAT_NAME = pred_val.index.names\n", + "if not FEAT_NAME_DISPLAY:\n", + " FEAT_NAME_DISPLAY = FEAT_NAME\n", "pred_val[MODELS]" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "d5196bcc", - "metadata": {}, - "outputs": [], - "source": [ - "errors_val = (pred_val\n", - " .drop(TARGET_COL, axis=1)\n", - " .sub(pred_val[TARGET_COL], axis=0)\n", - " [MODELS])\n", - "errors_val.describe() # over all samples, and all features" - ] - }, { "cell_type": "markdown", "id": "ad1e732c-235f-4fd9-95cc-a64a2ec09f6c", @@ -403,11 +447,17 @@ { "cell_type": "code", "execution_count": null, - "id": "47df94e9-2436-4cdd-9c6f-47062bac7bee", - "metadata": {}, + "id": "d5196bcc", + "metadata": { + "tags": [] + }, "outputs": [], "source": [ - "errors_val.abs().describe() # over all samples, and all features" + "errors_val = (pred_val\n", + " .drop(TARGET_COL, axis=1)\n", + " .sub(pred_val[TARGET_COL], axis=0)\n", + " [MODELS])\n", + "errors_val # over all samples and all features" ] }, { @@ -417,14 +467,16 @@ "lines_to_next_cell": 0 }, "source": [ - "## Select top N for plotting and set colors" + "### Select top N for plotting and set colors" ] }, { "cell_type": "code", "execution_count": null, "id": "e94d9dd6-d97d-4e1c-b877-48dc1ae9c7c7", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "ORDER_MODELS = (errors_val\n", @@ -440,48 +492,54 @@ "cell_type": "code", "execution_count": null, "id": "4d6417fc", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "mae_stats_ordered_val = errors_val.abs().describe()[ORDER_MODELS]\n", "mae_stats_ordered_val.to_excel(writer, sheet_name='mae_stats_ordered_val', float_format='%.5f')\n", - "mae_stats_ordered_val" + "mae_stats_ordered_val.T" ] }, { "cell_type": "markdown", "id": "f5b33f93", - "metadata": { - "lines_to_next_cell": 0 - }, + "metadata": {}, "source": [ - "Hack color order, by assing CF, DAE and VAE unique colors no matter their order\n", - "Could be extended to all supported imputation methods" + "Some model have fixed colors, others are assigned randomly\n", + "\n", + "> Note\n", + ">\n", + "> 1. The order of \"new\" models is important for the color assignment.\n", + "> 2. User defined model keys for the same model with two configuration will yield different colors." ] }, { "cell_type": "code", "execution_count": null, "id": "36e078fb-2268-41dd-a069-4ca3dc5ca6cf", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ - "COLORS_TO_USE = vaep.plotting.defaults.assign_colors(list(k.upper() for k in ORDER_MODELS))" + "COLORS_TO_USE = vaep.plotting.defaults.assign_colors(list(k.upper() for k in ORDER_MODELS))\n", + "vaep.plotting.defaults.ModelColorVisualizer(ORDER_MODELS, COLORS_TO_USE)" ] }, { "cell_type": "code", "execution_count": null, "id": "a2440887-b5f2-45a1-90cd-d15ef9bfa0a7", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ - "# For top_N -> define colors\n", "TOP_N_ORDER = ORDER_MODELS[:args.plot_to_n]\n", - "\n", "TOP_N_COLOR_PALETTE = {model: color for model,\n", " color in zip(TOP_N_ORDER, COLORS_TO_USE)}\n", - "\n", "TOP_N_ORDER" ] }, @@ -497,7 +555,9 @@ "cell_type": "code", "execution_count": null, "id": "3aa7831e-ebf3-4de4-af6c-c4b2a8b00373", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "pred_val_corr = pred_val.corr()\n", @@ -510,7 +570,7 @@ "ax = vaep.plotting.add_height_to_barplot(ax)\n", "ax.set_xticklabels(ax.get_xticklabels(), rotation=45,\n", " horizontalalignment='right')\n", - "fname = args.out_figures / 'pred_corr_val_overall.pdf'\n", + "fname = args.out_figures / f'2_{group}_pred_corr_val_overall.pdf'\n", "figures[fname.stem] = fname\n", "vaep.savefig(ax.get_figure(), name=fname)\n", "pred_val_corr" @@ -528,7 +588,9 @@ "cell_type": "code", "execution_count": null, "id": "cea24eb1", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "corr_per_sample_val = (pred_val\n", @@ -537,7 +599,8 @@ " lambda df: df.corr().loc[TARGET_COL]\n", " )[ORDER_MODELS])\n", "\n", - "kwargs = dict(ylim=(0.7, 1), rot=90,\n", + "min_corr = int(corr_per_sample_val.min().min() * 10) / 10\n", + "kwargs = dict(ylim=(min_corr, 1), rot=90,\n", " # boxprops=dict(linewidth=1.5),\n", " flierprops=dict(markersize=3),\n", " # title='Corr. betw. fake NA and model pred. per sample on validation data',\n", @@ -545,11 +608,11 @@ "ax = corr_per_sample_val[TOP_N_ORDER].plot.box(**kwargs)\n", "ax.set_xticklabels(ax.get_xticklabels(), rotation=45,\n", " horizontalalignment='right')\n", - "fname = args.out_figures / 'pred_corr_val_per_sample.pdf'\n", + "fname = args.out_figures / f'2_{group}_pred_corr_val_per_sample.pdf'\n", "figures[fname.stem] = fname\n", "vaep.savefig(ax.get_figure(), name=fname)\n", "\n", - "fname = args.out_figures/'pred_corr_val_per_sample.xlsx'\n", + "fname = args.out_figures / f'2_{group}_pred_corr_val_per_sample.xlsx'\n", "dumps[fname.stem] = fname\n", "with pd.ExcelWriter(fname) as w:\n", " corr_per_sample_val.describe().to_excel(w, sheet_name='summary')\n", @@ -568,7 +631,9 @@ "cell_type": "code", "execution_count": null, "id": "4068d91f-856e-4aa6-9c62-5f1f77a77c4c", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "treshold = vaep.pandas.get_lower_whiskers(\n", @@ -590,26 +655,30 @@ "cell_type": "code", "execution_count": null, "id": "52298acd-73c5-4574-b7fe-8fb6544708cf", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "c_error_min = 4.5\n", "mask = (errors_val[MODELS].abs() > c_error_min).any(axis=1)\n", - "errors_val.loc[mask].sort_index(level=1)" + "errors_val.loc[mask].sort_index(level=1).head()" ] }, { "cell_type": "code", "execution_count": null, "id": "570fc505-ab27-4710-b4c2-adbe72b33898", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "errors_val = errors_val.abs().groupby(\n", " freq_feat.index.name).mean() # absolute error\n", "errors_val = errors_val.join(freq_feat)\n", "errors_val = errors_val.sort_values(by=freq_feat.name, ascending=True)\n", - "errors_val" + "errors_val.head()" ] }, { @@ -624,10 +693,12 @@ "cell_type": "code", "execution_count": null, "id": "ddc98a9f", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ - "errors_val.describe() # mean of means" + "errors_val.describe()[ORDER_MODELS].T # mean of means" ] }, { @@ -635,12 +706,13 @@ "execution_count": null, "id": "af4f0e81-e9af-4763-908d-f7bdf4a4fed7", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [] }, "outputs": [], "source": [ "c_avg_error = 2\n", - "mask = (errors_val[MODELS] >= c_avg_error).any(axis=1)\n", + "mask = (errors_val[TOP_N_ORDER] >= c_avg_error).any(axis=1)\n", "errors_val.loc[mask]" ] }, @@ -657,19 +729,23 @@ "cell_type": "code", "execution_count": null, "id": "df6923c5-e6f7-4a14-aa8e-d55bf66cf817", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "fig, ax = plt.subplots(figsize=(8, 3))\n", - "ax, errors_binned = vaep.plotting.errors.plot_errors_binned(\n", + "ax, errors_binned = vaep.plotting.errors.plot_errors_by_median(\n", " pred_val[\n", - " [TARGET_COL]+TOP_N_ORDER\n", + " [TARGET_COL] + TOP_N_ORDER\n", " ],\n", + " feat_medians=data.train_X.median(),\n", " ax=ax,\n", + " feat_name=FEAT_NAME_DISPLAY,\n", " palette=TOP_N_COLOR_PALETTE,\n", " metric_name=METRIC,)\n", "ax.set_ylabel(f\"Average error ({METRIC})\")\n", - "fname = args.out_figures / 'errors_binned_by_int_val.pdf'\n", + "fname = args.out_figures / f'2_{group}_errors_binned_by_feat_median_val.pdf'\n", "figures[fname.stem] = fname\n", "vaep.savefig(ax.get_figure(), name=fname)" ] @@ -678,7 +754,9 @@ "cell_type": "code", "execution_count": null, "id": "6122a309-5435-44d2-a6f8-8e9d46b5afae", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "errors_binned.head()\n", @@ -699,7 +777,9 @@ "cell_type": "code", "execution_count": null, "id": "1dc848c6-d39e-4092-9b72-3f6a0e1949e2", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "pred_test = compare_predictions.load_split_prediction_by_modelkey(\n", @@ -708,7 +788,7 @@ " model_keys=MODELS_PASSED,\n", " shared_columns=[TARGET_COL])\n", "pred_test = pred_test.join(freq_feat, on=freq_feat.index.name)\n", - "SAMPLE_ID, FEAT_NAME = pred_test.index.names\n", + "\n", "pred_test" ] }, @@ -724,7 +804,9 @@ "cell_type": "code", "execution_count": null, "id": "8bce941c", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "errors_test_mae = vaep.pandas.calc_errors.get_absolute_error(\n", @@ -738,7 +820,9 @@ "cell_type": "code", "execution_count": null, "id": "ff722dae", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "mae_stats_ordered_test.to_excel(writer, sheet_name='mae_stats_ordered_test', float_format='%.5f')" @@ -748,15 +832,17 @@ "cell_type": "code", "execution_count": null, "id": "629eddae", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "cp_mean_perf = pd.concat([\n", " mae_stats_ordered_val.loc['mean'],\n", " mae_stats_ordered_test.loc['mean'],\n", "],\n", - "axis=1,\n", - "keys=['val', 'test']\n", + " axis=1,\n", + " keys=['val', 'test']\n", ").sort_values(by='val')\n", "cp_mean_perf.to_excel(writer, sheet_name='cp_mean_perf', float_format='%.5f')\n", "cp_mean_perf" @@ -766,7 +852,9 @@ "cell_type": "code", "execution_count": null, "id": "f639cd92", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "writer.close()" @@ -779,24 +867,26 @@ "lines_to_next_cell": 0 }, "source": [ - "## Intensity distribution as histogram\n", - "plot top 4 models" + "### Intensity distribution as histogram\n", + "Plot top 4 models predictions for intensities in test data" ] }, { "cell_type": "code", "execution_count": null, "id": "99f7951f", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "min_max = vaep.plotting.data.min_max(pred_test[TARGET_COL])\n", "top_n = 4\n", - "fig, axes = plt.subplots(ncols=4, figsize=(8, 2), sharey=True)\n", + "fig, axes = plt.subplots(ncols=top_n, figsize=(8, 2), sharey=True)\n", "\n", "for model, color, ax in zip(\n", - " ORDER_MODELS[:4],\n", - " COLORS_TO_USE[:4],\n", + " ORDER_MODELS[:top_n],\n", + " COLORS_TO_USE[:top_n],\n", " axes):\n", "\n", " ax, _ = vaep.plotting.data.plot_histogram_intensities(\n", @@ -812,13 +902,13 @@ " ax=ax,\n", " alpha=0.5,\n", " )\n", - " _ = [(l.set_rotation(90))\n", - " for l in ax.get_xticklabels()]\n", + " _ = [(l_.set_rotation(90))\n", + " for l_ in ax.get_xticklabels()]\n", " ax.legend()\n", "\n", "axes[0].set_ylabel('Number of observations')\n", "\n", - "fname = args.out_figures / f'intensity_binned_top_{top_n}_models_test.pdf'\n", + "fname = args.out_figures / f'2_{group}_intensity_binned_top_{top_n}_models_test.pdf'\n", "figures[fname.stem] = fname\n", "vaep.savefig(fig, name=fname)" ] @@ -835,7 +925,9 @@ "cell_type": "code", "execution_count": null, "id": "b42efaec-4556-45e9-a813-66da159e771c", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "pred_test_corr = pred_test.corr()\n", @@ -847,7 +939,7 @@ "ax = vaep.plotting.add_height_to_barplot(ax)\n", "ax.set_xticklabels(ax.get_xticklabels(), rotation=45,\n", " horizontalalignment='right')\n", - "fname = args.out_figures / 'pred_corr_test_overall.pdf'\n", + "fname = args.out_figures / f'2_{group}_pred_corr_test_overall.pdf'\n", "figures[fname.stem] = fname\n", "vaep.savefig(ax.get_figure(), name=fname)\n", "pred_test_corr" @@ -865,7 +957,9 @@ "cell_type": "code", "execution_count": null, "id": "ee088a12-ee60-45d1-bf5a-e07b76413c56", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "corr_per_sample_test = (pred_test\n", @@ -886,9 +980,12 @@ "cell_type": "code", "execution_count": null, "id": "825efac2", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ + "# ! add minimum\n", "kwargs = dict(ylim=(0.7, 1), rot=90,\n", " flierprops=dict(markersize=3),\n", " # title='Corr. betw. fake NA and model predictions per sample on test data',\n", @@ -899,7 +996,7 @@ " .box(**kwargs))\n", "ax.set_xticklabels(ax.get_xticklabels(), rotation=45,\n", " horizontalalignment='right')\n", - "fname = args.out_figures / 'pred_corr_test_per_sample.pdf'\n", + "fname = args.out_figures / f'2_{group}_pred_corr_test_per_sample.pdf'\n", "figures[fname.stem] = fname\n", "vaep.savefig(ax.get_figure(), name=fname)\n", "\n", @@ -921,7 +1018,9 @@ "cell_type": "code", "execution_count": null, "id": "77b846e1-00b8-4f61-b5cd-cdc1692787de", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "treshold = vaep.pandas.get_lower_whiskers(\n", @@ -935,7 +1034,9 @@ "cell_type": "code", "execution_count": null, "id": "7bff3764-5063-4399-a182-3ba795fbe99d", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "feature_names = pred_test.index.levels[-1]\n", @@ -948,7 +1049,9 @@ "cell_type": "code", "execution_count": null, "id": "c6145bd0-9b59-490e-9a0e-89475c18663b", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "options = random.sample(set(feature_names), 1)\n", @@ -967,7 +1070,9 @@ "cell_type": "code", "execution_count": null, "id": "6ee92128-4f78-45e9-a607-8e6c4163181a", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "corr_per_feat_test = pred_test.groupby(FEAT_NAME).aggregate(\n", @@ -983,7 +1088,9 @@ "cell_type": "code", "execution_count": null, "id": "8e45b324-eaa0-43e4-b28b-b0f839f91955", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "corr_per_feat_test.loc[too_few_obs].dropna(thresh=3, axis=0)" @@ -993,12 +1100,14 @@ "cell_type": "code", "execution_count": null, "id": "4c9a9ecc-526a-41ac-8a4d-d3a389ea6c07", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "kwargs = dict(rot=90,\n", " flierprops=dict(markersize=1),\n", - " ylabel=f'correlation per {FEAT_NAME}')\n", + " ylabel=f'correlation per {FEAT_NAME_DISPLAY}')\n", "ax = (corr_per_feat_test\n", " .loc[~too_few_obs, TOP_N_ORDER]\n", " .plot\n", @@ -1006,7 +1115,7 @@ " )\n", "_ = ax.set_xticklabels(ax.get_xticklabels(), rotation=45,\n", " horizontalalignment='right')\n", - "fname = args.out_figures / 'pred_corr_test_per_feat.pdf'\n", + "fname = args.out_figures / f'2_{group}_pred_corr_test_per_feat.pdf'\n", "figures[fname.stem] = fname\n", "vaep.savefig(ax.get_figure(), name=fname)\n", "dumps[fname.stem] = fname.with_suffix('.xlsx')\n", @@ -1020,7 +1129,9 @@ "cell_type": "code", "execution_count": null, "id": "b38ffdfc-b1b0-4ae0-a47d-5881c534881f", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "feat_count_test = data.test_y.stack().groupby(FEAT_NAME).count()\n", @@ -1033,7 +1144,8 @@ "execution_count": null, "id": "9993d145-8b78-4769-838a-01721900a3c7", "metadata": { - "lines_to_next_cell": 0 + "lines_to_next_cell": 0, + "tags": [] }, "outputs": [], "source": [ @@ -1073,7 +1185,9 @@ "cell_type": "code", "execution_count": null, "id": "829ebc82-587d-47c6-8422-03c610855211", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "metrics = vaep.models.Metrics()\n", @@ -1087,7 +1201,9 @@ "cell_type": "code", "execution_count": null, "id": "f8269d00-9048-4e70-9f39-dab95e103c32", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "n_in_comparison = int(test_metrics.loc['N'].unique()[0])\n", @@ -1098,7 +1214,9 @@ "cell_type": "code", "execution_count": null, "id": "096083d1-bcd2-44a2-94fe-a89b7d204b66", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "_to_plot = test_metrics.loc[METRIC].to_frame().T\n", @@ -1111,13 +1229,18 @@ "execution_count": null, "id": "05a259ef-48bd-4dd0-8dfe-9e2750579383", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [] }, "outputs": [], "source": [ - "text = model_configs[[\"latent_dim\", \"hidden_layers\"]].apply(\n", - " build_text,\n", - " axis=1)\n", + "try:\n", + " text = model_configs[[\"latent_dim\", \"hidden_layers\"]].apply(\n", + " build_text,\n", + " axis=1)\n", + "except KeyError:\n", + " logger.warning(\"No PIMMS models in comparsion. Using empty text\")\n", + " text = pd.Series('', index=model_configs.columns)\n", "\n", "_to_plot.loc[\"text\"] = text\n", "_to_plot = _to_plot.fillna('')\n", @@ -1128,20 +1251,23 @@ "cell_type": "code", "execution_count": null, "id": "d3dd53c0-4068-4eac-a5c3-7aaa608e5f8f", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "fig, ax = plt.subplots(figsize=(4, 2))\n", - "ax = _to_plot.loc[[feature_names.name]].plot.bar(rot=0,\n", - " ylabel=f\"{METRIC} for {feature_names.name} ({n_in_comparison:,} intensities)\",\n", - " # title=f'performance on test data (based on {n_in_comparison:,} measurements)',\n", - " color=COLORS_TO_USE,\n", - " ax=ax,\n", - " width=.8)\n", + "ax = _to_plot.loc[[feature_names.name]].plot.bar(\n", + " rot=0,\n", + " ylabel=f\"{METRIC} for {FEAT_NAME_DISPLAY} ({n_in_comparison:,} intensities)\",\n", + " # title=f'performance on test data (based on {n_in_comparison:,} measurements)',\n", + " color=COLORS_TO_USE,\n", + " ax=ax,\n", + " width=.8)\n", "ax = vaep.plotting.add_height_to_barplot(ax, size=5)\n", "ax = vaep.plotting.add_text_to_barplot(ax, _to_plot.loc[\"text\"], size=5)\n", "ax.set_xticklabels([])\n", - "fname = args.out_figures / 'performance_test.pdf'\n", + "fname = args.out_figures / f'2_{group}_performance_test.pdf'\n", "figures[fname.stem] = fname\n", "vaep.savefig(fig, name=fname)" ] @@ -1151,7 +1277,8 @@ "execution_count": null, "id": "ef92551d", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [] }, "outputs": [], "source": [ @@ -1169,29 +1296,33 @@ "id": "d88c21c7", "metadata": {}, "source": [ - "Plot error by median feature intensity" + "### Plot error by median feature intensity" ] }, { "cell_type": "code", "execution_count": null, "id": "588f7bf3", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ + "vaep.plotting.make_large_descriptors(7)\n", "fig, ax = plt.subplots(figsize=(8, 2))\n", "\n", "ax, errors_binned = vaep.plotting.errors.plot_errors_by_median(\n", " pred=pred_test[\n", - " [TARGET_COL]+TOP_N_ORDER\n", + " [TARGET_COL] + TOP_N_ORDER\n", " ],\n", " feat_medians=data.train_X.median(),\n", " ax=ax,\n", + " feat_name=FEAT_NAME_DISPLAY,\n", " metric_name=METRIC,\n", " palette=COLORS_TO_USE\n", ")\n", - "\n", - "fname = args.out_figures / 'errors_binned_by_feat_medians.pdf'\n", + "vaep.plotting.make_large_descriptors(6)\n", + "fname = args.out_figures / f'2_{group}_test_errors_binned_by_feat_medians.pdf'\n", "figures[fname.stem] = fname\n", "vaep.savefig(ax.get_figure(), name=fname)\n", "\n", @@ -1204,7 +1335,9 @@ "cell_type": "code", "execution_count": null, "id": "b13ecd37", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "(errors_binned\n", @@ -1215,6 +1348,110 @@ " .sort_values(by=METRIC))" ] }, + { + "cell_type": "markdown", + "id": "26370a1a", + "metadata": {}, + "source": [ + "### Custom model selection" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "712faf9a", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "if SEL_MODELS:\n", + " metrics = vaep.models.Metrics()\n", + " test_metrics = metrics.add_metrics(\n", + " pred_test[['observed', *SEL_MODELS]], key='test data')\n", + " test_metrics = pd.DataFrame(test_metrics)[SEL_MODELS]\n", + " test_metrics\n", + "\n", + " n_in_comparison = int(test_metrics.loc['N'].unique()[0])\n", + " n_in_comparison\n", + "\n", + " _to_plot = test_metrics.loc[METRIC].to_frame().T\n", + " _to_plot.index = [feature_names.name]\n", + " _to_plot\n", + "\n", + " try:\n", + " text = model_configs[[\"latent_dim\", \"hidden_layers\"]].apply(\n", + " build_text,\n", + " axis=1)\n", + " except KeyError:\n", + " logger.warning(\"No PIMMS models in comparsion. Using empty text\")\n", + " text = pd.Series('', index=model_configs.columns)\n", + "\n", + " _to_plot.loc[\"text\"] = text\n", + " _to_plot = _to_plot.fillna('')\n", + " _to_plot\n", + "\n", + " fig, ax = plt.subplots(figsize=(4, 2))\n", + " ax = _to_plot.loc[[feature_names.name]].plot.bar(\n", + " rot=0,\n", + " ylabel=f\"{METRIC} for {FEAT_NAME_DISPLAY} ({n_in_comparison:,} intensities)\",\n", + " # title=f'performance on test data (based on {n_in_comparison:,} measurements)',\n", + " color=vaep.plotting.defaults.assign_colors(\n", + " list(k.upper() for k in SEL_MODELS)),\n", + " ax=ax,\n", + " width=.8)\n", + " ax = vaep.plotting.add_height_to_barplot(ax, size=5)\n", + " ax = vaep.plotting.add_text_to_barplot(ax, _to_plot.loc[\"text\"], size=5)\n", + " ax.set_xticklabels([])\n", + " fname = args.out_figures / f'2_{group}_performance_test_sel.pdf'\n", + " figures[fname.stem] = fname\n", + " vaep.savefig(fig, name=fname)\n", + "\n", + " dumps[fname.stem] = fname.with_suffix('.csv')\n", + " _to_plot_long = _to_plot.T\n", + " _to_plot_long = _to_plot_long.rename(\n", + " {feature_names.name: 'metric_value'}, axis=1)\n", + " _to_plot_long['data level'] = feature_names.name\n", + " _to_plot_long = _to_plot_long.set_index('data level', append=True)\n", + " _to_plot_long.to_csv(fname.with_suffix('.csv'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a578570", + "metadata": {}, + "outputs": [], + "source": [ + "# custom selection\n", + "if SEL_MODELS:\n", + " vaep.plotting.make_large_descriptors(7)\n", + " fig, ax = plt.subplots(figsize=(8, 2))\n", + "\n", + " ax, errors_binned = vaep.plotting.errors.plot_errors_by_median(\n", + " pred=pred_test[\n", + " [TARGET_COL] + SEL_MODELS\n", + " ],\n", + " feat_medians=data.train_X.median(),\n", + " ax=ax,\n", + " metric_name=METRIC,\n", + " feat_name=FEAT_NAME_DISPLAY,\n", + " palette=vaep.plotting.defaults.assign_colors(\n", + " list(k.upper() for k in SEL_MODELS))\n", + " )\n", + " # ax.set_ylim(0, 1.5)\n", + " # for text in ax.legend().get_texts():\n", + " # text.set_fontsize(6)\n", + " fname = args.out_figures / f'2_{group}_test_errors_binned_by_feat_medians_sel.pdf'\n", + " figures[fname.stem] = fname\n", + " vaep.savefig(ax.get_figure(), name=fname)\n", + " dumps[fname.stem] = fname.with_suffix('.csv')\n", + " errors_binned.to_csv(fname.with_suffix('.csv'))\n", + " vaep.plotting.make_large_descriptors(6)\n", + " # ax.xaxis.set_tick_params(rotation=0) # horizontal\n", + " display(errors_binned)" + ] + }, { "cell_type": "markdown", "id": "549236ca-9e89-47aa-905c-c97a45d4dc2b", @@ -1229,19 +1466,21 @@ "cell_type": "code", "execution_count": null, "id": "3339df97-230f-4cbd-b61d-7aef9a7495e8", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "fig, ax = plt.subplots(figsize=(8, 2))\n", "ax, errors_binned = vaep.plotting.errors.plot_errors_binned(\n", " pred_test[\n", - " [TARGET_COL]+TOP_N_ORDER\n", + " [TARGET_COL] + TOP_N_ORDER\n", " ],\n", " ax=ax,\n", " palette=TOP_N_COLOR_PALETTE,\n", " metric_name=METRIC,\n", ")\n", - "fname = args.out_figures / 'errors_binned_by_int_test.pdf'\n", + "fname = args.out_figures / f'2_{group}_test_errors_binned_by_int.pdf'\n", "figures[fname.stem] = fname\n", "vaep.savefig(ax.get_figure(), name=fname)" ] @@ -1250,7 +1489,9 @@ "cell_type": "code", "execution_count": null, "id": "095f64eb-1c4f-47ae-9a01-d5b05a795779", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "dumps[fname.stem] = fname.with_suffix('.csv')\n", @@ -1272,7 +1513,9 @@ "cell_type": "code", "execution_count": null, "id": "c8f67ae1-40e9-4c2a-af0a-41e627703518", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "figures" @@ -1282,19 +1525,13 @@ "cell_type": "code", "execution_count": null, "id": "b08b442f", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "dumps" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c2f3ebc5", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -1302,7 +1539,7 @@ "formats": "ipynb,py:percent" }, "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -1316,7 +1553,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.8.17" }, "toc-autonumbering": true, "vscode": { diff --git a/project/01_2_performance_plots.py b/project/01_2_performance_plots.py index 2060bba1d..59e626918 100644 --- a/project/01_2_performance_plots.py +++ b/project/01_2_performance_plots.py @@ -6,9 +6,9 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: -# display_name: Python 3 +# display_name: Python 3 (ipykernel) # language: python # name: python3 # --- @@ -27,14 +27,15 @@ # - top N based on validation data # %% +import logging import yaml import random from pathlib import Path +from IPython.display import display import matplotlib.pyplot as plt import numpy as np import pandas as pd -import seaborn as sns import vaep import vaep.imputation @@ -44,14 +45,15 @@ from vaep.analyzers import compare_predictions import vaep.nb -pd.options.display.max_rows = 120 -pd.options.display.min_rows = 50 +pd.options.display.max_rows = 30 +pd.options.display.min_rows = 10 pd.options.display.max_colwidth = 100 plt.rcParams.update({'figure.figsize': (4, 2)}) -vaep.plotting.make_large_descriptors(5) +vaep.plotting.make_large_descriptors(6) logger = vaep.logging.setup_nb_logger() +logging.getLogger('fontTools').setLevel(logging.WARNING) def load_config_file(fname: Path, first_split='config_') -> dict: @@ -92,9 +94,10 @@ def build_text(s): # Machine parsed metadata from rawfile workflow fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv' models: str = 'Median,CF,DAE,VAE' # picked models to compare (comma separated) +sel_models: str = '' # user defined comparison (comma separated) # Restrict plotting to top N methods for imputation based on error of validation data, maximum 10 plot_to_n: int = 5 - +feat_name_display: str = None # display name for feature name (e.g. 'protein group') # %% [markdown] # Some argument transformations @@ -117,6 +120,10 @@ def build_text(s): MIN_FREQ = None MODELS_PASSED = args.models.split(',') MODELS = MODELS_PASSED.copy() +FEAT_NAME_DISPLAY = args.feat_name_display +SEL_MODELS = None +if args.sel_models: + SEL_MODELS = args.sel_models.split(',') # %% @@ -138,40 +145,49 @@ def build_text(s): title='Test split', size=1) fig.suptitle("Simulated missing values per sample", size=8) - -fname = args.out_figures / 'fake_na_val_test_splits.png' +group = 1 +fname = args.out_figures / f'2_{group}_fake_na_val_test_splits.png' figures[fname.stem] = fname vaep.savefig(fig, name=fname) # %% [markdown] -# ## Across data completeness +# ## data completeness across entire data # %% # load frequency of training features... # needs to be pickle -> index.name needed freq_feat = vaep.io.datasplits.load_freq(args.data, file='freq_features.json') - freq_feat.head() # training data # %% prop = freq_feat / len(data.train_X.index.levels[0]) -prop.to_frame() +prop.sort_values().to_frame().plot() + +# %% [markdown] +# View training data in wide format # %% data.to_wide_format() data.train_X +# %% [markdown] +# Number of samples and features: + # %% N_SAMPLES, M_FEAT = data.train_X.shape print(f"N samples: {N_SAMPLES:,d}, M features: {M_FEAT}") +# %% [markdown] +# Collect outputs in excel file: + # %% fname = args.folder_experiment / '01_2_performance_summary.xlsx' dumps[fname.stem] = fname writer = pd.ExcelWriter(fname) +print(f"Saving to: {fname}") # %% [markdown] -# # Model specifications +# ## Model specifications # - used for bar plot annotations # %% @@ -194,9 +210,7 @@ def build_text(s): # %% # index name freq_feat.index.name = data.train_X.columns.name - -# %% -# index name +# sample index name sample_index_name = data.train_X.index.name # %% [markdown] @@ -213,23 +227,23 @@ def build_text(s): split='val', model_keys=MODELS_PASSED, shared_columns=[TARGET_COL]) +SAMPLE_ID, FEAT_NAME = pred_val.index.names +if not FEAT_NAME_DISPLAY: + FEAT_NAME_DISPLAY = FEAT_NAME pred_val[MODELS] +# %% [markdown] +# Describe absolute error + # %% errors_val = (pred_val .drop(TARGET_COL, axis=1) .sub(pred_val[TARGET_COL], axis=0) [MODELS]) -errors_val.describe() # over all samples, and all features +errors_val # over all samples and all features # %% [markdown] -# Describe absolute error - -# %% -errors_val.abs().describe() # over all samples, and all features - -# %% [markdown] -# ## Select top N for plotting and set colors +# ### Select top N for plotting and set colors # %% ORDER_MODELS = (errors_val .abs() @@ -242,21 +256,24 @@ def build_text(s): # %% mae_stats_ordered_val = errors_val.abs().describe()[ORDER_MODELS] mae_stats_ordered_val.to_excel(writer, sheet_name='mae_stats_ordered_val', float_format='%.5f') -mae_stats_ordered_val +mae_stats_ordered_val.T # %% [markdown] -# Hack color order, by assing CF, DAE and VAE unique colors no matter their order -# Could be extended to all supported imputation methods +# Some model have fixed colors, others are assigned randomly +# +# > Note +# > +# > 1. The order of "new" models is important for the color assignment. +# > 2. User defined model keys for the same model with two configuration will yield different colors. + # %% COLORS_TO_USE = vaep.plotting.defaults.assign_colors(list(k.upper() for k in ORDER_MODELS)) +vaep.plotting.defaults.ModelColorVisualizer(ORDER_MODELS, COLORS_TO_USE) # %% -# For top_N -> define colors TOP_N_ORDER = ORDER_MODELS[:args.plot_to_n] - TOP_N_COLOR_PALETTE = {model: color for model, color in zip(TOP_N_ORDER, COLORS_TO_USE)} - TOP_N_ORDER # %% [markdown] @@ -273,7 +290,7 @@ def build_text(s): ax = vaep.plotting.add_height_to_barplot(ax) ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right') -fname = args.out_figures / 'pred_corr_val_overall.pdf' +fname = args.out_figures / f'2_{group}_pred_corr_val_overall.pdf' figures[fname.stem] = fname vaep.savefig(ax.get_figure(), name=fname) pred_val_corr @@ -288,7 +305,8 @@ def build_text(s): lambda df: df.corr().loc[TARGET_COL] )[ORDER_MODELS]) -kwargs = dict(ylim=(0.7, 1), rot=90, +min_corr = int(corr_per_sample_val.min().min() * 10) / 10 +kwargs = dict(ylim=(min_corr, 1), rot=90, # boxprops=dict(linewidth=1.5), flierprops=dict(markersize=3), # title='Corr. betw. fake NA and model pred. per sample on validation data', @@ -296,11 +314,11 @@ def build_text(s): ax = corr_per_sample_val[TOP_N_ORDER].plot.box(**kwargs) ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right') -fname = args.out_figures / 'pred_corr_val_per_sample.pdf' +fname = args.out_figures / f'2_{group}_pred_corr_val_per_sample.pdf' figures[fname.stem] = fname vaep.savefig(ax.get_figure(), name=fname) -fname = args.out_figures/'pred_corr_val_per_sample.xlsx' +fname = args.out_figures / f'2_{group}_pred_corr_val_per_sample.xlsx' dumps[fname.stem] = fname with pd.ExcelWriter(fname) as w: corr_per_sample_val.describe().to_excel(w, sheet_name='summary') @@ -322,24 +340,24 @@ def build_text(s): # %% c_error_min = 4.5 mask = (errors_val[MODELS].abs() > c_error_min).any(axis=1) -errors_val.loc[mask].sort_index(level=1) +errors_val.loc[mask].sort_index(level=1).head() # %% errors_val = errors_val.abs().groupby( freq_feat.index.name).mean() # absolute error errors_val = errors_val.join(freq_feat) errors_val = errors_val.sort_values(by=freq_feat.name, ascending=True) -errors_val +errors_val.head() # %% [markdown] # Some interpolated features are missing # %% -errors_val.describe() # mean of means +errors_val.describe()[ORDER_MODELS].T # mean of means # %% c_avg_error = 2 -mask = (errors_val[MODELS] >= c_avg_error).any(axis=1) +mask = (errors_val[TOP_N_ORDER] >= c_avg_error).any(axis=1) errors_val.loc[mask] @@ -349,15 +367,17 @@ def build_text(s): # %% fig, ax = plt.subplots(figsize=(8, 3)) -ax, errors_binned = vaep.plotting.errors.plot_errors_binned( +ax, errors_binned = vaep.plotting.errors.plot_errors_by_median( pred_val[ - [TARGET_COL]+TOP_N_ORDER + [TARGET_COL] + TOP_N_ORDER ], + feat_medians=data.train_X.median(), ax=ax, + feat_name=FEAT_NAME_DISPLAY, palette=TOP_N_COLOR_PALETTE, metric_name=METRIC,) ax.set_ylabel(f"Average error ({METRIC})") -fname = args.out_figures / 'errors_binned_by_int_val.pdf' +fname = args.out_figures / f'2_{group}_errors_binned_by_feat_median_val.pdf' figures[fname.stem] = fname vaep.savefig(ax.get_figure(), name=fname) @@ -377,7 +397,7 @@ def build_text(s): model_keys=MODELS_PASSED, shared_columns=[TARGET_COL]) pred_test = pred_test.join(freq_feat, on=freq_feat.index.name) -SAMPLE_ID, FEAT_NAME = pred_test.index.names + pred_test # %% [markdown] @@ -398,8 +418,8 @@ def build_text(s): mae_stats_ordered_val.loc['mean'], mae_stats_ordered_test.loc['mean'], ], -axis=1, -keys=['val', 'test'] + axis=1, + keys=['val', 'test'] ).sort_values(by='val') cp_mean_perf.to_excel(writer, sheet_name='cp_mean_perf', float_format='%.5f') cp_mean_perf @@ -408,16 +428,16 @@ def build_text(s): writer.close() # %% [markdown] -# ## Intensity distribution as histogram -# plot top 4 models +# ### Intensity distribution as histogram +# Plot top 4 models predictions for intensities in test data # %% min_max = vaep.plotting.data.min_max(pred_test[TARGET_COL]) top_n = 4 -fig, axes = plt.subplots(ncols=4, figsize=(8, 2), sharey=True) +fig, axes = plt.subplots(ncols=top_n, figsize=(8, 2), sharey=True) for model, color, ax in zip( - ORDER_MODELS[:4], - COLORS_TO_USE[:4], + ORDER_MODELS[:top_n], + COLORS_TO_USE[:top_n], axes): ax, _ = vaep.plotting.data.plot_histogram_intensities( @@ -433,13 +453,13 @@ def build_text(s): ax=ax, alpha=0.5, ) - _ = [(l.set_rotation(90)) - for l in ax.get_xticklabels()] + _ = [(l_.set_rotation(90)) + for l_ in ax.get_xticklabels()] ax.legend() axes[0].set_ylabel('Number of observations') -fname = args.out_figures / f'intensity_binned_top_{top_n}_models_test.pdf' +fname = args.out_figures / f'2_{group}_intensity_binned_top_{top_n}_models_test.pdf' figures[fname.stem] = fname vaep.savefig(fig, name=fname) @@ -456,7 +476,7 @@ def build_text(s): ax = vaep.plotting.add_height_to_barplot(ax) ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right') -fname = args.out_figures / 'pred_corr_test_overall.pdf' +fname = args.out_figures / f'2_{group}_pred_corr_test_overall.pdf' figures[fname.stem] = fname vaep.savefig(ax.get_figure(), name=fname) pred_test_corr @@ -479,6 +499,7 @@ def build_text(s): corr_per_sample_test.loc[~too_few_obs].describe() # %% +# # ! add minimum kwargs = dict(ylim=(0.7, 1), rot=90, flierprops=dict(markersize=3), # title='Corr. betw. fake NA and model predictions per sample on test data', @@ -489,7 +510,7 @@ def build_text(s): .box(**kwargs)) ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right') -fname = args.out_figures / 'pred_corr_test_per_sample.pdf' +fname = args.out_figures / f'2_{group}_pred_corr_test_per_sample.pdf' figures[fname.stem] = fname vaep.savefig(ax.get_figure(), name=fname) @@ -536,7 +557,7 @@ def build_text(s): # %% kwargs = dict(rot=90, flierprops=dict(markersize=1), - ylabel=f'correlation per {FEAT_NAME}') + ylabel=f'correlation per {FEAT_NAME_DISPLAY}') ax = (corr_per_feat_test .loc[~too_few_obs, TOP_N_ORDER] .plot @@ -544,7 +565,7 @@ def build_text(s): ) _ = ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right') -fname = args.out_figures / 'pred_corr_test_per_feat.pdf' +fname = args.out_figures / f'2_{group}_pred_corr_test_per_feat.pdf' figures[fname.stem] = fname vaep.savefig(ax.get_figure(), name=fname) dumps[fname.stem] = fname.with_suffix('.xlsx') @@ -601,9 +622,13 @@ def highlight_min(s, color, tolerence=0.00001): _to_plot # %% -text = model_configs[["latent_dim", "hidden_layers"]].apply( - build_text, - axis=1) +try: + text = model_configs[["latent_dim", "hidden_layers"]].apply( + build_text, + axis=1) +except KeyError: + logger.warning("No PIMMS models in comparsion. Using empty text") + text = pd.Series('', index=model_configs.columns) _to_plot.loc["text"] = text _to_plot = _to_plot.fillna('') @@ -612,16 +637,17 @@ def highlight_min(s, color, tolerence=0.00001): # %% fig, ax = plt.subplots(figsize=(4, 2)) -ax = _to_plot.loc[[feature_names.name]].plot.bar(rot=0, - ylabel=f"{METRIC} for {feature_names.name} ({n_in_comparison:,} intensities)", - # title=f'performance on test data (based on {n_in_comparison:,} measurements)', - color=COLORS_TO_USE, - ax=ax, - width=.8) +ax = _to_plot.loc[[feature_names.name]].plot.bar( + rot=0, + ylabel=f"{METRIC} for {FEAT_NAME_DISPLAY} ({n_in_comparison:,} intensities)", + # title=f'performance on test data (based on {n_in_comparison:,} measurements)', + color=COLORS_TO_USE, + ax=ax, + width=.8) ax = vaep.plotting.add_height_to_barplot(ax, size=5) ax = vaep.plotting.add_text_to_barplot(ax, _to_plot.loc["text"], size=5) ax.set_xticklabels([]) -fname = args.out_figures / 'performance_test.pdf' +fname = args.out_figures / f'2_{group}_performance_test.pdf' figures[fname.stem] = fname vaep.savefig(fig, name=fname) @@ -636,22 +662,24 @@ def highlight_min(s, color, tolerence=0.00001): # %% [markdown] -# Plot error by median feature intensity +# ### Plot error by median feature intensity # %% +vaep.plotting.make_large_descriptors(7) fig, ax = plt.subplots(figsize=(8, 2)) ax, errors_binned = vaep.plotting.errors.plot_errors_by_median( pred=pred_test[ - [TARGET_COL]+TOP_N_ORDER + [TARGET_COL] + TOP_N_ORDER ], feat_medians=data.train_X.median(), ax=ax, + feat_name=FEAT_NAME_DISPLAY, metric_name=METRIC, palette=COLORS_TO_USE ) - -fname = args.out_figures / 'errors_binned_by_feat_medians.pdf' +vaep.plotting.make_large_descriptors(6) +fname = args.out_figures / f'2_{group}_test_errors_binned_by_feat_medians.pdf' figures[fname.stem] = fname vaep.savefig(ax.get_figure(), name=fname) @@ -667,6 +695,90 @@ def highlight_min(s, color, tolerence=0.00001): .loc[ORDER_MODELS[0]] .sort_values(by=METRIC)) +# %% [markdown] +# ### Custom model selection + +# %% +if SEL_MODELS: + metrics = vaep.models.Metrics() + test_metrics = metrics.add_metrics( + pred_test[['observed', *SEL_MODELS]], key='test data') + test_metrics = pd.DataFrame(test_metrics)[SEL_MODELS] + test_metrics + + n_in_comparison = int(test_metrics.loc['N'].unique()[0]) + n_in_comparison + + _to_plot = test_metrics.loc[METRIC].to_frame().T + _to_plot.index = [feature_names.name] + _to_plot + + try: + text = model_configs[["latent_dim", "hidden_layers"]].apply( + build_text, + axis=1) + except KeyError: + logger.warning("No PIMMS models in comparsion. Using empty text") + text = pd.Series('', index=model_configs.columns) + + _to_plot.loc["text"] = text + _to_plot = _to_plot.fillna('') + _to_plot + + fig, ax = plt.subplots(figsize=(4, 2)) + ax = _to_plot.loc[[feature_names.name]].plot.bar( + rot=0, + ylabel=f"{METRIC} for {FEAT_NAME_DISPLAY} ({n_in_comparison:,} intensities)", + # title=f'performance on test data (based on {n_in_comparison:,} measurements)', + color=vaep.plotting.defaults.assign_colors( + list(k.upper() for k in SEL_MODELS)), + ax=ax, + width=.8) + ax = vaep.plotting.add_height_to_barplot(ax, size=5) + ax = vaep.plotting.add_text_to_barplot(ax, _to_plot.loc["text"], size=5) + ax.set_xticklabels([]) + fname = args.out_figures / f'2_{group}_performance_test_sel.pdf' + figures[fname.stem] = fname + vaep.savefig(fig, name=fname) + + dumps[fname.stem] = fname.with_suffix('.csv') + _to_plot_long = _to_plot.T + _to_plot_long = _to_plot_long.rename( + {feature_names.name: 'metric_value'}, axis=1) + _to_plot_long['data level'] = feature_names.name + _to_plot_long = _to_plot_long.set_index('data level', append=True) + _to_plot_long.to_csv(fname.with_suffix('.csv')) + + +# %% +# custom selection +if SEL_MODELS: + vaep.plotting.make_large_descriptors(7) + fig, ax = plt.subplots(figsize=(8, 2)) + + ax, errors_binned = vaep.plotting.errors.plot_errors_by_median( + pred=pred_test[ + [TARGET_COL] + SEL_MODELS + ], + feat_medians=data.train_X.median(), + ax=ax, + metric_name=METRIC, + feat_name=FEAT_NAME_DISPLAY, + palette=vaep.plotting.defaults.assign_colors( + list(k.upper() for k in SEL_MODELS)) + ) + # ax.set_ylim(0, 1.5) + # for text in ax.legend().get_texts(): + # text.set_fontsize(6) + fname = args.out_figures / f'2_{group}_test_errors_binned_by_feat_medians_sel.pdf' + figures[fname.stem] = fname + vaep.savefig(ax.get_figure(), name=fname) + dumps[fname.stem] = fname.with_suffix('.csv') + errors_binned.to_csv(fname.with_suffix('.csv')) + vaep.plotting.make_large_descriptors(6) + # ax.xaxis.set_tick_params(rotation=0) # horizontal + display(errors_binned) + # %% [markdown] # ### Error by non-decimal number of intensity # @@ -676,13 +788,13 @@ def highlight_min(s, color, tolerence=0.00001): fig, ax = plt.subplots(figsize=(8, 2)) ax, errors_binned = vaep.plotting.errors.plot_errors_binned( pred_test[ - [TARGET_COL]+TOP_N_ORDER + [TARGET_COL] + TOP_N_ORDER ], ax=ax, palette=TOP_N_COLOR_PALETTE, metric_name=METRIC, ) -fname = args.out_figures / 'errors_binned_by_int_test.pdf' +fname = args.out_figures / f'2_{group}_test_errors_binned_by_int.pdf' figures[fname.stem] = fname vaep.savefig(ax.get_figure(), name=fname) @@ -698,5 +810,3 @@ def highlight_min(s, color, tolerence=0.00001): # %% dumps - -# %% diff --git a/project/02_1_aggregate_metrics.py.py b/project/02_1_aggregate_metrics.py.py index 3c7945550..ea11f334a 100644 --- a/project/02_1_aggregate_metrics.py.py +++ b/project/02_1_aggregate_metrics.py.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python diff --git a/project/02_1_join_metrics.py.ipynb b/project/02_1_join_metrics.py.ipynb index 387a722f6..561f06525 100644 --- a/project/02_1_join_metrics.py.ipynb +++ b/project/02_1_join_metrics.py.ipynb @@ -38,9 +38,7 @@ "cell_type": "code", "execution_count": 4, "id": "df472356", - "metadata": { - "lines_to_next_cell": 1 - }, + "metadata": {}, "outputs": [], "source": [ "filepath_out" @@ -52,7 +50,7 @@ "metadata": {}, "outputs": [], "source": [ - "## Example \n", + "## Example\n", "\n", "- first file" ] @@ -64,6 +62,8 @@ "metadata": {}, "outputs": [], "source": [ + "\n", + "\n", "def process(fpath: str) -> pd.DataFrame:\n", " df = pd.read_csv(fpath, index_col=POS_INDEX_COL, header=list(range(N_HEADER_COLS)))\n", " return df\n", diff --git a/project/02_1_join_metrics.py.py b/project/02_1_join_metrics.py.py index ec2aae537..8b395c187 100644 --- a/project/02_1_join_metrics.py.py +++ b/project/02_1_join_metrics.py.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -30,11 +30,13 @@ filepath_out # %% [markdown] -# ## Example +# ## Example # # - first file # %% + + def process(fpath: str) -> pd.DataFrame: df = pd.read_csv(fpath, index_col=POS_INDEX_COL, header=list(range(N_HEADER_COLS))) return df diff --git a/project/02_2_aggregate_configs.py.ipynb b/project/02_2_aggregate_configs.py.ipynb index 2d36b6b43..cdb9d77fb 100644 --- a/project/02_2_aggregate_configs.py.ipynb +++ b/project/02_2_aggregate_configs.py.ipynb @@ -20,10 +20,12 @@ "source": [ "from pathlib import Path\n", "import pandas as pd\n", - "pd.options.display.max_columns = 30 \n", "\n", - "from vaep.models.collect_dumps import collect_configs\n", "from vaep.logging import setup_nb_logger\n", + "from vaep.models.collect_dumps import collect_configs\n", + "\n", + "pd.options.display.max_columns = 30\n", + "\n", "logger = setup_nb_logger()" ] }, diff --git a/project/02_2_aggregate_configs.py.py b/project/02_2_aggregate_configs.py.py index f820ef03e..dc8ba3a3a 100644 --- a/project/02_2_aggregate_configs.py.py +++ b/project/02_2_aggregate_configs.py.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -20,10 +20,12 @@ # %% from pathlib import Path import pandas as pd -pd.options.display.max_columns = 30 -from vaep.models.collect_dumps import collect_configs from vaep.logging import setup_nb_logger +from vaep.models.collect_dumps import collect_configs + +pd.options.display.max_columns = 30 + logger = setup_nb_logger() # %% diff --git a/project/02_2_join_configs.py.ipynb b/project/02_2_join_configs.py.ipynb index 8b51442fb..4d1b871f5 100644 --- a/project/02_2_join_configs.py.ipynb +++ b/project/02_2_join_configs.py.ipynb @@ -103,7 +103,7 @@ ], "metadata": { "kernelspec": { - "display_name": "vaep", + "display_name": "Python 3", "language": "python", "name": "python3" }, diff --git a/project/02_2_join_configs.py.py b/project/02_2_join_configs.py.py index 36a18ccc4..d8381e119 100644 --- a/project/02_2_join_configs.py.py +++ b/project/02_2_join_configs.py.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.0 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python diff --git a/project/02_3_grid_search_analysis.ipynb b/project/02_3_grid_search_analysis.ipynb index b47d75474..440b99b21 100644 --- a/project/02_3_grid_search_analysis.ipynb +++ b/project/02_3_grid_search_analysis.ipynb @@ -21,18 +21,18 @@ "import plotly.express as px\n", "import matplotlib\n", "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", "\n", - "\n", + "import vaep.plotting.plotly as px_vaep\n", + "from vaep.analyzers import compare_predictions\n", + "from vaep import sampling\n", + "from vaep.io import datasplits\n", + "import vaep.utils\n", + "import vaep.pandas\n", + "import vaep.io\n", "import vaep.nb\n", "matplotlib.rcParams['figure.figsize'] = [12.0, 6.0]\n", "\n", - "import vaep.io\n", - "import vaep.pandas\n", - "import vaep.utils\n", - "from vaep.io import datasplits\n", - "from vaep import sampling\n", - "from vaep.analyzers import compare_predictions\n", - "import vaep.plotting.plotly as px_vaep\n", "\n", "pd.options.display.max_columns = 45\n", "pd.options.display.max_rows = 100\n", @@ -70,8 +70,8 @@ }, "outputs": [], "source": [ - "metrics_csv:str = \"path/to/all_metrics.csv\" # file path to metrics\n", - "configs_csv:str = \"path/to/all_configs.csv\" # file path to configs (\"meta data\")" + "metrics_csv: str = \"path/to/all_metrics.csv\" # file path to metrics\n", + "configs_csv: str = \"path/to/all_configs.csv\" # file path to configs (\"meta data\")" ] }, { @@ -170,7 +170,7 @@ "source": [ "# ToDo: integrate as parameters\n", "metric_columns = ['MSE', 'MAE']\n", - "model_keys = metrics.stack('model').index.levels[-1].unique().to_list() # not used\n", + "model_keys = metrics.stack('model').index.levels[-1].unique().to_list() # not used\n", "subset = metrics.columns.levels[0][0]\n", "print(f\"{subset = }\")" ] @@ -204,16 +204,16 @@ "meta['hidden_layers'] = (meta\n", " .loc[meta['hidden_layers'].notna(), 'hidden_layers']\n", " .apply(lambda x: tuple(eval(x)))\n", - ")\n", + " )\n", "meta['n_hidden_layers'] = (meta\n", " .loc[meta['hidden_layers'].notna(), 'hidden_layers']\n", " .apply(len)\n", - ")\n", + " )\n", "meta['n_hidden_layers'] = (meta\n", " ['n_hidden_layers']\n", " .fillna(0)\n", " .astype(int)\n", - ")\n", + " )\n", "meta.loc[meta['hidden_layers'].isna(), 'hidden_layers'] = None\n", "meta = meta.set_index('id')\n", "meta" @@ -225,7 +225,8 @@ "id": "b4dd468f-8995-403d-a389-6c4e4e912cd5", "metadata": {}, "source": [ - "Batch size for collab models depends on a factor (as the data in long format has roughly N samples * M features entries)." + "Batch size for collab models depends on a factor (as the data in long\n", + "format has roughly N samples * M features entries)." ] }, { @@ -271,19 +272,19 @@ "source": [ "# ToDo: To make it cleaner: own config for each model (interpolated and median)\n", "metrics_styled = (metrics\n", - " .set_index(\n", - " pd.MultiIndex\n", - " .from_frame(\n", - " meta\n", - " .loc[metrics.index, ['latent_dim', 'hidden_layers', 'batch_size']]\n", - " # .loc[metrics.index]\n", - " )\n", - " )\n", - " .sort_index()\n", - " .stack('model')\n", - " .drop_duplicates()\n", - " .style.background_gradient(cmap)\n", - ")\n", + " .set_index(\n", + " pd.MultiIndex\n", + " .from_frame(\n", + " meta\n", + " .loc[metrics.index, ['latent_dim', 'hidden_layers', 'batch_size']]\n", + " # .loc[metrics.index]\n", + " )\n", + " )\n", + " .sort_index()\n", + " .stack('model')\n", + " .drop_duplicates()\n", + " .style.background_gradient(cmap)\n", + " )\n", "\n", "metrics = metrics_styled.data\n", "metrics_styled" @@ -354,7 +355,7 @@ "metadata": {}, "outputs": [], "source": [ - "metrics_long = pd.read_csv(path_metrics, index_col=[0], header=[0,1,2])\n", + "metrics_long = pd.read_csv(path_metrics, index_col=[0], header=[0, 1, 2])\n", "# columns_names = ['subset', 'data_split', 'model', 'metric_name']\n", "columns_names = list(metrics_long.columns.names)\n", "metrics_long.sample(5) if len(metrics_long) > 15 else metrics_long" @@ -395,13 +396,13 @@ "outputs": [], "source": [ "metrics_prop = (metrics_long\n", - " .loc[:, pd.IndexSlice[:, :, 'prop']]\n", - " .stack(['data_split', 'model'])\n", - " .reset_index()\n", - " .drop_duplicates()\n", - " .set_index(['id', 'data_split', 'model'])\n", - " .astype(int)\n", - " )\n", + " .loc[:, pd.IndexSlice[:, :, 'prop']]\n", + " .stack(['data_split', 'model'])\n", + " .reset_index()\n", + " .drop_duplicates()\n", + " .set_index(['id', 'data_split', 'model'])\n", + " .astype(int)\n", + " )\n", "metrics_prop" ] }, @@ -427,7 +428,7 @@ " .to_frame('metric_value')\n", " .reset_index('metric_name')\n", " .join(metrics_N)\n", - ")\n", + " )\n", "metrics_long" ] }, @@ -450,7 +451,7 @@ "metrics_long = (metrics_long\n", " .reset_index(['data_split'])\n", " .join(meta.set_index('model', append=True))\n", - " ).reset_index('model')\n", + " ).reset_index('model')\n", "# metrics_long.index.name = 'id'\n", "metrics_long.sample(5)" ] @@ -549,7 +550,7 @@ "id": "c4607c64-2e90-4ed6-b337-8e210d7c37de", "metadata": {}, "source": [ - "# Collection of Performance plots \n", + "# Collection of Performance plots\n", "\n", "- specify `labels_dict` for plotly plotting\n", "\n" @@ -607,7 +608,6 @@ }, "outputs": [], "source": [ - "import seaborn as sns\n", "plt.rcParams['figure.figsize'] = (8, 4)\n", "plt.rcParams['lines.linewidth'] = 2\n", "plt.rcParams['lines.markersize'] = 3\n", @@ -620,9 +620,9 @@ " x='n_params',\n", " y='metric_value',\n", " col=\"data_split\",\n", - " col_order = col_order,\n", + " col_order=col_order,\n", " row=\"metric_name\",\n", - " row_order = row_order,\n", + " row_order=row_order,\n", " hue=\"model\",\n", " # style=\"day\",\n", " palette=vaep.plotting.defaults.color_model_mapping,\n", @@ -633,11 +633,11 @@ "fg.fig.get_size_inches()\n", "\n", "(ax_00, ax_01), (ax_10, ax_11) = fg.axes\n", - "ax_00.set_ylabel('MAE')\n", - "ax_10.set_ylabel('MSE')\n", - "_ = ax_00.set_title('validation data')\n", - "_ = ax_01.set_title('test data')\n", - "ax_10.set_xlabel('number of parameters')\n", + "ax_00.set_ylabel(row_order[0])\n", + "ax_10.set_ylabel(row_order[1])\n", + "_ = ax_00.set_title('validation data') # col_order[0]\n", + "_ = ax_01.set_title('test data') # col_order[1]\n", + "ax_10.set_xlabel('number of parameters') # n_params\n", "ax_11.set_xlabel('number of parameters')\n", "ax_10.xaxis.set_major_formatter(\"{x:,.0f}\")\n", "ax_11.xaxis.set_major_formatter(\"{x:,.0f}\")\n", @@ -686,6 +686,7 @@ " yaxis={'title': {'standoff': 6}})\n", " return fig\n", "\n", + "\n", "dataset = \"test_fake_na\"\n", "fig = plot_by_params(dataset)\n", "fname = FOLDER / f\"hyperpar_{dataset}_results_by_parameters.pdf\"\n", @@ -730,8 +731,8 @@ "source": [ "group_by = ['data_split', 'latent_dim', 'metric_name', 'model']\n", "metrics_long_sel_min = metrics_long.reset_index(\n", - " ).groupby(by=group_by\n", - " ).apply(lambda df: df.sort_values(by='metric_value').iloc[0])\n", + ").groupby(by=group_by\n", + " ).apply(lambda df: df.sort_values(by='metric_value').iloc[0])\n", "metrics_long_sel_min" ] }, @@ -813,7 +814,7 @@ "source": [ "dataset = 'valid_fake_na'\n", "group_by = ['data_split', 'metric_name', 'model', 'latent_dim']\n", - "METRIC = 'MAE' # params.metric\n", + "METRIC = 'MAE' # params.metric\n", "selected = (metrics_long\n", " .reset_index()\n", " .groupby(by=group_by)\n", @@ -864,11 +865,11 @@ "outputs": [], "source": [ "min_latent = (selected\n", - " .loc[METRIC]\n", - " .loc[model_with_latent]\n", - " .groupby(level='latent_dim')\n", - " .agg({'metric_value': 'mean'})\n", - " .sort_values('metric_value')\n", + " .loc[METRIC]\n", + " .loc[model_with_latent]\n", + " .groupby(level='latent_dim')\n", + " .agg({'metric_value': 'mean'})\n", + " .sort_values('metric_value')\n", " )\n", "min_latent" ] @@ -1034,10 +1035,10 @@ " .value_counts()\n", " .sort_index()\n", " .plot(style='.',\n", - " xlabel='number of samples',\n", - " ylabel='observations')\n", - ")\n", - "vaep.savefig(ax.get_figure(), files_out[f'n_obs_error_counts_{dataset}.pdf'])" + " xlabel='number of samples',\n", + " ylabel='observations')\n", + " )\n", + "vaep.savefig(ax.get_figure(), files_out[f'n_obs_error_counts_{dataset}.pdf'])" ] }, { @@ -1117,8 +1118,8 @@ "msg_annotation = f\"(Latend dim: {min_latent}, No. of feat: {M_feat}, window_size: {window_size})\"\n", "print(msg_annotation)\n", "\n", - "files_out[f'best_models_ld_{min_latent}_rolling_errors_by_freq'] = (FOLDER /\n", - " f'best_models_ld_{min_latent}_rolling_errors_by_freq')\n", + "files_out[f'best_models_ld_{min_latent}_rolling_errors_by_freq'] = (\n", + " FOLDER / f'best_models_ld_{min_latent}_rolling_errors_by_freq')\n", "vaep.savefig(\n", " ax.get_figure(),\n", " name=files_out[f'best_models_ld_{min_latent}_rolling_errors_by_freq'])" @@ -1165,8 +1166,8 @@ " )\n", "fig = px_vaep.apply_default_layout(fig)\n", "fig.update_layout(legend_title_text='') # remove legend title\n", - "files_out[f'best_models_ld_{min_latent}_errors_by_freq_plotly.html'] = (FOLDER /\n", - " f'best_models_ld_{min_latent}_errors_by_freq_plotly.html')\n", + "files_out[f'best_models_ld_{min_latent}_errors_by_freq_plotly.html'] = (\n", + " FOLDER / f'best_models_ld_{min_latent}_errors_by_freq_plotly.html')\n", "fig.write_html(\n", " files_out[f'best_models_ld_{min_latent}_errors_by_freq_plotly.html'])\n", "fig" @@ -1205,8 +1206,8 @@ " # title='mean error for features averaged for each frequency'\n", " xlim=(FREQ_MIN, freq_feat.max())\n", ")\n", - "files_out[f'best_models_ld_{min_latent}_errors_by_freq_averaged'] = (FOLDER /\n", - " f'best_models_ld_{min_latent}_errors_by_freq_averaged')\n", + "files_out[f'best_models_ld_{min_latent}_errors_by_freq_averaged'] = (\n", + " FOLDER / f'best_models_ld_{min_latent}_errors_by_freq_averaged')\n", "vaep.savefig(\n", " ax.get_figure(),\n", " files_out[f'best_models_ld_{min_latent}_errors_by_freq_averaged'])" diff --git a/project/02_3_grid_search_analysis.py b/project/02_3_grid_search_analysis.py index cd36ae19a..24c0ce374 100644 --- a/project/02_3_grid_search_analysis.py +++ b/project/02_3_grid_search_analysis.py @@ -6,7 +6,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -22,18 +22,18 @@ import plotly.express as px import matplotlib import matplotlib.pyplot as plt +import seaborn as sns - +import vaep.plotting.plotly as px_vaep +from vaep.analyzers import compare_predictions +from vaep import sampling +from vaep.io import datasplits +import vaep.utils +import vaep.pandas +import vaep.io import vaep.nb matplotlib.rcParams['figure.figsize'] = [12.0, 6.0] -import vaep.io -import vaep.pandas -import vaep.utils -from vaep.io import datasplits -from vaep import sampling -from vaep.analyzers import compare_predictions -import vaep.plotting.plotly as px_vaep pd.options.display.max_columns = 45 pd.options.display.max_rows = 100 @@ -48,8 +48,8 @@ # papermill parameters: # %% tags=["parameters"] -metrics_csv:str = "path/to/all_metrics.csv" # file path to metrics -configs_csv:str = "path/to/all_configs.csv" # file path to configs ("meta data") +metrics_csv: str = "path/to/all_metrics.csv" # file path to metrics +configs_csv: str = "path/to/all_configs.csv" # file path to configs ("meta data") # %% try: @@ -92,7 +92,7 @@ # %% # ToDo: integrate as parameters metric_columns = ['MSE', 'MAE'] -model_keys = metrics.stack('model').index.levels[-1].unique().to_list() # not used +model_keys = metrics.stack('model').index.levels[-1].unique().to_list() # not used subset = metrics.columns.levels[0][0] print(f"{subset = }") @@ -107,22 +107,23 @@ meta['hidden_layers'] = (meta .loc[meta['hidden_layers'].notna(), 'hidden_layers'] .apply(lambda x: tuple(eval(x))) -) + ) meta['n_hidden_layers'] = (meta .loc[meta['hidden_layers'].notna(), 'hidden_layers'] .apply(len) -) + ) meta['n_hidden_layers'] = (meta ['n_hidden_layers'] .fillna(0) .astype(int) -) + ) meta.loc[meta['hidden_layers'].isna(), 'hidden_layers'] = None meta = meta.set_index('id') meta # %% [markdown] -# Batch size for collab models depends on a factor (as the data in long format has roughly N samples * M features entries). +# Batch size for collab models depends on a factor (as the data in long +# format has roughly N samples * M features entries). # %% [markdown] # ## Colorcoded metrics @@ -141,19 +142,19 @@ # %% # ToDo: To make it cleaner: own config for each model (interpolated and median) metrics_styled = (metrics - .set_index( - pd.MultiIndex - .from_frame( - meta - .loc[metrics.index, ['latent_dim', 'hidden_layers', 'batch_size']] - # .loc[metrics.index] - ) - ) - .sort_index() - .stack('model') - .drop_duplicates() - .style.background_gradient(cmap) -) + .set_index( + pd.MultiIndex + .from_frame( + meta + .loc[metrics.index, ['latent_dim', 'hidden_layers', 'batch_size']] + # .loc[metrics.index] + ) + ) + .sort_index() + .stack('model') + .drop_duplicates() + .style.background_gradient(cmap) + ) metrics = metrics_styled.data metrics_styled @@ -189,7 +190,7 @@ # Rebuild metrics from dictionary # %% -metrics_long = pd.read_csv(path_metrics, index_col=[0], header=[0,1,2]) +metrics_long = pd.read_csv(path_metrics, index_col=[0], header=[0, 1, 2]) # columns_names = ['subset', 'data_split', 'model', 'metric_name'] columns_names = list(metrics_long.columns.names) metrics_long.sample(5) if len(metrics_long) > 15 else metrics_long @@ -210,13 +211,13 @@ # %% metrics_prop = (metrics_long - .loc[:, pd.IndexSlice[:, :, 'prop']] - .stack(['data_split', 'model']) - .reset_index() - .drop_duplicates() - .set_index(['id', 'data_split', 'model']) - .astype(int) - ) + .loc[:, pd.IndexSlice[:, :, 'prop']] + .stack(['data_split', 'model']) + .reset_index() + .drop_duplicates() + .set_index(['id', 'data_split', 'model']) + .astype(int) + ) metrics_prop # %% [markdown] @@ -229,7 +230,7 @@ .to_frame('metric_value') .reset_index('metric_name') .join(metrics_N) -) + ) metrics_long # %% [markdown] @@ -239,7 +240,7 @@ metrics_long = (metrics_long .reset_index(['data_split']) .join(meta.set_index('model', append=True)) - ).reset_index('model') + ).reset_index('model') # metrics_long.index.name = 'id' metrics_long.sample(5) @@ -286,7 +287,7 @@ logger.info(f"Saved metrics in long format: {fname}") # %% [markdown] -# # Collection of Performance plots +# # Collection of Performance plots # # - specify `labels_dict` for plotly plotting # @@ -316,7 +317,6 @@ hover_data['metric_value'] = ':.4f' # %% -import seaborn as sns plt.rcParams['figure.figsize'] = (8, 4) plt.rcParams['lines.linewidth'] = 2 plt.rcParams['lines.markersize'] = 3 @@ -329,9 +329,9 @@ x='n_params', y='metric_value', col="data_split", - col_order = col_order, + col_order=col_order, row="metric_name", - row_order = row_order, + row_order=row_order, hue="model", # style="day", palette=vaep.plotting.defaults.color_model_mapping, @@ -344,9 +344,9 @@ (ax_00, ax_01), (ax_10, ax_11) = fg.axes ax_00.set_ylabel(row_order[0]) ax_10.set_ylabel(row_order[1]) -_ = ax_00.set_title('validation data') # col_order[0] -_ = ax_01.set_title('test data') # col_order[1] -ax_10.set_xlabel('number of parameters') # n_params +_ = ax_00.set_title('validation data') # col_order[0] +_ = ax_01.set_title('test data') # col_order[1] +ax_10.set_xlabel('number of parameters') # n_params ax_11.set_xlabel('number of parameters') ax_10.xaxis.set_major_formatter("{x:,.0f}") ax_11.xaxis.set_major_formatter("{x:,.0f}") @@ -388,6 +388,7 @@ def plot_by_params(data_split: str = '', subset: str = ''): yaxis={'title': {'standoff': 6}}) return fig + dataset = "test_fake_na" fig = plot_by_params(dataset) fname = FOLDER / f"hyperpar_{dataset}_results_by_parameters.pdf" @@ -412,8 +413,8 @@ def plot_by_params(data_split: str = '', subset: str = ''): # %% group_by = ['data_split', 'latent_dim', 'metric_name', 'model'] metrics_long_sel_min = metrics_long.reset_index( - ).groupby(by=group_by - ).apply(lambda df: df.sort_values(by='metric_value').iloc[0]) +).groupby(by=group_by + ).apply(lambda df: df.sort_values(by='metric_value').iloc[0]) metrics_long_sel_min @@ -469,7 +470,7 @@ def get_plotly_figure(dataset: str, x='latent_dim'): # %% dataset = 'valid_fake_na' group_by = ['data_split', 'metric_name', 'model', 'latent_dim'] -METRIC = 'MAE' # params.metric +METRIC = 'MAE' # params.metric selected = (metrics_long .reset_index() .groupby(by=group_by) @@ -494,11 +495,11 @@ def get_plotly_figure(dataset: str, x='latent_dim'): # %% min_latent = (selected - .loc[METRIC] - .loc[model_with_latent] - .groupby(level='latent_dim') - .agg({'metric_value': 'mean'}) - .sort_values('metric_value') + .loc[METRIC] + .loc[model_with_latent] + .groupby(level='latent_dim') + .agg({'metric_value': 'mean'}) + .sort_values('metric_value') ) min_latent @@ -581,10 +582,10 @@ def get_plotly_figure(dataset: str, x='latent_dim'): .value_counts() .sort_index() .plot(style='.', - xlabel='number of samples', - ylabel='observations') -) -vaep.savefig(ax.get_figure(), files_out[f'n_obs_error_counts_{dataset}.pdf']) + xlabel='number of samples', + ylabel='observations') + ) +vaep.savefig(ax.get_figure(), files_out[f'n_obs_error_counts_{dataset}.pdf']) # %% ax = errors.plot.scatter('freq', 'n_obs') @@ -621,8 +622,8 @@ def get_plotly_figure(dataset: str, x='latent_dim'): msg_annotation = f"(Latend dim: {min_latent}, No. of feat: {M_feat}, window_size: {window_size})" print(msg_annotation) -files_out[f'best_models_ld_{min_latent}_rolling_errors_by_freq'] = (FOLDER / - f'best_models_ld_{min_latent}_rolling_errors_by_freq') +files_out[f'best_models_ld_{min_latent}_rolling_errors_by_freq'] = ( + FOLDER / f'best_models_ld_{min_latent}_rolling_errors_by_freq') vaep.savefig( ax.get_figure(), name=files_out[f'best_models_ld_{min_latent}_rolling_errors_by_freq']) @@ -647,8 +648,8 @@ def get_plotly_figure(dataset: str, x='latent_dim'): ) fig = px_vaep.apply_default_layout(fig) fig.update_layout(legend_title_text='') # remove legend title -files_out[f'best_models_ld_{min_latent}_errors_by_freq_plotly.html'] = (FOLDER / - f'best_models_ld_{min_latent}_errors_by_freq_plotly.html') +files_out[f'best_models_ld_{min_latent}_errors_by_freq_plotly.html'] = ( + FOLDER / f'best_models_ld_{min_latent}_errors_by_freq_plotly.html') fig.write_html( files_out[f'best_models_ld_{min_latent}_errors_by_freq_plotly.html']) fig @@ -673,8 +674,8 @@ def get_plotly_figure(dataset: str, x='latent_dim'): # title='mean error for features averaged for each frequency' xlim=(FREQ_MIN, freq_feat.max()) ) -files_out[f'best_models_ld_{min_latent}_errors_by_freq_averaged'] = (FOLDER / - f'best_models_ld_{min_latent}_errors_by_freq_averaged') +files_out[f'best_models_ld_{min_latent}_errors_by_freq_averaged'] = ( + FOLDER / f'best_models_ld_{min_latent}_errors_by_freq_averaged') vaep.savefig( ax.get_figure(), files_out[f'best_models_ld_{min_latent}_errors_by_freq_averaged']) diff --git a/project/02_4_best_models_over_all_data.ipynb b/project/02_4_best_models_over_all_data.ipynb index 17c3f0dde..f252973d2 100644 --- a/project/02_4_best_models_over_all_data.ipynb +++ b/project/02_4_best_models_over_all_data.ipynb @@ -78,7 +78,7 @@ "source": [ "# snakemake.params.folder\n", "try:\n", - " models = snakemake.params.models # snakefile would need to be\n", + " models = snakemake.params.models # snakefile would need to be\n", "except AttributeError:\n", " models = ['Median', 'interpolated', 'CF', 'DAE', 'VAE']\n", "models" @@ -155,9 +155,9 @@ "outputs": [], "source": [ "_unique = metrics_long[\"data level\"].unique()\n", - "order_categories['data level'] = [l for l in order_categories['data level'] if l in _unique] #ensure predefined order\n", + "order_categories['data level'] = [l for l in order_categories['data level'] if l in _unique] # ensure predefined order\n", "_unique = metrics_long['model'].unique()\n", - "order_categories['model'] = [m for m in order_categories['model'] if m in _unique] #ensure predefined order\n", + "order_categories['model'] = [m for m in order_categories['model'] if m in _unique] # ensure predefined order\n", "\n", "semi_supervised = [m for m in ['CF', 'DAE', 'VAE'] if m in _unique]\n", "reference = [m for m in ['median', 'interpolated'] if m in _unique]\n", @@ -216,11 +216,11 @@ "source": [ "# select best model of top N with least parameters\n", "sel_on_val = (sel_on_val\n", - " .groupby(by=group_by)\n", - " .apply(\n", - " lambda df: df.sort_values(by='n_params').iloc[0]\n", - " )\n", - " ).loc[\n", + " .groupby(by=group_by)\n", + " .apply(\n", + " lambda df: df.sort_values(by='n_params').iloc[0]\n", + " )\n", + " ).loc[\n", " pd.IndexSlice[dataset, IDX_ORDER[0], 'MAE', IDX_ORDER[1]],\n", " selected_cols]\n", "sel_on_val.to_excel(writer, sheet_name=f'selected')\n", @@ -248,13 +248,13 @@ "idx = sel_on_val.droplevel(level='data_split').index\n", "sel_on_val = sel_on_val.reset_index(['latent_dim', 'hidden_layers', 'id'])\n", "\n", - "test_results = ( metrics_long\n", - " .query('data_split == \"test_fake_na\"')\n", - " .reset_index().set_index(idx.names)\n", - " .loc[idx]\n", - " .reset_index(['latent_dim', 'hidden_layers', 'id'])\n", - " .set_index('data_split', append=True)\n", - ")[selected_cols]\n", + "test_results = (metrics_long\n", + " .query('data_split == \"test_fake_na\"')\n", + " .reset_index().set_index(idx.names)\n", + " .loc[idx]\n", + " .reset_index(['latent_dim', 'hidden_layers', 'id'])\n", + " .set_index('data_split', append=True)\n", + " )[selected_cols]\n", "test_results" ] }, @@ -329,7 +329,7 @@ "metadata": {}, "outputs": [], "source": [ - "### Validation data results " + "### Validation data results" ] }, { @@ -344,7 +344,7 @@ "_to_plot = sel_on_val.reset_index(level=['data level', 'model']).loc[[('valid_fake_na', METRIC), ]]\n", "\n", "_to_plot = _to_plot.set_index(['data level', 'model'])[['metric_value', 'text']]\n", - "_to_plot = _to_plot.loc[IDX_ORDER,:]\n", + "_to_plot = _to_plot.loc[IDX_ORDER, :]\n", "_to_plot.index.name = ''\n", "# text = test_results['text'].unstack().loc[IDX_ORDER].unstack()\n", "_to_plot = _to_plot['metric_value'].unstack().loc[IDX_ORDER]\n", @@ -383,7 +383,8 @@ "fname = 'best_models_1_val_plotly'\n", "_to_plot = sel_on_val.reset_index(level=['data level', 'model']).loc[[('valid_fake_na', METRIC), ]]\n", "_to_plot = _to_plot.set_index(['data level', 'model'])\n", - "_to_plot[['metric_value', 'latent_dim', 'hidden_layers', 'text']] = _to_plot[['metric_value', 'latent_dim', 'hidden_layers', 'text']].fillna('-')\n", + "_to_plot[['metric_value', 'latent_dim', 'hidden_layers', 'text']] = _to_plot[[\n", + " 'metric_value', 'latent_dim', 'hidden_layers', 'text']].fillna('-')\n", "\n", "_to_plot = _to_plot.loc[pd.IndexSlice[IDX_ORDER], :]\n", "_to_plot.to_csv(FOLDER / f\"{fname}.csv\")\n", @@ -401,7 +402,7 @@ "fig = px.bar(_to_plot.reset_index(),\n", " x='data level',\n", " y='metric_value',\n", - " hover_data={'N': ':,d', 'n_params': ':,d'}, # format hover data\n", + " hover_data={'N': ':,d', 'n_params': ':,d'}, # format hover data\n", " color='model',\n", " barmode=\"group\",\n", " text='text',\n", @@ -439,8 +440,8 @@ " f' & metric_name == \"{METRIC}\"')\n", "\n", "best_on_average = metrics_long_sel.reset_index(\n", - " ).groupby(by=group_by\n", - " )['metric_value'].mean().sort_values().reset_index(level=group_by[1:])\n", + ").groupby(by=group_by\n", + " )['metric_value'].mean().sort_values().reset_index(level=group_by[1:])\n", "best_on_average" ] }, @@ -477,21 +478,21 @@ "data_split = 'test_fake_na'\n", "\n", "metrics_long_sel_test = metrics_long.query(f'data_split == \"{data_split}\"'\n", - " f' & metric_name == \"{METRIC}\"')\n", + " f' & metric_name == \"{METRIC}\"')\n", "\n", "to_plot = (metrics_long_sel_test\n", - " .reset_index().set_index(group_by)\n", - " .loc[best_on_average.index]\n", - " .reset_index().set_index(['model', 'data level'])\n", - " .loc[pd.IndexSlice[order_categories['model'], order_categories['data level']], :])\n", + " .reset_index().set_index(group_by)\n", + " .loc[best_on_average.index]\n", + " .reset_index().set_index(['model', 'data level'])\n", + " .loc[pd.IndexSlice[order_categories['model'], order_categories['data level']], :])\n", "\n", "\n", "to_plot = to_plot.reset_index()\n", "to_plot['model annotated'] = to_plot['model'] + ' - ' + to_plot['text']\n", - "order_model = to_plot['model annotated'].drop_duplicates().to_list() # model name with annotation\n", + "order_model = to_plot['model annotated'].drop_duplicates().to_list() # model name with annotation\n", "\n", "to_plot = to_plot.drop_duplicates(subset=['model', 'data level', 'metric_value'])\n", - "to_plot.to_csv(FOLDER /f\"{fname}.csv\")\n", + "to_plot.to_csv(FOLDER / f\"{fname}.csv\")\n", "to_plot" ] }, @@ -502,7 +503,7 @@ "metadata": {}, "outputs": [], "source": [ - "figsize= (10,8) # None # (10,8)\n", + "figsize = (10, 8) # None # (10,8)\n", "fig, ax = plt.subplots(figsize=figsize)\n", "to_plot.columns.name = ''\n", "ax = (to_plot\n", @@ -517,7 +518,19 @@ " width=.8,\n", " ax=ax,\n", " # colormap=\"Paired\",\n", - " color = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928']\n", + " color=[\n", + " '#a6cee3',\n", + " '#1f78b4',\n", + " '#b2df8a',\n", + " '#33a02c',\n", + " '#fb9a99',\n", + " '#e31a1c',\n", + " '#fdbf6f',\n", + " '#ff7f00',\n", + " '#cab2d6',\n", + " '#6a3d9a',\n", + " '#ffff99',\n", + " '#b15928']\n", " )\n", " )\n", "ax = vaep.plotting.add_height_to_barplot(ax, size=11)\n", @@ -546,7 +559,7 @@ " x='model',\n", " y='metric_value',\n", " color='data level',\n", - " hover_data={'N': ':,d', 'n_params': ':,d'}, # format hover data\n", + " hover_data={'N': ':,d', 'n_params': ':,d'}, # format hover data\n", " barmode=\"group\",\n", " color_discrete_sequence=px.colors.colorbrewer.Paired,\n", " # color_discrete_sequence=['#a6cee3', '#1f78b4', '#b2df8a'],\n", @@ -583,14 +596,14 @@ " .loc[best_on_average.index].reset_index()\n", " .set_index(['model', 'data level'])\n", " .loc[pd.IndexSlice[order_categories['model'], order_categories['data level']], :]\n", - " )\n", + " )\n", "\n", "to_plot = to_plot.reset_index()\n", "to_plot['model annotated'] = to_plot['model'] + ' - ' + to_plot['text']\n", - "order_model = to_plot['model annotated'].drop_duplicates().to_list() # model name with annotation\n", + "order_model = to_plot['model annotated'].drop_duplicates().to_list() # model name with annotation\n", "\n", "to_plot = to_plot.drop_duplicates(subset=['model', 'data level', 'metric_value'])\n", - "to_plot.to_csv(FOLDER /f\"{fname}.csv\")\n", + "to_plot.to_csv(FOLDER / f\"{fname}.csv\")\n", "to_plot" ] }, @@ -601,7 +614,7 @@ "metadata": {}, "outputs": [], "source": [ - "figsize= (10,8) # None # (10,8)\n", + "figsize = (10, 8) # None # (10,8)\n", "fig, ax = plt.subplots(figsize=figsize)\n", "to_plot.columns.name = ''\n", "ax = (to_plot\n", @@ -616,7 +629,19 @@ " width=.8,\n", " ax=ax,\n", " # colormap=\"Paired\",\n", - " color = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928']\n", + " color=[\n", + " '#a6cee3',\n", + " '#1f78b4',\n", + " '#b2df8a',\n", + " '#33a02c',\n", + " '#fb9a99',\n", + " '#e31a1c',\n", + " '#fdbf6f',\n", + " '#ff7f00',\n", + " '#cab2d6',\n", + " '#6a3d9a',\n", + " '#ffff99',\n", + " '#b15928']\n", " )\n", " )\n", "ax = vaep.plotting.add_height_to_barplot(ax, size=11)\n", @@ -645,7 +670,7 @@ " x='model',\n", " y='metric_value',\n", " color='data level',\n", - " hover_data={'N': ':,d', 'n_params': ':,d'}, # format hover data\n", + " hover_data={'N': ':,d', 'n_params': ':,d'}, # format hover data\n", " barmode=\"group\",\n", " color_discrete_sequence=px.colors.colorbrewer.Paired,\n", " # color_discrete_sequence=['#a6cee3', '#1f78b4', '#b2df8a'],\n", diff --git a/project/02_4_best_models_over_all_data.py b/project/02_4_best_models_over_all_data.py index ffb4b941f..3aea8ecae 100644 --- a/project/02_4_best_models_over_all_data.py +++ b/project/02_4_best_models_over_all_data.py @@ -6,7 +6,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -59,7 +59,7 @@ # %% # snakemake.params.folder try: - models = snakemake.params.models # snakefile would need to be + models = snakemake.params.models # snakefile would need to be except AttributeError: models = ['Median', 'interpolated', 'CF', 'DAE', 'VAE'] models @@ -96,9 +96,9 @@ # %% _unique = metrics_long["data level"].unique() -order_categories['data level'] = [l for l in order_categories['data level'] if l in _unique] #ensure predefined order +order_categories['data level'] = [l for l in order_categories['data level'] if l in _unique] # ensure predefined order _unique = metrics_long['model'].unique() -order_categories['model'] = [m for m in order_categories['model'] if m in _unique] #ensure predefined order +order_categories['model'] = [m for m in order_categories['model'] if m in _unique] # ensure predefined order semi_supervised = [m for m in ['CF', 'DAE', 'VAE'] if m in _unique] reference = [m for m in ['median', 'interpolated'] if m in _unique] @@ -137,11 +137,11 @@ # %% # select best model of top N with least parameters sel_on_val = (sel_on_val - .groupby(by=group_by) - .apply( - lambda df: df.sort_values(by='n_params').iloc[0] - ) - ).loc[ + .groupby(by=group_by) + .apply( + lambda df: df.sort_values(by='n_params').iloc[0] + ) + ).loc[ pd.IndexSlice[dataset, IDX_ORDER[0], 'MAE', IDX_ORDER[1]], selected_cols] sel_on_val.to_excel(writer, sheet_name=f'selected') @@ -156,13 +156,13 @@ idx = sel_on_val.droplevel(level='data_split').index sel_on_val = sel_on_val.reset_index(['latent_dim', 'hidden_layers', 'id']) -test_results = ( metrics_long - .query('data_split == "test_fake_na"') - .reset_index().set_index(idx.names) - .loc[idx] - .reset_index(['latent_dim', 'hidden_layers', 'id']) - .set_index('data_split', append=True) -)[selected_cols] +test_results = (metrics_long + .query('data_split == "test_fake_na"') + .reset_index().set_index(idx.names) + .loc[idx] + .reset_index(['latent_dim', 'hidden_layers', 'id']) + .set_index('data_split', append=True) + )[selected_cols] test_results # %% [markdown] @@ -197,7 +197,7 @@ vaep.savefig(fig, fname, folder=FOLDER) # %% [markdown] -# ### Validation data results +# ### Validation data results # %% fname = 'best_models_1_val_mpl' @@ -205,7 +205,7 @@ _to_plot = sel_on_val.reset_index(level=['data level', 'model']).loc[[('valid_fake_na', METRIC), ]] _to_plot = _to_plot.set_index(['data level', 'model'])[['metric_value', 'text']] -_to_plot = _to_plot.loc[IDX_ORDER,:] +_to_plot = _to_plot.loc[IDX_ORDER, :] _to_plot.index.name = '' # text = test_results['text'].unstack().loc[IDX_ORDER].unstack() _to_plot = _to_plot['metric_value'].unstack().loc[IDX_ORDER] @@ -230,7 +230,8 @@ fname = 'best_models_1_val_plotly' _to_plot = sel_on_val.reset_index(level=['data level', 'model']).loc[[('valid_fake_na', METRIC), ]] _to_plot = _to_plot.set_index(['data level', 'model']) -_to_plot[['metric_value', 'latent_dim', 'hidden_layers', 'text']] = _to_plot[['metric_value', 'latent_dim', 'hidden_layers', 'text']].fillna('-') +_to_plot[['metric_value', 'latent_dim', 'hidden_layers', 'text']] = _to_plot[[ + 'metric_value', 'latent_dim', 'hidden_layers', 'text']].fillna('-') _to_plot = _to_plot.loc[pd.IndexSlice[IDX_ORDER], :] _to_plot.to_csv(FOLDER / f"{fname}.csv") @@ -241,7 +242,7 @@ fig = px.bar(_to_plot.reset_index(), x='data level', y='metric_value', - hover_data={'N': ':,d', 'n_params': ':,d'}, # format hover data + hover_data={'N': ':,d', 'n_params': ':,d'}, # format hover data color='model', barmode="group", text='text', @@ -266,8 +267,8 @@ f' & metric_name == "{METRIC}"') best_on_average = metrics_long_sel.reset_index( - ).groupby(by=group_by - )['metric_value'].mean().sort_values().reset_index(level=group_by[1:]) +).groupby(by=group_by + )['metric_value'].mean().sort_values().reset_index(level=group_by[1:]) best_on_average # %% @@ -284,25 +285,25 @@ data_split = 'test_fake_na' metrics_long_sel_test = metrics_long.query(f'data_split == "{data_split}"' - f' & metric_name == "{METRIC}"') + f' & metric_name == "{METRIC}"') to_plot = (metrics_long_sel_test - .reset_index().set_index(group_by) - .loc[best_on_average.index] - .reset_index().set_index(['model', 'data level']) - .loc[pd.IndexSlice[order_categories['model'], order_categories['data level']], :]) + .reset_index().set_index(group_by) + .loc[best_on_average.index] + .reset_index().set_index(['model', 'data level']) + .loc[pd.IndexSlice[order_categories['model'], order_categories['data level']], :]) to_plot = to_plot.reset_index() to_plot['model annotated'] = to_plot['model'] + ' - ' + to_plot['text'] -order_model = to_plot['model annotated'].drop_duplicates().to_list() # model name with annotation +order_model = to_plot['model annotated'].drop_duplicates().to_list() # model name with annotation to_plot = to_plot.drop_duplicates(subset=['model', 'data level', 'metric_value']) -to_plot.to_csv(FOLDER /f"{fname}.csv") +to_plot.to_csv(FOLDER / f"{fname}.csv") to_plot # %% -figsize= (10,8) # None # (10,8) +figsize = (10, 8) # None # (10,8) fig, ax = plt.subplots(figsize=figsize) to_plot.columns.name = '' ax = (to_plot @@ -317,7 +318,19 @@ width=.8, ax=ax, # colormap="Paired", - color = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928'] + color=[ + '#a6cee3', + '#1f78b4', + '#b2df8a', + '#33a02c', + '#fb9a99', + '#e31a1c', + '#fdbf6f', + '#ff7f00', + '#cab2d6', + '#6a3d9a', + '#ffff99', + '#b15928'] ) ) ax = vaep.plotting.add_height_to_barplot(ax, size=11) @@ -333,7 +346,7 @@ x='model', y='metric_value', color='data level', - hover_data={'N': ':,d', 'n_params': ':,d'}, # format hover data + hover_data={'N': ':,d', 'n_params': ':,d'}, # format hover data barmode="group", color_discrete_sequence=px.colors.colorbrewer.Paired, # color_discrete_sequence=['#a6cee3', '#1f78b4', '#b2df8a'], @@ -357,18 +370,18 @@ .loc[best_on_average.index].reset_index() .set_index(['model', 'data level']) .loc[pd.IndexSlice[order_categories['model'], order_categories['data level']], :] - ) + ) to_plot = to_plot.reset_index() to_plot['model annotated'] = to_plot['model'] + ' - ' + to_plot['text'] -order_model = to_plot['model annotated'].drop_duplicates().to_list() # model name with annotation +order_model = to_plot['model annotated'].drop_duplicates().to_list() # model name with annotation to_plot = to_plot.drop_duplicates(subset=['model', 'data level', 'metric_value']) -to_plot.to_csv(FOLDER /f"{fname}.csv") +to_plot.to_csv(FOLDER / f"{fname}.csv") to_plot # %% -figsize= (10,8) # None # (10,8) +figsize = (10, 8) # None # (10,8) fig, ax = plt.subplots(figsize=figsize) to_plot.columns.name = '' ax = (to_plot @@ -383,7 +396,19 @@ width=.8, ax=ax, # colormap="Paired", - color = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928'] + color=[ + '#a6cee3', + '#1f78b4', + '#b2df8a', + '#33a02c', + '#fb9a99', + '#e31a1c', + '#fdbf6f', + '#ff7f00', + '#cab2d6', + '#6a3d9a', + '#ffff99', + '#b15928'] ) ) ax = vaep.plotting.add_height_to_barplot(ax, size=11) @@ -399,7 +424,7 @@ x='model', y='metric_value', color='data level', - hover_data={'N': ':,d', 'n_params': ':,d'}, # format hover data + hover_data={'N': ':,d', 'n_params': ':,d'}, # format hover data barmode="group", color_discrete_sequence=px.colors.colorbrewer.Paired, # color_discrete_sequence=['#a6cee3', '#1f78b4', '#b2df8a'], diff --git a/project/03_1_best_models_comparison.ipynb b/project/03_1_best_models_comparison.ipynb index aff217903..6a6f6f6fe 100644 --- a/project/03_1_best_models_comparison.ipynb +++ b/project/03_1_best_models_comparison.ipynb @@ -21,7 +21,7 @@ "logger = setup_logger(logger=logging.getLogger('vaep'), level=10)\n", "\n", "plt.rcParams['figure.figsize'] = [4.0, 2.0]\n", - "vaep.plotting.make_large_descriptors(5)" + "vaep.plotting.make_large_descriptors(7)" ] }, { @@ -93,7 +93,10 @@ "min_max_MAE = (selected\n", " .loc[pd.IndexSlice[:, 'MAE', :]]\n", " .groupby('model')\n", - " .agg(['min', 'max']))\n", + " .agg(['min', 'max'])\n", + " .stack()\n", + " .T\n", + " .loc[IDX[0]])\n", "min_max_MAE.to_excel(writer, sheet_name='min_max_MAE')\n", "min_max_MAE" ] @@ -150,8 +153,8 @@ " split,\n", " :, 'MAE']].stack(1)\n", "view_long = (selected.stack()\n", - " .to_frame('MAE')\n", - " .reset_index())\n", + " .to_frame('MAE')\n", + " .reset_index())\n", "view_long" ] }, @@ -182,6 +185,16 @@ "vaep.savefig(fig, FOLDER / \"model_performance_repeated_runs.pdf\")" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "0813889a", + "metadata": {}, + "outputs": [], + "source": [ + "writer.close()" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/project/03_1_best_models_comparison.py b/project/03_1_best_models_comparison.py index 7061b53e2..97c54d8b5 100644 --- a/project/03_1_best_models_comparison.py +++ b/project/03_1_best_models_comparison.py @@ -6,7 +6,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -29,7 +29,7 @@ logger = setup_logger(logger=logging.getLogger('vaep'), level=10) plt.rcParams['figure.figsize'] = [4.0, 2.0] -vaep.plotting.make_large_descriptors(5) +vaep.plotting.make_large_descriptors(7) # %% IDX = [['proteinGroups', 'peptides', 'evidence'], @@ -63,7 +63,10 @@ min_max_MAE = (selected .loc[pd.IndexSlice[:, 'MAE', :]] .groupby('model') - .agg(['min', 'max'])) + .agg(['min', 'max']) + .stack() + .T + .loc[IDX[0]]) min_max_MAE.to_excel(writer, sheet_name='min_max_MAE') min_max_MAE @@ -94,8 +97,8 @@ split, :, 'MAE']].stack(1) view_long = (selected.stack() - .to_frame('MAE') - .reset_index()) + .to_frame('MAE') + .reset_index()) view_long # %% @@ -114,3 +117,6 @@ vaep.savefig(fig, FOLDER / "model_performance_repeated_runs.pdf") # %% +writer.close() + +# %% diff --git a/project/03_2_best_models_comparison_fig2.ipynb b/project/03_2_best_models_comparison_fig2.ipynb index ef70b394c..6370691f4 100644 --- a/project/03_2_best_models_comparison_fig2.ipynb +++ b/project/03_2_best_models_comparison_fig2.ipynb @@ -34,11 +34,12 @@ "outputs": [], "source": [ "# parameters\n", - "FOLDER = Path('runs/dev_dataset_large/')\n", + "FOLDER = Path('runs/mnar_mcar/')\n", + "SIZE = 'l'\n", "files_in = {\n", - " 'protein groups': FOLDER / 'proteinGroups/figures/performance_test.csv',\n", - " 'peptides': FOLDER / 'peptides/figures/performance_test.csv',\n", - " 'precursors': FOLDER / 'evidence/figures/performance_test.csv'\n", + " 'protein groups': FOLDER / 'pg_l_25MNAR/figures/2_1_performance_test_sel.csv',\n", + " 'peptides': FOLDER / 'pep_l_25MNAR/figures/2_1_performance_test.csv',\n", + " 'precursors': FOLDER / 'evi_l_25MNAR/figures/2_1_performance_test.csv'\n", "}" ] }, @@ -49,11 +50,12 @@ "metadata": {}, "outputs": [], "source": [ - "FOLDER = Path('runs/dev_dataset_small/')\n", + "FOLDER = Path('runs/mnar_mcar/')\n", + "SIZE = 'm'\n", "files_in = {\n", - " 'protein groups': FOLDER / 'proteinGroups_N50/figures/performance_test.csv',\n", - " 'peptides': FOLDER / 'peptides_N50/figures/performance_test.csv',\n", - " 'precursors': FOLDER / 'evidence_N50/figures/performance_test.csv'\n", + " 'protein groups': FOLDER / 'pg_m_25MNAR/figures/2_1_performance_test_sel.csv',\n", + " 'peptides': FOLDER / 'pep_m_25MNAR/figures/2_1_performance_test_sel.csv',\n", + " 'precursors': FOLDER / 'evi_m_25MNAR/figures/2_1_performance_test_sel.csv'\n", "}" ] }, @@ -134,13 +136,13 @@ }, "outputs": [], "source": [ - "fname = FOLDER / 'best_models_1_test_mpl.pdf'\n", + "fname = FOLDER / f'best_models_{SIZE}_test_mpl.pdf'\n", "metrics = df['metric_value'].unstack('model')\n", "ORDER_MODELS = metrics.mean().sort_values().index.to_list()\n", "metrics = metrics.loc[ORDER_DATA, ORDER_MODELS]\n", "\n", "plt.rcParams['figure.figsize'] = [4.0, 2.0]\n", - "matplotlib.rcParams.update({'font.size': 5})\n", + "matplotlib.rcParams.update({'font.size': 6})\n", "\n", "ax = (metrics\n", " .plot\n", @@ -149,10 +151,12 @@ " ylabel=f\"{METRIC} (log2 intensities)\",\n", " color=COLORS_TO_USE_MAPPTING,\n", " width=.85,\n", - " fontsize=8\n", + " fontsize=7\n", " ))\n", "\n", - "ax = vaep.plotting.add_height_to_barplot(ax, size=5)\n", + "\n", + "ax = vaep.plotting.add_height_to_barplot(ax, size=6, rotated=True)\n", + "ax.set_ylim((0, 0.75))\n", "ax.legend(fontsize=5, loc='lower right')\n", "text = (\n", " df['text']\n", @@ -161,7 +165,7 @@ " .stack().loc[pd.IndexSlice[ORDER_MODELS, ORDER_DATA]]\n", "\n", ")\n", - "ax = vaep.plotting.add_text_to_barplot(ax, text, size=5)\n", + "ax = vaep.plotting.add_text_to_barplot(ax, text, size=6)\n", "fig = ax.get_figure()\n", "fig.tight_layout()\n", "vaep.savefig(fig, fname)" @@ -230,10 +234,18 @@ "metadata": {}, "outputs": [], "source": [ - "fname = FOLDER / 'performance_summary.xlsx'\n", + "fname = FOLDER / f'performance_summary_{SIZE}.xlsx'\n", "perf.to_excel(fname)\n", "fname.as_posix()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d97a66a0", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/project/03_2_best_models_comparison_fig2.py b/project/03_2_best_models_comparison_fig2.py index 1af9ae1a7..d5b69eae1 100644 --- a/project/03_2_best_models_comparison_fig2.py +++ b/project/03_2_best_models_comparison_fig2.py @@ -6,7 +6,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.2 # kernelspec: # display_name: Python 3 # language: python @@ -33,19 +33,21 @@ # %% # parameters -FOLDER = Path('runs/dev_dataset_large/') +FOLDER = Path('runs/mnar_mcar/') +SIZE = 'l' files_in = { - 'protein groups': FOLDER / 'proteinGroups/figures/performance_test.csv', - 'peptides': FOLDER / 'peptides/figures/performance_test.csv', - 'precursors': FOLDER / 'evidence/figures/performance_test.csv' + 'protein groups': FOLDER / 'pg_l_25MNAR/figures/2_1_performance_test_sel.csv', + 'peptides': FOLDER / 'pep_l_25MNAR/figures/2_1_performance_test.csv', + 'precursors': FOLDER / 'evi_l_25MNAR/figures/2_1_performance_test.csv' } # %% -FOLDER = Path('runs/dev_dataset_small/') +FOLDER = Path('runs/mnar_mcar/') +SIZE = 'm' files_in = { - 'protein groups': FOLDER / 'proteinGroups_N50/figures/performance_test.csv', - 'peptides': FOLDER / 'peptides_N50/figures/performance_test.csv', - 'precursors': FOLDER / 'evidence_N50/figures/performance_test.csv' + 'protein groups': FOLDER / 'pg_m_25MNAR/figures/2_1_performance_test_sel.csv', + 'peptides': FOLDER / 'pep_m_25MNAR/figures/2_1_performance_test_sel.csv', + 'precursors': FOLDER / 'evi_m_25MNAR/figures/2_1_performance_test_sel.csv' } # %% @@ -84,13 +86,13 @@ df # %% -fname = FOLDER / 'best_models_1_test_mpl.pdf' +fname = FOLDER / f'best_models_{SIZE}_test_mpl.pdf' metrics = df['metric_value'].unstack('model') ORDER_MODELS = metrics.mean().sort_values().index.to_list() metrics = metrics.loc[ORDER_DATA, ORDER_MODELS] plt.rcParams['figure.figsize'] = [4.0, 2.0] -matplotlib.rcParams.update({'font.size': 5}) +matplotlib.rcParams.update({'font.size': 6}) ax = (metrics .plot @@ -99,10 +101,12 @@ ylabel=f"{METRIC} (log2 intensities)", color=COLORS_TO_USE_MAPPTING, width=.85, - fontsize=8 + fontsize=7 )) -ax = vaep.plotting.add_height_to_barplot(ax, size=5) + +ax = vaep.plotting.add_height_to_barplot(ax, size=6, rotated=True) +ax.set_ylim((0, 0.75)) ax.legend(fontsize=5, loc='lower right') text = ( df['text'] @@ -111,7 +115,7 @@ .stack().loc[pd.IndexSlice[ORDER_MODELS, ORDER_DATA]] ) -ax = vaep.plotting.add_text_to_barplot(ax, text, size=5) +ax = vaep.plotting.add_text_to_barplot(ax, text, size=6) fig = ax.get_figure() fig.tight_layout() vaep.savefig(fig, fname) @@ -148,6 +152,8 @@ perf # %% -fname = FOLDER / 'performance_summary.xlsx' +fname = FOLDER / f'performance_summary_{SIZE}.xlsx' perf.to_excel(fname) fname.as_posix() + +# %% diff --git a/project/03_3_combine_experiment_result_tables.ipynb b/project/03_3_combine_experiment_result_tables.ipynb index a0b3a2d69..476023e23 100644 --- a/project/03_3_combine_experiment_result_tables.ipynb +++ b/project/03_3_combine_experiment_result_tables.ipynb @@ -34,7 +34,7 @@ "metadata": {}, "outputs": [], "source": [ - "files = {Path(f).parent.name: f for f in snakemake.input }\n", + "files = {Path(f).parent.name: f for f in snakemake.input}\n", "files" ] }, @@ -47,7 +47,7 @@ "source": [ "table = []\n", "for key, file in files.items():\n", - " df = pd.read_excel(file, sheet_name='cp_mean_perf', index_col=0)\n", + " df = pd.read_excel(file, sheet_name=-1, index_col=0)\n", " df.columns = pd.MultiIndex.from_tuples([(key, x) for x in df.columns])\n", " table.append(df)\n", "\n", @@ -71,10 +71,10 @@ "outputs": [], "source": [ "order = (table\n", - " .loc[:, pd.IndexSlice[:, 'val']]\n", - " .mean(axis=1)\n", - " .sort_values()\n", - ")\n", + " .loc[:, pd.IndexSlice[:, 'val']]\n", + " .mean(axis=1)\n", + " .sort_values()\n", + " )\n", "order" ] }, diff --git a/project/03_3_combine_experiment_result_tables.py b/project/03_3_combine_experiment_result_tables.py index 7ba8472ba..37dd49f26 100644 --- a/project/03_3_combine_experiment_result_tables.py +++ b/project/03_3_combine_experiment_result_tables.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -23,7 +23,7 @@ # Use parent folder name as key # %% -files = {Path(f).parent.name: f for f in snakemake.input } +files = {Path(f).parent.name: f for f in snakemake.input} files # %% @@ -41,10 +41,10 @@ # %% order = (table - .loc[:, pd.IndexSlice[:, 'val']] - .mean(axis=1) - .sort_values() -) + .loc[:, pd.IndexSlice[:, 'val']] + .mean(axis=1) + .sort_values() + ) order # %% diff --git a/project/03_4_join_tables.py b/project/03_4_join_tables.py index 88e1004f1..404a24bae 100644 --- a/project/03_4_join_tables.py +++ b/project/03_4_join_tables.py @@ -5,34 +5,24 @@ # %% -fname = 'runs/appl_ald_data/plasma/proteinGroups_all/01_2_performance_summary.xlsx' +fname = 'runs/appl_ald_data_2023_11/plasma/proteinGroups/01_2_performance_summary.xlsx' ald_pg_perf = pd.read_excel(fname, sheet_name=-1, index_col=0) -ald_pg_perf.columns = pd.MultiIndex.from_tuples([('ALD data','protein groups', x) for x in ald_pg_perf.columns]) +ald_pg_perf.columns = pd.MultiIndex.from_tuples([('ALD protein groups', x) for x in ald_pg_perf.columns]) ald_pg_perf # %% -files = { - 'small HeLa': 'runs/dev_dataset_small/performance_summary.xlsx', - 'large HeLa': 'runs/dev_dataset_large/performance_summary.xlsx', -} -files - -table = [] -for key, file in files.items(): - df = pd.read_excel(file, index_col=0, header=[0, 1]) - df.columns = pd.MultiIndex.from_tuples([(key, *x) for x in df.columns]) - table.append(df) - +file = 'runs/mnar_mcar/all_results.xlsx' +table = [pd.read_excel(file, index_col=0, header=[0, 1])] table.append(ald_pg_perf) table = pd.concat(table, axis=1) table # %% order = (table - .loc[:, pd.IndexSlice[:, :, 'val']] - .mean(axis=1) - .sort_values() -) + .loc[:, pd.IndexSlice[ :, 'val']] + .mean(axis=1) + .sort_values() + ) order # %% @@ -63,10 +53,10 @@ # %% # %% order = (table - .loc[:, pd.IndexSlice[:, 'val']] - .mean(axis=1) - .sort_values() -) + .loc[:, pd.IndexSlice[:, 'val']] + .mean(axis=1) + .sort_values() + ) order # %% diff --git a/project/03_5_join_benchmarks.py b/project/03_5_join_benchmarks.py index 976e3e339..1db7eeb73 100644 --- a/project/03_5_join_benchmarks.py +++ b/project/03_5_join_benchmarks.py @@ -9,14 +9,16 @@ root_folder: str = 'runs/dev_dataset_small' # large -# root_folder: str = 'runs/dev_dataset_large' -# root_folder: str = 'runs/appl_ald_data/plasma' +root_folder: str = 'runs/mnar_mcar' +root_folder: str = 'runs/appl_ald_data_2023_11/plasma' # %% root_folder = Path(root_folder) # %% # find folders in root folder and get files with tsv extension + + def find_tsv_benchmarks(root_folder: Path): """Find snakemake benchmark files in subfolders of root_folder (pimms workflow) @@ -39,6 +41,7 @@ def find_tsv_benchmarks(root_folder: Path): if file.suffix == '.tsv': yield file + files = find_tsv_benchmarks(root_folder) # %% @@ -47,7 +50,7 @@ def find_tsv_benchmarks(root_folder: Path): # files = (x for x in files if x.is_file()) # %% -COL = 'h:m:s' # 's' for seconds +COL = 'h:m:s' # 's' for seconds SPLIT_TERM = '_train_' data = dict() for file in files: @@ -64,7 +67,7 @@ def find_tsv_benchmarks(root_folder: Path): data = (pd .DataFrame(data) .drop('PRED') -) + ) data # %% @@ -78,9 +81,8 @@ def find_tsv_benchmarks(root_folder: Path): # %% runtime_dumps = [ - 'runs/dev_dataset_large/runtimes.xlsx', - 'runs/dev_dataset_small/runtimes.xlsx', - 'runs/appl_ald_data/plasma/runtimes.xlsx' + 'runs/mnar_mcar/runtimes.xlsx', + 'runs/appl_ald_data_2023_11/plasma/runtimes.xlsx' ] runtime_dumps = [pd.read_excel(fname, index_col=0) for fname in runtime_dumps] runtime_dumps = pd.concat(runtime_dumps, axis=1) diff --git a/project/04_1_train_pimms_models.ipynb b/project/04_1_train_pimms_models.ipynb index 21ac7cd0f..5f305f5d6 100644 --- a/project/04_1_train_pimms_models.ipynb +++ b/project/04_1_train_pimms_models.ipynb @@ -188,7 +188,7 @@ "id": "a76ba4ce", "metadata": {}, "source": [ - "Let's set up collaborative filtering without a validation or test set, using \n", + "Let's set up collaborative filtering without a validation or test set, using\n", "all the data there is." ] }, @@ -257,10 +257,10 @@ "metadata": {}, "outputs": [], "source": [ - "df_imputed = df_imputed.stack() # long-format\n", + "df_imputed = df_imputed.stack() # long-format\n", "observed = df_imputed.loc[df.index]\n", "imputed = df_imputed.loc[df_imputed.index.difference(df.index)]\n", - "df_imputed = df_imputed.unstack() # back to wide-format\n", + "df_imputed = df_imputed.unstack() # back to wide-format\n", "# some checks\n", "assert len(df) == len(observed)\n", "assert df_imputed.shape[0] * df_imputed.shape[1] == len(imputed) + len(observed)" @@ -273,30 +273,30 @@ "metadata": {}, "outputs": [], "source": [ - "fig, axes = plt.subplots(2, figsize=(8,4))\n", + "fig, axes = plt.subplots(2, figsize=(8, 4))\n", "\n", "min_max = vaep.plotting.data.get_min_max_iterable(\n", " [observed, imputed])\n", "label_template = '{method} (N={n:,d})'\n", "ax, _ = vaep.plotting.data.plot_histogram_intensities(\n", - " observed,\n", - " ax=axes[0],\n", - " min_max=min_max,\n", - " label=label_template.format(method='measured',\n", - " n=len(observed),\n", - " ),\n", - " color='grey',\n", - " alpha=1)\n", + " observed,\n", + " ax=axes[0],\n", + " min_max=min_max,\n", + " label=label_template.format(method='measured',\n", + " n=len(observed),\n", + " ),\n", + " color='grey',\n", + " alpha=1)\n", "_ = ax.legend()\n", "ax, _ = vaep.plotting.data.plot_histogram_intensities(\n", - " imputed,\n", - " ax=axes[1],\n", - " min_max=min_max,\n", - " label=label_template.format(method='CF imputed',\n", - " n=len(imputed),\n", - " ),\n", - " color=color_model_mapping['CF'],\n", - " alpha=1)\n", + " imputed,\n", + " ax=axes[1],\n", + " min_max=min_max,\n", + " label=label_template.format(method='CF imputed',\n", + " n=len(imputed),\n", + " ),\n", + " color=color_model_mapping['CF'],\n", + " alpha=1)\n", "_ = ax.legend()" ] }, @@ -330,8 +330,8 @@ "metadata": {}, "source": [ "The AutoEncoder model currently need validation data for training.\n", - "We will use 10% of the training data for validation. \n", - "> Expect this limitation to be dropped in the next release. It will still be recommended \n", + "We will use 10% of the training data for validation.\n", + "> Expect this limitation to be dropped in the next release. It will still be recommended\n", "> to use validation data for early stopping." ] }, @@ -352,7 +352,7 @@ "metadata": {}, "source": [ "We will use the `sampling` module to sample the validation data from the training data.\n", - "Could be split differently by providing another `weights` vector. " + "Could be split differently by providing another `weights` vector." ] }, { @@ -423,7 +423,7 @@ "metadata": {}, "outputs": [], "source": [ - "model_selected = 'VAE' # 'DAE'\n", + "model_selected = 'VAE' # 'DAE'\n", "model = AETransformer(\n", " model=model_selected,\n", " hidden_layers=[512,],\n", @@ -535,8 +535,8 @@ "metadata": {}, "outputs": [], "source": [ - "df = df.stack() # long-format\n", - "df_imputed = df_imputed.stack() # long-format\n", + "df = df.stack() # long-format\n", + "df_imputed = df_imputed.stack() # long-format\n", "observed = df_imputed.loc[df.index]\n", "imputed = df_imputed.loc[df_imputed.index.difference(df.index)]" ] @@ -550,30 +550,30 @@ }, "outputs": [], "source": [ - "fig, axes = plt.subplots(2, figsize=(8,4))\n", + "fig, axes = plt.subplots(2, figsize=(8, 4))\n", "\n", "min_max = vaep.plotting.data.get_min_max_iterable(\n", " [observed, imputed])\n", "label_template = '{method} (N={n:,d})'\n", "ax, _ = vaep.plotting.data.plot_histogram_intensities(\n", - " observed,\n", - " ax=axes[0],\n", - " min_max=min_max,\n", - " label=label_template.format(method='measured',\n", - " n=len(observed),\n", - " ),\n", - " color='grey',\n", - " alpha=1)\n", + " observed,\n", + " ax=axes[0],\n", + " min_max=min_max,\n", + " label=label_template.format(method='measured',\n", + " n=len(observed),\n", + " ),\n", + " color='grey',\n", + " alpha=1)\n", "_ = ax.legend()\n", "ax, _ = vaep.plotting.data.plot_histogram_intensities(\n", - " imputed,\n", - " ax=axes[1],\n", - " min_max=min_max,\n", - " label=label_template.format(method=f'{model_selected} imputed',\n", - " n=len(imputed),\n", - " ),\n", - " color=color_model_mapping[model_selected],\n", - " alpha=1)\n", + " imputed,\n", + " ax=axes[1],\n", + " min_max=min_max,\n", + " label=label_template.format(method=f'{model_selected} imputed',\n", + " n=len(imputed),\n", + " ),\n", + " color=color_model_mapping[model_selected],\n", + " alpha=1)\n", "_ = ax.legend()" ] }, diff --git a/project/04_1_train_pimms_models.py b/project/04_1_train_pimms_models.py index fea106857..3ddae8b24 100644 --- a/project/04_1_train_pimms_models.py +++ b/project/04_1_train_pimms_models.py @@ -86,7 +86,7 @@ # # # # CollaborativeFilteringTransformer? # %% [markdown] -# Let's set up collaborative filtering without a validation or test set, using +# Let's set up collaborative filtering without a validation or test set, using # all the data there is. # %% @@ -117,39 +117,39 @@ # Let's plot the distribution of the imputed values vs the ones used for training: # %% -df_imputed = df_imputed.stack() # long-format +df_imputed = df_imputed.stack() # long-format observed = df_imputed.loc[df.index] imputed = df_imputed.loc[df_imputed.index.difference(df.index)] -df_imputed = df_imputed.unstack() # back to wide-format +df_imputed = df_imputed.unstack() # back to wide-format # some checks assert len(df) == len(observed) assert df_imputed.shape[0] * df_imputed.shape[1] == len(imputed) + len(observed) # %% -fig, axes = plt.subplots(2, figsize=(8,4)) +fig, axes = plt.subplots(2, figsize=(8, 4)) min_max = vaep.plotting.data.get_min_max_iterable( [observed, imputed]) label_template = '{method} (N={n:,d})' ax, _ = vaep.plotting.data.plot_histogram_intensities( - observed, - ax=axes[0], - min_max=min_max, - label=label_template.format(method='measured', - n=len(observed), - ), - color='grey', - alpha=1) + observed, + ax=axes[0], + min_max=min_max, + label=label_template.format(method='measured', + n=len(observed), + ), + color='grey', + alpha=1) _ = ax.legend() ax, _ = vaep.plotting.data.plot_histogram_intensities( - imputed, - ax=axes[1], - min_max=min_max, - label=label_template.format(method='CF imputed', - n=len(imputed), - ), - color=color_model_mapping['CF'], - alpha=1) + imputed, + ax=axes[1], + min_max=min_max, + label=label_template.format(method='CF imputed', + n=len(imputed), + ), + color=color_model_mapping['CF'], + alpha=1) _ = ax.legend() # %% [markdown] @@ -166,8 +166,8 @@ # %% [markdown] # The AutoEncoder model currently need validation data for training. -# We will use 10% of the training data for validation. -# > Expect this limitation to be dropped in the next release. It will still be recommended +# We will use 10% of the training data for validation. +# > Expect this limitation to be dropped in the next release. It will still be recommended # > to use validation data for early stopping. # %% @@ -176,7 +176,7 @@ # %% [markdown] # We will use the `sampling` module to sample the validation data from the training data. -# Could be split differently by providing another `weights` vector. +# Could be split differently by providing another `weights` vector. # %% val_X, train_X = vaep.sampling.sample_data(df.stack(), @@ -204,7 +204,7 @@ # Select either `DAE` or `VAE` model: # %% -model_selected = 'VAE' # 'DAE' +model_selected = 'VAE' # 'DAE' model = AETransformer( model=model_selected, hidden_layers=[512,], @@ -257,36 +257,36 @@ df_imputed = df_imputed.replace(val_X) # %% -df = df.stack() # long-format -df_imputed = df_imputed.stack() # long-format +df = df.stack() # long-format +df_imputed = df_imputed.stack() # long-format observed = df_imputed.loc[df.index] imputed = df_imputed.loc[df_imputed.index.difference(df.index)] # %% -fig, axes = plt.subplots(2, figsize=(8,4)) +fig, axes = plt.subplots(2, figsize=(8, 4)) min_max = vaep.plotting.data.get_min_max_iterable( [observed, imputed]) label_template = '{method} (N={n:,d})' ax, _ = vaep.plotting.data.plot_histogram_intensities( - observed, - ax=axes[0], - min_max=min_max, - label=label_template.format(method='measured', - n=len(observed), - ), - color='grey', - alpha=1) + observed, + ax=axes[0], + min_max=min_max, + label=label_template.format(method='measured', + n=len(observed), + ), + color='grey', + alpha=1) _ = ax.legend() ax, _ = vaep.plotting.data.plot_histogram_intensities( - imputed, - ax=axes[1], - min_max=min_max, - label=label_template.format(method=f'{model_selected} imputed', - n=len(imputed), - ), - color=color_model_mapping[model_selected], - alpha=1) + imputed, + ax=axes[1], + min_max=min_max, + label=label_template.format(method=f'{model_selected} imputed', + n=len(imputed), + ), + color=color_model_mapping[model_selected], + alpha=1) _ = ax.legend() diff --git a/project/10_0_ald_data.ipynb b/project/10_0_ald_data.ipynb index 01027e035..e006f94e4 100644 --- a/project/10_0_ald_data.ipynb +++ b/project/10_0_ald_data.ipynb @@ -47,14 +47,14 @@ "print(*(folder_data.iterdir()), sep='\\n')\n", "\n", "fnames = dict(\n", - "plasma_proteinGroups = folder_data / 'Protein_ALDupgrade_Report.csv',\n", - "plasma_aggPeptides = folder_data / 'ald_proteome_spectronaut.tsv',\n", - "liver_proteinGroups = folder_data / 'Protein_20200221_121354_20200218_ALD_LiverTissue_PlateS1_Atlaslib_Report.csv',\n", - "liver_aggPeptides = folder_data / 'Peptide_20220819_100847_20200218_ALD_LiverTissue_PlateS1_Atlaslib_Report.csv',\n", - "annotations = folder_data / 'ald_experiment_annotations.csv',\n", - "clinic = folder_data / 'labtest_integrated_numeric.csv',\n", - "raw_meta = folder_data / 'ald_metadata_rawfiles.csv')\n", - "fnames =vaep.nb.Config.from_dict(fnames) # could be handeled kwargs as in normal dict" + " plasma_proteinGroups=folder_data / 'Protein_ALDupgrade_Report.csv',\n", + " plasma_aggPeptides=folder_data / 'ald_proteome_spectronaut.tsv',\n", + " liver_proteinGroups=folder_data / 'Protein_20200221_121354_20200218_ALD_LiverTissue_PlateS1_Atlaslib_Report.csv',\n", + " liver_aggPeptides=folder_data / 'Peptide_20220819_100847_20200218_ALD_LiverTissue_PlateS1_Atlaslib_Report.csv',\n", + " annotations=folder_data / 'ald_experiment_annotations.csv',\n", + " clinic=folder_data / 'labtest_integrated_numeric.csv',\n", + " raw_meta=folder_data / 'ald_metadata_rawfiles.csv')\n", + "fnames = vaep.nb.Config.from_dict(fnames) # could be handeled kwargs as in normal dict" ] }, { @@ -128,7 +128,7 @@ "metadata": {}, "outputs": [], "source": [ - "annotations['Participant ID'].value_counts().value_counts() # some only have a blood sample, some both" + "annotations['Participant ID'].value_counts().value_counts() # some only have a blood sample, some both" ] }, { @@ -306,7 +306,7 @@ "metadata": {}, "outputs": [], "source": [ - "clinic.loc[idx_overlap_plasma].to_csv(folder_data_out /'ald_metadata_cli.csv')" + "clinic.loc[idx_overlap_plasma].to_csv(folder_data_out / 'ald_metadata_cli.csv')" ] }, { @@ -456,7 +456,7 @@ "id": "cfe1c458-dc61-4890-b430-6efa7eb89e72", "metadata": {}, "source": [ - "## (Aggregated) Peptide Data " + "## (Aggregated) Peptide Data" ] }, { @@ -603,7 +603,7 @@ "metadata": {}, "outputs": [], "source": [ - "id_mappings = [\"PEP.StrippedSequence\", \"PG.ProteinAccessions\", \"PG.Genes\"]\n", + "id_mappings = [\"PEP.StrippedSequence\", \"PG.ProteinAccessions\", \"PG.Genes\"]\n", "id_mappings = meta[id_mappings].drop_duplicates()\n", "id_mappings.to_csv(folder_data_out / 'ald_plasma_aggPeptides_id_mappings.csv')\n", "id_mappings" @@ -618,25 +618,26 @@ "\n", "taken from [Spectronaut manuel](https://biognosys.com/resources/spectronaut-manual/)\n", "\n", - "feature | description \n", + "feature | description\n", "--- | ---\n", "PEP.IsProteinGroupSpecific | True or False. Tells you whether the peptide only belongs to one Protein Group.\n", "PEP.StrippedSequence | -\n", "PEP.IsProteotypic | -\n", "PEP.PeptidePosition | -\n", - "PG.Cscore | - \n", + "PG.Cscore | -\n", "PG.ProteinAccessions | -\n", - "PG.Genes | - \n", + "PG.Genes | -\n", "PEP.Quantity | The quantitative value for that peptide as defined in the settings.\n", - "EG.PrecursorId | Unique Id for the precursor: [modified sequence] plus [charge] \n", + "EG.PrecursorId | Unique Id for the precursor: [modified sequence] plus [charge]\n", "EG.Qvalue | The q-value (FDR) of the EG.\n", - "EG.TotalQuantity (Settings) | The quantitative value for that EG as defined in the settings. \n", + "EG.TotalQuantity (Settings) | The quantitative value for that EG as defined in the settings.\n", "\n", - "> Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious. \n", + "> Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious.\n", "\n", - "> Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious. \n", + "> Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious.\n", "\n", - "After discussing with Lili, `PEP.Quantity` is the fitting entity for each unique aggregated Peptide. Duplicated entries are just to drop" + "After discussing with Lili, `PEP.Quantity` is the fitting entity for\n", + "each unique aggregated Peptide. Duplicated entries are just to drop" ] }, { @@ -646,7 +647,7 @@ "metadata": {}, "outputs": [], "source": [ - "sel_cols = ['Sample ID', 'PEP.StrippedSequence', 'PEP.Quantity'] # selected quantity in last position\n", + "sel_cols = ['Sample ID', 'PEP.StrippedSequence', 'PEP.Quantity'] # selected quantity in last position\n", "df = df.reset_index()[sel_cols].drop_duplicates().set_index(sel_cols[:2])\n", "df" ] @@ -691,7 +692,7 @@ "id": "b823acaf-2610-4b0a-91d8-2d6dd6ff4182", "metadata": {}, "source": [ - "- rawfile metadata -> keep " + "- rawfile metadata -> keep" ] }, { @@ -732,7 +733,8 @@ "id": "5dddafbb-edd7-4ef0-9787-3120b24d7f79", "metadata": {}, "source": [ - "For one raw file no metadata could be extracted (`ERROR: Unable to access the RAW file using the native Thermo library.`)" + "For one raw file no metadata could be extracted (`ERROR: Unable to\n", + "access the RAW file using the native Thermo library.`)" ] }, { @@ -875,7 +877,7 @@ "metadata": {}, "outputs": [], "source": [ - "id_mappings = [\"PG.ProteinAccessions\", \"PG.Genes\"]\n", + "id_mappings = [\"PG.ProteinAccessions\", \"PG.Genes\"]\n", "id_mappings = meta[id_mappings].drop_duplicates()\n", "id_mappings.to_csv(folder_data_out / 'ald_plasma_proteinGroups_id_mappings.csv', index=False)\n", "id_mappings" @@ -900,7 +902,7 @@ "outputs": [], "source": [ "column_types = ['.'.join(x for x in tup) for tup in list(column_types.unique())]\n", - "column_types # 'PG.Quantity' expected" + "column_types # 'PG.Quantity' expected" ] }, { @@ -929,11 +931,12 @@ "metadata": {}, "outputs": [], "source": [ - "def find_idx_to_drop(df:pd.DataFrame, idx_to_keep:list):\n", - " to_drop = [x for x in df.index.names if not x in idx_to_keep]\n", + "def find_idx_to_drop(df: pd.DataFrame, idx_to_keep: list):\n", + " to_drop = [x for x in df.index.names if x not in idx_to_keep]\n", " logger.info(\"Columnns to drop: {}\".format(\",\".join((str(x) for x in to_drop))))\n", " return to_drop\n", - " \n", + "\n", + "\n", "to_drop = find_idx_to_drop(df, idx_cols)\n", "df = df.reset_index(level=to_drop, drop=True)\n", "df.head()" @@ -1135,7 +1138,7 @@ " 'title': 'protein group measurement distribution'}\n", "\n", "ax = vaep.plotting.plot_counts(des_data.T.sort_values(by='count', ascending=False).reset_index(\n", - "), feat_col_name='count', n_samples=len(df), ax=None, min_feat_prop=.0,**kwargs)\n", + "), feat_col_name='count', n_samples=len(df), ax=None, min_feat_prop=.0, **kwargs)\n", "\n", "fig = ax.get_figure()\n", "fig.tight_layout()\n", @@ -1447,7 +1450,7 @@ }, "outputs": [], "source": [ - "id_mappings = [\"PEP.StrippedSequence\", \"PG.ProteinAccessions\", \"PG.Genes\"]\n", + "id_mappings = [\"PEP.StrippedSequence\", \"PG.ProteinAccessions\", \"PG.Genes\"]\n", "id_mappings = meta[id_mappings].drop_duplicates()\n", "id_mappings.to_csv(folder_data_out / 'ald_liver_aggPeptides_id_mappings.csv')\n", "id_mappings" @@ -1462,25 +1465,26 @@ "\n", "taken from [Spectronaut manuel](https://biognosys.com/resources/spectronaut-manual/)\n", "\n", - "feature | description \n", + "feature | description\n", "--- | ---\n", "PEP.IsProteinGroupSpecific | True or False. Tells you whether the peptide only belongs to one Protein Group.\n", "PEP.StrippedSequence | -\n", "PEP.IsProteotypic | -\n", "PEP.PeptidePosition | -\n", - "PG.Cscore | - \n", + "PG.Cscore | -\n", "PG.ProteinAccessions | -\n", - "PG.Genes | - \n", + "PG.Genes | -\n", "PEP.Quantity | The quantitative value for that peptide as defined in the settings.\n", - "EG.PrecursorId | Unique Id for the precursor: [modified sequence] plus [charge] \n", + "EG.PrecursorId | Unique Id for the precursor: [modified sequence] plus [charge]\n", "EG.Qvalue | The q-value (FDR) of the EG.\n", - "EG.TotalQuantity (Settings) | The quantitative value for that EG as defined in the settings. \n", + "EG.TotalQuantity (Settings) | The quantitative value for that EG as defined in the settings.\n", "\n", - "> Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious. \n", + "> Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious.\n", "\n", - "> Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious. \n", + "> Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious.\n", "\n", - "After discussing with Lili, `PEP.Quantity` is the fitting entity for each unique aggregated Peptide. Duplicated entries are just to drop" + "After discussing with Lili, `PEP.Quantity` is the fitting entity for\n", + "each unique aggregated Peptide. Duplicated entries are just to drop" ] }, { @@ -1490,7 +1494,7 @@ "metadata": {}, "outputs": [], "source": [ - "sel_cols = ['Sample ID', 'PEP.StrippedSequence', VAR_PEP] # selected quantity in last position\n", + "sel_cols = ['Sample ID', 'PEP.StrippedSequence', VAR_PEP] # selected quantity in last position\n", "df = df.reset_index()[sel_cols].drop_duplicates().set_index(sel_cols[:2]).squeeze()\n", "df" ] @@ -1513,7 +1517,7 @@ "metadata": {}, "source": [ "Select entry with maximum intensity of `duplicated entries`\n", - " \n", + "\n", "> change of variable and many duplicates -> could be PSM table? (close to evidence?)" ] }, @@ -1535,7 +1539,9 @@ "metadata": {}, "outputs": [], "source": [ - "df = vaep.pandas.select_max_by(df=df.reset_index(), grouping_columns=sel_cols[:-1], selection_column=sel_cols[-1]).set_index(sel_cols[:-1])" + "df = vaep.pandas.select_max_by(df=df.reset_index(),\n", + " grouping_columns=sel_cols[:-1],\n", + " selection_column=sel_cols[-1]).set_index(sel_cols[:-1])" ] }, { @@ -1545,7 +1551,7 @@ "metadata": {}, "outputs": [], "source": [ - "assert df.index.duplicated(False).sum() == 0 , \"Still missing values\"" + "assert df.index.duplicated(False).sum() == 0, \"Still missing values\"" ] }, { @@ -1577,7 +1583,7 @@ "id": "529fa0e7-7ad4-4c72-91b1-d37587835ce5", "metadata": {}, "source": [ - "- rawfile metadata -> keep " + "- rawfile metadata -> keep" ] }, { @@ -1614,7 +1620,7 @@ "%%time\n", "# des_data = df.describe() unnecessary computation which take too long\n", "des_data = df.isna().sum().to_frame('count').T\n", - "des_data " + "des_data" ] }, { @@ -1630,7 +1636,8 @@ "id": "44616770-fcc2-4a97-86f4-e0eadc98bb7a", "metadata": {}, "source": [ - "For one raw file no metadata could be extracted (`ERROR: Unable to access the RAW file using the native Thermo library.`)" + "For one raw file no metadata could be extracted (`ERROR: Unable to\n", + "access the RAW file using the native Thermo library.`)" ] }, { @@ -1777,7 +1784,7 @@ "metadata": {}, "outputs": [], "source": [ - "id_mappings = [\"PG.ProteinAccessions\", \"PG.Genes\"]\n", + "id_mappings = [\"PG.ProteinAccessions\", \"PG.Genes\"]\n", "id_mappings = meta[id_mappings].drop_duplicates()\n", "id_mappings.to_csv(folder_data_out / 'ald_liver_proteinGroups_id_mappings.csv')\n", "id_mappings" @@ -1802,7 +1809,7 @@ "outputs": [], "source": [ "column_types = ['.'.join(x for x in tup) for tup in list(column_types.unique())]\n", - "column_types # 'PG.Quantity' expected" + "column_types # 'PG.Quantity' expected" ] }, { @@ -1900,7 +1907,7 @@ "metadata": {}, "outputs": [], "source": [ - "sel_cols = ['PG.ProteinAccessions', 'PG.Genes', 'Sample ID', VAR_PG] # last one gives quantity\n", + "sel_cols = ['PG.ProteinAccessions', 'PG.Genes', 'Sample ID', VAR_PG] # last one gives quantity\n", "df = df.reset_index()[sel_cols].drop_duplicates().set_index(sel_cols[:-1])" ] }, diff --git a/project/10_0_ald_data.py b/project/10_0_ald_data.py index f0c54f0d7..1e179b55e 100644 --- a/project/10_0_ald_data.py +++ b/project/10_0_ald_data.py @@ -6,7 +6,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -38,14 +38,14 @@ print(*(folder_data.iterdir()), sep='\n') fnames = dict( -plasma_proteinGroups = folder_data / 'Protein_ALDupgrade_Report.csv', -plasma_aggPeptides = folder_data / 'ald_proteome_spectronaut.tsv', -liver_proteinGroups = folder_data / 'Protein_20200221_121354_20200218_ALD_LiverTissue_PlateS1_Atlaslib_Report.csv', -liver_aggPeptides = folder_data / 'Peptide_20220819_100847_20200218_ALD_LiverTissue_PlateS1_Atlaslib_Report.csv', -annotations = folder_data / 'ald_experiment_annotations.csv', -clinic = folder_data / 'labtest_integrated_numeric.csv', -raw_meta = folder_data / 'ald_metadata_rawfiles.csv') -fnames =vaep.nb.Config.from_dict(fnames) # could be handeled kwargs as in normal dict + plasma_proteinGroups=folder_data / 'Protein_ALDupgrade_Report.csv', + plasma_aggPeptides=folder_data / 'ald_proteome_spectronaut.tsv', + liver_proteinGroups=folder_data / 'Protein_20200221_121354_20200218_ALD_LiverTissue_PlateS1_Atlaslib_Report.csv', + liver_aggPeptides=folder_data / 'Peptide_20220819_100847_20200218_ALD_LiverTissue_PlateS1_Atlaslib_Report.csv', + annotations=folder_data / 'ald_experiment_annotations.csv', + clinic=folder_data / 'labtest_integrated_numeric.csv', + raw_meta=folder_data / 'ald_metadata_rawfiles.csv') +fnames = vaep.nb.Config.from_dict(fnames) # could be handeled kwargs as in normal dict # %% @@ -77,7 +77,7 @@ annotations # %% -annotations['Participant ID'].value_counts().value_counts() # some only have a blood sample, some both +annotations['Participant ID'].value_counts().value_counts() # some only have a blood sample, some both # %% [markdown] # ### Select ALD subcohort @@ -147,7 +147,7 @@ clinic["kleiner"].value_counts() # %% -clinic.loc[idx_overlap_plasma].to_csv(folder_data_out /'ald_metadata_cli.csv') +clinic.loc[idx_overlap_plasma].to_csv(folder_data_out / 'ald_metadata_cli.csv') # %% [markdown] # ## Rawfile information @@ -212,7 +212,7 @@ # > see section below # %% [markdown] -# ## (Aggregated) Peptide Data +# ## (Aggregated) Peptide Data # %% df = pd.read_table(fnames.plasma_aggPeptides, low_memory=False) @@ -263,7 +263,7 @@ meta.describe(include='all') # %% -id_mappings = ["PEP.StrippedSequence", "PG.ProteinAccessions", "PG.Genes"] +id_mappings = ["PEP.StrippedSequence", "PG.ProteinAccessions", "PG.Genes"] id_mappings = meta[id_mappings].drop_duplicates() id_mappings.to_csv(folder_data_out / 'ald_plasma_aggPeptides_id_mappings.csv') id_mappings @@ -273,28 +273,29 @@ # # taken from [Spectronaut manuel](https://biognosys.com/resources/spectronaut-manual/) # -# feature | description +# feature | description # --- | --- # PEP.IsProteinGroupSpecific | True or False. Tells you whether the peptide only belongs to one Protein Group. # PEP.StrippedSequence | - # PEP.IsProteotypic | - # PEP.PeptidePosition | - -# PG.Cscore | - +# PG.Cscore | - # PG.ProteinAccessions | - -# PG.Genes | - +# PG.Genes | - # PEP.Quantity | The quantitative value for that peptide as defined in the settings. -# EG.PrecursorId | Unique Id for the precursor: [modified sequence] plus [charge] +# EG.PrecursorId | Unique Id for the precursor: [modified sequence] plus [charge] # EG.Qvalue | The q-value (FDR) of the EG. -# EG.TotalQuantity (Settings) | The quantitative value for that EG as defined in the settings. +# EG.TotalQuantity (Settings) | The quantitative value for that EG as defined in the settings. # -# > Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious. +# > Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious. # -# > Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious. +# > Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious. # -# After discussing with Lili, `PEP.Quantity` is the fitting entity for each unique aggregated Peptide. Duplicated entries are just to drop +# After discussing with Lili, `PEP.Quantity` is the fitting entity for +# each unique aggregated Peptide. Duplicated entries are just to drop # %% -sel_cols = ['Sample ID', 'PEP.StrippedSequence', 'PEP.Quantity'] # selected quantity in last position +sel_cols = ['Sample ID', 'PEP.StrippedSequence', 'PEP.Quantity'] # selected quantity in last position df = df.reset_index()[sel_cols].drop_duplicates().set_index(sel_cols[:2]) df @@ -313,7 +314,7 @@ idx.describe() # %% [markdown] -# - rawfile metadata -> keep +# - rawfile metadata -> keep # %% df = df.set_index(idx) @@ -330,7 +331,8 @@ # ### Check for metadata from rawfile overlap # %% [markdown] -# For one raw file no metadata could be extracted (`ERROR: Unable to access the RAW file using the native Thermo library.`) +# For one raw file no metadata could be extracted (`ERROR: Unable to +# access the RAW file using the native Thermo library.`) # %% idx_diff = df.index.difference(raw_meta.index) @@ -390,7 +392,7 @@ meta.describe(include='all') # %% -id_mappings = ["PG.ProteinAccessions", "PG.Genes"] +id_mappings = ["PG.ProteinAccessions", "PG.Genes"] id_mappings = meta[id_mappings].drop_duplicates() id_mappings.to_csv(folder_data_out / 'ald_plasma_proteinGroups_id_mappings.csv', index=False) id_mappings @@ -401,7 +403,7 @@ # %% column_types = ['.'.join(x for x in tup) for tup in list(column_types.unique())] -column_types # 'PG.Quantity' expected +column_types # 'PG.Quantity' expected # %% df = df.set_index(list(df.columns[:N_FRIST_META])).sort_index(axis=1) @@ -412,11 +414,12 @@ # Drop index columns which are not selected # %% -def find_idx_to_drop(df:pd.DataFrame, idx_to_keep:list): - to_drop = [x for x in df.index.names if not x in idx_to_keep] +def find_idx_to_drop(df: pd.DataFrame, idx_to_keep: list): + to_drop = [x for x in df.index.names if x not in idx_to_keep] logger.info("Columnns to drop: {}".format(",".join((str(x) for x in to_drop)))) return to_drop - + + to_drop = find_idx_to_drop(df, idx_cols) df = df.reset_index(level=to_drop, drop=True) df.head() @@ -503,7 +506,7 @@ def find_idx_to_drop(df:pd.DataFrame, idx_to_keep:list): 'title': 'protein group measurement distribution'} ax = vaep.plotting.plot_counts(des_data.T.sort_values(by='count', ascending=False).reset_index( -), feat_col_name='count', n_samples=len(df), ax=None, min_feat_prop=.0,**kwargs) +), feat_col_name='count', n_samples=len(df), ax=None, min_feat_prop=.0, **kwargs) fig = ax.get_figure() fig.tight_layout() @@ -623,7 +626,7 @@ def find_idx_to_drop(df:pd.DataFrame, idx_to_keep:list): meta # %% -id_mappings = ["PEP.StrippedSequence", "PG.ProteinAccessions", "PG.Genes"] +id_mappings = ["PEP.StrippedSequence", "PG.ProteinAccessions", "PG.Genes"] id_mappings = meta[id_mappings].drop_duplicates() id_mappings.to_csv(folder_data_out / 'ald_liver_aggPeptides_id_mappings.csv') id_mappings @@ -634,28 +637,29 @@ def find_idx_to_drop(df:pd.DataFrame, idx_to_keep:list): # # taken from [Spectronaut manuel](https://biognosys.com/resources/spectronaut-manual/) # -# feature | description +# feature | description # --- | --- # PEP.IsProteinGroupSpecific | True or False. Tells you whether the peptide only belongs to one Protein Group. # PEP.StrippedSequence | - # PEP.IsProteotypic | - # PEP.PeptidePosition | - -# PG.Cscore | - +# PG.Cscore | - # PG.ProteinAccessions | - -# PG.Genes | - +# PG.Genes | - # PEP.Quantity | The quantitative value for that peptide as defined in the settings. -# EG.PrecursorId | Unique Id for the precursor: [modified sequence] plus [charge] +# EG.PrecursorId | Unique Id for the precursor: [modified sequence] plus [charge] # EG.Qvalue | The q-value (FDR) of the EG. -# EG.TotalQuantity (Settings) | The quantitative value for that EG as defined in the settings. +# EG.TotalQuantity (Settings) | The quantitative value for that EG as defined in the settings. # -# > Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious. +# > Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious. # -# > Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious. +# > Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious. # -# After discussing with Lili, `PEP.Quantity` is the fitting entity for each unique aggregated Peptide. Duplicated entries are just to drop +# After discussing with Lili, `PEP.Quantity` is the fitting entity for +# each unique aggregated Peptide. Duplicated entries are just to drop # %% -sel_cols = ['Sample ID', 'PEP.StrippedSequence', VAR_PEP] # selected quantity in last position +sel_cols = ['Sample ID', 'PEP.StrippedSequence', VAR_PEP] # selected quantity in last position df = df.reset_index()[sel_cols].drop_duplicates().set_index(sel_cols[:2]).squeeze() df @@ -666,7 +670,7 @@ def find_idx_to_drop(df:pd.DataFrame, idx_to_keep:list): # %% [markdown] # Select entry with maximum intensity of `duplicated entries` -# +# # > change of variable and many duplicates -> could be PSM table? (close to evidence?) # %% @@ -674,10 +678,12 @@ def find_idx_to_drop(df:pd.DataFrame, idx_to_keep:list): df.loc[mask_idx_duplicated].sort_index() # %% -df = vaep.pandas.select_max_by(df=df.reset_index(), grouping_columns=sel_cols[:-1], selection_column=sel_cols[-1]).set_index(sel_cols[:-1]) +df = vaep.pandas.select_max_by(df=df.reset_index(), + grouping_columns=sel_cols[:-1], + selection_column=sel_cols[-1]).set_index(sel_cols[:-1]) # %% -assert df.index.duplicated(False).sum() == 0 , "Still missing values" +assert df.index.duplicated(False).sum() == 0, "Still missing values" # %% df = df.unstack() @@ -690,7 +696,7 @@ def find_idx_to_drop(df:pd.DataFrame, idx_to_keep:list): idx.describe() # %% [markdown] -# - rawfile metadata -> keep +# - rawfile metadata -> keep # %% df = df.set_index(idx) @@ -706,13 +712,14 @@ def find_idx_to_drop(df:pd.DataFrame, idx_to_keep:list): # %%time # des_data = df.describe() unnecessary computation which take too long des_data = df.isna().sum().to_frame('count').T -des_data +des_data # %% [markdown] # ### Check for metadata from rawfile overlap # %% [markdown] -# For one raw file no metadata could be extracted (`ERROR: Unable to access the RAW file using the native Thermo library.`) +# For one raw file no metadata could be extracted (`ERROR: Unable to +# access the RAW file using the native Thermo library.`) # %% # idx_diff = df.index.difference(raw_meta.index) @@ -773,7 +780,7 @@ def find_idx_to_drop(df:pd.DataFrame, idx_to_keep:list): meta.describe(include='all') # %% -id_mappings = ["PG.ProteinAccessions", "PG.Genes"] +id_mappings = ["PG.ProteinAccessions", "PG.Genes"] id_mappings = meta[id_mappings].drop_duplicates() id_mappings.to_csv(folder_data_out / 'ald_liver_proteinGroups_id_mappings.csv') id_mappings @@ -784,7 +791,7 @@ def find_idx_to_drop(df:pd.DataFrame, idx_to_keep:list): # %% column_types = ['.'.join(x for x in tup) for tup in list(column_types.unique())] -column_types # 'PG.Quantity' expected +column_types # 'PG.Quantity' expected # %% df = df.set_index(list(df.columns[:N_FRIST_META])).sort_index(axis=1) @@ -824,7 +831,7 @@ def find_idx_to_drop(df:pd.DataFrame, idx_to_keep:list): df # %% -sel_cols = ['PG.ProteinAccessions', 'PG.Genes', 'Sample ID', VAR_PG] # last one gives quantity +sel_cols = ['PG.ProteinAccessions', 'PG.Genes', 'Sample ID', VAR_PG] # last one gives quantity df = df.reset_index()[sel_cols].drop_duplicates().set_index(sel_cols[:-1]) # %% diff --git a/project/10_0_ald_data_3v3.py b/project/10_0_ald_data_3v3.py new file mode 100644 index 000000000..3d75c14c4 --- /dev/null +++ b/project/10_0_ald_data_3v3.py @@ -0,0 +1,31 @@ +# %% +from pathlib import Path +import pandas as pd + +# %% +FN_INTENSITIES = "data/ALD_study/processed/ald_plasma_proteinGroups.pkl" +fn_clinical_data = "data/ALD_study/processed/ald_metadata_cli.csv" + +FN_INTENSITIES = Path(FN_INTENSITIES) + +# %% +df = pd.read_pickle(FN_INTENSITIES) +df + +# %% +meta = pd.read_csv(fn_clinical_data, index_col=0) +meta + +# %% +sel = pd.concat( + [df.loc[meta['kleiner'] == 0].sample(3), + df.loc[meta['kleiner'] == 4].sample(3), + ]) +sel + +# %% +fname = FN_INTENSITIES.parent / f'{FN_INTENSITIES.stem}_3v3.pkl' +sel.to_pickle(fname) +fname.as_posix() + +# %% diff --git a/project/10_1_ald_diff_analysis.ipynb b/project/10_1_ald_diff_analysis.ipynb index 59b6d06ae..b25564941 100644 --- a/project/10_1_ald_diff_analysis.ipynb +++ b/project/10_1_ald_diff_analysis.ipynb @@ -8,7 +8,7 @@ "\n", "- load missing values predictions\n", "- leave all other values as they were\n", - "- compare missing values predicition by model with baseline method \n", + "- compare missing values predicition by model with baseline method\n", " (default: draw from shifted normal distribution. short RSN)" ] }, @@ -18,8 +18,10 @@ "metadata": {}, "outputs": [], "source": [ + "import logging\n", "from pathlib import Path\n", "import matplotlib.pyplot as plt\n", + "from IPython.display import display\n", "\n", "import pandas as pd\n", "\n", @@ -31,7 +33,8 @@ "\n", "import vaep.nb\n", "\n", - "logger = vaep.logging.setup_nb_logger()" + "logger = vaep.logging.setup_nb_logger()\n", + "logging.getLogger('fontTools').setLevel(logging.WARNING)" ] }, { @@ -66,19 +69,19 @@ "folder_experiment = \"runs/appl_ald_data/plasma/proteinGroups\"\n", "folder_data: str = '' # specify data directory if needed\n", "fn_clinical_data = \"data/ALD_study/processed/ald_metadata_cli.csv\"\n", - "fn_qc_samples = '' #'data/ALD_study/processed/qc_plasma_proteinGroups.pkl'\n", + "fn_qc_samples = '' # 'data/ALD_study/processed/qc_plasma_proteinGroups.pkl'\n", "f_annotations = 'data/ALD_study/processed/ald_plasma_proteinGroups_id_mappings.csv'\n", "\n", "\n", "target: str = 'kleiner'\n", - "covar:str = 'age,bmi,gender_num,nas_steatosis_ordinal,abstinent_num'\n", + "covar: str = 'age,bmi,gender_num,nas_steatosis_ordinal,abstinent_num'\n", "\n", "file_format = \"csv\"\n", - "model_key = 'VAE' # model(s) to evaluate\n", - "model = None # default same as model_key, but could be overwritten (edge case)\n", - "value_name='intensity'\n", - "out_folder='diff_analysis'\n", - "template_pred = 'pred_real_na_{}.csv' # fixed, do not change" + "model_key = 'VAE' # model(s) to evaluate\n", + "model = None # default same as model_key, but could be overwritten (edge case)\n", + "value_name = 'intensity'\n", + "out_folder = 'diff_analysis'\n", + "template_pred = 'pred_real_na_{}.csv' # fixed, do not change" ] }, { @@ -180,7 +183,7 @@ "source": [ "df_clinic = pd.read_csv(args.fn_clinical_data, index_col=0)\n", "df_clinic = df_clinic.loc[observed.index.levels[0]]\n", - "cols_clinic = vaep.pandas.get_columns_accessor(df_clinic) # pick Berlin as reference?\n", + "cols_clinic = vaep.pandas.get_columns_accessor(df_clinic)\n", "df_clinic[[args.target, *args.covar]].describe()" ] }, @@ -302,7 +305,7 @@ "logger.info(fname)\n", "feat_freq_observed.to_csv(fname)\n", "ax = feat_freq_observed.sort_values().plot(marker='.', rot=90)\n", - "_ = ax.set_xticklabels([l.get_text().split(';')[0] for l in ax.get_xticklabels()])" + "_ = ax.set_xticklabels([l_.get_text().split(';')[0] for l_ in ax.get_xticklabels()])" ] }, { @@ -321,7 +324,7 @@ "DATA_COMPLETENESS = 0.6\n", "# MIN_N_PROTEIN_GROUPS: int = 200\n", "FRAC_PROTEIN_GROUPS: int = 0.622\n", - "CV_QC_SAMPLE: float = 0.4 # Coef. of variation on 13 QC samples\n", + "CV_QC_SAMPLE: float = 0.4 # Coef. of variation on 13 QC samples\n", "\n", "ald_study, cutoffs = vaep.analyzers.diff_analysis.select_raw_data(observed.unstack(\n", "), data_completeness=DATA_COMPLETENESS, frac_protein_groups=FRAC_PROTEIN_GROUPS)\n", @@ -340,14 +343,14 @@ " qc_samples = pd.read_pickle(args.fn_qc_samples)\n", " qc_cv_feat = qc_samples.std() / qc_samples.mean()\n", " qc_cv_feat = qc_cv_feat.rename(qc_samples.columns.name)\n", - " fig, ax = plt.subplots(figsize=(4,7))\n", + " fig, ax = plt.subplots(figsize=(4, 7))\n", " ax = qc_cv_feat.plot.box(ax=ax)\n", " ax.set_ylabel('Coefficient of Variation')\n", " vaep.savefig(fig, name='cv_qc_samples', folder=args.out_figures)\n", " print((qc_cv_feat < CV_QC_SAMPLE).value_counts())\n", " # only to ald_study data\n", " ald_study = ald_study[vaep.analyzers.diff_analysis.select_feat(qc_samples[ald_study.columns])]\n", - " \n", + "\n", "ald_study" ] }, @@ -360,8 +363,8 @@ "outputs": [], "source": [ "fig, axes = vaep.plotting.plot_cutoffs(observed.unstack(),\n", - " feat_completness_over_samples=cutoffs.feat_completness_over_samples,\n", - " min_feat_in_sample=cutoffs.min_feat_in_sample)\n", + " feat_completness_over_samples=cutoffs.feat_completness_over_samples,\n", + " min_feat_in_sample=cutoffs.min_feat_in_sample)\n", "vaep.savefig(fig, name='tresholds_normal_imputation', folder=args.out_figures)" ] }, @@ -399,7 +402,7 @@ "outputs": [], "source": [ "fname = args.out_preds / args.template_pred.format(args.model)\n", - "fname " + "fname" ] }, { @@ -462,11 +465,11 @@ "def plot_distributions(observed: pd.Series,\n", " imputation: pd.Series = None,\n", " model_key: str = 'MODEL',\n", - " figsize=(4,3),\n", + " figsize=(4, 3),\n", " sharex=True):\n", " \"\"\"Plots distributions of intensities provided as dictionary of labels to pd.Series.\"\"\"\n", " series_ = [observed, imputation] if imputation is not None else [observed]\n", - " min_bin, max_bin = vaep.plotting.data.get_min_max_iterable(series_)\n", + " min_bin, max_bin = vaep.plotting.data.get_min_max_iterable([observed])\n", "\n", " if imputation is not None:\n", " fig, axes = plt.subplots(len(series_), figsize=figsize, sharex=sharex)\n", @@ -474,8 +477,8 @@ " else:\n", " fig, ax = plt.subplots(1, figsize=figsize, sharex=sharex)\n", "\n", - " bins = range(min_bin, max_bin+1, 1)\n", - " \n", + " bins = range(min_bin, max_bin + 1, 1)\n", + "\n", " label = 'observed measurments'\n", " ax = observed.hist(ax=ax, bins=bins, color='grey')\n", " ax.set_title(f'{label} (N={len(observed):,d})')\n", @@ -483,14 +486,13 @@ " ax.locator_params(axis='y', integer=True)\n", " ax.yaxis.set_major_formatter(\"{x:,.0f}\")\n", "\n", - "\n", " if imputation is not None:\n", " ax = axes[1]\n", " label = f'Missing values imputed using {model_key.upper()}'\n", " color = vaep.plotting.defaults.color_model_mapping.get(model_key, None)\n", " if color is None:\n", " color = f'C{1}'\n", - " ax = imputation.hist(ax=ax,bins=bins, color=color)\n", + " ax = imputation.hist(ax=ax, bins=bins, color=color)\n", " ax.set_title(f'{label} (N={len(imputation):,d})')\n", " ax.set_ylabel('observations')\n", " ax.locator_params(axis='y', integer=True)\n", @@ -498,7 +500,7 @@ " return fig\n", "\n", "\n", - "vaep.plotting.make_large_descriptors(5)\n", + "vaep.plotting.make_large_descriptors(6)\n", "fig = plot_distributions(observed,\n", " imputation=pred_real_na,\n", " model_key=args.model_key, figsize=(2.5, 2))\n", @@ -522,7 +524,7 @@ "source": [ "if pred_real_na is not None:\n", " shifts = (vaep.imputation.compute_moments_shift(observed, pred_real_na,\n", - " names=('observed', args.model_key)))\n", + " names=('observed', args.model_key)))\n", " display(pd.DataFrame(shifts).T)" ] }, @@ -543,8 +545,8 @@ " index_level = 0 # per sample\n", " mean_by_sample = pd.DataFrame(\n", " {'observed': vaep.imputation.stats_by_level(observed, index_level=index_level),\n", - " args.model_key: vaep.imputation.stats_by_level(pred_real_na, index_level=index_level)\n", - " })\n", + " args.model_key: vaep.imputation.stats_by_level(pred_real_na, index_level=index_level)\n", + " })\n", " mean_by_sample.loc['mean_shift'] = (mean_by_sample.loc['mean', 'observed'] -\n", " mean_by_sample.loc['mean']).abs() / mean_by_sample.loc['std', 'observed']\n", " mean_by_sample.loc['std shrinkage'] = mean_by_sample.loc['std'] / \\\n", @@ -602,10 +604,10 @@ "outputs": [], "source": [ "scores = vaep.stats.diff_analysis.analyze(df_proteomics=df,\n", - " df_clinic=df_clinic,\n", - " target=args.target,\n", - " covar=args.covar,\n", - " value_name=args.value_name)\n", + " df_clinic=df_clinic,\n", + " target=args.target,\n", + " covar=args.covar,\n", + " value_name=args.value_name)\n", "\n", "scores" ] @@ -623,7 +625,7 @@ " scores = (scores\n", " .join(gene_to_PG)\n", " .set_index(gene_to_PG.columns.to_list(), append=True)\n", - " )\n", + " )\n", "scores" ] }, @@ -648,7 +650,7 @@ }, "outputs": [], "source": [ - "fname = args.out_folder/ 'scores' / f'diff_analysis_scores_{str(args.model_key)}.pkl'\n", + "fname = args.out_folder / 'scores' / f'diff_analysis_scores_{str(args.model_key)}.pkl'\n", "files_out[fname.name] = fname.as_posix()\n", "fname.parent.mkdir(exist_ok=True, parents=True)\n", "scores.to_pickle(fname)\n", diff --git a/project/10_1_ald_diff_analysis.py b/project/10_1_ald_diff_analysis.py index cc5c3b6bb..1258de220 100644 --- a/project/10_1_ald_diff_analysis.py +++ b/project/10_1_ald_diff_analysis.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -17,12 +17,14 @@ # # - load missing values predictions # - leave all other values as they were -# - compare missing values predicition by model with baseline method +# - compare missing values predicition by model with baseline method # (default: draw from shifted normal distribution. short RSN) # %% +import logging from pathlib import Path import matplotlib.pyplot as plt +from IPython.display import display import pandas as pd @@ -35,6 +37,7 @@ import vaep.nb logger = vaep.logging.setup_nb_logger() +logging.getLogger('fontTools').setLevel(logging.WARNING) # %% # catch passed parameters @@ -48,19 +51,19 @@ folder_experiment = "runs/appl_ald_data/plasma/proteinGroups" folder_data: str = '' # specify data directory if needed fn_clinical_data = "data/ALD_study/processed/ald_metadata_cli.csv" -fn_qc_samples = '' #'data/ALD_study/processed/qc_plasma_proteinGroups.pkl' +fn_qc_samples = '' # 'data/ALD_study/processed/qc_plasma_proteinGroups.pkl' f_annotations = 'data/ALD_study/processed/ald_plasma_proteinGroups_id_mappings.csv' target: str = 'kleiner' -covar:str = 'age,bmi,gender_num,nas_steatosis_ordinal,abstinent_num' +covar: str = 'age,bmi,gender_num,nas_steatosis_ordinal,abstinent_num' file_format = "csv" -model_key = 'VAE' # model(s) to evaluate -model = None # default same as model_key, but could be overwritten (edge case) -value_name='intensity' -out_folder='diff_analysis' -template_pred = 'pred_real_na_{}.csv' # fixed, do not change +model_key = 'VAE' # model(s) to evaluate +model = None # default same as model_key, but could be overwritten (edge case) +value_name = 'intensity' +out_folder = 'diff_analysis' +template_pred = 'pred_real_na_{}.csv' # fixed, do not change # %% @@ -109,7 +112,7 @@ # %% df_clinic = pd.read_csv(args.fn_clinical_data, index_col=0) df_clinic = df_clinic.loc[observed.index.levels[0]] -cols_clinic = vaep.pandas.get_columns_accessor(df_clinic) # pick Berlin as reference? +cols_clinic = vaep.pandas.get_columns_accessor(df_clinic) df_clinic[[args.target, *args.covar]].describe() @@ -174,7 +177,7 @@ logger.info(fname) feat_freq_observed.to_csv(fname) ax = feat_freq_observed.sort_values().plot(marker='.', rot=90) -_ = ax.set_xticklabels([l.get_text().split(';')[0] for l in ax.get_xticklabels()]) +_ = ax.set_xticklabels([l_.get_text().split(';')[0] for l_ in ax.get_xticklabels()]) # %% [markdown] # ## ALD study approach using all measurments @@ -183,7 +186,7 @@ DATA_COMPLETENESS = 0.6 # MIN_N_PROTEIN_GROUPS: int = 200 FRAC_PROTEIN_GROUPS: int = 0.622 -CV_QC_SAMPLE: float = 0.4 # Coef. of variation on 13 QC samples +CV_QC_SAMPLE: float = 0.4 # Coef. of variation on 13 QC samples ald_study, cutoffs = vaep.analyzers.diff_analysis.select_raw_data(observed.unstack( ), data_completeness=DATA_COMPLETENESS, frac_protein_groups=FRAC_PROTEIN_GROUPS) @@ -196,20 +199,20 @@ qc_samples = pd.read_pickle(args.fn_qc_samples) qc_cv_feat = qc_samples.std() / qc_samples.mean() qc_cv_feat = qc_cv_feat.rename(qc_samples.columns.name) - fig, ax = plt.subplots(figsize=(4,7)) + fig, ax = plt.subplots(figsize=(4, 7)) ax = qc_cv_feat.plot.box(ax=ax) ax.set_ylabel('Coefficient of Variation') vaep.savefig(fig, name='cv_qc_samples', folder=args.out_figures) print((qc_cv_feat < CV_QC_SAMPLE).value_counts()) # only to ald_study data ald_study = ald_study[vaep.analyzers.diff_analysis.select_feat(qc_samples[ald_study.columns])] - + ald_study # %% fig, axes = vaep.plotting.plot_cutoffs(observed.unstack(), - feat_completness_over_samples=cutoffs.feat_completness_over_samples, - min_feat_in_sample=cutoffs.min_feat_in_sample) + feat_completness_over_samples=cutoffs.feat_completness_over_samples, + min_feat_in_sample=cutoffs.min_feat_in_sample) vaep.savefig(fig, name='tresholds_normal_imputation', folder=args.out_figures) @@ -225,7 +228,7 @@ # %% fname = args.out_preds / args.template_pred.format(args.model) -fname +fname # %% [markdown] # Baseline comparison @@ -262,11 +265,11 @@ def plot_distributions(observed: pd.Series, imputation: pd.Series = None, model_key: str = 'MODEL', - figsize=(4,3), + figsize=(4, 3), sharex=True): """Plots distributions of intensities provided as dictionary of labels to pd.Series.""" series_ = [observed, imputation] if imputation is not None else [observed] - min_bin, max_bin = vaep.plotting.data.get_min_max_iterable(series_) + min_bin, max_bin = vaep.plotting.data.get_min_max_iterable([observed]) if imputation is not None: fig, axes = plt.subplots(len(series_), figsize=figsize, sharex=sharex) @@ -274,8 +277,8 @@ def plot_distributions(observed: pd.Series, else: fig, ax = plt.subplots(1, figsize=figsize, sharex=sharex) - bins = range(min_bin, max_bin+1, 1) - + bins = range(min_bin, max_bin + 1, 1) + label = 'observed measurments' ax = observed.hist(ax=ax, bins=bins, color='grey') ax.set_title(f'{label} (N={len(observed):,d})') @@ -283,14 +286,13 @@ def plot_distributions(observed: pd.Series, ax.locator_params(axis='y', integer=True) ax.yaxis.set_major_formatter("{x:,.0f}") - if imputation is not None: ax = axes[1] label = f'Missing values imputed using {model_key.upper()}' color = vaep.plotting.defaults.color_model_mapping.get(model_key, None) if color is None: color = f'C{1}' - ax = imputation.hist(ax=ax,bins=bins, color=color) + ax = imputation.hist(ax=ax, bins=bins, color=color) ax.set_title(f'{label} (N={len(imputation):,d})') ax.set_ylabel('observations') ax.locator_params(axis='y', integer=True) @@ -298,7 +300,7 @@ def plot_distributions(observed: pd.Series, return fig -vaep.plotting.make_large_descriptors(5) +vaep.plotting.make_large_descriptors(6) fig = plot_distributions(observed, imputation=pred_real_na, model_key=args.model_key, figsize=(2.5, 2)) @@ -312,7 +314,7 @@ def plot_distributions(observed: pd.Series, # %% if pred_real_na is not None: shifts = (vaep.imputation.compute_moments_shift(observed, pred_real_na, - names=('observed', args.model_key))) + names=('observed', args.model_key))) display(pd.DataFrame(shifts).T) # %% [markdown] @@ -323,8 +325,8 @@ def plot_distributions(observed: pd.Series, index_level = 0 # per sample mean_by_sample = pd.DataFrame( {'observed': vaep.imputation.stats_by_level(observed, index_level=index_level), - args.model_key: vaep.imputation.stats_by_level(pred_real_na, index_level=index_level) - }) + args.model_key: vaep.imputation.stats_by_level(pred_real_na, index_level=index_level) + }) mean_by_sample.loc['mean_shift'] = (mean_by_sample.loc['mean', 'observed'] - mean_by_sample.loc['mean']).abs() / mean_by_sample.loc['std', 'observed'] mean_by_sample.loc['std shrinkage'] = mean_by_sample.loc['std'] / \ @@ -352,10 +354,10 @@ def plot_distributions(observed: pd.Series, # Targets - Clinical variables # %% scores = vaep.stats.diff_analysis.analyze(df_proteomics=df, - df_clinic=df_clinic, - target=args.target, - covar=args.covar, - value_name=args.value_name) + df_clinic=df_clinic, + target=args.target, + covar=args.covar, + value_name=args.value_name) scores @@ -366,7 +368,7 @@ def plot_distributions(observed: pd.Series, scores = (scores .join(gene_to_PG) .set_index(gene_to_PG.columns.to_list(), append=True) - ) + ) scores # %% @@ -376,7 +378,7 @@ def plot_distributions(observed: pd.Series, # %% -fname = args.out_folder/ 'scores' / f'diff_analysis_scores_{str(args.model_key)}.pkl' +fname = args.out_folder / 'scores' / f'diff_analysis_scores_{str(args.model_key)}.pkl' files_out[fname.name] = fname.as_posix() fname.parent.mkdir(exist_ok=True, parents=True) scores.to_pickle(fname) diff --git a/project/10_2_ald_compare_methods.ipynb b/project/10_2_ald_compare_methods.ipynb index e10ecbc66..68ad9f949 100644 --- a/project/10_2_ald_compare_methods.ipynb +++ b/project/10_2_ald_compare_methods.ipynb @@ -28,7 +28,7 @@ "logger = vaep.logging.setup_nb_logger()\n", "\n", "plt.rcParams['figure.figsize'] = (2, 2)\n", - "fontsize= 5\n", + "fontsize = 5\n", "vaep.plotting.make_large_descriptors(fontsize)" ] }, @@ -121,7 +121,7 @@ "source": [ "files_in = {\n", " 'freq_features_observed.csv': args.folder_experiment / 'freq_features_observed.csv',\n", - " }\n", + "}\n", "files_in" ] }, @@ -152,7 +152,7 @@ "source": [ "writer_args = dict(float_format='%.3f')\n", "\n", - "fname = args.out_folder / 'diff_analysis_compare_methods.xlsx'\n", + "fname = args.out_folder / 'diff_analysis_compare_methods.xlsx'\n", "files_out[fname.name] = fname\n", "writer = pd.ExcelWriter(fname)\n", "fname" @@ -163,7 +163,7 @@ "id": "770d1f76-e86f-4ae3-9d7b-ceef9b9e9a22", "metadata": {}, "source": [ - "# Load scores " + "# Load scores" ] }, { @@ -183,7 +183,7 @@ "metadata": {}, "outputs": [], "source": [ - "fname =args.scores_folder / f'diff_analysis_scores_{args.baseline}.pkl'\n", + "fname = args.scores_folder / f'diff_analysis_scores_{args.baseline}.pkl'\n", "scores_baseline = pd.read_pickle(fname)\n", "scores_baseline" ] @@ -259,11 +259,12 @@ "cell_type": "code", "execution_count": null, "id": "53bd5597-221c-4d54-abf2-82956db42594", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "outputs": [], "source": [ - "scores.describe(include=['bool', 'O'])\n", - "\n" + "scores.describe(include=['bool', 'O'])" ] }, { @@ -305,8 +306,8 @@ " .dropna()\n", " .reset_index(-1, drop=True)\n", " ).join(\n", - " freq_feat, how='left'\n", - " )\n", + " freq_feat, how='left'\n", + ")\n", "scores_common" ] }, @@ -323,7 +324,7 @@ "\n", "annotations = None\n", "for model, model_column in models.items():\n", - " if not annotations is None:\n", + " if annotations is not None:\n", " annotations += ' - '\n", " annotations += annotate_decision(scores_common,\n", " model=model, model_column=model_column)\n", @@ -395,7 +396,7 @@ "outputs": [], "source": [ "# should it be possible to run not only RSN?\n", - "to_plot['diff_qvalue'] = (to_plot[str(args.baseline)] - to_plot[str(args.model_key)]).abs()\n", + "to_plot['diff_qvalue'] = (to_plot[str(args.baseline)] - to_plot[str(args.model_key)]).abs()\n", "to_plot.loc[mask_different].sort_values('diff_qvalue', ascending=False)" ] }, @@ -430,7 +431,7 @@ "_ = ax.legend(fontsize=fontsize,\n", " title_fontsize=fontsize,\n", " markerscale=0.4,\n", - " title='',\n", + " title='',\n", " )\n", "ax.set_xlabel(f\"qvalue for {x_col}\")\n", "ax.set_ylabel(f\"qvalue for {y_col}\")\n", @@ -466,7 +467,7 @@ " y=to_plot.columns[1],\n", " size='frequency',\n", " s=size,\n", - " sizes=(5,20),\n", + " sizes=(5, 20),\n", " hue='Differential Analysis Comparison')\n", "_ = ax.legend(fontsize=fontsize,\n", " title_fontsize=fontsize,\n", @@ -504,7 +505,7 @@ " .loc[\n", " scores_model_only.index.difference(\n", " scores_common.index),\n", - " args.model_key]\n", + " args.model_key]\n", " .sort_values(by='qvalue', ascending=True)\n", " .join(freq_feat)\n", " )\n", @@ -592,7 +593,7 @@ "metadata": {}, "outputs": [], "source": [ - "feat_name = scores.index.names[0] # first index level is feature name\n", + "feat_name = scores.index.names[0] # first index level is feature name\n", "if args.annotaitons_gene_col in scores.index.names:\n", " logger.info(f\"Found gene annotation in scores index: {scores.index.names}\")\n", "else:\n", @@ -610,13 +611,13 @@ "outputs": [], "source": [ "gene_to_PG = (scores.droplevel(\n", - " list(set(scores.index.names) - {feat_name, args.annotaitons_gene_col})\n", - " )\n", - " .index\n", - " .to_frame()\n", - " .reset_index(drop=True)\n", - " .set_index(args.annotaitons_gene_col)\n", - " )\n", + " list(set(scores.index.names) - {feat_name, args.annotaitons_gene_col})\n", + ")\n", + " .index\n", + " .to_frame()\n", + " .reset_index(drop=True)\n", + " .set_index(args.annotaitons_gene_col)\n", + ")\n", "gene_to_PG.head()" ] }, diff --git a/project/10_2_ald_compare_methods.py b/project/10_2_ald_compare_methods.py index 5eac1e1ba..55037a3de 100644 --- a/project/10_2_ald_compare_methods.py +++ b/project/10_2_ald_compare_methods.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -29,7 +29,7 @@ logger = vaep.logging.setup_nb_logger() plt.rcParams['figure.figsize'] = (2, 2) -fontsize= 5 +fontsize = 5 vaep.plotting.make_large_descriptors(fontsize) # %% @@ -76,7 +76,7 @@ # %% files_in = { 'freq_features_observed.csv': args.folder_experiment / 'freq_features_observed.csv', - } +} files_in # %% [markdown] @@ -88,19 +88,19 @@ # %% writer_args = dict(float_format='%.3f') -fname = args.out_folder / 'diff_analysis_compare_methods.xlsx' +fname = args.out_folder / 'diff_analysis_compare_methods.xlsx' files_out[fname.name] = fname writer = pd.ExcelWriter(fname) fname # %% [markdown] -# # Load scores +# # Load scores # %% [x for x in args.scores_folder.iterdir() if 'scores' in str(x)] # %% -fname =args.scores_folder / f'diff_analysis_scores_{args.baseline}.pkl' +fname = args.scores_folder / f'diff_analysis_scores_{args.baseline}.pkl' scores_baseline = pd.read_pickle(fname) scores_baseline @@ -133,7 +133,6 @@ scores.describe(include=['bool', 'O']) - # %% [markdown] # ## Load frequencies of observed features @@ -150,8 +149,8 @@ .dropna() .reset_index(-1, drop=True) ).join( - freq_feat, how='left' - ) + freq_feat, how='left' +) scores_common @@ -162,7 +161,7 @@ def annotate_decision(scores, model, model_column): annotations = None for model, model_column in models.items(): - if not annotations is None: + if annotations is not None: annotations += ' - ' annotations += annotate_decision(scores_common, model=model, model_column=model_column) @@ -199,7 +198,7 @@ def annotate_decision(scores, model, model_column): # %% # should it be possible to run not only RSN? -to_plot['diff_qvalue'] = (to_plot[str(args.baseline)] - to_plot[str(args.model_key)]).abs() +to_plot['diff_qvalue'] = (to_plot[str(args.baseline)] - to_plot[str(args.model_key)]).abs() to_plot.loc[mask_different].sort_values('diff_qvalue', ascending=False) # %% [markdown] @@ -222,7 +221,7 @@ def annotate_decision(scores, model, model_column): _ = ax.legend(fontsize=fontsize, title_fontsize=fontsize, markerscale=0.4, - title='', + title='', ) ax.set_xlabel(f"qvalue for {x_col}") ax.set_ylabel(f"qvalue for {y_col}") @@ -246,7 +245,7 @@ def annotate_decision(scores, model, model_column): y=to_plot.columns[1], size='frequency', s=size, - sizes=(5,20), + sizes=(5, 20), hue='Differential Analysis Comparison') _ = ax.legend(fontsize=fontsize, title_fontsize=fontsize, @@ -272,7 +271,7 @@ def annotate_decision(scores, model, model_column): .loc[ scores_model_only.index.difference( scores_common.index), - args.model_key] + args.model_key] .sort_values(by='qvalue', ascending=True) .join(freq_feat) ) @@ -307,7 +306,7 @@ def annotate_decision(scores, model, model_column): # %% # %% -feat_name = scores.index.names[0] # first index level is feature name +feat_name = scores.index.names[0] # first index level is feature name if args.annotaitons_gene_col in scores.index.names: logger.info(f"Found gene annotation in scores index: {scores.index.names}") else: @@ -318,13 +317,13 @@ def annotate_decision(scores, model, model_column): # %% gene_to_PG = (scores.droplevel( - list(set(scores.index.names) - {feat_name, args.annotaitons_gene_col}) - ) - .index - .to_frame() - .reset_index(drop=True) - .set_index(args.annotaitons_gene_col) - ) + list(set(scores.index.names) - {feat_name, args.annotaitons_gene_col}) +) + .index + .to_frame() + .reset_index(drop=True) + .set_index(args.annotaitons_gene_col) +) gene_to_PG.head() # %% diff --git a/project/10_3_ald_ml_new_feat.ipynb b/project/10_3_ald_ml_new_feat.ipynb index dbb096e61..d3d7dd763 100644 --- a/project/10_3_ald_ml_new_feat.ipynb +++ b/project/10_3_ald_ml_new_feat.ipynb @@ -40,8 +40,8 @@ "\n", "plt.rcParams['figure.figsize'] = (2.5, 2.5)\n", "plt.rcParams['lines.linewidth'] = 1\n", - "fontsize= 5\n", - "figsize= (2.5, 2.5)\n", + "fontsize = 5\n", + "figsize = (2.5, 2.5)\n", "vaep.plotting.make_large_descriptors(fontsize)\n", "\n", "\n", @@ -89,7 +89,7 @@ "cutoff_target: int = 2 # => for binarization target >= cutoff_target\n", "file_format = \"csv\"\n", "out_folder = 'diff_analysis'\n", - "fn_qc_samples = '' #'data/ALD_study/processed/qc_plasma_proteinGroups.pkl'\n", + "fn_qc_samples = '' # 'data/ALD_study/processed/qc_plasma_proteinGroups.pkl'\n", "\n", "baseline = 'RSN' # default is RSN, as this was used in the original ALD Niu. et. al 2022\n", "template_pred = 'pred_real_na_{}.csv' # fixed, do not change" @@ -272,7 +272,7 @@ "outputs": [], "source": [ "fname = args.out_preds / args.template_pred.format(args.baseline)\n", - "pred_real_na_baseline = load_single_csv_pred_file(fname) #.loc[mask_has_target]\n", + "pred_real_na_baseline = load_single_csv_pred_file(fname) # .loc[mask_has_target]\n", "pred_real_na_baseline" ] }, @@ -291,7 +291,9 @@ "Repeat general approach for\n", " 1. all original ald data: all features justed in original ALD study\n", " 2. all model data: all features available my using the self supervised deep learning model\n", - " 3. newly available feat only: the subset of features available from the self supervised deep learning model which were newly retained using the new approach" + "3. newly available feat only: the subset of features available from the\n", + "self supervised deep learning model which were newly retained using the\n", + "new approach" ] }, { diff --git a/project/10_3_ald_ml_new_feat.py b/project/10_3_ald_ml_new_feat.py index fca637f80..5ac862558 100644 --- a/project/10_3_ald_ml_new_feat.py +++ b/project/10_3_ald_ml_new_feat.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -41,8 +41,8 @@ plt.rcParams['figure.figsize'] = (2.5, 2.5) plt.rcParams['lines.linewidth'] = 1 -fontsize= 5 -figsize= (2.5, 2.5) +fontsize = 5 +figsize = (2.5, 2.5) vaep.plotting.make_large_descriptors(fontsize) @@ -66,7 +66,7 @@ cutoff_target: int = 2 # => for binarization target >= cutoff_target file_format = "csv" out_folder = 'diff_analysis' -fn_qc_samples = '' #'data/ALD_study/processed/qc_plasma_proteinGroups.pkl' +fn_qc_samples = '' # 'data/ALD_study/processed/qc_plasma_proteinGroups.pkl' baseline = 'RSN' # default is RSN, as this was used in the original ALD Niu. et. al 2022 template_pred = 'pred_real_na_{}.csv' # fixed, do not change @@ -160,7 +160,7 @@ # %% fname = args.out_preds / args.template_pred.format(args.baseline) -pred_real_na_baseline = load_single_csv_pred_file(fname) #.loc[mask_has_target] +pred_real_na_baseline = load_single_csv_pred_file(fname) # .loc[mask_has_target] pred_real_na_baseline # %% [markdown] @@ -174,7 +174,9 @@ # Repeat general approach for # 1. all original ald data: all features justed in original ALD study # 2. all model data: all features available my using the self supervised deep learning model -# 3. newly available feat only: the subset of features available from the self supervised deep learning model which were newly retained using the new approach +# 3. newly available feat only: the subset of features available from the +# self supervised deep learning model which were newly retained using the +# new approach # %% X = pd.concat([data, pred_real_na]).unstack() diff --git a/project/10_4_ald_compare_single_pg.ipynb b/project/10_4_ald_compare_single_pg.ipynb index a5e0612f1..fe947a33b 100644 --- a/project/10_4_ald_compare_single_pg.ipynb +++ b/project/10_4_ald_compare_single_pg.ipynb @@ -20,6 +20,7 @@ "source": [ "from pathlib import Path\n", "\n", + "import logging\n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "\n", @@ -32,10 +33,10 @@ "import vaep.imputation\n", "\n", "logger = vaep.logging.setup_nb_logger()\n", - "\n", + "logging.getLogger('fontTools').setLevel(logging.WARNING)\n", "\n", "plt.rcParams['figure.figsize'] = [4, 2.5] # [16.0, 7.0] , [4, 3]\n", - "vaep.plotting.make_large_descriptors(5)" + "vaep.plotting.make_large_descriptors(7)" ] }, { @@ -72,7 +73,7 @@ "source": [ "folder_experiment = 'runs/appl_ald_data/plasma/proteinGroups'\n", "fn_clinical_data = \"data/ALD_study/processed/ald_metadata_cli.csv\"\n", - "make_plots = True # create histograms and swarmplots of diverging results\n", + "make_plots = True # create histograms and swarmplots of diverging results\n", "model_key = 'VAE'\n", "sample_id_col = 'Sample ID'\n", "target = 'kleiner'\n", @@ -81,7 +82,7 @@ "file_format = 'csv'\n", "baseline = 'RSN' # default is RSN, but could be any other trained model\n", "template_pred = 'pred_real_na_{}.csv' # fixed, do not change\n", - "ref_method_score = None # filepath to reference method score" + "ref_method_score = None # filepath to reference method score" ] }, { @@ -164,7 +165,7 @@ "outputs": [], "source": [ "# Reference dump\n", - "if args.ref_method_score: \n", + "if args.ref_method_score:\n", " scores_reference = (pd\n", " .read_pickle(args.ref_method_score)\n", " .rename({'None': 'None (100%)'},\n", @@ -207,7 +208,7 @@ " ).set_index(\n", " ('data', 'frequency'), append=True)\n", "qvalues.index.names = qvalues.index.names[:-1] + ['frequency']\n", - "fname = args.out_folder / 'qvalues_target.pkl'\n", + "fname = args.out_folder / 'qvalues_target.pkl'\n", "files_out[fname.name] = fname.as_posix()\n", "qvalues.to_pickle(fname)\n", "qvalues.to_excel(writer, sheet_name='qvalues_all')\n", @@ -227,7 +228,7 @@ " ).set_index(\n", " ('data', 'frequency'), append=True)\n", "pvalues.index.names = pvalues.index.names[:-1] + ['frequency']\n", - "fname = args.out_folder / 'pvalues_target.pkl'\n", + "fname = args.out_folder / 'pvalues_target.pkl'\n", "files_out[fname.name] = fname.as_posix()\n", "pvalues.to_pickle(fname)\n", "pvalues.to_excel(writer, sheet_name='pvalues_all')\n", @@ -244,7 +245,7 @@ "da_target = scores.loc[pd.IndexSlice[:, args.target],\n", " pd.IndexSlice[:, 'rejected']\n", " ].join(freq_feat\n", - " ).set_index(\n", + " ).set_index(\n", " ('data', 'frequency'), append=True)\n", "da_target.index.names = da_target.index.names[:-1] + ['frequency']\n", "fname = args.out_folder / 'equality_rejected_target.pkl'\n", @@ -336,7 +337,22 @@ "(qvalues\n", " .loc[feat_idx_w_diff]\n", " .sort_values(('None', 'qvalue'))\n", - " .to_excel(writer, sheet_name='qvalues_diff'))\n", + " .to_excel(writer, sheet_name='qvalues_diff')\n", + " )\n", + "\n", + "(qvalues\n", + " .loc[feat_idx_w_diff]\n", + " .loc[mask_common] # mask automatically aligned\n", + " .sort_values(('None', 'qvalue'))\n", + " .to_excel(writer, sheet_name='qvalues_diff_common')\n", + " )\n", + "\n", + "(qvalues\n", + " .loc[feat_idx_w_diff]\n", + " .loc[~mask_common] # mask automatically aligned\n", + " .sort_values(('None', 'qvalue'))\n", + " .to_excel(writer, sheet_name='qvalues_diff_new')\n", + " )\n", "writer.close()" ] }, @@ -580,7 +596,9 @@ "cell_type": "code", "execution_count": null, "id": "f813f693", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "outputs": [], "source": [ "min_y_int, max_y_int = vaep.plotting.data.get_min_max_iterable(\n", @@ -592,66 +610,6 @@ "min_max, target_name" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "d03628eb", - "metadata": {}, - "outputs": [], - "source": [ - "for idx in feat_sel:\n", - " fig, ax = plt.subplots()\n", - "\n", - " feat_observed = data[idx].dropna()\n", - "\n", - " label_template = '{method} (N={n:,d}, q={q:.3f})'\n", - " # observed data\n", - " vaep.plotting.data.plot_histogram_intensities(\n", - " feat_observed,\n", - " ax=ax,\n", - " min_max=min_max,\n", - " label=label_template.format(method='measured',\n", - " n=len(feat_observed),\n", - " q=float(qvalues.loc[idx, ('None', 'qvalue')])),\n", - " color='grey',\n", - " alpha=0.6)\n", - "\n", - " # all models\n", - " for i, method in enumerate(model_keys):\n", - " try:\n", - " pred = pred_real_na.loc[pd.IndexSlice[:, idx], method].dropna()\n", - " if len(pred) == 0:\n", - " # in case no values was imputed -> qvalue is as based on measured\n", - " label = label_template.format(method=method,\n", - " n=len(pred),\n", - " q=float(qvalues.loc[idx, ('None', 'qvalue')]\n", - " ))\n", - " else:\n", - " label = label_template.format(method=method,\n", - " n=len(pred),\n", - " q=float(qvalues.loc[idx, (method, 'qvalue')]\n", - " ))\n", - " ax, bins = vaep.plotting.data.plot_histogram_intensities(\n", - " pred,\n", - " ax=ax,\n", - " min_max=min_max,\n", - " label=label,\n", - " color=f'C{i}',\n", - " alpha=0.6)\n", - " except KeyError:\n", - " print(f\"No missing values for {idx}: {method}\")\n", - " continue\n", - " first_pg = idx.split(\";\")[0]\n", - " ax.set_title(\n", - " f'Imputation for protein group {first_pg} with target {target_name} (N= {len(data):,d} samples)')\n", - " ax.set_ylabel('count measurments')\n", - " _ = ax.legend()\n", - " files_out[fname.name] = fname.as_posix()\n", - " vaep.savefig(\n", - " fig, folder / f'{first_pg}_hist.pdf')\n", - " plt.close(fig)" - ] - }, { "cell_type": "markdown", "id": "b9db8a0e", @@ -677,16 +635,25 @@ " fig, ax = plt.subplots()\n", "\n", " # dummy plots, just to get the Path objects\n", - " tmp_dot = ax.scatter([1,2],[3,4], marker='X')\n", + " tmp_dot = ax.scatter([1, 2], [3, 4], marker='X')\n", " new_mk, = tmp_dot.get_paths()\n", " tmp_dot.remove()\n", "\n", " feat_observed = data[idx].dropna()\n", - " label_template = '{method} (N={n:,d}, q={q:.3f})'\n", - " key = label_template.format(method='measured',\n", - " n=len(feat_observed),\n", - " q=float(qvalues.loc[idx, ('None', 'qvalue')])\n", - " )\n", + "\n", + " def get_centered_label(method, n, q):\n", + " model_str = f'{method}'\n", + " stats_str = f'(N={n:,d}, q={q:.3f})'\n", + " if len(model_str) > len(stats_str):\n", + " stats_str = f\"{stats_str:<{len(model_str)}}\"\n", + " else:\n", + " model_str = f\"{model_str:<{len(stats_str)}}\"\n", + " return f'{model_str}\\n{stats_str}'\n", + "\n", + " key = get_centered_label(method='observed',\n", + " n=len(feat_observed),\n", + " q=float(qvalues.loc[idx, ('None', 'qvalue')])\n", + " )\n", " to_plot = {key: feat_observed}\n", " for method in model_keys:\n", " try:\n", @@ -694,15 +661,15 @@ " idx], method].dropna().droplevel(-1)\n", " if len(pred) == 0:\n", " # in case no values was imputed -> qvalue is as based on measured\n", - " key = label_template.format(method=method,\n", - " n=len(pred),\n", - " q=float(qvalues.loc[idx, ('None', 'qvalue')]\n", - " ))\n", + " key = get_centered_label(method=method,\n", + " n=len(pred),\n", + " q=float(qvalues.loc[idx, ('None', 'qvalue')]\n", + " ))\n", " elif qvalues.loc[idx, (method, 'qvalue')].notna().all():\n", - " key = label_template.format(method=method,\n", - " n=len(pred),\n", - " q=float(qvalues.loc[idx, (method, 'qvalue')]\n", - " ))\n", + " key = get_centered_label(method=method,\n", + " n=len(pred),\n", + " q=float(qvalues.loc[idx, (method, 'qvalue')]\n", + " ))\n", " elif qvalues.loc[idx, (method, 'qvalue')].isna().all():\n", " logger.info(f\"NA qvalues for {idx}: {method}\")\n", " continue\n", @@ -726,7 +693,7 @@ " order=groups_order,\n", " dodge=True,\n", " hue=args.target,\n", - " size=1,\n", + " size=2,\n", " ax=ax)\n", " first_pg = idx.split(\";\")[0]\n", " ax.set_title(\n", @@ -747,7 +714,6 @@ " _ = ax.collections[0].set_paths([new_mk])\n", " _ = ax.collections[1].set_paths([new_mk])\n", "\n", - " # import matplotlib.lines as mlines\n", " label_target_0, label_target_1 = ax.collections[-2].get_label(), ax.collections[-1].get_label()\n", " _ = ax.collections[-2].set_label(f'imputed, {label_target_0}')\n", " _ = ax.collections[-1].set_label(f'imputed, {label_target_1}')\n", diff --git a/project/10_4_ald_compare_single_pg.py b/project/10_4_ald_compare_single_pg.py index 7b3c9279c..8f8fa5da0 100644 --- a/project/10_4_ald_compare_single_pg.py +++ b/project/10_4_ald_compare_single_pg.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -21,6 +21,7 @@ # %% from pathlib import Path +import logging import matplotlib.pyplot as plt import pandas as pd @@ -33,10 +34,10 @@ import vaep.imputation logger = vaep.logging.setup_nb_logger() - +logging.getLogger('fontTools').setLevel(logging.WARNING) plt.rcParams['figure.figsize'] = [4, 2.5] # [16.0, 7.0] , [4, 3] -vaep.plotting.make_large_descriptors(5) +vaep.plotting.make_large_descriptors(7) # %% [markdown] # ## Parameters @@ -49,7 +50,7 @@ # %% tags=["parameters"] folder_experiment = 'runs/appl_ald_data/plasma/proteinGroups' fn_clinical_data = "data/ALD_study/processed/ald_metadata_cli.csv" -make_plots = True # create histograms and swarmplots of diverging results +make_plots = True # create histograms and swarmplots of diverging results model_key = 'VAE' sample_id_col = 'Sample ID' target = 'kleiner' @@ -58,7 +59,7 @@ file_format = 'csv' baseline = 'RSN' # default is RSN, but could be any other trained model template_pred = 'pred_real_na_{}.csv' # fixed, do not change -ref_method_score = None # filepath to reference method score +ref_method_score = None # filepath to reference method score # %% @@ -99,7 +100,7 @@ # %% # Reference dump -if args.ref_method_score: +if args.ref_method_score: scores_reference = (pd .read_pickle(args.ref_method_score) .rename({'None': 'None (100%)'}, @@ -123,7 +124,7 @@ ).set_index( ('data', 'frequency'), append=True) qvalues.index.names = qvalues.index.names[:-1] + ['frequency'] -fname = args.out_folder / 'qvalues_target.pkl' +fname = args.out_folder / 'qvalues_target.pkl' files_out[fname.name] = fname.as_posix() qvalues.to_pickle(fname) qvalues.to_excel(writer, sheet_name='qvalues_all') @@ -136,7 +137,7 @@ ).set_index( ('data', 'frequency'), append=True) pvalues.index.names = pvalues.index.names[:-1] + ['frequency'] -fname = args.out_folder / 'pvalues_target.pkl' +fname = args.out_folder / 'pvalues_target.pkl' files_out[fname.name] = fname.as_posix() pvalues.to_pickle(fname) pvalues.to_excel(writer, sheet_name='pvalues_all') @@ -146,7 +147,7 @@ da_target = scores.loc[pd.IndexSlice[:, args.target], pd.IndexSlice[:, 'rejected'] ].join(freq_feat - ).set_index( + ).set_index( ('data', 'frequency'), append=True) da_target.index.names = da_target.index.names[:-1] + ['frequency'] fname = args.out_folder / 'equality_rejected_target.pkl' @@ -190,7 +191,22 @@ (qvalues .loc[feat_idx_w_diff] .sort_values(('None', 'qvalue')) - .to_excel(writer, sheet_name='qvalues_diff')) + .to_excel(writer, sheet_name='qvalues_diff') + ) + +(qvalues + .loc[feat_idx_w_diff] + .loc[mask_common] # mask automatically aligned + .sort_values(('None', 'qvalue')) + .to_excel(writer, sheet_name='qvalues_diff_common') + ) + +(qvalues + .loc[feat_idx_w_diff] + .loc[~mask_common] # mask automatically aligned + .sort_values(('None', 'qvalue')) + .to_excel(writer, sheet_name='qvalues_diff_new') + ) writer.close() # %% [markdown] @@ -313,58 +329,6 @@ min_max, target_name -# %% -for idx in feat_sel: - fig, ax = plt.subplots() - - feat_observed = data[idx].dropna() - - label_template = '{method} (N={n:,d}, q={q:.3f})' - # observed data - vaep.plotting.data.plot_histogram_intensities( - feat_observed, - ax=ax, - min_max=min_max, - label=label_template.format(method='measured', - n=len(feat_observed), - q=float(qvalues.loc[idx, ('None', 'qvalue')])), - color='grey', - alpha=0.6) - - # all models - for i, method in enumerate(model_keys): - try: - pred = pred_real_na.loc[pd.IndexSlice[:, idx], method].dropna() - if len(pred) == 0: - # in case no values was imputed -> qvalue is as based on measured - label = label_template.format(method=method, - n=len(pred), - q=float(qvalues.loc[idx, ('None', 'qvalue')] - )) - else: - label = label_template.format(method=method, - n=len(pred), - q=float(qvalues.loc[idx, (method, 'qvalue')] - )) - ax, bins = vaep.plotting.data.plot_histogram_intensities( - pred, - ax=ax, - min_max=min_max, - label=label, - color=f'C{i}', - alpha=0.6) - except KeyError: - print(f"No missing values for {idx}: {method}") - continue - first_pg = idx.split(";")[0] - ax.set_title( - f'Imputation for protein group {first_pg} with target {target_name} (N= {len(data):,d} samples)') - ax.set_ylabel('count measurments') - _ = ax.legend() - files_out[fname.name] = fname.as_posix() - vaep.savefig( - fig, folder / f'{first_pg}_hist.pdf') - plt.close(fig) # %% [markdown] # ## Compare with target annotation @@ -378,16 +342,25 @@ fig, ax = plt.subplots() # dummy plots, just to get the Path objects - tmp_dot = ax.scatter([1,2],[3,4], marker='X') + tmp_dot = ax.scatter([1, 2], [3, 4], marker='X') new_mk, = tmp_dot.get_paths() tmp_dot.remove() feat_observed = data[idx].dropna() - label_template = '{method} (N={n:,d}, q={q:.3f})' - key = label_template.format(method='measured', - n=len(feat_observed), - q=float(qvalues.loc[idx, ('None', 'qvalue')]) - ) + + def get_centered_label(method, n, q): + model_str = f'{method}' + stats_str = f'(N={n:,d}, q={q:.3f})' + if len(model_str) > len(stats_str): + stats_str = f"{stats_str:<{len(model_str)}}" + else: + model_str = f"{model_str:<{len(stats_str)}}" + return f'{model_str}\n{stats_str}' + + key = get_centered_label(method='observed', + n=len(feat_observed), + q=float(qvalues.loc[idx, ('None', 'qvalue')]) + ) to_plot = {key: feat_observed} for method in model_keys: try: @@ -395,15 +368,15 @@ idx], method].dropna().droplevel(-1) if len(pred) == 0: # in case no values was imputed -> qvalue is as based on measured - key = label_template.format(method=method, - n=len(pred), - q=float(qvalues.loc[idx, ('None', 'qvalue')] - )) + key = get_centered_label(method=method, + n=len(pred), + q=float(qvalues.loc[idx, ('None', 'qvalue')] + )) elif qvalues.loc[idx, (method, 'qvalue')].notna().all(): - key = label_template.format(method=method, - n=len(pred), - q=float(qvalues.loc[idx, (method, 'qvalue')] - )) + key = get_centered_label(method=method, + n=len(pred), + q=float(qvalues.loc[idx, (method, 'qvalue')] + )) elif qvalues.loc[idx, (method, 'qvalue')].isna().all(): logger.info(f"NA qvalues for {idx}: {method}") continue @@ -427,7 +400,7 @@ order=groups_order, dodge=True, hue=args.target, - size=1, + size=2, ax=ax) first_pg = idx.split(";")[0] ax.set_title( @@ -448,7 +421,6 @@ _ = ax.collections[0].set_paths([new_mk]) _ = ax.collections[1].set_paths([new_mk]) - # import matplotlib.lines as mlines label_target_0, label_target_1 = ax.collections[-2].get_label(), ax.collections[-1].get_label() _ = ax.collections[-2].set_label(f'imputed, {label_target_0}') _ = ax.collections[-1].set_label(f'imputed, {label_target_1}') diff --git a/project/10_5_comp_diff_analysis_repetitions.ipynb b/project/10_5_comp_diff_analysis_repetitions.ipynb index 961dd02c0..0d09578be 100644 --- a/project/10_5_comp_diff_analysis_repetitions.ipynb +++ b/project/10_5_comp_diff_analysis_repetitions.ipynb @@ -27,9 +27,7 @@ "cell_type": "code", "execution_count": null, "id": "8bef6cd3-fef6-4499-85cb-63bd524c9edc", - "metadata": { - "lines_to_next_cell": 1 - }, + "metadata": {}, "outputs": [], "source": [ "files_out = dict()\n", @@ -46,15 +44,18 @@ "metadata": {}, "outputs": [], "source": [ - "def _load_pickle(pfath, run:int):\n", + "\n", + "\n", + "def _load_pickle(pfath, run: int):\n", " df = pd.read_pickle(pfath)\n", " df['run'] = f'run{run:02d}'\n", " df = df.set_index('run', append=True)\n", " return df\n", "\n", + "\n", "df_long_qvalues = pd.concat(\n", - " [_load_pickle(f,i) for i,f in enumerate(pickled_qvalues)]\n", - " )\n", + " [_load_pickle(f, i) for i, f in enumerate(pickled_qvalues)]\n", + ")\n", "df_long_qvalues" ] }, @@ -148,7 +149,7 @@ " [~da_target_same]\n", " .index\n", " .get_level_values(0)\n", - ")" + " )" ] }, { @@ -181,7 +182,7 @@ "qvalue_stats = (qvalue_stats\n", " .loc[idx_different]\n", " .sort_values(('None', 'qvalue', 'mean'))\n", - ")\n", + " )\n", "qvalue_stats" ] }, @@ -254,15 +255,15 @@ "source": [ "# pgs included in original ald study\n", "tab_diff_rejec_counts_old = (da_counts\n", - " .loc[mask_pgs_included_in_ald_study]\n", - " .reset_index()\n", - " .groupby(\n", - " by=da_counts.columns.to_list())\n", - " .size()\n", - " .to_frame('N')\n", - ")\n", + " .loc[mask_pgs_included_in_ald_study]\n", + " .reset_index()\n", + " .groupby(\n", + " by=da_counts.columns.to_list())\n", + " .size()\n", + " .to_frame('N')\n", + " )\n", "tab_diff_rejec_counts_old.to_excel(writer,\n", - " sheet_name='tab_diff_rejec_counts_old')\n", + " sheet_name='tab_diff_rejec_counts_old')\n", "tab_diff_rejec_counts_old" ] }, @@ -292,17 +293,16 @@ "source": [ "# new pgs\n", "tab_diff_rejec_counts_new = (da_counts\n", - " .loc[~mask_pgs_included_in_ald_study]\n", - " .reset_index()\n", - " .drop('RSN', axis=1)\n", - " .groupby(\n", - " by=\n", - " [m for m in da_counts.columns if m != 'RSN'])\n", - " .size()\n", - " .to_frame('N')\n", - ")\n", + " .loc[~mask_pgs_included_in_ald_study]\n", + " .reset_index()\n", + " .drop('RSN', axis=1)\n", + " .groupby(\n", + " by=[m for m in da_counts.columns if m != 'RSN'])\n", + " .size()\n", + " .to_frame('N')\n", + " )\n", "tab_diff_rejec_counts_new.to_excel(writer,\n", - " sheet_name='tab_diff_rejec_counts_new')\n", + " sheet_name='tab_diff_rejec_counts_new')\n", "tab_diff_rejec_counts_new" ] }, @@ -331,7 +331,7 @@ "outputs": [], "source": [ "mask_new_da_with_imp = mask_new_da_with_imputation = ((~mask_pgs_included_in_ald_study)\n", - " & (da_counts['None'] != 10))\n", + " & (da_counts['None'] != 10))\n", "\n", "tab_new_da_with_imp = vaep.pandas.combine_value_counts(\n", " da_counts\n", diff --git a/project/10_5_comp_diff_analysis_repetitions.py b/project/10_5_comp_diff_analysis_repetitions.py index 65df744be..909c940fd 100644 --- a/project/10_5_comp_diff_analysis_repetitions.py +++ b/project/10_5_comp_diff_analysis_repetitions.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: vaep # language: python @@ -29,15 +29,18 @@ fname # %% -def _load_pickle(pfath, run:int): + + +def _load_pickle(pfath, run: int): df = pd.read_pickle(pfath) df['run'] = f'run{run:02d}' df = df.set_index('run', append=True) return df + df_long_qvalues = pd.concat( - [_load_pickle(f,i) for i,f in enumerate(pickled_qvalues)] - ) + [_load_pickle(f, i) for i, f in enumerate(pickled_qvalues)] +) df_long_qvalues # %% [markdown] @@ -77,7 +80,7 @@ def _load_pickle(pfath, run:int): [~da_target_same] .index .get_level_values(0) -) + ) # %% da_counts = da_counts.loc[idx_different] @@ -90,7 +93,7 @@ def _load_pickle(pfath, run:int): qvalue_stats = (qvalue_stats .loc[idx_different] .sort_values(('None', 'qvalue', 'mean')) -) + ) qvalue_stats # %% [markdown] @@ -123,15 +126,15 @@ def _load_pickle(pfath, run:int): # %% # pgs included in original ald study tab_diff_rejec_counts_old = (da_counts - .loc[mask_pgs_included_in_ald_study] - .reset_index() - .groupby( - by=da_counts.columns.to_list()) - .size() - .to_frame('N') -) + .loc[mask_pgs_included_in_ald_study] + .reset_index() + .groupby( + by=da_counts.columns.to_list()) + .size() + .to_frame('N') + ) tab_diff_rejec_counts_old.to_excel(writer, - sheet_name='tab_diff_rejec_counts_old') + sheet_name='tab_diff_rejec_counts_old') tab_diff_rejec_counts_old # %% @@ -147,17 +150,16 @@ def _load_pickle(pfath, run:int): # %% # new pgs tab_diff_rejec_counts_new = (da_counts - .loc[~mask_pgs_included_in_ald_study] - .reset_index() - .drop('RSN', axis=1) - .groupby( - by= - [m for m in da_counts.columns if m != 'RSN']) - .size() - .to_frame('N') -) + .loc[~mask_pgs_included_in_ald_study] + .reset_index() + .drop('RSN', axis=1) + .groupby( + by=[m for m in da_counts.columns if m != 'RSN']) + .size() + .to_frame('N') + ) tab_diff_rejec_counts_new.to_excel(writer, - sheet_name='tab_diff_rejec_counts_new') + sheet_name='tab_diff_rejec_counts_new') tab_diff_rejec_counts_new # %% @@ -172,7 +174,7 @@ def _load_pickle(pfath, run:int): # %% mask_new_da_with_imp = mask_new_da_with_imputation = ((~mask_pgs_included_in_ald_study) - & (da_counts['None'] != 10)) + & (da_counts['None'] != 10)) tab_new_da_with_imp = vaep.pandas.combine_value_counts( da_counts diff --git a/project/10_6_interpret_repeated_ald_da.py b/project/10_6_interpret_repeated_ald_da.py index b9cc384ed..bb685cb21 100644 --- a/project/10_6_interpret_repeated_ald_da.py +++ b/project/10_6_interpret_repeated_ald_da.py @@ -24,7 +24,7 @@ def load_pred_from_run(run_folder: Path, # %% reps_folder = 'runs/appl_ald_data/plasma/proteinGroups/reps' template_pred = 'pred_real_na_{}.csv' # fixed, do not change -model_keys = ['CF', 'DAE', 'KNN', 'Median', 'RSN', 'VAE','rf'] +model_keys = ['CF', 'DAE', 'KNN', 'Median', 'RSN', 'VAE', 'rf'] # %% @@ -52,12 +52,12 @@ def load_pred_from_run(run_folder: Path, for method in model_keys: pred_real_na_cvs[method] = pred_real_na[( method, 'std')] / pred_real_na[(method, 'mean')] - + pred_real_na_cvs.to_excel(writer, float_format='%.3f', sheet_name='CVs') ax = pred_real_na_cvs.plot.hist(bins=15, color=vaep.plotting.defaults.assign_colors(model_keys), - alpha=0.5) + alpha=0.5) ax.yaxis.set_major_formatter('{x:,.0f}') ax.set_xlabel(f'Coefficient of variation of imputed intensites (N={len(pred_real_na):,d})') fname = reps_folder / 'pred_real_na_cvs.png' diff --git a/project/10_7_ald_reduced_dataset_plots.ipynb b/project/10_7_ald_reduced_dataset_plots.ipynb index af290974c..65400732d 100644 --- a/project/10_7_ald_reduced_dataset_plots.ipynb +++ b/project/10_7_ald_reduced_dataset_plots.ipynb @@ -21,9 +21,20 @@ "\n", "import vaep\n", "plt.rcParams['figure.figsize'] = [4, 2] # [16.0, 7.0] , [4, 3]\n", - "vaep.plotting.make_large_descriptors(5)\n", + "vaep.plotting.make_large_descriptors(6)\n", + "\n", + "\n", + "NONE_COL_NAME = 'No imputation\\n(None)'\n", + "col_mapper = {'None':\n", + " NONE_COL_NAME}\n", + "# overwrite for now to align with Fig. 3\n", + "ORDER_MODELS = ['DAE', 'VAE', 'TRKNN', 'RF', 'CF', 'Median', 'QRILC', NONE_COL_NAME]\n", + "REF_MODEL = 'None (100%)'\n", + "CUTOFF = 0.05\n", "\n", "COLORS_TO_USE_MAPPTING = vaep.plotting.defaults.color_model_mapping\n", + "COLORS_TO_USE_MAPPTING[NONE_COL_NAME] = COLORS_TO_USE_MAPPTING['None']\n", + "\n", "\n", "def plot_qvalues(df, x: str, y: list, ax=None, cutoff=0.05,\n", " alpha=1.0, style='.', markersize=3):\n", @@ -40,14 +51,15 @@ " linestyles='dashed',\n", " color='grey',\n", " linewidth=1)\n", - " return ax\n", - "\n" + " return ax" ] }, { "cell_type": "markdown", "id": "9d21e1a1-7a46-49d4-8976-bc2031652ee4", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 0 + }, "source": [ "DA analysis" ] @@ -59,7 +71,7 @@ "metadata": {}, "outputs": [], "source": [ - "out_folder = 'runs/appl_ald_data/plasma/proteinGroups_80%_dataset/diff_analysis/kleiner/'\n", + "out_folder = 'runs/appl_ald_data_2023_11/plasma/proteinGroups_80perc_25MNAR/diff_analysis/kleiner/'\n", "out_folder = Path(out_folder)" ] }, @@ -73,43 +85,19 @@ "files_out = dict()\n", "fname = out_folder / 'ald_reduced_dataset_plots.xlsx'\n", "files_out[fname.name] = fname.as_posix()\n", - "writer = pd.ExcelWriter(fname)" - ] - }, - { - "cell_type": "markdown", - "id": "e37f0980-80d9-4835-b814-91cc9cac26f9", - "metadata": {}, - "source": [ - "Ordering of model and reference model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1d745bfc", - "metadata": {}, - "outputs": [], - "source": [ - "ORDER_MODELS = pd.read_csv(\n", - " out_folder.parent.parent / 'figures/performance_test.csv',\n", - " index_col=0\n", - ").index.to_list()\n", - "ORDER_MODELS" + "writer = pd.ExcelWriter(fname)\n", + "\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "f4c6a074", - "metadata": {}, + "id": "cd7ab8d5", + "metadata": { + "lines_to_next_cell": 2 + }, "outputs": [], - "source": [ - "# overwrite for now to align with Fig. 3\n", - "ORDER_MODELS = ['DAE', 'VAE', 'rf', 'CF', 'KNN', 'Median', 'None']\n", - "REF_MODEL = 'None (100%)'\n", - "CUTOFF = 0.05" - ] + "source": [] }, { "cell_type": "markdown", @@ -126,7 +114,10 @@ "metadata": {}, "outputs": [], "source": [ - "da_target = pd.read_pickle(out_folder / 'equality_rejected_target.pkl')\n", + "da_target = (pd\n", + " .read_pickle(out_folder / 'equality_rejected_target.pkl').\n", + " rename(col_mapper, axis=1)\n", + " )\n", "da_target.describe()" ] }, @@ -134,12 +125,13 @@ "cell_type": "code", "execution_count": null, "id": "e8df4a84", - "metadata": { - "lines_to_next_cell": 2 - }, + "metadata": {}, "outputs": [], "source": [ - "qvalues = pd.read_pickle(out_folder / 'qvalues_target.pkl')\n", + "qvalues = (pd\n", + " .read_pickle(out_folder / 'qvalues_target.pkl')\n", + " .rename(col_mapper, axis=1)\n", + " )\n", "qvalues" ] }, @@ -187,7 +179,7 @@ "source": [ "qvalues_sel = (qvalues\n", " .loc[feat_idx_w_diff]\n", - " .sort_values(('None', 'qvalue')\n", + " .sort_values((NONE_COL_NAME, 'qvalue')\n", " ))" ] }, @@ -218,8 +210,8 @@ "outputs": [], "source": [ "mask_lost_sign = (\n", - " (da_target_sel['None'] == False)\n", - " & (da_target_sel[REF_MODEL] == True)\n", + " (da_target_sel[NONE_COL_NAME] == False)\n", + " & (da_target_sel[REF_MODEL])\n", ")\n", "sel = qvalues_sel.loc[mask_lost_sign.squeeze()]\n", "sel.columns = sel.columns.droplevel(-1)\n", @@ -238,15 +230,15 @@ "# 0: FN\n", "# 1: TP\n", "da_target_sel_counts = (da_target_sel[ORDER_MODELS]\n", - " .loc[mask_lost_sign.squeeze()]\n", - " .astype(int)\n", - " .replace(\n", - " {0: 'FN',\n", - " 1: 'TP'}\n", - " ).droplevel(-1, axis=1)\n", + " .loc[mask_lost_sign.squeeze()]\n", + " .astype(int)\n", + " .replace(\n", + " {0: 'FN',\n", + " 1: 'TP'}\n", + ").droplevel(-1, axis=1)\n", ")\n", "da_target_sel_counts = vaep.pandas.combine_value_counts(da_target_sel_counts)\n", - "ax = da_target_sel_counts.T.plot.bar()\n", + "ax = da_target_sel_counts.T.plot.bar(ylabel='count')\n", "ax.locator_params(axis='y', integer=True)\n", "fname = out_folder / 'lost_signal_da_counts.pdf'\n", "files_out[fname.name] = fname.as_posix()\n", @@ -266,7 +258,8 @@ " x=REF_MODEL,\n", " y=ORDER_MODELS,\n", " cutoff=CUTOFF)\n", - "ax.set_xlim(-0.0005, CUTOFF + 0.0005)\n", + "ax.set_xlim(-0.0005, CUTOFF + 0.015)\n", + "ax.legend(loc='upper right')\n", "ax.set_xlabel(\"q-value using 100% of the data without imputation\")\n", "ax.set_ylabel(\"q-value using 80% of the data\")\n", "fname = out_folder / 'lost_signal_qvalues.pdf'\n", @@ -290,7 +283,7 @@ "outputs": [], "source": [ "mask_gained_signal = (\n", - " (da_target_sel['None'] == True)\n", + " (da_target_sel[NONE_COL_NAME])\n", " & (da_target_sel[REF_MODEL] == False)\n", ")\n", "sel = qvalues_sel.loc[mask_gained_signal.squeeze()]\n", @@ -308,15 +301,15 @@ "outputs": [], "source": [ "da_target_sel_counts = (da_target_sel[ORDER_MODELS]\n", - " .loc[mask_gained_signal.squeeze()]\n", - " .astype(int)\n", - " .replace(\n", - " {0: 'TN',\n", - " 1: 'FP'}\n", - " ).droplevel(-1, axis=1)\n", + " .loc[mask_gained_signal.squeeze()]\n", + " .astype(int)\n", + " .replace(\n", + " {0: 'TN',\n", + " 1: 'FP'}\n", + ").droplevel(-1, axis=1)\n", ")\n", "da_target_sel_counts = vaep.pandas.combine_value_counts(da_target_sel_counts)\n", - "ax = da_target_sel_counts.T.plot.bar()\n", + "ax = da_target_sel_counts.T.plot.bar(ylabel='count')\n", "ax.locator_params(axis='y', integer=True)\n", "fname = out_folder / 'gained_signal_da_counts.pdf'\n", "files_out[fname.name] = fname.as_posix()\n", @@ -333,10 +326,10 @@ "ax = plot_qvalues(sel,\n", " x=REF_MODEL,\n", " y=ORDER_MODELS)\n", - "ax.set_xlim(CUTOFF - 0.01, sel[REF_MODEL].max() + 0.005)\n", + "# ax.set_xlim(CUTOFF - 0.005, sel[REF_MODEL].max() + 0.005)\n", "ax.set_xlabel(\"q-value using 100% of the data without imputation\")\n", - "ax.set_ylabel(\"q-value using 80%\")\n", - "ax.legend(loc='upper center')\n", + "ax.set_ylabel(\"q-value using 80% of the data\")\n", + "ax.legend(loc='upper right')\n", "fname = out_folder / 'gained_signal_qvalues.pdf'\n", "files_out[fname.name] = fname.as_posix()\n", "vaep.savefig(ax.figure, fname)" @@ -364,7 +357,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cd7ab8d5", + "id": "47e6afe8", "metadata": {}, "outputs": [], "source": [] diff --git a/project/10_7_ald_reduced_dataset_plots.py b/project/10_7_ald_reduced_dataset_plots.py index 670f3a057..af0739fec 100644 --- a/project/10_7_ald_reduced_dataset_plots.py +++ b/project/10_7_ald_reduced_dataset_plots.py @@ -8,9 +8,20 @@ import vaep plt.rcParams['figure.figsize'] = [4, 2] # [16.0, 7.0] , [4, 3] -vaep.plotting.make_large_descriptors(5) +vaep.plotting.make_large_descriptors(6) + + +NONE_COL_NAME = 'No imputation\n(None)' +col_mapper = {'None': + NONE_COL_NAME} +# overwrite for now to align with Fig. 3 +ORDER_MODELS = ['DAE', 'VAE', 'TRKNN', 'RF', 'CF', 'Median', 'QRILC', NONE_COL_NAME] +REF_MODEL = 'None (100%)' +CUTOFF = 0.05 COLORS_TO_USE_MAPPTING = vaep.plotting.defaults.color_model_mapping +COLORS_TO_USE_MAPPTING[NONE_COL_NAME] = COLORS_TO_USE_MAPPTING['None'] + def plot_qvalues(df, x: str, y: list, ax=None, cutoff=0.05, alpha=1.0, style='.', markersize=3): @@ -30,12 +41,10 @@ def plot_qvalues(df, x: str, y: list, ax=None, cutoff=0.05, return ax - # %% [markdown] # DA analysis - # %% -out_folder = 'runs/appl_ald_data/plasma/proteinGroups_80%_dataset/diff_analysis/kleiner/' +out_folder = 'runs/appl_ald_data_2023_11/plasma/proteinGroups_80perc_25MNAR/diff_analysis/kleiner/' out_folder = Path(out_folder) # %% @@ -44,34 +53,28 @@ def plot_qvalues(df, x: str, y: list, ax=None, cutoff=0.05, files_out[fname.name] = fname.as_posix() writer = pd.ExcelWriter(fname) -# %% [markdown] -# Ordering of model and reference model -# %% -ORDER_MODELS = pd.read_csv( - out_folder.parent.parent / 'figures/performance_test.csv', - index_col=0 -).index.to_list() -ORDER_MODELS # %% -# overwrite for now to align with Fig. 3 -ORDER_MODELS = ['DAE', 'VAE', 'rf', 'CF', 'KNN', 'Median', 'None'] -REF_MODEL = 'None (100%)' -CUTOFF = 0.05 + # %% [markdown] # Load dumps # %% -da_target = pd.read_pickle(out_folder / 'equality_rejected_target.pkl') +da_target = (pd + .read_pickle(out_folder / 'equality_rejected_target.pkl'). + rename(col_mapper, axis=1) + ) da_target.describe() # %% -qvalues = pd.read_pickle(out_folder / 'qvalues_target.pkl') +qvalues = (pd + .read_pickle(out_folder / 'qvalues_target.pkl') + .rename(col_mapper, axis=1) + ) qvalues - # %% [markdown] # take only those with different decisions @@ -88,7 +91,7 @@ def plot_qvalues(df, x: str, y: list, ax=None, cutoff=0.05, # %% qvalues_sel = (qvalues .loc[feat_idx_w_diff] - .sort_values(('None', 'qvalue') + .sort_values((NONE_COL_NAME, 'qvalue') )) @@ -101,8 +104,8 @@ def plot_qvalues(df, x: str, y: list, ax=None, cutoff=0.05, # %% mask_lost_sign = ( - (da_target_sel['None'] == False) - & (da_target_sel[REF_MODEL] == True) + (da_target_sel[NONE_COL_NAME] == False) + & (da_target_sel[REF_MODEL]) ) sel = qvalues_sel.loc[mask_lost_sign.squeeze()] sel.columns = sel.columns.droplevel(-1) @@ -114,15 +117,15 @@ def plot_qvalues(df, x: str, y: list, ax=None, cutoff=0.05, # 0: FN # 1: TP da_target_sel_counts = (da_target_sel[ORDER_MODELS] - .loc[mask_lost_sign.squeeze()] - .astype(int) - .replace( - {0: 'FN', - 1: 'TP'} - ).droplevel(-1, axis=1) + .loc[mask_lost_sign.squeeze()] + .astype(int) + .replace( + {0: 'FN', + 1: 'TP'} +).droplevel(-1, axis=1) ) da_target_sel_counts = vaep.pandas.combine_value_counts(da_target_sel_counts) -ax = da_target_sel_counts.T.plot.bar() +ax = da_target_sel_counts.T.plot.bar(ylabel='count') ax.locator_params(axis='y', integer=True) fname = out_folder / 'lost_signal_da_counts.pdf' files_out[fname.name] = fname.as_posix() @@ -133,7 +136,8 @@ def plot_qvalues(df, x: str, y: list, ax=None, cutoff=0.05, x=REF_MODEL, y=ORDER_MODELS, cutoff=CUTOFF) -ax.set_xlim(-0.0005, CUTOFF + 0.0005) +ax.set_xlim(-0.0005, CUTOFF + 0.015) +ax.legend(loc='upper right') ax.set_xlabel("q-value using 100% of the data without imputation") ax.set_ylabel("q-value using 80% of the data") fname = out_folder / 'lost_signal_qvalues.pdf' @@ -146,7 +150,7 @@ def plot_qvalues(df, x: str, y: list, ax=None, cutoff=0.05, # %% mask_gained_signal = ( - (da_target_sel['None'] == True) + (da_target_sel[NONE_COL_NAME]) & (da_target_sel[REF_MODEL] == False) ) sel = qvalues_sel.loc[mask_gained_signal.squeeze()] @@ -157,15 +161,15 @@ def plot_qvalues(df, x: str, y: list, ax=None, cutoff=0.05, # %% da_target_sel_counts = (da_target_sel[ORDER_MODELS] - .loc[mask_gained_signal.squeeze()] - .astype(int) - .replace( - {0: 'TN', - 1: 'FP'} - ).droplevel(-1, axis=1) + .loc[mask_gained_signal.squeeze()] + .astype(int) + .replace( + {0: 'TN', + 1: 'FP'} +).droplevel(-1, axis=1) ) da_target_sel_counts = vaep.pandas.combine_value_counts(da_target_sel_counts) -ax = da_target_sel_counts.T.plot.bar() +ax = da_target_sel_counts.T.plot.bar(ylabel='count') ax.locator_params(axis='y', integer=True) fname = out_folder / 'gained_signal_da_counts.pdf' files_out[fname.name] = fname.as_posix() @@ -175,10 +179,10 @@ def plot_qvalues(df, x: str, y: list, ax=None, cutoff=0.05, ax = plot_qvalues(sel, x=REF_MODEL, y=ORDER_MODELS) -ax.set_xlim(CUTOFF - 0.01, sel[REF_MODEL].max() + 0.005) +# ax.set_xlim(CUTOFF - 0.005, sel[REF_MODEL].max() + 0.005) ax.set_xlabel("q-value using 100% of the data without imputation") -ax.set_ylabel("q-value using 80%") -ax.legend(loc='upper center') +ax.set_ylabel("q-value using 80% of the data") +ax.legend(loc='upper right') fname = out_folder / 'gained_signal_qvalues.pdf' files_out[fname.name] = fname.as_posix() vaep.savefig(ax.figure, fname) diff --git a/project/README.md b/project/README.md index 95f54336d..8091caa67 100644 --- a/project/README.md +++ b/project/README.md @@ -70,8 +70,6 @@ papermill 01_0_split_data.ipynb runs/experiment_03/%DATASET%/experiment_03_data ## Notebooks -- erda: Is the longterm storage of the university -> MQ output was processed on a server attached to erda -- hela: dumps from erda processing (raw file names, aggregated `summaries.txt` from MQ, protein groups, peptides and precursor dumps) - run: a single experiment with models attached, see `workflow/Snakefile` - grid: only grid search associated, see `workflow/Snakefile_grid.smk` - best: best models repeatedly trained or across datasets, see `workflow/Snakefile_best_repeated_train.smk` and `workflow/Snakefile_best_across_datasets.smk` @@ -79,21 +77,6 @@ papermill 01_0_split_data.ipynb runs/experiment_03/%DATASET%/experiment_03_data tag | notebook | Description --- | --- | --- -Development data related -erda | erda_01_mq_select_runs.ipynb | Aggregate current summary files from MQ runs into table -erda | erda_02_mq_count_features.ipynb | Aggregate information from all eligable MQ runs
Saves processed files used for data selection (Counters used in `erda_03_training_data.ipynb`) -erda | erda_03_training_data.ipynb | Build training data dump (run for each data level) in wide format -erda | erda_04_transpose_data.ipynb | Transpose dataset (row: a sample), separate as erda has memory limits, dump counts and present-absent patterns -erda | erda_12_explore_raw_MQ_data.ipynb | Load a single MQ txt output folder and browse data
dumps large pickle files for training -erda | erda_data_available.ipynb | Plots on theoretically available data based on Counter dictionaries -hela | 00_0_hela_metadata_rawfiles.ipynb | Analyze rawfile metadata and prepare for data selection -hela | 00_1_hela_MQ_summaries.ipynb | Analyzse summaries.txt data from all samples -hela | 00_2_hela_all_raw_files.ipynb | Find duplicate raw files, analyze sizes -hela | 00_3_hela_selected_files_overview.ipynb | Data description based on file size and metaddata of selected files -hela | 00_4_hela_development_dataset_splitting | Splitting data into development datasets of HeLa cell line data (based on wide format input from `erda_03` and `erda_04`) -Single development dataset | -hela | 00_5_hela_development_dataset_support.ipynb | Support of training data samples/feat on selected development data set -hela | 00_6_hela_training_data_exploration.ipynb | Explore a data set for diagnositics
Visualize key metrics Single experiment | run | 01_0_split_data.ipynb | Create train, validation and test data splits run | 01_1_train_.ipynb | Train a single model e.g. (VAE, DAE, CF) @@ -111,13 +94,7 @@ ald | 16_ald_compare_methods.ipynb | DA comparison between methods ald | 16_ald_ml_new_feat.ipynb | ML model comparison ald | 16_ald_compare_single_pg.ipynb | [DEV] Compare imputation for feat between methods (dist plots) Miscancellous notebooks on different topics (partly exploration) | -misc | misc_clustering_proteins.ipynb | \[documentation\] PCA protein analysis from Annelaura w/ initial data
(Executed, only for documentation) -misc | misc_data_exploration_peptides.ipynb | Describe current peptides training data -misc | misc_data_exploration_proteins.ipynb | \[NEEDS UPDATE\] Describe small initial protein training data misc | misc_embeddings.ipynb | FastAI Embeddings -misc | misc_FASTA_data_agg_by_gene.ipynb | Investigate possibility to join proteins by gene -misc | misc_FASTA_tryptic_digest.ipynb | Analyze fasta file used for peptide identification -misc | misc_id_mapper.ipynb | train models per gene, see overlaps in proteins, see coverage | of proteins with observed peptides, align overlapping peptide sequences misc | misc_illustrations.ipynb | Illustrations of certain concepts (e.g. draw from shifted random distribution) misc | misc_json_formats.ipynb | Investigate storring training data as json with correct encoding misc | misc_MaxQuantOutput.ipynb | \[documentation\] Analyze MQ output, show MaxQuantOutput class behaviour @@ -128,74 +105,6 @@ misc | misc_sampling_in_pandas.ipynb | How to sample in pandas # Notebook descriptions (To be completed) -## erda notebooks - -- [ ] determine order and rename accordingly with prefix - -The data is for now processed only using MaxQuant. If the files are processed -by another Software, these notebooks need to be adapted for if they contain `mq` or `MQ`. - -### erda_01_mq_select_runs - -- read in all summaries and select eligable runs based on number of identified peptides - -### erda_02_mq_count_features - -- Feature Extraction and Feature counting -- dumps extracted features per group into `FOLDER_PROCESSED` - (separated for type and by year) - -### erda_03_training_data - -- needs to be executed for each data type -- loads a python config file (setting `FeatureCounter` classes and custom functions) - along string configuration variables - -## HeLa notebooks - Training data - - -### `00_0_1_rawfile_renaming.ipynb` - -> internal, documentation only (see pride upload for result) - -- create a new id for each raw file based on the creation date and instrument -- uses metadata -- build lftp commands for pride upload - -### `00_0_hela_metadata_rawfiles.ipynb` - -- group by MS instrument parameters -- create `data/files_per_instrument_nested.yaml` for selection of data by massspectrometer - -### `00_1_hela_MQ_summaries.ipynb` - -- analysze all `summaries.txt` - -### `00_2_hela_all_raw_files.ipynb` - -### `00_3_hela_selected_files_overview.ipynb` - -- created joined metadata file -- overview of metadata of selected files for data descriptor paper - -### `00_4_hela_development_dataset_splitting.ipynb` - -- Create development dataset(s) of common machines, one for each machine -- UMAP **Figure 1b**, statistics of **Figure 1c** -- create datasets for training PIMMS models - -### Training data inspection - -### `00_5_hela_development_dataset_support.ipynb` - -- feature counts for a single development dataset (e.g. for a single machine) - -### `00_6_hela_training_data_exploration.ipynb` - -> needs clean-up - -- explore a data set for diagnositics - ## Single experiment run ### `01_0_split_data.ipynb` diff --git a/project/bin/README.md b/project/bin/README.md new file mode 100644 index 000000000..3e84a2791 --- /dev/null +++ b/project/bin/README.md @@ -0,0 +1,18 @@ +# Computerome2 (CR2) scripts + +Cluster exectuion script for CR2 using a torque-pbs queue. + + +## Distributed + +```bash +qsub run_snakemake_cluster.sh -N snakemake_exp -v configfile=config/single_dev_dataset/example/config.yaml,prefix=exp +``` + +## Single node + +```bash +qsub run_snakemake.sh -N grid_exp +``` + + diff --git a/project/bin/create_qsub_commands.py b/project/bin/create_qsub_commands.py new file mode 100755 index 000000000..d5b2445a9 --- /dev/null +++ b/project/bin/create_qsub_commands.py @@ -0,0 +1,39 @@ +# %% +from itertools import product + +# import subprocess +mnar_mcar = [25, 50, 75] +datasets = ["pg_m", "pg_l", "pep_m", "evi_m", "pep_l", "evi_l"] + +for dataset, perc in product(datasets, mnar_mcar): + print(f"# {dataset = } # {perc = }") + cmd = ( + "qsub bin/run_snakemake_cluster.sh" + f" -N sm_{dataset}_{perc}" + f" -v configfile=config/single_dev_dataset/mnar_mcar/{dataset}.yaml,prefix={dataset}_{perc}," + f"frac_mnar={perc/100:.2f}," + f"config_split=runs/mnar_mcar/{dataset}_{perc}MNAR/01_0_split_data.yaml," + f"config_train=runs/mnar_mcar/{dataset}_{perc}MNAR/train_{{model}}.yaml," + f"folder_experiment=runs/mnar_mcar/{dataset}_{perc}MNAR" + ) + print(cmd) + # subprocess.run(cmd, check=True, shell=True, stdout=subprocess.PIPE) + +# %% [markdown] +# Create local command to run on interactive node +print() +print("#" * 80) +print() +# %% +for dataset, perc in product(datasets, mnar_mcar): + cmd = ( + "snakemake -s workflow/Snakefile_v2" + f" --configfile config/single_dev_dataset/mnar_mcar/{dataset}.yaml" + f" --config frac_mnar={perc/100:.2f}" + f" config_split=runs/mnar_mcar/{dataset}_{perc}MNAR/01_0_split_data.yaml" + f" config_train=runs/mnar_mcar/{dataset}_{perc}MNAR/train_{{model}}.yaml" + f" folder_experiment=runs/mnar_mcar/{dataset}_{perc}MNAR" + " -c1" + ) + print(cmd) +# %% diff --git a/workflows/maxquant/run_sm_on_cluster.sh b/project/bin/run_snakemake.sh similarity index 60% rename from workflows/maxquant/run_sm_on_cluster.sh rename to project/bin/run_snakemake.sh index acb0cada8..9b7b2d032 100644 --- a/workflows/maxquant/run_sm_on_cluster.sh +++ b/project/bin/run_snakemake.sh @@ -3,16 +3,17 @@ ### Account information #PBS -W group_list=cpr_10006 -A cpr_10006 ### Job name (comment out the next line to get the name of the script used as the job name) -#PBS -N snakemake +#PBS -N sn_grid ### Output files (comment out the next 2 lines to get the job name used instead) -#PBS -e ${PBS_JOBNAME}.${PBS_JOBID}.e -#PBS -o ${PBS_JOBNAME}.${PBS_JOBID}.o +#PBS -e qsub_logs/${PBS_JOBNAME}.${PBS_JOBID}.e +#PBS -o qsub_logs/${PBS_JOBNAME}.${PBS_JOBID}.o ### Email notification: a=aborts, b=begins, e=ends, n=no notifications #PBS -m ae -M henry.webel@cpr.ku.dk ### Number of nodes -#PBS -l nodes=1:ppn=4,mem=8gb +### other: #PBS -l nodes=1:ppn=20:mem:40g +#PBS -l nodes=1:ppn=40 ### Requesting timeformat is ::: -#PBS -l walltime=7:00:00:00 +#PBS -l walltime=1:00:00:00 ### Forward all environment variables ### if authentification is done using pw in the environment #PBS -V @@ -22,11 +23,12 @@ echo Working directory is $PBS_O_WORKDIR cd $PBS_O_WORKDIR +# start_conda +. ~/setup_conda.sh +conda activate vaep + +# try to influence how many jobs are run in parallel in one job training a model +export MKL_NUM_THREADS=5 + +snakemake --snakefile workflow/Snakefile_grid.smk --rerun-incomplete -f -j 4 -c 20 -snakemake --jobs 59 -k -p --latency-wait 60 --use-envmodules --rerun-incomplete \ ---cluster "qsub -l walltime={resources.walltime},nodes=1:ppn={threads},mem={resources.mem_mb}mb"\ -" -W group_list=cpr_10006 -A cpr_10006 -m f -V "\ -"-e {params.logdir} -o {params.logdir}" \ ---cluster-status "python qsub-status.py" && -echo "done" || -echo "failed" \ No newline at end of file diff --git a/project/bin/run_snakemake_cluster.sh b/project/bin/run_snakemake_cluster.sh new file mode 100644 index 000000000..20d24ff2c --- /dev/null +++ b/project/bin/run_snakemake_cluster.sh @@ -0,0 +1,65 @@ +#!/bin/sh +### Note: No commands may be executed until after the #PBS lines +### Account information +#PBS -W group_list=cpr_10006 -A cpr_10006 +### Job name (comment out the next line to get the name of the script used as the job name) +#PBS -N snakemake +### Output files (comment out the next 2 lines to get the job name used instead) +#PBS -e qsub_logs/${PBS_JOBNAME}.${PBS_JOBID}.e +#PBS -o qsub_logs/${PBS_JOBNAME}.${PBS_JOBID}.o +### Email notification: a=aborts, b=begins, e=ends, n=no notifications +#PBS -m ae -M henry.webel@cpr.ku.dk +### Number of nodes +#PBS -l nodes=1:ppn=2,mem=16gb +### Requesting timeformat is ::: +#PBS -l walltime=1:12:00:00 + + + +# Go to the directory from where the job was submitted (initial directory is $HOME) +echo Working directory is $PBS_O_WORKDIR +cd $PBS_O_WORKDIR + +cd pimms/project # throws an error, but is not consequential. + +# Get the values of the parameters from the environment variables +prefix=${prefix:-""} +configfile=${configfile:-""} + +# Check if the values are empty +if [ -z "$prefix" ]; then + echo "Error: Missing required parameters: prefix" + exit 1 +# Check if the values are empty +elif [ -z "$configfile" ]; then + echo "Error: Missing required parameters: configfile" + exit 1 +else + echo " # found parameters, see above:" + echo prefix: $prefix + echo configfile: $configfile + echo '####################################################################' +fi + +echo folder_experiment $folder_experiment +echo config_split $config_split +echo config_train $config_train + +. ~/setup_conda.sh +conda activate vaep + +snakemake -s workflow/Snakefile_v2 --jobs 10 -k -p -c2 --latency-wait 60 --rerun-incomplete \ +--configfile $configfile \ +--config frac_mnar=$frac_mnar folder_experiment=$folder_experiment config_split=$config_split config_train=$config_train \ +--max-status-checks-per-second 0.1 \ +--max-jobs-per-second 1 \ +--use-conda \ +--default-resources walltime=3600 \ +--rerun-trigger mtime \ +--cluster "qsub -l walltime={resources.walltime},nodes=1:ppn={threads},mem={resources.mem_mb}mb"\ +" -W group_list=cpr_10006 -A cpr_10006 "\ +" -e {params.err} -o {params.out}"\ +" -N ${prefix}.{params.name}" \ +--cluster-status "python workflow/bin/qsub-status_v2.py" && +echo "done" || +echo "failed" diff --git a/project/config/__init__.py b/project/config/__init__.py deleted file mode 100644 index af7a2d97b..000000000 --- a/project/config/__init__.py +++ /dev/null @@ -1,226 +0,0 @@ -# src.config goes here -# import src.config -> import config - -""" -Project config file. - -Different config for different settings. - -os to pathlib functionaly, see -https://docs.python.org/3/library/pathlib.html#correspondence-to-tools-in-the-os-module - -""" -import vaep.io -import logging -import yaml -from collections import namedtuple -from pathlib import Path, PurePath, PurePosixPath -from pprint import pformat - -import numpy as np -import pandas -import matplotlib as mpl - - -def mkdir(path=Path): - path.mkdir(exist_ok=True, parents=True) - return path - - -logger = logging.getLogger('vaep') - -############################################################################### -############################################################################### -# project folder specific -FIGUREFOLDER = Path('Figures') -FIGUREFOLDER.mkdir(exist_ok=True) - -FOLDER_DATA = Path('data') -FOLDER_DATA.mkdir(exist_ok=True) - -FOLDER_PROCESSED = FOLDER_DATA / 'processed' -FOLDER_PROCESSED.mkdir(exist_ok=True) - -FOLDER_TRAINING = mkdir(FOLDER_DATA / 'hela_qc_data') - -# (old) Synonyms -PROCESSED_DATA = FOLDER_PROCESSED -PROTEIN_DUMPS = PROCESSED_DATA - -############################################################################### -############################################################################### -# Adapt this part -ON_ERDA = True -#local PC config -FOLDER_MQ_TXT_DATA = None - -FOLDERS_MQ_TXT_DATA = [ - Path('Y:/') / 'mq_out', - FOLDER_DATA / 'mq_out', - Path('/home/jovyan/work/mq_out/'), -] -for folder in FOLDERS_MQ_TXT_DATA[:-1]: - if folder.exists(): - print(f'FOLDER_MQ_TXT_DATA = {folder}') - FOLDER_MQ_TXT_DATA = folder - ON_ERDA = False - break - -if FOLDERS_MQ_TXT_DATA[-1].exists(): - print(f'FOLDER_MQ_TXT_DATA = {folder}') - FOLDER_MQ_TXT_DATA = folder - -if not FOLDER_MQ_TXT_DATA: - print( - 'Not found. Check FOLDER_MQ_TXT_DATA entries above: {}'.format( - ", ".join([str(fname) for fname in FOLDERS_MQ_TXT_DATA]) - ) - ) - FOLDER_MQ_TXT_DATA = FOLDERS_MQ_TXT_DATA[1] - FOLDER_MQ_TXT_DATA.mkdir() - ON_ERDA = False - print(f"Created local folder: {FOLDER_MQ_TXT_DATA}") - -if ON_ERDA: - import sys - sys.path.append('/home/jovyan/work/vaep/') - - FOLDER_MQ_TXT_DATA = Path('/home/jovyan/work/mq_out/') - if FOLDER_MQ_TXT_DATA.exists(): - print(f'FOLDER_MQ_TXT_DATA = {FOLDER_MQ_TXT_DATA}') - else: - raise FileNotFoundError(f"Check config for FOLDER_MQ_TXT_DATA") - - FOLDER_RAW_DATA = Path('/home/jovyan/work/share_hela_raw/') - if FOLDER_RAW_DATA.exists(): - print(f'FOLDER_RAW_DATA = {FOLDER_RAW_DATA}') - else: - raise FileNotFoundError( - f"Check config for FOLDER_RAW_DATA: {FOLDER_RAW_DATA}") - -# FOLDER_KEY = None - -FOLDER_KEY = 'txt' - -############################################################################### -############################################################################### -# Files - -FN_ALL_SUMMARIES = FOLDER_PROCESSED / 'all_summaries.json' - -#FN_PEPTIDE_INTENSITIES = Path(FOLDER_DATA) / 'mq_out' / 'peptide_intensities.pkl' -FN_PEPTIDE_STUMP = 'peptide_intensities' -FN_PEPTIDE_INTENSITIES = Path(FOLDER_DATA) / 'peptide_intensities.pkl' - -FN_PROTEIN_TSV = FOLDER_DATA / 'Mann_Hepa_data.tsv' - -PREFIX_IMPUTED = 'hela_imputed_proteins' -PREFIX_META = 'hela_metadata' - -FOLDER_FASTA = Path(FOLDER_DATA) / 'fasta' -FN_FASTA_DB = FOLDER_FASTA / 'fasta_db.json' -FN_ID_MAP = FOLDER_FASTA / 'id_map.json' -FN_PROT_GENE_MAP = FOLDER_FASTA / 'uniprot_protein_gene_map.json' -FN_PEP_TO_PROT = FOLDER_FASTA / 'peptided_to_prot_id.json' -FN_PROTEIN_SUPPORT_MAP = Path(FOLDER_DATA) / 'protein_support.pkl' -FN_PROTEIN_SUPPORT_FREQ = Path(FOLDER_DATA) / 'dict_protein_support_freq.pkl' - -FN_ALL_RAW_FILES = 'all_raw_files_dump.txt' - -# DATA FASTA Config -KEY_FASTA_HEADER = 'meta' -KEY_FASTA_SEQ = 'seq' -KEY_PEPTIDES = 'peptides' -KEY_GENE_NAME = 'gene' -KEY_GENE_NAME_FASTA = 'gene_fasta' - -KEYS_FASTA_ENTRY = [KEY_FASTA_HEADER, - KEY_FASTA_SEQ, KEY_PEPTIDES, KEY_GENE_NAME] - -FastaEntry = namedtuple('FastaEntry', KEYS_FASTA_ENTRY) -fasta_entry = FastaEntry(*KEYS_FASTA_ENTRY) - - -FILEPATH_UTILS = 'src/file_utils.py' - -FNAME_C_PEPTIDES = FOLDER_PROCESSED / \ - 'count_all_peptides.json' # aggregated peptides -# evidence peptides (sequence, charge, modification) -FNAME_C_EVIDENCE = FOLDER_PROCESSED / 'count_all_evidences.json' - -FNAME_C_PG = FOLDER_PROCESSED / 'count_all_protein_groups.json' -FNAME_C_GENES = FOLDER_PROCESSED / 'count_all_genes.json' - - -def build_df_fname(df: pandas.DataFrame, stub: str) -> str: - N, M = df.shape - return f'{stub}_N{N:05d}_M{M:05d}' - - -def insert_shape(df: pandas.DataFrame, template: str = "filename{}.txt", shape=None): - if shape is None: - N, M = df.shape - else: - N, M = shape - return template.format(f'_N{N:05d}_M{M:05d}') - -# put to testing -# df_test = pd.DataFrame(np.random.randint(low=-4, high=10, size=(1729, 146))) -# N, M = df_test.shape -# assert build_fname(df=df_test, stub='df_intensities') == f'df_intensities_N{N:05d}_M{M:05d}' - - -############################################################################### -############################################################################### -# configure plotting -# https://matplotlib.org/stable/users/dflt_style_changes.html -mpl.rcParams['figure.figsize'] = [10.0, 8.0] - -# cfg.keys.gene_name -# cfg.paths.processed -# cfg. - - -class Config(): - """Config class with a setter enforcing that config entries cannot - be overwritten. - - - Can contain configs, which are itself configs: - keys, paths, - - """ - - def __setattr__(self, entry, value): - """Set if attribute not in instance.""" - if hasattr(self, entry) and getattr(self, entry) != value: - raise AttributeError( - f'{entry} already set to {getattr(self, entry)}') - super().__setattr__(entry, value) - - def __repr__(self): - return pformat(vars(self)) # does not work in Jupyter? - - def overwrite_entry(self, entry, value): - """Explicitly overwrite a given value.""" - super().__setattr__(entry, value) - - def dump(self, fname=None): - if fname is None: - try: - fname = self.out_folder - fname = Path(fname) / 'model_config.yml' - except AttributeError: - raise AttributeError( - 'Specify fname or set "out_folder" attribute.') - d = vaep.io.parse_dict(input_dict=self.__dict__) - with open(fname, 'w') as f: - yaml.dump(d, f) - logger.info(f"Dumped config to: {fname}") - - -if __name__ == '__main__': - cfg = Config() - cfg.test = 'test' - print(cfg.test) - cfg.test = 'raise ValueError' diff --git a/project/config/across_datasets/config.yaml b/project/config/across_datasets/config.yaml index f7a75f630..1e278aa07 100644 --- a/project/config/across_datasets/config.yaml +++ b/project/config/across_datasets/config.yaml @@ -1,3 +1,4 @@ +# # config for Snakefile_best_across_datasets epochs_max: - 100 batch_size: 64 diff --git a/project/config/appl_ald_data/plasma/aggPeptides/train.yaml b/project/config/appl_ald_data/plasma/aggPeptides/train.yaml index 82c32663f..6bffe907c 100644 --- a/project/config/appl_ald_data/plasma/aggPeptides/train.yaml +++ b/project/config/appl_ald_data/plasma/aggPeptides/train.yaml @@ -3,5 +3,5 @@ file_format: pkl latent_dim: 20 hidden_layers: "512" sample_idx_position: 0 -cuda: True +cuda: False save_pred_real_na: True \ No newline at end of file diff --git a/project/config/appl_ald_data/plasma/grid_search/config_grid.yaml b/project/config/appl_ald_data/plasma/grid_search/config_grid.yaml index dcddbc2eb..27737b9f4 100644 --- a/project/config/appl_ald_data/plasma/grid_search/config_grid.yaml +++ b/project/config/appl_ald_data/plasma/grid_search/config_grid.yaml @@ -29,4 +29,4 @@ levels: config_split: 'config/appl_ald_data/plasma/{level}/split.yaml' config_train: 'placeholder.yaml' name_template: run_LD_{latent_dim}_E_{epochs_max} -cuda: True \ No newline at end of file +cuda: False \ No newline at end of file diff --git a/project/config/appl_ald_data/plasma/proteinGroups/comparison.yaml b/project/config/appl_ald_data/plasma/proteinGroups/comparison.yaml index 17c5ebdd2..dd490ea0c 100644 --- a/project/config/appl_ald_data/plasma/proteinGroups/comparison.yaml +++ b/project/config/appl_ald_data/plasma/proteinGroups/comparison.yaml @@ -1,8 +1,8 @@ -folder_experiment: runs/appl_ald_data/plasma/proteinGroups +folder_experiment: runs/appl_ald_data_2023_11/plasma/proteinGroups out_folder: diff_analysis # subfolder of experiment folder -fn_rawfile_metadata: 'data/ALD_study/processed/raw_meta.csv' +fn_rawfile_metadata: "data/ALD_study/processed/raw_meta.csv" make_plots: True -covar: +covar: kleiner: age,bmi,gender_num,nas_steatosis_ordinal,abstinent_num inflammation: age,bmi,gender_num,nas_steatosis_ordinal,abstinent_num steatosis: age,bmi,gender_num,abstinent_num,kleiner,nas_inflam @@ -19,9 +19,10 @@ annotaitons_gene_col: PG.Genes baseline: RSN ref_method_score: methods: - - Median - - CF - - DAE - - VAE - - rf - - KNN + - Median + - CF + - DAE + - VAE + - QRILC + - TRKNN + - RF diff --git a/project/config/appl_ald_data/plasma/proteinGroups/config.yaml b/project/config/appl_ald_data/plasma/proteinGroups/config.yaml index 24a8a3880..c17dc8e13 100644 --- a/project/config/appl_ald_data/plasma/proteinGroups/config.yaml +++ b/project/config/appl_ald_data/plasma/proteinGroups/config.yaml @@ -1,6 +1,7 @@ +# config for Snakefile_v1 config_split: config/appl_ald_data/plasma/proteinGroups/split.yaml config_train: config/appl_ald_data/plasma/proteinGroups/train_{model}.yaml -folder_experiment: runs/appl_ald_data/plasma/proteinGroups +folder_experiment: runs/appl_ald_data_2023_11/plasma/proteinGroups fn_rawfile_metadata: data/ALD_study/processed/raw_meta.csv file_format: pkl models: @@ -11,24 +12,27 @@ models: - VAE - KNN NAGuideR_methods: - - ZERO - - MINIMUM + - BPCA - COLMEDIAN - - ROWMEDIAN + - GSIMP + - IMPSEQ + - IMPSEQROB + - IRM - KNN_IMPUTE - # - SEQKNN # error - - BPCA - - SVDMETHOD - LLS - - MLE - - QRILC + # - MICE-CART # stopped after 24h + # - MICE-NORM # stopped after 24h - MINDET + - MINIMUM - MINPROB - - IRM - # - IMPSEQ # error - - IMPSEQROB - # - MICE-NORM # stopped after 30mins - # - MICE-CART # stopped after 30mins - # - TRKNN # error + - MLE + - MSIMPUTE + - MSIMPUTE_MNAR + - PI + - QRILC - RF - - PI \ No newline at end of file + - ROWMEDIAN + # - SEQKNN # Error in x[od, ismiss, drop = FALSE]: subscript out of bounds + - SVDMETHOD + - TRKNN + - ZERO diff --git a/project/config/appl_ald_data/plasma/proteinGroups/config_reps.yaml b/project/config/appl_ald_data/plasma/proteinGroups/config_reps.yaml new file mode 100644 index 000000000..77ab1b896 --- /dev/null +++ b/project/config/appl_ald_data/plasma/proteinGroups/config_reps.yaml @@ -0,0 +1,17 @@ +# config for Snakefile_v1 +config_split: config/appl_ald_data/plasma/proteinGroups/split.yaml +config_train: config/appl_ald_data/plasma/proteinGroups/train_{model}.yaml +folder_experiment: runs/appl_ald_data_2023_11/plasma/proteinGroups +fn_rawfile_metadata: data/ALD_study/processed/raw_meta.csv +file_format: pkl +models: + - RSN + - Median # maps to median on file system? + - CF + - DAE + - VAE + - KNN +NAGuideR_methods: + - QRILC + - RF + - TRKNN diff --git a/project/config/appl_ald_data/plasma/proteinGroups/split.yaml b/project/config/appl_ald_data/plasma/proteinGroups/split.yaml index d1737df8e..e0e85bcc0 100644 --- a/project/config/appl_ald_data/plasma/proteinGroups/split.yaml +++ b/project/config/appl_ald_data/plasma/proteinGroups/split.yaml @@ -3,4 +3,5 @@ fn_rawfile_metadata: data/ALD_study/processed/raw_meta.csv sample_completeness: 0.5 min_RT_time: 20 column_names: - - PG.ProteinAccessions \ No newline at end of file + - PG.ProteinAccessions +frac_mnar: 0.25 diff --git a/project/config/appl_ald_data/plasma/proteinGroups_80%_dataset/config.yaml b/project/config/appl_ald_data/plasma/proteinGroups_80%_dataset/config.yaml deleted file mode 100644 index 91db84805..000000000 --- a/project/config/appl_ald_data/plasma/proteinGroups_80%_dataset/config.yaml +++ /dev/null @@ -1,23 +0,0 @@ -config_split: config/appl_ald_data/plasma/proteinGroups_80%_dataset/split.yaml -config_train: config/appl_ald_data/plasma/proteinGroups/train_{model}.yaml -folder_experiment: runs/appl_ald_data/plasma/proteinGroups_80%_dataset -fn_rawfile_metadata: 'data/ALD_study/processed/raw_meta.csv' -file_format: pkl -models: - - RSN - - Median # maps to median on file system? - - CF - - DAE - - VAE - - KNN -NAGuideR_methods: - # - lls - - knnmethod - # - seqknn # error - - rf - # - impseq # error - - qrilc - - minprob - - mindet - - svdmethod - # - mice-norm # stopped after 8h \ No newline at end of file diff --git a/project/config/appl_ald_data/plasma/proteinGroups_80%_dataset/comparison.yaml b/project/config/appl_ald_data/plasma/proteinGroups_80perc_dataset/comparison.yaml similarity index 68% rename from project/config/appl_ald_data/plasma/proteinGroups_80%_dataset/comparison.yaml rename to project/config/appl_ald_data/plasma/proteinGroups_80perc_dataset/comparison.yaml index d1a4ab35a..785724574 100644 --- a/project/config/appl_ald_data/plasma/proteinGroups_80%_dataset/comparison.yaml +++ b/project/config/appl_ald_data/plasma/proteinGroups_80perc_dataset/comparison.yaml @@ -1,7 +1,7 @@ -folder_experiment: runs/appl_ald_data/plasma/proteinGroups_80%_dataset +folder_experiment: runs/appl_ald_data_2023_11/plasma/proteinGroups_80perc_25MNAR out_folder: diff_analysis # subfolder of experiment folder -fn_rawfile_metadata: 'data/ALD_study/processed/raw_meta.csv' -covar: +fn_rawfile_metadata: "data/ALD_study/processed/raw_meta.csv" +covar: kleiner: age,bmi,gender_num,nas_steatosis_ordinal,abstinent_num inflammation: age,bmi,gender_num,nas_steatosis_ordinal,abstinent_num steatosis: age,bmi,gender_num,abstinent_num,kleiner,nas_inflam @@ -17,13 +17,14 @@ f_annotations: data/ALD_study/processed/ald_plasma_proteinGroups_id_mappings.csv annotaitons_gene_col: PG.Genes baseline: RSN # ! needs analysis full dataset: -ref_method_score: runs/appl_ald_data/plasma/proteinGroups/diff_analysis/kleiner/scores/diff_analysis_scores_None.pkl +ref_method_score: runs/appl_ald_data_2023_11/plasma/proteinGroups/diff_analysis/kleiner/scores/diff_analysis_scores_None.pkl # ! # needs to be false if ref method is specified in current setup -make_plots: False +make_plots: False methods: - - Median - - CF - - DAE - - VAE - - rf - - KNN + - Median + - CF + - DAE + - VAE + - QRILC + - TRKNN + - RF diff --git a/project/config/appl_ald_data/plasma/proteinGroups_80perc_dataset/config.yaml b/project/config/appl_ald_data/plasma/proteinGroups_80perc_dataset/config.yaml new file mode 100644 index 000000000..37177c1c2 --- /dev/null +++ b/project/config/appl_ald_data/plasma/proteinGroups_80perc_dataset/config.yaml @@ -0,0 +1,38 @@ +# config for Snakefile_v1 +config_split: config/appl_ald_data/plasma/proteinGroups_80perc_dataset/split.yaml +config_train: config/appl_ald_data/plasma/proteinGroups/train_{model}.yaml +folder_experiment: runs/appl_ald_data_2023_11/plasma/proteinGroups_80perc_25MNAR +fn_rawfile_metadata: "data/ALD_study/processed/raw_meta.csv" +file_format: pkl +models: + - RSN + - Median # maps to median on file system? + - CF + - DAE + - VAE + - KNN +NAGuideR_methods: + - BPCA + - COLMEDIAN + - GSIMP # slow + - IMPSEQ + - IMPSEQROB + - IRM + - KNN_IMPUTE + - LLS + # - MICE-CART # stopped after 24h + # - MICE-NORM # stopped after 24h + - MINDET + - MINIMUM + - MINPROB + - MLE + - MSIMPUTE + - MSIMPUTE_MNAR + - PI + - QRILC + - RF + - ROWMEDIAN + # - SEQKNN # Error in x[od, ismiss, drop = FALSE]: subscript out of bounds + - SVDMETHOD + - TRKNN + - ZERO diff --git a/project/config/appl_ald_data/plasma/proteinGroups_80%_dataset/split.yaml b/project/config/appl_ald_data/plasma/proteinGroups_80perc_dataset/split.yaml similarity index 74% rename from project/config/appl_ald_data/plasma/proteinGroups_80%_dataset/split.yaml rename to project/config/appl_ald_data/plasma/proteinGroups_80perc_dataset/split.yaml index 3d4ef529e..b28fafef9 100644 --- a/project/config/appl_ald_data/plasma/proteinGroups_80%_dataset/split.yaml +++ b/project/config/appl_ald_data/plasma/proteinGroups_80perc_dataset/split.yaml @@ -1,6 +1,7 @@ -FN_INTENSITIES: data/ALD_study/processed/ald_plasma_proteinGroups_0.80.pkl +FN_INTENSITIES: data/ALD_study/processed/ald_plasma_proteinGroups_80perc_25MNAR.pkl fn_rawfile_metadata: data/ALD_study/processed/raw_meta.csv sample_completeness: 0.3 min_RT_time: 20 column_names: - - PG.ProteinAccessions \ No newline at end of file + - PG.ProteinAccessions +frac_mnar: 0.25 diff --git a/project/config/counter_fpaths.py b/project/config/counter_fpaths.py deleted file mode 100644 index 5d0218092..000000000 --- a/project/config/counter_fpaths.py +++ /dev/null @@ -1,7 +0,0 @@ -from .defaults import FOLDER_PROCESSED - -FNAME_C_PEPTIDES = FOLDER_PROCESSED / 'count_all_peptides.json' # aggregated peptides -FNAME_C_EVIDENCE = FOLDER_PROCESSED / 'count_all_evidences.json' # evidence peptides (sequence, charge, modification) - -FNAME_C_PG = FOLDER_PROCESSED / 'count_all_protein_groups.json' -FNAME_C_GENES = FOLDER_PROCESSED / 'count_all_genes.json' diff --git a/project/config/defaults.py b/project/config/defaults.py deleted file mode 100644 index 239453cb0..000000000 --- a/project/config/defaults.py +++ /dev/null @@ -1,17 +0,0 @@ -from pathlib import Path - - -def mkdir(path=Path): - path.mkdir(exist_ok=True, parents=True) - return path - - -# project folder specific -FIGUREFOLDER = mkdir(Path('Figures')) -FOLDER_DATA = mkdir(Path('data')) -FOLDER_PROCESSED = mkdir(FOLDER_DATA / 'processed') -FOLDER_TRAINING = mkdir(FOLDER_DATA / 'hela_qc_data') - -# (old) Synonyms -PROCESSED_DATA = FOLDER_PROCESSED -PROTEIN_DUMPS = PROCESSED_DATA \ No newline at end of file diff --git a/project/config/config_grid.yaml b/project/config/grid_search_large_data/config_grid.yaml similarity index 90% rename from project/config/config_grid.yaml rename to project/config/grid_search_large_data/config_grid.yaml index 6d9e4a737..1523a47df 100644 --- a/project/config/config_grid.yaml +++ b/project/config/grid_search_large_data/config_grid.yaml @@ -29,11 +29,11 @@ levels: - proteinGroups - peptides - evidence -config_split: 'config/single_dev_dataset/{level}/split.yaml' +config_split: config/grid_search_large_data/{level}_split.yaml file_format: csv config_train: 'placeholder.yaml' name_template: run/LD_{latent_dim}_E_{epochs_max} -cuda: True +cuda: False NAGuideR_methods: models: - CF diff --git a/project/config/config_grid_small.yaml b/project/config/grid_search_large_data/config_grid_small.yaml similarity index 88% rename from project/config/config_grid_small.yaml rename to project/config/grid_search_large_data/config_grid_small.yaml index 69d18852b..c29a09dbd 100644 --- a/project/config/config_grid_small.yaml +++ b/project/config/grid_search_large_data/config_grid_small.yaml @@ -16,10 +16,10 @@ levels: - proteinGroups - peptides # - evidence -config_split: 'config/single_dev_dataset/{level}/split.yaml' +config_split: 'config/grid_search_large_data/{level}_split.yaml' file_format: csv config_train: 'placeholder.yaml' -cuda: True +cuda: False NAGuideR_methods: models: - Median diff --git a/project/config/grid_search_large_data/evidence_split.yaml b/project/config/grid_search_large_data/evidence_split.yaml new file mode 100644 index 000000000..e766c0b62 --- /dev/null +++ b/project/config/grid_search_large_data/evidence_split.yaml @@ -0,0 +1,5 @@ +FN_INTENSITIES: data/dev_datasets/df_intensities_evidence_long/Q_Exactive_HF_X_Orbitrap_6070.pkl +fn_rawfile_metadata: data/dev_datasets/df_intensities_evidence_long/metadata.csv +sample_completeness: 0.5 +min_RT_time: 120 +column_names: null \ No newline at end of file diff --git a/project/config/grid_search_large_data/peptides_split.yaml b/project/config/grid_search_large_data/peptides_split.yaml new file mode 100644 index 000000000..732957cb3 --- /dev/null +++ b/project/config/grid_search_large_data/peptides_split.yaml @@ -0,0 +1,5 @@ +FN_INTENSITIES: data/dev_datasets/df_intensities_peptides_long/Q_Exactive_HF_X_Orbitrap_6070.pkl +fn_rawfile_metadata: data/dev_datasets/df_intensities_peptides_long/metadata.csv +sample_completeness: 0.5 +min_RT_time: 120 +column_names: null \ No newline at end of file diff --git a/project/config/grid_search_large_data/proteinGroups_split.yaml b/project/config/grid_search_large_data/proteinGroups_split.yaml new file mode 100644 index 000000000..675f1b785 --- /dev/null +++ b/project/config/grid_search_large_data/proteinGroups_split.yaml @@ -0,0 +1,3 @@ +FN_INTENSITIES: data/dev_datasets/df_intensities_proteinGroups_long/Q_Exactive_HF_X_Orbitrap_6070.pkl +sample_completeness: 0.5 +min_RT_time: 120 \ No newline at end of file diff --git a/project/config/grid_search_small_data/config_grid.yaml b/project/config/grid_search_small_data/config_grid.yaml index 67dbeb14f..0ba053433 100644 --- a/project/config/grid_search_small_data/config_grid.yaml +++ b/project/config/grid_search_small_data/config_grid.yaml @@ -32,7 +32,7 @@ levels: config_split: 'config/grid_search_small_data/{level}_split.yaml' config_train: 'placeholder.yaml' name_template: run_LD_{latent_dim}_E_{epochs_max} -cuda: True +cuda: False NAGuideR_methods: models: - CF diff --git a/project/config/knn_comparison/ald_pgs_all/README.md b/project/config/knn_comparison/ald_pgs_all/README.md new file mode 100644 index 000000000..95ea87933 --- /dev/null +++ b/project/config/knn_comparison/ald_pgs_all/README.md @@ -0,0 +1,7 @@ +# KNN comparison + +for ALD protein groups dataset. + +```bash +snakemake -s workflow/Snakefile_v2 --configfile config/knn_comparison/ald_pgs_all/config.yaml -p -c1 -n +``` \ No newline at end of file diff --git a/project/config/knn_comparison/ald_pgs_all/config.yaml b/project/config/knn_comparison/ald_pgs_all/config.yaml new file mode 100644 index 000000000..96c85270a --- /dev/null +++ b/project/config/knn_comparison/ald_pgs_all/config.yaml @@ -0,0 +1,30 @@ +# config for Snakefile_v2 +config_train: runs/knn_comparison/ald_pgs_all/configs_train/train_{model}.yaml +folder_experiment: runs/knn_comparison/ald_pgs_all +fn_rawfile_metadata: data/ALD_study/processed/raw_meta.csv +file_format: pkl +cuda: False +split_data: + FN_INTENSITIES: data/ALD_study/processed/ald_plasma_proteinGroups.pkl + sample_completeness: 0.5 + min_RT_time: 20 + column_names: + - PG.ProteinAccessions +# frac_mnar: 0.0 +models: + - Median: + model: Median + - 3NN: + neighbors: 3 + model: KNN + - 5NN: + neighbors: 5 + model: KNN + - 10NN: + neighbors: 10 + model: KNN + - 15NN: + neighbors: 15 + model: KNN +NAGuideR_methods: + - KNN_IMPUTE diff --git a/project/config/knn_comparison/hela_pgs_large/README.md b/project/config/knn_comparison/hela_pgs_large/README.md new file mode 100644 index 000000000..b22dd0b33 --- /dev/null +++ b/project/config/knn_comparison/hela_pgs_large/README.md @@ -0,0 +1,7 @@ +# KNN comparison + +for large protein groups HeLa dataset. + +```bash +snakemake -s workflow/Snakefile_v2 --configfile config/knn_comparison/hela_pgs_large/config.yaml -p -c1 -n +``` \ No newline at end of file diff --git a/project/config/knn_comparison/hela_pgs_large/config.yaml b/project/config/knn_comparison/hela_pgs_large/config.yaml new file mode 100644 index 000000000..671a9c222 --- /dev/null +++ b/project/config/knn_comparison/hela_pgs_large/config.yaml @@ -0,0 +1,24 @@ +# config for Snakefile_v2 +config_split: config/knn_comparison/hela_pgs_large/split.yaml +config_train: runs/knn_comparison/hela_pgs_large/configs_train/train_{model}.yaml +folder_experiment: runs/knn_comparison/hela_pgs_large +fn_rawfile_metadata: None +file_format: csv +cuda: False +models: + - Median: + model: Median + - 3NN: + neighbors: 3 + model: KNN + - 5NN: + neighbors: 5 + model: KNN + - 10NN: + neighbors: 10 + model: KNN + - 15NN: + neighbors: 15 + model: KNN +NAGuideR_methods: + - KNN_IMPUTE diff --git a/project/config/knn_comparison/hela_pgs_large/inspect_data.yaml b/project/config/knn_comparison/hela_pgs_large/inspect_data.yaml new file mode 100644 index 000000000..abc129e66 --- /dev/null +++ b/project/config/knn_comparison/hela_pgs_large/inspect_data.yaml @@ -0,0 +1,11 @@ +FN_INTENSITIES: data/dev_datasets/df_intensities_proteinGroups_long/Q_Exactive_HF_X_Orbitrap_6070.pkl +FOLDER_EXPERIMENT: runs/knn_comparison/hela_pgs_large +N_FIRST_ROWS: +LOG_TRANSFORM: True +INDEX_COL: + - 0 + - 1 +LONG_FORMAT: True +COMPLETENESS_OVER_SAMPLES: 0.25 +MIN_FEAT_PER_SAMPLE: 0.4 +PG_SEPARATOR: ; diff --git a/project/config/knn_comparison/hela_pgs_large/split.yaml b/project/config/knn_comparison/hela_pgs_large/split.yaml new file mode 100644 index 000000000..efb1f12d0 --- /dev/null +++ b/project/config/knn_comparison/hela_pgs_large/split.yaml @@ -0,0 +1,3 @@ +FN_INTENSITIES: data/dev_datasets/df_intensities_proteinGroups_long/Q_Exactive_HF_X_Orbitrap_6070.pkl +sample_completeness: 0.5 +frac_mnar: 0.25 diff --git a/project/config/permuted_dataset/config.yaml b/project/config/permuted_dataset/config.yaml index b917f88c4..0b338744f 100644 --- a/project/config/permuted_dataset/config.yaml +++ b/project/config/permuted_dataset/config.yaml @@ -1,17 +1,16 @@ +# config for Snakefile_v1 # fit permuted data to the same model as the original data -config_split: config/permuted_dataset/split.yaml # proteinGroups +config_split: config/permuted_dataset/split.yaml # proteinGroups config_train: config/single_dev_dataset/proteinGroups/train_{model}.yaml folder_experiment: runs/permuted #/proteinGroups fn_rawfile_metadata: # no metadata for permuted data -cuda: True +cuda: False models: - - Median - - CF - - DAE - - VAE - - KNN + - Median + - CF + - DAE + - VAE + - KNN NAGuideR_methods: - - lls - - knnmethod - - rf - # - impseq # fails \ No newline at end of file + - KNN_IMPUTE + # - RF diff --git a/project/config/permuted_dataset/split.yaml b/project/config/permuted_dataset/split.yaml index 2bc14b229..69441fd23 100644 --- a/project/config/permuted_dataset/split.yaml +++ b/project/config/permuted_dataset/split.yaml @@ -1,2 +1,3 @@ FN_INTENSITIES: data/dev_datasets/df_intensities_proteinGroups_long/Q_Exactive_HF_X_Orbitrap_6070_permuted.pkl sample_completeness: 0.5 +frac_mnar: 0.25 diff --git a/project/config/repeat_best/split.yaml b/project/config/repeat_best/split.yaml index d47b203f9..766aa05c7 100644 --- a/project/config/repeat_best/split.yaml +++ b/project/config/repeat_best/split.yaml @@ -1,13 +1,14 @@ epochs_max: -- 100 + - 100 repeats: 5 folder: "runs/repeat_best_split" levels: -- proteinGroups -- peptides -- evidence + - proteinGroups + - peptides + - evidence fn_rawfile_metadata: data/dev_datasets/df_intensities_{level}_long/metadata.csv -config_split: 'config/single_dev_dataset/{level}/split.yaml' -config_train: 'config/single_dev_dataset/{level}/train_{model}.yaml' -repitition_name: 'repeat' +config_split: "config/single_dev_dataset/{level}/split.yaml" +config_train: "config/single_dev_dataset/{level}/train_{model}.yaml" +repitition_name: "repeat" file_format: pkl +cuda: True diff --git a/project/config/repeat_best/train.yaml b/project/config/repeat_best/train.yaml index b8cf4be09..976284b60 100644 --- a/project/config/repeat_best/train.yaml +++ b/project/config/repeat_best/train.yaml @@ -1,13 +1,14 @@ epochs_max: -- 100 + - 100 repeats: 5 folder: "runs/repeat_best_train" levels: -- proteinGroups -- peptides -- evidence + - proteinGroups + - peptides + - evidence fn_rawfile_metadata: data/dev_datasets/df_intensities_{level}_long/metadata.csv -config_split: 'config/single_dev_dataset/{level}/split.yaml' -config_train: 'config/single_dev_dataset/{level}/train_{model}.yaml' -repitition_name: 'repeat' +config_split: "config/single_dev_dataset/{level}/split.yaml" +config_train: "config/single_dev_dataset/{level}/train_{model}.yaml" +repitition_name: "repeat" file_format: pkl +cuda: True diff --git a/project/config/single_dev_dataset/evidence/config.yaml b/project/config/single_dev_dataset/evidence/config.yaml index 031a42614..abda49a9b 100644 --- a/project/config/single_dev_dataset/evidence/config.yaml +++ b/project/config/single_dev_dataset/evidence/config.yaml @@ -1,32 +1,35 @@ +# config for Snakefile_v1 config_split: config/single_dev_dataset/evidence/split.yaml config_train: config/single_dev_dataset/evidence/train_{model}.yaml folder_experiment: runs/dev_dataset_large/evidence fn_rawfile_metadata: data/dev_datasets/df_intensities_evidence_long/metadata.csv +cuda: False models: - - Median - - CF - - DAE - - VAE - - KNN + - Median + - CF + - DAE + - VAE + - KNN NAGuideR_methods: - - ZERO - - MINIMUM - - COLMEDIAN - - ROWMEDIAN - - KNN_IMPUTE - # - SEQKNN # error - # - BPCA # stopped after 30mins - # - SVDMETHOD # stopped after 30mins - # - LLS # stopped after 30mins - # - MLE # error - - QRILC - - MINDET - - MINPROB - # - IRM # stopped after 30mins - # - IMPSEQ # error - # - IMPSEQROB # error - # - MICE-NORM # stopped after 30mins - # - MICE-CART # stopped after 30mins - # - TRKNN # error - # - RF # stopped after 30mins - - PI + # - BPCA # stopped after 24h + - COLMEDIAN + - IMPSEQ + - IMPSEQROB + # - IRM # stopped after 24h + - KNN_IMPUTE + # - LLS # error + # - MICE-CART # stopped after 24h + # - MICE-NORM # stopped after 24h + - MINDET + - MINIMUM + - MINPROB + - MLE + - MSIMPUTE + - PI + - QRILC + # - RF # stopped after 24h + - ROWMEDIAN + # - SEQKNN # error: Error in x[od, ismiss, drop = FALSE]: subscript out of bounds + - SVDMETHOD + # - TRKNN # stopped after 24h + - ZERO diff --git a/project/config/single_dev_dataset/evidence/split.yaml b/project/config/single_dev_dataset/evidence/split.yaml index e766c0b62..8b85e3ad0 100644 --- a/project/config/single_dev_dataset/evidence/split.yaml +++ b/project/config/single_dev_dataset/evidence/split.yaml @@ -2,4 +2,6 @@ FN_INTENSITIES: data/dev_datasets/df_intensities_evidence_long/Q_Exactive_HF_X_O fn_rawfile_metadata: data/dev_datasets/df_intensities_evidence_long/metadata.csv sample_completeness: 0.5 min_RT_time: 120 -column_names: null \ No newline at end of file +column_names: null +frac_mnar: 0.25 +frac_non_train: 0.1 diff --git a/project/config/single_dev_dataset/evidence/train_CF.yaml b/project/config/single_dev_dataset/evidence/train_CF.yaml index 9324564b6..2068cc158 100644 --- a/project/config/single_dev_dataset/evidence/train_CF.yaml +++ b/project/config/single_dev_dataset/evidence/train_CF.yaml @@ -3,5 +3,5 @@ latent_dim: 50 batch_size: 32768 epochs_max: 100 sample_idx_position: 0 -cuda: True +cuda: False save_pred_real_na: True \ No newline at end of file diff --git a/project/config/single_dev_dataset/evidence/train_DAE.yaml b/project/config/single_dev_dataset/evidence/train_DAE.yaml index 05d68c70b..41ffcb066 100644 --- a/project/config/single_dev_dataset/evidence/train_DAE.yaml +++ b/project/config/single_dev_dataset/evidence/train_DAE.yaml @@ -4,5 +4,5 @@ batch_size: 64 epochs_max: 100 hidden_layers: "512" sample_idx_position: 0 -cuda: True +cuda: False save_pred_real_na: True \ No newline at end of file diff --git a/project/config/single_dev_dataset/evidence/train_VAE.yaml b/project/config/single_dev_dataset/evidence/train_VAE.yaml index 87565f179..b3a22c8bb 100644 --- a/project/config/single_dev_dataset/evidence/train_VAE.yaml +++ b/project/config/single_dev_dataset/evidence/train_VAE.yaml @@ -4,5 +4,5 @@ batch_size: 64 epochs_max: 100 hidden_layers: "512" sample_idx_position: 0 -cuda: True +cuda: False save_pred_real_na: True \ No newline at end of file diff --git a/project/config/single_dev_dataset/evidence_N50/config.yaml b/project/config/single_dev_dataset/evidence_N50/config.yaml index cc99984d2..fea06b5df 100644 --- a/project/config/single_dev_dataset/evidence_N50/config.yaml +++ b/project/config/single_dev_dataset/evidence_N50/config.yaml @@ -1,3 +1,4 @@ +# config for Snakefile_v1 config_split: config/single_dev_dataset/evidence_N50/split.yaml config_train: config/single_dev_dataset/evidence_N50/train_{model}.yaml folder_experiment: runs/dev_dataset_small/evidence_N50 @@ -5,29 +6,29 @@ fn_rawfile_metadata: data/dev_datasets/df_intensities_evidence_long/metadata.csv file_format: csv models: - Median - - CF - - DAE - - VAE + - CF # 1min29s + - DAE # 2min36s + - VAE # 4min05s - KNN -NAGuideR_methods: - - ZERO - - MINIMUM + NAGuideR_methods: + - BPCA #27min - COLMEDIAN - - ROWMEDIAN + - IMPSEQ # 1min + - IMPSEQROB + - IRM # 12h00min - KNN_IMPUTE - # - SEQKNN # error - - BPCA - - SVDMETHOD - # - LLS # stopped after 30mins - - MLE - - QRILC # fails for N=10 + # - LLS # error: out of memory + - MICE-CART # 2h39min + - MICE-NORM # 5min - MINDET + - MINIMUM - MINPROB - # - IRM # stopped after 30mins - # - IMPSEQ # error - - IMPSEQROB - - MICE-NORM - # - MICE-CART # stopped after 30mins - # - TRKNN # error - # - RF # stopped after 30mins - - PI \ No newline at end of file + - MLE + - MSIMPUTE + - PI + - QRILC + - RF # 3h44min + - ROWMEDIAN + - SEQKNN # 24min + - SVDMETHOD # 1min + - TRKNN # 5h26min \ No newline at end of file diff --git a/project/config/single_dev_dataset/evidence_N50/split.yaml b/project/config/single_dev_dataset/evidence_N50/split.yaml index b72f99608..369b70aa8 100644 --- a/project/config/single_dev_dataset/evidence_N50/split.yaml +++ b/project/config/single_dev_dataset/evidence_N50/split.yaml @@ -4,4 +4,7 @@ min_RT_time: 120 select_N: 50 column_names: null index_col: 0 -meta_date_col: Content Creation Date \ No newline at end of file +meta_date_col: Content Creation Date +frac_mnar: 0.25 +frac_non_train: 0.1 + diff --git a/project/config/single_dev_dataset/evidence_N50/train_CF.yaml b/project/config/single_dev_dataset/evidence_N50/train_CF.yaml index 2ca3c5ac1..802add8cd 100644 --- a/project/config/single_dev_dataset/evidence_N50/train_CF.yaml +++ b/project/config/single_dev_dataset/evidence_N50/train_CF.yaml @@ -4,5 +4,5 @@ latent_dim: 50 batch_size: 4096 epochs_max: 30 sample_idx_position: 0 -cuda: True +cuda: False save_pred_real_na: True \ No newline at end of file diff --git a/project/config/single_dev_dataset/evidence_N50/train_DAE.yaml b/project/config/single_dev_dataset/evidence_N50/train_DAE.yaml index cbbd89129..feefc16e1 100644 --- a/project/config/single_dev_dataset/evidence_N50/train_DAE.yaml +++ b/project/config/single_dev_dataset/evidence_N50/train_DAE.yaml @@ -4,5 +4,5 @@ batch_size: 10 epochs_max: 100 hidden_layers: "256" sample_idx_position: 0 -cuda: True +cuda: False save_pred_real_na: True diff --git a/project/config/single_dev_dataset/evidence_N50/train_VAE.yaml b/project/config/single_dev_dataset/evidence_N50/train_VAE.yaml index 4eba2e370..ea2604123 100644 --- a/project/config/single_dev_dataset/evidence_N50/train_VAE.yaml +++ b/project/config/single_dev_dataset/evidence_N50/train_VAE.yaml @@ -6,5 +6,5 @@ batch_size: 10 epochs_max: 100 hidden_layers: "256" sample_idx_position: 0 -cuda: True +cuda: False save_pred_real_na: True \ No newline at end of file diff --git a/project/config/single_dev_dataset/example/config.yaml b/project/config/single_dev_dataset/example/config.yaml index b7ed7f717..649d702b4 100644 --- a/project/config/single_dev_dataset/example/config.yaml +++ b/project/config/single_dev_dataset/example/config.yaml @@ -1,5 +1,6 @@ -config_split: config/single_dev_dataset/proteinGroups_N50/split.yaml -config_train: config/single_dev_dataset/proteinGroups_N50/train_{model}.yaml +# config for Snakefile_v1 +config_split: config/single_dev_dataset/example/split.yaml +config_train: config/single_dev_dataset/example/train_{model}.yaml folder_experiment: runs/example # folder_experiment: runs/dev_dataset_small/proteinGroups_N50 fn_rawfile_metadata: data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv @@ -14,10 +15,13 @@ NAGuideR_methods: - ROWMEDIAN # e1071 - KNN_IMPUTE # impute - SEQKNN # SeqKNN - - RF # missForest + - RF # missForest - ~7mins - IMPSEQ # rrcovNA - QRILC # imputeLCMD - SVDMETHOD # pcaMethods - MICE-NORM # mice - MLE # norm - - IRM # VIM + - IRM # VIM - ~9mins + - TRKNN + - MSIMPUTE_MNAR + - GSIMP # slow even on data with fewer features diff --git a/project/config/single_dev_dataset/example/split.yaml b/project/config/single_dev_dataset/example/split.yaml index 32672a3ba..46565d148 100644 --- a/project/config/single_dev_dataset/example/split.yaml +++ b/project/config/single_dev_dataset/example/split.yaml @@ -1,8 +1,9 @@ -FN_INTENSITIES: data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv +FN_INTENSITIES: data/dev_datasets/HeLa_6070/protein_groups_wide_N50_M454.csv fn_rawfile_metadata: data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv sample_completeness: 0.5 +feat_prevalence: 0.6 min_RT_time: 120 column_names: - Gene Names index_col: 0 -meta_date_col: Content Creation Date \ No newline at end of file +meta_date_col: Content Creation Date diff --git a/project/config/single_dev_dataset/example/train_CF.yaml b/project/config/single_dev_dataset/example/train_CF.yaml index 30f22595c..751cd8ef9 100644 --- a/project/config/single_dev_dataset/example/train_CF.yaml +++ b/project/config/single_dev_dataset/example/train_CF.yaml @@ -1,8 +1,8 @@ -folder_experiment: runs/example +folder_experiment: runs/example file_format: csv latent_dim: 50 batch_size: 4096 -epochs_max: 20 +epochs_max: 3 sample_idx_position: 0 cuda: False -save_pred_real_na: True \ No newline at end of file +save_pred_real_na: True diff --git a/project/config/single_dev_dataset/example/train_DAE.yaml b/project/config/single_dev_dataset/example/train_DAE.yaml index d01114dad..6fa4e5e3e 100644 --- a/project/config/single_dev_dataset/example/train_DAE.yaml +++ b/project/config/single_dev_dataset/example/train_DAE.yaml @@ -1,8 +1,8 @@ file_format: csv latent_dim: 10 batch_size: 10 -epochs_max: 100 +epochs_max: 5 hidden_layers: "512" sample_idx_position: 0 cuda: False -save_pred_real_na: True \ No newline at end of file +save_pred_real_na: True diff --git a/project/config/single_dev_dataset/example/train_VAE.yaml b/project/config/single_dev_dataset/example/train_VAE.yaml index 993649f5d..72a4e6c52 100644 --- a/project/config/single_dev_dataset/example/train_VAE.yaml +++ b/project/config/single_dev_dataset/example/train_VAE.yaml @@ -1,10 +1,10 @@ # models_training: -folder_experiment: runs/example +folder_experiment: runs/example file_format: csv latent_dim: 25 batch_size: 10 -epochs_max: 50 +epochs_max: 5 hidden_layers: "512_256" sample_idx_position: 0 cuda: False -save_pred_real_na: True \ No newline at end of file +save_pred_real_na: True diff --git a/project/config/single_dev_dataset/mnar_mcar/evi_l.yaml b/project/config/single_dev_dataset/mnar_mcar/evi_l.yaml new file mode 100644 index 000000000..899cd1a59 --- /dev/null +++ b/project/config/single_dev_dataset/mnar_mcar/evi_l.yaml @@ -0,0 +1,76 @@ +# config for Snakefile_v2 +config_split: runs/mnar_mcar/pg_l_50MNAR/01_0_split_data.yaml # ! will be build +config_train: runs/mnar_mcar/pg_l_50MNAR/train_{model}.yaml # ! will be build, should say model_key next +folder_experiment: runs/mnar_mcar/pg_l_50MNAR +frac_mnar: 0.5 +fn_rawfile_metadata: data/dev_datasets/df_intensities_peptides_long/metadata.csv +file_format: csv +split_data: + FN_INTENSITIES: data/dev_datasets/df_intensities_evidence_long/Q_Exactive_HF_X_Orbitrap_6070.pkl + sample_completeness: 0.4 + feat_prevalence: 0.25 + min_RT_time: 120 + index_col: 0 + meta_date_col: Content Creation Date + column_names: null +models: + - Median: + model: Median # needs to set at least one parameter + - CF: + model: CF + file_format: csv + latent_dim: 50 + batch_size: 4096 + epochs_max: 30 + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - DAE: + model: DAE + file_format: csv + latent_dim: 50 + batch_size: 10 + epochs_max: 200 + hidden_layers: "512" + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - VAE: + model: VAE + file_format: csv + latent_dim: 25 + batch_size: 10 + epochs_max: 200 + hidden_layers: "512" + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - KNN: + model: KNN + neighbors: 3 + file_format: csv +NAGuideR_methods: + # - BPCA # > 24h, killed + - COLMEDIAN + # - GSIMP # > 24h, killed + - IMPSEQ + - IMPSEQROB + # - IRM # > 24h, killed + - KNN_IMPUTE + # - LLS # Error in svd(X): infinite or missing values in 'x' + # - MICE-CART # > 24h, killed + # - MICE-NORM # > 24h, killed + - MINDET + - MINIMUM + - MINPROB + - MLE + - MSIMPUTE + - MSIMPUTE_MNAR + - PI + - QRILC + # - RF # > 24h, killed + - ROWMEDIAN + # - SEQKNN # error + - SVDMETHOD + # - TRKNN # 24h, killed + - ZERO diff --git a/project/config/single_dev_dataset/mnar_mcar/evi_m.yaml b/project/config/single_dev_dataset/mnar_mcar/evi_m.yaml new file mode 100755 index 000000000..3de82f1dc --- /dev/null +++ b/project/config/single_dev_dataset/mnar_mcar/evi_m.yaml @@ -0,0 +1,78 @@ +# config for Snakefile_v2 +config_split: runs/mnar_mcar/pg_m_50MNAR/01_0_split_data.yaml # ! will be build +config_train: runs/mnar_mcar/pg_m_50MNAR/train_{model}.yaml # ! will be build, should say model_key next +folder_experiment: runs/mnar_mcar/pg_m_50MNAR +frac_mnar: 0.5 +fn_rawfile_metadata: data/dev_datasets/df_intensities_peptides_long/metadata.csv +file_format: csv +split_data: + FN_INTENSITIES: data/dev_datasets/df_intensities_evidence_long/Q_Exactive_HF_X_Orbitrap_6070.pkl + sample_completeness: 0.4 + feat_prevalence: 0.25 + select_N: 50 + min_RT_time: 120 + index_col: 0 + meta_date_col: Content Creation Date + column_names: null +models: + - Median: + model: Median # needs to set at least one parameter + - CF: + model: CF + file_format: csv + latent_dim: 50 + batch_size: 4096 + epochs_max: 30 + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - DAE: + model: DAE + file_format: csv + latent_dim: 25 + batch_size: 25 + patience: 50 + epochs_max: 200 + hidden_layers: "256" + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - VAE: + model: VAE + file_format: csv + latent_dim: 10 + batch_size: 25 + epochs_max: 200 + hidden_layers: "256" + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - KNN: + model: KNN + neighbors: 3 + file_format: csv +NAGuideR_methods: + - BPCA + - COLMEDIAN + # - GSIMP # > 24h, killed + - IMPSEQ + - IMPSEQROB + - IRM # error + - KNN_IMPUTE + # - LLS, # error + - MICE-CART # > 24h, killed + - MICE-NORM # > 24h, killed + - MINDET + - MINIMUM + - MINPROB + - MLE + - MSIMPUTE + - MSIMPUTE_MNAR + - PI + - QRILC + - RF # > 24h, killed + - ROWMEDIAN + - SEQKNN # error + - SVDMETHOD + - TRKNN # error + - ZERO diff --git a/project/config/single_dev_dataset/mnar_mcar/pep_l.yaml b/project/config/single_dev_dataset/mnar_mcar/pep_l.yaml new file mode 100644 index 000000000..5bbe75364 --- /dev/null +++ b/project/config/single_dev_dataset/mnar_mcar/pep_l.yaml @@ -0,0 +1,76 @@ +# config for Snakefile_v2 +config_split: runs/mnar_mcar/pep_l_50MNAR/01_0_split_data.yaml # ! will be build +config_train: runs/mnar_mcar/pep_l_50MNAR/train_{model}.yaml # ! will be build, should say model_key next +folder_experiment: runs/mnar_mcar/pep_l_50MNAR +frac_mnar: 0.5 +fn_rawfile_metadata: data/dev_datasets/df_intensities_peptides_long/metadata.csv +file_format: csv +split_data: + FN_INTENSITIES: data/dev_datasets/df_intensities_peptides_long/Q_Exactive_HF_X_Orbitrap_6070.pkl + sample_completeness: 0.4 + feat_prevalence: 0.25 + min_RT_time: 120 + index_col: 0 + meta_date_col: Content Creation Date + column_names: null +models: + - Median: + model: Median # needs to set at least one parameter + - CF: + model: CF + file_format: csv + latent_dim: 50 + batch_size: 4096 + epochs_max: 30 + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - DAE: + model: DAE + file_format: csv + latent_dim: 50 + batch_size: 10 + epochs_max: 200 + hidden_layers: "1024" + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - VAE: + model: VAE + file_format: csv + latent_dim: 10 + batch_size: 10 + epochs_max: 200 + hidden_layers: "512" + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - KNN: + model: KNN + neighbors: 3 + file_format: csv +NAGuideR_methods: + # - BPCA # > 24h, killed + - COLMEDIAN + # - GSIMP # > 24h, killed + - IMPSEQ + - IMPSEQROB + # - IRM # > 24h, killed + - KNN_IMPUTE + # - LLS # error + # - MICE-CART # > 24h, killed + # - MICE-NORM # > 24h, killed + - MINDET + - MINIMUM + - MINPROB + - MLE + - MSIMPUTE + - MSIMPUTE_MNAR + - PI + - QRILC + # - RF # > 24h, killed + - ROWMEDIAN + # - SEQKNN # Error in x[od, ismiss, drop = FALSE]: subscript out of bounds + - SVDMETHOD + # - TRKNN # > 24h, killed + - ZERO diff --git a/project/config/single_dev_dataset/mnar_mcar/pep_m.yaml b/project/config/single_dev_dataset/mnar_mcar/pep_m.yaml new file mode 100755 index 000000000..cfdd672fd --- /dev/null +++ b/project/config/single_dev_dataset/mnar_mcar/pep_m.yaml @@ -0,0 +1,78 @@ +# config for Snakefile_v2 +config_split: runs/mnar_mcar/pep_m_50MNAR/01_0_split_data.yaml # ! will be build +config_train: runs/mnar_mcar/pep_m_50MNAR/train_{model}.yaml # ! will be build, should say model_key next +folder_experiment: runs/mnar_mcar/pep_m_50MNAR +frac_mnar: 0.5 +fn_rawfile_metadata: data/dev_datasets/df_intensities_peptides_long/metadata.csv +file_format: csv +split_data: + FN_INTENSITIES: data/dev_datasets/df_intensities_peptides_long/Q_Exactive_HF_X_Orbitrap_6070.pkl + sample_completeness: 0.4 + feat_prevalence: 0.25 + select_N: 50 + min_RT_time: 120 + index_col: 0 + meta_date_col: Content Creation Date + column_names: null +models: + - Median: + model: Median # needs to set at least one parameter + - CF: + model: CF + file_format: csv + latent_dim: 50 + batch_size: 4096 + epochs_max: 30 + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - DAE: + model: DAE + file_format: csv + latent_dim: 75 + batch_size: 25 + patience: 50 + epochs_max: 200 + hidden_layers: "256_128" + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - VAE: + model: VAE + file_format: csv + latent_dim: 50 + batch_size: 25 + epochs_max: 200 + hidden_layers: "256" + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - KNN: + model: KNN + neighbors: 3 + file_format: csv +NAGuideR_methods: + - BPCA + - COLMEDIAN + # - GSIMP > 24h, killed + - IMPSEQ + - IMPSEQROB + - IRM + - KNN_IMPUTE + - LLS + - MICE-CART + - MICE-NORM + - MINDET + - MINIMUM + - MINPROB + - MLE + - MSIMPUTE + - MSIMPUTE_MNAR + - PI + - QRILC + - RF + - ROWMEDIAN + - SEQKNN + - SVDMETHOD + - TRKNN + - ZERO diff --git a/project/config/single_dev_dataset/mnar_mcar/pg_l.yaml b/project/config/single_dev_dataset/mnar_mcar/pg_l.yaml new file mode 100755 index 000000000..9cfb5307c --- /dev/null +++ b/project/config/single_dev_dataset/mnar_mcar/pg_l.yaml @@ -0,0 +1,73 @@ +# config for Snakefile_v2 +config_split: runs/mnar_mcar/pg_l_50MNAR/01_0_split_data.yaml # ! will be build +config_train: runs/mnar_mcar/pg_l_50MNAR/train_{model}.yaml # ! will be build, should say model_key next +folder_experiment: runs/mnar_mcar/pg_l_50MNAR +frac_mnar: 0.5 +fn_rawfile_metadata: data/dev_datasets/df_intensities_proteinGroups_long/metadata.csv +cuda: False +file_format: csv +split_data: + FN_INTENSITIES: data/dev_datasets/df_intensities_proteinGroups_long/Q_Exactive_HF_X_Orbitrap_6070.pkl + sample_completeness: 0.4 + feat_prevalence: 0.25 + min_RT_time: 120 + index_col: 0 + meta_date_col: Content Creation Date +models: + - Median: + model: Median + - CF: # 2min + model: CF + latent_dim: 50 + batch_size: 32768 + epochs_max: 100 + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - DAE: # 2min + model: DAE + latent_dim: 25 + batch_size: 64 + epochs_max: 100 + hidden_layers: "512" + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - VAE: # 2min + model: VAE + latent_dim: 25 + batch_size: 64 + epochs_max: 50 + hidden_layers: "512" + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - KNN: + model: KNN + neighbors: 3 + file_format: csv +NAGuideR_methods: + - BPCA #6h41min + - COLMEDIAN + # - GSIMP # stopped after 24h + - IMPSEQ # 1min + - IMPSEQROB + - IRM # 7h52min + - KNN_IMPUTE + - LLS + # - MICE-CART # stopped after 24h + # - MICE-NORM # stopped after 24h + - MINDET + - MINIMUM + - MINPROB + - MLE + - MSIMPUTE + - MSIMPUTE_MNAR + - PI + - QRILC + - RF # 58min + - ROWMEDIAN + # - SEQKNN # Error in x[od, ismiss, drop = FALSE]: subscript out of bounds + - SVDMETHOD # 16min + - TRKNN # 5h38min + - ZERO diff --git a/project/config/single_dev_dataset/mnar_mcar/pg_m.yaml b/project/config/single_dev_dataset/mnar_mcar/pg_m.yaml new file mode 100644 index 000000000..b484276c5 --- /dev/null +++ b/project/config/single_dev_dataset/mnar_mcar/pg_m.yaml @@ -0,0 +1,78 @@ +# config for Snakefile_v2 +config_split: runs/mnar_mcar/pg_m_50MNAR/01_0_split_data.yaml # ! will be build +config_train: runs/mnar_mcar/pg_m_50MNAR/train_{model}.yaml # ! will be build, should say model_key next +folder_experiment: runs/mnar_mcar/pg_m_50MNAR +frac_mnar: 0.5 +fn_rawfile_metadata: data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv +file_format: csv +split_data: + FN_INTENSITIES: data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv + sample_completeness: 0.4 + feat_prevalence: 0.25 + min_RT_time: 120 + column_names: + - Gene Names + index_col: 0 + meta_date_col: Content Creation Date +models: + - Median: + model: Median # needs to set at least one parameter + - CF: + model: CF + file_format: csv + latent_dim: 50 + batch_size: 4096 + epochs_max: 20 + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - DAE: + model: DAE + file_format: csv + latent_dim: 10 + batch_size: 25 + patience: 50 + epochs_max: 200 + hidden_layers: "512" + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - VAE: + model: VAE + file_format: csv + latent_dim: 25 + batch_size: 25 + epochs_max: 200 + hidden_layers: "512_256" + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - KNN: + model: KNN + neighbors: 3 + file_format: csv +NAGuideR_methods: + - BPCA + - COLMEDIAN + - GSIMP + - IMPSEQ + - IMPSEQROB + - IRM + - KNN_IMPUTE + - LLS + - MICE-CART + - MICE-NORM + - MINDET + - MINIMUM + - MINPROB + - MLE + - MSIMPUTE + - MSIMPUTE_MNAR + - PI + - QRILC + - RF + - ROWMEDIAN + - SEQKNN + - SVDMETHOD + - TRKNN + - ZERO diff --git a/project/config/single_dev_dataset/peptides/config.yaml b/project/config/single_dev_dataset/peptides/config.yaml index 6a81ae3b5..102e77741 100644 --- a/project/config/single_dev_dataset/peptides/config.yaml +++ b/project/config/single_dev_dataset/peptides/config.yaml @@ -1,32 +1,35 @@ +# config for Snakefile_v1 config_split: config/single_dev_dataset/peptides/split.yaml config_train: config/single_dev_dataset/peptides/train_{model}.yaml folder_experiment: runs/dev_dataset_large/peptides fn_rawfile_metadata: data/dev_datasets/df_intensities_peptides_long/metadata.csv +cuda: False models: - - Median # ~ 1:20 min - - CF # ~ 12:07 min - - DAE # 4:36 mins - - VAE # ~ 3:32 min - - KNN # ~ 2:20 min + - Median # ~ 1:20 min + - CF # ~ 12:07 min + - DAE # 4:36 mins + - VAE # ~ 3:32 min + - KNN # ~ 2:20 min NAGuideR_methods: - - ZERO - - MINIMUM - - COLMEDIAN - - ROWMEDIAN - - KNN_IMPUTE - # - SEQKNN # error - # - BPCA # stopped after 30mins - # - SVDMETHOD # stopped after 30mins - # - LLS # stopped after 30mins - # - MLE # error - - QRILC - - MINDET - - MINPROB - # - IRM # stopped after 30mins - # - IMPSEQ # error - # - IMPSEQROB # error - # - MICE-NORM # stopped after 30mins - # - MICE-CART # stopped after 30mins - # - TRKNN # error - # - RF # stopped after 30mins - - PI + # - BPCA # stopped after 24h + - COLMEDIAN + - IMPSEQ + - IMPSEQROB + # - IRM # stopped after 24h + - KNN_IMPUTE + # - LLS # error + # - MICE-CART # stopped after 24h + # - MICE-NORM # stopped after 24h + - MINDET + - MINIMUM + - MINPROB + - MLE + - MSIMPUTE + - PI + - QRILC + # - RF # stopped after 24h + - ROWMEDIAN + # - SEQKNN # error: Error in x[od, ismiss, drop = FALSE]: subscript out of bounds + - SVDMETHOD + # - TRKNN # stopped after 24h + - ZERO diff --git a/project/config/single_dev_dataset/peptides/split.yaml b/project/config/single_dev_dataset/peptides/split.yaml index 732957cb3..5c5c53d38 100644 --- a/project/config/single_dev_dataset/peptides/split.yaml +++ b/project/config/single_dev_dataset/peptides/split.yaml @@ -2,4 +2,6 @@ FN_INTENSITIES: data/dev_datasets/df_intensities_peptides_long/Q_Exactive_HF_X_O fn_rawfile_metadata: data/dev_datasets/df_intensities_peptides_long/metadata.csv sample_completeness: 0.5 min_RT_time: 120 -column_names: null \ No newline at end of file +column_names: null +frac_mnar: 0.25 +frac_non_train: 0.1 diff --git a/project/config/single_dev_dataset/peptides/train_CF.yaml b/project/config/single_dev_dataset/peptides/train_CF.yaml index 9324564b6..2068cc158 100644 --- a/project/config/single_dev_dataset/peptides/train_CF.yaml +++ b/project/config/single_dev_dataset/peptides/train_CF.yaml @@ -3,5 +3,5 @@ latent_dim: 50 batch_size: 32768 epochs_max: 100 sample_idx_position: 0 -cuda: True +cuda: False save_pred_real_na: True \ No newline at end of file diff --git a/project/config/single_dev_dataset/peptides/train_DAE.yaml b/project/config/single_dev_dataset/peptides/train_DAE.yaml index 8f94a9b07..aa1e0bd06 100644 --- a/project/config/single_dev_dataset/peptides/train_DAE.yaml +++ b/project/config/single_dev_dataset/peptides/train_DAE.yaml @@ -4,5 +4,5 @@ batch_size: 64 epochs_max: 100 hidden_layers: "1024" sample_idx_position: 0 -cuda: True +cuda: False save_pred_real_na: True \ No newline at end of file diff --git a/project/config/single_dev_dataset/peptides/train_VAE.yaml b/project/config/single_dev_dataset/peptides/train_VAE.yaml index 462cf3e3a..dd3a9cca6 100644 --- a/project/config/single_dev_dataset/peptides/train_VAE.yaml +++ b/project/config/single_dev_dataset/peptides/train_VAE.yaml @@ -4,5 +4,5 @@ batch_size: 64 epochs_max: 50 hidden_layers: "512" sample_idx_position: 0 -cuda: True +cuda: False save_pred_real_na: True \ No newline at end of file diff --git a/project/config/single_dev_dataset/peptides_N50/config.yaml b/project/config/single_dev_dataset/peptides_N50/config.yaml index 0f12e6b0c..91888760c 100644 --- a/project/config/single_dev_dataset/peptides_N50/config.yaml +++ b/project/config/single_dev_dataset/peptides_N50/config.yaml @@ -1,3 +1,4 @@ +# config for Snakefile_v1 config_split: config/single_dev_dataset/peptides_N50/split.yaml config_train: config/single_dev_dataset/peptides_N50/train_{model}.yaml folder_experiment: runs/dev_dataset_small/peptides_N50 @@ -6,29 +7,30 @@ cuda: False file_format: csv models: - Median # maps to median on file system? - - CF - - DAE - - VAE + - CF # 3min + - DAE # 3min + - VAE # 5min - KNN NAGuideR_methods: - - ZERO - - MINIMUM - - COLMEDIAN - - ROWMEDIAN - - KNN_IMPUTE - # - SEQKNN # error - - BPCA - - SVDMETHOD - # - LLS # stopped after 30mins - - MLE - - QRILC # fails for N=10 - - MINDET - - MINPROB - # - IRM # stopped after 30mins - # - IMPSEQ # error - - IMPSEQROB - - MICE-NORM - # - MICE-CART # stopped after 30mins - # - TRKNN # error - # - RF # stopped after 30mins - - PI \ No newline at end of file + - BPCA #27min + - COLMEDIAN + - IMPSEQ + - IMPSEQROB + - IRM # 8h55min + - KNN_IMPUTE + - LLS # 10h21min + - MICE-CART # 2h20min + - MICE-NORM # 4h25min + - MINDET + - MINIMUM + - MINPROB + - MLE + - MSIMPUTE + - PI + - QRILC + - RF # 5h38min + - ROWMEDIAN + - SEQKNN # 17min + - SVDMETHOD + - TRKNN #4h15min + - ZERO \ No newline at end of file diff --git a/project/config/single_dev_dataset/peptides_N50/split.yaml b/project/config/single_dev_dataset/peptides_N50/split.yaml index 901cc2a96..b6f68b86c 100644 --- a/project/config/single_dev_dataset/peptides_N50/split.yaml +++ b/project/config/single_dev_dataset/peptides_N50/split.yaml @@ -4,4 +4,6 @@ min_RT_time: 120 select_N: 50 column_names: null index_col: 0 -meta_date_col: Content Creation Date \ No newline at end of file +meta_date_col: Content Creation Date +frac_mnar: 0.25 +frac_non_train: 0.1 diff --git a/project/config/single_dev_dataset/peptides_N50/train_CF.yaml b/project/config/single_dev_dataset/peptides_N50/train_CF.yaml index 2ca3c5ac1..802add8cd 100644 --- a/project/config/single_dev_dataset/peptides_N50/train_CF.yaml +++ b/project/config/single_dev_dataset/peptides_N50/train_CF.yaml @@ -4,5 +4,5 @@ latent_dim: 50 batch_size: 4096 epochs_max: 30 sample_idx_position: 0 -cuda: True +cuda: False save_pred_real_na: True \ No newline at end of file diff --git a/project/config/single_dev_dataset/peptides_N50/train_DAE.yaml b/project/config/single_dev_dataset/peptides_N50/train_DAE.yaml index c337dcf74..627750d8b 100644 --- a/project/config/single_dev_dataset/peptides_N50/train_DAE.yaml +++ b/project/config/single_dev_dataset/peptides_N50/train_DAE.yaml @@ -1,8 +1,8 @@ file_format: csv -latent_dim: 10 +latent_dim: 75 batch_size: 10 epochs_max: 100 -hidden_layers: "512" +hidden_layers: "256_128" sample_idx_position: 0 -cuda: True +cuda: False save_pred_real_na: True \ No newline at end of file diff --git a/project/config/single_dev_dataset/peptides_N50/train_VAE.yaml b/project/config/single_dev_dataset/peptides_N50/train_VAE.yaml index 09d9c3ff7..17b98c7fe 100644 --- a/project/config/single_dev_dataset/peptides_N50/train_VAE.yaml +++ b/project/config/single_dev_dataset/peptides_N50/train_VAE.yaml @@ -1,10 +1,10 @@ # models_training: -folder_experiment: runs/example +folder_experiment: runs/example file_format: csv -latent_dim: 25 +latent_dim: 50 batch_size: 10 epochs_max: 100 -hidden_layers: "512_256" +hidden_layers: "256" sample_idx_position: 0 -cuda: True +cuda: False save_pred_real_na: True \ No newline at end of file diff --git a/project/config/single_dev_dataset/proteinGroups/config.yaml b/project/config/single_dev_dataset/proteinGroups/config.yaml index 3f470f6e6..6b9baadf7 100644 --- a/project/config/single_dev_dataset/proteinGroups/config.yaml +++ b/project/config/single_dev_dataset/proteinGroups/config.yaml @@ -1,33 +1,35 @@ +# config for Snakefile_v1 config_split: config/single_dev_dataset/proteinGroups/split.yaml config_train: config/single_dev_dataset/proteinGroups/train_{model}.yaml folder_experiment: runs/dev_dataset_large/proteinGroups fn_rawfile_metadata: data/dev_datasets/df_intensities_proteinGroups_long/metadata.csv -cuda: True +cuda: False models: - Median - - CF - - DAE - - VAE + - CF # 2min + - DAE # 2min + - VAE # 2min - KNN NAGuideR_methods: - - ZERO - - MINIMUM + - BPCA #6h41min - COLMEDIAN - - ROWMEDIAN + - IMPSEQ # 1min + - IMPSEQROB + - IRM # 7h52min - KNN_IMPUTE - # - SEQKNN # error - # - BPCA # stopped after 30mins - - SVDMETHOD # stopped after 30mins - LLS - # - MLE # error - - QRILC + # - MICE-CART # stopped after 24h + # - MICE-NORM # stopped after 24h - MINDET + - MINIMUM - MINPROB - # - IRM # stopped after 30mins - # - IMPSEQ # error - # - IMPSEQROB # error - # - MICE-NORM # stopped after 30mins - # - MICE-CART # stopped after 30mins - # - TRKNN # error - - RF - - PI \ No newline at end of file + - MLE + - MSIMPUTE + - PI + - QRILC + - RF # 58min + - ROWMEDIAN + # - SEQKNN # Error in x[od, ismiss, drop = FALSE]: subscript out of bounds + - SVDMETHOD # 16min + - TRKNN # 5h38min + - ZERO \ No newline at end of file diff --git a/project/config/single_dev_dataset/proteinGroups/split.yaml b/project/config/single_dev_dataset/proteinGroups/split.yaml index 675f1b785..414b9a887 100644 --- a/project/config/single_dev_dataset/proteinGroups/split.yaml +++ b/project/config/single_dev_dataset/proteinGroups/split.yaml @@ -1,3 +1,5 @@ FN_INTENSITIES: data/dev_datasets/df_intensities_proteinGroups_long/Q_Exactive_HF_X_Orbitrap_6070.pkl sample_completeness: 0.5 -min_RT_time: 120 \ No newline at end of file +min_RT_time: 120 +frac_mnar: 0.25 +frac_non_train: 0.1 diff --git a/project/config/single_dev_dataset/proteinGroups/train_CF.yaml b/project/config/single_dev_dataset/proteinGroups/train_CF.yaml index f879b0f13..2068cc158 100644 --- a/project/config/single_dev_dataset/proteinGroups/train_CF.yaml +++ b/project/config/single_dev_dataset/proteinGroups/train_CF.yaml @@ -1,8 +1,7 @@ file_format: csv -fn_rawfile_metadata: data/dev_datasets/df_intensities_proteinGroups_long/metadata.csv latent_dim: 50 batch_size: 32768 epochs_max: 100 sample_idx_position: 0 -cuda: True +cuda: False save_pred_real_na: True \ No newline at end of file diff --git a/project/config/single_dev_dataset/proteinGroups/train_DAE.yaml b/project/config/single_dev_dataset/proteinGroups/train_DAE.yaml index 86ac35fb9..b3a22c8bb 100644 --- a/project/config/single_dev_dataset/proteinGroups/train_DAE.yaml +++ b/project/config/single_dev_dataset/proteinGroups/train_DAE.yaml @@ -1,9 +1,8 @@ file_format: csv -fn_rawfile_metadata: data/dev_datasets/df_intensities_proteinGroups_long/metadata.csv latent_dim: 25 batch_size: 64 epochs_max: 100 hidden_layers: "512" sample_idx_position: 0 -cuda: True +cuda: False save_pred_real_na: True \ No newline at end of file diff --git a/project/config/single_dev_dataset/proteinGroups/train_KNN.yaml b/project/config/single_dev_dataset/proteinGroups/train_KNN.yaml index baa99b732..2d056f335 100644 --- a/project/config/single_dev_dataset/proteinGroups/train_KNN.yaml +++ b/project/config/single_dev_dataset/proteinGroups/train_KNN.yaml @@ -1,3 +1,2 @@ neighbors: 3 -file_format: csv -fn_rawfile_metadata: data/dev_datasets/df_intensities_proteinGroups_long/metadata.csv \ No newline at end of file +file_format: csv \ No newline at end of file diff --git a/project/config/single_dev_dataset/proteinGroups/train_Median.yaml b/project/config/single_dev_dataset/proteinGroups/train_Median.yaml index 340efcd69..745cca2c5 100644 --- a/project/config/single_dev_dataset/proteinGroups/train_Median.yaml +++ b/project/config/single_dev_dataset/proteinGroups/train_Median.yaml @@ -1,2 +1 @@ -file_format: csv -fn_rawfile_metadata: data/dev_datasets/df_intensities_proteinGroups_long/metadata.csv \ No newline at end of file +file_format: csv \ No newline at end of file diff --git a/project/config/single_dev_dataset/proteinGroups/train_VAE.yaml b/project/config/single_dev_dataset/proteinGroups/train_VAE.yaml index 7d3f29462..7caad9dab 100644 --- a/project/config/single_dev_dataset/proteinGroups/train_VAE.yaml +++ b/project/config/single_dev_dataset/proteinGroups/train_VAE.yaml @@ -1,10 +1,9 @@ # models_training: file_format: csv -fn_rawfile_metadata: data/dev_datasets/df_intensities_proteinGroups_long/metadata.csv latent_dim: 25 batch_size: 64 epochs_max: 50 hidden_layers: "512" sample_idx_position: 0 -cuda: True +cuda: False save_pred_real_na: True \ No newline at end of file diff --git a/project/config/single_dev_dataset/proteinGroups_N50/config.yaml b/project/config/single_dev_dataset/proteinGroups_N50/config.yaml index 004c4b5aa..6c8335674 100644 --- a/project/config/single_dev_dataset/proteinGroups_N50/config.yaml +++ b/project/config/single_dev_dataset/proteinGroups_N50/config.yaml @@ -1,3 +1,4 @@ +# config for Snakefile_v1 config_split: config/single_dev_dataset/proteinGroups_N50/split.yaml config_train: config/single_dev_dataset/proteinGroups_N50/train_{model}.yaml folder_experiment: runs/dev_dataset_small/proteinGroups_N50 @@ -10,24 +11,25 @@ models: - VAE - KNN NAGuideR_methods: - - ZERO - - MINIMUM - - COLMEDIAN - - ROWMEDIAN - - KNN_IMPUTE - - SEQKNN - - BPCA - - SVDMETHOD - - LLS - - MLE - - QRILC - - MINDET - - MINPROB - - IRM # stopped after 30mins - - IMPSEQ # error - - IMPSEQROB - - MICE-NORM - - MICE-CART # stopped after 30mins - # - TRKNN # error - - RF - - PI + - BPCA + - COLMEDIAN + - IMPSEQ + - IMPSEQROB + - IRM + - KNN_IMPUTE + - LLS + - MICE-CART + - MICE-NORM + - MINDET + - MINIMUM + - MINPROB + - MLE + - MSIMPUTE + - PI + - QRILC + - RF + - ROWMEDIAN + - SEQKNN + - SVDMETHOD + - TRKNN + - ZERO diff --git a/project/config/single_dev_dataset/proteinGroups_N50/config_v2.yaml b/project/config/single_dev_dataset/proteinGroups_N50/config_v2.yaml new file mode 100644 index 000000000..51b501a3b --- /dev/null +++ b/project/config/single_dev_dataset/proteinGroups_N50/config_v2.yaml @@ -0,0 +1,74 @@ +# config for Snakefile_v2 +config_split: config/single_dev_dataset/proteinGroups_N50/split.yaml # ! will be build +config_train: config/single_dev_dataset/proteinGroups_N50/train_{model}.yaml # ! will be build +folder_experiment: runs/dev_dataset_small/proteinGroups_N50_Snakefile_v2 +fn_rawfile_metadata: data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv +file_format: csv +split_data: + FN_INTENSITIES: data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv + fn_rawfile_metadata: data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv + sample_completeness: 0.5 + min_RT_time: 120 + column_names: + - Gene Names + index_col: 0 + meta_date_col: Content Creation Date +models: + - Median: + model: Median # needs to set at least one parameter + - CF: + model: CF + file_format: csv + latent_dim: 50 + batch_size: 4096 + epochs_max: 20 + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - DAE: + model: DAE + file_format: csv + latent_dim: 10 + batch_size: 10 + epochs_max: 200 + hidden_layers: "512" + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - VAE: + model: VAE + file_format: csv + latent_dim: 25 + batch_size: 10 + epochs_max: 200 + hidden_layers: "512_256" + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - KNN: + model: KNN + neighbors: 3 + file_format: csv +NAGuideR_methods: + - BPCA + - COLMEDIAN + - IMPSEQ + - IMPSEQROB + - IRM + - KNN_IMPUTE + - LLS + - MICE-CART + - MICE-NORM + - MINDET + - MINIMUM + - MINPROB + - MLE + - MSIMPUTE + - PI + - QRILC + - RF + - ROWMEDIAN + - SEQKNN + - SVDMETHOD + - TRKNN + - ZERO diff --git a/project/config/single_dev_dataset/proteinGroups_N50/split.yaml b/project/config/single_dev_dataset/proteinGroups_N50/split.yaml index 32672a3ba..10287480f 100644 --- a/project/config/single_dev_dataset/proteinGroups_N50/split.yaml +++ b/project/config/single_dev_dataset/proteinGroups_N50/split.yaml @@ -1,8 +1,11 @@ +# Build in Snakemake workflow FN_INTENSITIES: data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv fn_rawfile_metadata: data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv sample_completeness: 0.5 min_RT_time: 120 column_names: - - Gene Names +- Gene Names index_col: 0 -meta_date_col: Content Creation Date \ No newline at end of file +meta_date_col: Content Creation Date +frac_mnar: 0.25 +frac_non_train: 0.1 diff --git a/project/config/single_dev_dataset/proteinGroups_N50/train_CF.yaml b/project/config/single_dev_dataset/proteinGroups_N50/train_CF.yaml index 30f22595c..0b18de049 100644 --- a/project/config/single_dev_dataset/proteinGroups_N50/train_CF.yaml +++ b/project/config/single_dev_dataset/proteinGroups_N50/train_CF.yaml @@ -1,8 +1,9 @@ -folder_experiment: runs/example +# Build in Snakemake workflow +model: CF file_format: csv latent_dim: 50 batch_size: 4096 epochs_max: 20 sample_idx_position: 0 -cuda: False -save_pred_real_na: True \ No newline at end of file +cuda: false +save_pred_real_na: true diff --git a/project/config/single_dev_dataset/proteinGroups_N50/train_DAE.yaml b/project/config/single_dev_dataset/proteinGroups_N50/train_DAE.yaml index 7fe65c7ca..f3c6d66b0 100644 --- a/project/config/single_dev_dataset/proteinGroups_N50/train_DAE.yaml +++ b/project/config/single_dev_dataset/proteinGroups_N50/train_DAE.yaml @@ -1,8 +1,10 @@ +# Build in Snakemake workflow +model: DAE file_format: csv latent_dim: 10 batch_size: 10 epochs_max: 200 -hidden_layers: "512" +hidden_layers: '512' sample_idx_position: 0 -cuda: False -save_pred_real_na: True \ No newline at end of file +cuda: false +save_pred_real_na: true diff --git a/project/config/single_dev_dataset/proteinGroups_N50/train_KNN.yaml b/project/config/single_dev_dataset/proteinGroups_N50/train_KNN.yaml index 2d056f335..f5a6dd36e 100644 --- a/project/config/single_dev_dataset/proteinGroups_N50/train_KNN.yaml +++ b/project/config/single_dev_dataset/proteinGroups_N50/train_KNN.yaml @@ -1,2 +1,4 @@ +# Build in Snakemake workflow +model: KNN neighbors: 3 -file_format: csv \ No newline at end of file +file_format: csv diff --git a/project/config/single_dev_dataset/proteinGroups_N50/train_Median.yaml b/project/config/single_dev_dataset/proteinGroups_N50/train_Median.yaml index 745cca2c5..61c18b9d7 100644 --- a/project/config/single_dev_dataset/proteinGroups_N50/train_Median.yaml +++ b/project/config/single_dev_dataset/proteinGroups_N50/train_Median.yaml @@ -1 +1,2 @@ -file_format: csv \ No newline at end of file +# Build in Snakemake workflow +model: Median diff --git a/project/config/single_dev_dataset/proteinGroups_N50/train_RSN.yaml b/project/config/single_dev_dataset/proteinGroups_N50/train_RSN.yaml deleted file mode 100644 index dba9e23d8..000000000 --- a/project/config/single_dev_dataset/proteinGroups_N50/train_RSN.yaml +++ /dev/null @@ -1 +0,0 @@ -axis: 0 # per protein groups RSN \ No newline at end of file diff --git a/project/config/single_dev_dataset/proteinGroups_N50/train_VAE.yaml b/project/config/single_dev_dataset/proteinGroups_N50/train_VAE.yaml index bda8a795e..48ed2025d 100644 --- a/project/config/single_dev_dataset/proteinGroups_N50/train_VAE.yaml +++ b/project/config/single_dev_dataset/proteinGroups_N50/train_VAE.yaml @@ -1,10 +1,10 @@ -# models_training: -folder_experiment: runs/example +# Build in Snakemake workflow +model: VAE file_format: csv latent_dim: 25 batch_size: 10 epochs_max: 200 -hidden_layers: "512_256" +hidden_layers: '512_256' sample_idx_position: 0 -cuda: False -save_pred_real_na: True \ No newline at end of file +cuda: false +save_pred_real_na: true diff --git a/project/config/training_data/evidence.py b/project/config/training_data/evidence.py deleted file mode 100644 index e3651f867..000000000 --- a/project/config/training_data/evidence.py +++ /dev/null @@ -1,18 +0,0 @@ -from ..counter_fpaths import FNAME_C_EVIDENCE -from vaep.io import data_objects - -NAME = 'evidence' -BASE_NAME = f"df_intensities_{NAME}_long" - -TYPES_DUMP = {'Sample ID': 'category', - 'Sequence': 'category', - 'Charge': 'category',} - -TYPES_COUNT = {'Charge': int} - -IDX_COLS_LONG = ['Sample ID', 'Sequence', 'Charge'] # in order - -LOAD_DUMP = data_objects.load_evidence_dump - -CounterClass = data_objects.EvidenceCounter -FNAME_COUNTER = FNAME_C_EVIDENCE \ No newline at end of file diff --git a/project/config/training_data/peptides.py b/project/config/training_data/peptides.py deleted file mode 100644 index c4e8075a3..000000000 --- a/project/config/training_data/peptides.py +++ /dev/null @@ -1,19 +0,0 @@ -from ..counter_fpaths import FNAME_C_PEPTIDES -from vaep.io import data_objects - -NAME = 'peptides' -BASE_NAME = f"df_intensities_{NAME}_long" - -TYPES_DUMP = {'Sample ID': 'category', - 'Sequence': 'category', - } - -TYPES_COUNT = {} - -IDX_COLS_LONG = ['Sample ID', 'Sequence'] # in order - -LOAD_DUMP = data_objects.load_agg_peptide_dump - -CounterClass = data_objects.PeptideCounter -FNAME_COUNTER = FNAME_C_PEPTIDES - diff --git a/project/config/training_data/proteinGroups.py b/project/config/training_data/proteinGroups.py deleted file mode 100644 index 4158623cf..000000000 --- a/project/config/training_data/proteinGroups.py +++ /dev/null @@ -1,20 +0,0 @@ -from ..counter_fpaths import FNAME_C_GENES # use genes as identifier between samples -from vaep.io import data_objects - -NAME = 'proteinGroups' - -BASE_NAME = f"df_intensities_{NAME}_long" - -TYPES_DUMP = {'Sample ID': 'category', - 'Gene names': 'category', - } - -TYPES_COUNT = {} - -IDX_COLS_LONG = ['Sample ID', 'Gene names'] # in order - -LOAD_DUMP = data_objects.pg_idx_gene_fct - -CounterClass = data_objects.GeneCounter - -FNAME_COUNTER = FNAME_C_GENES diff --git a/project/data/dev_datasets/HeLa_6070/protein_groups_wide_N50_M227.csv b/project/data/dev_datasets/HeLa_6070/protein_groups_wide_N50_M227.csv new file mode 100644 index 000000000..3f9a79c48 --- /dev/null +++ b/project/data/dev_datasets/HeLa_6070/protein_groups_wide_N50_M227.csv @@ -0,0 +1,51 @@ +Sample ID,NOP16,PTGES,WAC,CRNKL1,RPL18A,LPGAT1,PTGES3L-AARSD1;AARSD1,PPP1R2;PPP1R2P3,TOE1,FRG1,ALDH1A3,RPL36A;RPL36A-HNRNPH2,FBXO22,YIF1A,CEP55,DNAJC11,RARS2,QKI,DHX9,AGK,LRRC40,LIMD1,ALDOA,SOAT1,DIAPH1,CTH,SPRYD4,CD3EAP,NUP37,ARMC1,ATP2A2,EPB41L2,ATIC,CHORDC1,IQGAP1,ERMP1,LIMA1,C8orf82,COPG1,MDH2,C9orf78,TMA16,RPS10,H3F3B;H3F3A,XPA,MDC1,SORBS2,RCC2,RPL7A,DYNC1H1,LASP1,BOP1,ARMCX3,S100A10,NUP98,MRFAP1,BCAS2,NUDCD3,APEX1,S100A13,TEX10,RPS5,C15orf40,PIGK,COL14A1,EGFR,GGCT,ATXN10,SARS,KIF22,RABEPK,COG5,PGRMC2,RPS14,SDCBP,ZNF622,ST13;ST13P5;ST13P4,COMMD4,GPAA1,DNAJC10,RAD21,POLE4,AHSA1,SNAP29,HIST1H4A,POLR3E,NUDT9,MRPL3,DIMT1,MXRA7,UFL1,MARCH5,PBDC1,UBE3A,RPL29,SUMO2,PPIH,TIMELESS,ACOT7,TAMM41,SLC25A12,CNN3,PSMB5,SERPINB6,RPL27,MMTAG2,NOL6,RAD23B,LAMP1,ITPRIP,TCEB1,SLC35A4,COPE,PSMC6,NDUFA6,SSFA2,CEP170,HNRNPU,TP53RK,ZNF593,GAPDH,PPP2CB,PPP1CB,USP5,PON2,TNPO2,C2orf49,EWSR1,IDI1,NCAPD2,DNAAF5,SLC16A1,GTF3C3,NUCB2;Nucb2,IDH2,UBLCP1,INCENP,RAP2C,SCO2,SUN1,TPX2,LARS2,BZW1,MRPS36,OCIAD2,ATAD2,DPH2,CHMP1A,MYL6B,PITHD1,TMX1,TRMT61A,TKT,RRP36,APRT,FAHD1,CARKD,NCKAP1,NLN,MAPKAPK2,WDR55,SEC23A,EDC4,SLC7A1,LYRM2,CHTOP,XAB2,METTL5,CCDC59,EXOSC9,FERMT2,DHCR7,DCTPP1,HMGN4,FTO,RPAP3,URB2,VPS35,CDC73,DHX37,NARS,STX12,DCTN5,ZNF512,U2AF1,RPL3,RPAP1,FH,FKBP9,EIF2B5,ESF1,FHOD1,H2AFX;HIST1H2AA,TIMMDC1,PPP2R2A,BST2,NDUFS4,SAE1,MED1,MYCBP2,DIAPH3,EIF3J,AAMP,GCLC,PQBP1,CSTF3,NACC1,PARP4,UNC13D,ADSL,RPS17,ECHS1,LSM2,GPX4,OTUD6B,SEL1L,MACROD1,RRAS2,ACTR2,TOMM34,RAB5A,ROR2,XPOT,KPNA2,SBNO1,AUP1,TMSB4X +2019_12_18_14_35_Q-Exactive-HF-X-Orbitrap_6070,316880000.0,218380000.0,42243000.0,137700000.0,3703500000.0,,274590000.0,165980000.0,123170000.0,47520000.0,,,232390000.0,,26628000.0,106600000.0,24513000.0,131740000.0,12530000000.0,283670000.0,245040000.0,66667000.0,26334000000.0,1810500000.0,1604400000.0,57433000.0,44031000.0,105410000.0,262160000.0,106410000.0,3306700000.0,1352400000.0,4352000000.0,3007000000.0,8299800000.0,69639000.0,630990000.0,11403000.0,2104700000.0,8193700000.0,397820000.0,148580000.0,6289900000.0,,13925000.0,185210000.0,7592400.0,1554800000.0,9229200000.0,9173700000.0,2002500000.0,516460000.0,,621120000.0,954820000.0,92938000.0,496410000.0,149270000.0,3389000000.0,542700000.0,505810000.0,9974800000.0,27364000.0,25643000.0,,241660000.0,692400000.0,180070000.0,807330000.0,90161000.0,154720000.0,94128000.0,410570000.0,4921000000.0,67315000.0,105230000.0,,71748000.0,23842000.0,,599160000.0,6877800.0,3479000000.0,37935000.0,42397000000.0,8037700.0,143990000.0,119560000.0,516040000.0,62578000.0,267020000.0,100640000.0,,54243000.0,1169800000.0,,435010000.0,156280000.0,549790000.0,32705000.0,51633000.0,,1352400000.0,1526700000.0,4853100000.0,61582000.0,354990000.0,1699100000.0,1156500000.0,44309000.0,2216200000.0,92837000.0,983020000.0,2266800000.0,176340000.0,,313480000.0,15967000000.0,128230000.0,25706000.0,103290000000.0,948890000.0,554960000.0,1626600000.0,283590000.0,26645000.0,25338000.0,462160000.0,273240000.0,1480000000.0,664960000.0,781540000.0,23490000.0,,53830000.0,65577000.0,80455000.0,162560000.0,97655000.0,104960000.0,507360000.0,121700000.0,1116800000.0,49590000.0,,546130000.0,93225000.0,86827000.0,,12408000.0,611940000.0,37012000.0,30803000000.0,,530090000.0,65414000.0,,467730000.0,521550000.0,92944000.0,38427000.0,520620000.0,1008500000.0,104190000.0,,101260000.0,212840000.0,29694000.0,52581000.0,97713000.0,282840000.0,179040000.0,495650000.0,,359740000.0,570490000.0,301760000.0,3164500000.0,748840000.0,185940000.0,1937300000.0,101010000.0,56744000.0,75991000.0,1384700000.0,13244000000.0,13171000.0,3381700000.0,278260000.0,170400000.0,243010000.0,77605000.0,,11287000.0,855450000.0,117570000.0,,600020000.0,63279000.0,,211540000.0,2394800000.0,114990000.0,630570000.0,289960000.0,209520000.0,115240000.0,28610000.0,18353000.0,794340000.0,6689500000.0,1297900000.0,263830000.0,30534000.0,86873000.0,18851000.0,306580000.0,88299000.0,819850000.0,867670000.0,76409000.0,10751000.0,1452000000.0,3109100000.0,306800000.0,242540000.0, +2019_12_19_19_48_Q-Exactive-HF-X-Orbitrap_6070,87856000.0,277430000.0,31772000.0,122270000.0,3820000000.0,,138890000.0,352970000.0,102380000.0,81775000.0,,891420000.0,178870000.0,,33813000.0,257470000.0,,115700000.0,9545100000.0,93645000.0,113490000.0,66898000.0,25799000000.0,1606900000.0,1322200000.0,101360000.0,29349000.0,73541000.0,333110000.0,92340000.0,2696300000.0,1128400000.0,3031100000.0,2850700000.0,6717200000.0,46996000.0,530430000.0,,970470000.0,11414000000.0,300800000.0,69004000.0,6116000000.0,,,105720000.0,,2091900000.0,7805400000.0,6541300000.0,1276000000.0,315520000.0,,1089600000.0,610000000.0,67607000.0,444670000.0,66830000.0,1981600000.0,304580000.0,320570000.0,10190000000.0,70964000.0,28319000.0,,336240000.0,532870000.0,271060000.0,835560000.0,40160000.0,158380000.0,92813000.0,468970000.0,4130400000.0,88243000.0,119870000.0,1858900000.0,44979000.0,,31977000.0,237900000.0,11794000.0,2636200000.0,37577000.0,52897000000.0,13648000.0,49774000.0,142710000.0,267890000.0,44183000.0,190210000.0,15236000.0,,32433000.0,1189000000.0,,276470000.0,45420000.0,356930000.0,72967000.0,64857000.0,,1269400000.0,1622500000.0,3764900000.0,27759000.0,173140000.0,997060000.0,885060000.0,60026000.0,1329800000.0,141780000.0,679540000.0,1778100000.0,52447000.0,,119970000.0,11732000000.0,,15474000.0,107710000000.0,12020000.0,334390000.0,1116400000.0,470340000.0,,58392000.0,437800000.0,411380000.0,1296200000.0,383820000.0,607790000.0,54000000.0,155950000.0,26097000.0,74560000.0,108470000.0,91499000.0,57806000.0,,309070000.0,62863000.0,1295600000.0,90030000.0,,304390000.0,18895000.0,137020000.0,,118060000.0,368670000.0,27985000.0,23137000000.0,44410000.0,562050000.0,150190000.0,,364590000.0,462060000.0,,178490000.0,416460000.0,690440000.0,23534000.0,26543000.0,100630000.0,209110000.0,,12492000.0,120750000.0,147910000.0,228890000.0,479290000.0,,307210000.0,587010000.0,108790000.0,2529700000.0,541460000.0,188130000.0,1669800000.0,85403000.0,23174000.0,29553000.0,1068000000.0,10915000000.0,,3825800000.0,166710000.0,193160000.0,217070000.0,75108000.0,,25740000.0,373750000.0,150180000.0,,479030000.0,24812000.0,,140770000.0,1998600000.0,80960000.0,373600000.0,55254000.0,205010000.0,140390000.0,85323000.0,,549570000.0,4114500000.0,626300000.0,113500000.0,30118000.0,75605000.0,13033000.0,222470000.0,47210000.0,836560000.0,856220000.0,,,852920000.0,2697200000.0,81395000.0,190730000.0, +2019_12_20_14_15_Q-Exactive-HF-X-Orbitrap_6070,308010000.0,245000000.0,15610000.0,164430000.0,5294600000.0,,210960000.0,432630000.0,145180000.0,108340000.0,,653910000.0,221190000.0,12817000.0,21449000.0,293500000.0,,87933000.0,13855000000.0,221820000.0,387800000.0,132850000.0,37574000000.0,2514900000.0,2322800000.0,85675000.0,,127240000.0,232490000.0,179190000.0,3635900000.0,1388800000.0,5293900000.0,4344700000.0,11187000000.0,156830000.0,745320000.0,,2512200000.0,16128000000.0,424660000.0,35391000.0,477720000.0,,,139940000.0,,2513900000.0,9961300000.0,11271000000.0,2236400000.0,323430000.0,,1557700000.0,1836000000.0,98440000.0,764800000.0,199820000.0,4517400000.0,723540000.0,519780000.0,14699000000.0,74927000.0,44352000.0,,864900000.0,593280000.0,441520000.0,1389400000.0,62692000.0,155560000.0,127910000.0,694430000.0,5682500000.0,104660000.0,292280000.0,,272980000.0,9448400.0,99146000.0,357070000.0,24125000.0,2293900000.0,49334000.0,65561000000.0,42065000.0,125000000.0,161270000.0,530530000.0,72854000.0,325420000.0,124150000.0,,42345000.0,1522400000.0,,403760000.0,140120000.0,565530000.0,122470000.0,43922000.0,,1665500000.0,2289000000.0,4575400000.0,41283000.0,426040000.0,1335400000.0,1218700000.0,98472000.0,2125300000.0,191650000.0,912750000.0,2656700000.0,121510000.0,,310000000.0,18557000000.0,84590000.0,46103000.0,130830000000.0,20963000.0,3979900000.0,1689600000.0,262760000.0,,12794000.0,629270000.0,714250000.0,2103900000.0,1044000000.0,880100000.0,117840000.0,192780000.0,84163000.0,67663000.0,199160000.0,91380000.0,64543000.0,61045000.0,693320000.0,181440000.0,1789600000.0,149440000.0,,767380000.0,62475000.0,202130000.0,,251260000.0,720520000.0,37117000.0,31773000000.0,73355000.0,939300000.0,191950000.0,,742470000.0,866130000.0,,172140000.0,915220000.0,1019400000.0,,21397000.0,117480000.0,405740000.0,39130000.0,52366000.0,254740000.0,261610000.0,346410000.0,737640000.0,237690000.0,573970000.0,930310000.0,290620000.0,4113600000.0,868470000.0,214090000.0,5593600000.0,116680000.0,58903000.0,50428000.0,2439200000.0,15545000000.0,9012900.0,5484400000.0,92837000.0,299050000.0,290020000.0,138500000.0,,31988000.0,1378200000.0,355070000.0,120910000.0,815740000.0,42582000.0,,237960000.0,3345300000.0,143640000.0,581560000.0,108350000.0,292380000.0,263250000.0,170920000.0,,790230000.0,7968300000.0,1614800000.0,287490000.0,47213000.0,111900000.0,81471000.0,466860000.0,121200000.0,1112200000.0,1338400000.0,,,1655300000.0,4295100000.0,204120000.0,331920000.0,1216600000.0 +2019_12_27_12_29_Q-Exactive-HF-X-Orbitrap_6070,253490000.0,215110000.0,70180000.0,131990000.0,3691000000.0,,132810000.0,150750000.0,98495000.0,43371000.0,,,73315000.0,28792000.0,,154770000.0,,119890000.0,8027700000.0,228350000.0,1558700000.0,53163000.0,21600000000.0,1440800000.0,1144200000.0,51114000.0,25102000.0,72448000.0,171730000.0,85617000.0,2439800000.0,1098600000.0,2946800000.0,2531600000.0,5422100000.0,32904000.0,416330000.0,,1237500000.0,10307000000.0,282410000.0,70963000.0,281740000.0,,,79585000.0,,1845000000.0,6934100000.0,6469800000.0,1311600000.0,249130000.0,,1052700000.0,528870000.0,33525000.0,477380000.0,85851000.0,1998800000.0,227590000.0,312700000.0,8589500000.0,50401000.0,46226000.0,,186100000.0,624750000.0,236630000.0,620010000.0,53278000.0,80966000.0,52733000.0,121100000.0,3135600000.0,79165000.0,173790000.0,2330800000.0,13597000.0,,42346000.0,238870000.0,12666000.0,1979400000.0,16500000.0,41173000000.0,20192000.0,46626000.0,80601000.0,327870000.0,81483000.0,166080000.0,29444000.0,,46459000.0,1127200000.0,,409560000.0,18579000.0,234140000.0,108540000.0,73933000.0,,987970000.0,1280000000.0,2911900000.0,33017000.0,249260000.0,831940000.0,932620000.0,48150000.0,1608200000.0,130050000.0,445080000.0,1784200000.0,,,149180000.0,9598700000.0,,24077000.0,88307000000.0,721900000.0,333140000.0,795100000.0,362270000.0,,51889000.0,393980000.0,388040000.0,967410000.0,463980000.0,638860000.0,31499000.0,87729000.0,24328000.0,50597000.0,138630000.0,38789000.0,48149000.0,,218410000.0,66245000.0,797740000.0,116550000.0,,240200000.0,32631000.0,90780000.0,,128010000.0,508440000.0,,23665000000.0,29470000.0,403400000.0,95995000.0,,483590000.0,418460000.0,13698000.0,143880000.0,244950000.0,666790000.0,18584000.0,,106020000.0,139380000.0,,41991000.0,111750000.0,164610000.0,314670000.0,405020000.0,39900000.0,150190000.0,388860000.0,152520000.0,2065500000.0,567070000.0,153530000.0,1401100000.0,86436000.0,,22542000.0,1143800000.0,8187800000.0,27313000.0,3296900000.0,100960000.0,150860000.0,100090000.0,68547000.0,,25243000.0,675240000.0,,14287000.0,432270000.0,61763000.0,,203640000.0,1953900000.0,80158000.0,474240000.0,128520000.0,106500000.0,144620000.0,47380000.0,17526000.0,411610000.0,4234000000.0,810160000.0,,29946000.0,38582000.0,28573000.0,215780000.0,39806000.0,756840000.0,500870000.0,,,691770000.0,2488100000.0,,144700000.0,886350000.0 +2019_12_29_15_06_Q-Exactive-HF-X-Orbitrap_6070,472860000.0,,88868000.0,56674000.0,9081200000.0,20411000.0,401790000.0,273330000.0,224910000.0,392740000.0,382940000.0,,152230000.0,,47647000.0,369750000.0,,196330000.0,11000000000.0,180410000.0,439590000.0,20241000.0,60643000000.0,223720000.0,1100600000.0,65662000.0,110070000.0,152340000.0,198830000.0,57398000.0,4147800000.0,1477300000.0,8346700000.0,2553700000.0,14428000000.0,52972000.0,2882300000.0,,3225200000.0,12124000000.0,454370000.0,96204000.0,,,20811000.0,101380000.0,328700000.0,5466600000.0,12999000000.0,15237000000.0,4563600000.0,639760000.0,159830000.0,7987900000.0,1338900000.0,,491210000.0,169580000.0,3742200000.0,1114700000.0,265330000.0,9551500000.0,,76252000.0,27482000.0,888960000.0,862310000.0,3243400000.0,2004800000.0,52984000.0,58978000.0,69839000.0,357020000.0,8898100000.0,173770000.0,643610000.0,,207620000.0,,414430000.0,537970000.0,60879000.0,2374300000.0,36388000.0,132810000000.0,46880000.0,155960000.0,378310000.0,244090000.0,29129000.0,207910000.0,33560000.0,210420000.0,22356000.0,7210000000.0,2539400000.0,709220000.0,7962400.0,814590000.0,,37102000.0,702570000.0,2585100000.0,1761500000.0,5910800000.0,,336430000.0,773070000.0,2126700000.0,104990000.0,1842400000.0,292340000.0,1430200000.0,2038600000.0,276100000.0,174190000.0,,28432000000.0,206370000.0,,144070000000.0,24103000.0,4082200000.0,2544800000.0,851980000.0,22715000.0,103960000.0,892770000.0,1805300000.0,2127000000.0,521610000.0,557860000.0,109470000.0,99954000.0,341590000.0,339030000.0,61101000.0,,55908000.0,109190000.0,424300000.0,406980000.0,4294200000.0,115920000.0,161870000.0,253800000.0,17528000.0,214250000.0,42328000.0,112150000.0,881180000.0,21874000.0,94428000000.0,136840000.0,1091500000.0,290100000.0,49245000.0,1130000000.0,814740000.0,31373000.0,22890000.0,1473700000.0,505040000.0,56561000.0,23775000.0,286380000.0,327950000.0,116890000.0,,63135000.0,3112900000.0,274140000.0,471790000.0,228720000.0,548240000.0,194560000.0,,1913700000.0,410950000.0,82016000.0,3318900000.0,,142500000.0,191350000.0,715760000.0,20911000000.0,6558300.0,3203300000.0,919960000.0,203580000.0,,73898000.0,,,875790000.0,882000000.0,,1848200000.0,89853000.0,54084000.0,299320000.0,2170100000.0,132520000.0,116190000.0,195380000.0,382390000.0,72813000.0,27394000.0,74925000.0,1691900000.0,7366900000.0,3111100000.0,957840000.0,12603000.0,104600000.0,80960000.0,387360000.0,42019000.0,3051400000.0,1098500000.0,,,821780000.0,3693000000.0,49519000.0,104010000.0,23303000000.0 +2019_12_29_18_18_Q-Exactive-HF-X-Orbitrap_6070,368040000.0,,70719000.0,90370000.0,12378000000.0,,398220000.0,612040000.0,141220000.0,381760000.0,319140000.0,4745100000.0,165340000.0,213390000.0,64692000.0,484480000.0,17174000.0,429980000.0,12636000000.0,325590000.0,497100000.0,33576000.0,63093000000.0,250160000.0,1821900000.0,67057000.0,79366000.0,150020000.0,243210000.0,95039000.0,5013100000.0,1827500000.0,9741600000.0,3322400000.0,16694000000.0,64646000.0,3331700000.0,23039000.0,3924100000.0,13086000000.0,348620000.0,71738000.0,,,30203000.0,198830000.0,338460000.0,5903100000.0,17954000000.0,18310000000.0,5033300000.0,833280000.0,141810000.0,8927400000.0,1559500000.0,,586960000.0,300600000.0,3580200000.0,1318400000.0,325070000.0,8083700000.0,,215510000.0,5683800.0,1115300000.0,833530000.0,3285200000.0,2484000000.0,137920000.0,67991000.0,84879000.0,461790000.0,9912900000.0,200060000.0,634480000.0,,201970000.0,63685000.0,660920000.0,724830000.0,37824000.0,2302600000.0,33247000.0,132590000000.0,19321000.0,188570000.0,455100000.0,392420000.0,60868000.0,231020000.0,57163000.0,326850000.0,37175000.0,8189800000.0,1850400000.0,1099600000.0,10469000.0,908800000.0,75454000.0,105370000.0,1043700000.0,2665000000.0,1906800000.0,6689900000.0,,405800000.0,1909900000.0,2590100000.0,116920000.0,3811200000.0,321060000.0,1653800000.0,2084000000.0,514930000.0,133900000.0,253330000.0,31258000000.0,256230000.0,,172120000000.0,,4869300000.0,2926600000.0,1266000000.0,,,1100800000.0,2049000000.0,2063300000.0,576260000.0,829370000.0,248120000.0,375830000.0,422250000.0,394130000.0,47358000.0,,78204000.0,99926000.0,523200000.0,480940000.0,4980000000.0,100450000.0,239250000.0,225540000.0,95196000.0,315990000.0,48205000.0,170400000.0,1011600000.0,95450000.0,105810000000.0,55397000.0,1248600000.0,427640000.0,61015000.0,1322500000.0,937780000.0,149200000.0,74143000.0,1666900000.0,590080000.0,56385000.0,50218000.0,212260000.0,419500000.0,81938000.0,,401930000.0,3457600000.0,408520000.0,701320000.0,139140000.0,383670000.0,344900000.0,,2942900000.0,664220000.0,98604000.0,4044800000.0,59213000.0,127270000.0,205520000.0,1864400000.0,26338000000.0,8410600.0,3611400000.0,783640000.0,355640000.0,46679000.0,44065000.0,,,1082700000.0,1500900000.0,340320000.0,1748400000.0,36389000.0,107360000.0,492970000.0,2714000000.0,241830000.0,106170000.0,94225000.0,631860000.0,50487000.0,239810000.0,98216000.0,2550800000.0,8630200000.0,2771500000.0,830170000.0,,185770000.0,156950000.0,499860000.0,101350000.0,3346400000.0,1129100000.0,,,822710000.0,4171900000.0,102690000.0,101020000.0,24464000000.0 +2020_01_02_17_38_Q-Exactive-HF-X-Orbitrap_6070,48680000.0,,14975000.0,,4585500000.0,9570900.0,75969000.0,78343000.0,22333000.0,20516000.0,77264000.0,,30439000.0,106410000.0,,84952000.0,,92255000.0,4101900000.0,78195000.0,101960000.0,,29261000000.0,46745000.0,360890000.0,,,14673000.0,90490000.0,16842000.0,1196000000.0,291390000.0,2926600000.0,678680000.0,4998700000.0,,602840000.0,,1286800000.0,5034400000.0,14973000.0,,,38960000.0,,15791000.0,51025000.0,1688900000.0,6915300000.0,4566600000.0,1613700000.0,135240000.0,,1284700000.0,311270000.0,10285000.0,63125000.0,,1477600000.0,486370000.0,32143000.0,4953100000.0,,16030000.0,18553000.0,91300000.0,291530000.0,1088400000.0,352860000.0,,,,326630000.0,3324300000.0,,103130000.0,,34960000.0,,82841000.0,139580000.0,,626920000.0,21122000.0,82002000000.0,,16005000.0,40482000.0,71900000.0,,31977000.0,,41642000.0,,1530500000.0,,151120000.0,,159640000.0,,,162160000.0,587160000.0,542210000.0,2090000000.0,,114410000.0,688910000.0,940310000.0,51950000.0,1502700000.0,107090000.0,385540000.0,590250000.0,162110000.0,,,11103000000.0,33022000.0,,70711000000.0,27037000.0,446490000.0,680370000.0,358520000.0,,,109470000.0,480420000.0,522490000.0,140460000.0,146170000.0,34178000.0,47292000.0,45759000.0,49251000.0,,,,12596000.0,28697000.0,20473000.0,1834500000.0,29131000.0,52134000.0,35967000.0,31505000.0,37093000.0,,,258950000.0,,51801000000.0,,861030000.0,73304000.0,,153460000.0,154710000.0,,,227660000.0,136820000.0,9142800.0,,54311000.0,41034000.0,,,38375000.0,670890000.0,48297000.0,52017000.0,236350000.0,75416000.0,70761000.0,,727910000.0,72377000.0,,746660000.0,20562000.0,,20612000.0,426720000.0,8624500000.0,,665090000.0,165440000.0,52629000.0,,,,9994800.0,224800000.0,530140000.0,,772700000.0,,,10447000.0,551390000.0,,19866000.0,52730000.0,59534000.0,,,,767440000.0,2940000000.0,1166600000.0,275630000.0,,14108000.0,,112570000.0,,863910000.0,370870000.0,,,218370000.0,1555500000.0,,,3534200000.0 +2020_01_03_11_17_Q-Exactive-HF-X-Orbitrap_6070,66619000.0,,,55405000.0,7191900000.0,3630200.0,105950000.0,151530000.0,20786000.0,,108290000.0,405260000.0,81460000.0,284910000.0,,61982000.0,,137110000.0,6601300000.0,46287000.0,202690000.0,,60145000000.0,66675000.0,909910000.0,,85102000.0,32648000.0,44977000.0,29309000.0,3111000000.0,510530000.0,4716100000.0,1259700000.0,9344400000.0,,1036800000.0,,1058400000.0,12093000000.0,181400000.0,,,44248000.0,,48100000.0,104180000.0,1969700000.0,10389000000.0,8395500000.0,2615300000.0,157190000.0,,3685300000.0,565530000.0,20205000.0,88465000.0,,2395500000.0,636720000.0,125220000.0,6343700000.0,,67488000.0,,278270000.0,488890000.0,1887700000.0,1687600000.0,,,,247610000.0,5263500000.0,,201770000.0,,32899000.0,,144490000.0,439800000.0,,1460200000.0,,324880000000.0,,85224000.0,,130830000.0,18965000.0,49594000.0,15799000.0,152670000.0,,3376800000.0,,228170000.0,,518070000.0,,47635000.0,242250000.0,1244900000.0,1750300000.0,4285900000.0,,177060000.0,655750000.0,1654800000.0,57933000.0,2746100000.0,157500000.0,631010000.0,836450000.0,337410000.0,,,17127000000.0,150760000.0,,155710000000.0,,1909500000.0,1478100000.0,398580000.0,,,197390000.0,754170000.0,1078200000.0,91532000.0,291390000.0,35949000.0,,100810000.0,113630000.0,,,,85219000.0,49205000.0,70833000.0,3213000000.0,52367000.0,10690000.0,17006000.0,,,27718000.0,,753470000.0,,78753000000.0,,558290000.0,141010000.0,,158170000.0,285910000.0,,,656650000.0,87504000.0,,,121460000.0,18782000.0,37474000.0,,106420000.0,1233300000.0,20219000.0,254330000.0,,142550000.0,187650000.0,,724530000.0,179910000.0,,2796800000.0,,47193000.0,18199000.0,,14005000000.0,,624280000.0,187450000.0,75148000.0,,,,,413870000.0,240560000.0,267480000.0,834750000.0,22066000.0,,253880000.0,674710000.0,40607000.0,,,110010000.0,,,,1604300000.0,3197900000.0,1387600000.0,837690000.0,6691800.0,30183000.0,,90010000.0,,1103900000.0,413030000.0,6843600.0,14683000.0,591770000.0,2317500000.0,,86736000.0,8635000000.0 +2020_01_03_16_58_Q-Exactive-HF-X-Orbitrap_6070,254760000.0,,44656000.0,26138000.0,8382000000.0,106410000.0,262340000.0,417110000.0,97686000.0,269440000.0,292960000.0,,80436000.0,311970000.0,21273000.0,370330000.0,,207780000.0,10288000000.0,270650000.0,228140000.0,,69862000000.0,216660000.0,894800000.0,22145000.0,,141740000.0,240400000.0,,3134600000.0,1403500000.0,6732200000.0,2759100000.0,10531000000.0,,2110700000.0,,2524400000.0,9024600000.0,294750000.0,,,,,210070000.0,195110000.0,4120200000.0,14939000000.0,12269000000.0,4267300000.0,515420000.0,179860000.0,5029200000.0,839720000.0,27897000.0,367390000.0,162690000.0,2526000000.0,1018700000.0,98526000.0,10650000000.0,41109000.0,86332000.0,55923000.0,481160000.0,481000000.0,2225500000.0,1555900000.0,32479000.0,40941000.0,67732000.0,331220000.0,7193200000.0,79483000.0,381730000.0,,56720000.0,,261210000.0,421810000.0,,2132200000.0,124430000.0,119920000000.0,,138960000.0,181750000.0,199860000.0,,70917000.0,,95943000.0,,8939600000.0,,831560000.0,,455780000.0,50611000.0,58816000.0,477030000.0,2392500000.0,882510000.0,5112300000.0,,224790000.0,462530000.0,1891200000.0,76655000.0,761630000.0,197970000.0,1072700000.0,1504000000.0,280740000.0,83675000.0,31262000.0,22827000000.0,150060000.0,,140280000000.0,1007700000.0,1128400000.0,2114200000.0,666680000.0,23213000.0,,558890000.0,1531700000.0,1230000000.0,254290000.0,462420000.0,118720000.0,211220000.0,250400000.0,146550000.0,,,,119840000.0,437020000.0,238660000.0,4485500000.0,150120000.0,66426000.0,117350000.0,46444000.0,334330000.0,25020000.0,67987000.0,680820000.0,8638000.0,100040000000.0,,1133600000.0,206910000.0,12655000.0,1055400000.0,659910000.0,,24331000.0,847700000.0,457950000.0,,43759000.0,320930000.0,337690000.0,,,263010000.0,2492000000.0,191070000.0,453410000.0,343490000.0,147530000.0,224230000.0,,1827100000.0,341350000.0,89109000.0,2060600000.0,122410000.0,,36900000.0,538180000.0,26760000000.0,7231000.0,2600100000.0,326650000.0,259500000.0,,35578000.0,,,484700000.0,1146300000.0,63434000.0,2065300000.0,80102000.0,20502000.0,269500000.0,1428600000.0,147440000.0,84489000.0,183520000.0,386210000.0,42698000.0,9454700.0,17366000.0,1329200000.0,9278700000.0,2087800000.0,536130000.0,9815900.0,24307000.0,104220000.0,394740000.0,25981000.0,2472800000.0,663050000.0,930250000.0,26020000.0,398860000.0,2693000000.0,9996900.0,42293000.0,791370000.0 +2020_01_03_20_10_Q-Exactive-HF-X-Orbitrap_6070,153430000.0,,63874000.0,36957000.0,10509000000.0,12560000.0,334030000.0,334750000.0,106350000.0,280300000.0,285660000.0,,50914000.0,263700000.0,,362740000.0,,120650000.0,9847400000.0,215740000.0,300170000.0,,68788000000.0,119290000.0,986780000.0,20219000.0,70804000.0,132260000.0,250220000.0,60728000.0,3579700000.0,1487700000.0,6980800000.0,2840700000.0,9538900000.0,,2561800000.0,10191000.0,2447900000.0,9163200000.0,242790000.0,72693000.0,,,,168240000.0,165020000.0,3999100000.0,17140000000.0,11797000000.0,3430300000.0,596050000.0,101680000.0,8648800000.0,908750000.0,52008000.0,181970000.0,226330000.0,3245800000.0,1059000000.0,105710000.0,11522000000.0,21885000.0,120130000.0,25261000.0,609180000.0,509740000.0,2558500000.0,1701600000.0,42712000.0,19717000.0,88388000.0,532260000.0,6505200000.0,94528000.0,446460000.0,,35611000.0,44675000.0,389170000.0,408500000.0,52717000.0,1475700000.0,26941000.0,104010000000.0,,144640000.0,190310000.0,364830000.0,46834000.0,57983000.0,38070000.0,86116000.0,39546000.0,9259700000.0,,506130000.0,,922540000.0,48572000.0,10332000.0,619280000.0,2856500000.0,1092400000.0,4423400000.0,,356570000.0,1691400000.0,1689300000.0,98362000.0,1802300000.0,339010000.0,1012200000.0,1388400000.0,705590000.0,48572000.0,636470000.0,27415000000.0,191930000.0,,151850000000.0,64209000.0,1021300000.0,2117800000.0,823330000.0,,103600000.0,621350000.0,1513400000.0,1300600000.0,375030000.0,742820000.0,111580000.0,281680000.0,133370000.0,263010000.0,,,,37461000.0,393090000.0,276730000.0,5342100000.0,95791000.0,83096000.0,286020000.0,50980000.0,182720000.0,,66352000.0,807660000.0,,99426000000.0,104590000.0,1140600000.0,167930000.0,13797000.0,654200000.0,772290000.0,,,1330100000.0,245200000.0,,24980000.0,344830000.0,386280000.0,46833000.0,22278000.0,296350000.0,2918600000.0,281620000.0,400010000.0,299150000.0,214210000.0,188760000.0,23444000.0,2276700000.0,413130000.0,54915000.0,2493100000.0,54906000.0,39360000.0,86412000.0,1532100000.0,27627000000.0,,3131600000.0,309070000.0,295020000.0,,49096000.0,,,462890000.0,1072500000.0,,1558400000.0,66151000.0,35432000.0,283120000.0,960850000.0,223050000.0,38258000.0,170230000.0,361870000.0,72510000.0,19247000.0,17584000.0,1936700000.0,10190000000.0,1255700000.0,470360000.0,,66716000.0,117930000.0,470060000.0,33633000.0,1823100000.0,745110000.0,190910000.0,,519780000.0,3077900000.0,,53360000.0,20002000000.0 +2020_01_04_04_23_Q-Exactive-HF-X-Orbitrap_6070,145600000.0,,105420000.0,106410000.0,13867000000.0,23311000.0,474050000.0,512710000.0,147720000.0,357750000.0,378020000.0,1043800000.0,90349000.0,167280000.0,54206000.0,544170000.0,,326390000.0,9799000000.0,298990000.0,402410000.0,,82793000000.0,239500000.0,1690300000.0,41626000.0,81844000.0,129780000.0,256520000.0,123050000.0,3778000000.0,1901300000.0,8153000000.0,3128600000.0,14159000000.0,,2262400000.0,8696300.0,3204600000.0,12287000000.0,367180000.0,124080000.0,,,34502000.0,166030000.0,250500000.0,4662600000.0,20588000000.0,17400000000.0,4911500000.0,585100000.0,224100000.0,8609100000.0,1312300000.0,64725000.0,511060000.0,120920000.0,3158900000.0,1485800000.0,179110000.0,11670000000.0,43228000.0,226050000.0,87108000.0,788210000.0,1029500000.0,3379100000.0,1468800000.0,127160000.0,60028000.0,153500000.0,485820000.0,11555000000.0,56208000.0,605600000.0,,119520000.0,,557120000.0,489820000.0,,1551100000.0,83899000.0,187730000000.0,,109160000.0,219090000.0,597910000.0,36929000.0,136760000.0,61619000.0,132550000.0,40544000.0,10316000000.0,3785700000.0,868110000.0,,1142200000.0,64221000.0,223590000.0,754740000.0,2147600000.0,1721400000.0,5804500000.0,,345110000.0,1831100000.0,2048700000.0,120520000.0,1983500000.0,392240000.0,1325300000.0,1891100000.0,534580000.0,164330000.0,694390000.0,32217000000.0,160100000.0,,196470000000.0,1582100000.0,4707200000.0,2640400000.0,1099200000.0,15209000.0,27145000.0,907750000.0,2079800000.0,1688800000.0,409120000.0,654810000.0,142390000.0,666320000.0,206200000.0,436190000.0,,,,71085000.0,429620000.0,253240000.0,5750000000.0,152000000.0,173830000.0,333750000.0,25509000.0,231650000.0,31927000.0,86026000.0,1108700000.0,101710000.0,119480000000.0,88187000.0,1387700000.0,191460000.0,16528000.0,1249500000.0,1115400000.0,,49978000.0,2014200000.0,526150000.0,,60510000.0,303810000.0,361420000.0,81618000.0,,354690000.0,2573100000.0,315010000.0,393960000.0,601870000.0,498690000.0,220750000.0,,3743300000.0,429410000.0,125620000.0,3255300000.0,123810000.0,50678000.0,55904000.0,307520000.0,28218000000.0,,2937100000.0,733080000.0,353420000.0,,75426000.0,,,602090000.0,1745600000.0,,1778100000.0,131660000.0,35686000.0,258540000.0,2871100000.0,252470000.0,103810000.0,229070000.0,465700000.0,45600000.0,137590000.0,37452000.0,2186500000.0,8166100000.0,1988300000.0,1009500000.0,42173000.0,91082000.0,121190000.0,398000000.0,57716000.0,4053100000.0,897080000.0,,67364000.0,713920000.0,3581300000.0,41172000.0,142100000.0,28189000000.0 +2020_01_04_10_03_Q-Exactive-HF-X-Orbitrap_6070,,,,,4976000000.0,,46084000.0,,5327200.0,209770000.0,37379000.0,,,97159000.0,,17899000.0,,83625000.0,1289700000.0,40630000.0,,,19144000000.0,,221560000.0,,,18229000.0,23383000.0,,391980000.0,265950000.0,1147900000.0,312290000.0,1897600000.0,,342140000.0,,247830000.0,3212000000.0,47485000.0,,,,,18789000.0,16646000.0,1037900000.0,2515900000.0,2096300000.0,1376100000.0,14049000.0,12440000.0,310630000.0,212960000.0,,14572000.0,,1369900000.0,143400000.0,10110000.0,1632600000.0,,6107200.0,,109430000.0,142310000.0,423590000.0,205860000.0,,,,58847000.0,1590500000.0,,59202000.0,398880000.0,,,135560000.0,41299000.0,,412460000.0,,38844000000.0,,11109000.0,,53499000.0,,,,10139000.0,,2766700000.0,880530000.0,296160000.0,,220810000.0,,9811400.0,64461000.0,311490000.0,147640000.0,1564600000.0,,44822000.0,410370000.0,429110000.0,,904140000.0,44138000.0,49709000.0,304070000.0,34404000.0,,,5052100000.0,,,53286000000.0,,176870000.0,238800000.0,45552000.0,,,77246000.0,290510000.0,275180000.0,60167000.0,49587000.0,,94989000.0,,19499000.0,,,,,25086000.0,,1195700000.0,,5590500.0,12158000.0,,57402000.0,,,83061000.0,,34597000000.0,,93188000.0,24111000.0,,139100000.0,88199000.0,,,217330000.0,29312000.0,,15332000.0,40639000.0,63681000.0,,,22802000.0,228860000.0,10939000.0,106520000.0,75776000.0,8941200.0,,,347980000.0,57308000.0,,323140000.0,19946000.0,,,,7790600000.0,,646310000.0,13492000.0,128010000.0,,,,,86164000.0,170530000.0,,190970000.0,,,21338000.0,783020000.0,57621000.0,11147000.0,5409600.0,56297000.0,7786300.0,,,245700000.0,3174300000.0,239160000.0,48197000.0,,,24479000.0,39926000.0,,307440000.0,57807000.0,,7393900.0,27723000.0,476050000.0,,,15013000000.0 +2020_01_04_14_59_Q-Exactive-HF-X-Orbitrap_6070,,,,,3663800000.0,,16918000.0,300770000.0,19883000.0,12938000.0,57246000.0,,,71226000.0,,,,71042000.0,3383100000.0,42605000.0,50844000.0,,25184000000.0,221930000.0,196220000.0,,19361000.0,31180000.0,92077000.0,17276000.0,1130000000.0,502520000.0,2272600000.0,633850000.0,3854700000.0,,660370000.0,,959450000.0,3455000000.0,1439100000.0,,,,9384800.0,,31799000.0,1399200000.0,6907100000.0,3329100000.0,1478500000.0,51086000.0,95206000.0,1349300000.0,331580000.0,11746000.0,22194000.0,,1254200000.0,373220000.0,63323000.0,3478900000.0,,,,121640000.0,444510000.0,950070000.0,331920000.0,,,,54468000.0,3853800000.0,,120970000.0,,16066000.0,,118080000.0,125310000.0,,616780000.0,,69416000000.0,,83519000.0,47585000.0,42557000.0,,52972000.0,19262000.0,129410000.0,,2879700000.0,,33295000.0,,90154000.0,,,214030000.0,969940000.0,421000000.0,1578200000.0,,28558000.0,635210000.0,1055700000.0,17631000.0,1044500000.0,51421000.0,21149000.0,480180000.0,105450000.0,19644000.0,,11602000000.0,37479000.0,,40467000000.0,,1041900000.0,896910000.0,209520000.0,,,237540000.0,226010000.0,214210000.0,25707000.0,247470000.0,64867000.0,50600000.0,45592000.0,,,,,,42166000.0,123600000.0,2436400000.0,36240000.0,34445000.0,45712000.0,,76543000.0,,,323150000.0,,45988000000.0,,307740000.0,47858000.0,,71696000.0,88370000.0,,,450890000.0,43520000.0,,,80257000.0,33132000.0,,,70113000.0,678630000.0,50221000.0,90815000.0,,90162000.0,74023000.0,,784620000.0,51724000.0,9358300.0,583670000.0,,,57974000.0,,9926600000.0,,705740000.0,99178000.0,22718000.0,,4911500.0,,,,501660000.0,,309740000.0,,,28091000.0,976630000.0,28518000.0,,57246000.0,67461000.0,27843000.0,,,458530000.0,2741400000.0,610600000.0,182050000.0,,15085000.0,,42562000.0,,1013600000.0,282040000.0,18584000.0,,187420000.0,1322800000.0,,,5929800000.0 +2020_01_06_20_17_Q-Exactive-HF-X-Orbitrap_6070,475080000.0,,149480000.0,,18149000000.0,73972000.0,434650000.0,279420000.0,157280000.0,226300000.0,331780000.0,,302090000.0,186220000.0,,599680000.0,,551570000.0,19192000000.0,267420000.0,679270000.0,,82977000000.0,215860000.0,1771100000.0,,77334000.0,170750000.0,328070000.0,,6119700000.0,2063300000.0,13270000000.0,3370200000.0,17896000000.0,,3486700000.0,40669000.0,4483800000.0,19125000000.0,5232600000.0,155490000.0,7222900000.0,,31236000.0,221450000.0,352570000.0,8212000000.0,23814000000.0,22082000000.0,3241500000.0,832510000.0,131910000.0,9474800000.0,1719000000.0,64243000.0,652800000.0,304750000.0,3398800000.0,1042100000.0,360260000.0,11457000000.0,39847000.0,227830000.0,37628000.0,923120000.0,873900000.0,4011400000.0,2558500000.0,89278000.0,,136100000.0,686640000.0,14207000000.0,258300000.0,669660000.0,,145320000.0,,203640000.0,682510000.0,,3192700000.0,93883000.0,174180000000.0,,236030000.0,416540000.0,634150000.0,,281320000.0,64779000.0,456290000.0,,4837400000.0,2634800000.0,1269000000.0,18954000.0,1010200000.0,85731000.0,201180000.0,1093300000.0,3914900000.0,3390300000.0,8126500000.0,,541610000.0,1602100000.0,2212400000.0,247690000.0,2844900000.0,521970000.0,1650400000.0,2034800000.0,1037400000.0,120780000.0,297370000.0,38050000000.0,130980000.0,,216340000000.0,,2187600000.0,4339100000.0,1678000000.0,51235000.0,,957460000.0,2107600000.0,2893700000.0,829340000.0,917060000.0,271940000.0,770470000.0,647490000.0,542790000.0,66564000.0,,110210000.0,138110000.0,412800000.0,265770000.0,9202300000.0,182570000.0,236990000.0,544490000.0,44483000.0,227490000.0,69403000.0,196050000.0,1045600000.0,118730000.0,144230000000.0,93374000.0,2026800000.0,355330000.0,25836000.0,1202000000.0,867550000.0,57121000.0,40295000.0,2175900000.0,516170000.0,,51057000.0,294480000.0,256970000.0,109050000.0,,336450000.0,4334700000.0,282300000.0,320970000.0,490880000.0,628110000.0,472660000.0,,4278200000.0,825110000.0,125760000.0,4245500000.0,102030000.0,112860000.0,367010000.0,2287100000.0,21519000000.0,,3300300000.0,1179700000.0,529880000.0,54741000.0,146130000.0,,,831560000.0,2238300000.0,108270000.0,2581900000.0,138440000.0,225630000.0,315740000.0,2900500000.0,446180000.0,287850000.0,40125000.0,617470000.0,117680000.0,108560000.0,72430000.0,3075900000.0,12763000000.0,3815200000.0,1687600000.0,84927000.0,245550000.0,25662000.0,585670000.0,52316000.0,3545900000.0,1075300000.0,288320000.0,100050000.0,1319500000.0,6213100000.0,87840000.0,159980000.0,11295000000.0 +2020_01_08_16_43_Q-Exactive-HF-X-Orbitrap_6070,111550000.0,,22502000.0,,6993100000.0,,116990000.0,257260000.0,53797000.0,16190000.0,105750000.0,,,116980000.0,,199020000.0,,83357000.0,6340500000.0,71701000.0,138970000.0,,35358000000.0,315720000.0,492130000.0,,,65918000.0,76934000.0,37219000.0,1583100000.0,749750000.0,4205400000.0,906590000.0,5697500000.0,,1215500000.0,,1235000000.0,5789900000.0,17841000.0,,,905940000.0,,,87384000.0,2230000000.0,7915500000.0,5748600000.0,1461100000.0,197850000.0,182270000.0,2271200000.0,606550000.0,25016000.0,148570000.0,,1687100000.0,603670000.0,62981000.0,4428800000.0,,41507000.0,5691800.0,255320000.0,352060000.0,1456700000.0,552040000.0,,18593000.0,,285670000.0,5396900000.0,61145000.0,161690000.0,,60100000.0,,66979000.0,255980000.0,,911290000.0,4995800.0,94155000000.0,,106160000.0,60188000.0,102010000.0,,25013000.0,,90899000.0,,1895000000.0,1907100000.0,252330000.0,,443200000.0,,37614000.0,332200000.0,1930400000.0,745640000.0,3136400000.0,,188340000.0,848380000.0,1074900000.0,49772000.0,767240000.0,163010000.0,690760000.0,848760000.0,212330000.0,,37695000.0,17103000000.0,49374000.0,,83227000000.0,,601760000.0,1535800000.0,397240000.0,,,460850000.0,608860000.0,864570000.0,77318000.0,480240000.0,42799000.0,97464000.0,99908000.0,118750000.0,,,,38979000.0,312740000.0,150910000.0,3486100000.0,,75848000.0,92407000.0,21758000.0,,,,411480000.0,25799000.0,63174000000.0,,1205800000.0,87991000.0,,227890000.0,223770000.0,,,623490000.0,86823000.0,18140000.0,,179170000.0,68936000.0,,,130730000.0,1224600000.0,49802000.0,153510000.0,185280000.0,230120000.0,92411000.0,,1171000000.0,58633000.0,32613000.0,1120600000.0,49690000.0,36816000.0,75352000.0,382080000.0,15575000000.0,,875610000.0,130900000.0,59754000.0,,,,,,678340000.0,59467000.0,1157600000.0,,,93000000.0,962600000.0,42671000.0,,121880000.0,176510000.0,28434000.0,33546000.0,29740000.0,999700000.0,3924000000.0,1279600000.0,374040000.0,,67340000.0,16396000.0,155990000.0,25446000.0,1215300000.0,643820000.0,,,246530000.0,1919100000.0,,51133000.0,10364000000.0 +2020_01_09_11_07_Q-Exactive-HF-X-Orbitrap_6070,285920000.0,,52084000.0,46667000.0,7403500000.0,,300990000.0,,120520000.0,71859000.0,195240000.0,,51116000.0,82907000.0,20581000.0,180650000.0,,220070000.0,6948000000.0,82227000.0,179160000.0,,50500000000.0,70231000.0,909130000.0,17474000.0,41798000.0,123130000.0,212040000.0,13832000.0,2954100000.0,897470000.0,5225000000.0,1917300000.0,9147700000.0,,1272900000.0,,2357000000.0,8706100000.0,124660000.0,,,,,25887000.0,113620000.0,2792100000.0,9795400000.0,9600000000.0,2486800000.0,236190000.0,83692000.0,3006000000.0,693330000.0,62701000.0,254480000.0,62145000.0,2786000000.0,721980000.0,69540000.0,7228500000.0,,192610000.0,13949000.0,367560000.0,581000000.0,1631800000.0,1020600000.0,27698000.0,,,813770000.0,5461500000.0,,468900000.0,,50200000.0,,183850000.0,295750000.0,,1511600000.0,44090000.0,97039000000.0,,144640000.0,205620000.0,282760000.0,,115960000.0,,173770000.0,,2878800000.0,1098800000.0,495350000.0,,580130000.0,,123840000.0,545880000.0,2316500000.0,1082500000.0,2401900000.0,,177720000.0,1562300000.0,1019000000.0,53944000.0,2002300000.0,248620000.0,873050000.0,1491200000.0,125970000.0,92284000.0,,17387000000.0,56911000.0,,120850000000.0,1621000000.0,1087800000.0,1484800000.0,360640000.0,7895800.0,,710320000.0,1037900000.0,1093100000.0,299340000.0,280260000.0,149550000.0,315790000.0,105830000.0,164530000.0,,,,46241000.0,410920000.0,165390000.0,2949100000.0,78075000.0,95034000.0,103810000.0,18775000.0,173270000.0,,17708000.0,428540000.0,13866000.0,73106000000.0,,1157000000.0,104830000.0,,497770000.0,399010000.0,41700000.0,24358000.0,688990000.0,241860000.0,34818000.0,,195240000.0,85947000.0,25502000.0,,193640000.0,1813300000.0,65739000.0,304850000.0,231730000.0,105150000.0,66848000.0,8814100.0,1770400000.0,323560000.0,41779000.0,1934000000.0,15475000.0,28259000.0,60321000.0,1418000000.0,17565000000.0,5037500.0,1463700000.0,309790000.0,98692000.0,,79578000.0,,,537840000.0,686200000.0,105490000.0,1220900000.0,36996000.0,17815000.0,210360000.0,1493900000.0,123240000.0,37092000.0,,423580000.0,20752000.0,,21374000.0,1464100000.0,6083300000.0,2004600000.0,326010000.0,,70278000.0,,195430000.0,14076000.0,2017600000.0,535770000.0,72680000.0,10256000.0,487810000.0,2371400000.0,,33256000.0,13877000000.0 +2020_01_15_13_56_Q-Exactive-HF-X-Orbitrap_6070,126250000.0,325410000.0,18992000.0,26485000.0,2986200000.0,,124020000.0,158520000.0,31774000.0,15155000.0,,,54722000.0,69378000.0,,143560000.0,,63426000.0,5469100000.0,26647000.0,118710000.0,10136000.0,16610000000.0,458860000.0,634240000.0,45564000.0,7000300.0,40703000.0,141130000.0,13318000.0,1418100000.0,326010000.0,1304200000.0,874300000.0,3402600000.0,57710000.0,236880000.0,,693650000.0,6177300000.0,161840000.0,24873000.0,420810000.0,,,105200000.0,16109000.0,797280000.0,4822600000.0,3440800000.0,1073500000.0,223250000.0,,457120000.0,395130000.0,20563000.0,188930000.0,83549000.0,1044100000.0,284240000.0,133160000.0,5048900000.0,,,,37155000.0,339500000.0,109740000.0,449800000.0,32708000.0,47169000.0,54771000.0,43121000.0,2323100000.0,71775000.0,49580000.0,981010000.0,12679000.0,,,163300000.0,16596000.0,953420000.0,13611000.0,34805000000.0,9370100.0,27863000.0,21891000.0,188740000.0,,110250000.0,17727000.0,,11019000.0,1232900000.0,,146110000.0,13017000.0,256120000.0,17527000.0,,,495010000.0,764240000.0,2034500000.0,35834000.0,172420000.0,605240000.0,787460000.0,22425000.0,729190000.0,33904000.0,357240000.0,1241400000.0,35860000.0,,23734000.0,6310700000.0,,,41813000000.0,558610000.0,274910000.0,612320000.0,145870000.0,2284900.0,,199920000.0,115080000.0,493880000.0,245640000.0,35642000.0,,,,40094000.0,49304000.0,33999000.0,,,409520000.0,99682000.0,267540000.0,75471000.0,,128150000.0,48869000.0,40183000.0,,55531000.0,337220000.0,,14485000000.0,,343390000.0,10744000.0,,113190000.0,252380000.0,,27997000.0,121300000.0,309770000.0,3885800.0,15224000.0,44367000.0,58912000.0,3941000.0,8737800.0,93086000.0,149170000.0,242300000.0,229810000.0,198660000.0,89415000.0,231050000.0,68709000.0,1283300000.0,290000000.0,30300000.0,912230000.0,52465000.0,6748000.0,,707470000.0,7821900000.0,,2061100000.0,129450000.0,30753000.0,72618000.0,33446000.0,,,348950000.0,35378000.0,,342940000.0,42401000.0,,23772000.0,1165800000.0,33135000.0,211710000.0,142430000.0,71902000.0,45051000.0,30615000.0,2849500.0,285560000.0,3326200000.0,432460000.0,44389000.0,,87530000.0,9566500.0,109890000.0,,427000000.0,415820000.0,4083500.0,,520550000.0,1414100000.0,48176000.0,124350000.0, +2020_01_20_15_10_Q-Exactive-HF-X-Orbitrap_6070,927030000.0,,232580000.0,173290000.0,16965000000.0,171600000.0,376880000.0,,529230000.0,571620000.0,520220000.0,1133700000.0,295660000.0,343390000.0,135030000.0,920840000.0,33899000.0,563290000.0,20322000000.0,410750000.0,648830000.0,78373000.0,106720000000.0,479270000.0,2068500000.0,59237000.0,82027000.0,232250000.0,471810000.0,173250000.0,7230900000.0,2179800000.0,14644000000.0,4618200000.0,24592000000.0,,4137100000.0,24968000.0,5198300000.0,19655000000.0,497220000.0,179820000.0,15404000000.0,,63182000.0,515060000.0,393520000.0,8583500000.0,30174000000.0,28531000000.0,6570200000.0,1318200000.0,313150000.0,11485000000.0,2502400000.0,224300000.0,861960000.0,163930000.0,5938200000.0,1455800000.0,455250000.0,21867000000.0,94150000.0,273650000.0,88056000.0,1688700000.0,1471700000.0,5737700000.0,3445300000.0,135230000.0,214750000.0,123910000.0,952910000.0,14597000000.0,249190000.0,1070300000.0,,304360000.0,85572000.0,773810000.0,875740000.0,102500000.0,3500200000.0,197030000.0,233730000000.0,34471000.0,278140000.0,392090000.0,656090000.0,,274430000.0,93349000.0,469070000.0,66567000.0,10800000000.0,4634800000.0,1228300000.0,,1515500000.0,155390000.0,197850000.0,1681000000.0,4757200000.0,3116300000.0,8271300000.0,69546000.0,723250000.0,2576000000.0,2444700000.0,274890000.0,3614400000.0,760350000.0,3029700000.0,3917100000.0,582240000.0,401850000.0,328920000.0,45003000000.0,429950000.0,,292670000000.0,2067600000.0,8424000000.0,5434700000.0,1734900000.0,53423000.0,63689000.0,1644700000.0,2722300000.0,3973700000.0,1089700000.0,1402000000.0,335990000.0,631140000.0,624940000.0,551610000.0,,,53377000.0,198440000.0,1320600000.0,732590000.0,8297400000.0,445090000.0,292190000.0,309700000.0,9618300.0,313040000.0,106440000.0,264780000.0,1356900000.0,157640000.0,193120000000.0,200800000.0,2573900000.0,580920000.0,,1548900000.0,1110300000.0,122700000.0,59336000.0,2251200000.0,588620000.0,102360000.0,98081000.0,591330000.0,769290000.0,82066000.0,56557000.0,444990000.0,6475400000.0,374430000.0,764410000.0,106470000.0,1120600000.0,620660000.0,64107000.0,4276300000.0,1008300000.0,118380000.0,6397200000.0,93959000.0,240230000.0,346910000.0,1377900000.0,36008000000.0,,4936600000.0,845960000.0,498440000.0,80611000.0,256140000.0,,,1513700000.0,1167700000.0,275790000.0,3787000000.0,182910000.0,107310000.0,797780000.0,2759800000.0,588670000.0,195120000.0,149290000.0,1083300000.0,122770000.0,97393000.0,125180000.0,3286800000.0,13145000000.0,3263800000.0,1541800000.0,28392000.0,37736000.0,299780000.0,965540000.0,279490000.0,5334700000.0,1846800000.0,554100000.0,,1072800000.0,6073300000.0,92191000.0,234860000.0,29844000000.0 +2020_02_05_20_55_Q-Exactive-HF-X-Orbitrap_6070,570460000.0,,126720000.0,50917000.0,13848000000.0,71651000.0,444550000.0,507910000.0,256830000.0,385960000.0,1017800000.0,3151100000.0,304260000.0,194220000.0,37138000.0,581210000.0,,523700000.0,14674000000.0,421320000.0,682800000.0,,102460000000.0,449260000.0,2038700000.0,,122750000.0,235780000.0,447980000.0,103900000.0,5502600000.0,2053400000.0,17031000000.0,4622400000.0,21156000000.0,35717000.0,3303700000.0,27789000.0,4948700000.0,21724000000.0,457850000.0,159240000.0,,,49929000.0,437820000.0,336310000.0,6932900000.0,26289000000.0,22125000000.0,3537400000.0,788410000.0,107000000.0,11422000000.0,1635600000.0,156260000.0,565070000.0,106980000.0,5109400000.0,1045100000.0,299250000.0,14349000000.0,,103780000.0,83666000.0,1306100000.0,893640000.0,4058500000.0,2775600000.0,67303000.0,89354000.0,168510000.0,806360000.0,13979000000.0,287880000.0,791370000.0,,152060000.0,,588040000.0,544890000.0,44497000.0,2891800000.0,139680000.0,260060000000.0,,170240000.0,221200000.0,748570000.0,,214800000.0,72830000.0,381650000.0,,6407200000.0,4146500000.0,1396900000.0,,1040000000.0,182980000.0,268290000.0,1159100000.0,3401700000.0,1901600000.0,8565000000.0,,453320000.0,903250000.0,3132600000.0,194220000.0,4122200000.0,517490000.0,2271100000.0,2067100000.0,605470000.0,259170000.0,183010000.0,41688000000.0,114300000.0,,246430000000.0,,6703600000.0,4094900000.0,2587500000.0,15927000.0,79748000.0,1238500000.0,2399900000.0,3335200000.0,851400000.0,720030000.0,208490000.0,630220000.0,252250000.0,425760000.0,33571000.0,,,100370000.0,749950000.0,235810000.0,8582500000.0,81963000.0,126210000.0,454750000.0,77143000.0,213130000.0,51172000.0,120980000.0,1340400000.0,72599000.0,152980000000.0,,1873400000.0,724060000.0,37016000.0,1552800000.0,1232400000.0,,,2019200000.0,732140000.0,,42800000.0,421990000.0,364850000.0,26650000.0,19474000.0,628150000.0,4211700000.0,273650000.0,803090000.0,103560000.0,739690000.0,477220000.0,40767000.0,3318500000.0,853840000.0,184680000.0,4943800000.0,92200000.0,264450000.0,126080000.0,1575600000.0,37737000000.0,,3300400000.0,915790000.0,515690000.0,34247000.0,146670000.0,,,1070700000.0,1890800000.0,203720000.0,2361500000.0,138520000.0,171980000.0,625050000.0,3483900000.0,405230000.0,211140000.0,348250000.0,733490000.0,24081000.0,128790000.0,87798000.0,3182700000.0,11740000000.0,4547800000.0,1722200000.0,92394000.0,153980000.0,73590000.0,575090000.0,185470000.0,4262900000.0,1168400000.0,,43065000.0,1211900000.0,5530800000.0,,175770000.0,24984000000.0 +2020_02_10_15_41_Q-Exactive-HF-X-Orbitrap_6070,352960000.0,649450000.0,247620000.0,477200000.0,9029200000.0,19059000.0,,710020000.0,300300000.0,96348000.0,,,798010000.0,,143420000.0,380220000.0,53947000.0,363600000.0,23944000000.0,408430000.0,683620000.0,286440000.0,58597000000.0,4298600000.0,4458200000.0,395270000.0,237190000.0,319630000.0,866410000.0,250270000.0,8275800000.0,3104800000.0,10475000000.0,6973700000.0,19965000000.0,676900000.0,1726400000.0,21307000.0,6213900000.0,30935000000.0,824370000.0,324000000.0,1239700000.0,,33690000.0,579230000.0,145100000.0,4699000000.0,16942000000.0,19996000000.0,3405800000.0,1200700000.0,69128000.0,743930000.0,2388800000.0,254870000.0,1005600000.0,555770000.0,8393800000.0,692700000.0,1053500000.0,23861000000.0,56735000.0,217540000.0,,1524200000.0,1485700000.0,939080000.0,2006300000.0,341810000.0,495040000.0,370650000.0,416540000.0,8315000000.0,267140000.0,333300000.0,,236480000.0,28804000.0,92050000.0,1028600000.0,28947000.0,3291300000.0,72045000.0,75203000000.0,28683000.0,260460000.0,242300000.0,1342900000.0,178010000.0,763770000.0,494930000.0,,208220000.0,1926000000.0,,1501000000.0,419970000.0,1622000000.0,197770000.0,115390000.0,,3233600000.0,3874700000.0,9563600000.0,209060000.0,822400000.0,3074700000.0,3031100000.0,234440000.0,5421900000.0,314900000.0,1721500000.0,5595000000.0,231960000.0,39689000.0,345410000.0,37389000000.0,165070000.0,174730000.0,217570000000.0,2638500000.0,6809600000.0,4161900000.0,1065700000.0,26891000.0,169500000.0,1323600000.0,946850000.0,3287100000.0,1445700000.0,1138600000.0,59590000.0,,187670000.0,1143500000.0,275770000.0,212870000.0,148640000.0,177060000.0,1449900000.0,534830000.0,2334000000.0,107540000.0,44396000.0,1255000000.0,180090000.0,145900000.0,,286310000.0,1278900000.0,74743000.0,67331000000.0,222380000.0,1615900000.0,537620000.0,,1176800000.0,1814800000.0,213080000.0,492510000.0,1777800000.0,2457800000.0,,33186000.0,331920000.0,546360000.0,79473000.0,134610000.0,525660000.0,885860000.0,455470000.0,1406500000.0,,624670000.0,1499800000.0,651440000.0,7834600000.0,2357600000.0,710960000.0,4039400000.0,308930000.0,66930000.0,132410000.0,3602500000.0,31603000000.0,63735000.0,8844000000.0,576540000.0,631180000.0,767800000.0,487710000.0,,28667000.0,2066200000.0,350990000.0,200770000.0,1519900000.0,264830000.0,29046000.0,664540000.0,4864400000.0,183860000.0,1597700000.0,197270000.0,455580000.0,331310000.0,332430000.0,89448000.0,1502100000.0,11620000000.0,3062400000.0,808340000.0,119230000.0,445040000.0,117100000.0,567650000.0,178610000.0,2386400000.0,2748900000.0,259700000.0,9209100.0,3314500000.0,9997300000.0,439490000.0,795340000.0, +2020_02_11_10_35_Q-Exactive-HF-X-Orbitrap_6070,516840000.0,557710000.0,203690000.0,271620000.0,3676500000.0,,465820000.0,801040000.0,360180000.0,128670000.0,,,558900000.0,43671000.0,129030000.0,566240000.0,70866000.0,271700000.0,21521000000.0,272950000.0,590830000.0,284000000.0,53062000000.0,3888200000.0,3456500000.0,156790000.0,145860000.0,292620000.0,761330000.0,220590000.0,6726300000.0,2531200000.0,9408200000.0,6199600000.0,15792000000.0,335920000.0,1625900000.0,16670000.0,5234500000.0,24100000000.0,730440000.0,191340000.0,9003300000.0,,,659470000.0,103310000.0,2821500000.0,15400000000.0,17899000000.0,2704400000.0,1387000000.0,71552000.0,1521500000.0,1853400000.0,267660000.0,832200000.0,397920000.0,5695300000.0,579080000.0,798280000.0,14129000000.0,69706000.0,96766000.0,,1307700000.0,1431600000.0,453560000.0,1932800000.0,161960000.0,466310000.0,322810000.0,299220000.0,7326300000.0,183980000.0,255610000.0,,215490000.0,52194000.0,59103000.0,1142200000.0,29144000.0,4823500000.0,119680000.0,62867000000.0,59298000.0,286620000.0,394250000.0,935510000.0,187720000.0,655060000.0,339300000.0,,95759000.0,3902800000.0,,1109700000.0,496160000.0,1168000000.0,176390000.0,56223000.0,,2536100000.0,2888500000.0,8721700000.0,171860000.0,725160000.0,2533300000.0,2471000000.0,190640000.0,4462500000.0,186440000.0,1976400000.0,4575200000.0,216240000.0,17893000.0,509630000.0,30313000000.0,85640000.0,171600000.0,200950000000.0,2315600000.0,858660000.0,3642800000.0,1018700000.0,,55806000.0,1033200000.0,527690000.0,2724900000.0,1446900000.0,966750000.0,94102000.0,,154940000.0,160450000.0,133650000.0,157690000.0,90179000.0,184190000.0,1171600000.0,510650000.0,2647300000.0,263910000.0,19944000.0,1271500000.0,167810000.0,44671000.0,,352720000.0,1296800000.0,92686000.0,49694000000.0,178190000.0,1565100000.0,424050000.0,12340000.0,679830000.0,1273100000.0,314950000.0,372080000.0,1247800000.0,1832700000.0,55656000.0,38483000.0,302780000.0,539790000.0,67978000.0,132510000.0,453860000.0,617980000.0,399540000.0,1160200000.0,97326000.0,457000000.0,1258300000.0,461660000.0,6232500000.0,1766800000.0,683900000.0,3584100000.0,260500000.0,58751000.0,223170000.0,3013000000.0,22317000000.0,136450000.0,7270900000.0,622240000.0,399710000.0,711800000.0,387160000.0,,69277000.0,1757200000.0,275430000.0,185480000.0,1028700000.0,339820000.0,,577350000.0,5522700000.0,159720000.0,1196800000.0,259550000.0,452080000.0,326200000.0,260370000.0,87896000.0,1320000000.0,9818100000.0,2154700000.0,795330000.0,142300000.0,402160000.0,105090000.0,365600000.0,207510000.0,2498500000.0,1894700000.0,333290000.0,12236000.0,2651400000.0,7921100000.0,392990000.0,679550000.0, +2020_02_12_05_06_Q-Exactive-HF-X-Orbitrap_6070,450780000.0,682520000.0,227000000.0,500360000.0,7111400000.0,18583000.0,514830000.0,779080000.0,368470000.0,106720000.0,,,673600000.0,87162000.0,22328000.0,262680000.0,86396000.0,285650000.0,28105000000.0,373420000.0,865860000.0,313450000.0,57939000000.0,4393200000.0,4287800000.0,220080000.0,79998000.0,601130000.0,831210000.0,171750000.0,8675000000.0,3325400000.0,11154000000.0,7391900000.0,18513000000.0,353490000.0,1651600000.0,22612000.0,6443800000.0,32768000000.0,726910000.0,323620000.0,12312000000.0,,,677250000.0,108360000.0,3942700000.0,18973000000.0,21148000000.0,3682700000.0,1566600000.0,39166000.0,784850000.0,2283100000.0,174510000.0,1037300000.0,518630000.0,6042000000.0,685720000.0,969360000.0,21179000000.0,79769000.0,152440000.0,,649370000.0,1187300000.0,677220000.0,2155700000.0,477590000.0,289860000.0,448280000.0,606370000.0,9936000000.0,146550000.0,434650000.0,,223990000.0,,69943000.0,1157000000.0,42709000.0,3213700000.0,125050000.0,82437000000.0,73055000.0,257590000.0,243710000.0,1124300000.0,136210000.0,712580000.0,426590000.0,,130900000.0,3372200000.0,,1386800000.0,457470000.0,1374100000.0,260560000.0,19077000.0,,3019500000.0,3294600000.0,10903000000.0,158400000.0,621280000.0,3091000000.0,3120600000.0,142510000.0,4449700000.0,216340000.0,2175200000.0,6560000000.0,181190000.0,47592000.0,625750000.0,33012000000.0,123320000.0,188080000.0,223000000000.0,2215400000.0,7011100000.0,3682300000.0,1150300000.0,33063000.0,83182000.0,1359400000.0,792690000.0,3479600000.0,1852100000.0,2226800000.0,114840000.0,,147250000.0,213860000.0,174370000.0,193620000.0,146040000.0,198110000.0,1719100000.0,537710000.0,3314700000.0,406270000.0,29929000.0,1377000000.0,229410000.0,160880000.0,,416160000.0,992030000.0,223050000.0,58860000000.0,212320000.0,1555500000.0,483470000.0,26275000.0,1172000000.0,1791000000.0,257640000.0,440950000.0,2025800000.0,2649900000.0,116260000.0,,468510000.0,632170000.0,129680000.0,102410000.0,592070000.0,664490000.0,511090000.0,1435700000.0,354190000.0,599060000.0,1542700000.0,751790000.0,8030100000.0,2139700000.0,587830000.0,2449900000.0,298970000.0,68659000.0,175870000.0,2698500000.0,26027000000.0,68662000.0,8501900000.0,393850000.0,497380000.0,784660000.0,530000000.0,47301000000.0,,,379560000.0,198130000.0,1412600000.0,238260000.0,,588980000.0,5317200000.0,170810000.0,1650600000.0,276820000.0,627660000.0,459480000.0,299650000.0,71065000.0,1668800000.0,10396000000.0,2513200000.0,831670000.0,149350000.0,531130000.0,90726000.0,665130000.0,186200000.0,2837500000.0,2164500000.0,300370000.0,13873000.0,3463700000.0,9590200000.0,296930000.0,922710000.0, +2020_02_13_00_26_Q-Exactive-HF-X-Orbitrap_6070,1041500000.0,902880000.0,270090000.0,290030000.0,9503100000.0,53256000.0,393800000.0,771330000.0,475310000.0,204790000.0,,,498960000.0,123780000.0,99065000.0,330560000.0,97155000.0,403060000.0,30369000000.0,521050000.0,1160000000.0,292150000.0,64983000000.0,4299200000.0,4267100000.0,332930000.0,210960000.0,662420000.0,698720000.0,254110000.0,10249000000.0,3683600000.0,10855000000.0,8426500000.0,18170000000.0,250020000.0,1754600000.0,66713000.0,6108900000.0,35781000000.0,756200000.0,212800000.0,12472000000.0,,24667000.0,844900000.0,98563000.0,4555500000.0,17667000000.0,23057000000.0,5375300000.0,1513100000.0,37861000.0,796630000.0,2023800000.0,130330000.0,1016500000.0,474920000.0,7590100000.0,861850000.0,1123200000.0,20688000000.0,94792000.0,114610000.0,,599910000.0,1554100000.0,792460000.0,2457900000.0,289600000.0,352400000.0,334780000.0,1403400000.0,10714000000.0,159800000.0,475370000.0,,227240000.0,114680000.0,101560000.0,1000800000.0,,5990700000.0,24047000.0,91950000000.0,62232000.0,332890000.0,403220000.0,1306900000.0,143080000.0,825880000.0,423620000.0,,121580000.0,3076900000.0,,1223700000.0,486870000.0,1942500000.0,373200000.0,177280000.0,,3003000000.0,4010200000.0,12248000000.0,200850000.0,1187000000.0,2755500000.0,3045100000.0,227620000.0,4951100000.0,307500000.0,2333200000.0,6555700000.0,,47282000.0,638400000.0,30255000000.0,329560000.0,227080000.0,204600000000.0,2215600000.0,1235900000.0,3393000000.0,1210000000.0,70738000.0,73448000.0,913540000.0,1100700000.0,3403400000.0,1579300000.0,2058700000.0,194940000.0,463060000.0,170490000.0,1178600000.0,317780000.0,285110000.0,160610000.0,256190000.0,1712700000.0,513760000.0,3452200000.0,340730000.0,52873000.0,1493200000.0,70366000.0,28695000.0,,386340000.0,1276800000.0,205940000.0,62731000000.0,160570000.0,2220200000.0,473030000.0,21658000.0,1140500000.0,1762500000.0,54372000.0,516730000.0,1508100000.0,2498100000.0,106950000.0,64163000.0,258650000.0,799340000.0,45702000.0,42986000.0,328300000.0,744980000.0,819020000.0,1463300000.0,151610000.0,668940000.0,1470500000.0,682130000.0,7549700000.0,3005800000.0,586080000.0,4112100000.0,238110000.0,66589000.0,63438000.0,2420400000.0,26783000000.0,95652000.0,8692900000.0,730310000.0,580050000.0,506770000.0,509890000.0,44546000000.0,43818000.0,,535980000.0,401950000.0,1297600000.0,366840000.0,37699000.0,477020000.0,7275500000.0,90458000.0,1578000000.0,261850000.0,529160000.0,352780000.0,368040000.0,85839000.0,2068800000.0,9203800000.0,2740800000.0,792020000.0,87657000.0,442900000.0,176700000.0,827140000.0,119350000.0,2629900000.0,2563300000.0,,,3188700000.0,10287000000.0,461560000.0,913120000.0, +2020_02_13_03_11_Q-Exactive-HF-X-Orbitrap_6070,1468600000.0,934150000.0,295680000.0,259220000.0,11459000000.0,74892000.0,753790000.0,826130000.0,401990000.0,172630000.0,,,618570000.0,129930000.0,61961000.0,320560000.0,61405000.0,658080000.0,29370000000.0,447250000.0,1084900000.0,429430000.0,67090000000.0,4791800000.0,4082300000.0,482180000.0,158730000.0,651920000.0,763800000.0,209150000.0,9448000000.0,3899600000.0,11854000000.0,8734200000.0,18469000000.0,285660000.0,2483300000.0,48674000.0,7223400000.0,36982000000.0,896970000.0,204550000.0,,,24552000.0,898510000.0,74508000.0,4975700000.0,18757000000.0,25922000000.0,4247300000.0,1248600000.0,76238000.0,836580000.0,2603100000.0,206200000.0,437540000.0,855840000.0,8419700000.0,1132500000.0,949560000.0,21292000000.0,,121070000.0,,917590000.0,1599400000.0,856570000.0,2433400000.0,429680000.0,366900000.0,348230000.0,1504800000.0,12921000000.0,167910000.0,731840000.0,,236340000.0,79862000.0,152290000.0,1124300000.0,124330000.0,6196700000.0,111650000.0,90899000000.0,29077000.0,292230000.0,468070000.0,1668000000.0,150910000.0,885440000.0,405900000.0,,126140000.0,2933700000.0,,1478700000.0,439920000.0,1934300000.0,409180000.0,182170000.0,,3308100000.0,4522300000.0,11246000000.0,215810000.0,1095000000.0,3589800000.0,3874600000.0,177310000.0,6364700000.0,280000000.0,1716000000.0,6884700000.0,,,526740000.0,34348000000.0,283310000.0,121810000.0,195970000000.0,2208700000.0,1213500000.0,4698600000.0,1077000000.0,70748000.0,244750000.0,982280000.0,1146500000.0,4017800000.0,1482900000.0,2292400000.0,157390000.0,550620000.0,240340000.0,170340000.0,319630000.0,342180000.0,141490000.0,590630000.0,1736300000.0,543360000.0,3803700000.0,460030000.0,,1557300000.0,198840000.0,37496000.0,,521260000.0,1408800000.0,100170000.0,73145000000.0,109000000.0,2409500000.0,486980000.0,24537000.0,977500000.0,1776100000.0,79993000.0,525360000.0,1613200000.0,2752200000.0,128660000.0,,521560000.0,754030000.0,,62388000.0,664090000.0,896300000.0,822690000.0,1438000000.0,,643840000.0,1562700000.0,833840000.0,8073300000.0,2550300000.0,479830000.0,4638100000.0,239280000.0,49448000.0,202990000.0,2701800000.0,33141000000.0,231260000.0,11072000000.0,711680000.0,683090000.0,561210000.0,550700000.0,46757000000.0,123700000.0,1788300000.0,698250000.0,66786000.0,1375500000.0,368360000.0,37906000.0,503740000.0,6978300000.0,113800000.0,1875100000.0,229740000.0,526680000.0,456500000.0,442620000.0,207400000.0,2173800000.0,10852000000.0,2651400000.0,678710000.0,80471000.0,566790000.0,198940000.0,360870000.0,147430000.0,2662700000.0,2807500000.0,,,3072100000.0,9918100000.0,462810000.0,1415300000.0, +2020_02_17_13_55_Q-Exactive-HF-X-Orbitrap_6070,364000000.0,667470000.0,100410000.0,191080000.0,6007800000.0,,364240000.0,494930000.0,233010000.0,79527000.0,,,110580000.0,81507000.0,,233430000.0,31914000.0,290860000.0,15842000000.0,172270000.0,289220000.0,218580000.0,33250000000.0,1786600000.0,2353700000.0,146170000.0,,262680000.0,537510000.0,28223000.0,4584300000.0,1347800000.0,5178800000.0,3335000000.0,10161000000.0,172530000.0,962840000.0,22397000.0,3366100000.0,15105000000.0,253340000.0,149850000.0,,,,558710000.0,51598000.0,2280600000.0,12867000000.0,11168000000.0,1672100000.0,1020200000.0,,1489700000.0,1276700000.0,127710000.0,146600000.0,158420000.0,3510900000.0,1157900000.0,425430000.0,10574000000.0,40009000.0,139800000.0,,392940000.0,1351200000.0,469700000.0,1225700000.0,164140000.0,138430000.0,186910000.0,398200000.0,6215200000.0,142970000.0,302890000.0,2714900000.0,56659000.0,27604000.0,43746000.0,540960000.0,64316000.0,2472800000.0,91748000.0,66760000000.0,37910000.0,91322000.0,102520000.0,420240000.0,196430000.0,669440000.0,96380000.0,,110090000.0,2485500000.0,,509110000.0,110170000.0,629790000.0,55194000.0,29724000.0,,1872100000.0,2062100000.0,6008800000.0,152870000.0,357780000.0,2172700000.0,1381000000.0,37951000.0,2902500000.0,189990000.0,736580000.0,3246500000.0,100890000.0,43295000.0,283550000.0,18593000000.0,158460000.0,102360000.0,126380000000.0,107160000.0,495890000.0,2414900000.0,345890000.0,,48030000.0,834490000.0,674480000.0,1709400000.0,856490000.0,1067800000.0,110980000.0,,61864000.0,197050000.0,203440000.0,184140000.0,111770000.0,94233000.0,844550000.0,421820000.0,1720000000.0,242260000.0,69108000.0,700960000.0,139100000.0,98188000.0,,273750000.0,1095100000.0,,39680000000.0,121190000.0,337760000.0,290920000.0,,497610000.0,985180000.0,203360000.0,57585000.0,562370000.0,977360000.0,,,157520000.0,350840000.0,16972000.0,83788000.0,204700000.0,481090000.0,269090000.0,709710000.0,64597000.0,330200000.0,837290000.0,394390000.0,3911200000.0,1363500000.0,268270000.0,2276000000.0,148030000.0,70231000.0,98767000.0,1748700000.0,15677000000.0,25322000.0,4890400000.0,296670000.0,340790000.0,274110000.0,231210000.0,,31300000.0,1007700000.0,260580000.0,124000000.0,426230000.0,141880000.0,,262120000.0,3801400000.0,112530000.0,862170000.0,489020000.0,193870000.0,294470000.0,181550000.0,74049000.0,985040000.0,4490600000.0,1016600000.0,415910000.0,54194000.0,236280000.0,35435000.0,392250000.0,129880000.0,1770100000.0,3997200000.0,,,1573700000.0,5197500000.0,231530000.0,443610000.0, +2020_02_18_01_25_Q-Exactive-HF-X-Orbitrap_6070,395730000.0,470640000.0,55705000.0,90078000.0,6544800000.0,11219000.0,156630000.0,321380000.0,169570000.0,51004000.0,,,162600000.0,77042000.0,23294000.0,235520000.0,12607000.0,237440000.0,12932000000.0,50739000.0,209880000.0,107420000.0,33673000000.0,1477600000.0,1569000000.0,87977000.0,79464000.0,305830000.0,385210000.0,99234000.0,4629700000.0,1538700000.0,4505900000.0,3414400000.0,8510700000.0,184670000.0,984460000.0,10268000.0,2071400000.0,12122000000.0,2548700000.0,133470000.0,8536600000.0,,,325890000.0,70444000.0,2404400000.0,11815000000.0,10228000000.0,2270500000.0,746810000.0,36460000.0,672440000.0,1202500000.0,132080000.0,283280000.0,156850000.0,3388400000.0,657230000.0,325670000.0,7006200000.0,72887000.0,54039000.0,,318350000.0,1245400000.0,679300000.0,795010000.0,137360000.0,89905000.0,184770000.0,356600000.0,6363300000.0,97904000.0,223650000.0,3295200000.0,77535000.0,12068000.0,39103000.0,459520000.0,27280000.0,1846500000.0,30887000.0,68589000000.0,65196000.0,110370000.0,52926000.0,365650000.0,153800000.0,321290000.0,169500000.0,,57286000.0,2204700000.0,,612740000.0,45519000.0,717550000.0,56765000.0,60975000.0,,1264400000.0,2047900000.0,5909100000.0,117650000.0,318820000.0,1551900000.0,1883600000.0,57233000.0,2438000000.0,96642000.0,502470000.0,2310700000.0,86819000.0,17010000.0,418820000.0,19670000000.0,95041000.0,58943000.0,112570000000.0,24882000.0,215480000.0,1934500000.0,588730000.0,,89549000.0,639250000.0,615390000.0,1352700000.0,624750000.0,1120900000.0,77616000.0,,72322000.0,164470000.0,134060000.0,178880000.0,76217000.0,94352000.0,641300000.0,302920000.0,2255300000.0,117840000.0,66794000.0,404980000.0,45285000.0,206920000.0,,172940000.0,858190000.0,9997400.0,48650000000.0,73403000.0,491760000.0,172550000.0,,333740000.0,765240000.0,59824000.0,72339000.0,986290000.0,1062800000.0,108270000.0,,249430000.0,426000000.0,,68541000.0,266330000.0,428760000.0,190170000.0,475130000.0,,439820000.0,858910000.0,245200000.0,3620200000.0,1284300000.0,326560000.0,2072300000.0,104860000.0,38953000.0,54976000.0,863720000.0,13900000000.0,64008000.0,3811800000.0,203130000.0,243440000.0,173460000.0,216230000.0,60003000.0,14656000.0,729090000.0,310040000.0,81696000.0,420030000.0,165460000.0,,144810000.0,3812500000.0,59677000.0,470860000.0,459790000.0,248610000.0,195670000.0,79547000.0,31468000.0,565920000.0,6101700000.0,1353900000.0,197040000.0,77014000.0,284210000.0,,387490000.0,8028900.0,1310200000.0,1437500000.0,,5806400.0,920040000.0,4568400000.0,89585000.0,370030000.0, +2020_02_18_18_55_Q-Exactive-HF-X-Orbitrap_6070,150040000.0,427090000.0,75026000.0,469400000.0,4240600000.0,,366120000.0,377700000.0,220260000.0,74749000.0,,720530000.0,315230000.0,,40253000.0,298960000.0,7153800.0,156220000.0,14616000000.0,304320000.0,185980000.0,80885000.0,38552000000.0,1537300000.0,1662600000.0,124530000.0,80184000.0,190790000.0,415430000.0,67911000.0,2942800000.0,1162600000.0,5194400000.0,2912700000.0,11052000000.0,120700000.0,739190000.0,,2196500000.0,13661000000.0,318890000.0,172600000.0,,,,233640000.0,18736000.0,2734400000.0,9338300000.0,11160000000.0,1316400000.0,765210000.0,23843000.0,1248500000.0,1045900000.0,82020000.0,717780000.0,116850000.0,2886900000.0,647060000.0,530150000.0,8857900000.0,25490000.0,110630000.0,,448630000.0,541030000.0,369970000.0,1002400000.0,154920000.0,198750000.0,249540000.0,1202600000.0,6459600000.0,158980000.0,259870000.0,2819800000.0,73638000.0,,,541850000.0,53064000.0,2498100000.0,29103000.0,62649000000.0,9553500.0,257350000.0,214570000.0,593330000.0,,410570000.0,150050000.0,,51342000.0,824800000.0,,410820000.0,124290000.0,1019200000.0,54179000.0,66272000.0,,1446700000.0,2524800000.0,6546300000.0,85165000.0,431880000.0,1344000000.0,1412100000.0,104640000.0,1644000000.0,135380000.0,974140000.0,2200400000.0,94993000.0,15723000.0,255360000.0,22132000000.0,79592000.0,81574000.0,107500000000.0,1096000000.0,3278300000.0,1587900000.0,692330000.0,,81493000.0,833590000.0,321530000.0,2058600000.0,793540000.0,660610000.0,187970000.0,46109000.0,63345000.0,136250000.0,124900000.0,89041000.0,11968000.0,54465000.0,660960000.0,384020000.0,1416200000.0,97257000.0,19883000.0,659200000.0,79733000.0,93126000.0,,237100000.0,848210000.0,72448000.0,41106000000.0,15379000.0,1001000000.0,238630000.0,,609450000.0,957390000.0,188460000.0,68775000.0,552810000.0,1072900000.0,33698000.0,22122000.0,174110000.0,174430000.0,43706000.0,88369000.0,335110000.0,462050000.0,421720000.0,608100000.0,,307310000.0,855960000.0,341840000.0,3720500000.0,841350000.0,275070000.0,2261500000.0,49882000.0,17728000.0,73029000.0,979100000.0,13400000000.0,14945000.0,4407900000.0,308260000.0,273540000.0,238680000.0,108720000.0,14025000.0,14410000.0,,363080000.0,142310000.0,422690000.0,127330000.0,,186950000.0,2464400000.0,74272000.0,846620000.0,180170000.0,352820000.0,232870000.0,192650000.0,43267000.0,490260000.0,4487800000.0,1368700000.0,267270000.0,59815000.0,294840000.0,,478840000.0,77570000.0,1593600000.0,1582300000.0,129360000.0,5174200.0,1412600000.0,5227100000.0,268240000.0,275830000.0, +2020_02_28_12_27_Q-Exactive-HF-X-Orbitrap_6070,393350000.0,317600000.0,39159000.0,124950000.0,5263000000.0,,227750000.0,266560000.0,151900000.0,104580000.0,,,111260000.0,9816800.0,67781000.0,110580000.0,23119000.0,211120000.0,8519700000.0,270290000.0,232440000.0,26287000.0,27783000000.0,1669600000.0,1558200000.0,123210000.0,22127000.0,195990000.0,307230000.0,78724000.0,3682800000.0,1127100000.0,3911600000.0,2979800000.0,7046000000.0,89480000.0,359790000.0,16017000.0,1599100000.0,9651500000.0,453220000.0,80724000.0,8109500000.0,,16683000.0,189450000.0,11407000.0,1619600000.0,7665100000.0,8234600000.0,1830700000.0,418980000.0,27219000.0,301310000.0,899450000.0,58173000.0,511920000.0,146860000.0,2273000000.0,242900000.0,309890000.0,9590700000.0,44497000.0,55022000.0,,303080000.0,365780000.0,230290000.0,774060000.0,130870000.0,122500000.0,163120000.0,145160000.0,3949700000.0,93879000.0,92954000.0,,19614000.0,24742000.0,38087000.0,542680000.0,,1433600000.0,30443000.0,42203000000.0,7631500.0,130850000.0,142850000.0,317110000.0,51786000.0,413970000.0,95848000.0,,147940000.0,2503000000.0,,371300000.0,108900000.0,491650000.0,76338000.0,47006000.0,,1586200000.0,1726900000.0,3946400000.0,52652000.0,330010000.0,698110000.0,1047200000.0,1422400000.0,2021200000.0,150030000.0,587380000.0,2133400000.0,142180000.0,7590800.0,185630000.0,13723000000.0,190090000.0,38798000.0,75465000000.0,878380000.0,323780000.0,1407300000.0,345210000.0,12578000.0,49170000.0,428380000.0,340190000.0,1437400000.0,584830000.0,397200000.0,60769000.0,141300000.0,19808000.0,88389000.0,117660000.0,113080000.0,21889000.0,31115000.0,795940000.0,260570000.0,1761000000.0,94199000.0,,478610000.0,56123000.0,162920000.0,,244080000.0,429180000.0,77216000.0,27634000000.0,,701980000.0,131280000.0,6702700.0,358560000.0,583340000.0,13511000.0,245000000.0,415620000.0,792820000.0,50209000.0,16994000.0,156720000.0,393540000.0,33453000.0,29665000.0,65632000.0,277280000.0,255930000.0,479260000.0,69473000.0,291890000.0,574990000.0,206920000.0,2398800000.0,665340000.0,223020000.0,1510300000.0,120030000.0,12080000.0,61002000.0,1483600000.0,13058000000.0,26368000.0,3118400000.0,191030000.0,178520000.0,273420000.0,106890000.0,,23202000.0,876670000.0,168320000.0,,544680000.0,73211000.0,13353000.0,196040000.0,2538600000.0,100490000.0,569120000.0,207080000.0,195810000.0,175500000.0,145850000.0,37781000.0,482740000.0,4724500000.0,1123000000.0,183280000.0,49010000.0,169490000.0,34343000.0,178450000.0,100870000.0,1063600000.0,1074600000.0,109100000.0,15816000.0,1006200000.0,2807500000.0,152980000.0,277400000.0,1293500000.0 +2020_03_01_23_00_Q-Exactive-HF-X-Orbitrap_6070,254000000.0,160870000.0,12829000.0,82890000.0,3964800000.0,,206280000.0,229560000.0,92106000.0,44135000.0,,,123100000.0,7514200.0,24990000.0,118950000.0,14267000.0,81948000.0,7683400000.0,147320000.0,169560000.0,46054000.0,22858000000.0,844180000.0,1182200000.0,61914000.0,48975000.0,152420000.0,260900000.0,75693000.0,3373200000.0,1222900000.0,2751800000.0,2053800000.0,5044200000.0,93653000.0,491810000.0,,1321000000.0,7425200000.0,170770000.0,,524610000.0,,,124620000.0,10374000.0,1361100000.0,4897300000.0,6132600000.0,1183600000.0,262960000.0,11781000.0,205410000.0,639440000.0,42992000.0,320510000.0,98878000.0,1753800000.0,414050000.0,221830000.0,4943900000.0,24318000.0,24986000.0,,213210000.0,493850000.0,159520000.0,503510000.0,64754000.0,137040000.0,92111000.0,165380000.0,3434500000.0,77301000.0,172770000.0,,39385000.0,10261000.0,39551000.0,306050000.0,,1485900000.0,24440000.0,50712000000.0,30999000.0,136490000.0,119540000.0,625680000.0,41688000.0,239590000.0,71911000.0,,62514000.0,2036200000.0,,361880000.0,60070000.0,223480000.0,65660000.0,44999000.0,,984630000.0,1185400000.0,3048500000.0,39361000.0,165250000.0,919650000.0,878780000.0,48727000.0,1649900000.0,49000000.0,478490000.0,1377100000.0,73937000.0,,123430000.0,10024000000.0,34010000.0,19308000.0,77565000000.0,39409000.0,2051300000.0,1265000000.0,372070000.0,18643000.0,10179000.0,388180000.0,284290000.0,953270000.0,364730000.0,606640000.0,53958000.0,157780000.0,38352000.0,80149000.0,51216000.0,19894000.0,,218560000.0,555790000.0,135440000.0,1002000000.0,76008000.0,,246080000.0,48933000.0,52054000.0,,92027000.0,354270000.0,65043000.0,23956000000.0,47473000.0,494620000.0,86458000.0,4941400.0,313860000.0,428900000.0,17360000.0,40068000.0,328880000.0,562400000.0,76307000.0,12360000.0,181830000.0,246050000.0,,20766000.0,117480000.0,272700000.0,168760000.0,306340000.0,,241950000.0,633430000.0,54803000.0,1948800000.0,312750000.0,102020000.0,1874100000.0,45966000.0,13324000.0,28004000.0,913950000.0,9204500000.0,3733200.0,3533400000.0,158780000.0,115510000.0,129500000.0,60808000.0,,7347100.0,416590000.0,158350000.0,67285000.0,335420000.0,35030000.0,,155010000.0,1783200000.0,30638000.0,467270000.0,71001000.0,92886000.0,132960000.0,77110000.0,33625000.0,284570000.0,2839800000.0,804650000.0,176630000.0,25163000.0,197390000.0,31923000.0,150480000.0,52581000.0,775140000.0,1021000000.0,27330000.0,,778570000.0,1583400000.0,81597000.0,139570000.0, +2020_03_06_16_22_Q-Exactive-HF-X-Orbitrap_6070,69046000.0,,15836000.0,,3232500000.0,46193000.0,60117000.0,88369000.0,18852000.0,86020000.0,95904000.0,423210000.0,38740000.0,44822000.0,,76412000.0,,11856000.0,3738900000.0,86473000.0,95468000.0,,22287000000.0,,467820000.0,,,13451000.0,31013000.0,24185000.0,1097000000.0,386150000.0,2520600000.0,818580000.0,4232300000.0,,671680000.0,,933250000.0,5132300000.0,134820000.0,49721000.0,,,8183900.0,,10786000.0,1319200000.0,5170500000.0,4210200000.0,1214300000.0,64766000.0,49129000.0,1864700000.0,347960000.0,15477000.0,127190000.0,34830000.0,1338300000.0,334570000.0,49472000.0,3357200000.0,18927000.0,35395000.0,,320070000.0,178390000.0,1183900000.0,435560000.0,16887000.0,,,43573000.0,2360100000.0,14041000.0,267620000.0,,21454000.0,5162000.0,125210000.0,159630000.0,,520270000.0,31030000.0,57883000000.0,,86781000.0,82409000.0,108480000.0,,49560000.0,22549000.0,47727000.0,,2851500000.0,1016500000.0,249460000.0,,196880000.0,4333600.0,5465700.0,204310000.0,710450000.0,508600000.0,1537700000.0,,59439000.0,199760000.0,608630000.0,22815000.0,840260000.0,39604000.0,484940000.0,598980000.0,188270000.0,25990000.0,13176000.0,7439400000.0,37050000.0,,59611000000.0,22940000.0,1557200000.0,775700000.0,213920000.0,1913300.0,,251470000.0,513220000.0,591140000.0,91457000.0,201490000.0,37445000.0,75997000.0,73890000.0,108240000.0,,,,8822600.0,143590000.0,344190000.0,2525900000.0,14661000.0,45924000.0,46659000.0,3796900.0,64538000.0,,8922100.0,250290000.0,8295100.0,31645000000.0,,406070000.0,62480000.0,,282640000.0,165000000.0,,,343260000.0,56562000.0,38291000.0,10628000.0,96195000.0,84355000.0,20808000.0,3766900.0,31788000.0,723590000.0,88885000.0,138380000.0,105370000.0,116100000.0,67043000.0,,1186800000.0,98956000.0,8177600.0,912190000.0,29330000.0,30566000.0,62198000.0,810960000.0,8062500000.0,,796190000.0,223920000.0,84717000.0,,,,,213460000.0,320750000.0,,816210000.0,,6966000.0,90157000.0,557500000.0,58393000.0,29610000.0,44392000.0,125850000.0,14237000.0,,,498590000.0,2802000000.0,901440000.0,277990000.0,,38343000.0,4467700.0,123050000.0,7484000.0,778720000.0,271310000.0,33226000.0,,237130000.0,957130000.0,4076200.0,25235000.0,8977500000.0 +2020_03_07_18_15_Q-Exactive-HF-X-Orbitrap_6070,115810000.0,,37601000.0,33966000.0,7614200000.0,133580000.0,229630000.0,96411000.0,168290000.0,269040000.0,235370000.0,,114180000.0,105920000.0,16303000.0,238750000.0,13480000.0,200480000.0,7978400000.0,187840000.0,406640000.0,,41904000000.0,179570000.0,987720000.0,31285000.0,,60022000.0,233650000.0,54902000.0,2747000000.0,844420000.0,5762000000.0,1711900000.0,9128700000.0,16551000.0,1514400000.0,,2219800000.0,9325700000.0,197860000.0,75107000.0,,227800000.0,,465370000.0,81507000.0,2722900000.0,11415000000.0,10427000000.0,2417500000.0,290040000.0,196130000.0,5048000000.0,967850000.0,34290000.0,490770000.0,105630000.0,2362400000.0,994780000.0,120200000.0,8805600000.0,31962000.0,82662000.0,27635000.0,545850000.0,441560000.0,2433500000.0,1335900000.0,57724000.0,,,305980000.0,5901700000.0,95388000.0,431130000.0,,65834000.0,11785000.0,320350000.0,354530000.0,,1109700000.0,15665000.0,97411000000.0,18435000.0,138690000.0,287940000.0,292470000.0,38072000.0,128750000.0,35070000.0,143110000.0,52352000.0,3916900000.0,,502390000.0,,465580000.0,23338000.0,34234000.0,290700000.0,1293400000.0,1076500000.0,3494900000.0,,234050000.0,613030000.0,1114800000.0,52075000.0,1528200000.0,186100000.0,1106900000.0,1453700000.0,476780000.0,55155000.0,50490000.0,20582000000.0,119180000.0,,134230000000.0,,866760000.0,1710500000.0,666480000.0,4377800.0,,543990000.0,1013400000.0,1157000000.0,276070000.0,514130000.0,103350000.0,265690000.0,164770000.0,262280000.0,,,,58906000.0,577910000.0,235820000.0,4332100000.0,43364000.0,157200000.0,189970000.0,,168530000.0,51380000.0,54014000.0,561080000.0,21661000.0,69200000000.0,,1170100000.0,172160000.0,37380000.0,782370000.0,454160000.0,23754000.0,,1028500000.0,246580000.0,45213000.0,17879000.0,162990000.0,168880000.0,,16758000.0,223970000.0,2131900000.0,232130000.0,338650000.0,271110000.0,364710000.0,143450000.0,,1477300000.0,273400000.0,,2154000000.0,80150000.0,48207000.0,95575000.0,2789700000.0,21213000000.0,,1562300000.0,460550000.0,272520000.0,12376000.0,,,,690540000.0,580630000.0,,1469800000.0,,30506000.0,284940000.0,1329200000.0,112080000.0,92096000.0,10766000.0,398820000.0,74719000.0,23619000.0,36817000.0,1238400000.0,7376600000.0,1980900000.0,603210000.0,10358000.0,156570000.0,45056000.0,242510000.0,31654000.0,1706500000.0,447090000.0,,,535710000.0,2019400000.0,38743000.0,60502000.0,20664000000.0 +2020_03_11_11_25_Q-Exactive-HF-X-Orbitrap_6070,503520000.0,340780000.0,78635000.0,164410000.0,5800300000.0,,262960000.0,319910000.0,240030000.0,78560000.0,,,219780000.0,15993000.0,23526000.0,358490000.0,7539800.0,120030000.0,12309000000.0,132310000.0,150390000.0,124390000.0,29590000000.0,1517500000.0,1851500000.0,160390000.0,85294000.0,185050000.0,312870000.0,123620000.0,3858400000.0,1502200000.0,4907500000.0,3080800000.0,9977700000.0,86945000.0,1093400000.0,26155000.0,3124800000.0,11222000000.0,470020000.0,131140000.0,4505200000.0,,,336300000.0,73669000.0,2514000000.0,9318900000.0,10692000000.0,1791700000.0,589980000.0,31801000.0,926260000.0,1172000000.0,88879000.0,561700000.0,190570000.0,2525400000.0,612560000.0,438570000.0,8413400000.0,62068000.0,106990000.0,,309890000.0,554450000.0,321770000.0,967950000.0,150180000.0,186140000.0,238590000.0,193750000.0,5254700000.0,155820000.0,249350000.0,2317900000.0,76822000.0,,42342000.0,551190000.0,,1419800000.0,53094000.0,57352000000.0,83483000.0,217760000.0,220140000.0,366360000.0,,605630000.0,175970000.0,,77031000.0,3363600000.0,,582960000.0,133060000.0,696810000.0,92512000.0,37551000.0,,1542200000.0,2045800000.0,5760600000.0,102140000.0,650400000.0,1374800000.0,1532400000.0,187480000.0,1817400000.0,190680000.0,870080000.0,2728500000.0,75778000.0,7472000.0,289430000.0,17411000000.0,52203000.0,15837000.0,137060000000.0,,3284000000.0,1818200000.0,523460000.0,,78337000.0,175590000.0,519860000.0,1776200000.0,992800000.0,787110000.0,132340000.0,,51623000.0,132280000.0,175100000.0,118260000.0,35614000.0,46581000.0,1216200000.0,321880000.0,1228100000.0,56973000.0,,627170000.0,125400000.0,67717000.0,,291900000.0,778600000.0,75479000.0,34966000000.0,112530000.0,431510000.0,226080000.0,6819400.0,576620000.0,704940000.0,29165000.0,245310000.0,545780000.0,1076900000.0,91021000.0,,74646000.0,389740000.0,57763000.0,63945000.0,197040000.0,437060000.0,428080000.0,591120000.0,,327930000.0,760010000.0,296460000.0,3698600000.0,895170000.0,189610000.0,2298800000.0,185400000.0,25930000.0,85100000.0,1664200000.0,13867000000.0,53701000.0,3824000000.0,292150000.0,243510000.0,283120000.0,100000000.0,,,897880000.0,238910000.0,26906000.0,284050000.0,150960000.0,,114110000.0,2156100000.0,58049000.0,631110000.0,173810000.0,246550000.0,180830000.0,296880000.0,49123000.0,654430000.0,6075000000.0,909440000.0,346620000.0,68808000.0,288950000.0,79938000.0,355610000.0,34562000.0,1131400000.0,1434100000.0,104800000.0,,1474100000.0,4537700000.0,124290000.0,413620000.0, +2020_05_04_11_39_Q-Exactive-HF-X-Orbitrap_6070,329950000.0,,28663000.0,55023000.0,3013600000.0,,62015000.0,207210000.0,94613000.0,17355000.0,,,80365000.0,47265000.0,25970000.0,107070000.0,,70096000.0,5487300000.0,79429000.0,86162000.0,61651000.0,17890000000.0,719990000.0,823280000.0,47201000.0,21540000.0,67186000.0,156220000.0,40991000.0,2122100000.0,5010400000.0,2008900000.0,1518400000.0,3251300000.0,75918000.0,495580000.0,3215300.0,1108500000.0,5866600000.0,205250000.0,77331000.0,4981600000.0,,,136240000.0,22949000.0,785990000.0,4768200000.0,4093600000.0,1592600000.0,204730000.0,4683100.0,157860000.0,545780000.0,44577000.0,343030000.0,75899000.0,1532700000.0,104520000.0,192660000.0,5610600000.0,21709000.0,20871000.0,,144750000.0,368910000.0,131030000.0,362110000.0,34113000.0,80111000.0,45346000.0,118040000.0,2234900000.0,48321000.0,76314000.0,1151000000.0,24830000.0,6362800.0,23890000.0,183560000.0,2553200.0,1039800000.0,27094000.0,37138000000.0,27117000.0,63863000.0,94778000.0,136040000.0,18613000.0,212070000.0,59424000.0,,16623000.0,809210000.0,,169440000.0,49663000.0,255230000.0,43979000.0,23781000.0,,947200000.0,914350000.0,2157000000.0,63885000.0,203100000.0,777920000.0,773320000.0,25440000.0,828600000.0,80937000.0,269510000.0,1077000000.0,63695000.0,,48002000.0,8169900000.0,25506000.0,16706000.0,49587000000.0,12429000.0,1424400000.0,714770000.0,146200000.0,,5033700.0,291770000.0,249650000.0,596000000.0,258020000.0,325580000.0,6247200.0,75911000.0,,32579000.0,54650000.0,22535000.0,57064000.0,33852000.0,228200000.0,124980000.0,383290000.0,103220000.0,6506700.0,214720000.0,27370000.0,81791000.0,,80899000.0,309160000.0,29338000.0,16505000000.0,51635000.0,423620000.0,65038000.0,2156700.0,447160000.0,311410000.0,19279000.0,67223000.0,293950000.0,346140000.0,51441000.0,,91513000.0,148930000.0,34040000.0,27016000.0,106940000.0,204180000.0,252170000.0,326140000.0,,157410000.0,322500000.0,90932000.0,1563000000.0,371010000.0,58395000.0,503830000.0,55953000.0,3535400.0,37430000.0,681200000.0,7098800000.0,3711200.0,2752100000.0,224570000.0,67282000.0,179260000.0,56222000.0,,,346940000.0,61331000.0,,190640000.0,26394000.0,,51399000.0,1325700000.0,58461000.0,303890000.0,133150000.0,68649000.0,96703000.0,50121000.0,,262690000.0,2729000000.0,348270000.0,122490000.0,,98032000.0,,173750000.0,16415000.0,542430000.0,701340000.0,12276000.0,,419020000.0,1614800000.0,84082000.0,219380000.0, +2020_05_12_15_13_Q-Exactive-HF-X-Orbitrap_6070,225910000.0,140530000.0,50009000.0,44500000.0,3038500000.0,2950800.0,92824000.0,110190000.0,67126000.0,29835000.0,,288210000.0,57569000.0,8970200.0,6116800.0,90726000.0,,77820000.0,5504400000.0,40882000.0,81816000.0,51240000.0,16591000000.0,595730000.0,566750000.0,46741000.0,18343000.0,84145000.0,127430000.0,66070000.0,1686700000.0,428810000.0,2134700000.0,1625300000.0,3722100000.0,52478000.0,351300000.0,14574000.0,779870000.0,4748400000.0,131610000.0,58508000.0,147280000.0,,7880300.0,90333000.0,13465000.0,1164000000.0,4913300000.0,4326000000.0,1190800000.0,284020000.0,11352000.0,224750000.0,310800000.0,17473000.0,338340000.0,98436000.0,899080000.0,162650000.0,159390000.0,5350200000.0,11959000.0,24085000.0,,156940000.0,387630000.0,115170000.0,450070000.0,39547000.0,51489000.0,89361000.0,93472000.0,2450400000.0,58699000.0,148950000.0,1485400000.0,30117000.0,10882000.0,29272000.0,206740000.0,,742490000.0,25911000.0,33250000000.0,28549000.0,57735000.0,54417000.0,164820000.0,,177480000.0,52009000.0,,12620000.0,727470000.0,,176980000.0,49630000.0,334320000.0,50565000.0,8755700.0,,688220000.0,941260000.0,2110800000.0,40184000.0,123050000.0,702900000.0,645020000.0,11415000.0,1106700000.0,19020000.0,225690000.0,1146300000.0,63415000.0,7778500.0,75819000.0,8371300000.0,18088000.0,,53306000000.0,,225550000.0,851410000.0,430790000.0,,,317180000.0,98256000.0,637150000.0,270350000.0,124950000.0,8006200.0,87325000.0,13036000.0,67009000.0,77357000.0,31599000.0,81639000.0,21883000.0,205910000.0,159200000.0,544300000.0,105120000.0,1732100.0,213870000.0,16954000.0,79798000.0,,89147000.0,339330000.0,30427000.0,16954000000.0,,360820000.0,67991000.0,2681300.0,227840000.0,264270000.0,47204000.0,101340000.0,260140000.0,445530000.0,46541000.0,7886000.0,81636000.0,242450000.0,17909000.0,37712000.0,125790000.0,510100000.0,227640000.0,263910000.0,57074000.0,132840000.0,314740000.0,95930000.0,1387000000.0,308220000.0,66597000.0,865580000.0,63892000.0,4745700.0,21445000.0,791120000.0,7413600000.0,19096000.0,1569600000.0,170770000.0,56321000.0,89691000.0,45253000.0,,5491400.0,,97237000.0,3738700.0,154020000.0,26861000.0,,75658000.0,1203000000.0,55363000.0,219620000.0,64743000.0,110570000.0,85628000.0,45731000.0,15484000.0,192770000.0,2207300000.0,433510000.0,94699000.0,8715900.0,89704000.0,8257500.0,184200000.0,6837300.0,661030000.0,537620000.0,17079000.0,6615200.0,574630000.0,2033400000.0,98667000.0,214890000.0, +2020_05_12_18_10_Q-Exactive-HF-X-Orbitrap_6070,195040000.0,100680000.0,33609000.0,65033000.0,1710800000.0,4788300.0,84650000.0,65379000.0,82616000.0,28798000.0,,,66065000.0,,17074000.0,92943000.0,,75497000.0,4814500000.0,10796000.0,92451000.0,61057000.0,13574000000.0,348120000.0,669920000.0,40262000.0,28920000.0,41551000.0,124060000.0,38831000.0,1586300000.0,438550000.0,1634100000.0,1483700000.0,2967200000.0,31722000.0,254400000.0,8929700.0,953410000.0,3229800000.0,139470000.0,42210000.0,94993000.0,,6372800.0,91132000.0,,885040000.0,3681800000.0,3547800000.0,662800000.0,263420000.0,9532200.0,132150000.0,239070000.0,42753000.0,286700000.0,71963000.0,1241900000.0,117390000.0,101870000.0,3770800000.0,18175000.0,19005000.0,,131120000.0,349090000.0,87354000.0,364640000.0,21190000.0,78223000.0,51759000.0,85985000.0,2046400000.0,59213000.0,83249000.0,1218900000.0,14551000.0,9624300.0,10797000.0,221900000.0,,781530000.0,23969000.0,20978000000.0,9403700.0,41768000.0,39502000.0,149650000.0,,150590000.0,45054000.0,,19031000.0,979850000.0,,202640000.0,56262000.0,268050000.0,67313000.0,21282000.0,,541610000.0,652940000.0,1653500000.0,28910000.0,115960000.0,587700000.0,570120000.0,19644000.0,717670000.0,68806000.0,263360000.0,904410000.0,69530000.0,4118700.0,47894000.0,6054100000.0,19297000.0,20931000.0,33337000000.0,,909010000.0,695460000.0,142610000.0,7843000.0,16115000.0,258800000.0,192990000.0,569250000.0,253930000.0,308430000.0,16373000.0,44389000.0,,45649000.0,44767000.0,49013000.0,39409000.0,5228600.0,154190000.0,131240000.0,393180000.0,110540000.0,,191370000.0,3904600.0,27874000.0,,65198000.0,292070000.0,17844000.0,12751000000.0,27067000.0,276630000.0,57492000.0,,173180000.0,303060000.0,12139000.0,46000000.0,224260000.0,350390000.0,25468000.0,,71260000.0,89813000.0,4481100.0,14846000.0,110530000.0,192540000.0,84532000.0,225670000.0,,171720000.0,237040000.0,71682000.0,1166600000.0,281770000.0,29350000.0,713060000.0,44249000.0,4685600.0,17251000.0,633430000.0,5042200000.0,3992400.0,1552500000.0,141010000.0,24642000.0,91235000.0,17785000.0,,,274670000.0,34415000.0,3675900.0,111000000.0,26018000.0,2379600.0,63631000.0,1191200000.0,40795000.0,247310000.0,133920000.0,67046000.0,76604000.0,47639000.0,8272800.0,198080000.0,2021700000.0,280170000.0,132960000.0,,67011000.0,5903700.0,124310000.0,14797000.0,701700000.0,579430000.0,35896000.0,,531300000.0,1484200000.0,78766000.0,126800000.0, +2020_05_12_21_07_Q-Exactive-HF-X-Orbitrap_6070,195050000.0,93935000.0,15720000.0,45623000.0,2070400000.0,,62364000.0,155320000.0,75798000.0,23780000.0,,255580000.0,56046000.0,40843000.0,18417000.0,71326000.0,,60992000.0,4691100000.0,26713000.0,62375000.0,47514000.0,13667000000.0,747600000.0,655930000.0,35982000.0,19403000.0,38575000.0,125170000.0,43391000.0,1657500000.0,3336100000.0,1629200000.0,1337900000.0,3106200000.0,34949000.0,343950000.0,3456800.0,679230000.0,3949900000.0,65604000.0,17417000.0,139930000.0,,13759000.0,67509000.0,11537000.0,898030000.0,3927200000.0,3828600000.0,1132400000.0,380880000.0,10514000.0,307160000.0,351660000.0,46142000.0,281520000.0,76293000.0,1301500000.0,183640000.0,128160000.0,4319700000.0,10283000.0,27667000.0,,131390000.0,365400000.0,100470000.0,308130000.0,29526000.0,57168000.0,72476000.0,93122000.0,2023900000.0,38703000.0,95247000.0,1228000000.0,15514000.0,10958000.0,13305000.0,206090000.0,,565580000.0,8099200.0,27570000000.0,14943000.0,52888000.0,58015000.0,121470000.0,35055000.0,205660000.0,45720000.0,,11000000.0,493620000.0,,217560000.0,22309000.0,261250000.0,47535000.0,37214000.0,,505290000.0,693290000.0,1735300000.0,35711000.0,115670000.0,520770000.0,525510000.0,3709300.0,833670000.0,16199000.0,236740000.0,766380000.0,36943000.0,,86180000.0,6929800000.0,19456000.0,7803200.0,34653000000.0,16077000.0,180670000.0,744150000.0,367190000.0,,12004000.0,241590000.0,149730000.0,540870000.0,246530000.0,184190000.0,10207000.0,69015000.0,,32133000.0,54614000.0,56078000.0,65909000.0,18455000.0,194180000.0,101910000.0,405900000.0,88287000.0,7467600.0,188030000.0,47453000.0,54427000.0,,71358000.0,302340000.0,20927000.0,12275000000.0,34651000.0,302430000.0,51582000.0,,214710000.0,191990000.0,25397000.0,29032000.0,239270000.0,324410000.0,22800000.0,13112000.0,79938000.0,145120000.0,16631000.0,37320000.0,113260000.0,417440000.0,113720000.0,241520000.0,,165030000.0,291580000.0,50550000.0,1210800000.0,238400000.0,37056000.0,949950000.0,37465000.0,22522000.0,19995000.0,696800000.0,6114300000.0,,1464200000.0,110050000.0,37568000.0,146650000.0,45446000.0,,15248000.0,427410000.0,100790000.0,4487700.0,130120000.0,32243000.0,,94308000.0,1003800000.0,24293000.0,225010000.0,105290000.0,95269000.0,100060000.0,19581000.0,4540600.0,208890000.0,2331100000.0,317020000.0,97617000.0,4306200.0,85896000.0,,161460000.0,27725000.0,583130000.0,694700000.0,30359000.0,,533980000.0,1599600000.0,80729000.0,111520000.0, +2020_05_14_14_46_Q-Exactive-HF-X-Orbitrap_6070,210700000.0,129710000.0,31399000.0,56876000.0,3581200000.0,,113750000.0,173940000.0,64519000.0,16566000.0,,,112910000.0,6785600.0,5338100.0,120910000.0,,93429000.0,6212500000.0,394180000.0,99575000.0,32436000.0,20792000000.0,696610000.0,851240000.0,61788000.0,34254000.0,49806000.0,165450000.0,37323000.0,2413800000.0,776630000.0,2615500000.0,1742700000.0,4447800000.0,31150000.0,340410000.0,4142500.0,922060000.0,6382000000.0,149240000.0,38459000.0,,,,117150000.0,24105000.0,1045800000.0,4868600000.0,4971300000.0,1324700000.0,392830000.0,7914700.0,615800000.0,493230000.0,53611000.0,287000000.0,80308000.0,1733600000.0,436240000.0,184430000.0,6261100000.0,,50578000.0,,149350000.0,505510000.0,195070000.0,415020000.0,54757000.0,107510000.0,42001000.0,92204000.0,2683800000.0,35836000.0,92824000.0,,21921000.0,7544000.0,11072000.0,190750000.0,13653000.0,971290000.0,23087000.0,33146000000.0,15647000.0,66502000.0,70518000.0,267450000.0,28871000.0,159800000.0,59928000.0,,11093000.0,1436800000.0,,163550000.0,44346000.0,343350000.0,28975000.0,,,992990000.0,990740000.0,2281000000.0,33683000.0,154420000.0,983650000.0,920320000.0,24115000.0,1120900000.0,40288000.0,309710000.0,1230300000.0,,4332100.0,51113000.0,8203400000.0,29793000.0,29362000.0,54130000000.0,759010000.0,276620000.0,1126700000.0,178560000.0,,17980000.0,234550000.0,305280000.0,652760000.0,315400000.0,239530000.0,8935900.0,,8002000.0,310120000.0,68624000.0,20179000.0,18511000.0,20527000.0,510940000.0,151320000.0,875030000.0,37188000.0,,1348500000.0,18037000.0,54915000.0,,119180000.0,363380000.0,9183700.0,17859000000.0,6373400.0,313840000.0,75969000.0,,269320000.0,357140000.0,56222000.0,28570000.0,242220000.0,341750000.0,37140000.0,,87795000.0,145570000.0,4650800.0,22214000.0,117110000.0,192740000.0,141260000.0,220590000.0,,133080000.0,363310000.0,144220000.0,1197500000.0,614810000.0,40893000.0,1230800000.0,49934000.0,4874000.0,26885000.0,846420000.0,8368500000.0,30819000.0,3330300000.0,178810000.0,69777000.0,179950000.0,50699000.0,,,311360000.0,53993000.0,42907000.0,334680000.0,45184000.0,,85190000.0,2360500000.0,29655000.0,311310000.0,112050000.0,109580000.0,67163000.0,64012000.0,,305060000.0,3440600000.0,600650000.0,168580000.0,17529000.0,90426000.0,28164000.0,195890000.0,,664770000.0,491890000.0,26426000.0,,626520000.0,2028500000.0,67007000.0,202010000.0, +2020_05_14_17_28_Q-Exactive-HF-X-Orbitrap_6070,105420000.0,381990000.0,23271000.0,45897000.0,2954000000.0,,108000000.0,233760000.0,72823000.0,45952000.0,,,77657000.0,8168400.0,,165010000.0,,46029000.0,6250600000.0,18803000.0,128270000.0,53724000.0,18422000000.0,850300000.0,722260000.0,49479000.0,46287000.0,99612000.0,160620000.0,34327000.0,2585800000.0,741720000.0,2542300000.0,1761300000.0,4864300000.0,42316000.0,364980000.0,,1269100000.0,8725200000.0,53724000.0,49863000.0,,,,120200000.0,18375000.0,1160900000.0,5060400000.0,5090900000.0,954940000.0,388890000.0,10690000.0,615740000.0,423810000.0,46617000.0,283580000.0,105090000.0,1618600000.0,479940000.0,209990000.0,6034400000.0,,34702000.0,,169220000.0,407260000.0,140160000.0,321490000.0,29283000.0,72655000.0,327020000.0,121950000.0,2768000000.0,98360000.0,115760000.0,1476500000.0,,5643100.0,,133040000.0,14294000.0,1073900000.0,26383000.0,30848000000.0,12453000.0,66522000.0,112950000.0,189400000.0,10336000.0,161900000.0,76047000.0,,20189000.0,1607400000.0,,272780000.0,62576000.0,358490000.0,71108000.0,7507500.0,,891870000.0,1096900000.0,2477300000.0,41409000.0,123790000.0,811630000.0,764210000.0,50275000.0,691130000.0,47414000.0,324180000.0,1559100000.0,,,88885000.0,9544800000.0,23208000.0,15776000.0,58324000000.0,811460000.0,286170000.0,886510000.0,218210000.0,,6304900.0,257970000.0,169780000.0,658270000.0,403810000.0,257550000.0,,75668000.0,11894000.0,58524000.0,37267000.0,19718000.0,,14787000.0,619730000.0,131650000.0,702650000.0,42170000.0,,1230800000.0,37478000.0,51706000.0,,108540000.0,313320000.0,16712000.0,16692000000.0,22082000.0,250580000.0,76048000.0,,291960000.0,386190000.0,42965000.0,45640000.0,297920000.0,430470000.0,33828000.0,,77006000.0,126130000.0,35913000.0,72939000.0,110750000.0,212530000.0,247540000.0,230130000.0,,171750000.0,313920000.0,251310000.0,1537700000.0,529700000.0,81272000.0,1392700000.0,41692000.0,,34303000.0,919610000.0,8737500000.0,,2994000000.0,186770000.0,89698000.0,154410000.0,62482000.0,,,402450000.0,58742000.0,32715000.0,385290000.0,52702000.0,,94050000.0,1472400000.0,41119000.0,320380000.0,212010000.0,127760000.0,57768000.0,118970000.0,,277280000.0,3654500000.0,724440000.0,151760000.0,23328000.0,105100000.0,,169050000.0,13422000.0,534450000.0,1211000000.0,9083500.0,2072800.0,766050000.0,2395400000.0,85028000.0,151630000.0,761430000.0 +2020_05_14_20_11_Q-Exactive-HF-X-Orbitrap_6070,201910000.0,340090000.0,17824000.0,65642000.0,2925700000.0,,132750000.0,248790000.0,52687000.0,51681000.0,,,123410000.0,63201000.0,,100900000.0,,121430000.0,6848100000.0,43865000.0,593430000.0,45287000.0,19662000000.0,924290000.0,1033800000.0,16397000.0,43459000.0,136770000.0,168990000.0,33920000.0,2452300000.0,760720000.0,3214200000.0,1924400000.0,5163100000.0,55989000.0,336150000.0,7659800.0,1296700000.0,8243500000.0,99542000.0,87227000.0,,,,140530000.0,20201000.0,1249600000.0,5480200000.0,6069400000.0,1291900000.0,283460000.0,12874000.0,780860000.0,567620000.0,43006000.0,242630000.0,132480000.0,1680100000.0,460750000.0,214030000.0,7381500000.0,,52379000.0,,198570000.0,469790000.0,146880000.0,591740000.0,52293000.0,96050000.0,335460000.0,166210000.0,3125900000.0,82406000.0,112610000.0,,25488000.0,26990000.0,11878000.0,232340000.0,20527000.0,1208100000.0,29252000.0,32733000000.0,16903000.0,85705000.0,75968000.0,330870000.0,72775000.0,153430000.0,97628000.0,,57002000.0,901730000.0,,379230000.0,84282000.0,463560000.0,41368000.0,18919000.0,,1130800000.0,1195700000.0,2897600000.0,33878000.0,154760000.0,921900000.0,1037100000.0,35585000.0,1176100000.0,49923000.0,418680000.0,1571200000.0,,,103570000.0,10765000000.0,22843000.0,21435000.0,63945000000.0,608330000.0,231890000.0,1135600000.0,197370000.0,,31289000.0,343740000.0,256580000.0,789660000.0,348540000.0,264460000.0,8309000.0,79545000.0,26651000.0,79070000.0,39583000.0,53863000.0,,18150000.0,744290000.0,151940000.0,868440000.0,93431000.0,6161400.0,1364700000.0,45399000.0,,,101180000.0,411260000.0,14959000.0,21634000000.0,15321000.0,403430000.0,126660000.0,,249320000.0,466550000.0,44069000.0,40401000.0,375220000.0,510610000.0,21302000.0,,136640000.0,101810000.0,21539000.0,15094000.0,142800000.0,277940000.0,346630000.0,312740000.0,,198430000.0,367020000.0,128010000.0,1770000000.0,470870000.0,124700000.0,1386000000.0,68918000.0,8219700.0,44511000.0,1050000000.0,7921600000.0,,2949100000.0,138390000.0,90031000.0,210700000.0,38266000.0,,,456160000.0,142610000.0,43232000.0,427560000.0,60993000.0,39385000.0,106870000.0,1900500000.0,36610000.0,356990000.0,180290000.0,108810000.0,87537000.0,85719000.0,5182900.0,282830000.0,3274900000.0,833160000.0,118210000.0,12698000.0,88432000.0,17869000.0,236590000.0,,830940000.0,536380000.0,29234000.0,,800090000.0,2660400000.0,115320000.0,203540000.0,780340000.0 +2020_05_15_10_30_Q-Exactive-HF-X-Orbitrap_6070,582600000.0,416710000.0,30552000.0,117120000.0,6197900000.0,,168520000.0,392780000.0,111660000.0,58166000.0,,,191210000.0,15719000.0,38397000.0,181410000.0,5872000.0,145050000.0,11668000000.0,146300000.0,246870000.0,146360000.0,32630000000.0,1915500000.0,1746400000.0,80166000.0,65802000.0,111550000.0,302610000.0,99519000.0,4197100000.0,1234000000.0,3629700000.0,3812200000.0,7501300000.0,127920000.0,815680000.0,5942100.0,1821500000.0,13071000000.0,461070000.0,35091000.0,593730000.0,,,222980000.0,29812000.0,1294600000.0,8525000000.0,8781000000.0,2712300000.0,452670000.0,23450000.0,636970000.0,1178600000.0,50668000.0,596040000.0,212110000.0,2599500000.0,491320000.0,467530000.0,7689200000.0,,91058000.0,,221410000.0,489530000.0,242650000.0,853350000.0,81613000.0,92659000.0,140040000.0,89622000.0,4552000000.0,45395000.0,325450000.0,,150660000.0,,101520000.0,368360000.0,34367000.0,1265500000.0,32932000.0,49930000000.0,43353000.0,184870000.0,88694000.0,453700000.0,139890000.0,469670000.0,77386000.0,,64116000.0,1395900000.0,3088000000.0,596100000.0,113310000.0,500290000.0,82852000.0,21169000.0,,1551500000.0,1568600000.0,3405300000.0,132620000.0,221470000.0,1515300000.0,1661300000.0,60225000.0,2086900000.0,74010000.0,391470000.0,3262500000.0,59470000.0,,250550000.0,12895000000.0,55353000.0,10338000.0,90691000000.0,1162600000.0,437880000.0,1261400000.0,419480000.0,15510000.0,45152000.0,541040000.0,581170000.0,1100900000.0,565220000.0,463630000.0,12651000.0,194270000.0,28474000.0,149080000.0,80229000.0,,68707000.0,39195000.0,1115100000.0,250430000.0,1235600000.0,192250000.0,,466130000.0,41507000.0,107740000.0,,227710000.0,545230000.0,26080000.0,30607000000.0,58072000.0,769120000.0,144320000.0,,537730000.0,693130000.0,100020000.0,249450000.0,445580000.0,885390000.0,79854000.0,58993000.0,67756000.0,372230000.0,39068000.0,,227880000.0,285020000.0,241800000.0,532260000.0,,174380000.0,781650000.0,236070000.0,2880100000.0,968700000.0,190840000.0,1629200000.0,107090000.0,22542000.0,79600000.0,950020000.0,15848000000.0,,4154200000.0,278290000.0,210640000.0,331880000.0,237030000.0,,,1062500000.0,223780000.0,84210000.0,642340000.0,119130000.0,4404000.0,143220000.0,3232900000.0,51476000.0,556510000.0,268610000.0,261040000.0,238150000.0,223210000.0,24335000.0,702330000.0,7043500000.0,715510000.0,149260000.0,84527000.0,145700000.0,109880000.0,224050000.0,54878000.0,906460000.0,1087400000.0,75817000.0,,1228300000.0,3312700000.0,222630000.0,235390000.0, +2020_05_20_12_33_Q-Exactive-HF-X-Orbitrap_6070,360590000.0,137000000.0,26866000.0,50586000.0,4375800000.0,,71038000.0,334000000.0,31481000.0,79784000.0,,,178500000.0,84431000.0,,209180000.0,13718000.0,85967000.0,8450300000.0,66006000.0,130320000.0,8419900.0,19294000000.0,1182300000.0,1081900000.0,38565000.0,8408200.0,33747000.0,173630000.0,71288000.0,2080900000.0,573720000.0,2169700000.0,2007600000.0,4125200000.0,53767000.0,706720000.0,34739000.0,1573800000.0,7948300000.0,271220000.0,28040000.0,4590400000.0,,,107430000.0,11324000.0,1072200000.0,5058000000.0,5879700000.0,2096600000.0,220480000.0,10854000.0,978200000.0,452210000.0,40397000.0,293450000.0,145260000.0,1648900000.0,218870000.0,205690000.0,9343000000.0,,44617000.0,,133850000.0,476860000.0,152470000.0,429780000.0,67377000.0,46727000.0,32538000.0,91680000.0,3125000000.0,,170080000.0,2139400000.0,33772000.0,,26554000.0,278900000.0,,1242200000.0,36666000.0,45182000000.0,,48363000.0,98683000.0,338370000.0,27319000.0,256570000.0,71336000.0,,,1140800000.0,,365400000.0,41656000.0,319230000.0,39949000.0,14567000.0,,704360000.0,1203500000.0,2643000000.0,37622000.0,125800000.0,833620000.0,964110000.0,51467000.0,1906200000.0,44014000.0,273040000.0,1552200000.0,118770000.0,10438000.0,159000000.0,10828000000.0,14559000.0,22636000.0,59466000000.0,1113500000.0,224760000.0,1131000000.0,588690000.0,,,252290000.0,322110000.0,840030000.0,339150000.0,401820000.0,,138040000.0,,38351000.0,58149000.0,88449000.0,33389000.0,20247000.0,534520000.0,162870000.0,1425700000.0,104770000.0,,243720000.0,23983000.0,47400000.0,,83087000.0,446710000.0,25626000.0,21133000000.0,25857000.0,592220000.0,193960000.0,,390060000.0,336540000.0,64484000.0,19519000.0,407620000.0,813790000.0,72152000.0,,96295000.0,119100000.0,24770000.0,19424000.0,123290000.0,244640000.0,213520000.0,491950000.0,,141280000.0,481550000.0,94211000.0,1881700000.0,698960000.0,90442000.0,1412000000.0,57683000.0,,,1305200000.0,8253100000.0,,4311300000.0,190280000.0,107860000.0,268840000.0,13817000.0,14066000000.0,5794400.0,393610000.0,111300000.0,6722200.0,327590000.0,30643000.0,29239000.0,82211000.0,2618900000.0,56065000.0,227930000.0,116790000.0,95772000.0,53342000.0,129670000.0,,317390000.0,1887100000.0,630740000.0,125160000.0,33659000.0,175920000.0,10891000.0,154070000.0,18349000.0,584600000.0,966530000.0,45874000.0,,790720000.0,2323100000.0,78795000.0,132320000.0, +2020_05_20_15_35_Q-Exactive-HF-X-Orbitrap_6070,340940000.0,270900000.0,57264000.0,77686000.0,3713400000.0,,86760000.0,174620000.0,81286000.0,139480000.0,,402330000.0,126140000.0,62124000.0,,283010000.0,29132000.0,142390000.0,8966900000.0,144180000.0,324840000.0,58002000.0,28727000000.0,859470000.0,1330900000.0,43455000.0,66945000.0,48609000.0,102190000.0,92276000.0,2872800000.0,1443200000.0,3487900000.0,3253600000.0,6588200000.0,94769000.0,583170000.0,5568800.0,1422300000.0,12472000000.0,362550000.0,65544000.0,,,,188960000.0,20524000.0,1134300000.0,6915900000.0,7589800000.0,2037100000.0,465460000.0,24786000.0,445510000.0,634510000.0,92648000.0,403950000.0,139200000.0,2214200000.0,589870000.0,317520000.0,7485200000.0,42058000.0,58144000.0,,239060000.0,891370000.0,307140000.0,921130000.0,74916000.0,133810000.0,101180000.0,97862000.0,3542700000.0,16694000.0,155510000.0,2933200000.0,112850000.0,34238000.0,57957000.0,234780000.0,21141000.0,2651400000.0,35192000.0,38619000000.0,16300000.0,41827000.0,70954000.0,420090000.0,47691000.0,245970000.0,87073000.0,,49335000.0,2187400000.0,,252060000.0,131080000.0,450320000.0,41174000.0,,,1097400000.0,1470300000.0,3594100000.0,68766000.0,261990000.0,1287700000.0,1449900000.0,89004000.0,1634600000.0,116280000.0,431410000.0,1291900000.0,105610000.0,15625000.0,187370000.0,13661000000.0,33162000.0,18117000.0,93400000000.0,,2273700000.0,1428900000.0,381280000.0,,,370520000.0,269160000.0,1379900000.0,459110000.0,679800000.0,62697000.0,134740000.0,,55936000.0,119210000.0,136160000.0,41956000.0,126420000.0,709130000.0,107960000.0,1515900000.0,45425000.0,16212000.0,405380000.0,28567000.0,101980000.0,6578500.0,146110000.0,458260000.0,82969000.0,23393000000.0,83637000.0,744350000.0,104150000.0,,377480000.0,461120000.0,103950000.0,74421000.0,526980000.0,906890000.0,,49099000.0,132730000.0,242230000.0,21743000.0,29006000.0,140970000.0,168400000.0,319080000.0,445370000.0,,298330000.0,526480000.0,194380000.0,2623500000.0,693630000.0,190400000.0,1683000000.0,28943000.0,18156000.0,32254000.0,,9051200000.0,63369000.0,4542400000.0,158370000.0,145560000.0,327720000.0,61657000.0,26246000000.0,28022000.0,659270000.0,226120000.0,41713000.0,374860000.0,43865000.0,,218830000.0,2016000000.0,70841000.0,393680000.0,190100000.0,138130000.0,124030000.0,254800000.0,148540000.0,473140000.0,5772600000.0,805220000.0,196030000.0,40800000.0,110680000.0,22402000.0,178590000.0,27864000.0,669830000.0,955900000.0,34669000.0,,965180000.0,3573300000.0,128340000.0,336480000.0, +2020_05_22_14_57_Q-Exactive-HF-X-Orbitrap_6070,169260000.0,184990000.0,74627000.0,87377000.0,4241900000.0,,205340000.0,335500000.0,132680000.0,62444000.0,,,263530000.0,,,151210000.0,5087000.0,196460000.0,10331000000.0,108610000.0,162810000.0,59376000.0,32944000000.0,966770000.0,1121200000.0,87790000.0,46291000.0,116990000.0,150320000.0,73448000.0,4109500000.0,826270000.0,4482000000.0,2335800000.0,6795600000.0,53806000.0,445800000.0,,1725000000.0,10300000000.0,123920000.0,138500000.0,,,,134730000.0,25947000.0,1136200000.0,7607600000.0,6793500000.0,985650000.0,416310000.0,6377400.0,985120000.0,735120000.0,63852000.0,366510000.0,162210000.0,2792000000.0,763450000.0,235700000.0,5776900000.0,32263000.0,31884000.0,,271740000.0,453020000.0,313820000.0,633720000.0,103830000.0,93268000.0,114570000.0,200910000.0,6887400000.0,101150000.0,31358000.0,2050800000.0,38436000.0,42347000.0,,207380000.0,36625000.0,1655500000.0,42981000.0,44176000000.0,24307000.0,54728000.0,119700000.0,457690000.0,19653000.0,276140000.0,45172000.0,,46171000.0,2846800000.0,,530640000.0,28051000.0,630170000.0,71078000.0,38608000.0,,1193100000.0,1791600000.0,4134800000.0,37650000.0,168030000.0,1448500000.0,532290000.0,40693000.0,1668900000.0,62147000.0,506820000.0,1968000000.0,41097000.0,,153690000.0,16525000000.0,34386000.0,62865000.0,77097000000.0,,356840000.0,952920000.0,366650000.0,,,262480000.0,201940000.0,1120600000.0,542040000.0,583190000.0,53022000.0,67471000.0,12130000.0,104790000.0,47651000.0,135760000.0,,36842000.0,345480000.0,197790000.0,942060000.0,,,2466400000.0,76626000.0,73071000.0,,103250000.0,637540000.0,27090000.0,33845000000.0,,725090000.0,155030000.0,20310000.0,293470000.0,544350000.0,58083000.0,160880000.0,331300000.0,747840000.0,69659000.0,,,69822000.0,43297000.0,,175930000.0,225720000.0,421780000.0,331120000.0,,185370000.0,429550000.0,188760000.0,2320800000.0,824830000.0,249860000.0,1324600000.0,46749000.0,,35192000.0,213230000.0,8675900000.0,,3000000000.0,255680000.0,71206000.0,254950000.0,74144000.0,,6592900.0,,99111000.0,259490000.0,353100000.0,105360000.0,11075000.0,91262000.0,2040000000.0,60776000.0,529370000.0,224300000.0,229830000.0,135180000.0,144980000.0,,495720000.0,3206100000.0,1238100000.0,259960000.0,32162000.0,94588000.0,18545000.0,187980000.0,25235000.0,856280000.0,1088000000.0,89141000.0,,1348300000.0,3578900000.0,123370000.0,255880000.0, +2020_05_22_17_43_Q-Exactive-HF-X-Orbitrap_6070,454540000.0,339690000.0,40461000.0,125840000.0,5351300000.0,8699000.0,178640000.0,230050000.0,101870000.0,53640000.0,,,277870000.0,28772000.0,,238740000.0,,188210000.0,10363000000.0,125150000.0,184010000.0,61930000.0,30697000000.0,1499200000.0,1570600000.0,113550000.0,33272000.0,102330000.0,355660000.0,106310000.0,4055400000.0,1299600000.0,3649400000.0,3204900000.0,7498300000.0,88132000.0,717320000.0,,1999400000.0,10209000000.0,284390000.0,68531000.0,,,,194780000.0,,1549300000.0,9334900000.0,8894900000.0,1681100000.0,443780000.0,,1033100000.0,807220000.0,66741000.0,542640000.0,161460000.0,2737600000.0,565390000.0,289900000.0,8738800000.0,,58166000.0,,564040000.0,619590000.0,306000000.0,910190000.0,92915000.0,130530000.0,75141000.0,141900000.0,5067600000.0,112500000.0,201010000.0,2471100000.0,12989000.0,22537000.0,33675000.0,304930000.0,14060000.0,1758400000.0,31467000.0,40136000000.0,39488000.0,133060000.0,368180000.0,490440000.0,57743000.0,335050000.0,120750000.0,,58426000.0,2486300000.0,,453220000.0,72844000.0,787050000.0,65410000.0,20722000.0,,1035700000.0,1813800000.0,4733000000.0,64877000.0,377640000.0,1348900000.0,1268100000.0,109630000.0,1648000000.0,75778000.0,559450000.0,2045600000.0,,,243580000.0,15157000000.0,12529000.0,37332000.0,86502000000.0,,2912600000.0,1495600000.0,407130000.0,,,405690000.0,296040000.0,1481900000.0,633220000.0,813100000.0,28173000.0,144800000.0,34278000.0,356610000.0,86166000.0,65819000.0,60951000.0,38428000.0,425820000.0,189240000.0,1656400000.0,128990000.0,,520890000.0,76299000.0,203710000.0,,121260000.0,659720000.0,63477000.0,29193000000.0,35236000.0,734100000.0,104620000.0,16800000.0,461620000.0,605930000.0,145050000.0,170390000.0,622530000.0,741290000.0,22584000.0,19836000.0,103680000.0,188850000.0,19448000.0,,280270000.0,318030000.0,272260000.0,585550000.0,,355720000.0,611420000.0,191130000.0,2379200000.0,904840000.0,234350000.0,1901500000.0,96314000.0,27867000.0,60138000.0,1350300000.0,13055000000.0,12521000.0,4300600000.0,327190000.0,183820000.0,315690000.0,149130000.0,30710000000.0,44398000.0,,275660000.0,21061000.0,516520000.0,114680000.0,,130290000.0,2325300000.0,87039000.0,591360000.0,115100000.0,263630000.0,192380000.0,153270000.0,11820000.0,327210000.0,5585800000.0,641150000.0,134910000.0,34150000.0,191140000.0,34056000.0,229030000.0,15638000.0,1026000000.0,953800000.0,,,1284500000.0,3326300000.0,97477000.0,246850000.0, +2020_05_26_14_20_Q-Exactive-HF-X-Orbitrap_6070,523720000.0,468420000.0,59944000.0,78329000.0,6624300000.0,,417370000.0,483970000.0,243450000.0,103920000.0,,,364970000.0,36235000.0,,289780000.0,,121840000.0,19743000000.0,77723000.0,261440000.0,229130000.0,45390000000.0,1915700000.0,2187400000.0,65955000.0,29328000.0,208260000.0,311150000.0,51453000.0,5985500000.0,1700500000.0,5921600000.0,3640900000.0,11803000000.0,166320000.0,3392200000.0,12641000.0,4175200000.0,24005000000.0,538140000.0,137530000.0,,,,454940000.0,26111000.0,2387100000.0,14862000000.0,14094000000.0,3014900000.0,869410000.0,,480830000.0,1158700000.0,86181000.0,589290000.0,152380000.0,3075200000.0,1324500000.0,420860000.0,19372000000.0,,135310000.0,,222910000.0,1550300000.0,634470000.0,1256200000.0,156590000.0,147960000.0,67170000.0,597940000.0,7580800000.0,183730000.0,300980000.0,,67439000.0,,47176000.0,486380000.0,34402000.0,2586900000.0,115940000.0,92561000000.0,35467000.0,154080000.0,90392000.0,506750000.0,52819000.0,627530000.0,169770000.0,,56725000.0,4694800000.0,,567160000.0,120320000.0,1023800000.0,156870000.0,50725000.0,,2273400000.0,3077700000.0,4607600000.0,191920000.0,523540000.0,2258900000.0,1970400000.0,82416000.0,2374700000.0,140070000.0,852050000.0,4231700000.0,86706000.0,17548000.0,251940000.0,26953000000.0,54493000.0,25395000.0,159580000000.0,,3521400000.0,2088600000.0,391550000.0,,27316000.0,650640000.0,729520000.0,1421600000.0,930500000.0,839400000.0,47302000.0,129630000.0,40591000.0,184800000.0,211170000.0,243710000.0,,52882000.0,1829100000.0,324690000.0,2610300000.0,305530000.0,,4298000000.0,89678000.0,196310000.0,,104110000.0,1261400000.0,49300000.0,50402000000.0,94647000.0,1593600000.0,295340000.0,,853960000.0,862480000.0,199200000.0,16432000.0,965410000.0,1557100000.0,,36554000.0,255880000.0,388910000.0,,58568000.0,392100000.0,481180000.0,893600000.0,916530000.0,,508950000.0,839820000.0,202270000.0,4932400000.0,1472500000.0,197900000.0,2636200000.0,244810000.0,54868000.0,138020000.0,1283100000.0,19468000000.0,62570000.0,7129600000.0,354300000.0,212430000.0,247330000.0,147640000.0,56014000000.0,55036000.0,1612600000.0,77832000.0,69398000.0,983640000.0,98923000.0,,197820000.0,3149100000.0,38653000.0,774350000.0,405010000.0,478560000.0,372520000.0,120430000.0,,877410000.0,6858300000.0,1827900000.0,357900000.0,25238000.0,238530000.0,54183000.0,510410000.0,18946000.0,1807400000.0,2431200000.0,155490000.0,,1951700000.0,6647300000.0,211410000.0,540610000.0,2463200000.0 +2020_05_27_13_57_Q-Exactive-HF-X-Orbitrap_6070,1019000000.0,949030000.0,316810000.0,364680000.0,10686000000.0,63112000.0,853670000.0,1228500000.0,470120000.0,281780000.0,,,727200000.0,146030000.0,,784800000.0,115550000.0,658060000.0,33508000000.0,393460000.0,807430000.0,352400000.0,83324000000.0,6411600000.0,5812000000.0,336700000.0,203960000.0,400880000.0,733310000.0,384890000.0,10038000000.0,3758600000.0,13984000000.0,11829000000.0,23673000000.0,397450000.0,2752800000.0,122660000.0,8163700000.0,35321000000.0,1135100000.0,341950000.0,2323600000.0,,25574000.0,862240000.0,44097000.0,5740400000.0,25224000000.0,28252000000.0,5878700000.0,1476800000.0,19899000.0,1298600000.0,3119500000.0,309410000.0,1657400000.0,711770000.0,10382000000.0,1410100000.0,1310300000.0,27997000000.0,147090000.0,272430000.0,,1120600000.0,1746700000.0,1278700000.0,3365500000.0,396550000.0,430060000.0,478660000.0,523440000.0,15408000000.0,270320000.0,926060000.0,,299560000.0,31736000.0,219750000.0,1415000000.0,69582000.0,4452300000.0,185230000.0,107330000000.0,238440000.0,385170000.0,608120000.0,1677400000.0,197340000.0,1021700000.0,149710000.0,,378730000.0,5971000000.0,6373500000.0,1551000000.0,203990000.0,2272000000.0,321400000.0,106410000.0,,3940200000.0,4982800000.0,12796000000.0,200530000.0,917330000.0,3472100000.0,4164300000.0,134710000.0,4277900000.0,359380000.0,2520400000.0,9336200000.0,164160000.0,206840000.0,903030000.0,49385000000.0,184370000.0,138280000.0,251120000000.0,3285300000.0,10664000000.0,5706500000.0,1640900000.0,41251000.0,,1328200000.0,1239600000.0,4565500000.0,2503200000.0,3114600000.0,143810000.0,489850000.0,188030000.0,308910000.0,433810000.0,399560000.0,197230000.0,319010000.0,2834300000.0,813940000.0,3826300000.0,371640000.0,74686000.0,1911900000.0,196170000.0,507590000.0,,652560000.0,1781800000.0,94849000.0,70227000000.0,193020000.0,1806500000.0,694410000.0,46050000.0,1886300000.0,2821000000.0,150470000.0,610430000.0,2739200000.0,3469700000.0,152690000.0,102920000.0,244470000.0,1075200000.0,84438000.0,196080000.0,741060000.0,1321200000.0,773950000.0,1563900000.0,,1442600000.0,1735500000.0,1133300000.0,9230700000.0,3181400000.0,914070000.0,3814300000.0,395190000.0,,327750000.0,5192200000.0,34831000000.0,105190000.0,12209000000.0,602900000.0,529490000.0,1466400000.0,702280000.0,,92812000.0,3330800000.0,449840000.0,366920000.0,2266900000.0,362600000.0,82386000.0,725730000.0,7029300000.0,268180000.0,1754200000.0,717320000.0,882690000.0,641400000.0,688740000.0,166790000.0,2217100000.0,18861000000.0,3693900000.0,820620000.0,78884000.0,800530000.0,193000000.0,734490000.0,127730000.0,3639900000.0,3592600000.0,,,4435800000.0,14556000000.0,,811370000.0, +2020_05_28_04_06_Q-Exactive-HF-X-Orbitrap_6070,1507800000.0,570540000.0,264310000.0,476130000.0,14862000000.0,63117000.0,1216900000.0,1377700000.0,519730000.0,410430000.0,,,1458700000.0,179160000.0,,679240000.0,,805880000.0,43488000000.0,734840000.0,1025000000.0,417130000.0,97880000000.0,7256600000.0,6383500000.0,288360000.0,,474180000.0,741910000.0,486070000.0,12889000000.0,4316700000.0,17174000000.0,11522000000.0,27240000000.0,405290000.0,2409900000.0,123840000.0,10796000000.0,45876000000.0,1668200000.0,408910000.0,2658400000.0,,,1406100000.0,164780000.0,7013300000.0,30386000000.0,32378000000.0,7124800000.0,2350200000.0,116580000.0,3063300000.0,3769100000.0,250420000.0,1658200000.0,780670000.0,12827000000.0,1992800000.0,3162000000.0,30848000000.0,162800000.0,370090000.0,,1054800000.0,2684400000.0,1421900000.0,4173300000.0,498970000.0,437660000.0,4156700000.0,731790000.0,17371000000.0,479100000.0,565010000.0,,341150000.0,251620000.0,150990000.0,1195600000.0,185440000.0,7427400000.0,112380000.0,131690000000.0,236640000.0,573080000.0,285150000.0,1574000000.0,226620000.0,1213900000.0,289310000.0,,829730000.0,3723100000.0,12845000000.0,1462200000.0,590100000.0,2609100000.0,262500000.0,214960000.0,,4680300000.0,5026300000.0,16071000000.0,312250000.0,1376200000.0,5198100000.0,4820000000.0,223090000.0,7011800000.0,381850000.0,2586700000.0,10547000000.0,167690000.0,98290000.0,865200000.0,61312000000.0,305800000.0,199680000.0,314580000000.0,58647000.0,12062000000.0,6131700000.0,2291400000.0,37397000.0,,1258100000.0,1980500000.0,4815000000.0,3774400000.0,3251500000.0,143610000.0,721600000.0,177100000.0,469850000.0,345640000.0,552520000.0,271250000.0,263000000.0,4416000000.0,833690000.0,4759800000.0,212340000.0,107850000.0,7211300000.0,429720000.0,81160000.0,75914000.0,951870000.0,2646100000.0,350690000.0,93467000000.0,187190000.0,2612000000.0,789870000.0,75245000.0,1877600000.0,4078200000.0,216890000.0,499180000.0,2215300000.0,3706200000.0,172750000.0,50708000.0,543920000.0,1158800000.0,97121000.0,134410000.0,807140000.0,1567900000.0,902190000.0,2132900000.0,,2213300000.0,2326100000.0,887830000.0,11467000000.0,3746800000.0,1151800000.0,5910500000.0,586790000.0,102980000.0,263490000.0,2494700000.0,33627000000.0,232720000.0,12946000000.0,1302500000.0,661230000.0,1192900000.0,712800000.0,84861000000.0,145560000.0,2491000000.0,357160000.0,406680000.0,2883300000.0,528600000.0,52787000.0,972780000.0,9602100000.0,203560000.0,2056900000.0,808770000.0,1126300000.0,524000000.0,778980000.0,339660000.0,2489300000.0,21105000000.0,4731400000.0,1012300000.0,102540000.0,1130800000.0,204690000.0,913580000.0,66807000.0,4211300000.0,5002300000.0,,48645000.0,5759300000.0,15124000000.0,508050000.0,1053200000.0,4089000000.0 +2020_06_01_10_22_Q-Exactive-HF-X-Orbitrap_6070,,,,17836000.0,10572000000.0,,199240000.0,34746000.0,99474000.0,156480000.0,119800000.0,,,108500000.0,20522000.0,220060000.0,,,5038900000.0,21121000.0,225930000.0,,55890000000.0,92586000.0,1083700000.0,,,,110370000.0,,1504300000.0,839470000.0,7579300000.0,1146900000.0,9507900000.0,,769910000.0,22056000.0,2012700000.0,9265800000.0,188980000.0,,,168870000.0,,25760000.0,24172000.0,2080500000.0,12311000000.0,7094300000.0,2538300000.0,35706000.0,17511000.0,4442900000.0,316340000.0,,106420000.0,82895000.0,3157300000.0,342340000.0,191550000.0,11440000000.0,,55566000.0,14101000.0,320900000.0,286840000.0,2087600000.0,1136800000.0,,26832000.0,,127920000.0,4076800000.0,,219870000.0,2855600000.0,60635000.0,,318940000.0,169310000.0,,975950000.0,28494000.0,133570000000.0,,,23302000.0,93731000.0,20400000.0,41936000.0,,45213000.0,,4292200000.0,2193700000.0,159780000.0,,256160000.0,,21995000.0,374450000.0,743300000.0,1287100000.0,1091500000.0,,100020000.0,837820000.0,1484900000.0,109860000.0,1346600000.0,,667390000.0,811260000.0,442460000.0,18193000.0,109520000.0,20606000000.0,,,146000000000.0,69009000.0,2737700000.0,1777400000.0,307110000.0,,,299140000.0,1047800000.0,1338600000.0,295020000.0,527140000.0,120140000.0,105230000.0,132570000.0,80978000.0,,,,27366000.0,155280000.0,68486000.0,3712000000.0,46604000.0,151690000.0,43021000.0,,35227000.0,15968000.0,38069000.0,827510000.0,47698000.0,89029000000.0,,1065100000.0,204950000.0,,418300000.0,547990000.0,,20509000.0,579520000.0,108340000.0,70205000.0,,160010000.0,81327000.0,,,112660000.0,1011200000.0,214170000.0,68197000.0,307220000.0,123280000.0,232540000.0,7605400.0,1620500000.0,307580000.0,61589000.0,914960000.0,28309000.0,35997000.0,79649000.0,,14000000000.0,,1061500000.0,197900000.0,31127000.0,34161000.0,205860000.0,1444600000.0,,202890000.0,1083400000.0,,922180000.0,11287000.0,73979000.0,147030000.0,1983500000.0,334740000.0,62519000.0,,46885000.0,32036000.0,,,1191800000.0,5609300000.0,1730600000.0,547050000.0,,62691000.0,64152000.0,242930000.0,58256000.0,1110900000.0,645740000.0,,,300380000.0,4160400000.0,,71290000.0,8443600000.0 +2020_06_01_15_41_Q-Exactive-HF-X-Orbitrap_6070,254420000.0,,104380000.0,19899000.0,15447000000.0,50842000.0,443570000.0,256320000.0,,,435750000.0,2979100000.0,,342820000.0,,204990000.0,,229960000.0,13234000000.0,388420000.0,687740000.0,,91447000000.0,137010000.0,1107700000.0,,71906000.0,29181000.0,226790000.0,116840000.0,4221200000.0,577430000.0,11947000000.0,1571100000.0,16846000000.0,,1743300000.0,,3331800000.0,21713000000.0,344950000.0,,,319700000.0,,,74024000.0,5525300000.0,24892000000.0,15814000000.0,3677700000.0,276130000.0,54614000.0,7656000000.0,552540000.0,32190000.0,231940000.0,,3931000000.0,1703500000.0,515810000.0,16008000000.0,,151860000.0,22471000.0,401220000.0,443730000.0,4946300000.0,3016100000.0,,30389000.0,70199000.0,1124800000.0,8993600000.0,84948000.0,532650000.0,,,,436480000.0,402330000.0,,2340800000.0,,174920000000.0,,119990000.0,198680000.0,148480000.0,30222000.0,,93136000.0,152270000.0,,8029600000.0,,613070000.0,,668360000.0,,249440000.0,1134400000.0,2677200000.0,2989100000.0,6662300000.0,,289250000.0,2119600000.0,2408400000.0,127320000.0,1124500000.0,,1999700000.0,2498800000.0,709030000.0,102200000.0,,37747000000.0,,,256870000000.0,,7718900000.0,3682300000.0,1624700000.0,,,255520000.0,1334000000.0,2474900000.0,895640000.0,1383800000.0,74830000.0,312210000.0,324900000.0,356400000.0,,,,19302000.0,451100000.0,147360000.0,3520900000.0,112510000.0,320670000.0,314190000.0,184300000.0,139900000.0,,37017000.0,1818600000.0,64943000.0,162210000000.0,,901020000.0,381830000.0,,980500000.0,932820000.0,226990000.0,,751260000.0,462530000.0,152950000.0,,237470000.0,169550000.0,,35626000.0,203220000.0,2196500000.0,496260000.0,90437000.0,76447000.0,332890000.0,217480000.0,,2915500000.0,759840000.0,215930000.0,2419400000.0,26689000.0,157800000.0,88998000.0,3519300000.0,30647000000.0,,1915700000.0,528380000.0,202750000.0,53035000.0,201400000.0,2844900000.0,,767640000.0,1314400000.0,,2175300000.0,32211000.0,264870000.0,366150000.0,2550400000.0,262550000.0,93648000.0,,228220000.0,55059000.0,67988000.0,72113000.0,2164900000.0,10665000000.0,3667800000.0,1083600000.0,,108100000.0,51431000.0,316450000.0,21808000.0,2909900000.0,1214500000.0,220760000.0,,621960000.0,8016300000.0,,,15962000000.0 +2020_06_02_09_41_Q-Exactive-HF-X-Orbitrap_6070,410960000.0,470610000.0,105770000.0,178810000.0,7460000000.0,,435480000.0,451120000.0,227090000.0,123530000.0,,,426780000.0,,,316620000.0,,118120000.0,20763000000.0,145580000.0,399480000.0,66115000.0,54795000000.0,2354200000.0,2643100000.0,141560000.0,185530000.0,312700000.0,557050000.0,135690000.0,5924400000.0,2094600000.0,7501200000.0,3965800000.0,13928000000.0,173810000.0,1190200000.0,,2920200000.0,17913000000.0,537420000.0,,10973000000.0,,,411490000.0,93824000.0,2146800000.0,14785000000.0,14258000000.0,2086000000.0,858930000.0,,631150000.0,1337000000.0,149980000.0,721090000.0,265890000.0,4596700000.0,1278900000.0,435860000.0,13410000000.0,27760000.0,87124000.0,,458900000.0,858060000.0,704180000.0,1415700000.0,187540000.0,279350000.0,1706800000.0,233930000.0,7674400000.0,329900000.0,190330000.0,4005500000.0,116980000.0,15495000.0,57531000.0,784270000.0,32068000.0,3696900000.0,129810000.0,84037000000.0,,211740000.0,99351000.0,717940000.0,59564000.0,609790000.0,,,74848000.0,791610000.0,,486620000.0,155990000.0,990450000.0,165240000.0,25773000.0,,2266300000.0,3332700000.0,6299700000.0,135160000.0,619720000.0,1992100000.0,2399700000.0,175590000.0,2662300000.0,121930000.0,1183700000.0,3869400000.0,55012000.0,,171790000.0,25740000000.0,103080000.0,67035000.0,193730000000.0,68197000.0,760220000.0,2296000000.0,619610000.0,,154170000.0,733490000.0,672620000.0,2436800000.0,1219500000.0,523120000.0,18541000.0,283780000.0,41076000.0,185930000.0,80195000.0,192050000.0,68340000.0,61410000.0,1476800000.0,272240000.0,1637400000.0,62025000.0,32572000.0,685070000.0,41503000.0,121840000.0,,272370000.0,1028400000.0,60642000.0,58256000000.0,93905000.0,1276000000.0,152090000.0,13482000.0,741870000.0,1153200000.0,225480000.0,168330000.0,716300000.0,1637000000.0,63893000.0,37966000.0,148200000.0,256970000.0,63575000.0,,385570000.0,672410000.0,575820000.0,689340000.0,,506290000.0,1102100000.0,239780000.0,4075400000.0,1205900000.0,581420000.0,2167300000.0,212030000.0,38825000.0,42891000.0,1494300000.0,17199000000.0,30596000.0,5622000000.0,537920000.0,342180000.0,248910000.0,297790000.0,1140100000.0,47740000.0,1109100000.0,318980000.0,40020000.0,1087300000.0,172680000.0,,212850000.0,3391900000.0,129080000.0,719610000.0,332810000.0,290970000.0,195110000.0,517040000.0,,889360000.0,11908000000.0,2391500000.0,484240000.0,69790000.0,345580000.0,73200000.0,429250000.0,32705000.0,1369300000.0,1753700000.0,211680000.0,,1950400000.0,6681700000.0,585320000.0,574460000.0, diff --git a/project/data/dev_datasets/HeLa_6070/protein_groups_wide_N50_M454.csv b/project/data/dev_datasets/HeLa_6070/protein_groups_wide_N50_M454.csv new file mode 100644 index 000000000..627406322 --- /dev/null +++ b/project/data/dev_datasets/HeLa_6070/protein_groups_wide_N50_M454.csv @@ -0,0 +1,51 @@ +Sample ID,COX6A1,GTF2A2,ALDOA,TRAPPC3,AXL,EYA3,HIST2H2AC;HIST2H2AA3,IGF2BP3,CRTAP,RNASEH2C,FCF1,DSCR3,INTS9,PRPF6,ARL6IP4,TMEM30A,COL14A1,EXOSC7,ARMCX3,MARCH5,KTN1,PPP1R13L,ARGLU1,NAT10,NRBP1,AKR1A1,TRMT2A,SNRPF,VWA9,GNAI1,AK3,TXNL4A,PRPF8,BGN,CDKN2AIP,AP2A2,FASN,LRSAM1,MCM3,SERF2,KIF21A,HIGD2A,APEX1,CUL4A,NPLOC4,NCAPD2,RANBP9,CCBL2,RAB6A,VPS33A,SAP30BP,UBA52;UBC;UBB;RPS27A,RRP1B,AP2M1,TMEM245,ELP4,AURKA,AQR,LIG3,RPL27A,CDK9,THYN1,CASK,CLPP,MRPS28,FN3KRP,STRN4,ATP1A1,ALDH2,PPIF,KIF4A,ANO6,RPTOR,ATP6V1H,NELFCD;TH1L,CDCA7L,COASY,DDX52,ARAF,CKAP4,PITPNA,FAM91A1,RDH13,SCRN2,PRKCDBP,INTS7,MGMT,COQ9,SLC38A2,SLC7A1,CGGBP1,LLGL1,MFN1,SRPK2,MBD2,AP2S1,RBM17,DYNLL2,CRK,ALDH16A1,YBX1,SRM,ACTL6A,LRP1,SNF8,RPLP1,UBP1,WDR18,AFF4,MRPL32,HMGB2,HMGB3,Em:AP000351.3;GSTT2;GSTT2B,SMOC1,UTRN,PPP4C,RFX5,UBA6,SARNP,WNK1,RFC3,NQO2,NT5DC2,DDX17,SLC25A19,SLC16A7,YTHDF1,CNDP2,EIF4EBP2,EXOSC10,WBP2,ECT2,PPP4R2,TRUB1,TMEM65,ARHGAP35,THOC6,DKFZp686J1372;TPM3,TSNAX;DISC1,GNPAT,MSH3,PPP1R14B,PRMT7,SF3B2,RPAP1,MTFP1,MRPL19,UFSP2,DCTN3,MALT1,STAG2,TSC22D1,KIAA1143,S100A16,ATP5J,PMPCA,NUP153,CCDC12,YWHAH,MCU,HDLBP,UHRF2,NIFK,POLR2J3;POLR2J;POLR2J2,ASB6,RPS6KA3,ISOC1,UBE2Z,MRPL39,NKRF,SCRN1,B4GALT1,WBSCR16,TMEM192,ANKZF1,SREK1IP1,RPL38,ITPR1,SEC23A,EPB41L1,C9orf64,PITHD1,PPP1R10,BYSL,COMMD5,CSTB,SETD3,TPR,COTL1,GPHN,NAE1,POLR2C,TMEM261,PSMD5,CENPH,EXOC1,GOT1,LUC7L3,PAPSS1,SMAP;C11orf58,DYNLRB1,KIAA1279,SEPT2,PRRC2B,SLC25A5,HOOK3,CHCHD4,C5orf51,FNTA,HSPG2,SURF6,GLE1,ELAVL1,PTGFRN,BRAT1,TUBG1;TUBG2,GSTM1,HSP90AB2P,USP5,GGA2,NEK7,RRP12,ADAM10,NR2C2AP,CFL1,PROCR,POLE4,OCRL,ATF7,SGSH,C16orf62,OXSR1,MRPS25,PAXBP1,RTN1,BCAT1,LRRC16A,PTRH2,CASP2,LARP4B,UBE4A,RPP38,WDR36,TRIP12,NUP107,DNM1,POLA1,VCP,GRSF1,TUFM,PROSC,KNOP1,ATF7IP,DCTN4,RNF114,NUDT9,P4HA1,NONO,NSMCE2,PIGS,NUP62,VPS13C,NEGR1,IPO7,DNAJB1,GMPR2,PAAF1,CHRAC1,NDUFAF4,CLIC1,LAMTOR1,WDR61,PGK1,PLS3,GBF1,MPP6,TNPO1,ACTR3,MTOR,VPS11,LIMCH1,CHAMP1,EIF3C;EIF3CL,CARM1,MTHFD2,ELOF1,JUNB,EPN1,DBN1,SNX3,PSMD8,HEATR6,TXN,GPN3,RPAP3,HSPH1,ZNF706,AHSA1,ABCB6,SUMO3,SF3A1,INPP1,TWISTNB,NANS,RPRD1A,MFAP1,DDX19A;DDX19B,PABPC1;PABPC3,MRPS9,ERAP1,CASC4,DTD1,CCDC58,TMED5,ATP13A1,CRTC3,DNAJC10,GAR1,SYNCRIP,ACAD9,RAB1B;RAB1C,GTF2I,COPS4,TRIM28,TMSB4X,NVL,CD99,PIH1D1,MAGT1,GCDH,EIF1B,PLEKHA2,SLC35A4,IPO11,AAR2,CCT3,GRPEL1,SYNGR2,ABHD11,COPG1,CSE1L,PPP2CA;PPP2CB,SLC7A5,TRMT10C,VPS29,COX6C,CTSD,MRE11A,DENR,ACTN1,SNRNP70,FARP1,RBM34,PTPN1,SH3BGRL3,NPEPPS,FAM195B,ARPC1A,NACC1,TRMT5,PXN,VTA1,ZC3H7B,ERP29,MRPS26,PRMT3,SEPT7,PDCD5,NPM3,RCN2,SMARCA1,ITGB1,GNG12,RTCA,SPG21,RAB35,MRPL51,GABARAPL2,PPAN;PPAN-P2RY11,EMC1,ATP5SL,LIPA,DNAJA4,LSM12,MRPL12,CAPZA1,RPS5,NIT2,NIPSNAP3A,DHX8,RPL18A,TNFAIP8,FHL3,MRPS35,PRPF38B,DPM3,UTP3,FKBP14,LSG1,FLOT2,EDC4,WDR33,ZNF326,KDM1A,SUCLG2,NRD1,DUSP12,SNAP29,GEMIN5,PPP3R1,IWS1,TIMM9,DNAJB4,DERL2,RPS2,SNX2,BPI,SMG8,DDAH2,BDH2,FHL1,POLR3A,LCMT1,POLR1A,ATIC,AVEN,TIMM23;TIMM23B,FLYWCH2,INF2,FAM21A;FAM21C,POR,SDF2L1,RPS7,PRKAA1,HSPA4,NDUFAF3,PSMD14,HN1L,FDXR,PTMS,TRAF2,CCDC50,PARVA,EIF4G1,MED4,DDX46,PODXL,RBM22,YIF1B,CSNK1A1,YY1,VWA8,IKBKB,CHMP1A +2019_12_18_14_35_Q-Exactive-HF-X-Orbitrap_6070,,,26334000000.0,221750000.0,,,,1144400000.0,385150000.0,68173000.0,60790000.0,,79071000.0,1484700000.0,52205000.0,105920000.0,,467030000.0,,100640000.0,2185500000.0,,125610000.0,2027600000.0,276210000.0,943060000.0,17604000.0,1060000000.0,82020000.0,21166000.0,84744000.0,64074000.0,4929300000.0,,141580000.0,182420000.0,27789000000.0,190830000.0,3159300000.0,,530620000.0,183330000.0,3389000000.0,928040000.0,431780000.0,1480000000.0,93783000.0,,,92406000.0,70097000.0,,827630000.0,462180000.0,17610000.0,38120000.0,56965000.0,181260000.0,430840000.0,4121600000.0,116440000.0,189160000.0,7352400.0,224970000.0,114360000.0,13363000.0,,4492000000.0,,663110000.0,148920000.0,12799000.0,38190000.0,132860000.0,216050000.0,,1400200000.0,278920000.0,9964800.0,2768600000.0,339280000.0,207350000.0,283660000.0,,,117080000.0,115610000.0,301030000.0,580420000.0,104190000.0,19964000.0,,132660000.0,231900000.0,31194000.0,84635000.0,254910000.0,33217000.0,268690000.0,90541000.0,5567800000.0,1920800000.0,677270000.0,,31860000.0,5298900000.0,,321070000.0,,57241000.0,490230000.0,99739000.0,,,,157840000.0,,279780000.0,1302800000.0,218170000.0,350940000.0,671270000.0,54131000.0,2756100000.0,111020000.0,,31864000.0,821880000.0,,617540000.0,,112580000.0,179750000.0,38637000.0,23485000.0,,247130000.0,8341600000.0,,14757000.0,44890000.0,312660000.0,93890000.0,2747000000.0,13171000.0,,192110000.0,36644000.0,14188000.0,,579530000.0,,125350000.0,122060000.0,382260000.0,1626700000.0,294490000.0,230250000.0,818890000.0,427270000.0,2111400000.0,,502040000.0,129440000.0,35344000.0,208710000.0,291540000.0,179620000.0,344880000.0,378680000.0,125520000.0,20713000.0,86122000.0,,34397000.0,,526750000.0,222710000.0,520620000.0,,90622000.0,12408000.0,65200000.0,727830000.0,,3678200000.0,70791000.0,3476600000.0,1744500000.0,85923000.0,234520000.0,88669000.0,98157000.0,1339900000.0,34628000.0,,1831600000.0,650750000.0,615110000.0,156080000.0,312840000.0,273110000.0,863560000.0,15057000.0,876150000.0,8712400.0,172310000.0,96523000.0,95068000.0,,326240000.0,111400000.0,2171900000.0,,289090000.0,685770000.0,,1617600000.0,1626600000.0,38034000.0,52534000.0,2162300000.0,109160000.0,42565000.0,11331000000.0,106720000.0,6877800.0,27780000.0,42155000.0,,,799620000.0,193880000.0,85313000.0,,,48206000.0,403990000.0,52727000.0,209860000.0,252070000.0,,1080400000.0,86757000.0,514310000.0,,136570000.0,10310000000.0,1479600000.0,10372000000.0,202380000.0,268430000.0,25882000.0,142650000.0,270160000.0,143990000.0,745050000.0,4287800000.0,185700000.0,214700000.0,724620000.0,,,4601700000.0,1212400000.0,,65495000.0,114720000.0,423050000.0,4299600000.0,575210000.0,340880000.0,17028000000.0,5655900000.0,679540000.0,203340000.0,3239900000.0,879960000.0,283360000.0,53465000.0,300860000.0,188570000.0,2472200000.0,213530000.0,643410000.0,43435000.0,,131230000.0,480020000.0,786620000.0,348060000.0,47282000.0,16340000000.0,21708000.0,570490000.0,8015200000.0,,3479000000.0,23265000.0,1529900000.0,2149300000.0,21005000.0,29487000.0,662490000.0,102670000.0,147250000.0,,,330350000.0,69806000.0,32472000.0,53678000.0,209520000.0,252970000.0,203950000.0,25913000.0,,191630000.0,8304000000.0,390910000.0,,2243800000.0,589610000.0,5356800000.0,,181990000.0,76059000.0,141330000.0,,43935000.0,,,92837000.0,446890000.0,159370000.0,12399000000.0,677760000.0,25285000.0,55043000.0,2104700000.0,9770700000.0,,1272300000.0,543670000.0,1048300000.0,733370000.0,1788800000.0,198480000.0,250740000.0,1918400000.0,1631500000.0,,191910000.0,443930000.0,254800000.0,2936700000.0,,26182000.0,115240000.0,39474000.0,127290000.0,286620000.0,5061300.0,1874400000.0,64267000.0,309810000.0,784620000.0,2530100000.0,1646600000.0,424620000.0,,433780000.0,291400000.0,105140000.0,134350000.0,147640000.0,43519000.0,165670000.0,,653610000.0,,46199000.0,144810000.0,,1061900000.0,1076800000.0,9974800000.0,236750000.0,27791000.0,135280000.0,3703500000.0,,49433000.0,190160000.0,18180000.0,106060000.0,99003000.0,,225930000.0,323250000.0,1008500000.0,47671000.0,453280000.0,296050000.0,575790000.0,1176400000.0,180840000.0,37935000.0,2067900000.0,75756000.0,101840000.0,140510000.0,319900000.0,,8173100000.0,319530000.0,,,,,20405000.0,110760000.0,34667000.0,355350000.0,4352000000.0,,,492470000.0,406360000.0,,2293100000.0,165950000.0,5462300000.0,155530000.0,8756500000.0,17358000.0,691890000.0,586910000.0,56763000.0,938680000.0,197980000.0,89162000.0,29172000.0,7329000000.0,62928000.0,2495800000.0,368600000.0,23691000.0,17144000.0,,,62607000.0,,86827000.0 +2019_12_19_19_48_Q-Exactive-HF-X-Orbitrap_6070,,18895000.0,25799000000.0,212240000.0,,7106500.0,,851160000.0,307160000.0,,10275000.0,,78610000.0,1061900000.0,58924000.0,51937000.0,,293740000.0,,15236000.0,1610100000.0,,,1806000000.0,201490000.0,1125700000.0,85676000.0,685910000.0,139540000.0,33583000.0,44735000.0,,4389900000.0,,,81286000.0,21593000000.0,109620000.0,3231800000.0,64888000.0,365470000.0,,1981600000.0,702990000.0,407390000.0,1296200000.0,107660000.0,76180000.0,,58014000.0,25491000.0,,627900000.0,215720000.0,14620000.0,7923700.0,,258510000.0,246620000.0,4191400000.0,123290000.0,176260000.0,,125430000.0,99765000.0,19069000.0,,3407500000.0,,642300000.0,55773000.0,,,28994000.0,151860000.0,,1066800000.0,133300000.0,11477000.0,2302700000.0,202860000.0,95056000.0,133920000.0,,,73293000.0,45697000.0,99981000.0,151890000.0,23534000.0,89442000.0,,39750000.0,119280000.0,,45901000.0,186630000.0,23543000.0,61948000.0,46133000.0,4981900000.0,1357300000.0,867130000.0,,13738000.0,4214900000.0,,176850000.0,35347000.0,36035000.0,236420000.0,187370000.0,,,126790000.0,84625000.0,,230400000.0,785050000.0,102960000.0,587430000.0,667780000.0,98577000.0,1696200000.0,148400000.0,19318000.0,,847560000.0,27711000.0,402440000.0,112590000.0,63925000.0,134790000.0,101780000.0,32324000.0,,80741000.0,,34809000.0,,,,77274000.0,2531000000.0,,,162330000.0,10847000.0,19522000.0,,296770000.0,,78497000.0,54029000.0,619090000.0,393900000.0,105030000.0,190240000.0,547080000.0,217580000.0,1696200000.0,,345940000.0,77764000.0,,103520000.0,286140000.0,143430000.0,223490000.0,460410000.0,97341000.0,7079000.0,,,,32392000.0,688000000.0,71685000.0,416460000.0,,41625000.0,118060000.0,197110000.0,514580000.0,,2747500000.0,77397000.0,3675600000.0,1579100000.0,43782000.0,246660000.0,162290000.0,,1376700000.0,16447000.0,22706000.0,1952100000.0,639360000.0,95372000.0,,293020000.0,266570000.0,681040000.0,,8910800000.0,,86411000.0,,82767000.0,,265790000.0,76580000.0,1582700000.0,,202100000.0,617870000.0,,1559200000.0,1116400000.0,31010000.0,121950000.0,820530000.0,63901000.0,,10486000000.0,247200000.0,11794000.0,,,,,679430000.0,248980000.0,136870000.0,,,52239000.0,296570000.0,27587000.0,54200000.0,104990000.0,37202000.0,683450000.0,65906000.0,367650000.0,,380980000.0,8372400000.0,718490000.0,9445700000.0,37307000.0,170240000.0,18889000.0,20468000.0,300950000.0,49774000.0,578590000.0,3715000000.0,60033000.0,127780000.0,304580000.0,,,2685400000.0,1482000000.0,,62481000.0,93776000.0,303900000.0,3432000000.0,420100000.0,444750000.0,16192000000.0,5427800000.0,453310000.0,159580000.0,3152700000.0,1358700000.0,121030000.0,35764000.0,131880000.0,156990000.0,2305900000.0,432330000.0,482370000.0,43066000.0,,86293000.0,406390000.0,530620000.0,295110000.0,7799200.0,14762000000.0,21721000.0,587010000.0,8834700000.0,,2636200000.0,14355000.0,,1797900000.0,14876000.0,89476000.0,512530000.0,102420000.0,164530000.0,,6872300000.0,182030000.0,21381000.0,24774000.0,,122420000.0,244700000.0,127080000.0,,31977000.0,271740000.0,5239500000.0,363320000.0,534860000.0,1619000000.0,514610000.0,4536200000.0,,45655000.0,35976000.0,189700000.0,,18261000.0,560540000.0,,141780000.0,337070000.0,155320000.0,9868600000.0,812420000.0,17314000.0,,970470000.0,7940700000.0,,825600000.0,281450000.0,742940000.0,250730000.0,1372200000.0,205150000.0,168370000.0,1667200000.0,1383100000.0,,188950000.0,337910000.0,180080000.0,2200400000.0,,,140390000.0,19930000.0,63526000.0,171170000.0,,1753200000.0,64440000.0,478730000.0,777760000.0,1416500000.0,1082800000.0,461270000.0,,248020000.0,234560000.0,116760000.0,160550000.0,171830000.0,,200350000.0,,456490000.0,9093700.0,,371540000.0,355270000.0,660440000.0,1257000000.0,10190000000.0,284290000.0,,70695000.0,3820000000.0,,22272000.0,133810000.0,54372000.0,67146000.0,94044000.0,23693000.0,193800000.0,317720000.0,690440000.0,12891000.0,352490000.0,317350000.0,308550000.0,1245100000.0,158840000.0,37577000.0,1628000000.0,42496000.0,92446000.0,94608000.0,266720000.0,,5687200000.0,599260000.0,,,,15850000.0,,15120000.0,,284370000.0,3031100000.0,,,459310000.0,248860000.0,87344000.0,1480300000.0,201540000.0,3689900000.0,199630000.0,8510000000.0,70664000.0,566900000.0,461440000.0,9539800.0,697660000.0,106090000.0,134640000.0,9434300.0,7405100000.0,,1991600000.0,392400000.0,52280000.0,,,,,,137020000.0 +2019_12_20_14_15_Q-Exactive-HF-X-Orbitrap_6070,,,37574000000.0,219620000.0,,10807000.0,,1179000000.0,603980000.0,73130000.0,24540000.0,,145590000.0,1582800000.0,122650000.0,74671000.0,,525480000.0,,124150000.0,2832500000.0,81121000.0,244290000.0,2451900000.0,468920000.0,1932700000.0,84573000.0,972450000.0,209560000.0,49938000.0,49279000.0,74214000.0,6663200000.0,,,156290000.0,33997000000.0,128720000.0,5264100000.0,160950000.0,702570000.0,48997000.0,4517400000.0,1454700000.0,582730000.0,2103900000.0,152270000.0,184420000.0,1012500000.0,279930000.0,77059000.0,31324000.0,1122300000.0,445060000.0,50622000.0,30115000.0,41793000.0,421740000.0,506880000.0,5328600000.0,71763000.0,213570000.0,41180000.0,204090000.0,,34521000.0,,5097700000.0,,1082100000.0,198790000.0,,35980000.0,230410000.0,271080000.0,8859000.0,1512500000.0,171590000.0,90496000.0,3804900000.0,283850000.0,230250000.0,356550000.0,,,110360000.0,54701000.0,234170000.0,231560000.0,,35889000.0,18390000.0,83811000.0,255540000.0,44427000.0,87004000.0,779570000.0,35246000.0,276240000.0,89094000.0,5389900000.0,2248600000.0,1387100000.0,,36440000.0,6548300000.0,,338610000.0,7773200.0,12266000.0,433850000.0,407780000.0,,,,67643000.0,,522220000.0,972350000.0,355330000.0,914720000.0,838830000.0,121520000.0,3237000000.0,152250000.0,125570000.0,47388000.0,1021800000.0,17757000.0,793530000.0,,71424000.0,193550000.0,246510000.0,51000000.0,27745000.0,165910000.0,,133970000.0,,13868000.0,,77832000.0,4100800000.0,9012900.0,13763000.0,315170000.0,16503000.0,50727000.0,14572000.0,589340000.0,,122450000.0,108520000.0,860430000.0,718650000.0,418350000.0,218530000.0,1208200000.0,428640000.0,2513400000.0,,533030000.0,60865000.0,46728000.0,198540000.0,379330000.0,201260000.0,305710000.0,646590000.0,173370000.0,,148650000.0,38752000.0,66071000.0,53245000.0,942820000.0,118000000.0,915220000.0,,136230000.0,251260000.0,83201000.0,880190000.0,85969000.0,3814800000.0,128720000.0,4705400000.0,2303300000.0,67089000.0,338660000.0,355480000.0,,2016300000.0,28031000.0,18209000.0,2098000000.0,1018000000.0,658910000.0,625610000.0,215430000.0,541580000.0,1575600000.0,26923000.0,1749200000.0,27711000.0,183440000.0,93874000.0,114370000.0,,303660000.0,13625000.0,2181000000.0,,356670000.0,556150000.0,,2327600000.0,1689600000.0,68125000.0,115180000.0,1789700000.0,88722000.0,44243000.0,18268000000.0,110780000.0,24125000.0,21149000.0,,,,805400000.0,309370000.0,259910000.0,,,143010000.0,349620000.0,112020000.0,141000000.0,194990000.0,57279000.0,1585900000.0,132680000.0,436110000.0,,382120000.0,13261000000.0,1550000000.0,14782000000.0,264130000.0,268360000.0,,72947000.0,419760000.0,125000000.0,758300000.0,4840100000.0,54401000.0,66289000.0,661580000.0,,,5118800000.0,2613900000.0,,53745000.0,119670000.0,408010000.0,5250700000.0,642060000.0,486820000.0,20813000000.0,8103700000.0,777100000.0,295630000.0,4491700000.0,1762600000.0,1224300000.0,34370000.0,371130000.0,165410000.0,3846300000.0,301750000.0,784970000.0,68408000.0,28924000.0,110290000.0,439780000.0,1390200000.0,745630000.0,29188000.0,16046000000.0,42397000.0,930310000.0,11328000000.0,22229000.0,2293900000.0,121170000.0,,2724000000.0,35097000.0,75716000.0,938990000.0,138130000.0,540280000.0,,,415350000.0,,30786000.0,60269000.0,287780000.0,348150000.0,410340000.0,17363000.0,99146000.0,445470000.0,9791500000.0,503330000.0,779930000.0,2499900000.0,1036000000.0,7051000000.0,1216600000.0,173870000.0,117710000.0,456430000.0,,32859000.0,944410000.0,,191650000.0,439940000.0,174350000.0,16764000000.0,1426100000.0,,69261000.0,2512200000.0,12612000000.0,,959830000.0,536660000.0,926200000.0,282360000.0,1977700000.0,517460000.0,168570000.0,2556900000.0,1562400000.0,,366430000.0,732170000.0,530800000.0,3741600000.0,,31122000.0,263250000.0,58119000.0,119080000.0,292960000.0,,2456200000.0,41320000.0,425080000.0,1475400000.0,3172000000.0,2472300000.0,352800000.0,,675270000.0,284980000.0,242630000.0,307500000.0,212170000.0,,178810000.0,,1062500000.0,14035000.0,,510730000.0,136820000.0,,1572500000.0,14699000000.0,494780000.0,31168000.0,186920000.0,5294600000.0,,62688000.0,303420000.0,63097000.0,237380000.0,152930000.0,54094000.0,472570000.0,567090000.0,1019400000.0,122900000.0,620220000.0,530330000.0,747080000.0,2035400000.0,267760000.0,49334000.0,2746800000.0,51894000.0,171330000.0,,496100000.0,,7954500000.0,764090000.0,,25302000.0,,21088000.0,16363000.0,114730000.0,,387930000.0,5293900000.0,32998000.0,,931610000.0,465970000.0,107530000.0,2521600000.0,167390000.0,6625100000.0,230410000.0,12939000000.0,108320000.0,1032800000.0,646510000.0,12506000.0,863670000.0,51787000.0,71959000.0,152210000.0,9460400000.0,48333000.0,3229400000.0,382560000.0,169700000.0,,428090000.0,,,,202130000.0 +2019_12_27_12_29_Q-Exactive-HF-X-Orbitrap_6070,,,21600000000.0,168920000.0,,10708000.0,,675300000.0,358290000.0,,8862700.0,,68743000.0,943780000.0,25976000.0,10056000.0,,263320000.0,,29444000.0,1380000000.0,30511000.0,,1126200000.0,240850000.0,848490000.0,57977000.0,623480000.0,144550000.0,23748000.0,5922200.0,98885000.0,3489500000.0,,9441700.0,44792000.0,20047000000.0,49293000.0,3041200000.0,78455000.0,309300000.0,11667000.0,1998800000.0,782460000.0,316010000.0,967410000.0,64362000.0,86046000.0,590300000.0,58481000.0,51405000.0,,552190000.0,373360000.0,14755000.0,,13007000.0,333050000.0,321800000.0,3295500000.0,81163000.0,155400000.0,,151510000.0,89858000.0,31072000.0,,2632400000.0,,466150000.0,58425000.0,,15024000.0,68988000.0,144240000.0,,1084100000.0,78558000.0,54295000.0,2379900000.0,210500000.0,91463000.0,172070000.0,,,66783000.0,,140350000.0,157770000.0,18584000.0,34422000.0,,12348000.0,46083000.0,,,442470000.0,10714000.0,131870000.0,44599000.0,3724100000.0,1393900000.0,688330000.0,,13825000.0,3769800000.0,,314410000.0,,,241180000.0,194700000.0,,,,58621000.0,,232290000.0,818560000.0,170920000.0,537560000.0,456940000.0,59202000.0,1535200000.0,53567000.0,11390000.0,27278000.0,504440000.0,16527000.0,356480000.0,43036000.0,43108000.0,135270000.0,83542000.0,26158000.0,11542000.0,83713000.0,,,,13414000.0,,61298000.0,2371400000.0,27313000.0,,124490000.0,,,,259180000.0,,92193000.0,58719000.0,453420000.0,624270000.0,247750000.0,181780000.0,630530000.0,282030000.0,1406000000.0,,389790000.0,43342000.0,64743000.0,107000000.0,209980000.0,136570000.0,137280000.0,398060000.0,137890000.0,30806000.0,,,17893000.0,,641600000.0,66951000.0,244950000.0,,69863000.0,128010000.0,86941000.0,503040000.0,47677000.0,2300300000.0,63519000.0,2784500000.0,1091500000.0,33949000.0,215410000.0,167690000.0,,813320000.0,,22141000.0,1495600000.0,540160000.0,236600000.0,,101510000.0,231990000.0,802540000.0,,10325000000.0,,115890000.0,96874000.0,48342000.0,,138600000.0,8902600.0,1831700000.0,,177540000.0,449560000.0,,1242000000.0,795100000.0,,68873000.0,1050400000.0,94638000.0,,9701600000.0,226660000.0,12666000.0,,,,,439070000.0,149630000.0,112900000.0,651280000.0,,49724000.0,282220000.0,16495000.0,71429000.0,60389000.0,,757530000.0,50587000.0,220380000.0,,142650000.0,6755100000.0,768020000.0,8230100000.0,103280000.0,190650000.0,18541000.0,81463000.0,280770000.0,46626000.0,426580000.0,3143400000.0,61513000.0,105450000.0,302620000.0,,,2830200000.0,1241200000.0,10373000.0,40832000.0,69435000.0,357630000.0,3212400000.0,321370000.0,220490000.0,12557000000.0,4138200000.0,458810000.0,185400000.0,2573200000.0,863600000.0,107300000.0,,183460000.0,104160000.0,2097900000.0,279690000.0,415480000.0,33269000.0,5847600.0,23208000.0,381540000.0,803870000.0,264290000.0,9206800.0,6841400000.0,33659000.0,388860000.0,6724000000.0,,1979400000.0,17333000.0,,1335900000.0,,13313000.0,242470000.0,,207440000.0,,,185100000.0,,18919000.0,19374000.0,134080000.0,201670000.0,91558000.0,28168000.0,42346000.0,210860000.0,4927100000.0,249410000.0,334080000.0,1551900000.0,580940000.0,4450700000.0,886350000.0,99259000.0,44684000.0,41819000.0,,19693000.0,10538000.0,,130050000.0,234440000.0,,10849000000.0,643450000.0,15507000.0,28972000.0,1237500000.0,16151000000.0,,618090000.0,175970000.0,640880000.0,320570000.0,1481600000.0,253170000.0,35090000.0,1070800000.0,958340000.0,,186540000.0,311210000.0,202240000.0,1416000000.0,,,144620000.0,22096000.0,55356000.0,154720000.0,,1419900000.0,44040000.0,214030000.0,612080000.0,1768300000.0,1180300000.0,610080000.0,,387920000.0,146020000.0,59719000.0,101950000.0,110330000.0,,72346000.0,,508750000.0,7963300.0,,360410000.0,65397000.0,,1158100000.0,8589500000.0,222520000.0,,83135000.0,3691000000.0,,,177840000.0,54776000.0,101290000.0,53776000.0,45304000.0,168390000.0,391710000.0,666790000.0,15823000.0,295570000.0,181140000.0,413050000.0,1163700000.0,150620000.0,16500000.0,1666400000.0,50429000.0,148700000.0,80814000.0,268680000.0,,4496600000.0,387890000.0,,,,,,38959000.0,13805000.0,134190000.0,2946800000.0,,87427000.0,525470000.0,268450000.0,63110000.0,1480100000.0,144000000.0,3797400000.0,111440000.0,7118100000.0,47609000.0,549010000.0,398760000.0,17239000.0,556740000.0,65526000.0,56141000.0,49162000.0,5774200000.0,67555000.0,1466200000.0,168170000.0,67976000.0,,,,,,90780000.0 +2019_12_29_15_06_Q-Exactive-HF-X-Orbitrap_6070,83858000.0,,60643000000.0,249130000.0,41863000.0,,44597000.0,1600200000.0,1155800000.0,,17399000.0,,64304000.0,1994100000.0,190650000.0,57645000.0,27482000.0,384310000.0,159830000.0,33560000.0,5161700000.0,210570000.0,338740000.0,1708700000.0,99922000.0,1986600000.0,12935000.0,1169200000.0,46625000.0,312490000.0,620830000.0,131330000.0,8924000000.0,280740000.0,310820000.0,195480000.0,39300000000.0,82297000.0,4211900000.0,489920000.0,43720000.0,208860000.0,3742200000.0,1563300000.0,551490000.0,2127000000.0,77220000.0,142560000.0,,57675000.0,150470000.0,,539760000.0,1411800000.0,,,94465000.0,666690000.0,165960000.0,6026600000.0,115890000.0,291240000.0,157320000.0,465730000.0,280330000.0,416120000.0,23030000.0,4663700000.0,616640000.0,1028600000.0,,30731000.0,35123000.0,71783000.0,64618000.0,165200000.0,149600000.0,156520000.0,21390000.0,4884700000.0,435520000.0,44620000.0,34336000.0,,110030000.0,54551000.0,38415000.0,20404000.0,52016000.0,56561000.0,111970000.0,68780000.0,,90738000.0,68940000.0,234490000.0,239360000.0,66361000.0,480330000.0,160610000.0,5838300000.0,1896800000.0,1202900000.0,210080000.0,116670000.0,6658900000.0,,291810000.0,,126540000.0,3952700000.0,869250000.0,94459000.0,93881000.0,119250000.0,218660000.0,,1384600000.0,2201400000.0,501050000.0,2059800000.0,180950000.0,279570000.0,3683100000.0,40325000.0,35605000.0,,1064100000.0,43005000.0,967240000.0,161000000.0,,317510000.0,,30016000.0,103900000.0,185450000.0,27218000000.0,546500000.0,52989000.0,32034000.0,253240000.0,69481000.0,4269700000.0,6558300.0,,313200000.0,111370000.0,303120000.0,28303000.0,823170000.0,,267330000.0,587630000.0,1070500000.0,886770000.0,515940000.0,46449000.0,1644100000.0,189060000.0,4102800000.0,,486070000.0,119720000.0,20176000.0,232430000.0,152220000.0,422700000.0,409530000.0,210980000.0,1004900000.0,1006000000.0,,,79489000.0,76479000.0,2204500000.0,309670000.0,1473700000.0,422940000.0,208200000.0,112150000.0,117780000.0,545540000.0,35629000.0,3231200000.0,114620000.0,4862700000.0,2705100000.0,282820000.0,380640000.0,233600000.0,69059000.0,1801600000.0,68354000.0,33454000.0,1592100000.0,1220400000.0,1827000000.0,,758760000.0,300050000.0,2954100000.0,27283000.0,4798600000.0,,175540000.0,148710000.0,113180000.0,,112700000.0,30303000.0,3452500000.0,329230000.0,67030000.0,716300000.0,2118400000.0,2579900000.0,2544800000.0,,222380000.0,1110200000.0,59416000.0,87502000.0,55180000000.0,64635000.0,60879000.0,45996000.0,,,135270000.0,1104800000.0,381190000.0,,,640690000.0,,495300000.0,31438000.0,63124000.0,19262000.0,43170000.0,695700000.0,408940000.0,1539000000.0,616270000.0,477770000.0,17735000000.0,725280000.0,12209000000.0,104140000.0,,51937000.0,111140000.0,196360000.0,155960000.0,1173600000.0,14208000000.0,,270520000.0,720450000.0,,,4719000000.0,1635800000.0,53417000.0,52976000.0,143510000.0,93723000.0,6285200000.0,408500000.0,711970000.0,54883000000.0,8959000000.0,268580000.0,57835000.0,5272500000.0,2125400000.0,415410000.0,,742960000.0,370270000.0,3728900000.0,577480000.0,217540000.0,,38063000.0,199890000.0,2742900000.0,318760000.0,799180000.0,17391000.0,18228000000.0,47473000.0,194560000.0,5586800000.0,,2374300000.0,,179370000.0,3930100000.0,12987000.0,66535000.0,715920000.0,204580000.0,400870000.0,20021000.0,,349060000.0,,,89850000.0,312720000.0,132570000.0,226990000.0,,414430000.0,343430000.0,6719800000.0,553940000.0,,3516000000.0,2268900000.0,11338000000.0,23303000000.0,36682000.0,144750000.0,60922000.0,84881000.0,328320000.0,849540000.0,126540000.0,292340000.0,333100000.0,96217000.0,14918000000.0,852200000.0,36018000.0,,3225200000.0,7944700000.0,,313650000.0,890270000.0,1302800000.0,1098000000.0,1422900000.0,772560000.0,132860000.0,9482900000.0,2727600000.0,209190000.0,103080000.0,446240000.0,2085000000.0,6376700000.0,,304790000.0,72813000.0,52122000.0,302630000.0,526370000.0,47869000.0,3343400000.0,158590000.0,90708000.0,4714700000.0,2081400000.0,310720000.0,206870000.0,331460000.0,6015000000.0,1139500000.0,33945000.0,,347770000.0,69281000.0,236040000.0,309830000.0,757810000.0,,74815000.0,,399820000.0,,4602100000.0,9551500000.0,667030000.0,16258000.0,139830000.0,9081200000.0,68782000.0,172340000.0,173640000.0,126030000.0,,78253000.0,203620000.0,156210000.0,843210000.0,505040000.0,118280000.0,611860000.0,540720000.0,1098600000.0,1345000000.0,114010000.0,36388000.0,1351600000.0,323820000.0,206560000.0,243410000.0,295620000.0,,15200000000.0,516030000.0,76403000.0,,103560000.0,166150000.0,1215600000.0,,74841000.0,197650000.0,8346700000.0,,271280000.0,360200000.0,353820000.0,,1351500000.0,156490000.0,12912000000.0,542900000.0,11527000000.0,,1075400000.0,771910000.0,161680000.0,7088500000.0,60722000.0,81898000.0,396440000.0,6842700000.0,22252000.0,2376300000.0,1198300000.0,297470000.0,34130000.0,536240000.0,218600000.0,18987000.0,79412000.0,214250000.0 +2019_12_29_18_18_Q-Exactive-HF-X-Orbitrap_6070,192680000.0,,63093000000.0,214030000.0,100020000.0,7181400.0,69400000.0,2347900000.0,1239100000.0,25611000.0,45509000.0,28360000.0,90707000.0,2150700000.0,138250000.0,,5683800.0,602040000.0,141810000.0,57163000.0,5637800000.0,160630000.0,400490000.0,2342500000.0,256450000.0,2348300000.0,,1416300000.0,58032000.0,419460000.0,511470000.0,45902000.0,10202000000.0,348320000.0,482220000.0,308740000.0,48391000000.0,127290000.0,5977600000.0,458820000.0,41246000.0,175410000.0,3580200000.0,3274100000.0,746940000.0,2063300000.0,,154030000.0,,98614000.0,237230000.0,,674410000.0,1805100000.0,,48800000.0,159050000.0,728070000.0,158580000.0,7651800000.0,193510000.0,282480000.0,188360000.0,546800000.0,421830000.0,319140000.0,76270000.0,5353800000.0,749460000.0,1245500000.0,,62701000.0,36333000.0,141220000.0,101910000.0,24887000.0,464480000.0,78234000.0,,5284900000.0,438370000.0,62093000.0,120010000.0,,173150000.0,50112000.0,29309000.0,,66103000.0,56385000.0,85957000.0,,,136560000.0,89191000.0,315010000.0,231770000.0,73773000.0,596680000.0,113600000.0,6437000000.0,2767000000.0,1656700000.0,257120000.0,68449000.0,5280700000.0,,275110000.0,57954000.0,145310000.0,6861300000.0,1089300000.0,74135000.0,165970000.0,1164300000.0,336330000.0,,1404800000.0,2551500000.0,528620000.0,2526800000.0,335020000.0,304380000.0,4429800000.0,,40021000.0,61579000.0,1339400000.0,,1174400000.0,169530000.0,22201000.0,291790000.0,42341000.0,,312590000.0,189830000.0,,503330000.0,47709000.0,,314670000.0,105110000.0,5367700000.0,8410600.0,20042000.0,269240000.0,148380000.0,193600000.0,20802000.0,856380000.0,,266730000.0,781530000.0,1258300000.0,1346200000.0,698660000.0,62171000.0,2690500000.0,158760000.0,5333900000.0,15676000.0,628580000.0,161430000.0,,,128480000.0,518230000.0,555780000.0,419380000.0,859960000.0,843550000.0,,,45339000.0,67151000.0,2701700000.0,298560000.0,1666900000.0,360890000.0,222810000.0,170400000.0,138140000.0,636560000.0,,3675200000.0,228100000.0,6546300000.0,2484800000.0,498080000.0,479020000.0,345100000.0,57636000.0,2177300000.0,78302000.0,148070000.0,2228200000.0,1144300000.0,1873400000.0,,886240000.0,374330000.0,3624800000.0,17544000.0,20140000000.0,,281020000.0,364530000.0,177540000.0,,233050000.0,38201000.0,4895400000.0,318750000.0,117710000.0,1075200000.0,2812900000.0,2775700000.0,2926600000.0,98952000.0,360530000.0,1250200000.0,42191000.0,112400000.0,63651000000.0,,37824000.0,68847000.0,,,215820000.0,1098800000.0,393310000.0,,,706860000.0,33344000.0,638190000.0,44046000.0,82112000.0,35924000.0,38417000.0,1026400000.0,564620000.0,1274000000.0,887690000.0,608700000.0,20076000000.0,1057200000.0,16199000000.0,190040000.0,48826000.0,40930000.0,155920000.0,321750000.0,188570000.0,1443800000.0,16611000000.0,,203820000.0,1080000000.0,,94036000.0,5643000000.0,2138500000.0,15651000.0,122500000.0,189080000.0,139940000.0,8245700000.0,481220000.0,767030000.0,69083000000.0,9510600000.0,479400000.0,133300000.0,7666700000.0,3185800000.0,414840000.0,,896470000.0,368050000.0,4726400000.0,690930000.0,190160000.0,62727000.0,21713000.0,274230000.0,2964300000.0,349130000.0,1417000000.0,9763300.0,20441000000.0,34310000.0,344900000.0,7656100000.0,68892000.0,2302600000.0,,1976400000.0,4246600000.0,14699000.0,62976000.0,1161900000.0,211920000.0,322820000.0,45375000.0,,376830000.0,52454000.0,,160340000.0,298190000.0,148780000.0,159570000.0,77795000.0,660920000.0,455180000.0,6521200000.0,897640000.0,,3531700000.0,1884100000.0,15211000000.0,24464000000.0,73820000.0,274340000.0,128630000.0,251540000.0,517590000.0,941450000.0,54838000.0,321060000.0,545800000.0,126710000.0,18197000000.0,997160000.0,,,3924100000.0,10718000000.0,1514200000.0,3060100000.0,1174200000.0,1363700000.0,1030500000.0,1688100000.0,964870000.0,184320000.0,11080000000.0,2301400000.0,349260000.0,74333000.0,530120000.0,2013300000.0,7287100000.0,,2723800000.0,50487000.0,99353000.0,279320000.0,462780000.0,,3898700000.0,191910000.0,173900000.0,5041800000.0,2718400000.0,699180000.0,365120000.0,412030000.0,6644300000.0,1351600000.0,179610000.0,,399010000.0,29575000.0,243100000.0,,832420000.0,,86780000.0,323010000.0,596330000.0,120100000.0,5827600000.0,8083700000.0,1135000000.0,,150480000.0,12378000000.0,152490000.0,261370000.0,217870000.0,188090000.0,,232230000.0,177610000.0,179030000.0,799360000.0,590080000.0,120990000.0,702500000.0,549110000.0,1217600000.0,1541600000.0,82087000.0,33247000.0,1681800000.0,378290000.0,305610000.0,306690000.0,385540000.0,,19517000000.0,708300000.0,,19015000.0,172350000.0,148560000.0,1695200000.0,,112200000.0,1431900000.0,9741600000.0,25694000.0,322950000.0,312940000.0,481420000.0,,1296500000.0,129360000.0,16093000000.0,217550000.0,12221000000.0,16413000.0,737510000.0,869300000.0,200290000.0,7220300000.0,90607000.0,195480000.0,437920000.0,8758600000.0,,3166500000.0,1453400000.0,480780000.0,,705720000.0,337660000.0,27467000.0,25029000.0,315990000.0 +2020_01_02_17_38_Q-Exactive-HF-X-Orbitrap_6070,45563000.0,,29261000000.0,,,4992300.0,77811000.0,537050000.0,349750000.0,,,,,304870000.0,,,18553000.0,58003000.0,,,1317200000.0,27618000.0,166870000.0,497300000.0,43407000.0,356540000.0,,388030000.0,26457000.0,39762000.0,165530000.0,,2439000000.0,74539000.0,57140000.0,52565000.0,16596000000.0,6814300.0,1807900000.0,,16153000.0,17149000.0,1477600000.0,321310000.0,119860000.0,522490000.0,11885000.0,,,,29358000.0,220840000.0,107370000.0,363870000.0,,,,118180000.0,31867000.0,1403700000.0,,,,71133000.0,98515000.0,79002000.0,,1418200000.0,179370000.0,225460000.0,,,,38533000.0,20058000.0,19411000.0,107580000.0,6265300.0,,1452500000.0,94667000.0,,12738000.0,,23890000.0,9298400.0,,79838000.0,173040000.0,9142800.0,,,,,29127000.0,53634000.0,74455000.0,,44962000.0,58698000.0,1669500000.0,640420000.0,185770000.0,63460000.0,,2684000000.0,,189800000.0,,9237300.0,1186800000.0,196610000.0,,,,84029000.0,,458360000.0,695500000.0,128180000.0,216980000.0,60205000.0,41033000.0,915150000.0,,,85210000.0,352910000.0,,301670000.0,,,28346000.0,,,17257000.0,56695000.0,2469400000.0,221500000.0,9500300.0,,39310000.0,,766340000.0,,,70703000.0,,,12020000.0,34371000.0,37820000.0,,256940000.0,366940000.0,219160000.0,97595000.0,,950860000.0,93382000.0,1939200000.0,,79331000.0,,,,41724000.0,14640000.0,35559000.0,67653000.0,136310000.0,347480000.0,,,34551000.0,,292370000.0,73496000.0,227660000.0,41728000.0,43804000.0,,9340700.0,86709000.0,,955020000.0,59094000.0,1218500000.0,840840000.0,49290000.0,56271000.0,49448000.0,,764210000.0,,,526420000.0,174990000.0,348230000.0,,69275000.0,24730000.0,1428300000.0,,8088000000.0,14969000.0,34492000.0,91608000.0,46003000.0,,,,1733600000.0,,25667000.0,216600000.0,738280000.0,1169800000.0,680370000.0,,54399000.0,229900000.0,,39916000.0,26820000000.0,21924000.0,,,,,15295000.0,286030000.0,23702000.0,,,96236000.0,,299220000.0,,,,,92451000.0,65397000.0,194860000.0,20007000.0,80974000.0,8048700000.0,112020000.0,4535000000.0,,,,24911000.0,36132000.0,16005000.0,305290000.0,6355100000.0,,47385000.0,108870000.0,,13598000.0,1773400000.0,393890000.0,25780000.0,,25055000.0,21877000.0,2147700000.0,140290000.0,126100000.0,19185000000.0,2192800000.0,27906000.0,,1826900000.0,879980000.0,615450000.0,,208850000.0,97490000.0,1119000000.0,179910000.0,64022000.0,,,23119000.0,733950000.0,247120000.0,297220000.0,5454000.0,7058700000.0,,70761000.0,1685600000.0,39383000.0,626920000.0,,,1488300000.0,22722000.0,,139500000.0,123300000.0,,,,31123000.0,,,,95904000.0,,2080200000.0,,82841000.0,163580000.0,4421000000.0,145010000.0,,795330000.0,698200000.0,4410000000.0,3534200000.0,,36713000.0,,,69099000.0,,,107090000.0,103260000.0,,5557200000.0,303080000.0,,,1286800000.0,2822300000.0,,2272200000.0,214870000.0,301150000.0,646950000.0,447210000.0,104410000.0,42545000.0,3004500000.0,518510000.0,47769000.0,,84834000.0,530390000.0,1965300000.0,,132340000.0,,,,221290000.0,,960350000.0,,,1125000000.0,975770000.0,169610000.0,,52421000.0,1708400000.0,230220000.0,30987000.0,,54570000.0,,9584000.0,,196650000.0,,,,238870000.0,,814410000.0,4953100000.0,100400000.0,,59617000.0,4585500000.0,,53211000.0,30099000.0,20471000.0,,34614000.0,17055000.0,,240930000.0,136820000.0,,149690000.0,47505000.0,246280000.0,319350000.0,76962000.0,21122000.0,253010000.0,135460000.0,22358000.0,76376000.0,71839000.0,,10900000000.0,32649000.0,,,,21875000.0,198620000.0,2849600.0,,19402000.0,2926600000.0,,,50457000.0,80396000.0,,268020000.0,,7323500000.0,,2975000000.0,,230860000.0,173820000.0,10624000.0,3215300000.0,,,118850000.0,1671400000.0,,499520000.0,222610000.0,22562000.0,,136330000.0,,,,37093000.0 +2020_01_03_11_17_Q-Exactive-HF-X-Orbitrap_6070,,,60145000000.0,78036000.0,17016000.0,,98240000.0,975200000.0,366580000.0,,,,,259370000.0,,,,215110000.0,,15799000.0,2341900000.0,65552000.0,,398210000.0,27977000.0,1285200000.0,,1177200000.0,48616000.0,19118000.0,484130000.0,19338000.0,3656100000.0,113960000.0,50377000.0,109970000.0,28186000000.0,,2594300000.0,185160000.0,,6099200.0,2395500000.0,1276400000.0,160030000.0,1078200000.0,,,,,,374860000.0,41609000.0,358580000.0,,,43192000.0,49211000.0,40832000.0,2577100000.0,69225000.0,,,36746000.0,,,31846000.0,2288800000.0,141330000.0,569220000.0,,,14305000.0,30770000.0,46158000.0,,44799000.0,59479000.0,,2121400000.0,137520000.0,75475000.0,12028000.0,,137640000.0,,,44594000.0,223650000.0,,,,,19197000.0,,64971000.0,125410000.0,,461030000.0,,3270300000.0,1187600000.0,703710000.0,53912000.0,33944000.0,3523600000.0,,,,,1919100000.0,355390000.0,28727000.0,,,60065000.0,,299010000.0,616870000.0,140540000.0,317270000.0,175400000.0,91736000.0,1483200000.0,,31695000.0,199050000.0,491010000.0,,240650000.0,25852000.0,,118510000.0,27793000.0,,6427100.0,76659000.0,11681000000.0,99603000.0,,,76009000.0,31417000.0,1879900000.0,,,,36794000.0,113670000.0,,118590000.0,69269000.0,52201000.0,273760000.0,1273300000.0,592080000.0,179580000.0,,1567200000.0,209040000.0,1681000000.0,,277600000.0,,,203150000.0,184480000.0,84877000.0,49505000.0,35450000.0,369600000.0,526130000.0,32562000.0,,,50841000.0,1765200000.0,46906000.0,656650000.0,51096000.0,131060000.0,,80016000.0,81020000.0,24092000.0,1625300000.0,74183000.0,2003200000.0,1141600000.0,91345000.0,99536000.0,61578000.0,,1051900000.0,,15904000.0,1250300000.0,561330000.0,738840000.0,,,91620000.0,3200400000.0,,3335900000.0,,29777000.0,273900000.0,18249000.0,,18489000.0,,2959200000.0,,34592000.0,510800000.0,,4011300000.0,1478100000.0,,60996000.0,239930000.0,,,36117000000.0,,,,,,,321010000.0,27888000.0,,,439650000.0,,173210000.0,,,,,273310000.0,119400000.0,379220000.0,53340000.0,112600000.0,15202000000.0,98898000.0,9889700000.0,,,,43901000.0,56489000.0,85224000.0,409180000.0,7145700000.0,,31401000.0,436580000.0,,45660000.0,2784400000.0,476380000.0,,,147800000.0,,3252000000.0,30768000.0,151590000.0,53435000000.0,7198000000.0,47111000.0,,2990000000.0,1140500000.0,112430000.0,,223450000.0,108610000.0,2454200000.0,155580000.0,104240000.0,,,59229000.0,1406300000.0,549540000.0,310760000.0,,10630000000.0,,187650000.0,3359000000.0,105300000.0,1460200000.0,,990630000.0,1917500000.0,,,413510000.0,,130530000.0,778250000.0,8881600000.0,87496000.0,,,,132950000.0,13835000.0,21183000.0,,144490000.0,,5073000000.0,213610000.0,,1235200000.0,1547300000.0,4767100000.0,8635000000.0,,97610000.0,98975000.0,10705000.0,175170000.0,,,157500000.0,122100000.0,30155000.0,13830000000.0,776490000.0,,,1058400000.0,5070000000.0,,4839100000.0,870290000.0,527050000.0,973720000.0,485190000.0,774130000.0,149370000.0,4697300000.0,1093300000.0,60895000.0,19853000.0,347500000.0,1419100000.0,4302500000.0,,99794000.0,,15154000.0,,26014000.0,,2823000000.0,,,2718300000.0,2194500000.0,392890000.0,,56533000.0,3766100000.0,938830000.0,84676000.0,,,,2083900.0,,201730000.0,,45849000.0,230690000.0,326430000.0,,3654300000.0,6343700000.0,198490000.0,,,7191900000.0,,140690000.0,75596000.0,2009500.0,,113260000.0,,,309370000.0,87504000.0,12107000.0,19985000.0,88747000.0,418370000.0,420620000.0,,,750980000.0,107750000.0,117170000.0,178400000.0,138000000.0,,11549000000.0,252810000.0,,,,39234000.0,330510000.0,,,57102000.0,4716100000.0,,,92681000.0,156940000.0,65807000.0,619210000.0,,7338100000.0,146920000.0,5733700000.0,,319720000.0,378800000.0,32748000.0,2806300000.0,,21262000.0,236660000.0,2924700000.0,,856050000.0,505330000.0,123130000.0,,149130000.0,,,, +2020_01_03_16_58_Q-Exactive-HF-X-Orbitrap_6070,,58359000.0,69862000000.0,,51597000.0,,106030000.0,993940000.0,534070000.0,16621000.0,52447000.0,50300000.0,,1229300000.0,77315000.0,,55923000.0,184990000.0,179860000.0,,4040000000.0,163650000.0,687080000.0,1227900000.0,194270000.0,1352700000.0,28993000.0,1362100000.0,42458000.0,227550000.0,422750000.0,128690000.0,6146700000.0,54763000.0,217020000.0,235430000.0,34776000000.0,32895000.0,3733500000.0,424260000.0,,180280000.0,2526000000.0,2598900000.0,287820000.0,1230000000.0,49530000.0,109190000.0,,39389000.0,68587000.0,,1778800000.0,1037600000.0,,51376000.0,79296000.0,247870000.0,140680000.0,5789900000.0,123650000.0,150960000.0,88163000.0,504830000.0,248170000.0,245200000.0,28720000.0,4438000000.0,256460000.0,1088300000.0,,23701000.0,,83126000.0,103570000.0,,215300000.0,60976000.0,,3161600000.0,290260000.0,22877000.0,41837000.0,43289000.0,203640000.0,31470000.0,,7043500.0,,,,22661000.0,,64448000.0,170100000.0,193680000.0,261670000.0,,228770000.0,41477000.0,3908400000.0,1820400000.0,861110000.0,241240000.0,54631000.0,4564000000.0,,260970000.0,,92366000.0,7962600000.0,724840000.0,45722000.0,70034000.0,88454000.0,97750000.0,,813290000.0,1813500000.0,210540000.0,714330000.0,162530000.0,168490000.0,2314300000.0,17755000.0,58122000.0,86197000.0,825660000.0,,599920000.0,8154400.0,22722000.0,172910000.0,,,103670000.0,215430000.0,,418150000.0,,,225870000.0,117510000.0,3426400000.0,7231000.0,,125270000.0,78851000.0,181290000.0,19134000.0,736580000.0,160330000.0,163550000.0,555310000.0,652190000.0,886450000.0,572110000.0,,1103500000.0,,2986100000.0,,365290000.0,50432000.0,,,112520000.0,451660000.0,299260000.0,165100000.0,443450000.0,738010000.0,24614000.0,,76621000.0,148370000.0,1069400000.0,193680000.0,847700000.0,255560000.0,84325000.0,67987000.0,184960000.0,694620000.0,,1933600000.0,191390000.0,3582600000.0,2071400000.0,200120000.0,268870000.0,135050000.0,59684000.0,1340200000.0,88766000.0,68039000.0,1123200000.0,846750000.0,1263000000.0,,466960000.0,240020000.0,2461500000.0,81424000.0,2876300000.0,18851000.0,220200000.0,204640000.0,29541000.0,,78148000.0,15670000.0,3990000000.0,250710000.0,94678000.0,798550000.0,1915500000.0,2188300000.0,2114200000.0,,124540000.0,1218100000.0,27125000.0,81855000.0,29474000000.0,110620000.0,,69500000.0,,,62587000.0,933290000.0,55606000.0,,,467640000.0,24565000.0,572670000.0,23064000.0,131290000.0,31268000.0,,394810000.0,203870000.0,1153700000.0,555320000.0,236460000.0,21613000000.0,162540000.0,12189000000.0,167040000.0,,,70123000.0,81195000.0,138960000.0,756310000.0,20216000000.0,,212570000.0,401140000.0,,,4480300000.0,1125500000.0,21204000.0,30181000.0,84121000.0,72300000.0,5642200000.0,219310000.0,311710000.0,44059000000.0,7229800000.0,103760000.0,,3885400000.0,1237700000.0,384720000.0,,576380000.0,287900000.0,3339900000.0,289110000.0,227970000.0,50434000.0,,178310000.0,1855200000.0,116640000.0,785940000.0,60946000.0,14252000000.0,,224230000.0,3776800000.0,59419000.0,2132200000.0,,,2441900000.0,45535000.0,71278000.0,686660000.0,263360000.0,292930000.0,1725500000.0,,192530000.0,35702000.0,10439000.0,62914000.0,237280000.0,66811000.0,151370000.0,80578000.0,261210000.0,271880000.0,8849600000.0,387060000.0,,2515700000.0,1488800000.0,11435000000.0,791370000.0,27185000.0,37026000.0,67025000.0,162090000.0,210840000.0,9555600.0,67583000.0,197970000.0,299150000.0,65786000.0,12669000000.0,988510000.0,,,2524400000.0,7009300000.0,,4322200000.0,635550000.0,901930000.0,664270000.0,825920000.0,585650000.0,,5366800000.0,1944800000.0,169560000.0,27865000.0,175990000.0,2074800000.0,5207300000.0,,60642000.0,42698000.0,29463000.0,158640000.0,242380000.0,34472000.0,1570800000.0,176330000.0,98050000.0,4141900000.0,1875800000.0,361820000.0,219810000.0,253350000.0,3758700000.0,1090100000.0,126170000.0,39642000.0,234780000.0,,82299000.0,,418980000.0,,60308000.0,,534880000.0,,2266400000.0,10650000000.0,633600000.0,,135150000.0,8382000000.0,,185480000.0,64093000.0,105100000.0,,149620000.0,,180530000.0,279100000.0,457950000.0,127440000.0,83682000.0,442480000.0,753160000.0,972990000.0,119530000.0,124430000.0,1303500000.0,148250000.0,85636000.0,181720000.0,265650000.0,,14482000000.0,268570000.0,,,57352000.0,60528000.0,659630000.0,20086000.0,70418000.0,199440000.0,6732200000.0,21818000.0,205450000.0,209640000.0,278110000.0,133160000.0,762600000.0,140570000.0,16565000000.0,135500000.0,8381100000.0,,786710000.0,382840000.0,78884000.0,5909600000.0,65319000.0,58625000.0,,4863700000.0,23070000.0,1649200000.0,1763300000.0,233440000.0,69928000.0,,,33041000.0,14512000.0,334330000.0 +2020_01_03_20_10_Q-Exactive-HF-X-Orbitrap_6070,,,68788000000.0,111690000.0,48492000.0,,83047000000.0,1893800000.0,512440000.0,11561000.0,,51662000.0,,1596800000.0,168710000.0,,25261000.0,297280000.0,101680000.0,38070000.0,3840600000.0,61401000.0,424450000.0,1520100000.0,240830000.0,1566500000.0,47300000.0,1581600000.0,60783000.0,34191000.0,353020000.0,124670000.0,5694500000.0,134020000.0,334220000.0,112480000.0,36437000000.0,28900000.0,4311700000.0,350610000.0,,133130000.0,3245800000.0,2688300000.0,405080000.0,1300600000.0,,53442000.0,,13006000.0,18617000.0,670880000.0,435530000.0,1183300000.0,,76917000.0,66280000.0,284930000.0,110980000.0,6675500000.0,35746000.0,113300000.0,123330000.0,454530000.0,,349890000.0,237580000.0,4290500000.0,231980000.0,1397000000.0,312460000.0,52711000.0,11767000.0,126520000.0,100390000.0,17231000.0,449300000.0,39823000.0,,4072200000.0,220780000.0,17941000.0,63419000.0,,234610000.0,,,66571000.0,33625000.0,,,22314000.0,,91316000.0,,216430000.0,267010000.0,,307260000.0,81561000.0,6407900000.0,1643000000.0,1076200000.0,312720000.0,51553000.0,3543800000.0,12052000.0,270910000.0,,113700000.0,4556200000.0,820040000.0,52867000.0,33319000.0,153860000.0,99563000.0,,729090000.0,1915800000.0,441290000.0,476320000.0,129020000.0,266000000.0,1812400000.0,39725000.0,17689000.0,146590000.0,755350000.0,,661830000.0,10807000.0,13560000.0,253400000.0,28026000.0,,60428000.0,145580000.0,,557930000.0,48079000.0,,200870000.0,95464000.0,3989700000.0,,,119670000.0,87164000.0,236000000.0,33088000.0,623860000.0,,182940000.0,358630000.0,636320000.0,921610000.0,490010000.0,,1676700000.0,42195000.0,2672800000.0,83730000.0,320030000.0,40354000.0,,161390000.0,131130000.0,442620000.0,346310000.0,210220000.0,739100000.0,891610000.0,24841000.0,,62705000.0,,2250600000.0,219770000.0,1330100000.0,368520000.0,45921000.0,66352000.0,109940000.0,478370000.0,,2160100000.0,138450000.0,3651300000.0,2662400000.0,144380000.0,178070000.0,149300000.0,26588000.0,1204600000.0,49275000.0,94950000.0,1425400000.0,1260800000.0,1383800000.0,,515100000.0,144210000.0,2943700000.0,61564000.0,2623600000.0,,225840000.0,153520000.0,53474000.0,,74042000.0,12639000.0,3746800000.0,285300000.0,71627000.0,602210000.0,,1586500000.0,2117800000.0,,222210000.0,1321600000.0,36142000.0,95177000.0,31711000000.0,63051000.0,52717000.0,,,,33177000.0,822290000.0,57120000.0,,,273330000.0,60894000.0,487650000.0,,108190000.0,15855000.0,,437710000.0,217470000.0,859800000.0,567460000.0,318880000.0,18802000000.0,424490000.0,10758000000.0,73648000.0,,,75548000.0,81174000.0,144640000.0,692270000.0,19250000000.0,,229020000.0,456810000.0,,39386000.0,3136600000.0,1254700000.0,,1668000000.0,138680000.0,74890000.0,4541900000.0,293520000.0,362040000.0,56938000000.0,7472600000.0,248090000.0,90796000.0,3666700000.0,1494500000.0,276050000.0,18037000.0,816050000.0,459800000.0,3103300000.0,130660000.0,98653000.0,65955000.0,,162540000.0,1862000000.0,645820000.0,789900000.0,4891800.0,14875000000.0,,188760000.0,4739500000.0,65851000.0,1475700000.0,,,2840600000.0,34612000.0,106200000.0,528780000.0,165230000.0,210010000.0,1646200000.0,,212500000.0,14823000.0,,71302000.0,133300000.0,40708000.0,111540000.0,51423000.0,389170000.0,250450000.0,5086900000.0,394260000.0,,2426000000.0,1280900000.0,11588000000.0,20002000000.0,23319000.0,157470000.0,87955000.0,84773000.0,172700000.0,,112830000.0,339010000.0,201200000.0,124930000.0,12293000000.0,1292400000.0,,,2447900000.0,6287200000.0,,3778800000.0,772310000.0,1196500000.0,1017700000.0,1085200000.0,632480000.0,16019000.0,5861000000.0,1551100000.0,168080000.0,92260000.0,340990000.0,2388600000.0,3807600000.0,,55657000.0,72510000.0,42528000.0,233260000.0,259050000.0,,2000900000.0,167240000.0,223300000.0,3514700000.0,1846500000.0,353530000.0,198920000.0,284640000.0,3823700000.0,736560000.0,107080000.0,49314000.0,169570000.0,99085000.0,92403000.0,,479890000.0,,69641000.0,88578000.0,401380000.0,,3331700000.0,11522000000.0,706990000.0,,120550000.0,10509000000.0,,267400000.0,59052000.0,57815000.0,,17590000.0,244380000.0,28937000.0,322170000.0,245200000.0,248790000.0,234780000.0,463340000.0,934700000.0,929820000.0,152680000.0,26941000.0,1347500000.0,204070000.0,86056000.0,258810000.0,293050000.0,,16558000000.0,475880000.0,,11651000.0,46248000.0,,826020000.0,62789000.0,65430000.0,317880000.0,6980800000.0,,289350000.0,261780000.0,264870000.0,99540000.0,829590000.0,131800000.0,11541000000.0,,9130900000.0,,611540000.0,629940000.0,51474000.0,7284800000.0,79663000.0,220980000.0,238230000.0,5736300000.0,37430000.0,1533800000.0,1534200000.0,222060000.0,49903000.0,492920000.0,121250000.0,29144000.0,28693000.0,182720000.0 +2020_01_04_04_23_Q-Exactive-HF-X-Orbitrap_6070,133940000.0,,82793000000.0,88958000.0,75654000.0,,689840000.0,1565100000.0,1183400000.0,,102020000.0,43047000.0,77420000.0,1410800000.0,191500000.0,,87108000.0,356980000.0,224100000.0,61619000.0,5641500000.0,157290000.0,372000000.0,1671700000.0,199280000.0,2244300000.0,,1315800000.0,75226000.0,184060000.0,621480000.0,182820000.0,8291600000.0,197740000.0,289680000.0,287760000.0,47634000000.0,73371000.0,4016000000.0,301290000.0,97315000.0,61043000.0,3158900000.0,2867700000.0,718170000.0,1688800000.0,63227000.0,151590000.0,,59619000.0,138460000.0,,571540000.0,1719800000.0,,28804000.0,61844000.0,541440000.0,207330000.0,6468900000.0,92395000.0,200360000.0,185490000.0,145530000.0,158070000.0,506640000.0,65780000.0,5680300000.0,404410000.0,1155100000.0,,25127000.0,,166860000.0,112020000.0,25196000.0,436130000.0,97238000.0,52740000.0,5084500000.0,490100000.0,82439000.0,116480000.0,,265710000.0,151790000.0,,68357000.0,42967000.0,,36183000.0,47339000.0,,80102000.0,50953000.0,285860000.0,285710000.0,,622260000.0,128760000.0,7275200000.0,2057600000.0,1125600000.0,501420000.0,156320000.0,3929700000.0,,311000000.0,,145720000.0,6186400000.0,1589500000.0,35600000.0,109710000.0,206630000.0,152460000.0,,1548700000.0,2081100000.0,555110000.0,2651700000.0,210920000.0,213560000.0,3425900000.0,46373000.0,41850000.0,99423000.0,968690000.0,,871980000.0,7669700.0,,195100000.0,23149000.0,46166000.0,87879000.0,185500000.0,,599430000.0,,7624500.0,540670000.0,207540000.0,4127800000.0,,,140000000.0,96986000.0,177340000.0,55749000.0,1011800000.0,,336250000.0,468190000.0,610380000.0,1164800000.0,723670000.0,21184000.0,2226000000.0,,4259700000.0,176980000.0,480520000.0,81271000.0,,223450000.0,206100000.0,505850000.0,409190000.0,119870000.0,959410000.0,938450000.0,24992000.0,66720000.0,149180000.0,,2504500000.0,194830000.0,2014200000.0,269430000.0,245140000.0,86026000.0,52678000.0,612430000.0,34562000.0,3682000000.0,149110000.0,5218800000.0,2551100000.0,259100000.0,282140000.0,217220000.0,55767000.0,1676600000.0,51886000.0,79129000.0,1846300000.0,941660000.0,1876100000.0,,,289950000.0,3320300000.0,631230000.0,21437000000.0,,130710000.0,343830000.0,98969000.0,,179970000.0,21966000.0,4017700000.0,346150000.0,109010000.0,769310000.0,2363400000.0,3166800000.0,2640400000.0,105860000.0,346300000.0,1426700000.0,40205000.0,81361000.0,59389000000.0,173140000.0,,18193000.0,,25788000.0,68107000.0,887620000.0,161820000.0,,251030000.0,350020000.0,68127000.0,741280000.0,424770000.0,98590000.0,39402000.0,58143000.0,738560000.0,320220000.0,893140000.0,508190000.0,512090000.0,25298000000.0,601220000.0,15719000000.0,46983000.0,,16749000.0,220330000.0,168220000.0,109160000.0,1274400000.0,21110000000.0,,260090000.0,473330000.0,,36247000.0,4210500000.0,1847900000.0,25755000.0,1770400000.0,37911000.0,143530000.0,7235700000.0,467240000.0,623080000.0,66298000000.0,9751400000.0,233410000.0,122670000.0,6202300000.0,3077600000.0,428600000.0,,931840000.0,432330000.0,5559200000.0,343520000.0,260540000.0,,,229800000.0,2507700000.0,670330000.0,1172300000.0,12044000.0,23330000000.0,,220750000.0,5424100000.0,26321000.0,1551100000.0,,244200000.0,3738100000.0,37350000.0,77145000.0,878840000.0,281050000.0,302400000.0,7411000.0,16615000000.0,389400000.0,44989000.0,,59266000.0,127350000.0,114290000.0,185130000.0,158110000.0,557120000.0,296690000.0,5694400000.0,615770000.0,,3713800000.0,1442800000.0,14062000000.0,28189000000.0,,196830000.0,134440000.0,215710000.0,279530000.0,9495800.0,199140000.0,392240000.0,443440000.0,238270000.0,17505000000.0,1292600000.0,,,3204600000.0,7258800000.0,,3284500000.0,976810000.0,1184600000.0,947280000.0,944950000.0,1059400000.0,174710000.0,8568900000.0,2936700000.0,275700000.0,192950000.0,186770000.0,2495900000.0,5969000000.0,49509000.0,2072900000.0,45600000.0,93356000.0,266750000.0,332210000.0,,2841500000.0,163700000.0,223760000.0,4248500000.0,2776000000.0,443580000.0,479500000.0,512420000.0,5655300000.0,997190000.0,89059000.0,,391160000.0,106370000.0,148520000.0,,652440000.0,,119430000.0,,953260000.0,34061000.0,3886600000.0,11670000000.0,567460000.0,73089000.0,305450000.0,13867000000.0,,240890000.0,104390000.0,191870000.0,,183180000.0,120890000.0,117600000.0,590030000.0,526150000.0,,343770000.0,478380000.0,1345500000.0,1125200000.0,117050000.0,83899000.0,1568700000.0,257000000.0,144380000.0,105470000.0,279800000.0,,23350000000.0,605740000.0,97364000.0,,81672000.0,124710000.0,1251700000.0,34789000.0,145420000.0,252820000.0,8153000000.0,14765000.0,250110000.0,286920000.0,680700000.0,86106000.0,1229800000.0,182320000.0,14797000000.0,97217000.0,12476000000.0,66616000.0,609610000.0,834280000.0,102660000.0,6793700000.0,144410000.0,45137000.0,543930000.0,6795200000.0,39271000.0,4859000000.0,1070800000.0,329610000.0,88740000.0,731570000.0,,17515000.0,,231650000.0 +2020_01_04_10_03_Q-Exactive-HF-X-Orbitrap_6070,,,19144000000.0,,,,14775000000.0,194180000.0,34873000.0,,,4853800.0,,181280000.0,74915000.0,,,94865000.0,12440000.0,,940270000.0,6076200.0,228330000.0,226450000.0,8651000.0,164690000.0,,147410000.0,3583700.0,,,,1286000000.0,6841400.0,60463000.0,60633000.0,8060500000.0,,804820000.0,312920000.0,,15757000.0,1369900000.0,91362000.0,65693000.0,275180000.0,6141900.0,,,,,,20777000.0,124500000.0,,,,25448000.0,,765090000.0,,39445000.0,,27934000.0,,,46149000.0,797610000.0,18815000.0,632770000.0,35792000.0,,,,,,149300000.0,,,761410000.0,74424000.0,,,,43413000.0,,,,,,,,,,,,106620000.0,,23109000.0,,921560000.0,251760000.0,145320000.0,9033200.0,7695400.0,408950000.0,,34225000.0,,,1261000000.0,64893000.0,,5920100.0,,,,223800000.0,395840000.0,,74683000.0,,,235160000.0,,,,88545000.0,,158770000.0,,,,,,,,,36450000.0,,,79502000.0,,711080000.0,,,20666000.0,,17606000.0,12833000.0,16593000.0,25323000.0,26501000.0,229820000.0,93686000.0,42019000.0,43227000.0,,222190000.0,,538830000.0,,17272000.0,,,54047000.0,10113000.0,,92389000.0,77869000.0,67936000.0,225950000.0,,,,59930000.0,479480000.0,,217330000.0,23888000.0,63753000.0,,41961000.0,9767800.0,,547850000.0,,968090000.0,526060000.0,69781000.0,70931000.0,18898000.0,,203310000.0,,,158060000.0,93325000.0,348740000.0,,62871000.0,,267810000.0,35399000.0,871020000.0,,27449000.0,15922000.0,,,,,994300000.0,,,,,439620000.0,238800000.0,,16386000.0,191410000.0,,,20394000000.0,9992400.0,,,,,,157540000.0,36124000.0,,,38587000.0,,43725000.0,,,,,70489000.0,8643000.0,258430000.0,170730000.0,,4767800000.0,94373000.0,2651000000.0,,,,2446200.0,94716000.0,11109000.0,247670000.0,5238900000.0,,,41766000.0,,,523170000.0,211380000.0,,,,,977150000.0,35840000.0,79165000.0,15363000000.0,1549000000.0,,,1069500000.0,384210000.0,159010000.0,,20251000.0,77111000.0,877400000.0,23347000.0,63451000.0,,,6878300.0,538910000.0,95925000.0,64990000.0,,4441300000.0,,,983890000.0,,412460000.0,,19115000.0,837750000.0,,,189980000.0,14640000.0,,319090000.0,,41132000.0,,,,90760000.0,,31721000.0,,135560000.0,24770000.0,1826300000.0,29448000.0,,384810000.0,134270000.0,2833400000.0,15013000000.0,,47351000.0,9564100.0,8871500.0,28111000.0,,,44138000.0,43604000.0,,3058900000.0,152220000.0,,,247830000.0,9524800000.0,,2751500000.0,62436000.0,56521000.0,485040000.0,118390000.0,18448000.0,32447000.0,1185300000.0,803150000.0,5204300.0,,60212000.0,863170000.0,463990000.0,,128900000.0,7786300.0,,,35406000.0,,484540000.0,5680400.0,,893440000.0,365810000.0,78456000.0,109500000.0,22769000.0,825600000.0,483940000.0,,,75888000.0,,,,90918000.0,,,,8336700.0,,344590000.0,1632600000.0,132000000.0,,,4976000000.0,,104590000.0,,,,2436300.0,39078000.0,,67818000.0,29312000.0,100270000.0,70939000.0,18182000.0,200480000.0,127940000.0,,,179860000.0,24838000.0,3639600.0,30686000.0,41314000.0,,4849800000.0,46553000.0,,,24328000.0,13539000.0,109680000.0,,,,1147900000.0,,,,36038000.0,,43724000.0,45129000.0,5115700000.0,28134000.0,1675200000.0,,59743000.0,187370000.0,,3373100000.0,3791200.0,,27079000.0,1072500000.0,,509790000.0,234720000.0,,,,,,,57402000.0 +2020_01_04_14_59_Q-Exactive-HF-X-Orbitrap_6070,,,25184000000.0,58631000.0,,,135860000.0,500960000.0,263980000.0,,,,,266380000.0,,,,121280000.0,95206000.0,19262000.0,1677900000.0,53589000.0,,588070000.0,30900000.0,505680000.0,,335470000.0,9287700.0,186800000.0,76296000.0,13891000.0,2016400000.0,57091000.0,50339000.0,49850000.0,14645000000.0,,1337700000.0,,,18209000.0,1254200000.0,736800000.0,101330000.0,214210000.0,17035000.0,42855000.0,,,,141340000.0,170850000.0,390770000.0,,,25932000.0,14402000.0,14898000.0,2043000000.0,21957000.0,,44714000.0,40811000.0,,118250000.0,,1839300000.0,131360000.0,451860000.0,33114000.0,,,54022000.0,30286000.0,,50963000.0,,,1217200000.0,111240000.0,,13084000.0,,58196000.0,2809700.0,,,,,,,,27363000.0,19782000.0,,172480000.0,,102520000.0,114640000.0,1912700000.0,400270000.0,246340000.0,34225000.0,15860000.0,552550000.0,,60122000.0,,53480000.0,1421700000.0,222520000.0,,43308000.0,,,,142450000.0,959960000.0,107860000.0,253440000.0,13356000.0,19798000.0,1330900000.0,,15950000.0,,277030000.0,,110040000.0,,,,,,,54646000.0,2668800000.0,99926000.0,,8715200.0,107230000.0,20222000.0,740760000.0,,,50484000.0,15300000.0,61337000.0,,52974000.0,69086000.0,51818000.0,88903000.0,405690000.0,186570000.0,141910000.0,,815600000.0,,1133700000.0,,78387000.0,,,,32071000.0,83385000.0,48128000.0,63383000.0,283610000.0,211720000.0,,,10517000.0,30807000.0,1238200000.0,15990000.0,450890000.0,38272000.0,37851000.0,,6770300.0,6846400.0,16425000.0,1086600000.0,,1187100000.0,1241500000.0,50936000.0,89255000.0,,,440600000.0,,,434600000.0,350640000.0,392790000.0,,122840000.0,,958640000.0,,8352500000.0,,124530000.0,16693000.0,,,70618000.0,,1275500000.0,,,160730000.0,,109300000.0,896910000.0,,18636000.0,393100000.0,,42877000.0,23589000000.0,29109000.0,,,,,18206000.0,219140000.0,49998000.0,,,39942000.0,,328810000.0,,,15589000.0,25949000.0,152690000.0,54719000.0,164980000.0,72478000.0,67240000.0,6257000000.0,97171000.0,4568800000.0,30065000.0,,,24336000.0,56595000.0,83519000.0,981890000.0,6762400000.0,,63110000.0,95659000.0,,,535440000.0,367080000.0,,51945000.0,56529000.0,21414000.0,2114700000.0,78266000.0,136600000.0,17506000000.0,2788100000.0,27036000.0,,1882900000.0,323970000.0,85586000.0,,190040000.0,24534000.0,1082500000.0,95794000.0,75014000.0,,,,723440000.0,360140000.0,386300000.0,,7840600000.0,,74023000.0,1578200000.0,,616780000.0,,50843000.0,1285700000.0,,,152750000.0,33854000.0,84115000.0,,4753300000.0,49929000.0,,,,35513000.0,11194000.0,71981000.0,,118080000.0,49238000.0,1550000000.0,129270000.0,371340000.0,941070000.0,535050000.0,2984000000.0,5929800000.0,11809000.0,34070000.0,10656000.0,,,,25525000.0,51421000.0,50587000.0,,4071600000.0,318580000.0,,,959450000.0,2805000000.0,,2411700000.0,218660000.0,370080000.0,300710000.0,368860000.0,142890000.0,,1952200000.0,935490000.0,26712000.0,18228000.0,29466000.0,450190000.0,1562200000.0,,121110000.0,27843000.0,22922000.0,,78119000.0,,923430000.0,42542000.0,,1040000000.0,522110000.0,142980000.0,,47213000.0,1497500000.0,320550000.0,6099000.0,36517000.0,31962000.0,,4656000.0,,149780000.0,,,,202000000.0,,1471300000.0,3478900000.0,74966000.0,,23433000.0,3663800000.0,,84110000.0,8405900.0,43528000.0,,81419000.0,26322000.0,,245790000.0,43520000.0,,75604000.0,58468000.0,379220000.0,418090000.0,23422000.0,,331500000.0,18848000.0,40910000.0,,76126000.0,,8550500000.0,195310000.0,,,,57943000.0,247180000.0,,,71925000.0,2272600000.0,,,,75905000.0,,421210000.0,,5345100000.0,50907000.0,3218900000.0,,276500000.0,434960000.0,49682000.0,1386900000.0,,22761000.0,125800000.0,1747100000.0,,457120000.0,628680000.0,65125000.0,,168280000.0,,1633800.0,,76543000.0 +2020_01_06_20_17_Q-Exactive-HF-X-Orbitrap_6070,54498000.0,,82977000000.0,226650000.0,114660000.0,,1248800000.0,2918100000.0,1127500000.0,,,33621000.0,91382000.0,2562400000.0,90632000.0,,37628000.0,709350000.0,131910000.0,64779000.0,6569000000.0,173520000.0,572190000.0,2761900000.0,428320000.0,2445100000.0,69241000.0,1866700000.0,78344000.0,299990000.0,705560000.0,205800000.0,10895000000.0,418950000.0,269580000.0,349610000.0,66453000000.0,266590000.0,7328000000.0,428190000.0,119650000.0,219070000.0,3398800000.0,4032400000.0,1040000000.0,2893700000.0,,221700000.0,,189340000.0,107700000.0,1082900000.0,889250000.0,2225800000.0,,,59514000.0,554610000.0,215800000.0,7255200000.0,101130000.0,64562000.0,297610000.0,309460000.0,216870000.0,210570000.0,,6783300000.0,572950000.0,1532300000.0,1761500000.0,28140000.0,58558000.0,124070000.0,259330000.0,64366000.0,435020000.0,451950000.0,,5504400000.0,632060000.0,97350000.0,65283000.0,,256330000.0,53023000.0,77323000.0,63278000.0,50266000.0,,124050000.0,52202000.0,18736000.0,189730000.0,60430000.0,351810000.0,455330000.0,,558430000.0,220850000.0,6462700000.0,2024300000.0,1617200000.0,729340000.0,67618000.0,4547700000.0,133560000.0,762810000.0,,171710000.0,6088300000.0,929180000.0,98619000.0,,192370000.0,454870000.0,,1471600000.0,2128200000.0,527180000.0,1164600000.0,257350000.0,317220000.0,4081000000.0,,34171000.0,194200000.0,1508500000.0,,1659100000.0,186640000.0,54190000.0,125590000.0,38984000.0,,208420000.0,334690000.0,,1012800000.0,42651000.0,34457000.0,858940000.0,103620000.0,5115000000.0,,,226360000.0,116570000.0,190610000.0,40723000.0,1050700000.0,110480000.0,159920000.0,744030000.0,945300000.0,1746200000.0,651990000.0,41508000.0,2703500000.0,187570000.0,5573200000.0,115840000.0,645800000.0,106040000.0,29655000.0,317270000.0,199330000.0,288830000.0,653160000.0,364550000.0,1120900000.0,1726500000.0,114320000.0,,143570000.0,,3024900000.0,640900000.0,2175900000.0,413870000.0,302620000.0,196050000.0,161490000.0,616510000.0,51358000.0,4691500000.0,259670000.0,6636200000.0,3710200000.0,365120000.0,364890000.0,476010000.0,,3142300000.0,82135000.0,168670000.0,2671200000.0,1678700000.0,2312500000.0,,982940000.0,528770000.0,4717200000.0,131590000.0,5623000000.0,72515000.0,581320000.0,511890000.0,106530000.0,,217850000.0,42384000.0,5223100000.0,760210000.0,246190000.0,1363700000.0,,3683500000.0,4339100000.0,,698880000.0,1447300000.0,,101430000.0,70541000000.0,,,170250000.0,,34692000.0,206280000.0,1184800000.0,261980000.0,27967000.0,,668750000.0,154670000.0,584890000.0,55060000.0,109810000.0,,,1481200000.0,352760000.0,847170000.0,673210000.0,685510000.0,33486000000.0,887980000.0,20301000000.0,169950000.0,88615000.0,,232190000.0,228530000.0,236030000.0,1532900000.0,26076000000.0,,336660000.0,709380000.0,49768000.0,58323000.0,6738300000.0,3993700000.0,200420000.0,128610000.0,121540000.0,198190000.0,9499800000.0,381740000.0,711890000.0,75166000000.0,11929000000.0,624310000.0,236260000.0,7597000000.0,4183600000.0,690700000.0,,1048800000.0,509200000.0,5955200000.0,695410000.0,384510000.0,,,178570000.0,3018400000.0,569760000.0,1605400000.0,58511000.0,22859000000.0,,472660000.0,6863400000.0,106280000.0,3192700000.0,,92123000.0,5558400000.0,50088000.0,,1112500000.0,632860000.0,387760000.0,2585500000.0,,536730000.0,32498000.0,,,391200000.0,158640000.0,311710000.0,,203640000.0,1032100000.0,8915900000.0,1208100000.0,,4413100000.0,3134800000.0,14409000000.0,11295000000.0,74428000.0,132730000.0,158910000.0,149360000.0,525170000.0,21011000.0,197960000.0,521970000.0,728650000.0,301350000.0,23889000000.0,1638300000.0,,,4483800000.0,12497000000.0,4307900000.0,5616200000.0,1207800000.0,1457500000.0,2074700000.0,1628600000.0,1177300000.0,429300000.0,11301000000.0,3379900000.0,425550000.0,141710000.0,750130000.0,2684100000.0,8116600000.0,56812000.0,366930000.0,117680000.0,106270000.0,405320000.0,711440000.0,97216000.0,3742300000.0,189820000.0,285240000.0,5289300000.0,3422300000.0,677610000.0,67253000.0,516140000.0,8482500000.0,1129600000.0,239260000.0,47789000.0,220580000.0,,342300000.0,503420000.0,880790000.0,,158590000.0,,522220000.0,,5683500000.0,11457000000.0,701670000.0,57247000.0,392780000.0,18149000000.0,58718000.0,303640000.0,157920000.0,51960000.0,118570000.0,76768000.0,70742000.0,89420000.0,922220000.0,516170000.0,106980000.0,659310000.0,1003100000.0,1584600000.0,1994000000.0,209400000.0,93883000.0,1598600000.0,631710000.0,96051000.0,326840000.0,470020000.0,,28105000000.0,900580000.0,257680000.0,,121910000.0,44980000.0,938900000.0,141670000.0,145170000.0,506100000.0,13270000000.0,,73858000.0,449300000.0,820200000.0,,1821600000.0,124970000.0,24661000000.0,179000000.0,14684000000.0,,918150000.0,962670000.0,340520000.0,6598100000.0,255380000.0,194970000.0,439010000.0,9369100000.0,,3718900000.0,1754200000.0,156240000.0,,921130000.0,276180000.0,65409000.0,54524000.0,227490000.0 +2020_01_08_16_43_Q-Exactive-HF-X-Orbitrap_6070,,,35358000000.0,69835000.0,47902000.0,,159220000.0,737820000.0,363940000.0,,,31616000.0,,592610000.0,,,5691800.0,217790000.0,182270000.0,,2201300000.0,71848000.0,176140000.0,849730000.0,49317000.0,703640000.0,,602030000.0,44595000.0,338800000.0,71867000.0,11508000.0,3957900000.0,68872000.0,153530000.0,199210000.0,22652000000.0,9220400.0,2143300000.0,226700000.0,13723000.0,76263000.0,1687100000.0,1290100000.0,218730000.0,864570000.0,,38947000.0,,,,111950000.0,237940000.0,526600000.0,,22673000.0,,182400000.0,61757000.0,3975000000.0,25026000.0,77214000.0,58310000.0,57706000.0,,212750000.0,32363000.0,2157100000.0,202020000.0,1017300000.0,,,,55579000.0,75777000.0,11617000.0,181530000.0,37743000.0,,1830100000.0,257350000.0,20116000.0,30582000.0,,157940000.0,16476000.0,,37786000.0,284540000.0,18140000.0,,,9800900.0,18378000.0,54380000.0,7324600.0,124060000.0,,138130000.0,94497000.0,2833600000.0,919170000.0,589650000.0,74098000.0,24489000.0,2282300000.0,,116520000.0,,,1873900000.0,421650000.0,17137000.0,,,77802000.0,,514820000.0,1131700000.0,170610000.0,217750000.0,90430000.0,80216000.0,1615400000.0,,,,368430000.0,,354820000.0,,,28114000.0,,,5898000.0,83271000.0,10974000000.0,177970000.0,,,159450000.0,,1764500000.0,,,151980000.0,,55350000.0,,251470000.0,66824000.0,,202180000.0,592230000.0,338330000.0,262660000.0,84326000.0,1019000000.0,,1808100000.0,,161000000.0,9081800.0,13395000.0,82959000.0,59356000.0,135900000.0,349320000.0,91989000.0,315540000.0,508940000.0,33544000.0,,,45008000.0,1163200000.0,83754000.0,623490000.0,79190000.0,189280000.0,,30304000.0,202080000.0,,2124800000.0,42465000.0,1692900000.0,1314800000.0,76783000.0,126620000.0,64743000.0,12853000.0,823550000.0,29174000.0,,582140000.0,244500000.0,605300000.0,,65015000.0,90103000.0,1616000000.0,,2031400000.0,,50011000.0,73364000.0,,,18073000.0,22221000.0,2440600000.0,102440000.0,49508000.0,255930000.0,1146400000.0,1232700000.0,1535800000.0,,138170000.0,415530000.0,,38561000.0,36577000000.0,,,24478000.0,,,49966000.0,479780000.0,116570000.0,,,153400000.0,,389070000.0,,,,,247250000.0,44956000.0,4167200000.0,58822000.0,120350000.0,9832200000.0,257330000.0,5583600000.0,47832000.0,,,49662000.0,,106160000.0,453440000.0,8829600000.0,,100190000.0,310490000.0,,30724000.0,2691500000.0,578010000.0,,,45158000.0,54721000.0,3168600000.0,174580000.0,210130000.0,32514000000.0,3843700000.0,143700000.0,,2305600000.0,1321700000.0,185320000.0,,282060000.0,233880000.0,1487300000.0,162120000.0,104460000.0,,,29135000.0,969040000.0,675500000.0,393280000.0,,12090000000.0,,92411000.0,2045400000.0,42378000.0,911290000.0,,,1670700000.0,17793000.0,,288870000.0,80590000.0,76007000.0,831910000.0,,50442000.0,12156000.0,,,47113000.0,11425000.0,3464700000.0,12590000.0,66979000.0,245940000.0,2517100000.0,282490000.0,,1654500000.0,850300000.0,5559200000.0,10364000000.0,15001000.0,115280000.0,,177810000.0,100120000.0,6660500.0,31377000.0,163010000.0,187180000.0,49917000.0,6768000000.0,651430000.0,13097000.0,23659000.0,1235000000.0,4477200000.0,,1943000000.0,431740000.0,412050000.0,1027600000.0,508090000.0,183590000.0,205800000.0,3494700000.0,683530000.0,52271000.0,37646000.0,182700000.0,454580000.0,2284600000.0,,34413000.0,28434000.0,44788000.0,103980000.0,155230000.0,,1032900000.0,119840000.0,87677000.0,1696700000.0,1047900000.0,231990000.0,218340000.0,158480000.0,2643800000.0,294610000.0,56638000.0,20585000.0,153290000.0,,29657000.0,76408000.0,278070000.0,,,,454980000.0,1243300000.0,2414600000.0,4428800000.0,342810000.0,,22366000.0,6993100000.0,,106880000.0,38702000.0,,,53211000.0,112950000.0,35243000.0,292030000.0,86823000.0,45985000.0,132520000.0,20370000.0,546600000.0,850170000.0,101060000.0,4995800.0,531100000.0,186300000.0,37148000.0,134800000.0,113690000.0,,8382600000.0,366710000.0,,,,61830000.0,295450000.0,,,128150000.0,4205400000.0,,38804000.0,138810000.0,123430000.0,,289450000.0,,8450800000.0,,5284800000.0,,323760000.0,342000000.0,101200000.0,3181300000.0,,22672000.0,226580000.0,3122500000.0,,1019100000.0,632020000.0,48404000.0,,168110000.0,,,, +2020_01_09_11_07_Q-Exactive-HF-X-Orbitrap_6070,,,50500000000.0,89269000.0,71612000.0,,232810000.0,908760000.0,553740000.0,,,,27750000.0,989060000.0,28417000.0,,13949000.0,167440000.0,83692000.0,,2859700000.0,77984000.0,364940000.0,1293600000.0,89669000.0,1533600000.0,,863720000.0,34892000.0,204280000.0,369410000.0,109610000.0,5270000000.0,146170000.0,185910000.0,105710000.0,25931000000.0,13330000.0,2608300000.0,339830000.0,74522000.0,68287000.0,2786000000.0,1570400000.0,404440000.0,1093100000.0,29727000.0,33358000.0,1226600000.0,,,1585400000.0,161880000.0,1058800000.0,,21604000.0,,206520000.0,104600000.0,3199600000.0,96610000.0,96920000.0,80130000.0,141140000.0,38922000.0,253200000.0,30583000.0,3581000000.0,197400000.0,286310000.0,,,13916000.0,60430000.0,87966000.0,13866000.0,144970000.0,43489000.0,,2492700000.0,313590000.0,42110000.0,36369000.0,,130750000.0,21859000.0,18426000.0,45302000.0,,34818000.0,85016000.0,,,15821000.0,155310000.0,70567000.0,212700000.0,43957000.0,261640000.0,38803000.0,1663500000.0,1673300000.0,728830000.0,209860000.0,31341000.0,4362000000.0,,227710000.0,,,3259600000.0,354060000.0,39871000.0,,112560000.0,67811000.0,,859130000.0,1282100000.0,173010000.0,408150000.0,131020000.0,121910000.0,2312900000.0,,,,605250000.0,,588040000.0,53942000.0,,102680000.0,,,92734000.0,202820000.0,,193090000.0,33325000.0,14720000.0,74688000.0,,2169200000.0,5037500.0,,59929000.0,56459000.0,146580000.0,,380700000.0,,98029000.0,278050000.0,643010000.0,504440000.0,258800000.0,,2468400000.0,132770000.0,2892500000.0,,232110000.0,96717000.0,,,85188000.0,241090000.0,251500000.0,110940000.0,630220000.0,769350000.0,,,36536000.0,,1979700000.0,111410000.0,688990000.0,134580000.0,152220000.0,17708000.0,103350000.0,207990000.0,,1705300000.0,151010000.0,2916300000.0,1577700000.0,117090000.0,191100000.0,227090000.0,40749000.0,1025000000.0,55499000.0,65586000.0,1509000000.0,676180000.0,1090000000.0,,439570000.0,180690000.0,2856600000.0,22211000.0,2489200000.0,,49538000.0,240660000.0,100900000.0,,26861000.0,,1686000000.0,158770000.0,72477000.0,,,1330700000.0,1484800000.0,,170710000.0,470720000.0,,30917000.0,32282000000.0,,,,,,127050000.0,376640000.0,189620000.0,12355000.0,181730000.0,384630000.0,11526000.0,352840000.0,14152000.0,75939000.0,15148000.0,,407830000.0,144910000.0,550080000.0,170970000.0,354860000.0,14281000000.0,306340000.0,8787100000.0,95722000.0,,,128120000.0,,144640000.0,566260000.0,9253900000.0,,204680000.0,586670000.0,,47235000.0,2793000000.0,824690000.0,,46108000.0,41842000.0,30031000.0,4310700000.0,247470000.0,398350000.0,36356000000.0,4873400000.0,187040000.0,,3521900000.0,2146000000.0,282870000.0,,476780000.0,196470000.0,2595100000.0,363670000.0,134570000.0,,,112660000.0,1489200000.0,135370000.0,250760000.0,14710000.0,12358000000.0,31721000.0,66848000.0,3472900000.0,49056000.0,1511600000.0,3343900000.0,,2234000000.0,37952000.0,,390490000.0,95775000.0,197220000.0,24603000.0,8538200000.0,125590000.0,19012000.0,,,73605000.0,46013000.0,118990000.0,,183850000.0,306110000.0,4573800000.0,403240000.0,5173600000.0,2318500000.0,978900000.0,6210400000.0,13877000000.0,,86454000.0,36510000.0,171580000.0,207890000.0,9640500.0,79970000.0,248620000.0,202410000.0,66467000.0,11376000000.0,994140000.0,,,2357000000.0,5796100000.0,,2164800000.0,567450000.0,533450000.0,796170000.0,768190000.0,400760000.0,18984000.0,6478000000.0,916020000.0,59275000.0,28593000.0,156780000.0,926330000.0,3795600000.0,,229520000.0,20752000.0,22436000.0,158910000.0,442930000.0,,2139700000.0,8714400.0,104920000.0,2652500000.0,1072500000.0,382820000.0,196650000.0,202840000.0,3166000000.0,436620000.0,77923000.0,51356000.0,143480000.0,69555000.0,66768000.0,216200000.0,257440000.0,,,,261380000.0,2025500000.0,2419800000.0,7228500000.0,400440000.0,,89769000.0,7403500000.0,,91453000.0,47106000.0,143020000.0,,24732000.0,47514000.0,,384440000.0,241860000.0,86772000.0,262060000.0,202210000.0,500360000.0,812250000.0,103500000.0,44090000.0,657310000.0,314100000.0,122450000.0,64010000.0,375210000.0,,14916000000.0,341710000.0,,,71365000.0,45618000.0,418610000.0,4742600.0,39464000.0,269770000.0,5225000000.0,16590000.0,,146320000.0,372940000.0,,632950000.0,,9634700000.0,,7269100000.0,,496120000.0,380140000.0,68917000.0,3419800000.0,51740000.0,47955000.0,219380000.0,4044700000.0,,1938000000.0,1327800000.0,51288000.0,,,31508000.0,40381000.0,19093000.0,173270000.0 +2020_01_15_13_56_Q-Exactive-HF-X-Orbitrap_6070,,,16610000000.0,128810000.0,,7535200.0,,240530000.0,276730000.0,30679000.0,,,9779700.0,360360000.0,54193000.0,,,167470000.0,,17727000.0,769330000.0,10392000.0,107720000.0,616510000.0,108540000.0,384030000.0,42537000.0,160940000.0,51418000.0,24859000.0,17301000.0,32164000.0,2507300000.0,,,77868000.0,10621000000.0,40290000.0,1328800000.0,79503000.0,158960000.0,84113000.0,1044100000.0,526790000.0,220640000.0,493880000.0,28886000.0,43187000.0,,13082000.0,21098000.0,69757000.0,259800000.0,178290000.0,,,76316000.0,125840000.0,178870000.0,1545800000.0,44202000.0,54573000.0,,,,,,1333800000.0,,446910000.0,,,3246600.0,24844000.0,91271000.0,,547880000.0,84860000.0,53623000.0,1065700000.0,110610000.0,36764000.0,115980000.0,,,,26877000.0,108150000.0,78075000.0,3885800.0,,3423000.0,34095000.0,114520000.0,14269000.0,,143090000.0,,27857000.0,45758000.0,2429700000.0,824210000.0,330780000.0,,17828000.0,2573000000.0,,187150000.0,,7103100.0,83077000.0,89628000.0,,,,32649000.0,,182290000.0,590830000.0,80101000.0,240290000.0,328810000.0,43028000.0,1126100000.0,23494000.0,,51830000.0,246190000.0,,194680000.0,82088000.0,11731000.0,32276000.0,47916000.0,,,75196000.0,,6562700.0,,,,25752000.0,1353700000.0,,,46670000.0,,80138000.0,3503000.0,235340000.0,,91192000.0,73613000.0,217730000.0,477250000.0,84563000.0,84597000.0,295510000.0,138690000.0,955860000.0,,163390000.0,29916000.0,,240560000.0,53479000.0,83526000.0,42566000.0,287710000.0,73755000.0,4301100.0,6426300.0,,14919000.0,,171200000.0,37954000.0,121300000.0,,68134000.0,55531000.0,34677000.0,268410000.0,8548300.0,1231500000.0,,1554700000.0,954770000.0,32522000.0,98582000.0,70235000.0,28886000.0,418950000.0,,,982960000.0,285380000.0,113430000.0,,107280000.0,132730000.0,230400000.0,,8408800000.0,24810000.0,75644000.0,52724000.0,46024000.0,,149740000.0,10417000.0,996920000.0,,145150000.0,305160000.0,,409560000.0,612320000.0,11037000.0,50980000.0,791070000.0,32576000.0,18018000.0,8200400000.0,47514000.0,16596000.0,4553500.0,,,,315620000.0,99717000.0,13609000.0,,,59006000.0,183810000.0,,56542000.0,73568000.0,6601400.0,360890000.0,25035000.0,144160000.0,,60100000.0,3459600000.0,501240000.0,5684100000.0,66225000.0,85696000.0,,30512000.0,171920000.0,27863000.0,253440000.0,2731900000.0,12382000.0,99244000.0,139810000.0,,,1589100000.0,1268600000.0,,,23122000.0,118800000.0,1491300000.0,144940000.0,87027000.0,7962600000.0,2546600000.0,166140000.0,81143000.0,1285900000.0,771150000.0,91832000.0,17585000.0,96574000.0,52630000.0,1161400000.0,217690000.0,352600000.0,,,49114000.0,167200000.0,590810000.0,109230000.0,7430600.0,9307100000.0,5366400.0,231050000.0,4531800000.0,,953420000.0,3711700.0,31898000.0,777270000.0,,8318100.0,402790000.0,30972000.0,174910000.0,,4137000000.0,87272000.0,,6572200.0,12269000.0,96970000.0,69205000.0,65308000.0,12459000.0,,111260000.0,3905100000.0,123790000.0,1591100000.0,909050000.0,205920000.0,3065200000.0,,43700000.0,46485000.0,61341000.0,,18000000.0,7606300.0,,33904000.0,103300000.0,80040000.0,6344000000.0,522280000.0,43915000.0,,693650000.0,3969800000.0,,540940000.0,212530000.0,304030000.0,168240000.0,647150000.0,97272000.0,17044000.0,722820000.0,1039900000.0,,97563000.0,113910000.0,165780000.0,1300600000.0,,,45051000.0,5129000.0,74014000.0,93039000.0,,1028800000.0,61421000.0,91937000.0,478730000.0,1203200000.0,482320000.0,154770000.0,,145320000.0,93078000.0,44465000.0,43781000.0,141140000.0,,,,311040000.0,,,244060000.0,21452000.0,,815530000.0,5048900000.0,126960000.0,,37639000.0,2986200000.0,,,62851000.0,30934000.0,83691000.0,,,88495000.0,136940000.0,309770000.0,14416000.0,231730000.0,119430000.0,187340000.0,513550000.0,63440000.0,13611000.0,608720000.0,37556000.0,38838000.0,,88781000.0,,3913400000.0,191410000.0,,2430900.0,,,,20443000.0,35860000.0,184260000.0,1304200000.0,5802100.0,,282770000.0,119480000.0,105450000.0,921750000.0,63274000.0,2818100000.0,51392000.0,4384400000.0,10862000.0,182660000.0,302020000.0,35423000.0,396310000.0,39577000.0,23881000.0,,3374400000.0,11057000.0,995660000.0,,19689000.0,5789300.0,,,,,40183000.0 +2020_01_20_15_10_Q-Exactive-HF-X-Orbitrap_6070,350360000.0,,106720000000.0,194540000.0,234650000.0,,1362300000.0,3426300000.0,1782300000.0,58810000.0,96315000.0,103580000.0,150890000.0,3266000000.0,151360000.0,54347000.0,88056000.0,713020000.0,313150000.0,93349000.0,7639100000.0,382720000.0,897030000.0,2565100000.0,252030000.0,3646700000.0,98760000.0,1986000000.0,114360000.0,745620000.0,1099900000.0,150720000.0,14999000000.0,483880000.0,714350000.0,202960000.0,68747000000.0,124030000.0,6555000000.0,588720000.0,327710000.0,860700000.0,5938200000.0,2604000000.0,1436900000.0,3973700000.0,,458350000.0,,48184000.0,199420000.0,170120000.0,3604200000.0,2123700000.0,,74007000.0,109100000.0,2004500000.0,331090000.0,8505400000.0,294860000.0,267260000.0,319590000.0,278770000.0,160850000.0,693450000.0,137640000.0,5822400000.0,1033000000.0,2040000000.0,774000000.0,54220000.0,88888000.0,249570000.0,333520000.0,147190000.0,727640000.0,202190000.0,109530000.0,6658200000.0,785780000.0,206100000.0,103770000.0,40290000.0,481160000.0,116980000.0,,228870000.0,68678000.0,102360000.0,193640000.0,88877000.0,12042000.0,257680000.0,170560000.0,278570000.0,519810000.0,63776000.0,780370000.0,220410000.0,8175400000.0,3783900000.0,2179500000.0,827770000.0,94686000.0,2419800000.0,,407870000.0,,200390000.0,6601100000.0,1080900000.0,,221290000.0,2044100000.0,512210000.0,21837000.0,1681900000.0,3170800000.0,747810000.0,1497800000.0,665560000.0,397800000.0,5518800000.0,26596000.0,114820000.0,323930000.0,2084500000.0,,2183200000.0,207350000.0,,498460000.0,25076000.0,37619000.0,251260000.0,620980000.0,,1015400000.0,,81224000.0,1485000000.0,178540000.0,7041900000.0,,,246360000.0,200710000.0,251830000.0,49764000.0,1101900000.0,,368450000.0,1126000000.0,1721200000.0,2326200000.0,1206000000.0,115220000.0,2987600000.0,96511000.0,10099000000.0,187090000.0,926440000.0,256180000.0,74516000.0,365110000.0,297290000.0,860080000.0,773750000.0,275850000.0,1525600000.0,2272200000.0,227030000.0,,227610000.0,141240000.0,3673200000.0,478860000.0,2251200000.0,532930000.0,307780000.0,264780000.0,253210000.0,1083000000.0,,4060000000.0,204560000.0,7215100000.0,3580200000.0,342270000.0,879030000.0,422810000.0,94416000.0,3612900000.0,119550000.0,228600000.0,3444900000.0,1009700000.0,2656200000.0,,1434100000.0,587790000.0,6589600000.0,155980000.0,9693000000.0,,291310000.0,651570000.0,376320000.0,,173810000.0,,6098400000.0,822240000.0,294300000.0,1341000000.0,6062100000.0,10177000000.0,5434700000.0,258190000.0,703190000.0,1331400000.0,96744000.0,218210000.0,100860000000.0,100900000.0,102500000.0,104340000.0,,,247120000.0,2028400000.0,647440000.0,54424000.0,,409460000.0,64918000.0,1355400000.0,88066000.0,201100000.0,32212000.0,127380000.0,1286700000.0,756370000.0,1328400000.0,985310000.0,1307900000.0,37498000000.0,917730000.0,17075000000.0,191180000.0,74706000.0,42159000.0,402430000.0,232530000.0,278140000.0,2720200000.0,29993000000.0,,518530000.0,1435000000.0,,118990000.0,9380300000.0,2382100000.0,147510000.0,184230000.0,307250000.0,273570000.0,9967800000.0,841600000.0,1181400000.0,87268000000.0,14146000000.0,569580000.0,140650000.0,9896500000.0,6317200000.0,768330000.0,,1426500000.0,456070000.0,6828900000.0,966110000.0,650170000.0,67566000.0,75675000.0,554770000.0,4631500000.0,924170000.0,1295500000.0,14712000.0,34077000000.0,44888000.0,620660000.0,11044000000.0,135960000.0,3500200000.0,83786000.0,230370000.0,6616400000.0,14209000.0,90632000.0,1434700000.0,367900000.0,641420000.0,3284900000.0,,692440000.0,87169000.0,,609830000.0,204670000.0,173000000.0,433790000.0,97391000.0,773810000.0,858620000.0,11229000000.0,1554500000.0,,5448600000.0,2936900000.0,18638000000.0,29844000000.0,75779000.0,330780000.0,250380000.0,181470000.0,1131700000.0,1298300000.0,194730000.0,760350000.0,1004300000.0,310620000.0,24685000000.0,1134000000.0,,51750000.0,5198300000.0,11201000000.0,,3802000000.0,1616000000.0,1594800000.0,2276200000.0,2787500000.0,1231200000.0,526270000.0,14189000000.0,3927500000.0,276290000.0,241550000.0,1009700000.0,2493300000.0,9080200000.0,,4339100000.0,122770000.0,207210000.0,426190000.0,1398500000.0,,4501100000.0,464800000.0,563140000.0,5832300000.0,2609600000.0,863190000.0,360960000.0,738430000.0,9114400000.0,1689700000.0,264500000.0,170370000.0,459850000.0,288440000.0,401340000.0,,1002300000.0,,235300000.0,,895280000.0,,8935200000.0,21867000000.0,1574400000.0,113930000.0,150890000.0,16965000000.0,,555480000.0,344830000.0,257300000.0,16716000.0,257420000.0,212890000.0,277210000.0,1155400000.0,588620000.0,74278000.0,932930000.0,822020000.0,1531200000.0,2122300000.0,192950000.0,197030000.0,2821400000.0,343840000.0,259790000.0,475450000.0,1158800000.0,55096000.0,32889000000.0,1468600000.0,,,138660000.0,247980000.0,2108700000.0,114050000.0,265620000.0,881430000.0,14644000000.0,,487350000.0,630910000.0,866270000.0,,3023700000.0,179120000.0,20944000000.0,474640000.0,18128000000.0,,1275500000.0,1167800000.0,333630000.0,5736000000.0,258460000.0,158770000.0,572460000.0,10934000000.0,32338000.0,4576700000.0,1879400000.0,661420000.0,109990000.0,349540000.0,433690000.0,111790000.0,589790000.0,313040000.0 +2020_02_05_20_55_Q-Exactive-HF-X-Orbitrap_6070,311950000.0,62610000.0,102460000000.0,211140000.0,173540000.0,,22349000.0,3312300000.0,1881500000.0,,36886000.0,69047000.0,28614000.0,2169200000.0,145200000.0,,83666000.0,352080000.0,107000000.0,72830000.0,6011300000.0,202160000.0,228620000.0,2273900000.0,368710000.0,2848600000.0,27344000.0,2075100000.0,155630000.0,637390000.0,949200000.0,188040000.0,11463000000.0,594860000.0,472060000.0,315050000.0,63591000000.0,60760000.0,5896200000.0,493470000.0,65427000.0,676180000.0,5109400000.0,3364000000.0,551710000.0,3335200000.0,151680000.0,262190000.0,2757300000.0,,132800000.0,88911000.0,3604100000.0,2109700000.0,,90391000.0,149910000.0,1054700000.0,420540000.0,8930600000.0,169320000.0,247760000.0,184660000.0,239590000.0,,538600000.0,80729000.0,5930400000.0,965420000.0,1468400000.0,,61773000.0,29462000.0,169290000.0,300640000.0,58365000.0,499260000.0,196120000.0,,5594400000.0,701780000.0,101230000.0,67629000.0,26267000.0,533180000.0,93153000.0,,31718000.0,541960000.0,,92679000.0,38325000.0,88911000.0,603010000.0,128010000.0,353480000.0,239920000.0,60488000.0,707380000.0,142990000.0,7265600000.0,3266600000.0,1689600000.0,689140000.0,71088000.0,10258000000.0,,376000000.0,,162430000.0,4946600000.0,1797500000.0,77945000.0,95943000.0,74659000.0,166410000.0,48250000.0,1540300000.0,3536600000.0,1015300000.0,1280700000.0,332690000.0,314750000.0,4155200000.0,,47248000.0,264630000.0,1627600000.0,,1829500000.0,136090000.0,49069000.0,379470000.0,42589000.0,50932000.0,137600000.0,447790000.0,,926780000.0,36669000.0,12973000.0,457650000.0,59611000.0,6243100000.0,,,302150000.0,843560000.0,220800000.0,104820000.0,1032400000.0,417440000.0,299700000.0,743540000.0,1336600000.0,1485200000.0,1089100000.0,174670000.0,2097000000.0,548490000.0,6141500000.0,185400000.0,854250000.0,162780000.0,33903000.0,365130000.0,319730000.0,775270000.0,432680000.0,178090000.0,861220000.0,1762800000.0,62100000.0,,46544000.0,,3637300000.0,499100000.0,2019200000.0,517500000.0,334780000.0,120980000.0,183850000.0,516320000.0,,5256200000.0,337450000.0,4933300000.0,1694600000.0,263670000.0,665090000.0,309140000.0,114440000.0,3126500000.0,108920000.0,120470000.0,2793200000.0,1484300000.0,1695700000.0,,1115500000.0,375460000.0,5044000000.0,133830000.0,23957000000.0,,349370000.0,368290000.0,183940000.0,336680000.0,70576000.0,45106000.0,5361000000.0,462760000.0,189300000.0,1212400000.0,2975000000.0,8159000000.0,4094900000.0,,498220000.0,1442900000.0,44697000.0,139360000.0,78517000000.0,205150000.0,44497000.0,112130000.0,31460000.0,,173760000.0,1535500000.0,251170000.0,25299000.0,,667240000.0,132330000.0,923650000.0,83633000.0,87830000.0,121980000.0,85884000.0,1131100000.0,388020000.0,2246300000.0,1610500000.0,779970000.0,33288000000.0,855350000.0,19438000000.0,316440000.0,,74561000.0,315090000.0,162580000.0,170240000.0,1283600000.0,21789000000.0,,272830000.0,964460000.0,13237000.0,,7069500000.0,1918400000.0,65017000.0,135890000.0,318980000.0,138370000.0,11826000000.0,738410000.0,990670000.0,86480000000.0,12310000000.0,505860000.0,118550000.0,8425200000.0,4418800000.0,1927000000.0,37450000.0,948810000.0,454450000.0,4794700000.0,649490000.0,398050000.0,86172000.0,94902000.0,344930000.0,3975900000.0,872310000.0,985320000.0,36503000.0,30557000000.0,62454000.0,477220000.0,7805900000.0,226790000.0,2891800000.0,18757000.0,206080000.0,4438600000.0,78382000.0,192110000.0,1406400000.0,473610000.0,457370000.0,,22085000000.0,678930000.0,20911000.0,,54017000.0,313630000.0,38212000.0,357460000.0,105730000.0,588040000.0,935790000.0,10034000000.0,1083000000.0,,4384900000.0,2817000000.0,17041000000.0,24984000000.0,164870000.0,215410000.0,222560000.0,169260000.0,494190000.0,63223000.0,175550000.0,517490000.0,633670000.0,141610000.0,25989000000.0,1370400000.0,,26829000.0,4948700000.0,53255000000.0,,4030600000.0,1266500000.0,1339400000.0,1480800000.0,1953900000.0,654210000.0,287670000.0,11362000000.0,2751700000.0,369440000.0,34104000.0,628380000.0,2814900000.0,8957200000.0,,123440000.0,24081000.0,250050000.0,294130000.0,904220000.0,20081000.0,3832200000.0,196770000.0,262880000.0,5499800000.0,3793800000.0,1047700000.0,316560000.0,614170000.0,7644100000.0,590500000.0,232760000.0,80170000.0,455940000.0,,175240000.0,,1238300000.0,,100350000.0,,296510000.0,,6109700000.0,14349000000.0,1210600000.0,27102000.0,233310000.0,13848000000.0,,416990000.0,311800000.0,203980000.0,,119550000.0,172400000.0,186870000.0,973950000.0,732140000.0,89290000.0,726540000.0,570860000.0,1418300000.0,2057000000.0,242390000.0,139680000.0,1976800000.0,609540000.0,167940000.0,551920000.0,521610000.0,,20472000000.0,726740000.0,74650000.0,119540000.0,68938000.0,126640000.0,931180000.0,73653000.0,173060000.0,496790000.0,17031000000.0,12654000.0,,453020000.0,836630000.0,,2012100000.0,,16766000000.0,319640000.0,16931000000.0,,989990000.0,1268400000.0,217860000.0,6455500000.0,232740000.0,95020000.0,518620000.0,8242700000.0,27394000.0,3375900000.0,1572800000.0,507850000.0,,,178500000.0,131120000.0,190490000.0,213130000.0 +2020_02_10_15_41_Q-Exactive-HF-X-Orbitrap_6070,,34589000.0,58597000000.0,718590000.0,,,,2156700000.0,1150400000.0,188750000.0,130090000.0,,194760000.0,2584200000.0,304930000.0,211150000.0,,994800000.0,69128000.0,494930000.0,4617800000.0,65211000.0,249600000.0,4754100000.0,667630000.0,2843300000.0,180840000.0,1701500000.0,164330000.0,147060000.0,79582000.0,173530000.0,12123000000.0,,,478740000.0,55270000000.0,291050000.0,9308800000.0,199310000.0,1564900000.0,719880000.0,8393800000.0,2472600000.0,1551500000.0,3287100000.0,388220000.0,220750000.0,2522400000.0,292830000.0,276290000.0,,2361600000.0,963060000.0,96818000.0,50403000.0,495690000.0,1071800000.0,1222700000.0,5842600000.0,145040000.0,317370000.0,,595100000.0,332200000.0,126280000.0,134640000.0,7418600000.0,,1589800000.0,301700000.0,173240000.0,,477680000.0,816650000.0,19478000.0,4335900000.0,694980000.0,201150000.0,6846000000.0,491610000.0,667480000.0,542220000.0,,,395490000.0,70257000.0,325910000.0,272620000.0,,84273000.0,23815000.0,225850000.0,428990000.0,25751000.0,64593000.0,770010000.0,66540000.0,475810000.0,708160000.0,9783100000.0,3830200000.0,2225200000.0,,106840000.0,16359000000.0,46326000.0,514140000.0,,102740000.0,719080000.0,417240000.0,,,21956000.0,226220000.0,15932000.0,1099200000.0,2375700000.0,449040000.0,1382600000.0,1693900000.0,289930000.0,5305200000.0,285470000.0,155140000.0,166330000.0,2398600000.0,27475000.0,1564800000.0,320680000.0,154220000.0,503580000.0,407810000.0,79186000.0,40500000.0,564550000.0,24193000000.0,443740000.0,99889000.0,147420000.0,342280000.0,322250000.0,7186200000.0,63735000.0,,233530000.0,13940000.0,71534000.0,81967000.0,797550000.0,100060000.0,205300000.0,156500000.0,1381400000.0,2668800000.0,713950000.0,459480000.0,2021600000.0,750660000.0,5223900000.0,,1465200000.0,185110000.0,133870000.0,1532400000.0,458690000.0,604800000.0,434110000.0,795700000.0,373390000.0,361590000.0,168510000.0,29450000.0,132090000.0,108980000.0,2286800000.0,627630000.0,1777800000.0,,227450000.0,286310000.0,326640000.0,1927100000.0,117070000.0,7737500000.0,262480000.0,8365900000.0,4119400000.0,354580000.0,902580000.0,440610000.0,,3034100000.0,70149000.0,130630000.0,4970700000.0,1857400000.0,473150000.0,,706000000.0,916950000.0,2749500000.0,181230000.0,24025000000.0,67434000.0,73122000.0,122680000.0,306430000.0,,358930000.0,236010000.0,4125300000.0,,1378700000.0,1357700000.0,,3797900000.0,4161900000.0,105150000.0,444530000.0,2954700000.0,521490000.0,62526000.0,31326000000.0,263730000.0,28947000.0,110570000.0,132820000.0,,,1447700000.0,420180000.0,241100000.0,,,112260000.0,700030000.0,215030000.0,647590000.0,730050000.0,174690000.0,2960300000.0,303330000.0,799640000.0,,825480000.0,25575000000.0,2589000000.0,23028000000.0,251310000.0,568040000.0,,464420000.0,617550000.0,260460000.0,1694600000.0,10930000000.0,226710000.0,318270000.0,1432400000.0,111740000.0,,10112000000.0,3555200000.0,,201960000.0,83822000.0,1240600000.0,11747000000.0,1139700000.0,985260000.0,37823000000.0,13627000000.0,1364100000.0,477640000.0,6596100000.0,3586000000.0,679440000.0,266960000.0,543670000.0,269260000.0,4655100000.0,736060000.0,2089200000.0,153760000.0,75024000.0,528180000.0,1210100000.0,1750300000.0,1200600000.0,145500000.0,25435000000.0,122890000.0,1499800000.0,19694000000.0,25352000.0,3291300000.0,180260000.0,2848500000.0,3786000000.0,,266810000.0,1822200000.0,246700000.0,528690000.0,,,703150000.0,114690000.0,81615000.0,115640000.0,293400000.0,486330000.0,516400000.0,,92050000.0,930890000.0,15124000000.0,1314100000.0,1049000000.0,4490000000.0,1948700000.0,13683000000.0,,804030000.0,164560000.0,690990000.0,60878000.0,179850000.0,2132000000.0,,314900000.0,1084700000.0,511150000.0,32608000000.0,1862800000.0,45887000.0,101330000.0,6213900000.0,44343000000.0,,2111500000.0,1016200000.0,1204600000.0,1014500000.0,3961700000.0,523290000.0,312440000.0,3697700000.0,3561500000.0,,827930000.0,800770000.0,858590000.0,8031000000.0,,127700000.0,331310000.0,157130000.0,387660000.0,955570000.0,74924000.0,3601200000.0,106800000.0,746040000.0,2226900000.0,3128400000.0,4608400000.0,810690000.0,,1473700000.0,360100000.0,507540000.0,481370000.0,580300000.0,,344480000.0,,2245800000.0,,101370000.0,619760000.0,337920000.0,,3578200000.0,23861000000.0,906860000.0,31658000.0,507650000.0,9029200000.0,,47776000.0,773380000.0,92409000.0,419510000.0,466970000.0,58240000.0,484680000.0,1246900000.0,2457800000.0,89484000.0,1064200000.0,807270000.0,1727300000.0,3478700000.0,405440000.0,72045000.0,3997000000.0,346950000.0,369550000.0,335390000.0,675150000.0,53022000.0,20246000000.0,1541700000.0,,28555000.0,,46901000.0,,419760000.0,134240000.0,654190000.0,10475000000.0,25491000.0,,1369000000.0,1064900000.0,,4334800000.0,389610000.0,13010000000.0,355520000.0,22346000000.0,180720000.0,1677500000.0,1323800000.0,225650000.0,1234800000.0,195000000.0,174900000.0,284520000.0,17693000000.0,122540000.0,5218000000.0,625960000.0,61393000.0,,,,262570000.0,13871000.0,145900000.0 +2020_02_11_10_35_Q-Exactive-HF-X-Orbitrap_6070,,30944000.0,53062000000.0,383700000.0,,33229000.0,,1747300000.0,903190000.0,167270000.0,97290000.0,,227580000.0,2550900000.0,194060000.0,105190000.0,,729830000.0,71552000.0,339300000.0,4011600000.0,90888000.0,176080000.0,4490900000.0,711230000.0,2020600000.0,183320000.0,1499300000.0,142480000.0,129970000.0,162360000.0,154410000.0,11200000000.0,,,188850000.0,47081000000.0,204300000.0,8238000000.0,98994000.0,1203000000.0,434450000.0,5695300000.0,1648800000.0,1089000000.0,2724900000.0,407160000.0,195510000.0,1558400000.0,173510000.0,181280000.0,,1940700000.0,857740000.0,54173000.0,25443000.0,161120000.0,893720000.0,812470000.0,6259400000.0,160380000.0,349600000.0,53614000.0,275520000.0,420690000.0,139440000.0,56231000.0,6901400000.0,,1336300000.0,,110520000.0,,426650000.0,574610000.0,24527000.0,3436300000.0,476790000.0,209730000.0,4832000000.0,557680000.0,276250000.0,363950000.0,,,241480000.0,140680000.0,325400000.0,307750000.0,55656000.0,65220000.0,,184970000.0,463890000.0,99064000.0,125270000.0,736140000.0,95367000.0,403330000.0,273120000.0,13171000000.0,3275300000.0,1947000000.0,,82986000.0,6798000000.0,,634380000.0,,90216000.0,596590000.0,413910000.0,,,10287000.0,306360000.0,,831090000.0,2057100000.0,472540000.0,1078700000.0,1506800000.0,191650000.0,4241200000.0,346460000.0,109270000.0,150750000.0,1907500000.0,13180000.0,1662200000.0,205260000.0,124880000.0,390430000.0,214180000.0,25163000.0,43791000.0,435530000.0,17602000000.0,184440000.0,75389000.0,114470000.0,905540000.0,341560000.0,5374700000.0,136450000.0,20572000.0,446960000.0,18487000.0,44927000.0,46046000.0,661090000.0,154640000.0,333090000.0,290980000.0,429260000.0,2255000000.0,537500000.0,351430000.0,1736900000.0,853950000.0,4023300000.0,,1415500000.0,163680000.0,235420000.0,494920000.0,223110000.0,546500000.0,531110000.0,1024500000.0,221890000.0,186320000.0,139990000.0,,83383000.0,71901000.0,1782600000.0,453000000.0,1247800000.0,,238730000.0,352720000.0,332910000.0,1599800000.0,37356000.0,4757000000.0,223100000.0,7330900000.0,3399500000.0,199860000.0,654250000.0,479470000.0,158910000.0,1975600000.0,65523000.0,100360000.0,4080200000.0,1845600000.0,525060000.0,,899770000.0,629560000.0,2552200000.0,151810000.0,20434000000.0,141380000.0,35199000.0,97359000.0,248110000.0,,658000000.0,164750000.0,4399700000.0,,777460000.0,1164700000.0,,3306100000.0,3642800000.0,89639000.0,495080000.0,3284800000.0,566670000.0,118410000.0,29709000000.0,303440000.0,29144000.0,131460000.0,98651000.0,,,1039700000.0,467110000.0,200960000.0,473710000.0,,299110000.0,550480000.0,265620000.0,418740000.0,630230000.0,125110000.0,2221500000.0,309080000.0,810910000.0,,426440000.0,19785000000.0,2190300000.0,23008000000.0,172770000.0,466420000.0,,350850000.0,642440000.0,286620000.0,1264600000.0,11037000000.0,143490000.0,245340000.0,1351500000.0,78809000.0,,8718500000.0,3703800000.0,,196210000.0,283370000.0,888530000.0,10341000000.0,865030000.0,721850000.0,29978000000.0,10854000000.0,1671500000.0,289800000.0,6286600000.0,2293100000.0,726280000.0,201930000.0,586140000.0,360140000.0,5337000000.0,764710000.0,1957500000.0,148320000.0,77464000.0,454790000.0,947680000.0,1412200000.0,995660000.0,124220000.0,22484000000.0,133480000.0,1258300000.0,16261000000.0,27375000.0,4823500000.0,222940000.0,2119700000.0,4121400000.0,,136500000.0,1635600000.0,212890000.0,495550000.0,,14186000000.0,1005200000.0,163240000.0,214830000.0,96726000.0,270460000.0,352400000.0,520010000.0,37099000.0,59103000.0,670130000.0,19762000000.0,746250000.0,1007600000.0,4387900000.0,995580000.0,10615000000.0,,848430000.0,197040000.0,479640000.0,,224430000.0,1473300000.0,,186440000.0,698780000.0,430640000.0,29325000000.0,1093800000.0,,112130000.0,5234500000.0,18105000000.0,,1999400000.0,1028700000.0,974890000.0,,3324500000.0,587390000.0,362160000.0,3094700000.0,3732700000.0,,606150000.0,829740000.0,496110000.0,6567000000.0,,,326200000.0,75834000.0,361360000.0,678840000.0,14468000.0,1923900000.0,247740000.0,698410000.0,1766900000.0,3225400000.0,3281300000.0,646520000.0,,735200000.0,581360000.0,399710000.0,287530000.0,320680000.0,,233910000.0,,1933100000.0,,56467000.0,192420000.0,493900000.0,,3484100000.0,14129000000.0,734960000.0,26232000.0,465010000.0,3676500000.0,,177710000.0,478720000.0,108100000.0,240770000.0,338710000.0,22997000.0,640900000.0,923640000.0,1832700000.0,106350000.0,981940000.0,953900000.0,1306600000.0,2934000000.0,512750000.0,119680000.0,4431100000.0,271550000.0,256110000.0,348850000.0,512560000.0,,13045000000.0,1436700000.0,,118650000.0,,78078000.0,,340320000.0,223970000.0,806440000.0,9408200000.0,69125000.0,,1014600000.0,1287500000.0,418710000.0,3710700000.0,368170000.0,10844000000.0,409030000.0,21068000000.0,200160000.0,1066900000.0,1215700000.0,202620000.0,1069100000.0,263230000.0,137560000.0,211710000.0,15593000000.0,95935000.0,5089600000.0,582550000.0,62376000.0,,,,199650000.0,13543000.0,44671000.0 +2020_02_12_05_06_Q-Exactive-HF-X-Orbitrap_6070,,,57939000000.0,338960000.0,,76125000.0,,2722100000.0,1072600000.0,175110000.0,60125000.0,,258030000.0,2692600000.0,331060000.0,90704000.0,,1124800000.0,39166000.0,426590000.0,4212000000.0,101380000.0,837790000.0,4330200000.0,868110000.0,2570600000.0,122550000.0,1829800000.0,287430000.0,181820000.0,102840000.0,188600000.0,12977000000.0,,61497000.0,691460000.0,54090000000.0,338740000.0,10373000000.0,353640000.0,1325100000.0,489080000.0,6042000000.0,2083400000.0,1490300000.0,3479600000.0,337270000.0,220920000.0,,671390000.0,83511000.0,,2600700000.0,1159700000.0,23363000.0,33530000.0,325920000.0,1027200000.0,1078900000.0,7210700000.0,262100000.0,419270000.0,68571000.0,578920000.0,310240000.0,,190240000.0,6955200000.0,,2065200000.0,434040000.0,100170000.0,25221000.0,446420000.0,687210000.0,,4202200000.0,702020000.0,135070000.0,6543600000.0,300270000.0,486760000.0,758360000.0,,,225100000.0,92536000.0,360900000.0,237710000.0,116260000.0,129020000.0,20157000.0,340710000.0,349230000.0,45821000.0,62634000.0,1165300000.0,76297000.0,367100000.0,335720000.0,10067000000.0,3552100000.0,2017000000.0,,128930000.0,15243000000.0,,820440000.0,,52607000.0,355600000.0,572680000.0,,,1651600000.0,101750000.0,49753000.0,1515500000.0,2142900000.0,677280000.0,1391800000.0,1487700000.0,152920000.0,5790000000.0,395860000.0,168730000.0,240610000.0,1769400000.0,131130000.0,1597800000.0,256150000.0,198970000.0,547800000.0,279470000.0,76951000.0,105930000.0,668500000.0,27535000000.0,,57064000.0,91950000.0,401690000.0,306940000.0,6666000000.0,68662000.0,,309570000.0,104100000.0,133580000.0,57940000.0,991070000.0,169700000.0,273030000.0,396150000.0,1049200000.0,2852000000.0,378940000.0,683720000.0,1551800000.0,864350000.0,4956600000.0,,1626600000.0,,152430000.0,309990000.0,792130000.0,661460000.0,563840000.0,835360000.0,338330000.0,,323400000.0,19690000.0,154570000.0,115860000.0,2177300000.0,470740000.0,2025800000.0,,214210000.0,416160000.0,251840000.0,1723600000.0,100080000.0,7563900000.0,328060000.0,9625700000.0,3999200000.0,489840000.0,683540000.0,552720000.0,17111000.0,2537700000.0,146840000.0,60922000.0,4959900000.0,2211800000.0,887460000.0,,1113800000.0,627720000.0,2235100000.0,113320000.0,3333900000.0,142360000.0,91454000.0,124800000.0,199710000.0,,663370000.0,208560000.0,5509900000.0,,1099100000.0,897610000.0,,3056800000.0,3682300000.0,177110000.0,339810000.0,2968600000.0,314620000.0,203640000.0,37771000000.0,414960000.0,42709000.0,50194000.0,105250000.0,,,1460600000.0,457990000.0,255850000.0,,,371230000.0,785760000.0,147660000.0,806120000.0,752600000.0,131330000.0,2673100000.0,440570000.0,1016700000.0,,766620000.0,22121000000.0,2924300000.0,28024000000.0,274300000.0,568430000.0,,271020000.0,827020000.0,257590000.0,1317000000.0,10165000000.0,134500000.0,400930000.0,1409700000.0,123250000.0,,10014000000.0,4382400000.0,,189000000.0,68578000.0,1166000000.0,11325000000.0,955160000.0,791270000.0,40566000000.0,14241000000.0,1533600000.0,492380000.0,7659800000.0,3542600000.0,810320000.0,223280000.0,826320000.0,424940000.0,4867700000.0,617390000.0,2391900000.0,,95767000.0,488630000.0,1643800000.0,2972100000.0,1185200000.0,127450000.0,27848000000.0,139650000.0,1542700000.0,21880000000.0,34588000.0,3213700000.0,169710000.0,2551300000.0,4531300000.0,,278040000.0,1317900000.0,213370000.0,487990000.0,,,1378000000.0,186490000.0,59668000.0,163120000.0,696410000.0,458070000.0,432130000.0,158080000.0,69943000.0,861960000.0,23615000000.0,1169400000.0,1050500000.0,5013300000.0,1332800000.0,12624000000.0,,1143500000.0,250360000.0,428570000.0,,292910000.0,2129300000.0,,216340000.0,1061400000.0,453020000.0,35484000000.0,1009100000.0,123010000.0,34840000.0,6443800000.0,21451000000.0,,2486300000.0,1268900000.0,1516300000.0,526420000.0,4552900000.0,937550000.0,231730000.0,4523200000.0,3740900000.0,,710370000.0,922290000.0,710330000.0,7031600000.0,,44610000.0,459480000.0,141020000.0,321790000.0,804390000.0,18096000.0,3610700000.0,110520000.0,541170000.0,2428500000.0,3835300000.0,3048800000.0,758090000.0,,1145200000.0,832220000.0,451630000.0,396440000.0,645180000.0,,172740000.0,,2103800000.0,,,426530000.0,150510000.0,,3546700000.0,21179000000.0,602260000.0,31531000.0,411580000.0,7111400000.0,,86381000.0,545690000.0,111200000.0,358770000.0,290380000.0,94995000.0,470420000.0,1176300000.0,2649900000.0,119090000.0,978340000.0,698700000.0,1698500000.0,3731200000.0,455780000.0,125050000.0,5576100000.0,261290000.0,373330000.0,358260000.0,686960000.0,36564000.0,16737000000.0,1267300000.0,,82170000.0,,32136000.0,45755000.0,258310000.0,179330000.0,884370000.0,11154000000.0,,,1156900000.0,1134400000.0,380210000.0,5025500000.0,458430000.0,11244000000.0,393790000.0,23410000000.0,152570000.0,2119100000.0,1390500000.0,157530000.0,1807100000.0,270790000.0,290780000.0,253280000.0,18126000000.0,169950000.0,5230900000.0,703690000.0,286660000.0,,,,251840000.0,12658000.0,160880000.0 +2020_02_13_00_26_Q-Exactive-HF-X-Orbitrap_6070,,,64983000000.0,472240000.0,,,,2537700000.0,1403400000.0,163550000.0,133200000.0,,364170000.0,3086700000.0,238630000.0,307690000.0,,919980000.0,37861000.0,423620000.0,4152800000.0,94567000.0,393980000.0,4285000000.0,925000000.0,2786900000.0,161160000.0,1538400000.0,116900000.0,167760000.0,131680000.0,134720000.0,14057000000.0,,,483470000.0,59447000000.0,323510000.0,11521000000.0,238010000.0,1314500000.0,725200000.0,7590100000.0,2355500000.0,1308100000.0,3403400000.0,305370000.0,382780000.0,,928590000.0,148130000.0,,3359800000.0,1590600000.0,60507000.0,138140000.0,371500000.0,1313700000.0,857230000.0,8014000000.0,259600000.0,377690000.0,,533750000.0,169950000.0,37456000.0,194600000.0,8719000000.0,,2622100000.0,311840000.0,162970000.0,57906000.0,422990000.0,875690000.0,,4388200000.0,597850000.0,193420000.0,6501600000.0,434740000.0,474180000.0,596160000.0,,,414360000.0,162110000.0,179660000.0,171180000.0,106950000.0,125840000.0,52898000.0,288170000.0,429820000.0,,27620000.0,771510000.0,62603000.0,364080000.0,274790000.0,11030000000.0,4201100000.0,2455400000.0,,89960000.0,14442000000.0,,664410000.0,,58098000.0,690830000.0,464760000.0,,,2132500000.0,169080000.0,,1159000000.0,2429300000.0,442430000.0,1608300000.0,1740700000.0,193690000.0,6007900000.0,512980000.0,61891000.0,206990000.0,2517800000.0,35411000.0,1937700000.0,283730000.0,193790000.0,432890000.0,125940000.0,115320000.0,120400000.0,394730000.0,29755000000.0,,71546000.0,3053400000.0,,219890000.0,6808700000.0,95652000.0,,331680000.0,109940000.0,321370000.0,60864000.0,1102000000.0,136490000.0,316720000.0,313660000.0,548200000.0,2023800000.0,688650000.0,290640000.0,1884000000.0,974460000.0,5087700000.0,,1697900000.0,129570000.0,311940000.0,633110000.0,491690000.0,530850000.0,412670000.0,1297000000.0,274750000.0,112110000.0,106620000.0,63214000.0,107530000.0,154880000.0,2645100000.0,494590000.0,1508100000.0,,130700000.0,386340000.0,301860000.0,1821600000.0,115810000.0,6567900000.0,374750000.0,8492100000.0,4265300000.0,349630000.0,805470000.0,639200000.0,236060000.0,3159200000.0,100790000.0,87025000.0,4073500000.0,2663700000.0,960180000.0,,690490000.0,465280000.0,2585900000.0,219010000.0,26630000000.0,199670000.0,103670000.0,331400000.0,297440000.0,127490000.0,875650000.0,193920000.0,5650000000.0,,1043900000.0,1881000000.0,,3680700000.0,3393000000.0,129350000.0,469440000.0,3828000000.0,455210000.0,74455000.0,28502000000.0,358810000.0,,131320000.0,85759000.0,,,1744500000.0,490130000.0,280880000.0,544320000.0,,311930000.0,917030000.0,317500000.0,411830000.0,772370000.0,136100000.0,2509900000.0,460780000.0,1312600000.0,,588980000.0,23811000000.0,2712800000.0,28470000000.0,301940000.0,669750000.0,,264560000.0,882310000.0,332890000.0,1535800000.0,10036000000.0,121670000.0,505620000.0,1277500000.0,91759000.0,12460000.0,9800000000.0,4977800000.0,,209670000.0,424740000.0,1113100000.0,13626000000.0,823990000.0,1132700000.0,42092000000.0,14468000000.0,2577500000.0,375640000.0,7049300000.0,3255700000.0,873830000.0,230270000.0,634460000.0,528760000.0,6205000000.0,915340000.0,2210100000.0,157480000.0,68266000.0,600440000.0,1308500000.0,2880200000.0,636290000.0,131450000.0,30588000000.0,46477000.0,1470500000.0,21472000000.0,,5990700000.0,134840000.0,378950000.0,5282200000.0,,248770000.0,1403400000.0,265680000.0,711320000.0,,,1065000000.0,60518000.0,90794000.0,239450000.0,783400000.0,482420000.0,587970000.0,127930000.0,101560000.0,918020000.0,16690000000.0,1287300000.0,1513800000.0,4523700000.0,1816600000.0,13559000000.0,,818740000.0,167980000.0,487980000.0,,328600000.0,2564200000.0,,307500000.0,965900000.0,250150000.0,38355000000.0,1132500000.0,,,6108900000.0,25166000000.0,,2477100000.0,1163100000.0,1966700000.0,1089900000.0,3879900000.0,727820000.0,306220000.0,4296500000.0,4576800000.0,,784110000.0,1112600000.0,1163400000.0,7970700000.0,,,352780000.0,191250000.0,691000000.0,830850000.0,24422000.0,3246000000.0,221060000.0,752680000.0,2297800000.0,4610400000.0,3404300000.0,684650000.0,,1300400000.0,614700000.0,321190000.0,445710000.0,727500000.0,21224000.0,187780000.0,,1837800000.0,23611000.0,112040000.0,1595100000.0,572390000.0,,4135800000.0,20688000000.0,728390000.0,,417000000.0,9503100000.0,,187470000.0,676950000.0,136160000.0,254210000.0,407340000.0,,338740000.0,1028700000.0,2498100000.0,178880000.0,1017400000.0,1009000000.0,777280000.0,3968400000.0,600110000.0,24047000.0,5758800000.0,292470000.0,330880000.0,504250000.0,693210000.0,,17464000000.0,1896900000.0,,158460000.0,18384000.0,52328000.0,66436000.0,264110000.0,116450000.0,786270000.0,10855000000.0,77351000.0,,1427300000.0,1065200000.0,554810000.0,5632700000.0,700850000.0,14179000000.0,282300000.0,23748000000.0,158290000.0,2150900000.0,1589300000.0,222220000.0,857230000.0,410410000.0,146950000.0,217440000.0,19294000000.0,115500000.0,5084200000.0,921470000.0,213270000.0,56332000.0,,,236370000.0,17131000.0,28695000.0 +2020_02_13_03_11_Q-Exactive-HF-X-Orbitrap_6070,,,67090000000.0,607260000.0,,,,2468300000.0,1375100000.0,137540000.0,143400000.0,,405140000.0,3202900000.0,293920000.0,288920000.0,,983600000.0,76238000.0,405900000.0,4896700000.0,132570000.0,1131700000.0,4864300000.0,994220000.0,2701800000.0,510660000.0,1721100000.0,174550000.0,212990000.0,,213240000.0,14513000000.0,,26727000.0,332750000.0,65575000000.0,361230000.0,10005000000.0,587800000.0,1580600000.0,603630000.0,8419700000.0,2515500000.0,1306100000.0,4017800000.0,387960000.0,278090000.0,2598300000.0,908100000.0,82926000.0,,3153500000.0,1804700000.0,72178000.0,152080000.0,282020000.0,1264100000.0,1144800000.0,8790400000.0,193580000.0,365050000.0,47402000.0,494040000.0,105080000.0,52281000.0,195370000.0,10166000000.0,,3003500000.0,479740000.0,106750000.0,101860000.0,409580000.0,722790000.0,37581000.0,3813000000.0,755440000.0,72127000.0,7121200000.0,782110000.0,609560000.0,800740000.0,,63592000.0,372550000.0,180760000.0,454000000.0,360890000.0,128660000.0,,63478000.0,283490000.0,476980000.0,,37133000.0,1012400000.0,55725000.0,349050000.0,351850000.0,12504000000.0,4011900000.0,2425600000.0,,135230000.0,12980000000.0,91364000.0,771550000.0,,59584000.0,954550000.0,810670000.0,,,86793000.0,219650000.0,,1117200000.0,2374300000.0,718610000.0,1829700000.0,2335800000.0,210260000.0,7113800000.0,436890000.0,72885000.0,,2388400000.0,,1899900000.0,337980000.0,151770000.0,628740000.0,294790000.0,115050000.0,,453870000.0,30347000000.0,,,118190000.0,,250580000.0,6005700000.0,231260000.0,,292430000.0,138960000.0,101500000.0,58904000.0,1319900000.0,166470000.0,321160000.0,379710000.0,942010000.0,2291100000.0,630230000.0,389570000.0,2308500000.0,1076000000.0,5357900000.0,16198000.0,1986600000.0,188760000.0,261720000.0,708270000.0,579810000.0,697160000.0,717020000.0,1535400000.0,401190000.0,,36277000.0,38779000.0,99191000.0,159550000.0,2670300000.0,488370000.0,1613200000.0,,118260000.0,521260000.0,373550000.0,2418300000.0,134730000.0,7126600000.0,279760000.0,9050300000.0,3791600000.0,459950000.0,659860000.0,858000000.0,160620000.0,3805800000.0,144130000.0,45275000.0,5636900000.0,2009100000.0,1014200000.0,,1276500000.0,750320000.0,2573800000.0,128550000.0,31105000000.0,95545000.0,111520000.0,279950000.0,302820000.0,,593350000.0,259950000.0,5274500000.0,,989420000.0,1842700000.0,,3470200000.0,4698600000.0,28951000.0,364530000.0,4167800000.0,665410000.0,222010000.0,31527000000.0,357940000.0,124330000.0,161670000.0,93390000.0,,,1811800000.0,440590000.0,264710000.0,,,344860000.0,1033800000.0,287880000.0,739840000.0,708830000.0,124820000.0,2919400000.0,671260000.0,2422100000.0,,599110000.0,29266000000.0,3140100000.0,29466000000.0,480150000.0,733540000.0,54178000.0,192970000.0,1116500000.0,292230000.0,1708200000.0,10488000000.0,272830000.0,590620000.0,1751400000.0,100250000.0,,10564000000.0,4265300000.0,,225220000.0,43627000.0,1297700000.0,13855000000.0,920440000.0,1202900000.0,41832000000.0,16606000000.0,2336300000.0,520230000.0,8865500000.0,3722600000.0,942130000.0,278780000.0,734980000.0,548390000.0,6716200000.0,821350000.0,1741900000.0,144160000.0,36352000.0,542820000.0,1327400000.0,2115000000.0,908190000.0,152870000.0,31324000000.0,97915000.0,1562700000.0,23460000000.0,,6196700000.0,203340000.0,3676200000.0,5370100000.0,,238130000.0,1614000000.0,289070000.0,861470000.0,,,985140000.0,179240000.0,83610000.0,242050000.0,362030000.0,622710000.0,593320000.0,43743000.0,152290000.0,974940000.0,18748000000.0,1539500000.0,1067700000.0,5235700000.0,1710500000.0,13299000000.0,,726170000.0,223820000.0,397110000.0,,273400000.0,2806000000.0,,280000000.0,1184900000.0,269520000.0,38558000000.0,3106500000.0,,,7223400000.0,51371000000.0,,2516300000.0,1380700000.0,2458000000.0,1065500000.0,4593700000.0,657450000.0,810420000.0,4483800000.0,5310900000.0,,743430000.0,705480000.0,1290600000.0,7273300000.0,,,456500000.0,318050000.0,399440000.0,1141300000.0,15172000.0,5173200000.0,168460000.0,885230000.0,2463000000.0,4659500000.0,4223900000.0,605650000.0,,1506800000.0,778770000.0,378000000.0,478440000.0,710420000.0,156590000.0,221050000.0,,2203800000.0,59944000.0,,259880000.0,457400000.0,,4374500000.0,21292000000.0,811270000.0,,495020000.0,11459000000.0,,201130000.0,615750000.0,147200000.0,,540870000.0,58328000.0,445270000.0,1388700000.0,2752200000.0,199800000.0,791360000.0,1101300000.0,1162100000.0,4772300000.0,431410000.0,111650000.0,5004500000.0,125290000.0,398680000.0,179780000.0,770180000.0,,22653000000.0,2168400000.0,,121410000.0,17654000.0,62138000.0,49240000.0,303550000.0,49965000.0,985110000.0,11854000000.0,80778000.0,,1734700000.0,1469700000.0,677200000.0,5213800000.0,654640000.0,12259000000.0,354580000.0,25825000000.0,153190000.0,2165300000.0,1801900000.0,350060000.0,828820000.0,227900000.0,241080000.0,45290000.0,22527000000.0,141190000.0,6136400000.0,799860000.0,473780000.0,46869000.0,,846620000.0,172480000.0,14830000.0,37496000.0 +2020_02_17_13_55_Q-Exactive-HF-X-Orbitrap_6070,,,33250000000.0,197270000.0,,22099000.0,,898980000.0,717540000.0,180550000.0,53316000.0,,45790000.0,2051900000.0,126280000.0,234280000.0,,473960000.0,,96380000.0,2376200000.0,79288000.0,,2583700000.0,292730000.0,936610000.0,165720000.0,909880000.0,125090000.0,87746000.0,110360000.0,70943000.0,6666100000.0,,,203730000.0,27111000000.0,154120000.0,4755900000.0,322100000.0,726010000.0,248820000.0,3510900000.0,1175300000.0,683300000.0,1709400000.0,217530000.0,232130000.0,1566500000.0,113620000.0,288130000.0,,1365000000.0,296430000.0,59097000.0,21097000.0,224150000.0,1099900000.0,440990000.0,4540600000.0,118490000.0,271240000.0,44879000.0,426310000.0,153260000.0,29403000.0,48219000.0,4952900000.0,,941000000.0,271320000.0,28066000.0,113420000.0,201190000.0,219990000.0,,1809000000.0,228590000.0,151780000.0,3503400000.0,286950000.0,203250000.0,246120000.0,,,179740000.0,112020000.0,34448000.0,212460000.0,,39359000.0,28675000.0,165440000.0,227360000.0,58657000.0,73383000.0,481020000.0,39252000.0,134720000.0,231850000.0,7599500000.0,1841500000.0,923240000.0,,38597000.0,4625900000.0,44990000.0,506400000.0,,90614000.0,586430000.0,232150000.0,,,,98156000.0,,501350000.0,2200300000.0,293880000.0,261660000.0,1007000000.0,94645000.0,3455200000.0,107370000.0,112840000.0,,1153300000.0,,781590000.0,257350000.0,68950000.0,176660000.0,204020000.0,48856000.0,,258960000.0,14265000000.0,,40741000.0,85996000.0,241120000.0,102600000.0,2823900000.0,25322000.0,10865000.0,267680000.0,42427000.0,67895000.0,7670000.0,335340000.0,100070000.0,262750000.0,105380000.0,534400000.0,1023600000.0,457650000.0,126670000.0,1350600000.0,261500000.0,3112000000.0,,684660000.0,194640000.0,32287000.0,330900000.0,175740000.0,346480000.0,321110000.0,357660000.0,314470000.0,37157000.0,95091000.0,,28553000.0,91572000.0,1389600000.0,537300000.0,562370000.0,,117090000.0,273750000.0,144940000.0,907920000.0,33794000.0,3808600000.0,133720000.0,4126200000.0,3683700000.0,235010000.0,374750000.0,253130000.0,155290000.0,1403300000.0,,29669000.0,2034600000.0,705440000.0,400010000.0,640370000.0,224930000.0,424610000.0,1107900000.0,113210000.0,1497900000.0,49614000.0,355210000.0,90142000.0,87339000.0,,452300000.0,139750000.0,2559800000.0,,506230000.0,586570000.0,,1317100000.0,2414900000.0,64195000.0,215930000.0,2295800000.0,156850000.0,45371000.0,19660000000.0,166210000.0,64316000.0,62967000.0,8414400.0,,,842630000.0,221680000.0,99166000.0,,,91596000.0,626460000.0,73218000.0,425730000.0,200830000.0,103960000.0,1369800000.0,135480000.0,456280000.0,,429780000.0,13428000000.0,1388900000.0,13770000000.0,260380000.0,287420000.0,8726400.0,190130000.0,514140000.0,91322000.0,769360000.0,5112300000.0,22240000.0,290900000.0,345310000.0,42792000.0,,3825700000.0,2427300000.0,,87343000.0,195550000.0,335970000.0,6455000000.0,298120000.0,343110000.0,20331000000.0,8019000000.0,688500000.0,265730000.0,3302100000.0,2154600000.0,342180000.0,101170000.0,308420000.0,162670000.0,2780300000.0,1816300000.0,1058800000.0,,,224410000.0,642800000.0,962050000.0,620230000.0,11827000.0,17946000000.0,52107000.0,837290000.0,12784000000.0,,2472800000.0,78420000.0,69021000.0,2943700000.0,,234150000.0,869830000.0,131040000.0,415460000.0,,12075000000.0,291850000.0,160930000.0,67141000.0,50303000.0,227280000.0,438860000.0,242010000.0,62568000.0,43746000.0,596150000.0,7283200000.0,365470000.0,2475200000.0,2536500000.0,651750000.0,7909700000.0,,270650000.0,131480000.0,215980000.0,,164840000.0,44610000.0,,189990000.0,635870000.0,78366000.0,16318000000.0,1286600000.0,92756000.0,59235000.0,3366100000.0,12082000000.0,,2505600000.0,712710000.0,1068100000.0,675760000.0,2724800000.0,298790000.0,392500000.0,1671600000.0,2519500000.0,,410770000.0,345940000.0,672060000.0,4533200000.0,,,294470000.0,110550000.0,215880000.0,383250000.0,22220000.0,3651700000.0,93818000.0,468400000.0,1526500000.0,3857400000.0,1008000000.0,306860000.0,,346420000.0,571240000.0,127030000.0,159300000.0,311240000.0,,334480000.0,436440000.0,780210000.0,,36873000.0,408660000.0,95205000.0,,1888100000.0,10574000000.0,604050000.0,9355200.0,204570000.0,6007800000.0,,,196350000.0,122170000.0,164170000.0,187400000.0,,244590000.0,626770000.0,977360000.0,165200000.0,634420000.0,488410000.0,910350000.0,1586700000.0,186670000.0,91748000.0,2346100000.0,362190000.0,203790000.0,66762000.0,504320000.0,,9885700000.0,606300000.0,,83513000.0,11722000.0,26017000.0,,114170000.0,76167000.0,341930000.0,5178800000.0,59377000.0,,947520000.0,545790000.0,,1634800000.0,198590000.0,4459200000.0,119880000.0,12157000000.0,141210000.0,625800000.0,1104600000.0,177530000.0,1760800000.0,128400000.0,137950000.0,158410000.0,9049700000.0,11424000.0,3748500000.0,619560000.0,,57551000.0,279570000.0,,64708000.0,,98188000.0 +2020_02_18_01_25_Q-Exactive-HF-X-Orbitrap_6070,,,33673000000.0,55309000.0,,28283000.0,,990810000.0,629120000.0,44760000.0,67128000.0,,40202000.0,1183700000.0,206950000.0,194510000.0,,497860000.0,36460000.0,169500000.0,2456300000.0,84651000.0,193100000.0,1878800000.0,282800000.0,954100000.0,93034000.0,1151700000.0,100820000.0,120690000.0,94362000.0,,6035000000.0,,58865000.0,156810000.0,24420000000.0,101650000.0,4120300000.0,203100000.0,501710000.0,140060000.0,3388400000.0,955210000.0,407190000.0,1352700000.0,136650000.0,78504000.0,1127300000.0,568380000.0,70108000.0,,1195100000.0,315730000.0,53364000.0,83636000.0,53321000.0,364800000.0,312020000.0,4448100000.0,126980000.0,108700000.0,,230380000.0,123140000.0,,18740000.0,5599500000.0,15833000.0,928120000.0,244800000.0,44210000.0,38540000.0,226830000.0,310550000.0,,1255000000.0,288020000.0,36359000.0,3261100000.0,257130000.0,121100000.0,291150000.0,,,87191000.0,127640000.0,179090000.0,54969000.0,108270000.0,57429000.0,31577000.0,86331000.0,161820000.0,53993000.0,,451400000.0,,113730000.0,145090000.0,7108100000.0,1510000000.0,957320000.0,,75886000.0,7569300000.0,13117000.0,602050000.0,35080000.0,29577000.0,497170000.0,318860000.0,,,21933000.0,66480000.0,,332380000.0,2202300000.0,286210000.0,412830000.0,653380000.0,82978000.0,3031900000.0,101320000.0,36341000.0,26950000.0,771580000.0,46018000.0,635460000.0,110870000.0,29851000.0,235230000.0,184180000.0,36465000.0,7178200.0,201300000.0,,,39967000.0,49050000.0,329560000.0,32588000.0,3417500000.0,64008000.0,,74257000.0,40294000.0,52904000.0,31203000.0,309390000.0,114360000.0,209900000.0,249700000.0,762260000.0,802820000.0,508220000.0,62317000.0,1384000000.0,198460000.0,2924800000.0,,759850000.0,,,261550000.0,454930000.0,253230000.0,249440000.0,768160000.0,171960000.0,,33878000.0,,43545000.0,102210000.0,1434100000.0,233040000.0,986290000.0,,,172940000.0,202640000.0,796820000.0,138430000.0,2670900000.0,85696000.0,4698300000.0,2232700000.0,171830000.0,133190000.0,222860000.0,200760000.0,1028900000.0,89147000.0,42336000.0,2186100000.0,813600000.0,212470000.0,217960000.0,,277110000.0,1342300000.0,66267000.0,1508600000.0,13705000.0,304590000.0,81560000.0,28064000.0,,183380000.0,102590000.0,2416400000.0,,369460000.0,533910000.0,,759950000.0,1934500000.0,82301000.0,129010000.0,1555000000.0,89197000.0,31213000.0,23159000000.0,91052000.0,27280000.0,9248100.0,13419000.0,,,654040000.0,167480000.0,,776910000.0,,110360000.0,444770000.0,35037000.0,85602000.0,106840000.0,99408000.0,1255600000.0,106630000.0,352370000.0,,274280000.0,11584000000.0,898690000.0,11728000000.0,114830000.0,208360000.0,20289000.0,146930000.0,414860000.0,110370000.0,888930000.0,6710000000.0,61332000.0,258970000.0,207260000.0,32020000.0,,2804900000.0,1873500000.0,46673000.0,30465000.0,182460000.0,446300000.0,5102900000.0,441230000.0,211560000.0,18020000000.0,6118600000.0,609090000.0,138720000.0,3410700000.0,1939400000.0,216750000.0,60098000.0,461370000.0,190370000.0,2228600000.0,638470000.0,621950000.0,46452000.0,31129000.0,191040000.0,722030000.0,1804600000.0,685130000.0,11041000.0,21129000000.0,71411000.0,858910000.0,10277000000.0,,1846500000.0,61208000.0,1618100000.0,2642900000.0,25557000.0,94768000.0,813400000.0,198750000.0,278890000.0,,10407000000.0,329740000.0,26330000.0,25102000.0,103510000.0,93741000.0,209720000.0,250390000.0,41141000.0,39103000.0,424300000.0,7420200000.0,453190000.0,419880000.0,2714800000.0,605630000.0,6675600000.0,,348290000.0,138600000.0,201730000.0,72852000.0,53426000.0,30705000.0,,96642000.0,383140000.0,16418000.0,14045000000.0,873140000.0,,17697000.0,2071400000.0,10652000000.0,,2130900000.0,569800000.0,1145000000.0,484530000.0,2081300000.0,182200000.0,204910000.0,1988300000.0,2899100000.0,,325350000.0,518270000.0,294980000.0,3693900000.0,,42744000.0,195670000.0,50369000.0,81653000.0,489480000.0,22551000.0,2346300000.0,73144000.0,380180000.0,1280500000.0,2995100000.0,1154700000.0,555090000.0,,571830000.0,486560000.0,35153000.0,212030000.0,319900000.0,,74090000.0,,619610000.0,15021000.0,49179000.0,79376000.0,107230000.0,,1857300000.0,7006200000.0,458420000.0,,117120000.0,6544800000.0,,30857000.0,92044000.0,30321000.0,199490000.0,179240000.0,41323000.0,289260000.0,408420000.0,1062800000.0,100210000.0,1177200000.0,407810000.0,835400000.0,1361800000.0,110390000.0,30887000.0,2169300000.0,142870000.0,105830000.0,,289880000.0,,11516000000.0,836820000.0,,21266000.0,,21401000.0,29153000.0,79914000.0,13951000.0,715290000.0,4505900000.0,,,1123000000.0,452510000.0,108670000.0,2100200000.0,229080000.0,5233100000.0,714960000.0,13402000000.0,185880000.0,719050000.0,1188300000.0,89373000.0,1412900000.0,164170000.0,61201000.0,120260000.0,9068100000.0,,3731600000.0,707440000.0,46840000.0,27168000.0,,100640000.0,,,206920000.0 +2020_02_18_18_55_Q-Exactive-HF-X-Orbitrap_6070,,,38552000000.0,405180000.0,,,,1026100000.0,454380000.0,106280000.0,94657000.0,,68494000.0,1087700000.0,202380000.0,,,446910000.0,23843000.0,150050000.0,1944300000.0,40716000.0,,2499500000.0,347370000.0,1241700000.0,125900000.0,345980000.0,237420000.0,40040000.0,27716000.0,59630000.0,5760600000.0,,12085000.0,263900000.0,32063000000.0,89546000.0,4182100000.0,,750970000.0,258380000.0,2886900000.0,1030500000.0,697250000.0,2058600000.0,222240000.0,152920000.0,1668200000.0,137330000.0,186930000.0,,1082500000.0,518440000.0,13549000.0,,231950000.0,569950000.0,556160000.0,5388500000.0,173460000.0,202570000.0,42742000.0,243670000.0,143330000.0,36874000.0,,4152400000.0,,897780000.0,246700000.0,51446000.0,30991000.0,55918000.0,305200000.0,,2046000000.0,130560000.0,117210000.0,3299200000.0,278000000.0,169890000.0,213070000.0,,,771760000.0,70155000.0,73692000.0,387700000.0,33698000.0,28464000.0,24754000.0,147550000.0,95144000.0,70163000.0,102910000.0,321070000.0,848920000.0,259470000.0,40785000.0,3649200000.0,2018900000.0,1115300000.0,,61388000.0,6919600000.0,17117000.0,395200000.0,,60072000.0,362750000.0,381530000.0,,,8700200.0,35084000.0,,576930000.0,1613500000.0,252800000.0,773900000.0,821710000.0,140380000.0,3439000000.0,86446000.0,,21230000.0,1369500000.0,,663590000.0,264210000.0,105060000.0,103450000.0,173760000.0,58242000.0,9195400.0,268400000.0,,,27382000.0,32486000.0,451270000.0,83956000.0,3290600000.0,14945000.0,,181050000.0,49465000.0,142600000.0,10214000.0,301320000.0,50796000.0,87290000.0,86701000.0,803900000.0,1028800000.0,299480000.0,121240000.0,843900000.0,420740000.0,3151000000.0,9794900.0,434610000.0,204090000.0,98028000.0,220250000.0,327470000.0,236630000.0,413380000.0,750240000.0,208070000.0,42137000.0,138090000.0,,78134000.0,,1518600000.0,157620000.0,552810000.0,,154350000.0,237100000.0,74646000.0,974690000.0,36311000.0,4010300000.0,169500000.0,3594200000.0,2407500000.0,245740000.0,356490000.0,222220000.0,77228000.0,1572500000.0,53501000.0,57934000.0,3120000000.0,743380000.0,477430000.0,,455850000.0,489640000.0,770950000.0,,16016000000.0,21511000.0,137440000.0,53823000.0,129270000.0,,124630000.0,83850000.0,2835500000.0,,344090000.0,851400000.0,,1695700000.0,1587900000.0,39517000.0,206420000.0,2066600000.0,78658000.0,57678000.0,18182000000.0,128790000.0,53064000.0,81893000.0,18083000.0,,,689050000.0,179370000.0,126030000.0,272250000.0,,183910000.0,352330000.0,134140000.0,305860000.0,346480000.0,34282000.0,1406800000.0,143100000.0,410730000.0,,212790000.0,9509900000.0,1171300000.0,11882000000.0,284630000.0,182680000.0,13708000.0,215260000.0,362430000.0,257350000.0,1120200000.0,4430400000.0,,189290000.0,622460000.0,18776000.0,,4191200000.0,1078700000.0,17602000.0,39416000.0,323200000.0,399550000.0,5123700000.0,670010000.0,544460000.0,23729000000.0,7108900000.0,743400000.0,291660000.0,3804600000.0,2286200000.0,371910000.0,45266000.0,348660000.0,205130000.0,2852800000.0,620110000.0,553000000.0,67876000.0,42746000.0,168810000.0,506400000.0,670910000.0,425320000.0,30105000.0,19862000000.0,27461000.0,855960000.0,8897800000.0,,2498100000.0,60363000.0,,2585800000.0,21307000.0,88405000.0,740290000.0,171620000.0,252400000.0,,,270520000.0,26898000.0,72600000.0,34521000.0,193680000.0,254710000.0,254890000.0,42145000.0,,520540000.0,5378800000.0,438860000.0,301850000.0,2862700000.0,659410000.0,5184100000.0,,279870000.0,23663000.0,284220000.0,72271000.0,118540000.0,1061900000.0,,135380000.0,441880000.0,171400000.0,15585000000.0,824790000.0,55617000.0,83935000.0,2196500000.0,30254000000.0,,688170000.0,685830000.0,900580000.0,761070000.0,2464600000.0,379770000.0,279790000.0,2073800000.0,1500700000.0,,245970000.0,376940000.0,247940000.0,4634100000.0,,49069000.0,232870000.0,48755000.0,65133000.0,308790000.0,44640000.0,2923000000.0,148660000.0,321490000.0,994230000.0,2455100000.0,1990500000.0,170740000.0,,485490000.0,368240000.0,139870000.0,229920000.0,151770000.0,,148650000.0,,996420000.0,,46712000.0,442720000.0,53375000.0,,1870200000.0,8857900000.0,594700000.0,,224730000.0,4240600000.0,,,254140000.0,38855000.0,84837000.0,74403000.0,47116000.0,296270000.0,466370000.0,1072900000.0,16581000.0,731820000.0,487560000.0,504180000.0,1754700000.0,311180000.0,29103000.0,2075800000.0,172920000.0,177500000.0,53991000.0,637060000.0,,9883800000.0,949410000.0,,72858000.0,,,,143840000.0,,479590000.0,5194400000.0,36475000.0,,640850000.0,455220000.0,110110000.0,1953200000.0,132670000.0,5827900000.0,190700000.0,10729000000.0,118770000.0,611360000.0,811490000.0,158250000.0,469660000.0,192480000.0,,185220000.0,7146500000.0,29265000.0,2976300000.0,208610000.0,146220000.0,20561000.0,,,60018000.0,,93126000.0 +2020_02_28_12_27_Q-Exactive-HF-X-Orbitrap_6070,,,27783000000.0,258140000.0,,13444000.0,,842130000.0,302240000.0,54047000.0,32974000.0,,23629000.0,1525600000.0,94651000.0,83648000.0,,405720000.0,27219000.0,95848000.0,1998100000.0,36108000.0,209600000.0,2156100000.0,226640000.0,917410000.0,63872000.0,462280000.0,68932000.0,98304000.0,46768000.0,58598000.0,4921700000.0,,,209410000.0,22553000000.0,59262000.0,4079500000.0,104900000.0,529270000.0,106780000.0,2273000000.0,780850000.0,452990000.0,1437400000.0,106200000.0,118890000.0,,270880000.0,91650000.0,,702000000.0,310430000.0,14652000.0,62392000.0,143300000.0,360740000.0,363090000.0,3806200000.0,98445000.0,179910000.0,35166000.0,182410000.0,258550000.0,61576000.0,,4020100000.0,,1308200000.0,140500000.0,40038000.0,29098000.0,82506000.0,246060000.0,,1267500000.0,208570000.0,70392000.0,2400700000.0,341200000.0,228500000.0,255610000.0,,,74567000.0,86666000.0,141860000.0,119640000.0,50209000.0,56303000.0,53820000.0,99504000.0,86438000.0,39708000.0,61378000.0,357810000.0,38307000.0,157590000.0,104370000.0,5216700000.0,1393100000.0,910970000.0,,22510000.0,2578700000.0,11463000.0,383350000.0,,68867000.0,310380000.0,212160000.0,,,28881000.0,103030000.0,,321940000.0,555660000.0,210270000.0,442170000.0,549320000.0,78250000.0,1962900000.0,109570000.0,,46895000.0,881350000.0,,538620000.0,45498000.0,60343000.0,158340000.0,81553000.0,32808000.0,,232760000.0,,84210000.0,55366000.0,43752000.0,,134120000.0,2190600000.0,26368000.0,7649300.0,227860000.0,21245000.0,111060000.0,25019000.0,521550000.0,,23612000.0,106420000.0,547750000.0,865200000.0,338990000.0,188310000.0,730620000.0,251580000.0,2058200000.0,,544620000.0,191850000.0,56364000.0,471210000.0,221060000.0,173270000.0,118980000.0,291170000.0,106160000.0,12222000.0,109380000.0,21245000.0,48692000.0,10054000.0,560980000.0,117900000.0,415620000.0,,135500000.0,244080000.0,114330000.0,610410000.0,,3779100000.0,67959000.0,3078400000.0,1275500000.0,121200000.0,346050000.0,147110000.0,13639000.0,1247900000.0,36717000.0,32061000.0,1783400000.0,943920000.0,354790000.0,573360000.0,266950000.0,279840000.0,744310000.0,24165000.0,10115000000.0,50119000.0,154210000.0,99585000.0,73233000.0,,226240000.0,17018000.0,2105000000.0,,274270000.0,569910000.0,,618850000.0,1407300000.0,11350000.0,80738000.0,1584200000.0,159660000.0,102230000.0,16104000000.0,21582000.0,,11097000.0,13295000.0,,,587780000.0,156190000.0,88220000.0,258690000.0,,94794000.0,172750000.0,90949000.0,119170000.0,138890000.0,24494000.0,936030000.0,185730000.0,392440000.0,,224710000.0,8439700000.0,964600000.0,8987800000.0,81422000.0,276460000.0,6663100.0,103430000.0,490880000.0,130850000.0,718230000.0,4460900000.0,78655000.0,161830000.0,409420000.0,10265000.0,,2592800000.0,1935400000.0,,47429000.0,109370000.0,373620000.0,3294100000.0,438210000.0,460670000.0,14982000000.0,4776000000.0,629550000.0,162600000.0,2261200000.0,980110000.0,166590000.0,37151000.0,265730000.0,134690000.0,2542000000.0,227420000.0,438650000.0,,80089000.0,111760000.0,570650000.0,1196300000.0,394670000.0,24744000.0,9260400000.0,75343000.0,574990000.0,8359300000.0,,1433600000.0,53249000.0,109330000.0,1637100000.0,12179000.0,74108000.0,947880000.0,116650000.0,322050000.0,,,330510000.0,33023000.0,32763000.0,31815000.0,183770000.0,82795000.0,226190000.0,28410000.0,38087000.0,260550000.0,8515800000.0,318390000.0,524450000.0,1931900000.0,588890000.0,6056700000.0,1293500000.0,203990000.0,49265000.0,92650000.0,36214000.0,132620000.0,20815000.0,,150030000.0,329540000.0,159390000.0,12229000000.0,771210000.0,62666000.0,39890000.0,1599100000.0,9011500000.0,,1106400000.0,310310000.0,950460000.0,297490000.0,1620500000.0,309420000.0,173640000.0,1764600000.0,1813200000.0,,250310000.0,411620000.0,363440000.0,2191100000.0,,19451000.0,175500000.0,51097000.0,63190000.0,270620000.0,5798800.0,1171200000.0,514100000.0,247060000.0,861040000.0,2642100000.0,1012300000.0,533430000.0,,276910000.0,261750000.0,73519000.0,186590000.0,322030000.0,48973000.0,41131000.0,,678880000.0,,28107000.0,239860000.0,271010000.0,,1471500000.0,9590700000.0,343840000.0,18136000.0,180350000.0,5263000000.0,159700000.0,51186000.0,143070000.0,37632000.0,209780000.0,153790000.0,29257000.0,218400000.0,236930000.0,792820000.0,52895000.0,487460000.0,347630000.0,543910000.0,1002300000.0,175690000.0,30443000.0,1884600000.0,96809000.0,90284000.0,142390000.0,266240000.0,19594000.0,9243500000.0,682110000.0,,30618000.0,6182100.0,,22787000.0,100160000.0,52265000.0,429660000.0,3911600000.0,23308000.0,,378520000.0,474860000.0,111660000.0,1684800000.0,192090000.0,4749300000.0,,9883400000.0,144960000.0,487120000.0,345770000.0,96508000.0,603440000.0,115390000.0,126910000.0,77249000.0,7187300000.0,52421000.0,3021600000.0,450120000.0,116580000.0,,,224480000.0,34193000.0,,162920000.0 +2020_03_01_23_00_Q-Exactive-HF-X-Orbitrap_6070,,,22858000000.0,110320000.0,,12123000.0,,530010000.0,321440000.0,47612000.0,44800000.0,,36145000.0,996450000.0,49919000.0,76941000.0,,176140000.0,11781000.0,71911000.0,1304500000.0,32846000.0,124530000.0,1153900000.0,161720000.0,605740000.0,33491000.0,592780000.0,105070000.0,91843000.0,47812000.0,37641000.0,3613200000.0,,5067500.0,125450000.0,15768000000.0,58911000.0,2566300000.0,84802000.0,416280000.0,91302000.0,1753800000.0,737630000.0,388400000.0,953270000.0,90365000.0,59048000.0,607090000.0,254480000.0,84073000.0,,618830000.0,250880000.0,8923800.0,35659000.0,49389000.0,274250000.0,268450000.0,3218000000.0,43402000.0,163740000.0,25491000.0,79540000.0,37374000.0,35798000.0,40753000.0,2590100000.0,,799240000.0,,22096000.0,,108010000.0,172530000.0,,983580000.0,85444000.0,90203000.0,2041500000.0,119480000.0,88004000.0,151840000.0,7702400.0,,63981000.0,29014000.0,191330000.0,101350000.0,76307000.0,21489000.0,28509000.0,74724000.0,111900000.0,5785100.0,64362000.0,140410000.0,41729000.0,118460000.0,64205000.0,4289900000.0,921610000.0,717610000.0,,23187000.0,2111200000.0,,324750000.0,,33813000.0,170770000.0,209860000.0,,,2980400.0,97193000.0,,268000000.0,603270000.0,137650000.0,295820000.0,421510000.0,77142000.0,1526900000.0,94255000.0,,24001000.0,596880000.0,,506850000.0,76689000.0,24244000.0,122060000.0,72925000.0,36807000.0,14182000.0,90360000.0,,15151000.0,32728000.0,30194000.0,,48320000.0,1865600000.0,3733200.0,,169060000.0,13629000.0,36664000.0,14670000.0,401700000.0,,,56821000.0,286970000.0,509590000.0,151060000.0,43869000.0,743410000.0,271050000.0,1552300000.0,,274670000.0,52172000.0,23529000.0,139050000.0,201100000.0,109510000.0,39198000.0,246690000.0,151980000.0,29152000.0,45950000.0,,55364000.0,,571900000.0,89033000.0,328880000.0,,628730000.0,92027000.0,62497000.0,358740000.0,313740000.0,2801900000.0,65500000.0,2696400000.0,1585900000.0,137330000.0,237030000.0,146360000.0,,770500000.0,7800400.0,4690600.0,1128700000.0,607940000.0,305620000.0,453430000.0,291540000.0,166520000.0,670380000.0,20565000.0,7146200000.0,13058000.0,105440000.0,65050000.0,41509000.0,,164840000.0,17190000.0,922370000.0,,190240000.0,353880000.0,,414290000.0,1265000000.0,,74527000.0,1177800000.0,122900000.0,16040000.0,10431000000.0,59181000.0,,7729800.0,13892000.0,,,493290000.0,128780000.0,52743000.0,,,71241000.0,213350000.0,39814000.0,92856000.0,139360000.0,23239000.0,678780000.0,48649000.0,383810000.0,,166300000.0,6361500000.0,570470000.0,6972200000.0,100760000.0,88460000.0,,94150000.0,468850000.0,136490000.0,436060000.0,4516800000.0,76846000.0,38908000.0,366670000.0,3916300.0,,2210800000.0,1341800000.0,,22784000.0,52045000.0,184230000.0,3179800000.0,296500000.0,362180000.0,12262000000.0,4061700000.0,401090000.0,118790000.0,2273100000.0,910620000.0,320450000.0,36072000.0,188160000.0,77528000.0,2044500000.0,270960000.0,382200000.0,,27352000.0,109370000.0,369610000.0,1338100000.0,426780000.0,40324000.0,10994000000.0,57959000.0,633430000.0,5094600000.0,13242000.0,1485900000.0,30332000.0,170230000.0,1633700000.0,7778600.0,61179000.0,647150000.0,72856000.0,247480000.0,,5230500000.0,293970000.0,31500000.0,68390000.0,56941000.0,175000000.0,58563000.0,175700000.0,38941000.0,39551000.0,112010000.0,4950300000.0,350850000.0,,1157400000.0,330330000.0,4105800000.0,,200290000.0,66828000.0,57649000.0,26211000.0,115860000.0,17256000.0,,49000000.0,295810000.0,123370000.0,10761000000.0,412960000.0,,35371000.0,1321000000.0,6802700000.0,,1135200000.0,268510000.0,634530000.0,335290000.0,1498600000.0,283990000.0,161220000.0,920240000.0,1312600000.0,,130690000.0,321820000.0,281030000.0,1823800000.0,,4474400.0,132960000.0,31166000.0,59041000.0,153640000.0,5956600.0,1642800000.0,9174200.0,224630000.0,786020000.0,1476700000.0,911870000.0,397100000.0,,264800000.0,188680000.0,64741000.0,114770000.0,183580000.0,35185000.0,136530000.0,,340950000.0,,,90600000.0,58133000.0,13204000.0,1011300000.0,4943900000.0,225090000.0,,94581000.0,3964800000.0,,40831000.0,106030000.0,18480000.0,3091000000.0,37084000.0,97980000.0,135770000.0,324740000.0,562400000.0,41942000.0,396130000.0,315100000.0,391810000.0,828850000.0,150450000.0,24440000.0,1641400000.0,122950000.0,60417000.0,83773000.0,286970000.0,6973500.0,6229100000.0,550060000.0,,4877400.0,,,,67877000.0,19032000.0,298330000.0,2751800000.0,13629000.0,,458020000.0,307180000.0,,1926800000.0,93566000.0,3724800000.0,130630000.0,7614700000.0,110820000.0,520620000.0,397640000.0,86495000.0,,78695000.0,80259000.0,58083000.0,5344300000.0,13502000.0,2126400000.0,202280000.0,101580000.0,8761800.0,,137660000.0,25272000.0,5906600.0,52054000.0 +2020_03_06_16_22_Q-Exactive-HF-X-Orbitrap_6070,49169000.0,,22287000000.0,28008000.0,,,4378900000.0,641890000.0,261200000.0,,,,,488830000.0,62155000.0,,,142330000.0,49129000.0,22549000.0,1751600000.0,30768000.0,129950000.0,447620000.0,41992000.0,552320000.0,,288690000.0,5139300.0,84595000.0,98537000.0,27193000.0,2888500000.0,48040000.0,54628000.0,18778000.0,14838000000.0,11629000.0,1202900000.0,139830000.0,16967000.0,24484000.0,1338300000.0,961750000.0,145430000.0,591140000.0,,15892000.0,,,,859760000.0,76129000.0,356300000.0,,,,66744000.0,41242000.0,1630100000.0,24485000.0,46652000.0,7507200.0,115910000.0,9912400.0,116080000.0,,1343000000.0,142920000.0,276280000.0,,,,5630200.0,34752000.0,6718700.0,30130000.0,19798000.0,29471000.0,990660000.0,146700000.0,37572000.0,9535300.0,,73473000.0,,13492000.0,,7966700.0,38291000.0,31416000.0,,,22699000.0,62597000.0,62181000.0,94142000.0,15782000.0,87982000.0,55353000.0,1094600000.0,500250000.0,401270000.0,95171000.0,23717000.0,445510000.0,,85738000.0,,18541000.0,1295300000.0,165540000.0,10648000.0,,6910000.0,66480000.0,,297500000.0,713910000.0,88943000.0,340490000.0,43070000.0,37936000.0,961610000.0,5994600.0,11226000.0,,467330000.0,,395520000.0,17530000.0,,59164000.0,,,21064000.0,45713000.0,,124490000.0,9753300.0,5735900.0,120250000.0,,1094000000.0,,,71736000.0,,37099000.0,7308400.0,219310000.0,,64407000.0,165470000.0,188580000.0,214440000.0,100480000.0,,932280000.0,57610000.0,1394900000.0,,113080000.0,20330000.0,,,30303000.0,131570000.0,103220000.0,59231000.0,189390000.0,251240000.0,9489100.0,,,,523680000.0,106730000.0,343260000.0,37762000.0,97135000.0,8922100.0,32105000.0,145340000.0,7899300.0,730990000.0,48856000.0,1466700000.0,798910000.0,87653000.0,109530000.0,77407000.0,8031600.0,629500000.0,38413000.0,4589400.0,342740000.0,201160000.0,597480000.0,,188450000.0,76737000.0,1080900000.0,19866000.0,1386200000.0,,24091000.0,42955000.0,17299000.0,,80316000.0,,846180000.0,48836000.0,4796500.0,198980000.0,458000000.0,897390000.0,775700000.0,,102350000.0,252910000.0,10365000.0,31375000.0,24047000000.0,49662000.0,,24106000.0,,,24208000.0,332320000.0,60107000.0,,,118390000.0,42110000.0,122820000.0,,5056300.0,11912000.0,,211550000.0,69203000.0,223480000.0,378660000.0,141720000.0,6142500000.0,138940000.0,4204700000.0,23684000.0,,,42369000.0,153710000.0,86781000.0,360320000.0,6267900000.0,,52108000.0,159670000.0,,25493000.0,1589600000.0,253850000.0,11698000.0,16931000.0,18041000.0,5257500.0,1913000000.0,43957000.0,169440000.0,18012000000.0,3042400000.0,135730000.0,4833200.0,1894100000.0,966110000.0,85612000.0,,170310000.0,87487000.0,1568000000.0,175180000.0,44127000.0,,5296100.0,45643000.0,751550000.0,96186000.0,279630000.0,3492700.0,7074100000.0,,67043000.0,1549800000.0,,520270000.0,,,1030900000.0,22557000.0,,191170000.0,54750000.0,96440000.0,419890000.0,,90175000.0,,,,77301000.0,34507000.0,46839000.0,,125210000.0,89972000.0,3582900000.0,154170000.0,2160600000.0,883910000.0,677750000.0,3628600000.0,8977500000.0,,82666000.0,17265000.0,37662000.0,188900000.0,6468400.0,,39604000.0,83802000.0,21899000.0,4518600000.0,370900000.0,,6648100.0,933250000.0,2487700000.0,,1052000000.0,230090000.0,152560000.0,371430000.0,341460000.0,221070000.0,59051000.0,3566200000.0,714010000.0,38346000.0,25295000.0,131800000.0,930450000.0,1726900000.0,,64012000.0,14237000.0,,49939000.0,161480000.0,,854150000.0,31116000.0,48549000.0,1299400000.0,691920000.0,148280000.0,107300000.0,194160000.0,1320600000.0,271040000.0,12949000.0,,99444000.0,4988700.0,31546000.0,,245580000.0,,33190000.0,,205400000.0,,1299200000.0,3357200000.0,214360000.0,5801100.0,21627000.0,3232500000.0,,50846000.0,14389000.0,26296000.0,,32638000.0,75127000.0,17754000.0,134730000.0,56562000.0,32144000.0,125350000.0,23969000.0,244230000.0,338690000.0,11042000.0,31030000.0,356110000.0,125380000.0,64008000.0,104720000.0,51051000.0,,5997600000.0,174640000.0,,,11732000.0,49281000.0,303440000.0,5282200.0,14185000.0,39645000.0,2520600000.0,,67515000.0,89639000.0,33828000.0,,302760000.0,12410000.0,4791200000.0,32941000.0,3290800000.0,7227200.0,305030000.0,159520000.0,28166000.0,1269400000.0,197280000.0,44608000.0,95189000.0,1978500000.0,12440000.0,767360000.0,451280000.0,80948000.0,,,94978000.0,,10702000.0,64538000.0 +2020_03_07_18_15_Q-Exactive-HF-X-Orbitrap_6070,15364000.0,,41904000000.0,22750000.0,,,46626000000.0,1451100000.0,331220000.0,18538000.0,25457000.0,16254000.0,,1127300000.0,,,27635000.0,193250000.0,196130000.0,35070000.0,4305500000.0,50085000.0,571240000.0,1120200000.0,76373000.0,1309000000.0,30877000.0,843500000.0,79688000.0,159870000.0,191010000.0,133610000.0,6068500000.0,238890000.0,129480000.0,178280000.0,27701000000.0,34870000.0,3579600000.0,244190000.0,88433000.0,106140000.0,2362400000.0,1985700000.0,311550000.0,1157000000.0,38080000.0,40901000.0,,238900000.0,39460000.0,3667600000.0,295380000.0,983540000.0,,30761000.0,76051000.0,264140000.0,148570000.0,3638000000.0,51269000.0,71319000.0,49686000.0,192420000.0,35104000.0,234470000.0,28206000.0,3046100000.0,312800000.0,498870000.0,,,,101500000.0,145970000.0,17865000.0,345240000.0,68138000.0,,2190900000.0,318050000.0,64892000.0,50881000.0,,219220000.0,65271000.0,24845000.0,60899000.0,19590000.0,45213000.0,76961000.0,8868700.0,,59629000.0,39114000.0,85836000.0,267180000.0,19774000.0,207720000.0,87252000.0,3532800000.0,970990000.0,615510000.0,188200000.0,21405000.0,701230000.0,20270000.0,158490000.0,,34840000.0,2651300000.0,639370000.0,30542000.0,40582000.0,56103000.0,102410000.0,,748610000.0,835910000.0,189830000.0,611240000.0,127190000.0,122890000.0,2292100000.0,,175400000.0,96607000.0,561480000.0,,772180000.0,54887000.0,,94230000.0,27786000.0,,81481000.0,113590000.0,28161000000.0,462290000.0,7147000.0,23164000.0,290100000.0,,2779100000.0,,,141830000.0,52875000.0,171340000.0,,493400000.0,,103980000.0,273580000.0,504990000.0,791110000.0,354810000.0,14923000.0,1764100000.0,90693000.0,3279400000.0,66633000.0,328870000.0,64961000.0,,145360000.0,73437000.0,341440000.0,312470000.0,286860000.0,594990000.0,924050000.0,36678000.0,,68758000.0,110110000.0,1674900000.0,303760000.0,1028500000.0,211740000.0,287110000.0,54014000.0,74925000.0,256160000.0,18303000.0,2025600000.0,120300000.0,4287500000.0,1790100000.0,227220000.0,233740000.0,140690000.0,,1281000000.0,62576000.0,34248000.0,1075000000.0,543410000.0,829540000.0,,427850000.0,203130000.0,1883100000.0,,3966500000.0,,132860000.0,221160000.0,76048000.0,,151950000.0,11051000.0,3187200000.0,204850000.0,55321000.0,468340000.0,1065600000.0,1496000000.0,1710500000.0,,237940000.0,603920000.0,30704000.0,79783000.0,42697000000.0,134930000.0,,44223000.0,,13452000.0,73363000.0,740030000.0,99487000.0,,,324210000.0,23910000.0,372860000.0,20770000.0,84467000.0,35824000.0,34192000.0,543810000.0,260730000.0,631130000.0,500070000.0,404830000.0,12699000000.0,261210000.0,9716900000.0,76667000.0,,,94410000.0,157020000.0,138690000.0,837010000.0,14455000000.0,,177810000.0,423660000.0,34247000.0,50525000.0,2700300000.0,1319100000.0,51906000.0,27926000.0,140630000.0,44321000.0,3858500000.0,265830000.0,436120000.0,35925000000.0,5760300000.0,331740000.0,39354000.0,3420500000.0,1908600000.0,297920000.0,,674300000.0,208990000.0,3746900000.0,230700000.0,262900000.0,,,108320000.0,1363900000.0,420240000.0,542690000.0,11388000.0,13457000000.0,,143450000.0,3064700000.0,11064000.0,1109700000.0,11450000.0,2123600000.0,2576200000.0,47472000.0,,416650000.0,95073000.0,228520000.0,1189300000.0,12584000000.0,192510000.0,18316000.0,,14717000.0,246140000.0,61503000.0,88751000.0,,320350000.0,224330000.0,8881800000.0,421790000.0,,2431500000.0,1076300000.0,7528100000.0,20664000000.0,38855000.0,70576000.0,5068200.0,62421000.0,507130000.0,35241000.0,,186100000.0,447980000.0,123670000.0,12060000000.0,810030000.0,19166000.0,33634000.0,2219800000.0,5871400000.0,3189900000.0,3211000000.0,493180000.0,388760000.0,730140000.0,569140000.0,523240000.0,118710000.0,8713200000.0,2137000000.0,138880000.0,59192000.0,366740000.0,1487900000.0,3767100000.0,14389000.0,65869000.0,74719000.0,32089000.0,88942000.0,485680000.0,45641000.0,2422800000.0,,251330000.0,3027200000.0,1740800000.0,327650000.0,326370000.0,440950000.0,2973100000.0,528150000.0,86159000.0,60983000.0,248070000.0,49904000.0,83227000.0,,450610000.0,,36426000.0,,489020000.0,26214000.0,1993200000.0,8805600000.0,466640000.0,,29091000.0,7614200000.0,,165230000.0,68872000.0,171730000.0,,124980000.0,187190000.0,92599000.0,379430000.0,246580000.0,,269010000.0,289130000.0,424450000.0,998570000.0,98175000.0,15665000.0,788990000.0,275780000.0,195260000.0,178300000.0,131110000.0,,8974600000.0,717270000.0,63819000.0,,136540000.0,59731000.0,585960000.0,3526600.0,119400000.0,187350000.0,5762000000.0,,,128980000.0,162390000.0,16992000.0,868370000.0,128310000.0,10958000000.0,85450000.0,8383000000.0,30174000.0,535020000.0,675860000.0,15867000.0,3767000000.0,25523000.0,95015000.0,298950000.0,4283800000.0,23191000.0,2055400000.0,704900000.0,295000000.0,,,173500000.0,,39235000.0,168530000.0 +2020_03_11_11_25_Q-Exactive-HF-X-Orbitrap_6070,,,29590000000.0,324970000.0,,19034000.0,,1002800000.0,487230000.0,72803000.0,56164000.0,,103980000.0,1430900000.0,154850000.0,,,582840000.0,31801000.0,175970000.0,2508400000.0,59969000.0,190620000.0,2617300000.0,365590000.0,1437300000.0,106900000.0,628490000.0,194080000.0,84881000.0,98841000.0,80241000.0,7041100000.0,,,245690000.0,26745000000.0,56338000.0,3402100000.0,60975000.0,520830000.0,214800000.0,2525400000.0,1110200000.0,380470000.0,1776200000.0,168690000.0,108440000.0,937910000.0,106670000.0,148680000.0,,1022600000.0,628820000.0,21396000.0,29404000.0,65985000.0,568360000.0,457220000.0,4114000000.0,43475000.0,299030000.0,,284860000.0,135710000.0,,25201000.0,4280800000.0,,1199300000.0,211020000.0,52848000.0,25518000.0,101420000.0,268690000.0,,1878900000.0,263720000.0,217600000.0,3436300000.0,176550000.0,140690000.0,219650000.0,,,487840000.0,119840000.0,156640000.0,608230000.0,91021000.0,48069000.0,22350000.0,106980000.0,135640000.0,52488000.0,17602000.0,582810000.0,27035000.0,206200000.0,78142000.0,4852300000.0,1727800000.0,994850000.0,,35903000.0,2570700000.0,13716000.0,335570000.0,,44950000.0,479710000.0,136330000.0,,,29455000.0,100750000.0,,493840000.0,1397300000.0,214370000.0,525900000.0,826410000.0,92954000.0,2867500000.0,184990000.0,,69046000.0,688580000.0,,788050000.0,99123000.0,46223000.0,186840000.0,181650000.0,51555000.0,12598000.0,161940000.0,10887000000.0,117220000.0,59172000.0,34584000.0,,69574000.0,3017100000.0,53701000.0,,171570000.0,40782000.0,160340000.0,10806000.0,404680000.0,,131480000.0,58011000.0,627620000.0,1152600000.0,370410000.0,108720000.0,940470000.0,420630000.0,2827800000.0,6906200.0,662680000.0,136980000.0,113930000.0,233780000.0,182430000.0,325950000.0,197800000.0,628440000.0,139810000.0,45634000.0,86357000.0,12746000.0,50237000.0,,1031200000.0,129070000.0,545780000.0,,108660000.0,291900000.0,90605000.0,898320000.0,52102000.0,3349400000.0,73435000.0,3945000000.0,2583900000.0,124650000.0,348760000.0,211660000.0,111140000.0,1257000000.0,22720000.0,73263000.0,2187500000.0,1162100000.0,503280000.0,,362770000.0,360340000.0,1014100000.0,29910000.0,10815000000.0,79736000.0,181170000.0,93996000.0,85524000.0,,301780000.0,97503000.0,2842400000.0,,368000000.0,662030000.0,,1997200000.0,1818200000.0,40905000.0,125690000.0,1845400000.0,185390000.0,39803000.0,12501000000.0,121550000.0,,27140000.0,,,,1068800000.0,185010000.0,172860000.0,,,172080000.0,358300000.0,64816000.0,235610000.0,308160000.0,68546000.0,1123500000.0,120480000.0,758420000.0,,262940000.0,8790900000.0,1314300000.0,12039000000.0,274080000.0,182120000.0,11830000.0,209050000.0,341150000.0,217760000.0,792700000.0,5034100000.0,44486000.0,305190000.0,677820000.0,42122000.0,,4106000000.0,1744200000.0,10152000.0,68344000.0,223740000.0,455920000.0,4905700000.0,387130000.0,664980000.0,19067000000.0,7085000000.0,609140000.0,234410000.0,4029200000.0,1681000000.0,260880000.0,75148000.0,269100000.0,234900000.0,2947700000.0,682250000.0,833740000.0,60467000.0,,151770000.0,670740000.0,905730000.0,601550000.0,29601000.0,12364000000.0,23426000.0,760010000.0,8425200000.0,,1419800000.0,95950000.0,39182000.0,2222700000.0,,75337000.0,949780000.0,271260000.0,344660000.0,,10918000000.0,391020000.0,69587000.0,40943000.0,79607000.0,272560000.0,292590000.0,309780000.0,104770000.0,42342000.0,433820000.0,10822000000.0,476700000.0,,2319500000.0,618010000.0,6730700000.0,,295940000.0,51895000.0,230560000.0,13278000.0,76289000.0,20579000.0,,190680000.0,396940000.0,192800000.0,16565000000.0,786160000.0,42563000.0,57581000.0,3124800000.0,11013000000.0,,1298700000.0,480330000.0,1509100000.0,232040000.0,1687300000.0,445820000.0,234550000.0,2248500000.0,2985100000.0,,278230000.0,333840000.0,262620000.0,3661500000.0,,,180830000.0,86316000.0,153080000.0,368930000.0,,2602900000.0,106310000.0,287160000.0,1038000000.0,1891100000.0,1600000000.0,394110000.0,,607300000.0,72268000.0,101340000.0,115640000.0,276940000.0,,156560000.0,185390000.0,773600000.0,272870000.0,45854000.0,145670000.0,202480000.0,,2052900000.0,8413400000.0,444020000.0,,179360000.0,5800300000.0,,,219620000.0,75092000.0,158490000.0,118560000.0,36629000.0,179590000.0,534180000.0,1076900000.0,51649000.0,539870000.0,411950000.0,666950000.0,1643900000.0,236630000.0,53094000.0,2857500000.0,93612000.0,109270000.0,141250000.0,442550000.0,,10776000000.0,552660000.0,,53102000.0,,30445000.0,8857600.0,110400000.0,23830000.0,325370000.0,4907500000.0,17135000.0,,379530000.0,466860000.0,150620000.0,2441500000.0,309390000.0,5009700000.0,224970000.0,10960000000.0,182040000.0,1046700000.0,685870000.0,217120000.0,375850000.0,124900000.0,66334000.0,100440000.0,8250200000.0,12288000.0,3056800000.0,231790000.0,166110000.0,19885000.0,,441860000.0,108630000.0,,67717000.0 +2020_05_04_11_39_Q-Exactive-HF-X-Orbitrap_6070,,,17890000000.0,103260000.0,,6228100.0,,310920000.0,295900000.0,16157000.0,13878000.0,,6079100.0,719960000.0,30899000.0,50947000.0,,150340000.0,4683100.0,59424000.0,985540000.0,6662600.0,143970000.0,999580000.0,92158000.0,455020000.0,23517000.0,348330000.0,51970000.0,64436000.0,408070000.0,28656000.0,2688100000.0,,,60479000.0,11688000000.0,76474000.0,1726600000.0,136110000.0,252270000.0,86289000.0,1532700000.0,379370000.0,201810000.0,596000000.0,67762000.0,52089000.0,518330000.0,11560000.0,57068000.0,,414010000.0,216080000.0,8650400.0,23444000.0,67247000.0,153900000.0,248640000.0,1799400000.0,29101000.0,124310000.0,,146240000.0,131210000.0,,,2311100000.0,,317740000.0,77490000.0,9977600.0,,86542000.0,117670000.0,,525940000.0,107510000.0,46981000.0,1392800000.0,82044000.0,64854000.0,140380000.0,,,47070000.0,52367000.0,94252000.0,50593000.0,51441000.0,20656000.0,8194800.0,62340000.0,65564000.0,7060300.0,21812000.0,220620000.0,12040000.0,110360000.0,39319000.0,3084200000.0,837610000.0,383430000.0,,8636700.0,1530700000.0,4207700.0,254360000.0,,34755000.0,146430000.0,90670000.0,,,,48067000.0,,183560000.0,662870000.0,62730000.0,146230000.0,301880000.0,13343000.0,1135600000.0,91469000.0,,23461000.0,324460000.0,23352000.0,270040000.0,110170000.0,10652000.0,52351000.0,70122000.0,17953000.0,,96469000.0,6268100000.0,32341000.0,23897000.0,5249800.0,72143000.0,74956000.0,1278300000.0,3711200.0,,114820000.0,,328310000.0,2348600.0,254800000.0,,96409000.0,69087000.0,183920000.0,321120000.0,130220000.0,34071000.0,326620000.0,181750000.0,1100000000.0,,296930000.0,55361000.0,17886000.0,76461000.0,101420000.0,98898000.0,128000000.0,306910000.0,129780000.0,37226000.0,30089000.0,5065700.0,11996000.0,,363580000.0,77316000.0,293950000.0,,64902000.0,80899000.0,39459000.0,468370000.0,10810000.0,1705800000.0,41203000.0,2101600000.0,830350000.0,31707000.0,223360000.0,96738000.0,10946000.0,459190000.0,8599300.0,8029000.0,792610000.0,412540000.0,191360000.0,,158410000.0,132670000.0,407960000.0,,5493900000.0,53578000.0,86552000.0,22023000.0,52555000.0,,106990000.0,22092000.0,968340000.0,,178850000.0,273950000.0,,457900000.0,714770000.0,29676000.0,43148000.0,789550000.0,94025000.0,35540000.0,5927900000.0,24935000.0,2553200.0,12236000.0,4192700.0,,,437080000.0,39770000.0,24430000.0,,,5813900.0,56649000.0,31637000.0,132750000.0,101800000.0,24570000.0,414380000.0,17634000.0,256610000.0,,89196000.0,3936800000.0,707390000.0,5114300000.0,51277000.0,55154000.0,7029500.0,65553000.0,271280000.0,63863000.0,287660000.0,2787300000.0,25844000.0,67955000.0,288480000.0,10089000.0,,1322200000.0,1215500000.0,,21459000.0,41592000.0,172150000.0,1876300000.0,164740000.0,190990000.0,8922300000.0,3251700000.0,271170000.0,156550000.0,1249500000.0,683040000.0,61546000.0,27288000.0,99829000.0,46290000.0,1346700000.0,180220000.0,358750000.0,24182000.0,4996300.0,60402000.0,331460000.0,817950000.0,244620000.0,6155000.0,8111200000.0,21133000.0,322500000.0,4548600000.0,,1039800000.0,8527900.0,102010000.0,910970000.0,,30269000.0,405780000.0,37276000.0,72656000.0,,4088300000.0,150830000.0,17371000.0,5000800.0,11634000.0,76613000.0,99380000.0,81316000.0,13988000.0,23890000.0,99434000.0,4802400000.0,162480000.0,,1030400000.0,280660000.0,4107100000.0,,115240000.0,60984000.0,45133000.0,,51018000.0,,,80937000.0,232270000.0,39519000.0,6503400000.0,526470000.0,58292000.0,17831000.0,1108500000.0,10790000000.0,,634340000.0,249190000.0,377200000.0,171760000.0,923380000.0,108740000.0,89590000.0,1016700000.0,1066700000.0,,71286000.0,208330000.0,174400000.0,1237300000.0,,12553000.0,96703000.0,16662000.0,43968000.0,92343000.0,,1234600000.0,85605000.0,119650000.0,701070000.0,1989900000.0,437960000.0,348640000.0,,186730000.0,146440000.0,107220000.0,67337000.0,215430000.0,19063000.0,121130000.0,,303290000.0,167040000.0,16989000.0,64633000.0,103510000.0,,757010000.0,5610600000.0,194100000.0,4153800.0,62789000.0,3013600000.0,,23976000.0,54571000.0,3784600.0,102590000.0,104500000.0,14768000.0,253280000.0,351260000.0,346140000.0,25462000.0,239260000.0,117150000.0,263120000.0,479230000.0,59178000.0,27094000.0,1139000000.0,33871000.0,17727000.0,78026000.0,82749000.0,6723900.0,4490400000.0,295670000.0,,27053000.0,,,,93127000.0,25539000.0,241890000.0,2008900000.0,10272000.0,,417220000.0,251750000.0,36718000.0,1001000000.0,135930000.0,2859500000.0,46105000.0,5301400000.0,48869000.0,214130000.0,322720000.0,41305000.0,,100320000.0,26283000.0,45753000.0,4865300000.0,4377200.0,1350000000.0,263190000.0,50265000.0,5262300.0,,,14718000.0,,81791000.0 +2020_05_12_15_13_Q-Exactive-HF-X-Orbitrap_6070,,,16591000000.0,96716000.0,,,,428460000.0,272290000.0,15842000.0,34481000.0,,28803000.0,644800000.0,38851000.0,20525000.0,,98248000.0,11352000.0,52009000.0,909670000.0,11837000.0,29404000.0,801720000.0,108980000.0,559350000.0,33245000.0,226680000.0,44929000.0,38334000.0,35712000.0,33487000.0,2635200000.0,,,100590000.0,12768000000.0,35481000.0,1978200000.0,,277570000.0,50772000.0,899080000.0,716560000.0,205900000.0,637150000.0,73619000.0,68091000.0,521250000.0,57128000.0,31495000.0,,388700000.0,262600000.0,13960000.0,12670000.0,35334000.0,191510000.0,225280000.0,1761400000.0,32163000.0,142720000.0,9876500.0,102210000.0,19440000.0,10314000.0,12170000.0,1743500000.0,,629060000.0,126110000.0,6785400.0,,64400000.0,136010000.0,,606220000.0,97822000.0,,1251700000.0,161370000.0,77773000.0,88114000.0,,,37536000.0,65652000.0,88943000.0,212320000.0,46541000.0,,3418900.0,41500000.0,30918000.0,25483000.0,11714000.0,283970000.0,,69358000.0,65933000.0,1980700000.0,707920000.0,381200000.0,,11228000.0,2008700000.0,4231500.0,142290000.0,,7731400.0,240130000.0,97550000.0,,,2470900.0,55123000.0,,277970000.0,548300000.0,67450000.0,238140000.0,312190000.0,62901000.0,1040000000.0,71047000.0,14390000.0,20666000.0,361590000.0,9821400.0,328160000.0,93602000.0,10464000.0,72979000.0,69750000.0,10967000.0,,108720000.0,,25138000.0,35932000.0,12297000.0,134960000.0,64122000.0,1106400000.0,19096000.0,,119170000.0,4897800.0,13514000.0,,141440000.0,16206000.0,74136000.0,85320000.0,182200000.0,425420000.0,190330000.0,31938000.0,358200000.0,155990000.0,1028600000.0,,262870000.0,50369000.0,39940000.0,137080000.0,138310000.0,101290000.0,138770000.0,293780000.0,55237000.0,10619000.0,67044000.0,11359000.0,16746000.0,,437930000.0,49927000.0,260140000.0,,76744000.0,89147000.0,32561000.0,224390000.0,23634000.0,1802700000.0,42419000.0,1682700000.0,826270000.0,44522000.0,163180000.0,99680000.0,6973500.0,532360000.0,3833400.0,,891590000.0,441280000.0,163390000.0,,178200000.0,119050000.0,424530000.0,13397000.0,5590900000.0,21364000.0,67997000.0,50825000.0,50221000.0,,72227000.0,15609000.0,1023500000.0,,190400000.0,268060000.0,,543590000.0,851410000.0,15167000.0,121600000.0,718780000.0,113410000.0,11333000.0,6096300000.0,42051000.0,,18292000.0,,,,315390000.0,66540000.0,39868000.0,,,46669000.0,58340000.0,31159000.0,118430000.0,123800000.0,21085000.0,685780000.0,62358000.0,205930000.0,,141430000.0,4405100000.0,555820000.0,6080200000.0,113550000.0,115350000.0,5028100.0,58211000.0,362320000.0,57735000.0,326640000.0,2745500000.0,38487000.0,73634000.0,161970000.0,132960000.0,,1599900000.0,942130000.0,,13833000.0,47909000.0,192630000.0,2040500000.0,133800000.0,347740000.0,9105900000.0,2984200000.0,313280000.0,80398000.0,1557300000.0,719560000.0,105650000.0,34094000.0,82826000.0,99644000.0,1247300000.0,197740000.0,309960000.0,27005000.0,,84435000.0,512850000.0,743790000.0,307240000.0,,7944200000.0,19103000.0,314740000.0,4335300000.0,,742490000.0,27297000.0,620670000.0,1130700000.0,5483200.0,25723000.0,329770000.0,19160000.0,96243000.0,,,157350000.0,31930000.0,23493000.0,13982000.0,119140000.0,383880000.0,118200000.0,52849000.0,29272000.0,126260000.0,4578400000.0,199490000.0,,974240000.0,301910000.0,2395700000.0,,84193000.0,11424000.0,84982000.0,46550000.0,42717000.0,335550000.0,,19020000.0,186540000.0,95778000.0,6437400000.0,280600000.0,17309000.0,7716400.0,779870000.0,4302800000.0,656150000.0,344920000.0,175480000.0,420430000.0,164980000.0,827000000.0,157880000.0,169940000.0,1057900000.0,992170000.0,,70869000.0,167520000.0,264990000.0,1319700000.0,,6207500.0,85628000.0,21433000.0,55892000.0,176450000.0,,1179000000.0,57531000.0,95368000.0,493030000.0,1396900000.0,532350000.0,125990000.0,,238710000.0,138950000.0,110680000.0,69783000.0,168400000.0,,76798000.0,,266730000.0,8874700.0,18475000.0,251500000.0,158920000.0,,411350000.0,5350200000.0,239990000.0,11961000.0,54468000.0,3038500000.0,,36278000.0,67568000.0,41771000.0,71304000.0,69607000.0,14512000.0,104240000.0,172760000.0,445530000.0,67560000.0,215600000.0,154360000.0,278700000.0,545180000.0,82311000.0,25911000.0,1024400000.0,32351000.0,45562000.0,17884000.0,95088000.0,,5145700000.0,205610000.0,,14538000.0,,7077000.0,10145000.0,76581000.0,16879000.0,182590000.0,2134700000.0,,,198480000.0,173640000.0,61938000.0,1084100000.0,129550000.0,2326200000.0,73656000.0,4703200000.0,22950000.0,195720000.0,159160000.0,11202000.0,182600000.0,48162000.0,50845000.0,36287000.0,3665500000.0,27555000.0,1508000000.0,97197000.0,89777000.0,,,,11409000.0,,79798000.0 +2020_05_12_18_10_Q-Exactive-HF-X-Orbitrap_6070,,,13574000000.0,81945000.0,,8947100.0,,349500000.0,201790000.0,13148000.0,10139000.0,,16720000.0,571550000.0,23115000.0,,,74087000.0,9532200.0,45054000.0,613130000.0,10547000.0,,676630000.0,72783000.0,452190000.0,39405000.0,170520000.0,55252000.0,62915000.0,13771000.0,29088000.0,2235600000.0,,,47922000.0,9915400000.0,17708000.0,1546800000.0,58439000.0,234800000.0,55253000.0,1241900000.0,333580000.0,160530000.0,569250000.0,62461000.0,68911000.0,431730000.0,33864000.0,43275000.0,,326010000.0,186600000.0,,33228000.0,53821000.0,145000000.0,168210000.0,1726800000.0,23720000.0,85780000.0,2775600.0,103430000.0,124610000.0,,13915000.0,1625800000.0,,343800000.0,60807000.0,12115000.0,14481000.0,46794000.0,125470000.0,,460450000.0,80830000.0,26682000.0,1025800000.0,55282000.0,86754000.0,84597000.0,,,61030000.0,53499000.0,18267000.0,155210000.0,25468000.0,9903400.0,3117500.0,21778000.0,21293000.0,26788000.0,18050000.0,181120000.0,11831000.0,72949000.0,34778000.0,2132400000.0,449020000.0,313250000.0,,20835000.0,1957000000.0,,147570000.0,,23590000.0,121450000.0,125770000.0,,,6445100.0,48327000.0,,108780000.0,258260000.0,74604000.0,200950000.0,252390000.0,33274000.0,1064600000.0,39472000.0,,26359000.0,263190000.0,4020800.0,290770000.0,61484000.0,,69328000.0,55473000.0,12110000.0,,71400000.0,3766200000.0,27506000.0,9788000.0,16274000.0,119590000.0,73177000.0,1029100000.0,3992400.0,4914300.0,38772000.0,8361500.0,10995000.0,,149770000.0,,31966000.0,33511000.0,159080000.0,337950000.0,129630000.0,41805000.0,315230000.0,93989000.0,955850000.0,,248400000.0,40577000.0,22011000.0,88054000.0,82719000.0,58294000.0,70461000.0,122100000.0,30124000.0,9399000.0,44487000.0,,9163200.0,125800000.0,328990000.0,79461000.0,224260000.0,,37494000.0,65198000.0,61936000.0,196100000.0,10220000.0,1422600000.0,38186000.0,1513100000.0,873500000.0,31061000.0,91842000.0,75898000.0,31023000.0,468220000.0,3625900.0,,705370000.0,451690000.0,203200000.0,,184450000.0,158620000.0,355510000.0,10592000.0,5161200000.0,10654000.0,71952000.0,55242000.0,57950000.0,,94929000.0,27111000.0,916780000.0,,132020000.0,222230000.0,,410340000.0,695460000.0,9767100.0,73644000.0,655400000.0,98813000.0,32079000.0,4692800000.0,22612000.0,,21567000.0,,,,854680000.0,39122000.0,35273000.0,99637000.0,,44797000.0,85486000.0,20587000.0,72008000.0,116010000.0,18554000.0,436360000.0,21142000.0,181220000.0,,88071000.0,3910200000.0,465890000.0,4335100000.0,55545000.0,83932000.0,4620400.0,63183000.0,110640000.0,41768000.0,276750000.0,1994100000.0,25881000.0,54433000.0,250960000.0,3907000.0,,1282700000.0,731500000.0,3682200.0,3558000.0,35967000.0,129260000.0,1728500000.0,228270000.0,281510000.0,6548100000.0,2577300000.0,255110000.0,45596000.0,1327200000.0,497650000.0,75224000.0,16075000.0,95788000.0,31884000.0,1086500000.0,157700000.0,346340000.0,22549000.0,5218700.0,60412000.0,210930000.0,332790000.0,177120000.0,8921700.0,4688200000.0,11047000.0,237040000.0,4177700000.0,,781530000.0,30543000.0,547230000.0,1091400000.0,5121000.0,22410000.0,392190000.0,41547000.0,73055000.0,,,88248000.0,37794000.0,21375000.0,19377000.0,46486000.0,57982000.0,91010000.0,23855000.0,10797000.0,96507000.0,4468100000.0,110170000.0,,803040000.0,172110000.0,1766800000.0,,57413000.0,22603000.0,44331000.0,3179100.0,48857000.0,10990000.0,,68806000.0,168870000.0,52867000.0,5561300000.0,202800000.0,15159000.0,10983000.0,953410000.0,9764700000.0,364840000.0,493470000.0,128760000.0,487950000.0,247100000.0,597090000.0,85238000.0,30569000.0,626140000.0,530440000.0,,89775000.0,169660000.0,150800000.0,1295200000.0,,8805100.0,76604000.0,15211000.0,27025000.0,189680000.0,2050100.0,994700000.0,40731000.0,69034000.0,486740000.0,981310000.0,544070000.0,163390000.0,,209300000.0,137310000.0,46176000.0,86113000.0,44706000.0,9296700.0,56024000.0,,218520000.0,,12576000.0,186840000.0,94458000.0,,431780000.0,3770800000.0,184510000.0,12582000.0,65050000.0,1710800000.0,,8698600.0,71677000.0,,68992000.0,63830000.0,41014000.0,94485000.0,296820000.0,350390000.0,17120000.0,234720000.0,179710000.0,217500000.0,409590000.0,75092000.0,23969000.0,788210000.0,25606000.0,42835000.0,60795000.0,91136000.0,8498100.0,3961900000.0,187910000.0,,16784000.0,,4154200.0,4125100.0,134900000.0,18192000.0,137880000.0,1634100000.0,9644600.0,,277290000.0,197470000.0,43665000.0,809260000.0,110450000.0,1738400000.0,58905000.0,3037100000.0,15789000.0,219720000.0,224170000.0,39507000.0,,72754000.0,32789000.0,39297000.0,2462000000.0,3712900.0,1256300000.0,89459000.0,69273000.0,,,55893000.0,12344000.0,,27874000.0 +2020_05_12_21_07_Q-Exactive-HF-X-Orbitrap_6070,,,13667000000.0,130070000.0,,,,417370000.0,252400000.0,14019000.0,15123000.0,,26969000.0,478130000.0,30497000.0,32554000.0,,150930000.0,10514000.0,45720000.0,853580000.0,,113640000.0,823300000.0,62257000.0,465740000.0,21671000.0,188090000.0,20349000.0,31086000.0,186510000.0,37392000.0,2340700000.0,,,78207000.0,9970500000.0,43252000.0,1597500000.0,,213640000.0,44619000.0,1301500000.0,553850000.0,177370000.0,540870000.0,73804000.0,56739000.0,505770000.0,26871000.0,51547000.0,,579590000.0,228290000.0,20208000.0,21876000.0,40846000.0,204150000.0,221130000.0,1421500000.0,56320000.0,131380000.0,,77373000.0,66939000.0,,9010100.0,1787100000.0,,530380000.0,65455000.0,11475000.0,4639000.0,61983000.0,105670000.0,,460330000.0,60593000.0,19366000.0,984900000.0,185040000.0,97945000.0,111570000.0,,,51662000.0,46040000.0,39307000.0,160900000.0,22800000.0,6661200.0,13621000.0,23580000.0,42739000.0,25311000.0,35650000.0,197950000.0,325030000.0,86418000.0,51493000.0,3812400000.0,711680000.0,307430000.0,,8520900.0,1476500000.0,4343200.0,141080000.0,,7516500.0,114400000.0,89018000.0,,,6422700.0,37155000.0,,188670000.0,492710000.0,148400000.0,173020000.0,213660000.0,31701000.0,2652500000.0,54893000.0,,11568000.0,334200000.0,7684800.0,279040000.0,80120000.0,7347800.0,96412000.0,36013000.0,25648000.0,,85573000.0,5919800000.0,20889000.0,14095000.0,8912600.0,120640000.0,50483000.0,1164400000.0,,3823800.0,91221000.0,,7732700.0,5587500.0,135690000.0,21647000.0,49975000.0,77482000.0,161020000.0,428050000.0,125710000.0,46763000.0,421020000.0,123010000.0,993950000.0,,291980000.0,39044000.0,19167000.0,97049000.0,118510000.0,80616000.0,69880000.0,200630000.0,67946000.0,,43039000.0,6895900.0,18414000.0,32333000.0,343950000.0,63708000.0,239270000.0,,51219000.0,71358000.0,19343000.0,275790000.0,8958100.0,1460200000.0,28595000.0,1585300000.0,691420000.0,56725000.0,152950000.0,130540000.0,11433000.0,435970000.0,2943400.0,5459200.0,799020000.0,517100000.0,197700000.0,,157930000.0,118120000.0,329380000.0,5203100.0,5617500000.0,48264000.0,62164000.0,67134000.0,39744000.0,,92612000.0,15447000.0,785520000.0,,180400000.0,207360000.0,,445520000.0,744150000.0,13116000.0,78016000.0,695140000.0,90953000.0,21589000.0,5013500000.0,25185000.0,,10202000.0,5842600.0,,,218110000.0,116370000.0,37026000.0,423100000.0,,15935000.0,85973000.0,27103000.0,96714000.0,101700000.0,,417540000.0,24405000.0,244300000.0,,86308000.0,3815400000.0,489620000.0,4615000000.0,89121000.0,83356000.0,2164300.0,69480000.0,251480000.0,52888000.0,267950000.0,2160800000.0,41010000.0,69210000.0,198870000.0,104050000.0,,1241800000.0,827220000.0,5770500.0,18508000.0,41010000.0,142530000.0,1865900000.0,173640000.0,253310000.0,7395000000.0,2876500000.0,262460000.0,21389000.0,1317000000.0,622820000.0,61245000.0,22542000.0,62445000.0,120370000.0,1231500000.0,148030000.0,233240000.0,26827000.0,,81494000.0,410420000.0,543540000.0,193090000.0,5507100.0,6245400000.0,20489000.0,291580000.0,3661300000.0,,565580000.0,28821000.0,90669000.0,1026600000.0,5237700.0,20928000.0,347330000.0,35115000.0,69899000.0,,3863700000.0,142730000.0,15439000.0,19438000.0,36424000.0,85049000.0,322760000.0,102860000.0,31174000.0,13305000.0,99050000.0,4163300000.0,180520000.0,,761270000.0,239190000.0,2912600000.0,,89110000.0,45193000.0,79711000.0,18328000.0,48627000.0,8344600.0,,16199000.0,130610000.0,50707000.0,5408600000.0,254950000.0,18858000.0,5956600.0,679230000.0,3637800000.0,,362430000.0,159960000.0,316580000.0,163890000.0,678350000.0,159400000.0,17116000.0,853810000.0,1045000000.0,,77634000.0,190260000.0,253760000.0,1185400000.0,,10777000.0,100060000.0,24976000.0,44627000.0,170170000.0,,1037500000.0,40894000.0,55282000.0,480710000.0,1418800000.0,417280000.0,289390000.0,,201550000.0,65350000.0,37362000.0,79630000.0,128450000.0,4655400.0,50046000.0,,246520000.0,11235000.0,14241000.0,178410000.0,72884000.0,,565120000.0,4319700000.0,185340000.0,7947500.0,86534000.0,2070400000.0,,23335000.0,77834000.0,16820000.0,74121000.0,75925000.0,37118000.0,120850000.0,144920000.0,324410000.0,35808000.0,199070000.0,162800000.0,194970000.0,492660000.0,55248000.0,8099200.0,965750000.0,38605000.0,54219000.0,18623000.0,51276000.0,6320400.0,3598100000.0,180190000.0,,27666000.0,,3681000.0,4592500.0,32144000.0,22662000.0,150790000.0,1629200000.0,7170400.0,,166560000.0,206050000.0,48303000.0,714340000.0,111290000.0,1972300000.0,37114000.0,4976900000.0,51939000.0,153500000.0,272850000.0,45332000.0,151830000.0,45344000.0,22028000.0,35678000.0,3839900000.0,12815000.0,1450400000.0,165110000.0,99792000.0,6617600.0,,,9642400.0,,54427000.0 +2020_05_14_14_46_Q-Exactive-HF-X-Orbitrap_6070,,,20792000000.0,52399000.0,,,,518990000.0,314360000.0,26689000.0,27916000.0,,38771000.0,780360000.0,61030000.0,14420000.0,,171340000.0,7914700.0,59928000.0,852580000.0,2942000.0,101430000.0,1240300000.0,118740000.0,532210000.0,33596000.0,414360000.0,47416000.0,44258000.0,62555000.0,58456000.0,2939900000.0,,,96869000.0,13505000000.0,28977000.0,2107300000.0,,162590000.0,82157000.0,1733600000.0,505740000.0,253180000.0,652760000.0,44210000.0,92758000.0,,54046000.0,25218000.0,,491300000.0,215380000.0,,7698100.0,48935000.0,162610000.0,195550000.0,1742700000.0,33581000.0,77584000.0,2653700.0,163540000.0,105830000.0,7289800.0,,2242900000.0,,406690000.0,153730000.0,15665000.0,,79468000.0,319700000.0,3181600.0,591700000.0,112780000.0,116100000.0,1132200000.0,117100000.0,73886000.0,137670000.0,,,66887000.0,36491000.0,37179000.0,260150000.0,37140000.0,,7211000.0,45510000.0,74415000.0,7565200.0,35868000.0,108340000.0,11841000.0,84780000.0,51668000.0,2914700000.0,985960000.0,453010000.0,,14612000.0,2629800000.0,,224420000.0,,11948000.0,182830000.0,40521000.0,,,,46477000.0,,226090000.0,657720000.0,125550000.0,183850000.0,259280000.0,18964000.0,1228900000.0,65759000.0,,,381090000.0,,316860000.0,30686000.0,25706000.0,69061000.0,81794000.0,26710000.0,,94133000.0,,,,8895900.0,,60810000.0,1296300000.0,30819000.0,4880900.0,124390000.0,,49243000.0,,242420000.0,,32735000.0,105820000.0,369980000.0,463170000.0,156810000.0,6423900.0,519580000.0,161410000.0,1583200000.0,,336210000.0,194560000.0,19068000.0,116280000.0,108180000.0,114380000.0,111050000.0,374910000.0,83817000.0,41725000.0,58588000.0,,29082000.0,,473580000.0,46062000.0,242220000.0,,77915000.0,119180000.0,36923000.0,338410000.0,,1815500000.0,67831000.0,2021800000.0,784780000.0,86198000.0,140930000.0,141640000.0,47737000.0,633770000.0,28404000.0,14048000.0,1016000000.0,424420000.0,185700000.0,,,212920000.0,423680000.0,20504000.0,8294100000.0,13500000.0,94545000.0,53842000.0,26346000.0,,78131000.0,13405000.0,1113300000.0,,136550000.0,206510000.0,,607200000.0,1126700000.0,19757000.0,57897000.0,1056400000.0,123210000.0,45505000.0,8157900000.0,57434000.0,13653000.0,28461000.0,5334900.0,,,373320000.0,99826000.0,21156000.0,,,47404000.0,157850000.0,13899000.0,154460000.0,105000000.0,12441000.0,399290000.0,46780000.0,174840000.0,,190330000.0,5352900000.0,574730000.0,6470300000.0,79135000.0,131240000.0,,106210000.0,266990000.0,66502000.0,243210000.0,2402000000.0,9584400.0,88560000.0,265900000.0,,,1417800000.0,1031300000.0,,40309000.0,48733000.0,164550000.0,2358100000.0,172830000.0,249320000.0,9623600000.0,3209200000.0,310230000.0,97519000.0,1473400000.0,793000000.0,98518000.0,6580400.0,139250000.0,61247000.0,1090200000.0,224200000.0,370100000.0,24372000.0,,50678000.0,560970000.0,1197500000.0,187470000.0,4944200.0,9426900000.0,8965600.0,363310000.0,5034600000.0,,971290000.0,9582700.0,55930000.0,1223800000.0,,40863000.0,524050000.0,90242000.0,72285000.0,,5800600000.0,230900000.0,18645000.0,11669000.0,35760000.0,11818000.0,72690000.0,81305000.0,50292000.0,11072000.0,147700000.0,2811500000.0,222270000.0,152580000.0,1272000000.0,245970000.0,4791700000.0,,121860000.0,15134000.0,72840000.0,9383800.0,39376000.0,8463600.0,,40288000.0,283210000.0,119960000.0,8304700000.0,645750000.0,,8388400.0,922060000.0,13356000000.0,,791000000.0,237480000.0,455240000.0,232710000.0,1051900000.0,159670000.0,116620000.0,894600000.0,1120100000.0,,60514000.0,202180000.0,165880000.0,1442200000.0,,,67163000.0,4336900.0,73977000.0,134310000.0,,1117700000.0,35557000.0,133690000.0,542500000.0,1826100000.0,667200000.0,228900000.0,,330720000.0,207390000.0,54031000.0,42491000.0,208160000.0,,63948000.0,36623000.0,380120000.0,,,171950000.0,90756000.0,,638800000.0,6261100000.0,235130000.0,4082700.0,60462000.0,3581200000.0,,,75122000.0,39182000.0,99790000.0,31427000.0,14407000.0,94047000.0,192800000.0,341750000.0,,165140000.0,134870000.0,390450000.0,611260000.0,65768000.0,23087000.0,994650000.0,62436000.0,39080000.0,86168000.0,107260000.0,6738300.0,5297600000.0,391360000.0,,23989000.0,,,,91925000.0,33312000.0,203830000.0,2615500000.0,,,476780000.0,226930000.0,49096000.0,987100000.0,112750000.0,3751000000.0,56904000.0,5605500000.0,50177000.0,299250000.0,435110000.0,35715000.0,203530000.0,64936000.0,33429000.0,49161000.0,5120800000.0,10111000.0,1421900000.0,263020000.0,66148000.0,7762000.0,116760000.0,,24033000.0,,54915000.0 +2020_05_14_17_28_Q-Exactive-HF-X-Orbitrap_6070,,,18422000000.0,162190000.0,,,,334350000.0,409160000.0,19515000.0,24517000.0,,32519000.0,808230000.0,62685000.0,32973000.0,,282520000.0,10690000.0,76047000.0,900260000.0,6563400.0,,1254600000.0,138890000.0,597030000.0,12535000.0,463050000.0,52321000.0,29243000.0,31863000.0,30233000.0,3319300000.0,,,55084000.0,14553000000.0,19237000.0,2168600000.0,,192810000.0,98892000.0,1618600000.0,582370000.0,215030000.0,658270000.0,42354000.0,93969000.0,,45805000.0,6471400.0,,413750000.0,308980000.0,13570000.0,11625000.0,45213000.0,195540000.0,254130000.0,2153500000.0,,120010000.0,3604400.0,166480000.0,23123000.0,,,2352600000.0,,890710000.0,88572000.0,9910100.0,,96288000.0,195610000.0,4053500.0,554470000.0,78609000.0,64115000.0,1023200000.0,216220000.0,90153000.0,176830000.0,,,36994000.0,28522000.0,77464000.0,306160000.0,33828000.0,8561500.0,,40054000.0,72346000.0,14533000.0,48950000.0,189280000.0,10138000.0,91120000.0,48223000.0,3186300000.0,1026300000.0,453880000.0,,,1061800000.0,,260800000.0,,11076000.0,249760000.0,134890000.0,,,,49770000.0,,193800000.0,771140000.0,73764000.0,104340000.0,291590000.0,55242000.0,1291400000.0,45899000.0,,10919000.0,298900000.0,,360720000.0,96354000.0,29903000.0,61682000.0,72556000.0,19428000.0,,85111000.0,6679200000.0,,4917300.0,15154000.0,,68731000.0,1406800000.0,,7062400.0,157370000.0,11499000.0,77759000.0,5439900.0,259990000.0,,40888000.0,110810000.0,395010000.0,406240000.0,168670000.0,123680000.0,620440000.0,189190000.0,1580900000.0,,285090000.0,75925000.0,27185000.0,126070000.0,152400000.0,121160000.0,102210000.0,327390000.0,138190000.0,39671000.0,59417000.0,,7620400.0,,560370000.0,50074000.0,297920000.0,,36041000.0,108540000.0,57510000.0,377970000.0,,1847700000.0,77745000.0,1976600000.0,935580000.0,71971000.0,162980000.0,147370000.0,55698000.0,618360000.0,21340000.0,,1022200000.0,284890000.0,203440000.0,,199670000.0,210820000.0,452900000.0,15477000.0,8052700000.0,19531000.0,86092000.0,47904000.0,37370000.0,,97230000.0,22462000.0,1079900000.0,,150690000.0,239870000.0,,880610000.0,886510000.0,,81342000.0,1203300000.0,108630000.0,36819000.0,10330000000.0,38175000.0,14294000.0,5289800.0,,,,393680000.0,107910000.0,63427000.0,,,34917000.0,161020000.0,27110000.0,173330000.0,142590000.0,16696000.0,398860000.0,46540000.0,272330000.0,,154830000.0,5735600000.0,774010000.0,7369500000.0,38544000.0,82887000.0,,84053000.0,232810000.0,66522000.0,287400000.0,2647500000.0,24355000.0,94971000.0,338890000.0,,,1770200000.0,654030000.0,,21775000.0,56951000.0,182720000.0,2702700000.0,211490000.0,269610000.0,10830000000.0,3636900000.0,292350000.0,91652000.0,1635100000.0,824840000.0,88687000.0,13490000.0,139540000.0,24847000.0,1059300000.0,249940000.0,260500000.0,27060000.0,,98295000.0,512310000.0,1175300000.0,305320000.0,11761000.0,9215100000.0,10202000.0,313920000.0,4782700000.0,,1073900000.0,22009000.0,152380000.0,1400400000.0,,50025000.0,480000000.0,99948000.0,116750000.0,,5424100000.0,255250000.0,7659400.0,,22254000.0,12642000.0,145260000.0,108310000.0,60706000.0,,122260000.0,3460800000.0,229880000.0,197100000.0,1338000000.0,293820000.0,4179500000.0,761430000.0,140700000.0,23740000.0,93750000.0,,36129000.0,8919600.0,,47414000.0,228060000.0,89455000.0,8256700000.0,455250000.0,,,1269100000.0,5598100000.0,,614920000.0,200560000.0,473870000.0,350570000.0,1168200000.0,166430000.0,119180000.0,966070000.0,1157400000.0,,122960000.0,184490000.0,352290000.0,1781400000.0,,,57768000.0,22595000.0,56340000.0,147980000.0,2896600.0,1265300000.0,23598000.0,127070000.0,636460000.0,1868500000.0,501230000.0,270850000.0,,229530000.0,225840000.0,71193000.0,96763000.0,95561000.0,,50213000.0,,419340000.0,,,274890000.0,155490000.0,,604740000.0,6034400000.0,224810000.0,14227000.0,83604000.0,2954000000.0,,,81317000.0,39980000.0,106100000.0,44610000.0,39572000.0,139690000.0,224540000.0,430470000.0,25806000.0,175060000.0,170730000.0,200150000.0,571970000.0,111650000.0,26383000.0,1149900000.0,90898000.0,28973000.0,31528000.0,244730000.0,,6429900000.0,341550000.0,,21379000.0,,7660700.0,,97827000.0,32322000.0,178400000.0,2542300000.0,,,485660000.0,235800000.0,,1065900000.0,154300000.0,3257300000.0,88934000.0,5799000000.0,224580000.0,360680000.0,361700000.0,35507000.0,185170000.0,18894000.0,16325000.0,,5104200000.0,11427000.0,1547600000.0,132240000.0,50055000.0,7846800.0,,,18788000.0,,51706000.0 +2020_05_14_20_11_Q-Exactive-HF-X-Orbitrap_6070,,,19662000000.0,179880000.0,,,,493790000.0,443280000.0,24200000.0,31478000.0,,39348000.0,922120000.0,48780000.0,,,239070000.0,12874000.0,97628000.0,982190000.0,20551000.0,60452000.0,1334600000.0,188540000.0,697360000.0,39429000.0,414170000.0,63756000.0,34180000.0,56029000.0,59977000.0,3520400000.0,,,99474000.0,15991000000.0,59900000.0,2645400000.0,101940000.0,254850000.0,102580000.0,1680100000.0,541440000.0,220130000.0,789660000.0,78004000.0,98091000.0,,67831000.0,23910000.0,,480510000.0,224320000.0,26617000.0,11431000.0,72154000.0,238460000.0,272290000.0,2578900000.0,120360000.0,154540000.0,24595000.0,222110000.0,72768000.0,,,2742100000.0,,720120000.0,94231000.0,10267000.0,,84720000.0,188170000.0,,408070000.0,115260000.0,53180000.0,1500700000.0,107060000.0,98559000.0,213600000.0,,,524300000.0,10825000.0,26264000.0,382220000.0,21302000.0,35905000.0,11831000.0,49977000.0,105340000.0,24394000.0,47462000.0,223640000.0,11253000.0,112320000.0,43785000.0,3232400000.0,1041300000.0,536290000.0,,,2933000000.0,,313940000.0,,,269480000.0,151640000.0,,,,57837000.0,,238280000.0,734370000.0,104990000.0,146980000.0,294500000.0,31342000.0,1488100000.0,54401000.0,,10271000.0,518240000.0,,442820000.0,79893000.0,31609000.0,19543000.0,94835000.0,,,125190000.0,8390900000.0,,,13958000.0,,66364000.0,1498000000.0,,,104040000.0,31936000.0,57709000.0,5426500.0,273820000.0,,70724000.0,96929000.0,193800000.0,595740000.0,260280000.0,51448000.0,495500000.0,259580000.0,1880800000.0,11499000.0,352950000.0,77015000.0,55321000.0,191750000.0,134530000.0,137090000.0,202910000.0,234120000.0,94627000.0,40425000.0,49808000.0,,17550000.0,14878000.0,764340000.0,65672000.0,375220000.0,,113530000.0,101180000.0,41699000.0,383040000.0,19572000.0,2005800000.0,48671000.0,1970800000.0,973440000.0,72756000.0,162100000.0,151470000.0,65836000.0,736220000.0,,,1323200000.0,562460000.0,169680000.0,,150820000.0,202560000.0,482960000.0,,8050400000.0,12801000.0,124520000.0,69546000.0,18601000.0,,101180000.0,18937000.0,1188600000.0,,193990000.0,329360000.0,,985130000.0,1135600000.0,,102770000.0,1430900000.0,95852000.0,52032000.0,10215000000.0,,20527000.0,27430000.0,,,,348110000.0,136500000.0,39556000.0,,,50682000.0,173790000.0,30248000.0,115660000.0,163620000.0,25174000.0,531850000.0,33816000.0,211340000.0,,95397000.0,6646000000.0,979490000.0,7469200000.0,85582000.0,155890000.0,,61974000.0,355560000.0,85705000.0,313250000.0,2839100000.0,37053000.0,161100000.0,362440000.0,11345000.0,,1795700000.0,1159400000.0,,24488000.0,86058000.0,188360000.0,2746100000.0,243170000.0,295450000.0,13054000000.0,3973600000.0,291530000.0,106430000.0,1988900000.0,1006800000.0,102700000.0,22709000.0,153740000.0,82311000.0,1444200000.0,250480000.0,446210000.0,28279000.0,,103350000.0,497280000.0,1082300000.0,189720000.0,12008000.0,10109000000.0,20026000.0,367020000.0,4531700000.0,,1208100000.0,10900000.0,67877000.0,1473500000.0,,47591000.0,573100000.0,109460000.0,123740000.0,9548000.0,,196130000.0,36291000.0,15580000.0,25690000.0,86731000.0,140340000.0,91693000.0,,11878000.0,129620000.0,3260200000.0,214740000.0,275310000.0,1386700000.0,327270000.0,4269900000.0,780340000.0,155050000.0,71907000.0,143300000.0,,34168000.0,,,49923000.0,185580000.0,70268000.0,9731200000.0,597550000.0,48151000.0,27484000.0,1296700000.0,6014300000.0,,788790000.0,205940000.0,809420000.0,298750000.0,1036700000.0,215480000.0,141480000.0,982990000.0,865750000.0,,145890000.0,191880000.0,347760000.0,1862200000.0,,,87537000.0,24474000.0,61581000.0,185180000.0,,1329000000.0,20000000.0,173190000.0,479940000.0,2098800000.0,904170000.0,264850000.0,,259950000.0,187280000.0,92950000.0,71642000.0,165910000.0,,72571000.0,,503210000.0,,,381350000.0,196090000.0,,560350000.0,7381500000.0,246250000.0,,132870000.0,2925700000.0,,,105510000.0,22790000.0,83407000.0,76361000.0,24715000.0,137090000.0,247060000.0,510610000.0,35430000.0,248610000.0,180780000.0,296300000.0,678250000.0,87031000.0,29252000.0,1349500000.0,97409000.0,46396000.0,,284750000.0,,7275700000.0,462620000.0,,27825000.0,,,11429000.0,24445000.0,30770000.0,207910000.0,3214200000.0,,,534860000.0,246850000.0,,1178200000.0,126430000.0,4316500000.0,74564000.0,6409300000.0,96468000.0,276030000.0,385220000.0,69841000.0,340460000.0,66817000.0,63016000.0,77244000.0,6400400000.0,17700000.0,1590400000.0,164490000.0,93237000.0,11043000.0,,,50093000.0,, +2020_05_15_10_30_Q-Exactive-HF-X-Orbitrap_6070,,,32630000000.0,102070000.0,,,,813920000.0,506240000.0,49695000.0,76356000.0,,44507000.0,1710700000.0,197740000.0,145570000.0,,484430000.0,23450000.0,77386000.0,2938800000.0,15667000.0,494930000.0,1979800000.0,261050000.0,939170000.0,26524000.0,770950000.0,214230000.0,82900000.0,68970000.0,31774000.0,5202100000.0,,32785000.0,147540000.0,22599000000.0,85477000.0,4193300000.0,95499000.0,413160000.0,57182000.0,2599500000.0,1150400000.0,351810000.0,1100900000.0,108890000.0,50295000.0,849630000.0,74231000.0,73403000.0,196790000.0,674340000.0,375440000.0,15188000.0,44747000.0,38281000.0,396860000.0,586030000.0,2717600000.0,120990000.0,223620000.0,,135620000.0,590790000.0,33420000.0,40047000.0,4006100000.0,,1077400000.0,223260000.0,25882000.0,12868000.0,177510000.0,269370000.0,,1088500000.0,355970000.0,123480000.0,2488100000.0,244870000.0,223950000.0,129730000.0,,,121750000.0,111290000.0,133430000.0,103910000.0,79854000.0,,5674300.0,73143000.0,209850000.0,35747000.0,67639000.0,218940000.0,13158000.0,153240000.0,105420000.0,8340400000.0,1356300000.0,954430000.0,,23783000.0,3178900000.0,19376000.0,554620000.0,,15841000.0,447960000.0,185600000.0,,,57172000.0,115750000.0,,409090000.0,995960000.0,363900000.0,522790000.0,317380000.0,61723000.0,2152200000.0,73305000.0,259940000.0,55647000.0,803610000.0,24665000.0,532210000.0,59166000.0,10175000.0,292340000.0,147040000.0,25435000.0,,172830000.0,11993000000.0,58844000.0,55015000.0,57144000.0,196240000.0,83454000.0,2720700000.0,,14808000.0,256500000.0,9070800.0,93495000.0,10119000.0,477410000.0,,222160000.0,189260000.0,472250000.0,1024800000.0,613070000.0,236390000.0,820030000.0,208860000.0,2555800000.0,,595610000.0,107930000.0,42849000.0,186760000.0,381230000.0,152380000.0,237150000.0,741460000.0,181760000.0,41212000.0,124870000.0,6322300.0,41949000.0,79922000.0,866710000.0,399500000.0,445580000.0,,171020000.0,227710000.0,135040000.0,767960000.0,,2136200000.0,164010000.0,3943700000.0,2052300000.0,79864000.0,421310000.0,260350000.0,72502000.0,1056100000.0,77702000.0,42286000.0,1577900000.0,852260000.0,364410000.0,462210000.0,519650000.0,276670000.0,1083400000.0,,14963000000.0,41036000.0,156520000.0,88462000.0,95328000.0,,345330000.0,45778000.0,2414600000.0,,312270000.0,537610000.0,,1063200000.0,1261400000.0,213450000.0,59927000.0,1714500000.0,200610000.0,38643000.0,15842000000.0,70082000.0,34367000.0,20347000.0,,,,602400000.0,170920000.0,93968000.0,295600000.0,,127550000.0,314440000.0,73984000.0,263330000.0,195190000.0,26303000.0,1118000000.0,53909000.0,431420000.0,,676740000.0,9620300000.0,890200000.0,10217000000.0,242730000.0,239340000.0,19423000.0,112160000.0,328130000.0,184870000.0,485260000.0,6685200000.0,30296000.0,189690000.0,440980000.0,6356200.0,,2709800000.0,1507700000.0,38038000.0,36416000.0,83080000.0,201420000.0,4242300000.0,417520000.0,241420000.0,14246000000.0,6918400000.0,466780000.0,165140000.0,2823700000.0,1341100000.0,255910000.0,73149000.0,229620000.0,85829000.0,2748200000.0,415500000.0,463660000.0,40439000.0,35483000.0,86094000.0,698810000.0,1411200000.0,273130000.0,17731000.0,14263000000.0,27034000.0,781650000.0,9910400000.0,,1265500000.0,96980000.0,82954000.0,2238800000.0,,98929000.0,661660000.0,119860000.0,637090000.0,,8546000000.0,207500000.0,22271000.0,41551000.0,29257000.0,123170000.0,273110000.0,121510000.0,24049000.0,101520000.0,298140000.0,6200600000.0,317060000.0,3180000000.0,2380200000.0,352830000.0,6019200000.0,,296370000.0,93770000.0,64474000.0,18145000.0,60029000.0,15889000.0,,74010000.0,318520000.0,30397000.0,13083000000.0,685440000.0,144840000.0,18193000.0,1821500000.0,9909600000.0,,770450000.0,441360000.0,1083100000.0,784980000.0,1499500000.0,279520000.0,246820000.0,1515700000.0,2314600000.0,,263340000.0,541130000.0,723950000.0,3415100000.0,,22095000.0,238150000.0,88479000.0,114090000.0,111400000.0,11209000.0,2116100000.0,59554000.0,256550000.0,1108800000.0,2827800000.0,1047400000.0,412310000.0,,391660000.0,470370000.0,96804000.0,165920000.0,328840000.0,63269000.0,112920000.0,,808920000.0,16872000.0,35098000.0,102460000.0,348960000.0,,1512300000.0,7689200000.0,327550000.0,18134000.0,154600000.0,6197900000.0,,36737000.0,52253000.0,49350000.0,203340000.0,137360000.0,29243000.0,416370000.0,387200000.0,885390000.0,106380000.0,459500000.0,203690000.0,434860000.0,992320000.0,85699000.0,32932000.0,1921100000.0,105440000.0,91334000.0,82288000.0,250230000.0,,8195300000.0,775960000.0,,42291000.0,,,,95209000.0,79755000.0,391710000.0,3629700000.0,11289000.0,,764430000.0,455110000.0,251030000.0,1747300000.0,301590000.0,4250700000.0,88713000.0,9790200000.0,99014000.0,697010000.0,477100000.0,83109000.0,1223800000.0,27852000.0,87279000.0,65113000.0,8402500000.0,12919000.0,2521900000.0,484820000.0,130320000.0,19314000.0,349130000.0,,29580000.0,,107740000.0 +2020_05_20_12_33_Q-Exactive-HF-X-Orbitrap_6070,,,19294000000.0,113070000.0,,8599600.0,,591470000.0,414570000.0,,53064000.0,,,785560000.0,18148000.0,36530000.0,,185880000.0,10854000.0,71336000.0,1573500000.0,,,1523000000.0,143550000.0,770670000.0,37480000.0,441150000.0,86074000.0,30008000.0,28747000.0,33660000.0,3545800000.0,,12586000.0,61056000.0,15826000000.0,86612000.0,3346700000.0,137520000.0,201200000.0,26181000.0,1648900000.0,673930000.0,156810000.0,840030000.0,57051000.0,46801000.0,,26393000.0,,,481000000.0,294410000.0,,10251000.0,25526000.0,232930000.0,316570000.0,2773100000.0,63764000.0,137340000.0,,171120000.0,734940000.0,,4712100.0,2394500000.0,,1176400000.0,99599000.0,,6249400.0,50896000.0,157640000.0,6679200.0,902520000.0,155790000.0,82751000.0,1708500000.0,160190000.0,78203000.0,81709000.0,,,630320000.0,26857000.0,19621000.0,44099000.0,72152000.0,20271000.0,22387000.0,36673000.0,97374000.0,,,366870000.0,8328800.0,118710000.0,37629000.0,4532000000.0,1124300000.0,658980000.0,,12331000.0,1493200000.0,,190910000.0,16687000.0,,303260000.0,111010000.0,,,,55414000.0,,465880000.0,717470000.0,137390000.0,377900000.0,230020000.0,49558000.0,1198200000.0,,35222000.0,352560000.0,502760000.0,44478000.0,339540000.0,,35971000.0,50678000.0,85319000.0,48988000.0,,96544000.0,11236000000.0,8983000.0,,,,44606000.0,1777800000.0,,,260300000.0,5440100.0,38280000.0,5172500.0,492080000.0,25089000.0,55340000.0,91234000.0,570140000.0,847290000.0,222630000.0,100320000.0,662640000.0,238130000.0,1358800000.0,,328740000.0,227540000.0,,220880000.0,94138000.0,122130000.0,125150000.0,282810000.0,133020000.0,69302000.0,34514000.0,11983000.0,10011000.0,,611890000.0,105470000.0,407620000.0,,21633000.0,83087000.0,42802000.0,536410000.0,22580000.0,1619700000.0,42732000.0,3854800000.0,1155300000.0,34022000.0,388620000.0,148820000.0,,601040000.0,34708000.0,,804200000.0,501240000.0,235640000.0,,828390000.0,250180000.0,692950000.0,,1139900000.0,15300000.0,102740000.0,79152000.0,30149000.0,,353660000.0,42428000.0,2092700000.0,,96856000.0,395160000.0,,569980000.0,1131000000.0,,44542000.0,1736200000.0,5512600.0,12584000.0,9456900000.0,45605000.0,,10381000.0,,,,397260000.0,164920000.0,90095000.0,485640000.0,,68383000.0,205070000.0,40754000.0,202960000.0,121920000.0,14498000.0,872520000.0,62559000.0,368540000.0,,86955000.0,7018200000.0,436110000.0,7735400000.0,71171000.0,74393000.0,6315000.0,17714000.0,558850000.0,48363000.0,529260000.0,4503700000.0,59845000.0,110660000.0,321240000.0,24949000.0,,2361400000.0,1033500000.0,,68624000.0,67544000.0,135640000.0,2702300000.0,262190000.0,302860000.0,13109000000.0,3963100000.0,218680000.0,130320000.0,1642700000.0,716520000.0,104370000.0,17403000.0,123200000.0,70861000.0,1874800000.0,222140000.0,435790000.0,,,14969000.0,287220000.0,1515800000.0,306860000.0,,12286000000.0,,481550000.0,5428600000.0,,1242200000.0,,185780000.0,1583600000.0,,36205000.0,493390000.0,69908000.0,330510000.0,,4903100000.0,169800000.0,34007000.0,23774000.0,43910000.0,139510000.0,147330000.0,99482000.0,,26554000.0,205210000.0,5186000000.0,492770000.0,577580000.0,1428400000.0,560740000.0,4599000000.0,,80895000.0,38345000.0,59714000.0,,34377000.0,11925000.0,,44014000.0,187360000.0,62321000.0,9673600000.0,879470000.0,,,1573800000.0,6265700000.0,,326580000.0,385400000.0,688650000.0,,1135800000.0,213960000.0,,1584700000.0,884690000.0,,64939000.0,247820000.0,280920000.0,1421600000.0,,,53342000.0,44846000.0,42516000.0,46700000.0,,992120000.0,119300000.0,149980000.0,752900000.0,1281800000.0,770320000.0,104840000.0,,178130000.0,315820000.0,62564000.0,99251000.0,244680000.0,,110190000.0,,417050000.0,,,63392000.0,30259000.0,,1060200000.0,9343000000.0,226830000.0,,34007000.0,4375800000.0,,,74825000.0,5645800.0,102880000.0,65727000.0,68376000.0,167710000.0,305910000.0,813790000.0,,236680000.0,183250000.0,371010000.0,1010800000.0,118960000.0,36666000.0,1369400000.0,41539000.0,45084000.0,62173000.0,198950000.0,,5559600000.0,396230000.0,,,,21250000.0,,71218000.0,21912000.0,180320000.0,2169700000.0,,,431020000.0,152330000.0,20820000.0,1237800000.0,124360000.0,4067100000.0,58240000.0,6963000000.0,36315000.0,527920000.0,320670000.0,17924000.0,1090100000.0,101860000.0,128070000.0,,6024400000.0,10908000.0,1920000000.0,168490000.0,51931000.0,10048000.0,,,9842300.0,,47400000.0 +2020_05_20_15_35_Q-Exactive-HF-X-Orbitrap_6070,,,28727000000.0,284320000.0,,,17752000.0,941560000.0,450840000.0,21865000.0,78271000.0,,12277000.0,938740000.0,9626200.0,44294000.0,,342410000.0,24786000.0,87073000.0,1804700000.0,17789000.0,413610000.0,1971400000.0,255590000.0,790520000.0,38563000.0,711510000.0,181640000.0,42717000.0,27424000.0,37150000.0,4438700000.0,,,140220000.0,20481000000.0,115360000.0,3517000000.0,120910000.0,324450000.0,76184000.0,2214200000.0,698260000.0,373630000.0,1379900000.0,75776000.0,118150000.0,,58141000.0,65710000.0,112290000.0,735990000.0,396660000.0,8374400.0,50707000.0,89185000.0,306350000.0,487700000.0,2610600000.0,98509000.0,220550000.0,,202930000.0,202520000.0,,12350000.0,3159500000.0,,892650000.0,152390000.0,33904000.0,37523000.0,103370000.0,237060000.0,,1067800000.0,183310000.0,39277000.0,2574200000.0,369650000.0,96696000.0,192700000.0,,,69505000.0,124600000.0,141360000.0,44475000.0,,13660000.0,5439100.0,54909000.0,88475000.0,15733000.0,26810000.0,339250000.0,17233000.0,141020000.0,52600000.0,4267300000.0,1211800000.0,718960000.0,,25451000.0,2356600000.0,,269460000.0,24279000.0,34879000.0,413420000.0,152660000.0,,,,89243000.0,15074000.0,384670000.0,924910000.0,224280000.0,245360000.0,596120000.0,44273000.0,1698300000.0,123380000.0,,40231000.0,728120000.0,40778000.0,373860000.0,,49000000.0,71600000.0,160680000.0,32057000.0,,183970000.0,13438000000.0,29944000.0,26485000.0,31607000.0,90522000.0,66924000.0,3428500000.0,63369000.0,5853400.0,90683000.0,23714000.0,36119000.0,13459000.0,410300000.0,,51144000.0,133280000.0,424140000.0,892940000.0,403380000.0,188500000.0,554110000.0,282580000.0,1978600000.0,,399960000.0,62374000.0,80157000.0,577640000.0,123170000.0,114170000.0,302220000.0,917040000.0,151150000.0,,51498000.0,,43898000.0,131920000.0,849220000.0,117860000.0,526980000.0,,147300000.0,146110000.0,116740000.0,622680000.0,15367000.0,2732200000.0,75391000.0,4430000000.0,1741500000.0,94869000.0,307510000.0,180730000.0,,633890000.0,36088000.0,23680000.0,1181200000.0,694840000.0,268740000.0,,338520000.0,301320000.0,1042700000.0,,11747000000.0,50485000.0,15193000.0,90268000.0,51445000.0,,254810000.0,66735000.0,1815700000.0,,234850000.0,598870000.0,,653250000.0,1428900000.0,78910000.0,86222000.0,1290900000.0,63628000.0,51570000.0,8327400000.0,49639000.0,21141000.0,47701000.0,,,,586480000.0,160710000.0,53824000.0,,,53462000.0,179360000.0,69146000.0,206720000.0,175760000.0,,886820000.0,62691000.0,445730000.0,,138980000.0,7525400000.0,846110000.0,9269600000.0,50062000.0,247540000.0,,58950000.0,296870000.0,41827000.0,521290000.0,5746700000.0,74835000.0,124340000.0,426060000.0,15797000.0,,2886000000.0,1769500000.0,,66696000.0,69277000.0,244270000.0,3495800000.0,325810000.0,422100000.0,12650000000.0,4593500000.0,290230000.0,149940000.0,2454600000.0,936140000.0,224040000.0,33281000.0,184600000.0,121240000.0,2416400000.0,309670000.0,520090000.0,31692000.0,,61958000.0,548080000.0,924910000.0,531590000.0,,13007000000.0,11955000.0,526480000.0,7080500000.0,,2651400000.0,52315000.0,94839000.0,1796400000.0,,14137000.0,792800000.0,109010000.0,274600000.0,,,310500000.0,71021000.0,23610000.0,81164000.0,144780000.0,112130000.0,116630000.0,44157000.0,57957000.0,342110000.0,6645700000.0,372400000.0,1920700000.0,1735800000.0,449770000.0,4390900000.0,,164050000.0,62925000.0,71143000.0,24835000.0,109650000.0,18266000.0,,116280000.0,227350000.0,111030000.0,10936000000.0,1045600000.0,41555000.0,21413000.0,1422300000.0,7209400000.0,,1070100000.0,293620000.0,725870000.0,461940000.0,1377200000.0,240090000.0,88412000.0,1298500000.0,850270000.0,,173040000.0,746910000.0,463090000.0,2547700000.0,,416090000.0,124030000.0,110990000.0,41588000.0,159020000.0,,1204600000.0,8261900.0,252260000.0,890050000.0,1812400000.0,1167600000.0,384670000.0,,277190000.0,252870000.0,69942000.0,93585000.0,140300000.0,43984000.0,29637000.0,153020000.0,522820000.0,,,129160000.0,17471000.0,,1204100000.0,7485200000.0,250430000.0,,74852000.0,3713400000.0,,26345000.0,104340000.0,107920000.0,147720000.0,88908000.0,58654000.0,232820000.0,366940000.0,906890000.0,36259000.0,329180000.0,409700000.0,429940000.0,1186800000.0,65844000.0,35192000.0,1984900000.0,86274000.0,98984000.0,105440000.0,288890000.0,,8374600000.0,526570000.0,,16497000.0,,15503000.0,,44704000.0,31787000.0,206790000.0,3487900000.0,,,318310000.0,355050000.0,156770000.0,1575400000.0,291440000.0,4147700000.0,150560000.0,7924200000.0,70416000.0,466000000.0,509940000.0,55221000.0,867120000.0,75206000.0,74947000.0,63644000.0,7360200000.0,54522000.0,2800500000.0,253390000.0,93828000.0,14196000.0,,,25599000.0,,101980000.0 +2020_05_22_14_57_Q-Exactive-HF-X-Orbitrap_6070,,,32944000000.0,296440000.0,,22298000.0,9004300.0,824760000.0,401240000.0,17158000.0,49187000.0,,22071000.0,1141100000.0,88049000.0,94839000.0,,329810000.0,6377400.0,45172000.0,1226400000.0,25109000.0,,1727700000.0,265380000.0,762340000.0,20567000.0,760820000.0,120330000.0,17169000.0,25780000.0,74133000.0,4324400000.0,,9961000.0,119680000.0,22701000000.0,7664900.0,2811400000.0,,446140000.0,99606000.0,2792000000.0,801290000.0,265630000.0,1120600000.0,66863000.0,103120000.0,,443190000.0,94955000.0,17645000.0,737970000.0,460920000.0,,28331000.0,38988000.0,376880000.0,314910000.0,3922800000.0,145000000.0,130990000.0,,146510000.0,45706000.0,56087000.0,8290300.0,3145900000.0,,348370000.0,91735000.0,19583000.0,106570000.0,56908000.0,128630000.0,,675480000.0,163710000.0,80980000.0,2088400000.0,252730000.0,94310000.0,392860000.0,,,482440000.0,65256000.0,84504000.0,50411000.0,69659000.0,63449000.0,10096000.0,149940000.0,164890000.0,43997000.0,34186000.0,294700000.0,15235000.0,83878000.0,66627000.0,3493400000.0,992340000.0,632650000.0,,29975000.0,3055300000.0,,119650000.0,,,284960000.0,120760000.0,,,30109000.0,110750000.0,,280800000.0,1229000000.0,150740000.0,336180000.0,584770000.0,396580000.0,1979600000.0,83827000.0,,35466000.0,730810000.0,,648890000.0,41985000.0,20491000.0,106960000.0,76905000.0,33183000.0,,158670000.0,9547000000.0,81262000.0,32496000.0,27808000.0,234990000.0,65048000.0,2126300000.0,,9922500.0,105650000.0,26140000.0,140020000.0,9300200.0,325660000.0,,24611000.0,129190000.0,440430000.0,898930000.0,195090000.0,127710000.0,624970000.0,225240000.0,2477700000.0,,244650000.0,100930000.0,32040000.0,,261630000.0,101580000.0,128320000.0,374310000.0,91136000.0,,59838000.0,7200300.0,28473000.0,,995560000.0,94790000.0,331300000.0,,106550000.0,103250000.0,73314000.0,712410000.0,27697000.0,3218800000.0,88609000.0,2692500000.0,2292600000.0,68718000.0,205960000.0,214780000.0,,1033400000.0,27454000.0,,1388100000.0,613100000.0,208190000.0,,479570000.0,129290000.0,555540000.0,45458000.0,15003000000.0,,186960000.0,12808000.0,87328000.0,,48447000.0,22623000.0,1496700000.0,,273830000.0,383710000.0,,895890000.0,952920000.0,21184000.0,108780000.0,1332900000.0,88688000.0,54519000.0,11016000000.0,100030000.0,36625000.0,45037000.0,,,,508400000.0,97018000.0,50759000.0,,,81567000.0,206300000.0,85764000.0,161150000.0,144890000.0,,851390000.0,62156000.0,270090000.0,,140150000.0,8763300000.0,808740000.0,9049000000.0,146510000.0,120390000.0,,87742000.0,140120000.0,54728000.0,759610000.0,3084400000.0,35014000.0,137830000.0,328280000.0,14469000.0,5304300.0,3686800000.0,1146900000.0,,32081000.0,126810000.0,171250000.0,3693700000.0,425910000.0,347520000.0,15664000000.0,4924100000.0,368190000.0,137920000.0,2357400000.0,1087500000.0,189460000.0,24266000.0,196640000.0,125200000.0,2278800000.0,414030000.0,449050000.0,58803000.0,6925000.0,87681000.0,302220000.0,235280000.0,276360000.0,34634000.0,13888000000.0,15126000.0,429550000.0,6385900000.0,,1655500000.0,18412000.0,,1292600000.0,,58717000.0,372220000.0,132900000.0,91726000.0,,7179700000.0,292660000.0,35630000.0,27768000.0,26894000.0,163630000.0,57489000.0,172450000.0,29952000.0,,209310000.0,4007400000.0,370960000.0,2036800000.0,1945100000.0,400430000.0,3230000000.0,,178770000.0,35541000.0,75622000.0,,53681000.0,20982000.0,,62147000.0,215040000.0,54237000.0,10133000000.0,820490000.0,,35046000.0,1725000000.0,20494000000.0,465710000.0,2290000000.0,316530000.0,1278300000.0,498540000.0,1104600000.0,231450000.0,73969000.0,1381500000.0,1445800000.0,,166740000.0,262970000.0,,3380300000.0,,,135180000.0,68211000.0,63551000.0,274920000.0,,1681300000.0,,353180000.0,557790000.0,1468600000.0,1611000000.0,190290000.0,,333730000.0,208060000.0,77274000.0,47131000.0,136520000.0,9866300.0,113270000.0,,662360000.0,,27812000.0,192440000.0,25654000.0,,1549200000.0,5776900000.0,279810000.0,9069500.0,155100000.0,4241900000.0,,,65227000.0,35302000.0,168630000.0,46861000.0,,144360000.0,412200000.0,747840000.0,24988000.0,469000000.0,209840000.0,472910000.0,970070000.0,159770000.0,42981000.0,1187800000.0,137680000.0,61678000.0,,395820000.0,,5660700000.0,515880000.0,,,4642500.0,,,93089000.0,58520000.0,336680000.0,4482000000.0,,,672940000.0,377080000.0,244870000.0,1591600000.0,115280000.0,4231900000.0,106540000.0,6859100000.0,28673000.0,337210000.0,474750000.0,59681000.0,22597000.0,29131000.0,13911000.0,72421000.0,6142600000.0,50337000.0,3247700000.0,54275000.0,89485000.0,13327000.0,,100460000.0,125180000.0,,73071000.0 +2020_05_22_17_43_Q-Exactive-HF-X-Orbitrap_6070,,,30697000000.0,172150000.0,,17624000.0,3268700.0,788540000.0,444560000.0,34973000.0,29019000.0,,14543000.0,1689700000.0,104560000.0,77878000.0,,351050000.0,,120750000.0,1878700000.0,21160000.0,311590000.0,1609900000.0,262580000.0,959760000.0,39228000.0,338500000.0,134960000.0,70728000.0,58770000.0,56219000.0,5607500000.0,,,144430000.0,24279000000.0,54682000.0,4097500000.0,,441740000.0,110230000.0,2737600000.0,1143100000.0,269700000.0,1481900000.0,154470000.0,98278000.0,700000000.0,62725000.0,36279000.0,63351000.0,942180000.0,490260000.0,,52224000.0,29711000.0,405550000.0,583660000.0,3224600000.0,161990000.0,179340000.0,,176140000.0,211640000.0,61468000.0,8523400.0,4841100000.0,,707150000.0,,65003000.0,18784000.0,168140000.0,253750000.0,,923570000.0,164220000.0,46766000.0,2685100000.0,439850000.0,143300000.0,164740000.0,,,295980000.0,12864000.0,135290000.0,102560000.0,22584000.0,33136000.0,,95590000.0,255880000.0,25013000.0,44238000.0,520000000.0,20114000.0,205850000.0,96722000.0,4784100000.0,1063800000.0,919960000.0,,22336000.0,3507600000.0,4890500.0,459420000.0,,14051000.0,395960000.0,279220000.0,,,20437000.0,98163000.0,,352420000.0,1103600000.0,222830000.0,454340000.0,485450000.0,107750000.0,7107800000.0,59763000.0,,25270000.0,848780000.0,18929000.0,566150000.0,,27816000.0,149310000.0,105240000.0,34343000.0,,175250000.0,,71783000.0,24059000.0,56909000.0,243520000.0,62399000.0,2568100000.0,12521000.0,8885500.0,178210000.0,8941600.0,82391000.0,19157000.0,412130000.0,,129760000.0,138050000.0,190320000.0,975470000.0,312960000.0,181300000.0,752220000.0,305050000.0,2421000000.0,,562790000.0,88359000.0,45859000.0,568320000.0,292490000.0,174910000.0,142540000.0,898370000.0,215930000.0,39065000.0,,,26825000.0,55774000.0,1085300000.0,139490000.0,622530000.0,,168380000.0,121260000.0,111970000.0,895630000.0,29967000.0,1974200000.0,123760000.0,3133400000.0,2005500000.0,34598000.0,267550000.0,314190000.0,,1121200000.0,27142000.0,50365000.0,1382300000.0,554460000.0,211500000.0,142540000.0,435510000.0,221110000.0,674940000.0,,1482800000.0,73424000.0,220150000.0,79512000.0,90904000.0,,235340000.0,62270000.0,2146200000.0,,246740000.0,516580000.0,,985930000.0,1495600000.0,51624000.0,102010000.0,1274200000.0,191230000.0,59724000.0,15306000000.0,99312000.0,14060000.0,16596000.0,10459000.0,,,692680000.0,153980000.0,73776000.0,321120000.0,,60421000.0,230850000.0,48318000.0,192790000.0,231320000.0,,735040000.0,72856000.0,516130000.0,,152050000.0,8616700000.0,1082300000.0,11141000000.0,142270000.0,163990000.0,,110120000.0,340220000.0,133060000.0,475870000.0,4740900000.0,42557000.0,189910000.0,500840000.0,19640000.0,,3279900000.0,1556100000.0,,73005000.0,137860000.0,430610000.0,3455100000.0,440340000.0,445800000.0,12742000000.0,5488000000.0,530250000.0,218860000.0,2816900000.0,1180400000.0,172970000.0,71227000.0,264730000.0,188380000.0,1944600000.0,707730000.0,629280000.0,44382000.0,31991000.0,56030000.0,422050000.0,1451800000.0,366180000.0,26776000.0,14127000000.0,17096000.0,611420000.0,8998200000.0,,1758400000.0,27021000.0,1526100000.0,1913600000.0,,35614000.0,783870000.0,85018000.0,216860000.0,,8588400000.0,289130000.0,43220000.0,51872000.0,17800000.0,250700000.0,224640000.0,201090000.0,20964000.0,33675000.0,200180000.0,8410000000.0,348080000.0,2206200000.0,2497400000.0,636980000.0,5884300000.0,,207290000.0,90578000.0,77694000.0,22586000.0,89776000.0,18720000.0,,75778000.0,372360000.0,48666000.0,12345000000.0,983460000.0,18116000.0,,1999400000.0,9247800000.0,,1140300000.0,363920000.0,803510000.0,,1376900000.0,288900000.0,369070000.0,1636000000.0,1935900000.0,,199840000.0,338360000.0,312800000.0,3730000000.0,,22254000.0,192380000.0,60718000.0,125600000.0,166430000.0,,2348200000.0,47752000.0,307630000.0,1040300000.0,2503900000.0,1121400000.0,355000000.0,,429310000.0,244900000.0,45610000.0,150480000.0,314310000.0,58943000.0,44740000.0,,974800000.0,43172000.0,,121780000.0,294410000.0,,1419200000.0,8738800000.0,249430000.0,11704000.0,135410000.0,5351300000.0,,17537000.0,160700000.0,55454000.0,135290000.0,86376000.0,,220830000.0,395470000.0,741290000.0,36477000.0,526500000.0,262310000.0,560030000.0,1168500000.0,232590000.0,31467000.0,1983300000.0,108270000.0,77504000.0,93190000.0,382350000.0,,7876300000.0,402190000.0,,28242000.0,13392000.0,,,416840000.0,16702000.0,420190000.0,3649400000.0,26221000.0,,556040000.0,449000000.0,,1735400000.0,218490000.0,4792400000.0,155570000.0,9214300000.0,117120000.0,351550000.0,504090000.0,116300000.0,1030600000.0,142770000.0,44507000.0,87016000.0,8609700000.0,21521000.0,2531800000.0,168320000.0,56058000.0,22182000.0,,,45924000.0,,203710000.0 +2020_05_26_14_20_Q-Exactive-HF-X-Orbitrap_6070,,,45390000000.0,313940000.0,,,,1487500000.0,935980000.0,100130000.0,45306000.0,,106170000.0,1481300000.0,163790000.0,,,482710000.0,,169770000.0,2572500000.0,,158630000.0,3138700000.0,394760000.0,1393500000.0,88939000.0,1100900000.0,139840000.0,89269000.0,101400000.0,,7143800000.0,,,171730000.0,36880000000.0,116390000.0,6990300000.0,199960000.0,671970000.0,188930000.0,3075200000.0,1221500000.0,601130000.0,1421600000.0,247840000.0,193990000.0,,569630000.0,,,1370500000.0,739470000.0,39236000.0,74763000.0,167820000.0,333500000.0,682030000.0,6859700000.0,215980000.0,216610000.0,47304000.0,301640000.0,,,33821000.0,7407700000.0,,1297900000.0,113800000.0,,20236000.0,186220000.0,445820000.0,17951000.0,2236400000.0,312590000.0,151620000.0,4313800000.0,344480000.0,193350000.0,420510000.0,,,136690000.0,160820000.0,156850000.0,29748000.0,,65396000.0,19124000.0,217560000.0,187020000.0,,,369480000.0,20723000.0,177170000.0,126580000.0,8153400000.0,2134200000.0,1671600000.0,,69907000.0,11088000000.0,18794000.0,316320000.0,10189000.0,,608440000.0,563320000.0,,,,59208000.0,,522020000.0,2488000000.0,403360000.0,549680000.0,866570000.0,149180000.0,3973200000.0,198130000.0,47921000.0,21773000.0,1223000000.0,,517530000.0,113960000.0,106900000.0,364970000.0,200870000.0,96698000.0,,311400000.0,19413000000.0,,71333000.0,108850000.0,148900000.0,67513000.0,3035100000.0,62570000.0,,380190000.0,34798000.0,1316600000.0,,527840000.0,88060000.0,134860000.0,155820000.0,905150000.0,964420000.0,708450000.0,87981000.0,1499800000.0,430630000.0,3602300000.0,,768550000.0,178180000.0,17845000.0,384870000.0,384720000.0,278770000.0,225000000.0,1383000000.0,377110000.0,76359000.0,160320000.0,13126000.0,57343000.0,57035000.0,1772300000.0,380970000.0,965410000.0,,128540000.0,104110000.0,174340000.0,1114200000.0,55940000.0,4428900000.0,74866000.0,5248000000.0,3604300000.0,172980000.0,416610000.0,472020000.0,27974000.0,2058200000.0,19044000.0,70397000.0,3158400000.0,1034900000.0,464990000.0,,916740000.0,,1391700000.0,,26340000000.0,190160000.0,240530000.0,96300000.0,125170000.0,,550520000.0,54379000.0,3623400000.0,,469050000.0,561170000.0,,1289800000.0,2088600000.0,50799000.0,85459000.0,1851400000.0,302540000.0,107900000.0,20506000000.0,255550000.0,34402000.0,18972000.0,20735000.0,,,1370000000.0,186240000.0,45130000.0,,,64631000.0,553680000.0,113230000.0,432480000.0,356170000.0,,1840700000.0,103180000.0,344580000.0,,361190000.0,13421000000.0,2002300000.0,18133000000.0,291180000.0,323710000.0,,227170000.0,691130000.0,154080000.0,736460000.0,7798300000.0,66540000.0,386430000.0,283940000.0,40959000.0,,4407900000.0,2481400000.0,31316000.0,63446000.0,239390000.0,610440000.0,9167900000.0,687730000.0,512320000.0,32607000000.0,8413300000.0,645030000.0,162770000.0,4924400000.0,2367100000.0,225040000.0,98413000.0,340930000.0,189920000.0,2817100000.0,668680000.0,1064500000.0,69205000.0,16714000.0,292580000.0,639950000.0,1844800000.0,655260000.0,36312000.0,22273000000.0,,839820000.0,15134000000.0,,2586900000.0,103920000.0,2948700000.0,3537500000.0,,104920000.0,654050000.0,144290000.0,278590000.0,,14574000000.0,231860000.0,77923000.0,33236000.0,76271000.0,231610000.0,393560000.0,238600000.0,62186000.0,47176000.0,592870000.0,13192000000.0,483460000.0,2988700000.0,3448200000.0,871590000.0,8960000000.0,2463200000.0,481200000.0,111690000.0,49833000.0,66785000.0,98779000.0,44703000.0,,140070000.0,522950000.0,161040000.0,21698000000.0,1397200000.0,39558000.0,47173000.0,4175200000.0,17873000000.0,,2079900000.0,554900000.0,1508800000.0,918340000.0,2773500000.0,492720000.0,730830000.0,1833600000.0,3755900000.0,,367560000.0,188970000.0,530680000.0,5174400000.0,,,372520000.0,75962000.0,171460000.0,530690000.0,,4354800000.0,194890000.0,424220000.0,1750600000.0,4173300000.0,2044900000.0,399680000.0,,903160000.0,669390000.0,194870000.0,145150000.0,384220000.0,,292380000.0,,942900000.0,28752000.0,46033000.0,373100000.0,218870000.0,,1930000000.0,19372000000.0,679430000.0,,194610000.0,6624300000.0,,72904000.0,213550000.0,66927000.0,255900000.0,149750000.0,,206800000.0,648120000.0,1557100000.0,96728000.0,781440000.0,456910000.0,662360000.0,1746700000.0,104600000.0,115940000.0,3739000000.0,,240370000.0,171710000.0,321350000.0,,14519000000.0,645240000.0,,,,,19105000.0,138020000.0,25102000.0,575050000.0,5921600000.0,55698000.0,,1098300000.0,636430000.0,,2322900000.0,302160000.0,7803100000.0,175670000.0,15857000000.0,109580000.0,827970000.0,1160800000.0,140940000.0,83079000.0,90975000.0,46037000.0,108180000.0,14982000000.0,,5382000000.0,405480000.0,113690000.0,57102000.0,,,29029000.0,,196310000.0 +2020_05_27_13_57_Q-Exactive-HF-X-Orbitrap_6070,,,83324000000.0,241760000.0,,,,3149700000.0,1572500000.0,219930000.0,197580000.0,,137790000.0,4550500000.0,209140000.0,168530000.0,,554690000.0,19899000.0,149710000.0,6816700000.0,38311000.0,567780000.0,6635100000.0,1192900000.0,3380000000.0,228680000.0,2007500000.0,868290000.0,138060000.0,181790000.0,79142000.0,18072000000.0,,28674000.0,674590000.0,80447000000.0,308080000.0,12632000000.0,471250000.0,2018100000.0,508370000.0,10382000000.0,2987900000.0,1317200000.0,4565500000.0,392640000.0,138770000.0,2452800000.0,259220000.0,149840000.0,174080000.0,3190100000.0,1583500000.0,,171220000.0,323380000.0,1508300000.0,2154400000.0,7809500000.0,373860000.0,575040000.0,120710000.0,831210000.0,615800000.0,128180000.0,137010000.0,10449000000.0,,3419800000.0,680730000.0,,130170000.0,796820000.0,894810000.0,,3617600000.0,757290000.0,252720000.0,8751500000.0,864300000.0,766630000.0,483940000.0,,15742000.0,479430000.0,152740000.0,455480000.0,459630000.0,152690000.0,172980000.0,64284000.0,324450000.0,598980000.0,110930000.0,,1021700000.0,1281800000.0,635630000.0,265490000.0,14746000000.0,4722200000.0,3420100000.0,,88564000.0,16500000000.0,,1176700000.0,,150190000.0,1471900000.0,944500000.0,,,133330000.0,445910000.0,,1281500000.0,3812000000.0,766360000.0,1187100000.0,2032300000.0,198880000.0,5393000000.0,507690000.0,73303000.0,69469000.0,2977900000.0,125780000.0,1911400000.0,,189210000.0,789290000.0,418500000.0,132860000.0,34579000.0,854770000.0,28844000000.0,,265700000.0,357410000.0,1119700000.0,236320000.0,7752900000.0,105190000.0,,731460000.0,160930000.0,312390000.0,45537000.0,1345900000.0,,468430000.0,695450000.0,1100000000.0,3322400000.0,1572400000.0,653150000.0,3369500000.0,1453000000.0,7173000000.0,,1421300000.0,446570000.0,387080000.0,802840000.0,960260000.0,570900000.0,928360000.0,1574300000.0,628420000.0,51250000.0,134750000.0,26104000.0,199530000.0,146740000.0,2497200000.0,762390000.0,2739200000.0,,286430000.0,652560000.0,646210000.0,2751300000.0,171900000.0,6856700000.0,433890000.0,13541000000.0,4719200000.0,626200000.0,1405200000.0,395030000.0,202170000.0,4476000000.0,126620000.0,,6610000000.0,3124200000.0,577680000.0,376840000.0,1237400000.0,790190000.0,3685400000.0,104880000.0,3689000000.0,101710000.0,135470000.0,534030000.0,396060000.0,,816540000.0,271740000.0,7735800000.0,,1327400000.0,2180900000.0,,2380300000.0,5706500000.0,209820000.0,452040000.0,5457900000.0,547530000.0,200760000.0,38519000000.0,361380000.0,69582000.0,196360000.0,18208000.0,,,1444800000.0,618340000.0,333890000.0,,,271190000.0,1094700000.0,249820000.0,701690000.0,518110000.0,57276000.0,3654800000.0,473440000.0,1669000000.0,,640790000.0,34257000000.0,3309000000.0,36121000000.0,175690000.0,651970000.0,39804000.0,365230000.0,823840000.0,385170000.0,2361800000.0,14195000000.0,372330000.0,1192900000.0,1077600000.0,152200000.0,,13547000000.0,5137100000.0,44187000.0,148070000.0,360830000.0,1462200000.0,16394000000.0,1364000000.0,1834600000.0,53913000000.0,16521000000.0,1507700000.0,477360000.0,11646000000.0,5014100000.0,897950000.0,197210000.0,1069500000.0,801920000.0,8450100000.0,1627500000.0,2452300000.0,119960000.0,28106000.0,638640000.0,1732800000.0,3921800000.0,1970600000.0,167550000.0,33074000000.0,146160000.0,1735500000.0,27131000000.0,85154000.0,4452300000.0,160550000.0,153000000.0,6117800000.0,,353040000.0,2115400000.0,305920000.0,612760000.0,,25198000000.0,1005900000.0,235640000.0,153910000.0,127200000.0,666630000.0,393180000.0,964400000.0,165790000.0,219750000.0,311400000.0,25687000000.0,1710100000.0,,6919500000.0,1999300000.0,21099000000.0,,716820000.0,184260000.0,499330000.0,123200000.0,218720000.0,119780000.0,27620000.0,359380000.0,1936000000.0,329210000.0,37683000000.0,2953200000.0,81533000.0,175360000.0,8163700000.0,25852000000.0,,3513500000.0,1101200000.0,2371200000.0,1414900000.0,4809400000.0,855500000.0,717550000.0,6514300000.0,6331200000.0,,844020000.0,913790000.0,1554900000.0,10932000000.0,,,641400000.0,389640000.0,757020000.0,381070000.0,,5926800000.0,229370000.0,647420000.0,3413300000.0,7242200000.0,6437800000.0,1279300000.0,,1347600000.0,825080000.0,390890000.0,550460000.0,882850000.0,164190000.0,364320000.0,,2270500000.0,32725000.0,76118000.0,549610000.0,293810000.0,,4953700000.0,27997000000.0,1337000000.0,23256000.0,590090000.0,10686000000.0,,138660000.0,531860000.0,258780000.0,433270000.0,431230000.0,181470000.0,961690000.0,1383700000.0,3469700000.0,276390000.0,1920200000.0,1317200000.0,1476200000.0,4421100000.0,902920000.0,185230000.0,7142600000.0,333750000.0,588820000.0,511240000.0,727730000.0,,22000000000.0,1898200000.0,,44575000.0,51738000.0,,26380000.0,405360000.0,170220000.0,973120000.0,13984000000.0,99042000.0,378100000.0,1067100000.0,1638100000.0,733150000.0,5237800000.0,692430000.0,21468000000.0,432500000.0,30568000000.0,346090000.0,1896000000.0,1784700000.0,409680000.0,2647400000.0,388520000.0,178930000.0,395760000.0,27648000000.0,194340000.0,8300200000.0,762720000.0,746600000.0,32824000.0,,638360000.0,263670000.0,27060000.0,507590000.0 +2020_05_28_04_06_Q-Exactive-HF-X-Orbitrap_6070,,,97880000000.0,95044000.0,,,9774300.0,3554900000.0,1586400000.0,253890000.0,233510000.0,,170990000.0,4999600000.0,537550000.0,399670000.0,,1271500000.0,116580000.0,289310000.0,7881800000.0,90765000.0,1203000000.0,6504800000.0,1475500000.0,3983600000.0,195300000.0,2472400000.0,782280000.0,284330000.0,236290000.0,,23599000000.0,,70305000.0,784200000.0,99267000000.0,225750000.0,16965000000.0,349480000.0,2425400000.0,1120800000.0,12827000000.0,4144300000.0,1595100000.0,4815000000.0,295690000.0,171030000.0,3002400000.0,379640000.0,398300000.0,198880000.0,3983200000.0,1706700000.0,49442000.0,168570000.0,366750000.0,1768700000.0,2062400000.0,10556000000.0,671720000.0,1008900000.0,116350000.0,862970000.0,1024100000.0,107750000.0,,14704000000.0,,4172500000.0,,242520000.0,193880000.0,760050000.0,1740700000.0,36976000.0,4833900000.0,1238400000.0,233960000.0,9327300000.0,1131000000.0,607470000.0,664580000.0,,56546000.0,581670000.0,355810000.0,627430000.0,824300000.0,172750000.0,199610000.0,53767000.0,563760000.0,536750000.0,107870000.0,252790000.0,1270200000.0,,980150000.0,358070000.0,17186000000.0,4341100000.0,4431200000.0,,99839000.0,18230000000.0,,1077100000.0,19933000.0,74352000.0,1996800000.0,1124100000.0,,,128360000.0,543910000.0,68535000.0,1804500000.0,5278900000.0,1415600000.0,1418200000.0,2259900000.0,271170000.0,21083000000.0,596860000.0,72058000.0,342940000.0,3381800000.0,92104000.0,2404300000.0,82828000.0,206380000.0,1022900000.0,333790000.0,283760000.0,73763000.0,1011800000.0,43369000000.0,,330550000.0,411540000.0,464860000.0,474950000.0,10440000000.0,232720000.0,26957000.0,808810000.0,146200000.0,538630000.0,33314000.0,1815000000.0,365150000.0,460010000.0,312550000.0,1362000000.0,3893900000.0,1475000000.0,831160000.0,2991200000.0,1013100000.0,8702000000.0,,2295700000.0,497040000.0,477110000.0,1082800000.0,773430000.0,750920000.0,1008600000.0,1667500000.0,833620000.0,74973000.0,181150000.0,77421000.0,232440000.0,96012000.0,3326600000.0,1166500000.0,2215300000.0,,371260000.0,951870000.0,399330000.0,2235800000.0,324890000.0,8769600000.0,387560000.0,16574000000.0,6444200000.0,731330000.0,1670000000.0,442820000.0,30396000.0,5010800000.0,273750000.0,56080000.0,7925900000.0,3288300000.0,1325200000.0,,1948800000.0,927020000.0,2992300000.0,32015000.0,43765000000.0,217450000.0,168490000.0,542760000.0,583040000.0,,1223000000.0,416410000.0,8123600000.0,,1638700000.0,2642800000.0,,2956900000.0,6131700000.0,96807000.0,383590000.0,6871800000.0,660960000.0,465640000.0,53132000000.0,418310000.0,185440000.0,112840000.0,200330000.0,,,3137700000.0,621540000.0,579380000.0,3174000000.0,,312390000.0,1304700000.0,353530000.0,997680000.0,927200000.0,,3294500000.0,594940000.0,4072800000.0,,1232400000.0,47141000000.0,4702200000.0,42909000000.0,771850000.0,925010000.0,50306000.0,431930000.0,761780000.0,573080000.0,2228700000.0,17376000000.0,303890000.0,1537800000.0,1160100000.0,145620000.0,,15798000000.0,7772200000.0,47599000.0,300460000.0,638710000.0,1550700000.0,14722000000.0,2162100000.0,1650200000.0,54364000000.0,21212000000.0,2389300000.0,756540000.0,12306000000.0,5501300000.0,1727400000.0,287230000.0,1297200000.0,554110000.0,8110700000.0,2517600000.0,2210700000.0,176210000.0,99650000.0,675270000.0,2384600000.0,3968200000.0,2214100000.0,107490000.0,46260000000.0,159280000.0,2326100000.0,35157000000.0,,7427400000.0,213460000.0,592280000.0,7942800000.0,,504400000.0,2981800000.0,424740000.0,1116100000.0,,30710000000.0,894580000.0,209210000.0,152780000.0,349090000.0,918700000.0,1060400000.0,702710000.0,66811000.0,150990000.0,1351300000.0,37856000000.0,2014900000.0,6798100000.0,7583900000.0,2629900000.0,23788000000.0,4089000000.0,999040000.0,403690000.0,654000000.0,192400000.0,324100000.0,164880000.0,,381850000.0,1568100000.0,272730000.0,54730000000.0,3153500000.0,,273730000.0,10796000000.0,38074000000.0,,4810200000.0,1218000000.0,4204300000.0,964340000.0,6773200000.0,1182000000.0,1019200000.0,7639200000.0,6947900000.0,,1073300000.0,1222400000.0,1940600000.0,12503000000.0,,47162000.0,524000000.0,416430000.0,524370000.0,797150000.0,,6782800000.0,324760000.0,787380000.0,3910300000.0,8292200000.0,6949200000.0,1261600000.0,,806370000.0,192060000.0,596100000.0,468290000.0,1139800000.0,,552110000.0,,3000000000.0,45350000.0,150660000.0,637830000.0,470370000.0,,6429400000.0,30848000000.0,1258300000.0,96334000.0,851670000.0,14862000000.0,,109160000.0,833730000.0,210600000.0,590260000.0,729180000.0,69893000.0,1007300000.0,1363200000.0,3706200000.0,278860000.0,2120200000.0,1128200000.0,2056200000.0,5274400000.0,1013600000.0,112380000.0,8295900000.0,547520000.0,528440000.0,260160000.0,1394800000.0,,20144000000.0,2254600000.0,,155160000.0,,,26649000.0,711420000.0,247040000.0,1974100000.0,17174000000.0,30641000.0,,2632000000.0,2542100000.0,1164700000.0,7673800000.0,905280000.0,22805000000.0,584380000.0,37686000000.0,389370000.0,2098100000.0,1598000000.0,344050000.0,2949200000.0,455320000.0,364610000.0,420620000.0,30031000000.0,124140000.0,10592000000.0,1504700000.0,625740000.0,39678000.0,,1400600000.0,350050000.0,,81160000.0 +2020_06_01_10_22_Q-Exactive-HF-X-Orbitrap_6070,,,55890000000.0,66026000.0,,,136330000.0,1348100000.0,333940000.0,,,,,774230000.0,15708000.0,,14101000.0,223000000.0,17511000.0,,2738800000.0,33226000.0,265190000.0,1074000000.0,105470000.0,1038700000.0,,1419900000.0,101680000.0,149450000.0,201680000.0,,3226700000.0,46769000.0,256830000.0,70726000.0,26526000000.0,,2926400000.0,256470000.0,,221660000.0,3157300000.0,1436500000.0,447650000.0,1338600000.0,,13418000.0,,,23442000.0,73896000.0,248730000.0,1152900000.0,,26541000.0,9956800.0,192110000.0,150070000.0,1806000000.0,,,10494000.0,,,168760000.0,21014000.0,2450700000.0,579230000.0,392560000.0,,24083000.0,6160100.0,89911000.0,67495000.0,,145090000.0,78597000.0,11916000.0,2710000000.0,242730000.0,17036000.0,33516000.0,,406860000.0,38678000.0,,,,70205000.0,16756000.0,8879300.0,50488000.0,108430000.0,,24833000.0,169780000.0,,73379000.0,120240000.0,4216700000.0,978530000.0,,74870000.0,,6776500000.0,,321310000.0,,37883000.0,1722900000.0,,58164000.0,54581000.0,51071000.0,290560000.0,9367900.0,508110000.0,1069900000.0,109470000.0,289470000.0,72917000.0,165860000.0,1369600000.0,,39359000.0,129480000.0,697060000.0,,302960000.0,,44198000.0,93360000.0,15622000.0,,22107000.0,139290000.0,,355490000.0,,,,,2113400000.0,,,42548000.0,,105980000.0,,234350000.0,86918000.0,66350000.0,526930000.0,33086000.0,345890000.0,284960000.0,10174000.0,1879200000.0,18745000.0,2071400000.0,54665000.0,258190000.0,33332000.0,,,107370000.0,113240000.0,147700000.0,149250000.0,480610000.0,622070000.0,37678000.0,,57058000.0,67140000.0,2266000000.0,10759000.0,579520000.0,,7364100.0,38069000.0,87959000.0,154820000.0,,730270000.0,63078000.0,2011300000.0,2249300000.0,12072000.0,340160000.0,213220000.0,,1412700000.0,60432000.0,,674710000.0,758230000.0,374700000.0,,615360000.0,36316000.0,2107000000.0,,4793700000.0,,,245180000.0,83886000.0,,89246000.0,,2524700000.0,82512000.0,35134000.0,,1504500000.0,2406900000.0,1777400000.0,13530000.0,,641680000.0,,93636000.0,48786000000.0,89755000.0,,55161000.0,,,38704000.0,1159100000.0,154210000.0,,,452070000.0,,537160000.0,13544000.0,,,,325880000.0,160020000.0,200950000.0,253720000.0,216480000.0,15458000000.0,223840000.0,8788500000.0,,,13790000.0,59369000.0,31543000.0,,639690000.0,10075000000.0,,149390000.0,489730000.0,,,4400100000.0,847190000.0,42061000.0,,249040000.0,72017000.0,4738600000.0,212800000.0,301520000.0,47335000000.0,4022100000.0,133350000.0,51544000.0,3334900000.0,1160000000.0,155570000.0,,155240000.0,75034000.0,2566100000.0,200550000.0,78491000.0,,,127890000.0,1805700000.0,892320000.0,664220000.0,17718000.0,12492000000.0,,232540000.0,3300700000.0,,975950000.0,,30277000.0,1823400000.0,59907000.0,,349990000.0,150680000.0,127350000.0,743800000.0,10079000000.0,75540000.0,,,20424000.0,,84682000.0,,,318940000.0,317440000.0,5059900000.0,247910000.0,,1246700000.0,605220000.0,9363400000.0,8443600000.0,,108050000.0,,32696000.0,60065000.0,56470000.0,50902000.0,,268120000.0,,11433000000.0,343410000.0,,,2012700000.0,5600500000.0,,2945700000.0,372320000.0,496220000.0,933480000.0,549400000.0,302800000.0,109320000.0,5145400000.0,1248500000.0,217080000.0,65650000.0,67745000.0,200580000.0,3878700000.0,,36956000.0,32036000.0,31526000.0,107250000.0,443910000.0,,2545400000.0,56806000.0,67039000.0,3310800000.0,1683200000.0,450970000.0,98504000.0,130670000.0,2960500000.0,1052700000.0,137210000.0,,241830000.0,,45750000.0,,291560000.0,25862000.0,,,469130000.0,2046600000.0,3024700000.0,11440000000.0,382140000.0,,46063000.0,10572000000.0,,118990000.0,48306000.0,90164000.0,,77368000.0,49055000.0,,257920000.0,108340000.0,,35836000.0,247030000.0,502560000.0,410450000.0,126280000.0,28494000.0,468510000.0,,,120000000.0,140270000.0,,18368000000.0,243280000.0,,,17227000.0,45668000.0,248170000.0,59534000.0,,96251000.0,7579300000.0,,143470000.0,154910000.0,117480000.0,,516370000.0,,11749000000.0,93313000.0,7579200000.0,,535680000.0,357460000.0,33023000.0,4538900000.0,70898000.0,31142000.0,,3462500000.0,,987000000.0,490890000.0,,,,68818000.0,25847000.0,39407000.0,35227000.0 +2020_06_01_15_41_Q-Exactive-HF-X-Orbitrap_6070,,54655000.0,91447000000.0,88627000.0,67107000.0,,432280000.0,2003200000.0,1809000000.0,,,113970000.0,61425000.0,1187900000.0,74170000.0,23177000.0,22471000.0,191420000.0,54614000.0,93136000.0,3666400000.0,93590000.0,,2460700000.0,444540000.0,2799600000.0,24352000.0,2504000000.0,672920000.0,194690000.0,330060000.0,15771000.0,8175400000.0,55135000.0,572860000.0,300900000.0,49982000000.0,30691000.0,3745500000.0,133420000.0,118620000.0,630460000.0,3931000000.0,2685300000.0,781040000.0,2474900000.0,,,,,,27344000000.0,465120000.0,2540700000.0,,,,354080000.0,53343000.0,8741500000.0,,86055000.0,9706700.0,,160120000.0,451590000.0,,5561500000.0,673270000.0,1012500000.0,,,,184470000.0,,80588000.0,281420000.0,81121000.0,,3008000000.0,274610000.0,,,,384310000.0,69754000.0,,62063000.0,,152950000.0,,,,67785000.0,,434230000.0,158540000.0,,460170000.0,214760000.0,7158600000.0,1595800000.0,934630000.0,179700000.0,,10521000000.0,,,,115820000.0,4423700000.0,,74640000.0,66143000.0,89312000.0,,,1207200000.0,640700000.0,300880000.0,886220000.0,578980000.0,254810000.0,2833200000.0,,85345000.0,,1645500000.0,,456900000.0,,63657000.0,204510000.0,26018000.0,,122580000.0,265070000.0,22577000000.0,825990000.0,35656000.0,29577000.0,,57637000.0,3318900000.0,,,190920000.0,67961000.0,45050000.0,,556100000.0,,84590000.0,667910000.0,674820000.0,353850000.0,285440000.0,71969000.0,3912400000.0,22163000.0,3571000000.0,,362770000.0,222850000.0,40673000.0,164480000.0,159210000.0,205210000.0,46317000.0,209910000.0,1054600000.0,751010000.0,,,82409000.0,,3312200000.0,254240000.0,751260000.0,103210000.0,25285000.0,37017000.0,,450890000.0,,1352500000.0,78802000.0,3504800000.0,3821000000.0,216780000.0,361780000.0,464200000.0,,2053800000.0,107420000.0,,2814500000.0,1145300000.0,1091700000.0,,1175100000.0,105800000.0,2566200000.0,,6833900000.0,103460000.0,,383310000.0,35300000.0,,118010000.0,,5914900000.0,,134170000.0,,2293400000.0,3403900000.0,3682300000.0,100520000.0,,1022300000.0,,154840000.0,57897000000.0,111310000.0,,14658000.0,,,245800000.0,1608400000.0,462900000.0,,,697120000.0,,1042500000.0,34572000.0,61133000.0,11907000.0,,1374200000.0,351910000.0,521210000.0,365140000.0,510680000.0,33721000000.0,704310000.0,21510000000.0,57803000.0,,,77518000.0,,119990000.0,959620000.0,18065000000.0,,406460000.0,758590000.0,,,8470300000.0,2063500000.0,62815000.0,,,164290000.0,9705600000.0,796180000.0,408980000.0,80951000000.0,6573500000.0,258120000.0,,6598500000.0,2994300000.0,283620000.0,,446850000.0,191870000.0,4196700000.0,117810000.0,,,,282020000.0,2194200000.0,901790000.0,717750000.0,32743000.0,29178000000.0,,217480000.0,7467000000.0,,2340800000.0,,37414000.0,3238000000.0,171310000.0,,277440000.0,611840000.0,214670000.0,1345700000.0,19914000000.0,193470000.0,,,72760000.0,,110120000.0,219090000.0,,436480000.0,890290000.0,16739000000.0,735690000.0,,3664400000.0,1585900000.0,9266600000.0,15962000000.0,,231950000.0,,92825000.0,131590000.0,121790000.0,166280000.0,,552920000.0,109560000.0,23822000000.0,1064000000.0,,,3331800000.0,10382000000.0,,3906900000.0,409320000.0,939440000.0,1112800000.0,1200200000.0,591440000.0,267420000.0,6823100000.0,1505300000.0,458900000.0,89570000.0,791220000.0,537760000.0,7345900000.0,,336280000.0,55059000.0,83858000.0,129240000.0,985200000.0,,6535300000.0,84085000.0,202090000.0,4354100000.0,3636600000.0,1135300000.0,,379980000.0,5021600000.0,1625900000.0,353730000.0,,387350000.0,,,88513000.0,683060000.0,28750000.0,,,213550000.0,,2645900000.0,16008000000.0,612290000.0,,240670000.0,15447000000.0,,143620000.0,85581000.0,141510000.0,,448200000.0,,34153000.0,675760000.0,462530000.0,,126840000.0,308450000.0,930350000.0,555180000.0,246580000.0,,763020000.0,350890000.0,31272000.0,153200000.0,337310000.0,,24099000000.0,233100000.0,172810000.0,,,93631000.0,830770000.0,192600000.0,93967000.0,321910000.0,11947000000.0,,,251150000.0,417060000.0,151840000.0,1123400000.0,,23619000000.0,75746000.0,9311700000.0,,142750000.0,1340200000.0,276920000.0,6410000000.0,101930000.0,,,6104800000.0,,1910800000.0,1441300000.0,201690000.0,,993580000.0,136270000.0,,67984000.0,139900000.0 +2020_06_02_09_41_Q-Exactive-HF-X-Orbitrap_6070,,,54795000000.0,505350000.0,,,,1422800000.0,611290000.0,112210000.0,77665000.0,,154770000.0,1418800000.0,252810000.0,158600000.0,,567720000.0,,,2686500000.0,,,2941200000.0,631590000.0,1514900000.0,,1049100000.0,166030000.0,86645000.0,159910000.0,97610000.0,7707400000.0,,46104000.0,259910000.0,45412000000.0,56571000.0,6773800000.0,,899030000.0,308030000.0,4596700000.0,1517500000.0,661300000.0,2436800000.0,166120000.0,124390000.0,1371900000.0,206270000.0,67845000.0,,1191700000.0,793560000.0,,66906000.0,155990000.0,1358800000.0,600570000.0,5935400000.0,221630000.0,239810000.0,,207150000.0,,,47434000.0,6237200000.0,,1346800000.0,145400000.0,71843000.0,13672000.0,339080000.0,446410000.0,,2550000000.0,450820000.0,201960000.0,4131700000.0,487220000.0,185410000.0,442290000.0,,,120430000.0,150590000.0,273140000.0,341760000.0,63893000.0,81216000.0,35455000.0,139960000.0,169770000.0,12002000.0,77102000.0,340140000.0,31760000.0,413620000.0,139110000.0,6530400000.0,2394800000.0,1778000000.0,,35376000.0,11337000000.0,,575550000.0,,88353000.0,928010000.0,564840000.0,,,,103780000.0,,418330000.0,1735300000.0,182630000.0,850280000.0,1011200000.0,158620000.0,3305500000.0,191130000.0,,49056000.0,1482400000.0,,678560000.0,66879000.0,105160000.0,139160000.0,29009000.0,46447000.0,8382700.0,345970000.0,13084000000.0,,39240000.0,1522600000.0,691590000.0,102740000.0,4334800000.0,30596000.0,17413000.0,174660000.0,31880000.0,152250000.0,,647800000.0,,119860000.0,80722000.0,484990000.0,1472700000.0,436810000.0,207080000.0,1435100000.0,346070000.0,3700600000.0,,679000000.0,185970000.0,139320000.0,248950000.0,375050000.0,266060000.0,295540000.0,1185000000.0,368650000.0,146270000.0,25266000.0,28334000.0,48993000.0,,1766100000.0,228380000.0,716300000.0,,211090000.0,272370000.0,368770000.0,887160000.0,,4544900000.0,95955000.0,5551200000.0,1991800000.0,140140000.0,518930000.0,351720000.0,,2057900000.0,47339000.0,,3775000000.0,1190900000.0,473080000.0,,791520000.0,317410000.0,1366700000.0,17431000.0,19727000000.0,87242000.0,300830000.0,229720000.0,117020000.0,,204210000.0,32253000.0,3385000000.0,,579770000.0,1096900000.0,,1695600000.0,2296000000.0,65880000.0,190270000.0,2432200000.0,168370000.0,87852000.0,32616000000.0,135580000.0,32068000.0,50445000.0,,,,1013200000.0,191650000.0,149300000.0,,,100610000.0,571850000.0,139910000.0,260800000.0,359110000.0,,1619800000.0,128320000.0,732750000.0,,312010000.0,18393000000.0,2322500000.0,16502000000.0,289710000.0,187210000.0,,279280000.0,491830000.0,211740000.0,1227500000.0,7210600000.0,46571000.0,322460000.0,761930000.0,41030000.0,11656000.0,6148300000.0,3263100000.0,,58675000.0,256410000.0,445520000.0,7725400000.0,667910000.0,783490000.0,28803000000.0,8431000000.0,822570000.0,301180000.0,4292600000.0,2447500000.0,665450000.0,,379840000.0,317070000.0,3712100000.0,736290000.0,976500000.0,67860000.0,,340230000.0,687760000.0,1742200000.0,756040000.0,33531000.0,19729000000.0,,1102100000.0,13414000000.0,,3696900000.0,18533000.0,63805000.0,3401300000.0,56421000.0,115990000.0,1013400000.0,292290000.0,341020000.0,,,342070000.0,108570000.0,40504000.0,36543000.0,328370000.0,244230000.0,234160000.0,26040000.0,57531000.0,526260000.0,11838000000.0,508580000.0,590530000.0,3672600000.0,1021100000.0,8795800000.0,,294560000.0,95530000.0,328490000.0,,131850000.0,,,121930000.0,517180000.0,285310000.0,21658000000.0,1165500000.0,131530000.0,66890000.0,2920200000.0,15280000000.0,,2148800000.0,392260000.0,1362500000.0,731290000.0,2627100000.0,624790000.0,468200000.0,2684800000.0,2624000000.0,,323760000.0,545850000.0,600800000.0,4989800000.0,,,195110000.0,55007000.0,71862000.0,375480000.0,,3254800000.0,65776000.0,467850000.0,2090900000.0,3522500000.0,2668100000.0,523770000.0,,662880000.0,364060000.0,159080000.0,233300000.0,417000000.0,,,,1302500000.0,20339000.0,71377000.0,185190000.0,318070000.0,,3192700000.0,13410000000.0,634830000.0,,203680000.0,7460000000.0,,,289120000.0,90977000.0,181540000.0,141860000.0,,282790000.0,659420000.0,1637000000.0,67166000.0,884350000.0,616300000.0,1144600000.0,2002200000.0,286780000.0,129810000.0,3445500000.0,299110000.0,105040000.0,,414140000.0,,15866000000.0,1173000000.0,,13291000.0,,,,198080000.0,69451000.0,528950000.0,7501200000.0,18721000.0,,707700000.0,841990000.0,,4144600000.0,319360000.0,11734000000.0,216610000.0,15484000000.0,72631000.0,1031500000.0,993740000.0,242940000.0,703540000.0,207100000.0,,153170000.0,11233000000.0,73347000.0,4184000000.0,293420000.0,135630000.0,42044000.0,,,52246000.0,,121840000.0 diff --git a/project/erda_00_maxquant_file_reader.ipynb b/project/erda_00_maxquant_file_reader.ipynb deleted file mode 100644 index 26b0e9ac8..000000000 --- a/project/erda_00_maxquant_file_reader.ipynb +++ /dev/null @@ -1,527 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "# MaxQuant (MQ) Output-Files\n", - "\n", - "Compare a single experiment\n", - "\n", - "Files compared:\n", - "1. `Summary.txt`\n", - "2. `mqpar.xml`\n", - "3. `peptides.txt`\n", - "4. `proteins.txt`\n", - "\n", - "There is are many files more, where several files seem to be available in several times in different formats." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import sys\n", - "import logging\n", - "from pathlib import Path\n", - "import random\n", - "from tqdm.notebook import tqdm\n", - "\n", - "import pandas as pd\n", - "import ipywidgets as widgets\n", - "\n", - "from vaep.io import PathsList\n", - "from vaep.io.mq import MaxQuantOutputDynamic\n", - "from vaep.io.mq import ExtractFromPeptidesTxt\n", - "import vaep.io.mq as mq\n", - "\n", - "\n", - "from src.file_utils import load_summary, load_mqpar_xml\n", - "from vaep.logging import setup_logger_w_file\n", - "\n", - "##################\n", - "##### CONFIG #####\n", - "##################\n", - "from config import FOLDER_MQ_TXT_DATA, FOLDER_PROCESSED\n", - "from config import FOLDER_KEY # defines how filenames are parsed for use as indices\n", - "\n", - "from config import FOLDER_DATA # project folder for storing the data\n", - "print(f\"Search Raw-Files on path: {FOLDER_MQ_TXT_DATA}\")\n", - "\n", - "##################\n", - "### Logging ######\n", - "##################\n", - "\n", - "#Delete Jupyter notebook root logger handler\n", - "root_logger = logging.getLogger()\n", - "root_logger.handlers = []\n", - "\n", - "logger = logging.getLogger('vaep')\n", - "logger = setup_logger_w_file(logger, fname_base='log_00_maxquant_file_reader')\n", - "\n", - "logger.info('Start with handlers: \\n' + \"\\n\".join(f\"- {repr(log_)}\" for log_ in logger.handlers))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "folders = [folder for folder in Path(FOLDER_MQ_TXT_DATA).iterdir() if folder.is_dir()]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "folders_dict = {folder.name: folder for folder in sorted(folders) }\n", - "assert len(folders_dict) == len(folders), \"Non unique file names\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "# w_file = widgets.Dropdown(options=[folder for folder in folders], description='View files')\n", - "w_file = widgets.Dropdown(options=folders_dict, description='View files')\n", - "w_file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mq_output = MaxQuantOutputDynamic(w_file.value)\n", - "mq_output" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Results will be saved in a subfolder under `vaep/project/data` using the name of the specified input-folder per default. Change to your liking:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> Go to the block you are interested in!" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "## MQ Summary files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mq_output.summary.iloc[0].to_dict()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "### File Handler\n", - "\n", - "- dictionary of run name to run output folder\n", - "- find class with expected output folders" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "# load_summary??" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "### Summaries\n", - "\n", - "- aggregated in `vaep/project/erda_01_mq_aggregate_summaries.ipynb` \n", - " - file selection based on summaries for further analysis thereafter" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# paths_summaries = [str(folder / 'summary.txt') for folder in folders_dict.values()]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "# # if paths_summaries.files:\n", - "# if folders_dict:\n", - "# # df, names, failed = process_files(handler_fct=load_summary, filepaths=paths_summaries.files, key=FOLDER_KEY, relative_to='paths_summaries.folder')\n", - "# df, names, failed = process_files(handler_fct=load_summary, filepaths=paths_summaries, key=FOLDER_KEY, relative_to=None)\n", - "# df.columns = names\n", - "# print(f\"Number of failed reads: {len(failed)}\")\n", - "# display(df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# # if paths_summaries.files:\n", - "# if paths_summaries:\n", - "# df.to_csv(os.path.join(FOLDER_PROCESSED, 'all_summary_txt.csv'))\n", - "# df.to_pickle(os.path.join(FOLDER_PROCESSED, 'all_summary_txt.pkl'))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- SIL - MS2 based on precursor which was a set of peaks\n", - "- PEAK - MS2 scan based on a single peak on precursor spectrum\n", - "- ISO - isotopic pattern detection\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# # if paths_summaries.files:\n", - "# if paths_summaries:\n", - "# MS_spectra = df.loc[['MS', 'MS/MS Identified']].T.astype('int64')\n", - "# mask = MS_spectra['MS/MS Identified'] > 0\n", - "# display(MS_spectra.loc[mask].describe())\n", - "# MS_spectra.to_csv(os.path.join(FOLDER_PROCESSED, 'overview_stats.csv'))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "## MaxQuant Parameter File\n", - "\n", - "- partly in a separate subfolder\n", - "- mainly in run folders\n", - "- rebase on folders_dictionary (check for `.xml` files in all folders)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mqpar_files = (Path(FOLDER_DATA) / 'mqpar_files')\n", - "\n", - "mqpar_files = [file for file in mqpar_files.iterdir() if file.suffix == '.xml']\n", - "len(mqpar_files) # nested search needed" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "w_file = widgets.Dropdown(options=mqpar_files, description='Select a file')\n", - "w_file" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "### Parameter Files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "fname_mqpar_xml = os.path.join(FOLDER_PROCESSED, 'peptide_intensities.{}')\n", - "\n", - "d_mqpar = dict()\n", - "for file in tqdm(mqpar_files):\n", - " d_mqpar[file.stem] = load_mqpar_xml(file)['MaxQuantParams']\n", - " \n", - "df_mqpar = pd.DataFrame(d_mqpar.values() , index=d_mqpar.keys()).convert_dtypes()\n", - "df_mqpar" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The number of threads used might differ" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_mqpar['numThreads'].value_counts()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The parameter files would need further parsing, which is skipped for now:\n", - " - `OrderedDict` would need to be flattend\n", - " - in the example below, it is not easy to see how entries should be easily combined\n", - " (list of `OrderedDict`s where only the `fastaFilePath` is different)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_mqpar.iloc[0].loc['fastaFiles']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "in order to see if there are different setting based on the string columns, drop duplicates \n", - "\n", - "- only one should remain" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_mqpar.select_dtypes('string').drop('numThreads', axis=1).drop_duplicates()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "## Peptides\n", - "\n", - "- peptides combined (combining different charged states): `peptides`\n", - "- single peptides (with differing charges): `evidence`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "pd.set_option('max_columns', 60)\n", - "\n", - "# mq_output = MaxQuantOutputDynamic(\n", - "# folder=folders[random.randint(0, len(paths_peptides.files)-1)])\n", - "mq_output.peptides" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mq_output.evidence" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mq_output.peptides.Intensity # as is in peptides.txt, comma seperated thousands" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "### Create peptide intensity dumps for each MQ outputfolder\n", - "\n", - "- idea was: dump peptides found for each (unique) gene\n", - " - creates a `json` file for each gene with the gene contained\n", - "\n", - "- decision: discard\n", - " - rather dump peptide information per sample. Mapping of peptides to gene can be done\n", - " using the fasta file on the pytorch level." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# folders[:10]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Check if the output folder contains already parsed files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# import json\n", - "\n", - "# with open(src.config.FN_FASTA_DB) as f:\n", - "# data_fasta = json.load(f)\n", - "# print(f'Number of proteins in fasta file DB: {len(data_fasta)}')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# %%time\n", - "# FOLDER_PEP_PER_GENE = Path(FOLDER_PROCESSED) / 'agg_peptides_per_gene'\n", - "# FOLDER_PEP_PER_GENE.mkdir(parents=True, exist_ok=True)\n", - "# set_previously_loaded = {folder.name for folder in FOLDER_PEP_PER_GENE.iterdir()}\n", - "\n", - "# FORCE = True\n", - "\n", - "# for folder in folders:\n", - "# if folder.name in set_previously_loaded and not FORCE and (folder / '0_completness_all_genes.json').exists():\n", - "# pass\n", - "# else:\n", - "# logger.info(f'\\n\\nProcess: {folder.name}')\n", - "# mq_output = MaxQuantOutputDynamic(folder)\n", - "# peptide_extractor = ExtractFromPeptidesTxt(\n", - "# out_folder=FOLDER_PEP_PER_GENE, mq_output_object=mq_output, fasta_db=data_fasta)\n", - "# completeness_per_gene = peptide_extractor()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "## Theoretial Peptides from used fasta-file\n", - "\n", - "> `01_explore_FASTA.ipynb` (formely `misc_FASTA_tryptic_digest.ipynb`)\n", - "\n", - "- check if peptides are part of theoretical peptides" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "file_extension": ".py", - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.15" - }, - "mimetype": "text/x-python", - "name": "python", - "npconvert_exporter": "python", - "pygments_lexer": "ipython3", - "toc-autonumbering": true, - "version": 3, - "vscode": { - "interpreter": { - "hash": "79d0f0394ff693752da6f78eb84feea9ce495e5d1d56e189f7fad91f86783599" - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/project/erda_00_maxquant_file_reader.py b/project/erda_00_maxquant_file_reader.py deleted file mode 100644 index f441a31ac..000000000 --- a/project/erda_00_maxquant_file_reader.py +++ /dev/null @@ -1,265 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.14.5 -# kernelspec: -# display_name: Python 3 (ipykernel) -# language: python -# name: python3 -# --- - -# %% [markdown] Collapsed="false" -# # MaxQuant (MQ) Output-Files -# -# Compare a single experiment -# -# Files compared: -# 1. `Summary.txt` -# 2. `mqpar.xml` -# 3. `peptides.txt` -# 4. `proteins.txt` -# -# There is are many files more, where several files seem to be available in several times in different formats. - -# %% -import os -import sys -import logging -from pathlib import Path -import random -from tqdm.notebook import tqdm - -import pandas as pd -import ipywidgets as widgets - -from vaep.io import PathsList -from vaep.io.mq import MaxQuantOutputDynamic -from vaep.io.mq import ExtractFromPeptidesTxt -import vaep.io.mq as mq - - -from src.file_utils import load_summary, load_mqpar_xml -from vaep.logging import setup_logger_w_file - -################## -##### CONFIG ##### -################## -from config import FOLDER_MQ_TXT_DATA, FOLDER_PROCESSED -from config import FOLDER_KEY # defines how filenames are parsed for use as indices - -from config import FOLDER_DATA # project folder for storing the data -print(f"Search Raw-Files on path: {FOLDER_MQ_TXT_DATA}") - -################## -### Logging ###### -################## - -#Delete Jupyter notebook root logger handler -root_logger = logging.getLogger() -root_logger.handlers = [] - -logger = logging.getLogger('vaep') -logger = setup_logger_w_file(logger, fname_base='log_00_maxquant_file_reader') - -logger.info('Start with handlers: \n' + "\n".join(f"- {repr(log_)}" for log_ in logger.handlers)) - -# %% -folders = [folder for folder in Path(FOLDER_MQ_TXT_DATA).iterdir() if folder.is_dir()] - -# %% -folders_dict = {folder.name: folder for folder in sorted(folders) } -assert len(folders_dict) == len(folders), "Non unique file names" - -# %% Collapsed="false" -# w_file = widgets.Dropdown(options=[folder for folder in folders], description='View files') -w_file = widgets.Dropdown(options=folders_dict, description='View files') -w_file - -# %% -mq_output = MaxQuantOutputDynamic(w_file.value) -mq_output - -# %% [markdown] -# Results will be saved in a subfolder under `vaep/project/data` using the name of the specified input-folder per default. Change to your liking: - -# %% [markdown] -# > Go to the block you are interested in! - -# %% [markdown] Collapsed="false" -# ## MQ Summary files - -# %% -mq_output.summary.iloc[0].to_dict() - -# %% [markdown] Collapsed="false" -# ### File Handler -# -# - dictionary of run name to run output folder -# - find class with expected output folders - -# %% Collapsed="false" -# # load_summary?? - -# %% [markdown] Collapsed="false" -# ### Summaries -# -# - aggregated in `vaep/project/erda_01_mq_aggregate_summaries.ipynb` -# - file selection based on summaries for further analysis thereafter - -# %% -# paths_summaries = [str(folder / 'summary.txt') for folder in folders_dict.values()] - -# %% Collapsed="false" -# # if paths_summaries.files: -# if folders_dict: -# # df, names, failed = process_files(handler_fct=load_summary, filepaths=paths_summaries.files, key=FOLDER_KEY, relative_to='paths_summaries.folder') -# df, names, failed = process_files(handler_fct=load_summary, filepaths=paths_summaries, key=FOLDER_KEY, relative_to=None) -# df.columns = names -# print(f"Number of failed reads: {len(failed)}") -# display(df) - -# %% -# # if paths_summaries.files: -# if paths_summaries: -# df.to_csv(os.path.join(FOLDER_PROCESSED, 'all_summary_txt.csv')) -# df.to_pickle(os.path.join(FOLDER_PROCESSED, 'all_summary_txt.pkl')) - -# %% [markdown] -# - SIL - MS2 based on precursor which was a set of peaks -# - PEAK - MS2 scan based on a single peak on precursor spectrum -# - ISO - isotopic pattern detection -# - -# %% -# # if paths_summaries.files: -# if paths_summaries: -# MS_spectra = df.loc[['MS', 'MS/MS Identified']].T.astype('int64') -# mask = MS_spectra['MS/MS Identified'] > 0 -# display(MS_spectra.loc[mask].describe()) -# MS_spectra.to_csv(os.path.join(FOLDER_PROCESSED, 'overview_stats.csv')) - -# %% [markdown] Collapsed="false" -# ## MaxQuant Parameter File -# -# - partly in a separate subfolder -# - mainly in run folders -# - rebase on folders_dictionary (check for `.xml` files in all folders) - -# %% -mqpar_files = (Path(FOLDER_DATA) / 'mqpar_files') - -mqpar_files = [file for file in mqpar_files.iterdir() if file.suffix == '.xml'] -len(mqpar_files) # nested search needed - -# %% Collapsed="false" -w_file = widgets.Dropdown(options=mqpar_files, description='Select a file') -w_file - -# %% [markdown] Collapsed="false" -# ### Parameter Files - -# %% Collapsed="false" -fname_mqpar_xml = os.path.join(FOLDER_PROCESSED, 'peptide_intensities.{}') - -d_mqpar = dict() -for file in tqdm(mqpar_files): - d_mqpar[file.stem] = load_mqpar_xml(file)['MaxQuantParams'] - -df_mqpar = pd.DataFrame(d_mqpar.values() , index=d_mqpar.keys()).convert_dtypes() -df_mqpar - -# %% [markdown] -# The number of threads used might differ - -# %% -df_mqpar['numThreads'].value_counts() - -# %% [markdown] -# The parameter files would need further parsing, which is skipped for now: -# - `OrderedDict` would need to be flattend -# - in the example below, it is not easy to see how entries should be easily combined -# (list of `OrderedDict`s where only the `fastaFilePath` is different) - -# %% -df_mqpar.iloc[0].loc['fastaFiles'] - -# %% [markdown] -# in order to see if there are different setting based on the string columns, drop duplicates -# -# - only one should remain - -# %% -df_mqpar.select_dtypes('string').drop('numThreads', axis=1).drop_duplicates() - -# %% [markdown] Collapsed="false" -# ## Peptides -# -# - peptides combined (combining different charged states): `peptides` -# - single peptides (with differing charges): `evidence` - -# %% Collapsed="false" -pd.set_option('max_columns', 60) - -# mq_output = MaxQuantOutputDynamic( -# folder=folders[random.randint(0, len(paths_peptides.files)-1)]) -mq_output.peptides - -# %% -mq_output.evidence - -# %% -mq_output.peptides.Intensity # as is in peptides.txt, comma seperated thousands - -# %% [markdown] Collapsed="false" -# ### Create peptide intensity dumps for each MQ outputfolder -# -# - idea was: dump peptides found for each (unique) gene -# - creates a `json` file for each gene with the gene contained -# -# - decision: discard -# - rather dump peptide information per sample. Mapping of peptides to gene can be done -# using the fasta file on the pytorch level. - -# %% -# folders[:10] - -# %% [markdown] -# Check if the output folder contains already parsed files - -# %% -# import json - -# with open(src.config.FN_FASTA_DB) as f: -# data_fasta = json.load(f) -# print(f'Number of proteins in fasta file DB: {len(data_fasta)}') - -# %% -# # %%time -# FOLDER_PEP_PER_GENE = Path(FOLDER_PROCESSED) / 'agg_peptides_per_gene' -# FOLDER_PEP_PER_GENE.mkdir(parents=True, exist_ok=True) -# set_previously_loaded = {folder.name for folder in FOLDER_PEP_PER_GENE.iterdir()} - -# FORCE = True - -# for folder in folders: -# if folder.name in set_previously_loaded and not FORCE and (folder / '0_completness_all_genes.json').exists(): -# pass -# else: -# logger.info(f'\n\nProcess: {folder.name}') -# mq_output = MaxQuantOutputDynamic(folder) -# peptide_extractor = ExtractFromPeptidesTxt( -# out_folder=FOLDER_PEP_PER_GENE, mq_output_object=mq_output, fasta_db=data_fasta) -# completeness_per_gene = peptide_extractor() - -# %% [markdown] Collapsed="false" -# ## Theoretial Peptides from used fasta-file -# -# > `01_explore_FASTA.ipynb` (formely `misc_FASTA_tryptic_digest.ipynb`) -# -# - check if peptides are part of theoretical peptides - -# %% Collapsed="false" diff --git a/project/erda_01_mq_select_runs.ipynb b/project/erda_01_mq_select_runs.ipynb deleted file mode 100644 index af7dd493a..000000000 --- a/project/erda_01_mq_select_runs.ipynb +++ /dev/null @@ -1,410 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "# MaxQuant (MQ) Output-Files\n", - "\n", - "Files compared:\n", - "1. `Summary.txt`\n", - "2. `mqpar.xml`\n", - "3. `peptides.txt`\n", - "4. `proteins.txt`\n", - "\n", - "There is are many files more, where several files seem to be available in several times in different formats." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "import logging\n", - "from pathlib import Path, PurePosixPath\n", - "import yaml\n", - "import random\n", - "\n", - "##################\n", - "### Logging ######\n", - "##################\n", - "\n", - "# Setup logging in notebooks\n", - "from vaep.logging import setup_nb_logger\n", - "setup_nb_logger()\n", - "logger = logging.getLogger()\n", - "\n", - "logging.info('Start with handlers: \\n' + \"\\n\".join(f\"- {repr(log_)}\" for log_ in logger.handlers))\n", - "\n", - "### Other imports\n", - "\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import pandas as pd\n", - "import ipywidgets as widgets\n", - "\n", - "from vaep.io.mq import MaxQuantOutputDynamic\n", - "from vaep import plotting\n", - "\n", - "from vaep.io import data_objects\n", - "from vaep.io.data_objects import MqAllSummaries \n", - "\n", - "##################\n", - "##### CONFIG #####\n", - "##################\n", - "import config\n", - "from config import FOLDER_MQ_TXT_DATA, FOLDER_PROCESSED\n", - "\n", - "ELIGABLE_FILES_YAML = Path('config/eligable_files.yaml')\n", - "MAP_FOLDER_PATH = Path('config/file_paths')\n", - "FPATH_ALL_SUMMARIES = FOLDER_PROCESSED / 'all_summaries.json'\n", - "FN_RAWFILE_METADATA = 'data/rawfile_metadata.csv'\n", - "\n", - "from config import FOLDER_DATA # project folder for storing the data\n", - "logger.info(f\"Search Raw-Files on path: {FOLDER_MQ_TXT_DATA}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "folders = [folder for folder in Path(FOLDER_MQ_TXT_DATA).iterdir() if folder.is_dir() and not folder.name.startswith('.')]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "folders_dict = {folder.name: folder for folder in sorted(folders)}\n", - "assert len(folders_dict) == len(folders), \"Non unique file names\"\n", - "\n", - "with open(MAP_FOLDER_PATH, 'w') as f:\n", - " yaml.dump({ k: str(PurePosixPath(v)) for k, v in folders_dict.items()} , f)\n", - "logger.info(f\"Save map of file names to file paths to: {str(MAP_FOLDER_PATH)}\")\n", - "\n", - "# w_file = widgets.Dropdown(options=[folder for folder in folders], description='View files')\n", - "w_file = widgets.Dropdown(options=folders_dict, description='View files')\n", - "w_file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mq_output = MaxQuantOutputDynamic(w_file.value)\n", - "mq_output" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(f\"Results will be saved in subfolders in\\n\\t{str(FOLDER_PROCESSED.absolute())}\"\n", - " \"\\nusing the name of the specified input-folder per default. Change to your liking.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> Go to the block you are interested in!" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "### Summaries Data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "pd.options.display.max_columns = 49\n", - "mq_all_summaries = MqAllSummaries(FPATH_ALL_SUMMARIES)\n", - "mq_all_summaries.load_new_samples(folders=folders)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "if mq_all_summaries.empty_folders:\n", - " print(mq_all_summaries.empty_folders)\n", - " with open('log_empty_folder.txt', 'a') as f:\n", - " f.writelines(mq_all_summaries.empty_folders)\n", - "print(f\"In total processed: {len(mq_all_summaries):5}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "pd.options.display.max_columns = len(mq_all_summaries.df.columns)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mq_all_summaries.df.info()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- SIL - MS2 based on precursor which was a set of peaks\n", - "- PEAK - MS2 scan based on a single peak on precursor spectrum\n", - "- ISO - isotopic pattern detection\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class col_summary:\n", - " MS1 = 'MS'\n", - " MS2 = 'MS/MS' \n", - " MS2_identified = 'MS/MS Identified'\n", - " peptides_identified = 'Peptide Sequences Identified' # 'peptides.txt' should have this number of peptides\n", - "\n", - "df = mq_all_summaries.df\n", - "if df is not None:\n", - " MS_spectra = df[[col_summary.MS1, col_summary.MS2, col_summary.MS2_identified, col_summary.peptides_identified]]\n", - "\n", - " def compute_summary(threshold_identified):\n", - " mask = MS_spectra[col_summary.peptides_identified] >= threshold_identified\n", - " display(MS_spectra.loc[mask].describe(np.linspace(0.05, 0.95, 10)))\n", - " \n", - " w_ions_range = widgets.IntSlider(value=15_000, min=.0, max=MS_spectra[col_summary.peptides_identified].max())\n", - " display(widgets.interactive(compute_summary, threshold_identified=w_ions_range))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mask = MS_spectra[col_summary.peptides_identified] >= w_ions_range.value\n", - "logger.warning(f\"Save {mask.sum()} file names to configuration file of selected samples: \"\n", - "f\"{ELIGABLE_FILES_YAML} \"\n", - "f\"based on a minimum of {w_ions_range.value} peptides.\")\n", - "idx_selected = MS_spectra.loc[mask].index\n", - "MS_spectra.loc[idx_selected]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Select Date Range\n", - "\n", - "- based on metadata" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_meta_rawfiles = pd.read_csv(FN_RAWFILE_METADATA, header=[0, 1], index_col=0)\n", - "date_col = ('FileProperties', 'Content Creation Date')\n", - "df_meta_rawfiles[date_col] = pd.to_datetime(\n", - " df_meta_rawfiles[date_col])\n", - "df_meta_rawfiles = df_meta_rawfiles.loc[idx_selected]\n", - "df_meta_rawfiles.sort_values(date_col, inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "w_date_range = widgets.SelectionRangeSlider(options=df_meta_rawfiles[date_col], value=[min(df_meta_rawfiles[date_col]),max(df_meta_rawfiles[date_col]) ] )\n", - "\n", - "def show(range):\n", - " mask = df_meta_rawfiles[date_col].between(*range)\n", - " df_view = MS_spectra.loc[idx_selected].loc[mask]\n", - " display(df_view)\n", - "\n", - "\n", - "int_date_range = widgets.interactive(show, range=w_date_range)\n", - "display(int_date_range)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mask = df_meta_rawfiles[date_col].between(*w_date_range.value)\n", - "idx_selected = mask.loc[mask].index\n", - "idx_selected" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Write out selected, eligable files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with open(ELIGABLE_FILES_YAML, 'w') as f:\n", - " yaml.dump(data={'files': idx_selected.to_list()}, stream=f)\n", - "logger.info(f\"Dumped yaml file with eligable files under key 'files' to {str(ELIGABLE_FILES_YAML)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Plot number of samples\n", - "\n", - "- binned by 10k steps" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "_max = MS_spectra[col_summary.peptides_identified].max() + 10_001\n", - "fig, ax = plt.subplots(figsize=(10,10))\n", - "_ = MS_spectra[col_summary.peptides_identified].hist(\n", - " bins=range(0,_max, 10_000),\n", - " legend=True,\n", - " ax = ax)\n", - "fig.suptitle('Number of samples, binned in 10K steps.')\n", - "fig.tight_layout()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "MS_spectra[col_summary.peptides_identified].mean(), MS_spectra[col_summary.peptides_identified].std() # including folders with 0 identified peptides" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def calc_cutoff(threshold=1):\n", - " s = MS_spectra[col_summary.peptides_identified]\n", - " mask = s >= threshold\n", - " s = s.loc[mask]\n", - " display(f\"Threshold selected (inclusive): {threshold} \")\n", - " display(f\"mean: {s.mean():.2f}, std-dev: {s.std():.2f}\")\n", - "\n", - "\n", - "# calc_cutoff()\n", - "display(widgets.interactive(calc_cutoff, threshold=widgets.IntSlider(value=10000.0, min=.0, max=MS_spectra[col_summary.peptides_identified].max())))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fig, axes = plt.subplots(2,2, figsize=(20,20), sharex=True)\n", - "\n", - "ylim_hist = (0,600)\n", - "xlim_dens = (0, 70_000)\n", - "\n", - "ax = axes[0,0]\n", - "ax = mq_all_summaries.df[col_summary.peptides_identified].plot(kind='hist', bins=50, title=\"Histogram including samples with zero identified peptides\", grid=True, ax=ax, ylim=ylim_hist)\n", - "ax = axes[1,0]\n", - "_ = mq_all_summaries.df[col_summary.peptides_identified].astype(float).plot.kde(ax=ax, title=\"Density plot including samples with zero identified peptides.\", xlim=xlim_dens)\n", - "\n", - "threshold_m2_identified = 15_000\n", - "mask = mq_all_summaries.df[col_summary.peptides_identified] >= threshold_m2_identified\n", - "\n", - "ax = axes[0,1]\n", - "ax = mq_all_summaries.df.loc[mask, col_summary.peptides_identified].plot(kind='hist', bins=40, title=f\"Histogram including samples with {threshold_m2_identified:,d} and more identified peptides\", grid=True, ax=ax, ylim=ylim_hist)\n", - "ax = axes[1,1]\n", - "_ = mq_all_summaries.df.loc[mask, col_summary.peptides_identified].astype(float).plot.kde(ax=ax, title=f\"Density plot including samples with {threshold_m2_identified:,d} and more identified peptides.\", xlim=xlim_dens)\n", - "\n", - "plotting._savefig(fig, name='distribution_peptides_in_samples', folder=config.FIGUREFOLDER)" - ] - } - ], - "metadata": { - "file_extension": ".py", - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.15" - }, - "mimetype": "text/x-python", - "name": "python", - "npconvert_exporter": "python", - "pygments_lexer": "ipython3", - "toc-autonumbering": true, - "version": 3 - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/project/erda_01_mq_select_runs.py b/project/erda_01_mq_select_runs.py deleted file mode 100644 index 070de215e..000000000 --- a/project/erda_01_mq_select_runs.py +++ /dev/null @@ -1,240 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.14.5 -# kernelspec: -# display_name: Python 3 (ipykernel) -# language: python -# name: python3 -# --- - -# %% [markdown] Collapsed="false" -# # MaxQuant (MQ) Output-Files -# -# Files compared: -# 1. `Summary.txt` -# 2. `mqpar.xml` -# 3. `peptides.txt` -# 4. `proteins.txt` -# -# There is are many files more, where several files seem to be available in several times in different formats. - -# %% -import sys -import logging -from pathlib import Path, PurePosixPath -import yaml -import random - -################## -### Logging ###### -################## - -# Setup logging in notebooks -from vaep.logging import setup_nb_logger -setup_nb_logger() -logger = logging.getLogger() - -logging.info('Start with handlers: \n' + "\n".join(f"- {repr(log_)}" for log_ in logger.handlers)) - -### Other imports - -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -import ipywidgets as widgets - -from vaep.io.mq import MaxQuantOutputDynamic -from vaep import plotting - -from vaep.io import data_objects -from vaep.io.data_objects import MqAllSummaries - -################## -##### CONFIG ##### -################## -import config -from config import FOLDER_MQ_TXT_DATA, FOLDER_PROCESSED - -ELIGABLE_FILES_YAML = Path('config/eligable_files.yaml') -MAP_FOLDER_PATH = Path('config/file_paths') -FPATH_ALL_SUMMARIES = FOLDER_PROCESSED / 'all_summaries.json' -FN_RAWFILE_METADATA = 'data/rawfile_metadata.csv' - -from config import FOLDER_DATA # project folder for storing the data -logger.info(f"Search Raw-Files on path: {FOLDER_MQ_TXT_DATA}") - -# %% Collapsed="false" -folders = [folder for folder in Path(FOLDER_MQ_TXT_DATA).iterdir() if folder.is_dir() and not folder.name.startswith('.')] - -# %% Collapsed="false" -folders_dict = {folder.name: folder for folder in sorted(folders)} -assert len(folders_dict) == len(folders), "Non unique file names" - -with open(MAP_FOLDER_PATH, 'w') as f: - yaml.dump({ k: str(PurePosixPath(v)) for k, v in folders_dict.items()} , f) -logger.info(f"Save map of file names to file paths to: {str(MAP_FOLDER_PATH)}") - -# w_file = widgets.Dropdown(options=[folder for folder in folders], description='View files') -w_file = widgets.Dropdown(options=folders_dict, description='View files') -w_file - -# %% -mq_output = MaxQuantOutputDynamic(w_file.value) -mq_output - -# %% -print(f"Results will be saved in subfolders in\n\t{str(FOLDER_PROCESSED.absolute())}" - "\nusing the name of the specified input-folder per default. Change to your liking.") - -# %% [markdown] -# > Go to the block you are interested in! - -# %% [markdown] Collapsed="false" -# ### Summaries Data - -# %% -# %%time -pd.options.display.max_columns = 49 -mq_all_summaries = MqAllSummaries(FPATH_ALL_SUMMARIES) -mq_all_summaries.load_new_samples(folders=folders) - -# %% Collapsed="false" -if mq_all_summaries.empty_folders: - print(mq_all_summaries.empty_folders) - with open('log_empty_folder.txt', 'a') as f: - f.writelines(mq_all_summaries.empty_folders) -print(f"In total processed: {len(mq_all_summaries):5}") - -# %% Collapsed="false" -pd.options.display.max_columns = len(mq_all_summaries.df.columns) - -# %% -mq_all_summaries.df.info() - - -# %% [markdown] -# - SIL - MS2 based on precursor which was a set of peaks -# - PEAK - MS2 scan based on a single peak on precursor spectrum -# - ISO - isotopic pattern detection -# - -# %% -class col_summary: - MS1 = 'MS' - MS2 = 'MS/MS' - MS2_identified = 'MS/MS Identified' - peptides_identified = 'Peptide Sequences Identified' # 'peptides.txt' should have this number of peptides - -df = mq_all_summaries.df -if df is not None: - MS_spectra = df[[col_summary.MS1, col_summary.MS2, col_summary.MS2_identified, col_summary.peptides_identified]] - - def compute_summary(threshold_identified): - mask = MS_spectra[col_summary.peptides_identified] >= threshold_identified - display(MS_spectra.loc[mask].describe(np.linspace(0.05, 0.95, 10))) - - w_ions_range = widgets.IntSlider(value=15_000, min=.0, max=MS_spectra[col_summary.peptides_identified].max()) - display(widgets.interactive(compute_summary, threshold_identified=w_ions_range)) - -# %% -mask = MS_spectra[col_summary.peptides_identified] >= w_ions_range.value -logger.warning(f"Save {mask.sum()} file names to configuration file of selected samples: " -f"{ELIGABLE_FILES_YAML} " -f"based on a minimum of {w_ions_range.value} peptides.") -idx_selected = MS_spectra.loc[mask].index -MS_spectra.loc[idx_selected] - -# %% [markdown] -# ### Select Date Range -# -# - based on metadata - -# %% -df_meta_rawfiles = pd.read_csv(FN_RAWFILE_METADATA, header=[0, 1], index_col=0) -date_col = ('FileProperties', 'Content Creation Date') -df_meta_rawfiles[date_col] = pd.to_datetime( - df_meta_rawfiles[date_col]) -df_meta_rawfiles = df_meta_rawfiles.loc[idx_selected] -df_meta_rawfiles.sort_values(date_col, inplace=True) - -# %% -w_date_range = widgets.SelectionRangeSlider(options=df_meta_rawfiles[date_col], value=[min(df_meta_rawfiles[date_col]),max(df_meta_rawfiles[date_col]) ] ) - -def show(range): - mask = df_meta_rawfiles[date_col].between(*range) - df_view = MS_spectra.loc[idx_selected].loc[mask] - display(df_view) - - -int_date_range = widgets.interactive(show, range=w_date_range) -display(int_date_range) - -# %% -mask = df_meta_rawfiles[date_col].between(*w_date_range.value) -idx_selected = mask.loc[mask].index -idx_selected - -# %% [markdown] -# ### Write out selected, eligable files - -# %% -with open(ELIGABLE_FILES_YAML, 'w') as f: - yaml.dump(data={'files': idx_selected.to_list()}, stream=f) -logger.info(f"Dumped yaml file with eligable files under key 'files' to {str(ELIGABLE_FILES_YAML)}") - -# %% [markdown] -# ## Plot number of samples -# -# - binned by 10k steps - -# %% -_max = MS_spectra[col_summary.peptides_identified].max() + 10_001 -fig, ax = plt.subplots(figsize=(10,10)) -_ = MS_spectra[col_summary.peptides_identified].hist( - bins=range(0,_max, 10_000), - legend=True, - ax = ax) -fig.suptitle('Number of samples, binned in 10K steps.') -fig.tight_layout() - -# %% -MS_spectra[col_summary.peptides_identified].mean(), MS_spectra[col_summary.peptides_identified].std() # including folders with 0 identified peptides - - -# %% -def calc_cutoff(threshold=1): - s = MS_spectra[col_summary.peptides_identified] - mask = s >= threshold - s = s.loc[mask] - display(f"Threshold selected (inclusive): {threshold} ") - display(f"mean: {s.mean():.2f}, std-dev: {s.std():.2f}") - - -# calc_cutoff() -display(widgets.interactive(calc_cutoff, threshold=widgets.IntSlider(value=10000.0, min=.0, max=MS_spectra[col_summary.peptides_identified].max()))) - -# %% -fig, axes = plt.subplots(2,2, figsize=(20,20), sharex=True) - -ylim_hist = (0,600) -xlim_dens = (0, 70_000) - -ax = axes[0,0] -ax = mq_all_summaries.df[col_summary.peptides_identified].plot(kind='hist', bins=50, title="Histogram including samples with zero identified peptides", grid=True, ax=ax, ylim=ylim_hist) -ax = axes[1,0] -_ = mq_all_summaries.df[col_summary.peptides_identified].astype(float).plot.kde(ax=ax, title="Density plot including samples with zero identified peptides.", xlim=xlim_dens) - -threshold_m2_identified = 15_000 -mask = mq_all_summaries.df[col_summary.peptides_identified] >= threshold_m2_identified - -ax = axes[0,1] -ax = mq_all_summaries.df.loc[mask, col_summary.peptides_identified].plot(kind='hist', bins=40, title=f"Histogram including samples with {threshold_m2_identified:,d} and more identified peptides", grid=True, ax=ax, ylim=ylim_hist) -ax = axes[1,1] -_ = mq_all_summaries.df.loc[mask, col_summary.peptides_identified].astype(float).plot.kde(ax=ax, title=f"Density plot including samples with {threshold_m2_identified:,d} and more identified peptides.", xlim=xlim_dens) - -plotting._savefig(fig, name='distribution_peptides_in_samples', folder=config.FIGUREFOLDER) diff --git a/project/erda_02_mq_count_features.ipynb b/project/erda_02_mq_count_features.ipynb deleted file mode 100644 index bc1e89640..000000000 --- a/project/erda_02_mq_count_features.ipynb +++ /dev/null @@ -1,769 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Count peptides over all files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import sys\n", - "import logging\n", - "from pathlib import Path\n", - "import random\n", - "import yaml\n", - "import json\n", - "\n", - "import pandas as pd\n", - "import ipywidgets as widgets\n", - "\n", - "### Logging setup ######\n", - "from vaep.logging import setup_nb_logger\n", - "setup_nb_logger()\n", - "\n", - "### vaep imports ######\n", - "from vaep.io.mq import MaxQuantOutputDynamic\n", - "from vaep.io.data_objects import MqAllSummaries\n", - "from vaep.io.data_objects import PeptideCounter\n", - "import vaep.pandas\n", - "\n", - "##################\n", - "##### CONFIG #####\n", - "##################\n", - "from config import FOLDER_MQ_TXT_DATA, FOLDER_PROCESSED\n", - "\n", - "from config import FOLDER_DATA # project folder for storing the data\n", - "logging.info(f\"Search Raw-Files on path: {FOLDER_MQ_TXT_DATA}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Use samples previously loaded." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ELIGABLE_FILES_YAML = Path('config/eligable_files.yaml')\n", - "MAP_FOLDER_PATH = Path('config/file_paths')\n", - "\n", - "with open(ELIGABLE_FILES_YAML) as f:\n", - " files = set(yaml.safe_load(f)['files'])\n", - " logging.info(f\"Found a total of {len(files):,d} eligable files.\")\n", - "with open(MAP_FOLDER_PATH) as f:\n", - " folders_dict = yaml.safe_load(f)\n", - " folders_dict = {folder: folders_dict[folder] for folder in files} # only select folders selected\n", - "\n", - "folders = [Path(folders_dict[folder]) for folder in files]\n", - "assert len(files) == len(folders_dict) == len(folders)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fn_id_old_new: str = 'data/rename/selected_old_new_id_mapping.csv' # selected samples with pride and original id\n", - "df_ids = pd.read_csv(fn_id_old_new)\n", - "df_ids" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "folders_dict = { sample_id: FOLDER_MQ_TXT_DATA / sample_id for sample_id in df_ids['Sample ID']}\n", - "# folders_dict = {p.stem : p.parent / p.stem for p in folders_dict}\n", - "# folders_dict" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "OVERWRITE = False\n", - "\n", - "from config import FNAME_C_PEPTIDES, FNAME_C_EVIDENCE, FNAME_C_PG, FNAME_C_GENES\n", - "\n", - "FNAME_C_PEPTIDES, FNAME_C_EVIDENCE, FNAME_C_PG, FNAME_C_GENES" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Random example" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import random\n", - "pd.set_option('display.max_columns', 60)\n", - "random_folder, random_path = random.sample(folders_dict.items(), 1)[0]\n", - "mq_output = MaxQuantOutputDynamic(random_path)\n", - "print(f\"peptides.txt from {random_folder!s}\")\n", - "mq_output.peptides" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "use_columns = mq_output.peptides.columns[33:45]\n", - "df = mq_output.peptides[use_columns].convert_dtypes() #.to_json('test.json')\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_json_string = df.to_json(orient='index', indent=4)\n", - "df_json_string[:1000]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_csv = df.to_csv()\n", - "df_csv[:1000]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pd.read_json(df_json_string, orient='index')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mq_output.peptides.Intensity # as is in peptides.txt, comma seperated thousands" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Count aggregated peptides" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "peptide_counter = PeptideCounter(FNAME_C_PEPTIDES, overwrite=OVERWRITE)\n", - "peptide_counter" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if peptide_counter.loaded:\n", - " print(peptide_counter.counter.most_common(10),\n", - " len(peptide_counter.loaded),\n", - " sep='\\n')\n", - "else:\n", - " print('New file created.')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- creates peptide intensity dumps for each MQ outputfolder per default `count_peptides` function (default processing function for `PeptideCounter`)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "%%time\n", - "# folders = [Path(folder_path) for folder_path in folders_dict.values()]\n", - "c = peptide_counter.sum_over_files(folders=folders)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "c.most_common(10) # peptide_counter.counter.most_common(10)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# To share as python file\n", - "N = 1000\n", - "with open(FOLDER_PROCESSED / f'most_common_{10}_peptides.py', 'w') as f:\n", - " f.write('import pandas as pd\\n\\n')\n", - " \n", - " #pprint.pformat list -> do this using standardlibrary\n", - " # https://docs.python.org/3/library/pprint.html\n", - " f.write(f\"most_common = [\\n \")\n", - " f.write(',\\n '.join(f\"{str(t)}\" for t in c.most_common(N)))\n", - " f.write(\"\\n]\\n\\n\")\n", - " \n", - " #peptide_counter.loaded()\n", - " \n", - " f.write(\"pd.DataFrame.from_records(most_common, index='Sequence', columns=['Sequence', 'counts'])\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "## Peptides by charge\n", - "\n", - "- count peptides by charge state (which are aggregated in `peptides.txt`)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "evidence_cols = vaep.pandas.get_columns_accessor(mq_output.evidence.reset_index())\n", - "evidence_cols # vaep.mq get this list" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "evidence = mq_output.evidence.set_index(evidence_cols.Charge, append=True)\n", - "evidence" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Modifikationen könnten noch zum index hinzugefügt werden" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "evidence.Modifications.value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vaep.pandas.prop_unique_index(evidence)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Using the protein AA sequence and it's charge as identifiers, does not yield a unique index.\n", - "\n", - "First potential contaminants and peptides with zero intensity (or missing intensity) can be removed from the table.\n", - "\n", - "These are apparently peptides identified by an MS2 spectrum but which could not be quantified by a MS1 scans" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mask = evidence[evidence_cols.Intensity].isna()\n", - "evidence.loc[mask, evidence_cols.Type].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "evidence_cols = vaep.io.data_objects.evidence_cols\n", - "use_cols = [evidence_cols.mz, evidence_cols.Protein_group_IDs, evidence_cols.Intensity, evidence_cols.Score, evidence_cols.Potential_contaminant]\n", - "\n", - "evidence_selected = vaep.io.data_objects.select_evidence(evidence[use_cols])\n", - "evidence_selected" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "evidence_selected = evidence_selected.sort_values(by=['Sequence', 'Charge', 'Score'], ascending=False)\n", - "evidence_selected" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "evidence_selected = vaep.pandas.select_max_by(evidence_selected.reset_index(), [evidence_cols.Sequence, evidence_cols.Charge], evidence_cols.Score)\n", - "evidence_selected" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from collections import Counter\n", - "c = Counter()\n", - "c.update(evidence.index)\n", - "c.most_common(10)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "example = evidence.loc[c.most_common(10)[0][0]]\n", - "\n", - "vaep.pandas.show_columns_with_variation(example)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- `Type`: only `MULTI-MSMS` and `MULIT-SECPEP` are quantified (does this mean a matching MS1 spectrum?)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "evidence[evidence_cols.Type].value_counts()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Some peptides can be assigned to different protein group IDs (razor peptides)\n", - " - option: discared non-unique peptides (and Protein group IDs can be already a combination of several isotopes)\n", - " - option: select on `Score` or `Intensity` (is there a relationship?)\n", - " - option: select based on `Number of isotopic peaks`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "evidence[evidence_cols.Protein_group_IDs].value_counts()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Count peptides based on evidence files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "evidence_counter = vaep.io.data_objects.EvidenceCounter(FNAME_C_EVIDENCE, overwrite=OVERWRITE)\n", - "c = evidence_counter.sum_over_files(folders=folders)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Protein Groups\n", - "\n", - "- protein groups between files\n", - " - aggregate by GENE ?\n", - " - " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mq_output.proteinGroups.describe(include='all')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pg_cols = vaep.pandas.get_columns_accessor(mq_output.proteinGroups.reset_index())\n", - "pg_cols" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "use_cols = [\n", - "# pg_cols.Protein_IDs,\n", - " pg_cols.Majority_protein_IDs,\n", - " pg_cols.Gene_names,\n", - " pg_cols.Evidence_IDs,\n", - " pg_cols.Q_value,\n", - " pg_cols.Score,\n", - " pg_cols.Only_identified_by_site,\n", - " pg_cols.Reverse,\n", - " pg_cols.Potential_contaminant,\n", - " pg_cols.Intensity,\n", - "]\n", - "\n", - "pd.options.display.max_rows = 100\n", - "pd.options.display.min_rows = 40\n", - "mask = mq_output.proteinGroups[[pg_cols.Only_identified_by_site, pg_cols.Reverse, pg_cols.Potential_contaminant]].notna().sum(axis=1) > 0\n", - "mq_output.proteinGroups.loc[mask, use_cols]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "msg = \"Omitting the data drops {0:.3f} % of the data.\"\n", - "print(msg.format(\n", - "mask.sum() / len(mask) * 100\n", - "))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "selection = mq_output.proteinGroups.loc[~mask, use_cols]\n", - "gene_counts = selection[pg_cols.Gene_names].value_counts() # Gene Names not unique\n", - "msg = 'proportion of entries with non-unique genes: {:.3f}'\n", - "print(msg.format(gene_counts.loc[gene_counts > 1].sum() / gene_counts.sum()))\n", - "gene_counts.head(20)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mask = selection.Intensity > 0 \n", - "msg = \"Proportion of non-zero Intensities: {:.3f} (zero_ count = {})\"\n", - "print(msg.format(mask.sum() / len(mask), (~mask).sum()))\n", - "selection.loc[~mask]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "selection = selection.loc[mask]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Some Proteins have no gene annotation\n", - " - P56181 -> mitochondrial\n", - "\n", - "In the online version of Uniprot these seems to be annotated (brief check). \n", - "So latest version probably has a gene annotation, so therefore these files are kept" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "gene_set = selection[pg_cols.Gene_names].str.split(';')\n", - "\n", - "col_loc_gene_names = selection.columns.get_loc(pg_cols.Gene_names)\n", - "_ = selection.insert(col_loc_gene_names+1, 'Number of Genes', gene_set.apply(vaep.pandas.length))\n", - "\n", - "mask = gene_set.isna()\n", - "selection.loc[mask]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cols = vaep.pandas.get_columns_accessor(selection)\n", - "gene_counts = vaep.pandas.counts_with_proportion(selection[cols.Number_of_Genes])\n", - "gene_counts" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Most `proteinGroups` have single genes assigned to them. If one only looks at gene sets,\n", - "one can increase uniquely identified `proteinGroups` further. \n", - "\n", - "> Can `geneGroups` (sets of `Gene Names`) be used instead of `proteinGroups`?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "gene_sets_counts = selection[cols.Gene_names].value_counts()\n", - "gene_sets_counts.value_counts()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Potential solutions:\n", - "- summarize intensity per gene. One of the isoforms seems to have the major proportion of intensity assigned.\n", - "- select maximum by score (higher scores seem to be related to higher intensity)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "non_unique_genes = gene_sets_counts.loc[gene_sets_counts > 1].index\n", - "\n", - "mask = selection[cols.Gene_names].isin(non_unique_genes)\n", - "selection.loc[mask].reset_index().set_index(cols.Gene_names).sort_index()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Protein Groups with Gene set with three and more genes:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "selection.loc[selection[cols.Number_of_Genes] > 2]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "logging.info(f\"Selection shape before dropping duplicates by gene: {selection.shape}\")\n", - "mask_no_gene = selection[pg_cols.Gene_names].isna()\n", - "selection_no_gene = selection.loc[mask_no_gene]\n", - "logging.info(f\"Entries without any gene annotation: {len(selection_no_gene)}\")\n", - "selection_no_gene" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "selection = vaep.pandas.select_max_by(df=selection.loc[~mask_no_gene].reset_index(), grouping_columns=[pg_cols.Gene_names], selection_column=pg_cols.Score)\n", - "logging.info(f\"Selection shape after dropping duplicates by gene: {selection.shape}\")\n", - "selection = selection.set_index(pg_cols.Protein_IDs)\n", - "mask = selection[cols.Gene_names].isin(non_unique_genes)\n", - "selection.loc[mask]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "selection = selection.append(selection_no_gene)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "protein_groups_counter = vaep.io.data_objects.ProteinGroupsCounter(FNAME_C_PG, overwrite=OVERWRITE)\n", - "c = protein_groups_counter.sum_over_files(folders=folders)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vaep.pandas.counts_with_proportion(pd.Series(c)) # Most proteinGroups are unique" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Count genes\n", - "Genes sets could be used to identify common features.\n", - "\n", - "> The assignment of isoforms to one proteinGroup or another might be volatile. \n", - "> A single (unique) peptide could lead to different assignments.\n", - "> Imputation on the evidence level could be a way to alleviate this problem\n", - "\n", - "- If genes set are not unique for a single run, one would have to decide which to take" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "gene_counter = vaep.io.data_objects.GeneCounter(FNAME_C_GENES, overwrite=OVERWRITE)\n", - "\n", - "if not gene_counter.dumps:\n", - " #empty dict, replace\n", - " gene_counter.dumps = dict(protein_groups_counter.dumps) # prot proteinGroups files to GeneCounter\n", - "pg_dumps = list(gene_counter.dumps.values())\n", - "\n", - "c_genes = gene_counter.sum_over_files(folders=pg_dumps)\n", - "\n", - "c_genes = pd.Series(c_genes)\n", - "vaep.pandas.counts_with_proportion(c_genes) # Most proteinGroups are unique" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "## Theoretial Peptides from used fasta-file\n", - "\n", - "> `01_explore_FASTA.ipynb` (formely `misc_FASTA_tryptic_digest.ipynb`)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.15" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/project/erda_02_mq_count_features.py b/project/erda_02_mq_count_features.py deleted file mode 100644 index e14a92657..000000000 --- a/project/erda_02_mq_count_features.py +++ /dev/null @@ -1,391 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.14.5 -# kernelspec: -# display_name: Python 3 (ipykernel) -# language: python -# name: python3 -# --- - -# %% [markdown] -# # Count peptides over all files - -# %% -import os -import sys -import logging -from pathlib import Path -import random -import yaml -import json - -import pandas as pd -import ipywidgets as widgets - -### Logging setup ###### -from vaep.logging import setup_nb_logger -setup_nb_logger() - -### vaep imports ###### -from vaep.io.mq import MaxQuantOutputDynamic -from vaep.io.data_objects import MqAllSummaries -from vaep.io.data_objects import PeptideCounter -import vaep.pandas - -################## -##### CONFIG ##### -################## -from config import FOLDER_MQ_TXT_DATA, FOLDER_PROCESSED - -from config import FOLDER_DATA # project folder for storing the data -logging.info(f"Search Raw-Files on path: {FOLDER_MQ_TXT_DATA}") - -# %% [markdown] -# Use samples previously loaded. - -# %% -ELIGABLE_FILES_YAML = Path('config/eligable_files.yaml') -MAP_FOLDER_PATH = Path('config/file_paths') - -with open(ELIGABLE_FILES_YAML) as f: - files = set(yaml.safe_load(f)['files']) - logging.info(f"Found a total of {len(files):,d} eligable files.") -with open(MAP_FOLDER_PATH) as f: - folders_dict = yaml.safe_load(f) - folders_dict = {folder: folders_dict[folder] for folder in files} # only select folders selected - -folders = [Path(folders_dict[folder]) for folder in files] -assert len(files) == len(folders_dict) == len(folders) - -# %% -fn_id_old_new: str = 'data/rename/selected_old_new_id_mapping.csv' # selected samples with pride and original id -df_ids = pd.read_csv(fn_id_old_new) -df_ids - -# %% -folders_dict = { sample_id: FOLDER_MQ_TXT_DATA / sample_id for sample_id in df_ids['Sample ID']} -# folders_dict = {p.stem : p.parent / p.stem for p in folders_dict} -# folders_dict - -# %% -OVERWRITE = False - -from config import FNAME_C_PEPTIDES, FNAME_C_EVIDENCE, FNAME_C_PG, FNAME_C_GENES - -FNAME_C_PEPTIDES, FNAME_C_EVIDENCE, FNAME_C_PG, FNAME_C_GENES - -# %% [markdown] -# ## Random example - -# %% -import random -pd.set_option('display.max_columns', 60) -random_folder, random_path = random.sample(folders_dict.items(), 1)[0] -mq_output = MaxQuantOutputDynamic(random_path) -print(f"peptides.txt from {random_folder!s}") -mq_output.peptides - -# %% -use_columns = mq_output.peptides.columns[33:45] -df = mq_output.peptides[use_columns].convert_dtypes() #.to_json('test.json') -df - -# %% -df_json_string = df.to_json(orient='index', indent=4) -df_json_string[:1000] - -# %% -df_csv = df.to_csv() -df_csv[:1000] - -# %% -pd.read_json(df_json_string, orient='index') - -# %% -mq_output.peptides.Intensity # as is in peptides.txt, comma seperated thousands - -# %% [markdown] -# ## Count aggregated peptides - -# %% -peptide_counter = PeptideCounter(FNAME_C_PEPTIDES, overwrite=OVERWRITE) -peptide_counter - -# %% -if peptide_counter.loaded: - print(peptide_counter.counter.most_common(10), - len(peptide_counter.loaded), - sep='\n') -else: - print('New file created.') - -# %% [markdown] -# - creates peptide intensity dumps for each MQ outputfolder per default `count_peptides` function (default processing function for `PeptideCounter`) - -# %% -# %%time -# folders = [Path(folder_path) for folder_path in folders_dict.values()] -c = peptide_counter.sum_over_files(folders=folders) - -# %% -c.most_common(10) # peptide_counter.counter.most_common(10) - -# %% -# To share as python file -N = 1000 -with open(FOLDER_PROCESSED / f'most_common_{10}_peptides.py', 'w') as f: - f.write('import pandas as pd\n\n') - - #pprint.pformat list -> do this using standardlibrary - # https://docs.python.org/3/library/pprint.html - f.write(f"most_common = [\n ") - f.write(',\n '.join(f"{str(t)}" for t in c.most_common(N))) - f.write("\n]\n\n") - - #peptide_counter.loaded() - - f.write("pd.DataFrame.from_records(most_common, index='Sequence', columns=['Sequence', 'counts'])\n") - -# %% [markdown] Collapsed="false" -# ## Peptides by charge -# -# - count peptides by charge state (which are aggregated in `peptides.txt`) - -# %% -evidence_cols = vaep.pandas.get_columns_accessor(mq_output.evidence.reset_index()) -evidence_cols # vaep.mq get this list - -# %% -evidence = mq_output.evidence.set_index(evidence_cols.Charge, append=True) -evidence - -# %% [markdown] -# Modifikationen könnten noch zum index hinzugefügt werden - -# %% -evidence.Modifications.value_counts() - -# %% -vaep.pandas.prop_unique_index(evidence) - -# %% [markdown] -# Using the protein AA sequence and it's charge as identifiers, does not yield a unique index. -# -# First potential contaminants and peptides with zero intensity (or missing intensity) can be removed from the table. -# -# These are apparently peptides identified by an MS2 spectrum but which could not be quantified by a MS1 scans - -# %% -mask = evidence[evidence_cols.Intensity].isna() -evidence.loc[mask, evidence_cols.Type].value_counts() - -# %% -evidence_cols = vaep.io.data_objects.evidence_cols -use_cols = [evidence_cols.mz, evidence_cols.Protein_group_IDs, evidence_cols.Intensity, evidence_cols.Score, evidence_cols.Potential_contaminant] - -evidence_selected = vaep.io.data_objects.select_evidence(evidence[use_cols]) -evidence_selected - -# %% -evidence_selected = evidence_selected.sort_values(by=['Sequence', 'Charge', 'Score'], ascending=False) -evidence_selected - -# %% -evidence_selected = vaep.pandas.select_max_by(evidence_selected.reset_index(), [evidence_cols.Sequence, evidence_cols.Charge], evidence_cols.Score) -evidence_selected - -# %% -from collections import Counter -c = Counter() -c.update(evidence.index) -c.most_common(10) - -# %% -example = evidence.loc[c.most_common(10)[0][0]] - -vaep.pandas.show_columns_with_variation(example) - -# %% [markdown] -# - `Type`: only `MULTI-MSMS` and `MULIT-SECPEP` are quantified (does this mean a matching MS1 spectrum?) - -# %% -evidence[evidence_cols.Type].value_counts() - -# %% [markdown] -# Some peptides can be assigned to different protein group IDs (razor peptides) -# - option: discared non-unique peptides (and Protein group IDs can be already a combination of several isotopes) -# - option: select on `Score` or `Intensity` (is there a relationship?) -# - option: select based on `Number of isotopic peaks` - -# %% -evidence[evidence_cols.Protein_group_IDs].value_counts() - -# %% [markdown] -# ## Count peptides based on evidence files - -# %% -evidence_counter = vaep.io.data_objects.EvidenceCounter(FNAME_C_EVIDENCE, overwrite=OVERWRITE) -c = evidence_counter.sum_over_files(folders=folders) - -# %% [markdown] -# ## Protein Groups -# -# - protein groups between files -# - aggregate by GENE ? -# - - -# %% -mq_output.proteinGroups.describe(include='all') - -# %% -pg_cols = vaep.pandas.get_columns_accessor(mq_output.proteinGroups.reset_index()) -pg_cols - -# %% -use_cols = [ -# pg_cols.Protein_IDs, - pg_cols.Majority_protein_IDs, - pg_cols.Gene_names, - pg_cols.Evidence_IDs, - pg_cols.Q_value, - pg_cols.Score, - pg_cols.Only_identified_by_site, - pg_cols.Reverse, - pg_cols.Potential_contaminant, - pg_cols.Intensity, -] - -pd.options.display.max_rows = 100 -pd.options.display.min_rows = 40 -mask = mq_output.proteinGroups[[pg_cols.Only_identified_by_site, pg_cols.Reverse, pg_cols.Potential_contaminant]].notna().sum(axis=1) > 0 -mq_output.proteinGroups.loc[mask, use_cols] - -# %% -msg = "Omitting the data drops {0:.3f} % of the data." -print(msg.format( -mask.sum() / len(mask) * 100 -)) - -# %% -selection = mq_output.proteinGroups.loc[~mask, use_cols] -gene_counts = selection[pg_cols.Gene_names].value_counts() # Gene Names not unique -msg = 'proportion of entries with non-unique genes: {:.3f}' -print(msg.format(gene_counts.loc[gene_counts > 1].sum() / gene_counts.sum())) -gene_counts.head(20) - -# %% -mask = selection.Intensity > 0 -msg = "Proportion of non-zero Intensities: {:.3f} (zero_ count = {})" -print(msg.format(mask.sum() / len(mask), (~mask).sum())) -selection.loc[~mask] - -# %% -selection = selection.loc[mask] - -# %% [markdown] -# Some Proteins have no gene annotation -# - P56181 -> mitochondrial -# -# In the online version of Uniprot these seems to be annotated (brief check). -# So latest version probably has a gene annotation, so therefore these files are kept - -# %% -gene_set = selection[pg_cols.Gene_names].str.split(';') - -col_loc_gene_names = selection.columns.get_loc(pg_cols.Gene_names) -_ = selection.insert(col_loc_gene_names+1, 'Number of Genes', gene_set.apply(vaep.pandas.length)) - -mask = gene_set.isna() -selection.loc[mask] - -# %% -cols = vaep.pandas.get_columns_accessor(selection) -gene_counts = vaep.pandas.counts_with_proportion(selection[cols.Number_of_Genes]) -gene_counts - -# %% [markdown] -# Most `proteinGroups` have single genes assigned to them. If one only looks at gene sets, -# one can increase uniquely identified `proteinGroups` further. -# -# > Can `geneGroups` (sets of `Gene Names`) be used instead of `proteinGroups`? - -# %% -gene_sets_counts = selection[cols.Gene_names].value_counts() -gene_sets_counts.value_counts() - -# %% [markdown] -# Potential solutions: -# - summarize intensity per gene. One of the isoforms seems to have the major proportion of intensity assigned. -# - select maximum by score (higher scores seem to be related to higher intensity) - -# %% -non_unique_genes = gene_sets_counts.loc[gene_sets_counts > 1].index - -mask = selection[cols.Gene_names].isin(non_unique_genes) -selection.loc[mask].reset_index().set_index(cols.Gene_names).sort_index() - -# %% [markdown] -# Protein Groups with Gene set with three and more genes: - -# %% -selection.loc[selection[cols.Number_of_Genes] > 2] - -# %% -logging.info(f"Selection shape before dropping duplicates by gene: {selection.shape}") -mask_no_gene = selection[pg_cols.Gene_names].isna() -selection_no_gene = selection.loc[mask_no_gene] -logging.info(f"Entries without any gene annotation: {len(selection_no_gene)}") -selection_no_gene - -# %% -selection = vaep.pandas.select_max_by(df=selection.loc[~mask_no_gene].reset_index(), grouping_columns=[pg_cols.Gene_names], selection_column=pg_cols.Score) -logging.info(f"Selection shape after dropping duplicates by gene: {selection.shape}") -selection = selection.set_index(pg_cols.Protein_IDs) -mask = selection[cols.Gene_names].isin(non_unique_genes) -selection.loc[mask] - -# %% -selection = selection.append(selection_no_gene) - -# %% -protein_groups_counter = vaep.io.data_objects.ProteinGroupsCounter(FNAME_C_PG, overwrite=OVERWRITE) -c = protein_groups_counter.sum_over_files(folders=folders) - -# %% -vaep.pandas.counts_with_proportion(pd.Series(c)) # Most proteinGroups are unique - -# %% [markdown] -# ### Count genes -# Genes sets could be used to identify common features. -# -# > The assignment of isoforms to one proteinGroup or another might be volatile. -# > A single (unique) peptide could lead to different assignments. -# > Imputation on the evidence level could be a way to alleviate this problem -# -# - If genes set are not unique for a single run, one would have to decide which to take - -# %% -gene_counter = vaep.io.data_objects.GeneCounter(FNAME_C_GENES, overwrite=OVERWRITE) - -if not gene_counter.dumps: - #empty dict, replace - gene_counter.dumps = dict(protein_groups_counter.dumps) # prot proteinGroups files to GeneCounter -pg_dumps = list(gene_counter.dumps.values()) - -c_genes = gene_counter.sum_over_files(folders=pg_dumps) - -c_genes = pd.Series(c_genes) -vaep.pandas.counts_with_proportion(c_genes) # Most proteinGroups are unique - -# %% [markdown] Collapsed="false" -# ## Theoretial Peptides from used fasta-file -# -# > `01_explore_FASTA.ipynb` (formely `misc_FASTA_tryptic_digest.ipynb`) - -# %% [markdown] -# diff --git a/project/erda_03_training_data.ipynb b/project/erda_03_training_data.ipynb deleted file mode 100644 index 269f55bc7..000000000 --- a/project/erda_03_training_data.ipynb +++ /dev/null @@ -1,400 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Build a set of training data\n", - "\n", - "Use a set of (most) common peptides to create inital data sets\n", - "\n", - "- based on `Counter` over all outputs from search (here: MaxQuant)\n", - " - keep based on threshold `FEAT_COMPLETNESS_CUTOFF` possible features\n", - " - option: select samples based on `YEARS` (e.g. due constrain by a batch of strains)\n", - " - collect in wide format data from output files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from functools import partial\n", - "from pathlib import Path\n", - "import logging\n", - "import multiprocessing\n", - "\n", - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "from tqdm.notebook import tqdm_notebook\n", - "\n", - "import vaep\n", - "\n", - "import config" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def join_as_str(seq):\n", - " ret = \"_\".join(str(x) for x in seq)\n", - " return ret" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "incorrectly_encoded_metadata": "[tag=parameters]", - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "RANDOM_SEED: int = 42 # Random seed for reproducibility\n", - "FEAT_COMPLETNESS_CUTOFF = 0.25 # Minimal proportion of samples which have to share a feature\n", - "SAMPLE_COL = 'Sample ID'\n", - "OUT_FOLDER = 'data/selected/'\n", - "FN_ID_OLD_NEW: str = 'data/rename/selected_old_new_id_mapping.csv' # selected samples with pride and original id" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Select a specific config file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "# options = ['peptides', 'evidence', 'proteinGroups']\n", - "from config.training_data import peptides as cfg\n", - "# from config.training_data import evidence as cfg\n", - "# from config.training_data import proteinGroups as cfg\n", - "\n", - "cfg_dict = {k: getattr(cfg, k) for k in dir(cfg) if not k.startswith('_')}\n", - "cfg_dict" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Set defaults from file (allows to potentially overwrite parameters)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# normal structure of config.py files\n", - "NAME = cfg.NAME\n", - "BASE_NAME = cfg.BASE_NAME\n", - "\n", - "TYPES_DUMP = cfg.TYPES_DUMP\n", - "TYPES_COUNT = cfg.TYPES_COUNT\n", - "\n", - "IDX_COLS_LONG = cfg.IDX_COLS_LONG\n", - "\n", - "LOAD_DUMP = cfg.LOAD_DUMP\n", - "\n", - "CounterClass = cfg.CounterClass\n", - "FNAME_COUNTER = cfg.FNAME_COUNTER" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "out_folder = Path(OUT_FOLDER) / cfg.NAME\n", - "out_folder.mkdir(exist_ok=True, parents=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Selected IDs\n", - "\n", - "- currently only `Sample ID` is used\n", - "- path are to `.raw` raw files, not the output folder (could be changed)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_ids = pd.read_csv(FN_ID_OLD_NEW)\n", - "df_ids" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Counter" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "counter = CounterClass(FNAME_COUNTER)\n", - "counts = counter.get_df_counts()\n", - "counts" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if TYPES_COUNT:\n", - " counts = counts.convert_dtypes().astype({'Charge': int}) #\n", - "mask = counts['proportion'] >= FEAT_COMPLETNESS_CUTOFF\n", - "counts.loc[mask]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Based on selected samples, retain features that potentially could be in the subset\n", - "\n", - "- if 1000 samples are selected, and given at treshold of 25%, one would need at least 250 observations" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "treshold_counts = int(len(df_ids) * FEAT_COMPLETNESS_CUTOFF)\n", - "mask = counts['counts'] >= treshold_counts\n", - "counts.loc[mask]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "IDX_selected = counts.loc[mask].set_index(cfg.IDX_COLS_LONG[1:]).index\n", - "if len(cfg.IDX_COLS_LONG[1:]) > 1:\n", - " IDX_selected = IDX_selected.map(join_as_str)\n", - "IDX_selected" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Select Dumps" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "selected_dumps = df_ids[\"Sample ID\"]\n", - "selected_dumps = {k: counter.dumps[k] for k in selected_dumps}\n", - "selected_dumps = list(selected_dumps.items())\n", - "print(f\"Selected # {len(selected_dumps):,d} dumps.\")\n", - "selected_dumps[:10]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Collect in parallel" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def load_fct(path):\n", - " s = (\n", - " pd.read_csv(path, index_col=cfg.IDX_COLS_LONG[1:], usecols=[*cfg.IDX_COLS_LONG[1:], \"Intensity\"])\n", - " .squeeze()\n", - " .astype(pd.Int64Dtype())\n", - " )\n", - " if len(cfg.IDX_COLS_LONG[1:]) > 1:\n", - " s.index = s.index.map(join_as_str)\n", - " \n", - " return s\n", - "load_fct(selected_dumps[0][-1])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def collect(folders, index, load_fct):\n", - " current = multiprocessing.current_process()\n", - " i = current._identity[0] % N_WORKERS + 1\n", - " print(\" \", end=\"\", flush=True)\n", - "\n", - " failed = []\n", - " all = pd.DataFrame(index=index)\n", - "\n", - " with tqdm_notebook(total=len(folders), position=i) as pbar:\n", - " for id, path in folders:\n", - " try:\n", - " s = load_fct(path)\n", - " s.name = id\n", - " all = all.join(s, how='left')\n", - " except FileNotFoundError:\n", - " logging.warning(f\"File not found: {path}\")\n", - " failed.append((id, path))\n", - " except pd.errors.EmptyDataError:\n", - " logging.warning(f\"Empty file: {path}\")\n", - " failed.append((id, path))\n", - " pbar.update(1)\n", - " \n", - " return all" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Collect intensities in parallel" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "all = None # free memory\n", - "\n", - "collect_intensities = partial(collect, index=IDX_selected, load_fct=load_fct)\n", - "\n", - "N_WORKERS = 8\n", - "\n", - "with multiprocessing.Pool(N_WORKERS) as p:\n", - " all = list(\n", - " tqdm_notebook(\n", - " p.imap(collect_intensities,\n", - " np.array_split(selected_dumps, N_WORKERS)),\n", - " total=N_WORKERS,\n", - " )\n", - " ) \n", - " \n", - "all = pd.concat(all, axis=1)\n", - "all" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "all.memory_usage(deep=True).sum() / (2**20)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# all = pd.read_pickle('data/selected/proteinGroups/intensities_wide_selected_N00100_M07444.pkl')\n", - "all = all.rename(df_ids.set_index(\"Sample ID\")['new_sample_id'], axis=1)\n", - "all.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "fname = out_folder / config.insert_shape(all, 'intensities_wide_selected{}.pkl') \n", - "all.to_pickle(fname)\n", - "fname" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "all.to_csv(fname.with_suffix('.csv'), chunksize=1_000)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Samples as rows, feature columns as columns\n", - "\n", - "- can fail due to memory -> next notebook" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.15" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/project/erda_03_training_data.py b/project/erda_03_training_data.py deleted file mode 100644 index 1d32f85d4..000000000 --- a/project/erda_03_training_data.py +++ /dev/null @@ -1,231 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.14.5 -# kernelspec: -# display_name: Python 3 (ipykernel) -# language: python -# name: python3 -# --- - -# %% [markdown] -# # Build a set of training data -# -# Use a set of (most) common peptides to create inital data sets -# -# - based on `Counter` over all outputs from search (here: MaxQuant) -# - keep based on threshold `FEAT_COMPLETNESS_CUTOFF` possible features -# - option: select samples based on `YEARS` (e.g. due constrain by a batch of strains) -# - collect in wide format data from output files - -# %% -from functools import partial -from pathlib import Path -import logging -import multiprocessing - -import numpy as np -import pandas as pd - -from tqdm.notebook import tqdm_notebook - -import vaep - -import config - - -# %% -def join_as_str(seq): - ret = "_".join(str(x) for x in seq) - return ret - - -# %% [markdown] -# ## Setup - -# %% [tag=parameters] -RANDOM_SEED: int = 42 # Random seed for reproducibility -FEAT_COMPLETNESS_CUTOFF = 0.25 # Minimal proportion of samples which have to share a feature -SAMPLE_COL = 'Sample ID' -OUT_FOLDER = 'data/selected/' -FN_ID_OLD_NEW: str = 'data/rename/selected_old_new_id_mapping.csv' # selected samples with pride and original id - - -# %% [markdown] -# Select a specific config file - -# %% -# options = ['peptides', 'evidence', 'proteinGroups'] -from config.training_data import peptides as cfg -# from config.training_data import evidence as cfg -# from config.training_data import proteinGroups as cfg - -cfg_dict = {k: getattr(cfg, k) for k in dir(cfg) if not k.startswith('_')} -cfg_dict - - -# %% [markdown] -# Set defaults from file (allows to potentially overwrite parameters) - -# %% -# normal structure of config.py files -NAME = cfg.NAME -BASE_NAME = cfg.BASE_NAME - -TYPES_DUMP = cfg.TYPES_DUMP -TYPES_COUNT = cfg.TYPES_COUNT - -IDX_COLS_LONG = cfg.IDX_COLS_LONG - -LOAD_DUMP = cfg.LOAD_DUMP - -CounterClass = cfg.CounterClass -FNAME_COUNTER = cfg.FNAME_COUNTER - -# %% -out_folder = Path(OUT_FOLDER) / cfg.NAME -out_folder.mkdir(exist_ok=True, parents=True) - - -# %% [markdown] -# ## Selected IDs -# -# - currently only `Sample ID` is used -# - path are to `.raw` raw files, not the output folder (could be changed) - -# %% -df_ids = pd.read_csv(FN_ID_OLD_NEW) -df_ids - -# %% [markdown] -# ## Counter - -# %% -counter = CounterClass(FNAME_COUNTER) -counts = counter.get_df_counts() -counts - -# %% -if TYPES_COUNT: - counts = counts.convert_dtypes().astype({'Charge': int}) # -mask = counts['proportion'] >= FEAT_COMPLETNESS_CUTOFF -counts.loc[mask] - -# %% [markdown] -# Based on selected samples, retain features that potentially could be in the subset -# -# - if 1000 samples are selected, and given at treshold of 25%, one would need at least 250 observations - -# %% -treshold_counts = int(len(df_ids) * FEAT_COMPLETNESS_CUTOFF) -mask = counts['counts'] >= treshold_counts -counts.loc[mask] - -# %% -IDX_selected = counts.loc[mask].set_index(cfg.IDX_COLS_LONG[1:]).index -if len(cfg.IDX_COLS_LONG[1:]) > 1: - IDX_selected = IDX_selected.map(join_as_str) -IDX_selected - -# %% [markdown] -# ## Select Dumps - -# %% -selected_dumps = df_ids["Sample ID"] -selected_dumps = {k: counter.dumps[k] for k in selected_dumps} -selected_dumps = list(selected_dumps.items()) -print(f"Selected # {len(selected_dumps):,d} dumps.") -selected_dumps[:10] - - -# %% [markdown] -# ## Collect in parallel - -# %% -def load_fct(path): - s = ( - pd.read_csv(path, index_col=cfg.IDX_COLS_LONG[1:], usecols=[*cfg.IDX_COLS_LONG[1:], "Intensity"]) - .squeeze() - .astype(pd.Int64Dtype()) - ) - if len(cfg.IDX_COLS_LONG[1:]) > 1: - s.index = s.index.map(join_as_str) - - return s -load_fct(selected_dumps[0][-1]) - - -# %% -def collect(folders, index, load_fct): - current = multiprocessing.current_process() - i = current._identity[0] % N_WORKERS + 1 - print(" ", end="", flush=True) - - failed = [] - all = pd.DataFrame(index=index) - - with tqdm_notebook(total=len(folders), position=i) as pbar: - for id, path in folders: - try: - s = load_fct(path) - s.name = id - all = all.join(s, how='left') - except FileNotFoundError: - logging.warning(f"File not found: {path}") - failed.append((id, path)) - except pd.errors.EmptyDataError: - logging.warning(f"Empty file: {path}") - failed.append((id, path)) - pbar.update(1) - - return all - - -# %% [markdown] -# ## Collect intensities in parallel - -# %% -all = None # free memory - -collect_intensities = partial(collect, index=IDX_selected, load_fct=load_fct) - -N_WORKERS = 8 - -with multiprocessing.Pool(N_WORKERS) as p: - all = list( - tqdm_notebook( - p.imap(collect_intensities, - np.array_split(selected_dumps, N_WORKERS)), - total=N_WORKERS, - ) - ) - -all = pd.concat(all, axis=1) -all - -# %% -all.memory_usage(deep=True).sum() / (2**20) - -# %% -# all = pd.read_pickle('data/selected/proteinGroups/intensities_wide_selected_N00100_M07444.pkl') -all = all.rename(df_ids.set_index("Sample ID")['new_sample_id'], axis=1) -all.head() - -# %% -# %%time -fname = out_folder / config.insert_shape(all, 'intensities_wide_selected{}.pkl') -all.to_pickle(fname) -fname - -# %% -# %%time -all.to_csv(fname.with_suffix('.csv'), chunksize=1_000) - -# %% [markdown] -# Samples as rows, feature columns as columns -# -# - can fail due to memory -> next notebook diff --git a/project/erda_04_transpose_file.ipynb b/project/erda_04_transpose_file.ipynb deleted file mode 100644 index 6ddcf659e..000000000 --- a/project/erda_04_transpose_file.ipynb +++ /dev/null @@ -1,284 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "8460a0bd-e679-4d04-ac84-ab0998900099", - "metadata": {}, - "source": [ - "# Transpose file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "76501fbd-b7fe-4010-8137-bb4cba2bee99", - "metadata": {}, - "outputs": [], - "source": [ - "from pathlib import Path\n", - "import pandas as pd\n", - "\n", - "import vaep\n", - "\n", - "import config" - ] - }, - { - "cell_type": "markdown", - "id": "23dabaa3-4ffb-41ff-8931-92f0684dd617", - "metadata": {}, - "source": [ - "Paramters" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "98952a84-2e8b-4416-a079-1ad9c755cfe5", - "metadata": {}, - "outputs": [], - "source": [ - "# out_folder = Path('data/selected/proteinGroups') \n", - "# fname = out_folder / 'intensities_wide_selected_N04550_M07444.pkl'\n", - "\n", - "# out_folder = Path('data/selected/peptides') \n", - "# fname = out_folder / 'intensities_wide_selected_N42881_M07441.pkl'\n", - "\n", - "out_folder = Path('data/selected/evidence') \n", - "fname = out_folder / 'intensities_wide_selected_N49560_M07444.pkl'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ffe6a395-66a7-4b50-9f58-d58746666122", - "metadata": {}, - "outputs": [], - "source": [ - "def get_template(fname, split='_N'):\n", - " ext = fname.suffix\n", - " stem = fname.stem.split(split)[0]\n", - " return f\"{stem}{{}}{ext}\"\n", - "\n", - "def memory_usage_in_mb(df):\n", - " return df.memory_usage(deep=True).sum() / (2**20)\n", - "\n", - "template = get_template(fname)\n", - "template" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d4abfa6d-7398-4c1f-8dac-bbf055931b70", - "metadata": {}, - "outputs": [], - "source": [ - "files_out = {}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "771acfc5-f9d1-4aaf-8f3c-a5b2298ab559", - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "df = pd.read_pickle(fname)\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1f2d73d4-96f3-4883-89a2-4085433a71c4", - "metadata": {}, - "outputs": [], - "source": [ - "df.memory_usage(deep=True).sum() / (2**20)" - ] - }, - { - "cell_type": "markdown", - "id": "eb613ede-5949-4c04-a5c5-a3e9b112de0d", - "metadata": {}, - "source": [ - "Here reading the csv file is slightly faster and consumes less memory.\n", - "\n", - "- dtype: `float64` -> missing values as `np.nan`\n", - "- but: saving to csv will be larger." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2ac14afc-7e94-45b3-b1c4-008ca8db5f00", - "metadata": {}, - "outputs": [], - "source": [ - "# %%time\n", - "# df = pd.read_csv(fname.with_suffix('.csv'), index_col=0)\n", - "# df.memory_usage(deep=True).sum() / (2**20) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "632fa73a-1db4-410f-b513-2f80b549948a", - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "count_samples = df.notna().sum()\n", - "\n", - "fname = out_folder / 'count_samples.json'\n", - "count_samples.to_json(fname)\n", - "\n", - "vaep.plotting.make_large_descriptors(size='medium')\n", - "\n", - "ax = count_samples.sort_values().plot(rot=90, ylabel='observations')\n", - "ax.yaxis.set_major_formatter(\"{x:,.0f}\")\n", - "vaep.savefig(ax.get_figure(), fname)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "57eca231-2bfe-4939-bc93-3b31836a0379", - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "df = df.T\n", - "df.memory_usage(deep=True).sum() / (2**20)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "11e30e15-d802-41fb-9cba-7e08d91eaf9d", - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "fname = out_folder / config.insert_shape(df, template=template)\n", - "files_out[fname.name] = fname.as_posix()\n", - "df.to_pickle(fname)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cc4f2a51-0c7e-4394-bb34-eda4146844ff", - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "fname = fname.with_suffix('.csv')\n", - "files_out[fname.name] = fname.as_posix()\n", - "df.to_csv(fname, chunksize=1_000)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3bc9250b-9a0c-4a44-b375-e1c06d9881b3", - "metadata": {}, - "outputs": [], - "source": [ - "count_features = df.notna().sum()\n", - "fname = out_folder / 'count_feat.json'\n", - "count_features.to_json(fname)\n", - "\n", - "ax = count_features.sort_values().plot(rot=90, ylabel='observations')\n", - "ax.yaxis.set_major_formatter(\"{x:,.0f}\")\n", - "vaep.savefig(ax.get_figure(), fname)" - ] - }, - { - "cell_type": "markdown", - "id": "448678a9-f239-4fc0-89a5-afbf6fea62f8", - "metadata": {}, - "source": [ - "## Present abesent pattern" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "02f74eb0-2ffd-4014-9d52-583f33ba4e29", - "metadata": {}, - "outputs": [], - "source": [ - "df = df.notna().astype(pd.Int8Dtype())\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "017e027a-6503-4033-947d-d306a08e7a27", - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "fname = out_folder / config.insert_shape(df, 'absent_0_present_1_selected{}.pkl')\n", - "\n", - "files_out[fname.name] = fname.as_posix()\n", - "df.to_pickle(fname)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d3fa48d3-cc66-4f90-8081-c42cced74d72", - "metadata": {}, - "outputs": [], - "source": [ - "files_outfname = fname.with_suffix('.csv')\n", - "files_out[fname.name] = fname.as_posix()\n", - "df.replace(0, pd.NA).to_csv(fname.with_suffix('.csv'), chunksize=1_000)" - ] - }, - { - "cell_type": "markdown", - "id": "4b565b50-13fd-48ac-a0c2-c52ce31d10a5", - "metadata": {}, - "source": [ - "## Files written" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8cd67887-c1b8-4718-910c-7d0871f71f71", - "metadata": {}, - "outputs": [], - "source": [ - "files_out" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.15" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/project/erda_04_transpose_file.py b/project/erda_04_transpose_file.py deleted file mode 100644 index c9c6db02c..000000000 --- a/project/erda_04_transpose_file.py +++ /dev/null @@ -1,136 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.14.5 -# kernelspec: -# display_name: Python 3 (ipykernel) -# language: python -# name: python3 -# --- - -# %% [markdown] -# # Transpose file - -# %% -from pathlib import Path -import pandas as pd - -import vaep - -import config - -# %% [markdown] -# Paramters - -# %% -# out_folder = Path('data/selected/proteinGroups') -# fname = out_folder / 'intensities_wide_selected_N04550_M07444.pkl' - -# out_folder = Path('data/selected/peptides') -# fname = out_folder / 'intensities_wide_selected_N42881_M07441.pkl' - -out_folder = Path('data/selected/evidence') -fname = out_folder / 'intensities_wide_selected_N49560_M07444.pkl' - - -# %% -def get_template(fname, split='_N'): - ext = fname.suffix - stem = fname.stem.split(split)[0] - return f"{stem}{{}}{ext}" - -def memory_usage_in_mb(df): - return df.memory_usage(deep=True).sum() / (2**20) - -template = get_template(fname) -template - -# %% -files_out = {} - -# %% -# %%time -df = pd.read_pickle(fname) -df.head() - -# %% -df.memory_usage(deep=True).sum() / (2**20) - -# %% [markdown] -# Here reading the csv file is slightly faster and consumes less memory. -# -# - dtype: `float64` -> missing values as `np.nan` -# - but: saving to csv will be larger. - -# %% -# # %%time -# df = pd.read_csv(fname.with_suffix('.csv'), index_col=0) -# df.memory_usage(deep=True).sum() / (2**20) - -# %% -# %%time -count_samples = df.notna().sum() - -fname = out_folder / 'count_samples.json' -count_samples.to_json(fname) - -vaep.plotting.make_large_descriptors(size='medium') - -ax = count_samples.sort_values().plot(rot=90, ylabel='observations') -ax.yaxis.set_major_formatter("{x:,.0f}") -vaep.savefig(ax.get_figure(), fname) - -# %% -# %%time -df = df.T -df.memory_usage(deep=True).sum() / (2**20) - -# %% -# %%time -fname = out_folder / config.insert_shape(df, template=template) -files_out[fname.name] = fname.as_posix() -df.to_pickle(fname) - -# %% -# %%time -fname = fname.with_suffix('.csv') -files_out[fname.name] = fname.as_posix() -df.to_csv(fname, chunksize=1_000) - -# %% -count_features = df.notna().sum() -fname = out_folder / 'count_feat.json' -count_features.to_json(fname) - -ax = count_features.sort_values().plot(rot=90, ylabel='observations') -ax.yaxis.set_major_formatter("{x:,.0f}") -vaep.savefig(ax.get_figure(), fname) - -# %% [markdown] -# ## Present abesent pattern - -# %% -df = df.notna().astype(pd.Int8Dtype()) -df - -# %% -# %%time -fname = out_folder / config.insert_shape(df, 'absent_0_present_1_selected{}.pkl') - -files_out[fname.name] = fname.as_posix() -df.to_pickle(fname) - -# %% -files_outfname = fname.with_suffix('.csv') -files_out[fname.name] = fname.as_posix() -df.replace(0, pd.NA).to_csv(fname.with_suffix('.csv'), chunksize=1_000) - -# %% [markdown] -# ## Files written - -# %% -files_out diff --git a/project/erda_05_parse_paramter_files.ipynb b/project/erda_05_parse_paramter_files.ipynb deleted file mode 100644 index 60a05b72d..000000000 --- a/project/erda_05_parse_paramter_files.ipynb +++ /dev/null @@ -1,548 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "368f6451-9bec-4ca6-9921-c5ab69c23153", - "metadata": {}, - "source": [ - "# Parse parameter files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3c3a8745-20e5-4353-a3d6-950a3bc1dd6c", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from pprint import pprint\n", - "import collections\n", - "from pathlib import Path\n", - "from tqdm.notebook import tqdm\n", - "\n", - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "92c20a10-a814-4a50-8186-d05fa1e14498", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "test_file = 'data/mqpar_example.xml'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e1645c86-7799-46db-92cd-cd5157cd11d8", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "def extend_tuple(t, target_length: int):\n", - " if not isinstance(t, tuple):\n", - " raise TypeError(\n", - " f\"Wrong type provided. Expected tuple, got {type(t)} : {t!r}\")\n", - " if len(t) > target_length:\n", - " raise ValueError(\n", - " f\"Tuple is too long (got {len(t)}, expected {target_length}: {t!r}\")\n", - " return t + (None,) * (target_length - len(t))\n", - "# extend_tuple(\"test\", 4)\n", - "# extend_tuple(('k1', 'k2'), 1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4341046b-13d2-49c5-924c-a73fd9f366d1", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "def extend_tuples_with_none(list_of_tuples, target_length):\n", - " extended_tuples = []\n", - " for tuple_ in list_of_tuples:\n", - " # if len(tuple_) > target_length:\n", - " # raise ValueError(f\"tuple is too long: {len(tuple_)}\")\n", - " extended_tuple = extend_tuple(tuple_, target_length)\n", - " extended_tuples.append(extended_tuple)\n", - " return extended_tuples\n", - "\n", - "\n", - "list_of_tuples = [(1, 2), (3, 4, 5), (6,)]\n", - "extend_tuples_with_none(list_of_tuples, 3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8691214b-65a1-4c27-92d7-f927dbac61bf", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import xml.etree.ElementTree as ET\n", - "\n", - "def add_record(data, tag, record):\n", - " if tag in data:\n", - " if isinstance(data[tag], list):\n", - " data[tag].append(record)\n", - " else:\n", - " data[tag] = [data[tag], record]\n", - " else:\n", - " data[tag] = record\n", - " return data\n", - "\n", - "\n", - "def read_xml_record(element):\n", - " data = dict()\n", - " for child in element:\n", - " if len(child) > 1 and child.tag:\n", - " # if there is a list, process each element one by one\n", - " # either nested or a plain text\n", - " data[child.tag] = [add_record(dict(), tag=child.tag, record=read_xml_record(child) if not (\n", - " child.text and child.text.strip()) else child.text.strip()) for child in child]\n", - " elif child.text and child.text.strip():\n", - " # just plain text record\n", - " data = add_record(data=data, tag=child.tag,\n", - " record=child.text.strip())\n", - " else:\n", - " record = read_xml_record(child)\n", - " data = add_record(data, child.tag, record)\n", - " if not data:\n", - " # empty strings and None are normalzied to None\n", - " return None\n", - " return data\n", - "\n", - "\n", - "tree = ET.parse(test_file)\n", - "root = tree.getroot()\n", - "\n", - "record_example = read_xml_record(root)\n", - "record_example" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "10f3103a-8133-4d01-9c6a-efcc75d85295", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "\n", - "\n", - "def flatten_dict_of_dicts(d: dict, parent_key: str = '') -> dict:\n", - " \"\"\"Build tuples for nested dictionaries for use as `pandas.MultiIndex`.\n", - "\n", - " Parameters\n", - " ----------\n", - " d : dict\n", - " Nested dictionary for which all keys are flattened to tuples.\n", - " parent_key : str, optional\n", - " Outer key (used for recursion), by default ''\n", - "\n", - " Returns\n", - " -------\n", - " dict\n", - " Flattend dictionary with tuple keys: {(outer_key, ..., inner_key) : value}\n", - " \"\"\"\n", - " # simplified and adapted from: https://stackoverflow.com/a/6027615/9684872\n", - " items = []\n", - " for k, v in d.items():\n", - " new_key = parent_key + (k,) if parent_key else (k,)\n", - " if isinstance(v, collections.abc.MutableMapping):\n", - " items.extend(flatten_dict_of_dicts(v, parent_key=new_key))\n", - " elif isinstance(v, list):\n", - " for item in v:\n", - " if isinstance(item, collections.abc.MutableMapping):\n", - " items.extend(flatten_dict_of_dicts(\n", - " item, parent_key=new_key))\n", - " elif isinstance(item, str):\n", - " items.append((new_key, item))\n", - " else:\n", - " raise ValueError(f\"Unknown item: {item:r}\")\n", - " else:\n", - " items.append((new_key, v))\n", - " return items\n", - "\n", - "\n", - "case_1 = {'k': 'v'}\n", - "case_2 = {'k1': {'k2': 'v1', 'k3': 'v2'}}\n", - "case_3 = {'k1': {'k2': [{'k4': 'v1'}, {'k4': 'v2'}]}}\n", - "case_4 = {'k1': [{'k2': {'k4': 'v1', 'k5': 'v2'}},\n", - " {'k2': {'k4': 'v1', 'k5': 'v2'}}]}\n", - "case_5 = {'restrictMods': [{'string': 'Oxidation (M)'},\n", - " {'string': 'Acetyl (Protein N-term)'}]}\n", - "case_6 = {'variableModifications': {\n", - " 'string': ['Oxidation (M)',\n", - " 'Acetyl (Protein N-term)']}}\n", - "\n", - "test_cases = [case_1, case_2, case_3, case_4, case_5, case_6]\n", - "\n", - "for case in (test_cases):\n", - " pprint(flatten_dict_of_dicts(case))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c8e19d55-d012-44be-8869-0271e16a7093", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "entries = list()\n", - "for case in test_cases:\n", - " entries.extend(flatten_dict_of_dicts(case))\n", - "[(extend_tuple(k, 4), v) for (k, v) in entries]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "76092ebb-e31e-4bf2-b350-090c51d1e1bc", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "def build_Series_from_records(records, index_length=4):\n", - " records = flatten_dict_of_dicts(records)\n", - " idx = pd.MultiIndex.from_tuples(\n", - " (extend_tuple(k, index_length) for (k, v) in records))\n", - " return pd.Series((v for (k, v) in records), index=idx)\n", - "\n", - "\n", - "tree = ET.parse(test_file)\n", - "root = tree.getroot()\n", - "\n", - "record_example = read_xml_record(root)\n", - "flattend = build_Series_from_records(record_example, 4)\n", - "flattend.to_frame('example')" - ] - }, - { - "cell_type": "markdown", - "id": "e63a712a-a6e8-46dc-befc-bc6a98a6a153", - "metadata": {}, - "source": [ - "## Parameters" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e718825e-428a-4d99-81e6-03cce50da2fc", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# folders to check\n", - "folder_w_params = Path('/home/jovyan/work/mqpar_files')\n", - "root = Path('/home/jovyan/work/')\n", - "dumped_folder = 'mq_out'\n", - "dumped_folder_names = 'mq_out_folder.txt'\n", - "# out\n", - "fname_out = 'data/all_parameter_files.csv'" - ] - }, - { - "cell_type": "markdown", - "id": "891ee5ec-03a2-4d66-845b-a2938c9018f7", - "metadata": {}, - "source": [ - "## Dump of some parameter files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fbdc4c32-9995-43ae-aff3-9b6358cf9ea2", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "def read_file(file, name, idx_levels=4) -> pd.Series:\n", - " tree = ET.parse(file)\n", - " root = tree.getroot()\n", - " record = read_xml_record(root)\n", - " s = build_Series_from_records(record, idx_levels)\n", - " s.name = name\n", - " return s" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c779378e-8b0c-440b-a43a-c1a10939cf8f", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "parameter_files_part_1 = list()\n", - "for file in tqdm(folder_w_params.iterdir()):\n", - " s_parameters = read_file(file, name=file.stem[6:])\n", - " parameter_files_part_1.append(s_parameters)\n", - "\n", - "parameter_files_part_1 = pd.concat(parameter_files_part_1, axis=1).T\n", - "parameter_files_part_1" - ] - }, - { - "cell_type": "markdown", - "id": "a5db69d3-89ed-4670-ae9a-d5e548e43106", - "metadata": {}, - "source": [ - "## Search for parameter files in output folders\n", - "\n", - "- read folders from dump (for stable execution on erda)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6dea94bd-dc1b-4e5b-ad99-0c6f1dc34682", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# # read as generator if file does not exist:\n", - "# folders = list(Path('/home/jovyan/work/mq_out').iterdir())\n", - "\n", - "root = Path('/home/jovyan/work/')\n", - "with open(root / dumped_folder_names) as f:\n", - " folders = list()\n", - " for line in f:\n", - " fpath = root / dumped_folder / line.strip()\n", - " folders.append(fpath)" - ] - }, - { - "cell_type": "markdown", - "id": "fd65f35b-7818-4275-961f-816aedfaa486", - "metadata": {}, - "source": [ - "read paramter files:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "32add481-89ba-4c22-b419-f025f98c2f2c", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "parameter_files_part_2 = list()\n", - "i = 0\n", - "for folder in tqdm(folders):\n", - " for file in folder.iterdir():\n", - " if file.suffix == '.xml':\n", - " s_parameters = read_file(file, file.parent.name)\n", - " parameter_files_part_2.append(s_parameters)\n", - " i += 1\n", - "\n", - "parameter_files_part_2 = pd.concat(parameter_files_part_2, axis=1).T\n", - "parameter_files_part_2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bd945d90-8416-4ddd-9b46-ab7e47ed1840", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "print(f\"Found {i} parameter files\")" - ] - }, - { - "cell_type": "markdown", - "id": "71cf0a35-0cf9-4fb1-9abc-08cad21d4fae", - "metadata": {}, - "source": [ - "## Combine both sets" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "985d4fab-49ad-45b7-ae2d-de66cbdae5a4", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "parameter_files = pd.concat([parameter_files_part_1, parameter_files_part_2])\n", - "# del parameter_files_part_1, parameter_files_part_2\n", - "parameter_files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "81944709-5e74-4067-ab4a-d20e2054ecd0", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# 11066" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "48197990-124f-4abc-8c42-3b4982f3cd4b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "parameter_files = parameter_files.infer_objects()\n", - "parameter_files.dtypes.value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4e0627e5-22c2-4b08-be5b-b57866b15d13", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "parameter_files.to_csv(fname_out)" - ] - }, - { - "cell_type": "markdown", - "id": "33bbf09d-6059-4abc-96d6-677f1dfb3eb5", - "metadata": {}, - "source": [ - "Read aggregated parameters dump" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d749fdc5-9f56-45ab-879c-1e01977e733a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "parameter_files = pd.read_csv(fname_out, index_col=0, header=list(range(4)))\n", - "parameter_files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "858a7dbf-a5e3-47d3-98e4-f130306cfbf0", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "parameter_files.dtypes.value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "88c9f10e-5a4d-4653-b6f8-cfe6152f1b5a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "parameter_files.loc[:, parameter_files.dtypes == 'object']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9a6c70d1-c5eb-49ab-82f1-fe919a8b60e7", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "parameter_files['fastaFiles']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "319e99f2-9236-406c-b95a-493864dcbf03", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "parameter_files.droplevel(-1, axis=1)['fastaFiles']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1f1a185a-b0e8-40bd-b35c-4c9be49099f7", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "parameter_files.columns.to_list()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5319b502-58ac-4a5c-94dc-4c9915f302ee", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/project/erda_05_parse_paramter_files.py b/project/erda_05_parse_paramter_files.py deleted file mode 100644 index 7f3cb7603..000000000 --- a/project/erda_05_parse_paramter_files.py +++ /dev/null @@ -1,290 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.15.1 -# kernelspec: -# display_name: Python 3 (ipykernel) -# language: python -# name: python3 -# --- - -# %% [markdown] -# # Parse parameter files - -# %% -from pprint import pprint -import collections -from pathlib import Path -from tqdm.notebook import tqdm - -import pandas as pd - -# %% -import logging - -import xml.etree.ElementTree as ET - -logger = logging.getLogger() - -test_file = 'data/mqpar_example.xml' - - -# %% -def extend_tuple(t, target_length: int): - if not isinstance(t, tuple): - raise TypeError( - f"Wrong type provided. Expected tuple, got {type(t)} : {t!r}") - if len(t) > target_length: - raise ValueError( - f"Tuple is too long (got {len(t)}, expected {target_length}: {t!r}") - return t + (None,) * (target_length - len(t)) -# extend_tuple("test", 4) -# extend_tuple(('k1', 'k2'), 1) - - -# %% -def extend_tuples_with_none(list_of_tuples, target_length): - extended_tuples = [] - for tuple_ in list_of_tuples: - # if len(tuple_) > target_length: - # raise ValueError(f"tuple is too long: {len(tuple_)}") - extended_tuple = extend_tuple(tuple_, target_length) - extended_tuples.append(extended_tuple) - return extended_tuples - - -list_of_tuples = [(1, 2), (3, 4, 5), (6,)] -extend_tuples_with_none(list_of_tuples, 3) - -# %% - - -def add_record(data, tag, record): - if tag in data: - if isinstance(data[tag], list): - data[tag].append(record) - else: - data[tag] = [data[tag], record] - else: - data[tag] = record - return data - - -def read_xml_record(element): - data = dict() - for child in element: - if len(child) > 1 and child.tag: - # if there is a list, process each element one by one - # either nested or a plain text - data[child.tag] = [add_record(dict(), tag=child.tag, record=read_xml_record(child) if not ( - child.text and child.text.strip()) else child.text.strip()) for child in child] - elif child.text and child.text.strip(): - # just plain text record - data = add_record(data=data, tag=child.tag, - record=child.text.strip()) - else: - record = read_xml_record(child) - data = add_record(data, child.tag, record) - if not data: - # empty strings and None are normalzied to None - return None - return data - - -tree = ET.parse(test_file) -root = tree.getroot() - -record_example = read_xml_record(root) -record_example - -# %% - - -def flatten_dict_of_dicts(d: dict, parent_key: str = '') -> dict: - """Build tuples for nested dictionaries for use as `pandas.MultiIndex`. - - Parameters - ---------- - d : dict - Nested dictionary for which all keys are flattened to tuples. - parent_key : str, optional - Outer key (used for recursion), by default '' - - Returns - ------- - dict - Flattend dictionary with tuple keys: {(outer_key, ..., inner_key) : value} - """ - # simplified and adapted from: https://stackoverflow.com/a/6027615/9684872 - items = [] - for k, v in d.items(): - new_key = parent_key + (k,) if parent_key else (k,) - if isinstance(v, collections.abc.MutableMapping): - items.extend(flatten_dict_of_dicts(v, parent_key=new_key)) - elif isinstance(v, list): - for item in v: - if isinstance(item, collections.abc.MutableMapping): - items.extend(flatten_dict_of_dicts( - item, parent_key=new_key)) - elif isinstance(item, str): - items.append((new_key, item)) - else: - raise ValueError(f"Unknown item: {item:r}") - else: - items.append((new_key, v)) - return items - - -case_1 = {'k': 'v'} -case_2 = {'k1': {'k2': 'v1', 'k3': 'v2'}} -case_3 = {'k1': {'k2': [{'k4': 'v1'}, {'k4': 'v2'}]}} -case_4 = {'k1': [{'k2': {'k4': 'v1', 'k5': 'v2'}}, - {'k2': {'k4': 'v1', 'k5': 'v2'}}]} -case_5 = {'restrictMods': [{'string': 'Oxidation (M)'}, - {'string': 'Acetyl (Protein N-term)'}]} -case_6 = {'variableModifications': { - 'string': ['Oxidation (M)', - 'Acetyl (Protein N-term)']}} - -test_cases = [case_1, case_2, case_3, case_4, case_5, case_6] - -for case in (test_cases): - pprint(flatten_dict_of_dicts(case)) - -# %% -entries = list() -for case in test_cases: - entries.extend(flatten_dict_of_dicts(case)) -[(extend_tuple(k, 4), v) for (k, v) in entries] - - -# %% -def build_Series_from_records(records, index_length=4): - records = flatten_dict_of_dicts(records) - idx = pd.MultiIndex.from_tuples( - (extend_tuple(k, index_length) for (k, v) in records)) - return pd.Series((v for (k, v) in records), index=idx) - - -tree = ET.parse(test_file) -root = tree.getroot() - -record_example = read_xml_record(root) -flattend = build_Series_from_records(record_example, 4) -flattend.to_frame('example') - -# %% [markdown] -# ## Parameters - -# %% -# folders to check -folder_w_params = Path('/home/jovyan/work/mqpar_files') -root = Path('/home/jovyan/work/') -dumped_folder = 'mq_out' -dumped_folder_names = 'mq_out_folder.txt' -# out -fname_out = 'data/all_parameter_files.csv' - - -# %% [markdown] -# ## Dump of some parameter files - -# %% -def read_file(file, name, idx_levels=4) -> pd.Series: - tree = ET.parse(file) - root = tree.getroot() - record = read_xml_record(root) - s = build_Series_from_records(record, idx_levels) - s.name = name - return s - - -# %% -parameter_files_part_1 = list() -for file in tqdm(folder_w_params.iterdir()): - s_parameters = read_file(file, name=file.stem[6:]) - parameter_files_part_1.append(s_parameters) - -parameter_files_part_1 = pd.concat(parameter_files_part_1, axis=1).T -parameter_files_part_1 - -# %% [markdown] -# ## Search for parameter files in output folders -# -# - read folders from dump (for stable execution on erda) - -# %% -# # read as generator if file does not exist: -# folders = list(Path('/home/jovyan/work/mq_out').iterdir()) - -root = Path('/home/jovyan/work/') -with open(root / dumped_folder_names) as f: - folders = list() - for line in f: - fpath = root / dumped_folder / line.strip() - folders.append(fpath) - -# %% [markdown] -# read paramter files: - -# %% -parameter_files_part_2 = list() -i = 0 -for folder in tqdm(folders): - for file in folder.iterdir(): - if file.suffix == '.xml': - s_parameters = read_file(file, file.parent.name) - parameter_files_part_2.append(s_parameters) - i += 1 - -parameter_files_part_2 = pd.concat(parameter_files_part_2, axis=1).T -parameter_files_part_2 - -# %% -print(f"Found {i} parameter files") - -# %% [markdown] -# ## Combine both sets - -# %% -parameter_files = pd.concat([parameter_files_part_1, parameter_files_part_2]) -# del parameter_files_part_1, parameter_files_part_2 -parameter_files - -# %% -# 11066 - -# %% -parameter_files = parameter_files.infer_objects() -parameter_files.dtypes.value_counts() - -# %% -parameter_files.to_csv(fname_out) - -# %% [markdown] -# Read aggregated parameters dump - -# %% -parameter_files = pd.read_csv(fname_out, index_col=0, header=list(range(4))) -parameter_files - -# %% -parameter_files.dtypes.value_counts() - -# %% -parameter_files.loc[:, parameter_files.dtypes == 'object'] - -# %% -parameter_files['fastaFiles'] - -# %% -parameter_files.droplevel(-1, axis=1)['fastaFiles'] - -# %% -parameter_files.columns.to_list() - -# %% diff --git a/project/erda_06_analyze_parameters.ipynb b/project/erda_06_analyze_parameters.ipynb deleted file mode 100644 index 9887c58b8..000000000 --- a/project/erda_06_analyze_parameters.ipynb +++ /dev/null @@ -1,153 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "8154dacf", - "metadata": {}, - "source": [ - "# Analyzse and rename dumped parameters" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ad32c7c1", - "metadata": {}, - "outputs": [], - "source": [ - "import vaep\n", - "import pandas as pd\n", - "\n", - "fname_mq_params = 'data/all_parameter_files.csv'\n", - "fname_id_mappings = 'data/rename/selected_old_new_id_mapping.csv'\n", - "\n", - "fname_out = 'data/selected_parameter_files.csv'\n", - "\n", - "parameter_files = pd.read_csv(fname_mq_params, index_col=0, header=list(range(4))\n", - " )\n", - "parameter_files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "09d029d6", - "metadata": {}, - "outputs": [], - "source": [ - "# thread experiments...\n", - "vaep.pandas.show_columns_with_variation(\n", - " parameter_files\n", - " .loc[parameter_files.index.duplicated(keep=False)])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "05f5fc43", - "metadata": {}, - "outputs": [], - "source": [ - "parameter_files = parameter_files.loc[~parameter_files.index.duplicated()]\n", - "parameter_files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b92ac981", - "metadata": {}, - "outputs": [], - "source": [ - "id_mappings = pd.read_csv(fname_id_mappings, index_col=0, usecols=['Sample ID', 'new_sample_id'])\n", - "id_mappings.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "245e795a", - "metadata": {}, - "outputs": [], - "source": [ - "parameter_files.loc[id_mappings.index]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ef8ffc10", - "metadata": {}, - "outputs": [], - "source": [ - "sel = (parameter_files\n", - " .loc[id_mappings.index]\n", - " .drop('filePaths', axis=1)\n", - " .rename(id_mappings['new_sample_id']))\n", - "sel.to_csv(fname_out)\n", - "sel" - ] - }, - { - "cell_type": "markdown", - "id": "b1f69026", - "metadata": {}, - "source": [ - "-inf and + inf cannot be handled correctly (fullMinMz, fullMaxMz)\n", - "number of Threads differs as the setting was varied in the beginning (most runs used 4 threads)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6b5ae165", - "metadata": {}, - "outputs": [], - "source": [ - "sel_with_diffs = vaep.pandas.show_columns_with_variation(sel)\n", - "sel_with_diffs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "15f413d3", - "metadata": {}, - "outputs": [], - "source": [ - "sel_with_diffs.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ee554c42", - "metadata": {}, - "outputs": [], - "source": [ - "sel[('numThreads', 'nan', 'nan', 'nan')].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8dc29350", - "metadata": {}, - "outputs": [], - "source": [ - "# 388 columns are identical\n", - "sel.drop(sel_with_diffs.columns, axis=1\n", - " ).drop_duplicates()" - ] - } - ], - "metadata": { - "jupytext": { - "cell_metadata_filter": "-all", - "main_language": "python", - "notebook_metadata_filter": "-all" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/project/erda_06_analyze_parameters.py b/project/erda_06_analyze_parameters.py deleted file mode 100644 index e2449ae5d..000000000 --- a/project/erda_06_analyze_parameters.py +++ /dev/null @@ -1,59 +0,0 @@ -# %% [markdown] -# # Analyzse and rename dumped parameters - -# %% -import vaep -import pandas as pd - -fname_mq_params = 'data/all_parameter_files.csv' -fname_id_mappings = 'data/rename/selected_old_new_id_mapping.csv' - -fname_out = 'data/selected_parameter_files.csv' - -parameter_files = pd.read_csv(fname_mq_params, index_col=0, header=list(range(4)) - ) -parameter_files - -# %% -# thread experiments... -vaep.pandas.show_columns_with_variation( - parameter_files - .loc[parameter_files.index.duplicated(keep=False)]) - -# %% -parameter_files = parameter_files.loc[~parameter_files.index.duplicated()] -parameter_files - -# %% -id_mappings = pd.read_csv(fname_id_mappings, index_col=0, usecols=['Sample ID', 'new_sample_id']) -id_mappings.head() - -# %% -parameter_files.loc[id_mappings.index] - -# %% -sel = (parameter_files - .loc[id_mappings.index] - .drop('filePaths', axis=1) - .rename(id_mappings['new_sample_id'])) -sel.to_csv(fname_out) -sel - -# %% [markdown] -# -inf and + inf cannot be handled correctly (fullMinMz, fullMaxMz) -# number of Threads differs as the setting was varied in the beginning (most runs used 4 threads) - -# %% -sel_with_diffs = vaep.pandas.show_columns_with_variation(sel) -sel_with_diffs - -# %% -sel_with_diffs.describe() - -# %% -sel[('numThreads', 'nan', 'nan', 'nan')].value_counts() - -# %% -# 388 columns are identical -sel.drop(sel_with_diffs.columns, axis=1 - ).drop_duplicates() diff --git a/project/erda_12_explore_raw_MQ_data.ipynb b/project/erda_12_explore_raw_MQ_data.ipynb deleted file mode 100644 index 03dfa8716..000000000 --- a/project/erda_12_explore_raw_MQ_data.ipynb +++ /dev/null @@ -1,1624 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Explore MaxQuant (MQ) output files of single runs\n", - "\n", - "The `erda_03_training_data.ipynb` notebook does extract information to be used as training data.\n", - "File specific one could also use the retention time analysis to identify _valid_ co-occurring peptides to be use during training.\n", - "Potentially this preprocessing step can be used at inference time.\n", - "\n", - "This notebook contains some relevant analysis for a specific `txt` output-folder in the current project\n", - "\n", - "##### Analysis overview\n", - "\n", - "> Report for example data\n", - "\n", - "- relation between `peptides.txt` and `evidence.txt`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import logging\n", - "import random\n", - "\n", - "import ipywidgets as widgets\n", - "\n", - "from numpy.testing import assert_almost_equal\n", - "from numpy import random\n", - "import pandas as pd\n", - "# pd.options.display.float_format = '{:,.1f}'.format\n", - "\n", - "import vaep.pandas\n", - "from vaep.pandas import length\n", - "from vaep.io.mq import FASTA_KEYS, MaxQuantOutput\n", - "from vaep.io import search_subfolders\n", - "import vaep.io.mq as mq\n", - "from vaep.io.mq import mq_col\n", - "\n", - "\n", - "from vaep.logging import setup_nb_logger\n", - "logger = setup_nb_logger()\n", - "\n", - "##################\n", - "##### CONFIG #####\n", - "##################\n", - "\n", - "import config\n", - "from config import FIGUREFOLDER\n", - "# from config import FOLDER_RAW_DATA\n", - "from config import FOLDER_MQ_TXT_DATA as FOLDER_RAW_DATA\n", - "\n", - "\n", - "print(f\"Search Raw-Files on path: {FOLDER_RAW_DATA}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "folders = search_subfolders(path=FOLDER_RAW_DATA, depth=1, exclude_root=True)\n", - "w_folder = widgets.Dropdown(options=folders, description='Select a folder')\n", - "w_folder" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mq_output = MaxQuantOutput(folder=w_folder.value)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Some important columns\n", - "\n", - "Grouped by a namedtuple allowing attribute access" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mq_col" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## `peptides.txt`\n", - "\n", - "> For reference on final \"result\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pd.options.display.max_columns = len(mq_output.peptides.columns)\n", - "mq_output.peptides" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`peptides.txt` contains aggregated peptides" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "intensities = mq_output.peptides.Intensity\n", - "intensities" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Not all peptides are associated with a Protein or Gene by MQ, although there is evidence for the peptide. This is due to potential `CON_`taminants in the medium which is encouded by default by MQ." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mq_output.peptides[FASTA_KEYS].isna().sum()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## `evidence.txt` \n", - "\n", - "contains\n", - "- retention time for peptides\n", - "- has repeated measures of the same sequences, which are all aggregated in `peptides.txt`\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pd.options.display.max_columns = len(mq_output.evidence.columns)\n", - "mq_output.evidence" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mq_output.evidence.Charge.value_counts().sort_index()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mask = mq_output.evidence[mq_col.RETENTION_TIME] != mq_output.evidence[mq_col.CALIBRATED_RETENTION_TIME]\n", - "print(\"Number of non-matching retention times between calibrated and non-calibrated column:\", mask.sum())\n", - "\n", - "# try:\n", - "# assert mask.sum() == 0, \"More than one replica?\"\n", - "# except AssertionError as e:\n", - "# logger.warning(e)\n", - "assert mask.sum() == 0, \"More than one replica?\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Using only one quality control sample, leaves the initial retention time as is." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "rt = mq_output.evidence[mq_col.CALIBRATED_RETENTION_TIME]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pep_measured_freq_in_evidence = rt.index.value_counts()\n", - "pep_measured_freq_in_evidence.iloc[:10] # top10 repeatedly measured peptides" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "max_observed_pep_evidence = pep_measured_freq_in_evidence.index[0]\n", - "mq_output.evidence.loc[\n", - " max_observed_pep_evidence,\n", - " :\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The retention time index is non-unique." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print('The retention time index is unique: {}'.format(rt.index.is_unique))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Peptides observed more than once at different times." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mask_duplicated = rt.index.duplicated(keep=False)\n", - "rt_duplicates = rt.loc[mask_duplicated]\n", - "rt_duplicates" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mq_output.evidence.loc[mask_duplicated, [\n", - " 'Charge', 'Calibrated retention time', 'Intensity']]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Calculate median intensity and calculate standard deviation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "_agg_functions = ['median', 'std']\n", - "\n", - "rt_summary = rt.groupby(level=0).agg(_agg_functions)\n", - "rt_summary" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's see several quartiles for both median and standard deviation (the columns are independent from each other) for the retention time" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "rt_summary.describe(percentiles=[0.8, 0.9, 0.95, 0.96, 0.97, 0.98, 0.99])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "rt_summary['median']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A large standard-deviation indicates that the intensity values originate from time points (in min) widely spread." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Peptides observed several times a different points of experimental run" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mask = rt_summary['std'] > 30.0\n", - "mask_indices = mask[mask].index\n", - "rt.loc[mask_indices]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Peptides with differen RT have different charges." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mq_output.evidence.loc[mask_indices]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Model evaluation possibility: Discard samples with several measurements from an experiment and predict value. See which intensity measurement corresponds more closely. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "_peptide = random.choice(mask_indices)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "f'evidence_{_peptide}_{w_folder.value.stem}'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "peptide_view = mq_output.evidence.loc[_peptide]\n", - "peptide_view = (peptide_view[\n", - " vaep.pandas.get_unique_non_unique_columns(peptide_view).non_unique]\n", - " .dropna(axis=1, how='all')\n", - " .set_index('Charge', append=True))\n", - "peptide_view" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fname = w_folder.value.parent / f'evidence_{_peptide}_{w_folder.value.stem}.xlsx'\n", - "peptide_view.to_excel(fname)\n", - "fname" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`Type` column indicates if peptide is based on one or more MS-MS spectra." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mq_output.peptides.loc[_peptide].to_frame().T" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Differences in intensities b/w peptides.txt and evidence.txt\n", - "\n", - "\n", - "The intensity reported in `peptides.txt` corresponds to roughly to the sum of the intensities found in different scans:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "col_intensity = mq_col.INTENSITY\n", - "try:\n", - "\n", - " assert_almost_equal(\n", - " _pep_int_evidence := mq_output.evidence.loc[_peptide, col_intensity].sum(),\n", - " _pep_int_peptides := mq_output.peptides.loc[_peptide, col_intensity],\n", - " err_msg='Mismatch between evidence.txt summed peptide intensities to reported peptide intensities in peptides.txt')\n", - "except AssertionError as e:\n", - " logging.error(\n", - " f\"{e}\\n Difference: {_pep_int_evidence - _pep_int_peptides:,.2f}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mq_output.evidence.loc[_peptide, col_intensity]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mq_output.peptides.loc[_peptide, col_intensity]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Make this comparison for all peptides" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "_pep_int_evidence = mq_output.evidence.groupby(\n", - " level=0).agg({col_intensity: [sum, len]})\n", - "_pep_int_evidence.columns = [col_intensity, 'count']\n", - "_pep_int_evidence" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "_diff = _pep_int_evidence[col_intensity] - \\\n", - " mq_output.peptides[col_intensity].astype(float)\n", - "mask_diff = _diff != 0.0\n", - "_pep_int_evidence.loc[mask_diff].describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "_diff.loc[mask_diff]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "_diff[mask_diff].describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Several smaller and larger differences in an intensity range way below the detection limit arise for some sequences. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Ideas on source of difference\n", - " - Are all peptides (sequences) which are based on single observations in `evidence.txt` represented as is in `peptides.txt`?\n", - " - how many peptides with more than one PTM have non-zero differences between the sum of intensity values in `evidence.txt` and the respective value in `peptides.txt`?\n", - " - maybe some peptides are filtered based on assignment as contaminent (`CON__`)?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# ToDo see above" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "_diff_indices = _diff[mask_diff].index\n", - "# some pep-seq in peptides.txt not in evidence.txt\n", - "_diff_indices = _diff_indices.intersection(mq_output.evidence.index.unique())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sample_index = random.choice(_diff_indices)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mq_output.evidence.loc[sample_index]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mq_output.peptides.loc[sample_index].to_frame().T" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Modifications" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mq_output.evidence.Modifications.value_counts()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Potential contaminant peptides" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `CON__` entries are possible contaminations resulting from sample preparation using a e.g. a serum:\n", - "\n", - "```python\n", - "data_fasta['ENSEMBL:ENSBTAP00000024146']\n", - "data_fasta['P12763'] # bovine serum protein -> present in cell cultures and in list of default contaminant in MQ\n", - "data_fasta['P00735'] # also bovin serum protein\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mask = mq_output.peptides['Potential contaminant'].notna()\n", - "mq_output.peptides.loc[mask]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Aggregate identifiers in evidence.txt\n", - "\n", - "- `Proteins`: All proteins that contain peptide sequence" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fasta_keys = [\"Proteins\", \"Leading proteins\",\n", - " \"Leading razor protein\", \"Gene names\"]\n", - "mq_output.evidence[fasta_keys]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The protein assignment information is not entirely unique for each group of peptides." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## align intensities and retention time (RT) for peptides\n", - "\n", - "- intensities are values reported in `peptides.txt`\n", - "- some (few) peptides in `peptides.txt` are not in `evidence.txt`, but then probably zero" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "intensities.index" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "seq_w_summed_intensities = intensities.to_frame().merge(\n", - " rt_summary, left_index=True, right_index=True, how='left')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "seq_w_summed_intensities" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mask = ~mq_output.evidence.reset_index(\n", - ")[[\"Sequence\", \"Proteins\", \"Gene names\"]].duplicated()\n", - "mask.index = mq_output.evidence.index" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "diff_ = seq_w_summed_intensities.index.unique().difference(mask.index.unique())\n", - "diff_.to_list()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# mq_output.msms.set_index('Sequence').loc['GIPNMLLSEEETES']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# There is no evidence, but then it is reported in peptides?!\n", - "# Is this the case for more than one MQ-RUN (last or first not written to file?)\n", - "try:\n", - " if len(diff_) > 0:\n", - " mq_output.evidence.loc[diff_]\n", - "except KeyError as e:\n", - " logging.error(e)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mq_output.peptides.loc[diff_]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Option: Peptide scan with highest score for repeatedly measured peptides\n", - "\n", - "- only select one of repeated peptide scans, namely the one with the highest score\n", - "- discards information, no summation of peptide intensities\n", - "- yields unique retention time per peptide, by discarding additional information" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "COL_SCORE = 'Score'\n", - "mq_output.evidence.groupby(level=0)[COL_SCORE].max()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mask_max_per_seq = mq_output.evidence.groupby(\n", - " level=0)[COL_SCORE].transform(\"max\").eq(mq_output.evidence[COL_SCORE])\n", - "mask_intensity_not_na = mq_output.evidence.Intensity.notna()\n", - "mask = mask_max_per_seq & mask_intensity_not_na" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This leads to a non-unique mapping, as some scores are exactly the same for two peptides." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mask_duplicates = mq_output.evidence.loc[mask].sort_values(\n", - " mq_col.INTENSITY).index.duplicated()\n", - "sequences_duplicated = mq_output.evidence.loc[mask].index[mask_duplicates]\n", - "mq_output.evidence.loc[mask].loc[sequences_duplicated, [\n", - " COL_SCORE, mq_col.INTENSITY, mq_col.RETENTION_TIME]] # .groupby(level=0).agg({mq_col.INTENSITY : max})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mask = mq_output.evidence.reset_index().sort_values(\n", - " by=[\"Sequence\", \"Score\", mq_col.INTENSITY]).duplicated(subset=[\"Sequence\", \"Score\"], keep='last')\n", - "_sequences = mq_output.evidence.index[mask]\n", - "mq_output.evidence.loc[_sequences, [\n", - " \"Score\", \"Retention time\", mq_col.INTENSITY, \"Proteins\"]]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- random, non missing intensity?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "aggregators = [\"Sequence\", \"Score\", mq_col.INTENSITY]\n", - "mask_intensity_not_na = mq_output.evidence.Intensity.notna()\n", - "seq_max_score_max_intensity = mq_output.evidence.loc[mask_intensity_not_na].reset_index(\n", - ")[aggregators+[\"Proteins\", \"Gene names\"]].sort_values(by=aggregators).set_index(\"Sequence\").groupby(level=0).last()\n", - "seq_max_score_max_intensity" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# drop NA intensities first.\n", - "assert seq_max_score_max_intensity.Intensity.isna().sum() == 0" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Certain peptides have no Protein or gene assigned." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "seq_max_score_max_intensity.isna().sum()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mask_seq_selected_not_assigned = seq_max_score_max_intensity.Proteins.isna(\n", - ") | seq_max_score_max_intensity[\"Gene names\"].isna()\n", - "seq_max_score_max_intensity.loc[mask_seq_selected_not_assigned]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "These might be a candiate for evaluating predictions, as the information is measured, but unknown. \n", - "If they cannot be assigned, the closest fit on different genes with model predictions could be a criterion for selection" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create dumps of intensities in `peptides.txt`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# mq_output.evidence.loc[\"AAAGGGGGGAAAAGR\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# ToDo: dump this?\n", - "mq_output.dump_intensity(folder='data/peptides_txt_intensities/')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create dumps per gene" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Some hundred peptides map to more than two genes " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "seq_max_score_max_intensity[mq_col.GENE_NAMES].str.split(\";\"\n", - " ).apply(lambda x: length(x)\n", - " ).value_counts(\n", - ").sort_index()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Mostly unique genes associated with a peptide." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Select sensible training data per gene\n", - "- sequence coverage information?\n", - "- minimal number or minimal sequence coverage, otherwise discared\n", - "- multiple genes:\n", - " - select first and add reference in others\n", - " - split and dump repeatedly\n", - " \n", - "Load fasta-file information" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with open(config.FN_FASTA_DB) as f:\n", - " data_fasta = json.load(f)\n", - "print(f'Number of proteins in fasta file DB: {len(data_fasta)}')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# schema validation? Load class with schema?\n", - "# -> Fasta-File creation should save schema with it" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Fasta Entries considered as contaminants by MQ" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mask_potential_contaminant = mq_output.peptides['Potential contaminant'] == '+'\n", - "contaminants = mq_output.peptides.loc[mask_potential_contaminant, [mq_col.PROTEINS, mq_col.LEADING_RAZOR_PROTEIN]]\n", - "contaminants.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "unique_cont = contaminants[mq_col.PROTEINS].str.split(';').to_list()\n", - "set_all = set().union(*unique_cont)\n", - "set_cont = {x.split('CON__')[-1] for x in set_all if 'CON__' in x}\n", - "set_proteins_to_remove = set_all.intersection(set_cont)\n", - "set_proteins_to_remove" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "List of proteins which are both in the fasta file and potential contaminants" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mask = mq_output.peptides[mq_col.LEADING_RAZOR_PROTEIN].isin(set_proteins_to_remove)\n", - "mq_output.peptides.loc[mask, 'Potential contaminant'].value_counts() # ToDo: Remove potential contaminants, check evidence.txt" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### `id_map`: Find genes based on fasta file\n", - "\n", - "Using `ID_MAP`, all protein entries for that gene are queried and combined." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# # slow! discarded for now\n", - "\n", - "# from config import FN_ID_MAP\n", - "\n", - "# with open(FN_ID_MAP) as f:\n", - "# id_map = json.load(f)\n", - "# id_map = pd.read_json(FN_ID_MAP, orient=\"split\")\n", - "\n", - "# protein_groups_per_gene = id_map.groupby(by=\"gene\")\n", - "# gene_found = []\n", - "# for name, gene_data in protein_groups_per_gene:\n", - "\n", - "# _peptides = set()\n", - "# for protein_id in gene_data.index:\n", - "# _peptides = _peptides.union(p for p_list in data_fasta[protein_id]['peptides']\n", - "# for p in p_list)\n", - "\n", - "# # select intersection of theoretical peptides for gene with observed peptides\n", - "# _matched = mq_output.peptides.index.intersection(_peptides)\n", - "# # add completness?\n", - "# if not _matched.empty and len(_matched) > 3:\n", - "# gene_found.append(name)\n", - "# #\n", - "# if not len(gene_found) % 500 :\n", - "# print(f\"Found {len(gene_found):6}\")\n", - "# print(f\"Total: {len(gene_found):5}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Compare this with the entries in the `Gene names` column of `peptides.txt`\n", - "\n", - "> Mapping is non-unique. MQ has no treshold on number of identified peptides. (How many (unique) peptides does MQ need?)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### `peptides.txt`: Multiple Genes per peptides\n", - "\n", - "- can gene name be collapsed meaningfully?\n", - "- some gene groups share common stem -> can this be used?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mq_output.peptides[mq_col.GENE_NAMES].head(10)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "gene_sets_unique = mq_output.peptides[\"Gene names\"].unique()\n", - "\n", - "N_GENE_SETS = len(gene_sets_unique)\n", - "print(f'There are {N_GENE_SETS} unique sets of genes.')\n", - "assert N_GENE_SETS != 0, 'No genes?'\n", - "\n", - "genes_single_unique = mq.get_set_of_genes(gene_sets_unique)\n", - "N_GENE_SINGLE_UNIQUE = len(genes_single_unique)\n", - "\n", - "mq.validate_gene_set(N_GENE_SINGLE_UNIQUE, N_GENE_SETS)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "How often do genes names appear in unique sets?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "genes_counted_each_in_unique_sets = pd.Series(mq.count_genes_in_sets(\n", - " gene_sets=gene_sets_unique))\n", - "\n", - "title_ = 'Frequency of counts for each gene in unique set of genes'\n", - "\n", - "ax = genes_counted_each_in_unique_sets.value_counts().sort_index().plot(\n", - " kind='bar',\n", - " title=title_,\n", - " xlabel='Count of a gene',\n", - " ylabel='Frequency of counts',\n", - " ax=None,\n", - ")\n", - "fig = ax.get_figure()\n", - "\n", - "fig_folder = FIGUREFOLDER / mq_output.folder.stem\n", - "fig_folder.mkdir(exist_ok=True)\n", - "fig.savefig(fig_folder / f'{title_}.pdf')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Unique gene sets with more than one gene:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "gene_sets_unique = pd.Series(gene_sets_unique).dropna()\n", - "\n", - "mask_more_than_one_gene = gene_sets_unique.str.contains(';')\n", - "gene_sets_unique.loc[mask_more_than_one_gene]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Long format for genes - `peptides_with_single_gene`\n", - "\n", - "Expand the rows for sets of genes using [`pandas.DataFrame.explode`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.explode.html).\n", - "\n", - "Does a group of peptide only assigns unique set of genes? Genes can have more than one protein.\n", - " - first build groups\n", - " - then see matches (see further below)\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "peptides_with_single_gene = mq.get_peptides_with_single_gene(\n", - " peptides=mq_output.peptides)\n", - "peptides_with_single_gene" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "peptides_with_single_gene.dtypes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\n", - " f\"DataFrame has due to unfolding now {len(peptides_with_single_gene)} instead of {len(mq_output.peptides)} rows\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Should peptides from potential contaminants be considered?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mask = peptides_with_single_gene['Proteins'].str.contains('CON__')\n", - "peptides_with_single_gene.loc[mask]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "_mask_con = peptides_with_single_gene.loc[mask, mq_col.PROTEINS].str.split(\";\"\n", - " ).apply(lambda x: [True if \"CON_\" in item else False for item in x]\n", - " ).apply(all)\n", - "\n", - "assert _mask_con.sum() == 0, \"There are peptides resulting only from possible confounders: {}\".format(\n", - " \", \".join(str(x) for x in peptides_with_single_gene.loc[mask, mq_col.PROTEINS].loc[_mask_con].index))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "peptides_per_gene = peptides_with_single_gene.value_counts(mq_col.GENE_NAMES)\n", - "peptides_per_gene" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "#### Find genes based on `Gene names` column in elonged data-set\n", - "\n", - "More efficient as it does not query unnecessary data or data twice." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "protein_groups_per_gene = peptides_with_single_gene.groupby(\n", - " by=mq_col.GENE_NAMES, dropna=True)\n", - "\n", - "gene_data = protein_groups_per_gene.get_group(peptides_per_gene.index[3])\n", - "gene_data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "list_of_proteins = gene_data[mq_col.PROTEINS].str.split(';').to_list()\n", - "set_of_proteins = set().union(*list_of_proteins)\n", - "set_of_proteins = {x for x in set_of_proteins if 'CON__' not in x}\n", - "set_of_proteins" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "gene_data[mq_col.PROTEINS].value_counts() # combine? select first in case of a CON_ as leading razor protein?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "protein_id = set_of_proteins.pop()\n", - "print(protein_id)\n", - "data_fasta[protein_id]['seq']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_fasta[protein_id]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Sample completeness\n", - "Find a sample with a certain completeness level:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "peps_exact_cleaved = mq.find_exact_cleaved_peptides_for_razor_protein(\n", - " gene_data, fasta_db=data_fasta)\n", - "peps_exact_cleaved[:10]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Then search the list of possible peptides originating from the fasta files assuming no miscleavages to the set of found peptides.\n", - "\n", - "- How many unique exact-cleaved peptides can be mapped to any peptide found in the sample (**completness**)?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "peps_in_data = gene_data.index\n", - "\n", - "mq.calculate_completness_for_sample(\n", - " peps_exact_cleaved=peps_exact_cleaved, \n", - " peps_in_data=peps_in_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The number of peptides found can be then used to calculate the completeness" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Select candidates by completeness of training data in single samples and save by experiment name" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mq_output.folder.stem # needs to go to root?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### GeneData accessor?\n", - "\n", - "- [Registering custom accessors tutorial](https://pandas.pydata.org/pandas-docs/stable/development/extending.html#registering-custom-accessors)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# @pd.api.extensions.register_dataframe_accessor('gene')\n", - "# class GeneDataAccessor:\n", - "\n", - "# COL_INTENSITY = mq_col.INTENSITY\n", - "# COL_RAZOR_PROT = 'Leading razor protein'\n", - "# COL_PROTEINS = 'Proteins'\n", - "# COL_GENE_NAME = 'Gene names'\n", - "\n", - "# COLS_EXPECTED = {COL_INTENSITY, COL_RAZOR_PROT, COL_PROTEINS, COL_GENE_NAME}\n", - "\n", - "# def __init__(self, pandas_df):\n", - "# self._validate(df=pandas_df)\n", - "\n", - "# @classmethod\n", - "# def _validate(cls, df):\n", - "# \"\"\"Verify if expected columns and layout apply to panda.DataFrame (view)\"\"\"\n", - "# _found_columns = cls.COLS_EXPECTED.intersection(df.columns)\n", - "# if not _found_columns == cls.COLS_EXPECTED:\n", - "# raise AttributeError(\"Expected columns not in DataFrame: {}\".format(\n", - "# list(cls.COLS_EXPECTED - _found_columns)))\n", - "# if not len(df[COL_RAZOR_PROT].unique()) != 1:\n", - "\n", - "\n", - "# # GeneDataAccessor(gene_data.drop(mq_col.INTENSITY, axis=1))\n", - "# # GeneDataAccessor(gene_data)\n", - "# # gene_data.drop(mq_col.INTENSITY, axis=1).gene\n", - "# gene_data.gene" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Gene Data Mapper?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class GeneDataMapper:\n", - "\n", - " COL_INTENSITY = mq_col.INTENSITY\n", - " COL_RAZOR_PROT = mq_col.LEADING_RAZOR_PROTEIN\n", - " COL_PROTEINS = mq_col.PROTEINS\n", - " COL_GENE_NAME = mq_col.GENE_NAMES\n", - "\n", - " COLS_EXPECTED = {COL_INTENSITY, COL_RAZOR_PROT,\n", - " COL_PROTEINS, COL_GENE_NAME}\n", - "\n", - " def __init__(self, pandas_df, fasta_dict):\n", - " self._validate(df=pandas_df)\n", - " self._df = pandas_df\n", - " self._fasta_dict = fasta_dict\n", - "\n", - " # self.log?\n", - "\n", - " @classmethod\n", - " def _validate(cls, df):\n", - " \"\"\"Verify if expected columns and layout apply to panda.DataFrame (view)\"\"\"\n", - " _found_columns = cls.COLS_EXPECTED.intersection(df.columns)\n", - " if not _found_columns == cls.COLS_EXPECTED:\n", - " raise AttributeError(\"Expected columns not in DataFrame: {}\".format(\n", - " list(cls.COLS_EXPECTED - _found_columns)))\n", - " if len(df[cls.COL_RAZOR_PROT].unique()) != 1:\n", - " raise ValueError(\n", - " \"Non-unique razor-protein in DataFrame: \", df[cls.COL_RAZOR_PROT].unique())\n", - "\n", - " def __repr__(self):\n", - " return f\"{self.__class__.__name__} at {id(self)}\"\n", - "\n", - "\n", - "GeneDataMapper(gene_data, data_fasta)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Dump samples as json\n", - "\n", - "- select unique gene-names in set (have to be shared by all peptides)\n", - "- dump peptide intensities as json from `peptides.txt`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "peptides_with_single_gene # long-format with repeated peptide information by gene" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "root_logger = logging.getLogger()\n", - "root_logger.handlers = []\n", - "root_logger.handlers" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "genes_counted_each_in_unique_sets = pd.Series(mq.count_genes_in_sets(\n", - " gene_sets=gene_sets_unique))\n", - "\n", - "# # ToDo: Develop\n", - "# class MaxQuantTrainingDataExtractor():\n", - "# \"\"\"Class to extract training data from `MaxQuantOutput`.\"\"\"\n", - "\n", - "# def __init__(self, out_folder):\n", - "# self.out_folder = Path(out_folder)\n", - "# self.out_folder.mkdir(exist_ok=True)\n", - "# self.fname_template = '{gene}.json'\n", - "\n", - "completeness_per_gene = mq.ExtractFromPeptidesTxt(\n", - " out_folder='train', mq_output_object=mq_output, fasta_db=data_fasta)()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# same code fails in `vaep.io.mq`, ABC needed?\n", - "isinstance(mq_output, MaxQuantOutput), type(mq_output)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Descriptics" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "s_completeness = pd.Series(completeness_per_gene, name='completenes_by_gene')\n", - "s_completeness.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "N_BINS = 20\n", - "ax = s_completeness.plot(kind='hist',\n", - " bins=N_BINS,\n", - " xticks=[x/100 for x in range(0, 101, 5)],\n", - " figsize=(10, 5),\n", - " rot=90,\n", - " title=f\"Frequency of proportion of observed exact peptides (completness) per razor protein from 0 to 1 in {N_BINS} bins\"\n", - " f\"\\nin sample {mq_output.folder.stem}\")\n", - "\n", - "_ = ax.set_xlabel(\n", - " \"Proportion of exactly observed peptides (including up to 2 mis-cleavages)\")\n", - "\n", - "fig = ax.get_figure()\n", - "fig.tight_layout()\n", - "fig.savefig(FIGUREFOLDER / mq_output.folder.stem / 'freq_completeness.png')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "based on completeness, select valid training data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# continously decrease this number in the scope of the project\n", - "mask = s_completeness > .6\n", - "s_completeness.loc[mask]" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.8.13 ('vaep')", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13" - }, - "toc-autonumbering": false, - "vscode": { - "interpreter": { - "hash": "cf83e9cb890c7f96eb0ae04f39a82254555f56a1a0ed2f03b23a8b40fe6cd31c" - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/project/erda_12_explore_raw_MQ_data.py b/project/erda_12_explore_raw_MQ_data.py deleted file mode 100644 index fc788ad17..000000000 --- a/project/erda_12_explore_raw_MQ_data.py +++ /dev/null @@ -1,829 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.14.5 -# kernelspec: -# display_name: Python 3.8.13 ('vaep') -# language: python -# name: python3 -# --- - -# %% [markdown] -# # Explore MaxQuant (MQ) output files of single runs -# -# The `erda_03_training_data.ipynb` notebook does extract information to be used as training data. -# File specific one could also use the retention time analysis to identify _valid_ co-occurring peptides to be use during training. -# Potentially this preprocessing step can be used at inference time. -# -# This notebook contains some relevant analysis for a specific `txt` output-folder in the current project -# -# ##### Analysis overview -# -# > Report for example data -# -# - relation between `peptides.txt` and `evidence.txt` - -# %% -import json -import logging -import random - -import ipywidgets as widgets - -from numpy.testing import assert_almost_equal -from numpy import random -import pandas as pd -# pd.options.display.float_format = '{:,.1f}'.format - -import vaep.pandas -from vaep.pandas import length -from vaep.io.mq import FASTA_KEYS, MaxQuantOutput -from vaep.io import search_subfolders -import vaep.io.mq as mq -from vaep.io.mq import mq_col - - -from vaep.logging import setup_nb_logger -logger = setup_nb_logger() - -################## -##### CONFIG ##### -################## - -import config -from config import FIGUREFOLDER -# from config import FOLDER_RAW_DATA -from config import FOLDER_MQ_TXT_DATA as FOLDER_RAW_DATA - - -print(f"Search Raw-Files on path: {FOLDER_RAW_DATA}") - -# %% -folders = search_subfolders(path=FOLDER_RAW_DATA, depth=1, exclude_root=True) -w_folder = widgets.Dropdown(options=folders, description='Select a folder') -w_folder - -# %% -mq_output = MaxQuantOutput(folder=w_folder.value) - -# %% [markdown] -# ## Some important columns -# -# Grouped by a namedtuple allowing attribute access - -# %% -mq_col - -# %% [markdown] -# ## `peptides.txt` -# -# > For reference on final "result" - -# %% -pd.options.display.max_columns = len(mq_output.peptides.columns) -mq_output.peptides - -# %% [markdown] -# `peptides.txt` contains aggregated peptides - -# %% -intensities = mq_output.peptides.Intensity -intensities - -# %% [markdown] -# Not all peptides are associated with a Protein or Gene by MQ, although there is evidence for the peptide. This is due to potential `CON_`taminants in the medium which is encouded by default by MQ. - -# %% -mq_output.peptides[FASTA_KEYS].isna().sum() - -# %% [markdown] -# ## `evidence.txt` -# -# contains -# - retention time for peptides -# - has repeated measures of the same sequences, which are all aggregated in `peptides.txt` -# - -# %% -pd.options.display.max_columns = len(mq_output.evidence.columns) -mq_output.evidence - -# %% -mq_output.evidence.Charge.value_counts().sort_index() - -# %% -mask = mq_output.evidence[mq_col.RETENTION_TIME] != mq_output.evidence[mq_col.CALIBRATED_RETENTION_TIME] -print("Number of non-matching retention times between calibrated and non-calibrated column:", mask.sum()) - -# try: -# assert mask.sum() == 0, "More than one replica?" -# except AssertionError as e: -# logger.warning(e) -assert mask.sum() == 0, "More than one replica?" - -# %% [markdown] -# Using only one quality control sample, leaves the initial retention time as is. - -# %% -rt = mq_output.evidence[mq_col.CALIBRATED_RETENTION_TIME] - -# %% -pep_measured_freq_in_evidence = rt.index.value_counts() -pep_measured_freq_in_evidence.iloc[:10] # top10 repeatedly measured peptides - -# %% -max_observed_pep_evidence = pep_measured_freq_in_evidence.index[0] -mq_output.evidence.loc[ - max_observed_pep_evidence, - : -] - -# %% [markdown] -# The retention time index is non-unique. - -# %% -print('The retention time index is unique: {}'.format(rt.index.is_unique)) - -# %% [markdown] -# Peptides observed more than once at different times. - -# %% -mask_duplicated = rt.index.duplicated(keep=False) -rt_duplicates = rt.loc[mask_duplicated] -rt_duplicates - -# %% -mq_output.evidence.loc[mask_duplicated, [ - 'Charge', 'Calibrated retention time', 'Intensity']] - -# %% [markdown] -# Calculate median intensity and calculate standard deviation - -# %% -_agg_functions = ['median', 'std'] - -rt_summary = rt.groupby(level=0).agg(_agg_functions) -rt_summary - -# %% [markdown] -# Let's see several quartiles for both median and standard deviation (the columns are independent from each other) for the retention time - -# %% -rt_summary.describe(percentiles=[0.8, 0.9, 0.95, 0.96, 0.97, 0.98, 0.99]) - -# %% -rt_summary['median'] - -# %% [markdown] -# A large standard-deviation indicates that the intensity values originate from time points (in min) widely spread. - -# %% [markdown] -# ### Peptides observed several times a different points of experimental run - -# %% -mask = rt_summary['std'] > 30.0 -mask_indices = mask[mask].index -rt.loc[mask_indices] - -# %% [markdown] -# Peptides with differen RT have different charges. - -# %% -mq_output.evidence.loc[mask_indices] - -# %% [markdown] -# Model evaluation possibility: Discard samples with several measurements from an experiment and predict value. See which intensity measurement corresponds more closely. - -# %% -_peptide = random.choice(mask_indices) - -# %% -f'evidence_{_peptide}_{w_folder.value.stem}' - -# %% - -peptide_view = mq_output.evidence.loc[_peptide] -peptide_view = (peptide_view[ - vaep.pandas.get_unique_non_unique_columns(peptide_view).non_unique] - .dropna(axis=1, how='all') - .set_index('Charge', append=True)) -peptide_view - -# %% -fname = w_folder.value.parent / f'evidence_{_peptide}_{w_folder.value.stem}.xlsx' -peptide_view.to_excel(fname) -fname - -# %% [markdown] -# `Type` column indicates if peptide is based on one or more MS-MS spectra. - -# %% -mq_output.peptides.loc[_peptide].to_frame().T - -# %% [markdown] -# ## Differences in intensities b/w peptides.txt and evidence.txt -# -# -# The intensity reported in `peptides.txt` corresponds to roughly to the sum of the intensities found in different scans: - -# %% -col_intensity = mq_col.INTENSITY -try: - - assert_almost_equal( - _pep_int_evidence := mq_output.evidence.loc[_peptide, col_intensity].sum(), - _pep_int_peptides := mq_output.peptides.loc[_peptide, col_intensity], - err_msg='Mismatch between evidence.txt summed peptide intensities to reported peptide intensities in peptides.txt') -except AssertionError as e: - logging.error( - f"{e}\n Difference: {_pep_int_evidence - _pep_int_peptides:,.2f}") - -# %% -mq_output.evidence.loc[_peptide, col_intensity] - -# %% -mq_output.peptides.loc[_peptide, col_intensity] - -# %% [markdown] -# Make this comparison for all peptides - -# %% -_pep_int_evidence = mq_output.evidence.groupby( - level=0).agg({col_intensity: [sum, len]}) -_pep_int_evidence.columns = [col_intensity, 'count'] -_pep_int_evidence - -# %% -_diff = _pep_int_evidence[col_intensity] - \ - mq_output.peptides[col_intensity].astype(float) -mask_diff = _diff != 0.0 -_pep_int_evidence.loc[mask_diff].describe() - -# %% -_diff.loc[mask_diff] - -# %% -_diff[mask_diff].describe() - -# %% [markdown] -# Several smaller and larger differences in an intensity range way below the detection limit arise for some sequences. - -# %% [markdown] -# ### Ideas on source of difference -# - Are all peptides (sequences) which are based on single observations in `evidence.txt` represented as is in `peptides.txt`? -# - how many peptides with more than one PTM have non-zero differences between the sum of intensity values in `evidence.txt` and the respective value in `peptides.txt`? -# - maybe some peptides are filtered based on assignment as contaminent (`CON__`)? - -# %% -# ToDo see above - -# %% -_diff_indices = _diff[mask_diff].index -# some pep-seq in peptides.txt not in evidence.txt -_diff_indices = _diff_indices.intersection(mq_output.evidence.index.unique()) - -# %% -sample_index = random.choice(_diff_indices) - -# %% -mq_output.evidence.loc[sample_index] - -# %% -mq_output.peptides.loc[sample_index].to_frame().T - -# %% [markdown] -# ### Modifications - -# %% -mq_output.evidence.Modifications.value_counts() - -# %% [markdown] -# ### Potential contaminant peptides - -# %% [markdown] -# The `CON__` entries are possible contaminations resulting from sample preparation using a e.g. a serum: -# -# ```python -# data_fasta['ENSEMBL:ENSBTAP00000024146'] -# data_fasta['P12763'] # bovine serum protein -> present in cell cultures and in list of default contaminant in MQ -# data_fasta['P00735'] # also bovin serum protein -# ``` - -# %% -mask = mq_output.peptides['Potential contaminant'].notna() -mq_output.peptides.loc[mask] - -# %% [markdown] -# ### Aggregate identifiers in evidence.txt -# -# - `Proteins`: All proteins that contain peptide sequence - -# %% -fasta_keys = ["Proteins", "Leading proteins", - "Leading razor protein", "Gene names"] -mq_output.evidence[fasta_keys] - -# %% [markdown] -# The protein assignment information is not entirely unique for each group of peptides. - -# %% [markdown] -# ## align intensities and retention time (RT) for peptides -# -# - intensities are values reported in `peptides.txt` -# - some (few) peptides in `peptides.txt` are not in `evidence.txt`, but then probably zero - -# %% -intensities.index - -# %% -seq_w_summed_intensities = intensities.to_frame().merge( - rt_summary, left_index=True, right_index=True, how='left') - -# %% -seq_w_summed_intensities - -# %% -mask = ~mq_output.evidence.reset_index( -)[["Sequence", "Proteins", "Gene names"]].duplicated() -mask.index = mq_output.evidence.index - -# %% -diff_ = seq_w_summed_intensities.index.unique().difference(mask.index.unique()) -diff_.to_list() - -# %% -# mq_output.msms.set_index('Sequence').loc['GIPNMLLSEEETES'] - -# %% -# There is no evidence, but then it is reported in peptides?! -# Is this the case for more than one MQ-RUN (last or first not written to file?) -try: - if len(diff_) > 0: - mq_output.evidence.loc[diff_] -except KeyError as e: - logging.error(e) - -# %% -mq_output.peptides.loc[diff_] - -# %% [markdown] -# ### Option: Peptide scan with highest score for repeatedly measured peptides -# -# - only select one of repeated peptide scans, namely the one with the highest score -# - discards information, no summation of peptide intensities -# - yields unique retention time per peptide, by discarding additional information - -# %% -COL_SCORE = 'Score' -mq_output.evidence.groupby(level=0)[COL_SCORE].max() - -# %% -mask_max_per_seq = mq_output.evidence.groupby( - level=0)[COL_SCORE].transform("max").eq(mq_output.evidence[COL_SCORE]) -mask_intensity_not_na = mq_output.evidence.Intensity.notna() -mask = mask_max_per_seq & mask_intensity_not_na - -# %% [markdown] -# This leads to a non-unique mapping, as some scores are exactly the same for two peptides. - -# %% -mask_duplicates = mq_output.evidence.loc[mask].sort_values( - mq_col.INTENSITY).index.duplicated() -sequences_duplicated = mq_output.evidence.loc[mask].index[mask_duplicates] -mq_output.evidence.loc[mask].loc[sequences_duplicated, [ - COL_SCORE, mq_col.INTENSITY, mq_col.RETENTION_TIME]] # .groupby(level=0).agg({mq_col.INTENSITY : max}) - -# %% -mask = mq_output.evidence.reset_index().sort_values( - by=["Sequence", "Score", mq_col.INTENSITY]).duplicated(subset=["Sequence", "Score"], keep='last') -_sequences = mq_output.evidence.index[mask] -mq_output.evidence.loc[_sequences, [ - "Score", "Retention time", mq_col.INTENSITY, "Proteins"]] - -# %% [markdown] -# - random, non missing intensity? - -# %% -aggregators = ["Sequence", "Score", mq_col.INTENSITY] -mask_intensity_not_na = mq_output.evidence.Intensity.notna() -seq_max_score_max_intensity = mq_output.evidence.loc[mask_intensity_not_na].reset_index( -)[aggregators+["Proteins", "Gene names"]].sort_values(by=aggregators).set_index("Sequence").groupby(level=0).last() -seq_max_score_max_intensity - -# %% -# drop NA intensities first. -assert seq_max_score_max_intensity.Intensity.isna().sum() == 0 - -# %% [markdown] -# Certain peptides have no Protein or gene assigned. - -# %% -seq_max_score_max_intensity.isna().sum() - -# %% -mask_seq_selected_not_assigned = seq_max_score_max_intensity.Proteins.isna( -) | seq_max_score_max_intensity["Gene names"].isna() -seq_max_score_max_intensity.loc[mask_seq_selected_not_assigned] - -# %% [markdown] -# These might be a candiate for evaluating predictions, as the information is measured, but unknown. -# If they cannot be assigned, the closest fit on different genes with model predictions could be a criterion for selection - -# %% [markdown] -# ## Create dumps of intensities in `peptides.txt` - -# %% -# mq_output.evidence.loc["AAAGGGGGGAAAAGR"] - -# %% -# ToDo: dump this? -mq_output.dump_intensity(folder='data/peptides_txt_intensities/') - -# %% [markdown] -# ## Create dumps per gene - -# %% [markdown] -# Some hundred peptides map to more than two genes - -# %% -seq_max_score_max_intensity[mq_col.GENE_NAMES].str.split(";" - ).apply(lambda x: length(x) - ).value_counts( -).sort_index() - -# %% [markdown] -# Mostly unique genes associated with a peptide. - -# %% [markdown] -# ### Select sensible training data per gene -# - sequence coverage information? -# - minimal number or minimal sequence coverage, otherwise discared -# - multiple genes: -# - select first and add reference in others -# - split and dump repeatedly -# -# Load fasta-file information - -# %% -with open(config.FN_FASTA_DB) as f: - data_fasta = json.load(f) -print(f'Number of proteins in fasta file DB: {len(data_fasta)}') - -# %% -# schema validation? Load class with schema? -# -> Fasta-File creation should save schema with it - -# %% [markdown] -# ### Fasta Entries considered as contaminants by MQ - -# %% -mask_potential_contaminant = mq_output.peptides['Potential contaminant'] == '+' -contaminants = mq_output.peptides.loc[mask_potential_contaminant, [mq_col.PROTEINS, mq_col.LEADING_RAZOR_PROTEIN]] -contaminants.head() - -# %% -unique_cont = contaminants[mq_col.PROTEINS].str.split(';').to_list() -set_all = set().union(*unique_cont) -set_cont = {x.split('CON__')[-1] for x in set_all if 'CON__' in x} -set_proteins_to_remove = set_all.intersection(set_cont) -set_proteins_to_remove - -# %% [markdown] -# List of proteins which are both in the fasta file and potential contaminants - -# %% -mask = mq_output.peptides[mq_col.LEADING_RAZOR_PROTEIN].isin(set_proteins_to_remove) -mq_output.peptides.loc[mask, 'Potential contaminant'].value_counts() # ToDo: Remove potential contaminants, check evidence.txt - -# %% [markdown] -# ### `id_map`: Find genes based on fasta file -# -# Using `ID_MAP`, all protein entries for that gene are queried and combined. - -# %% -# # slow! discarded for now - -# from config import FN_ID_MAP - -# with open(FN_ID_MAP) as f: -# id_map = json.load(f) -# id_map = pd.read_json(FN_ID_MAP, orient="split") - -# protein_groups_per_gene = id_map.groupby(by="gene") -# gene_found = [] -# for name, gene_data in protein_groups_per_gene: - -# _peptides = set() -# for protein_id in gene_data.index: -# _peptides = _peptides.union(p for p_list in data_fasta[protein_id]['peptides'] -# for p in p_list) - -# # select intersection of theoretical peptides for gene with observed peptides -# _matched = mq_output.peptides.index.intersection(_peptides) -# # add completness? -# if not _matched.empty and len(_matched) > 3: -# gene_found.append(name) -# # -# if not len(gene_found) % 500 : -# print(f"Found {len(gene_found):6}") -# print(f"Total: {len(gene_found):5}") - -# %% [markdown] -# Compare this with the entries in the `Gene names` column of `peptides.txt` -# -# > Mapping is non-unique. MQ has no treshold on number of identified peptides. (How many (unique) peptides does MQ need?) - -# %% [markdown] -# ### `peptides.txt`: Multiple Genes per peptides -# -# - can gene name be collapsed meaningfully? -# - some gene groups share common stem -> can this be used? - -# %% -mq_output.peptides[mq_col.GENE_NAMES].head(10) - -# %% -gene_sets_unique = mq_output.peptides["Gene names"].unique() - -N_GENE_SETS = len(gene_sets_unique) -print(f'There are {N_GENE_SETS} unique sets of genes.') -assert N_GENE_SETS != 0, 'No genes?' - -genes_single_unique = mq.get_set_of_genes(gene_sets_unique) -N_GENE_SINGLE_UNIQUE = len(genes_single_unique) - -mq.validate_gene_set(N_GENE_SINGLE_UNIQUE, N_GENE_SETS) - -# %% [markdown] -# How often do genes names appear in unique sets? - -# %% -genes_counted_each_in_unique_sets = pd.Series(mq.count_genes_in_sets( - gene_sets=gene_sets_unique)) - -title_ = 'Frequency of counts for each gene in unique set of genes' - -ax = genes_counted_each_in_unique_sets.value_counts().sort_index().plot( - kind='bar', - title=title_, - xlabel='Count of a gene', - ylabel='Frequency of counts', - ax=None, -) -fig = ax.get_figure() - -fig_folder = FIGUREFOLDER / mq_output.folder.stem -fig_folder.mkdir(exist_ok=True) -fig.savefig(fig_folder / f'{title_}.pdf') - -# %% [markdown] -# Unique gene sets with more than one gene: - -# %% -gene_sets_unique = pd.Series(gene_sets_unique).dropna() - -mask_more_than_one_gene = gene_sets_unique.str.contains(';') -gene_sets_unique.loc[mask_more_than_one_gene] - -# %% [markdown] -# ### Long format for genes - `peptides_with_single_gene` -# -# Expand the rows for sets of genes using [`pandas.DataFrame.explode`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.explode.html). -# -# Does a group of peptide only assigns unique set of genes? Genes can have more than one protein. -# - first build groups -# - then see matches (see further below) -# - -# %% -peptides_with_single_gene = mq.get_peptides_with_single_gene( - peptides=mq_output.peptides) -peptides_with_single_gene - -# %% -peptides_with_single_gene.dtypes - -# %% -print( - f"DataFrame has due to unfolding now {len(peptides_with_single_gene)} instead of {len(mq_output.peptides)} rows") - -# %% [markdown] -# Should peptides from potential contaminants be considered? - -# %% -mask = peptides_with_single_gene['Proteins'].str.contains('CON__') -peptides_with_single_gene.loc[mask] - -# %% -_mask_con = peptides_with_single_gene.loc[mask, mq_col.PROTEINS].str.split(";" - ).apply(lambda x: [True if "CON_" in item else False for item in x] - ).apply(all) - -assert _mask_con.sum() == 0, "There are peptides resulting only from possible confounders: {}".format( - ", ".join(str(x) for x in peptides_with_single_gene.loc[mask, mq_col.PROTEINS].loc[_mask_con].index)) - -# %% -peptides_per_gene = peptides_with_single_gene.value_counts(mq_col.GENE_NAMES) -peptides_per_gene - -# %% [markdown] -# -# #### Find genes based on `Gene names` column in elonged data-set -# -# More efficient as it does not query unnecessary data or data twice. - -# %% -protein_groups_per_gene = peptides_with_single_gene.groupby( - by=mq_col.GENE_NAMES, dropna=True) - -gene_data = protein_groups_per_gene.get_group(peptides_per_gene.index[3]) -gene_data - -# %% -list_of_proteins = gene_data[mq_col.PROTEINS].str.split(';').to_list() -set_of_proteins = set().union(*list_of_proteins) -set_of_proteins = {x for x in set_of_proteins if 'CON__' not in x} -set_of_proteins - -# %% -gene_data[mq_col.PROTEINS].value_counts() # combine? select first in case of a CON_ as leading razor protein? - -# %% -protein_id = set_of_proteins.pop() -print(protein_id) -data_fasta[protein_id]['seq'] - -# %% -data_fasta[protein_id] - -# %% [markdown] -# ### Sample completeness -# Find a sample with a certain completeness level: - -# %% -peps_exact_cleaved = mq.find_exact_cleaved_peptides_for_razor_protein( - gene_data, fasta_db=data_fasta) -peps_exact_cleaved[:10] - -# %% [markdown] -# Then search the list of possible peptides originating from the fasta files assuming no miscleavages to the set of found peptides. -# -# - How many unique exact-cleaved peptides can be mapped to any peptide found in the sample (**completness**)? - -# %% -peps_in_data = gene_data.index - -mq.calculate_completness_for_sample( - peps_exact_cleaved=peps_exact_cleaved, - peps_in_data=peps_in_data) - -# %% [markdown] -# The number of peptides found can be then used to calculate the completeness - -# %% [markdown] -# Select candidates by completeness of training data in single samples and save by experiment name - -# %% -mq_output.folder.stem # needs to go to root? - -# %% [markdown] -# ### GeneData accessor? -# -# - [Registering custom accessors tutorial](https://pandas.pydata.org/pandas-docs/stable/development/extending.html#registering-custom-accessors) - -# %% -# @pd.api.extensions.register_dataframe_accessor('gene') -# class GeneDataAccessor: - -# COL_INTENSITY = mq_col.INTENSITY -# COL_RAZOR_PROT = 'Leading razor protein' -# COL_PROTEINS = 'Proteins' -# COL_GENE_NAME = 'Gene names' - -# COLS_EXPECTED = {COL_INTENSITY, COL_RAZOR_PROT, COL_PROTEINS, COL_GENE_NAME} - -# def __init__(self, pandas_df): -# self._validate(df=pandas_df) - -# @classmethod -# def _validate(cls, df): -# """Verify if expected columns and layout apply to panda.DataFrame (view)""" -# _found_columns = cls.COLS_EXPECTED.intersection(df.columns) -# if not _found_columns == cls.COLS_EXPECTED: -# raise AttributeError("Expected columns not in DataFrame: {}".format( -# list(cls.COLS_EXPECTED - _found_columns))) -# if not len(df[COL_RAZOR_PROT].unique()) != 1: - - -# # GeneDataAccessor(gene_data.drop(mq_col.INTENSITY, axis=1)) -# # GeneDataAccessor(gene_data) -# # gene_data.drop(mq_col.INTENSITY, axis=1).gene -# gene_data.gene - -# %% [markdown] -# ### Gene Data Mapper? - -# %% -class GeneDataMapper: - - COL_INTENSITY = mq_col.INTENSITY - COL_RAZOR_PROT = mq_col.LEADING_RAZOR_PROTEIN - COL_PROTEINS = mq_col.PROTEINS - COL_GENE_NAME = mq_col.GENE_NAMES - - COLS_EXPECTED = {COL_INTENSITY, COL_RAZOR_PROT, - COL_PROTEINS, COL_GENE_NAME} - - def __init__(self, pandas_df, fasta_dict): - self._validate(df=pandas_df) - self._df = pandas_df - self._fasta_dict = fasta_dict - - # # self.log? - - @classmethod - def _validate(cls, df): - """Verify if expected columns and layout apply to panda.DataFrame (view)""" - _found_columns = cls.COLS_EXPECTED.intersection(df.columns) - if not _found_columns == cls.COLS_EXPECTED: - raise AttributeError("Expected columns not in DataFrame: {}".format( - list(cls.COLS_EXPECTED - _found_columns))) - if len(df[cls.COL_RAZOR_PROT].unique()) != 1: - raise ValueError( - "Non-unique razor-protein in DataFrame: ", df[cls.COL_RAZOR_PROT].unique()) - - def __repr__(self): - return f"{self.__class__.__name__} at {id(self)}" - - -GeneDataMapper(gene_data, data_fasta) - -# %% [markdown] -# ### Dump samples as json -# -# - select unique gene-names in set (have to be shared by all peptides) -# - dump peptide intensities as json from `peptides.txt` - -# %% -peptides_with_single_gene # long-format with repeated peptide information by gene - -# %% -root_logger = logging.getLogger() -root_logger.handlers = [] -root_logger.handlers - -# %% -genes_counted_each_in_unique_sets = pd.Series(mq.count_genes_in_sets( - gene_sets=gene_sets_unique)) - -# # ToDo: Develop -# class MaxQuantTrainingDataExtractor(): -# """Class to extract training data from `MaxQuantOutput`.""" - -# def __init__(self, out_folder): -# self.out_folder = Path(out_folder) -# self.out_folder.mkdir(exist_ok=True) -# self.fname_template = '{gene}.json' - -completeness_per_gene = mq.ExtractFromPeptidesTxt( - out_folder='train', mq_output_object=mq_output, fasta_db=data_fasta)() - -# %% -# same code fails in `vaep.io.mq`, ABC needed? -isinstance(mq_output, MaxQuantOutput), type(mq_output) - -# %% [markdown] -# #### Descriptics - -# %% -s_completeness = pd.Series(completeness_per_gene, name='completenes_by_gene') -s_completeness.describe() - -# %% -N_BINS = 20 -ax = s_completeness.plot(kind='hist', - bins=N_BINS, - xticks=[x/100 for x in range(0, 101, 5)], - figsize=(10, 5), - rot=90, - title=f"Frequency of proportion of observed exact peptides (completness) per razor protein from 0 to 1 in {N_BINS} bins" - f"\nin sample {mq_output.folder.stem}") - -_ = ax.set_xlabel( - "Proportion of exactly observed peptides (including up to 2 mis-cleavages)") - -fig = ax.get_figure() -fig.tight_layout() -fig.savefig(FIGUREFOLDER / mq_output.folder.stem / 'freq_completeness.png') - -# %% [markdown] -# based on completeness, select valid training data - -# %% -# continously decrease this number in the scope of the project -mask = s_completeness > .6 -s_completeness.loc[mask] diff --git a/project/erda_data_available.ipynb b/project/erda_data_available.ipynb deleted file mode 100644 index c9e9f2283..000000000 --- a/project/erda_data_available.ipynb +++ /dev/null @@ -1,214 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "a1cac16d-ce04-4573-b98b-ac87f1abdf4c", - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "import matplotlib\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "import vaep\n", - "from vaep.io import data_objects\n", - "from vaep.logging import setup_nb_logger\n", - "setup_nb_logger(level=logging.INFO)\n", - "\n", - "from config import FNAME_C_PEPTIDES, FNAME_C_EVIDENCE, FNAME_C_PG, FNAME_C_GENES\n", - "\n", - "FNAME_C_PEPTIDES, FNAME_C_EVIDENCE, FNAME_C_PG, FNAME_C_GENES" - ] - }, - { - "cell_type": "markdown", - "id": "1a3645b4", - "metadata": {}, - "source": [ - "## Aggregated Peptides" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "81592213", - "metadata": {}, - "outputs": [], - "source": [ - "peptide_counter = data_objects.PeptideCounter(FNAME_C_PEPTIDES)\n", - "N_SAMPLES = len(peptide_counter.loaded)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c3532c20", - "metadata": {}, - "outputs": [], - "source": [ - "peptide_counter" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cecb0d20", - "metadata": {}, - "outputs": [], - "source": [ - "peptide_counts = peptide_counter.get_df_counts()\n", - "# peptide_counts.index += 1 \n", - "peptide_counts.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dec794c3-5b0f-481a-a14f-e84c231c6365", - "metadata": {}, - "outputs": [], - "source": [ - "peptide_counts.describe(percentiles=np.linspace(0.1,1,10))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fc5d608f-4969-4964-b03f-923d706a5b33", - "metadata": {}, - "outputs": [], - "source": [ - "vaep.plotting.make_large_descriptors()\n", - "ax = peptide_counter.plot_counts()" - ] - }, - { - "cell_type": "markdown", - "id": "6d75c390", - "metadata": {}, - "source": [ - "## Evidence - Peptides by charge and modifications\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2edab869", - "metadata": {}, - "outputs": [], - "source": [ - "evidence_counter = data_objects.EvidenceCounter(FNAME_C_EVIDENCE)\n", - "evidence_count = evidence_counter.get_df_counts()\n", - "evidence_count.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8a9f11bd", - "metadata": {}, - "outputs": [], - "source": [ - "ax = evidence_counter.plot_counts()" - ] - }, - { - "cell_type": "markdown", - "id": "9a980a8e", - "metadata": {}, - "source": [ - "## Protein Groups" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "85264cf5", - "metadata": {}, - "outputs": [], - "source": [ - "pg_counter = data_objects.ProteinGroupsCounter(FNAME_C_PG)\n", - "pg_count = pg_counter.get_df_counts()\n", - "pg_count.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6d060cd5", - "metadata": {}, - "outputs": [], - "source": [ - "ax = pg_counter.plot_counts()" - ] - }, - { - "cell_type": "markdown", - "id": "cf95d49f", - "metadata": {}, - "source": [ - "## Genes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "30623792", - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "gene_counter = data_objects.GeneCounter(FNAME_C_GENES)\n", - "gene_count = gene_counter.get_df_counts()\n", - "gene_count.head() # remove NaN entry" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "18f788c4", - "metadata": {}, - "outputs": [], - "source": [ - "gene_count = gene_count.iloc[1:]\n", - "gene_count.head() " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "805a3ccc", - "metadata": {}, - "outputs": [], - "source": [ - "ax = gene_counter.plot_counts(df_counts=gene_count) # provide manuelly manipulated gene counts" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "vaep", - "language": "python", - "name": "vaep" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/project/erda_data_available.py b/project/erda_data_available.py deleted file mode 100644 index 81d787108..000000000 --- a/project/erda_data_available.py +++ /dev/null @@ -1,91 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.14.5 -# kernelspec: -# display_name: vaep -# language: python -# name: vaep -# --- - -# %% -import logging -import matplotlib -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd - -import vaep -from vaep.io import data_objects -from vaep.logging import setup_nb_logger -setup_nb_logger(level=logging.INFO) - -from config import FNAME_C_PEPTIDES, FNAME_C_EVIDENCE, FNAME_C_PG, FNAME_C_GENES - -FNAME_C_PEPTIDES, FNAME_C_EVIDENCE, FNAME_C_PG, FNAME_C_GENES - -# %% [markdown] -# ## Aggregated Peptides - -# %% -peptide_counter = data_objects.PeptideCounter(FNAME_C_PEPTIDES) -N_SAMPLES = len(peptide_counter.loaded) - -# %% -peptide_counter - -# %% -peptide_counts = peptide_counter.get_df_counts() -# peptide_counts.index += 1 -peptide_counts.head() - -# %% -peptide_counts.describe(percentiles=np.linspace(0.1,1,10)) - -# %% -vaep.plotting.make_large_descriptors() -ax = peptide_counter.plot_counts() - -# %% [markdown] -# ## Evidence - Peptides by charge and modifications -# -# - -# %% -evidence_counter = data_objects.EvidenceCounter(FNAME_C_EVIDENCE) -evidence_count = evidence_counter.get_df_counts() -evidence_count.head() - -# %% -ax = evidence_counter.plot_counts() - -# %% [markdown] -# ## Protein Groups - -# %% -pg_counter = data_objects.ProteinGroupsCounter(FNAME_C_PG) -pg_count = pg_counter.get_df_counts() -pg_count.head() - -# %% -ax = pg_counter.plot_counts() - -# %% [markdown] -# ## Genes - -# %% -gene_counter = data_objects.GeneCounter(FNAME_C_GENES) -gene_count = gene_counter.get_df_counts() -gene_count.head() # remove NaN entry - - -# %% -gene_count = gene_count.iloc[1:] -gene_count.head() - -# %% -ax = gene_counter.plot_counts(df_counts=gene_count) # provide manuelly manipulated gene counts diff --git a/project/misc_FASTA_data_agg_by_gene.ipynb b/project/misc_FASTA_data_agg_by_gene.ipynb deleted file mode 100644 index c4733a5c7..000000000 --- a/project/misc_FASTA_data_agg_by_gene.ipynb +++ /dev/null @@ -1,238 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Protein sequence aggregation by gene" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from collections import defaultdict\n", - "import json\n", - "from tqdm.notebook import tqdm\n", - "\n", - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "from Bio import Align\n", - "\n", - "from config import FN_FASTA_DB\n", - "from config import fasta_entry as fasta_keys" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with open(FN_FASTA_DB) as f:\n", - " data_fasta = json.load(f)#, indent=4, sort_keys=False)\n", - "len(data_fasta)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "gene_isotopes = defaultdict(list)\n", - "protein_wo_gene = []\n", - "for key, fasta_entry in tqdm(data_fasta.items()):\n", - " gene = fasta_entry[fasta_keys.gene]\n", - " if gene:\n", - " gene_isotopes[gene].append(key)\n", - " else:\n", - " protein_wo_gene.append(key)\n", - "\n", - "print(f\"#{len(protein_wo_gene)} proteins have not gene associated: {', '.join(protein_wo_gene[:10])}, ...\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "gene = 'ACTG1' # Actin as a contaminant protein\n", - "gene_isotopes[gene]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pprint import pprint\n", - "for isotope in gene_isotopes[gene]:\n", - " pprint(data_fasta[isotope])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Sequences" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sequences = {}\n", - "for isotope in gene_isotopes[gene]:\n", - " sequences[isotope] = data_fasta[isotope][fasta_keys.seq]\n", - "sequences" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sorted(sequences.values(), key=len)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sequences = pd.Series(sequences)\n", - "sequences.str.len()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "aligner = Align.PairwiseAligner()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "alignments = aligner.align(sequences.loc['I3L1U9'], sequences.loc['I3L3I0']) # Identical? Maybe check if this is more than once the case?\n", - "for alignment in alignments:\n", - " print(alignment)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_fasta['I3L1U9'][fasta_keys.seq] == data_fasta['I3L3I0'][fasta_keys.seq]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "alignments = aligner.align(sequences.loc['I3L1U9'], sequences.loc['I3L3R2']) # Identical?\n", - "for alignment in alignments:\n", - " print(alignment)\n", - " break" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "alignments = aligner.align(sequences.loc['P63261'], sequences.loc['K7EM38']) # Identical?\n", - "for alignment in alignments:\n", - " print(alignment)\n", - " break" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Unique Peptides" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import itertools\n", - "peptides = {}\n", - "for isotope in gene_isotopes[gene]:\n", - " sequences[isotope] = data_fasta[isotope][fasta_keys.peptides][0]\n", - "\n", - "for peptides in itertools.zip_longest(*sequences.values, fillvalue=''):\n", - " if len(set(peptides)) == 1: \n", - " print(f'all identical: {peptides[0]}')\n", - " else:\n", - " print('\\t'.join(peptides))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for j, peptides in enumerate(sequences.values):\n", - " if j==0:\n", - " set_overlap = set(peptides)\n", - " else:\n", - " set_overlap = set_overlap.intersection(peptides)\n", - "set_overlap" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "vaep", - "language": "python", - "name": "vaep" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/project/misc_FASTA_data_agg_by_gene.py b/project/misc_FASTA_data_agg_by_gene.py deleted file mode 100644 index 60d7a8888..000000000 --- a/project/misc_FASTA_data_agg_by_gene.py +++ /dev/null @@ -1,119 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.14.0 -# kernelspec: -# display_name: vaep -# language: python -# name: vaep -# --- - -# %% [markdown] -# # Protein sequence aggregation by gene - -# %% -from collections import defaultdict -import json -from tqdm.notebook import tqdm - -import numpy as np -import pandas as pd - -from Bio import Align - -from config import FN_FASTA_DB -from config import fasta_entry as fasta_keys - -# %% -with open(FN_FASTA_DB) as f: - data_fasta = json.load(f)#, indent=4, sort_keys=False) -len(data_fasta) - -# %% -gene_isotopes = defaultdict(list) -protein_wo_gene = [] -for key, fasta_entry in tqdm(data_fasta.items()): - gene = fasta_entry[fasta_keys.gene] - if gene: - gene_isotopes[gene].append(key) - else: - protein_wo_gene.append(key) - -print(f"#{len(protein_wo_gene)} proteins have not gene associated: {', '.join(protein_wo_gene[:10])}, ...") - -# %% -gene = 'ACTG1' # Actin as a contaminant protein -gene_isotopes[gene] - -# %% -from pprint import pprint -for isotope in gene_isotopes[gene]: - pprint(data_fasta[isotope]) - -# %% [markdown] -# ## Sequences - -# %% -sequences = {} -for isotope in gene_isotopes[gene]: - sequences[isotope] = data_fasta[isotope][fasta_keys.seq] -sequences - -# %% -sorted(sequences.values(), key=len) - -# %% -sequences = pd.Series(sequences) -sequences.str.len() - -# %% -aligner = Align.PairwiseAligner() - -# %% -alignments = aligner.align(sequences.loc['I3L1U9'], sequences.loc['I3L3I0']) # Identical? Maybe check if this is more than once the case? -for alignment in alignments: - print(alignment) - -# %% -data_fasta['I3L1U9'][fasta_keys.seq] == data_fasta['I3L3I0'][fasta_keys.seq] - -# %% -alignments = aligner.align(sequences.loc['I3L1U9'], sequences.loc['I3L3R2']) # Identical? -for alignment in alignments: - print(alignment) - break - -# %% -alignments = aligner.align(sequences.loc['P63261'], sequences.loc['K7EM38']) # Identical? -for alignment in alignments: - print(alignment) - break - -# %% [markdown] -# ## Unique Peptides - -# %% -import itertools -peptides = {} -for isotope in gene_isotopes[gene]: - sequences[isotope] = data_fasta[isotope][fasta_keys.peptides][0] - -for peptides in itertools.zip_longest(*sequences.values, fillvalue=''): - if len(set(peptides)) == 1: - print(f'all identical: {peptides[0]}') - else: - print('\t'.join(peptides)) - -# %% -for j, peptides in enumerate(sequences.values): - if j==0: - set_overlap = set(peptides) - else: - set_overlap = set_overlap.intersection(peptides) -set_overlap - -# %% diff --git a/project/misc_FASTA_tryptic_digest.ipynb b/project/misc_FASTA_tryptic_digest.ipynb deleted file mode 100644 index a2d970059..000000000 --- a/project/misc_FASTA_tryptic_digest.ipynb +++ /dev/null @@ -1,1259 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Process FASTA files\n", - "> uses only the provided fasta files in `src.config.py` by `FOLDER_FASTA` \n", - "\n", - "- create theoretically considered peptides considered by search engines\n", - "- dump results as human readable json to `FN_FASTA_DB` file specifed in src.config.\n", - "\n", - "> Based on notebook received by [Annelaura Bach](https://www.cpr.ku.dk/staff/mann-group/?pure=en/persons/443836) and created by Johannes B. Müller \\[[scholar](https://scholar.google.com/citations?user=Rn1OS8oAAAAJ&hl=de), [MPI Biochemistry](https://www.biochem.mpg.de/person/93696/2253)\\]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from collections import defaultdict, namedtuple\n", - "import os\n", - "import json\n", - "import logging\n", - "from pathlib import Path\n", - "\n", - "# import matplotlib\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import pandas as pd\n", - "from tqdm.notebook import tqdm" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from vaep.fasta import cleave_to_tryptic\n", - "from vaep.fasta import iterFlatten\n", - "from vaep.fasta import count_peptide_matches\n", - "from vaep.io import search_files\n", - "from vaep.pandas import combine_value_counts\n", - "from vaep.databases.uniprot import query_uniprot_id_mapping\n", - "from vaep.utils import sample_iterable\n", - "from vaep.plotting import _savefig" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from config import FN_FASTA_DB\n", - "from config import FIGUREFOLDER\n", - "from config import FN_ID_MAP\n", - "from config import FN_PROT_GENE_MAP\n", - "from config import FN_PEP_TO_PROT" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Core Functionality - Example\n", - "\n", - "- write tests for core functinality\n", - "- refactor to file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "test_data = {\n", - " \"meta\": \">tr|A0A024R1R8|A0A024R1R8_HUMAN HCG2014768, isoform CRA_a OS=Homo sapiens OX=9606 GN=hCG_2014768 PE=4 SV=1\",\n", - " \"seq\": \"MSSHEGGKKKALKQPKKQAKEMDEEEKAFKQKQKEEQKKLEVLKAKVVGKGPLATGGIKKSGKK\",\n", - " \"peptides\": [\"MSSHEGGK\", \"EMDEEEK\", \"GPLATGGIK\"],\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "regex is slower than native string replacing and splitting in Python" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# import re\n", - "# cut_by_trypsin = re.compile('([^K]+K)|([^R]+R)')\n", - "# _res = cut_by_trypsin.split(test_data['seq'])\n", - "# [_pep for _pep in _res if _pep != '' and _pep != None]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "- map peptide set of peptides (how to deal with mis-cleavages?)\n", - " - mis-cleavages can happen both to the peptide before and after.\n", - " > `pep1, pep2, pep3, pep4, pep5` \n", - " > `pep1pep2, pep2pep3, pep3pep4, pep4pep5`\n", - " - sliding windows can pass trough the list of peptides - should work with recursion" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "l_peptides = test_data[\"seq\"].replace(\"K\", \"K \").replace(\"R\", \"R \").split()\n", - "l_peptides" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`add_rxk` should add pattern of starting R and trailing K ? " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "last_pep = \"\"\n", - "temp_peps = []\n", - "num_missed_cleavages = 1\n", - "add_rxk = True\n", - "\n", - "sec_last_pep = \"\"\n", - "\n", - "pep_rdx = []\n", - "\n", - "for pep in l_peptides:\n", - " if last_pep != \"\":\n", - " temp_peps.append(last_pep + pep)\n", - " if add_rxk and sec_last_pep != \"\" and len(sec_last_pep) <= 2:\n", - " _pep_rxk = sec_last_pep + last_pep + pep\n", - " print(_pep_rxk)\n", - " pep_rdx.append(_pep_rxk)\n", - " temp_peps.append(_pep_rxk)\n", - "\n", - " sec_last_pep = last_pep # sec_last_pep, last_pep = last_pep, pep ?\n", - " last_pep = pep\n", - "temp_peps" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "repr(pep_rdx)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Missed cleavages core functionality (adapted)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "example_peptides_fasta = cleave_to_tryptic(\n", - " test_data[\"seq\"], num_missed_cleavages=2, add_rxk=True\n", - ")\n", - "print(\"number of peptides: \", [len(_l) for _l in example_peptides_fasta])\n", - "example_peptides_fasta[-1]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"\".join(example_peptides_fasta[0]), *example_peptides_fasta, sep=\"\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "rdx peptides are a subset of two missed cleavage sites peptides. There are omitted when two and more cleavage site can be skipped." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "example_peptides_fasta = cleave_to_tryptic(\n", - " test_data[\"seq\"], num_missed_cleavages=1, add_rxk=True\n", - ")\n", - "print(\"number of peptides: \", [len(_l) for _l in example_peptides_fasta])\n", - "example_peptides_fasta[-1]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Data Structure is no a list of list. Maybe this could be improved. Information what kind of type the peptide is from, is still interesting." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Process Fasta Files\n", - "\n", - "First define input Folder and the file location of the created peptides:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fasta_files = search_files(path=\".\", query=\".fasta\")\n", - "print(fasta_files)\n", - "print(\"\\n\".join(fasta_files.files))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define Setup\n", - "\n", - "Set input FASTA, Output .txt name, lower legth cutoff, missed cleavages and if to report reverse. \n", - "\n", - "Tryptic digest of Fastas to Peptides >6 in list for matching with measured peptides " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "CUTOFF_LEN_PEP = 7\n", - "MAX_MISSED_CLEAVAGES = 2 # default in MaxQuant\n", - "DECOY_REVERSE = False\n", - "SUMMARY_FILE = \"tex/fasta_tryptic_analysis.tex\"\n", - "\n", - "_summary_text = (\n", - " \"The theoretical analysis of the fasta files gives an idea about how many possible peptides \\n\"\n", - " \"can be expected by cleaving proteins using trypsin. The hyperparameters for peptide creation are \\n\"\n", - " f\"to consider the minimal peptide length to be {CUTOFF_LEN_PEP} amino acids, \\n\"\n", - " f\"to consider a maximum of {MAX_MISSED_CLEAVAGES} missed cleavage sites (default in MaxQuant) and \\n\"\n", - " f\"to {'not ' if not DECOY_REVERSE else ''}add decoy peptides by reversing peptide sequences. \\n\"\n", - ")\n", - "print(_summary_text, sep=\"\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "From the [Fasta Meta information](https://ebi14.uniprot.org/help/fasta-headers) the Identifier is extracted.\n", - "\n", - "```\n", - ">db|UniqueIdentifier|EntryName ProteinName OS=OrganismName OX=OrganismIdentifier [GN=GeneName ]PE=ProteinExistence SV=SequenceVersion\n", - "```\n", - "- db is `sp` for UniProtKB/Swiss-Prot and `tr` for UniProtKB/TrEMBL.\n", - "- `UniqueIdentifier` is the primary *accession number* of the UniProtKB entry. (seems to be used by MQ)\n", - "- `EntryName` is the entry name of the UniProtKB entry.\n", - "- `ProteinName` is the recommended name of the UniProtKB entry as annotated in the *RecName* field. For UniProtKB/TrEMBL entries without a *RecName* field, the *SubName* field is used. In case of multiple SubNames, the first one is used. The 'precursor' attribute is excluded, 'Fragment' is included with the name if applicable." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`>tr` or `>sp`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Schema for single fasta entry" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from config import KEY_FASTA_HEADER, KEY_FASTA_SEQ, KEY_GENE_NAME, KEY_PEPTIDES\n", - "\n", - "from vaep.fasta import read_fasta\n", - "\n", - "data_fasta = {}\n", - "\n", - "# # add Schema?\n", - "# schema_fasta_entry = {\n", - "# KEY_FASTA_HEADER: str,\n", - "# KEY_GENE_NAME: str,\n", - "# KEY_FASTA_SEQ: str,\n", - "# KEY_PEPTIDES: (list, (2,2))\n", - "# }\n", - "# # or dataclass\n", - "# from dataclasses import make_dataclass\n", - "# FastaEntry = make_dataclass(cls_name='FastaEntry', \n", - "# fields=[\n", - "# (KEY_FASTA_HEADER, 'str'),\n", - "# (KEY_GENE_NAME, 'str'),\n", - "# (KEY_FASTA_SEQ, 'str'),\n", - "# (KEY_PEPTIDES, list)\n", - "# ])\n", - "# # or namedtuple\n", - "# FastaEntry = namedtuple('FastaEntry', [KEY_FASTA_HEADER, KEY_GENE_NAME, KEY_FASTA_SEQ, KEY_PEPTIDES])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "How to validate schema of fasta entry stored as dictionary?\n", - "- [schema](https://stackoverflow.com/questions/45812387/how-to-validate-structure-or-schema-of-dictionary-in-python) validation in python discussion" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Process Fasta file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for _fasta in tqdm(fasta_files.files):\n", - "\n", - " with open(_fasta) as fp:\n", - " for i, (metainfo, seq) in tqdm(enumerate(read_fasta(fp))):\n", - " identifier = metainfo.split(\"|\")[1]\n", - " gene = \"|\".join([x.split(\"=\")[-1] for x in metainfo.split() if \"GN=\" in x])\n", - " if identifier in data_fasta:\n", - " raise ValueError(\"Key seen before: {}\".format(identifier))\n", - " _all_peptides = cleave_to_tryptic(\n", - " seq, num_missed_cleavages=MAX_MISSED_CLEAVAGES, reversed=DECOY_REVERSE\n", - " )\n", - " data_fasta[identifier] = {\n", - " KEY_FASTA_HEADER: metainfo,\n", - " KEY_GENE_NAME: gene,\n", - " KEY_FASTA_SEQ: seq,\n", - " KEY_PEPTIDES: [\n", - " [_pep for _pep in _peptides if len(_pep) >= CUTOFF_LEN_PEP]\n", - " for _peptides in _all_peptides\n", - " ],\n", - " }" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`fasta_data` holds all information to pick a subset of peptides from peptides intensity tables" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# from random import sample\n", - "# sample_ids = sample(list(data_fasta), 10)\n", - "# for _id in sample_ids:\n", - "# print(\"Unique Identifier: {}: \\n\\t AA-Seq: {} \\n\\t Header: {} \\n\\t Peptides: {}\\n\".format(_id, data_fasta[_id]['seq'], data_fasta[_id]['meta'], data_fasta[_id]['peptides']))\n", - "data_fasta[\"A0A024R1R8\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "d_seq_length = {}\n", - "for _key, _data in data_fasta.items():\n", - " d_seq_length[_key] = len(_data[KEY_FASTA_SEQ])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "d_seq_length = pd.Series(d_seq_length)\n", - "d_seq_length.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "test_series = pd.Series({\"A\": 4, \"B\": 1, \"C\": 0, \"D\": 4})\n", - "\n", - "def get_indices_with_value(s: pd.Series, value):\n", - " \"\"\"Return indices for with the value is true\"\"\"\n", - " return s[s == value].index\n", - "\n", - "\n", - "get_indices_with_value(test_series, 4)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Boolean Indexing, remember to set [parantheses](https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#boolean-indexing)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "MIN_AA_IN_SEQ = 10\n", - "MAX_AA_IN_SEQ = 2000\n", - "mask_min = d_seq_length < MIN_AA_IN_SEQ\n", - "mask_max = d_seq_length > MAX_AA_IN_SEQ\n", - "# _summary_text += f\"\\nThe FASTA file contain {sum(mask_min)} proteins with less than {MIN_AA_IN_SEQ} amino acids (AAs) and {sum(mask_max)} with more than {MAX_AA_IN_SEQ} AAs.\"\n", - "_summary_text += (\n", - " f\"The minimal AA sequence length is {min(d_seq_length)} of UniProt ID {', '.join(get_indices_with_value(d_seq_length, min(d_seq_length)))} \"\n", - " f\"and the maximal sequence lenght is {max(d_seq_length)} for UniProt ID {', '.join(get_indices_with_value(d_seq_length, max(d_seq_length)))}\"\n", - ")\n", - "print(_summary_text)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "_ = d_seq_length.loc[(~mask_max)].to_frame(name=\"AA Seq Length\").plot.hist(bins=200)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "l_genes = []\n", - "n_set = 0\n", - "for _key, _data in data_fasta.items():\n", - " _gene_name = _data[KEY_GENE_NAME]\n", - " if _gene_name:\n", - " l_genes.append(_gene_name)\n", - " n_set += 1" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "_summary_text += (\n", - " f\"\\nIn the FASTA header file {n_set} proteins have a set gene of a total of {len(data_fasta)} proteins,\"\n", - " f\" i.e. {len(data_fasta) - n_set} have an undefined origin. There are {len(set(l_genes))} unique gene names in the FASTA file specified.\\n\"\n", - ")\n", - "print(_summary_text)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "len(set(l_genes))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Number of well-defined peptides per protein (isotope)\n", - "\n", - "- well-defined peptides := no cleavage site is missed" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "peps_exact_count_freq = defaultdict(int)\n", - "\n", - "for key, d_data in data_fasta.items():\n", - " _N = len(d_data[KEY_PEPTIDES][0])\n", - " # if _N == 0:\n", - " # print(key)\n", - " # print(d_data)\n", - " peps_exact_count_freq[_N] += 1\n", - "peps_exact_count_freq = pd.Series(dict(peps_exact_count_freq)).sort_index()\n", - "peps_exact_count_freq" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "n_first = 40\n", - "ax = peps_exact_count_freq.iloc[:n_first].plot(kind='bar',\n", - " figsize=(20, 5),\n", - " title=f'Frequency of number of exact peptides (up to {peps_exact_count_freq.iloc[:40].index[-1]})'\n", - " f' representing {peps_exact_count_freq.iloc[:40].sum()} proteins out of '\n", - " f'{peps_exact_count_freq.sum()} ({peps_exact_count_freq.iloc[:40].sum()/peps_exact_count_freq.sum():.2f}%)',\n", - " xlabel=\"Number of exact peptides (considered) in protein sequence\",\n", - " ylabel=\"Number of protein(s) (incl. isotopes)\",\n", - " fontsize=10)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "peps_exact_count_freq = pd.Series(dict(peps_exact_count_freq)).sort_index()\n", - "fig = ax.get_figure()\n", - "fig.savefig(Path(FIGUREFOLDER) / 'fasta_exact_peptide_count_freq.png')\n", - "fig.savefig(Path(FIGUREFOLDER) / 'fasta_exact_peptide_count_freq.pdf')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Proteins' Isoforms" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Possible to join \"isoforms\" by joining all variants to one. Isoforms are numbered from the second on by appending `-i` for $i>1$, i.e. starting with `-2`. The gene name of which the protein (isoform) originate can be obtained by using [id mapping](https://www.uniprot.org/help/api_idmapping). Isoforms are not mapped automatically by Uniprot to its GENENAME, i.e. you have to strip all `-i`, e.g `-2`, `-3`, for querying. Here the protein, gene pairs are mapped to the unique protein identifiers." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "prot_ids = list(data_fasta.keys())\n", - "prot_ids = pd.Series(prot_ids)\n", - "prot_ids" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mask = prot_ids.str.contains(\"-\")\n", - "isoforms = prot_ids.copy().loc[mask]\n", - "isoforms" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "N_prot_with_isoform = isoforms.str.split(\"-\").str[0].nunique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "n_unique_proteins_wo_isoforms = len(prot_ids) - len(isoforms)\n", - "_summary_text += \"\\nA total of {} proteins have at least one more isoform. \".format(\n", - " N_prot_with_isoform\n", - ")\n", - "_summary_text += f\"Collapsing isoforms into one protein results in {n_unique_proteins_wo_isoforms} proteins.\"\n", - "print(_summary_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Remove Isoforms from list. How to integrate this information before?\n", - "\n", - "fasta-data has to be merge one-to-many." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "id_map = pd.DataFrame(\n", - " prot_ids.str.split(\"-\").str[0], columns=[\"protein\"]\n", - ") # , index=list(prot_ids))\n", - "id_map.index = pd.Index(prot_ids, name=\"prot_id\")\n", - "id_map" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "id_map.loc[id_map.index.str.contains(\"A0A096LP49|Q9Y6Z5|W5XKT8\")]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "l_proteins = id_map.protein.unique()\n", - "print(\n", - " f\"There are {len(l_proteins)} unique proteins without isoforms listed in the used fasta files.\"\n", - ")\n", - "# Check with pervious result.\n", - "assert n_unique_proteins_wo_isoforms == len(l_proteins)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "try:\n", - " with open(FN_PROT_GENE_MAP) as f:\n", - " dict_protein_to_gene = json.load(f)\n", - " logging.warning(f\"Loaded pre-cached map dict_protein_to_gene: {FN_PROT_GENE_MAP}\")\n", - "except FileNotFoundError:\n", - " dict_protein_to_gene = {}\n", - " start = 0\n", - " for end in list(range(10000, len(l_proteins), 10000)):\n", - " print(f\"Retrieve items {start+1:6} to {end:6}\")\n", - " _id_to_gene = query_uniprot_id_mapping(l_proteins[start:end])\n", - " print(f\"Found {len(_id_to_gene)} gene names\")\n", - " dict_protein_to_gene.update(_id_to_gene)\n", - " start = end\n", - " print(f\"Retrieve items {start:6} to {len(l_proteins):6}\")\n", - " _id_to_gene = query_uniprot_id_mapping(l_proteins[start:])\n", - " print(f\"Found {len(_id_to_gene)} gene names\")\n", - " dict_protein_to_gene.update(_id_to_gene)\n", - " with open(FN_PROT_GENE_MAP, \"w\") as f:\n", - " json.dump(dict_protein_to_gene, f, indent=4, sort_keys=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "genes = pd.Series(dict_protein_to_gene, name=\"gene\")\n", - "genes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "assert (\n", - " len(genes) == 72471\n", - "), f\"The number of proteins associated to a gene found on 11.11.2020 was 72471, now it's {len(genes)}\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Add gene names from UniProt to `id_map` DataFrame by an outer join (keeping all information based on the protein names shared by isotopes)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "id_map = id_map.merge(genes, how=\"outer\", left_on=\"protein\", right_index=True)\n", - "id_map.sort_values(by=[\"gene\", \"protein\"], inplace=True)\n", - "id_map" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "id_map.replace('', np.nan)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Add the gene name collected previously from the Fasta Header" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "genes_fasta_offline = pd.DataFrame(\n", - " ((_key, _data[KEY_GENE_NAME]) for _key, _data in data_fasta.items()),\n", - " columns=[\"prot_id\", \"gene_fasta\"],\n", - " ).set_index(\"prot_id\"\n", - " ).replace('', np.nan)\n", - "genes_fasta_offline.loc[genes_fasta_offline.gene_fasta.isna()]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "id_map = id_map.merge(\n", - " genes_fasta_offline,\n", - " how=\"outer\",\n", - " left_index=True,\n", - " right_index=True)\n", - "id_map.sort_values(by=[\"gene\", \"protein\"], inplace=True)\n", - "id_map" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mask_no_gene = id_map.gene.isna()\n", - "id_map.loc[mask_no_gene]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Using the genes from the fasta file header reduces the number of missing genes, but additionally other differences arise in the comparison to the lastest version." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mask_gene_diffs = id_map.gene != id_map.gene_fasta\n", - "id_map.loc[mask_gene_diffs]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "id_map.gene.isna().sum(), id_map.gene_fasta.isna()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "id_map.loc[(id_map.gene.isna()) & (id_map.gene_fasta.isna())]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "_summary_text += (\n", - " f\"\\nThere are {id_map.gene.isna().sum()} protein IDs (or {id_map.loc[mask_no_gene].protein.nunique()} proteins) \"\n", - " \"without a gene associated to them in the current online version of UniProt, \"\n", - " f\"whereas there are no genes for only {id_map.gene_fasta.isna().sum()} in the headers of proteins in the used FASTA files.\"\n", - ")\n", - "print(_summary_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Isotopes mapping\n", - "\n", - "Isotopes are mapped now to a protein with the same name. The same can be achieved by just discarding everything behind the hypen `-`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "id_map.loc[id_map.index.str.contains(\"-\")]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Save id_map" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "id_map.to_json(FN_ID_MAP, orient=\"split\", indent=4)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Most proteins with a missing gene are deleted\n", - "\n", - "If one checks manually some of the examples (e.g. the hundred provided here), one sees that all are deleted from Uniprot.\n", - "\n", - "> How to obtain different versions of UniProt?!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if not len(dict_protein_to_gene) == len(l_proteins):\n", - " print(\"Not all ids are mapped.\")\n", - " _diff = set(l_proteins).difference(dict_protein_to_gene.keys())\n", - " print(f\"Number of protein identifiers not mapped to a gene in UniProt online: {len(_diff)}\")\n", - " print(f'Look at {100} examples: {\", \".join(sample_iterable(_diff, 100))}')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "_summary_text += (\n", - " f\"\\nMost of the {len(_diff)} proteins ({len(_diff)/len(l_proteins)*100:.2f} percent of the unique proteins) \"\n", - " \"not mapped to a gene name are deleted in the most current version of UniProt (online). \"\n", - " \"The versioning of the fasta-files has to be investigated, as there arise differences over time due to updates.\"\n", - ")\n", - "_summary_text += (\n", - " f\"\\nProteins are mapped to a total number of genes of {id_map.gene.nunique()} in the online UniProt version and {id_map.gene_fasta.nunique()} in the offline used one.\\n\"\n", - ")\n", - "print(_summary_text)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "f\"Proteins are mapped to a total number of genes of {len(set(dict_protein_to_gene.values()))}\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Map peptide to either identifier, common protein or gene \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "peptide_to_prot = defaultdict(list)\n", - "for _id, _data in tqdm(data_fasta.items()):\n", - " for _pep in iterFlatten(_data[\"peptides\"]):\n", - " peptide_to_prot[_pep].append(_id)\n", - "\n", - "_summary_text += f\"\\nConsidering {MAX_MISSED_CLEAVAGES} missed cleavage site(s) there are {len(peptide_to_prot):,d} unique peptides.\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(_summary_text)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "{_key: peptide_to_prot[_key] for _key in sample_iterable(peptide_to_prot.keys())}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "with open(FN_PEP_TO_PROT, \"w\") as f:\n", - " json.dump(peptide_to_prot, f, indent=4, sort_keys=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Plot histograms for different levels of abstraction\n", - "\n", - "Plot counts of matched \n", - " 1. protein IDs\n", - " 2. proteins (joining isoforms)\n", - " 3. genes\n", - " \n", - "to their peptides. See how many unique peptides exist. The number of peptides should stay the same, so the counts do not have to be normalized." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "USE_OFFLINE_FASTA_GENES = True\n", - "if USE_OFFLINE_FASTA_GENES:\n", - " dict_protein_to_gene = genes_fasta_offline.loc[~genes_fasta_offline.index.str.contains('-')]\n", - " dict_protein_to_gene = dict_protein_to_gene.dropna().to_dict()['gene_fasta']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "{_key: dict_protein_to_gene[_key] for _key in sample_iterable(dict_protein_to_gene.keys(), 10)}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "len(dict_protein_to_gene)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "counters = {}\n", - "levels = [\"protein_id\", \"protein\", \"gene\"]\n", - "for level in levels:\n", - " counters[level] = pd.Series(\n", - " count_peptide_matches(peptide_to_prot, dict_protein_to_gene, level=level)\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for level in levels:\n", - " print(f\"{level}: {counters[level]['AACLCFR']}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "peptide_to_prot[\"AACLCFR\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "_prots = {x.split(\"-\")[0] for x in peptide_to_prot[\"AACLCFR\"]}\n", - "{dict_protein_to_gene[_prot] for _prot in _prots}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "counts_by_level = combine_value_counts(pd.DataFrame(counters))\n", - "counts_by_level = counts_by_level.replace(np.nan, 0).astype(int)\n", - "counts_by_level" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Interpretation: Peptides are assigned \\# of times to a protein_id, protein or gene respectively." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Check that for all levels the same number of peptides are counted." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "counts_by_level.sum()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Plot the frequency of matched proteins to one peptide sequence:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fig, ax = plt.subplots(figsize=(13, 7))\n", - "\n", - "ax = counts_by_level.iloc[:5].plot(kind=\"bar\", ax=ax)\n", - "ax.set_ylabel(\"peptide counts\")\n", - "ax.set_xlabel(\"number of matched levels\")\n", - "# ax.yaxis.set_major_formatter(\"{x:,}\")\n", - "_y_ticks = ax.set_yticks(list(range(0, 3_500_000, 500_000))) # is there a ways to transform float to int in matplotlib?\n", - "_y_ticks_labels = ax.set_yticklabels([f\"{x:,}\" for x in range(0, 3_500_000, 500_000)])\n", - "\n", - "_savefig(fig, folder=\"figures\", name=\"fasta_top4\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fig, axes = plt.subplots(2, 2, figsize=(17, 10))\n", - "axes = axes.flatten()\n", - "\n", - "counts_by_level.iloc[:10].plot(kind=\"bar\", ax=axes[0])\n", - "axes[0].set_title(\"up to 9 matches\")\n", - "axes[0].set_yticks(list(range(0, 3_500_000, 500_000)))\n", - "axes[0].set_yticklabels(['0', '500,000', '1,000,000', '1,500,000', '2,000,000', '2,500,000', '3,000,000'])\n", - "\n", - "_start = 10\n", - "for i, _end in enumerate([31, 61], start=1):\n", - " counts_by_level.iloc[_start:_end].plot(kind=\"bar\", ax=axes[i])\n", - " axes[i].set_title(f\"{_start} to {_end-1} matches\")\n", - " _start = _end\n", - "\n", - "i += 1\n", - "counts_by_level.iloc[-30:].plot(kind=\"bar\", ax=axes[i])\n", - "axes[i].set_title(f\"{30} most frequent matches\")\n", - "\n", - "\n", - "axes = axes.reshape((2, 2))\n", - "\n", - "pad = 5 # in point\n", - "for i in range(2):\n", - " axes[-1, i].set_xlabel(\"Count of number of matches for a peptide\")\n", - " axes[i, 0].set_ylabel(\"number of peptides\")\n", - "\n", - "_ = fig.suptitle(\n", - " \"Frequency of peptides matched to x items of protein IDs, proteins (combining isotopes) and genes\",\n", - " fontsize=16,\n", - ")\n", - "\n", - "\n", - "fig.tight_layout()\n", - "_savefig(fig, folder=\"figures\", name=\"fasta_mapping_counts\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "check for homology of sequences in python?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create Automated report\n", - "\n", - "- paragraph in tex for report\n", - "- summary table" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(_summary_text)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "Path(SUMMARY_FILE).parent.mkdir(exist_ok=True)\n", - "with open(Path(SUMMARY_FILE), \"w\") as f:\n", - " f.write(_summary_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save mappings as JSON\n", - "\n", - "Each `protein_id` is an entry with the following information:\n", - "```\n", - "'meta': \n", - "'seq': \n", - "'peptides': \n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "with open(FN_FASTA_DB, \"w\") as f:\n", - " json.dump(data_fasta, f, indent=4, sort_keys=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "os.stat(FN_FASTA_DB).st_size / 1024 / 1024" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "vaep", - "language": "python", - "name": "vaep" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13 (default, Mar 28 2022, 06:59:08) [MSC v.1916 64 bit (AMD64)]" - }, - "toc-autonumbering": true - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/project/misc_FASTA_tryptic_digest.py b/project/misc_FASTA_tryptic_digest.py deleted file mode 100644 index 80dde87eb..000000000 --- a/project/misc_FASTA_tryptic_digest.py +++ /dev/null @@ -1,687 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.14.0 -# kernelspec: -# display_name: vaep -# language: python -# name: vaep -# --- - -# %% [markdown] -# # Process FASTA files -# > uses only the provided fasta files in `src.config.py` by `FOLDER_FASTA` -# -# - create theoretically considered peptides considered by search engines -# - dump results as human readable json to `FN_FASTA_DB` file specifed in src.config. -# -# > Based on notebook received by [Annelaura Bach](https://www.cpr.ku.dk/staff/mann-group/?pure=en/persons/443836) and created by Johannes B. Müller \[[scholar](https://scholar.google.com/citations?user=Rn1OS8oAAAAJ&hl=de), [MPI Biochemistry](https://www.biochem.mpg.de/person/93696/2253)\] - -# %% -from collections import defaultdict, namedtuple -import os -import json -import logging -from pathlib import Path - -# import matplotlib -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -from tqdm.notebook import tqdm - -# %% -from vaep.fasta import cleave_to_tryptic -from vaep.fasta import iterFlatten -from vaep.fasta import count_peptide_matches -from vaep.io import search_files -from vaep.pandas import combine_value_counts -from vaep.databases.uniprot import query_uniprot_id_mapping -from vaep.utils import sample_iterable -from vaep.plotting import _savefig - -# %% -from config import FN_FASTA_DB -from config import FIGUREFOLDER -from config import FN_ID_MAP -from config import FN_PROT_GENE_MAP -from config import FN_PEP_TO_PROT - -# %% [markdown] -# ## Core Functionality - Example -# -# - write tests for core functinality -# - refactor to file - -# %% -test_data = { - "meta": ">tr|A0A024R1R8|A0A024R1R8_HUMAN HCG2014768, isoform CRA_a OS=Homo sapiens OX=9606 GN=hCG_2014768 PE=4 SV=1", - "seq": "MSSHEGGKKKALKQPKKQAKEMDEEEKAFKQKQKEEQKKLEVLKAKVVGKGPLATGGIKKSGKK", - "peptides": ["MSSHEGGK", "EMDEEEK", "GPLATGGIK"], -} - -# %% [markdown] -# regex is slower than native string replacing and splitting in Python - -# %% -# import re -# cut_by_trypsin = re.compile('([^K]+K)|([^R]+R)') -# _res = cut_by_trypsin.split(test_data['seq']) -# [_pep for _pep in _res if _pep != '' and _pep != None] - -# %% [markdown] -# -# - map peptide set of peptides (how to deal with mis-cleavages?) -# - mis-cleavages can happen both to the peptide before and after. -# > `pep1, pep2, pep3, pep4, pep5` -# > `pep1pep2, pep2pep3, pep3pep4, pep4pep5` -# - sliding windows can pass trough the list of peptides - should work with recursion - -# %% -l_peptides = test_data["seq"].replace("K", "K ").replace("R", "R ").split() -l_peptides - -# %% [markdown] -# `add_rxk` should add pattern of starting R and trailing K ? - -# %% -last_pep = "" -temp_peps = [] -num_missed_cleavages = 1 -add_rxk = True - -sec_last_pep = "" - -pep_rdx = [] - -for pep in l_peptides: - if last_pep != "": - temp_peps.append(last_pep + pep) - if add_rxk and sec_last_pep != "" and len(sec_last_pep) <= 2: - _pep_rxk = sec_last_pep + last_pep + pep - print(_pep_rxk) - pep_rdx.append(_pep_rxk) - temp_peps.append(_pep_rxk) - - sec_last_pep = last_pep # sec_last_pep, last_pep = last_pep, pep ? - last_pep = pep -temp_peps - -# %% -repr(pep_rdx) - -# %% [markdown] -# Missed cleavages core functionality (adapted) - -# %% -example_peptides_fasta = cleave_to_tryptic( - test_data["seq"], num_missed_cleavages=2, add_rxk=True -) -print("number of peptides: ", [len(_l) for _l in example_peptides_fasta]) -example_peptides_fasta[-1] - -# %% -print("".join(example_peptides_fasta[0]), *example_peptides_fasta, sep="\n") - -# %% [markdown] -# rdx peptides are a subset of two missed cleavage sites peptides. There are omitted when two and more cleavage site can be skipped. - -# %% -example_peptides_fasta = cleave_to_tryptic( - test_data["seq"], num_missed_cleavages=1, add_rxk=True -) -print("number of peptides: ", [len(_l) for _l in example_peptides_fasta]) -example_peptides_fasta[-1] - -# %% [markdown] -# Data Structure is no a list of list. Maybe this could be improved. Information what kind of type the peptide is from, is still interesting. - -# %% [markdown] -# ## Process Fasta Files -# -# First define input Folder and the file location of the created peptides: - -# %% -fasta_files = search_files(path=".", query=".fasta") -print(fasta_files) -print("\n".join(fasta_files.files)) - -# %% [markdown] -# ### Define Setup -# -# Set input FASTA, Output .txt name, lower legth cutoff, missed cleavages and if to report reverse. -# -# Tryptic digest of Fastas to Peptides >6 in list for matching with measured peptides - -# %% -CUTOFF_LEN_PEP = 7 -MAX_MISSED_CLEAVAGES = 2 # default in MaxQuant -DECOY_REVERSE = False -SUMMARY_FILE = "tex/fasta_tryptic_analysis.tex" - -_summary_text = ( - "The theoretical analysis of the fasta files gives an idea about how many possible peptides \n" - "can be expected by cleaving proteins using trypsin. The hyperparameters for peptide creation are \n" - f"to consider the minimal peptide length to be {CUTOFF_LEN_PEP} amino acids, \n" - f"to consider a maximum of {MAX_MISSED_CLEAVAGES} missed cleavage sites (default in MaxQuant) and \n" - f"to {'not ' if not DECOY_REVERSE else ''}add decoy peptides by reversing peptide sequences. \n" -) -print(_summary_text, sep="\n") - -# %% [markdown] -# From the [Fasta Meta information](https://ebi14.uniprot.org/help/fasta-headers) the Identifier is extracted. -# -# ``` -# >db|UniqueIdentifier|EntryName ProteinName OS=OrganismName OX=OrganismIdentifier [GN=GeneName ]PE=ProteinExistence SV=SequenceVersion -# ``` -# - db is `sp` for UniProtKB/Swiss-Prot and `tr` for UniProtKB/TrEMBL. -# - `UniqueIdentifier` is the primary *accession number* of the UniProtKB entry. (seems to be used by MQ) -# - `EntryName` is the entry name of the UniProtKB entry. -# - `ProteinName` is the recommended name of the UniProtKB entry as annotated in the *RecName* field. For UniProtKB/TrEMBL entries without a *RecName* field, the *SubName* field is used. In case of multiple SubNames, the first one is used. The 'precursor' attribute is excluded, 'Fragment' is included with the name if applicable. - -# %% [markdown] -# `>tr` or `>sp` - -# %% [markdown] -# ### Schema for single fasta entry - -# %% -from config import KEY_FASTA_HEADER, KEY_FASTA_SEQ, KEY_GENE_NAME, KEY_PEPTIDES - -from vaep.fasta import read_fasta - -data_fasta = {} - -# # add Schema? -# schema_fasta_entry = { -# KEY_FASTA_HEADER: str, -# KEY_GENE_NAME: str, -# KEY_FASTA_SEQ: str, -# KEY_PEPTIDES: (list, (2,2)) -# } -# # or dataclass -# from dataclasses import make_dataclass -# FastaEntry = make_dataclass(cls_name='FastaEntry', -# fields=[ -# (KEY_FASTA_HEADER, 'str'), -# (KEY_GENE_NAME, 'str'), -# (KEY_FASTA_SEQ, 'str'), -# (KEY_PEPTIDES, list) -# ]) -# # or namedtuple -# FastaEntry = namedtuple('FastaEntry', [KEY_FASTA_HEADER, KEY_GENE_NAME, KEY_FASTA_SEQ, KEY_PEPTIDES]) - -# %% [markdown] -# How to validate schema of fasta entry stored as dictionary? -# - [schema](https://stackoverflow.com/questions/45812387/how-to-validate-structure-or-schema-of-dictionary-in-python) validation in python discussion - -# %% [markdown] -# ### Process Fasta file - -# %% -for _fasta in tqdm(fasta_files.files): - - with open(_fasta) as fp: - for i, (metainfo, seq) in tqdm(enumerate(read_fasta(fp))): - identifier = metainfo.split("|")[1] - gene = "|".join([x.split("=")[-1] for x in metainfo.split() if "GN=" in x]) - if identifier in data_fasta: - raise ValueError("Key seen before: {}".format(identifier)) - _all_peptides = cleave_to_tryptic( - seq, num_missed_cleavages=MAX_MISSED_CLEAVAGES, reversed=DECOY_REVERSE - ) - data_fasta[identifier] = { - KEY_FASTA_HEADER: metainfo, - KEY_GENE_NAME: gene, - KEY_FASTA_SEQ: seq, - KEY_PEPTIDES: [ - [_pep for _pep in _peptides if len(_pep) >= CUTOFF_LEN_PEP] - for _peptides in _all_peptides - ], - } - -# %% [markdown] -# `fasta_data` holds all information to pick a subset of peptides from peptides intensity tables - -# %% -# from random import sample -# sample_ids = sample(list(data_fasta), 10) -# for _id in sample_ids: -# print("Unique Identifier: {}: \n\t AA-Seq: {} \n\t Header: {} \n\t Peptides: {}\n".format(_id, data_fasta[_id]['seq'], data_fasta[_id]['meta'], data_fasta[_id]['peptides'])) -data_fasta["A0A024R1R8"] - -# %% -d_seq_length = {} -for _key, _data in data_fasta.items(): - d_seq_length[_key] = len(_data[KEY_FASTA_SEQ]) - -# %% -d_seq_length = pd.Series(d_seq_length) -d_seq_length.describe() - -# %% -test_series = pd.Series({"A": 4, "B": 1, "C": 0, "D": 4}) - -def get_indices_with_value(s: pd.Series, value): - """Return indices for with the value is true""" - return s[s == value].index - - -get_indices_with_value(test_series, 4) - -# %% [markdown] -# Boolean Indexing, remember to set [parantheses](https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#boolean-indexing) - -# %% -MIN_AA_IN_SEQ = 10 -MAX_AA_IN_SEQ = 2000 -mask_min = d_seq_length < MIN_AA_IN_SEQ -mask_max = d_seq_length > MAX_AA_IN_SEQ -# _summary_text += f"\nThe FASTA file contain {sum(mask_min)} proteins with less than {MIN_AA_IN_SEQ} amino acids (AAs) and {sum(mask_max)} with more than {MAX_AA_IN_SEQ} AAs." -_summary_text += ( - f"The minimal AA sequence length is {min(d_seq_length)} of UniProt ID {', '.join(get_indices_with_value(d_seq_length, min(d_seq_length)))} " - f"and the maximal sequence lenght is {max(d_seq_length)} for UniProt ID {', '.join(get_indices_with_value(d_seq_length, max(d_seq_length)))}" -) -print(_summary_text) - -# %% -_ = d_seq_length.loc[(~mask_max)].to_frame(name="AA Seq Length").plot.hist(bins=200) - -# %% -l_genes = [] -n_set = 0 -for _key, _data in data_fasta.items(): - _gene_name = _data[KEY_GENE_NAME] - if _gene_name: - l_genes.append(_gene_name) - n_set += 1 - -# %% -_summary_text += ( - f"\nIn the FASTA header file {n_set} proteins have a set gene of a total of {len(data_fasta)} proteins," - f" i.e. {len(data_fasta) - n_set} have an undefined origin. There are {len(set(l_genes))} unique gene names in the FASTA file specified.\n" -) -print(_summary_text) - -# %% -len(set(l_genes)) - -# %% [markdown] -# ## Number of well-defined peptides per protein (isotope) -# -# - well-defined peptides := no cleavage site is missed - -# %% -peps_exact_count_freq = defaultdict(int) - -for key, d_data in data_fasta.items(): - _N = len(d_data[KEY_PEPTIDES][0]) - # if _N == 0: - # print(key) - # print(d_data) - peps_exact_count_freq[_N] += 1 -peps_exact_count_freq = pd.Series(dict(peps_exact_count_freq)).sort_index() -peps_exact_count_freq - -# %% -n_first = 40 -ax = peps_exact_count_freq.iloc[:n_first].plot(kind='bar', - figsize=(20, 5), - title=f'Frequency of number of exact peptides (up to {peps_exact_count_freq.iloc[:40].index[-1]})' - f' representing {peps_exact_count_freq.iloc[:40].sum()} proteins out of ' - f'{peps_exact_count_freq.sum()} ({peps_exact_count_freq.iloc[:40].sum()/peps_exact_count_freq.sum():.2f}%)', - xlabel="Number of exact peptides (considered) in protein sequence", - ylabel="Number of protein(s) (incl. isotopes)", - fontsize=10) - -# %% -peps_exact_count_freq = pd.Series(dict(peps_exact_count_freq)).sort_index() -fig = ax.get_figure() -fig.savefig(Path(FIGUREFOLDER) / 'fasta_exact_peptide_count_freq.png') -fig.savefig(Path(FIGUREFOLDER) / 'fasta_exact_peptide_count_freq.pdf') - -# %% [markdown] -# ### Proteins' Isoforms - -# %% [markdown] -# Possible to join "isoforms" by joining all variants to one. Isoforms are numbered from the second on by appending `-i` for $i>1$, i.e. starting with `-2`. The gene name of which the protein (isoform) originate can be obtained by using [id mapping](https://www.uniprot.org/help/api_idmapping). Isoforms are not mapped automatically by Uniprot to its GENENAME, i.e. you have to strip all `-i`, e.g `-2`, `-3`, for querying. Here the protein, gene pairs are mapped to the unique protein identifiers. - -# %% -prot_ids = list(data_fasta.keys()) -prot_ids = pd.Series(prot_ids) -prot_ids - -# %% -mask = prot_ids.str.contains("-") -isoforms = prot_ids.copy().loc[mask] -isoforms - -# %% -N_prot_with_isoform = isoforms.str.split("-").str[0].nunique() - -# %% -n_unique_proteins_wo_isoforms = len(prot_ids) - len(isoforms) -_summary_text += "\nA total of {} proteins have at least one more isoform. ".format( - N_prot_with_isoform -) -_summary_text += f"Collapsing isoforms into one protein results in {n_unique_proteins_wo_isoforms} proteins." -print(_summary_text) - -# %% [markdown] -# Remove Isoforms from list. How to integrate this information before? -# -# fasta-data has to be merge one-to-many. - -# %% -id_map = pd.DataFrame( - prot_ids.str.split("-").str[0], columns=["protein"] -) # , index=list(prot_ids)) -id_map.index = pd.Index(prot_ids, name="prot_id") -id_map - -# %% -id_map.loc[id_map.index.str.contains("A0A096LP49|Q9Y6Z5|W5XKT8")] - -# %% -l_proteins = id_map.protein.unique() -print( - f"There are {len(l_proteins)} unique proteins without isoforms listed in the used fasta files." -) -# Check with pervious result. -assert n_unique_proteins_wo_isoforms == len(l_proteins) - -# %% -try: - with open(FN_PROT_GENE_MAP) as f: - dict_protein_to_gene = json.load(f) - logging.warning(f"Loaded pre-cached map dict_protein_to_gene: {FN_PROT_GENE_MAP}") -except FileNotFoundError: - dict_protein_to_gene = {} - start = 0 - for end in list(range(10000, len(l_proteins), 10000)): - print(f"Retrieve items {start+1:6} to {end:6}") - _id_to_gene = query_uniprot_id_mapping(l_proteins[start:end]) - print(f"Found {len(_id_to_gene)} gene names") - dict_protein_to_gene.update(_id_to_gene) - start = end - print(f"Retrieve items {start:6} to {len(l_proteins):6}") - _id_to_gene = query_uniprot_id_mapping(l_proteins[start:]) - print(f"Found {len(_id_to_gene)} gene names") - dict_protein_to_gene.update(_id_to_gene) - with open(FN_PROT_GENE_MAP, "w") as f: - json.dump(dict_protein_to_gene, f, indent=4, sort_keys=False) - -# %% -genes = pd.Series(dict_protein_to_gene, name="gene") -genes - -# %% -assert ( - len(genes) == 72471 -), f"The number of proteins associated to a gene found on 11.11.2020 was 72471, now it's {len(genes)}" - -# %% [markdown] -# Add gene names from UniProt to `id_map` DataFrame by an outer join (keeping all information based on the protein names shared by isotopes) - -# %% -id_map = id_map.merge(genes, how="outer", left_on="protein", right_index=True) -id_map.sort_values(by=["gene", "protein"], inplace=True) -id_map - -# %% -id_map.replace('', np.nan) - -# %% [markdown] -# Add the gene name collected previously from the Fasta Header - -# %% -genes_fasta_offline = pd.DataFrame( - ((_key, _data[KEY_GENE_NAME]) for _key, _data in data_fasta.items()), - columns=["prot_id", "gene_fasta"], - ).set_index("prot_id" - ).replace('', np.nan) -genes_fasta_offline.loc[genes_fasta_offline.gene_fasta.isna()] - -# %% -id_map = id_map.merge( - genes_fasta_offline, - how="outer", - left_index=True, - right_index=True) -id_map.sort_values(by=["gene", "protein"], inplace=True) -id_map - -# %% -mask_no_gene = id_map.gene.isna() -id_map.loc[mask_no_gene] - -# %% [markdown] -# Using the genes from the fasta file header reduces the number of missing genes, but additionally other differences arise in the comparison to the lastest version. - -# %% -mask_gene_diffs = id_map.gene != id_map.gene_fasta -id_map.loc[mask_gene_diffs] - -# %% -id_map.gene.isna().sum(), id_map.gene_fasta.isna() - -# %% -id_map.loc[(id_map.gene.isna()) & (id_map.gene_fasta.isna())] - -# %% -_summary_text += ( - f"\nThere are {id_map.gene.isna().sum()} protein IDs (or {id_map.loc[mask_no_gene].protein.nunique()} proteins) " - "without a gene associated to them in the current online version of UniProt, " - f"whereas there are no genes for only {id_map.gene_fasta.isna().sum()} in the headers of proteins in the used FASTA files." -) -print(_summary_text) - -# %% [markdown] -# ### Isotopes mapping -# -# Isotopes are mapped now to a protein with the same name. The same can be achieved by just discarding everything behind the hypen `-` - -# %% -id_map.loc[id_map.index.str.contains("-")] - -# %% [markdown] -# Save id_map - -# %% -id_map.to_json(FN_ID_MAP, orient="split", indent=4) - -# %% [markdown] -# ### Most proteins with a missing gene are deleted -# -# If one checks manually some of the examples (e.g. the hundred provided here), one sees that all are deleted from Uniprot. -# -# > How to obtain different versions of UniProt?! - -# %% -if not len(dict_protein_to_gene) == len(l_proteins): - print("Not all ids are mapped.") - _diff = set(l_proteins).difference(dict_protein_to_gene.keys()) - print(f"Number of protein identifiers not mapped to a gene in UniProt online: {len(_diff)}") - print(f'Look at {100} examples: {", ".join(sample_iterable(_diff, 100))}') - -# %% -_summary_text += ( - f"\nMost of the {len(_diff)} proteins ({len(_diff)/len(l_proteins)*100:.2f} percent of the unique proteins) " - "not mapped to a gene name are deleted in the most current version of UniProt (online). " - "The versioning of the fasta-files has to be investigated, as there arise differences over time due to updates." -) -_summary_text += ( - f"\nProteins are mapped to a total number of genes of {id_map.gene.nunique()} in the online UniProt version and {id_map.gene_fasta.nunique()} in the offline used one.\n" -) -print(_summary_text) - -# %% -f"Proteins are mapped to a total number of genes of {len(set(dict_protein_to_gene.values()))}" - -# %% [markdown] -# ### Map peptide to either identifier, common protein or gene -# - -# %% -peptide_to_prot = defaultdict(list) -for _id, _data in tqdm(data_fasta.items()): - for _pep in iterFlatten(_data["peptides"]): - peptide_to_prot[_pep].append(_id) - -_summary_text += f"\nConsidering {MAX_MISSED_CLEAVAGES} missed cleavage site(s) there are {len(peptide_to_prot):,d} unique peptides." - -# %% -print(_summary_text) - -# %% -{_key: peptide_to_prot[_key] for _key in sample_iterable(peptide_to_prot.keys())} - -# %% -# %%time -with open(FN_PEP_TO_PROT, "w") as f: - json.dump(peptide_to_prot, f, indent=4, sort_keys=False) - -# %% [markdown] -# ### Plot histograms for different levels of abstraction -# -# Plot counts of matched -# 1. protein IDs -# 2. proteins (joining isoforms) -# 3. genes -# -# to their peptides. See how many unique peptides exist. The number of peptides should stay the same, so the counts do not have to be normalized. - -# %% -USE_OFFLINE_FASTA_GENES = True -if USE_OFFLINE_FASTA_GENES: - dict_protein_to_gene = genes_fasta_offline.loc[~genes_fasta_offline.index.str.contains('-')] - dict_protein_to_gene = dict_protein_to_gene.dropna().to_dict()['gene_fasta'] - -# %% -{_key: dict_protein_to_gene[_key] for _key in sample_iterable(dict_protein_to_gene.keys(), 10)} - -# %% -len(dict_protein_to_gene) - -# %% -counters = {} -levels = ["protein_id", "protein", "gene"] -for level in levels: - counters[level] = pd.Series( - count_peptide_matches(peptide_to_prot, dict_protein_to_gene, level=level) - ) - -# %% -for level in levels: - print(f"{level}: {counters[level]['AACLCFR']}") - -# %% -peptide_to_prot["AACLCFR"] - -# %% -_prots = {x.split("-")[0] for x in peptide_to_prot["AACLCFR"]} -{dict_protein_to_gene[_prot] for _prot in _prots} - -# %% -counts_by_level = combine_value_counts(pd.DataFrame(counters)) -counts_by_level = counts_by_level.replace(np.nan, 0).astype(int) -counts_by_level - -# %% [markdown] -# Interpretation: Peptides are assigned \# of times to a protein_id, protein or gene respectively. - -# %% [markdown] -# Check that for all levels the same number of peptides are counted. - -# %% -counts_by_level.sum() - -# %% [markdown] -# Plot the frequency of matched proteins to one peptide sequence: - -# %% -fig, ax = plt.subplots(figsize=(13, 7)) - -ax = counts_by_level.iloc[:5].plot(kind="bar", ax=ax) -ax.set_ylabel("peptide counts") -ax.set_xlabel("number of matched levels") -# ax.yaxis.set_major_formatter("{x:,}") -_y_ticks = ax.set_yticks(list(range(0, 3_500_000, 500_000))) # is there a ways to transform float to int in matplotlib? -_y_ticks_labels = ax.set_yticklabels([f"{x:,}" for x in range(0, 3_500_000, 500_000)]) - -_savefig(fig, folder="figures", name="fasta_top4") - -# %% -fig, axes = plt.subplots(2, 2, figsize=(17, 10)) -axes = axes.flatten() - -counts_by_level.iloc[:10].plot(kind="bar", ax=axes[0]) -axes[0].set_title("up to 9 matches") -axes[0].set_yticks(list(range(0, 3_500_000, 500_000))) -axes[0].set_yticklabels(['0', '500,000', '1,000,000', '1,500,000', '2,000,000', '2,500,000', '3,000,000']) - -_start = 10 -for i, _end in enumerate([31, 61], start=1): - counts_by_level.iloc[_start:_end].plot(kind="bar", ax=axes[i]) - axes[i].set_title(f"{_start} to {_end-1} matches") - _start = _end - -i += 1 -counts_by_level.iloc[-30:].plot(kind="bar", ax=axes[i]) -axes[i].set_title(f"{30} most frequent matches") - - -axes = axes.reshape((2, 2)) - -pad = 5 # in point -for i in range(2): - axes[-1, i].set_xlabel("Count of number of matches for a peptide") - axes[i, 0].set_ylabel("number of peptides") - -_ = fig.suptitle( - "Frequency of peptides matched to x items of protein IDs, proteins (combining isotopes) and genes", - fontsize=16, -) - - -fig.tight_layout() -_savefig(fig, folder="figures", name="fasta_mapping_counts") - -# %% [markdown] -# check for homology of sequences in python? - -# %% [markdown] -# ## Create Automated report -# -# - paragraph in tex for report -# - summary table - -# %% -print(_summary_text) - -# %% -Path(SUMMARY_FILE).parent.mkdir(exist_ok=True) -with open(Path(SUMMARY_FILE), "w") as f: - f.write(_summary_text) - -# %% [markdown] -# ## Save mappings as JSON -# -# Each `protein_id` is an entry with the following information: -# ``` -# 'meta': -# 'seq': -# 'peptides': -# ``` - -# %% -# %%time -with open(FN_FASTA_DB, "w") as f: - json.dump(data_fasta, f, indent=4, sort_keys=False) - -# %% -os.stat(FN_FASTA_DB).st_size / 1024 / 1024 diff --git a/project/misc_MaxQuantOutput.ipynb b/project/misc_MaxQuantOutput.ipynb deleted file mode 100644 index d855747e1..000000000 --- a/project/misc_MaxQuantOutput.ipynb +++ /dev/null @@ -1,2556 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# MaxQuantOutput" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "8016b389a5db4e868898f50e9e1987b9", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Dropdown(description='Select a folder', options=(WindowsPath('data/mq_out/20190611_QX3_LiSc_MA_Hela_500ng_LC15…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from pathlib import Path\n", - "import ipywidgets as w\n", - "\n", - "import vaep.io as io\n", - "import vaep.io.mq as mq\n", - "\n", - "from config import FOLDER_MQ_TXT_DATA\n", - "\n", - "folders = io.search_subfolders(path=FOLDER_MQ_TXT_DATA, depth=1, exclude_root=True)\n", - "w_folder = w.Dropdown(options=folders, description='Select a folder')\n", - "w_folder" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## MaxQuantOutput class\n", - "\n", - "Instead of handling the files manually in a MQ folder, e.g. like" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "PathsList(files=['.ipynb_checkpoints', 'allPeptides.txt', 'evidence.txt', 'matchedFeatures.txt', 'modificationSpecificPeptides.txt', 'ms3Scans.txt', 'msms.txt', 'msmsScans.txt', 'mzRange.txt', 'parameters.txt', 'peptides.txt', 'proteinGroups.txt', 'summary.txt', 'tables.pdf'], folder=WindowsPath('data/mq_out/20190611_QX3_LiSc_MA_Hela_500ng_LC15'))" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "all_files = io.search_files(path=w_folder.value, query='')\n", - "all_files" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "files can just be accessed using the `MaxQuantOutput` class." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "MaxQuantOutput(WindowsPath('data/mq_out/20190611_QX3_LiSc_MA_Hela_500ng_LC15'))" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mq_output = mq.MaxQuantOutput(w_folder.value)\n", - "mq_output" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This lists the files in the current folder for you (calling `search_files`):" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['allPeptides.txt',\n", - " 'evidence.txt',\n", - " 'matchedFeatures.txt',\n", - " 'modificationSpecificPeptides.txt',\n", - " 'ms3Scans.txt',\n", - " 'msms.txt',\n", - " 'msmsScans.txt',\n", - " 'mzRange.txt',\n", - " 'parameters.txt',\n", - " 'peptides.txt',\n", - " 'proteinGroups.txt',\n", - " 'summary.txt']" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mq_output.files" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And extends the class attributes on intialization by the expected files (statically):" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['NAME_FILE_MAP',\n", - " 'dump_intensity',\n", - " 'find_attribute',\n", - " 'get_files',\n", - " 'get_list_of_attributes',\n", - " 'load',\n", - " 'register_file']" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mq_output._inital_attritubutes" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['NAME_FILE_MAP',\n", - " 'OxidationSites',\n", - " '_inital_attritubutes',\n", - " 'allPeptides',\n", - " 'dump_intensity',\n", - " 'evidence',\n", - " 'files',\n", - " 'find_attribute',\n", - " 'folder',\n", - " 'get_files',\n", - " 'get_list_of_attributes',\n", - " 'load',\n", - " 'matchedFeatures',\n", - " 'modificationSpecificPeptides',\n", - " 'ms3Scans',\n", - " 'msms',\n", - " 'msmsScans',\n", - " 'mzRange',\n", - " 'parameters',\n", - " 'paths',\n", - " 'peptides',\n", - " 'proteinGroups',\n", - " 'register_file',\n", - " 'summary']" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mq_output.get_list_of_attributes()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# not able to delete yet. __getitem__ better alternative?\n", - "# lookup\n", - "# del mq_output.OxidationMSites" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'allPeptides': 'allPeptides.txt',\n", - " 'evidence': 'evidence.txt',\n", - " 'matchedFeatures': 'matchedFeatures.txt',\n", - " 'modificationSpecificPeptides': 'modificationSpecificPeptides.txt',\n", - " 'ms3Scans': 'ms3Scans.txt',\n", - " 'msms': 'msms.txt',\n", - " 'msmsScans': 'msmsScans.txt',\n", - " 'mzRange': 'mzRange.txt',\n", - " 'parameters': 'parameters.txt',\n", - " 'peptides': 'peptides.txt',\n", - " 'proteinGroups': 'proteinGroups.txt',\n", - " 'summary': 'summary.txt'}" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "{Path(x).stem: x for x in mq_output.files}" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
N-term cleavage windowC-term cleavage windowAmino acid beforeFirst amino acidSecond amino acidSecond last amino acidLast amino acidAmino acid afterA CountR Count...Potential contaminantidProtein group IDsMod. peptide IDsEvidence IDsMS/MS IDsBest MS/MSOxidation (M) site IDsTaxonomy IDsMS/MS Count
Sequence
AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPGAVGER______________________________LSGPAEVGPGAVGERTPRKKEPPRASPPGGKAAERT191...NaN017740000.0NaN96061
AAAAAAAAAVSRTTSSRVLRGGRDRGRAAAAAAAAAVSRRRKRGRAAAAAAAAAVSRRRKAEYPRRRRSSPSRAASRR91...NaN12311111.0NaN96061
AAAAAAAGDSDSWDADAFSVEDPVRK______________________________SWDADAFSVEDPVRKVGGGGTAGGDRWEGEMAARKV91...NaN218772222.0NaN96061
AAAAAAALQAKTILRQARNHKLRVDKAAAAAAALQAKSDEKRVDKAAAAAAALQAKSDEKAAVAGKKPVVGKAAAKS80...NaN32461333;44.0NaN96062
AAAAAAGAASGLPGPVAQGLK______________________________GAASGLPGPVAQGLKEALVDTLTGILSPVQMAALKE90...NaN4416244;55;66.0NaN96062
..................................................................
YYTSASGDEMVSLKHEDSQNRKKLSELLRYYTSASGDEMVSLKDRYYTSASGDEMVSLKDYCTRMKENQKHIYYRYYLKD10...NaN38,783207740966;4096749202;49203;49204;49205;49206;49207;49208;4920954670;54671;54672;54673;54674;54675;54676;5467...54,679.01311960610
YYTVFDRDNNRPSGPLWILGDVFIGRYYTVFDRDNNRVGFAFIGRYYTVFDRDNNRVGFAEAARL______RYYNRV02...NaN38,78437940968492105468054,680.0NaN96061
YYVLNALKGQPVKVRVSYQKLLKYYVLNALKHRPPKAQSYQKLLKYYVLNALKHRPPKAQKKRYLFRSKYYLKH10...NaN38,785352140969492115468154,681.0NaN96061
YYVTIIDAPGHRGITIDISLWKFETSKYYVTIIDAPGHRDFITSKYYVTIIDAPGHRDFIKNMITGTSQADCKYYHRD11...NaN38,78628734097049212;49213;4921454682;54683;54684;54685;5468654,683.0NaN96065
YYYIPQYKREVKEHVGTDQFGNKYYYIPQYKNWRGQTITDQFGNKYYYIPQYKNWRGQTIREKRIVEAKYYYKN00...NaN38,787374540971492155468754,687.0NaN96061
\n", - "

38788 rows × 56 columns

\n", - "
" - ], - "text/plain": [ - " N-term cleavage window \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... ______________________________ \n", - "AAAAAAAAAVSR TTSSRVLRGGRDRGRAAAAAAAAAVSRRRK \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK ______________________________ \n", - "AAAAAAALQAK TILRQARNHKLRVDKAAAAAAALQAKSDEK \n", - "AAAAAAGAASGLPGPVAQGLK ______________________________ \n", - "... ... \n", - "YYTSASGDEMVSLK HEDSQNRKKLSELLRYYTSASGDEMVSLKD \n", - "YYTVFDRDNNR PSGPLWILGDVFIGRYYTVFDRDNNRVGFA \n", - "YYVLNALK GQPVKVRVSYQKLLKYYVLNALKHRPPKAQ \n", - "YYVTIIDAPGHR GITIDISLWKFETSKYYVTIIDAPGHRDFI \n", - "YYYIPQYK REVKEHVGTDQFGNKYYYIPQYKNWRGQTI \n", - "\n", - " C-term cleavage window \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... LSGPAEVGPGAVGERTPRKKEPPRASPPGG \n", - "AAAAAAAAAVSR RGRAAAAAAAAAVSRRRKAEYPRRRRSSPS \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK SWDADAFSVEDPVRKVGGGGTAGGDRWEGE \n", - "AAAAAAALQAK RVDKAAAAAAALQAKSDEKAAVAGKKPVVG \n", - "AAAAAAGAASGLPGPVAQGLK GAASGLPGPVAQGLKEALVDTLTGILSPVQ \n", - "... ... \n", - "YYTSASGDEMVSLK RYYTSASGDEMVSLKDYCTRMKENQKHIYY \n", - "YYTVFDRDNNR FIGRYYTVFDRDNNRVGFAEAARL______ \n", - "YYVLNALK SYQKLLKYYVLNALKHRPPKAQKKRYLFRS \n", - "YYVTIIDAPGHR TSKYYVTIIDAPGHRDFIKNMITGTSQADC \n", - "YYYIPQYK TDQFGNKYYYIPQYKNWRGQTIREKRIVEA \n", - "\n", - " Amino acid before \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... K \n", - "AAAAAAAAAVSR R \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK M \n", - "AAAAAAALQAK K \n", - "AAAAAAGAASGLPGPVAQGLK M \n", - "... ... \n", - "YYTSASGDEMVSLK R \n", - "YYTVFDRDNNR R \n", - "YYVLNALK K \n", - "YYVTIIDAPGHR K \n", - "YYYIPQYK K \n", - "\n", - " First amino acid \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... A \n", - "AAAAAAAAAVSR A \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK A \n", - "AAAAAAALQAK A \n", - "AAAAAAGAASGLPGPVAQGLK A \n", - "... ... \n", - "YYTSASGDEMVSLK Y \n", - "YYTVFDRDNNR Y \n", - "YYVLNALK Y \n", - "YYVTIIDAPGHR Y \n", - "YYYIPQYK Y \n", - "\n", - " Second amino acid \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... A \n", - "AAAAAAAAAVSR A \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK A \n", - "AAAAAAALQAK A \n", - "AAAAAAGAASGLPGPVAQGLK A \n", - "... ... \n", - "YYTSASGDEMVSLK Y \n", - "YYTVFDRDNNR Y \n", - "YYVLNALK Y \n", - "YYVTIIDAPGHR Y \n", - "YYYIPQYK Y \n", - "\n", - " Second last amino acid \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... E \n", - "AAAAAAAAAVSR S \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK R \n", - "AAAAAAALQAK A \n", - "AAAAAAGAASGLPGPVAQGLK L \n", - "... ... \n", - "YYTSASGDEMVSLK L \n", - "YYTVFDRDNNR N \n", - "YYVLNALK L \n", - "YYVTIIDAPGHR H \n", - "YYYIPQYK Y \n", - "\n", - " Last amino acid \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... R \n", - "AAAAAAAAAVSR R \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK K \n", - "AAAAAAALQAK K \n", - "AAAAAAGAASGLPGPVAQGLK K \n", - "... ... \n", - "YYTSASGDEMVSLK K \n", - "YYTVFDRDNNR R \n", - "YYVLNALK K \n", - "YYVTIIDAPGHR R \n", - "YYYIPQYK K \n", - "\n", - " Amino acid after A Count \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... T 19 \n", - "AAAAAAAAAVSR R 9 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK V 9 \n", - "AAAAAAALQAK S 8 \n", - "AAAAAAGAASGLPGPVAQGLK E 9 \n", - "... ... ... \n", - "YYTSASGDEMVSLK D 1 \n", - "YYTVFDRDNNR V 0 \n", - "YYVLNALK H 1 \n", - "YYVTIIDAPGHR D 1 \n", - "YYYIPQYK N 0 \n", - "\n", - " R Count ... \\\n", - "Sequence ... \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 1 ... \n", - "AAAAAAAAAVSR 1 ... \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 1 ... \n", - "AAAAAAALQAK 0 ... \n", - "AAAAAAGAASGLPGPVAQGLK 0 ... \n", - "... ... ... \n", - "YYTSASGDEMVSLK 0 ... \n", - "YYTVFDRDNNR 2 ... \n", - "YYVLNALK 0 ... \n", - "YYVTIIDAPGHR 1 ... \n", - "YYYIPQYK 0 ... \n", - "\n", - " Potential contaminant \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... NaN \n", - "AAAAAAAAAVSR NaN \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK NaN \n", - "AAAAAAALQAK NaN \n", - "AAAAAAGAASGLPGPVAQGLK NaN \n", - "... ... \n", - "YYTSASGDEMVSLK NaN \n", - "YYTVFDRDNNR NaN \n", - "YYVLNALK NaN \n", - "YYVTIIDAPGHR NaN \n", - "YYYIPQYK NaN \n", - "\n", - " id Protein group IDs \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 0 1774 \n", - "AAAAAAAAAVSR 1 231 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 2 1877 \n", - "AAAAAAALQAK 3 2461 \n", - "AAAAAAGAASGLPGPVAQGLK 4 4162 \n", - "... ... ... \n", - "YYTSASGDEMVSLK 38,783 2077 \n", - "YYTVFDRDNNR 38,784 379 \n", - "YYVLNALK 38,785 3521 \n", - "YYVTIIDAPGHR 38,786 2873 \n", - "YYYIPQYK 38,787 3745 \n", - "\n", - " Mod. peptide IDs \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 0 \n", - "AAAAAAAAAVSR 1 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 2 \n", - "AAAAAAALQAK 3 \n", - "AAAAAAGAASGLPGPVAQGLK 4 \n", - "... ... \n", - "YYTSASGDEMVSLK 40966;40967 \n", - "YYTVFDRDNNR 40968 \n", - "YYVLNALK 40969 \n", - "YYVTIIDAPGHR 40970 \n", - "YYYIPQYK 40971 \n", - "\n", - " Evidence IDs \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 0 \n", - "AAAAAAAAAVSR 1 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 2 \n", - "AAAAAAALQAK 3 \n", - "AAAAAAGAASGLPGPVAQGLK 4;5 \n", - "... ... \n", - "YYTSASGDEMVSLK 49202;49203;49204;49205;49206;49207;49208;49209 \n", - "YYTVFDRDNNR 49210 \n", - "YYVLNALK 49211 \n", - "YYVTIIDAPGHR 49212;49213;49214 \n", - "YYYIPQYK 49215 \n", - "\n", - " MS/MS IDs \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 0 \n", - "AAAAAAAAAVSR 1 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 2 \n", - "AAAAAAALQAK 3;4 \n", - "AAAAAAGAASGLPGPVAQGLK 5;6 \n", - "... ... \n", - "YYTSASGDEMVSLK 54670;54671;54672;54673;54674;54675;54676;5467... \n", - "YYTVFDRDNNR 54680 \n", - "YYVLNALK 54681 \n", - "YYVTIIDAPGHR 54682;54683;54684;54685;54686 \n", - "YYYIPQYK 54687 \n", - "\n", - " Best MS/MS \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 0.0 \n", - "AAAAAAAAAVSR 1.0 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 2.0 \n", - "AAAAAAALQAK 4.0 \n", - "AAAAAAGAASGLPGPVAQGLK 6.0 \n", - "... ... \n", - "YYTSASGDEMVSLK 54,679.0 \n", - "YYTVFDRDNNR 54,680.0 \n", - "YYVLNALK 54,681.0 \n", - "YYVTIIDAPGHR 54,683.0 \n", - "YYYIPQYK 54,687.0 \n", - "\n", - " Oxidation (M) site IDs \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... NaN \n", - "AAAAAAAAAVSR NaN \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK NaN \n", - "AAAAAAALQAK NaN \n", - "AAAAAAGAASGLPGPVAQGLK NaN \n", - "... ... \n", - "YYTSASGDEMVSLK 1311 \n", - "YYTVFDRDNNR NaN \n", - "YYVLNALK NaN \n", - "YYVTIIDAPGHR NaN \n", - "YYYIPQYK NaN \n", - "\n", - " Taxonomy IDs MS/MS Count \n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 9606 1 \n", - "AAAAAAAAAVSR 9606 1 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 9606 1 \n", - "AAAAAAALQAK 9606 2 \n", - "AAAAAAGAASGLPGPVAQGLK 9606 2 \n", - "... ... ... \n", - "YYTSASGDEMVSLK 9606 10 \n", - "YYTVFDRDNNR 9606 1 \n", - "YYVLNALK 9606 1 \n", - "YYVTIIDAPGHR 9606 5 \n", - "YYYIPQYK 9606 1 \n", - "\n", - "[38788 rows x 56 columns]" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# mq_output.evidence(mq_output)\n", - "mq_output.peptides" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Dynamic Attribute lookup\n", - "\n", - "try to use `__getattr__`, maybe `__setattr__`?\n", - "\n", - "This version offers less inspection possibilities as the attributes are only set when they are looked up." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "MaxQuantOutputDynamic(WindowsPath('data/mq_out/20190611_QX3_LiSc_MA_Hela_500ng_LC15'))" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mq_output = mq.MaxQuantOutputDynamic(w_folder.value)\n", - "mq_output" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['allPeptides',\n", - " 'evidence',\n", - " 'matchedFeatures',\n", - " 'modificationSpecificPeptides',\n", - " 'ms3Scans',\n", - " 'msms',\n", - " 'msmsScans',\n", - " 'mzRange',\n", - " 'parameters',\n", - " 'peptides',\n", - " 'proteinGroups',\n", - " 'summary']" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mq_output.file_keys" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
N-term cleavage windowC-term cleavage windowAmino acid beforeFirst amino acidSecond amino acidSecond last amino acidLast amino acidAmino acid afterA CountR Count...Potential contaminantidProtein group IDsMod. peptide IDsEvidence IDsMS/MS IDsBest MS/MSOxidation (M) site IDsTaxonomy IDsMS/MS Count
Sequence
AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPGAVGER______________________________LSGPAEVGPGAVGERTPRKKEPPRASPPGGKAAERT191...NaN017740000.0NaN96061
AAAAAAAAAVSRTTSSRVLRGGRDRGRAAAAAAAAAVSRRRKRGRAAAAAAAAAVSRRRKAEYPRRRRSSPSRAASRR91...NaN12311111.0NaN96061
AAAAAAAGDSDSWDADAFSVEDPVRK______________________________SWDADAFSVEDPVRKVGGGGTAGGDRWEGEMAARKV91...NaN218772222.0NaN96061
AAAAAAALQAKTILRQARNHKLRVDKAAAAAAALQAKSDEKRVDKAAAAAAALQAKSDEKAAVAGKKPVVGKAAAKS80...NaN32461333;44.0NaN96062
AAAAAAGAASGLPGPVAQGLK______________________________GAASGLPGPVAQGLKEALVDTLTGILSPVQMAALKE90...NaN4416244;55;66.0NaN96062
..................................................................
YYTSASGDEMVSLKHEDSQNRKKLSELLRYYTSASGDEMVSLKDRYYTSASGDEMVSLKDYCTRMKENQKHIYYRYYLKD10...NaN38,783207740966;4096749202;49203;49204;49205;49206;49207;49208;4920954670;54671;54672;54673;54674;54675;54676;5467...54,679.01311960610
YYTVFDRDNNRPSGPLWILGDVFIGRYYTVFDRDNNRVGFAFIGRYYTVFDRDNNRVGFAEAARL______RYYNRV02...NaN38,78437940968492105468054,680.0NaN96061
YYVLNALKGQPVKVRVSYQKLLKYYVLNALKHRPPKAQSYQKLLKYYVLNALKHRPPKAQKKRYLFRSKYYLKH10...NaN38,785352140969492115468154,681.0NaN96061
YYVTIIDAPGHRGITIDISLWKFETSKYYVTIIDAPGHRDFITSKYYVTIIDAPGHRDFIKNMITGTSQADCKYYHRD11...NaN38,78628734097049212;49213;4921454682;54683;54684;54685;5468654,683.0NaN96065
YYYIPQYKREVKEHVGTDQFGNKYYYIPQYKNWRGQTITDQFGNKYYYIPQYKNWRGQTIREKRIVEAKYYYKN00...NaN38,787374540971492155468754,687.0NaN96061
\n", - "

38788 rows × 56 columns

\n", - "
" - ], - "text/plain": [ - " N-term cleavage window \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... ______________________________ \n", - "AAAAAAAAAVSR TTSSRVLRGGRDRGRAAAAAAAAAVSRRRK \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK ______________________________ \n", - "AAAAAAALQAK TILRQARNHKLRVDKAAAAAAALQAKSDEK \n", - "AAAAAAGAASGLPGPVAQGLK ______________________________ \n", - "... ... \n", - "YYTSASGDEMVSLK HEDSQNRKKLSELLRYYTSASGDEMVSLKD \n", - "YYTVFDRDNNR PSGPLWILGDVFIGRYYTVFDRDNNRVGFA \n", - "YYVLNALK GQPVKVRVSYQKLLKYYVLNALKHRPPKAQ \n", - "YYVTIIDAPGHR GITIDISLWKFETSKYYVTIIDAPGHRDFI \n", - "YYYIPQYK REVKEHVGTDQFGNKYYYIPQYKNWRGQTI \n", - "\n", - " C-term cleavage window \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... LSGPAEVGPGAVGERTPRKKEPPRASPPGG \n", - "AAAAAAAAAVSR RGRAAAAAAAAAVSRRRKAEYPRRRRSSPS \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK SWDADAFSVEDPVRKVGGGGTAGGDRWEGE \n", - "AAAAAAALQAK RVDKAAAAAAALQAKSDEKAAVAGKKPVVG \n", - "AAAAAAGAASGLPGPVAQGLK GAASGLPGPVAQGLKEALVDTLTGILSPVQ \n", - "... ... \n", - "YYTSASGDEMVSLK RYYTSASGDEMVSLKDYCTRMKENQKHIYY \n", - "YYTVFDRDNNR FIGRYYTVFDRDNNRVGFAEAARL______ \n", - "YYVLNALK SYQKLLKYYVLNALKHRPPKAQKKRYLFRS \n", - "YYVTIIDAPGHR TSKYYVTIIDAPGHRDFIKNMITGTSQADC \n", - "YYYIPQYK TDQFGNKYYYIPQYKNWRGQTIREKRIVEA \n", - "\n", - " Amino acid before \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... K \n", - "AAAAAAAAAVSR R \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK M \n", - "AAAAAAALQAK K \n", - "AAAAAAGAASGLPGPVAQGLK M \n", - "... ... \n", - "YYTSASGDEMVSLK R \n", - "YYTVFDRDNNR R \n", - "YYVLNALK K \n", - "YYVTIIDAPGHR K \n", - "YYYIPQYK K \n", - "\n", - " First amino acid \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... A \n", - "AAAAAAAAAVSR A \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK A \n", - "AAAAAAALQAK A \n", - "AAAAAAGAASGLPGPVAQGLK A \n", - "... ... \n", - "YYTSASGDEMVSLK Y \n", - "YYTVFDRDNNR Y \n", - "YYVLNALK Y \n", - "YYVTIIDAPGHR Y \n", - "YYYIPQYK Y \n", - "\n", - " Second amino acid \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... A \n", - "AAAAAAAAAVSR A \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK A \n", - "AAAAAAALQAK A \n", - "AAAAAAGAASGLPGPVAQGLK A \n", - "... ... \n", - "YYTSASGDEMVSLK Y \n", - "YYTVFDRDNNR Y \n", - "YYVLNALK Y \n", - "YYVTIIDAPGHR Y \n", - "YYYIPQYK Y \n", - "\n", - " Second last amino acid \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... E \n", - "AAAAAAAAAVSR S \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK R \n", - "AAAAAAALQAK A \n", - "AAAAAAGAASGLPGPVAQGLK L \n", - "... ... \n", - "YYTSASGDEMVSLK L \n", - "YYTVFDRDNNR N \n", - "YYVLNALK L \n", - "YYVTIIDAPGHR H \n", - "YYYIPQYK Y \n", - "\n", - " Last amino acid \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... R \n", - "AAAAAAAAAVSR R \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK K \n", - "AAAAAAALQAK K \n", - "AAAAAAGAASGLPGPVAQGLK K \n", - "... ... \n", - "YYTSASGDEMVSLK K \n", - "YYTVFDRDNNR R \n", - "YYVLNALK K \n", - "YYVTIIDAPGHR R \n", - "YYYIPQYK K \n", - "\n", - " Amino acid after A Count \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... T 19 \n", - "AAAAAAAAAVSR R 9 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK V 9 \n", - "AAAAAAALQAK S 8 \n", - "AAAAAAGAASGLPGPVAQGLK E 9 \n", - "... ... ... \n", - "YYTSASGDEMVSLK D 1 \n", - "YYTVFDRDNNR V 0 \n", - "YYVLNALK H 1 \n", - "YYVTIIDAPGHR D 1 \n", - "YYYIPQYK N 0 \n", - "\n", - " R Count ... \\\n", - "Sequence ... \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 1 ... \n", - "AAAAAAAAAVSR 1 ... \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 1 ... \n", - "AAAAAAALQAK 0 ... \n", - "AAAAAAGAASGLPGPVAQGLK 0 ... \n", - "... ... ... \n", - "YYTSASGDEMVSLK 0 ... \n", - "YYTVFDRDNNR 2 ... \n", - "YYVLNALK 0 ... \n", - "YYVTIIDAPGHR 1 ... \n", - "YYYIPQYK 0 ... \n", - "\n", - " Potential contaminant \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... NaN \n", - "AAAAAAAAAVSR NaN \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK NaN \n", - "AAAAAAALQAK NaN \n", - "AAAAAAGAASGLPGPVAQGLK NaN \n", - "... ... \n", - "YYTSASGDEMVSLK NaN \n", - "YYTVFDRDNNR NaN \n", - "YYVLNALK NaN \n", - "YYVTIIDAPGHR NaN \n", - "YYYIPQYK NaN \n", - "\n", - " id Protein group IDs \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 0 1774 \n", - "AAAAAAAAAVSR 1 231 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 2 1877 \n", - "AAAAAAALQAK 3 2461 \n", - "AAAAAAGAASGLPGPVAQGLK 4 4162 \n", - "... ... ... \n", - "YYTSASGDEMVSLK 38,783 2077 \n", - "YYTVFDRDNNR 38,784 379 \n", - "YYVLNALK 38,785 3521 \n", - "YYVTIIDAPGHR 38,786 2873 \n", - "YYYIPQYK 38,787 3745 \n", - "\n", - " Mod. peptide IDs \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 0 \n", - "AAAAAAAAAVSR 1 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 2 \n", - "AAAAAAALQAK 3 \n", - "AAAAAAGAASGLPGPVAQGLK 4 \n", - "... ... \n", - "YYTSASGDEMVSLK 40966;40967 \n", - "YYTVFDRDNNR 40968 \n", - "YYVLNALK 40969 \n", - "YYVTIIDAPGHR 40970 \n", - "YYYIPQYK 40971 \n", - "\n", - " Evidence IDs \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 0 \n", - "AAAAAAAAAVSR 1 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 2 \n", - "AAAAAAALQAK 3 \n", - "AAAAAAGAASGLPGPVAQGLK 4;5 \n", - "... ... \n", - "YYTSASGDEMVSLK 49202;49203;49204;49205;49206;49207;49208;49209 \n", - "YYTVFDRDNNR 49210 \n", - "YYVLNALK 49211 \n", - "YYVTIIDAPGHR 49212;49213;49214 \n", - "YYYIPQYK 49215 \n", - "\n", - " MS/MS IDs \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 0 \n", - "AAAAAAAAAVSR 1 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 2 \n", - "AAAAAAALQAK 3;4 \n", - "AAAAAAGAASGLPGPVAQGLK 5;6 \n", - "... ... \n", - "YYTSASGDEMVSLK 54670;54671;54672;54673;54674;54675;54676;5467... \n", - "YYTVFDRDNNR 54680 \n", - "YYVLNALK 54681 \n", - "YYVTIIDAPGHR 54682;54683;54684;54685;54686 \n", - "YYYIPQYK 54687 \n", - "\n", - " Best MS/MS \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 0.0 \n", - "AAAAAAAAAVSR 1.0 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 2.0 \n", - "AAAAAAALQAK 4.0 \n", - "AAAAAAGAASGLPGPVAQGLK 6.0 \n", - "... ... \n", - "YYTSASGDEMVSLK 54,679.0 \n", - "YYTVFDRDNNR 54,680.0 \n", - "YYVLNALK 54,681.0 \n", - "YYVTIIDAPGHR 54,683.0 \n", - "YYYIPQYK 54,687.0 \n", - "\n", - " Oxidation (M) site IDs \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... NaN \n", - "AAAAAAAAAVSR NaN \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK NaN \n", - "AAAAAAALQAK NaN \n", - "AAAAAAGAASGLPGPVAQGLK NaN \n", - "... ... \n", - "YYTSASGDEMVSLK 1311 \n", - "YYTVFDRDNNR NaN \n", - "YYVLNALK NaN \n", - "YYVTIIDAPGHR NaN \n", - "YYYIPQYK NaN \n", - "\n", - " Taxonomy IDs MS/MS Count \n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 9606 1 \n", - "AAAAAAAAAVSR 9606 1 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 9606 1 \n", - "AAAAAAALQAK 9606 2 \n", - "AAAAAAGAASGLPGPVAQGLK 9606 2 \n", - "... ... ... \n", - "YYTSASGDEMVSLK 9606 10 \n", - "YYTVFDRDNNR 9606 1 \n", - "YYVLNALK 9606 1 \n", - "YYVTIIDAPGHR 9606 5 \n", - "YYYIPQYK 9606 1 \n", - "\n", - "[38788 rows x 56 columns]" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mq_output.peptides" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "No such file: peptides_.txt: Choose one of the following:\n", - "allPeptides, evidence, matchedFeatures, modificationSpecificPeptides, ms3Scans, msms, msmsScans, mzRange, parameters, peptides, proteinGroups, summary\n" - ] - } - ], - "source": [ - "try:\n", - " mq_output.peptides_\n", - "except AttributeError as e:\n", - " print(*e.args)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['NAME_FILE_MAP',\n", - " 'OxidationSites',\n", - " '_inital_attritubutes',\n", - " '_peptides',\n", - " 'allPeptides',\n", - " 'dump_intensity',\n", - " 'evidence',\n", - " 'file_keys',\n", - " 'files',\n", - " 'find_attribute',\n", - " 'folder',\n", - " 'get_files',\n", - " 'get_list_of_attributes',\n", - " 'load',\n", - " 'matchedFeatures',\n", - " 'modificationSpecificPeptides',\n", - " 'ms3Scans',\n", - " 'msms',\n", - " 'msmsScans',\n", - " 'mzRange',\n", - " 'name_file_map',\n", - " 'parameters',\n", - " 'paths',\n", - " 'peptides',\n", - " 'proteinGroups',\n", - " 'register_file',\n", - " 'summary']" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mq_output.get_list_of_attributes()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Files" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### evidence.txt\n", - "\n", - "> some columns throw a warning" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "c:\\users\\kzl465\\documents\\repos\\vaep\\vaep\\io\\mq.py:87: DtypeWarning: Columns (50,53,58) have mixed types.Specify dtype option on import or set low_memory=False.\n", - " return cls.find_attribute(f'_{filename}')\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
LengthModificationsModified sequenceOxidation (M) ProbabilitiesOxidation (M) Score DiffsAcetyl (Protein N-term)Oxidation (M)Missed cleavagesProteinsLeading proteinsLeading razor proteinGene namesProtein namesTypeRaw fileMS/MS m/zChargem/zMassUncalibrated - Calibrated m/z [ppm]Uncalibrated - Calibrated m/z [Da]Mass error [ppm]Mass error [Da]Uncalibrated mass error [ppm]Uncalibrated mass error [Da]Max intensity m/z 0Retention timeRetention lengthCalibrated retention timeCalibrated retention time startCalibrated retention time finishRetention time calibrationMatch time differenceMatch m/z differenceMatch q-valueMatch scoreNumber of data pointsNumber of scansNumber of isotopic peaksPIFFraction of total spectrumBase peak fractionPEPMS/MS countMS/MS scan numberScoreDelta scoreCombinatoricsIntensityReversePotential contaminantidProtein group IDsPeptide IDMod. peptide IDMS/MS IDsBest MS/MSOxidation (M) site IDsTaxonomy IDs
Sequence
AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPGAVGER52Unmodified_AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVG...NaNNaN000R4GMQ1;O60341;O60341-2R4GMQ1R4GMQ1KDM1ALysine-specific histone demethylase 1AMULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC151,101.841,101.34,401.12.40.00.20.02.60.01,101.885.60.385.685.585.80.0nannannannan84.022.05.00000.01106,834102.791.3157,739,000.0NaNNaN017740000NaN9606
AAAAAAAAAVSR12Unmodified_AAAAAAAAAVSR_NaNNaN000A0A0A6YYC7;Q96JP5-2;Q96JP5A0A0A6YYC7A0A0A6YYC7ZFP91-CNTF;ZFP91E3 ubiquitin-protein ligase ZFP91MULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15500.82500.8999.51.40.00.50.02.00.0500.825.60.225.625.425.60.0nannannannan37.016.03.00000.0130,18468.646.9187,575,000.0NaNNaN12311111NaN9606
AAAAAAAGDSDSWDADAFSVEDPVRK26Acetyl (Protein N-term)_(Acetyl (Protein N-term))AAAAAAAGDSDSWDADAFSV...NaNNaN101O75822;O75822-3;O75822-2O75822O75822EIF3JEukaryotic translation initiation factor 3 sub...MULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15879.43879.12,634.22.20.0-0.8-0.01.40.0879.495.00.695.094.795.30.0nannannannan157.047.05.00000.01118,493157.9144.31442,780,000.0NaNNaN218772222NaN9606
AAAAAAALQAK11Unmodified_AAAAAAALQAK_NaNNaN000P36578;H3BM89;H3BU31P36578P36578RPL460S ribosomal protein L4MULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15479.32478.8955.51.90.0-0.3-0.01.60.0478.826.70.626.726.527.10.0nannannannan163.046.05.00000.0231,655144.4106.813,166,700,000.0NaNNaN32461333;44NaN9606
AAAAAAGAASGLPGPVAQGLK21Acetyl (Protein N-term)_(Acetyl (Protein N-term))AAAAAAGAASGLPGPVAQGLK_NaNNaN100Q96P70Q96P70Q96P70IPO9Importin-9MULTI-MSMS20190611_QX3_LiSc_MA_Hela_500ng_LC15598.03597.71,790.02.40.0-0.7-0.01.80.0597.796.90.596.996.697.10.0nannannannan85.034.03.00000.01120,70646.636.8140,166,000.0NaNNaN441624455NaN9606
\n", - "
" - ], - "text/plain": [ - " Length \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 52 \n", - "AAAAAAAAAVSR 12 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 26 \n", - "AAAAAAALQAK 11 \n", - "AAAAAAGAASGLPGPVAQGLK 21 \n", - "\n", - " Modifications \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... Unmodified \n", - "AAAAAAAAAVSR Unmodified \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK Acetyl (Protein N-term) \n", - "AAAAAAALQAK Unmodified \n", - "AAAAAAGAASGLPGPVAQGLK Acetyl (Protein N-term) \n", - "\n", - " Modified sequence \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... _AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVG... \n", - "AAAAAAAAAVSR _AAAAAAAAAVSR_ \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK _(Acetyl (Protein N-term))AAAAAAAGDSDSWDADAFSV... \n", - "AAAAAAALQAK _AAAAAAALQAK_ \n", - "AAAAAAGAASGLPGPVAQGLK _(Acetyl (Protein N-term))AAAAAAGAASGLPGPVAQGLK_ \n", - "\n", - " Oxidation (M) Probabilities \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... NaN \n", - "AAAAAAAAAVSR NaN \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK NaN \n", - "AAAAAAALQAK NaN \n", - "AAAAAAGAASGLPGPVAQGLK NaN \n", - "\n", - " Oxidation (M) Score Diffs \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... NaN \n", - "AAAAAAAAAVSR NaN \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK NaN \n", - "AAAAAAALQAK NaN \n", - "AAAAAAGAASGLPGPVAQGLK NaN \n", - "\n", - " Acetyl (Protein N-term) \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 0 \n", - "AAAAAAAAAVSR 0 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 1 \n", - "AAAAAAALQAK 0 \n", - "AAAAAAGAASGLPGPVAQGLK 1 \n", - "\n", - " Oxidation (M) \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 0 \n", - "AAAAAAAAAVSR 0 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 0 \n", - "AAAAAAALQAK 0 \n", - "AAAAAAGAASGLPGPVAQGLK 0 \n", - "\n", - " Missed cleavages \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 0 \n", - "AAAAAAAAAVSR 0 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 1 \n", - "AAAAAAALQAK 0 \n", - "AAAAAAGAASGLPGPVAQGLK 0 \n", - "\n", - " Proteins \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... R4GMQ1;O60341;O60341-2 \n", - "AAAAAAAAAVSR A0A0A6YYC7;Q96JP5-2;Q96JP5 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK O75822;O75822-3;O75822-2 \n", - "AAAAAAALQAK P36578;H3BM89;H3BU31 \n", - "AAAAAAGAASGLPGPVAQGLK Q96P70 \n", - "\n", - " Leading proteins \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... R4GMQ1 \n", - "AAAAAAAAAVSR A0A0A6YYC7 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK O75822 \n", - "AAAAAAALQAK P36578 \n", - "AAAAAAGAASGLPGPVAQGLK Q96P70 \n", - "\n", - " Leading razor protein \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... R4GMQ1 \n", - "AAAAAAAAAVSR A0A0A6YYC7 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK O75822 \n", - "AAAAAAALQAK P36578 \n", - "AAAAAAGAASGLPGPVAQGLK Q96P70 \n", - "\n", - " Gene names \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... KDM1A \n", - "AAAAAAAAAVSR ZFP91-CNTF;ZFP91 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK EIF3J \n", - "AAAAAAALQAK RPL4 \n", - "AAAAAAGAASGLPGPVAQGLK IPO9 \n", - "\n", - " Protein names \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... Lysine-specific histone demethylase 1A \n", - "AAAAAAAAAVSR E3 ubiquitin-protein ligase ZFP91 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK Eukaryotic translation initiation factor 3 sub... \n", - "AAAAAAALQAK 60S ribosomal protein L4 \n", - "AAAAAAGAASGLPGPVAQGLK Importin-9 \n", - "\n", - " Type \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... MULTI-MSMS \n", - "AAAAAAAAAVSR MULTI-MSMS \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK MULTI-MSMS \n", - "AAAAAAALQAK MULTI-MSMS \n", - "AAAAAAGAASGLPGPVAQGLK MULTI-MSMS \n", - "\n", - " Raw file \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 20190611_QX3_LiSc_MA_Hela_500ng_LC15 \n", - "AAAAAAAAAVSR 20190611_QX3_LiSc_MA_Hela_500ng_LC15 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 20190611_QX3_LiSc_MA_Hela_500ng_LC15 \n", - "AAAAAAALQAK 20190611_QX3_LiSc_MA_Hela_500ng_LC15 \n", - "AAAAAAGAASGLPGPVAQGLK 20190611_QX3_LiSc_MA_Hela_500ng_LC15 \n", - "\n", - " MS/MS m/z Charge m/z \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 1,101.8 4 1,101.3 \n", - "AAAAAAAAAVSR 500.8 2 500.8 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 879.4 3 879.1 \n", - "AAAAAAALQAK 479.3 2 478.8 \n", - "AAAAAAGAASGLPGPVAQGLK 598.0 3 597.7 \n", - "\n", - " Mass \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 4,401.1 \n", - "AAAAAAAAAVSR 999.5 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 2,634.2 \n", - "AAAAAAALQAK 955.5 \n", - "AAAAAAGAASGLPGPVAQGLK 1,790.0 \n", - "\n", - " Uncalibrated - Calibrated m/z [ppm] \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 2.4 \n", - "AAAAAAAAAVSR 1.4 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 2.2 \n", - "AAAAAAALQAK 1.9 \n", - "AAAAAAGAASGLPGPVAQGLK 2.4 \n", - "\n", - " Uncalibrated - Calibrated m/z [Da] \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 0.0 \n", - "AAAAAAAAAVSR 0.0 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 0.0 \n", - "AAAAAAALQAK 0.0 \n", - "AAAAAAGAASGLPGPVAQGLK 0.0 \n", - "\n", - " Mass error [ppm] \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 0.2 \n", - "AAAAAAAAAVSR 0.5 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK -0.8 \n", - "AAAAAAALQAK -0.3 \n", - "AAAAAAGAASGLPGPVAQGLK -0.7 \n", - "\n", - " Mass error [Da] \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 0.0 \n", - "AAAAAAAAAVSR 0.0 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK -0.0 \n", - "AAAAAAALQAK -0.0 \n", - "AAAAAAGAASGLPGPVAQGLK -0.0 \n", - "\n", - " Uncalibrated mass error [ppm] \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 2.6 \n", - "AAAAAAAAAVSR 2.0 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 1.4 \n", - "AAAAAAALQAK 1.6 \n", - "AAAAAAGAASGLPGPVAQGLK 1.8 \n", - "\n", - " Uncalibrated mass error [Da] \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 0.0 \n", - "AAAAAAAAAVSR 0.0 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 0.0 \n", - "AAAAAAALQAK 0.0 \n", - "AAAAAAGAASGLPGPVAQGLK 0.0 \n", - "\n", - " Max intensity m/z 0 \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 1,101.8 \n", - "AAAAAAAAAVSR 500.8 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 879.4 \n", - "AAAAAAALQAK 478.8 \n", - "AAAAAAGAASGLPGPVAQGLK 597.7 \n", - "\n", - " Retention time \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 85.6 \n", - "AAAAAAAAAVSR 25.6 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 95.0 \n", - "AAAAAAALQAK 26.7 \n", - "AAAAAAGAASGLPGPVAQGLK 96.9 \n", - "\n", - " Retention length \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 0.3 \n", - "AAAAAAAAAVSR 0.2 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 0.6 \n", - "AAAAAAALQAK 0.6 \n", - "AAAAAAGAASGLPGPVAQGLK 0.5 \n", - "\n", - " Calibrated retention time \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 85.6 \n", - "AAAAAAAAAVSR 25.6 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 95.0 \n", - "AAAAAAALQAK 26.7 \n", - "AAAAAAGAASGLPGPVAQGLK 96.9 \n", - "\n", - " Calibrated retention time start \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 85.5 \n", - "AAAAAAAAAVSR 25.4 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 94.7 \n", - "AAAAAAALQAK 26.5 \n", - "AAAAAAGAASGLPGPVAQGLK 96.6 \n", - "\n", - " Calibrated retention time finish \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 85.8 \n", - "AAAAAAAAAVSR 25.6 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 95.3 \n", - "AAAAAAALQAK 27.1 \n", - "AAAAAAGAASGLPGPVAQGLK 97.1 \n", - "\n", - " Retention time calibration \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 0.0 \n", - "AAAAAAAAAVSR 0.0 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 0.0 \n", - "AAAAAAALQAK 0.0 \n", - "AAAAAAGAASGLPGPVAQGLK 0.0 \n", - "\n", - " Match time difference \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... nan \n", - "AAAAAAAAAVSR nan \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK nan \n", - "AAAAAAALQAK nan \n", - "AAAAAAGAASGLPGPVAQGLK nan \n", - "\n", - " Match m/z difference \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... nan \n", - "AAAAAAAAAVSR nan \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK nan \n", - "AAAAAAALQAK nan \n", - "AAAAAAGAASGLPGPVAQGLK nan \n", - "\n", - " Match q-value \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... nan \n", - "AAAAAAAAAVSR nan \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK nan \n", - "AAAAAAALQAK nan \n", - "AAAAAAGAASGLPGPVAQGLK nan \n", - "\n", - " Match score \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... nan \n", - "AAAAAAAAAVSR nan \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK nan \n", - "AAAAAAALQAK nan \n", - "AAAAAAGAASGLPGPVAQGLK nan \n", - "\n", - " Number of data points \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 84.0 \n", - "AAAAAAAAAVSR 37.0 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 157.0 \n", - "AAAAAAALQAK 163.0 \n", - "AAAAAAGAASGLPGPVAQGLK 85.0 \n", - "\n", - " Number of scans \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 22.0 \n", - "AAAAAAAAAVSR 16.0 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 47.0 \n", - "AAAAAAALQAK 46.0 \n", - "AAAAAAGAASGLPGPVAQGLK 34.0 \n", - "\n", - " Number of isotopic peaks \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 5.0 \n", - "AAAAAAAAAVSR 3.0 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 5.0 \n", - "AAAAAAALQAK 5.0 \n", - "AAAAAAGAASGLPGPVAQGLK 3.0 \n", - "\n", - " PIF \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 0 \n", - "AAAAAAAAAVSR 0 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 0 \n", - "AAAAAAALQAK 0 \n", - "AAAAAAGAASGLPGPVAQGLK 0 \n", - "\n", - " Fraction of total spectrum \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 0 \n", - "AAAAAAAAAVSR 0 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 0 \n", - "AAAAAAALQAK 0 \n", - "AAAAAAGAASGLPGPVAQGLK 0 \n", - "\n", - " Base peak fraction PEP \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 0 0.0 \n", - "AAAAAAAAAVSR 0 0.0 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 0 0.0 \n", - "AAAAAAALQAK 0 0.0 \n", - "AAAAAAGAASGLPGPVAQGLK 0 0.0 \n", - "\n", - " MS/MS count \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 1 \n", - "AAAAAAAAAVSR 1 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 1 \n", - "AAAAAAALQAK 2 \n", - "AAAAAAGAASGLPGPVAQGLK 1 \n", - "\n", - " MS/MS scan number Score \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 106,834 102.7 \n", - "AAAAAAAAAVSR 30,184 68.6 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 118,493 157.9 \n", - "AAAAAAALQAK 31,655 144.4 \n", - "AAAAAAGAASGLPGPVAQGLK 120,706 46.6 \n", - "\n", - " Delta score \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 91.3 \n", - "AAAAAAAAAVSR 46.9 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 144.3 \n", - "AAAAAAALQAK 106.8 \n", - "AAAAAAGAASGLPGPVAQGLK 36.8 \n", - "\n", - " Combinatorics \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 1 \n", - "AAAAAAAAAVSR 1 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 1 \n", - "AAAAAAALQAK 1 \n", - "AAAAAAGAASGLPGPVAQGLK 1 \n", - "\n", - " Intensity Reverse \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 57,739,000.0 NaN \n", - "AAAAAAAAAVSR 87,575,000.0 NaN \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 442,780,000.0 NaN \n", - "AAAAAAALQAK 3,166,700,000.0 NaN \n", - "AAAAAAGAASGLPGPVAQGLK 40,166,000.0 NaN \n", - "\n", - " Potential contaminant id \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... NaN 0 \n", - "AAAAAAAAAVSR NaN 1 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK NaN 2 \n", - "AAAAAAALQAK NaN 3 \n", - "AAAAAAGAASGLPGPVAQGLK NaN 4 \n", - "\n", - " Protein group IDs \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 1774 \n", - "AAAAAAAAAVSR 231 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 1877 \n", - "AAAAAAALQAK 2461 \n", - "AAAAAAGAASGLPGPVAQGLK 4162 \n", - "\n", - " Peptide ID \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 0 \n", - "AAAAAAAAAVSR 1 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 2 \n", - "AAAAAAALQAK 3 \n", - "AAAAAAGAASGLPGPVAQGLK 4 \n", - "\n", - " Mod. peptide ID MS/MS IDs \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 0 0 \n", - "AAAAAAAAAVSR 1 1 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 2 2 \n", - "AAAAAAALQAK 3 3;4 \n", - "AAAAAAGAASGLPGPVAQGLK 4 5 \n", - "\n", - " Best MS/MS \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 0 \n", - "AAAAAAAAAVSR 1 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 2 \n", - "AAAAAAALQAK 4 \n", - "AAAAAAGAASGLPGPVAQGLK 5 \n", - "\n", - " Oxidation (M) site IDs \\\n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... NaN \n", - "AAAAAAAAAVSR NaN \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK NaN \n", - "AAAAAAALQAK NaN \n", - "AAAAAAGAASGLPGPVAQGLK NaN \n", - "\n", - " Taxonomy IDs \n", - "Sequence \n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPG... 9606 \n", - "AAAAAAAAAVSR 9606 \n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 9606 \n", - "AAAAAAALQAK 9606 \n", - "AAAAAAGAASGLPGPVAQGLK 9606 " - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "pd.options.display.max_columns = len(mq_output.evidence.columns)\n", - "mq_output.evidence.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Sequence\n", - "AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGPGAVGER 0\n", - "AAAAAAAAAVSR 1\n", - "AAAAAAAGDSDSWDADAFSVEDPVRK 2\n", - "AAAAAAALQAK 3\n", - "AAAAAAGAASGLPGPVAQGLK 4\n", - " ... \n", - "YYVLNALK 38,785\n", - "YYVTIIDAPGHR 38,786\n", - "YYVTIIDAPGHR 38,786\n", - "YYVTIIDAPGHR 38,786\n", - "YYYIPQYK 38,787\n", - "Name: Peptide ID, Length: 49216, dtype: int64" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mixed_dtype_columns = mq_output.evidence.columns[[50, 53, 58]]\n", - "mq_output.evidence[mixed_dtype_columns][mixed_dtype_columns[1]]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "vaep", - "language": "python", - "name": "vaep" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13 (default, Mar 28 2022, 06:59:08) [MSC v.1916 64 bit (AMD64)]" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/project/misc_clustering_proteins.ipynb b/project/misc_clustering_proteins.ipynb deleted file mode 100644 index b7892c718..000000000 --- a/project/misc_clustering_proteins.ipynb +++ /dev/null @@ -1,1465 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "# Data views" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "#PCA-fct Annelaura\n", - "from sklearn.decomposition import PCA\n", - "def runPCA(data, components = 2):\n", - " result = {}\n", - " X = data.copy()\n", - " pca = PCA(n_components=components)\n", - " pca.fit(X)\n", - " X = pca.transform(X)\n", - " var_exp = pca.explained_variance_ratio_\n", - " args = {\"x_title\":\"PC1\"+\" ({0:.2f})\".format(var_exp[0]),\"y_title\":\"PC2\"+\" ({0:.2f})\".format(var_exp[1])}\n", - " if components == 2:\n", - " resultDf = pd.DataFrame(X, columns = [\"x\",\"y\"])\n", - " if components > 2:\n", - " args.update({\"z_title\":\"PC3\"+str(var_exp[2])})\n", - " resultDf = pd.DataFrame(X)\n", - " cols = []\n", - " if len(components)>3:\n", - " cols = resultDf.columns[4:]\n", - " resultDf.columns = [\"x\", \"y\", \"z\"] + cols\n", - " result['pca'] = resultDf\n", - " return resultDf, args" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "## Load Data \n", - "\n", - "Meta Data" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
indexDateMS_instrumentLCPIDColumnLengthshortdate
SampleID
40820180713_QE8_nLC5_ASD_QC_Hela1_proteinGroups.txt20180713QE8nLC5ASDNaN201807
31220180713_QE8_nLC5_ASD_QC_Hela2_proteinGroups.txt20180713QE8nLC5ASDNaN201807
28120180713_QE8_nLC5_ASD_QC_Hela2_20190226172112_...20180713QE8nLC5ASDNaN201807
8220190103_QE8_nLC0_LiNi_QC_MNT_15cm_Hela_02_pro...20190103QE8nLC0LiNi15.0201901
16120190103_QE8_nLC0_LiNi_QC_MNT_15cm_Hela_01_pro...20190103QE8nLC0LiNi15.0201901
\n", - "
" - ], - "text/plain": [ - " index Date \\\n", - "SampleID \n", - "408 20180713_QE8_nLC5_ASD_QC_Hela1_proteinGroups.txt 20180713 \n", - "312 20180713_QE8_nLC5_ASD_QC_Hela2_proteinGroups.txt 20180713 \n", - "281 20180713_QE8_nLC5_ASD_QC_Hela2_20190226172112_... 20180713 \n", - "82 20190103_QE8_nLC0_LiNi_QC_MNT_15cm_Hela_02_pro... 20190103 \n", - "161 20190103_QE8_nLC0_LiNi_QC_MNT_15cm_Hela_01_pro... 20190103 \n", - "\n", - " MS_instrument LC PID ColumnLength shortdate \n", - "SampleID \n", - "408 QE8 nLC5 ASD NaN 201807 \n", - "312 QE8 nLC5 ASD NaN 201807 \n", - "281 QE8 nLC5 ASD NaN 201807 \n", - "82 QE8 nLC0 LiNi 15.0 201901 \n", - "161 QE8 nLC0 LiNi 15.0 201901 " - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import os\n", - "import ipywidgets as w\n", - "import pandas as pd\n", - "from config import PROCESSED_DATA, PREFIX_META\n", - "meta_data = pd.read_pickle(os.path.join(PROCESSED_DATA, PREFIX_META + '.pkl' ))\n", - "meta_data.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "Select imputed protein data for protein groups present in at least the available percentage of the samples" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "46aeae4fd95846ea808833aff7f2d318", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Dropdown(options=('hela_imputed_proteins_50.pkl', 'hela_imputed_proteins_90.pkl'), value='hela_imputed_protein…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "w_data = w.Dropdown(options= [x for x in os.listdir(os.path.join(PROCESSED_DATA)) if '.pkl' in x and not 'meta' in x])\n", - "w_data" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "## PCA \n", - "- How good is the reconstruction using the first two principal components?\n", - " - it works relatively good on MNIST, also here?" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "### Explained Variance\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "data = pd.read_pickle(os.path.join(PROCESSED_DATA, w_data.value))" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "from sklearn.decomposition import PCA\n", - "pca = PCA()\n", - "pca = pca.fit(data)\n", - "CDF_var_explained = np.cumsum(pca.explained_variance_ratio_)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "import seaborn\n", - "import matplotlib as plt\n", - "g = seaborn.relplot(data=pd.Series(CDF_var_explained))\n", - "g.set_axis_labels(\"Principal Components\", \"cumulative share of explained variance\")\n", - "_ = g.fig.suptitle('Explained Variance by Principal Components', y=1.03)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "90 percent of Variance is explained by first 163 principal components\n" - ] - } - ], - "source": [ - "print(\"90 percent of Variance is explained by first {} principal components\".format(\n", - " np.argmax(CDF_var_explained >= 0.9)\n", - "))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "### First two Principal Components" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "n_components = 2\n", - "from sklearn.decomposition import PCA\n", - "pca = PCA(n_components=n_components)\n", - "data_2PC = pca.fit_transform(data)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Variance share of PC1: 0.414, PC2: 0.130\n" - ] - } - ], - "source": [ - "print(\"Variance share of PC1: {:.3f}, PC2: {:.3f}\".format(*pca.explained_variance_ratio_))" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
PC1PC2
SampleID
408-12.037074-27.105675
281-13.252902-31.794649
8234.371093-3.456893
16142.6033772.525639
25126.988896-5.811818
\n", - "
" - ], - "text/plain": [ - " PC1 PC2\n", - "SampleID \n", - "408 -12.037074 -27.105675\n", - "281 -13.252902 -31.794649\n", - "82 34.371093 -3.456893\n", - "161 42.603377 2.525639\n", - "251 26.988896 -5.811818" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data_2PC = pd.DataFrame(data_2PC, index=data.index, columns=[f'PC{i}' for i in range(1,n_components+1)])\n", - "data_2PC.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "### Plot Principal Components\n", - "- there must have been a change in HeLa batch in june 2019 \n", - "- there are also a few runs from before/after that month that falls in the wrong cluster" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "# PCA\n", - "# Original data\n", - "result, args = runPCA(data)\n", - "result.set_index(data.index, inplace = True)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "result = meta_data.join(result)\n", - "result['shortdate'] = result.shortdate.astype('category', copy=True)\n", - "\n", - "seaborn.relplot(x='x', y='y', hue='shortdate', data=result )" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "### Other Plots (work in progress)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/tzx804/env/fixjupyter/lib/python3.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", - " after removing the cwd from sys.path.\n" - ] - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# look at missing values\n", - "miss = x_50.T.isnull().sum().astype(float).tolist()\n", - "df = result_to_plot.loc[result_to_plot['variable']=='shortdate']\n", - "df['missingness']=[(mis/x_50.shape[1])*100 for mis in miss]\n", - "from ggplot import *\n", - "ggplot(aes(x = 'x', y='y', color='missingness'),data = df)+geom_point(size =75, alpha = 0.8) + scale_color_gradient(low = \"#00AFBB\", high = \"#FC4E07\")+theme_bw()+ggtitle('MISSINGNESS')" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Make 'before' and 'after' datasets\n", - "labels['datetime'] = pd.to_datetime(labels.Date, format = '%Y%m%d')\n", - "labels['time'] = 'after'\n", - "labels.loc[labels['datetime']<'2019-06-01','time']='before'\n", - "# check number of nans\n", - "nonan = x.isnull().sum(axis = 1)\n", - "nonnan = pd.DataFrame(nonan)\n", - "nonnan.columns = ['nans']\n", - "nonnan['time']=labels['time']\n", - "#from ggplot import *\n", - "ggplot(aes(x='nans', color = 'time'), data = nonnan) + geom_density() #+ ylim(0,0.025)\n", - "\n", - "## There are fewer identified proteins (more missingness) in the \"old\" HeLa" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(177, 25427)\n", - "(175, 2069)\n", - "(296, 25427)\n", - "(223, 2375)\n", - "overlapping: 1451\n" - ] - } - ], - "source": [ - "# datasets with 50% coverage for both 'before' and 'after' \n", - "\n", - "x_50_before = coverage(x.loc[labels['datetime']<'2019-06-01',:], 0.5,0.5)\n", - "print(x.loc[labels['datetime']<'2019-05-01',:].shape)\n", - "print(x_50_before.shape)\n", - "x_50_after = coverage(x.loc[labels['datetime']>='2019-06-01',:], 0.5,0.5)\n", - "print(x.loc[labels['datetime']>='2019-05-01',:].shape)\n", - "print(x_50_after.shape)\n", - "\n", - "# Overlapping proteins\n", - "print('overlapping:',len(list(set(x_50_before.columns.tolist()) & set(x_50_after.columns.tolist()))))" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/tzx804/env/fixjupyter/lib/python3.7/site-packages/ipykernel_launcher.py:4: RuntimeWarning: divide by zero encountered in log\n", - " after removing the cwd from sys.path.\n" - ] - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAArgAAAIhCAYAAAClqcmkAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nOzdeVxU9f4/8NcwwwybxCaikqjXIlyQcgE1RUKtxBbFBfdKzd17vWGLtnozc6lMzTTNXSvFbuK+FKgprimau2aQC4osErLMDHN+f9yv/KIBAZk5n5kzr+fj4aMHhzOH17whfTl+zmdUkiRJICIiIiJSCCfRAYiIiIiILIkFl4iIiIgUhQWXiIiIiBSFBZeIiIiIFIUFl4iIiIgUhQWXiIiIiBSFBZeIiIiIFIUFl8hBNGzYELt37y73c7m5uRg9ejQCAgLg5uaGFi1aYNmyZWbnffvttwgPD4e7uzv8/f0RHh6OBQsWoLzttJs1awYPDw94eHhArVbDxcWl9OOPPvoIy5cvh1qthoeHBzw9PdGyZUts3rzZ7Dr5+fnw8PDAs88+W+5z8vf3x927d0uPLVmyBJ07dy79eOPGjQgLC4Onpyf8/Pzw1FNP4cqVKwCA999/H87OzqW5PDw8MHPmzNLHbt68GW3btoW7uzt8fX0xcOBAXL16tfTzlT2H33//HSqVCo8//niZ3Ldv34ZWq0XDhg3LPBdXV9cyWcaNG2f2dTw8PNCoUSO8/PLLuHDhgtlM7klOToaTk1PpYwIDA9G3b18cOXLE7FxJktC4cWM0bdq0zPHKvod//VoqlQozZsyoMA8RkZxYcIkcnF6vR5cuXZCWloaUlBTcuXMHs2bNwptvvolPP/209LxPPvkE//znPzFp0iRkZGTg5s2bWLhwIfbv3w+9Xm923dOnTyM/Px/5+fno2LEj5s+fX/rx5MmTAQDt2rVDfn4+cnNzMWbMGMTFxSE3N7fMdTZs2ACdToddu3YhIyPD7OuUlJTg888/L/e5Xbp0CUOGDMEnn3yCO3fu4MqVKxg7dizUanXpOf369SvNlZ+fj9dffx0AkJCQgAEDBuBf//oXbt++jdOnT0On0+HJJ59ETk5O6eOr8hwKCgrw66+/ln68du1aNGrUyCzvpk2bymSZP3++2de5c+cOdu/eDVdXV7Rq1arMdf+uXr16yM/Px59//omDBw/iscceQ8eOHfHjjz+WOW/v3r24desWfvvttzIFuCrfQwBYsWIFfHx8sHLlygqzEBHJiQWXyMGtWrUK6enpWL9+PRo1agRnZ2c888wzmDt3Lt59913k5eXhzp07ePfdd7FgwQL07t0btWrVKn1lcs2aNdDpdDXK4OTkhMGDB+Pu3bu4ePFimc+tWLECo0aNQmhoKFavXm322EmTJmH27NlmpRIATpw4gUaNGiE6OhoqlQq1atVCbGwsGjRocN88kiThtddew9tvv40BAwbA1dUVAQEBWLJkCTw8PPDZZ59V6zkMHjwYK1asKP145cqVGDJkyH0zVEStVuMf//gHFixYgMjISLz//vuVPkalUiEwMBBTp07F8OHD8cYbb5T5/IoVK/DCCy+ge/fuZXJWxd27d5GQkIAvvvgCFy9exNGjR6v1eCIia2DBJXJwu3btwrPPPgt3d/cyx2NjY1FUVISUlBSkpKSguLgYL7zwglUylJSUYNmyZXB2dkZQUFDp8bS0NCQnJ2PgwIEYOHBgua8Qtm7dGp07d8bs2bPNPvfEE0/g3LlzmDhxIpKSkpCfn1+lPOfPn0d6ejr69OlT5riTkxNiY2Oxa9euKj8HABg0aBC+/fZblJSU4MyZM8jPz0d4eHiVstxPr169sG/fvmo/5pdffild1lFQUICEhITSGX/77bflviJfke+//x4eHh7o06cPnn766WoXZCIia2DBJXJwt2/fRt26dc2OazQa+Pn54fbt27h9+zb8/Pyg0WhKP9++fXt4eXnB1dUVe/fufaCvffDgQXh5ecHFxQXx8fFYvXo1/P39Sz+/atUqhIaGomnTpoiLi8Pp06dx/Phxs+tMnToV8+bNQ2ZmZpnjjRs3RnJyMq5du4a+ffvCz88PL730Upmiu27dOnh5eZX+un79Om7fvg0A5c6lbt26pZ+vynMAgMDAQAQHB2P37t1YuXIlBg8eXO48XnzxxTJZFi9efN/51atXD9nZ2fc9p7zHSJJU+or3999/D51Oh27duiEmJgYGgwFbtmyp8vVWrFiBfv36Qa1WY8CAAfj2229hMBiqlYmIyNJYcIkcnJ+fH27cuGF23Gg0lhZbX19f3L59G0ajsfTzBw4cQG5uLnx9fWEymR7oa0dERCA3Nxc5OTl4/vnnzV6NXLlyJQYOHAgAqF+/PiIjI8t9hbB58+bo0aMHPv7443K/xrp165CZmYl9+/Zh7969mDZtWunn+/bti9zc3NJf9erVg5+fHwCUO5cbN26Ufr4qz+GeIUOGYPny5fjmm28qLLg//PBDmSwjRowo97x7rl27Bh8fn/ueU95jVCoVvLy8APyvoPbt2xcajQYuLi6IjY2t8quwf/zxB5KSkkq/Ry+88AKKioqqVZCJiKyBBZfIwXXp0gXbtm0rsxMB8P9v7oqIiEC7du2g0+mwceNGq2Tw8PDAl19+iVWrVpW+QnvgwAFcvHgR06dPR0BAAAICAnDo0CGsXbu2TNG+54MPPsDixYtx7dq1Cr9OmzZt0KtXr/vemAUAwcHBCAwMxPr168scN5lM2LBhA6Kjo6v0HP4qNjYWW7ZsQePGjStdA1xV//3vf9GxY8dqP+aJJ56Au7s7rl69ip9++gmrV68unXFCQgK2bt1a5lXqiqxatQomkwnPPfccAgIC0LhxYxQVFXGZAhEJx4JL5EAMBgOKiopKfxmNRgwePBiBgYHo06cPfv/9dxgMBuzYsQMTJkzA+++/j4ceegheXl547733MGbMGCQkJODPP/+EyWTCiRMnzIrxg/Lx8cHw4cMxdepUAP97ZbFr1644c+YMTpw4gRMnTuDXX39FYWEhtm3bZvb4Jk2aoF+/fpg7d27psZ9//hmLFy/GrVu3AADnzp1DYmIiIiIi7ptFpVJh9uzZ+PDDD7F27VoUFRUhIyMDw4cPR15eHiZOnFil5/BX7u7u+Omnn7BkyZIqz6Q8JSUluHLlCsaPH4/k5GS89957lT5GkiRcu3YNH3zwAZYsWVK6xdeqVavw6KOP4vz586UzvnDhAgIDA/HNN99Uet0VK1bgvffeK33siRMnsGHDBmzduhVZWVk1ep5ERDUiEZFDCAoKkgCU+TVlyhRJkiQpKytLevXVVyV/f3/JxcVFatq0qbR48WKza6xevVpq06aN5OrqKvn5+Ult27aVFi1aJBUXF9/3a0dGRppdb9myZVKHDh3KHPvjjz8krVYrpaamSl5eXlJiYqLZtUaPHi3FxsaWPqddu3aVfi49PV3S6XRSZGSkJEmSdOrUKalHjx6Sv7+/5O7uLgUFBUmvv/66pNfrJUmSpPfee08aOHBghbl/+OEHqXXr1pKbm5vk7e0txcXFSenp6VV+DleuXJEASAaDwezau3btkoKCgko/DgoKklxcXCR3d/fSXy+++GLp13FycpLc3d0lNzc3qUGDBtKQIUOkM2fOVJg9KSlJUqlUpY+pW7euFBsbK6WkpJSeExwcLM2dO9fssTNmzJBatWpV5tjfv4cpKSmSTqeTbt26Zfb4pk2bSvPmzaswGxGRtakkqZwd2omIiIiI7BSXKBARERGRorDgEhEREZGisOASERERkaKw4BIRERGRorDgEhEREZGisOASERERkaKw4BIRERGRorDgEhEREZGisOASERERkaKw4BIRERGRorDgEhEREZGisOASERERkaKw4BIRERGRorDgEhEREZGisOASERERkaKw4BIRERGRorDgEhEREZGisOASERERkaKw4BIRERGRorDgEhEREZGisOASERERkaKw4BIRERGRorDgEhEREZGisOASERERkaKw4BIRERGRorDgEhEREZGisOASERERkaKw4BIRERGRorDgEhEREZGisOASERERkaKw4BIRERGRorDgEhEREZGisOASERERkaKw4BIRERGRorDgEhEREZGisOASERERkaJoqnJSQUEBEhMTcfnyZbi5uSE6OhqhoaFm5xmNRmzbtg3nzp1DSUkJGjRogB49esDT09PiwYmIiIiIylOlV3C3bt0KtVqN+Ph49OrVC1u2bMGtW7fMzjt48CCuXr2K0aNH47XXXoOLiwu2bt1q8dBERERERBWptODq9XqcOXMGUVFR0Ol0CAoKQnBwMFJTU83Ozc3NxT/+8Q94eHjA2dkZzZs3R2ZmplWCExERERGVp9IlCllZWXBycoKfn1/psTp16iAtLc3s3Mcffxzbt29HXl4eXFxccPLkSTRp0qT083l5ecjPzy/zGL1eD3d395o8Bzg7O8NgMNToGnLTaDTw9vZGTk4OjEaj6DjVwnnLi/OWF+ctL85bXpy3vOxx3rVr1xYdwSIqLbh6vR46na7MMRcXFxQXF5ud6+vrC09PT3z66adQqVSoU6cOunfvXvr5Y8eOYc+ePWUeExkZiaioqAfNb/e8vb1FR3AonLe8OG95cd7y4rzlxXlTdVRacLVarVmZLS4uNiu9ALBlyxaUlJTg9ddfh1arxf79+7FmzRqMGDECANCqVSsEBweXeYxer6/xMgadTldu4bZl9vw3Us5bXpy3vDhveXHe8uK85WWP83aYV3B9fX1hMpmQlZUFX19fAEBGRka5A8jIyEB0dDTc3NwAAG3btkVSUhLu3r0Ld3d3eHp6mu2ocP369Rq/fK/RaOzunwDuMRqNdped85YX5y0vzltenLe8OG952fO87V2lN5lptVqEhIQgKSkJer0e6enpOH/+PFq2bGl2bv369ZGamoqioiKUlJTgyJEjqFWrVo3X2BIRERERVVWVtgmLiYmBwWDArFmzkJCQgJiYGPj7+yMtLQ3Tpk0rPa9bt27QaDSYO3cuZs6ciYsXL6Jfv35WC09ERERE9HdVeqMHNzc39O/f3+x4UFAQpkyZUua82NhYy6UjIiIiIqomvlUvERERESkKCy4RERERKQoLLhEREREpCgsuERERESkKCy4RERERKQoLLhEREREpCgsuERERESkKCy4RERERKQoLLhEREREpCgsuERERESkKCy4RERERKQoLLhERKY4kSaIjEJFAGtEBiIiIHpRer8exY8eQnJyM48eP49KlS8jJyYHBYICHhwcCAwPRvHlzREVFoUuXLnB3dxcdmYhkwIJLRER25/Lly1i9ejUSEhLQoEEDdO7cGWPHjkWTJk3g6+sLrVaLvLw8pKen4/jx41i/fj0mT56Mfv36YdSoUfD39xf9FIjIilhwiYjIbly8eBGffvopDhw4gLi4OGzevBlBQUHlnuvl5QUvLy+EhoZi6NChuHbtGhYtWoTo6GhMnDgRQ4cOhVqtlvkZEJEcuAaXiIhsXm5uLiZPnozY2Fg0a9YMBw4cwFtvvVVhuS1P/fr1MXXqVHz//fdITEzEkCFDkJ2dbcXURCQKCy4REdksSZKQkJCAqKgomEwm7NmzB+PGjavRWtpHHnkECQkJCAkJQffu3XH58mULJiYiW8AlCkREZJOys7MxadIkpKWlYenSpXj88cctdm2NRoO3334bTZo0QZ8+fbBq1So0a9bMYtcnIrH4Ci4REdmcpKQkdO3aFY0aNcKWLVssWm7/Ki4uDh988AEGDRrEV3KJFISv4BIRkc0wGo2YPn06EhMTMXfuXHTo0MHqX/O5555Dfn4+Bg4ciI0bN6JOnTpW/5pEZF0qSfBu2FlZWXByqtkLyU5OTjCZTBZKJA+VSgWtVgu9Xm93G5Jz3vLivOXFecvrr/POzMzE8OHDodFosHjxYvj4+MiaZcaMGUhKSkJiYiK0Wm2F5yll3vaC85aXt7e36AgWIbzgXr9+vcbXcHV1RWFhoQXSyMfZ2Rm1a9dGZmYmDAaD6DjVwnnLi/OWF+ctr3vz/uWXXzBy5Ej07t0b8fHxQrbvMplMGDZsGOrWrYuPPvqowvOUMG97wnnLq169eqIjWATX4BIRkVDr1q3DSy+9hA8//BBvvPGGsL1pnZyc8PnnnyM5ORk7duwQkoGILINrcImISAhJkjBt2jSsX78eGzZswCOPPCI6Ejw9PfHZZ59h1KhRaN26NXx9fUVHIqIHwFdwiYhIdkVFRRg3bhz27NmDTZs22US5vSc8PByxsbF444037G7NJxH9DwsuERHJKjs7G/3794fRaMTGjRvh5+cnOpKZ+Ph4/Pbbb9i8ebPoKET0AFhwiYhINpcvX8Zzzz2Htm3b4ssvv4Srq6voSOVycXHBtGnTMHXqVBQUFIiOQ0TVxIJLRESyOHjwIHr16oWxY8firbfeqvEWkdbWrl07tG3bFnPnzhUdhYiqybZ/dyEiIkXYsGEDXn31VcybNw8DBgwQHafK3nnnHaxevRq//fab6ChEVA0suEREZDWSJOHTTz/FzJkzsX79enTq1El0pGoJCAjAyJEj8fHHH4uOQkTVwIJLRERWUVxcjAkTJuDHH3/Epk2bEBwcLDrSAxk2bBiOHj2KkydPio5CRFXEgktERBaXk5ODAQMGoLCwEAkJCfD39xcd6YG5ublhwoQJmDlzpugoRFRFLLhERGRRV65cwfPPP4+wsDB89dVXNrtTQnUMGDAAly5dwsGDB0VHIaIqYMElIiKLOXLkCHr27IkRI0bgnXfesfmdEqpKq9Xi3//+N2bPni06ChFVgTJ+5yEiIuF++OEHvPLKK/jss88wZMgQ0XEsrmfPnvjjjz9w5MgR0VGIqBIsuEREVCOSJOHzzz/HRx99hO+++w5RUVGiI1mFs7MzRo0ahXnz5omOQkSVYMElIqIHZjAYEB8fj23btiExMRFNmzYVHcmq4uLicPToUZw5c0Z0FCK6DxZcIiJ6IH/++SeGDh2KW7duYcOGDQgICBAdyepcXV0xYsQIzJgxQ3QUIroPFlwiIqq2jIwM9OrVC4GBgVi2bBnc3d1FR5LNyy+/jMTERNy8eVN0FCKqAAsuERFVy7lz5/D888/j+eefx4wZM6DRaERHkpWXlxfi4uKwYsUK0VGIqAIsuEREVGX79+9H37598eabb2L8+PFQqVSiIwkxbtw4rFy5Enq9XnQUIioHCy4REVXJ999/jzFjxuDLL79Er169RMcRqlmzZggODsbmzZtFRyGicrDgEhHRfUmShLlz52LGjBlYt24dOnToIDqSTRg2bBiWLl0qOgYRlYMFl4iIKmQymfD2229j06ZN2LhxI4KDg0VHshlPP/00bt26hVOnTomOQkR/U6U7AwoKCpCYmIjLly/Dzc0N0dHRCA0NNTtv9erVSEtLK/24pKQEfn5+GDNmjOUSExGRLPR6PSZOnIiMjAxs2LABnp6eoiPZFLVajbi4OHzzzTdo0aKF6DhE9BdVKrhbt26FWq1GfHw8MjIysHbtWgQEBMDf37/MeYMGDSrz8bJly9CoUSPLpSUiIlkUFhbi1VdfhUajwerVq+Hq6io6kk3q168funXrhnfeeYczIrIhlRZcvV6PM2fOYMyYMdDpdAgKCkJwcDBSU1PRtWvXCh+Xk5OD9PR0vPjii6XH8vLykJ+fb3b9mu6fqFar4ezsXKNryO3etjr2uL0O5y0vzltenDeQm5uLgQMHolGjRpgzZ45Vv4/2Pu+GDRvi8ccfx44dO9CnTx/BySpn7/O2N/Y4b6Wo9KclKysLTk5O8PPzKz1Wp06dMksRypOamooGDRrA29u79NixY8ewZ8+eMudFRkYq9n3Lq+Kv8yHr47zlxXnLyxLzzsjIQGxsLKKjo/HJJ5/AyYm3alTk3rxHjx6NBQsWcDmelfH3E6qOKr2Cq9PpyhxzcXFBcXHxfR+XmpqKTp06lTnWqlUrsxsU9Ho9MjMzq5q3XDqdrtI8tkaj0cDb2xs5OTkwGo2i41QL5y0vzltejjzvmzdvomfPnujVqxdee+01ZGVlWTBl+ZQw73bt2mHMmDE4fPiwzS/LU8K87Yk9zrt27dqiI1hEpQVXq9WafXOKi4vNSu9fpaWlIT8/H02bNi1z3NPT0+wmhevXr8NgMFQnsxmNRlPja4hiNBrtLjvnLS/OW16OOu+MjAz06dMHffr0wYQJE2QrEkqYt5OTE3r27InVq1fjzTffFB3rvpQwb3tiz/O2d5X+25Ovry9MJlOZv8lnZGTct+GnpqYiJCTkviWYiIhsw40bN9C7d2/069cPEyZMEB3HLvXv3x/r169HSUmJ6ChEhCoUXK1Wi5CQECQlJUGv1yM9PR3nz59Hy5Ytyz3fYDDg9OnTCAsLs3hYIiKyrIyMDPTu3RsDBgzAuHHjRMexW8HBwfDz80NKSoroKESEKr7RQ0xMDAwGA2bNmoWEhATExMTA398faWlpmDZtWplzz507BxcXF5tfh0RE5Oiys7PRv39/xMXF8QYpC+jZsyd++OEH0TGICFXcB9fNzQ39+/c3Ox4UFIQpU6aUOdaiRQtueE1EZOPy8/MxePBgdOnSBePHjxcdRxFeeOEFdOnSBR9++CFcXFxExyFyaNz/hYjIwRQVFeGVV15Bs2bNMHnyZNFxFKNu3bpo2rQpkpKSREchcngsuEREDsRkMmHChAnw9vbG9OnToVKpREdSlJ49e+L7778XHYPI4bHgEhE5kOnTpyMzMxNz586FWq0WHUdxunfvjn379iEvL090FCKHxoJLROQg1qxZg23btuHrr7/mNo5W4uXlhQ4dOmDbtm2ioxA5NBZcIiIHsHfvXsyaNQsrV66Ej4+P6DiK9uKLL3I3BSLBWHCJiBTut99+w7hx47Bw4UI0btxYdBzF69KlC44fP47s7GzRUYgcFgsuEZGCFRQUYMSIEYiPj0dERIToOA7B1dUVHTt2xK5du0RHIXJYLLhERAolSRImTZqEFi1aYPDgwaLjOJQePXpg8+bNomMQOSwWXCIihfr6669x8eJFbgcmQHR0NA4fPow7d+6IjkLkkFhwiYgU6MiRI5g3bx6WLFkCV1dX0XEcjoeHB9q3b4/du3eLjkLkkFhwiYgUJi8vD+PHj8fMmTPRoEED0XEcVkxMDLZs2SI6BpFDYsElIlKYyZMno3Pnznj66adFR3FoXbt2xf79+5Gfny86CpHDYcElIlKQDRs24Ndff8V7770nOorDe+ihh9C2bVv8+OOPoqMQORwWXCIihfj999/x/vvvY/78+Vx3ayOeffZZ7NixQ3QMIofDgktEpAAmkwnjx4/HuHHj0Lx5c9Fx6P9ER0cjOTkZer1edBQih8KCS0SkAAsWLIAkSRgxYoToKPQXderUQePGjXHo0CHRUYgcCgsuEZGdS09Px/vvv485c+bAyYm/rduarl278l3NiGTG3wmJiOyYJEn497//jUmTJqFJkyai41A57hVcSZJERyFyGCy4RER27Ntvv8WdO3fw2muviY5CFQgJCYHJZML58+dFRyFyGCy4RER2KjMzE9OnT8dnn30GjUYjOg5VQKVSoVu3bti5c6foKEQOQyUJ/jeTrKysGq8Zc3JygslkslAieahUKmi1Wuj1erv7ZyvOW16ct7zsad5jx46Fn58fpk6dynnL6EF+vpOSkjB9+nThJddR5m0r7HHe3t7eoiNYhPC/8hcXF9f4Gq6urigsLLRAGvk4OzvDy8sLd+/ehcFgEB2nWjhveXHe8rKXeR8+fBjJyclITk6GwWDgvGX0ID/fTzzxBC5cuIC0tDT4+/tbOWHFHGXetsIe562UgsslCkREdsZoNGLy5Ml499134eHhIToOVYFWq0VkZCTf1YxIJiy4RER2ZsWKFfD19cVzzz0nOgpVA9fhEsmHBZeIyI7cunULc+bMwbRp06BSqUTHoWro3LkzDhw4gKKiItFRiBSPBZeIyI7Mnj0bffv25Z63dsjHxwePPvooDh8+LDoKkeKx4BIR2Ylz585hx44dmDBhgugo9ICeeuopJCUliY5BpHgsuEREduLDDz/EhAkT8NBDD4mOQg8oKiqKBZdIBiy4RER2YM+ePfj9998xePBg0VGoBkJDQ5GdnY2rV6+KjkKkaCy4REQ2rqSkBP/5z38wZcoUaLVa0XGoBpycnBAZGYmffvpJdBQiRWPBJSKycevXr0etWrXwzDPPiI5CFhAVFYXk5GTRMYgUjQWXiMiGFRUVYfbs2Xj77be5LZhC3NsuTK/Xi45CpFgsuERENmz16tVo3rw5WrVqJToKWYiPjw+aNGnC7cKIrIgFl4jIRhUUFOCLL75AfHy86ChkYZ07d+ZuCkRWxIJLRGSjli1bhrZt26J58+aio5CFcR0ukXVpRAcgIiJzeXl5WLRoETZs2CA6CllBWFgYbt68iWvXrqF+/fqi4xApDl/BJSKyQUuWLEFUVBQeeeQR0VHICtRqNTp27Ih9+/aJjkKkSCy4REQ2Jjs7G0uXLsXEiRNFRyEr6tSpE/bu3Ss6BpEiseASEdmYRYsWoXv37mjYsKHoKGRFHTt2xM8//wyTySQ6CpHisOASEdmQ3NxcrF69GhMmTBAdhawsMDAQnp6eOHPmjOgoRIrDgktEZEOWLVuGbt26ITAwUHQUkkGnTp24DpfIClhwiYhsRH5+PpYuXYqxY8eKjkIy4TpcIutgwSUishGrVq3Ck08+iSZNmoiOQjJp3749jh07hqKiItFRiBSlSvvgFhQUIDExEZcvX4abmxuio6MRGhpa7rnXr1/H9u3bcePGDWi1WnTs2BEREREWDU1EpDSFhYX46quvsGbNGtFRSEaenp547LHHcPjwYXTq1El0HCLFqFLB3bp1K9RqNeLj45GRkYG1a9ciICAA/v7+Zc67e/cuVq9ejWeeeQZNmzZFSUkJ8vLyrBKciEhJvvvuO7Rs2RJNmzYVHYVkdm8/XBZcIsupdImCXq/HmTNnEBUVBZ1Oh6CgIAQHByM1NdXs3JSUFDRp0gShoaHQaDTQ6XSoXbu2VYITESmFXq/HF198wZ0THBRvNCOyvEpfweXXW1sAACAASURBVM3KyoKTkxP8/PxKj9WpUwdpaWlm5169ehV16tTBkiVLkJ2djcDAQHTv3h1eXl4A/vfWk/n5+WUeo9fr4e7uXqMnoVar4ezsXKNryE2j0ZT5rz3hvOXFectLxLwTEhLQpEkThIeHP9DjOW95WXre4eHh+P3335GXlwdfX1+LXLMinLe87HHeSlHpT4ter4dOpytzzMXFBcXFxWbn5uXl4caNGxgyZAj8/f2xa9cubNiwAcOGDQMAHDt2DHv27CnzmMjISERFRdXkOdg1b29v0REcCuctL867ciaTCQsXLsTcuXNr/C9enLe8LDnvyMhIpKamol+/fha7ptLw55uqo9KCq9VqzcpscXGxWekFAGdnZ4SEhKB+/foAgM6dO2PmzJkoKiqCi4sLWrVqheDg4DKP0ev1yMzMrMlzgE6nK7dw2zKNRgNvb2/k5OTAaDSKjlMtnLe8OG95yT3vXbt2Qa1WIzQ09IF/L+S85WWNebdr1w6bNm3CU089ZZHrVYTzlpc9zlspS0srLbi+vr4wmUzIysoq/aeTjIyMcgdQp06d+17L09MTnp6eZY5dv34dBoOhOpnNaDSaGl9DFKPRaHfZOW95cd7yknve8+fPx6hRoyzyBzfnLS9LzrtDhw5YsGAB9Ho9VCqVRa5ZHs5bXvY8b3tX6U1mWq0WISEhSEpKgl6vR3p6Os6fP4+WLVuanRsWFoZz587hxo0bKCkpwd69e9GgQQO4uLhYJTwRkT07ceIE0tPT0aNHD9FRSLAmTZqgpKQEV65cER2FSBGq9EYPMTExMBgMmDVrFhISEhATEwN/f3+kpaVh2rRppec1btwY0dHRWLt2LWbNmoXs7GzExsZaLTwRkT1buHAhhg8fzptQCCqVCu3bt8eBAwdERyFShCrdkujm5ob+/fubHQ8KCsKUKVPKHGvTpg3atGljmXRERAqVnp6On3/+GbNnzxYdhWxEhw4dsGfPHgwaNEh0FCK7x7fqJSISYMmSJRgwYAA8PDxERyEb0aFDB6SkpECSJNFRiOweCy4RkcxycnKwYcMGvPLKK6KjkA15+OGH4eLigosXL4qOQmT3WHCJiGS2atUqdOvWDQEBAaKjkI3p0KED9u/fLzoGkd1jwSUiklFxcTGWL1+OkSNHio5CNog3mhFZBgsuEZGMNm/ejEcffRSPPfaY6Chkg9q3b4+UlBSYTCbRUYjsGgsuEZGMli1bxrW3VKG6devC29sbZ86cER2FyK6x4BIRyeT48eO4ffs2oqOjRUchG8ZlCkQ1x4JLRCSTpUuX4qWXXoJarRYdhWwYbzQjqjkWXCIiGdy6dQs//vgj4uLiREchG9e+fXscPnwYRqNRdBQiu8WCS0QkgzVr1qBHjx7w8vISHYVsnJ+fH+rVq4dff/1VdBQiu8WCS0RkZXq9HqtWrcLLL78sOgrZifbt23OZAlENsOASEVnZtm3b8I9//AMhISGio5Cd4I1mRDXDgktEZGVff/01twajaomIiMDRo0eh1+tFRyGySyy4RERWdPLkSWRkZKBr166io5Ad8fb2RsOGDZGamio6CpFdYsElIrKipUuXYujQodBoNKKjkJ3hdmFED44Fl4jISm7fvo2dO3eif//+oqOQHeKNZkQPjgWXiMhK1qxZg+7du8PHx0d0FLJD4eHhOHHiBIqKikRHIbI7LLhERFZgMBiwcuVKbg1GD6xWrVoIDg7G8ePHRUchsjssuEREVrB9+3YEBQWhWbNmoqOQHYuIiMDBgwdFxyCyOyy4RERWsGzZMm4NRjUWERGBlJQU0TGI7I5KkiRJZICsrCw4OdWsZzs5OcFkMlkokTxUKhW0Wi30ej0EfwuqjfOWF+ctL0vM+9SpU+jfvz9OnDghy+4Jjj5vuck57zt37qBFixa4dOkStFptja/HecvLHuft7e0tOoJFCN+3pri4uMbXcHV1RWFhoQXSyMfZ2RleXl64e/cuDAaD6DjVwnnLi/OWlyXm/eWXX2Lw4MEwGAyyPH9Hn7fc5Jy3VqtFw4YNcfDgQbRp06bG1+O85WWP81ZKweUSBSIiC8rOzsa2bdswcOBA0VFIIbgOl6j6WHCJiCzom2++wdNPPw1fX1/RUUgh2rVrx4JLVE0suEREFmI0GrF8+XLeXEYW1bZtWxw9ehRGo1F0FCK7wYJLRGQhO3fuRL169dCiRQvRUUhBfHx8EBgYiFOnTomOQmQ3WHCJiCxk6dKlfPWWrCIiIgKHDh0SHYPIbrDgEhFZwNmzZ3HlyhV0795ddBRSIO6HS1Q9LLhERBawbNkyDBo0CM7OzqKjkAJFRETgyJEjKCkpER2FyC6w4BIR1VBOTg42b96MQYMGiY5CClW7dm3Url0bZ8+eFR2FyC6w4BIR1dB3332H6Oho1K5dW3QUUrDw8HAuUyCqIhZcIqIaKCkpwfLlyzFs2DDRUUjh2rVrxxvNiKqIBZeIqAZ2794NPz8/hIWFiY5CCnfvHc1MJpPoKEQ2jwWXiKgGuDUYyaVu3bp46KGHcOHCBdFRiGweCy4R0QO6cOECLly4gB49eoiOQg4iPDycb9tLVAUsuERED2jp0qUYNGgQtFqt6CjkILgfLlHVsOASET2AO3fuIDExkVuDkazu3WgmSZLoKEQ2jQWXiOgBfPfdd4iKikKdOnVERyEH8vDDD0Or1eLy5cuioxDZNBZcIqJqurc1GG8uIxHu7aZARBVjwSUiqqYff/wRPj4+aNWqlego5IBYcIkqx4JLRFRNX3/9NV+9JWHu3WjGdbhEFWPBJSKqhvPnz3NrMBKqUaNGkCQJ6enpoqMQ2SwWXCKiali6dCkGDx7MrcFIGJVKxWUKRJVgwSUiqqLc3Fxs2rQJgwcPFh2FHBz3wyW6P01VTiooKEBiYiIuX74MNzc3REdHIzQ01Oy8pKQk7Nu3D2q1uvTY6NGj4ePjY7nERESCfPPNN+jSpQtq164tOgo5uIiICCxYsEB0DCKbVaWCu3XrVqjVasTHxyMjIwNr165FQEAA/P39zc5t1qwZYmNjLR6UiEgko9GI5cuXY9GiRaKjEOGRRx7B3bt3ce3aNdSvX190HCKbU2nB1ev1OHPmDMaMGQOdToegoCAEBwcjNTUVXbt2rdYXy8vLQ35+vtn13d3dq5f6b9RqNZydnWt0DblpNJoy/7UnnLe8OG95VTTvnTt3IiAgAG3atBGQ6v6UOG9bZivzbteuHY4ePYqGDRtW+TGct7zscd5KUelPS1ZWFpycnODn51d6rE6dOkhLSyv3/AsXLuDjjz9GrVq10LZt2zJ/GBw7dgx79uwpc35kZCSioqIeNL/d8/b2Fh3BoXDe8lLSvFesWIH4+HibXp6gpHnbA9Hz7tatG44fP47Ro0cLzSEX0fMm+1KlV3B1Ol2ZYy4uLiguLjY7t1mzZmjVqhU8PDxw9epVrFu3Di4uLmjRogUAoFWrVggODja7fmZmZk2eA3Q6Xbl5bJlGo4G3tzdycnJgNBpFx6kWzltenLe8ypv36dOncf78eXTs2LHGv19Zg9LmbetsZd7NmzfHvHnzqvUzyXnLyx7nbct/ia+OSguuVqs1++YUFxeblV4AZdbkNmjQAOHh4Thz5kxpwfX09ISnp2eZx1y/fh0Gg+GBwt+j0WhqfA1RjEaj3WXnvOXFecurvHl/9dVXpTsn2PLzUcq87YXoeT/yyCPIzMzE9evXq1xKOG952fO87V2l24T5+vrCZDIhKyur9FhGRkaV/mdSqVR8pxUismvZ2dnYunUrBg0aJDoKURlqtRpt2rThfrhE5ai04Gq1WoSEhCApKQl6vR7p6ek4f/48WrZsaXbuuXPnUFhYCEmScPXqVRw6dAiPPfaYVYITEclhzZo1eOaZZ+Dr6ys6CpGZiIgIHDp0SHQMIptTpVsSY2JisHHjRsyaNQuurq6IiYmBv78/0tLSsHr1akyZMgUA8Ouvv2Ljxo0wGo3w9PREhw4dEBYWZtUnQERkLXq9HsuXL8fKlStFRyEqV0REBCZNmiQ6BpHNqVLBdXNzQ//+/c2OBwUFlZZbAOjdu7flkhERCbZx40Y88sgjaNasmegoROVq3rw5/vjjD+Tk5HCXAaK/4Fv1EhGVQ5IkLFq0CCNHjhQdhahCzs7OeOKJJ3DkyBHRUYhsCgsuEVE59u3bB5PJhM6dO4uOQnRfERERvNGM6G9YcImIyrFo0SK8+uqrUKlUoqMQ3RcLLpE5Flwior85d+4czpw5g549e4qOQlSpli1b4uLFi8jPzxcdhchmsOASEf3NV199haFDh5b7hjZEtsbFxQUtW7bkOlyiv2DBJSL6i5s3b2L79u0YMmSI6ChEVRYeHs5lCkR/wYJLRPQXS5YswQsvvAAfHx/RUYiqLDw8nG/4QPQXVdoHl4jIERQUFGD58uX44YcfREchqpbWrVvj9OnTKCwshKurq+g4RMLxFVwiov+zbt06hIeHo3HjxqKjEFWLm5sbHnvsMfzyyy+ioxDZBBZcIiIARqMRixYtwrhx40RHIXogERERXKZA9H9YcImIACQmJqJevXqIiIgQHYXogURERCAlJUV0DCKbwIJLRA7PZDJh/vz5GD9+vOgoRA+sTZs2SE1NhV6vFx2FSDgWXCJyeLt27YKzszMiIyNFRyF6YJ6enmjcuDFSU1NFRyESjgWXiByaJEmYN28exo8fz7flJbvH/XCJ/ocFl4gc2v79+5GXl4dnn31WdBSiGuONZkT/w4JLRA5t3rx5GDt2LNRqtegoRDUWHh6Oo0ePwmg0io5CJBQLLhE5rF9++QVXrlxBr169REchsggfHx/Uq1cPp0+fFh2FSCgWXCJyWPPnz8fo0aPh7OwsOgqRxXAdLhELLhE5qLNnz+KXX35BXFyc6ChEFsV1uEQsuETkoD799FOMGjUKrq6uoqMQWVR4eDgOHToEk8kkOgqRMCpJkiSRAbKysuDkVLOe7eTkZHf/I6tUKmi1Wuj1egj+FlQb5y0vztvyTp06hb59++LYsWNwc3Mr8znOW16ct3W0bt0aK1euRNOmTcsc57zlZY/z9vb2Fh3BIjSiAxQXF9f4Gq6urigsLLRAGvk4OzvDy8sLd+/ehcFgEB2nWjhveXHeljd9+nSMGjUKKpXKbLact7w4b+sIDw9HcnIyGjVqVOY45y0ve5y3UgoulygQkUM5deoUjh8/jsGDB4uOQmQ1vNGMHB0LLhE5lE8++QRjx47l2ltStHbt2uHQoUN290/6RJbCgktEDiM1NRWnTp3CwIEDRUchsqrAwEBotVpcvnxZdBQiIVhwichhfPLJJxg3bhxcXFxERyGyunu7KRA5IhZcInIIR44cwdmzZ9G/f3/RUYhk0a5dO67DJYfFgktEiidJEj766CPEx8fz1VtyGPduNOM6XHJELLhEpHi7du3CnTt30Lt3b9FRiGTTuHFjGAwGXL16VXQUItmx4BKRopWUlGD69Ol46623oFarRcchko1KpUJERASXKZBDYsElIkVLSEiAt7c3unTpIjoKkexYcMlRseASkWIVFhZi9uzZmDx5MlQqleg4RLLjGz6Qo2LBJSLFWrFiBVq2bInWrVuLjkIkRHBwMHJzc5GRkSE6CpGsWHCJSJGys7OxYMECvPnmm6KjEAnj5OTE/XDJIbHgEpEizZ49Gy+88AKaNGkiOgqRUFymQI5IIzoAEZGlnT17Flu2bEFycrLoKETCtWvXDt99953oGESy4iu4RKQokiTh3XffxcSJE+Ht7S06DpFwTZs2xY0bN5CdnS06CpFsWHCJSFG2b9+O7OxsDBo0SHQUIpug0WjQunVrrsMlh8KCS0SKUVRUhKlTp+L999+HRsMVWET3hIeHIyUlRXQMItmw4BKRYixevBjNmjVDx44dRUchsincSYEcDV/iICJFuHr1KhYtWoQtW7aIjkJkc1q2bIkrV64gLy8Pzs7OouMQWR1fwSUiuydJEt5++20MHz4cQUFBouMQ2RytVouwsDBuF0YOgwWXiOze9u3b8fvvv2P06NGioxDZrIiICBw4cEB0DCJZVGmJQkFBARITE3H58mW4ubkhOjoaoaGhFZ5vNBqxcOFCFBcX47XXXrNYWCKiv8vPz8c777yD+fPnQ6fTiY5DZLMiIiIwY8YM0TGIZFGlgrt161ao1WrEx8cjIyMDa9euRUBAAPz9/cs9/8CBA3Bzc0NxcbFFwxIR/d3MmTPRqVMnREREiI5CZNMef/xxnD17FgUFBXBzcxMdh8iqKi24er0eZ86cwZgxY6DT6RAUFITg4GCkpqaia9euZufn5OTg5MmTePrpp5GYmFjmc3l5ecjPzze7vru7e42ehFqttrtF8/e2MLLHrYw4b3lx3hVLTU1FYmIi9u3bZ7EZcd7y4rzl4+zsjBYtWuDEiROIjIwUHafK7HXegH3+fCtFpT8tWVlZcHJygp+fX+mxOnXqIC0trdzzt27diujo6HJ/EI8dO4Y9e/aUORYZGYmoqKjq5lYMvtOSvDhveVlz3gaDAa+//jpmzZqF4OBgq30de8Kfb3nZ47yfeuopnDx5Er179xYdpdrscd4kTpVewf37ujYXF5dylx+cPXsWkiQhJCQEV65cMft8q1atzP4g0uv1yMzMrG7uMnQ6nd0th9BoNPD29kZOTg6MRqPoONXCecuL8y7fJ598Am9vbzz77LM1/j3krzhveXHe8mrVqhVmz56N8ePHi45SZfY8b3v8+a5du7boCBZRacHVarVm35zi4mKz0qvX67Fr1y4MHDiwwmt5enrC09OzzLHr16/DYDBUJ7MZjUZT42uIYjQa7S475y0vztvc6dOnsXjxYuzYscPif+Bx3vLivOXVunVrpKam4s8//4SLi4voONVij/O2559ve1dpwfX19YXJZEJWVhZ8fX0BABkZGWYNPysrC7m5uVi6dCkAoKSkBMXFxZg1axaGDx/Of1ogIovQ6/WYOHEi3n77bdSrV090HCK7UqtWLTz66KNITU1FeHi46DhEVlOlV3BDQkKQlJSE559/HhkZGTh//jyGDRtW5jx/f39MnDix9OM//vgDW7duxciRI2t8ExkR0T3z5s1DQEAA+vbtKzoKkV0KDw9HSkoKCy4pWpXe6CEmJgYGgwGzZs1CQkICYmJi4O/vj7S0NEybNg3A/+4UrFWrVukvV1dXqFQq1KpVC05OfD8JIqq5X3/9FStWrMCMGTOgUqlExyGySxERETh06JDoGERWVaU9N9zc3NC/f3+z40FBQZgyZUq5j2nUqBHf5IGILKawsBATJkzAO++8g7p164qOQ2S32rZti/Hjx8NgMHALK1IsvrRKRHbhP//5D4KDg+1yeyMiW+Ll5YWHH34Yp06dEh2FyGpYcInI5u3cuRM//fQTPv74Yy5NILKAdu3acZkCKRoLLhHZtBs3buD111/HvHnz8NBDD4mOQ6QI9240I1IqFlwislkmkwn//Oc/MXToULRp00Z0HCLFCA8Px5EjR1BSUiI6CpFVsOASkc1asGABjEYjJkyYIDoKkaLUrl0bderUwenTp0VHIbIKFlwiskkHDhzA119/jXnz5kGtVouOQ6Q47du3x4EDB0THILIKFlwisjkZGRkYN24cPv/8c9SvX190HCJF6tChA/bv3y86BpFVsOASkU0xGAwYNWoUhg4dik6dOomOQ6RY7dq1w5EjR2AwGERHIbI4FlwisinTpk2Dp6cnxo8fLzoKkaL5+Pjg4YcfxsmTJ0VHIbI4FlwishmbNm3Cjh078Pnnn/MtvolkwHW4pFT8E4SIbML58+cxefJkLFq0CN7e3qLjEDkErsMlpWLBJSLhsrOz8dJLL+G9995DaGio6DhEDiM8PBy//PILiouLRUchsigWXCISSq/X49VXX0WPHj3Qu3dv0XGIHMpDDz2EJk2a4Pjx46KjEFkUCy4RCSNJEt555x24u7vjzTffFB2HyCFxHS4pEQsuEQmzYsUKHDlyBPPnz+ebORAJ0qFDBxZcUhwWXCISYt++fZgzZw6WLVuGWrVqiY5D5LDatm2L1NRUFBYWio5CZDEsuEQku0uXLmHcuHFYsGABgoKCRMchcmju7u4ICQnB0aNHRUchshgWXCKSVWZmJgYPHoy33noL7du3Fx2HiMDtwkh5WHCJSDYFBQV46aWXEBsbi7i4ONFxiOj/8EYzUhoWXCKSRUlJCcaOHYsmTZrgtddeEx2HiP6idevWOHv2LPLz80VHIbIIjegAOp2uxm/J6eTkBFdXVwslkodKpUJBQQGcnZ2h0Qj/NlQL5y0vJcxbkiS88cYbKC4uxvz586HVakVHrJAS5m1POG95VTRvV1dXPP744zhx4gS6du0qIFnFlDhvsj7hPymWePcUV1dXu7v709nZGV5eXrh79y4MBoPoONXCectLCfNeuHAhfv75Z/z3v/9FSUmJTT8fJczbnnDe8rrfvCMiIpCcnIwnn3xS5lT3p9R52yqlvFU6lygQkVVt2rQJS5YswcqVK+Hp6Sk6DhFVgPvhkpKw4BKR1Rw6dAhTpkzB8uXLUb9+fdFxiOg+wsLCcOnSJdy5c0d0FKIaY8ElIqu4cOECXnnlFcydOxfNmzcXHYeIKqHT6fDEE0/g0KFDoqMQ1RgLLhFZ3M2bN/HMM89g8uTJ6Ny5s+g4RFRFHTp0wM8//yw6BlGNseASkUXl5+dj4MCBeOmllzBw4EDRcYioGp588km+4QMpAgsuEVmMwWDAyJEj0bJlS7zzzjui4xBRNYWGhuLGjRu4efOm6ChENcKCS0QWIUkSJk2aBLVajRkzZkClUomORETVpFar0b59ey5TILvHgktEFjFz5kxcunQJCxcutLvN2Ino/+vYsSP27dsnOgZRjbDgElGNrVy5EomJiVixYgXc3NxExyGiGrhXcCVJEh2F6IGx4BJRjezYsQNz5szBmjVr4OvrKzoOEdVQo0aNoFarcenSJdFRiB4YCy4RPbCjR48iPj4eS5cuRcOGDUXHISILUKlU6NSpE5cpkF1jwSWiB3Lp0iUMHz4cc+bMQVhYmOg4RGRBXIdL9o4Fl4iq7datWxg8eDDefPNNREdHi45DRBbWoUMHHDx4EEajUXQUogfCgktE1ZKfn4/Bgwejb9++iIuLEx2HiKzAz88PgYGBOH78uOgoRA+EBZeIqsxoNGL06NEIDQ3Fv/71L9FxiMiKOnXqxP1wyW6x4BJRlUiShHfffRdGoxEfffQR38iBSOG4DpfsGXdjJ6IqWbJkCQ4ePIgffvgBzs7OouMQkZWFh4fj119/xd27d+Hu7i46DlG18BVcIqrUjh07sHDhQqxcuRKenp6i4xCRDFxdXdGyZUukpKSIjkJUbSy4RHRfJ0+eRHx8PL7++msEBgaKjkNEMuIyBbJXLLhEVKFr167h5ZdfxsyZM7nXLZED6tixI280I7vEgktE5crPz8fQoUMxYsQIPPvss6LjEJEAoaGhyMjIwM2bN0VHIaoWFlwiMlNSUoIxY8agVatWGDlypOg4RCSIWq1G+/bt+Sou2Z0q7aJQUFCAxMREXL58GW5uboiOjkZoaKjZeSkpKTh06BAKCgqg1WrRvHlzdO3aFWq12uLBich6Zs6ciYKCAnz44YfcDozIwT355JPYu3cvYmNjRUchqrIqFdytW7dCrVYjPj4eGRkZWLt2LQICAuDv71/mvODgYISFhcHV1RUFBQVYt24dDh06hPbt21slPBFZ3n//+18kJiZiy5Yt3A6MiNC5c2fMmTMHkiTxL7xkNyotuHq9HmfOnMGYMWOg0+kQFBSE4OBgpKamomvXrmXO9fHxKfOxSqVCdnZ26cd5eXnIz883u35N99dTq9V29wexRqMp8197wnnLS855nzhxAu+99x42bNiAOnXqPPB1OG95cd7ycrR5N2nSBO7u7rhw4QKaN29upWQVc7R5k2VU+tOSlZUFJycn+Pn5lR6rU6cO0tLSyj3/5MmT2Lx5M/R6Pdzc3PD000+Xfu7YsWPYs2dPmfMjIyMRFRX1oPntnre3t+gIDoXzrlhGRgaGDRuGxYsXIzIy0iLX5LzlxXnLy5HmHRMTg8OHDwv989qR5k01V6VXcHU6XZljLi4uKC4uLvf80NBQhIaGIisrC6mpqWVenW3VqhWCg4PNrp+Zmfkg2UvpdLoK89gqjUYDb29v5OTkwGg0io5TLZy3vOSYd3FxMXr27Im4uDg8+eSTNf5/kvOWF+ctL0ecd0REBBYuXIhXXnnFCqnuzxHnLVLt2rVFR7CISguuVqs1++YUFxebld6/8/X1Re3atbFlyxbExcUBADw9Pc3eBen69eswGAzVzV2GRqOp8TVEMRqNdped85aXtectSRLi4+Ph7++PCRMmWPRrcd7y4rzl5Ujzbtu2LUaOHImcnBx4eHhYIVnlHGneVHOVbhPm6+sLk8mErKys0mMZGRlVavgmkwk5OTk1S0hEVrVixQqcPHkSc+bMgZMTdw4kInPu7u4ICwvDgQMHREchqpJK/zTTarUICQlBUlIS9Ho90tPTcf78ebRs2dLs3GPHjpXeRHbr1i38/PPPaNSokeVTE5FFHDt2DJ9++ikWL15c45s9iUjZoqKikJycLDoGUZVU6ZbEmJgYbNy4EbNmzYKrqytiYmLg7++PtLQ0rF69GlOmTAEA/PHHH/jpp59KbzBr1qyZQ99ARmTLsrKyMGrUKMyePZt/ESWiSkVGRmL48OGiYxBVSZUKrpubG/r37292PCgoqLTcAsCLL75ouWREZDX33qmsV69e6Natm+g4RGQHQkJCUFRUhCtXrvAvxWTzuOCOyAHNnDkTAPD6668LTkJE9kKlUqFz585cpkB2gQWXyMHs3LkT33//PRYsWMC30SaiaomMjGTBJbvAgkvkQK5cuYJJkyZh0aJF8PX1FR2HiOxMx44dcfDgQbvbtFdx+AAAIABJREFU25UcDwsukYMoKirCyJEjMXHiRDzxxBOi4xCRHfLx8cEjjzyCI0eOiI5CdF8suEQOYurUqWjUqBGGDh0qOgoR2TFuF0b2gAWXyAFs2bIFycnJmDVrFlQqleg4RGTHuA6X7AELLpHCpaen46233sKCBQvM3iqbiKi6wsLCcOPGDdy4cUN0FKIKseASKZher8eYMWMwbtw4hIWFiY5DRAqg0WgQGRmJn376SXQUogqx4BIp2IwZM+Dr64sRI0aIjkJECtKlSxf8+OOPomMQVYgFl0ihdu/ejcTERHz22Wdcd0tEFtW5c2fs378fRUVFoqMQlYsFl0iBrl+/jvj4eHzxxRfw8fERHYeIFMbHxwchISE4ePCg6ChE5WLBJVKYkpISjB8/Hq+88gratm0rOg4RKVR0dDR2794tOgZRuVhwiRTmyy+/hEqlwtixY0VHISIFu7cOV5Ik0VGIzLDgEinIqVOn8NVXX+Hzzz+HWq0WHYeIFOyxxx6D0WjExYsXRUchMsOCS6QQhYWFGDduHD744APUr19fdBwiUjiVSsXdFMhmseASKcS0adPQvHlz9OzZU3QUInIQLLhkq1hwiRQgKSkJO3fuxLRp00RHISIH0r59e5w6dQq5ubmioxCVwYJLZOeys7MRHx+Pzz77DF5eXqLjEJEDcXV1RUREBPbs2SM6ClEZLLhEdkySJEyaNAkvvvgiOnToIDoOETkgbhdGtogFl8iOfffdd0hLS8Prr78uOgoROajo6GgkJyejpKREdBSiUipJ8AZ2WVlZcHKqWc92cnKCyWSyUCJ5qFQqaLVa6PV6u9tDkPOWV0XzvnLlCrp164aNGzeiadOmApJVTInztmWct7w4b3MdO3bE7NmzER4ebvFrc97y8vb2Fh3BIjSiAxQXF9f4Gq6urigsLLRAGvk4OzvDy8sLd+/ehcFgEB2nWjhveZU3b6PRiFdffRXjx49Ho0aNbO77obR52zrOW16ct7mnnnoKW7ZsQWhoqMWvzXnLSykFl0sUiOzQvHnz4ObmhmHDhomOQkSEbt26YdeuXaJjEJUS/gouEVXP8ePHsXz5cmzfvr3Gy3uIiCwhLCwMubm5+O2339C4cWPRcYj4Ci6RPSkoKMD48ePx4Ycfom7duqLjEBEB+N9a027dumHnzp2ioxABYMElsisffPABWrVqheeee050FCKiMp555hls375ddAwiACy4RHZj586d2LNnDz788EPRUYiIzLRr1w4XLlxAZmam6ChELLhE9iAzMxNvvPEGPv/8c9SqVUt0HCIiMzqdDpGRkbzZjGwCCy6RjZMkCf+vvTuPi6rQ+wf+mRkYVtlRC30hmiJ5AxfwgrnAI+KC5aN5JcUF9dGUMs31CW6pFVqZWrbcXBNFKxFQFMhcgEJRr6i4UEp0Rcm1QcEBnAHm/P7ocX4RyDrMYWY+79eLV3nmnMOHj0f5ejhzzuLFizFhwoRWucckEZGu8DIFais44BK1cbGxsbhz5w4WLVokdhQionoFBQXh1KlTUCqVYkchE8cBl6gNKygoQExMDD777DPI5XKx4xAR1cvOzg6+vr7IyMgQOwqZOA64RG1UZWUlXn/9dSxbtgzPPPOM2HGIiBpl+PDhOHTokNgxyMRxwCVqoz7++GM4OjryaWVEZFBCQkJw7Ngxg3usLhkXDrhEbdC///1v7Nq1C+vWrYNEIhE7DhFRo3Xs2BEeHh7Izs4WOwqZMA64RG2MUqnE/Pnz8f7776N9+/ZixyEiarIRI0YgNTVV7BhkwjjgErUxb7/9NgICAjBixAixoxARNUtoaCi+++47VFdXix2FTBQHXKI2JCUlBadOncI777wjdhQiombz8PBA+/btcerUKbGjkInigEvURty6dQtRUVH49NNPYWNjI3YcIqIWGT16NA4ePCh2DDJRHHCJ2gCNRoMFCxYgIiICffv2FTsOEVGLjR49GqmpqbxMgUTBAZeoDdi8eTMePXqEefPmiR2FiEgnunbtCldXV5w+fVrsKGSCOOASiezy5cv47LPPsGHDBpiZmYkdh4hIZ3iZAomFAy6RiCoqKjBv3jy89dZbcHd3FzsOEZFO8TIFEgsHXCIRrV69Gt27d8c//vEPsaMQEelct27d4OzsjDNnzogdhUxMo34eWl5ejuTkZBQUFMDa2hpDhw6Ft7d3rfWOHz+O8+fPo6SkBNbW1vDz88Pzzz+v89BExiAjIwNpaWn4/vvv+bQyIjJajy9T+Pvf/y52FDIhjTqDm5qaCplMhsWLF2PcuHFISUnB3bt3a60nCALGjh2LZcuWYfLkyTh9+jQuXryo89BEhk6hUGDRokVYv349HB0dxY5DRNRqHl+moNFoxI5CJqTBM7hqtRp5eXmIjIyEhYUF3N3d4enpidzcXAwbNqzGugMHDtT+v4uLCzw9PXHjxg0899xzAIDS0lIolcpa+2/pPT9lMhnMzc1btA99e/xmIkN8UxH7bhlBELB06VKMGzcOQUFBDa7PvvWLfesX+9YvMfr28vKCo6Mjzp07B39//yZvz76pORo8WhQKBaRSKVxcXLTLOnTogMLCwnq3EwQB169fR79+/bTLcnJykJmZWWO9IUOGNOqbvLHi2Tv9agt9f/7557h79y727dsHCwsLseO0qrbQtylh3/rFvhsvPDwchw4dwgsvvNDsfbBvaopGncH96zdhS0tLqFSqerfLyMiAIAjo06ePdlm/fv3g6elZa//37t1rSuZaLCwsGszT1piZmcHR0RH3799HVVWV2HGahH0338WLF7F8+XKkpKSgtLS0Uduwb/1i3/rFvvVLrL5DQkIwatQoREVFNfmMJvvWL1dXV7Ej6ESDA65cLq/1m6NSqeo983Tq1Cnk5uZi+vTpNX6kYGdnBzs7uxrr3rx5E5WVlU3NXYOZmVmL9yGWqqoqg8vOvpunrKwMs2bNwsqVK9G5c+dG52Df+sW+9Yt965dYfbu5ucHd3R1Hjx7F0KFDm7UP9k1N0eCbzJydnaHRaKBQKLTLbt++/cQJ/+zZs8jKysLUqVNhb2+vu6REBi4qKgr9+/fH2LFjxY5CRKR348aNQ1JSktgxyEQ0OODK5XJ4eXkhPT0darUa169fx5UrV+Dj41Nr3QsXLuDo0aOYOnUqnJycWiUwkSGKj49Hbm4u3n33XbGjEBGJYvTo0Th69CjKysrEjkImoFFvSQwNDcX+/fuxZs0aWFlZITQ0FO3bt0dhYSHi4uIQHR0NADh27BgqKiqwadMm7bbe3t4tuqicyND98ssveOedd7Bnzx5YW1uLHYeISBQuLi7w9fXF999/z59kUatr1IBrbW2NiRMn1lru7u6uHW4BYMGCBbpLRmQEHj16hMjISCxZsgReXl5ixyEiEtXYsWORmJjIAZdaHR/VS9SKVq5ciS5dumDKlCliRyEiEt3w4cNx5syZGu/rIWoNHHCJWsnevXuRlZWFtWvX8lG8REQAbGxsMHToUBw4cEDsKGTkOOAStYK8vDysXLkSmzdvRrt27cSOQ0TUZjy+TIGoNXHAJdKxkpIS7f1ue/bsKXYcIqI2ZfDgwbh27VqDT0QlagkOuEQ6JAgC3njjDQQGBmLcuHFixyEianPMzc3x4osvIiEhQewoZMQ44BLp0BdffIG7d+/i7bffFjsKEVGbFRYWhj179kCj0YgdhYwUB1wiHcnKysKWLVuwcePGeh9lTURk6v72t7+hXbt2OHHihNhRyEhxwCXSgcLCQrz22mvYsGED3NzcxI5DRNSmSSQShIWF4dtvvxU7ChkpDrhELaRUKjFjxgzMnz8fgwYNEjsOEZFBGDduHI4cOYLS0lKxo5AR4oBL1AIajQavv/46+vXrh4iICLHjEBEZDCcnJwwcOBDJycliRyEjxAGXqAU++ugj3L9/H++99x4f5kBE1ES8TIFaCwdcombav38/EhISsHnzZsjlcrHjEBEZnMDAQPz222/Iz88XOwoZGQ64RM1w9uxZvPXWW9i2bRtcXFzEjkNEZJDMzMwwfvx4nsUlneOAS9RE165dw8yZM7Fu3Tr06tVL7DhERAbt5ZdfRnx8PFQqldhRyIhwwCVqguLiYkyePBkLFy5EcHCw2HGIiAxe165d0bNnT6SlpYkdhYwIB1yiRqqoqMD06dMRGhqKKVOmiB2HiMhoTJ06FTt37hQ7BhkRDrhEjfD4dmBubm5YtmyZ2HGIiIxKSEgI/vOf/+DKlStiRyEjwQGXqAGCIGD58uUoLi7G+vXrIZXyjw0RkS6Zm5vj5ZdfRlxcnNhRyEjwOzVRA9auXYtTp05h27ZtsLCwEDsOEZFRCg8PR2JiIsrLy8WOQkaAAy5RPTZu3Ij9+/dj9+7dsLe3FzsOEZHRcnNzg5+fH/bv3y92FDICEkEQBDEDKBSKFv/IVyqVQqPR6CiRfkgkEsjlcqjVaoj8W9BkptJ3XFwc1qxZg5SUFHTq1KmVEz6ZqfTdVrBv/WLf+tXW+z58+DBWr16NY8eOaZexb/1ydHQUO4JOmIkdQBf3vbOyskJFRYUO0uiPubk5HBwcUFZWhsrKSrHjNIkp9H3w4EHExMQgPj4ezs7Oon69ptB3W8K+9Yt961db7zsgIAD379/Hjz/+CF9fXwDsW9+MZcDlJQpEf5GWlobo6Gjs3LkT3bp1EzsOEZHJkMlkmDFjBjZv3ix2FDJwHHCJ/iQ1NRVvvvkm4uLi+JQyIiIRvPzyy8jKykJRUZHYUciAccAl+j8HDx5EVFQU4uLi8Nxzz4kdh4jIJNna2uIf//gHvvrqK7GjkAHjgEsEIDk5Gf/85z+xa9cu/O1vfxM7DhGRSZs5cya++eYblJWViR2FDBQHXDJ5iYmJWL58OXbv3s3LEoiI2oDOnTtjwIAB2LNnj9hRyEBxwCWTtnXrVqxatQrffPMNnn32WbHjEBHR/5k1axa2bNlicLfZoraBAy6ZJEEQ8NFHH+Grr75CUlISPD09xY5ERER/4ufnB3t7exw6dEjsKGSAOOCSydFoNIiOjsbhw4exb98+dO7cWexIRET0FxKJBHPnzsWGDRsM7gEPJD4OuGRSHj16hFdffRVXr17F3r174eLiInYkIiJ6glGjRqGkpKTGk82IGoMDLpmMe/fuYfz48dBoNIiLi0O7du3EjkRERPWQyWR4/fXXsWrVKrGjkIHhgEsmIT8/HwEBARgwYAD+9a9/wdLSUuxIRETUCC+99BIKCgpw5swZsaOQAeGAS0bv+PHjGDNmDKKjoxEVFQWplIc9EZGhMDc3x9KlS/HJJ5+IHYUMCL/Tk1GLi4tDZGQkNm3ahOnTp4sdh4iImmHGjBk4f/488vLyxI5CBoIDLhkllUqFpUuXYuvWrUhMTMTAgQPFjkRERM1kaWmJV155BRs2bBA7ChkIDrhkdG7duoWXXnoJ9+/fx4EDB9CtWzexIxERUQtFRETg5MmTuHz5sthRyABwwCWjcurUKYwePRojRozApk2bYGtrK3YkIiLSAVtbW7z66qtYs2aN2FHIAHDAJaOg0WiwceNGzJ49G2vXrsVrr70GiUQidiwiItKhKVOm4NKlS8jJyRE7CrVxHHDJ4BUXF2PatGk4ePAgDh48iMDAQLEjERFRK7C0tMQbb7yBDz/8UOwo1MZxwCWDlp2djZCQEHh5eSExMZGP3SUiMnITJkxAUVERsrKyxI5CbZiZ2AGImqO6uhoff/wxdu3ahXXr1vGsLRGRiTA3N8eSJUsQExODlJQU3tuc6sSjggxOUVERwsLCcPr0aaSlpXG4JSIyMWPGjIFMJkNiYqLYUaiNatQZ3PLyciQnJ6OgoADW1tYYOnQovL29a633n//8B5mZmbh165b2OhkiXREEAXv27EFMTAzmzJmDV155BTKZTOxYRESkZxKJBMuXL8fcuXMRGhoKKysrsSNRG9OoM7ipqamQyWRYvHgxxo0bh5SUFNy9e7fWeubm5ujTpw+GDRum86Bk2u7du4cZM2Zgy5Yt+PbbbxEZGcnhlojIhPn5+cHPzw9ffvml2FGoDWpwwFWr1cjLy0NQUBAsLCzg7u4OT09P5Obm1lq3U6dO8PHxgaOjY6uEJdOUlpaGYcOGwdPTEykpKfDy8hI7EhERtQFRUVHYsmULbt26JXYUamMavERBoVBAKpXCxcVFu6xDhw4oLCxs8icrLS2FUqmssUytVsPGxqbJ+/ozmUwGc3PzFu1D38zMzGr815Doq2+FQoF//vOfOHv2LLZv3w4/P79m74t96xf71i/2rV/sW7/q67tr166IiIhATEwMNm7cqO9oDTLEvo1Fg3861Wo1LCwsaiyztLSESqVq8ifLyclBZmZmjWVDhgxBUFBQk/dlLHi2uzZBELB7924sWrQI4eHhiI2NbfE/gh5j3/rFvvWLfesX+9avJ/UdExODXr164fz587xEkrQaHHDlcnmtYValUtUaehujX79+8PT0rLFMrVbj3r17Td7Xn1lYWDRr4BaTmZkZHB0dcf/+fVRVVYkdp0las+8bN25g6dKluHXrFnbs2IE+ffqgvLwc5eXlLdov+9Yv9q1f7Fu/2Ld+NabvmJgYvPLKK8jMzISlpaWeEz6ZIfbt6uoqdgSdaHDAdXZ2hkajgUKhgLOzMwDg9u3bzSrAzs4OdnZ2NZbdvHkTlZWVTd7Xn5mZmbV4H2KpqqoyuOyt0Xd1dTViY2Oxbt06zJo1C5GRkTA3N9f552Hf+sW+9Yt96xf71q/6+g4MDETPnj3xySefYOHChXpO9mSG3Leha/BNZnK5HF5eXkhPT4darcb169dx5coV+Pj41FpXo9GgsrISGo0GAFBZWWlw/7ol/bt48SLGjBmDAwcOYN++fZg/fz6vWSIioiZZuXIltm3bhl9++UXsKNQGNOo2YaGhoaisrMSaNWuwd+9ehIaGon379igsLERMTIx2vce/3rVrF0pKShATE4OdO3e2WngybA8ePEB0dDSmTJmCyZMnIyEhAc8884zYsYiIyAC5ublh8eLFWLBgAaqrq8WOQyJr1FtAra2tMXHixFrL3d3dER0drf21h4cHVqxYobNwZJw0Gg3i4+OxevVqjBgxAunp6XyzBhERtdjUqVORmpqKjRs3IjIyUuw4JCLDu8cJGbTLly8jOjoaarUa27dvR+/evcWORERERkIqlWLt2rUYNWoUgoOD0aNHD7EjkUgadYkCUUspFApERUVh4sSJGD9+PA4cOMDhloiIdK5z585YtmwZ5s+fD7VaLXYcEgkHXGpVKpUKX375JQIDAyGTyZCRkYHJkyfzMbtERNRqwsPD0aFDB7z//vtiRyGR8BIFahWCICAlJQWrVq1Cjx49kJSUxDeQERGRXkgkEqxbtw7Dhw/HgAEDEBwcLHYk0jMOuKRzZ8+exbvvvouHDx/igw8+wKBBg8SOREREJsbJyQmff/45Zs+ejdTUVDz99NNiRyI94iUKpDM///wzZsyYgdmzZ2PChAk4dOgQh1siIhJN//79MX36dERGRvJ6XBPDAZdarLCwEPPmzUNYWBj8/f2RlZWFiRMn8jpbIiIS3bx58+Ds7Izo6GgIgiB2HNITDrjUbLdv38abb76J0NBQdO3aFVlZWZg9e3abeg44ERGZNqlUik8++QRnz57F9u3bxY5DesJrcKnJbty4gXXr1mH//v0ICwvDDz/8ACcnJ7FjERER1cnW1hbbtm3DmDFj0K1bNwwePFjsSNTKeAaXGq2goAALFy5EYGAg2rVrh8zMTLz99tscbomIqM1zd3fHv/71L7z22mu4dOmS2HGolfEMLjXop59+wqeffoqsrCxMnz4dOTk5sLCwEDsWERFRkwQEBOD999/H1KlTkZCQAA8PD7EjUSvhgEt10mg0SE9Px+bNm3H16lXMmjULH374IWxtbWFlZYWKigqxIxIRETXZqFGjoFAoEB4ejsTERHTs2FHsSNQKOOBSDeXl5YiPj8fWrVthZWWFWbNm4YUXXuAZWyIiMhpTpkxBSUkJxo8fj/j4eDz11FNiRyId44BLAIBffvkFu3fvRnx8PPz8/PDBBx/A398fEolE7GhEREQ699prr0EikWiHXD4IwrhwwDVhKpUKaWlpiIuLQ35+PsLCwnDgwAF06dJF7GhERESt7tVXX4VMJsP48eOxe/dufv8zIhxwTYwgCLh8+TL27t2LhIQE9OrVC9OmTcPw4cMhl8vFjkdERKRXc+bMgY2NDcaOHYutW7eib9++YkciHeCAayKKioqQlJSEpKQklJWVYezYsTxbS0REhD+uye3YsSOmTZuGjz76CMOHDxc7ErUQB1wjplAokJaWhqSkJFy5cgWhoaF4//334evrC6mUt0AmIiJ6bNiwYYiLi8OMGTNw6dIlvPHGG/xeacA44BqZoqIipKWl4bvvvkNeXh6GDBmC2bNnIygoiJcgEBER1cPHxwepqamYO3cuzp8/jw0bNsDR0VHsWNQMHHANnCAIuHLlCr777jt89913+O233xASEoI5c+Zg0KBBsLS0FDsiERGRwejQoQO+/fZbvPfeexg+fDjWr1+P559/XuxY1EQSQRAEMQMoFIoW/whAKpVCo9HoKJF+SCQSyOVyqNVqNPW3oKSkBJmZmTh69CiOHj0KMzMzjBgxAqNHj4a/vz/MzFr33y2m1rfY2Ld+sW/9Yt/6xb6b5vDhw1iwYAH++7//G2+99VaTTxoZYt/GcsZa9AH35s2bLd6HIT5Zy9zcHK6urrh37x4qKyvrXVej0eDixYtIT09HRkYGfvrpJ/Tv3x+BgYEIDAxE165d9Xq/WmPvu61h3/rFvvWLfesX+2664uJiREVF4eLFi3jvvfcQFBTU6G0NsW9juR8wL1Fog6qrq5GXl4fs7GycPHkSp06dgqurKwIDA/HGG2+gf//+sLKyEjsmERGR0XNycsKXX36Jo0ePIjo6Gr169cKKFSvg5uYmdjSqBwfcNqCqqgoXL17EyZMnkZ2djTNnzqB9+/bw9/fHmDFjsGrVKj4rm4iISERDhw7FgAED8MUXXyAkJASTJk3C3Llz4eTkJHY0qgMHXBGo1WqcO3cOFy5cwOHDh3HmzBl06tQJ/v7+mDBhAtatWwcXFxexYxIREdGfWFlZYdGiRZg0aRI+/vhjDB48GP/zP/+DGTNmwM7OTux49CcccPXg0aNHOH/+vPaSg3PnzsHDwwNDhw7FtGnTsGHDBv4LkIiIyEA89dRT+OCDDzBnzhysX78eAQEBCAsLw8yZM3npQhvBAbcVVFRUICcnBydPnsTJkyeRm5uL7t27w9/fHzNnzkT//v3h6upqsG9SICIiIsDDwwMbNmzAb7/9hi1btiAkJASDBw/GpEmTeGsxkXHA1YHy8nKcOXMGJ06cwMmTJ3H58mX07NkTAQEBiIyMhJ+fH9q1ayd2TCIiImoFbm5uWL58ORYsWIDExESsXLkSFRUVmDp1Kl588UU89dRTYkc0ORxwm6GsrEw70GZnZyMvLw/PPfcc/P39sXDhQvj6+sLa2lrsmERERKRH9vb2mD59OiIiInDu3Dl88803CA4ORs+ePfHCCy9g9OjRfI+NnnDAbYSysjL8+9//RnZ2Nk6cOIGff/4Zzz33HAICArBkyRL4+vrytl1EREQE4I+HU/Tt2xfPP/883nnnHWRmZiI5ORkffPABvL29ERISguDgYLi7u4sd1WhxwK1DfQPtsmXL0K9fPw60RERE1CBLS0sMHz4cw4cPR0VFBTIyMnDkyBF8+umncHBwQHBwMIKDg+Hr69vqTyI1JWwSgFKprDHQXrlyBd7e3ggICMD//u//om/fvhxoiYiIqEWsrKwwcuRIjBw5EhqNBhcuXMCRI0ewfPlyFBYWwt/fH4MGDcLAgQPRo0cPvT6l1NiY5ICrVCpx+vRpZGdnIzs7G1euXIGPjw8CAgIQFRWFPn36cKAlIiKiViOVStG7d2/07t0bixcvxu+//47jx48jKysLmzdvhkqlwsCBAzFu3LgmPR6Y/mBSA+4PP/yADz/8UDvQDhgwANHR0ejTpw8sLS3FjkdEREQmysXFBWPGjMGYMWMAAIWFhfjxxx9RVlYmcjLDZFIDrru7OwdaIiIiavPc3d35JrQWMLkBlwcLERERkXGTih2AiIiIiEiXOOASERERkVHhgEtERERERoUDLhEREREZFQ64RERERGRUOOASERERkVHhgEtERERERoUDLhEREREZlUY96KG8vBzJyckoKCiAtbU1hg4dCm9v71rrCYKAI0eO4OzZswCAvn37Ijg4GBKJRLepiYiIiIieoFEDbmpqKmQyGRYvXozbt29j9+7d6NixI9q3b19jvZycHPz888+YM2cOJBIJduzYAQcHB/j5+bVKeCIiIiKiv2pwwFWr1cjLy0NkZCQsLCzg7u4OT09P5ObmYtiwYTXWPX/+PAICAmBvbw8AGDBgAHJycrQDbmlpKZRKZa3929jYtOiLkMlkMDc3b9E+9M3MzKzGfw0J+9Yv9q1f7Fu/2Ld+sW/9MsS+jUWDR4tCoYBUKoWLi4t2WYcOHVBYWFhr3Xv37qFjx4411rt375721zk5OcjMzKyxzZAhQxAUFNSs8MbA0dFR7AgmhX3rF/vWL/atX+xbv9g3NUWjzuBaWFjUWGZpaQmVStXgupaWllCr1RAEARKJBP369YOnp2etbf48BDeHhYVFnXnaMjMzMzg6OuL+/fuoqqoSO06TsG/9Yt/6xb71i33rF/vWL0Ps29XVVewIOtHggCuXy2v95qhUqlpDb13rqlQqyOVy7ZvM7OzsYGdnV2ObmzdvorKyslnhHzMzM2vxPsRSVVVlcNnZt36xb/1i3/rFvvWLfeuXIfdt6Bq8TZizszM0Gg0UCoV22e3bt+uc8F1dXXHnzp0G1yMiIiIiai0NDrhyuRxeXl5IT0+HWq3G9evXceUokRHFAAANh0lEQVTKFfj4+NRa18fHB9nZ2SgtLUVpaSmys7PRu3fvVglORERERFSXRr0lMTQ0FPv378eaNWtgZWWF0NBQtG/fHoWFhYiLi0N0dDQAwNfXF/fv38cXX3wB4I/74Pr6+rZeeiIiIiKiv2jUgGttbY2JEyfWWu7u7q4dbgFAIpEgJCQEISEhuktIRERERNQEEkEQBLFDmKLS0lLk5OSgX79+td54R7rHvvWLfesX+9Yv9q1f7Juao8FrcKl1KJVKZGZm1nrwBbUO9q1f7Fu/2Ld+sW/9Yt/UHBxwiYiIiMiocMAlIiIiIqPCAZeIiIiIjIpsxYoVK8QOYYoEQYBcLkeXLl3qfCoc6Rb71i/2rV/sW7/Yt36xb2oO3kWBiIiIiIxKo+6DS41TVVWFlJQU/Prrr6ioqICjoyOCg4PRvXt3VFVVISEhATdv3kRJSQmmTZsGDw+PJ+6rvLwcycnJKCgogLW1NYYOHQpvb289fjVtny77/uqrr1BUVASp9I+rduzs7DBv3jx9fSkGob6+b9y4gfT0dNy8eRNSqRRdunTByJEj0a5duzr3xeO7Ybrsm8d3w+rr++7du0hKSsL9+/cBAE899RRGjhyJ9u3b17kvHt8N02XfPL6pTgLpjEqlEo4dOyYUFxcL1dXVws8//yzExMQIxcXFQmVlpXDixAnh2rVrwpo1a4Rff/213n3Fx8cLe/bsER49eiRcu3ZNWLVqlXDnzh09fSWGQZd9b9u2TThz5oyekhum+vq+evWqcOnSJaGiokJQqVRCUlKSsGPHjifui8d3w3TZN4/vhtXXd3l5uVBcXCxoNBqhurpayM7OFj7//PMn7ovHd8N02TePb6oL32SmQ3K5HEFBQXB0dIRUKoWnpyccHBxw69YtmJmZISAgAO7u7pBIJPXuR61WIy8vD0FBQbCwsIC7uzs8PT2Rm5urp6/EMOiqb2qc+vru3r07evXqBUtLS8jlcvTv3x83btyocz88vhtHV31T49TXt5WVFRwdHSGRSCAIAqRSKYqLi+vcD4/vxtFV30RPwksUWpFSqYRCoYCrq2uTtlMoFJBKpXBxcdEu69ChAwoLC3Ud0ag0t+/Hjh49iiNHjsDFxQX/9V//Ve8lDVR/34WFhU/8feDx3TzN7fsxHt9NU1ffq1evhlqthiAICAoKqnM7Ht/N09y+H+PxTX/FAbeVVFdXIyEhAb17927ywKVWq2u9U9TS0hIqlUqXEY1KS/oGgGHDhsHV1RUymQyXLl3C119/jTlz5sDJyakV0hq++vq+ffs2MjMzMXHixDq35fHddC3pG+Dx3VRP6vvNN9+EWq3G+fPn4eDgUOe2PL6briV9Azy+qW68RKEVaDQaJCYmQiaTYdSoUU3eXi6X1/rLUKVS8fYoT9DSvgGgU6dOsLCwgJmZGXr37o3OnTsjPz9fx0mNQ319KxQK7Nq1CyNHjoS7u3ud2/P4bpqW9g3w+G6Khv4+kcvl8PX1RVJSUp2PjuXx3TQt7Rvg8U1144CrY4IgIDk5GWVlZQgLC4NMJmvyPpydnaHRaKBQKLTLbt++3ewfvRszXfRdl8fXflFN9fX94MED7NixA4MHD4aPj88T98Hju/F00XddeHzXrbF/nwiCgMrKSjx8+LDWazy+G08XfdeFxzcBHHB17uDBg7h37x4mTpwIc3PzGq9VVVWhsrISwB8/kqmsrKzzD6FcLoeXlxfS09OhVqtx/fp1XLlypcnfxEyBLvquqKjAL7/8gsrKSlRXV+PChQsoLCzEM888o5evwZA8qe/S0lLExsaif//+8PPzq3cfPL4bTxd98/huvCf1XVBQgFu3bkGj0eDRo0c4dOgQLC0ta1xn+xiP78bTRd88vulJ+KAHHXrw4AE+/vhjyGQy7f34AOCFF16At7c31q9fj5KSkhrbzJ8/H46Ojvjhhx9w/fp1TJ48GcAf91Hcv38/fv31V1hZWSE4OJj3UfwLXfVdVlaGXbt24ffff4dEItG+SaFbt276/pLatPr6Li4uRkZGRq1/ZERHRwMAj+9m0FXfPL4bp76+ZTIZjh07htLSUpibm8PNzQ1Dhw5Fx44dAfD4bg5d9c3jm56EAy4RERERGRVeokBERERERoUDLhEREREZFQ64RERERGRUOOASERERkVHhgEtERERERoUDLhEREREZFQ64RERERGRUOOASERERkVHhgEtERERERoUDLhEREREZFQ64RERERGRUOOASERERkVHhgEtERERERoUDLhEREREZFQ64RERERGRUOOASERERkVHhgEtERERERoUDLhEREREZFQ64RNSmBAYGwtHRESqVSrtMEAQsW7YMzs7OcHZ2xrJlyyAIQo3tlEolbG1tMXLkyFr7LC4uxtixY2FjYwN3d3fs3r1b+9qqVatga2ur/bCysoJUKsXvv/8OAIiIiIBcLq+xTnV1NQAgIyMDUqlUu9zNzQ3Lly+v9fk/+eQTeHh4wMbGBl5eXrh69SoAYPv27ZDJZDX2bWtri5s3b0KlUmHmzJlwd3dHu3bt0Lt3b6SlpbW8YCIiE8ABl4jajGvXruHHH3+ERCJBcnKydvmmTZuwb98+5Obm4sKFCzhw4AA2btxYY9uEhARYWFjg8OHDuH37do3XXn31Vcjlcty5cwe7du3C3LlzcfnyZQBAVFQUlEql9mPZsmUIDAyEi4uLdvulS5fWWEcmk2lfe/rpp7XLs7KysHXrVuzbt0/7+pYtW7B161akpKRAqVTi4MGDNfYdEBBQY99KpRJPP/00qqqq0LlzZ2RmZqKkpATvvfceJkyYgGvXrumkayIiY8YBl4jajB07dsDf3x8RERGIjY3VLo+NjcWiRYvQqVMnuLm5YdGiRdi+fXuNbWNjYzFnzhx4e3sjLi5Ou7ysrAwJCQl49913YWtri4EDB+LFF1/Ezp07a31+QRCwY8cOTJs2rVn5PTw8MGDAAOTl5QEANBoNVq5cifXr1+PZZ5+FRCJBt27d4OTk1OC+bGxssGLFCnTp0gVSqRSjR4+Gh4cHcnJympWNiMiUcMAlojZjx44dCA8PR3h4OA4dOoQ7d+4AAC5fvgwfHx/tej4+PtozsABQWFiIjIwM7bY7duzQvnb16lWYmZmhR48eT9z+sR9//BF3797FSy+9VGP5F198AScnJ/Tr1w8JCQlPzJ+fn4/jx4/D398fAFBUVISioiJcunQJnTt3hoeHB5YvXw6NRtPEZoA7d+7g6tWr6NWrV5O3JSIyNRxwiahNyMrKQmFhISZMmIB+/fqhW7du2mtllUol7O3tteva29tDqVRqr8PduXMnvL298eyzz+Lll1/G5cuXce7cOe22dnZ2NT6Xvb09Hj58WCtDbGwsxo8fD1tbW+2y119/Hfn5+bh79y7effddRERE4Pjx49rXb968CQcHB9jZ2aFHjx74+9//joEDBwL4Y8AFgO+//x4XL15Eeno6vv76a2zdulW7/cmTJ+Hg4KD96NatW61clZWVCA8Px7Rp09CzZ8+mFUtEZII44BJRmxAbG4uQkBDt9amTJk3SXqZga2uL0tJS7bqlpaWwtbWFRCIB8P/P/AKAm5sbhgwZ8sRtH2/frl27GsvKy8sRHx9f6/KEvn37wtnZGWZmZhg1ahTCw8ORmJioff3pp5/GgwcPUFpaigcPHsDKykq7DysrKwB/XMPr4OCALl264JVXXkFqaqp2e39/fzx48ED7UVBQUOPzazQaTJkyBXK5HJ999llTKiUiMllmYgcgIqqoqMCePXtQXV2Njh07AgBUKhUePHiA3Nxc9OrVC7m5uejfvz8AaJcBwIkTJ5Cfn4/Vq1dj7dq1AICHDx/i0qVL+Oijj9CjRw9UVVUhPz8f3bt3r7X9Y0lJSXByckJgYGC9WSUSSa07ODxmb2+PSZMmISwsDADg6ekJuVyuHcQfb99YgiBg5syZuHPnDlJTU2Fubt7obYmITBnP4BKR6Pbt2weZTIa8vDycP38e58+fx08//YRBgwZhx44dmDp1KtatW4fffvsNN2/exNq1axEREQHgjzO/w4YNq7HtpUuXUFFRgbS0NNjY2GDcuHF4++23UVZWhuPHj2P//v2YMmVKjQyxsbGYOnVqrQF07969UCqV0Gg0+P777xEXF4cXX3yxzq9DqVTim2++0Q7P1tbWCAsLw4cffoiHDx+iqKgImzZtwujRoxvVy9y5c/HTTz/hwIED2rPBRETUCAIRkciGDx8uLFy4sNbyb7/9VujQoYOgVquFJUuWCI6OjoKjo6OwZMkSQaPRCBUVFYKDg4OQnJxca9u5c+cKL730kiAIgqBQKIQxY8YI1tbWQufOnYVdu3bVWLeoqEiQyWRCfn5+rf0MHDhQsLOzE9q1ayd4e3sLX3/9tfa19PR0QSKRCDY2NoKNjY3g5OQkjBo1qsZ+SkpKhLCwMMHW1lbo1KmTsHLlSkGj0QiCIAhfffWVIJVKtds//jh9+rRw7do1AYBgYWFR47W4uLjmlUxEZEIkgvCEn7URERERERkgXqJAREREREaFAy4RERERGRUOuERERERkVDjgEhEREZFR4YBLREREREaFAy4RERERGRUOuERERERkVDjgEhEREZFR+X/r0LDogfWjIAAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# look at the distributions\n", - "from ggplot import *\n", - "ggplot(aes(x = 'A0A075B6E2'), data = x_50_before)+geom_density()+ggtitle('RAW DATA')\n", - "x_50_before_log = pd.DataFrame(np.log(x_50_before), columns = x_50_before.columns)\n", - "from ggplot import *\n", - "ggplot(aes(x = 'A0A075B6E2'), data = x_50_before_log)+geom_density()+ggtitle('LOG TRANSFORMED DATA')" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "# heatmap of log trans, zeroone, and z-scaled data (before)\n", - "data_before_pca = log_z_zeroone_na(x_50_before)\n", - "sns.heatmap(data_before_pca)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "# heatmap of log trans and z-scaled data (before)\n", - "data_log_z_na = log_z_na(x_50_before)\n", - "sns.heatmap(data_log_z_na)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "# heatmap of log trans and z-scaled data (after)\n", - "data_after_pca = log_z_zeroone_na(x_50_after)\n", - "sns.heatmap(data_after_pca)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/tzx804/env/fixjupyter/lib/python3.7/site-packages/ipykernel_launcher.py:6: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", - " \n" - ] - } - ], - "source": [ - "## PCA\n", - "result, args = runPCA(data_before_pca)\n", - "result.set_index(x_50_before.index, inplace = True)\n", - "\n", - "info = labels[['index','MS_instrument', 'LC','ColumnLength','shortdate']] # i have not included pid because there are so many different..\n", - "info['ColumnLength'] = labels.ColumnLength.astype(int).astype(str)\n", - "info = info.loc[x_50_before.index,:]\n", - "result_to_plot = pd.merge(result,info, left_index = True, right_index = True)\n", - "result_to_plot = pd.melt(result_to_plot, id_vars =['index','x','y'], value_vars =['MS_instrument','LC','ColumnLength','shortdate']) \n", - "\n", - "#from ggplot import *\n", - "#ggplot(aes(x = 'x', y='y', color = 'value'), data = result_to_plot)+geom_point()+theme_bw()+facet_grid('variable')" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# take a closer look at each\n", - "date_data = result_to_plot.loc[result_to_plot['variable']=='ColumnLength']\n", - "from ggplot import *\n", - "ggplot(aes(x = 'x', y='y',color = 'value'), data = date_data)+geom_point(size =75, alpha = 0.8)+theme_bw()+scale_color_manual(values = ['grey','brown','mediumaquamarine','mediumaquamarine','orchid'])+ggtitle('COLUMN LENGTH')\n" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "date_data = result_to_plot.loc[result_to_plot['variable']=='MS_instrument']\n", - "from ggplot import *\n", - "ggplot(aes(x = 'x', y='y',color = 'value'), data = date_data)+geom_point(size =75, alpha = 0.8)+theme_bw()+scale_color_manual(values = ['brown','mediumaquamarine','orchid'])+ggtitle('MS INSTRUMENT')\n" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "date_data = result_to_plot.loc[result_to_plot['variable']=='LC']\n", - "from ggplot import *\n", - "ggplot(aes(x = 'x', y='y',color = 'value'), data = date_data)+geom_point(size =75, alpha = 0.8)+theme_bw()+scale_color_manual(values = ['grey','brown','orange','mediumaquamarine','royalblue','orchid'])+ggtitle('LC')\n" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# take a closer look at each\n", - "date_data = result_to_plot.loc[result_to_plot['variable']=='shortdate']\n", - "from ggplot import *\n", - "ggplot(aes(x = 'x', y='y',color = 'value'), data = date_data)+geom_point(size =75, alpha = 0.8)+theme_bw()+scale_color_manual(values = ['grey','brown','mediumaquamarine','mediumaquamarine','royalblue','orchid'])+ggtitle('DATE')\n" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/tzx804/env/fixjupyter/lib/python3.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", - " after removing the cwd from sys.path.\n" - ] - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# look at missing values\n", - "miss = x_50_before.T.isnull().sum().astype(float).tolist()\n", - "date_data = result_to_plot.loc[result_to_plot['variable']=='shortdate']\n", - "date_data['missingness']=[(mis/x_50_before.shape[1])*100 for mis in miss]\n", - "from ggplot import *\n", - "ggplot(aes(x = 'x', y='y', color='missingness'),data = date_data)+geom_point(size =75, alpha = 0.8) + scale_color_gradient(low = \"#00AFBB\", high = \"#E7B800\")+theme_bw()+ggtitle('MISSINGNESS')" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(177, 232)\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/tzx804/env/fixjupyter/lib/python3.7/site-packages/ipykernel_launcher.py:10: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", - " # Remove the CWD from sys.path while we load stuff.\n" - ] - } - ], - "source": [ - "# grouping according to missing values! --> look at x_90\n", - "\n", - "x_90_before = coverage(x.loc[labels['datetime']<'2019-06-01',:], 0.9,0.9)\n", - "print(x_90_before.shape)\n", - "data_before_pca = log_z_zeroone_na(x_90_before)\n", - "result, args = runPCA(data_before_pca)\n", - "result.set_index(x_90_before.index, inplace = True)\n", - "\n", - "info = labels[['index','MS_instrument', 'LC','ColumnLength','shortdate']] # i have not included pid because there are so many different..\n", - "info['ColumnLength'] = labels.ColumnLength.astype(int).astype(str)\n", - "info = info.loc[x_90_before.index,:]\n", - "result_to_plot = pd.merge(result,info, left_index = True, right_index = True)\n", - "result_to_plot = pd.melt(result_to_plot, id_vars =['index','x','y'], value_vars =['MS_instrument','LC','ColumnLength','shortdate']) " - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/tzx804/env/fixjupyter/lib/python3.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", - " after removing the cwd from sys.path.\n" - ] - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# look at missing values\n", - "miss = x_90_before.T.isnull().sum().astype(float).tolist()\n", - "date_data = result_to_plot.loc[result_to_plot['variable']=='shortdate']\n", - "date_data['missingness']=[(mis/x_90_before.shape[1]) for mis in miss]\n", - "from ggplot import *\n", - "ggplot(aes(x = 'x', y='y', color='missingness'),data = date_data)+geom_point(size =75, alpha = 0.8) + scale_color_gradient(low = \"#00AFBB\", high = \"#E7B800\")+theme_bw()+ggtitle('MISSINGNESS')" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# take a closer look at each\n", - "date_data = result_to_plot.loc[result_to_plot['variable']=='ColumnLength']\n", - "from ggplot import *\n", - "ggplot(aes(x = 'x', y='y',color = 'value'), data = date_data)+geom_point(size =50, alpha = 0.8)+theme_bw()+scale_color_manual(values = ['grey','brown','mediumaquamarine','mediumaquamarine','orchid'])+ggtitle('COLUMN LENGTH')\n" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "date_data = result_to_plot.loc[result_to_plot['variable']=='LC']\n", - "from ggplot import *\n", - "ggplot(aes(x = 'x', y='y',color = 'value'), data = date_data)+geom_point(size =50, alpha = 0.8)+theme_bw()+scale_color_manual(values = ['grey','brown','orange','mediumaquamarine','royalblue','orchid'])+ggtitle('LC')\n" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "date_data = result_to_plot.loc[result_to_plot['variable']=='MS_instrument']\n", - "from ggplot import *\n", - "ggplot(aes(x = 'x', y='y',color = 'value'), data = date_data)+geom_point(size =50, alpha = 0.8)+theme_bw()+scale_color_manual(values = ['brown','mediumaquamarine','orchid'])+ggtitle('MS INSTRUMENT')\n" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "date_data = result_to_plot.loc[result_to_plot['variable']=='shortdate']\n", - "from ggplot import *\n", - "ggplot(aes(x = 'x', y='y',color = 'value'), data = date_data)+geom_point(size =50, alpha = 0.8)+theme_bw()+scale_color_manual(values = ['grey','brown','orange','mediumaquamarine','royalblue','orchid'])+ggtitle('DATE')\n" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'. Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.\n", - "'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'. Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.\n", - "'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'. Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.\n", - "'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'. Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.\n", - "'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'. Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.\n", - "'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'. Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.\n", - "'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'. Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.\n", - "'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'. Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.\n", - "'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'. Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.\n", - "'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'. Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.\n", - "'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'. Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.\n", - "'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'. Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.\n", - "'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'. Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.\n" - ] - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "info = labels[['PID']]\n", - "info = info.loc[x_90_before.index,:]\n", - "result_to_plot = pd.merge(result,info, left_index = True, right_index = True)\n", - "from ggplot import *\n", - "ggplot(aes(x = 'x', y='y',color = 'PID'), data = result_to_plot)+geom_point(size =50, alpha = 0.8)+theme_bw() #+scale_color_manual(values = ['brown','mediumaquamarine','orchid'])+ggtitle('MS INSTRUMENT')\n" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(272, 25427)\n", - "(236, 157)\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/tzx804/env/fixjupyter/lib/python3.7/site-packages/ipykernel_launcher.py:11: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", - " # This is added back by InteractiveShellApp.init_path()\n" - ] - } - ], - "source": [ - "## PCA\n", - "x_90_after = coverage(x.loc[labels['datetime']>='2019-06-01',:], 0.9,0.9)\n", - "print(x.loc[labels['datetime']>='2019-06-01',:].shape)\n", - "print(x_90_after.shape)\n", - "data_after_pca = log_z_zeroone_na(x_90_after)\n", - "\n", - "result, args = runPCA(data_after_pca)\n", - "result.set_index(x_90_after.index, inplace = True)\n", - "\n", - "info = labels[['index','MS_instrument', 'LC','ColumnLength','shortdate','PID']] \n", - "info['ColumnLength'] = labels.ColumnLength.astype(int).astype(str)\n", - "info = info.loc[x_90_after.index,:]\n", - "result_to_plot = pd.merge(result,info, left_index = True, right_index = True)\n", - "result_to_plot = pd.melt(result_to_plot, id_vars =['index','x','y'], value_vars =['MS_instrument','LC','ColumnLength','shortdate','PID']) " - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/tzx804/env/fixjupyter/lib/python3.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", - " after removing the cwd from sys.path.\n" - ] - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# look at missing values\n", - "miss = x_90_after.T.isnull().sum().astype(float).tolist()\n", - "date_data = result_to_plot.loc[result_to_plot['variable']=='shortdate']\n", - "date_data['missingness']=[(mis/x_90_after.shape[1]) for mis in miss]\n", - "from ggplot import *\n", - "ggplot(aes(x = 'x', y='y', color='missingness'),data = date_data)+geom_point(size =50, alpha = 0.8) + scale_color_gradient(low = \"#00AFBB\", high = \"#E7B800\")+theme_bw()+ggtitle('MISSINGNESS')" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'. Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.\n", - "'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'. Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.\n", - "'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'. Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.\n", - "'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'. Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.\n" - ] - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "date_data = result_to_plot.loc[result_to_plot['variable']=='shortdate']\n", - "from ggplot import *\n", - "ggplot(aes(x = 'x', y='y',color = 'value'), data = date_data)+geom_point(size =50, alpha = 0.8)+theme_bw()#+scale_color_manual(values = ['grey','brown','orange','mediumaquamarine','royalblue','orchid'])+ggtitle('DATE')\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.12" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/project/misc_data_exploration_peptides.ipynb b/project/misc_data_exploration_peptides.ipynb deleted file mode 100644 index adeaccdf8..000000000 --- a/project/misc_data_exploration_peptides.ipynb +++ /dev/null @@ -1,765 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "# Peptides" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "import os\n", - "import config\n", - "from config import erda_dumps\n", - "from vaep.analyzers import analyzers" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", - "\n", - "from sklearn import preprocessing\n", - "from sklearn.decomposition import PCA" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "pd.options.display.max_columns = 100\n", - "pd.options.display.min_rows = 30" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Data file and other configurations:\n", - "\n", - "- [ ] file reader for peptide intensities" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# FN_PEPTIDE_INTENSITIES = config.FOLDER_DATA / 'df_intensities_N07813_M01000'\n", - "# analysis = analyzers.AnalyzePeptides.from_csv(FN_PEPTIDE_INTENSITIES, index_col=0)\n", - "# INDEX_NAME = 'Sample ID'\n", - "# analysis.df.index.name = INDEX_NAME\n", - "\n", - "FN_PEPTIDE_INTENSITIES = erda_dumps.FN_PEPTIDES # config.FOLDER_DATA / 'df_intensities_peptides_wide_2017_2018_2019_2020_N05011_M42725.pkl'\n", - "analysis = analyzers.AnalyzePeptides.from_pickle(FN_PEPTIDE_INTENSITIES)\n", - "\n", - "peptides = analysis.df\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "analysis.df.iloc[:10, :10]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "X = analysis.df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "N_MIN_OBS = 10\n", - "mask_min_obsevation = X.notna().sum() >= N_MIN_OBS\n", - "mask_min_obsevation.sum()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Cleaning step\n", - "\n", - "- remove fractionated samples (need to be re-run and added to the analysis)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "queries = set()\n", - "\n", - "def find_indices_containing_query(query):\n", - " mask = X.index.str.contains(query)\n", - " X_query = X.loc[mask].sort_index()\n", - " queries.add(query)\n", - " return X_query" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "X_frac = find_indices_containing_query('[Ff]rac')\n", - "X_frac.index" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def get_unique_stub(X:pd.Index):\n", - " # X_frac_unique = sorted(list(set())) # matches too much\n", - " ret = X.str.split('frac').str[0].str.rsplit('_', n=1).str[0]\n", - " return sorted(list(set(ret)))\n", - "\n", - "X_frac_unique = get_unique_stub(X_frac.index)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from functools import partial\n", - "import ipywidgets as widgets\n", - "\n", - "def show_fractions(stub:str, df):\n", - " subset = df[df.index.str.contains(stub)]\n", - " display(subset)\n", - " display(subset.notna().sum(axis=1))\n", - "\n", - "w_data = widgets.Dropdown(options=X_frac_unique, index=0)\n", - " \n", - "# show_fractions(stub=X_frac_unique[2], df=X_frac)\n", - "\n", - "show_fractions = partial(show_fractions, df=X_frac)\n", - "out_sel = widgets.interactive_output(show_fractions, {'stub': w_data})\n", - "widgets.VBox([w_data, out_sel])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- check for file names with `exp`. Some seem to be fractionated samples" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "X_exp = find_indices_containing_query('_exp\\d_')\n", - "X_exp.index" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "assert find_indices_containing_query('[gG][pP][fF]').empty" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "find_indices_containing_query('[cC][vV]').index" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "remove singe fraction samples (need to be quantified as one)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "X.drop(labels=X_frac.index, inplace=True)\n", - "X.drop(labels=X_exp.index, inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# should be part of analysis\n", - "mask_less_than_500 = X.notna().sum(axis=1) < 500\n", - "print(X.loc[mask_less_than_500].sort_index().notna().sum(axis=1).to_string()) #'samples_potentially_fractionated.txt'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "['concat', 'HpH', 'ingel']" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "## Peptitome is spares" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "def get_sorted_not_missing(X:pd.DataFrame):\n", - " \"\"\"Return a Dataframe with missing values. Order columns by degree of completness \n", - " over columns from variables least to most shared among observations.\"\"\"\n", - " X = X.notna().astype(int)\n", - " return X[X.mean().sort_values().index]\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false", - "tags": [] - }, - "outputs": [], - "source": [ - "%time not_missing = get_sorted_not_missing(X)\n", - "not_missing.iloc[:,-10:].describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "not_missing.iloc[:10,-10:]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false", - "tags": [] - }, - "outputs": [], - "source": [ - "grid_kws = {\"width_ratios\": (.9, .05), \"hspace\": 0.5}\n", - "N_MOST_COMMON_PEPTIDES = 300\n", - "data_to_visualize = not_missing.iloc[:, -N_MOST_COMMON_PEPTIDES:]\n", - "print(f\"Look at missingness pattern of {N_MOST_COMMON_PEPTIDES} most common peptides across sample.\\n\"\n", - " f\"Data matrix dimension used for printing: { data_to_visualize.shape}\" )\n", - "\n", - "fig_heatmap_missing, (axes_heatmap_missing, cbar_ax) = plt.subplots(1, 2, gridspec_kw=grid_kws, figsize=(12,8))\n", - "axes_heatmap_missing = sns.heatmap(data_to_visualize, \n", - " ax=axes_heatmap_missing,\n", - " cbar_ax=cbar_ax,\n", - " cbar_kws={\"orientation\": \"vertical\"})\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "White patches indicates that a peptide has been measured, black means it was not measured. Some samples (rows) have few of the most common peptides. This suggests to set a minimum of total peptides in a sample, which is common pratice. \n", - "\n", - "> An algorithm should work with the most common peptides and base it's inference capabilities after training on these." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false", - "tags": [] - }, - "outputs": [], - "source": [ - "# # This currently crashes if you want to have a pdf\n", - "from datetime import datetime\n", - "datetime_now = datetime.now()\n", - "\n", - "from vaep.plotting import _savefig\n", - "_savefig(fig_heatmap_missing, f'peptides_heatmap_missing_{datetime_now:%y%m%d}', folder=config.FIGUREFOLDER, pdf=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "## Sample stats" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "TYPE = 'peptides'\n", - "COL_NO_MISSING, COL_NO_IDENTIFIED = f'no_missing_{TYPE}', f'no_identified_{TYPE}'\n", - "COL_PROP_SAMPLES = 'prop_samples'\n", - "\n", - "def compute_stats_missing(X):\n", - " \"\"\"Dataset of repeated samples indicating if an observation\n", - " has the variables observed or missing x\\in\\{0,1\\}\"\"\"\n", - " sample_stats = X.index.to_frame(index=False).reset_index()\n", - " sample_stats.columns = ['SampleID_int', 'INDEX']\n", - " sample_stats.set_index('INDEX', inplace=True)\n", - " \n", - " sample_stats[COL_NO_IDENTIFIED] = X.sum(axis=1)\n", - " sample_stats[COL_NO_MISSING] = (X == 0).sum(axis=1)\n", - "\n", - " assert all(sample_stats[[COL_NO_IDENTIFIED, COL_NO_MISSING]].sum(axis=1) == X.shape[1])\n", - " sample_stats = sample_stats.sort_values(by=COL_NO_IDENTIFIED, ascending=False)\n", - " sample_stats[COL_PROP_SAMPLES] = np.array(range(1,len(sample_stats)+1)) / len(sample_stats)\n", - " return sample_stats\n", - "\n", - "sample_stats = compute_stats_missing(not_missing)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sample_stats" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "fig_ident = sns.relplot(x='SampleID_int', y=COL_NO_IDENTIFIED, data=sample_stats) \n", - "fig_ident.set_axis_labels('Sample ID', f'Frequency of identified {TYPE}')\n", - "fig_ident.fig.suptitle(f'Frequency of identified {TYPE} by sample id', y=1.03)\n", - "_savefig(fig_ident, f'identified_{TYPE}_by_sample', folder=config.FIGUREFOLDER)\n", - "\n", - "fig_ident_dist = sns.relplot(x=COL_PROP_SAMPLES, y=COL_NO_IDENTIFIED, data=sample_stats)\n", - "fig_ident_dist.set_axis_labels('Proportion of samples (sorted by frequency)', f'Frequency of identified {TYPE}')\n", - "fig_ident_dist.fig.suptitle(f'Frequency of identified {TYPE} groups by sample id', y=1.03)\n", - "_savefig(fig_ident_dist, f'identified_{TYPE}_ordered', folder=config.FIGUREFOLDER)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "COL_NO_MISSING_PROP = COL_NO_MISSING + '_PROP'\n", - "sample_stats[COL_NO_MISSING_PROP]= sample_stats[COL_NO_MISSING] / float(X.shape[1])\n", - "\n", - "# from ggplot import *\n", - "# ggplot(aes(x='nan_proc'), data = nonnan) + geom_histogram(binwidth = 0.005) #+ ylim(0,0.025)\n", - "sns.set(style=\"darkgrid\")\n", - "g = sns.relplot(x='prop_samples', y=COL_NO_MISSING_PROP, data=sample_stats)\n", - "plt.subplots_adjust(top=0.9)\n", - "g.set_axis_labels(\"Proportion of samples (sorted by frequency)\", \"proportion missing\")\n", - "g.fig.suptitle(f'Proportion of missing {TYPE} ordered')\n", - "_savefig(g, \"proportion_proteins_missing\", folder=config.FIGUREFOLDER)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "## Look at sequences" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "class SequenceAnalyser():\n", - " \n", - " def __init__(self, sequences : pd.Series):\n", - " if not isinstance(sequences, pd.Series):\n", - " raise ValueError(\"Please provide a pandas.Series, not {}\".format(type(sequences)))\n", - " self.sequences = sequences\n", - " \n", - " def calc_counts(self,n_characters):\n", - " return self.sequences.str[:n_characters].value_counts()\n", - " \n", - " def length(self):\n", - " return self.sequences.str.len().sort_values()\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sequences = SequenceAnalyser(analysis.df.columns.to_series())\n", - "sequences.length()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "import ipywidgets as w\n", - "w.interact(sequences.calc_counts, n_characters=w.IntSlider(value=4, min=1, max=55))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "sequences_p4 = sequences.calc_counts(4)\n", - "display(sequences_p4.head())\n", - "sequences_p4.loc[sequences_p4.isin(('CON_','REV_'))].sort_index()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "What to do when \n", - "\n", - "\n", - "```\n", - "AAAAAAAAAAGAAGGRGSGPGR\n", - "AAAAAAAAAAGAAGGRGSGPGRR\n", - "\n", - "AAAANSGSSLPLFDCPTWAGKPPPGLHLDVVK\n", - "AAAANSGSSLPLFDCPTWAGKPPPGLHLDVVKGDK\n", - "```\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "## Select Proteins" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "### Minumum required sample quality\n", - "First define the minum requirement of a sample to be kept in " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false", - "tags": [] - }, - "outputs": [], - "source": [ - "import ipywidgets as w\n", - "MIN_DEPTH_SAMPLE = int(X.shape[-1] * 0.25)\n", - "w_min_depth_sample = w.IntSlider(value=MIN_DEPTH_SAMPLE, min=0, max=max(sample_stats[COL_NO_IDENTIFIED]))\n", - "print(f'Minimum {TYPE} per sample observed:')\n", - "w_min_depth_sample" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false", - "tags": [] - }, - "outputs": [], - "source": [ - "mask_samples = sample_stats[COL_NO_IDENTIFIED] >= w_min_depth_sample.value\n", - "print(f\"Selected {mask_samples.sum()} samples\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "### Distribution of Intensity values\n", - "- comparing non-transformed to $\\log_{10}$ transformed" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false", - "tags": [] - }, - "outputs": [], - "source": [ - "from vaep.transform import log\n", - "from random import sample\n", - "sample = X.sample(axis=0).iloc[0]\n", - "sample_id = sample.name # int(sample_stats.loc[sample.index].SampleID_int)\n", - "print(\"Sample ID:\", sample_id)\n", - "sns.set(style=\"darkgrid\")\n", - "sample = sample.dropna()\n", - "fig, axes = plt.subplots(1,2, figsize=(10,3))\n", - "sns.distplot(sample, bins=100, ax=axes[0])\n", - "axes[0].set_title(\"Unnormalized distribution\")\n", - "\n", - "sample_log = np.log(sample) # natural logarithm, could also be base_2, base_10 logarithm\n", - "sns.distplot(sample_log, bins=100, ax=axes[1])\n", - "axes[1].set_title('log (ln) normalized distribution')\n", - "\n", - "_ = fig.suptitle(f\"Dynamic Range of measured intensities in sample {sample_id}\")\n", - "fig.tight_layout(rect=[0, 0.03, 1, 0.95])\n", - "_savefig(fig, 'distribution_peptides_sample_' + str(sample_id), folder=config.FIGUREFOLDER)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false", - "tags": [] - }, - "outputs": [], - "source": [ - "from vaep.transform import log\n", - "from random import sample\n", - "sample = X.sample(axis=1)\n", - "sample_id = sample.columns[0]\n", - "print(\"Sample ID:\", sample_id)\n", - "sns.set(style=\"darkgrid\")\n", - "sample = sample.dropna()\n", - "fig, axes = plt.subplots(1,2, figsize=(10,3))\n", - "sns.distplot(sample, bins=100, ax=axes[0])\n", - "axes[0].set_title(\"Unnormalized distribution\")\n", - "\n", - "sample_log = np.log2(sample) # natural logarithm, could also be base_2, base_10 logarithm\n", - "sns.distplot(sample_log, bins=100, ax=axes[1])\n", - "axes[1].set_title('log (ln) normalized distribution')\n", - "\n", - "fig.suptitle(f\"Dynamic range of {sample_id} between samples\")\n", - "fig.tight_layout(rect=[0, 0.03, 1, 0.95])\n", - "_savefig(fig, 'distribution_peptides_sample_' + str(sample_id), folder=config.FIGUREFOLDER)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "### Reference table intensities (natural logarithm)\n", - "\n", - "14 to 23 spans a dynamic range of 3 orders of base 10" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "dynamic_range = pd.DataFrame(range(14, 24), columns=['x'])\n", - "dynamic_range['$e^x$'] = dynamic_range.x.apply(np.exp)\n", - "dynamic_range.set_index('x', inplace=True)\n", - "dynamic_range.index.name = ''\n", - "dynamic_range.T" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "## Next UP" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "### Find Protein of Peptides\n", - "- check with some reference list of peptides: This is created in `project\\FASTA_tryptic_digest.ipynb` " - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "vaep", - "language": "python", - "name": "vaep" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13 (default, Mar 28 2022, 06:59:08) [MSC v.1916 64 bit (AMD64)]" - }, - "nbdime-conflicts": { - "local_diff": [ - { - "diff": [ - { - "diff": [ - { - "diff": [ - { - "key": 4, - "op": "addrange", - "valuelist": "5" - }, - { - "key": 4, - "length": 1, - "op": "removerange" - } - ], - "key": 0, - "op": "patch" - } - ], - "key": "version", - "op": "patch" - } - ], - "key": "language_info", - "op": "patch" - } - ], - "remote_diff": [ - { - "diff": [ - { - "diff": [ - { - "key": 0, - "length": 1, - "op": "removerange" - } - ], - "key": "version", - "op": "patch" - } - ], - "key": "language_info", - "op": "patch" - } - ] - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/project/misc_data_exploration_proteins.ipynb b/project/misc_data_exploration_proteins.ipynb deleted file mode 100644 index faf1fe18e..000000000 --- a/project/misc_data_exploration_proteins.ipynb +++ /dev/null @@ -1,2133 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "# Denoising proteomics - data exploration" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "import os\n", - "from config import FN_PROTEIN_TSV \n", - "from config import FOLDER_DATA, FIGUREFOLDER" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", - "\n", - "from sklearn import preprocessing\n", - "from sklearn.decomposition import PCA" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "import os\n", - "import logging\n", - "logger = logging.getLogger()\n", - "def _savefig(fig, name, folder=FIGUREFOLDER, pdf=True):\n", - " \"\"\"Save matplotlib Figure (having method `savefig`) as pdf and png.\"\"\"\n", - " filename = os.path.join(folder, name)\n", - " fig.savefig(filename + '.png')\n", - " if pdf: fig.savefig(filename + '.pdf')\n", - " logger.info(f\"Saved Figures to {filename}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "#coverage\n", - "def coverage(X:pd.DataFrame, coverage_col:float, coverage_row:float):\n", - " \"\"\"Select proteins by column depending on their coverage. \n", - " Of these selected proteins, where the rows have a certain number of overall proteins.\n", - " \"\"\"\n", - " mask_col = X.isnull().mean() <= 1-coverage_col\n", - " df = X.loc[:,mask_col]\n", - " mask_row = df.isnull().mean(axis=1) <= 1-coverage_row\n", - " df = df.loc[mask_row,:]\n", - " return df" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false", - "toc-hr-collapsed": false - }, - "source": [ - "## Load Data" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "df = pd.read_table(FN_PROTEIN_TSV, sep = '\\t')\n", - "df.sort_values(by = ['Date'], inplace = True)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
indexDateMS_instrumentLCPIDColumnLengthA0A024QZ33A0A024QZ42A0A024QZP7A0A024QZX5...X6RK76X6RK96X6RKB4X6RKL2X6RKY7X6RLL4X6RLN4X6RLR1X6RLX0X6RM59
40820180713_QE8_nLC5_ASD_QC_Hela1_proteinGroups.txt20180713QE8nLC5ASDNaN46239000.0NaNNaN2.586900e+09...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
31220180713_QE8_nLC5_ASD_QC_Hela2_proteinGroups.txt20180713QE8nLC5ASDNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
28120180713_QE8_nLC5_ASD_QC_Hela2_20190226172112_...20180713QE8nLC5ASDNaNNaNNaNNaN3.078800e+09...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
8220190103_QE8_nLC0_LiNi_QC_MNT_15cm_Hela_02_pro...20190103QE8nLC0LiNi15.0NaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
16120190103_QE8_nLC0_LiNi_QC_MNT_15cm_Hela_01_pro...20190103QE8nLC0LiNi15.0NaNNaNNaN6.685000e+08...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", - "

5 rows × 25433 columns

\n", - "
" - ], - "text/plain": [ - " index Date \\\n", - "408 20180713_QE8_nLC5_ASD_QC_Hela1_proteinGroups.txt 20180713 \n", - "312 20180713_QE8_nLC5_ASD_QC_Hela2_proteinGroups.txt 20180713 \n", - "281 20180713_QE8_nLC5_ASD_QC_Hela2_20190226172112_... 20180713 \n", - "82 20190103_QE8_nLC0_LiNi_QC_MNT_15cm_Hela_02_pro... 20190103 \n", - "161 20190103_QE8_nLC0_LiNi_QC_MNT_15cm_Hela_01_pro... 20190103 \n", - "\n", - " MS_instrument LC PID ColumnLength A0A024QZ33 A0A024QZ42 \\\n", - "408 QE8 nLC5 ASD NaN 46239000.0 NaN \n", - "312 QE8 nLC5 ASD NaN NaN NaN \n", - "281 QE8 nLC5 ASD NaN NaN NaN \n", - "82 QE8 nLC0 LiNi 15.0 NaN NaN \n", - "161 QE8 nLC0 LiNi 15.0 NaN NaN \n", - "\n", - " A0A024QZP7 A0A024QZX5 ... X6RK76 X6RK96 X6RKB4 X6RKL2 X6RKY7 \\\n", - "408 NaN 2.586900e+09 ... NaN NaN NaN NaN NaN \n", - "312 NaN NaN ... NaN NaN NaN NaN NaN \n", - "281 NaN 3.078800e+09 ... NaN NaN NaN NaN NaN \n", - "82 NaN NaN ... NaN NaN NaN NaN NaN \n", - "161 NaN 6.685000e+08 ... NaN NaN NaN NaN NaN \n", - "\n", - " X6RLL4 X6RLN4 X6RLR1 X6RLX0 X6RM59 \n", - "408 NaN NaN NaN NaN NaN \n", - "312 NaN NaN NaN NaN NaN \n", - "281 NaN NaN NaN NaN NaN \n", - "82 NaN NaN NaN NaN NaN \n", - "161 NaN NaN NaN NaN NaN \n", - "\n", - "[5 rows x 25433 columns]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "# import datetime \n", - "# pd.to_datetime(df[\"Date\"],format='%Y%m%d') == datetime.datetime(2019, 4, 22)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "### Separate Data into proteome and meta-data" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "labels = df.iloc[:,0:6]\n", - "labels['shortdate']=labels['Date'].astype(str).str[:6]\n", - "X = df.iloc[:,6:] # ToDo: Rename everything from x to X -> code can be (potentially) copy-pasted\n", - "X.columns.name = 'proteins'" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
proteinsA0A024QZ33A0A024QZ42A0A024QZP7A0A024QZX5A0A024R161A0A024R1R8A0A024R341A0A024R368A0A024R3B9A0A024R3M2...X6RK76X6RK96X6RKB4X6RKL2X6RKY7X6RLL4X6RLN4X6RLR1X6RLX0X6RM59
40846239000.0NaNNaN2.586900e+09NaN7.753100e+08210280000.0NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
312NaNNaNNaNNaNNaNNaNNaNNaN3154400.0NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
281NaNNaNNaN3.078800e+09NaN1.238300e+09225030000.0NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
82NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
161NaNNaNNaN6.685000e+08NaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", - "

5 rows × 25427 columns

\n", - "
" - ], - "text/plain": [ - "proteins A0A024QZ33 A0A024QZ42 A0A024QZP7 A0A024QZX5 A0A024R161 \\\n", - "408 46239000.0 NaN NaN 2.586900e+09 NaN \n", - "312 NaN NaN NaN NaN NaN \n", - "281 NaN NaN NaN 3.078800e+09 NaN \n", - "82 NaN NaN NaN NaN NaN \n", - "161 NaN NaN NaN 6.685000e+08 NaN \n", - "\n", - "proteins A0A024R1R8 A0A024R341 A0A024R368 A0A024R3B9 A0A024R3M2 ... \\\n", - "408 7.753100e+08 210280000.0 NaN NaN NaN ... \n", - "312 NaN NaN NaN 3154400.0 NaN ... \n", - "281 1.238300e+09 225030000.0 NaN NaN NaN ... \n", - "82 NaN NaN NaN NaN NaN ... \n", - "161 NaN NaN NaN NaN NaN ... \n", - "\n", - "proteins X6RK76 X6RK96 X6RKB4 X6RKL2 X6RKY7 X6RLL4 X6RLN4 X6RLR1 \\\n", - "408 NaN NaN NaN NaN NaN NaN NaN NaN \n", - "312 NaN NaN NaN NaN NaN NaN NaN NaN \n", - "281 NaN NaN NaN NaN NaN NaN NaN NaN \n", - "82 NaN NaN NaN NaN NaN NaN NaN NaN \n", - "161 NaN NaN NaN NaN NaN NaN NaN NaN \n", - "\n", - "proteins X6RLX0 X6RM59 \n", - "408 NaN NaN \n", - "312 NaN NaN \n", - "281 NaN NaN \n", - "82 NaN NaN \n", - "161 NaN NaN \n", - "\n", - "[5 rows x 25427 columns]" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "X.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
indexDateMS_instrumentLCPIDColumnLengthshortdate
40820180713_QE8_nLC5_ASD_QC_Hela1_proteinGroups.txt20180713QE8nLC5ASDNaN201807
31220180713_QE8_nLC5_ASD_QC_Hela2_proteinGroups.txt20180713QE8nLC5ASDNaN201807
28120180713_QE8_nLC5_ASD_QC_Hela2_20190226172112_...20180713QE8nLC5ASDNaN201807
8220190103_QE8_nLC0_LiNi_QC_MNT_15cm_Hela_02_pro...20190103QE8nLC0LiNi15.0201901
16120190103_QE8_nLC0_LiNi_QC_MNT_15cm_Hela_01_pro...20190103QE8nLC0LiNi15.0201901
\n", - "
" - ], - "text/plain": [ - " index Date \\\n", - "408 20180713_QE8_nLC5_ASD_QC_Hela1_proteinGroups.txt 20180713 \n", - "312 20180713_QE8_nLC5_ASD_QC_Hela2_proteinGroups.txt 20180713 \n", - "281 20180713_QE8_nLC5_ASD_QC_Hela2_20190226172112_... 20180713 \n", - "82 20190103_QE8_nLC0_LiNi_QC_MNT_15cm_Hela_02_pro... 20190103 \n", - "161 20190103_QE8_nLC0_LiNi_QC_MNT_15cm_Hela_01_pro... 20190103 \n", - "\n", - " MS_instrument LC PID ColumnLength shortdate \n", - "408 QE8 nLC5 ASD NaN 201807 \n", - "312 QE8 nLC5 ASD NaN 201807 \n", - "281 QE8 nLC5 ASD NaN 201807 \n", - "82 QE8 nLC0 LiNi 15.0 201901 \n", - "161 QE8 nLC0 LiNi 15.0 201901 " - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "labels.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false", - "toc-hr-collapsed": false - }, - "source": [ - "### Proteome is sparse\n", - "Proteins that are only identified for a minority of samples should be removed" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "missingness = X.notnull().astype('int')\n", - "missingness = missingness[missingness.mean().sort_values().index]\n", - "missingness.index.name = 'SampleID'\n", - "missingness_index_id = missingness.index.to_frame()\n", - "missingness_index_id[\"DateID\"] = labels.Date\n", - "missingness.index = labels.Date\n", - "missingness.sort_index(inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "grid_kws = {\"width_ratios\": (.9, .05), \"hspace\": 0.5}\n", - "fig_heatmap_missing, (axes_heatmap_missing, cbar_ax) = plt.subplots(1, 2, gridspec_kw=grid_kws, figsize=(12,8))\n", - "axes_heatmap_missing = sns.heatmap(missingness, ax=axes_heatmap_missing,\n", - " cbar_ax=cbar_ax,\n", - " cbar_kws={\"orientation\": \"vertical\"})" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "# # This currently crashes if you want to have a pdf\n", - "_savefig(fig_heatmap_missing, 'proteins_heatmap_missing', pdf=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "#### Samples have 6 to 4461 identified protein-groups\n", - "- cutoff for minimum quality of peptide, e.g. at least 2500 identified proteins?\n", - "- Proteome is not consistent over many runs" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
SampleIDDateno_identified_proteinsno_missing_proteins_of_proteomeprop_samples
0408201907124461209660.002114
1312201907124437209900.004228
2281201909024304211230.006342
382201909104292211350.008457
4161201907084281211460.010571
..................
4684072019030616254110.991543
469712019030310254170.993658
47012201904139254180.995772
471375201907028254190.997886
47233201909066254211.000000
\n", - "

473 rows × 5 columns

\n", - "
" - ], - "text/plain": [ - " SampleID Date no_identified_proteins \\\n", - "0 408 20190712 4461 \n", - "1 312 20190712 4437 \n", - "2 281 20190902 4304 \n", - "3 82 20190910 4292 \n", - "4 161 20190708 4281 \n", - ".. ... ... ... \n", - "468 407 20190306 16 \n", - "469 71 20190303 10 \n", - "470 12 20190413 9 \n", - "471 375 20190702 8 \n", - "472 33 20190906 6 \n", - "\n", - " no_missing_proteins_of_proteome prop_samples \n", - "0 20966 0.002114 \n", - "1 20990 0.004228 \n", - "2 21123 0.006342 \n", - "3 21135 0.008457 \n", - "4 21146 0.010571 \n", - ".. ... ... \n", - "468 25411 0.991543 \n", - "469 25417 0.993658 \n", - "470 25418 0.995772 \n", - "471 25419 0.997886 \n", - "472 25421 1.000000 \n", - "\n", - "[473 rows x 5 columns]" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "COL_NO_MISSING_PROT, COL_NO_IDENTIFIED_PROT ='no_missing_proteins_of_proteome', 'no_identified_proteins'\n", - "COL_PROP_SAMPLES = 'prop_samples'\n", - "\n", - "sample_stats = missingness.sum(axis=1).to_frame(COL_NO_IDENTIFIED_PROT)\n", - "sample_stats[COL_NO_MISSING_PROT] = (missingness == 0).sum(axis=1)\n", - "\n", - "assert all(sample_stats.sum(axis=1) == X.shape[1])\n", - "sample_stats = sample_stats.sort_values(by=COL_NO_IDENTIFIED_PROT, ascending=False).reset_index()\n", - "sample_stats[COL_PROP_SAMPLES] = np.array(range(1,len(sample_stats)+1)) / len(sample_stats)\n", - "sample_stats = sample_stats.set_index(missingness_index_id.SampleID)\n", - "sample_stats.reset_index(inplace=True)\n", - "sample_stats" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAV8AAAF1CAYAAABPgunRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAgAElEQVR4nOzde5xbdZ3w8c/35OQyk5nptDPTAr0jF0EfUDsqgo/CooCogLbughYQWS7is7jrsy5eV0V9dl1UVlddBJHrCiIsguJlUUR2QYQWBbkK2pZe6G06nVsmk0nyff44v6Rn0sw0bSeTmeT7fr3mNcnJSfI9yck3v/yuoqoYY4yZWl6tAzDGmEZkydcYY2rAkq8xxtSAJV9jjKkBS77GGFMDlnyNMaYGLPnWiIjME5EHRGRARL5S5varROTTE9xfReSQKsX2lIgc7y6LiFwnIr0i8oiI/G8ReW4fH/d4EdkwqcFOIhEZFJGDax3HVBGR+0Xkr2sdx2QRkbUi8pZ9uN+E57SIXC8iX9i/6HbnT/YDishaYB6QC20+TFU3TfZzzXAXAtuBNi3T2VpVL56KIETkemCDqn4q9NyvCO3yRuCtwAJVHXLbDp+K2PaHiCwB1gBRVc1Wch9VbalmTGZ6UtX/pgbndLVKvu9U1ZbQ326JV0QmPfHPMIuBp8sl3mlmMbA2lHinhZl8/szk2M0kUtVJ/QPWAm8ps30JoMD5wIvAA277McBDwE7gceD40H2WAr8GBoB7gW8AN7vbjicosZV9boIvlo8BfwJ6gNuAOSWxnOti2Q58MvQ4EeAT7r4DwGpgIfBN4Cslz/kj4G/HeS2OBR4F+tz/Y93264FRIAMMjvN6XQ98IXT9o8BLwCbgAy7+Q9xtceDL7li2AFcBTeHXCfi/wFb3GOe52y4sieNH4dfRvVdpgl8xg8DnSl934CDgDmAbQUnz0tBtTe44eoGn3TFsKPdauf0VuBT4s3tPrgA8d9v7gQeBK4EdwBfce/wpYJ07thuBWW7/F93jDbq/N7jtHwCecTH9HFhc8vyHhF7/bwL3uHPgt8DLJoj9HBdHD/Bpxp6LnwVuB24G+oG/du/Zv7r3c5O7HA8d6/+UeW3CsV1F8JkYIPiMLHa3iXuNthKcd08Arxwn5vuBfwIecfvexa7PyD3A35Ts/wRwRpnHSbhj6yH4HD8KzHO3nede7wH3vl4Uut/xBOfmP7Dr3DwDOBX4o3ufPxHav/A6ft893mPA0Xv7+S8T//GMPadf7R57wD3XrYQ+i5OWKyf9AfecfG8EkgQfzPnuhTnVvVhvdde73H1+A3zVnahvci9Gpcn3b4GHgQXu/t8GbimJ5RoXx9HACHBEKNH9geCniLjbO4DXEXxQCgmhE0gVTrSSWOYQfMDPJqjeOctd7yiXXMvcv3g7cApBUn2le+2+x9gP478Cd7vnbCX4Qvin0OuUBS4Hou61TgGzx4uj5HV8P6FEEH7d3Xu2GvhHIAYcTPABO9nd/s/Af7u4FgJPlr5nZRLMr9z+iwg+gH8diiML/I17PZsIEukL7nlbgP8Ebip5j/3Q45/h9j/CPcangIcmSHA73HvuA/8B3DpO3EcSJPg3utfhywRfauHkO+qe33OxX05wfs4FuggKIJ8v95qPE9sAwWciDnytsD9wsntP2gnO3SOAA8eJ+35gI7vOqzvY9fn6S+C3oX2PJvhsxso8zkUE51wzQcFlGUF1GsDbgZe5WN5McO69puTc/EeCc/MCgi/x7xGcx68g+PI/uOR1XOH2/3t2VS1BhZ//iZKve//WAX/nnmOFe84Zk3wHCb4BdwI/LPkwHBza9zLchyW07ecEJdJF7o1Jhm77HpUn32eAE0O3HeheRD8Uy4LQ7Y8AZ7rLzwGnj3N8zwBvdZf/D/CTcfY7G3ikZNtvgPeHPkCVJt/vAv8cuu0wF/8h7qQeIlQqA94ArAm9TsOMTUJbgWPGi4PKk+/rgRdL7vtx4Dp3+c/AKaHbLix9z0ruqyX7XwL8MhRH6XP9ErgkdP3wMu9x+Lh/Cpwfuu4RJIPFoecPJ7jvhPY9FXh2nLj/kdAHmyAJZRibfB8ouc+fgFND108mqN7Z7TUfJ7ZbQ7e1EPw6WQj8BcGX1jG4QsIEr/f9JefVkS7uCEHC2gEc6m77MvCtcR7nAwRfHkdN9Hxu3x8CHy45NyPueqs7zteH9l+NK2271/HhkvfvJeB/783nv0xMx7PrnH4TQQFLQrc/RBWSb7XqfM9Q1Xb3d0bJbetDlxcD7xGRnYU/gtLDgQQ/Z3t1bF3jur2IYTFwZ+hxnyE4QeeF9tkcupwiOIkhOIn/NM7j3gCsdJdXAjeNs99BZeJdR1Da31sHMfZ1Cz9uF8GHfXXoWH/mthf06NhGp/Cx7o/FwEEl798n2PUaTxT3eEr3P2ic2wqPH37MdQSJdx7lLQa+Fop1B8GX13jvyXjnR6kxx6mqKYJSYlglsR9E5cLPN0hwLAep6n0E1XPfBLaIyNUi0lbJ47gYokCnqo4Q/FRfKSIewS+38c71mwgKTbeKyCYR+RcRiQKIyNtE5GER2eFe81MJfjEW9KhqoXF+2P3fErp9mLGve/i48wTVFuVet0o+/+UcBGxUl3Wdvck7FatFV7PwQa0nKPm2h/6SqvrPBN9os0UkGdp/UejyEEHSAUBEIoxNOOuBt5U8dkJVN1YQ43qCn0rl3AycLiJHE/yk++E4+20iOAHCFhH8zNtbLxF8IYQfp2A7wQn6itBxztLKW+51z7uMaz1BCTv8Greq6qkVxD2e0v3DjbWlsZa+xoVfS1vK7FuI96KSeJtU9aEK4prISwQ/bwEQkSaCaqqwSmIvHGvpuX1AmedcGLq9haCqZhOAqn5dVZcR/Gw/jKAabTylr/cowTkFQUHjfcCJQEpVf1PuAVR1VFU/p6pHErRzvAM4R0TiBFUZXyaommsHfkLwhbevwsftEbzu5XpS7evn/yVgvoiEY6zkvN1rte7nezPwThE5WUQiIpJwfUEXqOo6YBXwORGJicgbgXeG7vtHICEib3ffsp8i+KlUcBXwRRFZDCAiXSJyeoVxfQf4vIgc6vq5HiUiHQCquoGgQeEm4A5VHR7nMX4CHCYi7xURX0T+iuBn3Y8rjCHsNuD9InKkiDQDnync4L79rwGuFJG57ljni8jJFT72FoI6033xCNAvIpeJSJN7D18pIq8Nxf1xEZktIgsI6mv35KNu/4XAhwkaPMZzC/B3IrLUJaD/B3zflfK3AfmSY7vKxfMKABGZJSLv2ZsDHsftBOfxsSISI2iY3FOCuQX4lDsvOwmqLm52tz0OvEJEXiUiCYKf26VOFZE3uuf7PEH97HoRea2IvN59JobY1WA6npWh8+py4PZCSdQl2zzwFcYv9SIiJ4jI/3IFoH6CBJ4jqD+NE7wXWRF5G3DSHl6XPVkmIu92PUb+lqCt5uEy++3r5/83BF/gl7rP7bsJ6v0nXU2Tr6quB04n+Km6jeDb6qOhuN5LUK+4gyDh3Bi6bx9BneB3CEqTQwQ/QQq+RtAI9V8iMkDwBr2+wtC+SpA4/ovgZLqWoJGk4AbgfzHBCamqPQQlgP9L8BP0H4B3qOr28e4zwWP9lKBR7T6CBqP7Sna5zG1/WET6gV9Qeb/Fa4Ej3c+z8Urx48WVI/hCfBVBw8d2gvdjltvlcwQ/2dYQvJbjvl4hdxHU8/2eoMX92gn2/a57zAfcc6RxCd799P8i8KA7tmNU9U7gSwQ/j/sJGgDfVunxjkdVn3LPeytByWmAoF59ZIK7fYGgcPEEQePuY24bqvpHgkT4C+B54H/K3P97BJ+JHQQNXO9z29sIvox72dX74ssTxHETQR3yZoJeC5eW3H4jwbl+M+M7gOALqJ/g5/2vCdpmBtzj3ebieS/BZ3J/3AX8Fbsas9+tqqNl9tunz7+qZoB3E9S797rn+s/9jLksGVu1Mb2JyGcJGh1W7mnfKsfxJoKTcYkreZpJICJK0MDzQq1j2R+uFL6T4FjWVOHxr6dkYEy1iMg5wIWq+sZqP1cFsXyWafD5nyy1rnaYcdzPuQ8TtIRb4jUAiMg7RaTZtVF8maA0u7a2Ue0fVxVxCXB1rWOpR5Z894KIHEFQojmQoBrAmILT2TVg4lCCbosz52dlCddmsI2gTeB7NQ6nLs2oagdjjKkXVvI1xpgasORrjDE1YMnXGGNqwJKvMcbUgCVfY4ypAUu+xhhTA5Z8jTGmBupyOZNTTjlFf/azn9U6DGOMgXEmWarLku/27Xs9d40xxkypuky+xhgz3VnyNcaYGrDka4wxNWDJ1xhjasCSrzHG1IAlX2OMqQFLvsYYUwOWfI0xpgYs+RpjTA3U5fDifZXPKz1DGTLZHDE/QkcyhueVHRlojDH7xZKvk88rz20Z4IIbV7Ghd5gFs5u45pxuDp/XagnYGDPpLPk6PUMZLrhxFV0tcT79jiNpb4qyuS/NvLY4c5LxWodnjKkzlnydTDZHV0ucvz/5cC6744li6ffbK5fR3mTVD8aYyWUNbk7Mj3DpiYcWEy/Aht5hLrp5NT1DmRpHZ4ypN5Z8nY5kjKWdyWLiLdjQO0wmm6tRVMaYemXJ1/E8oTkeYcHspjHbF8xuIuZHahSVMaZeWfIN6UzGueac7mICLvR46EjGahyZMabeWINbiOcJh89r5c5LjrO+vsaYqrLkW8LzhK5W61pmjKkuS74lbJSbMWYqWJ1vSGGU2yfvfIIXd6QYHBllfW+Krf1p8nmtdXjGmDpiyTekZyjDlfc+xyUnHEJzLEJqJIcnQiqTY1PfMKOj1uXMGDM5LPmGZLI5li9byGhWSUQ9cqrc/Js1xPyg2mFTf5pNO4fJZvM1jtQYM9NZ8g0p1PF2tsSIeBHueXwjZx2zmO2DGW56aA0AIrB5IM2LPUNsHbDqCGPMvrEGt5COZIzhTJZMTvEEVnQvIpuDex7fyPLuhexMBcOMM1mlsyVG/3CWkdE8B7Yl8H37HjPGVM4yRojnCQfNaiLue+QVIp4Uk/DG3jSZbFDKjfnC2p4UPYMjvLB1kLU7hqwEbIzZK5Z8S/i+x4FtCWK+EAsl4eZYhM6WGJmsksrkePhP25iTjLG4o5m4H2FgxCbfMcZUzpJvGb7vsaC9mbamCFGXhFOZHDmFzpYY9z+7hbcfPZ/bV71IxBPiUY/RrLKxN8W6niFrlDPG7JEl33F4ntCWiDO/rYnWuMeCOU34npDToBqiUA+cHs2SymTZ1JfmxofWEI0IeVU29g1bEjbGjMuS7x74vses5gRL5yRpTUSI+x4RT4r1wBEvUmyUK+0Z4UeEnqERNrgSsQ3WMMYUWG+HCvm+R0dLglnZPJsH0mSyeZpjEQojj0t7RqRHs4xkcwxncvieR1dbHM0rm/qGyeWVpmiEzpa4DV02pkFZyXcv+b7HvJZ4sR44r5TtGRHxIvSlsiSiHslEhJ2pDH/ePsTlP3qKZzcP8OKOFBt6U1YtYUyDspLvPohGI8xNxhie00QunyMejeB7HunRfLFnhCdB41xOIZuDjb1pbnlkHZeccAhRVxIezebZ2DdMNOIxtyVufYWNaSCWfPdRLOazdE6SHakMIhCNQE6ViJsLolC1W6hVaI5FOO+4pTRFPeLRCJv70tzz+EbOPW4pnkDvcIZCBcRINk82r5aUjaljlnz3g+97zG1LFK+3xPIMZTIscCXipphfHJiRyuRY0tlMLs+YBrrBkSyqSjwaLFXUP5zlnsc38t5jlhQb7EbzavXExtQZS76TyPc9ZvkJkrEYO1IZ/AgMkiOvyvzZCUQET4JkXGig29ibZuGcZgprdFqDnTGNwZJvFYRLxO1NSn86QzanZHJKKhdk2YhLmuEeExAk5Q29wyyc00zvUIa2pgjxaNBgN5zJkckqC+ckEIFtA2krFRszQ1nyrTLPE9qbg2WJ8nll484UOdViA124fhgYt8GuL5Ul5gttTRHS2TyZbLZYKp7TEkMEdgyNkMurJWRjZoCqt+SISEREficiP3bXl4rIb0XkeRH5vojE3Pa4u/6Cu31J6DE+7rY/JyInVzvmavE8YX57M+1NUVriHlFfmD87QS6fw4+AH2FMF7acBg12hUS8Y2i0OKij0I0t6gv9wxmGMll2pDJs7k/TlxqlKRZBVdk6kGZDb8qmwDRmmpmKZvQPA8+Ern8JuFJVDwV6gfPd9vOBXlU9BLjS7YeIHAmcCbwCOAX4lohEpiDuqiiUhNubE8xva6IjGaO9KUYy5tMS82lxQ5lz+RwxX4r9iHO6q4qikIwjXmTMKLvSfsWb+9Nscck46ntkRvO2LJIx00RVk6+ILADeDnzHXRfgL4Db3S43AGe4y6e767jbT3T7nw7cqqojqroGeAF4XTXjniqFUXPzZjXR1ZqgszVBuxvKPLs5Tks8QswXVyKWMYM6CqXicgm50JCXcStyRH1ha3+avuFdJeJNfcM25NmYGqp2yfdfgX8ACsO4OoCdqpp11zcA893l+cB6AHd7n9u/uL3MfYpE5EIRWSUiq7Zt2zbZxzGlCg12c5IJFhSrKSIsdCXiQjLOK2UTciEpF5JxuZF2N7lJgETgJZeIbSIgY6ZO1ZKviLwD2Kqqq8Oby+yqe7htovvs2qB6tap2q2p3V1fXXsc7XRWqKeYkExzc2UJXa8JVTwSl4nCdcbiaojAFZrkS8f3PbuGsYxaTyeV5yc3GBpDLB7OxbbHSsDFVV83eDscBp4nIqUACaCMoCbeLiO9KtwuATW7/DcBCYIOI+MAsYEdoe0H4Pg3F84Q5yXjxeqEb28hoHs8TIt7YfsW+J2RdEg2PtCv0MYZd/Yo37RzmugfXsHzZQjqSMUZGcxw0q8lG1xlTJVX7ZKnqx1V1gaouIWgwu09V3wf8CljhdjsXuMtdvttdx91+n6qq236m6w2xFDgUeKRacc8khVJxoc44XE0xpzlWLB2XlogLkwCFJwK67sE1XHLCISxob2L+7CY8ETb2DVudsDFVUot+vpcBt4rIF4DfAde67dcCN4nICwQl3jMBVPUpEbkNeBrIAh9S1dzUhz0zhPsVQ9C3eGAkQ99wtlgijkY8RlzdbqFfcWHeCZGgcc4PTf6zvjdl80wYM8kkKFzWl+7ubl21alWtw5hW8vldI+08DwZGcqgb7PH8lsHivBPhUXX9w1m+cd/zxaqIrtY4B7UliEZnbE8/Y2qh7CgnG+HWIEpLxG3xPH3pDBEPFsxpKs47ER5V9437nufcY5dyw0NBXXDBfKsLNma/WfJtUIU+xgAtsRjbBkcYcvNOFBrnli9byA0PrRkzB7Hmlc0DaRu+bMx+suRrgtU52hJs3JlCoTgNZkcyNmYO4nKT+9h8EsbsG0u+Btg170S4ca6rNU5etTgHcenkPpAnk80znMmxY2iU5liEVCbH4jnNLOlMWgI2ZgJWcWeKPE+Y1RQvdldrTUTwRMad3KeQkFOZHA//aRtzkjEWdzQT9T0GRjK1PhxjpjVLvmY34VF1TdHIuJP7FBLy/c9u4e1Hz+eKnz/L81sH2dKfZmcqy+io9Qg0ZjyWfM2EOlvixMeZ3KeQkFd0Lyr2jHhsbQ+zmqLk8sqWwRFLwMaMw5KvmdDYOYjHTu5TSMgRT1i+bCEPPDe2BLy5L82m/rQlYGPKsAY3s0fhPsLtTXF2DmeKQ449D1KZPB3JGCu6F3HFz58tdk1rC5WA57XEbXCGMSGWfM1eKZ3cB6A1liPnupoVuqaNZJWbf7OG9x6zhJjvsSOVse5oxoRY8jX7LRqNcFBbgi2DIxwwK0EuD/c8vm63VZgz2WAEXX86y0g2z4FtCRspZxqWnflmUkSjEea1xItd0wqzpRUmc4ddDXZx3yPnRspZfbBpVJZ8zaSJRiPFrmmlqzAXljTKaVAdIRLss2VwxBb3NA3Jqh3MpOpsiTOSzSGyq1tasD2YsKdQHbEzFQzCKFZFDGcZGbWqCNM47Cw3k6rQNa2tKTJmFebCkkaF6ojC/BFWFWEalSVfM+k8T2hLxMeswhz3vTHVEeWqImJ+0PthU3/aFvM0dc+Sr6ma8CrMB7Yl3JJGXnFxz8LCnvc8vpGzjlnM9sEMl//oKXamMojA5oG0LW9v6pYlXzMlfN9jQag6wvfGVkUUJm//yEmH0RSLsLU/TV9qlJjvkXFLGVkSNvXEGtzMlClURzT7UfrSGVKZPCPZPBE32GL5soX4XqS4lFFhPbnC/MF5VTb1DdtADVMXLPmaKVdYRWNWNs+m/mE8EdKjwRDlQte0nAbryRXmD+5PZxnO5IoLe3rAtoG0jZozM5ZVO5iaCVdFRH1xk7dTrI4Izx/cl8qSiHokExFSmSw7Uhn+vH2Imx5aQzQixVU1NvamWNczZA12Ztqz5GtqqlAVMb+tibZExDXKyW7zBxca5wqTuG/sTXP/s1s465jFZHJ5hjJZNvWl+Zw12JkZwqodzLTg+x6z/QSzmrS4lBEwZqBGuEahORYpNtQVFBrsPFdXbFUUZjqzkq+ZVsJLGc1q8ovzBxdKw4W/VCZHxJMxq2oUGuwqqaJ4qW/YqidMTVnJ10xLhSTcGo+xczgDKIPkyGtQDJ4/O0E04jESSpylDXaFUvHG3jQP/2lbsYpi+2CGex7fyHuPWYIfEff4MJLNk80r0YjH3Ja4DXM2VWXJ10xr4fmD25uU/nSGbE4hFkzknkdRl5C7WuOM5twk7xNUUdzz+MbidJdKsH//cJZ7Ht/IucctxRPoHc4gQC6vVl1hqmKPyVdEXgZsUNURETkeOAq4UVV3Vjs4Y8LCK2oUtMXz9KUzqAbJeHAkyLCFuSMgqKLoat2VMFd0L2JD7zAL5zSPSchnHbOYwZEsqko8GiGTzRe7t81piRV7VFgJ2UyGSs6aO4CciBwCXAssBb5X1aiMqVChz3BnazCMuVBXHHOLfvqRXVUUhfri8HSXhb9CybgwB3E2R7HuOOoL/cOZYo+KG13dcTQi9AyNsMF1b7NeFWZvVFLtkFfVrIi8C/hXVf03EfldtQMzZl+E64rHq6LwPW9MLwqgOMqukJRhV93x+h2pYim5UELO5PIMjmTpGcxw/7NbOPe4pVYyNnulkuQ7KiJnAecC73TbotULyZj9N1EVRcSjON1l3C3q6Xse6dH8bknZk7EJOVx3XNqQl8nli3XH4cY8S8amnEqS73nAxcAXVXWNiCwFbq5uWMZMvkIVBUBLLMYONxhDCKoncqrMn50oJmVPhExWxyTkSKixrVxf49LGvNKGvO1DI2TzSj6vJGIROpPWgNeopNBSXE+6u7t11apVtQ7DzDDZbFAyzuYUzxMirgEvPZorNsIVSsgQVEcs7mge8xi5vBYb8wB+8Oi6oGTsGu8gKGW3NUXJ5RU/IsxriRN1JXBTl8p+u1bS2+E44LPAYre/AKqqB09mdMbUWrhkXFDo3jYymsfzhKgrIasrJZf2NQ435sHYhryWuE9bU4SRbDB5/LnHLSUa8diRylh3tgZUSbXDtcDfAasBW9/FNJRydcctMde9rUxf49LGvHBDXngdu/Ea7TK5YO5iqx+uf5Uk3z5V/WnVIzFmhigtIYf7GvuRsY154Ya80snjYWyjXWHkXaF+ePNA2krDdayS5PsrEbkC+E9gpLBRVR+rWlTGzCClyTjcmBcNNeT5npDN67iNduGBHqVd2F7qG7YeE3WmkuT7eve/O7RNgb+Y/HCMmfkKa9cVFKspFFKjueLk8TB29F24frjcXBThoc95G/Y84+0x+arqCVMRiDH1KlwyzueVwUymbKNduH64dC6K8NBnERkz7Lmwxp0l4Zll3OQrIitV9WYR+Ui521X1q9ULy5j6VLqOXbjRLuJKxOXmoiiUiBfOaR53jTtrrJtZJir5Jt3/1qkIxJhGUq7RbsiViEu7sJUOfS63xt3WgV3TZObyysa+YRLRCF1WEp62xk2+qvpt9/9zUxeOMY3J9z1m+QmSrn5YZOxcFKVDnwtJeG1PMO/EPY+vY3n3QnamMmSySmdLjIF0lkw2z4FtCSsFT0N7fEdE5DAR+aWIPOmuHyUin6p+aMY0nkKJeE4ywfy2JlrjPi3xYGa1wtDncmvcreheRF8qWHop5gupTI5kPEI0ImweSPNizxBbB2zWtemkkq/Da4CPA6MAqvoEcGY1gzLG7ErE7c1BIu5IxmhvitES37XQaKE0HPGEzpYYmawWp8Hc2p9mS3+avtQoiVgED1s+aTqpJPk2q+ojJduy1QjGGFNeIRHPm9U0Zt7iXWvceeR01yrPG3vTxURcWMuuMBcxUKwX3mJzENdMJf18t7vVLBRARFYAL1U1KmPMhErXuPNEyStksvniNJila9kVZlzbtHOY6x5cw/JlC+lIxhgZzXHQrCarF55ilSTfDwFXAy8XkY3AGuB9VY3KGFOR8Bp3yWiOzQPBZO7h4cwFheWTbnlkHZeccAjR0OxqmwfSNrvaFKvkq05V9S1AF/ByVX1jhfczxkyhaDTCgW0JYq5xzvekuHRSePmk845bSlPUI6fB7GoxX/A9oSeVYUNvyhrnpkila7ihqkOqOuC23V69kIwx+8r3PRa0NzO3NV5smCusZRfzgxnXDpiVIOJFxoyc2xJqnBMRUiM5NvUNMzpqExlWy0Qj3F4OvAKYJSLvDt3UBiTK38sYU2vhaTDzeS2uZVeYcU1E8EQnnGt4Rfci8ipsGRxhbjJGLFZJDaXZGxO9oocD7wDa2bV2G8AAcEE1gzLGTI7S+YhbYjG2DY4wlMuNO9fw24+ez+2rXmRF9yIy2TxbhzLMBUvAk2yPywiJyBtU9TdTFM+ksGWEjBlfPq9s3JlC3FwShdWZPQm6oN2+6kXefvT84nBlT4S8qg1X3ndlX7BK6nzXi8idIrJVRLaIyB0ismCSgzPGTBHPE+a3N9PWFCmOnCs0zkU8YUX3ojHd0j7/46d4dvMA63ek2NCbssEZk6SS5HsdcDdwEDAf+JHbZoyZoQqzqxVGzu0aNecVE/DG3jTXPbiGS044hDnNMTqSMTI55aX+tCXgSVBJ8p2rqtepatb9XU/Q7WxCIpIQkUdE5HEReUpEPue2LxWR34rI8yLyfRGJue1xd2LpSF4AACAASURBVP0Fd/uS0GN93G1/TkRO3qcjNcbspjByrrM1GDXX1hQpJuBCtzRh13wRcd8r9gu2nhD7p5Lku01EVopIxP2tBHoquN8I8BeqejTwKuAUETkG+BJwpaoeCvQC57v9zwd6VfUQ4Eq3HyJyJMFcEq8ATgG+JSLWE9yYSVYoDc9NxsZ0SysMU86p8vkfP8VzWwbY3JdmU78l4P1RSfL9APCXwGaCYcUr3LYJaWDQXY26v8LyQ4V+wjcAZ7jLp7vruNtPFBFx229V1RFVXQO8ALyugriNMfsgFvOZm4wVu6UV5ov4xn3PF6sgFsxOEPGETf1pm6RnH02YfF0Jc7mqnqaqXao6V1XPUNV1lTy4Kyn/HtgK3Av8CdipqoWJeTYQ1CPj/q8HcLf3AR3h7WXuY4ypgljMZ+mcJPGIVxymXKiCKEzeftNDa8hk86RHc2weSJPJ2Hxbe2PC5KuqOYKS5z5R1ZyqvgpYQFBaPaLcbu5/ue4YOsH2MUTkQhFZJSKrtm3btq8hG2Mc3/eY15Yg7nvkFQ6YlWDH0GhxdFyhP3Aur8X+wJaAK1dJr+kHReQbwPeBocLGvVk6XlV3isj9wDFAu4j4rnS7ANjkdtsALAQ2iIgPzAJ2hLYXhO8Tfo6rCSYAoru72walGzMJPE84sC3Bpv5htGTy9kJ/4POuf5QNvcMsmN3Ev69cxhHzWm2GtApU8godS9DYdTnwFff35T3dSUS6RKTdXW4C3gI8A/yKoN4Y4FzgLnf5bncdd/t9GowAuRs40/WGWAocCpTOL2yMqZLCfBFx1wgX7g98yX88xobeYQA29A7zwZtXs3VwpMYRzwzVXDr+QOAGV2/sAbep6o9F5GngVhH5AvA74Fq3/7XATSLyAkGJ90z3/E+JyG3A0wSTuH/IVYcYY6aI5wlzWxOkMjlGczkS8SiZbL6YeAs29A6TzVnjWyX2mHxFpAP4DPBGgrrW/wEuV9UJu5u55YZeXWb7nynTW0FV08B7xnmsLwJf3FOsxpjq8TxhSUeyOHn7ILBgdhNdLXEuPv5ltDdFSWVyJGxO4IpUUud7K/AAsNxdfx9B/e9bqhWUMWZ6Ck/e3uxnue6817J9YISP3v5Esd73mrO76bQ5IPaokuQ7R1U/H7r+BRE5Y9y9jTENIRbzmZWIcsXPnuXT7ziS9qYoO4dHufIXz/HFdx1FV2t8zw/SwCppcPuViJwpIp77+0vgnmoHZoyZ/lSVc49dyh2r17NzeJSOZIyPve0IZPfeoKZEJSXfi4CPADe56xFgSEQ+QjCQra1awRljprecwg0PreHcY5dy2R27qh6+vXIZc5JW9TCRSno7tE5FIMaYmUdVWb5sITc8tIYrVhzFwtlNKMJoLs+W/jTz2hKWgMdhU9MbY/ZZzI+wYHYTl5xwCAL0DGXYMTRKcyxCKpNjeDTHko6kJeAyLPkaY/ZZRzLGSDbH81sGaYn7ZPN5Pn3Xk8XqhytWHEV7c7TYQ8LsYsnXGLPPvJJ14M7+7iN0tcSLvR9SmRyjNuNZWROtXjxnojuq6o7JD8cYM9PE/aCKIafQ1RLnM6cdSe/QaPH2gZEsXXm1qocSE5V8V7NrVrFFBBOfC8Fqxi8CS6senTFm2utIxljc0YwnwidOPYLhTG63qoc5yZhVPZQYt5+vqi5V1YOBnwPvVNVOVe0gWE7+P6cqQGPM9FYYdtze7HPArERxtBsEcz189PYnGM7YdCylKhlk8VpV/Unhiqr+FHhz9UIyxsw0nie0N8dRtOxkOzkbc7GbShrctovIp4CbCaohKl3DzRjTYBLRoOvZsQd3cMGbDibilqRvTdj8vqUqeUXOIlit+E731+W2GWPMGJ3JOLdc8HpWvmExV/z8WZ7fOsjOVIb+4ZwttlmikhFuO4APi0hLaEFMY4zZjecJEc8rLrbZOzRKLq/8edsQo7k8B3e2WK8HZ48lXxE51k2A/rS7frSIfKvqkRljZqTRXJ7zjlta7PXwV1c/zKfvepJtAyPsHM7UOrxpo5I63yuBkwmW80FVHxeRN1U1KmPMjBWNeBwwK8HZ19qAi4lUVAuuqutLNlnljTGmrLktcTwRulri/P3Jh/PY2h7muL7Ao3m1ul+nkpLvehE5FlARiQGXEiyEaYwxu/F9j6ZohEtPPJQHnttiKxyPo5LkezHwNWA+wTLu/wV8qJpBGWNmts6WOEs7kyyY3cR51z86pvph+8AIO1rjzG1L1DrMmqqkt8N2gnXbjDGmIp4nNMcjDKa17HwP/enRhl/nbaKJdf5BVf9FRP4Ndl8TRFUvrWpkxpgZrTMZJ5PNF+d7uOWRdSxftpCOZAxPhP50hvbmxp3vYaKS79Pu/6qpCMQYU188T5jXEkcVPv/jp8ouNdSWiDVs6Xei5PtXwI+BdlX92hTFY4ypI9FoZLelhg5oS5BTZftghp3DmYad7Wyi5sZlIrIY+ICIzBaROeG/qQrQGDOzJWK7lhpqjkVY25OiZzDDcCbHjqEM+XxjzrozUcn3KuBnwMEEc/uGfxuo226MMRMq1P2+tDNddpmhRp3rd9zkq6pfB74uIv+uqh+cwpiMMXWkUKdrywyNNVFvhzZV7Qc+Wa6awZYRMsZUKu5HSI/mi6Pewg1vV61cRldr4y0xP1Gd7/fc/9UEPR5Wh/6sB4QxpmIdyRhx3+PSEw8tJl4IJlq/+ObV9Aw13oQ7E1U7vMP9t7XajDH7xfOEA9sSZPNB6Tfc62FzX5p8vvGqHiqZUvKXlWwzxpiJ+L5HW5PPZ087EoB/+ukz/GnbEIlohHQ2T7bB6n4nqvNNAM1Ap4jMZldvhzbgoCmIzRhTZ3J52DE0yi2PrNt90MXZyzjigLaGqfudqOR7EUH97ssZW997F/DN6odmjKk3o9k8zbEIy5ct3K3u96KbGqvud6I6368BXxORv1HVf5vCmIwxdSrmR0hlcnQkY2VXOc5kG2eu30pmNfs3N5/vkvD+qnpjFeMyxtShDjepOsCC2U1jEvCC2U3E/EitQptye0y+InIT8DLg9+xawUIBS77GmL3iecKSjiT96QzXn/da1u8YpjkWlIYXdzTTkYzVOsQpU8lk6t3AkaramAOwjTGTyvOEtkSMl/pGxgw1vuac7lqHNqUqWcfjSeCAagdijGkcPUMZLrhx1ZgGtwtuXGUNbiU6gadF5BFgpLBRVU+rWlTGmLqWyeaswa2CfT5b7SCMMY0l5kcavsFtj9UOqvprYC0QdZcfBR6rclzGmDrWkYxxzTndLJjdBFCs87UGtxARuQC4EJhD0OthPsFcvydWNzRjTL3yPOHwea3ceclxZLI5Yn4kWNutQUa3QWXVDh8CXgf8FkBVnxeRuVWNyhhT9zxP6GptvEnUCyrp7TCiqsUmSBHxKbOasTHGmMpVknx/LSKfAJpE5K3AD4AfVTcsY4ypb5VUO3wMOB/4A8FkOz8BvlPNoIwxjSGbzbN1cITRXJ5oxGNuSxzfr6RMOPNVMrdDHrjG/RljzKTIZvM8u2WAi29ePWZJoZfPa22IBDzRfL5/YIK6XVU9qioRGWMawtbBkWLihV1LCt120Rs4qL2pxtFV30Ql33e4/x9y/29y/98HpKoWkTGmIYzm8mVHuWVzjbGixUTz+a4DEJHjVPW40E0fE5EHgcurHZwxpn5FI17ZUW5+pP6rHKCy3g5JEXlj4Yqb2zdZvZCMMY1gbkucq1YuGzPK7aqVy5jb0hh9fyvp7XA+8F0RmeWu7wQ+UL2QjDGNwPc9Xj6vldsuegPZXB7fejuMpaqrgaNFpA0QVe2rfljGmEbg+15DNK6VM1Fvh5WqerOIfKRkOwCq+tUqx2aMaQD5vNIzlGm4OR4mKt8X6nVbx/mbkIgsFJFficgzIvKUiHzYbZ8jIveKyPPu/2y3XUTk6yLygog8ISKvCT3WuW7/50Xk3H08VmPMNJPPK89tGeBd33qQ4770K971rQd5bssA+Xz9z2Ag1VodSEQOBA5U1cdEpJVg2fkzgPcDO1T1n0XkY8BsVb1MRE4F/gY4FXg98DVVfb2IzAFWESxnpO5xlqlq73jP3d3dratWrarKcRljJs+2gRE+eecTLF+2kPamKDuHR7lj9Xq++K6j6mnSnbLF+KrVbKvqS6r6mLs8ADxDMB3l6cANbrcbCBIybvuNGngYaHcJ/GTgXlXd4RLuvcAp1YrbGDN18vk85x67lDtWr2fn8CgdyRgfe9sRSAPM3TUlzYoisgR4NcG0lPNU9SUIEjRQmJ5yPrA+dLcNbtt420uf40IRWSUiq7Zt2zbZh2CMqYKcwg0PreGSEw4hFvHI5ZV1PSl2Do/WfdXDuMk3VEd73Hj7VEJEWoA7gL9V1f6Jdi2zTSfYPnaD6tWq2q2q3V1dXfsWrDFmSqkq5x23lOFMjlseWcfO4VFaEz6eCP3p+l5Mc6KS73nu/7/t64OLSJQg8f6Hqv6n27zFVScU6oW3uu0bgIWhuy8ANk2w3Rgzw8X8CAfMSnDdg2uK1Q89Qxl2DGXoG86SzdbvUOOJku8zIrIWONz1Pij8/UFEntjTA0vQJ+1a4JmSbml3A4UeC+cCd4W2n+N6PRwD9LlqiZ8DJ4nIbNcz4iS3zRgzw3UkY3giLF+2kBseChLw53/8NCuu+g3v+85veW5r/fZ8mLC3g4gcQJDodlsmvjD3wwT3fSPw3wTzABe+vj5BUO97G7AIeBF4j6rucMn6GwSNaSngPFVd5R7rA+6+AF9U1esmem7r7WDMzLG1P82LO1L0DGW4Y/X6euz5ULa3Q0VdzUQkBhzmrj6nqqOTGNiks+RrzMyRzysbelMMjGTZmRrlsjueKM7v+6XlR3FIV5J5s2b0KLh962omIm8Gnge+CXwL+KOIvGlyYzPGNCrPEw6a1cSspmgx8UIwveRldzxBrj5rHSqaWOerwEmq+hyAiBwG3AIsq2ZgxpjGUZhMZ0PvMK9e2M7Fx7+sWPUQqdORxpX0840WEi+Aqv4RiFYvJGNMI4r7EU46ci6fOe1IYm5O31jEY2AkW5eNbpWUfFeJyLWMXclidfVCMsY0oo5kjM+d9grW9qT49F1PFut9r1hxFHOSMeYkZ3Sj224qKfl+EHgKuBT4MPA0cHE1gzLGNB7PE/IKH719bL3vR29/guFMrsbRTb5K5vMdIaj3tSkkjTFVlVMtu65bPTa6NcaU8caYGSERjRSXFSpYMLuJRLT+UlX9HZExZsbqTMa55pzuMeu6XXNON511Vt8LFVQ7iMgrVfXJqQjGGNPYPE84fF4rd15yXN2vbFFJb4er3Ai364HvqerO6oZkjGlkniczfThxRfZY7aCqbyToXraQoNvZ90TkrVWPzBhj6lhFdb6q+jzwKeAy4M3A10XkWRF5dzWDM8aYelXJ3A5HiciVBMsA/QXwTlU9wl2+ssrxGWNMXaqkzvcbwDXAJ1S12AFPVTeJyKeqFpkxxtSxSpLvqcCwquYARMQDEqqaUtWbJr6rMcaYciqp8/0FEO713Oy2GWNMVeTzyraBETb2ptg2MNKwE+skVHWwcEVVB0WkuYoxGWMaWD6vPLdlgAtuXFWcXOeac7o5fF5rXfX3raTkOyQirylcEZFlwPAE+xtjzD7rGcoUEy8EcztccOMqeobqazXjSkq+fwv8QEQKKwYfCPxV9UIyxjSyTDZXdnKdTLa+ZjarZFazR0Xk5cDhBGsRPTvd13AzxsxcMT+YXCecgBfMbiLmR2oY1eSrdGKd1wJHAa8GzhKRc6oXkjGmkXUkY2Un1+lIxmoc2eSqZGKdm4CXAb8HCuV+BW6sYlzGmAbVKJPrVFLn2w0cqZWsMW+MMZOgESbXqaTa4UnggGoHYowxjaSSkm8n8LSIPAKMFDaq6mlVi8oYY+pcJcn3s9UOwhhjGk0lXc1+LSKLgUNV9RdudFt99fkwxpgpVsmUkhcAtwPfdpvmAz+sZlDGGFPvKmlw+xBwHNAPxYnV51YzKGOMqXeVJN8RVS0OqhYRn6CfrzHGmH1USfL9tYh8Amhya7f9APhRdcMyxpj6Vkny/RiwDfgDcBHwE4L13IwxxuyjSno75AmWEbqm+uEYY0xjqGRuhzWUqeNV1YOrEpExpuHl80rPUMbmdghdTgDvAeZUJxxjTKOzlSwcVe0J/W1U1X8lWDbeGGMmna1k4YSXECJI1t1Aa9UiMsY0NFvJYpevhC5ngbXAX1YlGmNMw2uUlSwq6e1wwlQEYowxsGslix8+tp4V3YuIeELM92hPVFJWnDkqqXb4yES3q+pXJy8cY0yj8zzhkM4k73jVAs67/tFio9tVK5fx8nmt+H6lq59Nb5UcRTfwQYIJdeYDFwNHEtT7Wt2vMWbSbRvK8MGbV49pdLv45tVsHRzZwz1njkonU3+Nqg4AiMhngR+o6l9XMzBjTOMazeXLNrplc/kaRTT5Kin5LgLCfTwywJKqRGOMMUA04hVXLy5YMLsJP1IfVQ5QWfK9CXhERD4rIp8BfoutXGyMqaK5LXGuWrlszPLxV61cxtyW+llUUypZlNj19f3f7uoDqvq7qka1n7q7u3XVqlW1DsMYsx+y2TxbB0fI5vL4EY+5LfGZ2thWdlhepX03moF+Vb1ORLpEZKmqrpm82IwxZizf9ziovWnPO85QlXQ1+wxBj4fDgeuAKHAzweoWxhhTNfU8wU4lJd93Aa8GHgNQ1U0iYl3MjDFVVe8T7FRSgZLRoGJYAUQkWd2QjDGm/ifYqST53iYi3wba3UrGv8AmVjfGVFm9T7BTydwOX3Zrt/UT1Pv+o6reW/XIjDENrd4n2Jmw5CsiERH5hareq6ofVdW/t8RrjJkKhQl2wn19rzmnm45krMaRTY4JS76qmhORlIjMUtW+qQrKGGM8Tzh8Xit3XnJcXfZ2qKTONw38QUSuFZGvF/72dCcR+a6IbBWRJ0Pb5ojIvSLyvPs/220X97gviMgT4QncReRct//zInLuvhykMWZm8jyhqzXO/NnNdLXG6ybxQmXJ9x7g08ADwOrQ355cD5xSsu1jwC9V9VDgl+46wNuAQ93fhcC/Q5Csgc8ArwdeB3ymkLCNMfUvn1e2DYywsTfFtoER8vk9j8idKcatdhCRRar6oqresC8PrKoPiMiSks2nA8e7yzcA9wOXue03ui5tD4tIu4gc6Pa9V1V3uJjuJUjot+xLTMaYmaPQz/fKe59j+bKFdCRjDGeyHDSraaYOMx5joiP4YeGCiNwxSc83T1VfAnD/57rt84H1of02sGv+4HLbjTF1rmcow5X3Pse5xy7ljtXr6RnKsHVghI19w2SzM39qyYka3MKVKwdXOY5yFTk6wfbdH0DkQoIqCxYtWjR5kRljaiKTzbF82UJueGgN5x67lMvueKI40u3bZy/jiAPaZnQd8EQlXx3n8v7Y4qoTcP+3uu0bgIWh/RYAmybYvnuwqlerareqdnd1dU1SuMaYWin0bli+bGEx8UIw0OKim1bP+JFuEyXfo0WkX0QGgKPc5X4RGRCR/n18vruBQo+Fc4G7QtvPcb0ejgH6XLXEz4GTRGS2a2g7yW0zxtS5jmSMua1xOpKxuhzpNm61g6ru1zASEbmFoMGsU0Q2EPRa+GeC4crnAy8C73G7/wQ4FXgBSAHnuRh2iMjngUfdfpcXGt+MMfXN84SDZjWhwElHzuW845ZyQFuCnCrbBzM0xWb2SLeKJlOfaWwydWPqx+hojrW9KVIjWXzPo60pSi6v+BFhXkucaHTaJ+GyFdMzv7+GMaau7Uxn2Tk0SiLqkVPl5t+sIeYH+WxTf5pNO2dm7wdLvsaYaS2TzdHZEiPiRbjn8Y2cdcxitg9muPxHT7EzlcGPCFsG0qzrGZpRidiSrzFmWov5EXIKnsCK7kVkc/CN+57nIycdxqzmKJv70tz40Boy2Tzp0RybB9JkMtlah71Hla7hZowxNdGRjJEezTGSzRNx/XqXL1uI70XI5uCexzfy9qPnc971jxb7Af/7ymUcMa91Wo+Es+RrjJnWPE84sC3Bpv5hPBHSo/lgdjPXjLWiexHnXf8oxx7cwQVvOpiIJ+QV+tIZOloStQ1+AtP3a8EYYxzf91jQ3kxbU4SoH8x0llfIK0Q84diDO1j5hsVc8fNn2dKfJuLB4EiOrf3paTsZjyVfY8yM4HlCWyLO/LYm2hIRYr7gRyDme1z45pfxjfue55ITDqE5FiE1kiPme2Syedb3pqZlErZqB2PMjOL7HrP9BLOalP50BlByeWX5soWMZpW2pggiwtb+NJms0tkSoz+dZSSb58C2xLSpB7bka4yZkTxPaG+OA5DKDNORjNGRjJFT6B3KEPOFmC+s7UkFpeFMjpFsjoM7W6bFhDzT4yvAGGP2w9yWOF2t8WKXtM6WGJmsksrkuOWRdewcHqU14eOJuNJy7VnyNcbMeL7vcVBbgrjvkVfIaZCAr3twDZeccAixiEcur6zrSbF9MDMt6n+t2sEYUxei0UixSxpAJqucd9xShjM5Pn3Xk8U+wFesOIo5yRhzkvGaxmvJ1xhTNwpd0gZGMvQP5zhgVoKzr32ErpY433zvq5nbliCXUwbSWbI5pbOldotyWvI1xtQVzxNmNcVJRvNs7BumqyXOPy1/JZ4Ia7cP8dHbd62Icc3Z3Rx+QGtNErAlX2NMXfJ9j6ZohEtPPBTfi7B+R4pbHlnHFSuOGjMv8M7hTE2qICz5GmPqVmdLnKWdSfIa9Pe95IRDGM7kOPu7j4xZD669KTblpV/r7WCMqVueJzTHI+QVmmI+vUOjxWoHqO16cJZ8jTF1rTMZJ+4Lvic0xyLTZj04S77GmLrmecL89mbiUY9UJseC2U1jbl8wu4mYP/VLEVnyNcbUPc8T5rUmWNzRzBUrjiom4AWzm7jmnG46krEpj8ka3IwxDcHzhCUdSdqbo3z/wmPIKSSiHp3J2vT1teRrjGkYnidBt7JkrSOxagdjjKkJS77GGFMDlnyNMaYGLPkaY0wNWPI1xpgasORrjDE1YMnXGGNqwJKvMcbUgCVfY4ypARvhZoxpKPm80jOUIZPNEfMjdCSnfi5fsORrjGkg+bzy3JYBLrhx1a6lhM7p5vB5U7+UkFU7GGMaRs9Qpph4IZjL94IbV9lk6sYYU02ZbM4mUzfGmKkW8yM2mboxxky1jmSMa87ptsnUjTFmKnmecPi8Vu685Djr7WCMMVPJ84Su1nitw7BqB2OMqQVLvsYYUwNW7WCMaTjTYZSbJV9jTEOZLqPcrNrBGNNQpssoN0u+xpiGMl1GuVnyNcY0lOkyys2SrzGmoUyXUW7W4GaMaSieJxza1cJtF72B0VyeaMRjbkvcejsYY0w15fPK89sGa97bwZKvMaahFHo7dLXEuWLFURzQliCnsKU/zby2xJQlYEu+xpiGksnm6GqJ85nTjkSAtT0pmmMR1mdyDI/mWNKRnJIEbA1uxpiGEvMjXHrioYxmlVQmx8N/2sacZIzFHc1EIx4DI1PT39eSrzGmoXQkYyztTNLZEuP+Z7fw9qPnc/uqF8nlldFcnoF0jkwmW/U4LPkaYxqK5wnN8Qg5hRXdi7jn8Y3FBJyIBinxpYERNu0cJpvNVy+Oqj3yJBORU0TkORF5QUQ+Vut4jDEzV2cyTtz3iHhSTMBnHbOY7YMZLv/RU+xMZRCBzQNp1vUMsbU/TT6vkxrDjGhwE5EI8E3grcAG4FERuVtVn65tZMaYmcjzhAPbEmweSJPJ5lnRvYhsDr5x3/N85KTD8ERYu32Ij97+xK7uaGd3c/gBk9cdbaaUfF8HvKCqf1bVDHArcHqNYzLGzGC+7zGvJU7MlYA9geXLFuJ7ETb2pouJF9zkOzdN7uQ7MyX5zgfWh65vcNuKRORCEVklIqu2bds2pcEZY2amaDTC3GSMmO+R16AxzhNojkWqPvnOTEm+5cr5YypgVPVqVe1W1e6urq4pCssYM9PFYj5zkzGifrC2W14hlclVffKdmZJ8NwALQ9cXAJtqFIsxps7EYj7z25poS0SI+cL82QmuWHHU2Ml3zp7cyXdmRIMb8ChwqIgsBTYCZwLvrW1Ixph64vses/0Es5qU/nSGWYkot154DPm8kohG6JzkyXdmRPJV1ayI/B/g50AE+K6qPlXjsIwxdcjzhPbm6i8tPyOSL4Cq/gT4Sa3jMMaYyTBT6nyNMaauWPI1xpgasORrjDE1YMnXGGNqwJKvMcbUgCVfY4ypAUu+xhhTA5Z8jTGmBkR1cicIng5EZBuwbj8eohPYPknhTAd2PNObHc/0tr/Hs11VTyndWJfJd3+JyCpV7a51HJPFjmd6s+OZ3qp1PFbtYIwxNWDJ1xhjasCSb3lX1zqASWbHM73Z8UxvVTkeq/M1xpgasJKvMcbUQEMnXxE5RUSeE5EXRORjZW6Pi8j33e2/FZElUx9lZSo4lo+IyNMi8oSI/FJEFtcizkrt6XhC+60QERWRad26XsnxiMhfuvfoKRH53lTHuDcqON8WicivROR37pw7tRZxVkpEvisiW0XkyXFuFxH5ujveJ0TkNfv9pKrakH8EK2L8CTgYiAGPA0eW7HMJcJW7fCbw/VrHvR/HcgLQ7C5/cLoeS6XH4/ZrBR4AHga6ax33fr4/hwK/A2a763NrHfd+Hs/VwAfd5SOBtbWOew/H9CbgNcCT49x+KvBTgsV8jwF+u7/P2cgl39cBL6jqn1U1A9wKnF6yz+nADe7y7cCJIjJ5izhNnj0ei6r+SlVT7urDBIuQTleVvDcAnwf+BUhPZXD7oJLjuQD4pqr2Aqjq1imOcW9UcjwKtLnLs5jmC96q6gPAjgl2OR24UQMPA+0icuD+PGcjJ9/5wPrQ9Q1uW9l9VDUL9AEdUxLd3qnkWMLOJ/gWn672eDwi8mpgoar+eCoD20eVvD+HAYeJyIMi8rCI7DYiahqp5Hg+C6wUkQ0Ey3/9vB43dwAACjxJREFUzdSEVjV7+xnboxmzhlsVlCvBlnb9qGSf6aDiOEVkJdANvLmqEe2fCY9HRDzgSuD9UxXQfqrk/fEJqh6OJ/hV8t8i8kpV3Vnl2PZFJcdzFnC9qn5FRN4A3OSOJ1/98Kpi0nNBI5d8NwALQ9cXsPtPo+I+IuIT/Hya6KdJrVRyLIjIW4BPAqep6sgUxbYv9nQ8rcArgftFZC1BHdzd07jRrdJz7S5VHVXVNcBzBMl4OqrkeM4HbgNQ1d8ACYI5Emaqij5je6ORk++jwKEislREYgQNaneX7HM3cK67vAK4T13t+zSzx2NxP9O/TZB4p3N9IuzheFS1T1U7VXWJqi4hqMM+TVVX1SbcParkXPshQaMoItJJUA3x5ymNsnKVHM+LwIkAInIEQfLdNqVRTq67gXNcr4djgD5VfWm/HrHWrYw1buE8FfgjQcvtJ922ywk+yBCcMD8AXgAeAQ6udcz7cSy/ALYAv3d/d9c65v05npJ972ca93ao8P0R4KvA08AfgDNrHfN+Hs+RwIMEPSF+D5xU65j3cDy3AC8BowSl3POBi4GLQ+/PN93x/mEyzjcb4WaMMTXQyNUOxhhTM5Z8jTGmBiz5GmNMDVjyNcaYGrDka4wxNWDJd4qISE5Efi8iT4rID0SkeYqf/wwROTJ0/XI36KKaz3mLmwHq76r5PHuI4XgR2e8hyK5/530i0rbnvff4WO8XkYP28j5Lys24tb/HJyLvEZFnRORX+/oY04GIvENEPlfrOPaGJd+pM6yqr1LVVwIZgj6ERe7DXZX3w43OO4Og7yUAqvqPqvqLajyfe84DgGNV9ShVvbJazzOFTgUeV9X+/XkQEYkQDIveq+RbRecDl6jqCeGN7pyZSe4BTpvqQs3+sORbG/8NHOJKM8+IyLeAx4CFInKWiPzBlZC/VLiDiAyKyFdE5DE3H2+X2/4qNxHLEyJyp4jMdtvvF5H/JyK/Bi4DTgOucKXvl4nI9SKywu17opt39Q9uXtO4275WRD7nnvMPIvLy0gOR/9/emcdYUWVh/Pc1tjYgQoxoROPgoA2RqBjEGU0TmYzyz2jUKJLY7jsa1MRlxiXEJVEnkBkDhjCBgXbBKLjEuMQGlxYR0iDYgkvTbqAzwaiJomxuffzjnCdFWa+XxOY18X7Jy7t169a5595bdercU3W/kmokzYv9b0oqXcSLgP2jvrG5YyZE+96StCTyhkp6LepaLemEyB8n6VVJCyS1SbpXUr2kFVHnsCjXIGlWyGiTdEqBrv2jfStD19Mif2TIa4l+LFrWWw88nZHzXOj/tqSJXejHKZKW4pwHxwLzo76+kkZHG1dJalSwZUX+W5KWA1d3cD7tE2P/bvRBlaRLJP1y05N0maR/5fpjClAHzJI0Ve6RL5T0TIwfkm6M/lqT9Swl3Srn831RPsO5IfKbFMu8Je0nX/6NpD5RR0nWFZnxbZL0uKRWSfMlZw6UNEbSsuiDFZIGxPiOyujxuqSjzBcsNAG/Gvdei0qvLPm9/IDN8b8HfhFPAoYC7cCfY98QfFnm4Cj3MnB67DOgPtJTgPsjvQY4MdJ3AvdFugmYmam/ATgrv42v4vsUqI38B4HrIr0emBzpq4A5Be26HpgX6RGhf020rRw36lrgoEgPiv9+QE2kDwfeiPQ44GvgQGAv4P/AHbHv2kx7G4AXcIficHyVUk0c/2yUuRs4t1QvvkKrPzAj07d7An0LdN4ADIj0mcDszL6BXejHmzLlm4gVUkA1sAwYHNsTgbkFYzu1qD+jfdtxbt0+wOIY1/74aqzqKLcMOLLg+KwuF0a/7Rvb43FeXkW/Povz3o6OMeyH00Z+ANxQIG8/gscXuBy4LdJ7AW8Ah4b+m3CuhCpgOX5D2BNfXj0mjtkHvyYuyIx5LXGexHY9MKPS13pXf8nz3XXoK6kFP+k+Af4b+RvM+UEBxgBNZvaFOYXlfPxkBzfSj0X6YaBO0kDceL0a+Q9kypMp3xGGAx+bWVsZGU/G/yrcoOZRBzwEYGatuJGq7aTO14EGSZfhBgPcCM2WtBZf0n1EpvxKM9toTgb0IeGV4QYgq9MCM2s3s/fxCzfvqY8H/hHj0IQbzEPwC/4WSX8H/mBm2wp03tfMvs3Ue5Kkf0oaa2ab6Lwfy43FcJwkaHHodRtwcMHYPlTmeIAV5ty6P+HLZOvMbAt+8z4lZizVZra2AxklLDazEnnU+Pi9ic/MRuA3trHAU2a21TwMk+d1KMJ4nBuhBWjGqVlLM4wVZvY/c8azFnxMhwMbzWwlgJl9E9fEwmhTNXAxftMt4XN6TzinU+xucZ3dGdvMbFQ2I2ZXW7JZ3ZDXlXXhWzov0mmdJfaznyg+X7pNLm9mV0r6E/A3oCWmkZNx7omjcQ8oS5CeZWBrz2y353TK90kRReiZZrYul/+epObQp1HSpWb2cq7Mj5Kqwri3SRqNx4HvkbSIzg1QubEQ8I6ZHb9TpjSoQP9yKNfuOcAtQCswr4uy8ufjPWb2n5xu13Wg24/sCGfW5GRNNrPGnKxx7Dy+pfNMRXWY2VZJi3Fy87PxEE4JNUDRjbNXInm+vQvNwIkRK+uDxwdLnk8VPp0EOAdYGh7XV9oRUz0vUz6Pb3EqxjxagaGSDuuCjCIswad7SKrFPcm8cdsJkoaZWbOZTQG+xKn6BuKeTnvo0KcjGWUwIeKdw/BpeF6PRmByJqZ4TPz/EfjIzKbjRvSoAtnrQibyNxW2mtnDwDT88zPd6cfsWKwDBss5b5FULWmkOY/vJkl1Ua6+g3YfJ2cYq8LDFksBzKwZ79tzcI+4u2gELpa0d+h2kKT98TE/I+LVA4BTM8esx8MSsON8LcmaFB4rkmol9e+g7lZgiKQxUX6AdjwEnANMx2dEWYrXWqDwG2y9Ecnz7UUws42SbgZewe/8z5vZ07F7CzBS0io8RjYx8i/AH5j0w6faF5UR/yg+rb+GzEVhZtslXQQsjJN7JTCrG2rPjPrX4l7PhWb2nTr+2tJU+UMtAS/hzFczgSckTYj2d8Vrz2MdbvAOwNmotuf0uAu4D1gTBng9/oBmIv7VhR+Az/DYeR7P4fHJD4Ajow3tOAvWpG72YwPeZ9uA4/HxmB6hhj1Cx3fwsZwraStuvMphOXBv6LUEeCqzbwEwyuLzRN2BmS2S00Euj37cjMfMV0t6DA8RbMAfIJcwDVgg6Tw87FHCHDycsDr6/gv8DZxydX8vf5A5Q1Jf3KM9CX92skrSN/zam/8LcHN321kpJFaz3QSSNpvZ3pXWo7dCUgP+YO3xHpJ/IP4Nr5N7Qn5PQf4O8L/N7KUerON23ChO66k6cvUNwWP2I2KmhKQDgEfM7K+7QoffAinskJDQBZgTZ8/Wb7DIYldA0iBJbfizhh4zvLsaks7Hw3O32s6fJDoEf/Nmt0HyfBMSEhIqgOT5JiQkJFQAyfgmJCQkVADJ+CYkJCRUAMn4JiQkJFQAyfgmJCQkVADJ+CYkJCRUAD8Dsu11nJTCETEAAAAASUVORK5CYII=", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "fig_ident_prot = sns.relplot(x='SampleID', y=COL_NO_IDENTIFIED_PROT, data=sample_stats) \n", - "fig_ident_prot.set_axis_labels(\"Sample ID\", \"Frequency of identified proteins\")\n", - "fig_ident_prot.fig.suptitle('Frequency of identified protein groups by sample id', y=1.03)\n", - "_savefig(fig_ident_prot, 'identified_proteins_by_sample')\n", - "\n", - "fig_ident_prot_dist = sns.relplot(x=COL_PROP_SAMPLES, y=COL_NO_IDENTIFIED_PROT, data=sample_stats)\n", - "fig_ident_prot_dist.set_axis_labels(\"Proportion of samples (sorted by frequency)\", \"Frequency of identified proteins\")\n", - "fig_ident_prot_dist.fig.suptitle('Frequency of identified protein groups by sample id', y=1.03)\n", - "_savefig(fig_ident_prot_dist, 'identified_proteins_ordered')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "### Protein-Groups do not match well\n", - "- Are the assigned protein-group names sensible inbetween runs? \n", - " > Each sample has at least 80 percent missing values regarding all identified proteins\n", - "- Clustering by MaxQuant into protein-groups is sample dependent and therefore some groups might represent a very similar set of peptides" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "COL_NO_MISSING_PROT_PROP = COL_NO_MISSING_PROT + '_PROP'\n", - "sample_stats[COL_NO_MISSING_PROT_PROP]= sample_stats[COL_NO_MISSING_PROT] / float(X.shape[1])\n", - "\n", - "# from ggplot import *\n", - "# ggplot(aes(x='nan_proc'), data = nonnan) + geom_histogram(binwidth = 0.005) #+ ylim(0,0.025)\n", - "sns.set(style=\"darkgrid\")\n", - "g = sns.relplot(x='prop_samples', y=COL_NO_MISSING_PROT_PROP, data=sample_stats)\n", - "plt.subplots_adjust(top=0.9)\n", - "g.set_axis_labels(\"Proportion of samples (sorted by frequency)\", \"proportion missing\")\n", - "g.fig.suptitle('Proportion of missing values ordered')\n", - "_savefig(g, \"proportion_proteins_missing.pdf\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "### Protein-Groups" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "protein_groups = X.columns.to_frame('proteins')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "Some protein groups are ensembles?" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "proteins\n", - "X6RM59 6\n", - "K7EQF5 6\n", - "K7EQG1 6\n", - "K7EQG9 6\n", - "K7EQH1 6\n", - " ..\n", - "CON__ENSEMBL:ENSBTAP00000024462 31\n", - "CON__ENSEMBL:ENSBTAP00000032840 31\n", - "CON__ENSEMBL:ENSBTAP00000038329 31\n", - "CON__ENSEMBL:ENSBTAP00000001528 31\n", - "CON__ENSEMBL:ENSBTAP00000018229 31\n", - "Name: proteins, Length: 25427, dtype: int64" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "protein_groups.proteins.str.len().sort_values()" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Top 24 grouped by first characters\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "ce9bfeb6c8e34b8493d15e92a20bf337", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "interactive(children=(IntSlider(value=4, description='n_characters', max=10, min=1), Output()), _dom_classes=(…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import ipywidgets as w\n", - "N=24\n", - "print(f'Top {N} grouped by first characters')\n", - "def calc_counts(n_characters):\n", - " return protein_groups.proteins.str[:n_characters].value_counts()[:N]\n", - "w.interact(calc_counts, n_characters=w.IntSlider(value=4, min=1, max=10))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "How does the naming work?" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "protein_groups['p4'] = protein_groups.proteins.str[:4]\n", - "protein_groups['p8'] = protein_groups.proteins.str[:8]\n", - "protein_groups['p9'] = protein_groups.proteins.str[:9]" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
proteinsp4p8p9
proteins
A0A024QZ33A0A024QZ33A0A0A0A024QZA0A024QZ3
A0A024QZ42A0A024QZ42A0A0A0A024QZA0A024QZ4
A0A024QZP7A0A024QZP7A0A0A0A024QZA0A024QZP
A0A024QZX5A0A024QZX5A0A0A0A024QZA0A024QZX
A0A024R161A0A024R161A0A0A0A024R1A0A024R16
...............
X6RLL4X6RLL4X6RLX6RLL4X6RLL4
X6RLN4X6RLN4X6RLX6RLN4X6RLN4
X6RLR1X6RLR1X6RLX6RLR1X6RLR1
X6RLX0X6RLX0X6RLX6RLX0X6RLX0
X6RM59X6RM59X6RMX6RM59X6RM59
\n", - "

25427 rows × 4 columns

\n", - "
" - ], - "text/plain": [ - " proteins p4 p8 p9\n", - "proteins \n", - "A0A024QZ33 A0A024QZ33 A0A0 A0A024QZ A0A024QZ3\n", - "A0A024QZ42 A0A024QZ42 A0A0 A0A024QZ A0A024QZ4\n", - "A0A024QZP7 A0A024QZP7 A0A0 A0A024QZ A0A024QZP\n", - "A0A024QZX5 A0A024QZX5 A0A0 A0A024QZ A0A024QZX\n", - "A0A024R161 A0A024R161 A0A0 A0A024R1 A0A024R16\n", - "... ... ... ... ...\n", - "X6RLL4 X6RLL4 X6RL X6RLL4 X6RLL4\n", - "X6RLN4 X6RLN4 X6RL X6RLN4 X6RLN4\n", - "X6RLR1 X6RLR1 X6RL X6RLR1 X6RLR1\n", - "X6RLX0 X6RLX0 X6RL X6RLX0 X6RLX0\n", - "X6RM59 X6RM59 X6RM X6RM59 X6RM59\n", - "\n", - "[25427 rows x 4 columns]" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "protein_groups.sort_values('p8')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "Prefixed `CON__` or `REV__`" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
proteinsp4p8p9
proteins
CON__A2I7N0CON__A2I7N0CON_CON__A2ICON__A2I7
CON__A2I7N1CON__A2I7N1CON_CON__A2ICON__A2I7
CON__A2I7N3CON__A2I7N3CON_CON__A2ICON__A2I7
CON__ENSEMBL:ENSBTAP00000001528CON__ENSEMBL:ENSBTAP00000001528CON_CON__ENSCON__ENSE
CON__ENSEMBL:ENSBTAP00000006074CON__ENSEMBL:ENSBTAP00000006074CON_CON__ENSCON__ENSE
...............
REV__Q96HY6REV__Q96HY6REV_REV__Q96REV__Q96H
REV__Q96HY6-2REV__Q96HY6-2REV_REV__Q96REV__Q96H
REV__Q96PE2REV__Q96PE2REV_REV__Q96REV__Q96P
REV__Q9BYP7-4REV__Q9BYP7-4REV_REV__Q9BREV__Q9BY
REV__Q9Y2X3REV__Q9Y2X3REV_REV__Q9YREV__Q9Y2
\n", - "

84 rows × 4 columns

\n", - "
" - ], - "text/plain": [ - " proteins p4 \\\n", - "proteins \n", - "CON__A2I7N0 CON__A2I7N0 CON_ \n", - "CON__A2I7N1 CON__A2I7N1 CON_ \n", - "CON__A2I7N3 CON__A2I7N3 CON_ \n", - "CON__ENSEMBL:ENSBTAP00000001528 CON__ENSEMBL:ENSBTAP00000001528 CON_ \n", - "CON__ENSEMBL:ENSBTAP00000006074 CON__ENSEMBL:ENSBTAP00000006074 CON_ \n", - "... ... ... \n", - "REV__Q96HY6 REV__Q96HY6 REV_ \n", - "REV__Q96HY6-2 REV__Q96HY6-2 REV_ \n", - "REV__Q96PE2 REV__Q96PE2 REV_ \n", - "REV__Q9BYP7-4 REV__Q9BYP7-4 REV_ \n", - "REV__Q9Y2X3 REV__Q9Y2X3 REV_ \n", - "\n", - " p8 p9 \n", - "proteins \n", - "CON__A2I7N0 CON__A2I CON__A2I7 \n", - "CON__A2I7N1 CON__A2I CON__A2I7 \n", - "CON__A2I7N3 CON__A2I CON__A2I7 \n", - "CON__ENSEMBL:ENSBTAP00000001528 CON__ENS CON__ENSE \n", - "CON__ENSEMBL:ENSBTAP00000006074 CON__ENS CON__ENSE \n", - "... ... ... \n", - "REV__Q96HY6 REV__Q96 REV__Q96H \n", - "REV__Q96HY6-2 REV__Q96 REV__Q96H \n", - "REV__Q96PE2 REV__Q96 REV__Q96P \n", - "REV__Q9BYP7-4 REV__Q9B REV__Q9BY \n", - "REV__Q9Y2X3 REV__Q9Y REV__Q9Y2 \n", - "\n", - "[84 rows x 4 columns]" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "protein_groups.loc[protein_groups.p4.isin(('CON_','REV_'))].sort_index()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false", - "toc-hr-collapsed": false - }, - "source": [ - "## Select Proteins" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "### Minumum required sample quality\n", - "First define the minum requirement of a sample to be kept in " - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5652baf0957f48ef99021159aee75fe3", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "IntSlider(value=1500, max=4461)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import ipywidgets as w\n", - "MIN_DEPTH_SAMPLE = 1500\n", - "w_min_depth_sample = w.IntSlider(value=MIN_DEPTH_SAMPLE, min=0, max=max(sample_stats[COL_NO_IDENTIFIED_PROT]))\n", - "w_min_depth_sample" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Selected 413 samples\n" - ] - } - ], - "source": [ - "mask_samples = sample_stats[COL_NO_IDENTIFIED_PROT] >= w_min_depth_sample.value\n", - "print(f\"Selected {mask_samples.sum()} samples\")" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "# datasets with 50% and 90% coverage + log transformation, z-normalization and fill missingness (for pca)\n", - "x_50 = coverage(X.loc[mask_samples], coverage_col=0.5, coverage_row=0.2)\n", - "# x_50_pca = log_z_zeroone_na(x_50) # there is a huge difference if NA is set to low value or mean!!\n", - "x_90 = coverage(X.loc[mask_samples], 0.9, 0.9)\n", - "# x_90_pca = log_z_zeroone_na(x_90)" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
proteinsA0A075B736A0A087WYC1A0A0A0MSI0A0A0B4J2C3A0A0C4DGZ5A0A0J9YXZ5A0A140T936A0A2R8Y6G6A0A3B3IS95A0A494C1A5...Q9NYU2Q9UG63-2Q9UIG0Q9UQ80-2Q9Y265-2Q9Y2W1Q9Y3A5Q9Y3F4-2Q9Y617-2V9GYZ6
SampleID
82.257000e+094.630300e+101.383400e+112.232300e+101.554300e+104.390000e+102.025100e+102.889800e+111.090300e+113.493700e+10...2.272000e+091.636800e+108.111000e+095.147400e+101.253200e+107.201600e+095.729700e+091.356700e+101.721900e+105.427000e+10
\n", - "

1 rows × 147 columns

\n", - "
" - ], - "text/plain": [ - "proteins A0A075B736 A0A087WYC1 A0A0A0MSI0 A0A0B4J2C3 \\\n", - "SampleID \n", - "8 2.257000e+09 4.630300e+10 1.383400e+11 2.232300e+10 \n", - "\n", - "proteins A0A0C4DGZ5 A0A0J9YXZ5 A0A140T936 A0A2R8Y6G6 \\\n", - "SampleID \n", - "8 1.554300e+10 4.390000e+10 2.025100e+10 2.889800e+11 \n", - "\n", - "proteins A0A3B3IS95 A0A494C1A5 ... Q9NYU2 Q9UG63-2 \\\n", - "SampleID ... \n", - "8 1.090300e+11 3.493700e+10 ... 2.272000e+09 1.636800e+10 \n", - "\n", - "proteins Q9UIG0 Q9UQ80-2 Q9Y265-2 Q9Y2W1 \\\n", - "SampleID \n", - "8 8.111000e+09 5.147400e+10 1.253200e+10 7.201600e+09 \n", - "\n", - "proteins Q9Y3A5 Q9Y3F4-2 Q9Y617-2 V9GYZ6 \n", - "SampleID \n", - "8 5.729700e+09 1.356700e+10 1.721900e+10 5.427000e+10 \n", - "\n", - "[1 rows x 147 columns]" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "x_90.sample()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "### Distribution of Intensity values\n", - "- comparing non-transformed to $\\log_{10}$ transformed" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Sample ID: 366\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from vaep.transform import log\n", - "from random import sample\n", - "sample = x_50.sample()\n", - "sample_id = sample.index[0]\n", - "print(\"Sample ID:\", sample_id)\n", - "sns.set(style=\"darkgrid\")\n", - "fig, axes = plt.subplots(1,2, figsize=(10,3))\n", - "sns.distplot(sample, bins=100, ax=axes[0])\n", - "sample_log = log(sample) # natural logarithm, could also be base_2, base_10 logarithm\n", - "sns.distplot(sample_log, bins=100, ax=axes[1])\n", - "_ = fig.suptitle(\"Normalized vs. log (ln) normalized distribution\")\n", - "plt.tight_layout()\n", - "_savefig(fig, 'distribution_sample_' + str(sample_id))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "ToDo: Select a logarithm for the transformation which is either interpretable or gives best to normal distribution:\n", - "1. $log_2$: Interpretable as continous doubling of the intensity\n", - "2. KDensity measure of goodness of fit to normal distribution." - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "x_50 = x_50.apply(log)\n", - "x_90 = x_90.apply(log)" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": { - "Collapsed": "false" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
proteinsA0A075B6E2A0A075B6F9A0A075B6Q0A0A075B736A0A087WSY9A0A087WTB8A0A087WTW0A0A087WUC6A0A087WUE9A0A087WUZ3...U3KQ85V5IRT4V9GYP5V9GYY3V9GYZ6X6R8A1X6RA14X6RA30X6RAC9X6RFL8
SampleID
40823.30892220.59128818.53392023.16108122.94706720.54162420.20455920.24031520.73795723.491972...19.93149118.11639521.10982023.61274823.45359920.90116121.53588419.92890520.952471NaN
28123.30029620.42208018.91592623.13371822.89925520.76938620.39777120.53468120.70705523.667126...19.51932617.81077121.16936123.75359323.71799820.95818021.89395119.753284NaNNaN
8222.241750NaNNaN22.498286NaN19.45943718.80689518.96943819.87395622.474429...20.96393517.666934NaNNaN22.38143719.978657NaN18.560182NaNNaN
16122.241465NaNNaN22.683572NaN19.93314518.912018NaN19.67167122.149430...NaNNaN20.086215NaN22.41363819.699251NaN18.398496NaNNaN
25120.808342NaN18.82120022.698971NaN20.19363119.569891NaN20.333197NaN...NaN18.384494NaN23.11733622.46713020.543469NaN19.538115NaNNaN
\n", - "

5 rows × 1897 columns

\n", - "
" - ], - "text/plain": [ - "proteins A0A075B6E2 A0A075B6F9 A0A075B6Q0 A0A075B736 A0A087WSY9 \\\n", - "SampleID \n", - "408 23.308922 20.591288 18.533920 23.161081 22.947067 \n", - "281 23.300296 20.422080 18.915926 23.133718 22.899255 \n", - "82 22.241750 NaN NaN 22.498286 NaN \n", - "161 22.241465 NaN NaN 22.683572 NaN \n", - "251 20.808342 NaN 18.821200 22.698971 NaN \n", - "\n", - "proteins A0A087WTB8 A0A087WTW0 A0A087WUC6 A0A087WUE9 A0A087WUZ3 ... \\\n", - "SampleID ... \n", - "408 20.541624 20.204559 20.240315 20.737957 23.491972 ... \n", - "281 20.769386 20.397771 20.534681 20.707055 23.667126 ... \n", - "82 19.459437 18.806895 18.969438 19.873956 22.474429 ... \n", - "161 19.933145 18.912018 NaN 19.671671 22.149430 ... \n", - "251 20.193631 19.569891 NaN 20.333197 NaN ... \n", - "\n", - "proteins U3KQ85 V5IRT4 V9GYP5 V9GYY3 V9GYZ6 X6R8A1 \\\n", - "SampleID \n", - "408 19.931491 18.116395 21.109820 23.612748 23.453599 20.901161 \n", - "281 19.519326 17.810771 21.169361 23.753593 23.717998 20.958180 \n", - "82 20.963935 17.666934 NaN NaN 22.381437 19.978657 \n", - "161 NaN NaN 20.086215 NaN 22.413638 19.699251 \n", - "251 NaN 18.384494 NaN 23.117336 22.467130 20.543469 \n", - "\n", - "proteins X6RA14 X6RA30 X6RAC9 X6RFL8 \n", - "SampleID \n", - "408 21.535884 19.928905 20.952471 NaN \n", - "281 21.893951 19.753284 NaN NaN \n", - "82 NaN 18.560182 NaN NaN \n", - "161 NaN 18.398496 NaN NaN \n", - "251 NaN 19.538115 NaN NaN \n", - "\n", - "[5 rows x 1897 columns]" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "x_50.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, - "source": [ - "## Imputing missing values in log-transformed space\n", - "\n", - "Options:\n", - "1. Mean Imputation\n", - "2. Zero Imputation\n", - "2. Shifted Mean Imputation (and randomization)" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "from vaep.imputation import imputation_normal_distribution\n", - "_manuael_imputed = imputation_normal_distribution(x_50.iloc[:,0])\n", - "_applied_imputed = x_50.iloc[:,:3].apply(imputation_normal_distribution).iloc[:,0]\n", - "assert _manuael_imputed.equals(_applied_imputed), \"You got apply wrong\"" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "from vaep.imputation import imputation_normal_distribution\n", - "x_50 = x_50.apply(imputation_normal_distribution)\n", - "x_90 = x_90.apply(imputation_normal_distribution)" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "from config import PROCESSED_DATA, PREFIX_IMPUTED, PREFIX_META\n", - "x_50.to_pickle(os.path.join(PROCESSED_DATA, PREFIX_IMPUTED+'_50.pkl'))\n", - "x_90.to_pickle(os.path.join(PROCESSED_DATA, PREFIX_IMPUTED+'_90.pkl'))" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ - "labels.to_pickle(os.path.join(PROCESSED_DATA, PREFIX_META + '.pkl'))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "vaep", - "language": "python", - "name": "vaep" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13 (default, Mar 28 2022, 06:59:08) [MSC v.1916 64 bit (AMD64)]" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/project/misc_embeddings.ipynb b/project/misc_embeddings.ipynb index 97d53839e..36c60b2dc 100644 --- a/project/misc_embeddings.ipynb +++ b/project/misc_embeddings.ipynb @@ -138,9 +138,9 @@ ], "metadata": { "kernelspec": { - "display_name": "vaep", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "vaep" + "name": "python3" }, "language_info": { "codemirror_mode": { diff --git a/project/misc_embeddings.py b/project/misc_embeddings.py new file mode 100644 index 000000000..fcbc27eb4 --- /dev/null +++ b/project/misc_embeddings.py @@ -0,0 +1,39 @@ +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.15.2 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# ## Understanding Embeddings + +# %% +from fastai.tabular.all import * +from fastai.collab import * + +# %% +# Embedding? + +# %% +Embedding + +# %% +e = Embedding(100, 10) + +# %% +idx = torch.tensor([1, 3]) +e(idx).detach().numpy() + +# %% +idx = torch.tensor([1, 2]) +e(idx).detach().numpy() + +# %% diff --git a/project/misc_id_mapper.ipynb b/project/misc_id_mapper.ipynb deleted file mode 100644 index b549dd839..000000000 --- a/project/misc_id_mapper.ipynb +++ /dev/null @@ -1,766 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Analyse peptides\n", - "\n", - "## Specification\n", - "- access different levels of peptides easily\n", - "- select training data per gene easily\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import logging\n", - "logging.basicConfig(level=logging.INFO) # configures root logger\n", - "logger = logging.getLogger()\n", - "logger.info(\"test\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "from config import FN_FASTA_DB, FN_ID_MAP, FN_PEPTIDE_INTENSITIES" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "id_map = pd.read_json(FN_ID_MAP, orient=\"split\")\n", - "\n", - "mask_no_gene = id_map.gene.isna()\n", - "id_map.loc[mask_no_gene, \"gene\"] = \"-\"\n", - "\n", - "\n", - "with open(FN_FASTA_DB) as f:\n", - " data_fasta = json.load(f)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_peptides = pd.read_pickle(FN_PEPTIDE_INTENSITIES)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "set_peptides = set(data_peptides.columns)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- switch between list of proteins with any support and non\n", - " - set threshold of number of peptides per protein over all samples (some peptides uniquely matched to one protein in on sample is just noise -> check razor peptides)\n", - "- show support" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from collections import defaultdict\n", - "import ipywidgets as w\n", - "from config import KEY_FASTA_HEADER, KEY_FASTA_SEQ, KEY_PEPTIDES, KEY_GENE_NAME, KEY_GENE_NAME_FASTA\n", - "\n", - "TGREEN = \"\\033[32m\" # Green Text\n", - "RESET = \"\\033[0;0m\"\n", - "\n", - "w_first_letter = w.Dropdown(\n", - " options=id_map[KEY_GENE_NAME_FASTA].str[0].unique())\n", - "w_genes = w.Dropdown(\n", - " options=id_map.gene.loc[id_map[KEY_GENE_NAME_FASTA].str[0] == w_first_letter.value].unique(),\n", - " value='ACTB'\n", - ")\n", - "\n", - "mask = id_map.gene == w_genes.value\n", - "selected = id_map.loc[mask, \"protein\"]\n", - "\n", - "\n", - "w_proteins_ids = w.Dropdown(options=selected.index)\n", - "w_protein = w.Dropdown(options=selected.unique())\n", - "\n", - "\n", - "def update_gene_list(first_letter):\n", - " \"\"\"Update proteins when new gene is selected\"\"\"\n", - " mask_selected_genes = id_map[KEY_GENE_NAME_FASTA].str[0] == w_first_letter.value\n", - " w_genes.options = id_map.gene.loc[mask_selected_genes].unique()\n", - "\n", - "\n", - "_ = w.interactive_output(update_gene_list, {\"first_letter\": w_first_letter})\n", - "\n", - "\n", - "def update_protein_list(gene):\n", - " mask = id_map[KEY_GENE_NAME_FASTA] == gene\n", - " selected = id_map.loc[mask, \"protein\"]\n", - " w_protein.options = selected.unique()\n", - "# w_proteins_ids.options = selected.loc[selected == w_protein.value].index\n", - "\n", - "\n", - "_ = w.interactive_output(update_protein_list, {\"gene\": w_genes})\n", - " \n", - "\n", - "def update_protein_id_list(protein):\n", - " \"\"\"Update isotope list when protein is selected\"\"\"\n", - " mask = id_map.protein == w_protein.value\n", - " selected = id_map.protein.loc[mask]\n", - " w_proteins_ids.options = selected.index\n", - "\n", - "_ = w.interactive_output(update_protein_id_list, {'protein': w_protein})\n", - "\n", - "d_peptides_observed_prot_id = defaultdict(list)\n", - "\n", - "def show_sequences(prot_id):\n", - " _data = data_fasta[prot_id]\n", - " print(f\"Protein_ID on Uniport: {prot_id}\")\n", - " print(f\"HEADER: {_data[KEY_FASTA_HEADER]}\")\n", - "# print(f\"Seq : {_data[KEY_FASTA_SEQ]}\")\n", - " annotate_seq = \"Peptides: \"\n", - " global d_peptides_observed_prot_id\n", - " for i, _l in enumerate(_data[KEY_PEPTIDES]):\n", - " annotate_seq += f\"\\nNo. of missed K or R: {i}\"\n", - " prot_seq_annotated = _data[KEY_FASTA_SEQ]\n", - " for j, _pep in enumerate(_l):\n", - " if _pep in set_peptides:\n", - " d_peptides_observed_prot_id[prot_id].append(_pep)\n", - " _pep_in_green = TGREEN + f\"{_pep}\" + RESET\n", - " prot_seq_annotated = prot_seq_annotated.replace(_pep, _pep_in_green)\n", - " _pep = _pep_in_green\n", - " if j==0:\n", - " annotate_seq += \"\\n\\t\" + _pep\n", - " else:\n", - " annotate_seq += \",\\n\\t\" + _pep\n", - " print(f\"Seq {i}: {prot_seq_annotated}\")\n", - " print(annotate_seq)\n", - " \n", - " \n", - " display(data_peptides[d_peptides_observed_prot_id[prot_id]].dropna(how='all'))\n", - "\n", - "w_out = w.interactive_output(show_sequences, {\"prot_id\": w_proteins_ids})\n", - "\n", - "label_first_letter = w.Label(value='First letter of Gene')\n", - "label_genes = w.Label('Gene')\n", - "label_protein = w.Label('Protein')\n", - "label_proteins_ids = w.Label('Protein Isotopes')\n", - "\n", - "panel_levels = w.VBox([\n", - " w.HBox([\n", - " w.VBox([label_first_letter, w_first_letter]),\n", - " w.VBox([label_genes, w_genes]),\n", - " w.VBox([label_protein, w_protein]),\n", - " w.VBox([label_proteins_ids, w_proteins_ids])\n", - " ]),\n", - " w_out]\n", - ")\n", - "panel_levels" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- relatively short peptides resulting from one missed cleaveage, do not appear in the upper part." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- `gene` `->` `Protein_ID` (contains information of `gene` `->` `protein_isotopes`\n", - "- `protein_ID` `->` `sequences` (`FN_FASTA_DB`)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pickle\n", - "from tqdm.notebook import tqdm\n", - "from config import FN_PROTEIN_SUPPORT_MAP, FN_PROTEIN_SUPPORT_FREQ\n", - "try:\n", - " df_protein_support = pd.read_pickle(FN_PROTEIN_SUPPORT_MAP)\n", - " with open(FN_PROTEIN_SUPPORT_FREQ, 'rb') as f:\n", - " d_protein_support_freq = pickle.load(f)\n", - "except FileNotFoundError:\n", - " from vaep.utils import sample_iterable\n", - " d_protein_support = {}\n", - " d_protein_support_freq = {}\n", - " for prot_id in tqdm(data_fasta.keys()):\n", - " _data = data_fasta[prot_id]\n", - " peptides_measured = []\n", - " for i, _l in enumerate(_data[KEY_PEPTIDES]):\n", - " for _pep in _l:\n", - " if _pep in set_peptides:\n", - " peptides_measured.append(_pep)\n", - " _d_protein_support = {}\n", - " _df_support_protein = data_peptides[peptides_measured].dropna(how='all')\n", - "\n", - " _n_samples = len(_df_support_protein)\n", - " if _n_samples > 0:\n", - " _d_protein_support['N_samples'] = _n_samples\n", - " d_protein_support_freq[prot_id] = _df_support_protein.notna().sum().to_dict()\n", - " d_protein_support[prot_id] = _d_protein_support\n", - " else:\n", - " d_protein_support[prot_id] = None\n", - " \n", - " df_protein_support = pd.DataFrame(d_protein_support).T.dropna()\n", - " df_protein_support = df_protein_support.join(id_map)\n", - " df_protein_support.to_pickle(FN_PROTEIN_SUPPORT_MAP)\n", - " \n", - " with open(FN_PROTEIN_SUPPORT_FREQ, 'wb') as f:\n", - " pickle.dump(d_protein_support_freq, f)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "l_proteins_good_support = df_protein_support.sort_values(by='N_samples').tail(100).index.to_list()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "d_protein_support_freq['I3L3I0']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Connect to experimental peptide data\n", - "\n", - "Check if counts by `data_fasta`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from tqdm.notebook import tqdm\n", - "\n", - "counts_observed_by_missed_cleavages = {}\n", - "for _protein_id, _data in tqdm(data_fasta.items()):\n", - " _peptides = _data[KEY_PEPTIDES]\n", - " _counts = {}\n", - " for i, _l in enumerate(_peptides):\n", - " _counts[i] = 0\n", - " for _pep in _l:\n", - " if _pep in set_peptides:\n", - " _counts[i] += 1\n", - " counts_observed_by_missed_cleavages[_protein_id] = _counts" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_counts_observed_by_missed_cleavages = pd.DataFrame(\n", - " counts_observed_by_missed_cleavages\n", - ").T" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "from matplotlib import table\n", - "\n", - "fig, axes = plt.subplots(ncols=2, gridspec_kw={\"width_ratios\": [5, 1], \"wspace\": 0.2}, figsize=(10,4))\n", - "\n", - "_counts_summed = df_counts_observed_by_missed_cleavages.sum()\n", - "_counts_summed.name = \"frequency\"\n", - "\n", - "ax = axes[0]\n", - "_ = _counts_summed.plot(kind=\"bar\", ax=ax)\n", - "ax.set_xlabel(\"peptides from n miscleavages\")\n", - "ax.set_ylabel(\"frequency\")\n", - "\n", - "ax = axes[1]\n", - "ax.axis(\"off\")\n", - "_ = pd.plotting.table(ax=ax, data=_counts_summed, loc=\"best\", colWidths=[1], edges='open')\n", - "_ = fig.suptitle('Peptides frequencies')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "These are unnormalized counts in the meaning of that _razor_ peptides are counted as often as they are matched." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mask = df_counts_observed_by_missed_cleavages != 0\n", - "df_prot_observed = df_counts_observed_by_missed_cleavages.replace(0, pd.NA)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_prot_observed = df_prot_observed.dropna(axis=0, how=\"all\")\n", - "df_prot_observed = df_prot_observed.fillna(0)\n", - "df_prot_observed = df_prot_observed.convert_dtypes()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from vaep.pandas import combine_value_counts\n", - "\n", - "combine_value_counts(df_prot_observed)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "freq_pep_mapped_to_protID = df_prot_observed.sum(axis=1).value_counts()\n", - "freq_pep_mapped_to_protID = freq_pep_mapped_to_protID.sort_index()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "freq_pep_mapped_to_protID" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Genes with support in data\n", - "\n", - "try software to identify the _most likely_ protein. OpenMS or russian alternative? " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Imputation: Train model\n", - "\n", - "> Select Gene or Protein\n", - "\n", - "As the samples are all obtained from the same biological sample (in principal), the single run should somehow be comparable.\n", - "An description of variablity (from the Data Scientist perspective) can highlight some commenly known facts about proteomics experiments:\n", - " - batch effects: Measurements on consecutive days are have to be normalized to each other\n", - " - scoring: PSM are assigned to a peptide based on a score. Small variations can lead to different assignments\n", - " \n", - "Can a complex representation of a sample level out experimental variation on an in principle comparable data. \n", - "\n", - "### Strategy\n", - "- first start using peptides from single Protein_IDs\n", - "- then move to all models from genes\n", - "- explore structure" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import torch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "d_peptides_observed_prot_id" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "w_select_proteins_good_support = w.Dropdown(options=l_proteins_good_support)\n", - "w_select_proteins_queried = w.Dropdown(options=list(d_peptides_observed_prot_id.keys()))\n", - "w.HBox(\n", - " [\n", - " w.VBox(\n", - " [\n", - " w.Label(f\"Top {len(l_proteins_good_support)} covered proteins\"),\n", - " w_select_proteins_good_support,\n", - " ]\n", - " ),\n", - " w.VBox([w.Label(\"Queried proteins from above\"), w_select_proteins_queried]),\n", - " ]\n", - ")\n", - "# select from top100 or above selection" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Idea: Select a protein which leads to training. Each selection will create a dump of the selected data, which can be used in the `XZY.ipynb` for model fine-tuning." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "prot_id = w_select_proteins_good_support.value\n", - "id_map.loc[prot_id]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "prot_id = 'P00338' # 'I3L3I0' # w_select_proteins_queried.value # \n", - "_protein, _gene, _ = id_map.loc[prot_id]\n", - "# _gene_fasta" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "w_first_letter.value = _gene[0]\n", - "w_genes.value = _gene\n", - "w_protein.value = _protein\n", - "w_proteins_ids.value = prot_id" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "peptides_measured = d_peptides_observed_prot_id[prot_id]\n", - "n_peptides_in_selection = len(peptides_measured)\n", - "print(f\"Selected a total of {n_peptides_in_selection} peptides.\") " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_peptides[peptides_measured].notna().sum(axis=1).value_counts().sort_index()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "PROP_DATA_COMPLETENESS = 0.75\n", - "mask_samples_selected = data_peptides[peptides_measured].notna().sum(axis=1) >= int(n_peptides_in_selection * 0.75)\n", - "print(f\"Using a share of at least {PROP_DATA_COMPLETENESS}, i.e. at least {int(n_peptides_in_selection * 0.75)} out of {n_peptides_in_selection}.\",\n", - " f\"In total {mask_samples_selected.sum()} samples.\", sep=\"\\n\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from config import PROTEIN_DUMPS\n", - "_ = data_peptides.loc[mask_samples_selected, peptides_measured]\n", - "_.to_json(PROTEIN_DUMPS / f\"{prot_id}.pkl\")\n", - "_" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import vaep\n", - "from vaep.transform import log\n", - "\n", - "peptides_selected_log10 = data_peptides.loc[mask_samples_selected, peptides_measured].apply(log) # selected in widget overview above\n", - "peptides_selected_log10" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> The data to be seen here should be **assigned** peptides. Razor peptides are for now not put to one or the other protein (focus only on unique peptides?)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Hyperparameters" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "n_samples, n_features = peptides_selected_log10.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from vaep.models.cmd import parser\n", - "\n", - "BATCH_SIZE = 16\n", - "EPOCHS = 600\n", - "args = ['--batch-size', str(BATCH_SIZE), '--seed', '43', '--epochs', str(EPOCHS), '--log-interval', str(BATCH_SIZE)]\n", - "args = parser.parse_args(args)\n", - "args.cuda = not args.no_cuda and torch.cuda.is_available()\n", - "args" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "torch.manual_seed(args.seed)\n", - "device = torch.device(\"cuda\" if args.cuda else \"cpu\")\n", - "device = torch.device(\"cpu\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "torch.device?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Dataset and DataLoader\n", - "\n", - "The `torch.utils.data.Dataset` can load data into memory, or just create a mapping to data somewhere to be continously loaded by the `torch.utils.data.DataLoader`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "peptide_intensities = peptides_selected_log10\n", - "detection_limit = float(int(peptide_intensities.min().min()))\n", - "detection_limit " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# from vaep.model import PeptideDatasetInMemory\n", - "\n", - "from torch.utils.data import Dataset\n", - "class PeptideDatasetInMemory(Dataset):\n", - " \"\"\"Peptide Dataset fully in memory.\"\"\"\n", - "\n", - " def __init__(self, data: pd.DataFrame, fill_na=0):\n", - " self.mask_obs = torch.from_numpy(data.notna().values)\n", - " data = data.fillna(fill_na)\n", - " self.peptides = torch.from_numpy(data.values)\n", - " self.length_ = len(data)\n", - "\n", - " def __len__(self):\n", - " return self.length_\n", - "\n", - " def __getitem__(self, idx):\n", - " return self.peptides[idx], self.mask_obs[idx]\n", - "\n", - "\n", - "dataset_in_memory = PeptideDatasetInMemory(peptide_intensities.copy(), detection_limit)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "kwargs = {'num_workers': 1, 'pin_memory': True} if device=='cuda' else {}\n", - "train_loader = torch.utils.data.DataLoader(\n", - " dataset=dataset_in_memory,\n", - " batch_size=args.batch_size, shuffle=True, **kwargs)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for i, (data, mask) in enumerate(train_loader):\n", - " print(\"Nummber of samples in mini-batch: {}\".format(len(data)),\n", - " \"\\tObject-Type: {}\".format(type(mask)))\n", - "# print(data)\n", - "# print(mask)\n", - " break" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data[~mask] = 0\n", - "plt.imshow(data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "create logged information for tensorboard, see tutorial and docs." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from datetime import datetime\n", - "from torch.utils.tensorboard import SummaryWriter\n", - "writer = SummaryWriter(f'runs/{prot_id}_{format(datetime.now(), \"%y%m%d_%H%M\")}')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "writer.add_image(f'{len(data)} samples heatmap', data, dataformats='HW')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# import importlib; importlib.reload(vaep.model)\n", - "from IPython.core.debugger import set_trace\n", - "\n", - "from torch import optim\n", - "from vaep.models.ae import VAE\n", - "from vaep.models.ae import loss_function\n", - "\n", - "model = VAE(n_features=n_features, n_neurons=30).double().to(device)\n", - "writer.add_graph(model, input_to_model=data)\n", - "\n", - "optimizer = optim.Adam(model.parameters(), lr=1e-4)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "vaep", - "language": "python", - "name": "vaep" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13 (default, Mar 28 2022, 06:59:08) [MSC v.1916 64 bit (AMD64)]" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/project/misc_illustrations.ipynb b/project/misc_illustrations.ipynb index dd7e6350b..02012ad42 100644 --- a/project/misc_illustrations.ipynb +++ b/project/misc_illustrations.ipynb @@ -13,19 +13,22 @@ "metadata": {}, "outputs": [], "source": [ + "from pathlib import Path\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", - "import pandas as pd\n", "import scipy.stats" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "outputs": [], "source": [ - "import config as cfg" + "FIGUREFOLDER = Path('figures')\n", + "FIGUREFOLDER.mkdir(exist_ok=True, parents=True)" ] }, { @@ -36,9 +39,9 @@ "source": [ "plt.rcParams.update({'xtick.labelsize': 'xx-large',\n", " 'ytick.labelsize': 'xx-large',\n", - " 'axes.titlesize' : 'xx-large',\n", - " 'axes.labelsize' : 'xx-large',\n", - " })\n", + " 'axes.titlesize': 'xx-large',\n", + " 'axes.labelsize': 'xx-large',\n", + " })\n", "# {k:v for k,v in plt.rcParams.items() if 'tick' in k and 'size' in k}" ] }, @@ -63,16 +66,16 @@ "mu = 25.0\n", "stddev = 1.0\n", "\n", - "x = np.linspace(mu -5, mu + 5, num=101)\n", + "x = np.linspace(mu - 5, mu + 5, num=101)\n", "\n", "y_normal = scipy.stats.norm.pdf(x, loc=mu, scale=stddev)\n", "\n", - "mu_shifted = mu - (1.8*stddev)\n", - "stddev_shifted = 0.3*stddev\n", + "mu_shifted = mu - (1.8 * stddev)\n", + "stddev_shifted = 0.3 * stddev\n", "print(f\"Downshifted: {mu_shifted = }, {stddev_shifted = }\")\n", - "y_impute = scipy.stats.norm.pdf(x, loc=mu - (1.8*stddev), scale=0.3*stddev)\n", + "y_impute = scipy.stats.norm.pdf(x, loc=mu - (1.8 * stddev), scale=0.3 * stddev)\n", "\n", - "colors = plt.cm.viridis([0.25,0.75]) \n", + "colors = plt.cm.viridis([0.25, 0.75])\n", "\n", "fig, ax = plt.subplots(1, 1, figsize=(5, 4))\n", "\n", @@ -93,9 +96,9 @@ "outputs": [], "source": [ "\n", - "fig.savefig(cfg.FIGUREFOLDER / 'illustration_normal_imputation')\n", - "fig.savefig(cfg.FIGUREFOLDER / 'illustration_normal_imputation.pdf')\n", - "fig.savefig(cfg.FIGUREFOLDER / 'illustration_normal_imputation_highres', dpi=600)" + "fig.savefig(FIGUREFOLDER / 'illustration_normal_imputation')\n", + "fig.savefig(FIGUREFOLDER / 'illustration_normal_imputation.pdf')\n", + "fig.savefig(FIGUREFOLDER / 'illustration_normal_imputation_highres', dpi=600)" ] }, { @@ -106,7 +109,8 @@ "\n", "- what does log2 transformation mean for the error\n", "\n", - "If the error is calculated in log2 space, the larger values have to be predicted with higher precision (in comparison to the original space)" + "If the error is calculated in log2 space, the larger values have to be\n", + "predicted with higher precision (in comparison to the original space)" ] }, { @@ -115,22 +119,23 @@ "metadata": {}, "outputs": [], "source": [ - "def get_original_error_log2(x:float, error_log2:float):\n", - " return 2 ** (np.log2(x) + error_log2) - x \n", + "def get_original_error_log2(x: float, error_log2: float):\n", + " return 2 ** (np.log2(x) + error_log2) - x\n", + "\n", "\n", "print(\n", " f\"{get_original_error_log2(1e9, 0.5) = :,.1f}\",\n", " f\"{get_original_error_log2(1e8, 0.5) = :,.1f}\",\n", " sep='\\n'\n", - " )" + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "If we try to find the rel log2 error equalling the original error, this can be done by \n", - "equating: \n", + "If we try to find the rel log2 error equalling the original error, this can be done by\n", + "equating:\n", "\n", "$$ \\exp(\\ln(a)+e) - a = \\exp(\\ln(a)+e^*) - b $$\n", "\n", @@ -147,12 +152,13 @@ "source": [ "def rel_error(measurment, log_error, other_measurment):\n", " numerator = 2 ** (np.log2(measurment) + log_error)\n", - " numerator-=measurment\n", - " numerator+=other_measurment\n", - " \n", + " numerator -= measurment\n", + " numerator += other_measurment\n", + "\n", " denominator = other_measurment\n", " return np.log2(numerator / denominator)\n", "\n", + "\n", "rel_error = rel_error(1.e9, 0.5, 1e8)\n", "print(f\"{rel_error = :.3f}\")" ] @@ -167,14 +173,14 @@ " f\"0.500 rel to 1e9: {get_original_error_log2(1e9, 0.5) :,.1f}\",\n", " f\"{rel_error:.3f} rel to 1e8: {get_original_error_log2(1e8, rel_error) :,.1f}\",\n", " sep='\\n'\n", - " )" + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "So the relative error of 0.5 for $10^9$ is five times larger for $10^8$ in the logspace, \n", + "So the relative error of 0.5 for $10^9$ is five times larger for $10^8$ in the logspace,\n", "whereas the error in the original space is the same" ] }, diff --git a/project/misc_illustrations.py b/project/misc_illustrations.py index 55328e65d..cdfd362fb 100644 --- a/project/misc_illustrations.py +++ b/project/misc_illustrations.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.0 +# jupytext_version: 1.15.2 # kernelspec: # display_name: Python 3 # language: python @@ -16,20 +16,22 @@ # # Figures for Illustration of concepts # %% +from pathlib import Path import matplotlib.pyplot as plt import numpy as np -import pandas as pd import scipy.stats # %% -import config as cfg +FIGUREFOLDER = Path('figures') +FIGUREFOLDER.mkdir(exist_ok=True, parents=True) + # %% plt.rcParams.update({'xtick.labelsize': 'xx-large', 'ytick.labelsize': 'xx-large', - 'axes.titlesize' : 'xx-large', - 'axes.labelsize' : 'xx-large', - }) + 'axes.titlesize': 'xx-large', + 'axes.labelsize': 'xx-large', + }) # {k:v for k,v in plt.rcParams.items() if 'tick' in k and 'size' in k} # %% [markdown] @@ -42,16 +44,16 @@ mu = 25.0 stddev = 1.0 -x = np.linspace(mu -5, mu + 5, num=101) +x = np.linspace(mu - 5, mu + 5, num=101) y_normal = scipy.stats.norm.pdf(x, loc=mu, scale=stddev) -mu_shifted = mu - (1.8*stddev) -stddev_shifted = 0.3*stddev +mu_shifted = mu - (1.8 * stddev) +stddev_shifted = 0.3 * stddev print(f"Downshifted: {mu_shifted = }, {stddev_shifted = }") -y_impute = scipy.stats.norm.pdf(x, loc=mu - (1.8*stddev), scale=0.3*stddev) +y_impute = scipy.stats.norm.pdf(x, loc=mu - (1.8 * stddev), scale=0.3 * stddev) -colors = plt.cm.viridis([0.25,0.75]) +colors = plt.cm.viridis([0.25, 0.75]) fig, ax = plt.subplots(1, 1, figsize=(5, 4)) @@ -65,9 +67,9 @@ fig.tight_layout() # %% -fig.savefig(cfg.FIGUREFOLDER / 'illustration_normal_imputation') -fig.savefig(cfg.FIGUREFOLDER / 'illustration_normal_imputation.pdf') -fig.savefig(cfg.FIGUREFOLDER / 'illustration_normal_imputation_highres', dpi=600) +fig.savefig(FIGUREFOLDER / 'illustration_normal_imputation') +fig.savefig(FIGUREFOLDER / 'illustration_normal_imputation.pdf') +fig.savefig(FIGUREFOLDER / 'illustration_normal_imputation_highres', dpi=600) # %% [markdown] @@ -75,22 +77,24 @@ # # - what does log2 transformation mean for the error # -# If the error is calculated in log2 space, the larger values have to be predicted with higher precision (in comparison to the original space) +# If the error is calculated in log2 space, the larger values have to be +# predicted with higher precision (in comparison to the original space) # %% -def get_original_error_log2(x:float, error_log2:float): - return 2 ** (np.log2(x) + error_log2) - x +def get_original_error_log2(x: float, error_log2: float): + return 2 ** (np.log2(x) + error_log2) - x + print( f"{get_original_error_log2(1e9, 0.5) = :,.1f}", f"{get_original_error_log2(1e8, 0.5) = :,.1f}", sep='\n' - ) +) # %% [markdown] -# If we try to find the rel log2 error equalling the original error, this can be done by -# equating: +# If we try to find the rel log2 error equalling the original error, this can be done by +# equating: # # $$ \exp(\ln(a)+e) - a = \exp(\ln(a)+e^*) - b $$ # @@ -101,12 +105,13 @@ def get_original_error_log2(x:float, error_log2:float): # %% def rel_error(measurment, log_error, other_measurment): numerator = 2 ** (np.log2(measurment) + log_error) - numerator-=measurment - numerator+=other_measurment - + numerator -= measurment + numerator += other_measurment + denominator = other_measurment return np.log2(numerator / denominator) + rel_error = rel_error(1.e9, 0.5, 1e8) print(f"{rel_error = :.3f}") @@ -115,10 +120,10 @@ def rel_error(measurment, log_error, other_measurment): f"0.500 rel to 1e9: {get_original_error_log2(1e9, 0.5) :,.1f}", f"{rel_error:.3f} rel to 1e8: {get_original_error_log2(1e8, rel_error) :,.1f}", sep='\n' - ) +) # %% [markdown] -# So the relative error of 0.5 for $10^9$ is five times larger for $10^8$ in the logspace, +# So the relative error of 0.5 for $10^9$ is five times larger for $10^8$ in the logspace, # whereas the error in the original space is the same # %% [markdown] diff --git a/project/misc_json_formats.ipynb b/project/misc_json_formats.ipynb index 81f6d10ee..b503d2a04 100644 --- a/project/misc_json_formats.ipynb +++ b/project/misc_json_formats.ipynb @@ -226,9 +226,9 @@ ], "metadata": { "kernelspec": { - "display_name": "vaep", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "vaep" + "name": "python3" }, "language_info": { "codemirror_mode": { diff --git a/project/misc_json_formats.py b/project/misc_json_formats.py new file mode 100644 index 000000000..b589ea91c --- /dev/null +++ b/project/misc_json_formats.py @@ -0,0 +1,112 @@ +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.15.2 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Json Formats +# +# - object is loaded with the correct conversions (but this is re-computed) +# - can shared information be saved as "meta" information? +# +# - [`pd.json_normalize`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.json_normalize.html) should be able to efficiently combine information + +# %% +import pandas as pd +from vaep.io.data_objects import MqAllSummaries +from vaep.pandas import get_unique_non_unique_columns + +mq_all_summaries = MqAllSummaries() + +# %% [markdown] +# ## summaries.json + +# %% [markdown] +# ### Table format with schema + +# %% +# json format with categories +columns = get_unique_non_unique_columns(mq_all_summaries.df) +columns.unique[:2] + +# %% +mq_all_summaries.df[columns.unique[:3]].dtypes + +# %% +type(mq_all_summaries.df.iloc[0,3]) + +# %% +meta = mq_all_summaries.df[columns.unique].iloc[0].to_json(indent=4, orient='table') +# print(meta) + +# %% +pd.read_json(meta, orient='table').T.convert_dtypes() + +# %% +pd.read_json(meta, orient='table') # produce errors when having int columns has NaN + +# %% +pd.options.display.max_columns = len(columns.non_unique) +# mq_all_summaries.df[columns.non_unique] + +# %% +data = mq_all_summaries.df[columns.non_unique].iloc[0:3].to_json() +data = pd.read_json(data) +data + +# %% +mq_all_summaries.fp_summaries.parent / mq_all_summaries.fp_summaries.stem / '_meta.json' + +# %% +meta = mq_all_summaries.df[columns.unique].iloc[0].to_json(indent=4) +meta = pd.read_json(meta, typ='series') +meta + +# %% +for col, value in meta.items(): + data[col] = value + +# %% +data + +# %% [markdown] +# ## Table schema bug +# +# - filed bug report on pandas [#40255](https://github.com/pandas-dev/pandas/issues/40255) + +# %% +pd.show_versions() + +# %% +pd.__version__ + +# %% +import traceback +import pandas +data = {'A' : [1, 2, 2, pd.NA, 4, 8, 8, 8, 8, 9], + 'B': [pd.NA] * 10} +data = pd.DataFrame(data) +data = data.astype(pd.Int64Dtype()) # in my example I get this from data.convert_dtypes() +data_json = data.to_json(orient='table', indent=4) +try: + pd.read_json(data_json, orient='table') #ValueError: Cannot convert non-finite values (NA or inf) to integer +except ValueError as e: + print(e) + traceback.print_exc() + +# %% +print(data.to_string()) + +# %% +N = 3 +meta = mq_all_summaries.df[columns.unique[:N]].iloc[0:2].reset_index(drop=True) +meta.to_dict() diff --git a/project/misc_protein_support.ipynb b/project/misc_protein_support.ipynb deleted file mode 100644 index 846693e82..000000000 --- a/project/misc_protein_support.ipynb +++ /dev/null @@ -1,739 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Analyse peptides\n", - "\n", - "## Specification\n", - "- access different levels of peptides easily\n", - "- select training data per gene easily\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import time\n", - "import json\n", - "import logging\n", - "\n", - "import pandas as pd\n", - "from IPython.core.debugger import set_trace\n", - "\n", - "\n", - "pd.options.display.float_format = '{:,.1f}'.format\n", - "\n", - "from config import erda_dumps\n", - "from config import FN_FASTA_DB, FN_ID_MAP, FN_PEPTIDE_INTENSITIES, FN_PEPTIDE_STUMP, FOLDER_DATA\n", - "\n", - "logging.basicConfig(level=logging.INFO) # configures root logger\n", - "logger = logging.getLogger()\n", - "logger.info(\"test\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "id_map = pd.read_json(FN_ID_MAP, orient=\"split\")\n", - "\n", - "mask_no_gene = id_map.gene.isna()\n", - "id_map.loc[mask_no_gene, \"gene\"] = \"-\"\n", - "\n", - "with open(FN_FASTA_DB) as f:\n", - " data_fasta = json.load(f)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "peptides_intensities = pd.read_pickle(erda_dumps.FN_PEPTIDES)\n", - "peptides_intensities.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# idx = peptides_intensities.index.levels[0][:1000]\n", - "peptides_intensities.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "peptides_intensities = peptides_intensities.unstack()\n", - "peptides_intensities.head()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_peptides = peptides_intensities\n", - "set(data_peptides.dtypes)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "set_peptides = set(data_peptides.columns)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- switch between list of proteins with any support and non\n", - " - set threshold of number of peptides per protein over all samples (some peptides uniquely matched to one protein in on sample is just noise -> check razor peptides)\n", - "- show support" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "peptides_2 = ('TTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGR',\n", - " 'LDLAGRDLTDYLMK')\n", - "\n", - "peptides_4 = (\"ILTERGYSFTTTAEREIVR\",\n", - " \"GYSFTTTAEREIVRDIK\",\n", - " \"EIVRDIKEK\",\n", - " \"DIKEKLCYVALDFEQEMATAASSSSLEK\")\n", - "peptides_4[:0:-1]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# logger.setLevel(logging.DEBUG)\n", - "COLORS = [\"\\033[32;2m\", \"\\033[32;1m\", \"0;34;47m\"]\n", - "\n", - "\n", - "def annotate_overlap(peptides):\n", - " i = len(peptides)\n", - " if i > 3:\n", - " raise ValueError(\"Two many peptides provided.\")\n", - " logging.debug(f\"First peptide: {peptides[0]} \")\n", - " base_peptide = peptides[0][::-1]\n", - " logging.debug(f\"Reversed pep: {base_peptide}\")\n", - " colored_part = \"\"\n", - " overlaps = []\n", - " logging.debug(peptides[:0:-1])\n", - " for pep in peptides[:0:-1]:\n", - "\n", - " logger.debug(f\"Find overlap for: {pep}\")\n", - " overlap = \"\"\n", - " overlap_in_last_step = False\n", - " for j, amino_acid in enumerate(pep):\n", - " overlap += amino_acid\n", - " if overlap[::-1] != base_peptide[:len(overlap)]:\n", - " overlap_now = False\n", - " else:\n", - " overlap_in_last_step = True\n", - " logger.debug(f\"Found overlap: {overlap}\")\n", - " if overlap_in_last_step and not overlap_now:\n", - " overlaps.append(overlap)\n", - " break\n", - " logger.debug(\n", - " f\"Search remaining peptide: {base_peptide[len(overlap)::]}\")\n", - " base_peptide = base_peptide[len(overlap)::]\n", - " overlaps.append(base_peptide[::-1])\n", - " return overlaps[::-1]\n", - "\n", - "\n", - "assert ''.join(annotate_overlap(peptides_2)\n", - " ) == \"TTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGR\"\n", - "# annotate_overlap(peptides_4) # should raise ValueError\n", - "assert ''.join(annotate_overlap(peptides_4[0:3])) == 'ILTERGYSFTTTAEREIVR'\n", - "assert ''.join(annotate_overlap(peptides_4[1:])) == 'GYSFTTTAEREIVRDIK'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pep_0missed = \"GYSFTTTAER\"\n", - "pep_1missed = [\"ILTERGYSFTTTAER\",\n", - " \"GYSFTTTAEREIVR\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from collections import defaultdict\n", - "import ipywidgets as w\n", - "from config import KEY_FASTA_HEADER, KEY_FASTA_SEQ, KEY_PEPTIDES, KEY_GENE_NAME_FASTA\n", - "\n", - "\n", - "pd.options.display.float_format = '{:,.1f}'.format\n", - "\n", - "TGREEN = \"\\033[32;2m\" # Green Text\n", - "TGREEN_2 = \"\\033[32;1m\" # Green Text\n", - "RESET = \"\\033[0;0m\"\n", - "\n", - "w_first_letter = w.Dropdown(\n", - " options=id_map[KEY_GENE_NAME_FASTA].str[0].unique())\n", - "\n", - "w_genes = w.Dropdown(\n", - " options=id_map.gene.loc[id_map[KEY_GENE_NAME_FASTA].str[0]\n", - " == w_first_letter.value].unique(),\n", - " value='ACTB'\n", - ")\n", - "\n", - "mask = id_map.gene == w_genes.value\n", - "selected = id_map.loc[mask, \"protein\"]\n", - "\n", - "\n", - "w_proteins_ids = w.Dropdown(options=selected.index)\n", - "w_protein = w.Dropdown(options=selected.unique())\n", - "\n", - "\n", - "def update_gene_list(first_letter):\n", - " \"\"\"Update proteins when new gene is selected\"\"\"\n", - " mask_selected_genes = id_map[KEY_GENE_NAME_FASTA].str[0] == w_first_letter.value\n", - " w_genes.options = id_map[KEY_GENE_NAME_FASTA].loc[mask_selected_genes].unique(\n", - " )\n", - "\n", - "\n", - "_ = w.interactive_output(update_gene_list, {\"first_letter\": w_first_letter})\n", - "\n", - "\n", - "def update_protein_list(gene):\n", - " mask = id_map[KEY_GENE_NAME_FASTA] == gene\n", - " selected = id_map.loc[mask, \"protein\"]\n", - " w_protein.options = selected.unique()\n", - "# w_proteins_ids.options = selected.loc[selected == w_protein.value].index\n", - "\n", - "\n", - "_ = w.interactive_output(update_protein_list, {\"gene\": w_genes})\n", - "\n", - "\n", - "def update_protein_id_list(protein):\n", - " \"\"\"Update isotope list when protein is selected\"\"\"\n", - " mask = id_map.protein == w_protein.value\n", - " selected = id_map.protein.loc[mask]\n", - " w_proteins_ids.options = selected.index\n", - "\n", - "\n", - "_ = w.interactive_output(update_protein_id_list, {'protein': w_protein})\n", - "\n", - "d_peptides_observed_prot_id = defaultdict(list)\n", - "\n", - "\n", - "def show_sequences(prot_id):\n", - " _data = data_fasta[prot_id]\n", - " print(f\"Protein_ID on Uniport: {prot_id}\")\n", - " print(f\"HEADER: {_data[KEY_FASTA_HEADER]}\")\n", - "# print(f\"Seq : {_data[KEY_FASTA_SEQ]}\")\n", - " annotate_seq = \"Peptides: \"\n", - " global d_peptides_observed_prot_id\n", - " for i, _l in enumerate(_data[KEY_PEPTIDES]):\n", - " annotate_seq += f\"\\nNo. of missed K or R: {i}\"\n", - " prot_seq_annotated = _data[KEY_FASTA_SEQ]\n", - " _change_color = False\n", - " for j, _pep in enumerate(_l):\n", - " if _pep in set_peptides:\n", - " d_peptides_observed_prot_id[prot_id].append(_pep)\n", - " if _change_color is False:\n", - " _pep_in_green = TGREEN + f\"{_pep}\" + RESET\n", - " _change_color = True\n", - " else:\n", - " _pep_in_green = TGREEN_2 + f\"{_pep}\" + RESET\n", - " _change_color = False\n", - " prot_seq_annotated = prot_seq_annotated.replace(\n", - " _pep, _pep_in_green)\n", - " _pep = _pep_in_green\n", - " else:\n", - " _change_color = False\n", - " if j == 0:\n", - " annotate_seq += \"\\n\\t\"\n", - " else:\n", - " annotate_seq += \",\\n\\t\"\n", - " annotate_seq += _pep\n", - "\n", - " print(f\"Seq {i}: {prot_seq_annotated}\")\n", - " print(annotate_seq)\n", - "\n", - " _ = data_peptides[d_peptides_observed_prot_id[prot_id]].dropna(how='all')\n", - " if _.columns.size > 2:\n", - " display(_)\n", - " display(_.describe())\n", - " else:\n", - " print(\"\\nNo empirical evidence for protein\")\n", - "\n", - "\n", - "w_out = w.interactive_output(show_sequences, {\"prot_id\": w_proteins_ids})\n", - "\n", - "label_first_letter = w.Label(value='First letter of Gene')\n", - "label_genes = w.Label('Gene')\n", - "label_protein = w.Label('Protein')\n", - "label_proteins_ids = w.Label('Protein Isotopes')\n", - "\n", - "panel_levels = w.VBox([\n", - " w.HBox([\n", - " w.VBox([label_first_letter, w_first_letter]),\n", - " w.VBox([label_genes, w_genes]),\n", - " w.VBox([label_protein, w_protein]),\n", - " w.VBox([label_proteins_ids, w_proteins_ids])\n", - " ]),\n", - " w_out]\n", - ")\n", - "panel_levels" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> create styler object?\n", - "\n", - "- [ ] replace zeros with NaN\n", - "- [ ] display summary statistics on log-scale (but do not compute summary based on log-scale)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Get meta-data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "query_template = \"https://www.uniprot.org/uniprot/?query=accession:{prot_id}&format=txt\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- relatively short peptides resulting from one missed cleaveage, do not appear in the upper part." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- `gene` `->` `Protein_ID` (contains information of `gene` `->` `protein_isotopes`\n", - "- `protein_ID` `->` `sequences` (`FN_FASTA_DB`)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pickle\n", - "from tqdm.notebook import tqdm\n", - "from config import FN_PROTEIN_SUPPORT_MAP, FN_PROTEIN_SUPPORT_FREQ\n", - "# from vaep.utils import sample_iterable\n", - "\n", - "try:\n", - " if (time.time() - os.path.getmtime(FN_PROTEIN_SUPPORT_MAP)) / 3600 / 24 > 7:\n", - " # recompute file every week\n", - " raise FileNotFoundError\n", - " df_protein_support = pd.read_pickle(FN_PROTEIN_SUPPORT_MAP)\n", - " with open(FN_PROTEIN_SUPPORT_FREQ, 'rb') as f:\n", - " d_protein_support_freq = pickle.load(f)\n", - "except FileNotFoundError:\n", - " d_protein_support = {}\n", - " d_protein_support_freq = {}\n", - " for prot_id in tqdm(data_fasta.keys()):\n", - " _data = data_fasta[prot_id]\n", - " peptides_measured = []\n", - " for i, _l in enumerate(_data[KEY_PEPTIDES]):\n", - " for _pep in _l:\n", - " if _pep in set_peptides:\n", - " peptides_measured.append(_pep)\n", - " _d_protein_support = {}\n", - " _df_support_protein = data_peptides[peptides_measured].dropna(\n", - " how='all')\n", - "\n", - " _n_samples = len(_df_support_protein)\n", - " if _n_samples > 0:\n", - " _d_protein_support['N_samples'] = _n_samples\n", - " d_protein_support_freq[prot_id] = _df_support_protein.notna(\n", - " ).sum().to_dict()\n", - " d_protein_support[prot_id] = _d_protein_support\n", - " else:\n", - " d_protein_support[prot_id] = None\n", - "\n", - " df_protein_support = pd.DataFrame(d_protein_support).T.dropna()\n", - " df_protein_support = df_protein_support.join(id_map)\n", - " df_protein_support.to_pickle(FN_PROTEIN_SUPPORT_MAP)\n", - "\n", - " with open(FN_PROTEIN_SUPPORT_FREQ, 'wb') as f:\n", - " pickle.dump(d_protein_support_freq, f)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "l_proteins_good_support = df_protein_support.sort_values(\n", - " by='N_samples').tail(100).index.to_list()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "d_protein_support_freq['I3L3I0']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Connect to experimental peptide data\n", - "\n", - "Check if counts by `data_fasta`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from tqdm.notebook import tqdm\n", - "\n", - "counts_observed_by_missed_cleavages = {}\n", - "for _protein_id, _data in tqdm(data_fasta.items()):\n", - " _peptides = _data[KEY_PEPTIDES]\n", - " _counts = {}\n", - " for i, _l in enumerate(_peptides):\n", - " _counts[i] = 0\n", - " for _pep in _l:\n", - " if _pep in set_peptides:\n", - " _counts[i] += 1\n", - " counts_observed_by_missed_cleavages[_protein_id] = _counts" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_counts_observed_by_missed_cleavages = pd.DataFrame(\n", - " counts_observed_by_missed_cleavages\n", - ").T" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "from matplotlib import table\n", - "\n", - "fig, axes = plt.subplots(ncols=2, gridspec_kw={\"width_ratios\": [\n", - " 5, 1], \"wspace\": 0.2}, figsize=(10, 4))\n", - "\n", - "_counts_summed = df_counts_observed_by_missed_cleavages.sum()\n", - "_counts_summed.name = \"frequency\"\n", - "\n", - "ax = axes[0]\n", - "_ = _counts_summed.plot(kind=\"bar\", ax=ax)\n", - "ax.set_xlabel(\"peptides from n miscleavages\")\n", - "ax.set_ylabel(\"frequency\")\n", - "\n", - "ax = axes[1]\n", - "ax.axis(\"off\")\n", - "_ = pd.plotting.table(ax=ax, data=_counts_summed,\n", - " loc=\"best\", colWidths=[1], edges='open')\n", - "_ = fig.suptitle('Peptides frequencies')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "These are unnormalized counts in the meaning of that _razor_ peptides are counted as often as they are matched." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mask = df_counts_observed_by_missed_cleavages != 0\n", - "df_prot_observed = df_counts_observed_by_missed_cleavages.replace(0, pd.NA)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_prot_observed = df_prot_observed.dropna(axis=0, how=\"all\")\n", - "df_prot_observed = df_prot_observed.fillna(0)\n", - "df_prot_observed = df_prot_observed.convert_dtypes()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from vaep.pandas import combine_value_counts\n", - "\n", - "combine_value_counts(df_prot_observed)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "freq_pep_mapped_to_protID = df_prot_observed.sum(axis=1).value_counts()\n", - "freq_pep_mapped_to_protID = freq_pep_mapped_to_protID.sort_index()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "freq_pep_mapped_to_protID" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Genes with support in data\n", - "\n", - "try software to identify the _most likely_ protein. [PyOpenMS](https://pyopenms.readthedocs.io/en/latest/) or [Pyteomics](https://pyteomics.readthedocs.io/en/latest/)? " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Imputation: Train model\n", - "\n", - "> Select Gene or Protein\n", - "\n", - "As the samples are all obtained from the same biological sample (in principal), the single run should somehow be comparable.\n", - "An description of variablity (from the Data Scientist perspective) can highlight some commenly known facts about proteomics experiments:\n", - " - batch effects: Measurements on consecutive days are have to be normalized to each other\n", - " - scoring: PSM are assigned to a peptide based on a score. Small variations can lead to different assignments\n", - " \n", - "Can a complex representation of a sample level out experimental variation on an in principle comparable data. \n", - "\n", - "### Strategy\n", - "- first start using peptides from single Protein_IDs\n", - "- then move to all models from genes\n", - "- explore structure" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "d_peptides_observed_prot_id" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_peptides.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from config import PROTEIN_DUMPS\n", - "from vaep.transform import log\n", - "import vaep\n", - "w_select_proteins_good_support = w.Dropdown(options=l_proteins_good_support)\n", - "w_select_proteins_queried = w.Dropdown(\n", - " options=list(d_peptides_observed_prot_id.keys()))\n", - "\n", - "# select from top100 or above\n", - "\n", - "\n", - "def main_trigger(prot_id):\n", - " \"\"\"Explore protein data\n", - "\n", - " Global Variables used\n", - " ---------------------\n", - " data_peptides : pandas.DataFrame\n", - " id_map : pandas.DataFrame\n", - " d_peptides_observed_prot_id: dict\n", - "\n", - "\n", - " Global variables set\n", - " --------------------\n", - " peptides_selected_log10: pandas.DataFrame\n", - " Current selection of data for protein_id. All possible features are returned. log10 transformed\n", - " prod_id : str\n", - " Passed prot_id to function exposed globally\n", - " \"\"\"\n", - " print(f'Protein Identifier: {prot_id}')\n", - " # Select gene name, based on selected FASTA-File\n", - " _gene_name = id_map.loc[prot_id, KEY_GENE_NAME_FASTA]\n", - " # Protein Name summarized several UNIPROT isotopes (PROT, PROT_2, PROT_3, etc)\n", - " _protein = id_map.protein.loc[prot_id]\n", - " print(f'Gene Identifier {_gene_name}')\n", - " # configure viewer above\n", - " w_first_letter.value = _gene_name[0]\n", - " w_genes.value = _gene_name\n", - " w_protein.value = _protein\n", - " w_proteins_ids.value = prot_id\n", - "\n", - " # get observed peptides according to pre-computed dictionary\n", - " peptides_measured = d_peptides_observed_prot_id[prot_id]\n", - " n_peptides_in_selection = len(peptides_measured)\n", - " print(\n", - " f\"Found {n_peptides_in_selection} peptides measured of this protein.\\n\\n\")\n", - "\n", - " # select subsample (as view) of peptides\n", - " peptides_selected = data_peptides[peptides_measured]\n", - " mask_selected_notna = data_peptides[peptides_measured].notna()\n", - " selected_notna_summed_ax1 = mask_selected_notna.sum(axis=1)\n", - " print(\"How many samples have how many peptides quantified?\")\n", - " for n_peptides, n_samples in selected_notna_summed_ax1.value_counts().sort_index().tail(10).items():\n", - " print(f\"In {n_samples:5} samples are {n_peptides:5} peptides measured.\")\n", - "\n", - " PROP_DATA_COMPLETENESS = 0.5\n", - " mask_samples_selected = selected_notna_summed_ax1 >= int(\n", - " n_peptides_in_selection * PROP_DATA_COMPLETENESS)\n", - " print(f\"\\nUsing a share of at least {PROP_DATA_COMPLETENESS}, \"\n", - " f\"i.e. at least {int(n_peptides_in_selection * PROP_DATA_COMPLETENESS)} out of {n_peptides_in_selection}.\",\n", - " f\"In total {mask_samples_selected.sum()} samples are selected for further analysis.\", sep=\"\\n\")\n", - " # from IPython.core.debugger import set_trace; set_trace()\n", - " _ = peptides_selected.loc[mask_samples_selected, peptides_measured]\n", - " _.index.name = f\"protein_id {prot_id}\"\n", - " # _.to_json(PROTEIN_DUMPS / f\"{prot_id}.json\")\n", - "\n", - " display(_)\n", - " # display(_.describe())\n", - " global peptides_selected_log10\n", - " peptides_selected_log10 = _.apply(log) # selected in widget overview above\n", - " display(peptides_selected_log10)\n", - " display(peptides_selected_log10.describe())\n", - " global prot_last\n", - " prot_last = prot_id\n", - "\n", - "\n", - "w.VBox([\n", - " w.HBox(\n", - " [\n", - " w.VBox(\n", - " [\n", - " w.Label(\n", - " f\"Top {len(l_proteins_good_support)} covered proteins\"),\n", - " w_select_proteins_good_support,\n", - " ]\n", - " ),\n", - " w.VBox([w.Label(\"Queried proteins from above\"),\n", - " w_select_proteins_queried]),\n", - " ]\n", - " ),\n", - " w.interactive_output(\n", - " main_trigger, {\"prot_id\": w_select_proteins_good_support})\n", - "])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Idea: Select a protein which leads to training. Each selection will create a dump of the selected data, which can be used in the `XZY.ipynb` for model fine-tuning. ( A model per protein/gene?)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "vaep", - "language": "python", - "name": "vaep" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/project/misc_pytorch_fastai_dataloaders.ipynb b/project/misc_pytorch_fastai_dataloaders.ipynb index fe025a473..71ab4b573 100644 --- a/project/misc_pytorch_fastai_dataloaders.ipynb +++ b/project/misc_pytorch_fastai_dataloaders.ipynb @@ -62,7 +62,8 @@ " def setups(self, to):\n", " store_attr(but='to', na_dict={n:self.fill_strategy(to[n], self.fill_vals[n])\n", " for n in to.conts.keys()})\n", - " self.fill_strategy = self.fill_strategy.__name__\n" + " self.fill_strategy = self.fill_strategy.__name__\n", + "\n" ] }, { @@ -97,11 +98,11 @@ "X_test = create_df(int(N*0.1), M, prop_na=.1, start_idx=len(X))\n", "\n", "data = DataSplits(train_X=X.loc[X.index.difference(idx_val)],\n", - " val_X=X.loc[idx_val],\n", - " test_X=X_test,\n", + " val_y=X.loc[idx_val],\n", + " test_y=X_test,\n", " is_wide_format=True)\n", "\n", - "data.val_X.loc[data.val_X.isna().any(axis=1), data.val_X.isna().any(axis=0)]" + "data.val_y.loc[data.val_y.isna().any(axis=1), data.val_y.isna().any(axis=0)]" ] }, { @@ -167,9 +168,9 @@ "metadata": {}, "outputs": [], "source": [ - "X = data.train_X.append(data.val_X)\n", + "X = data.train_X.append(data.val_y)\n", "\n", - "splits = X.index.get_indexer(data.val_X.index) # In Tabular iloc is used, not loc for splitting\n", + "splits = X.index.get_indexer(data.val_y.index) # In Tabular iloc is used, not loc for splitting\n", "splits = IndexSplitter(splits)(X) # splits is are to list of integer indicies (for iloc)\n", " \n", "procs = [Normalize, FillMissingKeepAll]\n", @@ -304,8 +305,8 @@ "metadata": {}, "outputs": [], "source": [ - "# test_ds = TabularPandas(data.test_X, cont_names=data.test_X.columns.to_list())\n", - "dl_test = dls.test_dl(data.test_X.copy())\n", + "# test_ds = TabularPandas(data.test_y, cont_names=data.test_y.columns.to_list())\n", + "dl_test = dls.test_dl(data.test_y.copy())\n", "dl_test.xs.head()" ] }, @@ -334,7 +335,7 @@ "metadata": {}, "outputs": [], "source": [ - "to_test = TabularPandas(data.test_X.copy(), procs=None, cont_names=data.test_X.columns.to_list(), splits=None, do_setup=True)\n", + "to_test = TabularPandas(data.test_y.copy(), procs=None, cont_names=data.test_y.columns.to_list(), splits=None, do_setup=True)\n", "_ = procs(to_test) # inplace operation\n", "to_test.items.head()" ] @@ -346,7 +347,7 @@ "metadata": {}, "outputs": [], "source": [ - "data.test_X.head()" + "data.test_y.head()" ] }, { @@ -422,7 +423,7 @@ "outputs": [], "source": [ "train_ds = DatasetWithMaskAndNoTarget(df=data.train_X)\n", - "valid_ds = DatasetWithMaskAndNoTarget(df=data.val_X)\n", + "valid_ds = DatasetWithMaskAndNoTarget(df=data.val_y)\n", "train_ds[-1]" ] }, @@ -481,7 +482,7 @@ " \n", "o_tf_norm = Normalize()\n", "o_tf_norm.setup(data.train_X)\n", - "o_tf_norm(data.val_X.head()) # apply this manueally to each dataset" + "o_tf_norm(data.val_y.head()) # apply this manueally to each dataset" ] }, { @@ -502,7 +503,7 @@ "outputs": [], "source": [ "train_ds = DatasetWithMaskAndNoTarget(df=o_tf_norm(data.train_X))\n", - "valid_ds = DatasetWithMaskAndNoTarget(df=o_tf_norm(data.val_X))\n", + "valid_ds = DatasetWithMaskAndNoTarget(df=o_tf_norm(data.val_y))\n", "\n", "dls = DataLoaders.from_dsets(\n", " train_ds,\n", @@ -526,9 +527,9 @@ "\n", "assert (dls.valid.one_batch()[1] < 0.0).any(), \"Normalization did not work.\"\n", "with pytest.raises(AttributeError):\n", - " DatasetWithMaskAndNoTarget(df=data.val_X, transformer=o_tf_norm)\n", + " DatasetWithMaskAndNoTarget(df=data.val_y, transformer=o_tf_norm)\n", " \n", - "# assert_array_almost_equal(DatasetWithMaskAndNoTarget(df=data.val_X, transformer=o_tf_norm)[0][1], DatasetWithMaskAndNoTarget(df=o_tf_norm(data.val_X))[0][1])\n", + "# assert_array_almost_equal(DatasetWithMaskAndNoTarget(df=data.val_y, transformer=o_tf_norm)[0][1], DatasetWithMaskAndNoTarget(df=o_tf_norm(data.val_y))[0][1])\n", "# with pytest.raises(AttributeError):\n", "# valid_ds.inverse_transform(dls.valid.one_batch()[1])" ] @@ -576,7 +577,7 @@ "metadata": {}, "outputs": [], "source": [ - "valid_ds = DatasetWithMaskAndNoTarget(data.val_X, dae_transforms)\n", + "valid_ds = DatasetWithMaskAndNoTarget(data.val_y, dae_transforms)\n", "valid_ds[:4]" ] }, @@ -588,7 +589,7 @@ "outputs": [], "source": [ "from vaep.io.dataloaders import get_dls\n", - "dls = get_dls(data.train_X, data.val_X, dae_transforms, bs=4) \n", + "dls = get_dls(data.train_X, data.val_y, dae_transforms, bs=4) \n", "dls.valid.one_batch()" ] }, @@ -600,7 +601,7 @@ "outputs": [], "source": [ "test_dl = DataLoader(\n", - " dataset=DatasetWithMaskAndNoTarget(data.test_X, dae_transforms),\n", + " dataset=DatasetWithMaskAndNoTarget(data.test_y, dae_transforms),\n", " shuffle=False,\n", " bs=4)\n", "test_dl.one_batch()" @@ -623,7 +624,7 @@ "metadata": {}, "outputs": [], "source": [ - "data.test_X.head(4)" + "data.test_y.head(4)" ] }, { @@ -671,8 +672,8 @@ " range(len(data.train_X)),\n", " DatasetTransform(data.train_X))\n", "valid_tl = TfmdLists(\n", - " range(len(data.val_X)),\n", - " DatasetTransform(data.val_X))\n", + " range(len(data.val_y)),\n", + " DatasetTransform(data.val_y))\n", "\n", "dls = DataLoaders.from_dsets(train_tl, valid_tl,\n", "# after_item=[Normalize],\n", @@ -707,7 +708,7 @@ "_transform_fct = scaler.transform\n", "\n", "train_ds = DatasetWithMaskAndNoTarget(df=_transform_fct(data.train_X))\n", - "valid_ds = DatasetWithMaskAndNoTarget(df=_transform_fct(data.val_X))\n", + "valid_ds = DatasetWithMaskAndNoTarget(df=_transform_fct(data.val_y))\n", "\n", "dls = DataLoaders.from_dsets(train_ds, valid_ds,\n", " bs=4)\n", @@ -733,9 +734,9 @@ ], "metadata": { "kernelspec": { - "display_name": "vaep", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "vaep" + "name": "python3" }, "language_info": { "codemirror_mode": { diff --git a/project/misc_pytorch_fastai_dataloaders.py b/project/misc_pytorch_fastai_dataloaders.py new file mode 100644 index 000000000..0dddfeef6 --- /dev/null +++ b/project/misc_pytorch_fastai_dataloaders.py @@ -0,0 +1,416 @@ +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.15.2 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # `DataLoaders` for feeding data into models + +# %% +import numpy as np +import pandas as pd + +import fastai +from fastai.tabular.core import Normalize +from fastai.tabular.core import FillMissing +from fastai.tabular.core import TabularPandas +from fastai.tabular.core import IndexSplitter +# make DataLoaders.test_dl work for DataFrames as test_items: + +# from fastai.tabular.all import * +from fastai.tabular.all import TabularDataLoaders +from fastcore.transform import Pipeline + +import torch + +from vaep.logging import setup_nb_logger +setup_nb_logger() + +from vaep.io.datasplits import DataSplits +from vaep.io.datasets import DatasetWithMaskAndNoTarget, to_tensor +from vaep.transform import VaepPipeline +from vaep.models import ae +from vaep.utils import create_random_df + +np.random.seed(42) +print(f"fastai version: {fastai.__version__}") +print(f"torch version: {torch.__version__}") + +# %% +from fastcore.transform import Pipeline + +from fastcore.basics import store_attr +class FillMissingKeepAll(FillMissing): + """Replacement for `FillMissing` including also non-missing features + in the training data which might be missing in the validation or test data. + """ + def setups(self, to): + store_attr(but='to', na_dict={n:self.fill_strategy(to[n], self.fill_vals[n]) + for n in to.conts.keys()}) + self.fill_strategy = self.fill_strategy.__name__ + + + +# %% [markdown] +# Create data +# +# - train data without missings +# - validation and test data with missings +# +# Could be adapted to have more or less missing in training, validation or test data. Choosen as in current version the validation data cannot contain features with missing values which were not missing in the training data. + +# %% +N, M = 150, 15 + +create_df = create_random_df + +X = create_df(N, M) +X = X.append(create_df(int(N*0.3), M, prop_na=.1, start_idx=len(X))) + +idx_val = X.index[N:] # RandomSplitter could be used, but used to show IndexSplitter usage with Tabular + +X_test = create_df(int(N*0.1), M, prop_na=.1, start_idx=len(X)) + +data = DataSplits(train_X=X.loc[X.index.difference(idx_val)], + val_y=X.loc[idx_val], + test_y=X_test, + is_wide_format=True) + +data.val_y.loc[data.val_y.isna().any(axis=1), data.val_y.isna().any(axis=0)] + +# %% [markdown] +# ## Collab + +# %% + +# %% [markdown] +# ## Denoising Autoencoder + +# %% [markdown] +# ### DataSet `Tabular` +# +# - `fastai.tabular.core.Tabular` +# +# +# Adding procs / transforms manually +# +# ```python +# cont_names = list(splits.train_X.columns) +# to = TabularPandas(splits.train_X, cont_names=cont_names, do_setup=False) +# +# tf_norm = NORMALIZER() +# tf_fillna = FillMissing(add_col=True) +# +# _ = tf_norm.setups(to) # returns to +# _ = tf_fillna.setup(to) +# ``` +# +# No added in a manuel pipeline. See [opened issue](https://github.com/fastai/fastai/issues/3530) on `Tabular` behaviour. +# Setting transformation (procs) in the constructor is somehow not persistent, although very similar code is called. +# +# ``` +# # not entirely empty, but to.procs.fs needs to be populated +# type(to.procs), to.procs.fs # __call__, setup, decode, fs +# ``` + +# %% +X = data.train_X.append(data.val_y) + +splits = X.index.get_indexer(data.val_y.index) # In Tabular iloc is used, not loc for splitting +splits = IndexSplitter(splits)(X) # splits is are to list of integer indicies (for iloc) + +procs = [Normalize, FillMissingKeepAll] + +to = TabularPandas(X, procs=procs, cont_names=X.columns.to_list(), splits=splits) # to = tabular object + +print("Tabular object:", type(to)) +to.items.head() + +# %% [markdown] +# Test data with procs + +# %% +procs = to.procs +procs.fs + +# %% [markdown] +# Let's format this to see what it does +# +# ```python +# # (#2) +# [ +# FillMissingKeepAll -- +# {'fill_strategy': , +# 'add_col': True, +# 'fill_vals': defaultdict(, {'feat_00': 0, 'feat_01': 0, 'feat_02': 0, ..., 'feat_14': 13.972452} +# }: +# encodes: (object,object) -> encodes +# decodes: , +# Normalize -- +# {'mean': None, 'std': None, 'axes': (0, 2, 3), +# 'means': {'feat_00': 14.982738, 'feat_01': 13.158741, 'feat_02': 14.800485, ..., 'feat_14': 8.372757} +# }: +# encodes: (TensorImage,object) -> encodes +# (Tabular,object) -> encodes +# decodes: (TensorImage,object) -> decodes +# (Tabular,object) -> decodes +# ] +# +# ``` + +# %% +procs + +# %% +# Check behaviour +procs.encodes + +# %% [markdown] +# #### DataLoader + +# %% +dls = to.dataloaders(bs=4) +dls.show_batch() + +# %% +dls.one_batch() + +# %% +[x.dtype for x in dls.one_batch()] + +# %% [markdown] +# #### transfrom test data using `DataLoaders.test_dl` + +# %% +# test_ds = TabularPandas(data.test_y, cont_names=data.test_y.columns.to_list()) +dl_test = dls.test_dl(data.test_y.copy()) +dl_test.xs.head() + +# %% +dl_test.show_batch() + +# %% [markdown] +# #### Transform test data manuelly + +# %% +to_test = TabularPandas(data.test_y.copy(), procs=None, cont_names=data.test_y.columns.to_list(), splits=None, do_setup=True) +_ = procs(to_test) # inplace operation +to_test.items.head() + +# %% +data.test_y.head() + +# %% [markdown] +# #### Feeding one batch to the model + +# %% +cats, conts, ys = dls.one_batch() + +# %% +model = ae.Autoencoder(n_features=M, n_neurons=int( + M/2), last_decoder_activation=None, dim_latent=10) +model + +# %% [markdown] +# The forward pass just uses the conts features + +# %% +model(conts) + +# %% [markdown] +# #### target +# - missing puzzle piece is to have a `callable` y-block which transforms part of the input. In principle it could be the same as the continous features + +# %% [markdown] +# ### PyTorch Dataset + +# %% +train_ds = DatasetWithMaskAndNoTarget(df=data.train_X) +valid_ds = DatasetWithMaskAndNoTarget(df=data.val_y) +train_ds[-1] + +# %% [markdown] +# #### DataLoaders + +# %% +from fastai.data.core import DataLoaders + +dls = DataLoaders.from_dsets(train_ds, valid_ds, + bs=4) + +dls.valid.one_batch() + +# %% [markdown] +# #### DataLoaders with Normalization fastai Transform + +# %% +from fastai.tabular.all import * +class Normalize(Transform): + def setup(self, array): + self.mean = array.mean() # this assumes tensor, numpy arrays and alike + # should be applied along axis 0 (over the samples) + self.std = array.std() # ddof=0 in scikit-learn + + def encodes(self, x): # -> torch.Tensor: # with type annotation this throws an error + x_enc = (x - self.mean) / self.std + return x_enc + + def decodes(self, x_enc:torch.tensor) -> torch.Tensor: + x = (self.std * x_enc) + self.mean + return x + +o_tf_norm = Normalize() +o_tf_norm.setup(data.train_X) +o_tf_norm(data.val_y.head()) # apply this manueally to each dataset + +# %% +o_tf_norm.encodes # object= everything + +# %% +train_ds = DatasetWithMaskAndNoTarget(df=o_tf_norm(data.train_X)) +valid_ds = DatasetWithMaskAndNoTarget(df=o_tf_norm(data.val_y)) + +dls = DataLoaders.from_dsets( + train_ds, + valid_ds, + # tfms=[o_tf_norm], + # after_batch=[o_tf_norm], + bs=4) + +dls.valid.one_batch() + +# %% +import pytest +from numpy.testing import assert_array_almost_equal, assert_array_less + +assert (dls.valid.one_batch()[1] < 0.0).any(), "Normalization did not work." +with pytest.raises(AttributeError): + DatasetWithMaskAndNoTarget(df=data.val_y, transformer=o_tf_norm) + +# assert_array_almost_equal(DatasetWithMaskAndNoTarget(df=data.val_y, transformer=o_tf_norm)[0][1], DatasetWithMaskAndNoTarget(df=o_tf_norm(data.val_y))[0][1]) +# with pytest.raises(AttributeError): +# valid_ds.inverse_transform(dls.valid.one_batch()[1]) + +# %% [markdown] +# #### DataLoaders with Normalization sklearn transform +# +# - solve transformation problem by composition +# - inverse transform only used for + +# %% +import sklearn +# from sklearn import preprocessing +from sklearn.impute import SimpleImputer +from sklearn.preprocessing import StandardScaler + +import vaep +# import importlib; importlib.reload(vaep); importlib.reload(vaep.transform) + +dae_default_pipeline = sklearn.pipeline.Pipeline( + [ + ('normalize', StandardScaler()), + ('impute', SimpleImputer(add_indicator=False)) + ]) +# new procs, transform equal encode, inverse_transform equals decode +dae_transforms = VaepPipeline( + df_train=data.train_X, encode=dae_default_pipeline, decode=['normalize']) + +# %% +valid_ds = DatasetWithMaskAndNoTarget(data.val_y, dae_transforms) +valid_ds[:4] + +# %% +from vaep.io.dataloaders import get_dls +dls = get_dls(data.train_X, data.val_y, dae_transforms, bs=4) +dls.valid.one_batch() + +# %% +test_dl = DataLoader( + dataset=DatasetWithMaskAndNoTarget(data.test_y, dae_transforms), + shuffle=False, + bs=4) +test_dl.one_batch() + +# %% +dae_transforms.inverse_transform(test_dl.one_batch()[1]) # here the missings are not replaced + +# %% +data.test_y.head(4) + +# %% [markdown] +# ### FastAi Transfrom (as Dataset) +# +# - adding `Transforms` not possible, I openend a [discussion](https://forums.fast.ai/t/correct-output-type-for-tensor-created-from-dataframe-custom-new-task-tutorial/92564) + +# %% +from typing import Tuple +from fastai.tabular.all import * +# from fastai.torch_core import TensorBase + + +class DatasetTransform(Transform): + def __init__(self, df: pd.DataFrame): + if not issubclass(type(df), pd.DataFrame): + raise ValueError( + f'please pass a pandas DataFrame, not: {type(df) = }') + self.mask_obs = df.isna() # .astype('uint8') # in case 0,1 is preferred + self.data = df + + def encodes(self, idx): # -> Tuple[torch.Tensor, torch.Tensor]: # annotation is interpreted + mask = self.mask_obs.iloc[idx] + data = self.data.iloc[idx] + # return (self.to_tensor(mask), self.to_tensor(data)) + # return (Tensor(mask), Tensor(data)) + return (tensor(data), tensor(mask)) #TabData, TabMask + + def to_tensor(self, s: pd.Series) -> torch.Tensor: + return torch.from_numpy(s.values) + + +train_tl = TfmdLists( + range(len(data.train_X)), + DatasetTransform(data.train_X)) +valid_tl = TfmdLists( + range(len(data.val_y)), + DatasetTransform(data.val_y)) + +dls = DataLoaders.from_dsets(train_tl, valid_tl, +# after_item=[Normalize], +# after_batch=[Normalize], + bs=4) +print(f"\n{DatasetTransform.encodes = }") +dls.one_batch() + +# %% [markdown] +# ## Variational Autoencoder + +# %% +from vaep.transform import MinMaxScaler + +args_vae = {} +args_vae['SCALER'] = MinMaxScaler +# select initial data: transformed vs not log transformed +scaler = args_vae['SCALER']().fit(data.train_X) + +_transform_fct = scaler.transform + +train_ds = DatasetWithMaskAndNoTarget(df=_transform_fct(data.train_X)) +valid_ds = DatasetWithMaskAndNoTarget(df=_transform_fct(data.val_y)) + +dls = DataLoaders.from_dsets(train_ds, valid_ds, + bs=4) +dls.one_batch() + +# %% [markdown] +# ## FastAi version + +# %% diff --git a/project/misc_pytorch_fastai_dataset.ipynb b/project/misc_pytorch_fastai_dataset.ipynb index c7404c1b3..622f2ac4d 100644 --- a/project/misc_pytorch_fastai_dataset.ipynb +++ b/project/misc_pytorch_fastai_dataset.ipynb @@ -251,9 +251,9 @@ "from vaep.io.datasplits import long_format\n", "\n", "\n", - "\n", + "data = pd.DataFrame(data)\n", + "data.index.name, data.columns.name = ('Sample ID', 'peptide')\n", "df_long = long_format(pd.DataFrame(data))\n", - "df_long.index.names = ('Sample ID', 'peptide')\n", "df_long.reset_index(inplace=True)\n", "df_long.head()" ] @@ -397,10 +397,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "outputs": [], "source": [ - "next(iter(dls.dataset))\n" + "next(iter(dls.dataset))" ] }, { @@ -416,9 +418,9 @@ "hash": "ca718f398b3a596c3df6787ca2afa269ec54c58eb9478d66aeb41db8e6cb8262" }, "kernelspec": { - "display_name": "vaep", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "vaep" + "name": "python3" }, "language_info": { "codemirror_mode": { diff --git a/project/misc_pytorch_fastai_dataset.py b/project/misc_pytorch_fastai_dataset.py new file mode 100644 index 000000000..fb9317e17 --- /dev/null +++ b/project/misc_pytorch_fastai_dataset.py @@ -0,0 +1,211 @@ +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.15.2 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Datasets +# +# Datasets are `Iterable` (through their `__getitem__` and `__len__` attribute). +# Datasets are provided to `DataLoaders` which perform the aggreation to batches. + +# %% +import random +import numpy as np +import pandas as pd +import vaep.io.datasets as datasets +import vaep.utils as test_data + +# %% +N, M = 15, 7 +data = test_data.create_random_missing_data(N, M, prop_missing=.4) + +# %% [markdown] +# ## Datasets +# +# - `PeptideDatasetInMemory` +# - `PeptideDatasetInMemoryMasked` +# - `PeptideDatasetInMemoryNoMissings` + +# %% [markdown] +# ## `DatasetWithMaskAndNoTarget` + +# %% +dataset = datasets.DatasetWithMaskAndNoTarget(df=pd.DataFrame(data)) +for _mask, _array in dataset: + break +_array, _mask + +# %% [markdown] +# ### `PeptideDatasetInMemory` +# +# - with duplicated target in memory + +# %% +dataset = datasets.PeptideDatasetInMemory(data) +for _array, _mask, _target in dataset: + break +_array, _mask, _target + +# %% +id(_array), id(_mask), id(_target) + +# %% +_array is _target # should be true + +# %% +data = test_data.create_random_missing_data(N, M, prop_missing=0.3) +dataset = datasets.PeptideDatasetInMemoryMasked(df=pd.DataFrame(data), fill_na=25.0) + +for _array, _mask in dataset: + if any(_mask): + print(_array, _mask) + break + +# %% [markdown] +# ### `DatasetWithTarget` + +# %% +data = test_data.create_random_missing_data(N, M, prop_missing=0.3) +dataset = datasets.DatasetWithTarget(df=pd.DataFrame(data)) + +for _mask, _array, target in dataset: + if any(_mask): + print(_array, _mask, target, sep='\n') + break + +# %% [markdown] +# ### `DatasetWithTargetSpecifyTarget` + +# %% +data = test_data.create_random_missing_data(N, M, prop_missing=0.2) + +df = pd.DataFrame(data) + +val_y = df.stack().groupby(level=0).sample(frac=0.2) +# targets = val_y.unstack().sort_index() +targets = val_y.unstack() + +df[targets.notna()] = pd.NA +df + +# %% [markdown] +# The targets are complementary + +# %% +targets + +# %% +dataset = datasets.DatasetWithTargetSpecifyTarget(df=df, targets=targets) +for _mask, _array, target in dataset: + if any(_mask): + print(_mask, _array, target, sep='\n') + break + +# %% +row = random.randint(0,len(dataset)-1) +print(f"{row = }") +dataset[row] + +# %% [markdown] +# ### `PeptideDatasetInMemoryNoMissings` + +# %% +# data and pd.DataFrame.data share the same memory +try: + dataset = datasets.PeptideDatasetInMemoryNoMissings(data) + for _array in dataset: + print(_array) + break +except AssertionError as e: + print(e) + +# %% [markdown] +# ## DataLoaders +# +# FastAI DataLoaders accept pytorch datasets + +# %% +from fastai.collab import CollabDataLoaders +# , MSELossFlat, Learner +# from fastai.collab import EmbeddingDotBias + +from vaep.io.datasplits import long_format + + +data = pd.DataFrame(data) +data.index.name, data.columns.name = ('Sample ID', 'peptide') +df_long = long_format(pd.DataFrame(data)) +df_long.reset_index(inplace=True) +df_long.head() + +# %% +dls = CollabDataLoaders.from_df(df_long, valid_pct=0.15, + user_name='Sample ID', item_name='peptide', rating_name='intensity', + bs=4) +type(dls.dataset), dls.dataset._dl_type # no __mro__? + +# %% [markdown] +# Iterating over the dataset gives the column names + +# %% +for x in dls.dataset: + print(x) + +# %% [markdown] +# Training DataFrame is hidden under items + +# %% +dls.dataset.items + +# %% +for x in dls.train_ds: + print(x) + break + +# %% +dls.train_ds + +# %% [markdown] +# Iterating over the dataset returns columns, not single rows + +# %% +# dls.train_ds.__getitem__?? + +# %% +dls.train_ds.items['Sample ID'] + +# %% [markdown] +# But the `DataLoader` return the numeric representation in batches: + +# %% +for batch in dls.train_ds: + break +batch + +# %% +# dls.train.__iter__?? + +# %% +from torch.utils.data.dataloader import _SingleProcessDataLoaderIter +# _SingleProcessDataLoaderIter?? + +# %% [markdown] +# So.. It seems too complicated +# - the `_collate_fn` seems to aggrete the data from the DataFrame +# - should be possible to keep track of that + +# %% +next(iter(dls.dataset)) + + +# %% diff --git a/project/misc_sampling_in_pandas.ipynb b/project/misc_sampling_in_pandas.ipynb index 6bb163201..05500f95d 100644 --- a/project/misc_sampling_in_pandas.ipynb +++ b/project/misc_sampling_in_pandas.ipynb @@ -305,9 +305,9 @@ ], "metadata": { "kernelspec": { - "display_name": "vaep", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "vaep" + "name": "python3" }, "language_info": { "codemirror_mode": { diff --git a/project/misc_sampling_in_pandas.py b/project/misc_sampling_in_pandas.py new file mode 100644 index 000000000..0924f7453 --- /dev/null +++ b/project/misc_sampling_in_pandas.py @@ -0,0 +1,132 @@ +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.15.2 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# ## Sampling with weights in Pandas +# +# - sampling core utilities is based on numpy (see docstring) +# - [file](https://github.com/pandas-dev/pandas/blob/49d371364b734b47c85733aac74b03ac4400c629/pandas/core/sample.py) containing sampling functions + +# %% [markdown] +# ## Some random data + +# %% +from vaep.utils import create_random_df +X = create_random_df(100, 15, prop_na=0.1).stack().to_frame( + 'intensity').reset_index() + +freq = X.peptide.value_counts().sort_index() +freq.name = 'freq' + +X = X.set_index(keys=list(X.columns[0:2])) # to_list as an alternative +freq + +# %% +X + +# %% +print(f"Based on total number of rows, 95% is roughly: {int(len(X) * 0.95)}") +print("Based on each sample's 95% obs, it is roughly: {}".format( + X.groupby('Sample ID').apply(lambda df: int(len(df) * 0.95)).sum())) + +# %% [markdown] +# ## Samling using a column with the weights + +# %% +X = X.join(freq, on='peptide') +X + +# %% +t = X.groupby('Sample ID').get_group('sample_003') +t + +# %% +t.sample(frac=0.75, weights='freq') + +# %% [markdown] +# Sampling the entire DataFrame based on the freq will normalize on N of all rows. The normalization leaves relative frequency the same (if no floating point unprecision is reached) + +# %% +# number of rows not the same as when using groupby (see above) +X.sample(frac=0.95, weights='freq') + +# %% [markdown] +# ### Sampling fails with groupby, reindexing needed + +# %% [markdown] +# The above is not mapped one to one to the groupby sample method. One needs to apply it to every single df. + +# %% +# X.groupby('Sample ID').sample(frac=0.95, weights='freq') # does not work +X.groupby('Sample ID').apply( + lambda df: df.reset_index(0, drop=True).sample(frac=0.95, weights='freq') +).drop('freq', axis=1) + +# %% [markdown] +# And passing a Series need the original X to be indexed the same (multi-indices are not supported) + +# %% +# for i, t in X.groupby('Sample ID'): +# t = t.sample(frac=0.75, weights=freq) +# t + +# %% +X = X.reset_index('Sample ID') +X + +# %% +X.groupby(by='Sample ID').sample(frac=0.95, weights=freq) + +# %% +X.groupby(by='Sample ID').get_group('sample_002') + +# %% [markdown] +# ## Sanity check: Downsampling the first feature + +# %% +freq.loc['feat_00'] = 1 # none should be selected + +# %% +freq = freq / freq.sum() +freq + +# %% +X.groupby(by='Sample ID').sample( + frac=0.5, weights=freq).sort_index().reset_index().peptide.value_counts() + +# %% [markdown] +# ## Using a series +# +# - in the above approach, sampling weights might be readjusted based on the values present in `sample` as `NAN`s lead to the weights not summing up. Alteratively one could loop through the wide format rows and sample values from these. + +# %% +freq + +# %% +X = X.drop('freq', axis=1).set_index( + 'Sample ID', append=True).squeeze().unstack(0) +X + +# %% +X.iloc[0].sample(frac=0.8, weights=freq).sort_index() + +# %% [markdown] +# Sampling using the wide format would garuantee that the weights are not adjusted based on missing values, but that instead missing values are sample into on or the other set. Ultimately `NaN`s are dropped also in this approach. + +# %% +import pandas as pd +data = {} +for row_key in X.index: + data[row_key] = X.loc[row_key].sample(frac=0.8, weights=freq) +pd.DataFrame(data).stack() diff --git a/project/run_snakemake.sh b/project/run_snakemake.sh deleted file mode 100644 index 83b5aa572..000000000 --- a/project/run_snakemake.sh +++ /dev/null @@ -1,51 +0,0 @@ -#!/bin/sh -### Note: No commands may be executed until after the #PBS lines -### Account information -#PBS -W group_list=cpr_10006 -A cpr_10006 -### Job name (comment out the next line to get the name of the script used as the job name) -#PBS -N snakemake -### Output files (comment out the next 2 lines to get the job name used instead) -#PBS -e ${PBS_JOBNAME}.${PBS_JOBID}.e -#PBS -o ${PBS_JOBNAME}.${PBS_JOBID}.o -### Email notification: a=aborts, b=begins, e=ends, n=no notifications -#PBS -m ae -M henry.webel@cpr.ku.dk -### Number of nodes -### other: #PBS -l nodes=1:ppn=40:gpus=1 -#PBS -l nodes=1:ppn=40 -### Requesting timeformat is ::: -#PBS -l walltime=1:00:00:00 -### Forward all environment variables -### if authentification is done using pw in the environment -#PBS -V - -module load tools git/2.15.0 -module load anaconda3/2021.11 - -# >>> conda initialize >>> -# !! Contents within this block are managed by 'conda init' !! -__conda_setup="$('/services/tools/anaconda3/2021.11/bin/conda' 'shell.bash' 'hook' 2> /dev/null)" -if [ $? -eq 0 ]; then - eval "$__conda_setup" -else - if [ -f "/services/tools/anaconda3/2021.11/etc/profile.d/conda.sh" ]; then - . "/services/tools/anaconda3/2021.11/etc/profile.d/conda.sh" - else - export PATH="/services/tools/anaconda3/2021.11/bin:$PATH" - fi -fi -unset __conda_setup -# <<< conda initialize <<< - - -# Go to the directory from where the job was submitted (initial directory is $HOME) -echo Working directory is $PBS_O_WORKDIR -cd $PBS_O_WORKDIR - - -conda activate vaep - -# try to influence how many jobs are run in parallel in one job training a model -export MKL_NUM_THREADS=10 - -snakemake --snakefile workflow/Snakefile_grid.smk --rerun-incomplete -f -j 4 -c 40 - diff --git a/project/src/R_NAGuideR/GSimp.R b/project/src/R_NAGuideR/GSimp.R new file mode 100644 index 000000000..3346e5066 --- /dev/null +++ b/project/src/R_NAGuideR/GSimp.R @@ -0,0 +1,339 @@ +# formatted from https://github.com/WandeRum/GSimp/blob/9f661e5ebf991b160ccedb7728c5bcd825dc963b/GSimp.R +# require(missForest) +require(imputeLCMD) +require(magrittr) +require(glmnet) +require(abind) +require(foreach) +require(doParallel) +# require(MASS) + + +## Source ## +# source('MVI_global.R') +# source('Prediction_funcs.R') + +## Draw n samples from a truncated normal distribution N(mu, std^2|[lo, hi]) ## +rnorm_trunc <- function (n, + mu, + std, + lo = -Inf, + hi = Inf) { + p_lo <- pnorm(lo, mu, std) + p_hi <- pnorm(hi, mu, std) + p_hi[p_hi < .01] <- .01 + u <- runif(n, p_lo, p_hi) + return(qnorm(u, mu, std)) +} + +## Initialize the missing data ## +## lsym will draw samples from the right tail of the distribution and transformed to the left tail +miss_init <- + function(miss_data, + method = c('lsym', 'qrilc', 'rsym')[1]) { + init_data <- miss_data + if (method == 'lsym') { + for (i in 1:ncol(init_data)) { + col_temp <- init_data[, i] + na_idx <- which(is.na(col_temp)) + prop <- mean(is.na(col_temp)) + min_temp <- min(col_temp, na.rm = T) + col_temp[na_idx] <- min_temp - 1 + med_temp <- median(col_temp) + col_temp[na_idx] <- + med_temp - (sample(col_temp[col_temp >= quantile(col_temp, 1 - prop)], length(na_idx), replace = + T) - med_temp) + init_data[, i] <- col_temp + } + } + if (method == 'rsym') { + for (i in 1:ncol(init_data)) { + col_temp <- init_data[, i] + na_idx <- which(is.na(col_temp)) + prop <- mean(is.na(col_temp)) + max_temp <- max(col_temp, na.rm = T) + col_temp[na_idx] <- max_temp + 1 + med_temp <- median(col_temp) + col_temp[na_idx] <- + med_temp + (med_temp - sample(col_temp[col_temp <= quantile(col_temp, prop)], length(na_idx), replace = + T)) + init_data[, i] <- col_temp + } + } + if (method == 'qrilc') { + init_data <- impute.QRILC(miss_data)[[1]] + } + return(init_data) + } + +## Single missing variable imputation based on Gibbs sampler ## +single_impute_iters <- + function(x, + y, + y_miss, + y_real = NULL, + imp_model = 'glmnet_pred', + lo = -Inf, + hi = Inf, + iters_each = 100, + gibbs = c()) { + y_res <- y + x <- as.matrix(x) + na_idx <- which(is.na(y_miss)) + imp_model_func <- getFunction(imp_model) + nrmse_vec <- c() + gibbs_res <- array(NA, dim = c(3, length(gibbs), iters_each)) + dimnames(gibbs_res) <- list(c('std', 'yhat', 'yres'), NULL, NULL) + + for (i in 1:iters_each) { + y_hat <- imp_model_func(x, y_res) + std <- sqrt(sum((y_hat[na_idx] - y_res[na_idx]) ^ 2) / length(na_idx)) + y_res[na_idx] <- + rnorm_trunc(length(na_idx), y_hat[na_idx], std, lo, hi) + if (length(gibbs) > 0) { + gibbs_res[1, , i] <- std + gibbs_res[2, , i] <- y_hat[gibbs] + gibbs_res[3, , i] <- y_res[gibbs] + } + ## The following code is for prediction function testing when y_real availabe ## + if (!is.null(y_real)) { + Sys.sleep(.5) + par(mfrow = c(2, 2)) + nrmse_vec <- c(nrmse_vec, nrmse(y_res, y_miss, y_real)) + plot(y_real ~ y_res) + plot(y_real ~ y_hat) + plot(y_hat ~ y_res) + plot(nrmse_vec) + } + } + return(list(y_imp = y_res, gibbs_res = gibbs_res)) + } + + +## Multiple missing variables imputation ## +## iters_each=number (100); vector of numbers, e.g. rep(100, 20) while iters_all=20 +## lo/hi=numer; vector; functions like min/max/median/mean... +## initial=character ('qrilc'/'lysm'); initialized data maatrix +## n_cores=1 is sequentially (non-parallel) computing +multi_impute <- + function(data_miss, + iters_each = 100, + iters_all = 20, + initial = 'qrilc', + lo = -Inf, + hi = 'min', + n_cores = 1, + imp_model = 'glmnet_pred', + gibbs = data.frame(row = integer(), col = integer())) { + ## Convert to data.frame ## + data_miss %<>% data.frame() + + ## Make vector for iters_each ## + if (length(iters_each) == 1) { + iters_each <- rep(iters_each, iters_all) + } else if (length(iters_each) == iters_all) { + iters_each <- iters_each + } else { + stop('improper argument: iters_each') + } + + + ## Missing count in each column ## + miss_count <- data_miss %>% apply(., 2, function(x) + sum(is.na(x))) + ## Index of missing variables, sorted (increasing) by the number of missings + miss_col_idx <- + order(miss_count, decreasing = T) %>% extract(1:sum(miss_count != 0)) %>% rev() + + if (!all(gibbs$col %in% miss_col_idx)) { + stop('improper argument: gibbs') + } + gibbs_sort <- gibbs + if (nrow(gibbs_sort) > 0) { + gibbs_sort$order <- c(1:nrow(gibbs_sort)) + gibbs_sort <- gibbs_sort[order(gibbs_sort$row),] + gibbs_sort <- + gibbs_sort[order(match(gibbs_sort$col, miss_col_idx)),] + } else { + gibbs_sort$order <- integer() + } + + ## Make vectors for lo and hi ## + if (length(lo) > 1) { + if (length(lo) != ncol(data_miss)) { + stop('Length of lo should equal to one or the number of variables') + } + else { + lo_vec <- lo + } + } else if (is.numeric(lo)) { + lo_vec <- rep(lo, ncol(data_miss)) + } else if (is.character(lo)) { + lo_fun <- getFunction(lo) + lo_vec <- + apply(data_miss, 2, function(x) + x %>% na.omit %>% lo_fun) + } + + if (length(hi) > 1) { + if (length(hi) != ncol(data_miss)) { + stop('Length of hi should equal to one or the number of variables') + } + else { + hi_vec <- hi + } + } else if (is.numeric(hi)) { + hi_vec <- rep(hi, ncol(data_miss)) + } else if (is.character(hi)) { + hi_fun <- getFunction(hi) + hi_vec <- + apply(data_miss, 2, function(x) + x %>% na.omit %>% hi_fun) + } + + # Check whether lo is lower than hi + if (!all(lo_vec < hi_vec)) { + stop('lo should be lower than hi') + } + + ## Initialization using build-in method or input initial matrix ## + if (is.character(initial)) { + data_init <- miss_init(data_miss, method = initial) + } else if (is.data.frame(initial) & + identical(data_miss[!is.na(data_miss)], initial[!is.na(data_miss)])) { + data_init <- initial + } else { + stop('improper argument: initial') + } + + data_imp <- data_init + gibbs_res_final <- array(NA, dim = c(3, nrow(gibbs), 0)) + + ## Iterations for the whole data matrix ## + for (i in 1:iters_all) { + cat('Iteration', i, 'start...') + + ## Parallel computing ## + if (n_cores > 1) { + cat(paste0('Parallel computing (n_cores=', n_cores, ')...')) + ## Parallel on missing variables + cl <- makeCluster(n_cores) + registerDoParallel(cl) + core_res <- + foreach ( + k = miss_col_idx, + .combine = 'cbind_abind', + .export = c('single_impute_iters', 'rnorm_trunc'), + .packages = c('magrittr') + ) %dopar% { + source('Prediction_funcs.R') + gibbs_sort_temp <- gibbs_sort[gibbs_sort$col == k,] + y_imp_res <- + single_impute_iters( + data_imp[,-k], + data_imp[, k], + data_miss[, k], + imp_model = imp_model, + lo = lo_vec[k], + hi = hi_vec[k], + iters_each = iters_each[i], + gibbs = gibbs_sort_temp$row + ) + y_imp_df <- y_imp_res$y_imp %>% data.frame + colnames(y_imp_df) <- colnames(data_miss)[k] + gibbs_res <- y_imp_res$gibbs_res + list(y_imp = y_imp_df, gibbs_res = gibbs_res) + } + stopCluster(cl) + y_imp_df <- core_res$y_imp + gibbs_res_final <- + abind(gibbs_res_final, core_res$gibbs_res, along = 3) + miss_col_idx_match <- + match(colnames(y_imp_df), colnames(data_miss)) + data_imp[, miss_col_idx_match] <- y_imp_df + } else { + ## Sequential computing ## + gibbs_res_j <- array(NA, dim = c(3, 0, iters_each[i])) + for (j in miss_col_idx) { + gibbs_sort_temp <- gibbs_sort[gibbs_sort$col == j,] + y_miss <- data_miss[, j] + y_imp_res <- + single_impute_iters( + data_imp[,-j], + data_imp[, j], + y_miss, + imp_model = imp_model, + lo = lo_vec[j], + hi = hi_vec[j], + iters_each = iters_each[i], + gibbs = gibbs_sort_temp$row + ) + y_imp <- y_imp_res$y_imp + gibbs_res_j <- + abind(gibbs_res_j, y_imp_res$gibbs_res, along = 2) + data_imp[is.na(y_miss), j] <- y_imp[is.na(y_miss)] + } + gibbs_res_final <- + abind(gibbs_res_final, gibbs_res_j, along = 3) + } + cat('end!\n') + } + gibbs_res_final_reorder <- gibbs_res_final[, gibbs_sort$order,] + return(list(data_imp = data_imp, gibbs_res = gibbs_res_final_reorder)) + } + + +# GS_impute --------------------------------------------------------------- +GS_impute <- multi_impute + +# ------------------------------------------------------------------------------ + +# GSimp: MVI_global.R +# https://github.com/WandeRum/GSimp/blob/9f661e5ebf991b160ccedb7728c5bcd825dc963b/MVI_global.R#L41-L61 + +# Scale and recover ------------------------------------------------------- +scale_recover <- function(data, + method = 'scale', + param_df = NULL) { + results <- list() + data_res <- data + if (!is.null(param_df)) { + if (method == 'scale') { + data_res[] <- scale(data, center = param_df$mean, scale = param_df$std) + } else if (method == 'recover') { + data_res[] <- t(t(data) * param_df$std + param_df$mean) + } + } else { + if (method == 'scale') { + param_df <- + data.frame(mean = sapply(data, function(x) + mean(x, na.rm = T)), + std = sapply(data, function(x) + sd(x, na.rm = T))) + data_res[] <- + scale(data, center = param_df$mean, scale = param_df$std) + } else { + stop('no param_df found for recover...') + } + } + results[[1]] <- data_res + results[[2]] <- param_df + return(results) +} + + +# ------------------------------------------------------------------------------ +# https://github.com/WandeRum/GSimp/blob/9f661e5ebf991b160ccedb7728c5bcd825dc963b/Prediction_funcs.R#L27C1-L32C2 +# Prediction_funcs.R + +glmnet_pred <- function(x, y, alpha = .5, lambda = .01) { + x_mat <- as.matrix(x) + model <- glmnet( + x = x_mat, + y = y, + alpha = alpha, + lambda = lambda + ) + y_hat <- predict(model, newx = x_mat)[, 1] + return(y_hat) +} diff --git a/project/src/file_utils.py b/project/src/file_utils.py deleted file mode 100644 index 20d317f71..000000000 --- a/project/src/file_utils.py +++ /dev/null @@ -1,223 +0,0 @@ -import os -from collections import namedtuple -from pathlib import Path -import logging - -from tqdm import tqdm - -import pandas as pd -from pandas.errors import EmptyDataError - -import xmltodict -from numpy import dtype - -logger = logging.getLogger('src.file_utils.py') - - -MQ_VERSION = '1.6.12.0' - - -def check_for_key(iterable, key): - """Check for key in items of Iterable - using `in`(`__contains__`). - - Parameters - ---------- - iterable : Iterable of Strings - Iterable of items which the key can be checked for - key : String - key to check for using `key in item` of Iterable. - - Returns - ------- - string, int - Returns zero if nothing is found, otherwise a string. - If only one item is found containing the key, return this. - Multiple hits are returned connacotaed using an underscore. - """ - hits = [x for x in iterable if key in x] - n_hits = len(hits) - if n_hits == 1: - return hits[0] - elif n_hits == 0: - return 0 - elif n_hits > 1: - return '_'.join(iterable) - - -# can file-loading be made concurrent? -# check tf.data - - -def load_summary(filepath: str = 'summary.txt') -> pd.DataFrame: - f"""Load MaxQuant {MQ_VERSION} summary.txt file. - - Parameters - ---------- - filepath : str, optional - filepath, by default 'summary.txt' - - Returns - ------- - pd.DataFrame - Text-File is returned as pandas.DataFrame - """ - df = pd.read_table(filepath) - df = df.T - df = df.iloc[:, :-1] - return df - - -def load_mqpar_xml(filepath:Path) -> dict: - f"""Load MaxQuant {MQ_VERSION}parameter file in xml format which stores parameters for MaxQuant run, - including version numbers. - - Parameters - ---------- - filepath : str, optional - filepath to xml- parameter file - - Returns - ------- - dict - XML-File parsed as dictionary - """ - with open(filepath) as f: - _ = f.readline() - xml = f.read() - - return xmltodict.parse(xml) - - -types_peptides = {'N-term cleavage window': dtype('O'), - 'C-term cleavage window': dtype('O'), - 'Amino acid before': dtype('O'), - 'First amino acid': dtype('O'), - 'Second amino acid': dtype('O'), - 'Second last amino acid': dtype('O'), - 'Last amino acid': dtype('O'), - 'Amino acid after': dtype('O'), - 'A Count': dtype('int64'), - 'R Count': dtype('int64'), - 'N Count': dtype('int64'), - 'D Count': dtype('int64'), - 'C Count': dtype('int64'), - 'Q Count': dtype('int64'), - 'E Count': dtype('int64'), - 'G Count': dtype('int64'), - 'H Count': dtype('int64'), - 'I Count': dtype('int64'), - 'L Count': dtype('int64'), - 'K Count': dtype('int64'), - 'M Count': dtype('int64'), - 'F Count': dtype('int64'), - 'P Count': dtype('int64'), - 'S Count': dtype('int64'), - 'T Count': dtype('int64'), - 'W Count': dtype('int64'), - 'Y Count': dtype('int64'), - 'V Count': dtype('int64'), - 'U Count': dtype('int64'), - 'O Count': dtype('int64'), - 'Length': dtype('int64'), - 'Missed cleavages': dtype('int64'), - 'Mass': dtype('float64'), - 'Proteins': dtype('O'), - 'Leading razor protein': dtype('O'), - 'Start position': dtype('float64'), - 'End position': dtype('float64'), - 'Gene names': dtype('O'), - 'Protein names': dtype('O'), - 'Unique (Groups)': dtype('O'), - 'Unique (Proteins)': dtype('O'), - 'Charges': dtype('O'), - 'PEP': dtype('float64'), - 'Score': dtype('float64'), - 'Intensity': dtype('int64'), - 'Reverse': dtype('O'), - 'Potential contaminant': dtype('O'), - 'id': dtype('int64'), - 'Protein group IDs': dtype('O'), - 'Mod. peptide IDs': dtype('O'), - 'Evidence IDs': dtype('O'), - 'MS/MS IDs': dtype('O'), - 'Best MS/MS': dtype('float64'), - 'Oxidation (M) site IDs': dtype('O'), - 'MS/MS Count': dtype('int64')} - - -# def load_peptide_intensities(filepath): -# f"""Load Intensities from `peptides.txt`. -# Data types of columns as of in MaxQuant {MQ_VERSION} - -# Parameters -# ---------- -# filepath : str -# filepath (rel or absolute) to MQ peptides.txt - -# Returns -# ------- -# pandas.DataFrame -# Return text file as DataFrame. -# """ -# df = pd.read_table(filepath, index_col='Sequence', dtype=types_peptides) -# return df[['Intensity']] - - -dtypes_proteins = {'Protein IDs': dtype('O'), - 'Majority protein IDs': dtype('O'), - 'Peptide counts (all)': dtype('O'), - 'Peptide counts (razor+unique)': dtype('O'), - 'Peptide counts (unique)': dtype('O'), - 'Protein names': dtype('O'), - 'Gene names': dtype('O'), - 'Fasta headers': dtype('O'), - 'Number of proteins': dtype('int64'), - 'Peptides': dtype('int64'), - 'Razor + unique peptides': dtype('int64'), - 'Unique peptides': dtype('int64'), - 'Sequence coverage [%]': dtype('float64'), - 'Unique + razor sequence coverage [%]': dtype('float64'), - 'Unique sequence coverage [%]': dtype('float64'), - 'Mol. weight [kDa]': dtype('float64'), - 'Sequence length': dtype('int64'), - 'Sequence lengths': dtype('O'), - 'Q-value': dtype('float64'), - 'Score': dtype('float64'), - 'Intensity': dtype('int64'), - 'MS/MS count': dtype('int64'), - 'Only identified by site': dtype('O'), - 'Reverse': dtype('O'), - 'Potential contaminant': dtype('O'), - 'id': dtype('int64'), - 'Peptide IDs': dtype('O'), - 'Peptide is razor': dtype('O'), - 'Mod. peptide IDs': dtype('O'), - 'Evidence IDs': dtype('O'), - 'MS/MS IDs': dtype('O'), - 'Best MS/MS': dtype('O'), - 'Oxidation (M) site IDs': dtype('O'), - 'Oxidation (M) site positions': dtype('O'), - 'Taxonomy IDs': dtype('O')} - - -def load_protein_intensities(filepath): - f"""Load Intensities from `proteins.txt`. - Data types of columns as of in MaxQuant {MQ_VERSION} - - Parameters - ---------- - filepath : str - filepath (rel or absolute) to MQ proteins.txt - - Returns - ------- - pandas.DataFrame - Return text file as DataFrame. - """ - df = pd.read_table( - filepath, index_col='Majority protein IDs', dtype=dtypes_proteins) - return df[['Intensity']] - - - diff --git a/project/src/setup_logging.py b/project/src/setup_logging.py deleted file mode 100644 index 4eb0f3e2a..000000000 --- a/project/src/setup_logging.py +++ /dev/null @@ -1,9 +0,0 @@ -from .logging import * - -logger = logging.getLogger() # returns root-logger -logger.setLevel(logging.CRITICAL) # silence for everything else -logger.handlers = [] - - -logger = setup_logger(logger=logging.getLogger('vaep')) -logger.info("'vaep' logger setup completed.") \ No newline at end of file diff --git a/project/workflow/README.md b/project/workflow/README.md index 477c6c61b..f0a9f6d2c 100644 --- a/project/workflow/README.md +++ b/project/workflow/README.md @@ -92,4 +92,8 @@ Executes both workflows for model training and comparison ten times: - `nolock` : ensure that parent workflow does not block child workflow ```bash snakemake -s workflow\Snakefile_ald_comparison_repeated.smk -p -c1 --nolock --drop-metadata -F -n -``` \ No newline at end of file +``` + +## Test `misc_` notebook + +`TestNotebooks.smk` test the `misc_*` notebooks in the project folder. \ No newline at end of file diff --git a/project/workflow/Snakefile b/project/workflow/Snakefile index 867d4c8ce..96b8e7a23 100644 --- a/project/workflow/Snakefile +++ b/project/workflow/Snakefile @@ -3,16 +3,31 @@ Document how all the notebooks for a single experiment are connected. """ from snakemake.logging import logger + configfile: "config/single_dev_dataset/proteinGroups_N50/config.yaml" + +MAX_WALLTIME = "24:00:00" +# Thinnode resources sharing: 40 cores and 196 GB RAM (minus 2GB for snakemake) +# JOB_RAM_MB = int(204_800 / 40 * config['THREATS_MQ']) +JOB_RAM_MB = "4gb" folder_experiment = config["folder_experiment"] logger.info(f"{folder_experiment = }") +# local rules are excuted in the process (job) running snakemake +localrules: + all, + comparison, + transform_NAGuideR_predictions, + transform_data_to_wide_format, + create_splits, + + rule all: input: - f"{folder_experiment}/figures/errors_binned_by_int_test.pdf", - f"{folder_experiment}/01_2_performance_summary.xlsx" + f"{folder_experiment}/figures/2_1_test_errors_binned_by_int.pdf", + f"{folder_experiment}/01_2_performance_summary.xlsx", nb = "01_2_performance_plots.ipynb" @@ -21,6 +36,9 @@ MODELS = config["models"].copy() if config["NAGuideR_methods"]: MODELS += config["NAGuideR_methods"] +nb_stem = "01_2_performance_summary" + + rule comparison: input: nb=nb, @@ -30,48 +48,58 @@ rule comparison: model=MODELS, ), output: - xlsx="{folder_experiment}/01_2_performance_summary.xlsx", - pdf="{folder_experiment}/figures/errors_binned_by_int_test.pdf", + xlsx=f"{{folder_experiment}}/{nb_stem}.xlsx", + pdf="{folder_experiment}/figures/2_1_test_errors_binned_by_int.pdf", nb="{folder_experiment}" f"/{nb}", params: meta_data=config["fn_rawfile_metadata"], models=",".join(MODELS), + err=f"{{folder_experiment}}/{nb_stem}.e", + out=f"{{folder_experiment}}/{nb_stem}.o", shell: - "papermill {input.nb} {output.nb}" - " -r fn_rawfile_metadata {params.meta_data:q}" + "papermill {input.nb} {output.nb:q}" + " -p fn_rawfile_metadata {params.meta_data:q}" " -r folder_experiment {wildcards.folder_experiment:q}" " -r models {params.models:q}" - " && jupyter nbconvert --to html {output.nb}" + " && jupyter nbconvert --to html {output.nb:q}" + ########################################################################################## # train NaGuideR methods nb_stem = "01_1_transfer_NAGuideR_pred" + + rule transform_NAGuideR_predictions: - input: + input: dumps=expand( "{{folder_experiment}}/preds/pred_all_{method}.csv", method=config["NAGuideR_methods"], ), nb=f"{nb_stem}.ipynb", output: - # "{{folder_experiment}}/preds/pred_real_na_{method}.csv"), - expand( ( - "{{folder_experiment}}/preds/pred_val_{method}.csv", - "{{folder_experiment}}/preds/pred_test_{method}.csv"), + # "{{folder_experiment}}/preds/pred_real_na_{method}.csv"), + expand( + ( + "{{folder_experiment}}/preds/pred_val_{method}.csv", + "{{folder_experiment}}/preds/pred_test_{method}.csv", + ), method=config["NAGuideR_methods"], ), nb="{folder_experiment}/01_1_transfer_NAGuideR_pred.ipynb", benchmark: - "{folder_experiment}/"f"{nb_stem}.tsv", + "{folder_experiment}/" f"{nb_stem}.tsv" params: + err=f"{{folder_experiment}}/{nb_stem}.e", + out=f"{{folder_experiment}}/{nb_stem}.o", folder_experiment="{folder_experiment}", # https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#non-file-parameters-for-rules - dumps_as_str=lambda wildcards, input: ','.join(input.dumps) + dumps_as_str=lambda wildcards, input: ",".join(input.dumps), shell: - "papermill {input.nb} {output.nb}" - " -r folder_experiment {params.folder_experiment}" + "papermill {input.nb} {output.nb:q}" + " -r folder_experiment {params.folder_experiment:q}" " -p dumps {params.dumps_as_str}" - " && jupyter nbconvert --to html {output.nb}" + " && jupyter nbconvert --to html {output.nb:q}" + rule train_NAGuideR_model: input: @@ -79,32 +107,51 @@ rule train_NAGuideR_model: train_split="{folder_experiment}/data/data_wide_sample_cols.csv", output: nb="{folder_experiment}/01_1_train_NAGuideR_{method}.ipynb", - dump=temp("{folder_experiment}/preds/pred_all_{method}.csv") + dump="{folder_experiment}/preds/pred_all_{method}.csv", + resources: + mem_mb=JOB_RAM_MB, + walltime=MAX_WALLTIME, + threads: 1 # R is single threaded benchmark: "{folder_experiment}/01_1_train_NAGuideR_{method}.tsv" params: + err="{folder_experiment}/01_1_train_NAGuideR_{method}.e", + out="{folder_experiment}/01_1_train_NAGuideR_{method}.o", folder_experiment="{folder_experiment}", method="{method}", + name="{method}", + # log: + # err="{folder_experiment}/01_1_train_NAGuideR_{method}.log", + conda: + "vaep" shell: - "papermill {input.nb} {output.nb}" - " -r train_split {input.train_split}" + "papermill {input.nb} {output.nb:q}" + " -r train_split {input.train_split:q}" " -r method {params.method}" - " -r folder_experiment {params.folder_experiment}" - " && jupyter nbconvert --to html {output.nb}" + " -r folder_experiment {params.folder_experiment:q}" + # " 2> {log.err}" + " && jupyter nbconvert --to html {output.nb:q}" + + +nb_stem = "01_0_transform_data_to_wide_format" + rule transform_data_to_wide_format: input: - nb="01_0_transform_data_to_wide_format.ipynb", + nb=f"{nb_stem}.ipynb", train_split="{folder_experiment}/data/train_X.csv", output: nb="{folder_experiment}/01_0_transform_data_to_wide_format.ipynb", - train_split=temp("{folder_experiment}/data/data_wide_sample_cols.csv"), + train_split="{folder_experiment}/data/data_wide_sample_cols.csv", params: folder_experiment="{folder_experiment}", + err=f"{{folder_experiment}}/{nb_stem}.e", + out=f"{{folder_experiment}}/{nb_stem}.o", shell: - "papermill {input.nb} {output.nb}" - " -r folder_experiment {params.folder_experiment}" - " && jupyter nbconvert --to html {output.nb}" + "papermill {input.nb} {output.nb:q}" + " -r folder_experiment {params.folder_experiment:q}" + " && jupyter nbconvert --to html {output.nb:q}" + ########################################################################################## # train models in python @@ -115,39 +162,50 @@ rule train_models: configfile=config["config_train"], output: nb="{folder_experiment}/01_1_train_{model}.ipynb", - pred="{folder_experiment}/preds/pred_test_{model}.csv" + pred="{folder_experiment}/preds/pred_test_{model}.csv", benchmark: "{folder_experiment}/01_1_train_{model}.tsv" params: folder_experiment="{folder_experiment}", meta_data=config["fn_rawfile_metadata"], + err="{folder_experiment}/01_1_train_{model}.e", + out="{folder_experiment}/01_1_train_{model}.o", + name="{model}", + log: + err="{folder_experiment}/01_1_train_{model}.log", + conda: + "vaep" shell: - "papermill {input.nb} {output.nb}" - " -f {input.configfile}" - " -r folder_experiment {params.folder_experiment}" - " -p fn_rawfile_metadata {params.meta_data}" - " -r model_key {wildcards.model}" - " && jupyter nbconvert --to html {output.nb}" + "papermill {input.nb:q} {output.nb:q}" + " -f {input.configfile:q}" + " -r folder_experiment {params.folder_experiment:q}" + " -p fn_rawfile_metadata {params.meta_data:q}" + " -r model_key {wildcards.model:q}" + " 2> {log.err}" + " && jupyter nbconvert --to html {output.nb:q}" ########################################################################################## # Create Data splits # separate workflow by level -> provide custom configs -nb = "01_0_split_data.ipynb" +nb_stem = "01_0_split_data" + rule create_splits: input: - nb=nb, + nb=f"{nb_stem}.ipynb", configfile=config["config_split"], output: train_split="{folder_experiment}/data/train_X.csv", - nb="{folder_experiment}" f"/{nb}", + nb="{folder_experiment}" f"/{nb_stem}.ipynb", params: folder_experiment="{folder_experiment}", meta_data=config["fn_rawfile_metadata"], + err=f"{{folder_experiment}}/{nb_stem}.e", + out=f"{{folder_experiment}}/{nb_stem}.o", shell: "papermill {input.nb} {output.nb}" - " -f {input.configfile}" - " -r folder_experiment {params.folder_experiment}" - " -p fn_rawfile_metadata {params.meta_data}" - " && jupyter nbconvert --to html {output.nb}" \ No newline at end of file + " -f {input.configfile:q}" + " -r folder_experiment {params.folder_experiment:q}" + " -p fn_rawfile_metadata {params.meta_data:q}" + " && jupyter nbconvert --to html {output.nb:q}" diff --git a/project/workflow/Snakefile_ald_comparison.smk b/project/workflow/Snakefile_ald_comparison.smk index ecfc10e5d..0b79e9c1b 100644 --- a/project/workflow/Snakefile_ald_comparison.smk +++ b/project/workflow/Snakefile_ald_comparison.smk @@ -22,7 +22,8 @@ target_cutoff = dict(kleiner="2") target = "kleiner" -all_methods = [config["baseline"], 'None', *config["methods"]] +all_methods = [config["baseline"], "None", *config["methods"]] + wildcard_constraints: target=target, @@ -34,13 +35,14 @@ wildcard_constraints: rule all: input: expand( - out_folder + 'diff_analysis_compare_DA.xlsx', + out_folder + "diff_analysis_compare_DA.xlsx", target=target, - out_folder=config["out_folder"],), + out_folder=config["out_folder"], + ), expand( [ out_folder_two_methods_cp + "diff_analysis_comparision_2_{model}.pdf", - out_folder_two_methods_cp + "mrmr_feat_by_model.xlsx" + out_folder_two_methods_cp + "mrmr_feat_by_model.xlsx", ], target=[target], baseline=config["baseline"], @@ -48,21 +50,23 @@ rule all: out_folder=config["out_folder"], ), + ########################################################################################## # Create plots for featues where decisions between model differ (if computed) nb = "10_4_ald_compare_single_pg.ipynb" + rule plot_intensities_for_diverging_results: input: - expand(folder_experiment + "/preds/pred_real_na_{method}.csv", - method=[ - config["baseline"], - *config["methods"]],), expand( - [ - out_folder + "scores/diff_analysis_scores_{model}.pkl", - ], + folder_experiment + "/preds/pred_real_na_{method}.csv", + method=[config["baseline"], *config["methods"]], + ), + expand( + [ + out_folder + "scores/diff_analysis_scores_{model}.pkl", + ], target=[target], baseline=config["baseline"], model=all_methods, @@ -71,19 +75,19 @@ rule plot_intensities_for_diverging_results: nb=nb, fn_clinical_data="data/ALD_study/processed/ald_metadata_cli.csv", output: - diff_da = out_folder + 'diff_analysis_compare_DA.xlsx', - qvalues = out_folder + 'qvalues_target.pkl', - nb=out_folder + nb + diff_da=out_folder + "diff_analysis_compare_DA.xlsx", + qvalues=out_folder + "qvalues_target.pkl", + nb=out_folder + nb, params: baseline=config["baseline"], cutoff=lambda wildcards: config["cutoffs"][wildcards.target], make_plots=config["make_plots"], - ref_method_score = config['ref_method_score'] # None, + ref_method_score=config["ref_method_score"], # None, shell: "papermill {input.nb} {output.nb}" f" -r folder_experiment {folder_experiment}" " -r target {wildcards.target}" - " -r baseline {params.baseline}" # not yet used + " -r baseline {params.baseline}" " -r out_folder {wildcards.out_folder}" " -p cutoff_target {params.cutoff}" " -p make_plots {params.make_plots}" @@ -119,16 +123,18 @@ rule ml_comparison: " -r fn_clinical_data {input.fn_clinical_data}" " && jupyter nbconvert --to html {output.nb}" + ########################################################################################## # basemethod vs other methods nb = "10_2_ald_compare_methods.ipynb" nb_stem = "10_2_ald_compare_methods" + rule compare_diff_analysis: input: nb=nb, score_base=out_folder + "scores/diff_analysis_scores_{baseline}.pkl", - score_model=out_folder + "scores/diff_analysis_scores_{model}.pkl" + score_model=out_folder + "scores/diff_analysis_scores_{model}.pkl", output: nb=out_folder_two_methods_cp + nb, figure=out_folder_two_methods_cp + "diff_analysis_comparision_2_{model}.pdf", @@ -136,7 +142,7 @@ rule compare_diff_analysis: disease_ontology=lambda wildcards: config["disease_ontology"][wildcards.target], annotaitons_gene_col=config["annotaitons_gene_col"], benchmark: - out_folder_two_methods_cp + f"{nb_stem}.tsv", + out_folder_two_methods_cp + f"{nb_stem}.tsv" shell: "papermill {input.nb} {output.nb}" f" -r folder_experiment {folder_experiment}" @@ -172,6 +178,3 @@ rule differential_analysis: " -r model_key {wildcards.model}" " -r out_folder {wildcards.out_folder}" " && jupyter nbconvert --to html {output.nb}" - - - diff --git a/project/workflow/Snakefile_ald_comparison_repeated.smk b/project/workflow/Snakefile_ald_comparison_repeated.smk index 38e45ee70..b4aa9299a 100644 --- a/project/workflow/Snakefile_ald_comparison_repeated.smk +++ b/project/workflow/Snakefile_ald_comparison_repeated.smk @@ -3,7 +3,7 @@ Try to execute several time the same Snakemake workflow using another Snakemake - one by one? (-> one process at a time?) """ -folder_experiment = "runs/appl_ald_data/plasma/proteinGroups/reps" +folder_experiment = "runs/appl_ald_data_2023_11/reps/plasma/proteinGroups" folder_run = folder_experiment + "/run_{run}" out_folder = folder_run + "/{sub_folder}/{target}" @@ -12,19 +12,21 @@ sub_folder = "diff_analysis" N = 10 make_plots = False + rule all: input: f"{folder_experiment}/agg_differences_compared.xlsx", + rule compare_repetitions: input: - qvalues = expand( + qvalues=expand( f"{out_folder}/qvalues_target.pkl", target=target, sub_folder=sub_folder, run=range(N), ), - equality_rejected_target = expand( + equality_rejected_target=expand( f"{out_folder}/equality_rejected_target.pkl", target=target, sub_folder=sub_folder, @@ -39,9 +41,10 @@ rule compare_repetitions: notebook: "../10_5_comp_diff_analysis_repetitions.ipynb" + rule run_comparison_workflow: input: - f"{folder_run}/figures/errors_binned_by_int_test.pdf", + f"{folder_run}/figures/2_1_test_errors_binned_by_feat_medians.pdf", output: excel=f"{out_folder}/equality_rejected_target.pkl", qvalues=f"{out_folder}/qvalues_target.pkl", @@ -59,9 +62,9 @@ rule run_comparison_workflow: rule run_models: output: - f"{folder_run}/figures/errors_binned_by_int_test.pdf", + f"{folder_run}/figures/2_1_test_errors_binned_by_feat_medians.pdf", params: - configfile="config/appl_ald_data/plasma/proteinGroups/config.yaml", + configfile="config/appl_ald_data/plasma/proteinGroups/config_reps.yaml", folder_experiment=folder_run, shell: "snakemake --configfile {params.configfile}" diff --git a/project/workflow/Snakefile_best_across_datasets.smk b/project/workflow/Snakefile_best_across_datasets.smk index 0850bb919..ea208a1c4 100644 --- a/project/workflow/Snakefile_best_across_datasets.smk +++ b/project/workflow/Snakefile_best_across_datasets.smk @@ -55,6 +55,7 @@ rule collect_metrics: REPITITION_NAME = params.repitition_name + # key fully specified in path def key_from_fname(fname): key = (fname.parents[2].name, fname.parents[1].name) diff --git a/project/workflow/Snakefile_best_repeated_split.smk b/project/workflow/Snakefile_best_repeated_split.smk index 44f37f860..b0c709114 100644 --- a/project/workflow/Snakefile_best_repeated_split.smk +++ b/project/workflow/Snakefile_best_repeated_split.smk @@ -10,8 +10,10 @@ config["folder_experiment"] = folder_experiment MODELS = ["DAE", "VAE", "CF"] + wildcard_constraints: - model="|".join(MODELS) + model="|".join(MODELS), + rule all: input: @@ -58,6 +60,7 @@ rule collect_metrics: nb = "01_0_split_data.ipynb" + rule create_splits: input: nb=nb, @@ -69,7 +72,7 @@ rule create_splits: folder_experiment=f"{folder_experiment}", meta_data=config["fn_rawfile_metadata"], file_format=config["file_format"], - random_state="{repeat}" + random_state="{repeat}", shell: "papermill {input.nb} {output.nb}" " -f {input.configfile}" @@ -79,6 +82,7 @@ rule create_splits: " -p random_state {params.random_state}" " && jupyter nbconvert --to html {output.nb}" + rule train_models: input: nb="01_1_train_{model}.ipynb", @@ -93,6 +97,7 @@ rule train_models: model_key="{model}", meta_data=config["fn_rawfile_metadata"], file_format=config["file_format"], + cuda=config["cuda"], shell: "papermill {input.nb} {output.nb}" " -f {input.configfile}" @@ -100,4 +105,5 @@ rule train_models: " -r fn_rawfile_metadata {params.meta_data}" " -r file_format {params.file_format}" " -r model_key {params.model_key}" + " -p cuda {params.cuda}" " && jupyter nbconvert --to html {output.nb}" diff --git a/project/workflow/Snakefile_best_repeated_train.smk b/project/workflow/Snakefile_best_repeated_train.smk index 7e6f26a58..2bbac4f86 100644 --- a/project/workflow/Snakefile_best_repeated_train.smk +++ b/project/workflow/Snakefile_best_repeated_train.smk @@ -10,6 +10,7 @@ config["folder_experiment"] = folder_experiment MODELS = ["DAE", "VAE", "CF"] + rule all: input: f"{config['folder']}/model_performance_repeated_runs.pdf", @@ -53,9 +54,9 @@ rule collect_metrics: "notebooks/best_repeated_train_collect_metrics.ipynb" - nb = "01_0_split_data.ipynb" + rule create_splits: input: nb=nb, @@ -75,6 +76,7 @@ rule create_splits: " -r file_format {params.file_format}" " && jupyter nbconvert --to html {output.nb}" + rule train_models: input: nb="01_1_train_{model}.ipynb", @@ -89,6 +91,7 @@ rule train_models: model_key="{model}_{repeat}", meta_data=config["fn_rawfile_metadata"], file_format=config["file_format"], + cuda=config['cuda'], shell: "papermill {input.nb} {output.nb}" " -f {input.configfile}" @@ -96,4 +99,5 @@ rule train_models: " -r fn_rawfile_metadata {params.meta_data}" " -r file_format {params.file_format}" " -r model_key {params.model_key}" + " -p cuda {params.cuda}" " && jupyter nbconvert --to html {output.nb}" diff --git a/project/workflow/Snakefile_grid.smk b/project/workflow/Snakefile_grid.smk index 4d4cfc148..299a512ea 100644 --- a/project/workflow/Snakefile_grid.smk +++ b/project/workflow/Snakefile_grid.smk @@ -4,7 +4,8 @@ from snakemake.utils import min_version min_version("6.0") -configfile: "config/config_grid_small.yaml" +configfile: "config/grid_search_large_data/config_grid_small.yaml" + # prefix: "grid_search" # could be used to redirect all outputs @@ -229,15 +230,17 @@ rule train_ae_models: params: folder_dataset=f"{root_model}/{run_id_template}", # model_key="HL_{hidden_layers}_LD_{hidden_layers}", # ToDo + # add log + # https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#log-files threads: 10 shell: "papermill {input.nb} {output.nb}" " -f {input.configfile}" " -r folder_experiment {params.folder_dataset}" - # " -r model_key {params.model_key}" " && jupyter nbconvert --to html {output.nb}" + rule build_train_config_ae: output: config_train=f"{root_model}/{run_id_template}/config_nb_train.yaml", @@ -309,7 +312,7 @@ rule train_CF_model: metric=f"{root_model}/{run_id_template}/metrics_{_model}.json", config=f"{root_model}/{run_id_template}/model_config_{_model}.yaml", benchmark: - f"{root_model}/{run_id_template}/01_1_train_{_model}.tsv", + f"{root_model}/{run_id_template}/01_1_train_{_model}.tsv" threads: 10 params: folder_experiment=f"{root_model}/{run_id_template}", @@ -324,7 +327,6 @@ rule train_CF_model: " && jupyter nbconvert --to html {output.nb}" - rule build_train_config_collab: output: config_nb_train=f"{root_model}/{run_id_template}/config_nb_train_CF.yaml", diff --git a/project/workflow/Snakefile_small_N.smk b/project/workflow/Snakefile_small_N.smk index 01aab082d..807f7d211 100644 --- a/project/workflow/Snakefile_small_N.smk +++ b/project/workflow/Snakefile_small_N.smk @@ -8,6 +8,7 @@ min_version("6.0") configfile: "config/single_dev_dataset/proteinGroups_N50/config.yaml" + # prefix: "grid_search" # could be used to redirect all outputs @@ -20,6 +21,7 @@ module single_experiment: config: config + root_experiment = Path(config["folder_experiment"]) # runs/dev_dataset_small/proteinGroups_N50 @@ -27,12 +29,14 @@ folder_experiment = config["folder_experiment"][:-2] + "{N}" config["folder_experiment"] = folder_experiment - logger.info(f"{folder_experiment = }") logger.info(f"{root_experiment = }") + rule all: - input: combined_xlsx=f"{root_experiment.parent}/{root_experiment.name}_all_small.xlsx" + input: + combined_xlsx=f"{root_experiment.parent}/{root_experiment.name}_all_small.xlsx", + rule combine_result_tables: input: @@ -65,7 +69,6 @@ MODELS = single_experiment.MODELS # logger.info(f"{config['NAGuideR_methods'] = }") - # # MODELS = config["models"].copy() # # # ! needed to run NAGuideR methods, but needs to be switched off for comparison nb # # # ? how is the original config imported here? @@ -75,6 +78,7 @@ MODELS = single_experiment.MODELS nb = "01_2_performance_plots.ipynb" + use rule comparison from single_experiment as adapted_comparison with: input: nb=nb, @@ -86,7 +90,7 @@ use rule comparison from single_experiment as adapted_comparison with: ), output: xlsx="{folder_experiment}/01_2_performance_summary.xlsx", - pdf="{folder_experiment}/figures/errors_binned_by_int_test.pdf", + pdf="{folder_experiment}/figures/2_1_test_errors_binned_by_int.pdf", nb="{folder_experiment}" f"/{nb}", diff --git a/project/workflow/Snakefile_v2 b/project/workflow/Snakefile_v2 new file mode 100644 index 000000000..bbd2a95b7 --- /dev/null +++ b/project/workflow/Snakefile_v2 @@ -0,0 +1,260 @@ +""" +Document how all the notebooks for a single experiment are connected. +""" +from snakemake.logging import logger + + +configfile: "config/single_dev_dataset/proteinGroups_N50/config_v2.yaml" + + +MAX_WALLTIME = "24:00:00" +# Thinnode resources sharing: 40 cores and 196 GB RAM (minus 2GB for snakemake) +# JOB_RAM_MB = int(204_800 / 40 * config['THREATS_MQ']) +JOB_RAM_MB = "8gb" +folder_experiment = config["folder_experiment"] +logger.info(f"{folder_experiment = }") +logger.info(f"{config = }") + + +# local rules are excuted in the process (job) running snakemake +localrules: + all, + comparison, + transform_NAGuideR_predictions, + transform_data_to_wide_format, + create_splits, + dump_train_config, + dump_split_config, + + +rule all: + input: + f"{folder_experiment}/figures/2_1_test_errors_binned_by_int.pdf", + f"{folder_experiment}/01_2_performance_summary.xlsx", + + +nb = "01_2_performance_plots.ipynb" + +if "frac_mnar" in config: + config["split_data"]["frac_mnar"] = config["frac_mnar"] + +# print(config['split_data']) +# MODELS = config["models"].copy() + +MODELS = list() +model_configs = dict() +for m in config["models"]: + for model, cfg_model in m.items(): + MODELS.append(model) + model_configs[model] = dict(cfg_model) +else: + del model, cfg_model + +if config["NAGuideR_methods"]: + MODELS += config["NAGuideR_methods"] + +nb_stem = "01_2_performance_summary" + + +rule comparison: + input: + nb=nb, + runs=expand( + "{folder_experiment}/preds/pred_test_{model}.csv", + folder_experiment=config["folder_experiment"], + model=MODELS, + ), + output: + xlsx=f"{{folder_experiment}}/{nb_stem}.xlsx", + pdf="{folder_experiment}/figures/2_1_test_errors_binned_by_int.pdf", + nb="{folder_experiment}" f"/{nb}", + params: + meta_data=config["fn_rawfile_metadata"], + models=",".join(MODELS), + err=f"{{folder_experiment}}/{nb_stem}.e", + out=f"{{folder_experiment}}/{nb_stem}.o", + shell: + "papermill {input.nb} {output.nb}" + " -r fn_rawfile_metadata {params.meta_data:q}" + " -r folder_experiment {wildcards.folder_experiment:q}" + " -r models {params.models:q}" + " && jupyter nbconvert --to html {output.nb}" + + +########################################################################################## +# train NaGuideR methods +nb_stem = "01_1_transfer_NAGuideR_pred" + + +rule transform_NAGuideR_predictions: + input: + dumps=expand( + "{{folder_experiment}}/preds/pred_all_{method}.csv", + method=config["NAGuideR_methods"], + ), + nb=f"{nb_stem}.ipynb", + output: + # "{{folder_experiment}}/preds/pred_real_na_{method}.csv"), + expand( + ( + "{{folder_experiment}}/preds/pred_val_{method}.csv", + "{{folder_experiment}}/preds/pred_test_{method}.csv", + ), + method=config["NAGuideR_methods"], + ), + nb="{folder_experiment}/01_1_transfer_NAGuideR_pred.ipynb", + benchmark: + "{folder_experiment}/" f"{nb_stem}.tsv" + params: + err=f"{{folder_experiment}}/{nb_stem}.e", + out=f"{{folder_experiment}}/{nb_stem}.o", + folder_experiment="{folder_experiment}", + # https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#non-file-parameters-for-rules + dumps_as_str=lambda wildcards, input: ",".join(input.dumps), + shell: + "papermill {input.nb} {output.nb}" + " -r folder_experiment {params.folder_experiment}" + " -p dumps {params.dumps_as_str}" + " && jupyter nbconvert --to html {output.nb}" + + +rule train_NAGuideR_model: + input: + nb="01_1_train_NAGuideR_methods.ipynb", + train_split="{folder_experiment}/data/data_wide_sample_cols.csv", + output: + nb="{folder_experiment}/01_1_train_NAGuideR_{method}.ipynb", + dump="{folder_experiment}/preds/pred_all_{method}.csv", + resources: + mem_mb=JOB_RAM_MB, + walltime=MAX_WALLTIME, + threads: 1 # R is single threaded + benchmark: + "{folder_experiment}/01_1_train_NAGuideR_{method}.tsv" + params: + err="{folder_experiment}/01_1_train_NAGuideR_{method}.e", + out="{folder_experiment}/01_1_train_NAGuideR_{method}.o", + folder_experiment="{folder_experiment}", + method="{method}", + name="{method}", + conda: + "vaep" + shell: + "papermill {input.nb} {output.nb}" + " -r train_split {input.train_split}" + " -r method {params.method}" + " -r folder_experiment {params.folder_experiment}" + " && jupyter nbconvert --to html {output.nb}" + + +nb_stem = "01_0_transform_data_to_wide_format" + + +rule transform_data_to_wide_format: + input: + nb=f"{nb_stem}.ipynb", + train_split="{folder_experiment}/data/train_X.csv", + output: + nb="{folder_experiment}/01_0_transform_data_to_wide_format.ipynb", + train_split="{folder_experiment}/data/data_wide_sample_cols.csv", + params: + folder_experiment="{folder_experiment}", + err=f"{{folder_experiment}}/{nb_stem}.e", + out=f"{{folder_experiment}}/{nb_stem}.o", + shell: + "papermill {input.nb} {output.nb}" + " -r folder_experiment {params.folder_experiment}" + " && jupyter nbconvert --to html {output.nb}" + + +########################################################################################## +# train models in python +rule train_models: + input: + nb=lambda wildcards: "01_1_train_{}.ipynb".format( + model_configs[wildcards.model]["model"] + ), + train_split="{folder_experiment}/data/train_X.csv", + configfile=config["config_train"], + output: + nb="{folder_experiment}/01_1_train_{model}.ipynb", + pred="{folder_experiment}/preds/pred_test_{model}.csv", + benchmark: + "{folder_experiment}/01_1_train_{model}.tsv" + resources: + mem_mb=JOB_RAM_MB, + walltime=MAX_WALLTIME, + params: + folder_experiment="{folder_experiment}", + meta_data=config["fn_rawfile_metadata"], + err="{folder_experiment}/01_1_train_{model}.e", + out="{folder_experiment}/01_1_train_{model}.o", + name="{model}", + conda: + "vaep" + shell: + "papermill {input.nb} {output.nb}" + " -f {input.configfile}" + " -r folder_experiment {params.folder_experiment}" + " -p fn_rawfile_metadata {params.meta_data}" + " -r model_key {wildcards.model}" + " && jupyter nbconvert --to html {output.nb}" + + +########################################################################################## +# create config file dumps for each model + + +rule dump_train_config: + output: + configfile=config["config_train"], + run: + import yaml + + with open(output.configfile, "w") as f: + f.write("# Build in Snakemake workflow\n") + yaml.dump(model_configs[wildcards.model], f, sort_keys=False) + + +########################################################################################## +# Create Data splits +# separate workflow by level -> provide custom configs +nb_stem = "01_0_split_data" + + +rule create_splits: + input: + nb=f"{nb_stem}.ipynb", + configfile=config["config_split"], + output: + train_split="{folder_experiment}/data/train_X.csv", + nb="{folder_experiment}" f"/{nb_stem}.ipynb", + params: + folder_experiment="{folder_experiment}", + meta_data=config["fn_rawfile_metadata"], + err=f"{{folder_experiment}}/{nb_stem}.e", + out=f"{{folder_experiment}}/{nb_stem}.o", + shell: + "papermill {input.nb} {output.nb}" + " -f {input.configfile}" + " -r folder_experiment {params.folder_experiment}" + " -p fn_rawfile_metadata {params.meta_data}" + " && jupyter nbconvert --to html {output.nb}" + + +########################################################################################## +# create config file dumps for each model + + +rule dump_split_config: + output: + configfile=config["config_split"], + run: + import yaml + # recreating dict, otherwise Null becomes string "Null" in yaml dump... + cfg = dict() + for k, v in config["split_data"].items(): + cfg[k] = v + with open(output.configfile, "w") as f: + f.write("# Build in Snakemake workflow (from v2)\n") + yaml.dump(cfg, f, sort_keys=False) diff --git a/project/workflow/Snakemake_project.smk b/project/workflow/Snakemake_project.smk deleted file mode 100644 index 87058d84e..000000000 --- a/project/workflow/Snakemake_project.smk +++ /dev/null @@ -1,113 +0,0 @@ -"""Workflow to produce global analysis for the project.""" -nb_outfolder = "runs" - -DATASETS=[ "df_intensities_proteinGroups_long", - "df_intensities_peptides_long", - "df_intensities_evidence_long" - ] - -OUT_INFO = "dataset_info" - -rule all: - input: - "data/files_per_instrument.yaml", # nested: model, attribute, serial number - "data/files_selected_metadata.csv", - "data/files_selected_per_instrument.yaml", - "data/files_selected_per_instrument_counts.csv", # counts - f'{nb_outfolder}/{"00_2_hela_all_raw_files.ipynb"}', - "data/samples_selected.yaml", - expand( - "data/single_datasets/{dataset}/{OUT_INFO}.json", - dataset=DATASETS, - OUT_INFO=OUT_INFO, - ), - - -nb = "00_2_hela_all_raw_files.ipynb" - - -rule metadata: - input: - nb=nb, - # meta='../workflows/metadata/rawfile_metadata.json', - meta="data/all_raw_files_dump_2021_10_29.txt", - summaries="data/processed/all_summaries.json", - intensities="data/df_intensities_N07285_M01000", # csv - output: - nb=f"{nb_outfolder}/{nb}", - # # final config - # {'FN_ALL_RAW_FILES': 'data/all_raw_files_dump_2021_10_29.txt', # input - # 'FN_ALL_SUMMARIES': 'data/processed/all_summaries.json', # input - # 'FN_ALL_RAW_FILES_UNIQUE': 'data/all_raw_files_dump_2021_10_29_unique_N50521_M00003.csv', - # 'FN_ALL_RAW_FILES_DUPLICATED': 'data/all_raw_files_dump_2021_10_29_duplicated.txt', - # 'raw_file_overview': 'Figures/raw_file_overview.pdf', - # 'fname_1000_most_common_peptides': 'data/df_intensities_N07285_M01000', - # 'figure_1': 'Figures/figure_1.pdf', - # 'remote_files': 'data/remote_files.yaml'} - shell: - "papermill {input.nb} {output.nb}" - " -p FN_ALL_RAW_FILES {input.meta}" - " -p FN_ALL_SUMMARIES {input.summaries}" - " -p FN_PEPTIDE_INTENSITIES {input.intensities}" - " && jupyter nbconvert --to html {output.nb}" - - -nb = "00_1_hela_MQ_summaries.ipynb" - - -rule summaries: - input: - nb=nb, - summaries="data/processed/all_summaries.json", - output: - nb=f"{nb_outfolder}/{nb}", - selected="data/samples_selected.yaml", - shell: - "papermill {input.nb} {output.nb}" - " -r FN_ALL_SUMMARIES {input.summaries}" - " && jupyter nbconvert --to html {output.nb}" - - -nb = "00_0_hela_metadata_rawfiles.ipynb" - - -rule metadata_rawfiles: - input: - "data/rawfile_metadata.csv", - "data/samples_selected.yaml", - nb=nb, - output: - "data/files_per_instrument.yaml", # nested: model, attribute, serial number - "data/files_selected_metadata.csv", - "data/files_selected_per_instrument.yaml", - "data/files_selected_per_instrument_counts.csv", # counts - nb=f"{nb_outfolder}/{nb}", - shell: - "papermill {input.nb} {output.nb}" - " && jupyter nbconvert --to html {output.nb}" # run with defaults - - - -nb='00_3_hela_development_dataset_splitting.ipynb' -outfolder=f'dev_datasets' -ROOT_DUMPS = "C:/Users/kzl465/OneDrive - University of Copenhagen/vaep/project/data" - - -rule split_data: - input: - nb=nb, - data=f"{ROOT_DUMPS}/{{dataset}}.pkl", - output: - nb=f"data/dev_datasets/{{dataset}}/{nb}", - json=f'data/dev_datasets/{{dataset}}/{OUT_INFO}.xlsx' - params: - folder_datasets="single_datasets/{dataset}", - shell: - # papermill parameters with whitespaces > - "papermill {input.nb} {output.nb}" - ' -r DUMP "{input.data}" ' - " -r FILE_EXT pkl" - " -r FOLDER_DATASETS {params.folder_datasets}" - ' -r SAMPLE_ID "Sample ID" ' - f" -r OUT_INFO {OUT_INFO} " - " && jupyter nbconvert --to html {output.nb}" diff --git a/project/workflow/TestNotebooks.smk b/project/workflow/TestNotebooks.smk index 1abd20b1e..9d8399827 100644 --- a/project/workflow/TestNotebooks.smk +++ b/project/workflow/TestNotebooks.smk @@ -1,33 +1,9 @@ notebookes = [ - "misc_FASTA_data_agg_by_gene.ipynb", - "misc_FASTA_tryptic_digest.ipynb", - # "2_clustering_proteins.ipynb", # Reference Annelaura - "erda_data_available.ipynb", - "misc_data_exploration_peptides.ipynb", - "misc_data_exploration_proteins.ipynb", - "00_0_hela_metadata_rawfiles.ipynb", - "00_1_hela_MQ_summaries.ipynb", - "00_2_hela_all_raw_files.ipynb", - "misc_protein_support.ipynb", - "00_5_training_data_exploration.ipynb", - "00_4_development_dataset_support.ipynb", - "01_0_split_data.ipynb", - "14_experiment_03_dataloaders.ipynb", - "14_experiment_03_dataset.ipynb", - # "02_3_grid_search_analysis.ipynb", # needs parametrization for testing - "14_experiment_03_latent_space_analysis.ipynb", "misc_embeddings.ipynb", "misc_illustrations.ipynb", "misc_pytorch_fastai_dataloaders.ipynb", "misc_pytorch_fastai_dataset.ipynb", - "erda_00_maxquant_file_reader.ipynb", - "erda_01_mq_aggregate_summaries.ipynb", - "erda_02_mq_count_peptides.ipynb", - # "erda_10_training_data.ipynb", - # "erda_11_select_training_data.ipynb", - # "erda_12_explore_raw_MQ_data.ipynb", - # "VAEP_POC.ipynb", # to discard - # "misc_id_mapper.ipynb", # to discard + "misc_sampling_in_pandas.ipynb" ] diff --git a/project/workflow/bin/README.md b/project/workflow/bin/README.md new file mode 100644 index 000000000..a45c2b8a8 --- /dev/null +++ b/project/workflow/bin/README.md @@ -0,0 +1,14 @@ +# Scripts for pbs-torque cluster execution + +We ran the software partly on a pbs-torque cluster + +`qsub-status_v2.py` is used by snakemake to query the status of a submitted job in case the job, +in case the job is not ran locally within the main process running snakemake. + +`create_qsub_commands.py` is a script which create some job submission commands. + +`jobscript.sh` is a script which sets up conda before the subcommand create from within +a snakemake job is run. + + +> None of this is needed in case snakemake is non-distributed on a single node. \ No newline at end of file diff --git a/workflows/maxquant/qsub-status.py b/project/workflow/bin/qsub-status.py old mode 100644 new mode 100755 similarity index 94% rename from workflows/maxquant/qsub-status.py rename to project/workflow/bin/qsub-status.py index e6cdad7bb..acdd47554 --- a/workflows/maxquant/qsub-status.py +++ b/project/workflow/bin/qsub-status.py @@ -1,26 +1,26 @@ -#!/usr/bin/env python3 -# https://github.com/Snakemake-Profiles/pbs-torque/blob/master/%7B%7Bcookiecutter.profile_name%7D%7D/pbs-status.py -import sys -import subprocess -import xml.etree.cElementTree as ET - -jobid = sys.argv[1] - -try: - res = subprocess.run("qstat -f -x {}".format(jobid), check=True, - stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) - - xmldoc = ET.ElementTree(ET.fromstring(res.stdout.decode())).getroot() - job_state = xmldoc.findall('.//job_state')[0].text - - if job_state == "C": - exit_status = xmldoc.findall('.//exit_status')[0].text - if exit_status == '0': - print("success") - else: - print("failed") - else: - print("running") - -except (subprocess.CalledProcessError, IndexError, KeyboardInterrupt) as e: - print("failed") +#!/usr/bin/env python3 +# https://github.com/Snakemake-Profiles/pbs-torque/blob/master/%7B%7Bcookiecutter.profile_name%7D%7D/pbs-status.py +import sys +import subprocess +import xml.etree.cElementTree as ET + +jobid = sys.argv[1] + +try: + res = subprocess.run("qstat -f -x {}".format(jobid), check=True, + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) + + xmldoc = ET.ElementTree(ET.fromstring(res.stdout.decode())).getroot() + job_state = xmldoc.findall('.//job_state')[0].text + + if job_state == "C": + exit_status = xmldoc.findall('.//exit_status')[0].text + if exit_status == '0': + print("success") + else: + print("failed") + else: + print("running") + +except (subprocess.CalledProcessError, IndexError, KeyboardInterrupt) as e: + print("failed") \ No newline at end of file diff --git a/project/workflow/bin/qsub-status_v2.py b/project/workflow/bin/qsub-status_v2.py new file mode 100755 index 000000000..4f67d1a02 --- /dev/null +++ b/project/workflow/bin/qsub-status_v2.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +# https://github.com/Snakemake-Profiles/pbs-torque/blob/master/%7B%7Bcookiecutter.profile_name%7D%7D/pbs-status.py +import sys +import subprocess + +jobid = sys.argv[1] + +try: + # ! do net query full information + qstat = subprocess.run("qstat {}".format(jobid), check=True, + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) + res = qstat.stdout.decode(errors='ignore') + + if "C" in res: + full = subprocess.run("qstat -f -x {}".format(jobid), check=True, + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) + full = full.stdout.decode(errors='ignore') + if "0" in full: + print("success") + else: + print("failed") + else: + print("running") + +except (subprocess.CalledProcessError, IndexError, KeyboardInterrupt) as e: + print("failed") \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index 4e33952ce..9ac3edd9e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -22,9 +22,9 @@ include_package_data = True install_requires = numpy matplotlib - pandas<2 + pandas plotly - torch<2 + torch scikit-learn>=1.0 scipy seaborn diff --git a/setup.py b/setup.py index 8ab824cc7..8bf1ba938 100644 --- a/setup.py +++ b/setup.py @@ -1,2 +1,2 @@ from setuptools import setup -setup() \ No newline at end of file +setup() diff --git a/vaep/__init__.py b/vaep/__init__.py index 7402b9bc1..20ad35526 100644 --- a/vaep/__init__.py +++ b/vaep/__init__.py @@ -23,7 +23,7 @@ savefig = vaep.plotting.savefig __license__ = 'GPLv3' -__version__ = (0, 1, 0) +__version__ = (0, 2, 0) # set some defaults diff --git a/vaep/analyzers/__init__.py b/vaep/analyzers/__init__.py index 4274cfd4d..c50b18b25 100644 --- a/vaep/analyzers/__init__.py +++ b/vaep/analyzers/__init__.py @@ -1,9 +1,10 @@ from types import SimpleNamespace -from . import diff_analysis +from . import diff_analysis from . import compare_predictions __all__ = ['diff_analysis', 'compare_predictions', 'Analysis'] + class Analysis(SimpleNamespace): - pass \ No newline at end of file + pass diff --git a/vaep/analyzers/analyzers.py b/vaep/analyzers/analyzers.py index 3f0609dfa..9c48d4ed8 100644 --- a/vaep/analyzers/analyzers.py +++ b/vaep/analyzers/analyzers.py @@ -2,6 +2,8 @@ from pathlib import Path from types import SimpleNamespace from typing import Tuple, Union, List + +import logging import random @@ -21,8 +23,8 @@ from vaep.pandas import _add_indices from vaep.io.datasplits import long_format, wide_format -from . import metadata +logger = logging.getLogger(__name__) __doc__ = 'A collection of Analyzers to perform certain type of analysis.' @@ -31,22 +33,22 @@ def verify_df(df, - fname, - index_col:str, # could be potentially 0 for the first column - verify_fname: bool = False, - usecols=None, - ): + fname, + index_col: str, # could be potentially 0 for the first column + verify_fname: bool = False, + usecols=None, + ): if usecols and isinstance(index_col, str): assert index_col in usecols, 'Add index_col to usecols Sequence' if verify_fname: if not len(df.shape) == 2: raise ValueError(f"Expected 2 -dimensional array, not {len(df.shape)} -dimensional," - f" of type: {type(df)}") + f" of type: {type(df)}") N, M = df.shape assert f'N{N:05d}' in str(fname) and f'M{M:05d}' in str(fname), \ ("Filename number don't match loaded numbers: " f"{fname} should contain N{N} and M{M}") - + class AnalyzePeptides(SimpleNamespace): """Namespace for current analysis @@ -61,7 +63,7 @@ class AnalyzePeptides(SimpleNamespace): Many more attributes are set dynamically depending on the concrete analysis. """ - def __init__(self, data:pd.DataFrame, + def __init__(self, data: pd.DataFrame, is_log_transformed: bool = False, is_wide_format: bool = True, ind_unstack: str = '',): if not is_wide_format: @@ -115,7 +117,7 @@ def from_pickle(cls, fname: str, def get_consecutive_dates(self, n_samples, seed=42): """Select n consecutive samples using a seed. - + Updated the original DataFrame attribute: df """ self.df.sort_index(inplace=True) @@ -138,7 +140,11 @@ def df_long(self): return self._df_long return self.to_long_format(colname_values='intensity', index_name=self.index_col) - def to_long_format(self, colname_values: str = 'intensity', index_name: str = 'Sample ID', inplace: str = False) -> pd.DataFrame: + def to_long_format( + self, + colname_values: str = 'intensity', + index_name: str = 'Sample ID', + inplace: str = False) -> pd.DataFrame: """[summary] Parameters @@ -178,7 +184,11 @@ def to_long_format(self, colname_values: str = 'intensity', index_name: str = 'S def df_wide(self): return self.to_wide_format() - def to_wide_format(self, columns: str = 'Sample ID', name_values: str = 'intensity', inplace: bool = False) -> pd.DataFrame: + def to_wide_format( + self, + columns: str = 'Sample ID', + name_values: str = 'intensity', + inplace: bool = False) -> pd.DataFrame: """[summary] Parameters @@ -197,11 +207,11 @@ def to_wide_format(self, columns: str = 'Sample ID', name_values: str = 'intensi """ """Build wide data view. - + Return df attribute in case this is in wide-format. If df attribute is in long-format this is used. If df is wide, but long-format exist, then the wide format is build. - - + + """ if self.is_wide_format: return self.df @@ -235,18 +245,6 @@ def get_prop_not_na(self): """Get prop. of not NA values for each sample.""" return self.df.notna().sum(axis=1) / self.df.shape[-1] - def add_metadata(self, add_prop_not_na=True): - d_meta = metadata.get_metadata_from_filenames(self.df.index) - self.df_meta = pd.DataFrame.from_dict( - d_meta, orient='index') - self.df_meta.index.name = self.df.index.name - print(f'Created metadata DataFrame attribute `df_meta`.') - # add proportion on not NA to meta data - if add_prop_not_na: - self.df_meta['prop_not_na'] = self.get_prop_not_na() - print(f'Added proportion of not NA values based on `df` intensities.') - return self.df_meta - def get_PCA(self, n_components=2, imputer=SimpleImputer): self.imputer_ = imputer() X = self.imputer_.fit_transform(self.df) @@ -255,24 +253,26 @@ def get_PCA(self, n_components=2, imputer=SimpleImputer): PCs, self.pca_ = run_pca(X, n_components=n_components) if not hasattr(self, 'df_meta'): - _ = self.add_metadata() + logger.warning('No metadata available, please set "df_meta" first.') try: PCs['ms_instrument'] = self.df_meta['ms_instrument'].astype('category') except KeyError: - print("No MS instrument added.") + logger.warning("No MS instrument added.") + except AttributeError: + logger.warning("No metadata available, please set 'df_meta' first.") + logger.warning("No MS instrument added.") return PCs def calculate_PCs(self, new_df, is_wide=True): if not is_wide: - new_df = new_df.unstack(new_df.index.names[1:]) - + new_df = new_df.unstack(new_df.index.names[1:]) + X = self.imputer_.transform(new_df) X = _add_indices(X, new_df) PCs = self.pca_.transform(X) PCs = _add_indices(PCs, new_df, index_only=True) PCs.columns = [f'PC {i+1}' for i in range(PCs.shape[-1])] - return PCs - + return PCs def plot_pca(self,): """Create principal component plot with three heatmaps showing @@ -282,7 +282,7 @@ def plot_pca(self,): self.is_wide_format = True if not hasattr(self, 'df_meta'): - _ = self.add_metadata() + raise AttributeError('No metadata available, please set "df_meta" first.') PCs = self.get_PCA() cols = list(PCs.columns) @@ -294,7 +294,8 @@ def plot_pca(self,): self.dim = Dim(*self.df.shape) fig.suptitle( - f'First two Principal Components of {self.dim.M} most abundant peptides \n for {self.dim.N} samples', fontsize=30) + f'First two Principal Components of {self.dim.M} most abundant peptides \n for {self.dim.N} samples', + fontsize=30) # by instrument ax = axes[0] @@ -401,38 +402,19 @@ def _plot(self, fct, meta_key: str, save: bool = True): title=f'{self.model_name} latent space PCA of {self.latent_dim} dimensions by {meta_key}') if save: vaep.plotting._savefig(fig, name=f'{self.model_name}_latent_by_{meta_key}', - folder=self.folder) + folder=self.folder) return fig, ax # def read_csv(fname:str, nrows:int, index_col:str=None)-> pd.DataFrame: # return pd.read_csv(fname, index_col=index_col, low_memory=False, nrows=nrows) -def build_metadata_df(filenames:pd.Index) -> pd.DataFrame: - """Build a DataFrame based on a list of strings (an Index) to parse. - Is strongly coupled to the analysis context. - - Parameters - ---------- - filenames : pd.Index - An Iterable with strings. - - Returns - ------- - pd.DataFrame - A DataFrame with the parsed metadata. - """ - - d_meta = metadata.get_metadata_from_filenames(filenames) - df_meta = pd.DataFrame.from_dict(d_meta, orient='index') - df_meta.index.name = filenames.name - return df_meta def get_consecutive_data_indices(df, n_samples): index = df.sort_index().index start_sample = len(index) - n_samples start_sample = random.randint(0, start_sample) - return df.loc[index[start_sample:start_sample+n_samples]] + return df.loc[index[start_sample:start_sample + n_samples]] def corr_lower_triangle(df): @@ -446,20 +428,20 @@ def corr_lower_triangle(df): def plot_corr_histogram(corr_lower_triangle, bins=10): fig, axes = plt.subplots(ncols=2, gridspec_kw={"width_ratios": [ - 5, 1], "wspace": 0.2}, figsize=(10, 4)) + 5, 1], "wspace": 0.2}, figsize=(8, 4)) values = pd.Series(corr_lower_triangle.to_numpy().flatten()).dropna() ax = axes[0] ax = values.hist(ax=ax, bins=bins) ax.yaxis.set_major_formatter("{x:,.0f}") ax = axes[1] plt.axis('off') - data = values.describe(percentiles=np.linspace(0.1,1,10)).round(2) + data = values.describe(percentiles=np.linspace(0.1, 1, 10)).round(2) data.name = '' _ = pd.plotting.table(ax=ax, data=data, loc="best", edges="open") return fig, axes -def run_pca(df_wide:pd.DataFrame, n_components:int=2) -> Tuple[pd.DataFrame, PCA]: +def run_pca(df_wide: pd.DataFrame, n_components: int = 2) -> Tuple[pd.DataFrame, PCA]: """Run PCA on DataFrame and return result. Parameters @@ -531,10 +513,10 @@ def seaborn_scatter(df, ax, seaborn.scatterplot(x=df[cols[0]], y=df[cols[1]], hue=meta, ax=ax, palette='deep', s=size, alpha=alpha) _ = ax.legend(fontsize=fontsize, - title_fontsize=fontsize, - markerscale=0.4, - title=meta.name, - ) + title_fontsize=fontsize, + markerscale=0.4, + title=meta.name, + ) ax.set_title(title, fontsize=fontsize) return ax @@ -546,9 +528,9 @@ def scatter_plot_w_dates(ax, df, size=2): """plot first vs. second column in DataFrame. Use dates to color data. - - - + + + errors : {'ignore', 'raise', 'coerce'}, default 'raise' Passed on to pandas.to_datetime - If 'raise', then invalid parsing will raise an exception. @@ -576,7 +558,7 @@ def scatter_plot_w_dates(ax, df, def add_date_colorbar(mappable, ax): loc = mdates.AutoDateLocator() cbar = ax.get_figure().colorbar(mappable, ax=ax, ticks=loc, - format=mdates.AutoDateFormatter(loc)) + format=mdates.AutoDateFormatter(loc)) return cbar diff --git a/vaep/analyzers/compare_predictions.py b/vaep/analyzers/compare_predictions.py index ea0d16935..75878537d 100644 --- a/vaep/analyzers/compare_predictions.py +++ b/vaep/analyzers/compare_predictions.py @@ -24,7 +24,7 @@ def load_predictions(pred_files: List, shared_columns=['observed']): def load_split_prediction_by_modelkey(experiment_folder: Path, split: str, - model_keys:list[str], + model_keys: list[str], allow_missing=False, shared_columns: list[str] = None): """Load predictions from a list of models. @@ -63,7 +63,7 @@ def load_split_prediction_by_modelkey(experiment_folder: Path, return load_predictions(pred_files, shared_columns=shared_columns) -def load_single_csv_pred_file(fname:str|Path, value_name:str='intensity') -> pd.Series: +def load_single_csv_pred_file(fname: str | Path, value_name: str = 'intensity') -> pd.Series: """Load a single pred file from a single model. Last column are measurments, other are index. @@ -79,7 +79,7 @@ def load_single_csv_pred_file(fname:str|Path, value_name:str='intensity') -> pd. pd.Series measurments as a single column with set indices """ - pred = pd.read_csv(fname) # getattr for other file formats + pred = pd.read_csv(fname) # getattr for other file formats pred = pred.set_index(pred.columns[:-1].tolist()) pred = pred.squeeze() pred.name = value_name diff --git a/vaep/analyzers/diff_analysis.py b/vaep/analyzers/diff_analysis.py index deafe4311..6b115f6ba 100644 --- a/vaep/analyzers/diff_analysis.py +++ b/vaep/analyzers/diff_analysis.py @@ -31,7 +31,7 @@ def select_raw_data(df: pd.DataFrame, return df, Cutoffs(min_sample_for_feat, min_feat_per_sample) -def select_feat(df_qc:pd.DataFrame, threshold:float=0.4, axis:int=0): +def select_feat(df_qc: pd.DataFrame, threshold: float = 0.4, axis: int = 0): qc_cv_feat = df_qc.std(axis=axis) / df_qc.mean(axis=axis) mask = qc_cv_feat < threshold - return qc_cv_feat.loc[mask].index \ No newline at end of file + return qc_cv_feat.loc[mask].index diff --git a/vaep/analyzers/metadata.py b/vaep/analyzers/metadata.py deleted file mode 100644 index 27cb60906..000000000 --- a/vaep/analyzers/metadata.py +++ /dev/null @@ -1,179 +0,0 @@ -from typing import Iterable -import re -import logging - -logger = logging.getLogger('vaep') - -# from collections import namedtuple -# columns = 'date ms_instrument lc_instrument researcher rest'.split() -# RunMetaData = namedtuple('RunMetaData', columns) - -#Vyt, ss, pcp, lvs, teph -regex_researcher = '[_]*[A-Z]*[a-z]*[-]*[A-Z]*[a-zA-Z]*[_]*' - -assert re.search(regex_researcher, 'HeWe_').group() == 'HeWe_' -assert re.search(regex_researcher, '_HeWe_').group() == '_HeWe_' -assert re.search(regex_researcher, 'HeWE_').group() == 'HeWE_' -assert re.search(regex_researcher, '_HeWE_').group() == '_HeWE_' - -regex_lc_instrument = '[_]*([nN]|(UP|up))*((lc)|(LC)|(CL)|([eE][vV][oO]))[a-zA-Z0-9]*[_]*' - -assert re.search(regex_lc_instrument, '_nlc1_').group() == '_nlc1_' -assert re.search(regex_lc_instrument, '_LC6_').group() == '_LC6_' -assert re.search(regex_lc_instrument, '_nLC6_').group() == '_nLC6_' -assert re.search(regex_lc_instrument, 'nLC02_').group() == 'nLC02_' -assert re.search(regex_lc_instrument, '_UPCL_').group() == '_UPCL_' -assert re.search(regex_lc_instrument, '_Evo_').group() == '_Evo_' -assert re.search(regex_lc_instrument, '_EvO_').group() == '_EvO_' - - -# check not HeLa, HeLA, ON, OFF MNT, MA, QC, ALL -regex_not_researcher = '[Hh][eE][Ll][aA]|ON|OFF|MNT|MA|QC|ALL|method|Test' - -assert re.search(regex_not_researcher, 'HeLa').group() == 'HeLa' -assert re.search(regex_not_researcher, 'Hela').group() == 'Hela' -assert re.search(regex_not_researcher, 'hela').group() == 'hela' -assert re.search(regex_not_researcher, 'MNT').group() == 'MNT' -assert re.search(regex_not_researcher, 'MA').group() == 'MA' -assert re.search(regex_not_researcher, 'QC').group() == 'QC' -assert re.search(regex_not_researcher, 'MA_OFF').group() == 'MA' -assert re.search(regex_not_researcher, '_LiNi_') == None - - -type_run = {'MA': 'MNT', - 'MNT': 'MNT', - 'QC': 'QC'} - -# based on hints from core facility -ms_instrument_mapping = { - 'LUMOS1': 'LUMOS', - 'ECPL0': 'EXPL0', - 'Q10': 'QE10' - -} - -lc_instrument_mapping = { - f'LC{i}': f'LC{i:02}' for i in range(10) -} - - -def get_metadata_from_filenames(selected: Iterable, apply_cleaning=False): - data_meta = {} - for filename in selected: - # The first two fields are in order, the rest needs matching. - _entry = {} - try: - _entry['date'], _entry['ms_instrument'], _rest_filename = filename.split( - '_', maxsplit=2) - except ValueError: - logger.error(f'Unexpected filenaming format: {filename}') - _entry['rest'] = filename - data_meta[filename] = _entry - continue - - _entry['ms_instrument'] = _entry['ms_instrument'].upper() - if apply_cleaning and _entry['ms_instrument'] in ms_instrument_mapping: - _entry['ms_instrument'] = ms_instrument_mapping[_entry['ms_instrument']] - - _entry['lc_instrument'] = None - try: - for regex in [regex_lc_instrument, '[_]*[Bb][Rr][0-9]+[_]*']: - try: - _entry['lc_instrument'] = re.search( - regex, _rest_filename).group().strip('_') - break - except AttributeError: - pass - finally: - if _entry['lc_instrument']: - _rest_filename = _rest_filename.replace( - _entry['lc_instrument'], '').replace('__', '_') - _entry['lc_instrument'] = _entry['lc_instrument'].upper() - if _entry['lc_instrument'][0] == 'N': - if apply_cleaning: - _entry['lc_instrument'] = f"{_entry['lc_instrument'][1:]}" - else: - _entry['lc_instrument'] = f"n{_entry['lc_instrument'][1:]}" - if apply_cleaning and _entry['lc_instrument'] in lc_instrument_mapping: - _entry['lc_instrument'] = lc_instrument_mapping[_entry['lc_instrument']] - else: - # try rare cases: "20191216_QE4_nL4_MM_QC_MNT_HELA_01 - lc_rare_cases = { - 'nL4': 'nLC4', - 'nL0': 'nLC0', - 'nL2': 'nLC2', - } - for typo_key, replacement_key in lc_rare_cases.items(): - if typo_key in _rest_filename: - _entry['lc_instrument'] = replacement_key - _rest_filename = _rest_filename.replace( - f'{typo_key}_', '') - if not _entry['lc_instrument']: - logger.error(f'Could not find LC instrument in {filename}') - # researcher after LC instrument - try: - _entry['researcher'] = re.search( - regex_researcher, _rest_filename).group().strip('_') - _cleaned_filename = _rest_filename.replace( - _entry['researcher'], '').replace('__', '_') - while re.search(regex_not_researcher, _entry['researcher']): - _entry['researcher'] = re.search( - regex_researcher, _cleaned_filename) - if _entry['researcher']: - _entry['researcher'] = _entry['researcher'].group().strip('_') - else: - raise AttributeError - _cleaned_filename = _cleaned_filename.replace( - _entry['researcher'], '').replace('__', '_') - if _entry['researcher']: - _rest_filename = _rest_filename.replace( - _entry['researcher'], '').replace('__', '_') - else: - _entry['researcher'] = None - except AttributeError: - logger.critical(f'Found no researcher ID: {filename}') - _entry['researcher'] = None - - _entry['rest'] = _rest_filename - data_meta[filename] = _entry - return data_meta - - -test_cases = ['20131014_QE5_UPLC9_ALL_MNT_HELA_01', - '20150830_qe3_uplc9_LVS_MNT_HELA_07', - '20191216_QE4_nL4_MM_QC_MNT_HELA_01_20191217122319', - '20191012_QE1_nL0_GP_SA_HELA_L-CTR_M-VLX+THL_H-VLX+THL+MLN_GGIP_EXP4_F01', - '20181027_QE8_nL2_QC_AGF_MNT_BSA_01' - ] -# 20150622_QE5_UPLC8_ALL_QC_Hela_method_Test - -# print(get_metadata_from_filenames(test_cases)) - -assert get_metadata_from_filenames(test_cases) == { - '20131014_QE5_UPLC9_ALL_MNT_HELA_01': {'date': '20131014', - 'ms_instrument': 'QE5', - 'lc_instrument': 'UPLC9', - 'researcher': None, - 'rest': '_ALL_MNT_HELA_01'}, - '20150830_qe3_uplc9_LVS_MNT_HELA_07': {'date': '20150830', - 'ms_instrument': 'QE3', - 'lc_instrument': 'UPLC9', - 'researcher': 'LVS', - 'rest': '_MNT_HELA_07'}, - '20191216_QE4_nL4_MM_QC_MNT_HELA_01_20191217122319': {'date': '20191216', - 'ms_instrument': 'QE4', - 'lc_instrument': 'nLC4', - 'researcher': 'MM', - 'rest': '_QC_MNT_HELA_01_20191217122319'}, - '20191012_QE1_nL0_GP_SA_HELA_L-CTR_M-VLX+THL_H-VLX+THL+MLN_GGIP_EXP4_F01': - {'date': '20191012', - 'ms_instrument': 'QE1', - 'lc_instrument': 'nLC0', - 'researcher': 'GP', - 'rest': '_SA_HELA_L-CTR_M-VLX+THL_H-VLX+THL+MLN_GGIP_EXP4_F01'}, - '20181027_QE8_nL2_QC_AGF_MNT_BSA_01': {'date': '20181027', - 'ms_instrument': 'QE8', - 'lc_instrument': 'nLC2', - 'researcher': 'AGF', - 'rest': 'QC_MNT_BSA_01'}, -} diff --git a/vaep/databases/__init__.py b/vaep/databases/__init__.py index 1a1fb4a49..e69de29bb 100644 --- a/vaep/databases/__init__.py +++ b/vaep/databases/__init__.py @@ -1,2 +0,0 @@ -from . import uniprot - diff --git a/vaep/databases/diseases.py b/vaep/databases/diseases.py index edea11912..f4800f76f 100644 --- a/vaep/databases/diseases.py +++ b/vaep/databases/diseases.py @@ -3,7 +3,8 @@ logger = logging.getLogger(__name__) -def get_disease_association(doid:int, limit:int=1000): + +def get_disease_association(doid: int, limit: int = 1000): params = {'type1': -26, 'type2': 'value2', 'id1': f'DOID:{doid}', @@ -11,8 +12,8 @@ def get_disease_association(doid:int, limit:int=1000): 'limit': limit, 'format': 'json'} diseases_url_all = 'https://api.jensenlab.org/Integration' - - r = requests.get(diseases_url_all, params=params) + + r = requests.get(diseases_url_all, params=params) if r.status_code == 200: data, is_there_more = r.json() else: diff --git a/vaep/databases/uniprot.py b/vaep/databases/uniprot.py deleted file mode 100644 index 123c14ae2..000000000 --- a/vaep/databases/uniprot.py +++ /dev/null @@ -1,214 +0,0 @@ -""" -New UniProt REST API, see -https://www.uniprot.org/help/id_mapping. -""" -import re -import time -import json -import zlib -from xml.etree import ElementTree -from urllib.parse import urlparse, parse_qs, urlencode -import requests -from requests.adapters import HTTPAdapter, Retry - - -POLLING_INTERVAL = 3 -API_URL = "https://rest.uniprot.org" - - -retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504]) -session = requests.Session() -session.mount("https://", HTTPAdapter(max_retries=retries)) - - -def check_response(response): - try: - response.raise_for_status() - except requests.HTTPError: - print(response.json()) - raise - - -def submit_id_mapping(from_db, to_db, ids): - request = requests.post( - f"{API_URL}/idmapping/run", - data={"from": from_db, "to": to_db, "ids": ",".join(ids)}, - ) - check_response(request) - return request.json()["jobId"] - - -def get_next_link(headers): - re_next_link = re.compile(r'<(.+)>; rel="next"') - if "Link" in headers: - match = re_next_link.match(headers["Link"]) - if match: - return match.group(1) - - -def check_id_mapping_results_ready(job_id): - while True: - request = session.get(f"{API_URL}/idmapping/status/{job_id}") - check_response(request) - j = request.json() - if "jobStatus" in j: - if j["jobStatus"] == "RUNNING": - print(f"Retrying in {POLLING_INTERVAL}s") - time.sleep(POLLING_INTERVAL) - else: - raise Exception(j["jobStatus"]) - else: - return bool(j["results"] or j["failedIds"]) - - -def get_batch(batch_response, file_format, compressed): - batch_url = get_next_link(batch_response.headers) - while batch_url: - batch_response = session.get(batch_url) - batch_response.raise_for_status() - yield decode_results(batch_response, file_format, compressed) - batch_url = get_next_link(batch_response.headers) - - -def combine_batches(all_results, batch_results, file_format): - if file_format == "json": - for key in ("results", "failedIds"): - if key in batch_results and batch_results[key]: - all_results[key] += batch_results[key] - elif file_format == "tsv": - return all_results + batch_results[1:] - else: - return all_results + batch_results - return all_results - - -def get_id_mapping_results_link(job_id): - url = f"{API_URL}/idmapping/details/{job_id}" - request = session.get(url) - check_response(request) - return request.json()["redirectURL"] - - -def decode_results(response, file_format, compressed): - if compressed: - decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS) - if file_format == "json": - j = json.loads(decompressed.decode("utf-8")) - return j - elif file_format == "tsv": - return [line for line in decompressed.decode("utf-8").split("\n") if line] - elif file_format == "xlsx": - return [decompressed] - elif file_format == "xml": - return [decompressed.decode("utf-8")] - else: - return decompressed.decode("utf-8") - elif file_format == "json": - return response.json() - elif file_format == "tsv": - return [line for line in response.text.split("\n") if line] - elif file_format == "xlsx": - return [response.content] - elif file_format == "xml": - return [response.text] - return response.text - - -def get_xml_namespace(element): - m = re.match(r"\{(.*)\}", element.tag) - return m.groups()[0] if m else "" - - -def merge_xml_results(xml_results): - merged_root = ElementTree.fromstring(xml_results[0]) - for result in xml_results[1:]: - root = ElementTree.fromstring(result) - for child in root.findall("{http://uniprot.org/uniprot}entry"): - merged_root.insert(-1, child) - ElementTree.register_namespace("", get_xml_namespace(merged_root[0])) - return ElementTree.tostring(merged_root, encoding="utf-8", xml_declaration=True) - - -def print_progress_batches(batch_index, size, total): - n_fetched = min((batch_index + 1) * size, total) - print(f"Fetched: {n_fetched} / {total}") - - -def get_id_mapping_results_search(url): - parsed = urlparse(url) - query = parse_qs(parsed.query) - file_format = query["format"][0] if "format" in query else "json" - if "size" in query: - size = int(query["size"][0]) - else: - size = 500 - query["size"] = size - compressed = ( - query["compressed"][0].lower() == "true" if "compressed" in query else False - ) - parsed = parsed._replace(query=urlencode(query, doseq=True)) - url = parsed.geturl() - request = session.get(url) - check_response(request) - results = decode_results(request, file_format, compressed) - total = int(request.headers["x-total-results"]) - print_progress_batches(0, size, total) - for i, batch in enumerate(get_batch(request, file_format, compressed), 1): - results = combine_batches(results, batch, file_format) - print_progress_batches(i, size, total) - if file_format == "xml": - return merge_xml_results(results) - return results - - -def get_id_mapping_results_stream(url): - if "/stream/" not in url: - url = url.replace("/results/", "/results/stream/") - request = session.get(url) - check_response(request) - parsed = urlparse(url) - query = parse_qs(parsed.query) - file_format = query["format"][0] if "format" in query else "json" - compressed = ( - query["compressed"][0].lower() == "true" if "compressed" in query else False - ) - return decode_results(request, file_format, compressed) - - -def query_uniprot_id_mapping(query_list: list, FROM='UniProtKB_AC-ID', TO='Gene_Name', FORMAT='tab'): - """Query Uniprot ID mappings programatically (utility function) - See availabe mappings: https://www.uniprot.org/help/api_idmapping - Function is programmed to query gene IDs based on protein IDs. - - Parameters - ---------- - query_list : list - list of strings containing queries in format specified - in FROM parameter. - FROM : str, optional - Format of string-ids in query_list, by default 'ACC+ID' - TO : str, optional - Format to which strings-ids should be matched with, by default 'GENENAME' - FORMAT : str, optional - Separator for Uniprot-ID, by default 'tab' - - Returns - ------- - list: - List of tuples of type (FROM, TO) - """ - job_id = submit_id_mapping( - from_db=FROM, to_db=TO, ids=query_list - ) - if check_id_mapping_results_ready(job_id): - link = get_id_mapping_results_link(job_id) - results = get_id_mapping_results_search(link) - results = {d['from']: d['to'] for d in results['results']} - return results - - -if __name__ == "__main__": - ids= ['A0A075B6I0', 'A0A075B6I1', 'A0A075B6I6', 'A0A075B6I9',] - results = query_uniprot_id_mapping(ids) - print(results) - # {'A0A075B6I0': 'IGLV8-61', 'A0A075B6I1': 'IGLV4-60', 'A0A075B6I6': 'IGLV1-50', 'A0A075B6I9': 'IGLV7-46'} diff --git a/vaep/fasta.py b/vaep/fasta.py deleted file mode 100644 index bf1351ded..000000000 --- a/vaep/fasta.py +++ /dev/null @@ -1,163 +0,0 @@ -""" -Based on notebook received by [Annelaura Bach](https://www.cpr.ku.dk/staff/mann-group/?pure=en/persons/443836) -and created by Johannes B. Müller -[scholar](https://scholar.google.com/citations?user=Rn1OS8oAAAAJ&hl=de), -[MPI Biochemistry](https://www.biochem.mpg.de/person/93696/2253) -""" -import logging - -from tqdm import tqdm - -logger = logging.getLogger(__name__) - - -def get_n_miscleaved(pep_sequences: list, num_missed: int): - """Build miscleaved peptide sequences for a number of missed cleaving sides. - Call function recusively if you want not only a specific number of miscleavages.""" - _miscleaved = [] - for i in range(len(pep_sequences)): - if i >= num_missed: - _miscleaved.append(''.join(pep_sequences[i - num_missed:i + 1])) - return _miscleaved - - -def cleave_to_tryptic(seq, num_missed_cleavages=1, reversed=False, add_rxk=False): - """Takes a sequence and returns an array of peptides cleaved C-term to R and K. - - Parameters - ---------- - seq : str, optional - sequence of amino acids as string - num_missed_cleavages : int, optional - number of missed cleavages to consider, by default 1 - reversed : bool, optional - if reversed decoy peptide sequences should be added, by default False - """ - - seq.replace(' ', '') # remove white spaces - seq = seq.upper() - # add whitespace behind K and R for splitting - seq = seq.replace('K', 'K ').replace('R', 'R ').split() - - peps_seq = [seq, ] - - for i in range(1, num_missed_cleavages + 1): - _seq = get_n_miscleaved(seq, num_missed=i) - peps_seq.append(_seq) - - if add_rxk and num_missed_cleavages < 2: - _seq = find_rxk_peptides(seq) - peps_seq.append(_seq) - - return peps_seq - - -def find_rxk_peptides(l_peptides): - """Combine 3 peptides to one, if the first is an - 'RxK'-peptide: RX, XR, KX, XK - where the X can - be any other amino-acid. - - Returns - ------- - list - list of miscleaved peptides. Can be empty. - """ - if len(l_peptides) >= 3: - rdx_peptides = [] - for i in range(len(l_peptides) - 2): - if len(l_peptides[i]) <= 2: - rdx_peptides.append( - ''.join(l_peptides[i:i + 3]) - ) - return rdx_peptides - else: - return [] - - -def read_fasta(fp): - """Read a fasta file and yield continously header and sequences.""" - header, seq = None, [] - for line in fp: - line = line.rstrip() - if line.startswith(">"): - if header: - yield (header, ''.join(seq)) - header, seq = line, [] - else: - seq.append(line) - if header: - yield (header, ''.join(seq)) - - -def iterFlatten(root): - """Flatten a nested structure.""" - if isinstance(root, (list, tuple)): - for element in root: - for e in iterFlatten(element): - yield e - else: - yield root - - -def count_peptide_matches(peptide_to_proteinID: dict, - protein_to_gene: dict = None, - level: str = 'protein_id') -> dict: - """Count the number of matches of a peptide to the specified level. - Provides the basis for summary statistics (a counter of matches). - - Possibly to be extended in a class handling the matches of proteinIDs to - peptide sequences. - - Parameters - ---------- - peptide_to_proteinID : dict - mapping of peptides to a list of proteinIDs. - protein_to_gene : dict, optional - Uniport mapping of protein IDs (no isotopes ending "-2", "-3", etc) - in case of level='gene', by default None - level : str, optional - to which level the peptides should be matched. - 'protein_id', 'protein' or 'gene', by default 'protein_id' - - Returns - ------- - dict - Counter of number of matches of peptides to the specified level items. - {1 : 3, 3: 5} means that 3 peptides have been match uniquly to one item (e.g. gene) - and 5 peptides have been matched to 3 items (e.g. genes). - - Raises - ------ - KeyError - [description] - """ - assert level in ['protein_id', 'protein', 'gene'], ValueError( - 'Specify one of the three (in order of aggregation level): {}'.format( - ', '.join(['prot_id', 'prot', 'gene']))) - if level == 'gene': - assert protein_to_gene is not None, "Please provide protein to gene level name" - _set_missing_entires = set() - n_peptides_mapped_to_level = {} - - for _pep, _protein_ids in tqdm(peptide_to_proteinID.items()): - - if level == 'protein_id': - n_peptides_mapped_to_level[_pep] = len(_protein_ids) - elif level == 'protein': - n_peptides_mapped_to_level[_pep] = len( - {x.split('-')[0] for x in peptide_to_proteinID[_pep]}) - elif level == 'gene': - _proteins = {x.split('-')[0] for x in peptide_to_proteinID[_pep]} - _set_genes = set() - for _prot in _proteins: - try: - _set_genes.add(protein_to_gene[_prot]) - except KeyError: - _set_missing_entires.add(_prot) - n_peptides_mapped_to_level[_pep] = len(_set_genes) - else: - raise KeyError(f'unknown level: {level}') - if level == 'gene': - logger.warning( - f'Missing protein to gene mapppings: {len(set(_set_missing_entires))}') - return n_peptides_mapped_to_level diff --git a/vaep/io/__init__.py b/vaep/io/__init__.py index 5371ec649..21cfc5518 100644 --- a/vaep/io/__init__.py +++ b/vaep/io/__init__.py @@ -15,9 +15,10 @@ logger = logging.getLogger(__name__) logger.info(f"Calling from {__name__}") + def search_files(path='.', query='.txt'): - """Uses Pathlib to find relative to path files - with the query text in their file names. Returns + """Uses Pathlib to find relative to path files + with the query text in their file names. Returns the path relative to the specified path. Parameters @@ -30,7 +31,7 @@ def search_files(path='.', query='.txt'): Returns ------- list - list with files as string containig query key. + list with files as string containig query key. """ path = Path(path) files = [] @@ -71,11 +72,11 @@ def get_subfolders(path): return directories -def resolve_path(path:Union[str, Path], to:Union[str, Path]='.')-> Path: +def resolve_path(path: Union[str, Path], to: Union[str, Path] = '.') -> Path: """Resolve a path partly overlapping with to another path.""" pwd = Path(to).absolute() pwd = [p for p in pwd.parts] - ret = [p for p in Path(path).parts if p not in pwd] + ret = [p for p in Path(path).parts if p not in pwd] return Path('/'.join(ret)) @@ -144,8 +145,8 @@ def parse_dict(input_dict: dict, d = dict() for k, v in input_dict.items(): for (old_type, fct) in types: - if isinstance(v, old_type): - v = fct(v) + if isinstance(v, old_type): + v = fct(v) d[k] = v return d @@ -170,4 +171,4 @@ def extend_name(fname: Union[str, Path], extend_by: str, ext: str = None) -> Pat ext = fname.suffix fname = fname.parent / f"{fname.stem}{extend_by}" fname = fname.with_suffix(ext) - return fname \ No newline at end of file + return fname diff --git a/vaep/io/data_objects.py b/vaep/io/data_objects.py deleted file mode 100644 index 6540ecbad..000000000 --- a/vaep/io/data_objects.py +++ /dev/null @@ -1,662 +0,0 @@ -from collections import Counter -import os -import sys -import logging -import json -from pathlib import Path -import multiprocessing -from types import SimpleNamespace -from typing import Callable, Iterable, List, Union - -from tqdm.notebook import tqdm -import numpy as np -import pandas as pd - -from fastcore.meta import delegates - -from vaep.io import dump_json, dump_to_csv -import vaep.io.mq as mq -from vaep.io.mq import MaxQuantOutputDynamic -import vaep.pandas -from vaep.plotting import plot_feat_counts -# from .config import FOLDER_MQ_TXT_DATA, FOLDER_PROCESSED - -logger = logging.getLogger(__name__) -logger.info(f"Calling from {__name__}") - -FOLDER_DATA = Path('data') -FOLDER_DATA.mkdir(exist_ok=True) - -FOLDER_PROCESSED = FOLDER_DATA / 'processed' -FOLDER_PROCESSED.mkdir(exist_ok=True) - -FOLDER_MQ_TXT_DATA = FOLDER_DATA / 'mq_out' - - -# from vaep.cfg import DEFAULTS -DEFAULTS = SimpleNamespace() -DEFAULTS.ALL_SUMMARIES = Path(FOLDER_PROCESSED) / 'all_summaries.json' -DEFAULTS.COUNT_ALL_PEPTIDES = FOLDER_PROCESSED / 'count_all_peptides.json' - -# fastcore.imports has in_notebook, etc functionality -from fastcore.imports import IN_IPYTHON, IN_JUPYTER, IN_COLAB, IN_NOTEBOOK -# IN_IPYTHON,IN_JUPYTER,IN_COLAB,IN_NOTEBOOK = in_ipython(),in_jupyter(),in_colab(),in_notebook() - -N_WORKERS_DEFAULT = os.cpu_count() - 1 if os.cpu_count() <= 16 else 16 -if sys.platform == 'win32' and IN_NOTEBOOK: - N_WORKERS_DEFAULT = 1 - logger.warn( - "only use main process due to issue with ipython and multiprocessing on Windows") - - -manager = multiprocessing.Manager() - - -def _convert_dtypes(df): - """Convert dtypes automatically and make string columns categories.""" - df = df.convert_dtypes() - l_string_columns = df.columns[df.dtypes == 'string'] - if not l_string_columns.empty: - df[l_string_columns] = df[l_string_columns].astype('category') - return df - - -def calc_chunksize(n_workers, len_iterable, factor=4): - """Calculate chunksize argument for Pool-methods. - - Source and reference: https://stackoverflow.com/a/54813527/9684872 - """ - chunksize, extra = divmod(len_iterable, n_workers * factor) - if extra: - chunksize += 1 - return chunksize - - -class col_summary: - MS = 'MS' - MS2 = 'MS/MS Identified' - - -class MqAllSummaries(): - - def __init__(self, fp_summaries=DEFAULTS.ALL_SUMMARIES): - fp_summaries = Path(fp_summaries) - if fp_summaries.exists(): - self.df = _convert_dtypes( - pd.read_json(fp_summaries, orient='index')) - print( - f"{self.__class__.__name__}: Load summaries of {len(self.df)} folders.") - else: - if not fp_summaries.parent.exists(): - raise FileNotFoundError( - f'Folder of filename not found: {fp_summaries.parent}') - self.df = None - self.fp_summaries = fp_summaries - self.usecolumns = col_summary() - - def __len__(self): - if self.df is not None: - return len(self.df) - else: - raise ValueError("No data loaded yet.") - - def load_summary(self, folder): - folder_name = folder.stem - try: - mq_output = MaxQuantOutputDynamic(folder) - # to_dict not very performant - return {folder_name: mq_output.summary.iloc[0].to_dict()} - except FileNotFoundError as e: - if not mq_output.files and len(list(mq_output.folder.iterdir())) == 0: - mq_output.folder.rmdir() - logger.warning(f'Remove empty folder: {mq_output}') - self.empty_folders.append(f"{folder_name}\n") - else: - logger.error(f"{mq_output}, No summary and not empty.") - return {} - - def load_new_samples(self, folders, workers: int = 1): - if self.df is not None: - d_summaries = self.df.to_dict(orient='index') - samples = set(folder.stem for folder in folders) - \ - set(d_summaries.keys()) - samples = [folder for folder in folders if folder.stem in samples] - else: - d_summaries = {} - samples = folders - if not hasattr(self, 'empty_folders'): - # should this depend on multiprocessing? - self.empty_folders = manager.list() - - if samples: - # process_chunch_fct = self.load_summary - # workers=workers - # desc = 'Load summaries' - if workers > 1: - with multiprocessing.Pool(workers) as p: - # set chunksize: https://stackoverflow.com/a/49533645/9684872 - chunksize = calc_chunksize(workers, len(samples), factor=2) - list_of_updates = list( - tqdm( - p.imap(self.load_summary, samples, - chunksize=chunksize), - total=len(samples), - desc='Load summaries')) - else: - list_of_updates = [self.load_summary( - folder) for folder in tqdm(samples)] - - print("Newly loaded samples:", len(list_of_updates)) - - for d in list_of_updates: - d_summaries.update(d) - - self.df = _convert_dtypes( - pd.DataFrame.from_dict(d_summaries, orient='index')) - self.save_state() - else: - print("No new sample added.") - return self.df - - def save_state(self): - """Save summaries DataFrame as json and pickled object.""" - self.df.to_json(self.fp_summaries, orient='index') - self.df.to_pickle(self.fp_summaries.parent / - f"{self.fp_summaries.stem}.pkl") - - def get_files_w_min_MS2(self, threshold=10_000, relativ_to=FOLDER_MQ_TXT_DATA): - """Get a list of file ids with a minimum MS2 observations.""" - threshold_ms2_identified = threshold - mask = self.df[self.usecolumns.MS2] > threshold_ms2_identified - print(f"Selected {mask.sum()} of {len(mask)} folders.") - return [Path(relativ_to) / folder for folder in self.df.loc[mask].index] - -# maybe move functions related to fnames -def get_fname(N, M): - """Helper function to get file for intensities""" - return f'df_intensities_N{N:05d}_M{M:05d}' - - -def get_folder_names(folders: Iterable[str]): - return set(Path(folder).stem for folder in folders) - - -def create_parent_folder_name(folder: Path) -> str: - return folder.stem[:4] - - -## plotting function for value_counts from FeatureCounter.get_df_counts - - - -def collect_in_chuncks(paths: Iterable[Union[str, Path]], - process_chunk_fct: Callable, - n_workers: int = N_WORKERS_DEFAULT, - chunks=10, - desc='Run chunks in parallel') -> List: - """collect the results from process_chunk_fct (chunk of files to loop over). - The idea is that process_chunk_fct creates a more memory-efficient intermediate - result than possible if only callling single fpaths in paths. - - Parameters - ---------- - paths : Iterable - Iterable of paths - process_chunk_fct : Iterable[str, Path] - Callable which takes a chunk of paths and returns an result to collect, e.g. a dict - n_workers : int, optional - number of processes, by default N_WORKERS_DEFAULT - - Returns - ------- - List - List of results returned by process_chunk_fct - """ - paths_splits = np.array_split(paths, min(chunks, len(paths))) - if n_workers > 1: - with multiprocessing.Pool(n_workers) as p: - collected = list(tqdm(p.imap(process_chunk_fct, paths_splits), - total=len(paths_splits), - desc=desc)) - else: - collected = map(process_chunk_fct, paths_splits) - return collected - - -class FeatureCounter(): - def __init__(self, fp_counter: str, counting_fct: Callable[[List], Counter], - idx_names:Union[List, None]=None, - feature_name='feature', - overwrite=False): - self.fp = Path(fp_counter) - self.counting_fct = counting_fct - self.idx_names = idx_names - self.feature_name = feature_name - if self.fp.exists() and not overwrite: - d = self.load(self.fp) - self.counter = d['counter'] - self.loaded = set(folder for folder in d['based_on']) - self.dumps = d['dumps'] - else: - self.loaded = set() # None - self.counter = Counter() - self.dumps = dict() - - def __repr__(self): - return f"{self.__class__.__name__}(fp_counter={str(self.fp)})" - - def get_new_folders(self, folders: List[str]): - ret = get_folder_names(folders) - self.loaded - return ret - - # combine multiprocessing into base class? - def sum_over_files(self, folders: List[Path], n_workers=N_WORKERS_DEFAULT, save=True): - if self.loaded: - new_folder_names = self.get_new_folders(folders) - logger.info(f'{len(new_folder_names)} new folders to process.') - if new_folder_names: - folders = [ - folder for folder in folders if folder.stem in new_folder_names] - else: - folders = [] - - if folders: - list_of_sample_dicts = collect_in_chuncks(folders, - process_chunk_fct=self.counting_fct, - n_workers = n_workers, - chunks=n_workers*3, - desc = 'Count features in 100 chunks') - - for d in tqdm(list_of_sample_dicts, - total=len(list_of_sample_dicts), - desc='combine counters from chunks'): - self.counter += d['counter'] - self.dumps.update(d['dumps']) - - if self.loaded: - self.loaded |= new_folder_names - else: - self.loaded = get_folder_names(folders) - if save: - self.save() - else: - logger.info('Nothing to process.') - return self.counter - - @property - def n_samples(self): - return len(self.loaded) - - - def get_df_counts(self) -> pd.DataFrame: - """Counted features as DataFrame with proportion values. - - Returns - ------- - pd.DataFrame - _description_ - """ - feat_counts = (pd.Series(self.counter) - .sort_values(ascending=False) - .to_frame('counts')) - feat_counts['proportion'] = feat_counts / self.n_samples - if self.idx_names: - feat_counts.index.names = self.idx_names - feat_counts.reset_index(inplace=True) - feat_counts.index.name = 'consecutive count' - return feat_counts - - def plot_counts(self, df_counts: pd.DataFrame = None, ax=None, prop_feat=0.25, min_feat_prop=.01): - """Plot counts based on get_df_counts.""" - if df_counts is None: - df_counts = self.get_df_counts() - ax = plot_feat_counts(df_counts, - feat_name=self.feature_name, - n_samples=self.n_samples, - count_col='counts', - ax=ax) - n_feat_cutoff = vaep.pandas.get_last_index_matching_proportion( - df_counts=df_counts, prop=prop_feat) - n_samples_cutoff = df_counts.loc[n_feat_cutoff, 'counts'] - logger.info(f'{n_feat_cutoff = }, {n_samples_cutoff = }') - x_lim_max = vaep.pandas.get_last_index_matching_proportion( - df_counts, min_feat_prop) - logger.info(f'{x_lim_max = }') - ax.set_xlim(-1, x_lim_max) - ax.axvline(n_feat_cutoff, c='red') - - # ax.text(n_feat_cutoff + 0.03 * x_lim_max, - # n_samples_cutoff, '25% cutoff', - # style='italic', fontsize=12, - # bbox={'facecolor': 'grey', 'alpha': 0.5, 'pad': 10}) - - ax.annotate(f'{prop_feat*100}% cutoff', - xy=(n_feat_cutoff, n_samples_cutoff), - xytext=(n_feat_cutoff + 0.1 * x_lim_max, n_samples_cutoff), - fontsize=16, - arrowprops=dict(facecolor='black', shrink=0.05)) - - return ax - - def save(self): - """Save state - - { - 'counter': Counter, - 'based_on': list, - 'dumps: dict, - } - """ - d = {'counter': self.counter, - 'based_on': list(self.loaded), - 'dumps': {k: str(v) for k, v in self.dumps.items()}} - logger.info(f"Save to: {self.fp}") - dump_json(d, filename=self.fp) - - def load(self, fp): - with open(self.fp) as f: - d = json.load(f) - d['counter'] = Counter(d['counter']) - d['dumps'] = {k: Path(v) for k,v in d['dumps'].items()} - return d - - def load_dump(self, fpath, fct=pd.read_csv, use_cols=None): - return fct(fpath, index=self.idx_names, usecols=None) - - -class Count(): - - def __init__(self, - process_folder_fct: Callable, - use_cols=None, - parent_folder_fct: Callable = create_parent_folder_name, - outfolder=FOLDER_PROCESSED / 'dumps', - dump=False): - self.outfolder = Path(outfolder) - self.outfolder.mkdir(exist_ok=True, parents=True) - self.use_cols = use_cols - self.process_folder_fct = process_folder_fct - self.parent_folder_fct = parent_folder_fct - self.dump = dump - - def __call__(self, folders, - **fct_args): - logging.debug( - f"Passed function arguments for process_folder_fct Callable: {fct_args}") - c = Counter() - fpath_dict = {} - for folder in tqdm(folders): - folder = Path(folder) - df = self.process_folder_fct( - folder=folder, use_cols=self.use_cols, **fct_args) - c.update(df.index) - if self.dump: - fpath_dict[folder.stem] = dump_to_csv(df, folder=folder, outfolder=self.outfolder, - parent_folder_fct=self.parent_folder_fct) - ret = {'counter': c, 'dumps': fpath_dict} - return ret - -### aggregated peptides - -# # check df for redundant information (same feature value for all entries) -usecols = mq.COLS_ + ['Potential contaminant', mq.mq_col.SEQUENCE] - - -def count_peptides(folders: List[Path], dump=True, - usecols=usecols, - parent_folder_fct: Callable = create_parent_folder_name, - outfolder=FOLDER_PROCESSED / 'agg_peptides_dumps'): - c = Counter() - fpath_dict = {} - for folder in folders: - peptides = pd.read_table(folder / 'peptides.txt', - usecols=usecols, - index_col=0) - mask = (peptides[mq.mq_col.INTENSITY] == 0) | ( - peptides["Potential contaminant"] == '+') - peptides = peptides.loc[~mask] - c.update(peptides.index) - if dump: - fpath_dict[folder.stem] = dump_to_csv(peptides.drop('Potential contaminant', axis=1), - folder=folder, outfolder=outfolder, - parent_folder_fct=parent_folder_fct) - ret = {'counter': c, 'dumps': fpath_dict} - return ret - -d_dtypes_training_sample = { - 'Sequence': pd.StringDtype(), - 'Proteins': pd.StringDtype(), - 'Leading razor protein': pd.StringDtype(), - 'Gene names': pd.StringDtype(), - 'Intensity': pd.Int64Dtype() -} - - -def load_agg_peptide_dump(fpath): - fpath = Path(fpath) - peptides = pd.read_csv(fpath, index_col=0, dtype=d_dtypes_training_sample) - return peptides - -@delegates() -class PeptideCounter(FeatureCounter): - - def __init__(self, - fp_counter: str, - counting_fct: Callable[[List], Counter] = count_peptides, - idx_names=['Sequence'], - feature_name='aggregated peptide', - **kwargs): - super().__init__(fp_counter, counting_fct=counting_fct, - idx_names=idx_names, feature_name=feature_name, **kwargs) - - @staticmethod - def load_dump(fpath): - return load_agg_peptide_dump(fpath) - - - -### Evidence -evidence_cols = mq.mq_evidence_cols - - -def select_evidence(df_evidence: pd.DataFrame) -> pd.DataFrame: - mask = (df_evidence[evidence_cols.Potential_contaminant] - == '+') | (df_evidence[evidence_cols.Intensity] == 0) - evidence = df_evidence.loc[~mask].drop( - evidence_cols.Potential_contaminant, axis=1) - evidence = evidence.dropna(subset=[evidence_cols.Intensity]) - return evidence - - -idx_columns_evidence = [evidence_cols.Sequence, evidence_cols.Charge] - - -def load_process_evidence(folder: Path, use_cols, select_by): - evidence = pd.read_table(folder / 'evidence.txt', - usecols=idx_columns_evidence + use_cols) - evidence = select_evidence(evidence) - evidence = vaep.pandas.select_max_by( - evidence, grouping_columns=idx_columns_evidence, selection_column=select_by) - evidence = evidence.set_index(idx_columns_evidence).sort_index() - return evidence - - -def count_evidence(folders: List[Path], - select_by: str = 'Score', - dump=True, - use_cols=[evidence_cols.mz, - evidence_cols.Protein_group_IDs, - evidence_cols.Intensity, - evidence_cols.Score, - evidence_cols.Potential_contaminant], - parent_folder_fct: Callable = create_parent_folder_name, - outfolder=FOLDER_PROCESSED / 'evidence_dumps'): - outfolder = Path(outfolder) - outfolder.mkdir(exist_ok=True, parents=True) - c = Counter() - if dump: - fpath_dict = {} - for folder in tqdm(folders): - folder = Path(folder) - evidence = load_process_evidence( - folder=folder, use_cols=use_cols, select_by=select_by) - c.update(evidence.index) - if dump: - fpath_dict[folder.stem] = dump_to_csv(evidence, folder=folder, outfolder=outfolder, - parent_folder_fct=parent_folder_fct) - ret = {'counter': c, 'dumps': fpath_dict} - return ret - - -@delegates() -class EvidenceCounter(FeatureCounter): - - def __init__(self, fp_counter: str, - counting_fct: Callable[[List], Counter] = count_evidence, - idx_names=['Sequence', 'Charge'], - feature_name='charged peptide', - **kwargs): - super().__init__(fp_counter, counting_fct, - idx_names=idx_names, feature_name=feature_name, **kwargs) - - # Methods should use super, otherwise non-specific duplication is needed. - def save(self): - """Save state - - { - 'counter': Counter with tuple keys, - 'based_on': list - } - """ - d = {'counter': vaep.pandas.create_dict_of_dicts(self.counter), - 'based_on': list(self.loaded), - 'dumps': {k: str(v) for k, v in self.dumps.items()}} - print(f"Save to: {self.fp}") - dump_json(d, filename=self.fp) - - def load(self, fp): - with open(self.fp) as f: - d = json.load(f) - d['counter'] = Counter( - vaep.pandas.flatten_dict_of_dicts(d['counter'])) - d['dumps'] = {k: Path(v) for k,v in d['dumps'].items()} - return d - - -def load_evidence_dump(fpath, index_col=['Sequence', 'Charge']): - df = pd.read_csv(fpath, index_col=index_col) - return df - -### Protein Groups - - -pg_cols = mq.mq_protein_groups_cols - -# def load_process_evidence(folder: Path, use_cols, select_by): - - -def load_and_process_proteinGroups(folder: Union[str, Path], - #use_cols not really a parameter (or needs asserts?) - use_cols: List = [ - pg_cols.Protein_IDs, - pg_cols.Majority_protein_IDs, - pg_cols.Gene_names, - pg_cols.Evidence_IDs, - pg_cols.Q_value, - pg_cols.Score, - pg_cols.Only_identified_by_site, - pg_cols.Reverse, - pg_cols.Potential_contaminant, - pg_cols.Intensity, -]): - folder = Path(folder) - pg = pd.read_table(folder / 'proteinGroups.txt', - usecols=use_cols) - mask = pg[[pg_cols.Only_identified_by_site, pg_cols.Reverse, - pg_cols.Potential_contaminant]].notna().sum(axis=1) > 0 - pg = pg.loc[~mask] - mask = pg[pg_cols.Intensity] > 1 - pg = pg.loc[mask] - gene_set = pg[pg_cols.Gene_names].str.split(';') - col_loc_gene_names = pg.columns.get_loc(pg_cols.Gene_names) - _ = pg.insert(col_loc_gene_names+1, 'Number of Genes', - gene_set.apply(vaep.pandas.length)) - mask_no_gene = pg[pg_cols.Gene_names].isna() - pg_no_gene = pg.loc[mask_no_gene] - logger.debug(f"Entries without any gene annotation: {len(pg_no_gene)}") - pg = vaep.pandas.select_max_by(df=pg.loc[~mask_no_gene], - grouping_columns=[pg_cols.Gene_names], - selection_column=pg_cols.Score) - pg = pg.append(pg_no_gene) - pg = pg.set_index(pg_cols.Protein_IDs) - return pg - - - - - -count_protein_groups = Count(load_and_process_proteinGroups, - use_cols=[ - pg_cols.Protein_IDs, - pg_cols.Majority_protein_IDs, - pg_cols.Gene_names, - pg_cols.Evidence_IDs, - pg_cols.Q_value, - pg_cols.Score, - pg_cols.Only_identified_by_site, - pg_cols.Reverse, - pg_cols.Potential_contaminant, - pg_cols.Intensity, - ], - outfolder=FOLDER_PROCESSED / 'proteinGroups_dumps', - dump=True) - - -@delegates() -class ProteinGroupsCounter(FeatureCounter): - - def __init__(self, fp_counter: str, - counting_fct: Callable[[List], - Counter] = count_protein_groups, - idx_names=[pg_cols.Protein_IDs], # mq_specfic - feature_name='protein group', - **kwargs): - super().__init__(fp_counter, counting_fct, idx_names=idx_names, - feature_name=feature_name, **kwargs) - - -def load_pg_dump(folder, use_cols=None): - logger.debug(f"Load: {folder}") - df = pd.read_csv(folder, index_col=pg_cols.Protein_IDs, usecols=use_cols) - return df - -## Gene Counter - -def pg_idx_gene_fct(folder:Union[str, Path], use_cols=None): - folder = Path(folder) - logger.debug(f"Load: {folder}") - df = pd.read_csv(folder, index_col=pg_cols.Gene_names, usecols=use_cols) - return df - - -count_genes = Count(pg_idx_gene_fct, - use_cols=[ - pg_cols.Protein_IDs, - pg_cols.Gene_names, - pg_cols.Intensity, - ], - outfolder=FOLDER_PROCESSED / 'gene_dumps', # don't dump, only read - dump=False) - - -#summing needs to be done over processed proteinGroup dumps -@delegates() -class GeneCounter(FeatureCounter): - """Gene Counter to count gene in dumped proteinGroups.""" - - def __init__(self, fp_counter: str, - counting_fct: Callable[[List], Counter] = count_genes, - feature_name='gene', - idx_names=['Gene names'], **kwargs): - super().__init__(fp_counter, counting_fct, idx_names=idx_names, - feature_name=feature_name, **kwargs) diff --git a/vaep/io/dataloaders.py b/vaep/io/dataloaders.py index e98d2dd03..7443d28d6 100644 --- a/vaep/io/dataloaders.py +++ b/vaep/io/dataloaders.py @@ -33,7 +33,7 @@ def __init__(self, scaler : [type] A pipeline of transform to apply to the dataset. DataSetClass : torch.utils.data.Dataset - Type of dataset to use for generating single samples based on + Type of dataset to use for generating single samples based on DataFrames. batch_size : int Batch size to use. @@ -49,7 +49,10 @@ def __init__(self, self.scaler = scaler self.batch_size = batch_size - def get_dls(self, shuffle_train: bool = True, **kwargs) -> Tuple[torch.utils.data.DataLoader, torch.utils.data.DataLoader]: + def get_dls(self, + shuffle_train: bool = True, + **kwargs) -> Tuple[torch.utils.data.DataLoader, + torch.utils.data.DataLoader]: self.shuffle_train = shuffle_train dl_train = DataLoader( dataset=self.data_train, @@ -106,7 +109,7 @@ def get_dls(train_X: pandas.DataFrame, transforms = VaepPipeline(df_train=train_X, encode=dae_default_pipeline, decode=['normalize']) - dls = get_dls(train_X, val_X, transforms, bs=4) + dls = get_dls(train_X, val_X, transforms, bs=4) """ train_ds = datasets.DatasetWithTarget(df=train_X, transformer=transformer) diff --git a/vaep/io/datasets.py b/vaep/io/datasets.py index 9f854e1ce..53839626d 100644 --- a/vaep/io/datasets.py +++ b/vaep/io/datasets.py @@ -9,6 +9,7 @@ DEFAULT_DTYPE = torch.get_default_dtype() + class PeptideDatasetInMemory(Dataset): """Peptide Dataset fully in memory.""" @@ -23,7 +24,7 @@ def __init__(self, data: np.array, mask: np.array = None, fill_na=0.0): Peptide data for training, potentially with missings. mask : [type], optional Mask selecting values for evaluation from data(y), by default None - If no mask is provided, all non-missing values from `data`-array + If no mask is provided, all non-missing values from `data`-array will be used. fill_na : int, optional value to replace missing values with, by default 0 @@ -83,19 +84,21 @@ def __getitem__(self, idx) -> Tuple[torch.Tensor, torch.Tensor]: mask_isna = self.mask_isna.iloc[idx] data = self.data.iloc[idx] mask_isna, data = to_tensor(mask_isna), to_tensor(data) - return mask_isna, data + return mask_isna, data + class DatasetWithTarget(DatasetWithMaskAndNoTarget): def __getitem__(self, idx) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: mask, data = super().__getitem__(idx) - return mask, data, data + return mask, data, data + class DatasetWithTargetSpecifyTarget(DatasetWithMaskAndNoTarget): - def __init__(self, df: pd.DataFrame, targets:pd.DataFrame, + def __init__(self, df: pd.DataFrame, targets: pd.DataFrame, transformer: sklearn.pipeline.Pipeline = None): - """Create a dataset for validation. + """Create a dataset for validation. Parameters ---------- @@ -113,7 +116,7 @@ def __init__(self, df: pd.DataFrame, targets:pd.DataFrame, self.columns = df.columns self.transformer = transformer - self.target = df.fillna(targets) # not really necessary, without mask would not be needed + self.target = df.fillna(targets) # not really necessary, without mask would not be needed if transformer: if hasattr(transformer, 'transform'): @@ -125,16 +128,16 @@ def __init__(self, df: pd.DataFrame, targets:pd.DataFrame, self.data = df self.length_ = len(self.data) - def __getitem__(self, idx) -> Tuple[torch.Tensor, torch.Tensor]: - mask_isna, data = super().__getitem__(idx) - target = to_tensor(self.target.iloc[idx]) + mask_isna, data = super().__getitem__(idx) + target = to_tensor(self.target.iloc[idx]) return mask_isna, data, target + class PeptideDatasetInMemoryMasked(DatasetWithMaskAndNoTarget): """Peptide Dataset fully in memory. - + Dataset: torch.utils.data.Dataset """ @@ -155,7 +158,7 @@ def __init__(self, *args, fill_na=0, **kwargs): class PeptideDatasetInMemoryNoMissings(Dataset): """Peptide Dataset fully in memory. - + Dataset: torch.utils.data.Dataset """ diff --git a/vaep/io/datasplits.py b/vaep/io/datasplits.py index a6de3ace5..daae63bc2 100644 --- a/vaep/io/datasplits.py +++ b/vaep/io/datasplits.py @@ -17,6 +17,7 @@ # 'pickle': 'to_pickle', 'csv': ('to_csv', 'read_csv')} + def long_format(df: pd.DataFrame, colname_values: str = 'intensity', # index_name: str = 'Sample ID' @@ -49,7 +50,6 @@ class DataSplits(): train_X: pd.DataFrame = None val_y: pd.DataFrame = None test_y: pd.DataFrame = None - def __post_init__(self): self._items = sorted(self.__dict__) @@ -64,12 +64,12 @@ def __dir__(self): return ['dump', 'from_folder', 'interpolate', 'load', 'test_X', 'test_y', 'to_long_format', 'to_wide_format', 'train_X', 'val_X', 'val_y'] - def dump(self, folder='data', file_format='csv')-> dict: + def dump(self, folder='data', file_format='csv') -> dict: """dump in long format.""" folder = Path(folder) folder.mkdir(parents=True, exist_ok=True) - if not file_format in FILE_FORMAT_TO_DUMP_FCT: + if file_format not in FILE_FORMAT_TO_DUMP_FCT: raise ValueError(f"Select one of these formats: {', '.join(FILE_FORMAT_TO_DUMP_FCT.keys())}") dumps = {} n_dumped = 0 @@ -128,11 +128,11 @@ def to_wide_format(self): _df = _series.unstack() setattr(self, _attr, _df) self._is_wide = True - - def to_long_format(self, name_values:str='intensity'): - if not self._is_wide: + + def to_long_format(self, name_values: str = 'intensity'): + if not self._is_wide: return - + for _attr, _df in self: if _df is None: continue @@ -142,7 +142,7 @@ def to_long_format(self, name_values:str='intensity'): self._is_wide = False # singledispatch possible - def interpolate(self, dataset:Union[str, pd.DataFrame]): + def interpolate(self, dataset: Union[str, pd.DataFrame]): if issubclass(type(dataset), pd.DataFrame): ds = dataset elif issubclass(type(dataset), pd.Series): @@ -152,7 +152,7 @@ def interpolate(self, dataset:Union[str, pd.DataFrame]): ds = getattr(self, dataset) except AttributeError: raise AttributeError(f"Please provide a valid attribute, not '{dataset}'. " - "Valid attributes are {}".format(', '.join(x for x in self._items))) + "Valid attributes are {}".format(', '.join(x for x in self._items))) if dataset[-1] in ['y', 'Y']: logger.warning( f'Attempting to interpolate target: {dataset} ' @@ -160,15 +160,13 @@ def interpolate(self, dataset:Union[str, pd.DataFrame]): if ds is None: raise ValueError(f'Attribute is None: {dataset!r}.') if not self._is_wide: - ds = ds.unstack() # series is unstack to DataFrame + ds = ds.unstack() # series is unstack to DataFrame else: raise TypeError(f"Unknown type: {classname(dataset)}." - f" None of str, {class_full_module(pd.DataFrame)}, {class_full_module(pd.Series)}" - ) - - return interpolate(wide_df=ds) - + f" None of str, {class_full_module(pd.DataFrame)}, {class_full_module(pd.Series)}" + ) + return interpolate(wide_df=ds) def load_items(folder: str, items: dict, use_wide_format=False, file_format='csv') -> dict: @@ -184,7 +182,8 @@ def load_items(folder: str, items: dict, use_wide_format=False, file_format='csv read_fct = getattr(pd, FILE_FORMAT_TO_DUMP_FCT[file_format][1]) _df = read_fct(fname) # logic below is suited for csv reader -> maybe split up loading and saving later? - if len(_df.shape) == 1: _df = _df.to_frame().reset_index() # in case Series was pickled + if len(_df.shape) == 1: + _df = _df.to_frame().reset_index() # in case Series was pickled cols = list(_df.columns) if use_wide_format: _df = wide_format(_df.set_index(cols[1:-1]), columns=cols[0], name_values=cols[-1]) @@ -196,7 +195,7 @@ def load_items(folder: str, items: dict, use_wide_format=False, file_format='csv # set default file name -> intergrate into DataSplits? -def load_freq(folder:str, file='freq_features.pkl'): +def load_freq(folder: str, file='freq_features.pkl'): folder = Path(folder) fname = folder / file if fname.suffix == '.json': @@ -206,4 +205,4 @@ def load_freq(folder:str, file='freq_features.pkl'): freq_per_feature = pd.read_pickle(fname) else: raise ValueError(f"Unknown Fileextension: {fname.suffix}") - return freq_per_feature \ No newline at end of file + return freq_per_feature diff --git a/vaep/io/filenames.py b/vaep/io/filenames.py deleted file mode 100644 index ee9c122fa..000000000 --- a/vaep/io/filenames.py +++ /dev/null @@ -1,14 +0,0 @@ -import re -import logging -import functools - -logger = logging.getLogger(__name__) - -def read_number_from_str(fname:str, regex:str='M[0-9]*', strip:int=1) -> int: - M = re.search(regex, fname).group() - logger.info(f"Found: {M}") - M = int(M[strip:]) - return M - -read_M_features = functools.partial(read_number_from_str, regex='M[0-9]*', strip=1) -read_N_samples = functools.partial(read_number_from_str, regex='N[0-9]*', strip=1) diff --git a/vaep/io/format.py b/vaep/io/format.py index f316f8b5f..8ac18df95 100644 --- a/vaep/io/format.py +++ b/vaep/io/format.py @@ -11,7 +11,7 @@ def classname(obj): """ Return entire object's class name (repr notation) as str. Source: https://gist.github.com/clbarnes/edd28ea32010eb159b34b075687bb49e - + Parameters ---------- obj : object diff --git a/vaep/io/mq.py b/vaep/io/mq.py deleted file mode 100644 index daa843b0e..000000000 --- a/vaep/io/mq.py +++ /dev/null @@ -1,694 +0,0 @@ -import logging -from collections import Counter, namedtuple -from pathlib import Path -from typing import Iterable -import omegaconf - -import pandas as pd -from pandas import Int64Dtype, StringDtype, Float64Dtype - -import vaep.io - -logger = logging.getLogger(__name__) -logger.addHandler(logging.NullHandler()) - - -mq_use_columns = ['Gene names', - 'Intensity', - 'Retention time', - 'Calibrated retention time', - 'Sequence', - 'Leading razor protein', - 'Proteins' - ] - -MqColumnsUsed = namedtuple(typename='MqColumns', - field_names=[ - s.upper().replace(' ', '_') for s in mq_use_columns - ]) -mq_col = MqColumnsUsed(*mq_use_columns) - -FASTA_KEYS = ["Proteins", "Gene names"] - -mq_evidence_cols = {'Sequence': 'Sequence', - 'Length': 'Length', - 'Modifications': 'Modifications', - 'Modified_sequence': 'Modified sequence', - 'Oxidation_M_Probabilities': 'Oxidation (M) Probabilities', - 'Oxidation_M_Score_Diffs': 'Oxidation (M) Score Diffs', - 'Acetyl_Protein_N-term': 'Acetyl (Protein N-term)', - 'Oxidation_M': 'Oxidation (M)', - 'Missed_cleavages': 'Missed cleavages', - 'Proteins': 'Proteins', - 'Leading_proteins': 'Leading proteins', - 'Leading_razor_protein': 'Leading razor protein', - 'Gene_names': 'Gene names', - 'Protein_names': 'Protein names', - 'Type': 'Type', - 'Raw_file': 'Raw file', - 'MSMS_mz': 'MS/MS m/z', - 'Charge': 'Charge', - 'mz': 'm/z', - 'Mass': 'Mass', - 'Uncalibrated_-_Calibrated_mz_[ppm]': 'Uncalibrated - Calibrated m/z [ppm]', - 'Uncalibrated_-_Calibrated_mz_[Da]': 'Uncalibrated - Calibrated m/z [Da]', - 'Mass_error_[ppm]': 'Mass error [ppm]', - 'Mass_error_[Da]': 'Mass error [Da]', - 'Uncalibrated_mass_error_[ppm]': 'Uncalibrated mass error [ppm]', - 'Uncalibrated_mass_error_[Da]': 'Uncalibrated mass error [Da]', - 'Max_intensity_mz_0': 'Max intensity m/z 0', - 'Retention_time': 'Retention time', - 'Retention_length': 'Retention length', - 'Calibrated_retention_time': 'Calibrated retention time', - 'Calibrated_retention_time_start': - 'Calibrated retention time start', - 'Calibrated_retention_time_finish': 'Calibrated retention time finish', - 'Retention_time_calibration': 'Retention time calibration', - 'Match_time_difference': 'Match time difference', - 'Match_mz_difference': 'Match m/z difference', - 'Match_q-value': 'Match q-value', - 'Match_score': 'Match score', - 'Number_of_data_points': 'Number of data points', - 'Number_of_scans': 'Number of scans', - 'Number_of_isotopic_peaks': 'Number of isotopic peaks', - 'PIF': 'PIF', - 'Fraction_of_total_spectrum': 'Fraction of total spectrum', - 'Base_peak_fraction': 'Base peak fraction', - 'PEP': 'PEP', - 'MSMS_count': 'MS/MS count', - 'MSMS_scan_number': 'MS/MS scan number', - 'Score': 'Score', - 'Delta_score': 'Delta score', - 'Combinatorics': 'Combinatorics', - 'Intensity': 'Intensity', - 'Reverse': 'Reverse', - 'Potential_contaminant': 'Potential contaminant', - 'id': 'id', - 'Protein_group_IDs': 'Protein group IDs', - 'Peptide_ID': 'Peptide ID', - 'Mod._peptide_ID': 'Mod. peptide ID', - 'MSMS_IDs': 'MS/MS IDs', - 'Best_MSMS': 'Best MS/MS', - 'Oxidation_M_site_IDs': 'Oxidation (M) site IDs', - 'Taxonomy_IDs': 'Taxonomy IDs'} - -mq_evidence_cols = omegaconf.OmegaConf.create(mq_evidence_cols) - - -mq_evidence_dtypes = {'Length': Int64Dtype(), - 'Modifications': StringDtype, - 'Modified sequence': StringDtype, - 'Oxidation (M) Probabilities': StringDtype, - 'Oxidation (M) Score Diffs': StringDtype, - 'Acetyl (Protein N-term)': Int64Dtype(), - 'Oxidation (M)': Int64Dtype(), - 'Missed cleavages': Int64Dtype(), - 'Proteins': StringDtype, - 'Leading proteins': StringDtype, - 'Leading razor protein': StringDtype, - 'Gene names': StringDtype, - 'Protein names': StringDtype, - 'Type': StringDtype, - 'Raw file': StringDtype, - 'MS/MS m/z': Float64Dtype(), - 'm/z': Float64Dtype(), - 'Mass': Float64Dtype(), - 'Uncalibrated - Calibrated m/z [ppm]': Float64Dtype(), - 'Uncalibrated - Calibrated m/z [Da]': Float64Dtype(), - 'Mass error [ppm]': Float64Dtype(), - 'Mass error [Da]': Float64Dtype(), - 'Uncalibrated mass error [ppm]': Float64Dtype(), - 'Uncalibrated mass error [Da]': Float64Dtype(), - 'Max intensity m/z 0': Float64Dtype(), - 'Retention time': Float64Dtype(), - 'Retention length': Float64Dtype(), - 'Calibrated retention time': Float64Dtype(), - 'Calibrated retention time start': Float64Dtype(), - 'Calibrated retention time finish': Float64Dtype(), - 'Retention time calibration': Float64Dtype(), - 'Match time difference': Int64Dtype(), - 'Match m/z difference': Int64Dtype(), - 'Match q-value': Int64Dtype(), - 'Match score': Int64Dtype(), - 'Number of data points': Int64Dtype(), - 'Number of scans': Int64Dtype(), - 'Number of isotopic peaks': Int64Dtype(), - 'PIF': Int64Dtype(), - 'Fraction of total spectrum': Int64Dtype(), - 'Base peak fraction': Int64Dtype(), - 'PEP': Float64Dtype(), - 'MS/MS count': Int64Dtype(), - 'MS/MS scan number': Int64Dtype(), - 'Score': Float64Dtype(), - 'Delta score': Float64Dtype(), - 'Combinatorics': Int64Dtype(), - 'Intensity': Int64Dtype(), - 'Reverse': Int64Dtype(), - 'Potential contaminant': Int64Dtype(), - 'id': Int64Dtype(), - 'Protein group IDs': StringDtype, - 'Peptide ID': Int64Dtype(), - 'Mod. peptide ID': Int64Dtype(), - 'MS/MS IDs': StringDtype, - 'Best MS/MS': Int64Dtype(), - 'Oxidation (M) site IDs': StringDtype, - 'Taxonomy IDs': StringDtype, - } - - -mq_protein_groups_cols = {'Protein_IDs': 'Protein IDs', - 'Majority_protein_IDs': 'Majority protein IDs', - 'Peptide_counts_all': 'Peptide counts (all)', - 'Peptide_counts_razor+unique': 'Peptide counts (razor+unique)', - 'Peptide_counts_unique': 'Peptide counts (unique)', - 'Protein_names': 'Protein names', - 'Gene_names': 'Gene names', - 'Fasta_headers': 'Fasta headers', - 'Number_of_proteins': 'Number of proteins', - 'Peptides': 'Peptides', - 'Razor_+_unique_peptides': 'Razor + unique peptides', - 'Unique_peptides': 'Unique peptides', - 'Sequence_coverage_[%]': 'Sequence coverage [%]', - 'Unique_+_razor_sequence_coverage_[%]': 'Unique + razor sequence coverage [%]', - 'Unique_sequence_coverage_[%]': 'Unique sequence coverage [%]', - 'Mol._weight_[kDa]': 'Mol. weight [kDa]', - 'Sequence_length': 'Sequence length', - 'Sequence_lengths': 'Sequence lengths', - 'Q_value': 'Q-value', - 'Score': 'Score', - 'Intensity': 'Intensity', - 'MSMS_count': 'MS/MS count', - 'Only_identified_by_site': 'Only identified by site', - 'Reverse': 'Reverse', - 'Potential_contaminant': 'Potential contaminant', - 'id': 'id', - 'Peptide_IDs': 'Peptide IDs', - 'Peptide_is_razor': 'Peptide is razor', - 'Mod._peptide_IDs': 'Mod. peptide IDs', - 'Evidence_IDs': 'Evidence IDs', - 'MSMS_IDs': 'MS/MS IDs', - 'Best_MSMS': 'Best MS/MS', - 'Oxidation_M_site_IDs': 'Oxidation (M) site IDs', - 'Oxidation_M_site_positions': 'Oxidation (M) site positions', - 'Taxonomy_IDs': 'Taxonomy IDs'} - -mq_protein_groups_cols = omegaconf.OmegaConf.create(mq_protein_groups_cols) - -########################################################################################## -########################################################################################## -# import abc # abc.ABCMeta ? - - -class MaxQuantOutput(): - """Class assisting with MaxQuant txt output folder. - - Parameters - ---------- - folder: pathlib.Path, str - Path to Maxquant `txt` output folder. - - - Attributes - ---------- - self.files : list - list of files in `folder`. - _inital_attritubutes : list - Initial set of non-magic attributes - NAME_FILE_MAP : dict - Keys for known MaxQuant output files. - """ - NAME_FILE_MAP = {'allPeptides': 'allPeptides.txt', - 'evidence': 'evidence.txt', - 'matchedFeatures': 'matchedFeatures.txt', - 'modificationSpecificPeptides': 'modificationSpecificPeptides.txt', - 'ms3Scans': 'ms3Scans.txt', - 'msms': 'msms.txt', - 'msmsScans': 'msmsScans.txt', - 'mzRange': 'mzRange.txt', - 'OxidationSites': 'Oxidation (M)Sites.txt', - 'parameters': 'parameters.txt', - 'peptides': 'peptides.txt', - 'proteinGroups': 'proteinGroups.txt', - 'summary': 'summary.txt'} - - def __init__(self, folder): - self.folder = Path(folder) - self.files = self.get_files() - - def get_files(self): - """Get all txt files in output folder - - Attributes - --------- - paths: NamedTuple - """ - self.paths = vaep.io.search_files(path=self.folder, query='.txt') - return self.paths.files - - @classmethod - def register_file(cls, filename): - - @property - def fct(cls): - return cls.find_attribute(f'_{filename}') - - return fct - - def find_attribute(self, filename): - """Look up or load attribute.""" - if not hasattr(self, filename): - df = self.load(filename[1:]) - setattr(self, filename, df) - return getattr(self, filename) - - def load(self, file): - """Load a specified file into memory and return it. - Can be used """ - filepath = self.folder / self.NAME_FILE_MAP[file] - if not Path(filepath).exists(): - raise FileNotFoundError( - f"No such file: {file}.txt: Choose one of the following {', '.join(self.files)}") - - return pd.read_table(filepath, index_col=0) - - # def dump_training_data(self, ) - - def get_list_of_attributes(self): - """Return current list on non-magic instance attributes.""" - return [x for x in dir(self) if not x.startswith('__')] - - def __repr__(self): - return f'{self.__class__.__name__}({self.folder!r})' - - def dump_intensity(self, folder='.'): - """Dump all intensity values from peptides.txt""" - folder = Path(folder) - folder.mkdir(exist_ok=True) - fname = folder / f"{self.folder.stem}.json" - vaep.io.dump_json( - data_dict=self.peptides.Intensity.dropna().to_dict(), - filename=fname) - logger.info(f'Dumped intensities in peptides.txt: {fname}.') - - # needed to reset attributes on instance creation. - _inital_attritubutes = [x for x in dir() if not x.startswith('__')] - - -# register all properties -# Would be great to be able to do this at runtime based on the files actually present. -for filename in MaxQuantOutput.NAME_FILE_MAP.keys(): - setattr(MaxQuantOutput, filename, MaxQuantOutput.register_file(filename)) - -# This version offers less inspection possibilities as the attributes are only set when they are looked up. - - -class MaxQuantOutputDynamic(MaxQuantOutput): - """Class assisting with MaxQuant txt output folder. Fetches only availabe txt files. - - Parameters - ---------- - folder: pathlib.Path, str - Path to Maxquant `txt` output folder. - - Attributes - --------- - files : list - file names on disk - file_keys : list - keys for file name on disk to use for lookup - name_file_map : dict - Keys for known MaxQuant output files. - _inital_attritubutes : list - Initial set of non-magic attributes - """ - - def __init__(self, folder): - super().__init__(folder) - - # patch properties at instance creation? - self.name_file_map = {} - for file in self.files: - file_key = Path(file).stem - for symbol in " ()": - file_key = file_key.replace(symbol, '') - self.name_file_map[file_key] = file - self.file_keys = list(self.name_file_map) - - def __getattr__(self, filename): - if filename in self.name_file_map: - df = self.load(filename) - setattr(self, filename, df) - else: - msg = f"No such file: {filename}.txt: Choose one of the following:\n{', '.join(self.file_keys)}" - raise AttributeError(msg) - return df - - # needed to reset attributes on instance creation. - _inital_attritubutes = [x for x in dir() if not x.startswith('__')] - -########################################################################################## -########################################################################################## - - -def check_df(df, columns): - """Check DataFrame for specified columns - - Parameters - ---------- - df : pandas.DataFrame - DataFrame for which should contain `columns`. - columns : Iterable - Iterable of column names. - - Raises - ------ - AttributeError - One or more `columns` are missing. Specifies which. - """ - - missing = [] - for col in columns: - if not col in df: - missing.append(col) - - if missing: - raise AttributeError(f'Missing column(s): {", ".join(missing)}') - - -COLS_ = [mq_col.INTENSITY, mq_col.LEADING_RAZOR_PROTEIN] + FASTA_KEYS - - -def get_peptides_with_single_gene(peptides, keep_columns=COLS_, gene_column=mq_col.GENE_NAMES): - """Get long-data-format. Ungroup gene names. Peptides "shared" by genes - are assigned individual rows. retains only cases with full list of - features provided by `keep_columns`. - - Parameters - ---------- - peptides: pandas.DataFrame - MaxQuant txt output loaded as `pandas.DataFrame`. - keep_columns: list - List of columns to keep from the `peptides`.txt, default - {cols_} - gene_column: str - Column containing group information of format "group1;group2", - i.e. in MQ for genes "gene1;gene2". - """.format(cols_=COLS_) - if gene_column not in keep_columns: - keep_columns.append(gene_column) - check_df(peptides, COLS_) - peptides_with_single_gene = peptides[COLS_].dropna(how='any') - if len(peptides) < len(peptides_with_single_gene): - logger.warning('Removed {} of {} entries due to missing values.'.format( - len(peptides) - len(peptides_with_single_gene), - len(peptides) - )) - peptides_with_single_gene[gene_column] = peptides_with_single_gene[gene_column].str.split( - ';') - peptides_with_single_gene = peptides_with_single_gene.explode( - column=gene_column) - return peptides_with_single_gene - - -def get_set_of_genes(iterable, sep_in_str: str = ';'): - "Return the set of unique strings for an Iterable of strings (gene names)." - genes_single_unique = set() - for gene_iterable in pd.Series(iterable).str.split(sep_in_str): - try: - genes_single_unique.update(gene_iterable) - except TypeError: - pass - return genes_single_unique - - -def validate_gene_set(n_gene_single_unique, n_gene_sets): - """Compare N single geens to number of unqiue gene sets. - - Parameters - ---------- - n_gene_single_unique : int - Count in set. - n_gene_sets : int - Count in set. - - Raises - ------ - ValueError - [description] - """ - if n_gene_single_unique < n_gene_sets: - print( - f'There are however less unique-single genes {n_gene_single_unique} than sets.') - elif n_gene_single_unique == n_gene_sets: - print(f'Only realy unique gene (sets)') - else: - raise ValueError( - f'There are more gene-sets than unique genes: {n_gene_sets} vs. {n_gene_single_unique}.') - - -def count_genes_in_sets(gene_sets, sep=';'): - """Count for an Iterable of gene_sets - - Parameters - ---------- - gene_sets : Iterable - Iterable of gene_sets which entries are separated with `sep` - sep : str - Seperator of gene sets, default ';' - - Returns - ------- - collections.Counter - Counter with keys as genes and counts as value. - """ - genes_counted_each_in_unique_sets = Counter() - - for gene in pd.Series(gene_sets).dropna(): - try: - gene_iterable = gene.split(sep) - genes_counted_each_in_unique_sets.update(gene_iterable) - except TypeError: - print(f"Error on: {gene}") - - return genes_counted_each_in_unique_sets - - -def get_identifier_from_column(df: pd.DataFrame, identifier_col: str): - """Get unique identifier in a column of a DataFrame. - - Parameters - ---------- - df : pd.DataFrame - (Sub-) DataFrame with data for a gene. - identifier_col : str - Column name in which unique identifier is suspected - - Returns - ------- - Any - unique identifier in `identifier_col` - - Raises - ------ - ValueError - Non-unique identifier in column - """ - identifier = df[identifier_col].unique() - if len(identifier) == 1: - identifier = identifier[0] - else: - raise ValueError( - f"Found {len(identifier)} non-unique identifier: {identifier}") - return identifier - - -def find_exact_cleaved_peptides_for_razor_protein(gene_data, fasta_db, gene_id: str = None): - """Find exactly cleaved peptides based on razor protein in provided data-set - - Parameters - ---------- - gene_data : pandas.DataFrame - Pandas DataFrame with information from MQ peptides.txt output table. - gene_data.columns.name should be set to gene names of gene_data. - fasta_db : dict - Created fasta database with specific scheme. - gene_id : str, optional - gene name, by default None - - Returns - ------- - list - list of exact peptides for the razor protein of the gene. - - Raises - ------ - ValueError - Raised if no unique gene identifier could be inferred from the data if no gene-id - was set. - KeyError - If no protein could be found in fasta_db for specified gene. - """ - # ToDo: Replace with config from package - KEY_PEPTIDES = 'peptides' - - if not isinstance(gene_data.columns.name, str) or not gene_id: - try: - gene_id = get_identifier_from_column(gene_data, mq_col.GENE_NAMES) - except ValueError as e: - raise ValueError( - f"Could not identify single, unique identifier from {gene_id} column: {e}" - "Please set columns.name feature to a string-identifier (for genes separated by ;)" - f" not of {type(gene_data.columns.name)}: {gene_data.columns.name}") - protein_id = gene_data[mq_col.LEADING_RAZOR_PROTEIN].unique() - - # ToDo: Check for all proteins and decide on the best? - if len(protein_id) != 1: - logger.warning("- Gene: {:8}: More than one razor protein (try first): {} (Gene: {}) ".format( - gene_id, ", ".join(x for x in protein_id), gene_data.columns.name)) - protein_id = protein_id[0] - try: - peps_exact_cleaved = fasta_db[protein_id][KEY_PEPTIDES][0] - except KeyError: - # MQ marks potential contaminent proteins - if 'CON__' in protein_id: - logger.info( - f"- Gene: {gene_id:8}: " - f"Potential contaminent protein is leading razor protein: {protein_id}" - f" (Gene: {gene_data.columns.name})") - elif 'REV__' in protein_id: - logger.info( - f"- Gene: {gene_id:8}: " - f"Reversed protein is leading razor protein: {protein_id}" - f" (Gene: {gene_data.columns.name})") - else: - raise ValueError(f'Check case for {gene_id} on {protein_id}.') - # assert len(gene_data[mq_col.PROTEINS].unique()) == 1, f"{gene_data[mq_col.PROTEINS].unique()}" - protein_sets = gene_data[mq_col.PROTEINS].unique() - if len(protein_sets) > 1: - logger.warning( - f"More than one set of genes: {gene_data[mq_col.PROTEINS].unique()}") - # ToDo: find intersection of proteins between all sequences. - - # Enforce: proteins have to be share between all peptides - protein_sets = [set.split(';') for set in protein_sets] - # ToDo: Check if ordering is relevant (if not all proteins are checked) - proteins_shared_by_all = set( - protein_sets.pop()).intersection(*protein_sets) - # ToDo: Some CON_ proteins are also present in the fasta and appear twice. - # Remove all CON__ proteins from data globally, including their fasta - # pendants (e.g. Keratin: Q04695;CON__Q04695) - # exclude potential other contaminents - protein_sets = [ - x for x in proteins_shared_by_all if not 'CON__' in x] # .sorted() - if len(protein_sets) == 0: - # raise KeyError("No other overall protein found for sequences.") - logger.warning( - f'No good protein found for gene ({gene_id:8}). Return empty list.') - return [] - if len(protein_sets) > 1: - logger.warning( - f"- Gene: {gene_id:8}: " - "Non-unique other protein set found (select first): {}".format( - ', '.join(protein_sets) - )) - protein_id = protein_sets.pop() - peps_exact_cleaved = fasta_db[protein_id][KEY_PEPTIDES][0] - return peps_exact_cleaved - - -def calculate_completness_for_sample( - peps_exact_cleaved: Iterable[str], - peps_in_data: Iterable[str]): - """Calculate completeness for set of peptides. - - Parameters - ---------- - peps_exact_cleaved : Iterable[str] - Iterable of peptides exactly cleaved - peps_in_data : Iterable[str] - Iterable of peptides found during a run / in a sample. Check if peptides - overlap with any of the exact peptides. - - Returns - ------- - float - proportion of exact peptides for which some evidence was found. - """ - c = 0 - if not peps_exact_cleaved: - return 0 # no exact peptides - for i, _pep in enumerate(peps_exact_cleaved): - logger.debug(f"Check if exact peptide matches: {_pep}") - for _found_pep in peps_in_data: - logger.debug(f"Check for peptide: {_found_pep}") - if _pep in _found_pep: - c += 1 - break - if c == len(peps_in_data): - logger.debug(f"Last checked peptides in position {i:2}: {_pep}") - logger.debug( - f"Searched therfore {i+1:2} out of {len(peps_exact_cleaved)} peptides, " - f"i.e. a share of {(i+1)/len(peps_exact_cleaved):.3f}") - break - return c / len(peps_exact_cleaved) - - -class ExtractFromPeptidesTxt(): - """Strategy to extract Intensity measurements from MaxQuant txt output peptides.txt. - Creates dump of Training Data. - """ - - def __init__(self, - out_folder, - mq_output_object: MaxQuantOutput, - # Could be made a certain type -> ensure schema is met. - fasta_db: dict - ): - # # ToDo: make this check work - assert isinstance(mq_output_object, MaxQuantOutput) - self._mq_output = mq_output_object - self.out_folder = Path(out_folder) / mq_output_object.folder.stem - self.out_folder.mkdir(exist_ok=True, parents=True) - self.fname_template = '{gene}.json' - self.fasta_db = fasta_db - - def __call__(self): - """Dump valid cases to file. - - Returns: - collections.Counter - Counter with gene IDs as key and completeness as value. - """ - _counter = 0 - _genes = dict() - peptides_with_single_gene = get_peptides_with_single_gene( - peptides=self._mq_output.peptides) - for gene_names, data_gene in peptides_with_single_gene.groupby(mq_col.GENE_NAMES): - data_gene.columns.name = gene_names # ToDo: Find better solution - peps_exact_cleaved = find_exact_cleaved_peptides_for_razor_protein( - data_gene, fasta_db=self.fasta_db) - c = calculate_completness_for_sample(peps_exact_cleaved=peps_exact_cleaved, - peps_in_data=data_gene.index) - assert gene_names not in _genes - _genes[gene_names] = c - # ToDo check completeness for each shared protein in list - if c >= .6: - fname = self.out_folder / \ - self.fname_template.format(gene=gene_names) - with open(fname, 'w') as f: - data_gene.to_json(f) - _counter += 1 - logger.info( - f'Dumped {_counter} genes from {self._mq_output.folder.stem}') - fname = self.out_folder / '0_completeness_all_genes.json' - vaep.io.dump_json(_genes, fname) - logger.info(f'Dumped files to: {str(self.out_folder)}') - return _genes - - def __repr__(self): - return f"{self.__class__.__name__}(out_folder={self.out_folder}, mq_output_object={repr(self._mq_output)}, fasta_db)" - - -# so MaxQuantOutput could know which strategy to apply for which file-type? -STRATEGIES = {'peptides.txt': '', - 'evidence.txt': ''} diff --git a/vaep/io/rawfiles.py b/vaep/io/rawfiles.py deleted file mode 100644 index 7fd0b897b..000000000 --- a/vaep/io/rawfiles.py +++ /dev/null @@ -1,116 +0,0 @@ -import os -from pathlib import Path, PurePosixPath - -from IPython.display import display -import ipywidgets as widgets -import pandas as pd - - -queries = set() - - -def find_indices_containing_query(query, X): - mask = X.index.str.contains(query) - X_query = X.loc[mask].sort_index() - queries.add(query) - return X_query - - -def get_unique_stem(query, index: pd.Index): - """Gets stem filename, by splitting filename left of query and remove last underscore _. - - Fractionated samples seem to be named by fraction type. Last field indicates fraction. - """ - ret = index.str.split(query).str[0].str.rsplit("_", n=1).str[0] - # ret = index.str.rsplit('_', n=1).str[0] - return sorted(list(set(ret))) - - -def show_fractions(stub: str, df): - subset = df[df.index.str.contains(stub)] - print(repr(stub)) - display(subset) - display(f"N: {len(subset)}") - - -class RawFileViewer: - def __init__(self, df:pd.DataFrame, start_query: str="[Ff]rac", outputfolder: str='.', path_col='path'): - """Indices are used.""" - self.df = df - self.file_names = df.index - # self.queries = set() # add query button - - self.w_query = widgets.Text(start_query) - self.query = start_query - - self.save_button = widgets.Button(description='Save current files.') - self.save_button.on_click(self.save_current_files) - - self.w_data = widgets.Dropdown( - options=self.get_options(self.w_query.value), index=0 - ) - self.stub = None - self.folder = Path(outputfolder) - self.path_col = path_col - - def get_options(self, query): - # this needs to be clearer - try: - sub_df = self.find_indices_containing_query(query) - ret = get_unique_stem(query, sub_df.index) - return ret - except: - print(f"Not a valid query: {query} ") - return () - - def save_current_files(self, button): - """Save files in current views as txt file. - """ - folder = Path(self.folder) / self.query - folder.mkdir(exist_ok=True) - fname = folder / f"{self.stub}.txt" - files = self.subset[self.path_col] - line_template = "-get {remote_path} {local_path}" - with open(fname, 'w') as f: - f.write(f'-lmkdir {self.stub}\n') - for _path in files: - _local_path = PurePosixPath(self.stub)/_path.name - _remote_path = PurePosixPath(_path) - line = line_template.format(remote_path=_remote_path, local_path=_local_path) - f.write(f'{line}\n') - print(f"Saved file paths to: {fname}") - - def viewer(self, query, stub: str): - if query != self.query: - self.query = query - print(f"updated query to: {query}") - self.w_data.options = self.get_options(query) - if len(self.w_data.options): - stub = self.w_data.options[0] - else: - print(f"Nothing to display for QUERY: {query}") - stub = None - # find_indices_containing_query = partial(find_indices_containing_query, X=data_unique) - if stub and stub!=self.stub: - try: - subset = self.df[self.df.index.str.contains(stub)] - print('current stub: ', repr(stub)) - display(subset) - display(f"N: {len(subset)}") - self.subset = subset - except TypeError: - print(f"Nothing to display for query: {query}") - self.stub = stub - - def find_indices_containing_query(self, query): - mask = self.df.index.str.contains(query) - X_query = self.df.loc[mask].sort_index() - return X_query - - def view(self): - """Interactive viewer. Updates list of options based on query.""" - # widget for type of data: meta or not. might be combined - self.out_sel = widgets.interactive_output( - self.viewer, {"stub": self.w_data, "query": self.w_query} - ) - return widgets.VBox([self.w_query, self.w_data, self.out_sel, self.save_button]) # repr of class diff --git a/vaep/io/thermo_raw_files.py b/vaep/io/thermo_raw_files.py deleted file mode 100644 index 2238d5afc..000000000 --- a/vaep/io/thermo_raw_files.py +++ /dev/null @@ -1,26 +0,0 @@ -cols_instrument = ['Thermo Scientific instrument model', - 'instrument attribute', - 'instrument serial number', ] - - -meta_raw_selected = [ - 'Content Creation Date', - 'Thermo Scientific instrument model', - 'instrument serial number', - 'Software Version', - 'Number of MS1 spectra', - 'Number of MS2 spectra', - 'Number of scans', - 'MS max charge', - 'MS max RT', - 'MS min MZ', - 'MS max MZ', - 'MS scan range', - 'mass resolution', - 'Retention time range', - 'Mz range', - 'beam-type collision-induced dissociation', - 'injection volume setting', - 'dilution factor', -] - diff --git a/vaep/io/types.py b/vaep/io/types.py index b738d04a7..5f807113a 100644 --- a/vaep/io/types.py +++ b/vaep/io/types.py @@ -1,7 +1,9 @@ """ papermill strategy to determine type -see: https://github.com/nteract/papermill/blob/76906a882bb5b3e719ad113c7b2447e0ddffb2c7/papermill/cli.py#L275-L307 +see: https://github.com/nteract/papermill/blob/76906a882bb5b3e719ad113c7b2447e0ddffb2c7/papermill/cli.py#L275-L307 """ + + def resolve_type(value): if value == "True": return True @@ -34,4 +36,4 @@ def _is_float(value): except ValueError: return False else: - return True \ No newline at end of file + return True diff --git a/vaep/model.py b/vaep/model.py index 6d0e39c24..74e947643 100644 --- a/vaep/model.py +++ b/vaep/model.py @@ -13,17 +13,6 @@ logger = logging.getLogger(__name__) - - - - - - - - - - - def build_df_from_pred_batches(pred, scaler=None, index=None, columns=None): pred = np.vstack(pred) if scaler: @@ -32,10 +21,10 @@ def build_df_from_pred_batches(pred, scaler=None, index=None, columns=None): return pred -def get_latent_space(model_method_call:callable, - dl:torch.utils.data.DataLoader, - dl_index:pd.Index, - latent_tuple_pos:int=0) -> pd.DataFrame: +def get_latent_space(model_method_call: callable, + dl: torch.utils.data.DataLoader, + dl_index: pd.Index, + latent_tuple_pos: int = 0) -> pd.DataFrame: """Create a DataFrame of the latent space based on the model method call to be used (here: the model encoder or a latent space helper method) @@ -60,7 +49,7 @@ def get_latent_space(model_method_call:callable, for b in dl: model_input = b[1] res = model_method_call(model_input) - #if issubclass(type(res), torch.Tensor): + # if issubclass(type(res), torch.Tensor): if isinstance(res, tuple): res = res[latent_tuple_pos] res = res.detach().numpy() @@ -74,8 +63,6 @@ def get_latent_space(model_method_call:callable, return latent_space - - # # Defining the model manuelly # import torch.nn as nn diff --git a/vaep/models/__init__.py b/vaep/models/__init__.py index 55ac20670..c3c34de02 100644 --- a/vaep/models/__init__.py +++ b/vaep/models/__init__.py @@ -10,7 +10,7 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd -import torch +import torch from fastcore.foundation import L from fastai import learner import sklearn.metrics as sklm @@ -26,8 +26,8 @@ def plot_loss(recorder: learner.Recorder, - norm_train:np.int64=np.int64(1), - norm_val:np.int64=np.int64(1), + norm_train: np.int64 = np.int64(1), + norm_val: np.int64 = np.int64(1), skip_start: int = 5, with_valid: bool = True, ax: plt.Axes = None) -> plt.Axes: @@ -39,9 +39,9 @@ def plot_loss(recorder: learner.Recorder, recorder : learner.Recorder fastai Recorder object, learn.recorder norm_train: np.int64, optional - Normalize epoch loss by number of training samples, by default 1 + Normalize epoch loss by number of training samples, by default 1 norm_val: np.int64, optional - Normalize epoch loss by number of validation samples, by default 1 + Normalize epoch loss by number of validation samples, by default 1 skip_start : int, optional Skip N first batch metrics, by default 5 with_valid : bool, optional @@ -61,7 +61,7 @@ def plot_loss(recorder: learner.Recorder, if with_valid: idx = (np.array(recorder.iters) < skip_start).sum() ax.plot(recorder.iters[idx:], L( - recorder.values[idx:]).itemgot(1) / norm_val , label='valid') + recorder.values[idx:]).itemgot(1) / norm_val, label='valid') ax.legend() return ax @@ -70,7 +70,7 @@ def plot_training_losses(learner: learner.Learner, name: str, ax=None, save_recorder: bool = True, - norm_factors = np.array([1,1], dtype='int'), + norm_factors=np.array([1, 1], dtype='int'), folder='figures', figsize=(15, 8)): if ax is None: @@ -96,6 +96,7 @@ def calc_net_weight_count(model: torch.nn.modules.module.Module) -> int: weight_count += np.prod(param.size()) return int(weight_count) + class RecorderDump: """Simple Class to hold fastai Recorder Callback data for serialization using pickle. """ @@ -120,8 +121,6 @@ def load(cls, filepath, name): plot_loss = plot_loss - - def split_prediction_by_mask(pred: pd.DataFrame, mask: pd.DataFrame, check_keeps_all: bool = False) -> Tuple[pd.DataFrame, pd.DataFrame]: @@ -152,17 +151,17 @@ def split_prediction_by_mask(pred: pd.DataFrame, def compare_indices(first_index: pd.Index, second_index: pd.Index) -> pd.Index: """Show difference of indices in other index wrt. to first. First should be the larger collection wrt to the second. This is the set difference of two Index objects. - + If second index is a superset of indices of the first, the set will be empty, although there are differences (default behaviour in pandas). - + Parameters ---------- first_index : pd.Index Index, should be superset second_index : pd.Index Index, should be the subset - + Returns ------- pd.Index @@ -181,8 +180,7 @@ def compare_indices(first_index: pd.Index, second_index: pd.Index) -> pd.Index: ('MAE', sklm.mean_absolute_error)] - -def collect_metrics(metrics_jsons:List, key_fct: Callable) -> dict: +def collect_metrics(metrics_jsons: List, key_fct: Callable) -> dict: """Collect and aggregate a bunch of json metrics. Parameters @@ -207,7 +205,7 @@ def collect_metrics(metrics_jsons:List, key_fct: Callable) -> dict: fname = Path(fname) logger.info(f"Load file: {fname = }") - key = key_fct(fname) # level, repeat + key = key_fct(fname) # level, repeat logger.debug(f"{key = }") with open(fname) as f: diff --git a/vaep/models/ae.py b/vaep/models/ae.py index 29cc99dff..1c59157a2 100644 --- a/vaep/models/ae.py +++ b/vaep/models/ae.py @@ -28,8 +28,6 @@ logger = logging.getLogger(__name__) - - def get_preds_from_df(df: pd.DataFrame, learn: fastai.learner.Learner, transformer: vaep.transform.VaepPipeline, @@ -60,7 +58,7 @@ def get_preds_from_df(df: pd.DataFrame, dl = vaep.io.dataloaders.get_test_dl(df=df, transformer=transformer, dataset=dataset) - res = learn.get_preds(dl=dl) # -> dl could be int + res = learn.get_preds(dl=dl) # -> dl could be int if position_pred_tuple is not None and issubclass(type(res[0]), tuple): res = (res[0][position_pred_tuple], *res[1:]) res = L(res).map(lambda x: pd.DataFrame( @@ -69,9 +67,9 @@ def get_preds_from_df(df: pd.DataFrame, return res - leaky_relu_default = nn.LeakyReLU(.1) + class Autoencoder(nn.Module): """Autoencoder base class. @@ -116,8 +114,8 @@ def build_layer(in_feat, out_feat): # Encoder self.encoder = [] - for i in range(len(self.layers)-1): - in_feat, out_feat = self.layers[i:i+2] + for i in range(len(self.layers) - 1): + in_feat, out_feat = self.layers[i:i + 2] self.encoder.extend(build_layer(in_feat=in_feat, out_feat=out_feat)) self.encoder.append(nn.Linear(out_feat, dim_latent)) @@ -133,11 +131,11 @@ def build_layer(in_feat, out_feat): out_feat=out_feat) i = -1 # in case a single hidden layer is passed - for i in range(len(self.layers_decoder)-2): - in_feat, out_feat = self.layers_decoder[i:i+2] + for i in range(len(self.layers_decoder) - 2): + in_feat, out_feat = self.layers_decoder[i:i + 2] self.decoder.extend(build_layer(in_feat=in_feat, out_feat=out_feat)) - in_feat, out_feat = self.layers_decoder[i+1:i+3] + in_feat, out_feat = self.layers_decoder[i + 1:i + 3] self.decoder.append(nn.Linear(in_feat, out_feat)) if last_decoder_activation is not None: @@ -159,7 +157,7 @@ def get_missing_values(df_train_wide: pd.DataFrame, Parameters ---------- df_train_wide : pd.DataFrame - Training data in wide format. + Training data in wide format. val_idx : pd.Index Indices (MultiIndex of Sample and Feature) of validation split test_idx : pd.Index @@ -170,7 +168,7 @@ def get_missing_values(df_train_wide: pd.DataFrame, Returns ------- pd.Series - Multiindex series of missing values in training data which are not + Multiindex series of missing values in training data which are not in validiation and test split. """ # all idx missing in training data @@ -215,7 +213,7 @@ def get_missing_values(df_train_wide: pd.DataFrame, # assert self.layers_decoder is not self.layers # assert out_feat == self.layers_decoder[0] # self.decoder = [nn.Linear(self.dim_latent, out_feat), -# activation(), +# activation(), # nn.BatchNorm1d(out_feat)] # for i in range(len(self.layers_decoder)-1): # in_feat, out_feat = self.layers_decoder[i:i+2] @@ -278,9 +276,9 @@ def after_pred(self): class ModelAdapterFlatPred(DatasetWithTargetAdapter): - """Models forward only expects on input matrix. + """Models forward only expects on input matrix. Apply mask from dataloader to both pred and targets. - + Return only predictions and target for non NA inputs. """ @@ -302,9 +300,9 @@ def after_pred(self): class ModelAdapter(ModelAdapterFlatPred): - """Models forward only expects on input matrix. + """Models forward only expects on input matrix. Apply mask from dataloader to both pred and targets. - + Keep original dimension, i.e. also predictions for NA.""" def after_pred(self): @@ -321,7 +319,7 @@ def after_loss(self): class ModelAdapterVAEFlat(DatasetWithTargetAdapter): - """Models forward method only expects one input matrix. + """Models forward method only expects one input matrix. Apply mask from dataloader to both pred and targets.""" def before_batch(self): @@ -336,7 +334,7 @@ def after_pred(self): pred, mu, logvar = self.pred # return predictions self.learn.pred = (pred[self._mask], mu, logvar) # is this flat? elif len(self.pred) == 4: - x_mu,x_logvar, z_mu, z_logvar = self.pred + x_mu, x_logvar, z_mu, z_logvar = self.pred self.learn.pred = (x_mu[self._mask], x_logvar[self._mask], z_mu, z_logvar) # same as ModelAdapter. Inheritence is limiting composition here @@ -356,43 +354,39 @@ def after_loss(self): self.learn.yb = (self._all_y,) - - - class AutoEncoderAnalysis(analysis.ModelAnalysis): def __init__(self, - train_df:pd.DataFrame, - val_df:pd.DataFrame, # values to use for validation - model:torch.nn.modules.module.Module, - model_kwargs:dict, + train_df: pd.DataFrame, + val_df: pd.DataFrame, # values to use for validation + model: torch.nn.modules.module.Module, + model_kwargs: dict, transform: sklearn.pipeline.Pipeline, decode: List[str], bs=64 ): - self.transform = vaep.transform.VaepPipeline( - df_train=train_df, - encode=transform, - decode=decode) + self.transform = vaep.transform.VaepPipeline( + df_train=train_df, + encode=transform, + decode=decode) self.dls = vaep.io.dataloaders.get_dls( - train_X=train_df, - valid_X=val_df, - transformer=self.transform, bs=bs) + train_X=train_df, + valid_X=val_df, + transformer=self.transform, bs=bs) # M = data.train_X.shape[-1] self.kwargs_model = model_kwargs self.params = dict(self.kwargs_model) self.model = model(**self.kwargs_model) - + self.n_params_ae = vaep.models.calc_net_weight_count(self.model) self.params['n_parameters'] = self.n_params_ae self.learn = None - - def get_preds_from_df(self, df_wide:pd.DataFrame) -> pd.DataFrame: - if self.learn is None: raise ValueError("Assign Learner first as learn attribute.") - return get_preds_from_df(df=df_wide, learn=self.learn, transformer=self.transform) - - def get_test_dl(self, df_wide:pd.DataFrame, bs:int=64) -> pd.DataFrame: - return vaep.io.dataloaders.get_test_dl(df=df_wide, transformer=self.transform, bs=bs) + def get_preds_from_df(self, df_wide: pd.DataFrame) -> pd.DataFrame: + if self.learn is None: + raise ValueError("Assign Learner first as learn attribute.") + return get_preds_from_df(df=df_wide, learn=self.learn, transformer=self.transform) + def get_test_dl(self, df_wide: pd.DataFrame, bs: int = 64) -> pd.DataFrame: + return vaep.io.dataloaders.get_test_dl(df=df_wide, transformer=self.transform, bs=bs) diff --git a/vaep/models/analysis.py b/vaep/models/analysis.py index d0d6a1dbf..93d8a2aaa 100644 --- a/vaep/models/analysis.py +++ b/vaep/models/analysis.py @@ -5,11 +5,12 @@ from vaep.analyzers import Analysis + class ModelAnalysis(Analysis): """Class describing what an ModelAnalysis is supposed to have as attributes.""" model: torch.nn.Module dls: fastai.data.core.DataLoaders - learn: fastai.learner.Learner + learn: fastai.learner.Learner params: dict - transform: vaep.transform.VaepPipeline \ No newline at end of file + transform: vaep.transform.VaepPipeline diff --git a/vaep/models/cmd.py b/vaep/models/cmd.py index c0893401a..c31003c61 100644 --- a/vaep/models/cmd.py +++ b/vaep/models/cmd.py @@ -30,16 +30,16 @@ def create_argparser(): BATCH_SIZE = 16 EPOCHS = 600 + def get_args(batch_size=BATCH_SIZE, epochs=EPOCHS, log_interval=10, no_cuda=False): """Helper function to create arg.""" - args = ['--batch-size', str(batch_size), - '--seed', '43', - '--epochs', str(epochs), + args = ['--batch-size', str(batch_size), + '--seed', '43', + '--epochs', str(epochs), '--log-interval', str(log_interval)] if no_cuda: args.append('--no-cuda') args = parser.parse_args(args) args.cuda = torch.cuda.is_available() and not args.no_cuda return args - diff --git a/vaep/models/collab.py b/vaep/models/collab.py index 3d045a26c..ff9f461a1 100644 --- a/vaep/models/collab.py +++ b/vaep/models/collab.py @@ -38,7 +38,7 @@ def forward(self, x): def combine_data(train_df: pd.DataFrame, val_df: pd.DataFrame) -> Tuple[pd.DataFrame, float]: - """Helper function to combine training and validation data in long-format. The + """Helper function to combine training and validation data in long-format. The training and validation data will be mixed up in CF training as the sample embeddings have to be trained for all samples. The returned frac can be used to have the same number of (non-missing) validation samples as before. @@ -56,8 +56,8 @@ def combine_data(train_df: pd.DataFrame, val_df: pd.DataFrame) -> Tuple[pd.DataF Pandas DataFrame of concatenated samples of training and validation data. Fraction of samples originally in validation data. """ - X = train_df.append(val_df).reset_index() - frac = len(val_df) / (len(train_df)+len(val_df)) + X = pd.concat([train_df, val_df]).reset_index() + frac = len(val_df) / (len(train_df) + len(val_df)) return X, frac @@ -100,7 +100,7 @@ def collab_dot_product(sample_embeddings: torch.tensor, sample_bias: torch.tenso res = res.detach() if y_range is None: return res - return torch.sigmoid(res) * (y_range[1]-y_range[0]) + y_range[0] + return torch.sigmoid(res) * (y_range[1] - y_range[0]) + y_range[0] def collab_prediction(idx_samples: torch.tensor, @@ -112,20 +112,20 @@ def collab_prediction(idx_samples: torch.tensor, Parameters ---------- idx_samples : torch.tensor - An array containing the neighreast neighbors in the training data for + An array containing the neighreast neighbors in the training data for set of list of test samples. Normallay obtained from a sklearn KNN search. learn : fastai.learner.Learner The learner used for collab training index_samples : pd.Index, optional The pandas.Index for the training samples. If no index_samples is provided, the samples will just be numbered, - by default None + by default None Returns ------- pd.DataFrame predictions as DataFrame for all features encoded by the model for all samples. - + """ # Matrix multiplication way test_sample_embeddings = learn.u_weight( @@ -141,7 +141,7 @@ def collab_prediction(idx_samples: torch.tensor, res = res + feat_biases.T + test_sample_biases if learn.y_range is not None: - res = torch.sigmoid(res) * (learn.y_range[1]-learn.y_range[0] + res = torch.sigmoid(res) * (learn.y_range[1] - learn.y_range[0] ) + learn.y_range[0] res = pd.DataFrame(res, @@ -162,7 +162,7 @@ def __init__(self, batch_size=64): if datasplits.val_y is not None: self.X, self.frac = combine_data(datasplits.train_X, - datasplits.val_y) + datasplits.val_y) else: self.X, self.frac = datasplits.train_X.reset_index(), 0.0 self.batch_size = batch_size @@ -172,16 +172,23 @@ def __init__(self, item_name=item_column, rating_name=target_column, bs=self.batch_size) - user_name=sample_column - item_name=item_column - rating_name=target_column - cat_names = [user_name,item_name] + user_name = sample_column + item_name = item_column + rating_name = target_column + cat_names = [user_name, item_name] ratings = self.X splits = None if datasplits.val_y is not None: - idx_splitter = IndexSplitter(list(range(len(datasplits.train_X), len(datasplits.train_X)+ len(datasplits.val_y) ))) + idx_splitter = IndexSplitter( + list(range(len(datasplits.train_X), len(datasplits.train_X) + len(datasplits.val_y)))) splits = idx_splitter(self.X) - to = TabularCollab(ratings, [Categorify], cat_names, y_names=[rating_name], y_block=TransformBlock(), splits=splits) + to = TabularCollab( + ratings, + [Categorify], + cat_names, + y_names=[rating_name], + y_block=TransformBlock(), + splits=splits) self.dls = to.dataloaders(path='.', bs=self.batch_size) self.params = {} self.model_kwargs = model_kwargs diff --git a/vaep/models/collect_dumps.py b/vaep/models/collect_dumps.py index 1854bd8f2..d0359ac22 100644 --- a/vaep/models/collect_dumps.py +++ b/vaep/models/collect_dumps.py @@ -55,4 +55,4 @@ def collect(paths: Iterable, collect_configs = partial(collect, load_fn=load_config_file, ) -collect_configs = update_wrapper(collect_configs, collect) \ No newline at end of file +collect_configs = update_wrapper(collect_configs, collect) diff --git a/vaep/models/vae.py b/vaep/models/vae.py index e7e2e8401..6395773e7 100644 --- a/vaep/models/vae.py +++ b/vaep/models/vae.py @@ -15,6 +15,7 @@ leaky_relu_default = nn.LeakyReLU(.1) + class VAE(nn.Module): def __init__(self, n_features: int, @@ -39,11 +40,11 @@ def build_layer(in_feat, out_feat): # Encoder self.encoder = [] - for i in range(len(self.layers)-1): - in_feat, out_feat = self.layers[i:i+2] + for i in range(len(self.layers) - 1): + in_feat, out_feat = self.layers[i:i + 2] self.encoder.extend(build_layer(in_feat=in_feat, out_feat=out_feat)) - self.encoder.append(nn.Linear(out_feat, dim_latent*2)) + self.encoder.append(nn.Linear(out_feat, dim_latent * 2)) self.encoder = nn.Sequential(*self.encoder) @@ -56,13 +57,13 @@ def build_layer(in_feat, out_feat): out_feat=out_feat) i = -1 # in case a single hidden layer is passed - for i in range(len(self.layers_decoder)-2): - in_feat, out_feat = self.layers_decoder[i:i+2] + for i in range(len(self.layers_decoder) - 2): + in_feat, out_feat = self.layers_decoder[i:i + 2] self.decoder.extend(build_layer(in_feat=in_feat, out_feat=out_feat)) - in_feat, out_feat = self.layers_decoder[i+1:i+3] + in_feat, out_feat = self.layers_decoder[i + 1:i + 3] - self.decoder.append(nn.Linear(in_feat, out_feat*2)) + self.decoder.append(nn.Linear(in_feat, out_feat * 2)) if last_decoder_activation is not None: self.append(last_decoder_activation) @@ -84,7 +85,7 @@ def decode(self, z): return x_mu, x_logvar def reparameterize(self, mu, logvar): - std = torch.exp(0.5*logvar) + std = torch.exp(0.5 * logvar) return mu + torch.randn_like(std) * std def forward(self, x): @@ -95,11 +96,11 @@ def forward(self, x): def compute_kld(z_mu, z_logvar): - return 0.5*(z_mu**2 + torch.exp(z_logvar) - 1 - z_logvar) + return 0.5 * (z_mu**2 + torch.exp(z_logvar) - 1 - z_logvar) def gaussian_log_prob(z, mu, logvar): - return -0.5*(math.log(2*math.pi) + logvar + (z-mu)**2/torch.exp(logvar)) + return -0.5 * (math.log(2 * math.pi) + logvar + (z - mu)**2 / torch.exp(logvar)) def loss_fct(pred, y, reduction='sum', results: List = None, freebits=0.1): @@ -108,7 +109,7 @@ def loss_fct(pred, y, reduction='sum', results: List = None, freebits=0.1): l_rec = -torch.sum(gaussian_log_prob(batch, x_mu, x_logvar)) l_reg = torch.sum(F.relu(compute_kld(z_mu, z_logvar) - - freebits*math.log(2))+freebits*math.log(2), 1) + freebits * math.log(2)) + freebits * math.log(2), 1) if results is not None: results.append((l_rec.item(), torch.mean(l_reg).item())) diff --git a/vaep/nb.py b/vaep/nb.py index a8a4246e7..0d13104b7 100644 --- a/vaep/nb.py +++ b/vaep/nb.py @@ -9,14 +9,15 @@ class Config(): - """Config class with a setter enforcing that config entries cannot + """Config class with a setter enforcing that config entries cannot be overwritten. Can contain configs, which are itself configs: keys, paths, - + """ + def __setattr__(self, entry, value): """Set if attribute not in instance.""" if hasattr(self, entry) and getattr(self, entry) != value: @@ -45,7 +46,7 @@ def dump(self, fname=None): logger.info(f"Dumped config to: {fname}") @classmethod - def from_dict(cls, d:dict): + def from_dict(cls, d: dict): cfg = cls() for k, v in d.items(): setattr(cfg, k, v) @@ -57,17 +58,18 @@ def update_from_dict(self, params: dict): setattr(self, k, v) except AttributeError: logger.info(f"Already set attribute: {k} has value {v}") - + def keys(self): return vars(self).keys() def items(self): return vars(self).items() - + def values(self): return vars(self).values() -def get_params(args:dict.keys, globals, remove=True) -> dict: + +def get_params(args: dict.keys, globals, remove=True) -> dict: params = {k: v for k, v in globals.items() if k not in args and k[0] != '_'} if not remove: return params diff --git a/vaep/pandas/__init__.py b/vaep/pandas/__init__.py index bdbfa97f3..5557cbd58 100644 --- a/vaep/pandas/__init__.py +++ b/vaep/pandas/__init__.py @@ -1,9 +1,9 @@ import collections.abc from collections import namedtuple -import numbers + from types import SimpleNamespace -import typing + from typing import Iterable import numpy as np @@ -12,6 +12,7 @@ from .calc_errors import calc_errors_per_feat, get_absolute_error + def combine_value_counts(X: pd.DataFrame, dropna=True) -> pd.DataFrame: """Pass a selection of columns to combine it's value counts. @@ -39,20 +40,6 @@ def combine_value_counts(X: pd.DataFrame, dropna=True) -> pd.DataFrame: return freq_targets -def counts_with_proportion(s: pd.Series) -> pd.DataFrame: - """Counts with proportion of counts(!). - - Note: In case of missing values the proportion is not based on the total number of - rows in the DataFrame. - """ - s = s.value_counts() - s.index.name = 'value' - N = s.sum() - ret = s.to_frame('counts') - ret['prop.'] = s / N - return ret - - def unique_cols(s: pd.Series) -> bool: """Check all entries are equal in pandas.Series @@ -71,13 +58,6 @@ def unique_cols(s: pd.Series) -> bool: return (s.iloc[0] == s).all() -def show_columns_with_variation(df: pd.DataFrame) -> pd.DataFrame: - df_describe = df.describe(include='all', datetime_is_numeric=True) - col_mask = (df_describe.loc['unique'] > 1) | ( - df_describe.loc['std'] > 0.01) - return df.loc[:, col_mask] - - def get_unique_non_unique_columns(df: pd.DataFrame) -> SimpleNamespace: """Get back a namespace with an column.Index both of the unique and non-unique columns. @@ -111,11 +91,13 @@ def replace_with(string_key: str, replace: str = "()/", replace_with: str = '') string_key = string_key.replace(symbol, replace_with) return string_key -def index_to_dict(index:pd.Index) -> dict: + +def index_to_dict(index: pd.Index) -> dict: cols = {replace_with(col.replace(' ', '_').replace( '-', '_')): col for col in index} return cols + def get_columns_accessor(df: pd.DataFrame, all_lower_case=False) -> omegaconf.OmegaConf: if isinstance(df.columns, pd.MultiIndex): raise ValueError("MultiIndex not supported.") @@ -132,6 +114,7 @@ def get_columns_accessor_from_iterable(cols: Iterable[str], cols = {k.lower(): v for k, v in cols.items()} return omegaconf.OmegaConf.create(cols) + def select_max_by(df: pd.DataFrame, grouping_columns: list, selection_column: str) -> pd.DataFrame: df = df.sort_values(by=[*grouping_columns, selection_column], ascending=False) df = df.drop_duplicates(subset=grouping_columns, @@ -189,7 +172,7 @@ def _add_indices(array: np.array, original_df: pd.DataFrame, def interpolate(wide_df: pd.DataFrame, name='interpolated') -> pd.DataFrame: """Interpolate NA values with the values before and after. Uses n=3 replicates. - First rows replicates are the two following. + First rows replicates are the two following. Last rows replicates are the two preceding. Parameters @@ -219,35 +202,11 @@ def interpolate(wide_df: pd.DataFrame, name='interpolated') -> pd.DataFrame: ret.iloc[0] = first_row ret.iloc[-1] = last_row - ret = ret[mask].stack().dropna().squeeze() # does not work with MultiIndex columns + ret = ret[mask].stack().dropna().squeeze() # does not work with MultiIndex columns ret.rename(name, inplace=True) return ret -def create_dict_of_dicts(d: dict, verbose=False, - # maybe this should not be here... - transform_values: typing.Union[typing.Callable, numbers.Number] = None): - """Unpack a dictionary with tuple keys to a nested dictonary - of single tuple keys. - """ - ret = dict() - for keys, v in d.items(): - if verbose: - print(f"current key: {str(keys):90}: {len(v):>5}") - current_dict = ret - for k in keys[:-1]: - if not k in current_dict: - current_dict[k] = dict() - current_dict = current_dict[k] - last_key = keys[-1] - if last_key not in current_dict: - current_dict[last_key] = transform_values( - v) if transform_values else v - else: - raise KeyError(f"Key already in dict: {last_key}") - return ret - - def flatten_dict_of_dicts(d: dict, parent_key: str = '') -> dict: """Build tuples for nested dictionaries for use as `pandas.MultiIndex`. @@ -319,13 +278,13 @@ def length(x): Otherwise return length of list, pandas.Series, numpy.array, dict, etc.""" try: return len(x) - except: + except BaseException: return 0 def get_last_index_matching_proportion(df_counts: pd.DataFrame, - prop:float=0.25, - prop_col:str='proportion') -> object: + prop: float = 0.25, + prop_col: str = 'proportion') -> object: """df_counts needs to be sorted by "prop_col" (descending). Parameters @@ -349,8 +308,8 @@ def get_last_index_matching_proportion(df_counts: pd.DataFrame, return idx_cutoff -def get_lower_whiskers(df:pd.DataFrame, factor:float=1.5) -> pd.Series: +def get_lower_whiskers(df: pd.DataFrame, factor: float = 1.5) -> pd.Series: ret = df.describe() iqr = ret.loc['75%'] - ret.loc['25%'] - ret = ret.loc['25%'] - iqr*factor - return ret \ No newline at end of file + ret = ret.loc['25%'] - iqr * factor + return ret diff --git a/vaep/pandas/missing_data.py b/vaep/pandas/missing_data.py index b810895e5..7bd62e0ae 100644 --- a/vaep/pandas/missing_data.py +++ b/vaep/pandas/missing_data.py @@ -4,12 +4,15 @@ import pandas as pd + def percent_missing(df: pd.DataFrame) -> float: return df.isna().sum().sum() / math.prod(df.shape) + def percent_non_missing(df: pd.DataFrame) -> float: return df.notna().sum().sum() / math.prod(df.shape) + def list_files(folder='.') -> list[str]: return [f.as_posix() for f in Path(folder).iterdir()] @@ -29,4 +32,3 @@ def get_record(data: pd.DataFrame, columns_sample=False) -> dict: N_mis=int(N_mis), missing=float(missing), ) return record - diff --git a/vaep/plotting/__init__.py b/vaep/plotting/__init__.py index ebbb8def2..38e39c8d4 100644 --- a/vaep/plotting/__init__.py +++ b/vaep/plotting/__init__.py @@ -1,4 +1,6 @@ from __future__ import annotations + +from functools import partial import numpy as np import pandas as pd import matplotlib @@ -9,22 +11,22 @@ import vaep.pandas +from .errors import plot_rolling_error +from . import errors +from . import data +from . import plotly +from . defaults import order_categories, labels_dict, IDX_ORDER + seaborn.set_style("whitegrid") # seaborn.set_theme() -plt.rcParams['figure.figsize'] = [16.0, 7.0] # [4, 2], [4, 3] +plt.rcParams['figure.figsize'] = [16.0, 7.0] # [4, 2], [4, 3] plt.rcParams['pdf.fonttype'] = 42 plt.rcParams['ps.fonttype'] = 42 plt.rcParams['figure.dpi'] = 147 -from . defaults import order_categories, labels_dict, IDX_ORDER -from . import plotly -from . import data -from . import errors -from .errors import plot_rolling_error - logger = logging.getLogger(__name__) __all__ = ['plotly', @@ -40,6 +42,7 @@ 'plot_cutoffs', ] + def _savefig(fig, name, folder: pathlib.Path = '.', pdf=True, dpi=300 # default 'figure' @@ -107,9 +110,9 @@ def select_dates(date_series: pd.Series, max_ticks=30) -> np.array: def make_large_descriptors(size='xx-large'): - """Helper function to have very large titles, labes and tick texts for + """Helper function to have very large titles, labes and tick texts for matplotlib plots per default. - + size: str fontsize or allowed category. Change default if necessary, default 'xx-large' """ @@ -141,42 +144,56 @@ def add_prop_as_second_yaxis(ax: matplotlib.axes.Axes, n_samples: int, ax2 = ax.twinx() n_min, n_max = np.round(ax.get_ybound()) logger.info(f"{n_min = }, {n_max = }") - lower_prop = n_min/n_samples + (ax.get_ybound()[0] - n_min) / n_samples - upper_prop = n_max/n_samples + (ax.get_ybound()[1] - n_max) / n_samples + lower_prop = n_min / n_samples + (ax.get_ybound()[0] - n_min) / n_samples + upper_prop = n_max / n_samples + (ax.get_ybound()[1] - n_max) / n_samples logger.info(f'{lower_prop = }, {upper_prop = }') ax2.set_ybound(lower_prop, upper_prop) # _ = ax2.set_yticks(np.linspace(n_min/n_samples, # n_max /n_samples, len(ax.get_yticks())-2)) - _ = ax2.set_yticks(ax.get_yticks()[1:-1]/n_samples) + _ = ax2.set_yticks(ax.get_yticks()[1:-1] / n_samples) ax2.yaxis.set_major_formatter( matplotlib.ticker.StrMethodFormatter(format_str)) return ax2 -def add_height_to_barplot(ax, size=5): +def add_height_to_barplot(ax, size=5, rotated=False): + ax.annotate = partial(ax.annotate, text='NA', + xytext=(0, int(size / 2)), + ha='center', + size=size, + textcoords='offset points') + ax.annotate = partial(ax.annotate, + rotation=0, + va='center') + if rotated: + ax.annotate = partial(ax.annotate, + xytext=(1, int(size / 3)), + rotation=90, + va='bottom') for bar in ax.patches: if not bar.get_height(): + xy = (bar.get_x() + bar.get_width() / 2, + 0.0) + ax.annotate(text='NA', + xy=xy, + ) continue ax.annotate(text=format(bar.get_height(), '.2f'), xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()), - xytext=(0, int(size/2)), - ha='center', - va='center', - size=size, - textcoords='offset points') + ) return ax def add_text_to_barplot(ax, text, size=5): - for bar, text in zip(ax.patches, text): + for bar, text_ in zip(ax.patches, text): logger.debug(f"{bar = }, f{text = }, {bar.get_height() = }") if not bar.get_height(): continue - ax.annotate(text=text, + ax.annotate(text=text_, xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()), - xytext=(0, -5), + xytext=(1, -5), rotation=90, ha='center', va='top', @@ -208,7 +225,7 @@ def format_large_numbers(ax: matplotlib.axes.Axes, return ax -def plot_feat_counts(df_counts:pd.DataFrame, feat_name:str, n_samples:int, +def plot_feat_counts(df_counts: pd.DataFrame, feat_name: str, n_samples: int, ax=None, figsize=(15, 10), count_col='counts', **kwargs): @@ -218,7 +235,7 @@ def plot_feat_counts(df_counts:pd.DataFrame, feat_name:str, n_samples:int, title=f'Count and proportion of {len(df_counts):,d} {feat_name}s over {n_samples:,d} samples', ) args.update(kwargs) - + ax = df_counts[count_col].plot( figsize=figsize, @@ -236,8 +253,8 @@ def plot_feat_counts(df_counts:pd.DataFrame, feat_name:str, n_samples:int, def plot_counts(df_counts: pd.DataFrame, n_samples, - feat_col_name:str='count', - feature_name=None, + feat_col_name: str = 'count', + feature_name=None, ax=None, prop_feat=0.25, min_feat_prop=.01, **kwargs): """Plot counts based on get_df_counts.""" @@ -251,7 +268,7 @@ def plot_counts(df_counts: pd.DataFrame, n_samples, ax=ax, **kwargs) df_counts['prop'] = df_counts[feat_col_name] / n_samples n_feat_cutoff = vaep.pandas.get_last_index_matching_proportion( - df_counts=df_counts, prop=prop_feat, prop_col='prop') + df_counts=df_counts, prop=prop_feat, prop_col='prop') n_samples_cutoff = df_counts.loc[n_feat_cutoff, feat_col_name] logger.info(f'{n_feat_cutoff = }, {n_samples_cutoff = }') x_lim_max = vaep.pandas.get_last_index_matching_proportion( @@ -307,5 +324,3 @@ def plot_cutoffs(df: pd.DataFrame, if min_feat_in_sample is not None: ax.axhline(min_feat_in_sample) return fig, axes - - diff --git a/vaep/plotting/data.py b/vaep/plotting/data.py index 1fbd31d7a..f50c63606 100644 --- a/vaep/plotting/data.py +++ b/vaep/plotting/data.py @@ -1,4 +1,4 @@ -"""Plot data distribution based on pandas DataFrames or Series.""" +"""Plot data distribution based on pandas `DataFrames` or `Series`.""" from typing import Tuple, Iterable import matplotlib @@ -9,7 +9,19 @@ def min_max(s: pd.Series) -> Tuple[int]: - min_bin, max_bin = (int(s.min()), (int(s.max())+1)) + """Get the min and max as integer from a pandas.Series. + + Parameters + ---------- + s : pd.Series + Series of intensities. + + Returns + ------- + Tuple[int] + _description_ + """ + min_bin, max_bin = (int(s.min()), (int(s.max()) + 1)) return min_bin, max_bin @@ -27,10 +39,10 @@ def get_min_max_iterable(series: Iterable[pd.Series]) -> Tuple[int]: def plot_histogram_intensities(s: pd.Series, - interval_bins=1, - min_max=(15, 40), - ax=None, - **kwargs) -> Tuple[Axes, range]: + interval_bins=1, + min_max=(15, 40), + ax=None, + **kwargs) -> Tuple[Axes, range]: """Plot intensities in Series in a certain range and equally spaced intervals.""" min_bin, max_bin = min_max bins = range(min_bin, max_bin, interval_bins) @@ -90,8 +102,24 @@ def plot_observations(df: pd.DataFrame, def plot_missing_dist_highdim(data: pd.DataFrame, - min_feat_per_sample=None, - min_samples_per_feat=None) -> matplotlib.figure.Figure: + min_feat_per_sample: int = None, + min_samples_per_feat: int = None) -> matplotlib.figure.Figure: + """Plot missing distribution (cdf) in high dimensional data. + + Parameters + ---------- + data : pd.DataFrame + Intensity table with samples in rows and features in columns. + min_feat_per_sample : int, optional + Show the minimum required features a sample has to have, by default None + min_samples_per_feat : int, optional + Show the minimum required number of samples a feature has to be found in, by default None + + Returns + ------- + matplotlib.figure.Figure + Figure with two plots (Axes). + """ fig, axes = plt.subplots(1, 2, figsize=(4, 2)) not_na = data.notna() name = 'features per sample' @@ -251,8 +279,8 @@ def plot_feat_median_over_prop_missing(data: pd.DataFrame, missing_by_median['bins'] = pd.cut( missing_by_median['median feat value'], bins=bins) missing_by_median['median feat value (floor)'] = (missing_by_median['median feat value'] - .astype(int) - ) + .astype(int) + ) _counts = (missing_by_median .groupby('median feat value (floor)')['median feat value'] .count() diff --git a/vaep/plotting/defaults.py b/vaep/plotting/defaults.py index db00b29b9..f4a470abe 100644 --- a/vaep/plotting/defaults.py +++ b/vaep/plotting/defaults.py @@ -1,4 +1,5 @@ import logging +import matplotlib as mpl import seaborn as sns logger = logging.getLogger(__name__) @@ -6,7 +7,6 @@ # ! default seaborn color map only has 10 colors # https://seaborn.pydata.org/tutorial/color_palettes.html # sns.color_palette("husl", N) to get N distinct colors -# color_model_mapping = { 'KNN': sns.color_palette()[0], 'KNN_IMPUTE': sns.color_palette()[1], @@ -18,10 +18,22 @@ 'None': sns.color_palette()[7], 'BPCA': sns.color_palette()[8], 'MICE-CART': sns.color_palette()[9], - 'SEQKNN': sns.color_palette()[6], - 'MICE-NORM': sns.color_palette()[1], + } -other_colors = sns.color_palette()[8:] +# other_colors = sns.color_palette()[8:] +other_colors = sns.color_palette("husl", 20) +color_model_mapping['IMPSEQ'] = other_colors[0] +color_model_mapping['QRILC'] = other_colors[1] +color_model_mapping['IMPSEQROB'] = other_colors[1] +color_model_mapping['MICE-NORM'] = other_colors[2] +color_model_mapping['SEQKNN'] = other_colors[3] +color_model_mapping['IMPSEQROB'] = other_colors[4] +color_model_mapping['GSIMP'] = other_colors[5] +color_model_mapping['MSIMPUTE'] = other_colors[6] +color_model_mapping['MSIMPUTE_MNAR'] = other_colors[7] +color_model_mapping['TRKNN'] = other_colors[8] +color_model_mapping['SVDMETHOD'] = other_colors[9] +other_colors = other_colors[10:] def assign_colors(models): @@ -39,6 +51,32 @@ def assign_colors(models): return ret_colors +class ModelColorVisualizer: + + def __init__(self, models, palette): + self.models = models + self.palette = map(mpl.colors.colorConverter.to_rgb, palette) + + def as_hex(self): + """Return a color palette with hex codes instead of RGB values.""" + hex = [mpl.colors.rgb2hex(rgb) for rgb in self.palette] + return hex + + def _repr_html_(self): + """Rich display of the color palette in an HTML frontend.""" + s = 55 + n = len(self.models) + html = f'' + for i, (m, c) in enumerate(zip(self.models, self.as_hex())): + html += ( + f'' + ) + html += f'{m}' + html += '' + return html + + labels_dict = {"NA not interpolated valid_collab collab MSE": 'MSE', 'batch_size': 'bs', 'n_hidden_layers': "No. of hidden layers", diff --git a/vaep/plotting/errors.py b/vaep/plotting/errors.py index 30d53cb0f..5326b9d86 100644 --- a/vaep/plotting/errors.py +++ b/vaep/plotting/errors.py @@ -24,7 +24,7 @@ def plot_errors_binned(pred: pd.DataFrame, target_col='observed', len_max_bin = len(str(int(errors_binned['bin'].max()))) n_obs = (errors_binned[meta_cols] .apply( - lambda x: f"{x.bin:0{len_max_bin}} (N={x.n_obs:,d})", axis=1 + lambda x: f"{x.bin:0{len_max_bin}}\n(N={x.n_obs:,d})", axis=1 ) .rename('intensity bin') .astype('category') @@ -43,7 +43,7 @@ def plot_errors_binned(pred: pd.DataFrame, target_col='observed', x='intensity bin', y=metric_name, hue='model', palette=palette, errwidth=errwidth,) - ax.xaxis.set_tick_params(rotation=-90) + ax.xaxis.set_tick_params(rotation=90) return ax, errors_binned @@ -52,6 +52,7 @@ def plot_errors_by_median(pred: pd.DataFrame, target_col='observed', ax: Axes = None, palette: dict = None, + feat_name: str = None, metric_name: Optional[str] = None, errwidth: float = 1.2) -> tuple[Axes, pd.DataFrame]: # calculate absolute errors @@ -74,16 +75,17 @@ def plot_errors_by_median(pred: pd.DataFrame, errors = errors.join(n_obs, on="bin") - feat_name = feat_medians.index.name - if not feat_name: - feat_name = 'feature' + if feat_name is None: + feat_name = feat_medians.index.name + if not feat_name: + feat_name = 'feature' x_axis_name = f'intensity binned by median of {feat_name}' len_max_bin = len(str(int(errors['bin'].max()))) errors[x_axis_name] = ( errors[['bin', 'n_obs']] .apply( - lambda x: f"{x.bin:0{len_max_bin}} (N={x.n_obs:,d})", axis=1 + lambda x: f"{x.bin:0{len_max_bin}}\n(N={x.n_obs:,d})", axis=1 ) .rename('intensity bin') .astype('category') @@ -98,7 +100,7 @@ def plot_errors_by_median(pred: pd.DataFrame, hue='model', palette=palette, errwidth=errwidth,) - ax.xaxis.set_tick_params(rotation=-90) + ax.xaxis.set_tick_params(rotation=90) return ax, errors diff --git a/vaep/sampling.py b/vaep/sampling.py index a5c5f5f29..9716c14f1 100644 --- a/vaep/sampling.py +++ b/vaep/sampling.py @@ -16,11 +16,11 @@ def feature_frequency(df_wide: pd.DataFrame, measure_name: str = 'freq') -> pd.S Returns ------- pd.Series - Frequency on non-missing entries per feature (column). + Frequency on non-missing entries per feature (column). """ # if hasattr(df_wide.columns, "levels"): # is columns.names always set? # is listed as attribute: https://pandas.pydata.org/docs/reference/api/pandas.Index.html - _df_feat = df_wide.stack(df_wide.columns.names) # ensure that columns are named + _df_feat = df_wide.stack(df_wide.columns.names) # ensure that columns are named _df_feat = _df_feat.to_frame(measure_name) # implicit as stack puts column index in the last position (here: 1) @@ -61,8 +61,8 @@ def sample_data(series: pd.Series, sample_index_to_drop: Union[str, int], Parameters ---------- series : pd.Series - Long-format data in pd.Series. Index name is feature name. 2 dimensional - MultiIndex. + Long-format data in pd.Series. Index name is feature name. 2 dimensional + MultiIndex. sample_index_to_drop : Union[str, int] Sample index (as str or integer Index position). Unit to group by (i.e. Samples) frac : float, optional diff --git a/vaep/stats/__init__.py b/vaep/stats/__init__.py index 37992dbc5..3b7233754 100644 --- a/vaep/stats/__init__.py +++ b/vaep/stats/__init__.py @@ -1 +1 @@ -from . import diff_analysis \ No newline at end of file +from . import diff_analysis diff --git a/vaep/stats/diff_analysis.py b/vaep/stats/diff_analysis.py index d5ca991d2..13f411693 100644 --- a/vaep/stats/diff_analysis.py +++ b/vaep/stats/diff_analysis.py @@ -11,11 +11,11 @@ def ancova_pg(df_long: pd.DataFrame, feat_col: str, dv: str, between: str, - covar: list[str]|str, + covar: list[str] | str, fdr=0.05) -> pd.DataFrame: """ Analysis of covariance (ANCOVA) using pg.ancova https://pingouin-stats.org/generated/pingouin.ancova.html - + Adds multiple hypothesis testing correction by Benjamini-Hochberg (qvalue, rejected) @@ -64,7 +64,7 @@ def ancova_pg(df_long: pd.DataFrame, scores['-Log10 pvalue'] = -np.log10(scores['p-unc']) scores = scores[scores.Source != 'Residual'] - #FDR correction + # FDR correction scores = add_fdr_scores(scores, random_seed=123) return scores @@ -83,7 +83,7 @@ def analyze(df_proteomics: pd.DataFrame, df_clinic: pd.DataFrame, target: str, covar: list[str], - value_name: str='intensity') -> pd.DataFrame: + value_name: str = 'intensity') -> pd.DataFrame: """apply ancova and multiple test correction. Parameters diff --git a/vaep/tests/io/test_data_objects.py b/vaep/tests/io/test_data_objects.py index b36acf491..78ffa67cf 100644 --- a/vaep/tests/io/test_data_objects.py +++ b/vaep/tests/io/test_data_objects.py @@ -24,7 +24,7 @@ expected = """ Sequence,Charge,m/z,Protein group IDs,Intensity,Score YYYIPQYK,2,569.2844,3745,147680000.0,83.801 -YYVTIIDAPGHR,3,468.91386,2873,8630000000.0,131.83 +YYVTIIDAPGHR,3,468.91386,2873,8630000000.0,131.83 YYVTIIDAPGHR,2,702.867151,2873,2458400000.0,70.028 YYVLNALK,2,492.28166,3521,147430000.0,58.687 """ diff --git a/vaep/tests/io/test_dataloaders.py b/vaep/tests/io/test_dataloaders.py index 67206bd8b..a801beeba 100644 --- a/vaep/tests/io/test_dataloaders.py +++ b/vaep/tests/io/test_dataloaders.py @@ -10,7 +10,7 @@ def test_get_dls(): N, M = 23, 11 X_train = create_random_df(N, M) - N_valid = int(N*0.3) + N_valid = int(N * 0.3) X_valid = create_random_df( N_valid, M, prop_na=.1, start_idx=len(X_train)) @@ -18,13 +18,11 @@ def test_get_dls(): [('normalize', StandardScaler()), ('impute', SimpleImputer(add_indicator=False))]) transforms = VaepPipeline(df_train=X_train, - encode=dae_default_pipeline, - decode=['normalize']) + encode=dae_default_pipeline, + decode=['normalize']) BS = 4 dls = get_dls(train_X=X_train, valid_X=X_valid, transformer=transforms, bs=BS) assert len(dls.train_ds) == N assert len(dls.valid_ds) == N batch = dls.one_batch() assert batch[0].shape == (BS, M) - - diff --git a/vaep/tests/io/test_dataset.py b/vaep/tests/io/test_dataset.py index ae9da5a40..7fc12d431 100644 --- a/vaep/tests/io/test_dataset.py +++ b/vaep/tests/io/test_dataset.py @@ -6,7 +6,8 @@ from vaep.io.datasets import DatasetWithMaskAndNoTarget -def test_DatasetWithMaskAndNoTarget(): + +def test_DatasetWithMaskAndNoTarget(): with pytest.raises(ValueError): DatasetWithMaskAndNoTarget(df=np.random.rand(10, 5)) @@ -14,4 +15,4 @@ def test_DatasetWithMaskAndNoTarget(): data = helpers.create_DataFrame() ds = DatasetWithMaskAndNoTarget(df=data) assert all(ds[-1][1] == torch.tensor([95, 96, 97, 98, 99], dtype=torch.int32)) - assert all(ds[-1][0] == torch.tensor([False, False, False, False, False])) \ No newline at end of file + assert all(ds[-1][0] == torch.tensor([False, False, False, False, False])) diff --git a/vaep/tests/io/test_datasplits.py b/vaep/tests/io/test_datasplits.py index acae6f5c7..d9a4ebde0 100644 --- a/vaep/tests/io/test_datasplits.py +++ b/vaep/tests/io/test_datasplits.py @@ -8,14 +8,14 @@ X = np.random.rand(N, M) df = (pd.DataFrame(X, - index=[f'sample_{i}' for i in range(N)], - columns=(f'feat_{i}' for i in range(M))) - .rename_axis('Sample ID') - .rename_axis('Feature Name', axis=1)) + index=[f'sample_{i}' for i in range(N)], + columns=(f'feat_{i}' for i in range(M))) + .rename_axis('Sample ID') + .rename_axis('Feature Name', axis=1)) -_splits = {'train_X': df.iloc[:int(N*0.6)], - 'val_y': df.iloc[int(N*0.6):int(N*0.8)], - 'test_y': df.iloc[int(N*0.8):]} +_splits = {'train_X': df.iloc[:int(N * 0.6)], + 'val_y': df.iloc[int(N * 0.6):int(N * 0.8)], + 'test_y': df.iloc[int(N * 0.8):]} def test_DataSplits_iter(): @@ -54,11 +54,12 @@ def test_dump_load(tmp_path): splits = DataSplits(is_wide_format=None) splits.load(folder=tmp_path, use_wide_format=True) assert splits.train_X is not _splits['train_X'] - + npt.assert_almost_equal(_splits['train_X'].values, splits.train_X) # #ToDo: Index and Column names are not yet correctly set # assert splits.train_X.equals(_splits['train_X']) + def test_to_long_format(tmp_path): splits = DataSplits(**_splits, is_wide_format=True) splits.dump(folder=tmp_path) @@ -72,6 +73,7 @@ def test_to_long_format(tmp_path): assert splits.val_y is not expected assert splits.val_y.equals(expected) + def test_to_wide_format(tmp_path): splits = DataSplits(**_splits, is_wide_format=True) splits.dump(folder=tmp_path) @@ -85,9 +87,10 @@ def test_to_wide_format(tmp_path): assert splits.val_y is not expected assert splits.val_y.equals(expected) + def test_interpolate(): splits = DataSplits(**_splits, is_wide_format=True) - splits._is_wide = True # ToDo. Is not correctly set when init is called. + splits._is_wide = True # ToDo. Is not correctly set when init is called. with pytest.raises(AttributeError): _ = splits.interpolate('non-existing') diff --git a/vaep/tests/models/__pycache__/test_collect_dumps.py b/vaep/tests/models/__pycache__/test_collect_dumps.py index fd585f567..5dbc9b986 100644 --- a/vaep/tests/models/__pycache__/test_collect_dumps.py +++ b/vaep/tests/models/__pycache__/test_collect_dumps.py @@ -7,7 +7,3 @@ def test_select_content(): 'model_metrics_collab'] for test_case in test_cases: assert select_content(test_case, first_split='metrics_') == test_case.split('metrics_')[1] - - - - diff --git a/vaep/tests/pandas/test_calc_errors.py b/vaep/tests/pandas/test_calc_errors.py index c56e20bea..0749cd2d7 100644 --- a/vaep/tests/pandas/test_calc_errors.py +++ b/vaep/tests/pandas/test_calc_errors.py @@ -6,7 +6,7 @@ @fixture def example_data(): - """Example data with duplicated index values. Normally MulitIndex is used with + """Example data with duplicated index values. Normally MulitIndex is used with unique combination of sample and feat values.""" data = [[25.47317633, 27.23206642, 26.43510602, 28.40661375, 27.6536975], [30.57866718, 30.17035425, 30.22881888, 29.82725333, 30.1177242], @@ -21,7 +21,7 @@ def example_data(): data = pd.DataFrame(data, index=(f'feat_{i}' for i in [ 0, 0, 1, 1, 1, 2, 3, 4, 5, 6]), - columns=['observed'] + ['model_' + str(i+1) for i in range(4)]) + columns=['observed'] + ['model_' + str(i + 1) for i in range(4)]) data.columns.name = 'model' data.index.name = 'feat' data['freq_feat'] = [4, 5, 5, 4, 6, 7, 7, 9, 8, 6] diff --git a/vaep/tests/plotting/test_defaults.py b/vaep/tests/plotting/test_defaults.py new file mode 100644 index 000000000..086a1df63 --- /dev/null +++ b/vaep/tests/plotting/test_defaults.py @@ -0,0 +1,9 @@ +from vaep.plotting.defaults import assign_colors + + +def test_assign_colors(): + expected = [(0.8392156862745098, 0.15294117647058825, 0.1568627450980392), + (0.17254901960784313, 0.6274509803921569, 0.17254901960784313), + (0.21044753832183283, 0.6773105080456748, 0.6433941168468681)] + assigned = assign_colors(['DAE', 'CF', 'Test']) + assert assigned == expected diff --git a/vaep/tests/test_ae.py b/vaep/tests/test_ae.py index 1b30424f5..319f91e81 100644 --- a/vaep/tests/test_ae.py +++ b/vaep/tests/test_ae.py @@ -19,6 +19,7 @@ ) )""" + def test_basic_repr(): model = ae.Autoencoder(n_features=100, n_neurons=30) actual_repr = repr(model) @@ -26,6 +27,3 @@ def test_basic_repr(): assert model.dim_latent == 10 assert model.n_neurons == [30] assert model.n_features == 100 - - - diff --git a/vaep/tests/test_collab.py b/vaep/tests/test_collab.py index 42c4e4cc2..1e770d911 100644 --- a/vaep/tests/test_collab.py +++ b/vaep/tests/test_collab.py @@ -13,16 +13,17 @@ index=[f'sample_{i}' for i in range(N)], columns=(f'feat_{i}' for i in range(M))) -data = {'train_X': df.iloc[:int(N*0.6)], - 'val_y': df.iloc[int(N*0.6):int(N*0.8)], - 'test_y': df.iloc[int(N*0.8):]} +data = {'train_X': df.iloc[:int(N * 0.6)], + 'val_y': df.iloc[int(N * 0.6):int(N * 0.8)], + 'test_y': df.iloc[int(N * 0.8):]} data = DataSplits(**data, is_wide_format=True) -assert data._is_wide +assert data._is_wide data.to_long_format() + def test_combine_data(): N_train, N_val = len(data.train_X), len(data.val_y) X, frac = collab.combine_data(data.train_X, data.val_y) assert len(X) == N_train + N_val - npt.assert_almost_equal(frac, N_val / (N_train+N_val)) + npt.assert_almost_equal(frac, N_val / (N_train + N_val)) diff --git a/vaep/tests/test_fasta.py b/vaep/tests/test_fasta.py deleted file mode 100644 index 194636498..000000000 --- a/vaep/tests/test_fasta.py +++ /dev/null @@ -1,47 +0,0 @@ -from vaep.fasta import get_n_miscleaved -from vaep.fasta import find_rxk_peptides -from vaep.fasta import count_peptide_matches - - -def test_get_n_miscleaved_miss_1(): - pep_seqs = ['MSSHEGGK', 'K', 'K', 'ALK', 'QPK', 'K', 'QAK', - 'EMDEEEK', 'AFK', - 'QK', 'QK', 'EEQK', 'K', 'LEVLK', 'AK', - 'VVGK', 'GPLATGGIK', 'K', 'SGK', 'K'] - - true_result = set(['MSSHEGGKK', 'KK', 'KALK', 'ALKQPK', 'QPKK', 'KQAK', 'QAKEMDEEEK', - 'EMDEEEKAFK', 'AFKQK', - 'QKQK', 'QKEEQK', 'EEQKK', 'KLEVLK', 'LEVLKAK', 'AKVVGK', - 'VVGKGPLATGGIK', 'GPLATGGIKK', 'KSGK', 'SGKK']) - - result = get_n_miscleaved(pep_sequences=pep_seqs, num_missed=1) - - assert len(true_result.difference(result)) == 0, print( - true_result.difference(result)) - - -def test_find_rxk_peptides(): - - pep_seqs = ['MSSHEGGK', 'K', 'K', 'ALK', 'QPK', 'K', 'QAK', 'EMDEEEK', 'AFK', - 'QK', 'QK', 'EEQK', 'K', 'LEVLK', 'AK', 'VVGK', 'GPLATGGIK', 'K', 'SGK', 'K'] - - true_rdx_peps = ['KKALK', 'KALKQPK', 'KQAKEMDEEEK', 'QKQKEEQK', - 'QKEEQKK', 'KLEVLKAK', 'AKVVGKGPLATGGIK', 'KSGKK'] - - assert true_rdx_peps == find_rxk_peptides( - pep_seqs), 'Build sequence: {}'.format(repr(find_rxk_peptides(pep_seqs))) - - -def test_count_pep_mapped_to_gene(): - test_peptide_to_proteinID = {'LMHIQPPK': [ - 'A0JLT2', 'A0A2R8YDL4', 'A0A494C0G4', 'A0A494C0Y4', 'A0JLT2-2', 'J3KR33']} - test_protein_to_gene = {'A0JLT2': 'MED19', 'J3KR33': 'MED19'} - - assert count_peptide_matches( - test_peptide_to_proteinID, test_protein_to_gene) == {'LMHIQPPK': 6} - - assert count_peptide_matches(test_peptide_to_proteinID, test_protein_to_gene, level='protein') == { - 'LMHIQPPK': 5}, 'Error for canonical protein level (combination of all isotopes in UniProt)' - - assert count_peptide_matches( - test_peptide_to_proteinID, test_protein_to_gene, level='gene') == {'LMHIQPPK': 1} diff --git a/vaep/tests/test_helpers.py b/vaep/tests/test_helpers.py index d860bc319..fcde11887 100644 --- a/vaep/tests/test_helpers.py +++ b/vaep/tests/test_helpers.py @@ -3,9 +3,9 @@ from vaep.utils import create_random_missing_data + def test_create_random_missing_data(): data = create_random_missing_data(N=43, M=13, prop_missing=0.2) assert data.shape == (43, 13) assert np.isnan(data).sum() - assert abs((float(np.isnan(data).sum()) / (43 * 13) ) - 0.2 ) < 0.05 - + assert abs((float(np.isnan(data).sum()) / (43 * 13)) - 0.2) < 0.05 diff --git a/vaep/tests/test_imputation.py b/vaep/tests/test_imputation.py index 7419705c8..747b71c80 100644 --- a/vaep/tests/test_imputation.py +++ b/vaep/tests/test_imputation.py @@ -9,7 +9,7 @@ fraction_missing = proteins.notna().mean() -data = data[data.columns[fraction_missing > 0.4]] +data = data[data.columns[fraction_missing > 0.4]] N_FEAT = 200 N_FEAT_digits = len(str(N_FEAT)) data = data.sample(N_FEAT, axis=1) @@ -56,30 +56,29 @@ def test_imputation_normal_dist(): # def test_imputation_mixed_norm_KNN(): # pass + + @pytest.mark.parametrize('axis', [0, 1]) def test_impute_shifted_normal(example_data, axis): - mean_shift=1.8 - # remove zeros as these lead to -inf + mean_shift = 1.8 + # remove zeros as these lead to -inf example_data = np.log2(example_data.replace({0.0: np.nan}) - ).dropna(thresh=10, axis=1-axis) + ).dropna(thresh=10, axis=1 - axis) N, M = example_data.shape mask_observed = example_data.notna() imputed = impute_shifted_normal(example_data, axis=axis, mean_shift=mean_shift) - assert len(imputed) == ((N*M) - len(example_data.stack())) - + assert len(imputed) == ((N * M) - len(example_data.stack())) + if axis == 1: min_N = int(len(example_data) * 0.6) selected = example_data.dropna(axis=1, thresh=min_N) elif axis == 0: min_M = int(example_data.shape[1] * 0.6) selected = example_data.dropna(axis=0, thresh=min_M) - + mean = selected.mean(axis=axis) std = selected.std(axis=axis) mean_shifted = mean - (std * mean_shift) mean_imputed = imputed.unstack().mean(axis=axis) assert (mean_shifted - mean_imputed).abs().max() < 0.35 - - - diff --git a/vaep/tests/test_io.py b/vaep/tests/test_io.py index 610746a4c..143d705c5 100644 --- a/vaep/tests/test_io.py +++ b/vaep/tests/test_io.py @@ -3,16 +3,17 @@ import numpy as np import numpy.testing as npt -import vaep.io +import vaep.io from vaep.io.datasets import PeptideDatasetInMemory -data = np.random.random(size=(10,5)) +data = np.random.random(size=(10, 5)) mask = ~(data < 0.1) data_w_na = np.where(mask, data, np.nan) assert (data != data_w_na).any() assert (~np.isnan(data_w_na) == mask).all() + def test_PeptideDatasetInMemory_wo_Mask(): train_ds = PeptideDatasetInMemory(data_w_na, fill_na=0.0) mask_isna = np.isnan(data_w_na) @@ -25,8 +26,8 @@ def test_PeptideDatasetInMemory_wo_Mask(): def test_relative_to(): fpath = Path('project/runs/experiment_name/run') - pwd = 'project/runs/' # per defaut '.' (the current working directory) - expected = Path('experiment_name/run') + pwd = 'project/runs/' # per defaut '.' (the current working directory) + expected = Path('experiment_name/run') acutal = vaep.io.resolve_path(fpath, pwd) assert expected == acutal @@ -36,4 +37,4 @@ def test_relative_to(): # pwd = 'root/home/project/runs/' # per defaut '.' (the current working directory) # expected = Path('root/home/project/data/file') # acutal = vaep.io.resolve_path(fpath, pwd) - # assert expected == acutal \ No newline at end of file + # assert expected == acutal diff --git a/vaep/tests/test_nb.py b/vaep/tests/test_nb.py index 906406edb..a6dddb8b5 100644 --- a/vaep/tests/test_nb.py +++ b/vaep/tests/test_nb.py @@ -6,4 +6,4 @@ def test_Config(): cfg = Config() cfg.test = 'test' with pytest.raises(AttributeError): - cfg.test = 'raise AttributeError' \ No newline at end of file + cfg.test = 'raise AttributeError' diff --git a/vaep/tests/test_pandas.py b/vaep/tests/test_pandas.py index 555772594..4d46cc2f7 100644 --- a/vaep/tests/test_pandas.py +++ b/vaep/tests/test_pandas.py @@ -5,12 +5,12 @@ def test_interpolate(): test_data = { - "pep1": {0: nan, 1: 27.8, 2: 28.9, 3: nan, 4: 28.7}, - "pep2": {0: 29.1, 1: nan, 2: 27.6, 3: 29.1, 4: nan}, + "pep1": {0: nan, 1: 27.8, 2: 28.9, 3: nan, 4: 28.7}, + "pep2": {0: 29.1, 1: nan, 2: 27.6, 3: 29.1, 4: nan}, # 4 values replace based on one (edge case): - "pep3": {0: nan, 1: nan, 2: 23.6, 3: nan, 4: nan}, - "pep4": {0: nan, 1: nan, 2: nan, 3: nan, 4: nan}, - "pep5": {0: 26.0, 1: 27.0, 2: nan, 3: nan, 4: nan}, + "pep3": {0: nan, 1: nan, 2: 23.6, 3: nan, 4: nan}, + "pep4": {0: nan, 1: nan, 2: nan, 3: nan, 4: nan}, + "pep5": {0: 26.0, 1: 27.0, 2: nan, 3: nan, 4: nan}, } df_test_data = pd.DataFrame(test_data) @@ -52,29 +52,6 @@ def test_flatten_dict_of_dicts(): assert expected == actual -def test_create_dict_of_dicts(): - data = {('a', 'a1', 'a2'): 1, - ('a', 'a1', 'a3'): 2, - ('b', 'b1', 'b2'): 3, - ('b', 'b1', 'b3'): 4} - expected = { - "a": {'a1': {'a2': 1, 'a3': 2}}, - "b": {'b1': {'b2': 3, 'b3': 4}} - } - actual = vaep.pandas.create_dict_of_dicts(data) - assert expected == actual - - data = {('a', 'a1', 'a2'): (1, 1), - ('a', 'a1', 'a3'): (2, 2), - ('b', 'b1', 'b2'): (3, 3), - ('b', 'b1', 'b3'): (4, 4)} - expected = { - "a": {'a1': {'a2': [1, 1], 'a3': [2, 2]}}, - "b": {'b1': {'b2': [3, 3], 'b3': [4, 4]}} - } - actual = vaep.pandas.create_dict_of_dicts(data, transform_values=list) - assert expected == actual - def test_key_map(): # Build a schema of dicts @@ -112,4 +89,4 @@ def test_key_map(): 'gamma': ('a', 'b'), 'delta': None}} actual = vaep.pandas.key_map(d) - assert expected == actual \ No newline at end of file + assert expected == actual diff --git a/vaep/tests/test_sampling.py b/vaep/tests/test_sampling.py index bed514892..813c79331 100644 --- a/vaep/tests/test_sampling.py +++ b/vaep/tests/test_sampling.py @@ -53,5 +53,7 @@ def test_sample_data(random_data): series_sampled) + len(series_not_sampled) assert X.index.difference( series_sampled.index.append(series_not_sampled.index)).empty - assert series_sampled.loc[pd.IndexSlice[:, excluded_feat]].empty - assert not series_not_sampled.loc[pd.IndexSlice[:, excluded_feat]].empty + idx_excluded = series_sampled.index.isin(excluded_feat, level=1) + assert series_sampled.loc[idx_excluded].empty + idx_excluded = series_not_sampled.index.isin(excluded_feat, level=1) + assert not series_not_sampled.loc[idx_excluded].empty diff --git a/vaep/tests/test_transfrom.py b/vaep/tests/test_transfrom.py index 0aa0176e8..2a1463580 100644 --- a/vaep/tests/test_transfrom.py +++ b/vaep/tests/test_transfrom.py @@ -4,7 +4,6 @@ import numpy.testing as npt - import sklearn from sklearn import preprocessing from sklearn import impute @@ -43,7 +42,7 @@ def test_Vaep_Pipeline(): dae_default_pipeline = sklearn.pipeline.Pipeline( [ ('normalize', preprocessing.StandardScaler()), - ('impute', impute.SimpleImputer(add_indicator=False)) # True won't work + ('impute', impute.SimpleImputer(add_indicator=False)) # True won't work ] ) from random_data import data @@ -52,20 +51,20 @@ def test_Vaep_Pipeline(): # new procs, transform equal encode, inverse_transform equals decode dae_transforms = VaepPipeline(df, encode=dae_default_pipeline) res = dae_transforms.transform(df) - assert type(res) == pd.DataFrame + assert isinstance(res, pd.DataFrame) with pytest.raises(ValueError): res = dae_transforms.inverse_transform(res) # pd.DataFrame - with pytest.raises(ValueError): + with pytest.raises(ValueError): _ = dae_transforms.inverse_transform(res.iloc[0]) # pd.DataFrame - with pytest.raises(ValueError): + with pytest.raises(ValueError): _ = dae_transforms.inverse_transform(res.loc['sample_156']) # pd.DataFrame - with pytest.raises(ValueError): + with pytest.raises(ValueError): _ = dae_transforms.inverse_transform(to_tensor(res)) # torch.Tensor - with pytest.raises(ValueError): + with pytest.raises(ValueError): _ = dae_transforms.inverse_transform(res.values) # numpy.array - with pytest.raises(ValueError): + with pytest.raises(ValueError): _ = dae_transforms.inverse_transform(res.values[0]) # single sample dae_transforms = VaepPipeline(df, encode=dae_default_pipeline, decode=['normalize']) res = dae_transforms.transform(df) res = dae_transforms.inverse_transform(res) - npt.assert_array_almost_equal(df.values[mask], res.values[mask]) \ No newline at end of file + npt.assert_array_almost_equal(df.values[mask], res.values[mask]) diff --git a/vaep/tests/test_utils.py b/vaep/tests/test_utils.py index 3191e9678..c6bad659f 100644 --- a/vaep/tests/test_utils.py +++ b/vaep/tests/test_utils.py @@ -1,8 +1,5 @@ import pathlib -from vaep.utils import append_to_filepath, sample_iterable - -# def test_sample_iterable(): -# pass +from vaep.utils import append_to_filepath def test_append_to_filepath(): diff --git a/vaep/tf_board.py b/vaep/tf_board.py index 90f5b6ecf..bf8959e09 100644 --- a/vaep/tf_board.py +++ b/vaep/tf_board.py @@ -5,9 +5,10 @@ class TensorboardModelNamer(): """PyTorch SummaryWriter helper class for experiments. - + Creates new SummaryWriter for an experiment """ + def __init__(self, prefix_folder, root_dir=Path('runs')): """[summary] @@ -30,7 +31,7 @@ def get_model_name(self, hidden_layers: int, name = 'model_' name += f'hl{hidden_layers:02d}' - if type(neurons) == str: + if isinstance(neurons, str): neurons = neurons.split() elif not type(neurons) in [list, tuple]: raise TypeError( @@ -39,7 +40,7 @@ def get_model_name(self, hidden_layers: int, for x in neurons: name += f'_{x}' - if type(scaler) == str: + if isinstance(scaler, str): name += f'_{scaler}' else: name += f'_{scaler!r}' diff --git a/vaep/transform.py b/vaep/transform.py index ee3b9ac87..533599161 100644 --- a/vaep/transform.py +++ b/vaep/transform.py @@ -73,6 +73,7 @@ def inverse_transform(self, X, copy=None): # # arguments, see https://fastcore.fast.ai/meta.html#Metaprogramming # # decorate() + def transform(self, X, **kwargs): res = super(self.__class__, self).transform(X, **kwargs) if isinstance(X, pd.DataFrame): @@ -140,8 +141,9 @@ def get_df_fitted_mean_std(self, index): class VaepPipeline(): """Custom Pipeline combining a pandas.DataFrame and a sklearn.pipeline.Pipleine.""" - def __init__(self, df_train:pd.DataFrame, encode:sklearn.pipeline.Pipeline, - decode:List[str] =None): + + def __init__(self, df_train: pd.DataFrame, encode: sklearn.pipeline.Pipeline, + decode: List[str] = None): """[summary] Parameters @@ -153,7 +155,7 @@ def __init__(self, df_train:pd.DataFrame, encode:sklearn.pipeline.Pipeline, decode : List[str], optional subset of transforms (their string name) as an Iterable, by default None, i.e. the same as encode - """ + """ self.columns = df_train.columns self.M = len(df_train.columns) self.encode = encode @@ -163,20 +165,18 @@ def __init__(self, df_train:pd.DataFrame, encode:sklearn.pipeline.Pipeline, for d in decode: self.decode.append( (d, self.encode.named_steps[d]) - ) + ) self.decode = sklearn.pipeline.Pipeline(self.decode) else: self.decode = self.encode - - - + def transform(self, X): res = self.encode.transform(X) if isinstance(X, pd.DataFrame): return pd.DataFrame(res, columns=X.columns, index=X.index) return res - + # Option: single-dispatch based on type of X def inverse_transform(self, X, index=None): columns = self.columns @@ -195,4 +195,4 @@ def inverse_transform(self, X, index=None): X = X.reshape(-1, self.M) res = self.decode.inverse_transform(X) res = pd.DataFrame(res, columns=columns, index=index) - return res \ No newline at end of file + return res diff --git a/vaep/utils.py b/vaep/utils.py index 0ddccb131..245ba5088 100644 --- a/vaep/utils.py +++ b/vaep/utils.py @@ -1,4 +1,3 @@ -from random import sample import pathlib from typing import Union import numpy as np @@ -7,20 +6,12 @@ from vaep.io.datasplits import long_format -def sample_iterable(iterable: dict, n=10) -> list: - """Sample some keys from a given dictionary.""" - n_examples_ = n if len(iterable) > n else len(iterable) - keys = list(iterable) - sample_ = sample(keys, n_examples_) - return sample_ - - def append_to_filepath(filepath: Union[pathlib.Path, str], to_append: str, sep: str = '_', new_suffix: str = None) -> pathlib.Path: - """Append filepath with specified to_append using a seperator. - + """Append filepath with specified to_append using a seperator. + Example: `data.csv` to data_processed.csv """ filepath = pathlib.Path(filepath) @@ -59,11 +50,11 @@ def create_long_df(N: int, M: int, prop_missing=0.1): def create_random_df(N: int, M: int, - scaling_factor: float = 30.0, - prop_na: float = 0.0, - start_idx: int = 0, - name_index='Sample ID', - name_columns='peptide'): + scaling_factor: float = 30.0, + prop_na: float = 0.0, + start_idx: int = 0, + name_index='Sample ID', + name_columns='peptide'): X = np.random.rand(N, M) if prop_na > 0.0 and prop_na < 1.0: @@ -74,7 +65,7 @@ def create_random_df(N: int, M: int, X = pd.DataFrame(X, index=[f'sample_{i:0{len(str(N))}}' - for i in range(start_idx, start_idx+N)], + for i in range(start_idx, start_idx + N)], columns=(f'feat_{i:0{len(str(M))}}' for i in range(M))) X.index.name = name_index X.columns.name = name_columns diff --git a/workflows/README.md b/workflows/README.md deleted file mode 100644 index f7962031c..000000000 --- a/workflows/README.md +++ /dev/null @@ -1,73 +0,0 @@ -# Workflows - -## Snakemake -Snakemake is a framework for execution of workflows on UNIX based systems. -It is written in the line of thought of -[`GNU Makefiles`](https://www.opensourceforu.com/2012/06/gnu-make-in-detail-for-beginners/), -but as an extension to Python rather than `C/C++`. - -### Setup -``` -conda install -n snakemake snakemake pygraphviz -``` - -## Interacting with erda - -### Setup -In your `~/.ssh/config` define a target, here the `SharedFolderName` is called by `hela`: - -``` -Host hela -Hostname io.erda.dk -VerifyHostKeyDNS yes -User SharedFolderName -``` - -### Connect interactively - -Then you can connect to `hela` using the `sftp`-command, and copy files to your -local `data`-folder: - -``` -sftp -B 258048 hela # pw is SharedFolderName -get file1.raw data/ -get file2.raw data/ -``` - -### In Shell Script - -### In Python Script -Checkout snakemake's [SFTP](https://snakemake.readthedocs.io/en/stable/snakefiles/remote_files.html#file-transfer-over-ssh-sftp) -functionality which uses [`pysftp`](https://pysftp.readthedocs.io/en/release_0.2.8/pysftp.html#pysftp.Connection). - - -## Get file list from folder - -Once you have some files uploaded to erda, once in a while you could check which files -you already did store there. Assuming you followed the previous setup step, using the -hostname `io.erda.dk`, you can query the files in a `directory` and store the to a file -named `files_and_folders_in_dir.txt`: - -`sftp -q io.erda.dk:directory/ <<< "ls" | grep -v '^sftp>' > files_and_folders_in_dir.txt` - -### Examples and log - -``` -sftp -q hela <<< "ls" | grep -v '^sftp>' > hela_files.txt -sftp -q io.erda.dk:share_hela_raw/MNT_2019_Proteomics/MNT/ <<< "ls" | grep -v '^sftp>' > hela_mnt_2019.txt -sftp -q io.erda.dk:share_hela_raw/MNT_2020_MM/ <<< "ls" | grep -v '^sftp>' > hela_mnt_2020.txt -sftp -q io.erda.dk:share_hela_raw/MNT_2019_Clinical_Proteomics/MNT/ <<< "ls" | grep -v '^sftp>' > hela_mnt_2019_clinical_proteomics.txt -sftp -q io.erda.dk:share_hela_raw/MNT_2017_Proteomics/ <<< "ls" | grep -v '^sftp>' > hela_mnt_2017_proteomics.txt -sftp -q io.erda.dk:share_hela_raw/MNT_2017_Clinical_Proteomics/ <<< "ls" | grep -v '^sftp>' > hela_mnt_2017_clinical_proteomics.txt -sftp -q io.erda.dk:share_hela_raw/MNT_2013_Proteomics/ <<< "ls" | grep -v '^sftp>' > hela_mnt_2013_proteomics.txt -sftp -q io.erda.dk:share_hela_raw/MNT_2018_Proteomics/ <<< "ls" | grep -v '^sftp>' > hela_mnt_2018_proteomics.txt -sftp -q io.erda.dk:share_hela_raw/MNT_2018_Clinical_Proteomics/ <<< "ls" | grep -v '^sftp>' > hela_mnt_2018_clinical_proteomics.txt -sftp -q io.erda.dk:share_hela_raw/MNT_2016_Proteomics/ <<< "ls -S" | grep -v '^sftp>' > hela_mnt_2016_proteomics.txt -sftp -q io.erda.dk:share_hela_raw/MNT_2015_Proteomics/ <<< "ls -S" | grep -v '^sftp>' > hela_mnt_2015_proteomics.txt -``` - -get a list of all files in the `mq_out` folder on erda (the default folder for storing results): - -``` -sftp -q io.erda.dk:mq_out/ <<< "ls" | grep -v '^sftp>' > hela_processed.txt -``` \ No newline at end of file diff --git a/workflows/hela_files.txt b/workflows/hela_files.txt deleted file mode 100644 index 462f47fcb..000000000 --- a/workflows/hela_files.txt +++ /dev/null @@ -1,552 +0,0 @@ -20180721_QX7_IgPa_MA_HeLa_500ng_LC12_13.raw -20190416_QX2_ChDe_MA_HeLa_500ng_LC05_CTCDoff_190418180742.raw -20190416_QX3_MiWi_MA_Hela_500ng_LC15.raw -20190416_QX7_AnPi_MA_HeLa_500ng_LC01.raw -20190417_QX0_MaTa_MA_HeLa_500ng_LC07_1_too_much_for_a_cleaned_MS.raw -20190417_QX4_JoSw_MA_HeLa_500ng_BR13_standard.raw -20190417_QX4_JoSw_MA_HeLa_500ng_BR14_new.raw -20190417_QX7_JuSc_MA_HeLa_500ng_LC01.raw -20190418_QX1_JoMu_MA_HeLa_500ng_LC11.raw -20190418_QX8_JuSc_MA_HeLa_500ng_1.raw -20190422_QX3_MaTa_MA_Br14_Hela_500ng_LC15.raw -20190422_QX8_JuSc_MA_HeLa_500ng_1.raw -20190423_QX0_MaPe_MA_HeLa_500ng_LC07_1_high.raw -20190423_QX2_FlMe_MA_HeLa_500ng_LC05_CTCDoff.raw -20190423_QX6_MaTa_MA_HeLa_Br13_500ng_LC09.raw -20190423_QX6_MaTa_MA_HeLa_Br13_500ng_LC09_20190423174837.raw -20190423_QX6_MaTa_MA_HeLa_Br14_500ng_DIA_LC09.raw -20190423_QX7_JuSc_MA_HeLaBr14_500ng_LC02.raw -20190423_QX7_JuSc_MA_HeLa_500ng_LC01.raw -20190424_QX2_FlMe_MA_HeLa_500ng_LC05_CTCDoff.raw -20190424_QX8_JuSc_MA_HeLa_500ng_1.raw -20190425_QX3_MaTa_MA_Hela_500ng_LC15.raw -20190425_QX4_JoSw_MA_HeLa_500ng_BR13_standard.raw -20190425_QX4_JoSw_MA_HeLa_500ng_BR13_standard_190425181909.raw -20190425_QX4_JoSw_MA_HeLa_500ng_BR13_standard_190426221220.raw -20190425_QX7_ChDe_MA_HeLaBr14_500ng_LC02.raw -20190425_QX7_ChDe_MA_HeLa_500ng_LC01.raw -20190425_QX8_JuSc_MA_HeLa_500ng_1.raw -20190426_QX1_JoMu_MA_HeLa_500ng_LC11.raw -20190426_QX2_FlMe_MA_HeLa_500ng_LC05_CTCDoff_newcol.raw -20190426_QX2_FlMe_MA_HeLa_500ng_LC05_CTCDoff_newcol_190426220210.raw -20190428_QX1_JoMu_MA_HeLa_Easy11_uPAC_500ng.raw -20190428_QX1_JoMu_MA_HeLa_Easy11_uPAC_500ng_190428112836.raw -20190429_QX0_ChDe_MA_HeLa_500ng_LC07_1_BR13.raw -20190429_QX0_ChDe_MA_HeLa_500ng_LC07_1_BR13_190507121913.raw -20190429_QX0_ChDe_MA_HeLa_500ng_LC07_1_BR14.raw -20190429_QX2_ChDe_MA_HeLa_500ng_LC05_CTCDoff_BR14.raw -20190429_QX2_ChDe_MA_HeLa_500ng_LC05_CTCDoff_newcol.raw -20190429_QX3_ChDe_MA_Hela_500ng_LC15.raw -20190429_QX3_ChDe_MA_Hela_500ng_LC15_190429151336.raw -20190429_QX4_ChDe_MA_HeLa_500ng_BR13_standard.raw -20190429_QX4_ChDe_MA_HeLa_500ng_BR13_standard_190501203657.raw -20190429_QX4_ChDe_MA_HeLa_500ng_BR14_standard.raw -20190429_QX6_ChDe_MA_HeLa_Br13_500ng_LC09.raw -20190429_QX6_ChDe_MA_HeLa_Br14_500ng_LC09.raw -20190430_QX6_ChDe_MA_HeLa_Br13_500ng_LC09.raw -20190430_QX6_ChDe_MA_HeLa_Br14_500ng_LC09.raw -20190501_QX8_MiWi_MA_HeLa_500ng_new.raw -20190501_QX8_MiWi_MA_HeLa_500ng_old.raw -20190502_QX7_ChDe_MA_HeLaBr14_500ng.raw -20190502_QX7_ChDe_MA_HeLa_500ng.raw -20190502_QX8_MiWi_MA_HeLa_500ng_new.raw -20190502_QX8_MiWi_MA_HeLa_500ng_old.raw -20190503_QX1_LiSc_MA_HeLa_500ng_LC10.raw -20190506_QX2_FlMe_MA_HeLa_500ng_LC05_CTCDoff_1.raw -20190506_QX2_FlMe_MA_HeLa_500ng_LC05_CTCDoff_2.raw -20190506_QX3_ChDe_MA_Hela_500ng_LC15.raw -20190506_QX4_JiYu_MA_HeLa_500ng_BR13_standard.raw -20190506_QX6_ChDe_MA_HeLa_Br13_500ng_LC09.raw -20190506_QX7_ChDe_MA_HeLaBr14_500ng.raw -20190506_QX7_ChDe_MA_HeLa_500ng.raw -20190506_QX8_MiWi_MA_HeLa_500ng_new.raw -20190506_QX8_MiWi_MA_HeLa_500ng_old.raw -20190507_QX4_JiYu_MA_HeLa_500ng_BR14_standard.raw -20190507_QX6_ChDe_MA_HeLa_Br13_500ng_LC09.raw -20190508_QX2_FlMe_MA_HeLa_500ng_LC05_CTCDoff_1.raw -20190509_QX6_ChDe_MA_HeLa_Br14_500ng_LC09.raw -20190509_QX6_ChDe_MA_HeLa_Br14_500ng_LC09_20190509120700.raw -20190510_QX0_ChDe_MA_HeLa_500ng_LC07_1_BR14.raw -20190511_QX0_ChDe_MA_HeLa_500ng_LC07_1_BR14.raw -20190513_QX1_LiSc_MA_HeLa_500ng_LC10.raw -20190513_QX2_SeVW_MA_HeLa_500ng_LC05_CTCDoff_1.raw -20190513_QX3_ChDe_MA_Hela_500ng_LC15.raw -20190513_QX7_ChDe_MA_HeLaBr14_500ng.raw -20190513_QX8_MiWi_MA_HeLa_BR14_500ng.raw -20190514_QX0_MaPe_MA_HeLa_500ng_LC07_1_BR14.raw -20190514_QX1_JoMu_MA_HeLa_500ng_LC10_DMSO.raw -20190514_QX1_JoMu_MA_HeLa_500ng_LC10_DMSO_190514213034.raw -20190514_QX4_JiYu_MA_HeLa_500ng.raw -20190514_QX4_JiYu_MA_HeLa_500ng_BR14.raw -20190514_QX6_ChDe_MA_HeLa_Br13_500ng_LC09.raw -20190514_QX6_ChDe_MA_HeLa_Br14_500ng_LC09.raw -20190514_QX6_ChDe_MA_HeLa_Br14_500ng_LC09_20190515085753.raw -20190515_QX3_AsJa_MA_Hela_500ng_LC15.raw -20190515_QX4_JiYu_MA_HeLa_500ng_BR14.raw -20190515_QX6_ChDe_MA_HeLa_Br14_500ng_LC09.raw -20190515_QX8_MiWi_MA_HeLa_BR14_500ng.raw -20190515_QX8_MiWi_MA_HeLa_BR14_500ng_190516123056.raw -20190516_QX0_AlRe_MA_HeLa_500ng_LC07_1_BR14.raw -20190516_QX0_AlRe_MA_HeLa_500ng_LC07_1_BR14_190516181021.raw -20190516_QX4_JiYu_MA_HeLa_500ng_BR14.raw -20190516_QX4_JiYu_MA_HeLa_500ng_BR14_190518195426.raw -20190517_QX0_AlRe_MA_HeLa_500ng_LC07_1_BR14.raw -20190517_QX8_MiWi_MA_HeLa_BR14_500ng.raw -20190520_QX1_JoMu_MA_HeLa_500ng_LC10_DMSO.raw -20190520_QX2_SeVW_MA_HeLa_500ng_LC05_CTCDoff_1.raw -20190520_QX3_LiSc_MA_Hela_500ng_LC15.raw -20190520_QX4_JoSw_MA_HeLa_500ng.raw -20190521_QX0_MaPe_MA_HeLa_500ng_LC07_1_BR14.raw -20190521_QX0_MaPe_MA_HeLa_500ng_LC07_1_BR14_190521192614.raw -20190521_QX1_LiSc_MA_HeLa_500ng_LC14.raw -20190521_QX3_LiSc_MA_Hela_500ng_LC15.raw -20190521_QX4_JoSw_MA_HeLa_500ng.raw -20190521_QX6_AsJa_MA_HeLa_Br14_500ng_LC09.raw -20190521_QX6_AsJa_MA_HeLa_Br14_500ng_LC09_20190522134621.raw -20190521_QX7_MaMu_MA_HeLaBr14_500ng.raw -20190521_QX8_MiWi_MA_HeLa_BR14_500ng.raw -20190522_QX0_MaPe_MA_HeLa_500ng_LC07_1_BR14.raw -20190522_QX0_MaPe_MA_HeLa_500ng_LC07_1_BR14_190524170803.raw -20190523_QX3_LiSc_MA_Hela_500ng_LC15.raw -20190523_QX8_MiWi_MA_HeLa_BR14_500ng.raw -20190523_QX8_MiWi_MA_HeLa_BR14_500ng_08isolation.raw -20190524_QX4_JoSw_MA_HeLa_500ng.raw -20190525_QX1_PhGe_MA_HeLa_500ng_LC10.raw -20190525_QX1_PhGe_MA_HeLa_500ng_LC10_190525140733.raw -20190525_QX1_PhGe_MA_HeLa_500ng_LC10_190526143611.raw -20190525_QX1_PhGe_MA_HeLa_500ng_LC10_190526151124.raw -20190526_QX4_LiSc_MA_HeLa_500ng.raw -20190526_QX8_IgPa_MA_HeLa_BR14_500ng.raw -20190526_QX8_IgPa_MA_HeLa_BR14_500ng_08isolation.raw -20190527_QX0_MaPe_MA_HeLa_500ng_LC07_1_BR14.raw -20190527_QX1_PhGe_MA_HeLa_500ng_LC10.raw -20190527_QX3_LiSc_MA_Hela_500ng_LC15.raw -20190527_QX3_LiSc_MA_Hela_500ng_LC15_190527171650.raw -20190527_QX4_IgPa_MA_HeLa_500ng.raw -20190527_QX7_MaMu_MA_HeLa_Br14_500ng.raw -20190528_QX1_PhGe_MA_HeLa_500ng_LC10.raw -20190528_QX1_PhGe_MA_HeLa_DMSO_500ng_LC14.raw -20190528_QX1_PhGe_MA_HeLa_DMSO_500ng_LC14_190528164924.raw -20190528_QX1_PhGe_MA_HeLa_DMSO_500ng_LC14_190528191042.raw -20190528_QX2_SeVW_MA_HeLa_500ng_LC05_CTCDoff_1.raw -20190528_QX6_AsJa_MA_HeLa_Br14_500ng_LC09.raw -20190528_QX6_AsJa_MA_HeLa_Br14_500ng_LC09_2nd.raw -20190528_QX8_MiWi_MA_HeLa_BR14_500ng.raw -20190528_QX8_MiWi_MA_HeLa_BR14_500ng_190531131859.raw -20190530_QX2_SeVW_MA_HeLa_500ng_LC05_CTCDoff_1.raw -20190530_QX4_IgPa_MA_HeLa_500ng.raw -20190530_QX6_AsJa_MA_HeLa_Br14_500ng_LC09.raw -20190531_QX3_AnSe_MA_Hela_500ng_LC15.raw -20190531_QX8_MiWi_MA_HeLa_BR14_500ng.raw -20190601_QX1_JoMu_MA_HeLa_DMSO_500ng_LC14.raw -20190603_QX0_MePh_MA_HeLa_500ng_LC07_1_BR14.raw -20190603_QX3_AnSe_MA_Hela_500ng_LC15.raw -20190603_QX3_AnSe_MA_Hela_500ng_LC15_190603172414.raw -20190603_QX4_JiYu_MA_HeLa_500ng.raw -20190603_QX7_IgPa_MA_HeLa_Br14_500ng.raw -20190603_QX7_IgPa_MA_HeLa_Br14_500ng_190607132950.raw -20190604_QX8_MiWi_MA_HeLa_BR14_500ng.raw -20190605_QX0_MePh_MA_HeLa_500ng_LC07_1_BR14.raw -20190605_QX0_MePh_MA_HeLa_500ng_LC07_1_BR14_190610203402.raw -20190605_QX3_ChDe_MA_Hela_500ng_LC15.raw -20190606_QX4_JiYu_MA_HeLa_500ng.raw -20190606_QX8_MiWi_MA_HeLa_BR14_500ng.raw -20190607_QX4_JiYu_MA_HeLa_500ng.raw -20190609_QX8_MiWi_MA_HeLa_BR14_500ng.raw -20190609_QX8_MiWi_MA_HeLa_BR14_500ng_190609195817.raw -20190609_QX8_MiWi_MA_HeLa_BR14_500ng_190625163359.raw -20190609_QX8_MiWi_MA_HeLa_BR14_500ng_190625212127.raw -20190610_QX1_JoMu_MA_HeLa_DMSO_500ng_LC14.raw -20190611_QX0_AnBr_MA_HeLa_500ng_LC07_1.raw -20190611_QX0_AnBr_MA_HeLa_500ng_LC07_2.raw -20190611_QX0_MaTa_MA_HeLa_500ng_LC07_1.raw -20190611_QX0_MePh_MA_HeLa_500ng_LC07_1.raw -20190611_QX0_MePh_MA_HeLa_500ng_LC07_1_190618114751.raw -20190611_QX0_MePh_MA_HeLa_500ng_LC07_2.raw -20190611_QX0_MePh_MA_HeLa_500ng_LC07_3.raw -20190611_QX0_MePh_MA_HeLa_500ng_LC07_4.raw -20190611_QX0_MePh_MA_HeLa_500ng_LC07_5.raw -20190611_QX2_SeVW_MA_HeLa_500ng_LC05_CTCDoff_1.raw -20190611_QX3_LiSc_MA_Hela_500ng_LC15.raw -20190611_QX4_JiYu_MA_HeLa_500ng.raw -20190611_QX7_IgPa_MA_HeLa_Br14_500ng.raw -20190611_QX7_IgPa_MA_HeLa_Br14_500ng_190618134442.raw -20190612_QX2_SeVW_MA_HeLa_500ng_LC05_CTCDoff_1.raw -20190612_QX3_JoMu_MA_HeLa_500ng_LC15_uPAC200cm.raw -20190613_QX0_MePh_MA_HeLa_500ng_LC07_1.raw -20190613_QX0_MePh_MA_HeLa_500ng_LC07_Stab_01.raw -20190613_QX0_MePh_MA_HeLa_500ng_LC07_Stab_02.raw -20190613_QX0_MePh_MA_HeLa_500ng_LC07_Stab_03.raw -20190613_QX4_JiYu_MA_HeLa_500ng.raw -20190614_QX2_SeVW_MA_HeLa_500ng_LC05_CTCDoff_1.raw -20190614_QX3_JoSw_MA_Hela_500ng_LC15.raw -20190614_QX3_JoSw_MA_Hela_500ng_LC15_190709210149.raw -20190615_QX4_JiYu_MA_HeLa_500ng.raw -20190617_QX4_JiYu_MA_HeLa_500ng.raw -20190617_QX8_IgPa_MA_HeLa_500ng.raw -20190618_QX3_LiSc_MA_Hela_500ng_LC15.raw -20190618_QX3_LiSc_MA_Hela_500ng_LC15_190619053902.raw -20190618_QX4_JiYu_MA_HeLa_500ng.raw -20190618_QX4_JiYu_MA_HeLa_500ng_190618125902.raw -20190618_QX4_JiYu_MA_HeLa_500ng_190618174520.raw -20190618_QX4_JiYu_MA_HeLa_500ng_190619010035.raw -20190618_QX4_JiYu_MA_HeLa_500ng_centroid.raw -20190619_QX7_IgPa_MA_HeLa_Br14_500ng.raw -20190619_QX7_IgPa_MA_HeLa_Br14_500ng_190619192949.raw -20190620_QX1_JoMu_MA_HeLa__500ng_LC10.raw -20190621_QX2_SeVW_MA_HeLa_500ng_LC05.raw -20190621_QX3_MePh_MA_Hela_500ng_LC15.raw -20190621_QX3_MePh_MA_Hela_500ng_LC15_190621150413.raw -20190621_QX4_JoMu_MA_HeLa_500ng.raw -20190621_QX4_JoMu_MA_HeLa_500ng_190621161214.raw -20190624_QX3_MaMu_MA_Hela_500ng_LC15.raw -20190624_QX4_JiYu_MA_HeLa_500ng.raw -20190625_QX0_MaPe_MA_HeLa_500ng_LC07_1.raw -20190625_QX7_IgPa_MA_HeLa_Br14_500ng.raw -20190626_QX1_JoMu_MA_HeLa_500ng_LC10.raw -20190626_QX1_JoMu_MA_HeLa_500ng_LC10_190626135146.raw -20190626_QX6_ChDe_MA_HeLa_500ng_LC09.raw -20190626_QX7_IgPa_MA_HeLa_Br14_500ng.raw -20190626_QX8_ChDe_MA_HeLa_BR14_500ng.raw -20190626_QX8_ChDe_MA_HeLa_BR14_500ng_190626194235.raw -20190627_QX0_AnBr_MA_HeLa_500ng_LC07_01.raw -20190627_QX3_MaMu_MA_Hela_500ng_LC15.raw -20190627_QX6_JoMu_MA_HeLa_500ng_LC09.raw -20190627_QX7_IgPa_MA_HeLa_Br14_500ng.raw -20190627_QX8_AnPi_MA_HeLa_BR14_500ng.raw -20190627_QX8_AnPi_MA_HeLa_BR14_500ng_190627185125.raw -20190628_QX0_AnBr_MA_HeLa_500ng_LC07_01.raw -20190628_QX0_AnBr_MA_HeLa_500ng_LC07_02.raw -20190628_QX6_AnPi_MA_HeLa_500ng_LC09.raw -20190629_QX4_JiYu_MA_HeLa_500ng_MAX_ALLOWED.raw -20190630_QX3_ChMa_MA_Hela_500ng_LC15.raw -20190701_QX2_LiSc_MA_HeLa_500ng_LC05.raw -20190701_QX2_LiSc_MA_HeLa_500ng_LC05_without_columnoven.raw -20190701_QX3_MiWi_MA_Hela_500ng_LC15.raw -20190701_QX4_MePh_MA_HeLa_500ng_MAX_ALLOWED.raw -20190701_QX6_MaTa_MA_HeLa_500ng_LC09.raw -20190701_QX8_AnPi_MA_HeLa_BR14_500ng.raw -20190702_QX0_AnBr_MA_HeLa_500ng_LC07_01.raw -20190702_QX0_AnBr_MA_HeLa_500ng_LC07_01_190702180001.raw -20190703_QX4_MaTa_MA_HeLa_500ng_MAX_ALLOWED.raw -20190703_QX7_AnPi_MA_HeLa_Br14_500ng.raw -20190704_QX6_MaTa_MA_HeLa_500ng_LC09.raw -20190705_QX0_AnBr_MA_HeLa_500ng_LC07_01.raw -20190705_QX0_AnBr_MA_HeLa_500ng_LC07_01_190707104639.raw -20190705_QX6_ChDe_MA_HeLa_500ng_LC09.raw -20190706_QX4_MiWi_MA_HeLa_500ng.raw -20190706_QX4_MiWi_MA_HeLa_500ng_190707003046.raw -20190707_QX3_MaTa_MA_Hela_500ng_LC15.raw -20190708_QX7_MaMu_MA_HeLa_Br14_500ng.raw -20190708_QX8_AnPi_MA_HeLa_BR14_500ng.raw -20190709_QX1_JoMu_MA_HeLa_500ng_LC10.raw -20190709_QX2_JoMu_MA_HeLa_500ng_LC05.raw -20190709_QX2_JoMu_MA_HeLa_500ng_LC05_190709143552.raw -20190709_QX3_MaTa_MA_Hela_500ng_LC15.raw -20190709_QX6_MaTa_MA_HeLa_500ng_LC09.raw -20190709_QX6_MaTa_MA_HeLa_500ng_LC09_20190709143044.raw -20190709_QX6_MaTa_MA_HeLa_500ng_LC09_20190709155356.raw -20190709_QX7_MaMu_MA_HeLa_Br14_500ng.raw -20190716_QX6_MaTa_MA_HeLa_500ng_LC09.raw -20190716_QX8_AnPi_MA_HeLa_BR14_500ng_190716192109.raw -20190717_QX2_IgPa_MA_HeLa_500ng_CTCDoff_LC05.raw -20190717_QX2_IgPa_MA_HeLa_500ng_CTCDoff_LC05_190719190656.raw -20190717_QX3_OzKa_MA_Hela_500ng_LC15.raw -20190717_QX3_OzKa_MA_Hela_500ng_LC15_190720214645.raw -20190717_QX3_OzKa_MA_Hela_500ng_LC15_190721144939.raw -20190717_QX8_ChSc_MA_HeLa_500ng.raw -20190718_QX6_MaTa_MA_HeLa_500ng_LC09.raw -20190718_QX8_ChSc_MA_HeLa_500ng.raw -20190719_QX1_JoMu_MA_HeLa_500ng_LC10.raw -20190719_QX8_ChSc_MA_HeLa_500ng.raw -20190721_QX0_MePh_MA_HeLa_500ng_LC07_01.raw -20190722_QX2_IgPa_MA_HeLa_500ng_CTCDoff_LC05.raw -20190722_QX3_MiWi_MA_Hela_500ng_LC15.raw -20190722_QX4_StEb_MA_HeLa_500ng.raw -20190722_QX6_AsJa_MA_HeLa_500ng_LC09.raw -20190722_QX8_ChSc_MA_HeLa_500ng.raw -20190722_QX8_ChSc_MA_HeLa_500ng_190722174431.raw -20190723_QX1_JoMu_MA_HeLa_500ng_LC10_pack-2000bar.raw -20190723_QX1_JoMu_MA_HeLa_500ng_LC10_pack-2000bar_2.raw -20190723_QX1_JoMu_MA_HeLa_500ng_LC10_pack-2000bar_3.raw -20190723_QX3_MiWi_MA_Hela_500ng_LC15.raw -20190723_QX6_AsJa_MA_HeLa_500ng_LC09.raw -20190724_QX0_MePh_MA_HeLa_500ng_LC07_01.raw -20190724_QX3_MiWi_MA_Hela_500ng_LC15.raw -20190724_QX7_AlRe_MA_HeLa_Br14_500ng.raw -20190725_QX0_MePh_MA_HeLa_500ng_LC07_01.raw -20190725_QX2_AnBr_MA_HeLa_500ng_CTCDoff_LC05.raw -20190725_QX2_MePh_MA_HeLa_500ng.raw -20190726_QX8_ChSc_MA_HeLa_500ng.raw -20190728_QX3_MiWi_MA_Hela_500ng_LC15.raw -20190729_QX0_AsJa_MA_HeLa_500ng_LC07_01.raw -20190729_QX2_IgPa_MA_HeLa_500ng_CTCDoff_LC05.raw -20190729_QX3_MiWi_MA_Hela_500ng_LC15.raw -20190729_QX4_PhGe_MA_Hela_500ng_LC15.raw -20190730_QX2_IgPa_MA_HeLa_500ng_CTCDoff_LC05.raw -20190730_QX6_AsJa_MA_HeLa_500ng_LC09.raw -20190731_QX1_LiSc_MA_HeLa_500ng_LC10.raw -20190731_QX2_IgPa_MA_HeLa_500ng_CTCDoff_LC05.raw -20190731_QX6_AsJa_MA_HeLa_500ng_LC09.raw -20190731_QX8_ChSc_MA_HeLa_500ng.raw -20190801_QX3_StEb_MA_Hela_500ng_LC15.raw -20190801_QX6_AsJa_MA_HeLa_500ng_LC09.raw -20190802_QX2_OzKa_MA_HeLa_500ng_CTCDoff_LC05.raw -20190802_QX2_OzKa_MA_HeLa_500ng_CTCDoff_LC05_190802231901.raw -20190802_QX6_MaTa_MA_HeLa_500ng_LC09.raw -20190802_QX6_MaTa_MA_HeLa_500ng_LC09_20190803134200.raw -20190802_QX6_MaTa_MA_HeLa_500ng_LC09_20190803182900.raw -20190802_QX7_AlRe_MA_HeLa_Br14_500ng.raw -20190803_QX8_AnPi_MA_HeLa_BR14_500ng.raw -20190804_QX0_AsJa_MA_HeLa_500ng_LC07_01.raw -20190804_QX3_StEb_MA_Hela_500ng_LC15.raw -20190805_QX0_FyHa_MA_HeLa_500ng_LC07_01.raw -20190805_QX1_LiSc_MA_HeLa_500ng_LC10.raw -20190805_QX2_OzKa_MA_HeLa_500ng_CTCDoff_LC05.raw -20190805_QX2_SeVW_MA_HeLa_500ng_CTCDoff_LC05.raw -20190805_QX7_AlRe_MA_HeLa_Br14_500ng.raw -20190805_QX7_AlRe_MA_HeLa_Br14_500ng_190806072128.raw -20190805_QX8_AnPi_MA_HeLa_BR14_500ng.raw -20190806_QX2_SeVW_MA_HeLa_500ng_CTCDoff_LC05.raw -20190806_QX3_FyHa_MA_Hela_500ng_LC15.raw -20190806_QX3_StEb_MA_Hela_500ng_LC15.raw -20190806_QX4_StEb_MA_HeLa_500ng.raw -20190806_QX6_MaTa_MA_HeLa_500ng_LC09.raw -20190806_QX6_MaTa_MA_HeLa_500ng_LC09_20190806172817.raw -20190806_QX8_AnPi_MA_HeLa_BR14_500ng.raw -20190808_QX6_AsJa_MA_HeLa_500ng_LC09.raw -20190809_QX0_FyHa_MA_HeLa_500ng_LC07_01.raw -20190809_QX7_AlRe_MA_HeLa_Br14_500ng.raw -20190809_QX7_AlRe_MA_HeLa_Br14_500ng_190813124558.raw -20190809_QX8_MiWi_MA_HeLa_BR14_500ng.raw -20190811_QX0_AnBr_MA_HeLa_500ng_LC07_01.raw -20190812_QX4_PhGe_MA_Hela_500ng_LC1.raw -20190812_QX6_AsJa_MA_HeLa_500ng_LC09.raw -20190813_QX2_SeVW_MA_HeLa_500ng_CTCDoff_LC05.raw -20190813_QX3_LiSc_MA_Hela_500ng_LC15.raw -20190813_QX4_StEb_MA_HeLa_500ng.raw -20190813_QX6_AsJa_MA_HeLa_500ng_LC09.raw -20190814_QX6_AsJa_MA_HeLa_500ng_LC09.raw -20190814_QX6_AsJa_MA_HeLa_500ng_LC09_20190814155852.raw -20190814_QX8_JaBa_MA_HeLa_BR14_500ng.raw -20190814_QX8_JaBa_MA_HeLa_BR14_500ng_190818004141.raw -20190814_QX8_JaBa_MA_HeLa_BR14_500ng_190819030435.raw -20190815_QX0_AnBr_MA_HeLa_500ng_LC07_01.raw -20190816_QX0_AnBr_MA_HeLa_500ng_LC07_01.raw -20190816_QX0_AnBr_MA_HeLa_500ng_LC07_01_190820113117.raw -20190816_QX0_AnBr_MA_HeLa_500ng_LC07_02.raw -20190816_QX2_SeVW_MA_HeLa_500ng_CTCDoff_LC05.raw -20190816_QX6_AsJa_MA_HeLa_500ng_LC09.raw -20190816_QX7_AnBr_MA_HeLa_Br14_500ng_01.raw -20190816_QX7_AnBr_MA_HeLa_Br14_500ng_02.raw -20190816_QX7_AnBr_MA_HeLa_Br14_500ng_02_190827114153.raw -20190817_QX6_AsJa_MA_HeLa_500ng_LC09.raw -20190818_QX0_AnBr_MA_HeLa_500ng_LC07_CTCD_OFF_01.raw -20190818_QX0_AnBr_MA_HeLa_500ng_LC07_CTCD_OFF_02.raw -20190818_QX0_AnBr_MA_HeLa_500ng_LC07_Standard_01.raw -20190818_QX0_AnBr_MA_HeLa_500ng_LC07_Standard_02.raw -20190818_QX7_AnBr_MA_HeLa_Br14_500ng_01.raw -20190819_QX1_LiSc_MA_HeLa_500ng_LC10.raw -20190819_QX1_LiSc_MA_HeLa_500ng_LC10_2.raw -20190819_QX2_SeVW_MA_HeLa_500ng_CTCDoff_LC05.raw -20190819_QX3_LiSc_MA_Hela_500ng_LC15.raw -20190819_QX4_StEb_MA_HeLa_500ng.raw -20190819_QX6_AsJa_MA_HeLa_500ng_LC09.raw -20190819_QX7_AnBr_MA_HeLa_Br14_500ng_01.raw -20190819_QX8_JaBa_MA_HeLa_BR14_500ng.raw -20190820_QX2_SeVW_MA_HeLa_500ng_CTCDoff_LC10.raw -20190820_QX2_SeVW_MA_HeLa_500ng_CTCDoff_LC10_190820172227.raw -20190820_QX7_AnBr_MA_HeLa_Br14_500ng_01.raw -20190821_QX0_AnBr_MA_HeLa_500ng_LC07_01.raw -20190821_QX6_AsJa_MA_HeLa_500ng_LC09.raw -20190822_QX0_AnBr_MA_HeLa_500ng_LC07_01.raw -20190822_QX0_AnBr_MA_HeLa_500ng_LC07_02.raw -20190822_QX0_AnBr_MA_HeLa_500ng_LC07_03.raw -20190822_QX2_SeVW_MA_HeLa_500ng_CTCDoff_LC10.raw -20190822_QX6_AsJa_MA_HeLa_500ng_LC09.raw -20190823_QX0_AnBr_MA_HeLa_500ng_LC07_01.raw -20190823_QX0_AnBr_MA_HeLa_500ng_LC07_01_190824152456.raw -20190823_QX0_AnBr_MA_HeLa_500ng_LC07_01_190825070959.raw -20190823_QX0_AnBr_MA_HeLa_500ng_LC07_01_190825211633.raw -20190823_QX0_AnBr_MA_HeLa_500ng_LC07_1preDischarge.raw -20190823_QX4_LiSc_MA_HeLa_500ng.raw -20190823_QX7_AnBr_MA_HeLa_Br14_500ng_01.raw -20190825_QX6_AsJa_MA_HeLa_500ng_LC09.raw -20190826_QX0_ChDe_MA_HeLa_500ng_LC07_01.raw -20190826_QX0_ChDe_MA_HeLa_500ng_LC07_03.raw -20190826_QX0_ChDe_MA_HeLa_500ng_LC07_05.raw -20190826_QX1_LiSc_MA_HeLa_500ng_LC10.raw -20190826_QX1_LiSc_MA_HeLa_500ng_LC10_190827011213.raw -20190826_QX6_AsJa_MA_HeLa_500ng_LC09.raw -20190826_QX7_AnBr_MA_HeLa_Br14_500ng_01.raw -20190826_QX7_AnBr_MA_HeLa_Br14_500ng_01_190827114215.raw -20190826_QX8_JaBa_MA_HeLa_BR14_500ng.raw -20190827_QX0_ChDe_MA_HeLa_500ng_LC07_01.raw -20190827_QX0_ChDe_MA_HeLa_500ng_LC07_02.raw -20190827_QX0_ChDe_MA_HeLa_500ng_LC07_03.raw -20190827_QX0_ChDe_MA_HeLa_500ng_LC07_04.raw -20190827_QX0_ChDe_MA_HeLa_500ng_LC07_05.raw -20190827_QX0_ChDe_MA_HeLa_500ng_LC07_06.raw -20190827_QX2_SeVW_MA_HeLa_500ng_CTCDoff_LC10.raw -20190827_QX3_LiSc_MA_Hela_500ng_LC15.raw -20190827_QX3_LiSc_MA_Hela_500ng_LC15_190828161744.raw -20190827_QX7_AnBr_MA_HeLa_Br14_500ng_01.raw -20190828_QX1_LiSc_MA_HeLa_500ng_LC10.raw -20190828_QX8_AlRe_MA_HeLa_BR14_500ng.raw -20190829_QX4_LiSc_MA_HeLa_500ng.raw -20190830_QX6_AsJa_MA_HeLa_500ng_LC09.raw -20190902_QX2_SeVW_MA_HeLa_500ng_CTCDoff_LC05.raw -20190902_QX3_ChDe_MA_Hela_500ng_LC15.raw -20190902_QX4_JiYu_MA_HeLa_500ng.raw -20190902_QX4_JiYu_MA_HeLa_500ng_190902201537.raw -20190902_QX4_JiYu_MA_HeLa_500ng_190902223701.raw -20190903_QX0_OzKa_MA_HeLa_500ng_LC07_01.raw -20190903_QX0_OzKa_MA_HeLa_500ng_LC07_01_190904145103.raw -20190903_QX6_AsJa_MA_HeLa_500ng_LC09.raw -20190903_QX7_AnBr_MA_HeLa_Br14_500ng_01.raw -20190903_QX8_JuSc_MA_HeLa_BR14_500ng.raw -20190904_QX6_ChDe_MA_HeLa_500ng_LC09.raw -20190904_QX7_AlRe_MA_HeLa_Br14_500ng_01.raw -20190904_QX8_JuSc_MA_HeLa_BR14_500ng.raw -20190905_QX4_JiYu_MA_HeLa_500ng.raw -20190906_QX2_SeVW_MA_HeLa_500ng_CTCDoff_LC05.raw -20190906_QX2_SeVW_MA_HeLa_500ng_CTCDoff_LC05_190906190218.raw -20190909_QX0_OzKa_MA_HeLa_500ng_LC07_01.raw -20190909_QX3_ChDe_MA_Hela_500ng_LC15.raw -20190909_QX7_ChDe_MA_HeLa_Br14_500ng.raw -20190909_QX8_JuSc_MA_HeLa_BR14_500ng.raw -20190909_QX8_JuSc_MA_HeLa_BR14_500ng_190909141252.raw -20190910_QX1_LiSc_MA_HeLa_500ng_LC10_2.raw -20190910_QX4_JoSw_MA_HeLa_500ng.raw -20190911_QX0_ChDe_MA_HeLa_500ng_LC07_01.raw -20190911_QX6_ChDe_MA_HeLa_500ng_LC09.raw -20190911_QX7_ChDe_MA_HeLa_Br14_500ng.raw -20190911_QX8_JuSc_MA_HeLa_BR14_500ng.raw -20190912_QX1_LiSc_MA_HeLa_500ng_LC10_2.raw -20190912_QX2_SeVW_MA_HeLa_500ng_CTCDoff_LC05.raw -20190912_QX6_JuSc_MA_HeLa_500ng_LC09.raw -20190912_QX7_ChDe_MA_HeLa_Br14_500ng.raw -20190912_QX8_JuSc_MA_HeLa_BR14_500ng.raw -20190913_QX0_ChDe_MA_HeLa_500ng_LC07_05.raw -20190913_QX0_ChDe_MA_HeLa_500ng_LC07_07_CTCD_off.raw -20190913_QX0_ChDe_MA_HeLa_500ng_LC07_09.raw -20190913_QX0_ChDe_MA_HeLa_500ng_LC12_01.raw -20190913_QX0_ChDe_MA_HeLa_500ng_LC12_01_190913181030.raw -20190913_QX0_ChDe_MA_HeLa_500ng_LC12_03.raw -20190913_QX0_ChDe_MA_HeLa_500ng_LC12_03_190913230336.raw -20190913_QX4_MiWi_MA_HeLa_500ng.raw -20190913_QX6_ChDe_MA_HeLa_500ng_LC09.raw -20190913_QX7_ChDe_MA_HeLa_Br14_500ng.raw -20190913_QX7_ChDe_MA_HeLa_Br14_500ng_190924060155.raw -20190914_QX3_ChDe_MA_Hela_500ng_LC15.raw -20190914_QX8_JuSc_MA_HeLa_BR14_500ng.raw -20190916_QX3_LiSc_MA_Hela_500ng_LC15.raw -20190916_QX6_JuSc_MA_HeLa_500ng_LC09.raw -20190916_QX7_ChDe_MA_HeLa_Br14_500ng.raw -20190917_QX0_ChDe_MA_HeLa_500ng_LC12_01.raw -20190917_QX3_LiSc_MA_Hela_500ng_LC15.raw -20190917_QX4_StEb_MA_HeLa_500ng.raw -20190918_QX0_ChDe_MA_HeLa_500ng_LC12_01.raw -20190918_QX0_ChDe_MA_HeLa_500ng_LC12_01_190918155119.raw -20190918_QX0_ChDe_MA_HeLa_500ng_LC12_03.raw -20190918_QX0_ChDe_MA_HeLa_500ng_LC12_05.raw -20190918_QX0_ChDe_MA_HeLa_500ng_LC12_07.raw -20190918_QX2_SeVW_MA_HeLa_500ng_CTCDoff_LC05.raw -20190918_QX3_OzKa_MA_Hela_500ng_LC15.raw -20190918_QX6_JuSc_MA_HeLa_500ng_LC09.raw -20190918_QX8_JuSc_MA_HeLa_BR14_500ng.raw -20190919_QX1_LiSc_MA_HeLa_75umID_15cm_500ng_LC10_1.raw -20190919_QX1_LiSc_MA_HeLa_75umID_15cm_500ng_LC10_1_191007121002.raw -20190919_QX1_LiSc_MA_HeLa_75umID_15cm_500ng_LC10_2.raw -20190919_QX1_LiSc_MA_HeLa_75umID_15cm_500ng_LC10_3.raw -20190919_QX2_SeVW_MA_HeLa_500ng_CTCDoff_LC05.raw -20190919_QX6_JuSc_MA_HeLa_500ng_LC09.raw -20190919_QX6_JuSc_MA_HeLa_500ng_LC09_20190919173318.raw -20190920_QX3_ADB_MA_HeLa_500ng_DDA.raw -20190920_QX4_StEb_MA_HeLa_500ng.raw -20190920_QX4_StEb_MA_HeLa_500ng_190921074136.raw -20190922_QX8_JuSc_MA_HeLa_BR14_500ng.raw -20190923_QX2_ADB_MA_HeLa_500ng_CTCDoff_LC05_01.raw -20190923_QX2_ADB_MA_HeLa_500ng_CTCDoff_LC05_02.raw -20190923_QX2_ADB_MA_HeLa_500ng_CTCDoff_LC05_02_191015194030.raw -20190923_QX3_MiWi_MA_Hela_500ng_LC15.raw -20190923_QX4_StEb_MA_HeLa_500ng.raw -20190923_QX8_JuSc_MA_HeLa_BR14_500ng.raw -20190924_QX0_AsJa_MA_HeLa_500ng_LC12_01.raw -20190924_QX7_ChDe_MA_HeLa_Br14_500ng.raw -20190925_QX2_ADB_MA_HeLa_500ng_CTCDoff_LC05.raw -20190925_QX2_SeVW_MA_HeLa_500ng_CTCDoff_LC05.raw -20190925_QX6_JuSc_MA_HeLa_500ng_LC09.raw -20190925_QX6_JuSc_MA_HeLa_500ng_LC09_20190926214110.raw -20190925_QX6_JuSc_MA_HeLa_500ng_LC09_20190929121636.raw -20190926_QX4_StEb_MA_HeLa_500ng.raw -20190927_QX2_FlMe_MA_HeLa_500ng_CTCDoff_LC05_01.raw -20190927_QX2_FlMe_MA_HeLa_500ng_CTCDoff_LC05_02.raw -20190927_QX3_MiWi_MA_Hela_500ng_LC15.raw -20190927_QX8_JuSc_MA_HeLa_BR14_500ng.raw -20190929_QX8_JuSc_MA_HeLa_BR14_500ng.raw -20190930_QX0_AsJa_MA_HeLa_500ng_LC12_01.raw -20190930_QX3_MiWi_MA_Hela_500ng_LC15.raw -20190930_QX4_MePh_MA_HeLa_500ng.raw -20191001_QX6_JuSc_MA_HeLa_500ng_LC09.raw -20191001_QX7_IgPa_MA_HeLa_500ng_LC12_13.raw -20191001_QX8_JuSc_MA_HeLa_BR14_500ng.raw -20191002_QX4_MePh_MA_HeLa_500ng.raw -20191003_QX3_MiWi_MA_Hela_500ng_LC15.raw -20191004_QX6_JuSc_MA_HeLa_500ng_LC09.raw -20191007_QX1_JoMu_MA_HeLa_500ng_LC10.raw -20191007_QX2_LiSc_MA_HeLa_500ng_CTCDoff_LC05.raw -20191007_QX3_LiSc_MA_Hela_500ng_LC15.raw -20191007_QX3_LiSc_MA_Hela_500ng_LC15_191008004308.raw -20191007_QX3_LiSc_MA_Hela_500ng_LC15_191014174622.raw -20191007_QX4_JoSw_MA_HeLa_500ng.raw -20191007_QX6_JuSc_MA_HeLa_500ng_LC09.raw -20191007_QX7_ChDe_MA_HeLa_Br14_500ng.raw -20191007_QX8_JuSc_MA_HeLa_BR14_500ng.raw -20191008_QX0_AsJa_MA_HeLa_500ng_LC12_01.raw -20191008_QX0_AsJa_MA_HeLa_500ng_LC12_01_191008164424.raw -20191008_QX7_ChDe_MA_HeLa_Br14_500ng.raw -20191009_QX2_SeVW_MA_HeLa_500ng_CTCDoff_LC05.raw -20191009_QX8_JuSc_MA_HeLa_BR14_500ng.raw -20191010_QX7_ChDe_MA_HeLa_Br14_500ng.raw -20191011_QX0_ChDe_MA_HeLa_500ng_LC12_01.raw -20191011_QX8_JuSc_MA_HeLa_BR14_500ng.raw -20191014_QX0_AsJa_MA_HeLa_500ng_LC12_01.raw -20191014_QX1_JoMu_MA_HeLa_500ng_LC10.raw -20191014_QX6_JuSc_MA_HeLa_500ng_LC09.raw -20191014_QX6_JuSc_MA_HeLa_500ng_LC09_20191014160507.raw -20191014_QX8_JuSc_MA_HeLa_BR14_500ng.raw -20191015_QX3_AlRe_MA_Hela_500ng_LC15.raw -20191015_QX4_JiYu_MA_HeLa_500ng_MAX_ALLOWED.raw -20191015_QX6_JuSc_MA_HeLa_500ng_LC09.raw -20191015_QX7_ChDe_MA_HeLa_Br14_500ng.raw -20191016_QX7_ChDe_MA_HeLa_Br14_500ng.raw -20191017_QX1_teph_MA_HeLa_500ng_LC10.raw -20191017_QX1_teph_MA_HeLa_500ng_LC10_2.raw -20191017_QX2_ADB_MA_HeLa_500ng_CTCDoff_LC05.raw -20191019_QX8_JuSc_MA_HeLa_BR14_500ng.raw -20191020_QX2_ADB_MA_HeLa_500ng_CTCDoff_LC05.raw -20191021_QX3_AlRe_MA_Hela_500ng_LC15.raw -20191021_QX3_AlRe_MA_Hela_500ng_LC15_191023200659.raw -20191021_QX4_JoSw_MA_HeLa_500ng.raw -20191021_QX6_JuSc_MA_HeLa_500ng_LC09.raw -20191021_QX8_AlDa_MA_HeLa_BR14_500ng.raw -20191022_QX0_AsJa_MA_HeLa_500ng_LC12_01.raw -20191022_QX4_StEb_MA_HeLa_500ng.raw -20191022_QX7_ChDe_MA_HeLa_Br14_500ng.raw -20191023_QX0_AsJa_MA_HeLa_500ng_LC12_01.raw -20191023_QX2_ADB_MA_HeLa_500ng_CTCDoff_LC05.raw -20191024_QX1_JoMu_MA_HeLa_500ng_LC10_2.raw -20191025_QX0_AsJa_MA_HeLa_500ng_LC12_01.raw -20191025_QX0_AsJa_MA_HeLa_500ng_LC12_03.raw -20191025_QX0_AsJa_MA_HeLa_500ng_LC12_05.raw -20191025_QX6_JuSc_MA_HeLa_500ng_LC09.raw -20191028_QX3_LiSc_MA_Hela_500ng_LC15_1.raw -20191028_QX3_LiSc_MA_Hela_500ng_LC15_1_191029014900.raw -20191028_QX3_LiSc_MA_Hela_500ng_LC15_1_191029100827.raw -20191028_QX3_LiSc_MA_Hela_500ng_LC15_1_191029124318.raw -20191028_QX3_LiSc_MA_Hela_500ng_LC15_2.raw -20191028_QX3_LiSc_MA_Hela_500ng_LC15_2_191029042255.raw -20191028_QX4_StEb_MA_HeLa_500ng.raw -20191028_QX4_StEb_MA_HeLa_500ng_191029155608.raw -20191029_QX8_JuSc_MA_HeLa_BR14_500ng.raw \ No newline at end of file diff --git a/workflows/maxquant/README.md b/workflows/maxquant/README.md deleted file mode 100644 index e8cf9a4ff..000000000 --- a/workflows/maxquant/README.md +++ /dev/null @@ -1,218 +0,0 @@ -# MaxQuant Workflow - -## Setup on CR2 -If you already have a working version of snakemake, consider to skip this: -``` -module load tools -module load anaconda3/2019.10 -conda init bash -bash -conda env create -f vaep/workflows/maxquant/environment.yml -# ~/.conda/envs/snakemake -conda activate snakemake -# untrack changes to config -git update-index --assume-unchanged workflows/maxquant/config.yaml -``` - -## Example `config.yaml` for Workflow -Here the username is `henweb` and the group is Simon Rasmussen's group `cpr_10006`. - -``` -DATADIR: /home/projects/cpr_10006/people/henweb/hela/ -SCRIPTDIR: /home/projects/cpr_10006/people/henweb/vaep/workflows/maxquant/ - -#Either your own or (see below) -MAXQUANTEXE: /home/projects/cpr_10006/people/henweb/MaxQuant_1.6.12.0/MaxQuant/bin/MaxQuantCmd.exe -# MAXQUANTEXE: /services/tools/maxquant/1.6.7.0/MaxQuant.exe - -MQ_PAR_TEMP: /home/projects/cpr_10006/people/henweb/vaep/workflows/maxquant/mqpar_template_1.6.xml -THREATS_MQ: 8 - -REMOTE_OUT: io.erda.dk -REMOTE_FOLDER: mq_out - -# Remote name for fetching files and list of all files -REMOTE_IN: hela -FILES: ../hela_files.txt -FILES_EXCLUDED: log_excluded_files.txt -FILES_FAILED: log_failed.txt -``` - -> You have to specify the fasta file paths manually in the parameter template file -> referenced in MQ_PAR_TEMP, e.g. `/home/projects/cpr_10006/people/henweb/fasta/myfasta.fasta` - -If you specify passwords in your config file you might want to restrict permissions to your user - -``` -chmod 600 config.yaml -``` - -### Find MaxQuant exectuable -You can either use a pre-exisiting MaxQuant installation or a new one. -Once you know the path, you do not need to load the module explicitly -into your set of environment variables. -``` -module load mono/5.20.1.19 maxquant/1.6.7.0 -export | grep MAXQUANT # find path to MaxQuant executable -``` - -> It seems that also on minor version updates the parameter file of MaxQuant is -> not preserved. Make sure that your template parameter file is working together -> with your MaxQuant version - -## Test your Workflow - Dry-Run of Snakemake - -Make sure to be in the MaxQuant workflow folder `workflows/maxquant/` and -have a session which you can reconnect to (using e.g. `screen` or `tmux`). - -### Set `SSHPASS` -This workflow uses in the current implementation a password protected sftp -connection. In order to login your local environment in which you run -snakemake has to have a password, `` set. - -```bash -export SSHPASS= -``` - -If you don't, snakemake will remind you. -Howwever, snakemake cannot check if the password is correct -before execution, so best verify yourself that it works in the shell you execute. -The `REMOTE` is the same you specified in the `config.yml`: - -```bash -sshpass -e sftp -B 258048 REMOTE <<< "pwd" -``` - -If you set up a SSH connection for your `REMOTE_IN`, you can just set `SSHPASS` to -anything or comment the two line in the `Snakefile`. - -### Dry-RUN - -``` -snakemake -n -snakemake -n --report -``` - -### Run locally - -Either on your computer or in an interactive shell (`iqsub`) - -Running snakemake with many repeated sample which might fail, you can type: -``` -snakemake -k -``` - -### Run on cluster - -#### Using a separate script - -``` -qsub -V run_sm_on_cluster.sh -``` - -The `-V` options passes the current environment variables to the shell started by the -run, see [here](http://docs.adaptivecomputing.com/torque/4-0-2/Content/topics/commands/qsub.htm) - -The script itself contains the cluster execution. Please change the number of parallel jobs -in `run_sm_on_cluster.sh`: - -```bash -snakemake --jobs 6 -k --latency-wait 30 --use-envmodules \ ---cluster "qsub -l walltime={resources.walltime},nodes=1:ppn={threads},mem={resources.mem_mb}mb"\ -" -W group_list=cpr_10006 -A cpr_10006 -m f -V "\ -"-e {params.logdir} -o {params.logdir}" -n -``` - -> Once you are sure, remove the dryrun flag `-n`. Dry runs do not necessarily have to be -> sent to the queue. - -Alternatively invoked a profile defined from a [template](https://github.com/Snakemake-Profiles/pbs-torque). - -Using the profile, the configuration -defined in `config.yaml` and in the `Snakefile` will be used. - -``` -snakemake --profile pbs-torque --jobs 10 --latency-wait 10 -k -``` - -#### Logs - -All files resulting from executions are stored under the `.snakemake`. See the last file -in the `.snakemake/log` folder for inspecting the process of the currently executed -snakemake job. - - -## After running snakemake - -> The file names can be changed in the `config.yaml` - -After snakemake execution of the files in `[hela_files.txt](../hela_files.txt) -you should find new files in the workflow folder [maxquant](vaep/workflows/maxquant): - -``` -log_completed.txt -log_excluded_files.txt -log_failed.txt -sftp_commands -``` - -The `log_excluded_files.txt` will be discarded in further workflow runs -(due to being too small) and `log_failed.txt` holds -files which failed although their size is sufficient. The ladder are not automatically -excluded when you re-run snakemake, as the reason for the failure might be on the -server side. - -The `sftp_commands` file is the set of commands for batch-mode execution for -transferring files to erda.dk. Assuming the server was reachable when executing the -job, the files should have been transferred during the run. Otherwise you can re-run -the transfer again: - -If you set up access to your erda folder appropriatly -you should be able to connect to `erda `. I named it `erda io.erda.dk`. -If you can connect using this command, execute the sftp command in batch mode providing -`sftp_commands` as an argument in order to store the files in a `hela` folder on your -erda root directory. - -``` -sftp -B 258048 -b sftp_commands io.erda.dk -``` - -> afterwards one should rename the sftp_commands file or move it to an archive folder. - -## Find output files -> Look up only for now -Find MQ output files in `hela` folder and remove them by age: -``` -find ./hela/ -name '*txt*' -type d -print -find ./hela/ -path ./*/combined/txt -type d -ls ./hela/ -ltr # check for old files -find ./hela/ -mtime +2 -#find ./hela/ -mtime +2 -exec rm {} \; -#find ./hela/ -mtime +2 -exec rmdir {} \; -type d -#find ./hela/ -type d -empty -delete -#find ./hela/ -mtime +2 -exec rm -r {} \; -``` - -## Check files on server - -In order to see if a corresponding folder on exists on `erda.dk`, you can get a dump of the -files in the output folder. First get a list of all files in the `mq_out` folder on erda -(the default folder for storing results, but choose what is in `config.yaml`) : - -``` -sftp -q io.erda.dk:mq_out/ <<< "ls" | grep -v '^sftp>' > hela_processed.txt -``` - -> this could be integrated into snakemake _target_ rule. - -The `hela_processed.txt` is then the input of the small script `check_current_files.py`: - -``` -python check_current_files.py -f ../hela_processed.txt -v -``` - -which dumps the missing, not excluded or failed files into `current_files_to_do.txt`. -This comparison only checks it the folder for a file exists on the REMOTE if it should be -completed. `current_files_to_do.txt` can then itself be a new input file or used to remove -some output files. If you are sure set the `forceall` option in snakemake, -e.g. in `run_sm_on_cluster.sh`. diff --git a/workflows/maxquant/Snakefile b/workflows/maxquant/Snakefile deleted file mode 100644 index af23feed7..000000000 --- a/workflows/maxquant/Snakefile +++ /dev/null @@ -1,194 +0,0 @@ -""" -Incremental development: - -1) run on local machine in dry-run (iteratively adding functionality) -2) execute using `qsub` (or interactive shell on CR2) -3) add queuing system `qsub` -""" -import os -from pathlib import Path -configfile: 'config.yaml' # access using config['key'] -MIN_FILE_SIZE = 900_000_000 - -folder_logs = Path(config['LOGDIR']) -results_foldername = os.path.basename(config["RESULTSDIR"]) - -# only needed if login to remote is not set up using ssh keys -# move away from config into envvars -# envvars: -# "SSHPASS" - -# wildcard_mqpar = os.path.join(config['SCRIPTDIR'], 'mqpar_xmls', "mqpar_{file}.xml") -# mq_par_temp = os.path.join(config['SCRIPTDIR'], config['MQ_PAR_TEMP']) - -print(f"Use files from: {config['FILES']}") -with open(config['FILES'], encoding='utf-8') as f: - FILES = set(line.strip().split('.raw')[0] for line in f) - -# Order Files (set gives an unordered) -FILES = list(FILES) -FILES.sort() # otherwise execution order is not deterministic due to set -# local, excluded files previously identified as to small -fname_excluded = folder_logs / \ - f"{results_foldername}_excluded_files.txt" -print(f"Exclude files from: {fname_excluded}") -try: - with open(fname_excluded, encoding='utf-8') as f: - FILES_EXCLUDED = set(line.strip().split('.raw')[0] for line in f) -except FileNotFoundError: - print(f"No such file: {fname_excluded} - Creating one.") - with open(fname_excluded, 'w'): - pass - FILES_EXCLUDED = [] -for _file_to_remove in FILES_EXCLUDED: - try: - FILES.remove(_file_to_remove) - except ValueError as e: - print(f"WARNING: File to exclude not in inputs: {_file_to_remove}") - - -# local rules are excuted in the process (job) running snakemake -# allows for ssh-multiplexing, see https://blog.scottlowe.org/2015/12/11/using-ssh-multiplexing/ -localrules: target, mqpar, download_file, upload_file - - -# Thinnode resources sharing: 40 cores and 196 GB RAM (minus 2GB for snakemake) -# http://www.dewassoc.com/kbase/hard_drives/binary_v_decimal_measurement.htm -job_ram_mb = int(204_800 / 40 * config['THREATS_MQ']) - -# #Target Rule: -rule target: - input: - # mockfile approach: https://stackoverflow.com/a/53751654/9684872 - # replace? https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#directories-as-outputs - expand(os.path.join(config['RESULTSDIR'], - "{file}", "{file}.txt"), file=FILES) - -# File size could be done on remote before downloading, only security check afterwards?s -rule download_file: - output: - raw = temp(os.path.join(config['DATADIR'], "{file}", "{file}.raw")) - benchmark: - os.path.join(config['RESULTSDIR'], "{file}", "benchmark_download.tsv") - params: - file = "{file}.raw", - logdir = os.path.join(config['RESULTSDIR'], "{file}"), - remote = config['REMOTE_IN'], # SSHPASS migt be used here - datadir = os.path.join(config['DATADIR'], "{file}"), - min_file_size = MIN_FILE_SIZE - resources: - walltime = 300, nodes = 1, mem_mb = 2048 - threads: - 1 - shell: - """ - sftp -B 258048 {params.remote} <<< 'get {params.file} {output.raw}' && ( - FILESIZE=$(stat -c%s {output.raw}) && - [ $FILESIZE -le {params.min_file_size} ] && - echo '{params.file}' >> {fname_excluded} && - echo "{params.file} is too small: $FILESIZE" && - echo "File is too small: $FILESIZE" > {params.logdir}/download_too_small.log && - rm -R {params.datadir} && - exit 1 || exit 0 ) - """ - -# Is it possible to have relative paths for the Output? -# Path('mqpar_xmls') / 'mq_par_{file}.xml' -# -# should it be a temporary file, otherwise THREADS_MQ is fixed? -rule mqpar: - benchmark: - Path(config['RESULTSDIR']) / "{file}" / "benchmark_mqpar.tsv" - params: - raw = os.path.join(config['DATADIR'], "{file}", "{file}.raw"), - mq_par_temp = os.path.join(config['SCRIPTDIR'], config['MQ_PAR_TEMP']), - threads_mq = config['THREATS_MQ'], - logdir = os.path.join(config['RESULTSDIR'], "{file}"), - resources: - walltime = 240, nodes = 1, mem_mb = 1024 - threads: - 1 - output: - mq_par = Path(config['SCRIPTDIR']) / 'mqpar_xmls' / "mqpar_{file}.xml" - run: - # snakemake create folders itself if missing -> download job into folder - # - # multi-line with statements natively supported from Python 3.9: - # https://stackoverflow.com/questions/31039022/python-multi-line-with-statement - with open(file=params.mq_par_temp, encoding='utf-8') as infile, open(file=output.mq_par, mode='w', encoding='utf-8') as outfile: - for line in infile: - line = line.replace('PATH', params.raw) - line = line.replace('NUM_THREADS', str(params.threads_mq)) - outfile.write(line) - outfile.close() - infile.close() - - -rule maxquant: - input: - raw = os.path.join(config['DATADIR'], "{file}", "{file}.raw"), - mq_par = os.path.join(config['SCRIPTDIR'], - 'mqpar_xmls', "mqpar_{file}.xml"), - exe = config['MAXQUANTEXE'] - benchmark: - os.path.join(config['RESULTSDIR'], "{file}", "benchmark_MQ.tsv") - output: - out = os.path.join(config['RESULTSDIR'], - "{file}", "{file}_mq_done.txt") - resources: - mem_mb = job_ram_mb, nodes = 1, walltime = '5:00:00' - threads: - config['THREATS_MQ'] - envmodules: - "tools", - "mono/5.20.1.19" - params: - datadir = os.path.join(config['DATADIR'], "{file}"), - logdir = os.path.join(config['RESULTSDIR'], "{file}"), - log_failed = f"{results_foldername}_failed.txt", - log_completed = folder_logs / f"{results_foldername}_completed.txt", - remote = config['REMOTE_OUT'], - min_file_size = MIN_FILE_SIZE - shell: - # https://snakemake.readthedocs.io/en/stable/project_info/faq.html#i-don-t-want-snakemake-to-detect-an-error-if-my-shell-command-exits-with-an-exitcode-1-what-can-i-do - # write shell script? - # https: // stackoverflow.com/a/27301889/9684872 - # changing in presence of here-documents (<< EOF EOF below) - # https://snakemake.readthedocs.io/en/stable/project_info/faq.html#is-it-possible-to-pass-variable-values-to-the-workflow-via-the-command-line - # { have to be escaped as {{ - """ - mono --version && - FILESIZE=$(stat -c%s {input.raw}) && - [ $FILESIZE -ge {params.min_file_size} ] && - mono {input.exe} {input.mq_par} && - echo "$(date +"%F %T"): Finished run with MaxQuant version {input.exe}" >> {output.out} && - echo '-mkdir mq_out/{wildcards.file}' >> sftp_commands && - echo '-put {params.datadir}/combined/txt/* mq_out/{wildcards.file}' >> sftp_commands && - cp {input.mq_par} {params.datadir}/combined/txt/mqpar.xml && - echo '{wildcards.file}' >> {params.log_completed} || - ( echo '{wildcards.file}' >> {params.log_failed} && - echo "Size of {input.raw}: $FILESIZE" && - rm -R {params.datadir} && - exit 1 ) - """ - -rule upload_file: - input: - mq_done = os.path.join( - config['RESULTSDIR'], "{file}", "{file}_mq_done.txt") - output: - out = os.path.join(config['RESULTSDIR'], - "{file}", "{file}.txt") - resources: - mem_mb = job_ram_mb, nodes = 1, walltime = 900 - params: - datadir = os.path.join(config['DATADIR'], "{file}"), - remote = config['REMOTE_OUT'] - shell: - """ - sftp -B 258048 {params.remote} <<< '-mkdir mq_out/{wildcards.file}\nput {params.datadir}/combined/txt/* mq_out/{wildcards.file}' && - rm -R {params.datadir} && - echo 'Done uploading to mq_out/{wildcards.file} on {params.remote}' && - echo "$(date +"%F %T"): Finished upload to erda." >> {output.out} || - echo 'Failed upload for {wildcards.file}' - """ diff --git a/workflows/maxquant/bin/check_current_files.py b/workflows/maxquant/bin/check_current_files.py deleted file mode 100644 index c7a70dc35..000000000 --- a/workflows/maxquant/bin/check_current_files.py +++ /dev/null @@ -1,64 +0,0 @@ -''' -Create a diff view between files on server and files which should be completed. -The comparison is only based on folders. - -## hints -- reads config.yaml -- compare files on erda.io.dk with FILES -- exclude failed and excluded files -- outputs a file with diffs. -''' -import argparse -import yaml - -__author__ = 'Henry Webel' - -parser = argparse.ArgumentParser( - prog='FileCheckOnErda', - description='Check if current set of files which did not fail are uploaded to erda. ' - 'Does not check for empty folders.') - -parser.add_argument('-f', '--files_on_erda', - help='List of folders on erda.', required=True) -parser.add_argument('-v', '--verbose', help='Prinbt additional information', - action='count') -parser.add_argument('-o', '--outfile', default='current_files_to_do.txt') - -args = parser.parse_args() - -with open('config.yaml') as f: - config = yaml.safe_load(f) - -if args.verbose: - print(f"Load file: {config['FILES']}") - -with open(config['FILES'], encoding='utf-8') as f: - FILES = set(line.strip().split('.raw')[0] for line in f) - -if args.verbose: - print(f"Load file: {config['FILES_EXCLUDED']}") - -with open(config['FILES_EXCLUDED'], encoding='utf-8') as f: - FILES_EXCLUDED = set(line.strip().split('.raw')[0] for line in f) - -if args.verbose: - print(f"Load file: {config['FILES_FAILED']}") - -with open(config['FILES_FAILED'], encoding='utf-8') as f: - FILES_FAILED = set(line.strip().split('.raw')[0] for line in f) - -with open(args.files_on_erda, encoding='utf-8') as f: - # although no `.raw` ending, keep the default loading syntax - FILES_ON_ERDA = set(line.strip().split('.raw')[0] for line in f) - -files_to_do = FILES - FILES_EXCLUDED - FILES_FAILED - FILES_ON_ERDA -files_to_do = [f"{file}.raw" for file in files_to_do] - -if args.verbose: - print(f"In total {len(files_to_do)} are not processed:\n\t", - "\n\t".join(file for file in files_to_do)) - -with open(args.outfile, mode='w') as f: - f.writelines('\n'.join(file for file in files_to_do)) - -print(f'Saved difference between server and completed files: {args.outfile}') diff --git a/workflows/maxquant/bin/remove_duplicates.py b/workflows/maxquant/bin/remove_duplicates.py deleted file mode 100644 index 7fc0da2ea..000000000 --- a/workflows/maxquant/bin/remove_duplicates.py +++ /dev/null @@ -1,44 +0,0 @@ -''' -Create a copy of a file with duplicates removed. - -## hints -- exclude failed and excluded files -''' -import argparse -from pathlib import Path - -__author__ = 'Henry Webel' - -parser = argparse.ArgumentParser( - prog='Remove duplicate lines in file', - description='Remove duplicated lines and save a copy.') - -parser.add_argument('-f', '--file', - help='File with duplicates', required=True) -parser.add_argument('-v', '--verbose', help='Prinbt additional information', - action='count') -parser.add_argument('-o', '--outfile', required=False) - - -args = parser.parse_args() - -file = Path(args.file) - -try: - unique = set() - with open(file) as f: - for line in f: - unique.update((line.strip(),)) - unique = list(unique) - unique.sort() -except FileNotFoundError: - raise FileNotFoundError(f"No such file: {file} relative to {file.cwd()}.") - -out_file = args.outfile - -if not out_file: - out_file = file.parent / f"{file.stem}_wo_duplicates{file.suffix}" - -with open(out_file, 'w') as f: - f.writelines([f'{line}\n' for line in unique]) - diff --git a/workflows/maxquant/computerome2.yaml b/workflows/maxquant/computerome2.yaml deleted file mode 100644 index b44b8a0e7..000000000 --- a/workflows/maxquant/computerome2.yaml +++ /dev/null @@ -1,2 +0,0 @@ -cluster: qsub -jobs: 20 diff --git a/workflows/maxquant/config.yaml b/workflows/maxquant/config.yaml deleted file mode 100644 index 69e06afa9..000000000 --- a/workflows/maxquant/config.yaml +++ /dev/null @@ -1,5 +0,0 @@ -DATADIR: path/to/specify -SCRIPTDIR: path/to/specify -MAXQUANTEXE: path/to/specify -MQ_PAR_TEMP: path/to/specify -THREATS_MQ: 8 \ No newline at end of file diff --git a/workflows/maxquant/environment.yml b/workflows/maxquant/environment.yml deleted file mode 100644 index b6db284ea..000000000 --- a/workflows/maxquant/environment.yml +++ /dev/null @@ -1,11 +0,0 @@ -name: snakemake -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - snakemake=5.19.3 - - pygraphviz - #- cookiecutter -# #set individual environment location -# prefix: /home/.conda/envs/snakemake diff --git a/workflows/maxquant/mqpar_template_1.5.xml b/workflows/maxquant/mqpar_template_1.5.xml deleted file mode 100644 index 71e30cfd2..000000000 --- a/workflows/maxquant/mqpar_template_1.5.xml +++ /dev/null @@ -1,234 +0,0 @@ - - - Session1 - 1.5.6.8 - - 4 - false - - false - false - -1.7976931348623157E+308 - 1.7976931348623157E+308 - false - false - false - - PATH - - - FILE - - - 32767 - - - 0 - - - - 7 - 2 - false - false - true - 1 - NaN - NaN - MatchFromAndTo - 8 - true - 35 - true - 1.4 - 1.2 - false - 0 - 0 - 0 - - - - false - false - 3 - 6 - 100000 - 0 - 0 - false - false - true - false - 2 - true - 0 - 5 - 2 - 1 - - Trypsin/P - - - false - false - - Oxidation (M) - Acetyl (Protein N-term) - - false - - - - - false - - - true - 20 - 4.5 - true - 2 - true - 0.6 - 0.6 - true - false - 70 - false - - 0 - 0 - 0 - NaN - NaN - false - NaN - NaN - 0 - 0 - 0 - 0 - true - false - false - false - 0 - 0 - 0 - 0 - false - - PeptidesWithCleavedLinker - Standard - 0 - 0 - 0 - 0 - - - - Carbamidomethyl (C) - - - /home/projects/cpr_10006/people/henweb/fasta/UP000005640_9606.fasta - /home/projects/cpr_10006/people/henweb/fasta/UP000005640_9606_additional.fasta - - - - - 350000 - true - 0.005 - false - false - false - false - true - true - revert - all - true - 100 - 4600 - true - true - true - 0 - 6 - 0 - 40 - true - false - false - false - false - 0 - 0 - false - false - false - false - false - false - Species - false - 3 - false - true - false - true - false - 7 - 0.01 - 0.01 - 0.01 - 8 - 25 - true - 1 - 1 - 0 - false - true - false - - 2 - true - - Oxidation (M) - Acetyl (Protein N-term) - - 0 - 0 - 25 - 15 - - 200 - - - - 20 - 7 - 10 - - - 0.5 - 0.15 - 0.25 - - - 40 - 0.01 - 0.02 - - - 0.5 - 0.15 - 0.25 - - - 0 - 1 - none - diff --git a/workflows/maxquant/mqpar_template_1.6.xml b/workflows/maxquant/mqpar_template_1.6.xml deleted file mode 100644 index 930d718a9..000000000 --- a/workflows/maxquant/mqpar_template_1.6.xml +++ /dev/null @@ -1,473 +0,0 @@ - - - - - /home/projects/cpr_10006/people/henweb/fasta/UP000005640_9606.fasta - >.*\|(.*)\| - >.*\|(.*)\| - - - - 9606 - - - /home/projects/cpr_10006/people/henweb/fasta/UP000005640_9606_additional.fasta - >.*\|(.*)\| - >.*\|(.*)\| - - - - 9606 - - - - - - - - 350000 - True - 0.005 - False - False - True - True - revert - all - True - 4600 - True - True - True - 0 - 6 - 0 - 40 - True - False - False - False - False - 0 - 0 - False - False - False - False - 0 - False - False - False - False - False - False - Species - False - 3 - False - True - False - True - False - False - - - - 7 - 0.01 - 0.01 - 0.01 - 0.01 - 8 - 25 - True - 1 - 1 - 0 - False - True - False - - 2 - True - - Oxidation (M) - Acetyl (Protein N-term) - - 0 - 0 - 0 - 0 - 15 - 0 - 1 - - - 200 - False - True - True - True - True - True - True - False - False - False - True - False - 0 - 20 - - none - False - session1 - 1.6.12.0 - - - NUM_THREADS - 1 - - - - - -1.79589544172745E+308 - 1.79589544172745E+308 - False - False - False - False - False - False - False - - PATH - - - - - - 32767 - - - False - - - 0 - - - - - False - False - - - 0 - 7 - 2 - 1 - False - False - True - 1 - NaN - NaN - MatchFromAndTo - 0 - 8 - True - 35 - True - 1.4 - 1.2 - False - 0 - - - - Standard - False - 0 - 3 - 6 - 100000 - 0 - 0 - False - False - True - False - 2 - 0 - 5 - 2 - 1 - 0 - 0 - 0 - 0 - - Carbamidomethyl (C) - - - Trypsin/P - - - - 0 - False - False - - Oxidation (M) - Acetyl (Protein N-term) - - False - - - - - - - - - False - - - - - True - 20 - 4.5 - True - 2 - True - 0.6 - 0.6 - True - True - False - 70 - False - - 0 - 0 - 0 - NaN - NaN - False - NaN - NaN - 0 - 0 - 0 - 0 - True - False - True - False - - 0 - 6 - False - 0 - 0 - 0 - 0 - - - - - - - - - - False - True - 0.75 - 0 - - - - - - - - 20 - 20 - 0.85 - 2 - 2 - 7 - 1.99 - 0.4 - 0.65 - 0 - 6 - 1 - 3 - 0 - 0.8 - 0 - 1 - 0.5 - 0 - 3 - 25 - 260 - True - - - - - FTMS - 20 - True - 7 - True - 10 - True - True - 12 - 100 - True - True - True - True - False - - - ITMS - 0.5 - False - 0.15 - False - 0.25 - False - False - 8 - 100 - True - True - True - True - False - - - TOF - 40 - True - 0.01 - False - 0.02 - False - True - 10 - 100 - True - True - True - True - False - - - Unknown - 20 - True - 7 - True - 10 - True - True - 12 - 100 - True - True - True - True - False - - - - - CID - False - 1 - 1 - 1 - False - 1 - KRH - - - HCD - False - 1 - 1 - 1 - False - 1 - KRH - - - ETD - False - 1 - 1 - 1 - False - 1 - KRH - - - PQD - False - 1 - 1 - 1 - False - 1 - KRH - - - ETHCD - False - 1 - 1 - 1 - False - 1 - KRH - - - ETCID - False - 1 - 1 - 1 - False - 1 - KRH - - - UVPD - False - 1 - 1 - 1 - False - 1 - KRH - - - Unknown - False - 1 - 1 - 1 - False - 1 - KRH - - - diff --git a/workflows/maxquant/template_scripts/README.md b/workflows/maxquant/template_scripts/README.md deleted file mode 100644 index 904dc0637..000000000 --- a/workflows/maxquant/template_scripts/README.md +++ /dev/null @@ -1,12 +0,0 @@ -## Python Template -> Provided by Annelaura Bach - - -In the `template-scripts` folder is the `mq_job_template.sh` and -the `run_mq.py`: - -```bash -mq_job_template.sh # sumbitted to the queue -run_mq.py # script executing MaxQuant -``` - diff --git a/workflows/maxquant/template_scripts/mq_job_template.sh b/workflows/maxquant/template_scripts/mq_job_template.sh deleted file mode 100644 index 1b305c121..000000000 --- a/workflows/maxquant/template_scripts/mq_job_template.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/sh -### Note: No commands may be executed until after the #PBS lines -### Account information -#PBS -W group_list=cpr_10006 -A cpr_10006 -### Job name (comment out the next line to get the name of the script used as the job name) -#PBS -N -### Output files (comment out the next 2 lines to get the job name used instead) -#PBS -e ${PBS_JOBNAME}.e${PBS_JOBID} -#PBS -o ${PBS_JOBNAME}.o${PBS_JOBID} -### Email notification: a=aborts, b=begins, e=ends, n=no notifications -#PBS -m an -M henry.webel@cpr.ku.dk -### Number of nodes -#PBS -l nodes=1:ppn=20,mem=40gb -### Requesting timeformat is ::: -#PBS -l walltime=12:00:00 - -# Go to the directory from where the job was submitted (initial directory is $HOME) -module load tools -module load mono/6.8.0.105 -module load maxquant/1.6.7.0 - - -mono --version -#mono /home/projects/cpr_man/people/s155016/denoms/MaxQuant/bin/MaxQuantCmd.exe /home/projects/cpr_man/people/s155016/denoms/script/mqparfiles/mqpar_NAME.xml - -rm /home/projects/cpr_man/people/s155016/denoms/data/NAME/RAWFILE -echo "rawfile deleted" - -mv /home/projects/cpr_man/people/s155016/denoms/data/NAME/combined/txt/proteinGroups.txt /home/projects/cpr_man/people/s155016/denoms/result/proteinGroups/NAME_proteinGroups.txt -mv /home/projects/cpr_man/people/s155016/denoms/data/NAME/combined/txt/peptides.txt /home/projects/cpr_man/people/s155016/denoms/result/peptides/NAME_peptides.txt -echo "result files moved" - -echo "Done" diff --git a/workflows/maxquant/template_scripts/run_mq.py b/workflows/maxquant/template_scripts/run_mq.py deleted file mode 100644 index 246fa2e64..000000000 --- a/workflows/maxquant/template_scripts/run_mq.py +++ /dev/null @@ -1,55 +0,0 @@ -import pandas as pd -import os -from shutil import copyfile -from subprocess import call - -### prepare data frame with 3 columns: -# 'path' is the full path to the .raw file -# 'file' is the .raw file name e.g. 20191210_*.raw -# 'name' is the .raw file name without '.raw' e.g. 20191210_* - -# from os.listdir() - -DATADIR = '/home/projects/cpr_man/people/s155016/denoms/data/' -SCRIPTDIR = '/home/projects/cpr_man/people/s155016/denoms/script/' - -MQ_JOB_TEMPLATE = os.path.join(SCRIPTDIR, 'mq_job_template') - -MQ_PARAMETERS = 'mqpar_template.xml' -MQ_PARAMETERS = os.path.join(SCRIPTDIR, MQ_PARAMETERS) - -# df = pd.read_csv('my_df.tsv', sep ='\t', header = True) - -# loop over all runs -for i in range(df.shape[0]): - path, file, name = df.iloc[i] #replace by named parameters? - # create new directory - os.mkdir(DATADIR+name) - # copy file #ToDo: Why copy files? - copyfile(path, DATADIR + name + '/' + file) - print('rawfile copied') - # create mqpar with the correct path and experiment - mq_parameters_out_file = os.path.join(SCRIPTDIR, 'mqparfiles/mqpar_' + name + '.xml' ) - with open(MQ_PARAMETERS) as infile, open(mq_parameters_out_file, 'w') as outfile: - for line in infile: - line = line.replace('PATH', os.path.join(DATADIR, name, file)) - line = line.replace('FILE', file) - outfile.write(line) - outfile.close() - infile.close() - print('mqpar created') - # create mq_job file with the correct paths etc. - mq_job_file = os.path.join(SCRIPTDIR, 'mqjobs/mq_job_' + name) - with open(MQ_JOB_TEMPLATE) as infile, open(mq_job_file, 'w') as outfile: - for line in infile: - line = line.replace('NAME', name) - line = line.replace('RAWFILE', file) - outfile.write(line) - outfile.close() - infile.close() - print('mqjob created') - # run mqjob - os.chdir(os.path.join(DATADIR, name)) - queue_command = 'qsub '+ mq_job_file - return_code = call(queue_command, shell=True) - print('job {} queued out of 528'.format(i+1)) diff --git a/workflows/metadata/.gitignore b/workflows/metadata/.gitignore deleted file mode 100644 index dfb926ad7..000000000 --- a/workflows/metadata/.gitignore +++ /dev/null @@ -1,7 +0,0 @@ -tmp -jsons -txts -metadata -config -!config/ald_study -rawfile_metadata*.json \ No newline at end of file diff --git a/workflows/metadata/README.md b/workflows/metadata/README.md deleted file mode 100644 index 7f12790de..000000000 --- a/workflows/metadata/README.md +++ /dev/null @@ -1,70 +0,0 @@ -# Metadata workflow - -Get metadata from ThermoFischer proteomics raw files using -[`ThermoRawFileParser`](https://github.com/compomics/ThermoRawFileParser) - -## Output - -- both json and txt data format into `jsons` and `txts` folder -- create combined `rawfile_metadata.json` (needs to be deleted if files are added) - -## Configfile - -add a `config/files.yaml` in [config](config): - -```yaml -remote_in: erda:folder/path -out_folder: metadata -thermo_raw_file_parser_exe: mono path/to/ThermoRawFileParser/ThermoRawFileParser.exe -files: -- remote/path/file1.raw -- remote/path/file2.raw -``` - -The list of files is fetched from [`project/00_2_hela_all_raw_files.ipynb`](../../project/00_2_hela_all_raw_files.ipynb) notebook. - - -Then invoke the workflow with the list of config files - -```bash -# dry-run -snakemake --configfiles config/ald_study/config.yaml config/ald_study/excluded.yaml -p -n -``` - - -### Excluded files - -Some files might be corrupted and not be processed by `ThermoRawFileParser`. These can be -excluded based on the `tmp` folder - -```bash -# check files -echo 'excluded:' > config/excluded_$(date +"%Y%m%d").yaml -find tmp -name '*.raw*' | awk 'sub(/^.{4}/," ? ")' >> config/excluded_$(date +"%Y%m%d").yaml - -# potentially add these to the workflow exclusion files: -find tmp -name '*.raw*' | awk 'sub(/^.{4}/," ? ")' >> config/excluded.yaml -# rm -r tmp/* # remove excluded files -``` - -these files are ignored in the workflow (configured as a python set). - -## Setup - -- download and unzip [`ThermoRawFileParser`](https://github.com/compomics/ThermoRawFileParser) -- add path to `exe` to config - -```bash -# sudo apt-get update -sudo apt install mono-complete -conda activate vaep # actually only snakemake needed -snakemake -n # see job listing -``` - -## zip outputs - - -```bash -# could be part of snakemake process -zip -r metadata.zip txt jsons -``` \ No newline at end of file diff --git a/workflows/metadata/Snakefile b/workflows/metadata/Snakefile deleted file mode 100644 index 45a7bcda3..000000000 --- a/workflows/metadata/Snakefile +++ /dev/null @@ -1,85 +0,0 @@ -from pathlib import Path, PurePosixPath -configfile: "config/other.yaml" -configfile: "config/excluded.yaml" - -files = [PurePosixPath(p) for p in config['files'] if p not in config['excluded']] -files = [p.parent/ p.stem for p in files] - -CSV_OUT = config['out_csv'] - -rule all: - input: - # config['out_folder'] + "/rawfile_metadata.json" - CSV_OUT - -rule build_csv: - input: - data = config['out_folder'] + "/rawfile_metadata.json", - nb = 'read_metadata.ipynb' - output: - csv = CSV_OUT, - nb = config['out_folder'] + '/read_metadata_executed.ipynb' - shell: - "papermill {input.nb} {output.nb}" - " -p rawfile_metadata_in {input.data}" - " -p rawfile_metadata_out {output.csv}" - - -rule combine_json: - input: - expand( - config['out_folder'] + "/jsons/{file}-metadata.json", - file=files - ) - output: - out = config['out_folder'] + "/rawfile_metadata.json" - threads: - 1 - resources: - load=1 # multiplexing - run: - import json - import yaml - from pathlib import Path - metadata = {} - for fname in input: - key = Path(fname).name.split('-metadata.json')[0] - with open(fname) as f: - metadata[key] = json.load(f) - # metadata[key] = yaml.safe_load(f) # yaml can also read json (but add. dependency) - with open(output.out, 'w') as f: - json.dump(metadata, f) - - -rule parse_rawfile_metadata: - input: - raw = config['folder_raw'] + '{file}.raw' - output: - json = config['out_folder'] + "/jsons/{file}-metadata.json", - txt = config['out_folder'] + "/txts/{file}-metadata.txt" - params: - thermo_exe = config['thermo_raw_file_parser_exe'] - threads: - 1 - resources: - load=1 - shell: - # multiline does not work on windows workstation. ("""....""") - "{params.thermo_exe} -i {input.raw:q} -m 0 -b {output.json:q}" - " & {params.thermo_exe} -i {input.raw:q} -m 1 -b {output.txt:q}" - - -rule download: - output: - raw_local = temp(config['folder_raw'] + '/{file}.raw') - params: - remote = config['remote_in'], - raw_remote = '{file}.raw' - threads: - 1 - resources: - load=10 # erda: multiplexing allows 10 max concurrent sessions - shell: - """ - sftp -B 258048 {params.remote} <<< 'get "{params.raw_remote}" "{output.raw_local}"' - """ \ No newline at end of file diff --git a/workflows/metadata/config/README.md b/workflows/metadata/config/README.md deleted file mode 100644 index 6827dccd0..000000000 --- a/workflows/metadata/config/README.md +++ /dev/null @@ -1,12 +0,0 @@ -# Configfiles - -add a `config/example.yaml` - -```yaml -remote_in: erda:folder/path -out_folder: metadata -thermo_raw_file_parser_exe: mono path/to/ThermoRawFileParser/ThermoRawFileParser.exe -files: -- remote/path/file1.raw -- remote/path/file2.raw -``` \ No newline at end of file diff --git a/workflows/metadata/config/ald_study/config.yaml b/workflows/metadata/config/ald_study/config.yaml deleted file mode 100644 index 24a0bfe18..000000000 --- a/workflows/metadata/config/ald_study/config.yaml +++ /dev/null @@ -1,676 +0,0 @@ -remote_in: . -folder_raw: "E:\\data\\Lili\\ALD upgrade\\" -out_folder: metadata/ald -out_csv: ../../project/data/single_datasets/ald_metadata_rawfiles.csv -thermo_raw_file_parser_exe: D:\Users\clilniu\Documents\secure_cloud_tools\ThermoRawFileParser\ThermoRawFileParser.exe -files: - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_A1_20210920175431.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_A2.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_A3.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_A4.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_A5.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_A6.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_A7.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_A8.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_A9.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_A10.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_A11.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_A12.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_B1.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_B2.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_B3.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_B4.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_B5.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_B6.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_B7.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_B8.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_B9.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_B10.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_B11.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_B12.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_C1.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_C2.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_C3.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_C4.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_C5.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_C6.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_C7.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_C8.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_C9.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_C10.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_C11.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_C12.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_D2.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_D4.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_D5.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_D6.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_D7.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_D8.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_D1.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_D9.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_D10.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_D11.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_D12.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_E3.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_E4.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_E5.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_E6.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_E7.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_E2.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_E8.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_E9.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_E10.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_E11.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_E12.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_F1.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_F2.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_F3.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_F4.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_F5.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_F6.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_F7.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_F8.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_F9.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_F10.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_F11.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_F12.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_G1.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_G2.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_G3.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_G4.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_G5.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_G6.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_G7.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_G8.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_G9.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_G10.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_G11.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_G12.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_H1.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_H2.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_H3.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_H4.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_H5.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_H6.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_H7.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_H8.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_H9.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_H10.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_H11.raw - - 20210919_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate1_H12.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_A1.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_A2.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_A3.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_A4.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_A5.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_A6.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_A7.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_A8.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_A9.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_A10.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_A11.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_A12.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_B1.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_B2.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_B3.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_B4.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_B5.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_B6.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_B7.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_B8.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_B9.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_B10.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_B11.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_B12.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_C2.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_C3.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_C4.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_C5.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_C6.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_C7.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_C8.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_C9.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_C10.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_C11.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_C12.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_D1.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_D2.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_D3.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_D4.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_D5.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_D6.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_D7.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_D8.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_D9.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_D10.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_D11.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_D12.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_E1.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_E2.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_E3.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_E4.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_E5.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_E6.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_E7.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_E8.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_E9.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_E10.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_E11.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_E12.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_F1.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_F2.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_F3.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_F4.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_F5.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_F6.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_F7.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_F8.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_F9.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_F10.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_F11.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_F12.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_G1.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_G2.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_G3.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_G4.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_G5.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_G6.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_G7.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_G8.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_G9.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_G10.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_G11.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_G12.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_H1.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_H2.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_H3.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_H4.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_H5.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_H6.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_H7.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_H8.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_H9.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_H10.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_H11.raw - - 20210921_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_H12.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_A1.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_A2.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_A3.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_A4.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_A5.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_A6.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_A7.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_A8.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_A9.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_A10.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_A11.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_A12.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_B1.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_B2.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_B3.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_B4.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_B5.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_B6.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_B7.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_B8.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_B9.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_B10.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_B11.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_B12.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_C1.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_C2.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_C3.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_C4.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_C5.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_C6.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_C7.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_C8.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_C9.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_C10.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_C11.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_C12.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_D1.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_D2.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_D3.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_D4.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_D5.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_D6.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_D7.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_D8.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_D9.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_D10.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_D11.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_D12.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_E1.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_E2.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_E3.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_E4.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_E5.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_E6.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_E7.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_E8.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_E9.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_E10.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_E11.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_E12.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_F1.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_F2.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_F3.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_F4.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_F5.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_F6.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_F7.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_F8.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_F9.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_F10.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_F11.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_F12.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_G1.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_G2.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_G3.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_G4.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_G5.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_G6.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_G7.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_G8.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_G9.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_G10.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_G11.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_G12.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_H1.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_H3.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_H4.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_H5.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_H6.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_H7.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_H8.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_H9.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_H10.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_H11.raw - - 20210923_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate3_H12.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_A1.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_A2.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_A3.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_A4.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_A5.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_A6.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_A7.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_A8.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_A9.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_A10.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_A11.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_A12.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_B1.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_B2.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_B3.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_B4.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_B5.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_B6.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_B7.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_B8.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_B9.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_B10.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_B11.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_B12.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_C1.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_C2.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_C3.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_C4.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_C5.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_C6.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_C7.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_C8.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_C9.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_C10.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_C11.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_C12.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_D1.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_D2.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_D3.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_D4.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_D5.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_D6.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_D7.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_D8.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_D9.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_D10.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_D11.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_D12.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_E1.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_E2.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_E4.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_E5.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_E6.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_E7.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_E8.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_E9.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_E10.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_E11.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_E12.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_F1.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_F2.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_F3.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_F4.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_F5.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_F6.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_F7.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_F8.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_F9.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_F10.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_F11.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_F12.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_G1.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_G2.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_G3.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_G4.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_G5.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_G6.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_G7.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_G9.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_G10.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_G11.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_G12.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_H1.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_H2.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_H3.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_H4.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_H5.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_H6.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_H7.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_H8.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_H9.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_H10.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_H11.raw - - 20210925_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_H12.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_A1.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_A2.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_A3.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_A4.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_A5.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_A6.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_A7.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_A8.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_A9.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_A10.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_A11.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_A12.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_B1.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_B2.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_B3.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_B4.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_B5.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_B6.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_B8.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_B9.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_B10.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_B11.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_B12.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_C2.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_C3.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_C4.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_C5.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_C6.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_C7.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_C8.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_C9.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_C10.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_C11.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_C12.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_D1.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_D2.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_D3.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_D4.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_D7.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_D8.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_D9.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_D10.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_D11.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_D12.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_E1.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_E2.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_E3.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_E4.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_E5.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_E6.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_E7.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_E8.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_E9.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_E10.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_E11.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_E12.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_F1.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_F2.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_F3.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_F5.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_F6.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_F7.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_F8.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_F9.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_F10.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_F11.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_F12.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_G2.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_G3.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_G4.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_G5.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_G6.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_G7.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_G8.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_G9.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_G10.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_G11.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_G12.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_H1.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_H2.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_H3.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_H4.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_H5.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_H6.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_H7.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_H8.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_H9.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_H10.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_H11.raw - - 20210927_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_H12.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_A1.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_A2.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_A3.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_A4.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_A5.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_A6.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_A7.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_A8.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_A9.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_A10.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_A11.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_A12.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_B1.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_B2.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_B3.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_B4.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_B5.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_B6.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_B7.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_B8.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_B9.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_B10.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_B11.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_B12.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_C1.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_C2.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_C3.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_C4.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_C5.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_C6.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_C7.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_C8.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_C9.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_C10.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_C11.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_C12.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_D1.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_D2.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_D3.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_D4.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_D5.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_D6.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_D7.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_D8.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_D9.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_D10.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_D11.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_D12.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_E1.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_E2.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_E3.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_E4.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_E5.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_E6.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_E7.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_E8.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_E9.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_E10.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_E11.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_E12.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_F1.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_F2.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_F3.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_F4.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_F5.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_F6.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_F7.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_F8.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_F9.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_F10.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_F11.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_F12.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_G1.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_G2.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_G3.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_G4.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_G5.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_G6.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_G7.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_G8.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_G9.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_G10.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_G11.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_G12.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_H1.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_H2.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_H3.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_H4.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_H5.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_H6.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_H7.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_H8.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_H9.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_H10.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_H11.raw - - 20210929_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate6_H12.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_A1.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_A2.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_A3.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_A4.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_A5.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_A6.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_A7.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_A8.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_A9.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_A10.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_A11.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_A12.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_B1.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_B2.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_B3.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_B4.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_B5.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_B7.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_B8.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_B9.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_B10.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_B11.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_B12.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_C1.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_C2.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_C3.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_C4.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_C5.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_C6.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_C7.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_C8.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_C9.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_C10.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_C11.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_C12.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_D1.raw - - 20211001_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate7_D2.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_A1.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_A2.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_A3.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_A4.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_A5.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_A6.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_A7.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_A8.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_A9.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_A10.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_A11.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_A12.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_B1.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_B2.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_B3.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_B4.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_B5.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_B6.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_B7.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_B8.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_B9.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_B10.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_B11.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_B12.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_C1.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_C2.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_C3.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_C4.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_C5.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_C6.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_C7.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_C8.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_C9.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_C10.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_C11.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_C12.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_D1.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_D2.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_D3.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_D4.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_D5.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_D6.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_D7.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_D8.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_D9.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_D10.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_D11.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_D12.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_E1.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_E2.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_E3.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_E4.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_E5.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_E7.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_E8.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_E9.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_E10.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_E11.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_E12.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_F1.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_F2.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_F3.raw - - 20211003_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_H12.raw - - 20211014_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_C1.raw - - 20211014_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate4_G8.raw - - 20211014_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_D5.raw - - 20211014_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_D6.raw - - 20211014_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate5_G1.raw - - 20211014_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate8_E6.raw diff --git a/workflows/metadata/config/ald_study/excluded.yaml b/workflows/metadata/config/ald_study/excluded.yaml deleted file mode 100644 index ed7b46aa9..000000000 --- a/workflows/metadata/config/ald_study/excluded.yaml +++ /dev/null @@ -1,3 +0,0 @@ -excluded: -# ERROR Unable to access the RAW file using the native Thermo library. - ? 20211014_EXPL3_PRIEvosep2_LiNi_SA_Plasma_ALD_Plate2_C1.raw \ No newline at end of file diff --git a/workflows/metadata/config/example.yaml b/workflows/metadata/config/example.yaml deleted file mode 100644 index 0bb9713d3..000000000 --- a/workflows/metadata/config/example.yaml +++ /dev/null @@ -1,8 +0,0 @@ -# example for all config entires -# can be separated into several configfiles for snakemake -remote_in: erda:folder/path -out_folder: metadata -thermo_raw_file_parser_exe: mono path/to/ThermoRawFileParser/ThermoRawFileParser.exe -files: -- remote/path/file1.raw -- remote/path/file2.raw \ No newline at end of file diff --git a/workflows/metadata/config/excluded.yaml b/workflows/metadata/config/excluded.yaml deleted file mode 100644 index 43cf24f5d..000000000 --- a/workflows/metadata/config/excluded.yaml +++ /dev/null @@ -1,2 +0,0 @@ -excluded: - ? \ No newline at end of file diff --git a/workflows/metadata/read_metadata.ipynb b/workflows/metadata/read_metadata.ipynb deleted file mode 100644 index 1021c3fa4..000000000 --- a/workflows/metadata/read_metadata.ipynb +++ /dev/null @@ -1,314 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Machine metadata in pandas DataFrame" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import pathlib\n", - "import pprint\n", - "\n", - "import pandas as pd\n", - "pd.options.display.max_columns = 32\n", - "\n", - "import vaep.utils\n", - "import vaep.pandas" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "configs:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "DATA_PROJECT = pathlib.Path('../../project/data')\n", - "\n", - "rawfile_metadata_in = 'rawfile_metadata.json'\n", - "rawfile_metadata_out = DATA_PROJECT / 'rawfile_metadata.csv'\n", - "rawfile_metadata_in" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "read raw file created by snakemake workflow" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with open(rawfile_metadata_in) as f:\n", - " data = json.load(f)\n", - "\n", - "key_sampled = vaep.utils.sample_iterable(data, 1)[0]\n", - "sample = data[key_sampled]\n", - "pprint.pprint(sample)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sample.keys()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- essentially the all data is a dictionary with for keys: `['accession', 'cvLabel', 'name', 'value']`\n", - "- pick for each type of entry in `['FileProperties', 'InstrumentProperties', 'MsData', 'ScanSettings', 'SampleData']` \n", - " - the `name` and `value` for a single in a list of entries" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "parsed = {}\n", - "for sample_id, meta_json in data.items():\n", - " selected = {}\n", - " for k, entries in meta_json.items():\n", - " for entry in entries:\n", - " selected[k, entry['name']] = entry['value']\n", - " parsed[sample_id] = selected\n", - "pprint.pprint(parsed[sample_id])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# simple strings (not as shown by pprint)\n", - "(parsed[sample_id][('InstrumentProperties', 'instrument serial number')],\n", - " parsed[sample_id][('InstrumentProperties', 'Thermo Scientific instrument model')])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Explicitly specifying the default dtypese here once. These are set when the data is read from the json file created in this script `rawfile_metadata_out`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# # df.dtypes.to_dict() needed parsing\n", - "# from numpy import dtype\n", - "# types = {\n", - "# ('FileProperties', 'Pathname'): dtype('O'),\n", - "# ('FileProperties', 'Version'): dtype('int64'),\n", - "# ('FileProperties', 'Content Creation Date'): dtype('O'),\n", - "# ('InstrumentProperties', 'Thermo Scientific instrument model'): dtype('O'),\n", - "# ('InstrumentProperties', 'instrument attribute'): dtype('O'),\n", - "# ('InstrumentProperties', 'instrument serial number'): dtype('O'),\n", - "# ('InstrumentProperties', 'Software Version'): dtype('O'),\n", - "# ('InstrumentProperties', 'firmware version'): dtype('O'),\n", - "# ('MsData', 'Number of MS1 spectra'): dtype('int64'),\n", - "# ('MsData', 'Number of MS2 spectra'): dtype('float64'),\n", - "# ('MsData', 'MS min charge'): dtype('int64'),\n", - "# ('MsData', 'MS max charge'): dtype('int64'),\n", - "# ('MsData', 'MS min RT'): dtype('float64'),\n", - "# ('MsData', 'MS max RT'): dtype('float64'),\n", - "# ('MsData', 'MS min MZ'): dtype('float64'),\n", - "# ('MsData', 'MS max MZ'): dtype('float64'),\n", - "# ('ScanSettings', 'scan start time'): dtype('float64'),\n", - "# ('ScanSettings', 'mass resolution'): dtype('float64'),\n", - "# ('ScanSettings', 'mass unit'): dtype('O'),\n", - "# ('ScanSettings', 'Number of scans'): dtype('int64'),\n", - "# ('ScanSettings', 'MS scan range'): dtype('O'),\n", - "# ('ScanSettings', 'Retention time range'): dtype('O'),\n", - "# ('ScanSettings', 'Mz range'): dtype('O'),\n", - "# ('ScanSettings', 'beam-type collision-induced dissociation'): dtype('O'),\n", - "# ('SampleData', 'sample number'): dtype('O'),\n", - "# ('SampleData', 'Type'): dtype('O'),\n", - "# ('SampleData', 'Vial'): dtype('O'),\n", - "# ('SampleData', 'injection volume setting'): dtype('float64'),\n", - "# ('SampleData', 'Row'): dtype('float64'),\n", - "# ('SampleData', 'dilution factor'): dtype('int64'), # fails with NA\n", - "# ('SampleData', 'sample name'): dtype('O'),\n", - "# ('SampleData', 'Comment'): dtype('O'),\n", - "# }" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.DataFrame.from_dict(parsed, orient='index')\n", - "df.columns.names = ['category', 'item']\n", - "df.index.name = 'Sample ID'\n", - "\n", - "# df = df.astype(types)\n", - "\n", - "# write and read to check that this works and convert dtypes directly\n", - "# jso n format cannot preserve multiindex columns\n", - "df.to_csv(rawfile_metadata_out)\n", - "df = pd.read_csv(rawfile_metadata_out, header=[0,1], index_col=0) # read data elsewhere, set dtypes automatically, multiindex headers\n", - "df.describe(include='all')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.dtypes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "columns_selected = [\n", - " # ('FileProperties', 'Pathname'),\n", - " # ('FileProperties', 'Version'),\n", - " # ('FileProperties', 'Content Creation Date'),\n", - " ('InstrumentProperties', 'Thermo Scientific instrument model'),\n", - " ('InstrumentProperties', 'instrument attribute'),\n", - " ('InstrumentProperties', 'instrument serial number'),\n", - " # ('InstrumentProperties', 'Software Version'),\n", - " # ('InstrumentProperties', 'firmware version'),\n", - " # ('MsData', 'Number of MS1 spectra'),\n", - " # ('MsData', 'Number of MS2 spectra')\n", - " ('MsData', 'MS min charge'),\n", - " ('MsData', 'MS max charge'),\n", - " ('MsData', 'MS min RT'),\n", - " ('MsData', 'MS max RT'),\n", - " ('MsData', 'MS min MZ'),\n", - " ('MsData', 'MS max MZ'),\n", - " # ('ScanSettings', 'scan start time'),\n", - " ('ScanSettings', 'mass resolution'),\n", - " # ('ScanSettings', 'mass unit'),\n", - " # ('ScanSettings', 'Number of scans'),\n", - " # ('ScanSettings', 'MS scan range'),\n", - " ('ScanSettings', 'Retention time range'),\n", - " ('ScanSettings', 'Mz range'),\n", - " ('ScanSettings', 'beam-type collision-induced dissociation'),\n", - " # ('SampleData', 'sample number'),\n", - " # ('SampleData', 'Type'),\n", - " ('SampleData', 'Vial'),\n", - " # ('SampleData', 'injection volume setting'),\n", - " # ('SampleData', 'Row'),\n", - " ('SampleData', 'dilution factor'),\n", - " # ('SampleData', 'sample name'),\n", - " # ('SampleData', 'Comment')\n", - "]\n", - "\n", - "df[columns_selected]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[columns_selected].describe(include='all')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Test that dtypes of reloaded data are the same. Documents how to load metadata" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# import numpy.testing as npt\n", - "\n", - "# desired = df.dtypes.to_dict()\n", - "# # read data elsewhere, set dtypes automatically\n", - "# df_new = pd.read_csv(rawfile_metadata_out, header=[0, 1], index_col=0)\n", - "# actual = df_new.dtypes.to_dict()\n", - "# npt.assert_equal(actual, desired)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.8.13 ('vaep')", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13" - }, - "vscode": { - "interpreter": { - "hash": "cf83e9cb890c7f96eb0ae04f39a82254555f56a1a0ed2f03b23a8b40fe6cd31c" - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -}