From ee2e6f74c39f349cb7e4e923b82f58c2c570aff1 Mon Sep 17 00:00:00 2001 From: RobbinBouwmeester Date: Thu, 19 Dec 2024 15:47:19 +0100 Subject: [PATCH 1/2] Update Server resubmission.ipynb --- jupyter_notebooks/Server resubmission.ipynb | 461 +++++++++++++++++--- 1 file changed, 408 insertions(+), 53 deletions(-) diff --git a/jupyter_notebooks/Server resubmission.ipynb b/jupyter_notebooks/Server resubmission.ipynb index 4179d6cd..057a336d 100644 --- a/jupyter_notebooks/Server resubmission.ipynb +++ b/jupyter_notebooks/Server resubmission.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 18, + "execution_count": 21, "id": "1affdd3a", "metadata": {}, "outputs": [ @@ -57,12 +57,20 @@ "from bs4 import BeautifulSoup\n", "import os\n", "import zipfile\n", - "from tqdm import tqdm" + "from tqdm import tqdm\n", + "from pathlib import Path\n", + "\n", + "import json\n", + "\n", + "from collections import defaultdict\n", + "import toml\n", + "\n", + "from proteobench.modules.quant.lfq.ion.DDA.quant_lfq_ion_DDA import DDAQuantIonModule" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 2, "id": "56b53d36", "metadata": {}, "outputs": [ @@ -272,12 +280,12 @@ "3 0.605836 0.213234 0.333518 \n", "4 0.514675 0.199873 0.272681 \n", "\n", - " results.1.variance_epsilon results.1.nr_prec results.1.CV_median \\\n", - "0 0.162885 51193 0.204522 \n", - "1 0.267728 59609 0.226330 \n", - "2 0.337984 82533 0.132900 \n", - "3 0.291254 51338 0.218630 \n", - "4 0.155889 51345 0.202474 \n", + " results.1.variance_epsilon results.1.nr_prec results.1.CV_median \\\n", + "0 0.162885 51193 0.204522 \n", + "1 0.267728 59609 0.226330 \n", + "2 0.337984 82533 0.132900 \n", + "3 0.291254 51338 0.218630 \n", + "4 0.155889 51345 0.202474 \n", "\n", " results.1.CV_q90 results.1.CV_q75 results.1.CV_q95 \n", "0 0.426579 0.295455 0.524623 \n", @@ -289,7 +297,7 @@ "[5 rows x 87 columns]" ] }, - "execution_count": 16, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -311,14 +319,14 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 3, "id": "90eec923", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "e25afbdf0b454e70be200328304947a8", + "model_id": "b078ebc7a5db4ad59ecebb05d7112052", "version_major": 2, "version_minor": 0 }, @@ -332,7 +340,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "2658f30c03824c1d88ee8381d75f04d1", + "model_id": "8879c41130a64a0dabef9b18ec8c85cf", "version_major": 2, "version_minor": 0 }, @@ -346,7 +354,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "387cecf52a67429aba2c69e330a84817", + "model_id": "6c86a298774b461e9fcd59d3bb7a8e15", "version_major": 2, "version_minor": 0 }, @@ -393,21 +401,19 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 5, "id": "c79455a0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "14 8cbc0bce20eee581ad10326e02a09dbc316c30e1\n", - "15 36b7b01b380f641722b3b34633bb53d72348eb80\n", - "16 0280a06fabdbe84746419d0810deae56e7ab2406\n", - "17 47db7ef37a0fb5fec79f3bedbfb4f67835774f10\n", + "20 1bfa914c771321b285a9ca40d4aa538cb9fdc42e\n", + "21 e8e80290fb48ff02de5ee54eb6b0114ff661bace\n", "Name: intermediate_hash, dtype: object" ] }, - "execution_count": 19, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -418,7 +424,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 6, "id": "093bd9d5", "metadata": {}, "outputs": [ @@ -426,65 +432,42 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing folder: https://proteobench.cubimed.rub.de/datasets/0280a06fabdbe84746419d0810deae56e7ab2406/\n", - "Downloading: https://proteobench.cubimed.rub.de/datasets/0280a06fabdbe84746419d0810deae56e7ab2406/0280a06fabdbe84746419d0810deae56e7ab2406_data.zip\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Downloading 0280a06fabdbe84746419d0810deae56e7ab2406_data.zip: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 33.6M/33.6M [00:24<00:00, 1.42MB/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Extracted contents to: extracted_files/0280a06fabdbe84746419d0810deae56e7ab2406\n", - "Processing folder: https://proteobench.cubimed.rub.de/datasets/36b7b01b380f641722b3b34633bb53d72348eb80/\n", - "Downloading: https://proteobench.cubimed.rub.de/datasets/36b7b01b380f641722b3b34633bb53d72348eb80/36b7b01b380f641722b3b34633bb53d72348eb80_data.zip\n" + "Processing folder: https://proteobench.cubimed.rub.de/datasets/1bfa914c771321b285a9ca40d4aa538cb9fdc42e/\n", + "Downloading: https://proteobench.cubimed.rub.de/datasets/1bfa914c771321b285a9ca40d4aa538cb9fdc42e/1bfa914c771321b285a9ca40d4aa538cb9fdc42e_data.zip\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Downloading 36b7b01b380f641722b3b34633bb53d72348eb80_data.zip: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 25.4M/25.4M [00:14<00:00, 1.83MB/s]\n" + "Downloading 1bfa914c771321b285a9ca40d4aa538cb9fdc42e_data.zip: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 66.4M/66.4M [00:15<00:00, 4.50MB/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Extracted contents to: extracted_files/36b7b01b380f641722b3b34633bb53d72348eb80\n", - "Processing folder: https://proteobench.cubimed.rub.de/datasets/8cbc0bce20eee581ad10326e02a09dbc316c30e1/\n", - "Downloading: https://proteobench.cubimed.rub.de/datasets/8cbc0bce20eee581ad10326e02a09dbc316c30e1/8cbc0bce20eee581ad10326e02a09dbc316c30e1_data.zip\n" + "Extracted contents to: extracted_files/1bfa914c771321b285a9ca40d4aa538cb9fdc42e\n", + "Processing folder: https://proteobench.cubimed.rub.de/datasets/e8e80290fb48ff02de5ee54eb6b0114ff661bace/\n", + "Downloading: https://proteobench.cubimed.rub.de/datasets/e8e80290fb48ff02de5ee54eb6b0114ff661bace/e8e80290fb48ff02de5ee54eb6b0114ff661bace_data.zip\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Downloading 8cbc0bce20eee581ad10326e02a09dbc316c30e1_data.zip: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 33.2M/33.2M [00:18<00:00, 1.94MB/s]\n" + "Downloading e8e80290fb48ff02de5ee54eb6b0114ff661bace_data.zip: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 46.5M/46.5M [00:08<00:00, 5.71MB/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Extracted contents to: extracted_files/8cbc0bce20eee581ad10326e02a09dbc316c30e1\n" + "Extracted contents to: extracted_files/e8e80290fb48ff02de5ee54eb6b0114ff661bace\n" ] } ], "source": [ - "import pandas as pd\n", - "import requests\n", - "from bs4 import BeautifulSoup\n", - "import os\n", - "import zipfile\n", - "from tqdm import tqdm\n", - "\n", "# Step 1: Extract the hash list from the DataFrame\n", "hash_list = filtered_df[\"intermediate_hash\"].tolist()\n", "\n", @@ -549,10 +532,382 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, + "id": "593739e9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idold_newsoftware_namesoftware_versionsearch_enginesearch_engine_versionident_fdr_psmident_fdr_peptideident_fdr_proteinenable_match_between_runs...colorhover_textscatter_sizescan_windowquantification_method_DIANNsecond_passprotein_inferencepredictors_libraryquantification_methodmean_abs_epsilon
0MaxQuant_20241216_100704oldMaxQuant1.5.2.8AndromedaNoneNaN0.0100000.010000False...#377eb8ProteoBench ID: MaxQuant_20241216_100704<br>So...20NaNNaNNaNNaNNaNNaN0.265490
1ProlineStudio_20241216_103006oldProlineStudio2.3.0-SNAPSHOT_2024-09-11T06:45:20Z_jenkinsMascot2.8.30.010000NaNNaNTrue...#5f0f40ProteoBench ID: ProlineStudio_20241216_103006<...20NaNNaNNaNNaNNaNNaN0.319847
2i2MassChroQ_20241216_103323oldi2MassChroQ1.0.16X! TandemX! Tandem Alanine (2017.2.1.4)0.0089980.0119630.009873True...#984ea3ProteoBench ID: i2MassChroQ_20241216_103323<br...20NaNNaNNaNNaNNaNNaN0.369880
3MaxQuant_20241216_130203oldMaxQuant1.5.3.30AndromedaNoneNaN0.0100000.010000True...#377eb8ProteoBench ID: MaxQuant_20241216_130203<br>So...20NaNNaNNaNNaNNaNNaN0.322391
4MaxQuant_20241216_120735oldMaxQuant1.5.3.30AndromedaNoneNaN0.0100000.010000False...#377eb8ProteoBench ID: MaxQuant_20241216_120735<br>So...20NaNNaNNaNNaNNaNNaN0.259993
\n", + "

5 rows × 40 columns

\n", + "
" + ], + "text/plain": [ + " id old_new software_name \\\n", + "0 MaxQuant_20241216_100704 old MaxQuant \n", + "1 ProlineStudio_20241216_103006 old ProlineStudio \n", + "2 i2MassChroQ_20241216_103323 old i2MassChroQ \n", + "3 MaxQuant_20241216_130203 old MaxQuant \n", + "4 MaxQuant_20241216_120735 old MaxQuant \n", + "\n", + " software_version search_engine \\\n", + "0 1.5.2.8 Andromeda \n", + "1 2.3.0-SNAPSHOT_2024-09-11T06:45:20Z_jenkins Mascot \n", + "2 1.0.16 X! Tandem \n", + "3 1.5.3.30 Andromeda \n", + "4 1.5.3.30 Andromeda \n", + "\n", + " search_engine_version ident_fdr_psm ident_fdr_peptide \\\n", + "0 None NaN 0.010000 \n", + "1 2.8.3 0.010000 NaN \n", + "2 X! Tandem Alanine (2017.2.1.4) 0.008998 0.011963 \n", + "3 None NaN 0.010000 \n", + "4 None NaN 0.010000 \n", + "\n", + " ident_fdr_protein enable_match_between_runs ... color \\\n", + "0 0.010000 False ... #377eb8 \n", + "1 NaN True ... #5f0f40 \n", + "2 0.009873 True ... #984ea3 \n", + "3 0.010000 True ... #377eb8 \n", + "4 0.010000 False ... #377eb8 \n", + "\n", + " hover_text scatter_size \\\n", + "0 ProteoBench ID: MaxQuant_20241216_100704
So... 20 \n", + "1 ProteoBench ID: ProlineStudio_20241216_103006<... 20 \n", + "2 ProteoBench ID: i2MassChroQ_20241216_103323So... 20 \n", + "4 ProteoBench ID: MaxQuant_20241216_120735
So... 20 \n", + "\n", + " scan_window quantification_method_DIANN second_pass protein_inference \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "\n", + " predictors_library quantification_method mean_abs_epsilon \n", + "0 NaN NaN 0.265490 \n", + "1 NaN NaN 0.319847 \n", + "2 NaN NaN 0.369880 \n", + "3 NaN NaN 0.322391 \n", + "4 NaN NaN 0.259993 \n", + "\n", + "[5 rows x 40 columns]" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "token = toml.load(\"../webinterface/.streamlit/secrets.toml\")[\"gh\"][\"token\"]\n", + "\n", + "# TODO change to the correct module\n", + "module_obj = DDAQuantIonModule(token=token)\n", + "results_df = module_obj.obtain_all_data_points(all_datapoints=None)\n", + "\n", + "results_df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, "id": "ecb9cf7c", "metadata": {}, "outputs": [], + "source": [ + "extra_path = Path(\"extracted_files\")\n", + "\n", + "# submission_files = [\n", + "# {\n", + "# \"input_file\" : \"../test/data/dda_quant/MaxQuant_evidence_sample.txt\",\n", + "# \"param_file\" : \"../test/params/mqpar_MQ1.6.3.3_MBR.xml\",\n", + "# \"input_type\" : \"MaxQuant\",\n", + "# \"default_cutoff_min_prec\" : 3,\n", + "# \"user_comments\" : \"Put comments here.\"\n", + "# }\n", + "#]\n", + "\n", + "submission_files = []\n", + "\n", + "for idx,row in filtered_df.iterrows():\n", + " base_path = extra_path / row[\"intermediate_hash\"]\n", + " comments = \"\\n\".join(open(base_path / \"comment.txt\").readlines())\n", + " input_file = base_path / \"input_file.txt\"\n", + " parameter_file = base_path / \"param_0.txt\"\n", + " \n", + " submission_files.append({\n", + " \"input_file\" : input_file,\n", + " \"param_file\" : parameter_file,\n", + " \"input_type\" : row[\"software_name\"],\n", + " \"default_cutoff_min_prec\" : 3,\n", + " \"user_comments\" : comments\n", + " })" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "a40c51ba", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Not all columns required for making the ion are available.\n", + "Load locally: extracted_files\\1bfa914c771321b285a9ca40d4aa538cb9fdc42e\\param_0.txt\n", + "ProteoBenchParameters(software_name='AlphaPept', software_version='0.5.0', search_engine='AlphaPept', search_engine_version='0.5.0', ident_fdr_psm=None, ident_fdr_peptide=0.01, ident_fdr_protein=0.01, enable_match_between_runs=True, precursor_mass_tolerance='[-20 ppm, 20 ppm]', fragment_mass_tolerance='[-50 ppm, 50 ppm]', enzyme='Trypsin', allowed_miscleavages=2, min_peptide_length=7, max_peptide_length=27, fixed_mods='cC', variable_mods='oxM', max_mods=3, min_precursor_charge=1, max_precursor_charge=6, scan_window=None, quantification_method=None, second_pass=None, protein_inference=None, predictors_library=None)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Following Github server redirection from /repos/Proteobot/Results_Module2_quant_DDA to /repositories/594032348\n", + "INFO:github.Requester:Following Github server redirection from /repos/Proteobot/Results_Module2_quant_DDA to /repositories/594032348\n", + "Following Github server redirection from /repos/Proteobot/Results_quant_ion_DDA/branches/master to /repos/Proteobot/Results_quant_ion_DDA/branches/main\n", + "INFO:github.Requester:Following Github server redirection from /repos/Proteobot/Results_quant_ion_DDA/branches/master to /repos/Proteobot/Results_quant_ion_DDA/branches/main\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Submitted: {'input_file': WindowsPath('extracted_files/1bfa914c771321b285a9ca40d4aa538cb9fdc42e/input_file.txt'), 'param_file': WindowsPath('extracted_files/1bfa914c771321b285a9ca40d4aa538cb9fdc42e/param_0.txt'), 'input_type': 'AlphaPept', 'default_cutoff_min_prec': 3, 'user_comments': 'Fixed mod of N-acetyl (N-term)'}\n", + "------------------------\n", + "Not all columns required for making the ion are available.\n", + "Load locally: extracted_files\\e8e80290fb48ff02de5ee54eb6b0114ff661bace\\param_0.txt\n", + "ProteoBenchParameters(software_name='AlphaPept', software_version='0.5.0', search_engine='AlphaPept', search_engine_version='0.5.0', ident_fdr_psm=None, ident_fdr_peptide=0.01, ident_fdr_protein=0.01, enable_match_between_runs=True, precursor_mass_tolerance='[-10 ppm, 10 ppm]', fragment_mass_tolerance='[-20 ppm, 20 ppm]', enzyme='Trypsin', allowed_miscleavages=1, min_peptide_length=7, max_peptide_length=27, fixed_mods='cC', variable_mods='oxM', max_mods=3, min_precursor_charge=1, max_precursor_charge=6, scan_window=None, quantification_method=None, second_pass=None, protein_inference=None, predictors_library=None)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Following Github server redirection from /repos/Proteobot/Results_Module2_quant_DDA to /repositories/594032348\n", + "INFO:github.Requester:Following Github server redirection from /repos/Proteobot/Results_Module2_quant_DDA to /repositories/594032348\n", + "Following Github server redirection from /repos/Proteobot/Results_quant_ion_DDA/branches/master to /repos/Proteobot/Results_quant_ion_DDA/branches/main\n", + "INFO:github.Requester:Following Github server redirection from /repos/Proteobot/Results_quant_ion_DDA/branches/master to /repos/Proteobot/Results_quant_ion_DDA/branches/main\n", + "ERROR:root:Error in PR: 422 {\"message\": \"Validation Failed\", \"errors\": [{\"resource\": \"PullRequest\", \"code\": \"custom\", \"message\": \"A pull request already exists for Proteobot:AlphaPept_20241217_084044.\"}], \"documentation_url\": \"https://docs.github.com/rest/pulls/pulls#create-a-pull-request\", \"status\": \"422\"}\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Submitted: {'input_file': WindowsPath('extracted_files/e8e80290fb48ff02de5ee54eb6b0114ff661bace/input_file.txt'), 'param_file': WindowsPath('extracted_files/e8e80290fb48ff02de5ee54eb6b0114ff661bace/param_0.txt'), 'input_type': 'AlphaPept', 'default_cutoff_min_prec': 3, 'user_comments': ''}\n", + "------------------------\n" + ] + } + ], + "source": [ + "for submission_settings in submission_files:\n", + " param_file = submission_settings[\"param_file\"]\n", + " input_file = submission_settings[\"input_file\"]\n", + " input_type = submission_settings[\"input_type\"]\n", + " default_cutoff_min_prec = submission_settings[\"default_cutoff_min_prec\"]\n", + " user_comments = submission_settings[\"user_comments\"]\n", + " \n", + " user_config = defaultdict(lambda: \"\")\n", + "\n", + " results_intermediates, results_df_new, parsed_input = module_obj.benchmarking(\n", + " input_file,\n", + " input_type,\n", + " user_config,\n", + " results_df,\n", + " default_cutoff_min_prec=default_cutoff_min_prec,\n", + " )\n", + "\n", + " results_df_new.tail(5)\n", + " \n", + " param_obj = module_obj.load_params_file(\n", + " [param_file], input_type\n", + " )\n", + " print(param_obj)\n", + "\n", + " pr_url = module_obj.clone_pr(\n", + " results_df_new,\n", + " param_obj,\n", + " remote_git=\"\",\n", + " submission_comments=user_comments,\n", + " )\n", + " \n", + " print(f\"Submitted: {submission_settings}\")\n", + " print(\"------------------------\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c8b7f6f", + "metadata": {}, + "outputs": [], "source": [] } ], From f8f5650248021af91831c3f6587aa8cb86cd9de3 Mon Sep 17 00:00:00 2001 From: RobbinBouwmeester Date: Thu, 19 Dec 2024 15:49:19 +0100 Subject: [PATCH 2/2] Update .gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 49c006d2..3a54c1d9 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,8 @@ build/ develop-eggs/ dist/ downloads/ +jupyter_notebooks/extracted_files/ +jupyter_notebooks/result_dir/ eggs/ .eggs/ lib/