diff --git a/.gitignore b/.gitignore index 49c006d2..3a54c1d9 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,8 @@ build/ develop-eggs/ dist/ downloads/ +jupyter_notebooks/extracted_files/ +jupyter_notebooks/result_dir/ eggs/ .eggs/ lib/ diff --git a/jupyter_notebooks/Server resubmission.ipynb b/jupyter_notebooks/Server resubmission.ipynb index 4179d6cd..057a336d 100644 --- a/jupyter_notebooks/Server resubmission.ipynb +++ b/jupyter_notebooks/Server resubmission.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 18, + "execution_count": 21, "id": "1affdd3a", "metadata": {}, "outputs": [ @@ -57,12 +57,20 @@ "from bs4 import BeautifulSoup\n", "import os\n", "import zipfile\n", - "from tqdm import tqdm" + "from tqdm import tqdm\n", + "from pathlib import Path\n", + "\n", + "import json\n", + "\n", + "from collections import defaultdict\n", + "import toml\n", + "\n", + "from proteobench.modules.quant.lfq.ion.DDA.quant_lfq_ion_DDA import DDAQuantIonModule" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 2, "id": "56b53d36", "metadata": {}, "outputs": [ @@ -272,12 +280,12 @@ "3 0.605836 0.213234 0.333518 \n", "4 0.514675 0.199873 0.272681 \n", "\n", - " results.1.variance_epsilon results.1.nr_prec results.1.CV_median \\\n", - "0 0.162885 51193 0.204522 \n", - "1 0.267728 59609 0.226330 \n", - "2 0.337984 82533 0.132900 \n", - "3 0.291254 51338 0.218630 \n", - "4 0.155889 51345 0.202474 \n", + " results.1.variance_epsilon results.1.nr_prec results.1.CV_median \\\n", + "0 0.162885 51193 0.204522 \n", + "1 0.267728 59609 0.226330 \n", + "2 0.337984 82533 0.132900 \n", + "3 0.291254 51338 0.218630 \n", + "4 0.155889 51345 0.202474 \n", "\n", " results.1.CV_q90 results.1.CV_q75 results.1.CV_q95 \n", "0 0.426579 0.295455 0.524623 \n", @@ -289,7 +297,7 @@ "[5 rows x 87 columns]" ] }, - "execution_count": 16, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -311,14 +319,14 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 3, "id": "90eec923", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "e25afbdf0b454e70be200328304947a8", + "model_id": "b078ebc7a5db4ad59ecebb05d7112052", "version_major": 2, "version_minor": 0 }, @@ -332,7 +340,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "2658f30c03824c1d88ee8381d75f04d1", + "model_id": "8879c41130a64a0dabef9b18ec8c85cf", "version_major": 2, "version_minor": 0 }, @@ -346,7 +354,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "387cecf52a67429aba2c69e330a84817", + "model_id": "6c86a298774b461e9fcd59d3bb7a8e15", "version_major": 2, "version_minor": 0 }, @@ -393,21 +401,19 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 5, "id": "c79455a0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "14 8cbc0bce20eee581ad10326e02a09dbc316c30e1\n", - "15 36b7b01b380f641722b3b34633bb53d72348eb80\n", - "16 0280a06fabdbe84746419d0810deae56e7ab2406\n", - "17 47db7ef37a0fb5fec79f3bedbfb4f67835774f10\n", + "20 1bfa914c771321b285a9ca40d4aa538cb9fdc42e\n", + "21 e8e80290fb48ff02de5ee54eb6b0114ff661bace\n", "Name: intermediate_hash, dtype: object" ] }, - "execution_count": 19, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -418,7 +424,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 6, "id": "093bd9d5", "metadata": {}, "outputs": [ @@ -426,65 +432,42 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing folder: https://proteobench.cubimed.rub.de/datasets/0280a06fabdbe84746419d0810deae56e7ab2406/\n", - "Downloading: https://proteobench.cubimed.rub.de/datasets/0280a06fabdbe84746419d0810deae56e7ab2406/0280a06fabdbe84746419d0810deae56e7ab2406_data.zip\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Downloading 0280a06fabdbe84746419d0810deae56e7ab2406_data.zip: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 33.6M/33.6M [00:24<00:00, 1.42MB/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Extracted contents to: extracted_files/0280a06fabdbe84746419d0810deae56e7ab2406\n", - "Processing folder: https://proteobench.cubimed.rub.de/datasets/36b7b01b380f641722b3b34633bb53d72348eb80/\n", - "Downloading: https://proteobench.cubimed.rub.de/datasets/36b7b01b380f641722b3b34633bb53d72348eb80/36b7b01b380f641722b3b34633bb53d72348eb80_data.zip\n" + "Processing folder: https://proteobench.cubimed.rub.de/datasets/1bfa914c771321b285a9ca40d4aa538cb9fdc42e/\n", + "Downloading: https://proteobench.cubimed.rub.de/datasets/1bfa914c771321b285a9ca40d4aa538cb9fdc42e/1bfa914c771321b285a9ca40d4aa538cb9fdc42e_data.zip\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Downloading 36b7b01b380f641722b3b34633bb53d72348eb80_data.zip: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 25.4M/25.4M [00:14<00:00, 1.83MB/s]\n" + "Downloading 1bfa914c771321b285a9ca40d4aa538cb9fdc42e_data.zip: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 66.4M/66.4M [00:15<00:00, 4.50MB/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Extracted contents to: extracted_files/36b7b01b380f641722b3b34633bb53d72348eb80\n", - "Processing folder: https://proteobench.cubimed.rub.de/datasets/8cbc0bce20eee581ad10326e02a09dbc316c30e1/\n", - "Downloading: https://proteobench.cubimed.rub.de/datasets/8cbc0bce20eee581ad10326e02a09dbc316c30e1/8cbc0bce20eee581ad10326e02a09dbc316c30e1_data.zip\n" + "Extracted contents to: extracted_files/1bfa914c771321b285a9ca40d4aa538cb9fdc42e\n", + "Processing folder: https://proteobench.cubimed.rub.de/datasets/e8e80290fb48ff02de5ee54eb6b0114ff661bace/\n", + "Downloading: https://proteobench.cubimed.rub.de/datasets/e8e80290fb48ff02de5ee54eb6b0114ff661bace/e8e80290fb48ff02de5ee54eb6b0114ff661bace_data.zip\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Downloading 8cbc0bce20eee581ad10326e02a09dbc316c30e1_data.zip: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 33.2M/33.2M [00:18<00:00, 1.94MB/s]\n" + "Downloading e8e80290fb48ff02de5ee54eb6b0114ff661bace_data.zip: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 46.5M/46.5M [00:08<00:00, 5.71MB/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Extracted contents to: extracted_files/8cbc0bce20eee581ad10326e02a09dbc316c30e1\n" + "Extracted contents to: extracted_files/e8e80290fb48ff02de5ee54eb6b0114ff661bace\n" ] } ], "source": [ - "import pandas as pd\n", - "import requests\n", - "from bs4 import BeautifulSoup\n", - "import os\n", - "import zipfile\n", - "from tqdm import tqdm\n", - "\n", "# Step 1: Extract the hash list from the DataFrame\n", "hash_list = filtered_df[\"intermediate_hash\"].tolist()\n", "\n", @@ -549,10 +532,382 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, + "id": "593739e9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | id | \n", + "old_new | \n", + "software_name | \n", + "software_version | \n", + "search_engine | \n", + "search_engine_version | \n", + "ident_fdr_psm | \n", + "ident_fdr_peptide | \n", + "ident_fdr_protein | \n", + "enable_match_between_runs | \n", + "... | \n", + "color | \n", + "hover_text | \n", + "scatter_size | \n", + "scan_window | \n", + "quantification_method_DIANN | \n", + "second_pass | \n", + "protein_inference | \n", + "predictors_library | \n", + "quantification_method | \n", + "mean_abs_epsilon | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "MaxQuant_20241216_100704 | \n", + "old | \n", + "MaxQuant | \n", + "1.5.2.8 | \n", + "Andromeda | \n", + "None | \n", + "NaN | \n", + "0.010000 | \n", + "0.010000 | \n", + "False | \n", + "... | \n", + "#377eb8 | \n", + "ProteoBench ID: MaxQuant_20241216_100704<br>So... | \n", + "20 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "0.265490 | \n", + "
1 | \n", + "ProlineStudio_20241216_103006 | \n", + "old | \n", + "ProlineStudio | \n", + "2.3.0-SNAPSHOT_2024-09-11T06:45:20Z_jenkins | \n", + "Mascot | \n", + "2.8.3 | \n", + "0.010000 | \n", + "NaN | \n", + "NaN | \n", + "True | \n", + "... | \n", + "#5f0f40 | \n", + "ProteoBench ID: ProlineStudio_20241216_103006<... | \n", + "20 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "0.319847 | \n", + "
2 | \n", + "i2MassChroQ_20241216_103323 | \n", + "old | \n", + "i2MassChroQ | \n", + "1.0.16 | \n", + "X! Tandem | \n", + "X! Tandem Alanine (2017.2.1.4) | \n", + "0.008998 | \n", + "0.011963 | \n", + "0.009873 | \n", + "True | \n", + "... | \n", + "#984ea3 | \n", + "ProteoBench ID: i2MassChroQ_20241216_103323<br... | \n", + "20 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "0.369880 | \n", + "
3 | \n", + "MaxQuant_20241216_130203 | \n", + "old | \n", + "MaxQuant | \n", + "1.5.3.30 | \n", + "Andromeda | \n", + "None | \n", + "NaN | \n", + "0.010000 | \n", + "0.010000 | \n", + "True | \n", + "... | \n", + "#377eb8 | \n", + "ProteoBench ID: MaxQuant_20241216_130203<br>So... | \n", + "20 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "0.322391 | \n", + "
4 | \n", + "MaxQuant_20241216_120735 | \n", + "old | \n", + "MaxQuant | \n", + "1.5.3.30 | \n", + "Andromeda | \n", + "None | \n", + "NaN | \n", + "0.010000 | \n", + "0.010000 | \n", + "False | \n", + "... | \n", + "#377eb8 | \n", + "ProteoBench ID: MaxQuant_20241216_120735<br>So... | \n", + "20 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "0.259993 | \n", + "
5 rows × 40 columns
\n", + "