From 49d628ba788db6038093760fa2e2365363d29df3 Mon Sep 17 00:00:00 2001 From: Henry Date: Thu, 7 Dec 2023 13:33:17 +0100 Subject: [PATCH] :art::bug: update overfitting analysis (25MNAR) - :bug: remove metadata fpath from train_X.yaml - also run KNN comp. with workflow v2 with a share of 25MNAR --- project/03_1_best_models_comparison.ipynb | 17 ++++++++-- project/03_1_best_models_comparison.py | 10 ++++-- .../knn_comparison/hela_pgs_large/config.yaml | 32 +++++++++---------- .../knn_comparison/hela_pgs_large/split.yaml | 2 +- project/config/permuted_dataset/config.yaml | 18 +++++------ project/config/permuted_dataset/split.yaml | 1 + project/config/repeat_best/split.yaml | 15 +++++---- project/config/repeat_best/train.yaml | 15 +++++---- .../proteinGroups/train_CF.yaml | 1 - .../proteinGroups/train_DAE.yaml | 1 - .../proteinGroups/train_KNN.yaml | 3 +- .../proteinGroups/train_Median.yaml | 3 +- .../proteinGroups/train_VAE.yaml | 1 - project/workflow/Snakefile | 4 +-- .../Snakefile_best_repeated_split.smk | 2 ++ .../Snakefile_best_repeated_train.smk | 2 ++ 16 files changed, 73 insertions(+), 54 deletions(-) diff --git a/project/03_1_best_models_comparison.ipynb b/project/03_1_best_models_comparison.ipynb index 0a02134a1..6a6f6f6fe 100644 --- a/project/03_1_best_models_comparison.ipynb +++ b/project/03_1_best_models_comparison.ipynb @@ -21,7 +21,7 @@ "logger = setup_logger(logger=logging.getLogger('vaep'), level=10)\n", "\n", "plt.rcParams['figure.figsize'] = [4.0, 2.0]\n", - "vaep.plotting.make_large_descriptors(5)" + "vaep.plotting.make_large_descriptors(7)" ] }, { @@ -93,7 +93,10 @@ "min_max_MAE = (selected\n", " .loc[pd.IndexSlice[:, 'MAE', :]]\n", " .groupby('model')\n", - " .agg(['min', 'max']))\n", + " .agg(['min', 'max'])\n", + " .stack()\n", + " .T\n", + " .loc[IDX[0]])\n", "min_max_MAE.to_excel(writer, sheet_name='min_max_MAE')\n", "min_max_MAE" ] @@ -182,6 +185,16 @@ "vaep.savefig(fig, FOLDER / \"model_performance_repeated_runs.pdf\")" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "0813889a", + "metadata": {}, + "outputs": [], + "source": [ + "writer.close()" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/project/03_1_best_models_comparison.py b/project/03_1_best_models_comparison.py index 00bcb8ada..97c54d8b5 100644 --- a/project/03_1_best_models_comparison.py +++ b/project/03_1_best_models_comparison.py @@ -29,7 +29,7 @@ logger = setup_logger(logger=logging.getLogger('vaep'), level=10) plt.rcParams['figure.figsize'] = [4.0, 2.0] -vaep.plotting.make_large_descriptors(5) +vaep.plotting.make_large_descriptors(7) # %% IDX = [['proteinGroups', 'peptides', 'evidence'], @@ -63,7 +63,10 @@ min_max_MAE = (selected .loc[pd.IndexSlice[:, 'MAE', :]] .groupby('model') - .agg(['min', 'max'])) + .agg(['min', 'max']) + .stack() + .T + .loc[IDX[0]]) min_max_MAE.to_excel(writer, sheet_name='min_max_MAE') min_max_MAE @@ -114,3 +117,6 @@ vaep.savefig(fig, FOLDER / "model_performance_repeated_runs.pdf") # %% +writer.close() + +# %% diff --git a/project/config/knn_comparison/hela_pgs_large/config.yaml b/project/config/knn_comparison/hela_pgs_large/config.yaml index e618a31a3..671a9c222 100644 --- a/project/config/knn_comparison/hela_pgs_large/config.yaml +++ b/project/config/knn_comparison/hela_pgs_large/config.yaml @@ -2,23 +2,23 @@ config_split: config/knn_comparison/hela_pgs_large/split.yaml config_train: runs/knn_comparison/hela_pgs_large/configs_train/train_{model}.yaml folder_experiment: runs/knn_comparison/hela_pgs_large -fn_rawfile_metadata: data/dev_datasets/df_intensities_proteinGroups_long/metadata.csv +fn_rawfile_metadata: None file_format: csv cuda: False models: - - Median: - model: Median - - 3NN: - neighbors: 3 - model: KNN - - 5NN: - neighbors: 5 - model: KNN - - 10NN: - neighbors: 10 - model: KNN - - 15NN: - neighbors: 15 - model: KNN + - Median: + model: Median + - 3NN: + neighbors: 3 + model: KNN + - 5NN: + neighbors: 5 + model: KNN + - 10NN: + neighbors: 10 + model: KNN + - 15NN: + neighbors: 15 + model: KNN NAGuideR_methods: - - KNN_IMPUTE + - KNN_IMPUTE diff --git a/project/config/knn_comparison/hela_pgs_large/split.yaml b/project/config/knn_comparison/hela_pgs_large/split.yaml index 675f1b785..efb1f12d0 100644 --- a/project/config/knn_comparison/hela_pgs_large/split.yaml +++ b/project/config/knn_comparison/hela_pgs_large/split.yaml @@ -1,3 +1,3 @@ FN_INTENSITIES: data/dev_datasets/df_intensities_proteinGroups_long/Q_Exactive_HF_X_Orbitrap_6070.pkl sample_completeness: 0.5 -min_RT_time: 120 \ No newline at end of file +frac_mnar: 0.25 diff --git a/project/config/permuted_dataset/config.yaml b/project/config/permuted_dataset/config.yaml index 3d5657868..0b338744f 100644 --- a/project/config/permuted_dataset/config.yaml +++ b/project/config/permuted_dataset/config.yaml @@ -1,18 +1,16 @@ # config for Snakefile_v1 # fit permuted data to the same model as the original data -config_split: config/permuted_dataset/split.yaml # proteinGroups +config_split: config/permuted_dataset/split.yaml # proteinGroups config_train: config/single_dev_dataset/proteinGroups/train_{model}.yaml folder_experiment: runs/permuted #/proteinGroups fn_rawfile_metadata: # no metadata for permuted data cuda: False models: - - Median - - CF - - DAE - - VAE - - KNN + - Median + - CF + - DAE + - VAE + - KNN NAGuideR_methods: - - lls - - knnmethod - - rf - # - impseq # fails \ No newline at end of file + - KNN_IMPUTE + # - RF diff --git a/project/config/permuted_dataset/split.yaml b/project/config/permuted_dataset/split.yaml index 2bc14b229..69441fd23 100644 --- a/project/config/permuted_dataset/split.yaml +++ b/project/config/permuted_dataset/split.yaml @@ -1,2 +1,3 @@ FN_INTENSITIES: data/dev_datasets/df_intensities_proteinGroups_long/Q_Exactive_HF_X_Orbitrap_6070_permuted.pkl sample_completeness: 0.5 +frac_mnar: 0.25 diff --git a/project/config/repeat_best/split.yaml b/project/config/repeat_best/split.yaml index d47b203f9..766aa05c7 100644 --- a/project/config/repeat_best/split.yaml +++ b/project/config/repeat_best/split.yaml @@ -1,13 +1,14 @@ epochs_max: -- 100 + - 100 repeats: 5 folder: "runs/repeat_best_split" levels: -- proteinGroups -- peptides -- evidence + - proteinGroups + - peptides + - evidence fn_rawfile_metadata: data/dev_datasets/df_intensities_{level}_long/metadata.csv -config_split: 'config/single_dev_dataset/{level}/split.yaml' -config_train: 'config/single_dev_dataset/{level}/train_{model}.yaml' -repitition_name: 'repeat' +config_split: "config/single_dev_dataset/{level}/split.yaml" +config_train: "config/single_dev_dataset/{level}/train_{model}.yaml" +repitition_name: "repeat" file_format: pkl +cuda: True diff --git a/project/config/repeat_best/train.yaml b/project/config/repeat_best/train.yaml index b8cf4be09..976284b60 100644 --- a/project/config/repeat_best/train.yaml +++ b/project/config/repeat_best/train.yaml @@ -1,13 +1,14 @@ epochs_max: -- 100 + - 100 repeats: 5 folder: "runs/repeat_best_train" levels: -- proteinGroups -- peptides -- evidence + - proteinGroups + - peptides + - evidence fn_rawfile_metadata: data/dev_datasets/df_intensities_{level}_long/metadata.csv -config_split: 'config/single_dev_dataset/{level}/split.yaml' -config_train: 'config/single_dev_dataset/{level}/train_{model}.yaml' -repitition_name: 'repeat' +config_split: "config/single_dev_dataset/{level}/split.yaml" +config_train: "config/single_dev_dataset/{level}/train_{model}.yaml" +repitition_name: "repeat" file_format: pkl +cuda: True diff --git a/project/config/single_dev_dataset/proteinGroups/train_CF.yaml b/project/config/single_dev_dataset/proteinGroups/train_CF.yaml index 58370bd9c..2068cc158 100644 --- a/project/config/single_dev_dataset/proteinGroups/train_CF.yaml +++ b/project/config/single_dev_dataset/proteinGroups/train_CF.yaml @@ -1,5 +1,4 @@ file_format: csv -fn_rawfile_metadata: data/dev_datasets/df_intensities_proteinGroups_long/metadata.csv latent_dim: 50 batch_size: 32768 epochs_max: 100 diff --git a/project/config/single_dev_dataset/proteinGroups/train_DAE.yaml b/project/config/single_dev_dataset/proteinGroups/train_DAE.yaml index cab0c8246..b3a22c8bb 100644 --- a/project/config/single_dev_dataset/proteinGroups/train_DAE.yaml +++ b/project/config/single_dev_dataset/proteinGroups/train_DAE.yaml @@ -1,5 +1,4 @@ file_format: csv -fn_rawfile_metadata: data/dev_datasets/df_intensities_proteinGroups_long/metadata.csv latent_dim: 25 batch_size: 64 epochs_max: 100 diff --git a/project/config/single_dev_dataset/proteinGroups/train_KNN.yaml b/project/config/single_dev_dataset/proteinGroups/train_KNN.yaml index baa99b732..2d056f335 100644 --- a/project/config/single_dev_dataset/proteinGroups/train_KNN.yaml +++ b/project/config/single_dev_dataset/proteinGroups/train_KNN.yaml @@ -1,3 +1,2 @@ neighbors: 3 -file_format: csv -fn_rawfile_metadata: data/dev_datasets/df_intensities_proteinGroups_long/metadata.csv \ No newline at end of file +file_format: csv \ No newline at end of file diff --git a/project/config/single_dev_dataset/proteinGroups/train_Median.yaml b/project/config/single_dev_dataset/proteinGroups/train_Median.yaml index 340efcd69..745cca2c5 100644 --- a/project/config/single_dev_dataset/proteinGroups/train_Median.yaml +++ b/project/config/single_dev_dataset/proteinGroups/train_Median.yaml @@ -1,2 +1 @@ -file_format: csv -fn_rawfile_metadata: data/dev_datasets/df_intensities_proteinGroups_long/metadata.csv \ No newline at end of file +file_format: csv \ No newline at end of file diff --git a/project/config/single_dev_dataset/proteinGroups/train_VAE.yaml b/project/config/single_dev_dataset/proteinGroups/train_VAE.yaml index 2dc70a4ae..7caad9dab 100644 --- a/project/config/single_dev_dataset/proteinGroups/train_VAE.yaml +++ b/project/config/single_dev_dataset/proteinGroups/train_VAE.yaml @@ -1,6 +1,5 @@ # models_training: file_format: csv -fn_rawfile_metadata: data/dev_datasets/df_intensities_proteinGroups_long/metadata.csv latent_dim: 25 batch_size: 64 epochs_max: 50 diff --git a/project/workflow/Snakefile b/project/workflow/Snakefile index 5d44661ae..96b8e7a23 100644 --- a/project/workflow/Snakefile +++ b/project/workflow/Snakefile @@ -58,7 +58,7 @@ rule comparison: out=f"{{folder_experiment}}/{nb_stem}.o", shell: "papermill {input.nb} {output.nb:q}" - " -r fn_rawfile_metadata {params.meta_data:q}" + " -p fn_rawfile_metadata {params.meta_data:q}" " -r folder_experiment {wildcards.folder_experiment:q}" " -r models {params.models:q}" " && jupyter nbconvert --to html {output.nb:q}" @@ -179,7 +179,7 @@ rule train_models: "papermill {input.nb:q} {output.nb:q}" " -f {input.configfile:q}" " -r folder_experiment {params.folder_experiment:q}" - " -r fn_rawfile_metadata {params.meta_data:q}" + " -p fn_rawfile_metadata {params.meta_data:q}" " -r model_key {wildcards.model:q}" " 2> {log.err}" " && jupyter nbconvert --to html {output.nb:q}" diff --git a/project/workflow/Snakefile_best_repeated_split.smk b/project/workflow/Snakefile_best_repeated_split.smk index 279cc031f..b0c709114 100644 --- a/project/workflow/Snakefile_best_repeated_split.smk +++ b/project/workflow/Snakefile_best_repeated_split.smk @@ -97,6 +97,7 @@ rule train_models: model_key="{model}", meta_data=config["fn_rawfile_metadata"], file_format=config["file_format"], + cuda=config["cuda"], shell: "papermill {input.nb} {output.nb}" " -f {input.configfile}" @@ -104,4 +105,5 @@ rule train_models: " -r fn_rawfile_metadata {params.meta_data}" " -r file_format {params.file_format}" " -r model_key {params.model_key}" + " -p cuda {params.cuda}" " && jupyter nbconvert --to html {output.nb}" diff --git a/project/workflow/Snakefile_best_repeated_train.smk b/project/workflow/Snakefile_best_repeated_train.smk index a76c89351..2bbac4f86 100644 --- a/project/workflow/Snakefile_best_repeated_train.smk +++ b/project/workflow/Snakefile_best_repeated_train.smk @@ -91,6 +91,7 @@ rule train_models: model_key="{model}_{repeat}", meta_data=config["fn_rawfile_metadata"], file_format=config["file_format"], + cuda=config['cuda'], shell: "papermill {input.nb} {output.nb}" " -f {input.configfile}" @@ -98,4 +99,5 @@ rule train_models: " -r fn_rawfile_metadata {params.meta_data}" " -r file_format {params.file_format}" " -r model_key {params.model_key}" + " -p cuda {params.cuda}" " && jupyter nbconvert --to html {output.nb}"