From 8cd018720ad640086ec7380d888c5a7e4039c94e Mon Sep 17 00:00:00 2001 From: ValeSora Date: Thu, 18 Jul 2024 17:04:30 +0200 Subject: [PATCH] recount3 sub-package updated API-breaking changes: * The output data frames produced by the 'dgd_get_recount3_data' executable now contain both gene expression data and metadata, unless otherwise filtered (see below). Other changes: * Now the 'experiment_attributes' column, if present in the metadata columns of a SRA study, will be split into its constituent components when writing the output data frames for the 'dgd_get_recount3_data' executable (as it is already the case with the 'sample_attributes' column). * The user can now pass a YAML file to 'dgd_get_recount3_data' to download data from the Recount3 platform in bulk and filter them. * The user can now pass 'metadata_to_keep' and 'metadata_to_drop' lists of metadata columns in the input file to 'dgd_get_recount3_data' to keep or drop specific metadata columns in the output data frames. These can be passed both as columns, if the input file is a CSV file, or as specific keywords, if the input file is a YAML file. * The 'recount3.util.get_metadata' function now returns the metadata data frame with the 'recount3_project_name' and 'recount3_samples_category' columns added. * The 'model_untrained.yaml' configuration file was added to the examples of configuration files available within the package. Internal changes (for contributors): * Two new internal functions in the 'bulkDGD.recount3.util' module ('_load_samples_batches_csv' and 'load_samples_batches_yaml') were introduced to parse the input files to 'dgd_get_recount3_data'. The public function 'load_samples_batches' simply calls one of them depending on the file's extension. * The 'bulkDGD.util.get_handlers' function now accepts two new arguments 'log_level_console' and 'log_level_file' instead of the old 'log_level' to have a more fine-grained control over the log level of the handlers. * The log level of the console handler for the '_dgd_get_recount3_data_single_batch' execuable was changed to ERROR not to clutter the console too much with all the INFO messages from the subprocesses (which get logged to their own log files anyway if the overall log level is INFO or below). * The header of the 'bulkDGD/recount3/data/sra_metadata_fields.txt' file was changed to better describe the metadata fileds included in it. Documentation: * The documentation was updated to reflect the user-facing changes. * The readme files for the configurations were removed because of the redundancy with the content of the documentation and of the configuration files themselves. * The 'model_config_options.rst' file was removed from the documentation because it was empty and not referenced anywhere. --- .../_dgd_get_recount3_data_single_batch.py | 182 ++++-- bulkDGD/execs/dgd_get_probability_density.py | 3 +- bulkDGD/execs/dgd_get_recount3_data.py | 69 ++- bulkDGD/execs/dgd_get_representations.py | 3 +- bulkDGD/execs/dgd_perform_dea.py | 3 +- bulkDGD/execs/dgd_perform_pca.py | 3 +- bulkDGD/execs/dgd_preprocess_samples.py | 3 +- bulkDGD/execs/dgd_train.py | 3 +- .../model/{readme.md => model_untrained.yaml} | 77 +-- bulkDGD/ioutil/configs/plot/readme.md | 23 - .../ioutil/configs/representations/readme.md | 209 ------- bulkDGD/recount3/data/sra_metadata_fields.txt | 6 +- bulkDGD/recount3/util.py | 561 ++++++++++++++---- doc/source/dgd_get_recount3_data.md | 111 +++- setup.py | 8 +- 15 files changed, 752 insertions(+), 512 deletions(-) rename bulkDGD/ioutil/configs/model/{readme.md => model_untrained.yaml} (63%) delete mode 100644 bulkDGD/ioutil/configs/plot/readme.md delete mode 100644 bulkDGD/ioutil/configs/representations/readme.md diff --git a/bulkDGD/execs/_dgd_get_recount3_data_single_batch.py b/bulkDGD/execs/_dgd_get_recount3_data_single_batch.py index b75a00d..cffcb66 100644 --- a/bulkDGD/execs/_dgd_get_recount3_data_single_batch.py +++ b/bulkDGD/execs/_dgd_get_recount3_data_single_batch.py @@ -65,7 +65,6 @@ def main(): "The name of the Recount3 project for which samples will " \ f"be retrieved. The available projects are: {ip_choices_str}." parser.add_argument("-ip", "--input-project-name", - type = str, required = True, choices = ip_choices, help = ip_help) @@ -80,7 +79,6 @@ def main(): "associated with." \ "For SRA data, this is the code associated with the project." parser.add_argument("-is", "--input-samples-category", - type = str, required = True, help = is_help) @@ -92,7 +90,6 @@ def main(): "written in the working directory. The default file name is " \ "'{input_project_name}_{input_samples_category}.csv'." parser.add_argument("-o", "--output-csv", - type = str, default = None, help = o_help) @@ -102,7 +99,6 @@ def main(): "The working directory. The default is the current " \ "working directory." parser.add_argument("-d", "--work-dir", - type = str, default = os.getcwd(), help = d_help) @@ -139,19 +135,44 @@ def main(): "accepts a plain text file containing the string " \ "since it can be long for complex queries." parser.add_argument("-qs", "--query-string", - type = str, default = None, help = qs_help) #-----------------------------------------------------------------# + mk_help = \ + "A vertical line (|)-separated list of names of metadata " \ + "columns to keep in the final data frame. All the other " \ + "metadata columns will be dropped from the data frame. If " \ + "neither this option nor the '-md', '--metadata-to-drop' " \ + "is passed, all metadata columns are kept in the final data " \ + "frame." + parser.add_argument("-mk", "--metadata-to-keep", + default = None, + help = mk_help) + + #-----------------------------------------------------------------# + + md_help = \ + "A vertical line (|)-separated list of names of metadata " \ + "columns to drop in the final data frame. All the other " \ + "metadata columns will be kept in the final data frame. Use " \ + "the '_all_' reserved keyword to drop all metadata columns " \ + "from the data frame. If neither this option nor the '-mk', " \ + "'--metadata-to-keep' option is passed, all metadata " \ + "columns are kept in the final data frame." + parser.add_argument("-md", "--metadata-to-drop", + default = None, + help = md_help) + + #-----------------------------------------------------------------# + lf_default = "_dgd_get_recount3_data_single_batch.log" lf_help = \ "The name of the log file. The file will be written " \ "in the working directory. The default file name is " \ f"'{lf_default}'." parser.add_argument("-lf", "--log-file", - type = str, default = lf_default, help = lf_help) @@ -187,6 +208,8 @@ def main(): output_csv = args.output_csv wd = os.path.abspath(args.work_dir) query_string = args.query_string + metadata_to_keep = args.metadata_to_keep + metadata_to_drop = args.metadata_to_drop save_gene_sums = args.save_gene_sums save_metadata = args.save_metadata log_file = args.log_file @@ -197,7 +220,7 @@ def main(): #-----------------------------------------------------------------# # Get the module's logger. - logger = log.getLogger(__name__) + logger = log.getLogger("dgd_get_recount3_data") # Set WARNING logging level by default. log_level = log.WARNING @@ -219,9 +242,11 @@ def main(): handlers = \ util.get_handlers(\ log_console = log_console, + log_console_level = log.ERROR, log_file_class = log.FileHandler, - log_file_options = {"filename" : log_file}, - log_level = log_level) + log_file_options = {"filename" : log_file, + "mode" : "w"}, + log_file_level = log_level) # Set the logging configuration. log.basicConfig(level = log_level, @@ -247,10 +272,8 @@ def main(): # Log it an exit. errstr = \ - "It was not possible to get the RNA-seq data for " \ - f"project '{input_project_name}', samples " \ - f"'{input_samples_category}' from Recount3. " \ - f"Error: {e}" + "It was not possible to get the RNA-seq data from " \ + f"Recount3. Error: {e}" logger.exception(errstr) sys.exit(errstr) @@ -271,14 +294,32 @@ def main(): # Log it an exit. errstr = \ - "It was not possible to get the metadata for " \ - f"project '{input_project_name}', samples " \ - f"'{input_samples_category}' from Recount3. " \ + "It was not possible to get the metadata from Recount3. " \ f"Error: {e}" logger.exception(errstr) sys.exit(errstr) #-----------------------------------------------------------------# + + # Try to merge the RNA-seq data frame and the metadata data frame. + try: + + df_final = \ + recount3.merge_gene_sums_and_metadata(\ + df_gene_sums = df_gene_sums, + df_metadata = df_metadata) + + # If something went wrong + except Exception as e: + + # Log it and exit. + errstr = \ + "It was not possible to combine the RNA-seq data " \ + f"with the metadata. Error: {e}" + logger.exception(errstr) + sys.exit(errstr) + + #-----------------------------------------------------------------# # If the user has passed a query string or a file containing the # query string @@ -303,20 +344,12 @@ def main(): # Try to add the metadata to the RNA-seq data frame. try: - - # Merge the RNA-seq data frame and the metadata data frame. - df_merged = \ - recount3.merge_gene_sums_and_metadata(\ - df_gene_sums = df_gene_sums, - df_metadata = df_metadata, - project_name = input_project_name) # Filter the samples. df_final = \ recount3.filter_by_metadata(\ - df = df_merged, - query_string = query_string, - project_name = input_project_name) + df = df_final, + query_string = query_string) # If something went wrong except Exception as e: @@ -328,12 +361,95 @@ def main(): logger.exception(errstr) sys.exit(errstr) - # Otherwise - else: + #-----------------------------------------------------------------# + + # If the user passed a list of metadata columns to keep in the + # final data frame + if metadata_to_keep is not None: - # The final data frame will be the one containing the gene - # expression data. - df_final = df_gene_sums + # Get the list of metadata columns to keep. + metadata_to_keep = \ + [m.lstrip("'").rstrip("'") for m \ + in metadata_to_keep.rstrip().split("|")] + + # Get the columns to keep in the final data frame. + columns_to_keep = \ + [col for col in df_final.columns \ + if col.startswith("ENSG")] + \ + [col for col in df_final.columns \ + if not col.startswith("ENSG") \ + and col in metadata_to_keep] + + # Try to keep only the selected columns + try: + + df_final = df_final.loc[:,columns_to_keep] + + # If something went wrong + except Exception as e: + + # Get a string representing the metadata columns to keep. + metadata_to_keep_str = \ + ", ".join([f"'{m}'" for m in metadata_to_keep]) + + # Log it and exit. + errstr = \ + "It was not possible to keep only the following " \ + "metadata columns in the final data frame: " \ + f"{metadata_to_keep_str}. Error: {e}" + logger.exception(errstr) + sys.exit(errstr) + + #-----------------------------------------------------------------# + + # If the user passed a list of metadata columns to drop in the + # final data frame + if metadata_to_drop is not None: + + # If the user wants to drop all metadata columns + if metadata_to_drop == "_all_": + + # Get the columns to keep in the final data frame. + columns_to_keep = \ + [col for col in df_final.columns \ + if col.startswith("ENSG")] + + # Otherwise + else: + + # Get the list of metadata columns. + metadata_to_drop = \ + [m.lstrip("'").rstrip("'") for m \ + in metadata_to_drop.rstrip().split("|")] + + # Get the columns to keep in the final data frame. + columns_to_keep = \ + [col for col in df_final.columns \ + if col.startswith("ENSG")] + \ + [col for col in df_final.columns \ + if not col.startswith("ENSG") \ + and col not in metadata_to_drop] + + # Try to keep only the selected columns. + try: + + df_final = df_final.loc[:, columns_to_keep] + + # If something went wrong + except Exception as e: + + # Get a string representing the metadata columns to + # drop. + metadata_to_drop_str = \ + ", ".join([f"'{m}'" for m in metadata_to_drop]) + + # Log it and exit. + errstr = \ + "It was not possible to drop the following " \ + "metadata columns from the final data frame: " \ + f"{metadata_to_drop_str}. Error: {e}" + logger.exception(errstr) + sys.exit(errstr) #-----------------------------------------------------------------# @@ -364,7 +480,7 @@ def main(): # Log it and exit. errstr = \ - "It was not possible to save the RNA-seq data " \ - f"in '{output_csv_path}'. Error: {e}" + "It was not possible to save the final data frame in " \ + f"'{output_csv_path}'. Error: {e}" logger.exception(errstr) sys.exit(errstr) diff --git a/bulkDGD/execs/dgd_get_probability_density.py b/bulkDGD/execs/dgd_get_probability_density.py index 6b6e4ac..8f5dfba 100644 --- a/bulkDGD/execs/dgd_get_probability_density.py +++ b/bulkDGD/execs/dgd_get_probability_density.py @@ -198,10 +198,11 @@ def main(): handlers = \ util.get_handlers(\ log_console = log_console, + log_console_level = log_level, log_file_class = log.FileHandler, log_file_options = {"filename" : log_file, "mode" : "w"}, - log_level = log_level) + log_file_level = log_level) # Set the logging configuration. log.basicConfig(level = log_level, diff --git a/bulkDGD/execs/dgd_get_recount3_data.py b/bulkDGD/execs/dgd_get_recount3_data.py index 89abe01..1609bad 100755 --- a/bulkDGD/execs/dgd_get_recount3_data.py +++ b/bulkDGD/execs/dgd_get_recount3_data.py @@ -62,12 +62,7 @@ def main(): # Add the arguments. ib_help = \ - "A CSV file to download samples' data in bulk. The file " \ - "must contain at least two columns: 'input_project_name' " \ - "with the name of the project the samples belong to and " \ - "'input_samples_category' with the samples' category. " \ - "A third column, 'query_string', may specify the query " \ - "string used to filter each batch of samples." + "A CSV or a YAML file used to download samples' data in bulk." parser.add_argument("-i", "--input-samples-batches", type = str, default = None, @@ -100,11 +95,11 @@ def main(): "Save the original GZ file containing the RNA-seq " \ "data for the samples. For each batch of samples, "\ "the corresponding file will be saved in the " \ - "working directory and named '{input_project_name}_" \ - "{input_samples_category}_gene_sums.gz'. This file " \ + "working directory and named '{recount3_project_name}_" \ + "{recount3_samples_category}_gene_sums.gz'. This file " \ "will be written only once if more than one batch " \ - "refers to the same 'input_project_name' " \ - "and 'input_samples_category'." + "refers to the same 'recount3_project_name' " \ + "and 'recount3_samples_category'." parser.add_argument("-sg", "--save-gene-sums", action = "store_true", help = sg_help) @@ -115,11 +110,11 @@ def main(): "Save the original GZ file containing the metadata " \ "for the samples. For each batch of samples, "\ "the corresponding file will be saved in the " \ - "working directory and named '{input_project_name}_" \ - "{input_samples_category}_metadata.gz'. This file will " \ + "working directory and named '{recount3_project_name}_" \ + "{recount3_samples_category}_metadata.gz'. This file will " \ "be written only once if more than one batch refers " \ - "to the same 'input_project_name' and " \ - "'input_samples_category'." + "to the same 'recount3_project_name' and " \ + "'recount3_samples_category'." parser.add_argument("-sm", "--save-metadata", action = "store_true", help = sm_help) @@ -203,9 +198,11 @@ def main(): handlers = \ util.get_handlers(\ log_console = log_console, + log_console_level = log_level, log_file_class = loghandlers.RotatingFileHandler, - log_file_options = {"filename" : log_file}, - log_level = log_level) + log_file_options = {"filename" : log_file, + "mode" : "w"}, + log_file_level = log_level) # Set the logging configuration. log.basicConfig(level = log_level, @@ -242,7 +239,7 @@ def main(): try: df = recount3.load_samples_batches(\ - csv_file = input_samples_batches) + samples_file = input_samples_batches) # If something went wrong except Exception as e: @@ -269,13 +266,25 @@ def main(): for num_batch, row in enumerate(df.itertuples(index = False), 1): # Get the name of the project. - project_name = row.input_project_name + project_name = row.recount3_project_name # Get the samples' category. - samples_category = row.input_samples_category + samples_category = row.recount3_samples_category - # Get the query string. - query_string = row.query_string + # Get the query string, if provided. + query_string = \ + row.query_string \ + if hasattr(row, "query_string") else None + + # Get the columns to keep, if provided. + metadata_to_keep = \ + row.metadata_to_keep \ + if hasattr(row, "metadata_to_keep") else None + + # Get the columns to drop, if provided. + metadata_to_drop = \ + row.metadata_to_drop \ + if hasattr(row, "metadata_to_drop") else None #-------------------------------------------------------------# @@ -322,10 +331,24 @@ def main(): "-lf", log_file_path] # If the user passed a query string for the current batch - if not pd.isna(query_string): + if query_string is not None and query_string != "": # Add the query string option to the list of arguments. - args.extend(["-qs", query_string]) + args.extend(["-qs", str(query_string)]) + + # If the user passed the metadata columns to keep + if metadata_to_keep is not None and metadata_to_keep != "": + + # Add the option to keep only selected metadata columns to + # the list of arguments. + args.extend(["-mk", str(metadata_to_keep)]) + + # If the user passed the metadata columns to drop + if metadata_to_drop is not None and metadata_to_drop != "": + + # Add the option to drop selected metadata columns to the + # list of arguments. + args.extend(["-md", str(metadata_to_drop)]) # If the user wants to save the original 'gene_sums' files if save_gene_sums: diff --git a/bulkDGD/execs/dgd_get_representations.py b/bulkDGD/execs/dgd_get_representations.py index 7e21e93..71f7c8e 100644 --- a/bulkDGD/execs/dgd_get_representations.py +++ b/bulkDGD/execs/dgd_get_representations.py @@ -216,10 +216,11 @@ def main(): handlers = \ util.get_handlers(\ log_console = log_console, + log_console_level = log_level, log_file_class = log.FileHandler, log_file_options = {"filename" : log_file, "mode" : "w"}, - log_level = log_level) + log_file_level = log_level) # Set the logging configuration. log.basicConfig(level = log_level, diff --git a/bulkDGD/execs/dgd_perform_dea.py b/bulkDGD/execs/dgd_perform_dea.py index d5bd0a5..d47d407 100644 --- a/bulkDGD/execs/dgd_perform_dea.py +++ b/bulkDGD/execs/dgd_perform_dea.py @@ -255,9 +255,10 @@ def main(): handlers = \ util.get_handlers(\ log_console = log_console, + log_console_level = log_level, log_file_class = loghandlers.RotatingFileHandler, log_file_options = {"filename" : log_file}, - log_level = log_level) + log_file_level = log_level) # Set the logging configuration. log.basicConfig(level = log_level, diff --git a/bulkDGD/execs/dgd_perform_pca.py b/bulkDGD/execs/dgd_perform_pca.py index 3dcd641..ee85c6e 100644 --- a/bulkDGD/execs/dgd_perform_pca.py +++ b/bulkDGD/execs/dgd_perform_pca.py @@ -202,10 +202,11 @@ def main(): handlers = \ util.get_handlers(\ log_console = log_console, + log_console_level = log_level, log_file_class = log.FileHandler, log_file_options = {"filename" : log_file, "mode" : "w"}, - log_level = log_level) + log_file_level = log_level) # Set the logging configuration. log.basicConfig(level = log_level, diff --git a/bulkDGD/execs/dgd_preprocess_samples.py b/bulkDGD/execs/dgd_preprocess_samples.py index dea69ed..c181607 100644 --- a/bulkDGD/execs/dgd_preprocess_samples.py +++ b/bulkDGD/execs/dgd_preprocess_samples.py @@ -186,10 +186,11 @@ def main(): handlers = \ util.get_handlers(\ log_console = log_console, + log_console_level = log_level, log_file_class = log.FileHandler, log_file_options = {"filename" : log_file, "mode" : "w"}, - log_level = log_level) + log_file_level = log_level) # Set the logging configuration. log.basicConfig(level = log_level, diff --git a/bulkDGD/execs/dgd_train.py b/bulkDGD/execs/dgd_train.py index 5ea5e41..71bf8a0 100644 --- a/bulkDGD/execs/dgd_train.py +++ b/bulkDGD/execs/dgd_train.py @@ -235,10 +235,11 @@ def main(): handlers = \ util.get_handlers(\ log_console = log_console, + log_console_level = log_level, log_file_class = log.FileHandler, log_file_options = {"filename" : log_file, "mode" : "w"}, - log_level = log_level) + log_file_level = log_level) # Set the logging configuration. log.basicConfig(level = log_level, diff --git a/bulkDGD/ioutil/configs/model/readme.md b/bulkDGD/ioutil/configs/model/model_untrained.yaml similarity index 63% rename from bulkDGD/ioutil/configs/model/readme.md rename to bulkDGD/ioutil/configs/model/model_untrained.yaml index 76646d7..af65422 100644 --- a/bulkDGD/ioutil/configs/model/readme.md +++ b/bulkDGD/ioutil/configs/model/model_untrained.yaml @@ -1,20 +1,3 @@ -# `configs/model` - -Last updated: 12/05/2024 - -## `model.yaml` - -This is an example of a YAML configuration file containing the configuration for the DGD model and used by `dgd_get_representations`, `dgd_perform_dea`, and `dgd_get_probability_desity` (`-cm`, `--config-file-model` option). - -The provided configuration file is compatible with the parameters used in the trained deep generative decoder (not uploaded on GitHub because of its size; it can be found [here](https://drive.google.com/file/d/1SZaoazkvqZ6DBF-adMQ3KRcy4Itxsz77/view?usp=sharing)) and the Gaussian mixture model (`bulkDGD/ioutil/data/gmm.pth`). - -Suppose you want to change the architectures of the model components. In that case, you need to retrain the different components of the model, provide the corresponding PyTorch files, and update the given configuration file accordingly. - -The configuration can be loaded using the `bulkDGD.ioutil.load_config_model` function. - -The configuration file has the following structure: - -```yaml # Configuration file containing the configuration for the full DGD # model. @@ -22,23 +5,9 @@ The configuration file has the following structure: ####################### GAUSSIAN MIXTURE MODEL ######################## -# Set the PyTorch file containing the parameters of the trained GMM. -# -# Make sure that the file contains a GMM whose parameters fit the -# architecture specified in the 'options' section. -# -# Type: str. -# -# Default: 'default'. -gmm_pth_file: default - -#---------------------------------------------------------------------# - # Set the dimensionality of the Gaussian mixture model. # # Type: int. -# -# Default: 50. dim: 50 #---------------------------------------------------------------------# @@ -46,8 +15,6 @@ dim: 50 # Set the number of components in the Gaussian mixture model. # # Type: int. -# -# Default: 45. n_comp: 45 #---------------------------------------------------------------------# @@ -60,8 +27,6 @@ n_comp: 45 # - 'fixed' for a fixed covariance matrix. # - 'isotropic' for an isotropic covariance matrix. # - 'diagonal' for a diagonal covariance matrix. -# -# Default: 'diagonal'. cm_type: diagonal #---------------------------------------------------------------------# @@ -73,8 +38,6 @@ cm_type: diagonal # # Options: # - 'softball' for a softball distribution. -# -# Default: 'softball'. means_prior_name: softball # Set the options to set up the prior distribution (they vary according @@ -86,15 +49,11 @@ means_prior_options: # Set the radius of the soft ball. # # Type: int. - # - # Default: 7. radius: 7 # Set the sharpness of the soft boundary of the ball. # # Type: int. - # - # Default: 10. sharpness: 10 #---------------------------------------------------------------------# @@ -106,8 +65,6 @@ means_prior_options: # # Options: # - 'dirichlet' for a Dirichlet distribution. -# -# Default: 'dirichlet'. weights_prior_name: dirichlet # Set the options to set up the prior (they vary according to the prior @@ -121,8 +78,6 @@ weights_prior_options: # model. # # Type: int. - # - # Default: 5. alpha: 5 #---------------------------------------------------------------------# @@ -134,8 +89,6 @@ weights_prior_options: # # Options: # - 'gaussian' for a Gaussian distribution. -# -# Default: 'gaussian'. log_var_prior_name: gaussian # Set the options to set up the prior (they vary according to the prior @@ -148,39 +101,20 @@ log_var_prior_options: # 2 * log(mean). # # Type: float. - # - # Default: 0.1. mean : 0.1 # Set the standard deviation of the Gaussian distribution. # # Type: float. - # - # Default: 1.0. stddev: 1.0 ############################### DECODER ############################### -# Set the PyTorch file containing the parameters of the trained -# decoder. -# -# Make sure that the file contains a decoder whose parameters -# fit the architecture specified in the 'options' section. -# -# Type: str. -# -# Default: 'default'. -dec_pth_file: default - -#---------------------------------------------------------------------# - # Set the number of units in the hidden layers. # # Type: list of int. -# -# Default: [500, 8000]. n_units_hidden_layers: [500, 8000] #---------------------------------------------------------------------# @@ -189,8 +123,6 @@ n_units_hidden_layers: [500, 8000] # modeling the output layer. # # Type: int. -# -# Default: 2. r_init: 2 #---------------------------------------------------------------------# @@ -203,8 +135,6 @@ r_init: 2 # Options: # - 'sigmoid' for a sigmoid function. # - 'softplus' for a softplus function. -# -# Default: 'softplus'. activation_output: softplus @@ -215,9 +145,4 @@ activation_output: softplus # DGD model. # # Type: str. -# -# Default: 'default'. -genes_txt_file: default - -``` - +genes_txt_file: custom_genes.txt diff --git a/bulkDGD/ioutil/configs/plot/readme.md b/bulkDGD/ioutil/configs/plot/readme.md deleted file mode 100644 index b3685a5..0000000 --- a/bulkDGD/ioutil/configs/plot/readme.md +++ /dev/null @@ -1,23 +0,0 @@ -# `configs/plot` - -Last updated: 12/05/2023 - -## `r_values_hist` - -This is an example of a configuration file describing the aesthetics of a histogram representing the distribution of a set of r-values corresponding to different negative binomial distributions. The file also contains the options specifying the output file format where the plot will be saved. A comment line above each option describes it. - -The configuration in this file can be loaded with the `bulkDGD.ioutil.get_config_plot` function and then passed to the `bulkDGD.plotting.plot_r_values_hist` function, which generates the histogram. - -## `pca_scatter` - -This is an example of a configuration file describing the aesthetics of a scatter plot displaying the results of a two-dimensional principal component analysis (PCA). The file also contains the options specifying the output file format where the plot will be saved. A comment line above each option describes it. - -The configuration in this file can be loaded with the `bulkDGD.ioutil.get_config_plot` function and then passed to the `bulkDGD.plotting.plot_2d_pca` function, which generates the scatter plot. - -This is the default configuration file used by `dgd_perform_pca`. - -## `time_line.yaml` - -This is an example of a configuration file describing the aesthetics of a line plot displaying the CPU/wall clock time spent in each epoch of each round of optimization performed when finding the best representations for a set of samples. The file also contains the options specifying the output file format where the plot will be saved. A comment line above each option describes it. - -The configuration in this file can be loaded with the `bulkDGD.ioutil.get_config_plot` function and then passed to the `bulkDGD.plotting.plot_get_representations_time` function, which generates the line plot. \ No newline at end of file diff --git a/bulkDGD/ioutil/configs/representations/readme.md b/bulkDGD/ioutil/configs/representations/readme.md deleted file mode 100644 index b20e20d..0000000 --- a/bulkDGD/ioutil/configs/representations/readme.md +++ /dev/null @@ -1,209 +0,0 @@ -# `configs/representations` - -Last updated: 12/05/2024 - -## `one_opt.yaml` - -This is an example of a YAML configuration file used for `dgd_get_representations` (`-cr`, `--config-file-rep` option) to get the representations using only one round of optimization. - -The configuration can be loaded using the `bulkDGD.ioutil.load_config_rep` function. - -The configuration file has the following structure: - -```yaml -# Configuration file specifying the options to get the representations -# with only one round of optimization. - - -####################################################################### - - -# Set how many representations to initialize per sample per component -# of the Gaussian mixture model. -# -# Type: int. -# -# Default: 1. -n_rep_per_comp: 1 - - -####################################################################### - - -# Set the options for the optimization. -optimization: - - # Set the number of epochs. - # - # Type: int. - # - # Default: 60. - epochs: 60 - - #-------------------------------------------------------------------# - - # Set the optimizer to be used. - # - # Type: str. - # - # Options: - # - 'adam' for the Adam optimizer. - # - # Default: 'adam'. - optimizer_name: adam - - # Set the options for the optimizer (they vary according to the - # optimizer defined by 'optimizer_name'). - optimizer_options: - - # Set these options if 'optimizer_name' is 'adam'. - - # Set the learning rate. - # - # Type: float. - # - # Default: 0.01. - lr: 0.01 - - # Set the weight decay. - # - # Type: float. - # - # Default: 0.0. - weight_decay: 0.0 - - # Set the betas. - # - # Type: list of float. - # - # Default: [0.5, 0.9]. - betas: [0.5, 0.9] -``` - -## `two_opt.yaml` - -This is an example of a YAML configuration file used for `dgd_get_representations` (`-cr`, `--config-file-rep` option) to get the representations using two rounds of optimization. - -The configuration can be loaded using the `bulkDGD.ioutil.load_config_rep` function. - -The configuration file has the following structure: - -```yaml -# Configuration file specifying the options to get the representations -# with two rounds of optimization. - - -####################################################################### - - -# Set how many representations to initialize per sample per component -# of the Gaussian mixture model. -# -# Type: int. -# -# Default: 1. -n_rep_per_comp: 1 - - -####################################################################### - - -# Set the options for the optimizations. -optimization: - - # Set the options for the first optimization round. - opt1: - - # Set the number of epochs. - # - # Type: int. - # - # Default: 10. - epochs: 10 - - # Set the optimizer to be used. - # - # Type: str. - # - # Options: - # - 'adam' for the Adam optimizer. - # - # Default: 'adam'. - optimizer_name: adam - - # Set the options for the optimizer (they vary according to the - # optimizer defined by 'optimizer_name'). - optimizer_options: - - # Set these options if 'optimizer_name' is 'adam'. - - # Set the learning rate. - # - # Type: float. - # - # Default: 0.01. - lr: 0.01 - - # Set the weight decay. - # - # Type: float. - # - # Default: 0.0 - weight_decay: 0.0 - - # Set the betas. - # - # Type: list of float. - # - # Default: [0.5, 0.9]. - betas: [0.5, 0.9] - - #-------------------------------------------------------------------# - - # Set the options for the second optimization round. - opt2: - - # Set the number of epochs. - # - # Type: int. - # - # Default: 50. - epochs: 50 - - # Set the optimizer to be used. - # - # Type: str. - # - # Options: - # - 'adam' for the Adam optimizer. - # - # Default: 'adam'. - optimizer_name: adam - - # Set the options for the optimizer (they vary according to the - # optimizer defined by 'optimizer_name'). - optimizer_options: - - # Set these options if 'optimizer_name' is 'adam'. - - # Set the learning rate. - # - # Type: float. - # - # Default: 0.01. - lr: 0.01 - - # Set the weight decay. - # - # Type: float. - # - # Default: 0.0 - weight_decay: 0.0 - - # Set the betas. - # - # Type: list of float. - # - # Default: [0.5, 0.9]. - betas: [0.5, 0.9] -``` diff --git a/bulkDGD/recount3/data/sra_metadata_fields.txt b/bulkDGD/recount3/data/sra_metadata_fields.txt index a631efc..2d0fdb7 100644 --- a/bulkDGD/recount3/data/sra_metadata_fields.txt +++ b/bulkDGD/recount3/data/sra_metadata_fields.txt @@ -1,4 +1,6 @@ -# Fields found in the SRA metadata files downloaded from the Recount3 platform. +# Recount3-specific metadata fields found in the SRA metadata files +# downloaded from the Recount3 platform. Other study-specific metadata +# fields may be present in SRA studies. rail_id external_id @@ -40,4 +42,4 @@ read_info run_alias run_center_name run_broker_name -run_center \ No newline at end of file +run_center diff --git a/bulkDGD/recount3/util.py b/bulkDGD/recount3/util.py index 55772d2..db0ba8c 100755 --- a/bulkDGD/recount3/util.py +++ b/bulkDGD/recount3/util.py @@ -45,6 +45,7 @@ # Import from third-party packages. import pandas as pd import requests as rq +import yaml # Import from 'bulkDGD'. from . import defaults @@ -99,66 +100,63 @@ def _get_metadata_fields(project_name, #-------------------------------------------------------------# - # Initialize an empty set to store each sample's attributes. - samples_attributes = set() + # For each entity that may have attributes + for entity in ["sample", "experiment"]: - # If: - # - The project is 'sra'. - # - A data frame was passed. - # - There is a 'sample_attributes' column in the data frame. - if project_name == "sra" \ - and df is not None \ - and "sample_attributes" in df.columns: + # Initialize an empty set to store the entity's attributes. + attributes = set() - # Add the names of the samples' attributes to the set. - samples_attributes.add(\ - [item.split(";;")[0].replace(" ", "_") for item \ - in df["sample_attributes"].split("|")]) + # Set the name of the column that may contain the + # attributes. + column_attrs = f"{entity}_attributes" - #-------------------------------------------------------------# - - # Return all the metadata fields found. - return metadata_fields + sorted(samples_attributes) + # If: + # - The project is 'sra'. + # - A data frame was passed. + # - There is an '{entity}_attributes' column in the data + # frame. + if project_name == "sra" \ + and df is not None \ + and column_attrs in df.columns: + # Add the names of the samples' attributes to the set. + attributes.add(\ + [item.split(";;")[0].replace(" ", "_") for item \ + in df[column_attrs].split("|")]) -########################## PUBLIC FUNCTIONS ########################### + # Add the attributes to the list of metadata fields + # found. + metadata_fields += sorted(attributes) + #-------------------------------------------------------------# -def load_samples_batches(csv_file): - """Load a comma-separated CSV file containing a data frame with - information about the batches of samples to be downloaded from - Recount3. - - The data frame is expected to have at least two columns: - - * ``"input_project_name"``, containing the name of the project - the samples belong to. - * ``"input_samples_category"``, containing the name of the - category the samples belong to. + # Return all the metadata fields found. + return metadata_fields - A third column, ``"query_string"``, may be present. This should - contain the query string that should be used to filter each batch - of samples by their metadata. - If no ``"query_string"`` column is present, the samples will not - be filtered. +def _load_samples_batches_csv(csv_file): + """Load the information for batches of samples to be downloaded + from the Recount3 platform from a CSV file. Parameters ---------- - csv_file : ``str`` - The input CSV file. + yaml_file : ``str`` + The CSV file. Returns ------- df : ``pandas.DataFrame`` - The data frame parsed from the CSV file. + A data frame containing the information for the batches of + samples. """ # Set the columns taken into consideration in the data frame. supported_columns = \ - ["input_project_name", - "input_samples_category", - "query_string"] + ["recount3_project_name", + "recount3_samples_category", + "query_string", + "metadata_to_keep", + "metadata_to_drop"] #-----------------------------------------------------------------# @@ -167,12 +165,12 @@ def load_samples_batches(csv_file): sep = ",", header = 0, comment = "#", - index_col = False) + index_col = False).fillna("") #-----------------------------------------------------------------# # For each required column - for col in ["input_project_name", "input_samples_category"]: + for col in ["recount3_project_name", "recount3_samples_category"]: # If it does not exist if col not in df.columns: @@ -186,13 +184,13 @@ def load_samples_batches(csv_file): #-----------------------------------------------------------------# # For each project found in the data frame - for project_name in df["input_project_name"].unique(): + for project_name in df["recount3_project_name"].unique(): # Get the unique samples' categories found for that project # in the data frame. unique_samples_categories = \ - df.loc[df["input_project_name"] == project_name][\ - "input_samples_category"].unique() + df.loc[df["recount3_project_name"] == project_name][\ + "recount3_samples_category"].unique() # For each samples' category for samples_category in unique_samples_categories: @@ -231,6 +229,291 @@ def load_samples_batches(csv_file): return df +def _load_samples_batches_yaml(yaml_file): + """Load the information for batches of samples to be downloaded + from the Recount3 platform from a YAML file. + + Parameters + ---------- + yaml_file : ``str`` + The YAML file. + + Returns + ------- + df : ``pandas.DataFrame`` + A data frame containing the information for the batches of + samples. + """ + + # Set the columns that the final data frame will have. + columns = \ + ["recount3_project_name", + "recount3_samples_category", + "query_string", + "metadata_to_keep", + "metadata_to_drop"] + + #-----------------------------------------------------------------# + + # Set an empty list to store the data for the final data frame. + data = [] + + #-----------------------------------------------------------------# + + # Load the batches of samples. + samples_batches = yaml.safe_load(open(yaml_file, "r")) + + #-----------------------------------------------------------------# + + # For each Recount3 project's name. + for project_name in samples_batches: + + # Get the conditions that apply to all samples' categories + # for the project. + conditions_all = samples_batches[project_name].pop("all", {}) + + # Get the query string to filter all samples belonging to + # the project. + qs = conditions_all.get("query_string", "") + + # Get the metadata columns to be kept in all samples belonging + # to the project. + mtk = \ + "|".join(conditions_all.get("metadata_to_keep", [])) + + # Get the metadata columns to be dropped from all samples + # belonging to the project. + mtd = \ + "|".join(conditions_all.get("metadata_to_drop", [])) + + # For each category of samples in the project + for samples_category in samples_batches[project_name]: + + # Get the data for the samples belonging to the category. + samples_data = \ + samples_batches[project_name][samples_category] + + # Add each piece of data to the final list. + data.append(\ + {"recount3_project_name" : \ + project_name, + "recount3_samples_category" : \ + samples_category, + "query_string" : \ + samples_data.get("query_string", qs), + "metadata_to_keep" : \ + mtk + "|".join(samples_data.get(\ + "metadata_to_keep", [])), + "metadata_to_drop" : \ + mtd + "|".join(samples_data.get(\ + "metadata_to_drop", []))}) + + #-----------------------------------------------------------------# + + # Create the final data frame from the list. + df = pd.DataFrame(data, + columns = columns) + + #-----------------------------------------------------------------# + + # Return the data frame. + return df + + +########################## PUBLIC FUNCTIONS ########################### + + +def load_samples_batches(samples_file): + """Load a file with information about the batches of samples to be + downloaded from Recount3. + + The file can be either a CSV file or a YAML file. + + See the Notes section below for more details about their format. + + Parameters + ---------- + samples_file : ``str`` + The input file. + + Returns + ------- + df : ``pandas.DataFrame`` + A data frame containing the information parsed from the + file. + + Notes + ----- + ** CSV file ** + + If the input file is a CSV file, it should contain a + comma-separated data frame. + + The data frame is expected to have at least two columns: + + * ``"recount3_project_name"``, containing the name of the project + the samples belong to. + * ``"recount3_samples_category"``, containing the name of the + category the samples belong to (it is a tissue type for GTEx + data, a cancer type for TCGA data, and a project code for SRA + data) + + These additional three columns may also be present: + + * ``"query_string"``, containing the query string that should be + used to filter each batch of samples by their metadata. The + query string is passed to the ``pandas.DataFrame.query()`` + method. + + If no ``"query_string"`` column is present, the samples will not + be filtered. + + * ``metadata_to_keep``, containing a vertical line (|)-separated + list of names of metadata columns that will be kept in the + final data frames, together with the columns containing gene + expression data. + + ``"recount3_project_name"`` and ``"recount3_samples_category"`` + are valid column names, and, if passed, the final data frames + will also include them (each data frame will, of course, contain + only one repeated value for each of these columns, since it + contains samples from a single category of a single project). + + By default, all metadata columns (plus the + ``"recount3_project_name"`` and ``"recount3_samples_category"`` + columns) are kept in the final data frames. + + * ``metadata_to_drop``, containing a vertical line (|)-separated + list of names of metadata columns that will be dropped from the + final data frames. + + The reserved keyword ``'_all_'`` can be used to drop all metadata + columns from the final data frame of a specific batch of samples. + + ``"recount3_project_name"`` and ``"recount3_samples_category"`` + are valid column names and, if passed, will result in these + columns being dropped. + + ** YAML file ** + + If the file is a YAML file, it should have the format exemplified + below. We recommend using a YAML file over a CSV file when you have + several studies for which different filtering conditions should be + applied. + + .. code-block:: yaml + + # SRA studies - it can be omitted if no SRA studies are + # included. + sra: + + # Conditions applied to all SRA studies. + all: + + # Which metadata to keep in all studies (if found). It is + # a vertical line (|)-separated list of names of metadata + # columns that will be kept in the final data frames, + # together with the columns containing gene expression + # data. + # + # "recount3_project_name"`` and "recount3_samples_category" + # are valid column names, and, if passed, the final data + # frames will also include them (each data frame will, of + # course, contain only one repeated value for each of these + # columns, since it contains samples from a single category + # of a single project). + # + # By default, all metadata columns (plus the + # "recount3_project_name" and `"recount3_samples_category" + # columns) are kept in the final data frames. + metadata_to_keep: + + # Keep in all studies. + - source_name + ... + + # Which metadata to drop from all studies (if found). It is + # a vertical line (|)-separated list of names of metadata + # columns that will be dropped from the final data frames. + # + # The reserved keyword '_all_' can be used to drop all + # columns from the data frames. + # + # "recount3_project_name" and "recount3_samples_category" + # are valid column names and, if passed, will result in + # these columns being dropped. + metadata_to_drop: + + # Found in all studies. + - age + ... + + # Conditions applied to SRA study SRP179061. + SRP179061: + + # The query string that should be used to filter each batch + # of samples by their metadata. The query string is passed + # to the 'pandas.DataFrame.query()' method for filtering. + + # If no query string is present, the samples will not + # be filtered. + query_string: diagnosis == 'Control' + + # Which metadata to keep in this study (if found), It + # follows the same rules as the 'metadata_to_keep' field + # in the 'all' section. + metadata_to_keep: + - tissue + + # Which metadata to drop from this study (if found), It + # follows the same rules as the 'metadata_to_drop' field + # in the 'all' section. + metadata_to_drop: + - Sex + + # GTEx studies - it can be omitted if no GTEx studies are + # included. + gtex: + + # Same format as for SRA studies - single studies are + # identified by the tissue type each study refers to. + ... + + # TCGA studies - it can be omitted if no TCGA studies are + # included. + tcga: + + # Same format as for SRA studies - single studies are + # identified by the cancer type each study refers to. + ... + + """ + + # Get the extension of the input file. + _, samples_file_ext = os.path.splitext(samples_file) + + # If the file is a CSV file + if samples_file_ext == ".csv": + + # Create the data frame from the file and return it. + return _load_samples_batches_csv(csv_file = samples_file) + + # If the file is a YAML file + elif samples_file_ext == ".yaml": + + # Create the data frame from the file and return it. + return _load_samples_batches_yaml(yaml_file = samples_file) + + # Otherwise + else: + + # Raise an error. + errstr = \ + f"The file '{samples_file}' must be either a CSV file " \ + "('.csv' extension) or a YAML file ('.yaml' extension)." + raise ValueError(errstr) + + def check_samples_category(samples_category, project_name): """Check that the category of samples requested by the user is @@ -533,6 +816,14 @@ def get_metadata(project_name, df_metadata : ``pandas.DataFrame`` A data frame containing the metadata for the samples associated with the given category. + + Notes + ----- + The ``"recount3_project_name"`` and the + ``"recount3_samples_category"`` columns are automatically added to + the metadata returned by the function and contain the + ``project_name`` and ``samples_category`` of the samples, + respectively. """ # Set the name of the file that will contain the metadata. @@ -571,6 +862,16 @@ def get_metadata(project_name, compression = "gzip", low_memory = False) + # Add the column containing the project's name. + df_metadata.insert(loc = 0, + column = "recount3_project_name", + value = project_name) + + # Add the column containing the samples' category. + df_metadata.insert(loc = 1, + column = "recount3_samples_category", + value = samples_category) + # Return the data frame. return df_metadata @@ -652,67 +953,93 @@ def get_metadata(project_name, #-----------------------------------------------------------------# - # If there is no 'sample_attributes' column in the data frame - if "sample_attributes" not in df_metadata.columns: + # Add the column containing the project's name. + df_metadata.insert(loc = 0, + column = "recount3_project_name", + value = project_name) - # Simply return the data frame as it is. - return df_metadata + # Add the column containing the samples' category. + df_metadata.insert(loc = 1, + column = "recount3_samples_category", + value = samples_category) #-----------------------------------------------------------------# - - # Inform the string that samples' attributes were found. - infostr = \ - "Samples' attributes were found in the metadata (see below)." - logger.info(infostr) - # Define a function to parse the 'sample_attribute' column in the - # metadata. - parse_sample_attributes = \ - lambda attr_str: dict(\ - (item.split(";;")[0].replace(" ", "_"), - item.split(";;")[1]) \ - for item in attr_str.split("|")) - - # Parse the samples' attributes from the data frame and covert - # them into a DataFrame. - df_sample_attrs = \ - df_metadata["sample_attributes"].apply(\ - parse_sample_attributes).apply(pd.Series) - - # For each attribute - for col in df_sample_attrs.columns: - - # Get a string representing the unique values found in the - # column. - unique_values_str = \ - ", ".join([f"'{val}'" \ - for val in df_sample_attrs[col].unique()]) - - # Log the attribute and its unique values. - infostr = \ - f"Sample attribute '{col}' found. Unique values: " \ - f"{unique_values_str}." - logger.info(infostr) + # For each entity that may have attributes + for entity in ["sample", "experiment"]: - #-----------------------------------------------------------------# + # Set the name of the column that may contain the attributes. + column_attrs = f"{entity}_attributes" - # Get the standard metadata fields. - metadata_fields = \ - _get_metadata_fields(project_name = project_name) + # If the column exists in the data frame containing the + # metadata + if column_attrs in df_metadata.columns: - # Get any attributes that are already found in metadata. - attrs_to_drop = \ - df_sample_attrs.columns[\ - [col_name in df_metadata.columns \ - for col_name in df_sample_attrs.columns]] + #---------------------------------------------------------# + + # Inform the user that attributes were found. + infostr = \ + f"{entity.capitalize()}s' attributes were found in " \ + "the metadata (see below)." + logger.info(infostr) - # Drop them from the data frame of attributes. - df_sample_attrs = df_sample_attrs.drop(attrs_to_drop, + #---------------------------------------------------------# + + # Define a function to parse the 'sample_attribute' column + # in the metadata. + parse_attributes = \ + lambda attr_str: dict(\ + (item.split(";;")[0].replace(" ", "_"), + item.split(";;")[1]) \ + for item in str(attr_str).split("|") \ + if item != "nan") + + # Parse the samples' attributes from the data frame and + # convert them into a data frame. + df_attrs = \ + df_metadata[column_attrs].apply(\ + parse_attributes).apply(pd.Series) + + # For each attribute + for col in df_attrs.columns: + + # Get a string representing the unique values found in + # the column. + unique_values_str = \ + ", ".join(\ + [f"'{v}'" for v in df_attrs[col].unique()]) + + # Log the attribute and its unique values. + infostr = \ + f"{entity} attribute '{col}' found. Unique " \ + f"values: {unique_values_str}." + logger.info(infostr) + + #---------------------------------------------------------# + + # Get the standard metadata fields. + metadata_fields = \ + _get_metadata_fields(project_name = project_name) + + # Get any attributes that are already found in the + # metadata. + attrs_to_drop = \ + df_attrs.columns[\ + [col_name in df_metadata.columns \ + for col_name in df_attrs.columns]] + + # Drop them from the data frame of attributes. + df_attrs = df_attrs.drop(labels = attrs_to_drop, + axis = 1) + + # Add the metadata columns to the data frame of metadata. + df_metadata = df_metadata.join(df_attrs) + + # Drop the original column from the data frame containing + # the metadata. + df_metadata = df_metadata.drop(labels = [column_attrs], axis = 1) - # Add the metadata columns to the data frame of metadata. - df_metadata = df_metadata.join(df_sample_attrs) - #-----------------------------------------------------------------# # If the user wants to save the metadata @@ -721,8 +1048,9 @@ def get_metadata(project_name, # Inform the user that the updated metadata will be saved # in a separate file. infostr = \ - "The metadata with the 'sample_attributes' split into " \ - "different columns will be saved in a separate file." + "The metadata with the sample/experiment attributes " \ + "split into different columns will be saved in a " \ + "separate file." logger.info(infostr) # Set the name of the file that will contain the metadata. @@ -750,7 +1078,7 @@ def get_metadata(project_name, # Inform the user that the file was written. infostr = \ - "The metadata with the 'sample_attributes' column " \ + "The metadata with the sample/experiment attributes " \ "split into different columns were successfully " \ f"written in '{f_metadata_path}'." logger.info(infostr) @@ -762,8 +1090,7 @@ def get_metadata(project_name, def merge_gene_sums_and_metadata(df_gene_sums, - df_metadata, - project_name): + df_metadata): """Add the metadata for samples deposited in the Recount3 platform. Parameters @@ -774,9 +1101,6 @@ def merge_gene_sums_and_metadata(df_gene_sums, df_metadata : ``pandas.DataFrame`` The data frame containing the metadata for the samples. - project_name : ``str``, {``"gtex"``, ``"tcga"``, ``"sra"``} - The name of the project of interest. - Returns ------- df_merged : ``pandas.DataFrame`` @@ -784,7 +1108,8 @@ def merge_gene_sums_and_metadata(df_gene_sums, for the samples. """ - # Add the metadata to the original data frame. + # Add the metadata to the original data frame. Drop duplicated + # columns. df_final = pd.concat(objs = [df_gene_sums, df_metadata], axis = 1) @@ -795,8 +1120,7 @@ def merge_gene_sums_and_metadata(df_gene_sums, def filter_by_metadata(df, - query_string, - project_name): + query_string): """Filter samples using the associated metadata. Parameters @@ -808,14 +1132,10 @@ def filter_by_metadata(df, query_string : ``str`` A string to query the data frame with. - project_name : ``str``, {``"gtex"``, ``"tcga"``, ``"sra"``} - The name of the project of interest. - Returns ------- df_filtered : ``pandas.DataFrame`` - The filtered data frame. This data frame will only contain the - RNA-seq counts (no metadata). + The filtered data frame. """ # Filter the data frame based on the query string. @@ -823,22 +1143,5 @@ def filter_by_metadata(df, #-----------------------------------------------------------------# - # Get the fields containing metadata. - metadata_fields = \ - [col for col in df.columns \ - if not col.startswith("ENSG")] - - # If the 'external_id' column is in the metadata - if "external_id" in metadata_fields: - - # Remove the index column from the fields containing metadata. - metadata_fields.remove("external_id") - - # Drop these columns from the data frame. - df = df.drop(metadata_fields, - axis = 1) - - #-----------------------------------------------------------------# - # Return the data frame. return df diff --git a/doc/source/dgd_get_recount3_data.md b/doc/source/dgd_get_recount3_data.md index bcea008..2d90fe5 100644 --- a/doc/source/dgd_get_recount3_data.md +++ b/doc/source/dgd_get_recount3_data.md @@ -6,9 +6,110 @@ So far, the program supports retrieving data for samples from the [GTEx](https:/ The executable allows samples to be selected for a single tissue (for GTEx data), cancer type (for TCGA), or project code (for SRA) and to filter them according to the associated metadata. The filtering is performed using a query string in the format supported by the [`pandas.DataFrame.query()`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html) method. -Metadata fields on which it is possible to filter the samples differ between GTEx, TCGA, and SRA samples. A list of the available fields is available in `bulkDGD/recount3/data/gtex_metadata_fields.txt` for GTEx samples, `bulkDGD/recount3/data/tcga_metadata_fields.txt` for TCGA samples, and `bulkDGD/recount3/data/sra_metadata_fields.txt` for SRA samples. +A list of the available metadata fields/columns is available in `bulkDGD/recount3/data/gtex_metadata_fields.txt` for GTEx samples, `bulkDGD/recount3/data/tcga_metadata_fields.txt` for TCGA samples, and `bulkDGD/recount3/data/sra_metadata_fields.txt` for SRA samples. More metadata fields may be available for SRA samples depending on the study they refer to. In this case, you can inspect the study's available metadata fields using the [NCBI SRA Run Selector tool](https://www.ncbi.nlm.nih.gov/Traces/study/). However, remember that the SRA Run Selector does not report Recount3-specific metadata, which can be found in the `sra_metadata_fields.txt` file. -The main output of `dgd_get_recount3_data` is a CSV file containing the RNA-seq data retrieved from Recount3 for the samples of interest. The rows represent the samples, while the columns contain the genes, identified by their Ensembl IDs. +`dgd_get_recount3_data` accepts two types of inputs containing the batches of samples to be downloaded from Recount3: + +* A CSV file with a comma-separated data frame. The data frame is expected to have at least two columns: + + * `"recount3_project_name"`, containing the name of the project (`"gtex"`, `"tcga"`, or `"sra"`) the samples belong to. + * `"recount3_samples_category"`, containing the name of the category the samples belong to (it is a tissue type for GTEx data, a cancer type for TCGA data, and a project code for SRA data). + + These additional three columns may also be present: + + * `"query_string"`, containing the query string that should be used to filter each batch of samples by their metadata. The query string is passed to the [`pandas.DataFrame.query()`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html) method. If no ``"query_string"` column is present, the samples will not be filtered. + * ``metadata_to_keep``, containing a vertical line (`|`)-separated list of names of metadata columns that will be kept in the final data frames, together with the columns containing gene expression data. `"recount3_project_name"` and `"recount3_samples_category"` are valid column names, and, if passed, the final data frames will also include them (each data frame will, of course, contain only one repeated value for each of these columns, since it contains samples from a single category of a single project). By default, all metadata columns (plus the `"recount3_project_name"` and `"recount3_samples_category"` columns) are kept in the final data frames). + * ``metadata_to_drop``, containing a vertical line (`|`)-separated list of names of metadata columns that will be dropped from the final data frames. The reserved keyword `_all_` can be used to drop all metadata columns from the final data frame of a specific batch of samples. `"recount3_project_name"` and `"recount3_samples_category"` are valid column names, and, if passed, will result in these columns being dropped. + +* A YAML file with the format exemplified below. We recommend using a YAML file over a CSV file when you have several studies for which different filtering conditions should be applied. + + ```yaml + # SRA studies - it can be omitted if no SRA studies are + # included. + sra: + + # Conditions applied to all SRA studies. + all: + + # Which metadata to keep in all studies (if found). It is + # a vertical line (|)-separated list of names of metadata + # columns that will be kept in the final data frames, + # together with the columns containing gene expression + # data. + # + # "recount3_project_name"`` and "recount3_samples_category" + # are valid column names, and, if passed, the final data + # frames will also include them (each data frame will, of + # course, contain only one repeated value for each of these + # columns, since it contains samples from a single category + # of a single project). + # + # By default, all metadata columns (plus the + # "recount3_project_name" and `"recount3_samples_category" + # columns) are kept in the final data frames. + metadata_to_keep: + + # Keep in all studies. + - source_name + ... + + # Which metadata to drop from all studies (if found). It is + # a vertical line (|)-separated list of names of metadata + # columns that will be dropped from the final data frames. + # + # The reserved keyword '_all_' can be used to drop all + # columns from the data frames. + # + # "recount3_project_name" and "recount3_samples_category" + # are valid column names and, if passed, will result in + # these columns being dropped. + metadata_to_drop: + + # Found in all studies. + - age + ... + + # Conditions applied to SRA study SRP179061. + SRP179061: + + # The query string that should be used to filter each batch + # of samples by their metadata. The query string is passed + # to the 'pandas.DataFrame.query()' method for filtering. + + # If no query string is present, the samples will not + # be filtered. + query_string: diagnosis == 'Control' + + # Which metadata to keep in this study (if found), It + # follows the same rules as the 'metadata_to_keep' field + # in the 'all' section. + metadata_to_keep: + - tissue + + # Which metadata to drop from this study (if found), It + # follows the same rules as the 'metadata_to_drop' field + # in the 'all' section. + metadata_to_drop: + - Sex + + # GTEx studies - it can be omitted if no GTEx studies are + # included. + gtex: + + # Same format as for SRA studies - single studies are + # identified by the tissue type each study refers to. + ... + + # TCGA studies - it can be omitted if no TCGA studies are + # included. + tcga: + + # Same format as for SRA studies - single studies are + # identified by the cancer type each study refers to. + ... + ``` + +The main output of `dgd_get_recount3_data` is several CSV files (one per batch of samples) containing the RNA-seq data retrieved from Recount3 for the samples of interest. The rows represent the samples, while the columns contain the genes identified by their Ensembl IDs or the samples' metadata. The user also has the option to save the original compressed (`.gz`) files containing the RNA-seq data and the metadata associated with the samples. If these files are found in the working directory for a specific project and sample category, they will not be downloaded again. @@ -25,11 +126,11 @@ dgd_get_recount3_data [-h] [-is INPUT_SAMPLES_BATCHES] [-d WORK_DIR] [-n N_PROC] | Option | Description | | ------------------------------- | ------------------------------------------------------------ | | `-h`, `--help` | Show the help message and exit. | -| `-i`, `--input-samples-batches` | A CSV file to download samples' data in bulk. The file must contain at least two columns: `input_project_name` with the name of the project the samples belong to and `input_samples_category` with the samples' category. A third column, `query_string`, may specify the query string used to filter each batch of samples. | +| `-i`, `--input-samples-batches` | A CSV file or a YAML file used to download samples' data in bulk. | | `-d`, `--work-dir` | The working directory. The default is the current working directory. | | `-n`, `--n-proc` | The number of processes to start. The default number of processes started is 1. | -| `-sg`, `--save-gene-sums` | Save the original GZ file containing the RNA-seq data for the samples. For each batch of samples, the corresponding file will be saved in the working directory and named`{input_project_name}_{input_samples_category}_gene_sums.gz`. This file will be written only once if more than one batch refers to the same `input_project_name` and `input_samples_category`. | -| `-sm`, `--save-metadata` | Save the original GZ file containing the metadata for the samples. For each batch of samples, the corresponding file will be saved in the working directory and named `{input_project_name}_{input_samples_category}_metadata.gz`. This file will be written only once if more than one batch refers to the same `input_project_name` and `input_samples_category` | +| `-sg`, `--save-gene-sums` | Save the original GZ file containing the RNA-seq data for the samples. For each batch of samples, the corresponding file will be saved in the working directory and named`{recount3_project_name}_{recount3_samples_category}_gene_sums.gz`. This file will be written only once if more than one batch refers to the same `recount3_project_name` and `recount3_samples_category`. | +| `-sm`, `--save-metadata` | Save the original GZ file containing the metadata for the samples. For each batch of samples, the corresponding file will be saved in the working directory and named `{recount3_project_name}_{recount3_samples_category}_metadata.gz`. This file will be written only once if more than one batch refers to the same `recount3_project_name` and `recount3_samples_category`. | | `-lf`, `--log-file` | The name of the log file. The file will be written in the working directory. The default file name is `dgd_get_recount3_data.log`. | | `-lc`, `--log-console` | Show log messages also on the console. | | `-v`, `--logging-verbose` | Enable verbose logging (INFO level). | diff --git a/setup.py b/setup.py index aa02563..f6c958c 100644 --- a/setup.py +++ b/setup.py @@ -46,12 +46,12 @@ "Anders Lykkebo-Valløe, Andreas Bjerregaard, Anders Krogh" # Set the project's version. -version = "1.0.0" +version = "1.0.1" # Set a brief description of the project. description = \ "A generative model for human gene expression from bulk " \ - "RNA-Seq data." + "RNA-seq data." # Set which packages are included. packages = \ @@ -66,13 +66,9 @@ # Set which package data to include. package_data = \ {"bulkDGD.ioutil" : ["configs/model/*.yaml", - "configs/model/*.md", "configs/plot/*.yaml", - "configs/plot/*.md", "configs/representations/*.yaml", - "configs/representations/*.md", "configs/training/*.yaml", - "configs/training/*.md", "data/*.pth", "data/*.txt", "data/*.md",],