From f681cded5712f1b36730068ac68e47cc3a25ee55 Mon Sep 17 00:00:00 2001 From: Erick Armingol Date: Fri, 12 Jul 2024 12:11:10 +0100 Subject: [PATCH] Changed way of subsetting to keep rxns only if all genes present --- sccellfie/preprocessing/prepare_inputs.py | 60 +++++++++++++------ .../tests/test_prepare_inputs.py | 33 +++++++++- 2 files changed, 75 insertions(+), 18 deletions(-) diff --git a/sccellfie/preprocessing/prepare_inputs.py b/sccellfie/preprocessing/prepare_inputs.py index 2f2eb56..637c18e 100644 --- a/sccellfie/preprocessing/prepare_inputs.py +++ b/sccellfie/preprocessing/prepare_inputs.py @@ -1,4 +1,5 @@ import cobra +import re def preprocess_inputs(adata, gpr_info, task_by_gene, rxn_by_gene, task_by_rxn, verbose=True): @@ -56,31 +57,43 @@ def preprocess_inputs(adata, gpr_info, task_by_gene, rxn_by_gene, task_by_rxn, v reactions. Each cell contains ones or zeros, indicating whether a reaction is involved in a metabolic task. ''' + # Initialize GPRs gpr_rules = gpr_info.dropna().set_index('Reaction').to_dict()['GPR-symbol'] - genes = [g for g in rxn_by_gene.columns if (g in task_by_gene.columns) & (g in adata.var_names)] + valid_genes = set() + valid_reactions = set() - task_by_gene = task_by_gene.loc[:, genes] - task_by_gene = task_by_gene.loc[(task_by_gene != 0).any(axis=1)] + # Iterate through GPR rules + for reaction, gpr_rule in gpr_rules.items(): + if reaction in rxn_by_gene.index and reaction in task_by_rxn.columns: + clean_gpr_rule = clean_gene_names(gpr_rule) + genes_in_rule = find_genes_gpr(clean_gpr_rule) + if all(gene in adata.var_names for gene in genes_in_rule): + valid_genes.update(genes_in_rule) + valid_reactions.add(reaction) - rxn_by_gene = rxn_by_gene.loc[:, genes] - rxn_by_gene = rxn_by_gene.loc[(rxn_by_gene != 0).any(axis=1)] + # Filter adata + adata2 = adata[:, sorted(valid_genes)] - gpr_rules = {k: v.replace(' AND ', ' and ').replace(' OR ', ' or ') for k, v in gpr_rules.items() if k in rxn_by_gene.index.tolist()} + # Filter tables + task_by_gene = task_by_gene.loc[:, sorted(valid_genes)] + rxn_by_gene = rxn_by_gene.loc[sorted(valid_reactions), sorted(valid_genes)] + task_by_rxn = task_by_rxn.loc[:, sorted(valid_reactions)] - # Initialize GPRs - gpr_rules = {k: cobra.core.gene.GPR().from_string(gpr) for k, gpr in gpr_rules.items()} + # Remove tasks with no non-zero values + task_by_gene = task_by_gene.loc[(task_by_gene != 0).any(axis=1)] + task_by_rxn = task_by_rxn.loc[(task_by_rxn != 0).any(axis=1)] - tasks = task_by_gene.index.tolist() - rxns = list(gpr_rules.keys()) + # Ensure consistency across all tables + common_tasks = set(task_by_gene.index) & set(task_by_rxn.index) + task_by_gene = task_by_gene.loc[sorted(common_tasks)] + task_by_rxn = task_by_rxn.loc[sorted(common_tasks)] - task_by_rxn = task_by_rxn.loc[tasks, rxns] - task_by_rxn = task_by_rxn.loc[(task_by_rxn != 0).any(axis=1)] + # Update GPR rules + gpr_rules = {k: v for k, v in gpr_rules.items() if k in valid_reactions} - adata2 = adata[:, [True if g in genes else False for g in adata.var_names]] - if hasattr(adata, 'raw'): - if adata.raw is not None: - adata2.raw = adata.raw.to_adata()[:, [True if g in genes else False for g in adata.var_names]] + # Initialize GPRs + gpr_rules = {k: cobra.core.gene.GPR().from_string(gpr) for k, gpr in gpr_rules.items()} if verbose: print(f'Shape of new adata object: {adata2.shape}\n' @@ -89,4 +102,17 @@ def preprocess_inputs(adata, gpr_info, task_by_gene, rxn_by_gene, task_by_rxn, v f'Shape of reactions by genes: {rxn_by_gene.shape}\n' f'Shape of tasks by reactions: {task_by_rxn.shape}') - return adata2, gpr_rules, task_by_gene, rxn_by_gene, task_by_rxn \ No newline at end of file + return adata2, gpr_rules, task_by_gene, rxn_by_gene, task_by_rxn + + +def clean_gene_names(gpr_rule): + # Regular expression pattern to match spaces between numbers and parentheses + pattern = r'(\()\s*(\d+)\s*(\))' + # Replace the matched pattern with parentheses directly around the numbers + cleaned_rule = re.sub(pattern, r'(\2)', gpr_rule) + return cleaned_rule + + +def find_genes_gpr(gpr_rule): + elements = re.findall(r'\b[^\s(),]+\b', gpr_rule) + return [e for e in elements if e.lower() not in ('and', 'or')] \ No newline at end of file diff --git a/sccellfie/preprocessing/tests/test_prepare_inputs.py b/sccellfie/preprocessing/tests/test_prepare_inputs.py index 17e501e..6c429a0 100644 --- a/sccellfie/preprocessing/tests/test_prepare_inputs.py +++ b/sccellfie/preprocessing/tests/test_prepare_inputs.py @@ -1,6 +1,6 @@ import pandas as pd -from sccellfie.preprocessing import preprocess_inputs +from sccellfie.preprocessing.prepare_inputs import preprocess_inputs, clean_gene_names, find_genes_gpr from sccellfie.tests.toy_inputs import create_random_adata, create_controlled_adata, create_controlled_gpr_dict, create_controlled_task_by_rxn, create_controlled_task_by_gene, create_controlled_rxn_by_gene @@ -56,3 +56,34 @@ def test_shapes(): assert task_by_gene.shape != task_by_gene2.shape, "task_by_gene2 has the same shape as task_by_gene" assert rxn_by_gene.shape != rxn_by_gene2.shape, "rxn_by_gene2 has the same shape as rxn_by_gene" assert task_by_rxn.shape != task_by_rxn2.shape, "task_by_rxn2 has the same shape as task_by_rxn" + + +def test_clean_gene_names(): + test_cases = [ + ("(1 ) AND (2)", "(1) AND (2)"), + ("( 3 ) OR ( 4 )", "(3) OR (4)"), + ("(5) AND ( 6 ) OR (7 )", "(5) AND (6) OR (7)"), + ("gene1 AND (8 )", "gene1 AND (8)"), + ("( 9 ) OR gene2", "(9) OR gene2"), + ("No parentheses here", "No parentheses here") + ] + + for input_rule, expected_output in test_cases: + result = clean_gene_names(input_rule) + assert result == expected_output, f"For input '{input_rule}', expected '{expected_output}', but got '{result}'" + + +def test_find_genes_gpr(): + test_cases = [ + ("gene1 AND gene2", ["gene1", "gene2"]), + ("gene3 OR (gene4 AND gene5)", ["gene3", "gene4", "gene5"]), + ("(gene6 OR gene7) AND gene8", ["gene6", "gene7", "gene8"]), + ("gene9 AND gene10 OR gene11", ["gene9", "gene10", "gene11"]), + ("gene12", ["gene12"]), + ("gene13 AND (gene14 OR (gene15 AND gene16))", ["gene13", "gene14", "gene15", "gene16"]), + ("NO_GENES_HERE", ["NO_GENES_HERE"]) + ] + + for input_rule, expected_output in test_cases: + result = find_genes_gpr(input_rule) + assert result == expected_output, f"For input '{input_rule}', expected {expected_output}, but got {result}" \ No newline at end of file