From f681cded5712f1b36730068ac68e47cc3a25ee55 Mon Sep 17 00:00:00 2001
From: Erick Armingol <earmingol14@gmail.com>
Date: Fri, 12 Jul 2024 12:11:10 +0100
Subject: [PATCH] Changed way of subsetting to keep rxns only if all genes
 present

---
 sccellfie/preprocessing/prepare_inputs.py     | 60 +++++++++++++------
 .../tests/test_prepare_inputs.py              | 33 +++++++++-
 2 files changed, 75 insertions(+), 18 deletions(-)

diff --git a/sccellfie/preprocessing/prepare_inputs.py b/sccellfie/preprocessing/prepare_inputs.py
index 2f2eb56..637c18e 100644
--- a/sccellfie/preprocessing/prepare_inputs.py
+++ b/sccellfie/preprocessing/prepare_inputs.py
@@ -1,4 +1,5 @@
 import cobra
+import re
 
 
 def preprocess_inputs(adata, gpr_info, task_by_gene, rxn_by_gene, task_by_rxn, verbose=True):
@@ -56,31 +57,43 @@ def preprocess_inputs(adata, gpr_info, task_by_gene, rxn_by_gene, task_by_rxn, v
         reactions. Each cell contains ones or zeros, indicating whether a reaction
         is involved in a metabolic task.
     '''
+    # Initialize GPRs
     gpr_rules = gpr_info.dropna().set_index('Reaction').to_dict()['GPR-symbol']
 
-    genes = [g for g in rxn_by_gene.columns if (g in task_by_gene.columns) & (g in adata.var_names)]
+    valid_genes = set()
+    valid_reactions = set()
 
-    task_by_gene = task_by_gene.loc[:, genes]
-    task_by_gene = task_by_gene.loc[(task_by_gene != 0).any(axis=1)]
+    # Iterate through GPR rules
+    for reaction, gpr_rule in gpr_rules.items():
+        if reaction in rxn_by_gene.index and reaction in task_by_rxn.columns:
+            clean_gpr_rule = clean_gene_names(gpr_rule)
+            genes_in_rule = find_genes_gpr(clean_gpr_rule)
+            if all(gene in adata.var_names for gene in genes_in_rule):
+                valid_genes.update(genes_in_rule)
+                valid_reactions.add(reaction)
 
-    rxn_by_gene = rxn_by_gene.loc[:, genes]
-    rxn_by_gene = rxn_by_gene.loc[(rxn_by_gene != 0).any(axis=1)]
+    # Filter adata
+    adata2 = adata[:, sorted(valid_genes)]
 
-    gpr_rules = {k: v.replace(' AND ', ' and ').replace(' OR ', ' or ') for k, v in gpr_rules.items() if k in rxn_by_gene.index.tolist()}
+    # Filter tables
+    task_by_gene = task_by_gene.loc[:, sorted(valid_genes)]
+    rxn_by_gene = rxn_by_gene.loc[sorted(valid_reactions), sorted(valid_genes)]
+    task_by_rxn = task_by_rxn.loc[:, sorted(valid_reactions)]
 
-    # Initialize GPRs
-    gpr_rules = {k: cobra.core.gene.GPR().from_string(gpr) for k, gpr in gpr_rules.items()}
+    # Remove tasks with no non-zero values
+    task_by_gene = task_by_gene.loc[(task_by_gene != 0).any(axis=1)]
+    task_by_rxn = task_by_rxn.loc[(task_by_rxn != 0).any(axis=1)]
 
-    tasks = task_by_gene.index.tolist()
-    rxns = list(gpr_rules.keys())
+    # Ensure consistency across all tables
+    common_tasks = set(task_by_gene.index) & set(task_by_rxn.index)
+    task_by_gene = task_by_gene.loc[sorted(common_tasks)]
+    task_by_rxn = task_by_rxn.loc[sorted(common_tasks)]
 
-    task_by_rxn = task_by_rxn.loc[tasks, rxns]
-    task_by_rxn = task_by_rxn.loc[(task_by_rxn != 0).any(axis=1)]
+    # Update GPR rules
+    gpr_rules = {k: v for k, v in gpr_rules.items() if k in valid_reactions}
 
-    adata2 = adata[:, [True if g in genes else False for g in adata.var_names]]
-    if hasattr(adata, 'raw'):
-        if adata.raw is not None:
-            adata2.raw = adata.raw.to_adata()[:, [True if g in genes else False for g in adata.var_names]]
+    # Initialize GPRs
+    gpr_rules = {k: cobra.core.gene.GPR().from_string(gpr) for k, gpr in gpr_rules.items()}
 
     if verbose:
         print(f'Shape of new adata object: {adata2.shape}\n'
@@ -89,4 +102,17 @@ def preprocess_inputs(adata, gpr_info, task_by_gene, rxn_by_gene, task_by_rxn, v
               f'Shape of reactions by genes: {rxn_by_gene.shape}\n'
               f'Shape of tasks by reactions: {task_by_rxn.shape}')
 
-    return adata2, gpr_rules, task_by_gene, rxn_by_gene, task_by_rxn
\ No newline at end of file
+    return adata2, gpr_rules, task_by_gene, rxn_by_gene, task_by_rxn
+
+
+def clean_gene_names(gpr_rule):
+    # Regular expression pattern to match spaces between numbers and parentheses
+    pattern = r'(\()\s*(\d+)\s*(\))'
+    # Replace the matched pattern with parentheses directly around the numbers
+    cleaned_rule = re.sub(pattern, r'(\2)', gpr_rule)
+    return cleaned_rule
+
+
+def find_genes_gpr(gpr_rule):
+    elements = re.findall(r'\b[^\s(),]+\b', gpr_rule)
+    return [e for e in elements if e.lower() not in ('and', 'or')]
\ No newline at end of file
diff --git a/sccellfie/preprocessing/tests/test_prepare_inputs.py b/sccellfie/preprocessing/tests/test_prepare_inputs.py
index 17e501e..6c429a0 100644
--- a/sccellfie/preprocessing/tests/test_prepare_inputs.py
+++ b/sccellfie/preprocessing/tests/test_prepare_inputs.py
@@ -1,6 +1,6 @@
 import pandas as pd
 
-from sccellfie.preprocessing import preprocess_inputs
+from sccellfie.preprocessing.prepare_inputs import preprocess_inputs, clean_gene_names, find_genes_gpr
 from sccellfie.tests.toy_inputs import create_random_adata, create_controlled_adata, create_controlled_gpr_dict, create_controlled_task_by_rxn, create_controlled_task_by_gene, create_controlled_rxn_by_gene
 
 
@@ -56,3 +56,34 @@ def test_shapes():
     assert task_by_gene.shape != task_by_gene2.shape, "task_by_gene2 has the same shape as task_by_gene"
     assert rxn_by_gene.shape != rxn_by_gene2.shape, "rxn_by_gene2 has the same shape as rxn_by_gene"
     assert task_by_rxn.shape != task_by_rxn2.shape, "task_by_rxn2 has the same shape as task_by_rxn"
+
+
+def test_clean_gene_names():
+    test_cases = [
+        ("(1 ) AND (2)", "(1) AND (2)"),
+        ("( 3 ) OR ( 4 )", "(3) OR (4)"),
+        ("(5) AND ( 6 ) OR (7 )", "(5) AND (6) OR (7)"),
+        ("gene1 AND (8 )", "gene1 AND (8)"),
+        ("( 9 ) OR gene2", "(9) OR gene2"),
+        ("No parentheses here", "No parentheses here")
+    ]
+
+    for input_rule, expected_output in test_cases:
+        result = clean_gene_names(input_rule)
+        assert result == expected_output, f"For input '{input_rule}', expected '{expected_output}', but got '{result}'"
+
+
+def test_find_genes_gpr():
+    test_cases = [
+        ("gene1 AND gene2", ["gene1", "gene2"]),
+        ("gene3 OR (gene4 AND gene5)", ["gene3", "gene4", "gene5"]),
+        ("(gene6 OR gene7) AND gene8", ["gene6", "gene7", "gene8"]),
+        ("gene9 AND gene10 OR gene11", ["gene9", "gene10", "gene11"]),
+        ("gene12", ["gene12"]),
+        ("gene13 AND (gene14 OR (gene15 AND gene16))", ["gene13", "gene14", "gene15", "gene16"]),
+        ("NO_GENES_HERE", ["NO_GENES_HERE"])
+    ]
+
+    for input_rule, expected_output in test_cases:
+        result = find_genes_gpr(input_rule)
+        assert result == expected_output, f"For input '{input_rule}', expected {expected_output}, but got {result}"
\ No newline at end of file