Skip to content

Commit

Permalink
Changed way of subsetting to keep rxns only if all genes present
Browse files Browse the repository at this point in the history
  • Loading branch information
earmingol committed Jul 12, 2024
1 parent 44dc71f commit f681cde
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 18 deletions.
60 changes: 43 additions & 17 deletions sccellfie/preprocessing/prepare_inputs.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import cobra
import re


def preprocess_inputs(adata, gpr_info, task_by_gene, rxn_by_gene, task_by_rxn, verbose=True):
Expand Down Expand Up @@ -56,31 +57,43 @@ def preprocess_inputs(adata, gpr_info, task_by_gene, rxn_by_gene, task_by_rxn, v
reactions. Each cell contains ones or zeros, indicating whether a reaction
is involved in a metabolic task.
'''
# Initialize GPRs
gpr_rules = gpr_info.dropna().set_index('Reaction').to_dict()['GPR-symbol']

genes = [g for g in rxn_by_gene.columns if (g in task_by_gene.columns) & (g in adata.var_names)]
valid_genes = set()
valid_reactions = set()

task_by_gene = task_by_gene.loc[:, genes]
task_by_gene = task_by_gene.loc[(task_by_gene != 0).any(axis=1)]
# Iterate through GPR rules
for reaction, gpr_rule in gpr_rules.items():
if reaction in rxn_by_gene.index and reaction in task_by_rxn.columns:
clean_gpr_rule = clean_gene_names(gpr_rule)
genes_in_rule = find_genes_gpr(clean_gpr_rule)
if all(gene in adata.var_names for gene in genes_in_rule):
valid_genes.update(genes_in_rule)
valid_reactions.add(reaction)

rxn_by_gene = rxn_by_gene.loc[:, genes]
rxn_by_gene = rxn_by_gene.loc[(rxn_by_gene != 0).any(axis=1)]
# Filter adata
adata2 = adata[:, sorted(valid_genes)]

gpr_rules = {k: v.replace(' AND ', ' and ').replace(' OR ', ' or ') for k, v in gpr_rules.items() if k in rxn_by_gene.index.tolist()}
# Filter tables
task_by_gene = task_by_gene.loc[:, sorted(valid_genes)]
rxn_by_gene = rxn_by_gene.loc[sorted(valid_reactions), sorted(valid_genes)]
task_by_rxn = task_by_rxn.loc[:, sorted(valid_reactions)]

# Initialize GPRs
gpr_rules = {k: cobra.core.gene.GPR().from_string(gpr) for k, gpr in gpr_rules.items()}
# Remove tasks with no non-zero values
task_by_gene = task_by_gene.loc[(task_by_gene != 0).any(axis=1)]
task_by_rxn = task_by_rxn.loc[(task_by_rxn != 0).any(axis=1)]

tasks = task_by_gene.index.tolist()
rxns = list(gpr_rules.keys())
# Ensure consistency across all tables
common_tasks = set(task_by_gene.index) & set(task_by_rxn.index)
task_by_gene = task_by_gene.loc[sorted(common_tasks)]
task_by_rxn = task_by_rxn.loc[sorted(common_tasks)]

task_by_rxn = task_by_rxn.loc[tasks, rxns]
task_by_rxn = task_by_rxn.loc[(task_by_rxn != 0).any(axis=1)]
# Update GPR rules
gpr_rules = {k: v for k, v in gpr_rules.items() if k in valid_reactions}

adata2 = adata[:, [True if g in genes else False for g in adata.var_names]]
if hasattr(adata, 'raw'):
if adata.raw is not None:
adata2.raw = adata.raw.to_adata()[:, [True if g in genes else False for g in adata.var_names]]
# Initialize GPRs
gpr_rules = {k: cobra.core.gene.GPR().from_string(gpr) for k, gpr in gpr_rules.items()}

if verbose:
print(f'Shape of new adata object: {adata2.shape}\n'
Expand All @@ -89,4 +102,17 @@ def preprocess_inputs(adata, gpr_info, task_by_gene, rxn_by_gene, task_by_rxn, v
f'Shape of reactions by genes: {rxn_by_gene.shape}\n'
f'Shape of tasks by reactions: {task_by_rxn.shape}')

return adata2, gpr_rules, task_by_gene, rxn_by_gene, task_by_rxn
return adata2, gpr_rules, task_by_gene, rxn_by_gene, task_by_rxn


def clean_gene_names(gpr_rule):
# Regular expression pattern to match spaces between numbers and parentheses
pattern = r'(\()\s*(\d+)\s*(\))'
# Replace the matched pattern with parentheses directly around the numbers
cleaned_rule = re.sub(pattern, r'(\2)', gpr_rule)
return cleaned_rule


def find_genes_gpr(gpr_rule):
elements = re.findall(r'\b[^\s(),]+\b', gpr_rule)
return [e for e in elements if e.lower() not in ('and', 'or')]
33 changes: 32 additions & 1 deletion sccellfie/preprocessing/tests/test_prepare_inputs.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pandas as pd

from sccellfie.preprocessing import preprocess_inputs
from sccellfie.preprocessing.prepare_inputs import preprocess_inputs, clean_gene_names, find_genes_gpr
from sccellfie.tests.toy_inputs import create_random_adata, create_controlled_adata, create_controlled_gpr_dict, create_controlled_task_by_rxn, create_controlled_task_by_gene, create_controlled_rxn_by_gene


Expand Down Expand Up @@ -56,3 +56,34 @@ def test_shapes():
assert task_by_gene.shape != task_by_gene2.shape, "task_by_gene2 has the same shape as task_by_gene"
assert rxn_by_gene.shape != rxn_by_gene2.shape, "rxn_by_gene2 has the same shape as rxn_by_gene"
assert task_by_rxn.shape != task_by_rxn2.shape, "task_by_rxn2 has the same shape as task_by_rxn"


def test_clean_gene_names():
test_cases = [
("(1 ) AND (2)", "(1) AND (2)"),
("( 3 ) OR ( 4 )", "(3) OR (4)"),
("(5) AND ( 6 ) OR (7 )", "(5) AND (6) OR (7)"),
("gene1 AND (8 )", "gene1 AND (8)"),
("( 9 ) OR gene2", "(9) OR gene2"),
("No parentheses here", "No parentheses here")
]

for input_rule, expected_output in test_cases:
result = clean_gene_names(input_rule)
assert result == expected_output, f"For input '{input_rule}', expected '{expected_output}', but got '{result}'"


def test_find_genes_gpr():
test_cases = [
("gene1 AND gene2", ["gene1", "gene2"]),
("gene3 OR (gene4 AND gene5)", ["gene3", "gene4", "gene5"]),
("(gene6 OR gene7) AND gene8", ["gene6", "gene7", "gene8"]),
("gene9 AND gene10 OR gene11", ["gene9", "gene10", "gene11"]),
("gene12", ["gene12"]),
("gene13 AND (gene14 OR (gene15 AND gene16))", ["gene13", "gene14", "gene15", "gene16"]),
("NO_GENES_HERE", ["NO_GENES_HERE"])
]

for input_rule, expected_output in test_cases:
result = find_genes_gpr(input_rule)
assert result == expected_output, f"For input '{input_rule}', expected {expected_output}, but got {result}"

0 comments on commit f681cde

Please sign in to comment.