Skip to content

Commit

Permalink
feat(metacyc): add annotation of compounds
Browse files Browse the repository at this point in the history
Comments in Compound nodes require parsing. DBLINKS could be added to RDF ndoes.

fix #16
  • Loading branch information
y1zhou committed Jan 20, 2022
1 parent ec76534 commit fc7c749
Showing 1 changed file with 99 additions and 1 deletion.
100 changes: 99 additions & 1 deletion metabolike/parser/metacyc.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def __init__(
db_name: Optional[str] = None,
reactions: Optional[Union[str, Path]] = None,
pathways: Optional[Union[str, Path]] = None,
compounds: Optional[Union[str, Path]] = None,
publications: Optional[Union[str, Path]] = None,
classes: Optional[Union[str, Path]] = None,
):
Expand All @@ -80,6 +81,9 @@ def __init__(
classes: The path to the ``class.dat`` file. If given, the file
will be parsed and annotations on ``Compartment``, ``Taxa``,
and ``Compound`` nodes will be added.
compounds: The path to the ``compound.dat`` file. If given, the
file will be parsed and annotations on ``Compound`` nodes will
be added.
"""
# Neo4j driver
self.neo4j_driver = neo4j_driver
Expand All @@ -91,6 +95,7 @@ def __init__(
"pathways": self._validate_path(pathways),
"publications": self._validate_path(publications),
"classes": self._validate_path(classes),
"compounds": self._validate_path(compounds),
}

# Misc variables
Expand Down Expand Up @@ -137,6 +142,26 @@ def setup(self, force: bool = False):
self.pathway_to_graph(pw, pw_dat, session)
logger.debug(f"Added pathway annotation for {pw}")

# Compounds in compounds.dat
if self.input_files["compounds"]:
logger.info("Annotating Compound nodes")
cpd_dat = self._read_dat_file(self.input_files["compounds"])

# All Compound node with RDF have BioCyc IDs
# The META: prefix in BioCyc IDs need to be stripped
all_cpds = [
(cpd["c.displayName"], cpd["r.Biocyc"][5:])
for cpd in session.run(
"""
MATCH (c:Compound)-[:is]->(r:RDF)
RETURN c.displayName, r.Biocyc;
"""
)
] # TODO: 38 POLYMER nodes don't have BioCyc IDs
for cpd, biocyc in all_cpds:
self.compounds_to_graph(cpd, biocyc, cpd_dat, session)
logger.debug(f"Added annotation for compound {cpd} with {biocyc}")

# Read publications file if given
if self.input_files["publications"]:
logger.info("Annotating publications")
Expand Down Expand Up @@ -536,6 +561,75 @@ def pathway_to_graph(
)
)

def compounds_to_graph(
self,
cpd: str,
biocyc: str,
cpd_dat: Dict[str, List[List[str]]],
session: db.Session,
):
"""Annotate a compound node with data from the compound.dat file.
Args:
cpd: The ``displayName`` of the compound.
biocyc: The biocyc id of the compound.
cpd_dat: The compound.dat data.
session: The neo4j session.
"""
lines = cpd_dat[biocyc]
c_props: Dict[str, Union[str, List[str]]] = {}
rdf_props: Dict[str, Union[str, List[str]]] = {}
for k, v in lines:
if k in {
"GIBBS-0",
"LOGP",
"MOLECULAR-WEIGHT",
"MONOISOTOPIC-MW",
"POLAR-SURFACE-AREA",
"PKA1",
"PKA2",
"PKA3",
"COMMENT",
}:
_add_kv_to_dict(c_props, k, v, as_list=False)
elif k == "SYNONYMS":
_add_kv_to_dict(c_props, k, v, as_list=True)
elif k in {"SMILES", "INCHI"}:
_add_kv_to_dict(rdf_props, k, v, as_list=False)
elif k == "DBLINKS":
pass # TODO: parse DBLINKS
elif k == "CITATIONS":
self._link_node_to_citation(session, "Compound", cpd, v)

c_props = self._clean_props(
c_props,
num_fields=[
_snake_to_camel(x)
for x in [
"GIBBS-0",
"LOGP",
"MOLECULAR-WEIGHT",
"MONOISOTOPIC-MW",
"POLAR-SURFACE-AREA",
"PKA1",
"PKA2",
"PKA3",
]
],
enum_fields=[],
)
session.write_transaction(
lambda tx: tx.run(
"""
MATCH (c:Compound {displayName: $cpd_id})-[:is]->(r:RDF)
SET c += $c_props, r += $rdf_props;
""",
cpd_id=cpd,
c_props=c_props,
rdf_props=rdf_props,
)
)

def citation_to_graph(
self, cit_id: str, pub_dat: Dict[str, List[List[str]]], session: db.Session
):
Expand Down Expand Up @@ -924,7 +1018,7 @@ def _link_node_to_citation(
Args:
session: Neo4j session.
node_type: Type of the node (Reaction or Pathway).
node_type: Type of the node (Reaction, Pathway, or Compound).
node_display_name: ``displayName`` of the node.
citation_id: ``mcId`` of the ``Citation`` node.
"""
Expand All @@ -948,7 +1042,11 @@ def _clean_props(
Args:
props: Properties to normalize.
num_fields: Fields that should be converted to float numbers.
enum_fields: Fields that should be converted to alphanumerical strings.
Returns:
A dictionary with normalized properties.
"""
for f in num_fields:
if f in props:
Expand Down

0 comments on commit fc7c749

Please sign in to comment.