Skip to content

Commit

Permalink
Begin using curies.Converter in more places (#397)
Browse files Browse the repository at this point in the history
Part of #363 

This PR does the following:

1. Adds a minimum version of `curies` that has the strict compress and
expand functions
2. Rewrites the SPARQL utils and RDF utils to use `curies` functionality
3. Updates custom `curie_from_uri` to use `curies` (will make a
follow-up PR that replaces this completely)

---------

Co-authored-by: Harshad Hegde <[email protected]>
Co-authored-by: Nico Matentzoglu <[email protected]>
  • Loading branch information
3 people authored Jul 27, 2023
1 parent dbb490d commit f3be757
Show file tree
Hide file tree
Showing 9 changed files with 103 additions and 164 deletions.
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ authors = [
"Chris Mungall <[email protected]>",
"Nicolas Matentzoglu <[email protected]>",
"Harshad Hegde <[email protected]>"
]
]
license = "MIT"
readme = "README.md"

Expand All @@ -19,12 +19,13 @@ bioregistry = ">=0.9.43"
deprecation = ">=2.1.0"
linkml-runtime = ">=1.5.3"
networkx = ">=3.1"
curies = ">=0.5.7"
pandas = ">=2.0.2"
pansql = "^0.0.1"
pyyaml = ">=6.0"
rdflib = ">=6.3.2"
sparqlwrapper = ">=2.0.0"
sssom-schema = ">=0.13.0"
sssom-schema = ">=0.14.0"
validators = ">=0.20.0"
scipy = {version = "*", extras = ["scipy"]}

Expand Down
7 changes: 2 additions & 5 deletions src/sssom/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,11 +356,8 @@ def sparql(
endpoint.limit = limit
if object_labels is not None:
endpoint.include_object_labels = object_labels
if prefix is not None:
if endpoint.prefix_map is None:
endpoint.prefix_map = {}
for k, v in prefix:
endpoint.prefix_map[k] = v
for k, v in prefix or []:
endpoint.prefix_map[k] = v
msdf = query_mappings(endpoint)
write_table(msdf, output)

Expand Down
14 changes: 13 additions & 1 deletion src/sssom/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def get_extended_prefix_map():
:return: Prefix map.
"""
converter = Converter.from_extended_prefix_map(EXTENDED_PREFIX_MAP)
return converter.prefix_map
return {record.prefix: record.uri_prefix for record in converter.records}


def get_built_in_prefix_map() -> PrefixMap:
Expand Down Expand Up @@ -108,15 +108,27 @@ def get_default_metadata() -> Metadata:
if "@id" in v and "@prefix" in v:
if v["@prefix"]:
prefix_map[key] = v["@id"]
del prefix_map["@vocab"]

prefix_map.update({(k, v) for k, v in contxt_external.items() if k not in prefix_map})
_raise_on_invalid_prefix_map(prefix_map)

metadata = Metadata(prefix_map=prefix_map, metadata=metadata_dict)
metadata.metadata["mapping_set_id"] = DEFAULT_MAPPING_SET_ID
metadata.metadata["license"] = DEFAULT_LICENSE
return metadata


def _raise_on_invalid_prefix_map(prefix_map):
"""Raise an exception if the prefix map is not bijective.
This uses :meth:`curies.Converter.from_prefix_map` to try and load a
prefix map. If there are any duplicate values (i.e., it is _not_ bijective)
then it throws a value error.
"""
Converter.from_prefix_map(prefix_map)


def set_default_mapping_set_id(meta: Metadata) -> Metadata:
"""Provide a default mapping_set_id if absent in the MappingSetDataFrame.
Expand Down
55 changes: 27 additions & 28 deletions src/sssom/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import pandas as pd
import requests
import yaml
from curies import Converter
from deprecation import deprecated
from linkml_runtime.loaders.json_loader import JSONLoader
from pandas.errors import EmptyDataError
Expand Down Expand Up @@ -64,11 +65,10 @@
SSSOM_DEFAULT_RDF_SERIALISATION,
URI_SSSOM_MAPPINGS,
MappingSetDataFrame,
NoCURIEException,
curie_from_uri,
get_file_extension,
is_multivalued_slot,
raise_for_bad_path,
safe_compress,
to_mapping_set_dataframe,
)

Expand Down Expand Up @@ -506,6 +506,7 @@ def from_sssom_rdf(
:return: MappingSetDataFrame object
"""
prefix_map = _ensure_prefix_map(prefix_map)
converter = Converter.from_prefix_map(prefix_map)

ms = _init_mapping_set(meta)
mlist: List[Mapping] = []
Expand All @@ -515,7 +516,7 @@ def from_sssom_rdf(
for _s, p, o in g.triples((ox, None, None)):
if isinstance(p, URIRef):
try:
p_id = curie_from_uri(p, prefix_map)
p_id = safe_compress(p, converter)
k = None

if p_id.startswith("sssom:"):
Expand All @@ -529,14 +530,14 @@ def from_sssom_rdf(

if isinstance(o, URIRef):
v: Any
v = curie_from_uri(o, prefix_map)
v = safe_compress(o, converter)
else:
v = o.toPython()
if k:
v = _address_multivalued_slot(k, v)
mdict[k] = v

except NoCURIEException as e:
except ValueError as e:
logging.warning(e)
if mdict:
m = _prepare_mapping(Mapping(**mdict))
Expand Down Expand Up @@ -596,6 +597,7 @@ def from_alignment_minidom(
"""
# FIXME: should be prefix_map = _check_prefix_map(prefix_map)
_ensure_prefix_map(prefix_map)
converter = Converter.from_prefix_map(prefix_map)
ms = _init_mapping_set(meta)
mlist: List[Mapping] = []
# bad_attrs = {}
Expand All @@ -612,7 +614,7 @@ def from_alignment_minidom(
cell = e.getElementsByTagName("Cell")
for c_node in cell:
mdict = _cell_element_values(
c_node, prefix_map, mapping_predicates=mapping_predicates
c_node, converter, mapping_predicates=mapping_predicates
)
if mdict:
m = _prepare_mapping(mdict)
Expand Down Expand Up @@ -665,6 +667,7 @@ def from_obographs(
:return: An SSSOM data frame (MappingSetDataFrame)
"""
_ensure_prefix_map(prefix_map)
converter = Converter.from_prefix_map(prefix_map)
ms = _init_mapping_set(meta)
mlist: List[Mapping] = []
# bad_attrs = {}
Expand Down Expand Up @@ -705,13 +708,13 @@ def from_obographs(
xref_id = xref["val"]
mdict: Dict[str, Any] = {}
try:
mdict[SUBJECT_ID] = curie_from_uri(nid, prefix_map)
mdict[OBJECT_ID] = curie_from_uri(xref_id, prefix_map)
mdict[SUBJECT_ID] = safe_compress(nid, converter)
mdict[OBJECT_ID] = safe_compress(xref_id, converter)
mdict[SUBJECT_LABEL] = label
mdict[PREDICATE_ID] = "oboInOwl:hasDbXref"
mdict[MAPPING_JUSTIFICATION] = MAPPING_JUSTIFICATION_UNSPECIFIED
mlist.append(Mapping(**mdict))
except NoCURIEException as e:
except ValueError as e:
# FIXME this will cause all sorts of ragged Mappings
logging.warning(e)
if "basicPropertyValues" in n["meta"]:
Expand All @@ -721,15 +724,15 @@ def from_obographs(
xref_id = value["val"]
mdict = {}
try:
mdict[SUBJECT_ID] = curie_from_uri(nid, prefix_map)
mdict[OBJECT_ID] = curie_from_uri(xref_id, prefix_map)
mdict[SUBJECT_ID] = safe_compress(nid, converter)
mdict[OBJECT_ID] = safe_compress(xref_id, converter)
mdict[SUBJECT_LABEL] = label
mdict[PREDICATE_ID] = curie_from_uri(pred, prefix_map)
mdict[PREDICATE_ID] = safe_compress(pred, converter)
mdict[
MAPPING_JUSTIFICATION
] = MAPPING_JUSTIFICATION_UNSPECIFIED
mlist.append(Mapping(**mdict))
except NoCURIEException as e:
except ValueError as e:
# FIXME this will cause ragged mappings
logging.warning(e)
if "edges" in g:
Expand All @@ -739,15 +742,15 @@ def from_obographs(
predicate_id = _get_obographs_predicate_id(edge["pred"])
object_id = edge["obj"]
if predicate_id in mapping_predicates:
mdict[SUBJECT_ID] = curie_from_uri(subject_id, prefix_map)
mdict[OBJECT_ID] = curie_from_uri(object_id, prefix_map)
mdict[SUBJECT_ID] = safe_compress(subject_id, converter)
mdict[OBJECT_ID] = safe_compress(object_id, converter)
mdict[SUBJECT_LABEL] = (
labels[subject_id] if subject_id in labels.keys() else ""
)
mdict[OBJECT_LABEL] = (
labels[object_id] if object_id in labels.keys() else ""
)
mdict[PREDICATE_ID] = curie_from_uri(predicate_id, prefix_map)
mdict[PREDICATE_ID] = safe_compress(predicate_id, converter)
mdict[MAPPING_JUSTIFICATION] = MAPPING_JUSTIFICATION_UNSPECIFIED
mlist.append(Mapping(**mdict))
if "equivalentNodesSets" in g and OWL_EQUIV_CLASS_URI in mapping_predicates:
Expand All @@ -757,10 +760,10 @@ def from_obographs(
for ec2 in equivalents["nodeIds"]:
if ec1 != ec2:
mdict = {}
mdict[SUBJECT_ID] = curie_from_uri(ec1, prefix_map)
mdict[OBJECT_ID] = curie_from_uri(ec2, prefix_map)
mdict[PREDICATE_ID] = curie_from_uri(
OWL_EQUIV_CLASS_URI, prefix_map
mdict[SUBJECT_ID] = safe_compress(ec1, converter)
mdict[OBJECT_ID] = safe_compress(ec2, converter)
mdict[PREDICATE_ID] = safe_compress(
OWL_EQUIV_CLASS_URI, converter
)
mdict[MAPPING_JUSTIFICATION] = MAPPING_JUSTIFICATION_UNSPECIFIED
mdict[SUBJECT_LABEL] = (
Expand Down Expand Up @@ -868,19 +871,15 @@ def _set_metadata_in_mapping_set(
mapping_set[k] = v


def _cell_element_values(cell_node, prefix_map: PrefixMap, mapping_predicates) -> Optional[Mapping]:
def _cell_element_values(cell_node, converter: Converter, mapping_predicates) -> Optional[Mapping]:
mdict: Dict[str, Any] = {}
for child in cell_node.childNodes:
if child.nodeType == Node.ELEMENT_NODE:
try:
if child.nodeName == "entity1":
mdict[SUBJECT_ID] = curie_from_uri(
child.getAttribute("rdf:resource"), prefix_map
)
mdict[SUBJECT_ID] = safe_compress(child.getAttribute("rdf:resource"), converter)
elif child.nodeName == "entity2":
mdict[OBJECT_ID] = curie_from_uri(
child.getAttribute("rdf:resource"), prefix_map
)
mdict[OBJECT_ID] = safe_compress(child.getAttribute("rdf:resource"), converter)
elif child.nodeName == "measure":
mdict[CONFIDENCE] = child.firstChild.nodeValue
elif child.nodeName == "relation":
Expand All @@ -902,7 +901,7 @@ def _cell_element_values(cell_node, prefix_map: PrefixMap, mapping_predicates) -
logging.warning(f"{relation} not a recognised relation type.")
else:
logging.warning(f"Unsupported alignment api element: {child.nodeName}")
except NoCURIEException as e:
except ValueError as e:
logging.warning(e)

mdict[MAPPING_JUSTIFICATION] = MAPPING_JUSTIFICATION_UNSPECIFIED
Expand Down
21 changes: 8 additions & 13 deletions src/sssom/rdf_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,9 @@
import logging
from typing import Any, Dict, List, Optional

from curies import Converter
from linkml_runtime.utils.metamodelcore import URIorCURIE
from rdflib import Graph, URIRef

# from .sssom_datamodel import EntityReference, Mapping
from sssom_schema import EntityReference, Mapping

from .parsers import to_mapping_set_document
Expand All @@ -24,17 +23,12 @@ def rewire_graph(
precedence: Optional[List[str]] = None,
) -> int:
"""Rewire an RDF Graph replacing using equivalence mappings."""
pm = mset.prefix_map
mdoc = to_mapping_set_document(mset)
rewire_map: Dict[URIorCURIE, URIorCURIE] = {}

def expand_curie(curie: str) -> URIRef:
"""Expand CURIE into URIRef."""
pfx, local = curie.split(":")
return URIRef(f"{pm[pfx]}{local}")

if mdoc.mapping_set.mappings is None:
raise TypeError

converter = Converter.from_prefix_map(mdoc.prefix_map)
rewire_map: Dict[URIorCURIE, URIorCURIE] = {}
for m in mdoc.mapping_set.mappings:
if not isinstance(m, Mapping):
continue
Expand All @@ -49,8 +43,8 @@ def expand_curie(curie: str) -> URIRef:
curr_tgt = rewire_map[src]
logging.info(f"Ambiguous: {src} -> {tgt} vs {curr_tgt}")
if precedence:
curr_pfx, _ = curr_tgt.split(":")
tgt_pfx, _ = tgt.split(":")
curr_pfx, _ = converter.parse_curie(curr_tgt)
tgt_pfx, _ = converter.parse_curie(tgt)
if tgt_pfx in precedence:
if curr_pfx not in precedence or precedence.index(
tgt_pfx
Expand All @@ -63,7 +57,8 @@ def expand_curie(curie: str) -> URIRef:
rewire_map[src] = tgt

uri_ref_rewire_map: Dict[URIRef, URIRef] = {
expand_curie(k): expand_curie(v) for k, v in rewire_map.items()
URIRef(converter.expand_strict(k)): URIRef(converter.expand_strict(v))
for k, v in rewire_map.items()
}

def rewire_node(n: Any):
Expand Down
Loading

0 comments on commit f3be757

Please sign in to comment.