Skip to content

Commit

Permalink
Generate SPARQL Queries from SHACL shapes (#273)
Browse files Browse the repository at this point in the history
* add method to generate sparql query from shacl shape

* add some tests

* fix typo in query generation

* add sh:node, sh:or

* fixup tests

* handle or inside qualifiedvalue shape, complex property paths

* add test file

* update docs with image and guide on query generation

* augment docstring with more features

* add test case

* add section on supported SHACL features

* add algorithm documentation

* support other target definitions

* add more tests
  • Loading branch information
gtfierro committed May 3, 2024
1 parent a00fa6d commit 41de422
Show file tree
Hide file tree
Showing 9 changed files with 574 additions and 4 deletions.
180 changes: 177 additions & 3 deletions buildingmotif/dataclasses/shape_collection.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
import logging
import random
import string
from collections import defaultdict
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union
from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union

import rdflib
from rdflib import RDF, RDFS, URIRef
from rdflib import RDF, RDFS, Graph, URIRef
from rdflib.paths import ZeroOrMore, ZeroOrOne
from rdflib.term import Node

from buildingmotif import get_building_motif
from buildingmotif.namespaces import BMOTIF, OWL
from buildingmotif.namespaces import BMOTIF, OWL, SH
from buildingmotif.utils import Triple, copy_graph

if TYPE_CHECKING:
Expand Down Expand Up @@ -239,6 +244,8 @@ def get_shapes_about_class(
rows = graph.query(
f"""
PREFIX sh: <http://www.w3.org/ns/shacl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT ?shape WHERE {{
?shape a sh:NodeShape .
{rdf_type.n3()} rdfs:subClassOf* ?class .
Expand All @@ -249,6 +256,173 @@ def get_shapes_about_class(
)
return [row[0] for row in rows] # type: ignore

def shape_to_query(self, shape: URIRef) -> str:
"""
This method takes a URI representing a SHACL shape as an argument and returns
a SPARQL query selecting the information which would be used to satisfy that
SHACL shape. This uses the following rules:
- `<shape> sh:targetClass <class>` -> `?target rdf:type/rdfs:subClassOf* <class>`
- `<shape> sh:property [ sh:path <path>; sh:class <class>; sh:name <name> ]` ->
?target <path> ?name . ?name rdf:type/rdfs:subClassOf* <class>
- `<shape> sh:property [ sh:path <path>; sh:hasValue <value>]` ->
?target <path> <value>
"""
clauses, project = _shape_to_where(self.graph, shape)
preamble = """PREFIX sh: <http://www.w3.org/ns/shacl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
"""
return f"{preamble} SELECT {' '.join(project)} WHERE {{\n{clauses}\n}}"


def _is_list(graph: Graph, node: Node):
return (node, RDF.first, None) in graph


def _sh_path_to_path(graph: Graph, sh_path_value: Node):
# check if sh:path points to a list
if _is_list(graph, sh_path_value):
components = list(
graph.objects(sh_path_value, (RDF.rest * ZeroOrMore) / RDF.first) # type: ignore
)
return "/".join([_sh_path_to_path(graph, comp) for comp in components])
part = graph.value(sh_path_value, SH.oneOrMorePath)
if part is not None:
return f"{_sh_path_to_path(graph, part)}+"
part = graph.value(sh_path_value, SH.zeroOrMorePath)
if part is not None:
return f"{_sh_path_to_path(graph, part)}*"
part = graph.value(sh_path_value, SH.zeroOrOnePath)
if part is not None:
return f"{_sh_path_to_path(graph, part)}?"
return sh_path_value.n3()


def _shape_to_where(graph: Graph, shape: URIRef) -> Tuple[str, List[str]]:
# we will build the query as a string
clauses: str = ""
# build up the SELECT clause as a set of vars
project: Set[str] = {"?target"}

# local state for generating unique variable names
prefix = "".join(random.choice(string.ascii_lowercase) for _ in range(2))
variable_counter = 0

def gensym():
nonlocal variable_counter
varname = f"{prefix}{variable_counter}"
variable_counter += 1
return varname

# `<shape> sh:targetClass <class>` -> `?target rdf:type/rdfs:subClassOf* <class>`
targetClasses = graph.objects(shape, SH.targetClass | SH["class"])
tc_clauses = [
f"?target rdf:type/rdfs:subClassOf* {tc.n3()} .\n" for tc in targetClasses # type: ignore
]
clauses += " UNION ".join(tc_clauses)

# handle targetSubjectsOf
targetSubjectsOf = graph.objects(shape, SH.targetSubjectsOf)
tso_clauses = [
f"?target {tso.n3()} ?ignore .\n" for tso in targetSubjectsOf # type: ignore
]
clauses += " UNION ".join(tso_clauses)

# handle targetObjectsOf
targetObjectsOf = graph.objects(shape, SH.targetObjectsOf)
too_clauses = [
f"?ignore {too.n3()} ?target .\n" for too in targetObjectsOf # type: ignore
]
clauses += " UNION ".join(too_clauses)

# handle targetNode
targetNode = list(graph.objects(shape, SH.targetNode))
if len(targetNode) == 1:
clauses += f"BIND({targetNode[0].n3()} AS ?target) .\n"
elif len(targetNode) > 1:
raise ValueError(
"More than one targetNode found. This is not currently supported"
)

# find all of the non-qualified property shapes. All of these will use the same variable
# for all uses of the same sh:path value
pshapes_by_path: Dict[Node, List[Node]] = defaultdict(list)
for pshape in graph.objects(shape, SH.property):
path = _sh_path_to_path(graph, graph.value(pshape, SH.path))
if not graph.value(pshape, SH.qualifiedValueShape):
pshapes_by_path[path].append(pshape) # type: ignore

for dep_shape in graph.objects(shape, SH.node):
dep_clause, dep_project = _shape_to_where(graph, dep_shape)
clauses += dep_clause
project.update(dep_project)

for or_clause in graph.objects(shape, SH["or"]):
items = list(graph.objects(or_clause, (RDF.rest * ZeroOrMore) / RDF.first)) # type: ignore
or_parts = []
for item in items:
or_body, or_project = _shape_to_where(graph, item)
or_parts.append(or_body)
project.update(or_project)
clauses += " UNION ".join(f"{{ {or_body} }}" for or_body in or_parts)

# assign a unique variable for each sh:path w/o a qualified shape
pshape_vars: Dict[Node, str] = {}
for pshape_list in pshapes_by_path.values():
varname = f"?{gensym()}"
for pshape in pshape_list:
pshape_vars[pshape] = varname

for pshape in graph.objects(shape, SH.property):
# get the varname if we've already assigned one for this pshape above,
# or generate a new one. When generating a name, use the SH.name field
# in the PropertyShape or generate a unique one
name = pshape_vars.get(
pshape, f"?{graph.value(pshape, SH.name) or gensym()}".replace(" ", "_")
)
path = _sh_path_to_path(graph, graph.value(pshape, SH.path))
qMinCount = graph.value(pshape, SH.qualifiedMinCount) or 0

pclass = graph.value(
pshape, (SH["qualifiedValueShape"] * ZeroOrOne / SH["class"]) # type: ignore
)
if pclass:
clause = f"?target {path} {name} .\n {name} rdf:type/rdfs:subClassOf* {pclass.n3()} .\n"
if qMinCount == 0:
clause = f"OPTIONAL {{ {clause} }} .\n"
clauses += clause
project.add(name)

pnode = graph.value(
pshape, (SH["qualifiedValueShape"] * ZeroOrOne / SH["node"]) # type: ignore
)
if pnode:
node_clauses, node_project = _shape_to_where(graph, pnode)
clause = f"?target {path} {name} .\n"
clause += node_clauses.replace("?target", name)
if qMinCount == 0:
clause = f"OPTIONAL {{ {clause} }}"
clauses += clause
project.update({p.replace("?target", name) for p in node_project})

or_values = graph.value(
pshape, (SH["qualifiedValueShape"] * ZeroOrOne / SH["or"])
)
if or_values:
items = list(graph.objects(or_values, (RDF.rest * ZeroOrMore) / RDF.first))
or_parts = []
for item in items:
or_body, or_project = _shape_to_where(graph, item)
or_parts.append(or_body)
project.update(or_project)
clauses += " UNION ".join(f"{{ {or_body} }}" for or_body in or_parts)

pvalue = graph.value(pshape, SH.hasValue)
if pvalue:
clauses += f"?target {path} {pvalue.n3()} .\n"

return clauses, list(project)


def _resolve_imports(
graph: rdflib.Graph,
Expand Down
6 changes: 6 additions & 0 deletions buildingmotif/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,12 @@ def get_template_parts_from_shape(
deps.append({"template": str(otype), "args": {"name": param}})
body.add((param, RDF.type, otype))

# add 'hasValue'

pvalue = shape_graph.value(pshape, SH["hasValue"])
if pvalue:
body.add((root_param, path, pvalue))

if (shape_name, RDF.type, OWL.Class) in shape_graph:
body.add((root_param, RDF.type, shape_name))

Expand Down
2 changes: 2 additions & 0 deletions docs/_toc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,12 @@ parts:
chapters:
- file: guides/csv-import.md
- file: guides/ingress-bacnet-to-brick.md
- file: guides/generating-queries.md
- caption: Explainations
chapters:
- file: explanations/ingresses.md
- file: explanations/shapes-and-templates.md
- file: explanations/shacl_to_sparql.md
- caption: Appendix
chapters:
- file: bibliography.md
52 changes: 52 additions & 0 deletions docs/explanations/shacl_to_sparql.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# SHACL to SPARQL Conversion

BuildingMOTIF uses SHACL shapes to ensure that a metadata model contains the required metadata to support a given set of applications.
SHACL validation only yields whether or not a given node in the model passes or fails validation.
To aid in the execution of applications dependent on a SHACL shape, BuildingMOTIF provides functionality to extract from the model the nodes/edges that were used to validate the shape.

See [](../guides/generating-queries.md) for how to use the `shape_to_query` function. This page gives an overview of the algorithm.

## Shape-to-query Algorithm

The main method, `shape_to_query`, takes a SHACL shape represented as a URI and generates a SPARQL query to select information from an RDF graph that satisfies the constraints defined by the SHACL shape.
At a high level, the method works by first transforming the SHACL shape into a set of WHERE clauses, and then assembling these clauses into a complete SPARQL query.

The shape-to-query algorithm takes as input a definition of a SHACL Node Shape.

### `SELECT` clause generation

The `SELECT` clause of the resulting SPARQL query is generated as follows.
Each query has at least a `?target` variable in the generated `SELECT` clause.
This variable represents a target node of the SHACL shape.

The algorithm adds one variable to the `SELECT` clause for each Property Shape associated
with the Node Shape through `sh:property`.
The variable name is pulled from a `sh:name` annotation on the Property Shape if one exists;
otherwise, it is assigned a generated variable name.

If a `UNION` clause exists within the SPARQL query, the algorithm generates variable names independently for each branch of the `UNION` clause.
The resulting `SELECT` clause contains the union of the different sets of variable names.

### `WHERE` clause generation

The `WHERE` clause of the resulting SPARQL query is generated from each of the Property Shapes associated with the input Node Shape, and a few annotations directly on the NodeShape.

The Node Shape target definition is converted to a SPARQL query clause as follows:

| Target Definition | Query Pattern |
|-------------------|---------------|
| `sh:targetClass <c>` | `?target rdf:type/rdfs:subClassOf* <c>` |
| `sh:targetSubjectsOf <p>` | `?target <p> ?ignore ` |
| `sh:targetObjectsOf <p>` | `?ignore <p> ?target? ` |
| `sh:targetNode <n>` | `BIND(<n> AS ?target)` |

Additionally, any `sh:class <c>` constraint on the Node Shape is also transformed into `?target rdf:type/rdfs:subClassOf* <c>`.
Except for `sh:targetNode`, if more than one of the target clauses exists (e.g., `sh:targetClass brick:VAV, brick:RVAV`) then the algorithm uses a `UNION` clause to combine the independent query patterns.

The algorithm currently interprets a set of Property Shape-based constraint components into SPARQL query patterns.
At this stage, only the following clauses are supported:

| Property Shape pattern | SPARQL query pattern |
|------------------------|----------------------|
|`<shape> sh:property [ sh:path <path>; sh:class <class>; sh:name <name> ]` | `?target <path> ?name . ?name rdf:type/rdfs:subClassOf* <class>` |
|`<shape> sh:property [ sh:path <path>; sh:hasValue <value>]` | `?target <path> <value>` |
Binary file added docs/guides/Query-Generation-Flow.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 41de422

Please sign in to comment.