From 2bbfaf3fe21b0a17b3016fc9cd762fc7f25b9ffb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20=C5=BB=C3=B3=C5=82tak?= Date: Tue, 27 Feb 2024 17:53:08 +0100 Subject: [PATCH] TriGParser: distinguish empty entities from no-etity being read See https://github.com/pietercolpaert/hardf/issues/37 (closes #37) --- README.md | 27 +++++++++++++++++++++++++++ src/TriGParser.php | 17 +++++++++++++---- test/TriGParserTest.php | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 73 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 7d514d6..0a977b6 100644 --- a/README.md +++ b/README.md @@ -183,6 +183,33 @@ $parser->end(); //Needs to be called * `end(): array` * `explicitQuantifiers` - [...] +#### Empty document base IRI + +Some Turtle and N3 documents may use relative-to-the-base-IRI IRI syntax (see [here](https://www.w3.org/TR/turtle/#sec-iri) and [here](https://www.w3.org/TR/turtle/#sec-iri-references)), e.g. + +``` +<> "some value" . +``` + +To properly parse such documents the document base IRI must be known. +Otherwise we might end up with empty IRIs (e.g. for the subject in the example above). + +Sometimes the base IRI is encoded in the document, e.g. + +``` +@base . +<> "some value" . +``` + +but sometimes it is missing. +In such a case the [Turtle specification](https://www.w3.org/TR/turtle/#in-html-parsing) requires us to follow section 5.1.1 of the [RFC3986](http://www.ietf.org/rfc/rfc3986.txt) which says that if the base IRI is not encapsulated in the document, it should be assumed to be the document retrieval URI (e.g. the URL you downloaded the document from or a file path converted to an URL). Unfortunatelly this can not be guessed by the hardf parser and has to be provided by you using the `documentIRI` parser creation option, e.g. + +```php +parser = new TriGParser(["documentIRI" => "http://some.base/iri/"]); +``` + +Long story short if you run into the `subject/predicate/object on line X can not be parsed without knowing the the document base IRI.(...)` error, please initialize the parser with the `documentIRI` option. + ### Utility ```php use pietercolpaert\hardf\Util; diff --git a/src/TriGParser.php b/src/TriGParser.php index 012471b..2c1141d 100644 --- a/src/TriGParser.php +++ b/src/TriGParser.php @@ -354,7 +354,7 @@ private function initReaders() // Read the subject entity $this->subject = \call_user_func($this->readEntity, $token); if (null == $this->subject) { - return; + throw $this->getNoBaseUriException('subject', $token['line']); } // In N3 mode, the subject might be a path if ($this->n3Mode) { @@ -398,7 +398,7 @@ private function initReaders() default: $this->predicate = \call_user_func($this->readEntity, $token); if (null == $this->predicate) { - return; + throw $this->getNoBaseUriException('predicate', $token['line']); } } // The next token must be an object @@ -437,7 +437,7 @@ private function initReaders() // Read the object entity $this->object = \call_user_func($this->readEntity, $token); if (null == $this->object) { - return; + throw $this->getNoBaseUriException('object', $token['line']); } // In N3 mode, the object might be a path if ($this->n3Mode) { @@ -575,7 +575,7 @@ private function initReaders() default: $item = \call_user_func($this->readEntity, $token); if (null == $item) { - return; + throw $this->getNoBaseUriException('list item', $token['line']); } } @@ -1218,4 +1218,13 @@ public function end() { return $this->parseChunk('', true); } + + private function getNoBaseUriException($location, $line) + { + return new \Exception( + "$location on line $line can not be parsed without knowing the the document base IRI.\n". + "Please set the document base IRI using the documentIRI parser configuration option.\n". + "See https://github.com/pietercolpaert/hardf/#empty-document-base-IRI ." + ); + } } diff --git a/test/TriGParserTest.php b/test/TriGParserTest.php index ab482c8..d5072f0 100644 --- a/test/TriGParserTest.php +++ b/test/TriGParserTest.php @@ -69,6 +69,9 @@ public function shouldNotParse($createParser, $input, $expectedError = null): vo $this->fail("Expected this error to be thrown (but it wasn't): ".$expectedError); } }); + if (!$errorReceived) { + $this->fail("Expected this error to be thrown (but it wasn't): ".$expectedError); + } } /** @@ -2067,4 +2070,34 @@ public function testResolve(): void $this->itShouldResolve('http://abc/def/ghi?q=xx/yyy/z', 'jjj', 'http://abc/def/jjj'); $this->itShouldResolve('http://abc/def/ghi?q=xx/y?y/z', 'jjj', 'http://abc/def/jjj'); } + + // https://github.com/pietercolpaert/hardf/issues/37 + public function testIssue37(): void { + // should throw an error on empty subject/predicate/object + $errSuffix = " on line 1 can not be parsed without knowing the the document base IRI.\n". + "Please set the document base IRI using the documentIRI parser configuration option.\n". + "See https://github.com/pietercolpaert/hardf/#empty-document-base-IRI ."; + $this->shouldNotParse('<> .', 'subject' . $errSuffix); + $this->shouldNotParse(' <> .', 'predicate' . $errSuffix); + $this->shouldNotParse(' <> .', 'object' . $errSuffix); + + // but should manage with documentIRI being set or @base in the turle + $this->shouldParse( + "@base .\n". + "<> .\n". + " <> .\n". + " <> .", + ['http://base/', 'http://base/b', 'http://base/c'], + ['http://base/a', 'http://base/', 'http://base/c'], + ['http://base/a', 'http://base/b', 'http://base/']); + + $parser = function () { return new TriGParser(['documentIRI' => 'http://base/']); }; + $this->shouldParse($parser, + "<> .\n". + " <> .\n". + " <> .", + ['http://base/', 'http://base/b', 'http://base/c'], + ['http://base/a', 'http://base/', 'http://base/c'], + ['http://base/a', 'http://base/b', 'http://base/']); + } }