From 01bf5564c80bd5d449e217f3bd5ca89b2ac6f711 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20=C5=BB=C3=B3=C5=82tak?= Date: Wed, 28 Feb 2024 13:03:20 +0100 Subject: [PATCH] TriGParser: distinguish empty entities from no-entity being read (#42) * TriGParser: distinguish empty entities from no-etity being read See https://github.com/pietercolpaert/hardf/issues/37 (closes #37) * TriGParserTest::testBlankNodes() adjusted * removed the prefixed-only IRIs input line from the first test scenario as this does not belong to the testBlankNodes() tests and is tested aleady in testIssue37() * turned the empty prefixed IRIs test scenario into two - first, where and error is expected due to unknown document base IRI and second, where parsing succeeds thanks to `documentIRI` parser option being set * Update test/TriGParserTest.php * Update test/TriGParserTest.php --------- Co-authored-by: Konrad Abicht --- README.md | 27 ++++++++++++++++++++++ src/TriGParser.php | 17 ++++++++++---- test/TriGParserTest.php | 51 ++++++++++++++++++++++++++++++++++++----- 3 files changed, 85 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index b9f9ad9..0a1b2de 100644 --- a/README.md +++ b/README.md @@ -182,6 +182,33 @@ $parser->end(); //Needs to be called * `end(): array` * `explicitQuantifiers` - [...] +#### Empty document base IRI + +Some Turtle and N3 documents may use relative-to-the-base-IRI IRI syntax (see [here](https://www.w3.org/TR/turtle/#sec-iri) and [here](https://www.w3.org/TR/turtle/#sec-iri-references)), e.g. + +``` +<> "some value" . +``` + +To properly parse such documents the document base IRI must be known. +Otherwise we might end up with empty IRIs (e.g. for the subject in the example above). + +Sometimes the base IRI is encoded in the document, e.g. + +``` +@base . +<> "some value" . +``` + +but sometimes it is missing. +In such a case the [Turtle specification](https://www.w3.org/TR/turtle/#in-html-parsing) requires us to follow section 5.1.1 of the [RFC3986](http://www.ietf.org/rfc/rfc3986.txt) which says that if the base IRI is not encapsulated in the document, it should be assumed to be the document retrieval URI (e.g. the URL you downloaded the document from or a file path converted to an URL). Unfortunatelly this can not be guessed by the hardf parser and has to be provided by you using the `documentIRI` parser creation option, e.g. + +```php +parser = new TriGParser(["documentIRI" => "http://some.base/iri/"]); +``` + +Long story short if you run into the `subject/predicate/object on line X can not be parsed without knowing the the document base IRI.(...)` error, please initialize the parser with the `documentIRI` option. + ### Utility ```php use pietercolpaert\hardf\Util; diff --git a/src/TriGParser.php b/src/TriGParser.php index 012471b..2c1141d 100644 --- a/src/TriGParser.php +++ b/src/TriGParser.php @@ -354,7 +354,7 @@ private function initReaders() // Read the subject entity $this->subject = \call_user_func($this->readEntity, $token); if (null == $this->subject) { - return; + throw $this->getNoBaseUriException('subject', $token['line']); } // In N3 mode, the subject might be a path if ($this->n3Mode) { @@ -398,7 +398,7 @@ private function initReaders() default: $this->predicate = \call_user_func($this->readEntity, $token); if (null == $this->predicate) { - return; + throw $this->getNoBaseUriException('predicate', $token['line']); } } // The next token must be an object @@ -437,7 +437,7 @@ private function initReaders() // Read the object entity $this->object = \call_user_func($this->readEntity, $token); if (null == $this->object) { - return; + throw $this->getNoBaseUriException('object', $token['line']); } // In N3 mode, the object might be a path if ($this->n3Mode) { @@ -575,7 +575,7 @@ private function initReaders() default: $item = \call_user_func($this->readEntity, $token); if (null == $item) { - return; + throw $this->getNoBaseUriException('list item', $token['line']); } } @@ -1218,4 +1218,13 @@ public function end() { return $this->parseChunk('', true); } + + private function getNoBaseUriException($location, $line) + { + return new \Exception( + "$location on line $line can not be parsed without knowing the the document base IRI.\n". + "Please set the document base IRI using the documentIRI parser configuration option.\n". + "See https://github.com/pietercolpaert/hardf/#empty-document-base-IRI ." + ); + } } diff --git a/test/TriGParserTest.php b/test/TriGParserTest.php index ab482c8..6d31541 100644 --- a/test/TriGParserTest.php +++ b/test/TriGParserTest.php @@ -69,6 +69,9 @@ public function shouldNotParse($createParser, $input, $expectedError = null): vo $this->fail("Expected this error to be thrown (but it wasn't): ".$expectedError); } }); + if (false === $errorReceived) { + $this->fail("Expected this error to be thrown (but it wasn't): ".$expectedError); + } } /** @@ -270,13 +273,19 @@ public function testZeroOrMoreTriples(): void public function testBlankNodes(): void { - // should parse diamonds - $this->shouldParse("<> <> <> <>.\n(<>) <> (<>) <>.", - ['', '', '', ''], - ['_:b0', '', '_:b1', ''], - ['_:b0', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#first', ''], + // should throw an error on empty list item with lacking document base IRI + $this->shouldNotParse("(<>) <> (<>) <>.", + "list item on line 1 can not be parsed without knowing the the document base IRI.\n". + "Please set the document base IRI using the documentIRI parser configuration option.\n". + "See https://github.com/pietercolpaert/hardf/#empty-document-base-IRI ."); + + // but should manage if the parser has documentIRI set + $this->shouldParse(function () { return new TriGParser(['documentIRI' => 'http://base/']); }, + "(<>) <> (<>) <>.", + ['_:b0', 'http://base/', '_:b1', 'http://base/'], + ['_:b0', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#first', 'http://base/'], ['_:b0', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#rest', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#nil'], - ['_:b1', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#first', ''], + ['_:b1', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#first', 'http://base/'], ['_:b1', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#rest', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#nil']); // should parse statements with named blank nodes @@ -2067,4 +2076,34 @@ public function testResolve(): void $this->itShouldResolve('http://abc/def/ghi?q=xx/yyy/z', 'jjj', 'http://abc/def/jjj'); $this->itShouldResolve('http://abc/def/ghi?q=xx/y?y/z', 'jjj', 'http://abc/def/jjj'); } + + // https://github.com/pietercolpaert/hardf/issues/37 + public function testIssue37(): void { + // should throw an error on empty subject/predicate/object + $errSuffix = " on line 1 can not be parsed without knowing the the document base IRI.\n". + "Please set the document base IRI using the documentIRI parser configuration option.\n". + "See https://github.com/pietercolpaert/hardf/#empty-document-base-IRI ."; + $this->shouldNotParse('<> .', 'subject' . $errSuffix); + $this->shouldNotParse(' <> .', 'predicate' . $errSuffix); + $this->shouldNotParse(' <> .', 'object' . $errSuffix); + + // but should manage with documentIRI being set or @base in the turle + $this->shouldParse( + "@base .\n". + "<> .\n". + " <> .\n". + " <> .", + ['http://base/', 'http://base/b', 'http://base/c'], + ['http://base/a', 'http://base/', 'http://base/c'], + ['http://base/a', 'http://base/b', 'http://base/']); + + $parser = function () { return new TriGParser(['documentIRI' => 'http://base/']); }; + $this->shouldParse($parser, + "<> .\n". + " <> .\n". + " <> .", + ['http://base/', 'http://base/b', 'http://base/c'], + ['http://base/a', 'http://base/', 'http://base/c'], + ['http://base/a', 'http://base/b', 'http://base/']); + } }