digitalbazaar · gkellogg · Dec 28, 2019 · Dec 28, 2019 · Dec 28, 2019 · Dec 28, 2019
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -63,6 +63,8 @@
   - `omitGraph` based on processingMode.
 - Replaced `removePreserve` with `cleanupPreserve` and `cleanupNulls`.
 - Remove unused framing `graphStack` code that was removed from the spec.
+- Update calls to `documentLoader` to pass options.
+- Pass `requestProfile` in `Accept` header when loading documents.
 
 ### Added
 - Support for `"@import"`.
@@ -74,6 +76,7 @@
 - Top level `@graph` omitted if `omitGraph` is `true`.
 - Check for invalid values of `@embed`.
 - Support default values for `@type` when framing.
+- Support for extracting JSON-LD from HTML, when the xmldom package is loaded.
 
 ## 2.0.2 - 2020-01-17
 

diff --git a/lib/ContextResolver.js b/lib/ContextResolver.js
@@ -163,7 +163,7 @@ module.exports = class ContextResolver {
     let remoteDoc;
 
     try {
-      remoteDoc = await documentLoader(url);
+      remoteDoc = await documentLoader(url, {});
       context = remoteDoc.document || null;
       // parse string context as JSON
       if(_isString(context)) {

diff --git a/lib/documentLoaders/node.js b/lib/documentLoaders/node.js
@@ -3,7 +3,12 @@
  */
 'use strict';
 
-const {parseLinkHeader, buildHeaders} = require('../util');
+const contentType = require('content-type');
+
+const {
+  parseLinkHeader,
+  buildHeaders
+} = require('../util');
 const {LINK_HEADER_CONTEXT} = require('../constants');
 const JsonLdError = require('../JsonLdError');
 const RequestQueue = require('../RequestQueue');
@@ -38,11 +43,11 @@ module.exports = ({
   const http = require('http');
 
   const queue = new RequestQueue();
-  return queue.wrapLoader(function(url) {
-    return loadDocument(url, []);
+  return queue.wrapLoader(function(url, options) {
+    return loadDocument(url, options, []);
   });
 
-  async function loadDocument(url, redirects) {
+  async function loadDocument(url, options, redirects) {
     if(url.indexOf('http:') !== 0 && url.indexOf('https:') !== 0) {
       throw new JsonLdError(
         'URL could not be dereferenced; only "http" and "https" URLs are ' +
@@ -61,6 +66,12 @@ module.exports = ({
       return doc;
     }
 
+    // add any optional requestProfile
+    if(options.requestProfile) {
+      headers.Accept =
+        headers.Accept + `, application/ld+json;profile=${options.requestProfile}`;
+    }
+
     let result;
     let alternate = null;
     try {
@@ -78,8 +89,17 @@ module.exports = ({
     }
 
     const {res, body} = result;
+    const {type, parameters} = contentType.parse(res);
+
+    doc = {
+      contextUrl: null,
+      documentUrl: url,
+      document: body || null,
+      contentType: type,
+      profile: parameters.profile
+    };
 
-    doc = {contextUrl: null, documentUrl: url, document: body || null};
+    // separate profile from content-type
 
     // handle error
     const statusText = http.STATUS_CODES[res.statusCode];
@@ -95,7 +115,7 @@ module.exports = ({
 
     // handle Link Header
     if(res.headers.link &&
-      res.headers['content-type'] !== 'application/ld+json') {
+      contentType !== 'application/ld+json') {
       // only 1 related link header permitted
       const linkHeaders = parseLinkHeader(res.headers.link);
       const linkedContext = linkHeaders[LINK_HEADER_CONTEXT];
@@ -144,7 +164,7 @@ module.exports = ({
           });
       }
       redirects.push(url);
-      return loadDocument(res.headers.location, redirects);
+      return loadDocument(res.headers.location, options, redirects);
     }
 
     // cache for each redirected URL

diff --git a/lib/documentLoaders/xhr.js b/lib/documentLoaders/xhr.js
@@ -3,7 +3,12 @@
  */
 'use strict';
 
-const {parseLinkHeader, buildHeaders} = require('../util');
+const contentType = require('content-type');
+
+const {
+  parseLinkHeader,
+  buildHeaders
+} = require('../util');
 const {LINK_HEADER_CONTEXT} = require('../constants');
 const JsonLdError = require('../JsonLdError');
 const RequestQueue = require('../RequestQueue');
@@ -31,7 +36,7 @@ module.exports = ({
   const queue = new RequestQueue();
   return queue.wrapLoader(loader);
 
-  async function loader(url) {
+  async function loader(url, options) {
     if(url.indexOf('http:') !== 0 && url.indexOf('https:') !== 0) {
       throw new JsonLdError(
         'URL could not be dereferenced; only "http" and "https" URLs are ' +
@@ -45,6 +50,12 @@ module.exports = ({
         'jsonld.InvalidUrl', {code: 'loading document failed', url});
     }
 
+    // add any optional requestProfile
+    if(options.requestProfile) {
+      headers.Accept =
+        headers.Accept + `, application/ld+json;profile=${options.requestProfile}`;
+    }
+
     let req;
     try {
       req = await _get(xhr, url, headers);
@@ -65,13 +76,21 @@ module.exports = ({
         });
     }
 
-    let doc = {contextUrl: null, documentUrl: url, document: req.response};
+    const {type, parameters} = contentType.parse(req);
+
+    let doc = {
+      contextUrl: null,
+      documentUrl: url,
+      document: req.response,
+      contentType: type,
+      profile: parameters.profile
+    };
     let alternate = null;
 
     // handle Link Header (avoid unsafe header warning by existence testing)
-    const contentType = req.getResponseHeader('Content-Type');
     let linkHeader;
-    if(REGEX_LINK_HEADER.test(req.getAllResponseHeaders())) {
+    if(contentType !== 'application/ld+json' &&
+      REGEX_LINK_HEADER.test(req.getAllResponseHeaders())) {
       linkHeader = req.getResponseHeader('Link');
     }
     if(linkHeader && contentType !== 'application/ld+json') {

diff --git a/lib/frame.js b/lib/frame.js
@@ -231,9 +231,10 @@ api.frame = (state, subjects, frame, parent, property = null) => {
 
         // recurse into list
         if(graphTypes.isList(o)) {
-          const subframe = (frame[prop] && frame[prop][0] && frame[prop][0]['@list']) ?
-            frame[prop][0]['@list'] :
-            _createImplicitFrame(flags);
+          const subframe =
+            (frame[prop] && frame[prop][0] && frame[prop][0]['@list']) ?
+              frame[prop][0]['@list'] :
+              _createImplicitFrame(flags);
 
           // add empty list
           const list = {'@list': []};

diff --git a/lib/jsonld.js b/lib/jsonld.js
@@ -34,6 +34,7 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 const canonize = require('rdf-canonize');
+const contentType = require('content-type');
 const util = require('./util');
 const ContextResolver = require('./ContextResolver');
 const IdentifierIssuer = util.IdentifierIssuer;
@@ -42,6 +43,7 @@ const LRU = require('lru-cache');
 const NQuads = require('./NQuads');
 const Rdfa = require('./Rdfa');
 
+const {prependBase: _prependBase} = require('./url');
 const {expand: _expand} = require('./expand');
 const {flatten: _flatten} = require('./flatten');
 const {fromRDF: _fromRDF} = require('./fromRdf');
@@ -378,6 +380,7 @@ jsonld.flatten = async function(input, ctx, options) {
   // set default options
   options = _setDefaults(options, {
     base: _isString(input) ? input : '',
+    extractAllScripts: true,
     contextResolver: new ContextResolver(
       {sharedCache: _resolvedContextCache})
   });
@@ -663,6 +666,7 @@ jsonld.toRDF = async function(input, options) {
   // set default options
   options = _setDefaults(options, {
     base: _isString(input) ? input : '',
+    extractAllScripts: true,
     skipExpansion: false,
     contextResolver: new ContextResolver(
       {sharedCache: _resolvedContextCache})
@@ -862,6 +866,9 @@ jsonld.documentLoader = async url => {
  * @param url the URL to fetch.
  * @param [options] the options to use:
  *          [documentLoader] the document loader to use.
+ *          [extractAllScripts] concatenates all matching script elements.
+ *          [profile] used when selecting from HTML script elements.
+ *          [requestProfile] one or more profile IRIs to use in the request.
  *
  * @return a Promise that resolves to the retrieved remote document.
  */
@@ -873,7 +880,10 @@ jsonld.get = async function(url, options) {
     load = jsonld.documentLoader;
   }
 
-  const remoteDoc = await load(url);
+  // FIXME: unescape frag?
+  const [reference, frag] = url.split('#', 2);
+
+  const remoteDoc = await load(reference, options);
 
   try {
     if(!remoteDoc.document) {
@@ -882,9 +892,74 @@ jsonld.get = async function(url, options) {
         'jsonld.NullRemoteDocument');
     }
     if(_isString(remoteDoc.document)) {
-      remoteDoc.document = JSON.parse(remoteDoc.document);
+      if(remoteDoc.contentType && remoteDoc.contentType.includes('text/html')) {
+        const domParser = new jsonld.domParser();
+        const dom = domParser.parseFromString(remoteDoc.document);
+
+        // Use any document base
+        const baseElem = dom.getElementsByTagName('base');
+        if(baseElem.length > 0) {
+          const href = baseElem[0].getAttribute('href');
+          options.base = _prependBase(options.base || reference, href);
+        }
+
+        const scripts = dom.getElementsByTagName('script');
+        remoteDoc.document = [];
+
+        for(let i = 0; i < scripts.length; i++) {
+          const script = scripts[i];
+          // only application/ld+json
+          const {type} = contentType.parse(script.getAttribute('type'));
+          if(type !== 'application/ld+json') {
+            continue;
+          }
+          if(!script.getAttribute('type').startsWith('application/ld+json')) {
+            continue;
+          }
+          // If url has a fragment identifier, only matching scripts
+          if(frag && script.getAttribute('id') !== frag) {
+            continue;
+          }
+          try {
+            remoteDoc.document.push(JSON.parse(script.textContent));
+          } catch(e) {
+            throw new JsonLdError(
+              'Illegal script content.',
+              'jsonld.InvalidScriptElement', {
+                code: 'invalid script element',
+                remoteDoc
+              });
+          }
+        }
+        if(frag && remoteDoc.document.length === 0) {
+          throw new JsonLdError(
+            `No script tag found with id=${frag}.`,
+            'jsonld.InvalidScriptElement', {
+              code: 'loading document failed',
+              remoteDoc
+            });
+        }
+        if(frag || !options.extractAllScripts) {
+          if(!remoteDoc.document[0]) {
+            throw new JsonLdError(
+              `No script tag found.`,
+              'jsonld.InvalidScriptElement', {
+                code: 'loading document failed',
+                remoteDoc
+              });
+          }
+          remoteDoc.document = remoteDoc.document[0];
+        }
+      } else {
+        remoteDoc.document = JSON.parse(remoteDoc.document);
+      }
     }
   } catch(e) {
+    if(e.name === 'jsonld.InvalidScriptElement') {
+      // pass error detected in HTML decode
+      throw (e);
+    }
+    // otherwise, general loading error
     throw new JsonLdError(
       'Could not retrieve a JSON-LD document from the URL.',
       'jsonld.LoadDocumentError', {
@@ -942,6 +1017,27 @@ jsonld.documentLoaders = {};
 jsonld.documentLoaders.node = require('./documentLoaders/node');
 jsonld.documentLoaders.xhr = require('./documentLoaders/xhr');
 
+// Optional DOM parser
+try {
+  jsonld.domParser = require('xmldom').DOMParser || class NoDOMParser {
+    parseFromString() {
+      throw new JsonLdError(
+        'Could not parse HTML document. ' +
+        'HTML parsing not implemented.', 'jsonld.LoadDocumentError',
+        {code: 'loading document failed'});
+    }
+  };
+} catch(e) {
+  jsonld.domParser = class NoDOMParser {
+    parseFromString() {
+      throw new JsonLdError(
+        'Could not parse HTML document. ' +
+        'HTML parsing not implemented.', 'jsonld.LoadDocumentError',
+        {code: 'loading document failed'});
+    }
+  };
+}
+
 /**
  * Assigns the default document loader for external document URLs to a built-in
  * default. Supported types currently include: 'xhr' and 'node'.

diff --git a/lib/util.js b/lib/util.js
@@ -15,6 +15,7 @@ const REGEX_LINK_HEADER = /\s*<([^>]*?)>\s*(?:;\s*(.*))?/;
 const REGEX_LINK_HEADER_PARAMS =
   /(.*?)=(?:(?:"([^"]*?)")|([^"]*?))\s*(?:(?:;\s*)|$)/g;
 
+// FIXME: conditinally support text/html
 const DEFAULTS = {
   headers: {
     accept: 'application/ld+json, application/json'

diff --git a/package.json b/package.json
@@ -31,6 +31,7 @@
   ],
   "dependencies": {
     "canonicalize": "^1.0.1",
+    "content-type": "^1.0.4",
     "lru-cache": "^5.1.1",
     "object.fromentries": "^2.0.2",
     "rdf-canonize": "^1.0.2",