Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Load html #347

Open
wants to merge 18 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@
- `omitGraph` based on processingMode.
- Replaced `removePreserve` with `cleanupPreserve` and `cleanupNulls`.
- Remove unused framing `graphStack` code that was removed from the spec.
- Update calls to `documentLoader` to pass options.
- Pass `requestProfile` in `Accept` header when loading documents.

### Added
- Support for `"@import"`.
Expand All @@ -74,6 +76,7 @@
- Top level `@graph` omitted if `omitGraph` is `true`.
- Check for invalid values of `@embed`.
- Support default values for `@type` when framing.
- Support for extracting JSON-LD from HTML, when the xmldom package is loaded.

## 2.0.2 - 2020-01-17

Expand Down
2 changes: 1 addition & 1 deletion lib/ContextResolver.js
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ module.exports = class ContextResolver {
let remoteDoc;

try {
remoteDoc = await documentLoader(url);
remoteDoc = await documentLoader(url, {});
context = remoteDoc.document || null;
// parse string context as JSON
if(_isString(context)) {
Expand Down
34 changes: 27 additions & 7 deletions lib/documentLoaders/node.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,12 @@
*/
'use strict';

const {parseLinkHeader, buildHeaders} = require('../util');
const contentType = require('content-type');

const {
parseLinkHeader,
buildHeaders
} = require('../util');
const {LINK_HEADER_CONTEXT} = require('../constants');
const JsonLdError = require('../JsonLdError');
const RequestQueue = require('../RequestQueue');
Expand Down Expand Up @@ -38,11 +43,11 @@ module.exports = ({
const http = require('http');

const queue = new RequestQueue();
return queue.wrapLoader(function(url) {
return loadDocument(url, []);
return queue.wrapLoader(function(url, options) {
return loadDocument(url, options, []);
});

async function loadDocument(url, redirects) {
async function loadDocument(url, options, redirects) {
if(url.indexOf('http:') !== 0 && url.indexOf('https:') !== 0) {
throw new JsonLdError(
'URL could not be dereferenced; only "http" and "https" URLs are ' +
Expand All @@ -61,6 +66,12 @@ module.exports = ({
return doc;
}

// add any optional requestProfile
if(options.requestProfile) {
headers.Accept =
headers.Accept + `, application/ld+json;profile=${options.requestProfile}`;
}

let result;
let alternate = null;
try {
Expand All @@ -78,8 +89,17 @@ module.exports = ({
}

const {res, body} = result;
const {type, parameters} = contentType.parse(res);

doc = {
contextUrl: null,
documentUrl: url,
document: body || null,
contentType: type,
profile: parameters.profile
};

doc = {contextUrl: null, documentUrl: url, document: body || null};
// separate profile from content-type

// handle error
const statusText = http.STATUS_CODES[res.statusCode];
Expand All @@ -95,7 +115,7 @@ module.exports = ({

// handle Link Header
if(res.headers.link &&
res.headers['content-type'] !== 'application/ld+json') {
contentType !== 'application/ld+json') {
// only 1 related link header permitted
const linkHeaders = parseLinkHeader(res.headers.link);
const linkedContext = linkHeaders[LINK_HEADER_CONTEXT];
Expand Down Expand Up @@ -144,7 +164,7 @@ module.exports = ({
});
}
redirects.push(url);
return loadDocument(res.headers.location, redirects);
return loadDocument(res.headers.location, options, redirects);
}

// cache for each redirected URL
Expand Down
29 changes: 24 additions & 5 deletions lib/documentLoaders/xhr.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,12 @@
*/
'use strict';

const {parseLinkHeader, buildHeaders} = require('../util');
const contentType = require('content-type');

const {
parseLinkHeader,
buildHeaders
} = require('../util');
const {LINK_HEADER_CONTEXT} = require('../constants');
const JsonLdError = require('../JsonLdError');
const RequestQueue = require('../RequestQueue');
Expand Down Expand Up @@ -31,7 +36,7 @@ module.exports = ({
const queue = new RequestQueue();
return queue.wrapLoader(loader);

async function loader(url) {
async function loader(url, options) {
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There don't seem to be standalone XHR documentLoader tests, as there are node tests.

if(url.indexOf('http:') !== 0 && url.indexOf('https:') !== 0) {
throw new JsonLdError(
'URL could not be dereferenced; only "http" and "https" URLs are ' +
Expand All @@ -45,6 +50,12 @@ module.exports = ({
'jsonld.InvalidUrl', {code: 'loading document failed', url});
}

// add any optional requestProfile
if(options.requestProfile) {
headers.Accept =
headers.Accept + `, application/ld+json;profile=${options.requestProfile}`;
}

let req;
try {
req = await _get(xhr, url, headers);
Expand All @@ -65,13 +76,21 @@ module.exports = ({
});
}

let doc = {contextUrl: null, documentUrl: url, document: req.response};
const {type, parameters} = contentType.parse(req);

let doc = {
contextUrl: null,
documentUrl: url,
document: req.response,
contentType: type,
profile: parameters.profile
};
let alternate = null;

// handle Link Header (avoid unsafe header warning by existence testing)
const contentType = req.getResponseHeader('Content-Type');
let linkHeader;
if(REGEX_LINK_HEADER.test(req.getAllResponseHeaders())) {
if(contentType !== 'application/ld+json' &&
REGEX_LINK_HEADER.test(req.getAllResponseHeaders())) {
linkHeader = req.getResponseHeader('Link');
}
if(linkHeader && contentType !== 'application/ld+json') {
Expand Down
7 changes: 4 additions & 3 deletions lib/frame.js
Original file line number Diff line number Diff line change
Expand Up @@ -231,9 +231,10 @@ api.frame = (state, subjects, frame, parent, property = null) => {

// recurse into list
if(graphTypes.isList(o)) {
const subframe = (frame[prop] && frame[prop][0] && frame[prop][0]['@list']) ?
frame[prop][0]['@list'] :
_createImplicitFrame(flags);
const subframe =
(frame[prop] && frame[prop][0] && frame[prop][0]['@list']) ?
frame[prop][0]['@list'] :
_createImplicitFrame(flags);

// add empty list
const list = {'@list': []};
Expand Down
100 changes: 98 additions & 2 deletions lib/jsonld.js
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
const canonize = require('rdf-canonize');
const contentType = require('content-type');
const util = require('./util');
const ContextResolver = require('./ContextResolver');
const IdentifierIssuer = util.IdentifierIssuer;
Expand All @@ -42,6 +43,7 @@ const LRU = require('lru-cache');
const NQuads = require('./NQuads');
const Rdfa = require('./Rdfa');

const {prependBase: _prependBase} = require('./url');
const {expand: _expand} = require('./expand');
const {flatten: _flatten} = require('./flatten');
const {fromRDF: _fromRDF} = require('./fromRdf');
Expand Down Expand Up @@ -378,6 +380,7 @@ jsonld.flatten = async function(input, ctx, options) {
// set default options
options = _setDefaults(options, {
base: _isString(input) ? input : '',
extractAllScripts: true,
contextResolver: new ContextResolver(
{sharedCache: _resolvedContextCache})
});
Expand Down Expand Up @@ -663,6 +666,7 @@ jsonld.toRDF = async function(input, options) {
// set default options
options = _setDefaults(options, {
base: _isString(input) ? input : '',
extractAllScripts: true,
skipExpansion: false,
contextResolver: new ContextResolver(
{sharedCache: _resolvedContextCache})
Expand Down Expand Up @@ -862,6 +866,9 @@ jsonld.documentLoader = async url => {
* @param url the URL to fetch.
* @param [options] the options to use:
* [documentLoader] the document loader to use.
* [extractAllScripts] concatenates all matching script elements.
* [profile] used when selecting from HTML script elements.
* [requestProfile] one or more profile IRIs to use in the request.
*
* @return a Promise that resolves to the retrieved remote document.
*/
Expand All @@ -873,7 +880,10 @@ jsonld.get = async function(url, options) {
load = jsonld.documentLoader;
}

const remoteDoc = await load(url);
// FIXME: unescape frag?
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if this is necessary; it's not to pass tests.

const [reference, frag] = url.split('#', 2);

const remoteDoc = await load(reference, options);

try {
if(!remoteDoc.document) {
Expand All @@ -882,9 +892,74 @@ jsonld.get = async function(url, options) {
'jsonld.NullRemoteDocument');
}
if(_isString(remoteDoc.document)) {
remoteDoc.document = JSON.parse(remoteDoc.document);
if(remoteDoc.contentType && remoteDoc.contentType.includes('text/html')) {
const domParser = new jsonld.domParser();
const dom = domParser.parseFromString(remoteDoc.document);

// Use any document base
const baseElem = dom.getElementsByTagName('base');
if(baseElem.length > 0) {
const href = baseElem[0].getAttribute('href');
options.base = _prependBase(options.base || reference, href);
}

const scripts = dom.getElementsByTagName('script');
remoteDoc.document = [];

for(let i = 0; i < scripts.length; i++) {
const script = scripts[i];
// only application/ld+json
const {type} = contentType.parse(script.getAttribute('type'));
if(type !== 'application/ld+json') {
continue;
}
if(!script.getAttribute('type').startsWith('application/ld+json')) {
continue;
}
// If url has a fragment identifier, only matching scripts
if(frag && script.getAttribute('id') !== frag) {
continue;
}
try {
remoteDoc.document.push(JSON.parse(script.textContent));
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that textContent always decodes entities, so there are tests we can't pass. Any ideas?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWLIW, I threw together a JSBin that uses example 10 to test parsing via jsonld.js 2.0.2:
https://jsbin.com/rewaxiquki/edit?html,console

There aren't any entity decoding issues, so I think this issue is probably a bug in xmldom's implementation.

Here's another one which works using jsdom:
https://runkit.com/embed/9qrv8dl5g2bs

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI, I use htmlparser2 in my implementation, and it doesn't seem to be decoding entities, so it allows all tests to pass.
So I also think that this is a bug/feature in xmldom.

} catch(e) {
throw new JsonLdError(
'Illegal script content.',
'jsonld.InvalidScriptElement', {
code: 'invalid script element',
remoteDoc
});
}
}
if(frag && remoteDoc.document.length === 0) {
throw new JsonLdError(
`No script tag found with id=${frag}.`,
'jsonld.InvalidScriptElement', {
code: 'loading document failed',
remoteDoc
});
}
if(frag || !options.extractAllScripts) {
if(!remoteDoc.document[0]) {
throw new JsonLdError(
`No script tag found.`,
'jsonld.InvalidScriptElement', {
code: 'loading document failed',
remoteDoc
});
}
remoteDoc.document = remoteDoc.document[0];
}
} else {
remoteDoc.document = JSON.parse(remoteDoc.document);
}
}
} catch(e) {
if(e.name === 'jsonld.InvalidScriptElement') {
// pass error detected in HTML decode
throw (e);
}
// otherwise, general loading error
throw new JsonLdError(
'Could not retrieve a JSON-LD document from the URL.',
'jsonld.LoadDocumentError', {
Expand Down Expand Up @@ -942,6 +1017,27 @@ jsonld.documentLoaders = {};
jsonld.documentLoaders.node = require('./documentLoaders/node');
jsonld.documentLoaders.xhr = require('./documentLoaders/xhr');

// Optional DOM parser
try {
jsonld.domParser = require('xmldom').DOMParser || class NoDOMParser {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@davidlehn -- can you comment here? I don't think we can easily support this pattern with webpack. Can you suggest an alternative path forward? Instead of a require here, the user may need to have installed another package themselves that registered a DOM parser with jsonld in a similar way we do with RDF parsers. If so -- we should copy that pattern since it's already used.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Update: PR #341 removes xmldom as a dependency.

This PR here (#347) should be updated to take that into account (specifically line 1022 here).

parseFromString() {
throw new JsonLdError(
'Could not parse HTML document. ' +
'HTML parsing not implemented.', 'jsonld.LoadDocumentError',
{code: 'loading document failed'});
}
};
} catch(e) {
jsonld.domParser = class NoDOMParser {
parseFromString() {
throw new JsonLdError(
'Could not parse HTML document. ' +
'HTML parsing not implemented.', 'jsonld.LoadDocumentError',
{code: 'loading document failed'});
}
};
}

/**
* Assigns the default document loader for external document URLs to a built-in
* default. Supported types currently include: 'xhr' and 'node'.
Expand Down
1 change: 1 addition & 0 deletions lib/util.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ const REGEX_LINK_HEADER = /\s*<([^>]*?)>\s*(?:;\s*(.*))?/;
const REGEX_LINK_HEADER_PARAMS =
/(.*?)=(?:(?:"([^"]*?)")|([^"]*?))\s*(?:(?:;\s*)|$)/g;

// FIXME: conditinally support text/html
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With HTML support, we should include text/html and application/ld+json, but not sure best way to do that. It will also mess up some documentLoader tests?

const DEFAULTS = {
headers: {
accept: 'application/ld+json, application/json'
Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
],
"dependencies": {
"canonicalize": "^1.0.1",
"content-type": "^1.0.4",
"lru-cache": "^5.1.1",
"object.fromentries": "^2.0.2",
"rdf-canonize": "^1.0.2",
Expand Down
Loading