diff --git a/configs/bucketizer_configs.ttl b/configs/bucketizer_configs.ttl index 1cd5fcc..f65f37e 100644 --- a/configs/bucketizer_configs.ttl +++ b/configs/bucketizer_configs.ttl @@ -108,3 +108,25 @@ sh:minCount 1; ]. +[ ] a sh:NodeShape; + sh:targetClass tree:HourFragmentation; + sh:property [ + sh:name "path"; + sh:path tree:timestampPath; + sh:class rdfl:PathLens; + sh:maxCount 1; + sh:minCount 1; + ], [ + sh:name "pathQuads"; + sh:path tree:timestampPath; + sh:class ; + sh:maxCount 1; + sh:minCount 1; + ], [ + sh:name "unorderedRelations"; + sh:path tree:unorderedRelations; + sh:datatype xsd:boolean; + sh:maxCount 1; + sh:minCount 0; + ]. + diff --git a/package-lock.json b/package-lock.json index 4d27936..104f1f6 100644 --- a/package-lock.json +++ b/package-lock.json @@ -515,6 +515,7 @@ }, "node_modules/@eslint/config-array": { "version": "0.18.0", + "dev": true, "license": "Apache-2.0", "dependencies": { "@eslint/object-schema": "^2.1.4", @@ -527,6 +528,7 @@ }, "node_modules/@eslint/config-array/node_modules/brace-expansion": { "version": "1.1.11", + "dev": true, "license": "MIT", "dependencies": { "balanced-match": "^1.0.0", @@ -535,6 +537,7 @@ }, "node_modules/@eslint/config-array/node_modules/minimatch": { "version": "3.1.2", + "dev": true, "license": "ISC", "dependencies": { "brace-expansion": "^1.1.7" @@ -545,6 +548,7 @@ }, "node_modules/@eslint/eslintrc": { "version": "3.1.0", + "dev": true, "license": "MIT", "dependencies": { "ajv": "^6.12.4", @@ -566,6 +570,7 @@ }, "node_modules/@eslint/eslintrc/node_modules/brace-expansion": { "version": "1.1.11", + "dev": true, "license": "MIT", "dependencies": { "balanced-match": "^1.0.0", @@ -574,6 +579,7 @@ }, "node_modules/@eslint/eslintrc/node_modules/minimatch": { "version": "3.1.2", + "dev": true, "license": "ISC", "dependencies": { "brace-expansion": "^1.1.7" @@ -584,6 +590,7 @@ }, "node_modules/@eslint/js": { "version": "9.9.1", + "dev": true, "license": "MIT", "engines": { "node": "^18.18.0 || ^20.9.0 || >=21.1.0" @@ -591,6 +598,7 @@ }, "node_modules/@eslint/object-schema": { "version": "2.1.4", + "dev": true, "license": "Apache-2.0", "engines": { "node": "^18.18.0 || ^20.9.0 || >=21.1.0" @@ -660,6 +668,7 @@ }, "node_modules/@humanwhocodes/retry": { "version": "0.3.0", + "dev": true, "license": "Apache-2.0", "engines": { "node": ">=18.18" @@ -2179,6 +2188,7 @@ }, "node_modules/eslint": { "version": "9.9.1", + "dev": true, "license": "MIT", "dependencies": { "@eslint-community/eslint-utils": "^4.2.0", @@ -2246,6 +2256,7 @@ }, "node_modules/eslint-scope": { "version": "8.0.2", + "dev": true, "license": "BSD-2-Clause", "dependencies": { "esrecurse": "^4.3.0", @@ -2270,6 +2281,7 @@ }, "node_modules/eslint/node_modules/brace-expansion": { "version": "1.1.11", + "dev": true, "license": "MIT", "dependencies": { "balanced-match": "^1.0.0", @@ -2278,6 +2290,7 @@ }, "node_modules/eslint/node_modules/eslint-visitor-keys": { "version": "4.0.0", + "dev": true, "license": "Apache-2.0", "engines": { "node": "^18.18.0 || ^20.9.0 || >=21.1.0" @@ -2288,6 +2301,7 @@ }, "node_modules/eslint/node_modules/minimatch": { "version": "3.1.2", + "dev": true, "license": "ISC", "dependencies": { "brace-expansion": "^1.1.7" @@ -2298,6 +2312,7 @@ }, "node_modules/espree": { "version": "10.1.0", + "dev": true, "license": "BSD-2-Clause", "dependencies": { "acorn": "^8.12.0", @@ -2313,6 +2328,7 @@ }, "node_modules/espree/node_modules/eslint-visitor-keys": { "version": "4.0.0", + "dev": true, "license": "Apache-2.0", "engines": { "node": "^18.18.0 || ^20.9.0 || >=21.1.0" @@ -2487,6 +2503,7 @@ }, "node_modules/file-entry-cache": { "version": "8.0.0", + "dev": true, "license": "MIT", "dependencies": { "flat-cache": "^4.0.0" @@ -2539,6 +2556,7 @@ }, "node_modules/flat-cache": { "version": "4.0.1", + "dev": true, "license": "MIT", "dependencies": { "flatted": "^3.2.9", @@ -2689,6 +2707,7 @@ }, "node_modules/globals": { "version": "14.0.0", + "dev": true, "license": "MIT", "engines": { "node": ">=18" @@ -4209,17 +4228,6 @@ "rdf-terms": "^1.7.0" } }, - "node_modules/rdf-js": { - "version": "4.0.2", - "resolved": "https://registry.npmjs.org/rdf-js/-/rdf-js-4.0.2.tgz", - "integrity": "sha512-ApvlFa/WsQh8LpPK/6hctQwG06Z9ztQQGWVtrcrf9L6+sejHNXLPOqL+w7q3hF+iL0C4sv3AX1PUtGkLNzyZ0Q==", - "deprecated": "Use @types/rdf-js instead. See https://github.com/rdfjs/types?tab=readme-ov-file#what-about-typesrdf-js", - "license": "MIT", - "peer": true, - "dependencies": { - "@rdfjs/types": "*" - } - }, "node_modules/rdf-lens": { "version": "1.3.0", "resolved": "https://registry.npmjs.org/rdf-lens/-/rdf-lens-1.3.0.tgz", diff --git a/src/bucketizers/hourBucketizer.ts b/src/bucketizers/hourBucketizer.ts new file mode 100644 index 0000000..347884b --- /dev/null +++ b/src/bucketizers/hourBucketizer.ts @@ -0,0 +1,142 @@ +import { AddRelation, Bucketizer, HourFragmentation } from "./index"; +import { getLoggerFor } from "../utils/logUtil"; +import { Bucket, RdfThing, Record } from "../utils"; +import { BasicLensM, Cont } from "rdf-lens"; +import { Term } from "@rdfjs/types"; +import { TREE, XSD } from "@treecg/types"; +import { DataFactory } from "n3"; + +const { literal, namedNode } = DataFactory; + +export default class HourBucketizer implements Bucketizer { + protected readonly logger = getLoggerFor(this); + + private readonly path: BasicLensM; + private readonly pathQuads: RdfThing; + private readonly unorderedRelations: boolean; + + private hour: Date; + private root: boolean = true; + + constructor(config: HourFragmentation, save?: string) { + this.path = config.path.mapAll((x) => ({ + value: x.id.value, + literal: x.id, + })); + this.pathQuads = config.pathQuads; + this.unorderedRelations = config.unorderedRelations ?? false; + + if (save) { + const parsed = JSON.parse(save); + this.hour = new Date(parsed.hour); + this.root = parsed.root; + } + } + + bucketize( + record: Record, + getBucket: (key: string, root?: boolean) => Bucket, + addRelation: AddRelation, + ): Bucket[] { + const values = this.path + .execute(record.data) + .filter( + (x, i, arr) => arr.findIndex((y) => x.value === y.value) == i, + ); + + const out: Bucket[] = []; + + for (const value of values) { + if (value.literal) { + // The record has a timestamp value. + const timestamp = value.literal.value; + + const recordDate = new Date(timestamp); + + if (!this.hour) { + // Create the first (root) bucket. + this.root = true; + this.hour = new Date( + recordDate.getFullYear(), + recordDate.getMonth(), + recordDate.getDate(), + recordDate.getHours(), + ); + out.push(getBucket(this.hour.toISOString(), this.root)); + + this.logger.debug( + `Created root hour bucket ${this.hour.toISOString()}`, + ); + } else if (recordDate.getHours() !== this.hour.getHours()) { + // Create a new bucket. + const newHour = new Date( + recordDate.getFullYear(), + recordDate.getMonth(), + recordDate.getDate(), + recordDate.getHours(), + ); + const newBucket = getBucket(newHour.toISOString(), false); + + // Add a relation from and to the previous bucket. + const oldBucket = getBucket( + this.hour.toISOString(), + this.root, + ); + this.root = false; + if (this.unorderedRelations) { + addRelation(oldBucket, newBucket, TREE.terms.Relation); + addRelation(newBucket, oldBucket, TREE.terms.Relation); + } else { + addRelation( + oldBucket, + newBucket, + TREE.terms.GreaterThanOrEqualToRelation, + literal( + this.hour.toISOString(), + namedNode(XSD.dateTime), + ), + this.pathQuads, + ); + addRelation( + newBucket, + oldBucket, + TREE.terms.LessThanRelation, + literal( + this.hour.toISOString(), + namedNode(XSD.dateTime), + ), + this.pathQuads, + ); + } + + // Mark the old bucket as immutable. + oldBucket.immutable = true; + + out.push(newBucket); + this.hour = newHour; + + this.logger.debug( + `Created new hour bucket ${this.hour.toISOString()}`, + ); + } else { + // The record belongs to the current bucket. + out.push(getBucket(this.hour.toISOString(), this.root)); + } + } else { + // The record does not have a timestamp value. + this.logger.warn( + `Received records without timestamp values. Ignoring record '${record.data.id.value}'.`, + ); + } + } + + return out; + } + + save() { + return JSON.stringify({ + hour: this.hour, + root: this.root, + }); + } +} diff --git a/src/bucketizers/index.ts b/src/bucketizers/index.ts index ce3cdc3..d2958c1 100644 --- a/src/bucketizers/index.ts +++ b/src/bucketizers/index.ts @@ -1,5 +1,3 @@ -import { readFileSync } from "fs"; -import * as path from "path"; import { Term } from "@rdfjs/types"; import { BasicLensM, Cont } from "rdf-lens"; import { @@ -16,13 +14,18 @@ import SubjectBucketizer from "./subjectBucketizer"; import TimebasedBucketizer from "./timebasedBucketizer"; import { $INLINE_FILE } from "@ajuvercr/ts-transformer-inline-file"; +import HourBucketizer from "./hourBucketizer"; const df = new DataFactory(); export const SHAPES_TEXT = $INLINE_FILE("../../configs/bucketizer_configs.ttl"); export type BucketizerConfig = { type: Term; - config: SubjectFragmentation | PageFragmentation | TimebasedFragmentation; + config: + | SubjectFragmentation + | PageFragmentation + | TimebasedFragmentation + | HourFragmentation; }; export type SubjectFragmentation = { @@ -44,6 +47,12 @@ export type TimebasedFragmentation = { minBucketSpan: number; }; +export type HourFragmentation = { + path: BasicLensM; + pathQuads: Cont; + unorderedRelations?: boolean; +}; + export type AddRelation = ( origin: Bucket, target: Bucket, @@ -78,6 +87,8 @@ function createBucketizer(config: BucketizerConfig, save?: string): Bucketizer { config.config, save, ); + case TREE.custom("HourFragmentation"): + return new HourBucketizer(config.config, save); } throw "Unknown bucketizer " + config.type.value; }