diff --git a/packages/shared/package.json b/packages/shared/package.json index a60460618..8a2d42adc 100644 --- a/packages/shared/package.json +++ b/packages/shared/package.json @@ -30,6 +30,7 @@ "react": "^17.0.2", "react-dom": "^17.0.2", "react-i18next": "^11.18.0", + "react-json-view": "^1.21.3", "ts-endpoint": "^2.0.0", "ts-endpoint-express": "^2.0.0", "ts-io-error": "^2.0.0", diff --git a/packages/shared/src/extension/app.ts b/packages/shared/src/extension/app.ts index 06bb19f15..270cb23d8 100644 --- a/packages/shared/src/extension/app.ts +++ b/packages/shared/src/extension/app.ts @@ -1,4 +1,5 @@ import _ from 'lodash'; +import { ParserFn } from '../providers/parser.provider'; import { HandshakeResponse } from '../models/HandshakeBody'; import { clearCache } from '../providers/dataDonation.provider'; import { FIXED_USER_NAME, initializeKey } from './background/account'; @@ -16,6 +17,7 @@ import log from './logger'; import HubEvent from './models/HubEvent'; import { ServerLookup } from './models/Message'; import UserSettings from './models/UserSettings'; +import { renderUI, RenderUIProps } from './ui'; import { bo } from './utils/browser.utils'; // instantiate a proper logger @@ -67,7 +69,12 @@ interface SetupObserverOpts { onLocationChange: (oldLocation: string, newLocation: string) => void; } -export interface BootOpts { +export interface BootOpts< + S = any, + M = any, + C = any, + PP extends Record> = any +> { payload: ServerLookup['payload']; mapLocalConfig: ( c: UserSettings, @@ -79,6 +86,7 @@ export interface BootOpts { onRegister: (h: Hub, config: UserSettings) => void; }; onAuthenticated: (res: any) => void; + ui?: Omit, 'hub'>; } /** @@ -227,7 +235,12 @@ const serverHandshakeP = ( let loading = false; let app: App | undefined; -export async function boot(opts: BootOpts): Promise { +export async function boot< + S = any, + M = any, + C = any, + PP extends Record> = any +>(opts: BootOpts): Promise { if (app) { appLog.debug('App already booted!'); return app; @@ -316,12 +329,12 @@ export async function boot(opts: BootOpts): Promise { // register platform specific event handlers opts.hub.onRegister(opts.hub.hub, config); - // emergency button should be used when a supported with - // UX hack in place didn't see any UX change, so they - // can report the problem and we can handle it. - // initializeEmergencyButton(); + // render shared ui if configuration is given + if (opts.ui) { + renderUI({ hub: opts.hub.hub, ...opts.ui }); + } - // because the URL has been for sure reloaded, be sure to also + // because the URL has been for sure reloaded, be sure to also clear cache clearCache(); // send the configuration to the server to register the extension diff --git a/packages/shared/src/extension/tooltip/index.tsx b/packages/shared/src/extension/tooltip/index.tsx deleted file mode 100644 index a460f19de..000000000 --- a/packages/shared/src/extension/tooltip/index.tsx +++ /dev/null @@ -1,23 +0,0 @@ -import React from 'react'; -import ReactDOM from 'react-dom'; - -import { createTheme, ThemeProvider } from '@material-ui/core/styles'; - -import Tooltip from './components/tooltip'; - -const theme = createTheme({ - typography: { - fontFamily: 'Trex-Regular', - }, -}); - -function main(): void { - ReactDOM.render( - - - , - document.getElementById('yttrex--tooltip') - ); -} - -main(); diff --git a/packages/shared/src/extension/ui/components/MetadataLogger.tsx b/packages/shared/src/extension/ui/components/MetadataLogger.tsx new file mode 100644 index 000000000..d6e11a7d6 --- /dev/null +++ b/packages/shared/src/extension/ui/components/MetadataLogger.tsx @@ -0,0 +1,438 @@ +import { + Accordion, + AccordionDetails, + AccordionSummary, + Box, + Typography, +} from '@material-ui/core'; +import * as E from 'fp-ts/lib/Either'; +import { pipe } from 'fp-ts/lib/function'; +import * as M from 'fp-ts/lib/Map'; +import * as O from 'fp-ts/lib/Option'; +import * as S from 'fp-ts/lib/string'; +import * as t from 'io-ts'; +import { PathReporter } from 'io-ts/lib/PathReporter'; +import * as React from 'react'; +import ReactJSON from 'react-json-view'; +import { v4 as uuid } from 'uuid'; +import { + executionLoop, + ParserConfiguration, + ParserContext, + ParserFn, +} from '../../../providers/parser.provider'; +import { Hub } from '../../hub'; +import trexLogger from '../../logger'; + +const log = trexLogger.extend('metadata-logger'); + +export interface MetadataLoggerProps< + S, + M, + C extends ParserConfiguration, + PP extends Record> +> { + hub: Hub; + mapEvent: (id: string, e: unknown) => S | null; + decode: t.Decode; + parser: Omit< + ParserContext, + | 'db' + | 'codecs' + | 'getLastContributions' + | 'getMetadata' + | 'saveResults' + | 'getContributions' + >; +} + +interface MetadataLoggerEntry { + source: S; + metadata?: M; + errors: { + decode: string[]; + parser: string[]; + }; +} + +type MetadataLoggerState = Map>; + +const ParserErrorsHeader: React.FC<{ count: number }> = (props) => ( + + Parser Errors {props.count} + +); + +const MetadataHeader: React.FC<{ count: number }> = (props) => ( + + Metadata {props.count} + +); + +const DecodeErrorsHeader: React.FC<{ count: number }> = (props) => ( + + Decode Errors {props.count} + +); + +export const MetadataLogger = < + S extends any, + M extends any, + C extends ParserConfiguration, + PP extends Record> +>({ + hub, + mapEvent, + decode, + parser, +}: MetadataLoggerProps): JSX.Element => { + const [contributions, setContributions] = React.useState< + MetadataLoggerState + >(new Map()); + + const contributionEntries = pipe(contributions, M.toArray(S.Ord)); + + const getContributions = React.useCallback( + (f, s, a) => { + const sources = contributionEntries.map(([key, c]) => c.source); + log.debug('Get last sources %O', sources); + return Promise.resolve({ sources, errors: 0, overflow: false }); + }, + [contributionEntries] + ); + + const getMetadata = React.useCallback( + async (e: any): Promise => { + log.debug('Store %O', contributions); + const entryId = parser.getEntryId(e); + + const m = pipe( + contributions, + M.lookup(S.Eq)(entryId), + O.filterMap((m) => (m.metadata ? O.some(m.metadata) : O.none)), + O.toNullable + ); + + log.debug('Get metadata from entry id %s => %O', entryId, m); + + return m; + }, + [contributionEntries] + ); + + const saveResults = React.useCallback( + (s: any, m: any) => { + log.debug('Save results: source %O, metadata %O', s, m); + const metadataUpsert = pipe( + contributions, + M.lookup(S.Eq)(s.id), + O.fold( + (): MetadataLoggerEntry => ({ + errors: { + parser: [], + decode: [], + }, + source: s, + metadata: m, + }), + (c) => ({ + ...c, + metadata: m, + }) + ) + ); + log.debug('Old contributions %O', contributions); + pipe( + contributions, + M.upsertAt(S.Eq)(parser.getEntryId(s), metadataUpsert), + (cc) => { + log.debug('New contributions %s', cc); + setContributions(cc); + } + ); + + return Promise.resolve({ + metadata: m, + source: s, + count: { metadata: 1 }, + }); + }, + [contributionEntries] + ); + + const parserCtx: ParserContext = { + ...parser, + log, + db: { api: {} as any, read: {} as any, write: {} as any }, + codecs: { metadata: t.any, contribution: t.any }, + getMetadata, + getContributions, + saveResults, + }; + + React.useEffect(() => { + log.debug('Start parsing %O', M.toArray(S.Ord)(contributions)); + const unprocessedContributions = pipe( + contributions, + // once the parser has finished the entry should have either `metadata` or some `errors.parser` + M.filter((s) => s.metadata === undefined && s.errors.parser.length === 0) + ); + + log.debug( + 'Unprocessed contributions %O', + M.toArray(S.Ord)(unprocessedContributions) + ); + + if (unprocessedContributions.size >= 1) { + void executionLoop(parserCtx)({ + singleUse: true, + htmlAmount: 1, + stop: 1, + }).then((r) => { + log.debug('Parser output %O', r); + if (r.type === 'Success') { + const outputContributions = r.payload.reduce((acc, p) => { + const sourceId = parser.getEntryId(p.source); + + let parserErrors: any[] = []; + if (p.failures) { + parserErrors = Object.entries(p.failures).reduce( + (acc, [key, value]) => { + if (value instanceof Error) { + return acc.concat({ + [key]: value.message, + }); + } + + return acc.concat({ + [key]: value, + }); + }, + [] + ); + } + + const entry = pipe( + acc, + M.lookup(S.Eq)(sourceId), + O.fold( + (): MetadataLoggerEntry => ({ + source: p.source, + metadata: undefined, + errors: { + decode: [], + parser: parserErrors, + }, + }), + (m) => ({ + ...m, + metadata: p.metadata, + errors: { + ...m.errors, + parser: parserErrors, + }, + }) + ) + ); + + return pipe(acc, M.upsertAt(S.Eq)(sourceId, entry)); + }, contributions); + + log.debug('Update contributions %O', outputContributions); + setContributions(outputContributions); + } + }); + } + }, [contributionEntries]); + + const onAnyEvent = (event: any): void => { + // log.debug('event received', event); + const eventId = uuid(); + const datum = mapEvent(eventId, event); + + if (datum === null) { + log.debug('Avoid parsing event type %s', event.type); + return; + } + + log.debug('Decoding event %O', datum); + // log.debug('datum mapped %O', datum); + const contribution = decode(datum); + // when the received event decode fails + // we update the `decodeErrors` map + // to render the errors + + if (E.isLeft(contribution)) { + const errors = PathReporter.report(contribution); + + log.debug('Decode errors %O', errors); + + const decodeErrorUpsert = pipe( + contributions, + M.lookup(S.Eq)(eventId), + O.fold( + (): MetadataLoggerEntry => ({ + errors: { parser: [], decode: errors }, + metadata: undefined, + source: datum, + }), + (c) => ({ + ...c, + errors: { + ...c.errors, + decode: errors, + }, + }) + ) + ); + + pipe( + contributions, + M.upsertAt(S.Eq)(eventId, decodeErrorUpsert), + setContributions + ); + } else { + log.debug('Contribution id %s', eventId); + + pipe( + contributions, + M.lookup(S.Eq)(eventId), + O.fold( + () => ({ + metadata: undefined, + source: contribution.right, + errors: { + decode: [], + parser: [], + }, + }), + (c) => ({ ...c, source: contribution.right }) + ), + (c) => { + log.debug('Contribution %O', c); + pipe(contributions, M.upsertAt(S.Eq)(eventId, c), setContributions); + } + ); + } + }; + + React.useEffect(() => { + hub.onAnyEvent(onAnyEvent); + }, []); + + log.debug('contributions', contributions); + + const { summary, details } = React.useMemo(() => { + const metadata = pipe( + contributions, + M.filter((c) => !!c.metadata) + ); + + const decodeErrors = pipe( + contributions, + M.filter((c) => c.errors.decode.length > 0) + ); + + const parserErrors = pipe( + contributions, + M.filter((c) => c.errors.parser.length > 0) + ); + + return { + summary: ( +
+ + + +
+ ), + details: ( +
+
    + {pipe( + contributions, + M.mapWithIndex((key, data) => ( +
  • +
    + + Metadata {(data.metadata as any)?.type} + + + + +
    + + {data.errors.parser.map((l) => ( + + {JSON.stringify(l)} + + ))} + + + + + + {data.errors.decode.map((l) => ( + + {l} + + ))} +
  • + )), + M.toArray(S.Ord), + (entries) => entries.map(([k, e]) => e) + )} +
+
+ ), + }; + }, [contributionEntries]); + + return ( + + + {summary} + {details} + + + ); +}; diff --git a/packages/shared/src/extension/tooltip/components/tooltip.tsx b/packages/shared/src/extension/ui/components/tooltip.tsx similarity index 100% rename from packages/shared/src/extension/tooltip/components/tooltip.tsx rename to packages/shared/src/extension/ui/components/tooltip.tsx diff --git a/packages/shared/src/extension/ui/index.tsx b/packages/shared/src/extension/ui/index.tsx new file mode 100644 index 000000000..3cb66badc --- /dev/null +++ b/packages/shared/src/extension/ui/index.tsx @@ -0,0 +1,69 @@ +import { createTheme, ThemeProvider } from '@material-ui/core/styles'; +import React from 'react'; +import ReactDOM from 'react-dom'; +// import Tooltip from './components/tooltip'; +import { ErrorBoundary } from '../../components/Error/ErrorBoundary'; +import { ParserConfiguration, ParserFn } from '../../providers/parser.provider'; +import { Hub } from '../hub'; +import { + MetadataLogger, + MetadataLoggerProps +} from './components/MetadataLogger'; + +const theme = createTheme({ + typography: { + fontFamily: 'Trex-Regular', + }, +}); + +// const TOOLTIP_ID = 'trex--tooltip'; +const METADATA_LOGGER_ID = 'trex--metadata-logger'; + +export interface RenderUIProps< + S, + M, + C extends ParserConfiguration, + PP extends Record> +> { + hub: Hub; + metadataLogger: Omit, 'hub'>; +} + +export function renderUI< + S, + M, + C extends ParserConfiguration, + PP extends Record> +>({ hub, metadataLogger }: RenderUIProps): void { + // const tooltipNode = document.getElementById(TOOLTIP_ID); + // if (!tooltipNode) { + // const node = document.createElement('div'); + + // node.id = TOOLTIP_ID; + // document.body.append(node); + // ReactDOM.render( + // + // + // , + // node + // ); + // } + + const metadataLoggerNode = document.getElementById(METADATA_LOGGER_ID); + if (!metadataLoggerNode) { + const node = document.createElement('div'); + + node.id = METADATA_LOGGER_ID; + document.body.append(node); + ReactDOM.render( + + + + + + + , + node + ); + } +} diff --git a/packages/shared/src/providers/parser.provider.ts b/packages/shared/src/providers/parser.provider.ts index 47470f080..5a9e4960e 100644 --- a/packages/shared/src/providers/parser.provider.ts +++ b/packages/shared/src/providers/parser.provider.ts @@ -1,22 +1,18 @@ import { formatDistance } from 'date-fns'; import differenceInMinutes from 'date-fns/differenceInMinutes'; import subMinutes from 'date-fns/subMinutes'; -import fs from 'fs'; import * as t from 'io-ts'; import _ from 'lodash'; import { MongoClient } from 'mongodb'; -import path from 'path'; import { Logger, trexLogger } from '../logger'; import { sleep } from '../utils/promise.utils'; -import * as mongo3 from './mongo.provider'; +import type * as mongo3 from './mongo.provider'; /** * The parser configuration */ export interface ParserConfiguration { - errorReporter?: { - basePath: string; - }; + errorReporter?: (e: unknown) => void; } /** @@ -491,24 +487,7 @@ export const parseContributions = results.push({ ...result, ...m }); } else { if (ctx.config.errorReporter) { - const entryNature = ctx.getEntryNatureType(entry) ?? 'failed'; - const fixturePath = path.resolve( - ctx.config.errorReporter?.basePath, - ctx.name, - entryNature, - `${entryId}.json` - ); - ctx.log.debug( - 'Saving problematic contribution as fixture in %s', - fixturePath - ); - fs.writeFileSync( - fixturePath, - JSON.stringify({ - sources: [entry.html], - metadata: {}, - }) - ); + ctx.config.errorReporter(entry); } results.push({ @@ -609,6 +588,7 @@ export const executionLoop = computedFrequency = FREQUENCY; } else { ctx.log.debug('Data to process %d', envelops.sources.length); + lastExecution = new Date(); const currentResult = await parseContributions(ctx)(envelops); ctx.log.debug( 'Processed sources %O', @@ -628,6 +608,7 @@ export const executionLoop = ctx.log.debug('Sleep for %dms', sleepTime); await sleep(sleepTime); } + return { type: 'Success', payload: results }; } catch (e: any) { ctx.log.error('Error in filterChecker', e.message, e.stack); diff --git a/platforms/tktrex/backend/bin/parser.ts b/platforms/tktrex/backend/bin/parser.ts index 04e00ed0e..f5322fcf2 100644 --- a/platforms/tktrex/backend/bin/parser.ts +++ b/platforms/tktrex/backend/bin/parser.ts @@ -3,7 +3,7 @@ import * as mongo3 from '@shared/providers/mongo.provider'; import { GetParserProvider } from '@shared/providers/parser.provider'; import { TKMetadata } from '@tktrex/shared/models/Metadata'; -import { parsers } from '@tktrex/shared/parser'; +import { parsers } from '@tktrex/shared/parser/parsers'; import { HTMLSource } from '@tktrex/shared/parser/source'; import fs from 'fs'; import _ from 'lodash'; @@ -12,10 +12,8 @@ import path from 'path'; import { addDom, buildMetadata, - getLastHTMLs, - updateMetadataAndMarkHTML, - getMetadata, - parserConfig, + getLastHTMLs, getMetadata, + parserConfig, updateMetadataAndMarkHTML } from '../lib/parser'; nconf.argv().env().file({ file: 'config/settings.json' }); @@ -90,8 +88,21 @@ const run = async (): Promise => { getEntryNatureType: (e) => e.html.type, config: { ...parserConfig, - errorReporter: { - basePath: path.resolve(process.cwd(), 'parsers/__tests__/fixtures'), + errorReporter: (e: any) => { + const entryNature = e.html.nature.type ?? 'failed'; + const fixturePath = path.resolve( + path.resolve(process.cwd(), 'parsers/__tests__/fixtures'), + entryNature, + `${e.html.id}.json` + ); + + fs.writeFileSync( + fixturePath, + JSON.stringify({ + sources: [e.html], + metadata: {}, + }) + ); }, }, }).run({ diff --git a/platforms/tktrex/backend/lib/parser.ts b/platforms/tktrex/backend/lib/parser.ts index c614806a3..e4484ae46 100644 --- a/platforms/tktrex/backend/lib/parser.ts +++ b/platforms/tktrex/backend/lib/parser.ts @@ -8,7 +8,7 @@ import { } from '@shared/providers/parser.provider'; import { sanitizeHTML } from '@shared/utils/html.utils'; import { TKMetadata } from '@tktrex/shared/models/Metadata'; -import { TKParsers } from '@tktrex/shared/parser'; +import { TKParsers } from '@tktrex/shared/parser/parsers'; import { TKParserConfig } from '@tktrex/shared/parser/config'; import { HTMLSource } from '@tktrex/shared/parser/source'; import { isValid } from 'date-fns'; diff --git a/platforms/tktrex/extension/src/app/index.ts b/platforms/tktrex/extension/src/app/index.ts index e8b85b3da..ab97823e2 100644 --- a/platforms/tktrex/extension/src/app/index.ts +++ b/platforms/tktrex/extension/src/app/index.ts @@ -3,7 +3,13 @@ import { tiktokDomainRegExp } from '@tktrex/parser/v2/constant'; import { registerTkHandlers } from './handlers'; import { feedId, onLocationChange, tkHandlers, tkTrexActions } from './app'; import tkHub from '../handlers/hub'; +import { HTMLSource } from '@tktrex/parser/source'; +import { toMetadata } from '@tktrex/parser/metadata'; +import { trexLogger } from '@shared/logger'; +import { parsers } from '@tktrex/parser/parsers'; +import { v4 as uuid } from 'uuid'; +const metadataLogger = trexLogger.extend('tk:parser'); // Boot the app script. This is the first function called. void boot({ payload: { @@ -23,4 +29,56 @@ void boot({ onLocationChange, }, onAuthenticated: tkTrexActions, + ui: { + metadataLogger: { + parser: { + name: 'tk:parser', + log: metadataLogger, + parsers, + addDom: (h) => ({ + ...h, + jsdom: new DOMParser().parseFromString(h.html.html, 'text/html'), + }), + getEntryId: (e) => e.html.id, + getEntryDate: (e) => e.html.savingTime, + getEntryNatureType: (e) => e.html.nature.type, + buildMetadata: toMetadata, + config: {}, + }, + decode: HTMLSource.decode, + mapEvent(id, e) { + metadataLogger.debug('Map event %O', e); + const ev: any = e; + + if (ev.type === 'NewVideo') { + return { + html: { + ...ev.payload, + id, + metadataId: uuid(), + blang: '', + counters: [], + clientTime: new Date(), + savingTime: new Date(), + publicKey: '', + type: 'video', + videoId: '', + authorId: '', + nature: { type: 'video', videoId: '', authorId: '' }, + processed: false, + }, + supporter: { + publicKey: '', + version: '', + lastActivity: new Date(), + creationTime: new Date(), + p: '', + }, + jsdom: {}, + }; + } + return null; + }, + }, + }, }); diff --git a/platforms/tktrex/shared/src/parser/index.ts b/platforms/tktrex/shared/src/parser/index.ts deleted file mode 100644 index a28acb0be..000000000 --- a/platforms/tktrex/shared/src/parser/index.ts +++ /dev/null @@ -1,27 +0,0 @@ -import nature from './parsers/nature'; -import description from './parsers/description'; -import music from './parsers/music'; -import hashtags from './parsers/hashtags'; -import metrics from './parsers/metrics'; -import stitch from './parsers/stitch'; -import author from './parsers/author'; -import search from './parsers/search'; -import profile from './parsers/profile'; -import native from './parsers/native'; -import downloader from './parsers/downloader'; - -export const parsers = { - nature, - description, - music, - hashtags, - metrics, - stitch, - author, - search, - profile, - native, - downloader, -}; - -export type TKParsers = typeof parsers; diff --git a/platforms/tktrex/shared/src/parser/metadata.ts b/platforms/tktrex/shared/src/parser/metadata.ts new file mode 100644 index 000000000..9270afde8 --- /dev/null +++ b/platforms/tktrex/shared/src/parser/metadata.ts @@ -0,0 +1,124 @@ +import { BuildMetadataFn } from '@shared/providers/parser.provider'; +import { TKMetadata } from '../models/Metadata'; +import { TKParsers } from './parsers'; +import { HTMLSource } from './source'; +import { isValid } from 'date-fns'; +import _ from 'lodash'; + +export const toMetadata: BuildMetadataFn = ( + entry +) => { + // this contains the original .source (html, impression, timeline), the .findings and .failures + // the metadata is aggregated by unit and not unrolled in any way + if (!entry?.findings?.nature) return null; + + let metadata: any = { + clientTime: entry.source.html.clientTime, + }; + + switch (entry.findings.nature.type) { + case 'foryou': { + const { + nature, + author, + description, + hashtags, + metrics, + music, + downloader, + } = entry.findings; + metadata = { + ...metadata, + ...nature, + nature, + ...description, + author, + metrics, + music, + hashtags, + ...downloader, + }; + break; + } + case 'search': { + const { nature, downloader, search } = entry.findings; + metadata = { + ...metadata, + ...nature, + nature, + ...downloader, + ...search, + }; + metadata.query = _.toLower(metadata.query); + metadata.nature.query = metadata.query; + break; + } + case 'profile': { + const { nature, profile, downloader } = entry.findings; + metadata = { + ...metadata, + nature, + ...nature, + ...downloader, + ...profile, + }; + break; + } + case 'video': + case 'native': { + const { + nature, + description, + music, + hashtags, + metrics, + stitch, + author, + downloader, + native, + } = entry.findings; + metadata = { + ...nature, + nature, + ...description, + music, + hashtags, + metrics, + stitch, + author, + ...downloader, + ...native, + }; + break; + } + default: { + metadata = { + ...metadata, + ...entry.findings, + ...entry.findings.nature, + }; + } + } + + /* fixed fields */ + metadata.savingTime = isValid(entry.source.html.savingTime) + ? entry.source.html.savingTime.toISOString() + : entry.source.html.savingTime; + metadata.clientTime = isValid(entry.source.html.clientTime) + ? entry.source.html.clientTime.toISOString() + : entry.source.html.clientTime; + metadata.id = entry.source.html.id; + metadata.publicKey = entry.source.html.publicKey; + metadata.timelineId = entry.source.html.timelineId; + metadata.order = entry.source.html.n?.[0]; + + /* optional fields */ + if (entry.source.html.geoip?.length === 2) + metadata.geoip = entry.source.html.geoip; + if (entry.source.html.researchTag?.length) + metadata.researchTag = entry.source.html.researchTag; + if (entry.source.html.experimentId?.length) + metadata.experimentId = entry.source.html.experimentId; + + return metadata; +}; diff --git a/platforms/tktrex/shared/src/parser/parsers/index.ts b/platforms/tktrex/shared/src/parser/parsers/index.ts new file mode 100644 index 000000000..f063158c2 --- /dev/null +++ b/platforms/tktrex/shared/src/parser/parsers/index.ts @@ -0,0 +1,28 @@ +import nature from './nature'; +import description from './description'; +import music from './music'; +import hashtags from './hashtags'; +import metrics from './metrics'; +import stitch from './stitch'; +import author from './author'; +import search from './search'; +import profile from './profile'; +import native from './native'; +// import downloader from './downloader'; + +export const parsers = { + nature, + description, + music, + hashtags, + metrics, + stitch, + author, + search, + profile, + native, + downloader: () => Promise.resolve({}), + // downloader, +}; + +export type TKParsers = typeof parsers; diff --git a/platforms/yttrex/backend/__tests__/parser/html/parseHome.e2e.ts b/platforms/yttrex/backend/__tests__/parser/html/parseHome.e2e.ts index 82b80554d..98dd9f4ea 100644 --- a/platforms/yttrex/backend/__tests__/parser/html/parseHome.e2e.ts +++ b/platforms/yttrex/backend/__tests__/parser/html/parseHome.e2e.ts @@ -16,10 +16,10 @@ import { getMetadata, getMetadataSchema, getSourceSchema, - toMetadata, updateMetadataAndMarkHTML, } from '../../../lib/parser/html'; import processHome from '@yttrex/shared/parser/parsers/home'; +import { toMetadata } from '@yttrex/shared/parser/metadata'; import { GetTest, Test } from '../../../tests/Test'; describe('Parser: home', () => { diff --git a/platforms/yttrex/backend/__tests__/parser/html/parseSearch.e2e.ts b/platforms/yttrex/backend/__tests__/parser/html/parseSearch.e2e.ts index c6243fc5a..ad43b0204 100644 --- a/platforms/yttrex/backend/__tests__/parser/html/parseSearch.e2e.ts +++ b/platforms/yttrex/backend/__tests__/parser/html/parseSearch.e2e.ts @@ -7,11 +7,11 @@ import { getMetadata, getMetadataSchema, getSourceSchema, - toMetadata, updateMetadataAndMarkHTML, } from '../../../lib/parser/html'; import { SearchMetadata } from '@yttrex/shared/models/Metadata'; import { HTMLSource, parsers } from '@yttrex/shared/parser'; +import { toMetadata } from '@yttrex/shared/parser/metadata'; import { GetTest, Test } from '../../../tests/Test'; import { readFixtureJSON, diff --git a/platforms/yttrex/backend/__tests__/parser/html/parseVideo.e2e.ts b/platforms/yttrex/backend/__tests__/parser/html/parseVideo.e2e.ts index b452034d0..0b51f9c7d 100644 --- a/platforms/yttrex/backend/__tests__/parser/html/parseVideo.e2e.ts +++ b/platforms/yttrex/backend/__tests__/parser/html/parseVideo.e2e.ts @@ -8,6 +8,7 @@ import { } from '@shared/test/utils/parser.utils'; import { VideoMetadata } from '@yttrex/shared/models/Metadata'; import { HTMLSource, parsers } from '@yttrex/shared/parser'; +import { toMetadata } from '@yttrex/shared/parser/metadata'; import base58 from 'bs58'; import { addMinutes, parseISO } from 'date-fns'; import path from 'path'; @@ -19,7 +20,6 @@ import { getMetadata, getMetadataSchema, getSourceSchema, - toMetadata, updateMetadataAndMarkHTML, } from '../../../lib/parser/html'; diff --git a/platforms/yttrex/backend/bin/leaves-parser.ts b/platforms/yttrex/backend/bin/leaves-parser.ts index a1baafb4b..5f774f387 100644 --- a/platforms/yttrex/backend/bin/leaves-parser.ts +++ b/platforms/yttrex/backend/bin/leaves-parser.ts @@ -9,6 +9,7 @@ import { LeafParsers, leafParsers } from '@yttrex/shared/parser/parsers'; import fs from 'fs'; import _ from 'lodash'; import nconf from 'nconf'; +import path from 'path'; import { addDom, getLastLeaves, @@ -88,7 +89,25 @@ const run = async (): Promise => { getEntryNatureType: (e) => e.html.nature.type, buildMetadata: toMetadata, saveResults: updateAdvertisingAndMetadata(db), - config: parserConfig, + config: { + ...parserConfig, + errorReporter: (e: LeafSource) => { + const entryNature = e.html.nature.type ?? 'failed'; + const fixturePath = path.resolve( + path.resolve(process.cwd(), '__tests__/fixtures/htmls'), + entryNature, + `${e.html.id}.json` + ); + + fs.writeFileSync( + fixturePath, + JSON.stringify({ + sources: [e.html], + metadata: {}, + }) + ); + }, + }, }).run({ singleUse: typeof id === 'string' ? id : false, filter, diff --git a/platforms/yttrex/backend/bin/parser.ts b/platforms/yttrex/backend/bin/parser.ts index 63cddc1cc..3d0c8daad 100644 --- a/platforms/yttrex/backend/bin/parser.ts +++ b/platforms/yttrex/backend/bin/parser.ts @@ -6,7 +6,6 @@ import _ from 'lodash'; import nconf from 'nconf'; import { getLastHTMLs, - toMetadata, updateMetadataAndMarkHTML, addDom, getMetadata, @@ -14,6 +13,7 @@ import { import { parserConfig } from '@yttrex/shared/parser/config'; import { GetParserProvider } from '@shared/providers/parser.provider'; import { HTMLSource } from '@yttrex/shared/parser/source'; +import { toMetadata } from '@yttrex/shared/parser/metadata'; import { Metadata } from '@yttrex/shared/models/Metadata'; import { parsers } from '@yttrex/shared/parser/parsers'; import path from 'path'; @@ -90,8 +90,21 @@ const run = async (): Promise => { saveResults: updateMetadataAndMarkHTML(db), config: { ...parserConfig, - errorReporter: { - basePath: path.resolve(process.cwd(), './__tests__/fixtures'), + errorReporter: (e: HTMLSource) => { + const entryNature = e.html.nature.type ?? 'failed'; + const fixturePath = path.resolve( + path.resolve(process.cwd(), '__tests__/fixtures/htmls'), + entryNature, + `${e.html.id}.json` + ); + + fs.writeFileSync( + fixturePath, + JSON.stringify({ + sources: [e.html], + metadata: {}, + }) + ); }, }, }) diff --git a/platforms/yttrex/backend/lib/parser/html.ts b/platforms/yttrex/backend/lib/parser/html.ts index 8b95d123b..250b81e28 100644 --- a/platforms/yttrex/backend/lib/parser/html.ts +++ b/platforms/yttrex/backend/lib/parser/html.ts @@ -1,16 +1,14 @@ import { - BuildMetadataFn, ContributionAndDOMFn, GetContributionsFn, GetMetadataFn, ParserProviderContextDB, - SaveResults, + SaveResults } from '@shared/providers/parser.provider'; import { sanitizeHTML } from '@shared/utils/html.utils'; import { Metadata } from '@yttrex/shared/models/Metadata'; import { Supporter } from '@yttrex/shared/models/Supporter'; -import { HTMLSource, Parsers } from '@yttrex/shared/parser'; -import { isValid } from 'date-fns'; +import { HTMLSource } from '@yttrex/shared/parser'; import { JSDOM } from 'jsdom'; import _ from 'lodash'; import nconf from 'nconf'; @@ -87,91 +85,6 @@ export const getMetadata = ); }; -export const toMetadata: BuildMetadataFn = ( - entry, - oldMetadata -) => { - // this contains the original .source (html, impression, timeline), the .findings and .failures - // the metadata is aggregated by unit and not unrolled in any way - if (!entry?.findings?.nature) return null; - - let metadata: any = {}; - metadata.savingTime = isValid(entry.source.html.savingTime) - ? entry.source.html.savingTime - : new Date(entry.source.html.savingTime); - metadata.clientTime = isValid(entry.source.html.clientTime) - ? entry.source.html.clientTime - : new Date(entry.source.html.clientTime); - metadata.id = entry.source.html.metadataId; - metadata.publicKey = entry.source.html.publicKey; - - if ( - entry.source.html.experimentId && - entry.source.html.experimentId.length > 0 - ) { - metadata.experimentId = entry.source.html.experimentId; - } - - if ( - entry.source.html.researchTag && - entry.source.html.researchTag.length > 0 - ) { - metadata.researchTag = entry.source.html.researchTag; - } - - if (entry.findings.nature.type === 'search') { - metadata = { - ...entry.findings.nature, - ...entry.findings.search, - ...metadata, - }; - - return metadata; - } - if (entry.findings.nature.type === 'video') { - const videoNature: any = entry.findings.nature; - const related = videoNature.related - .reduce((acc: any[], m) => { - const index = acc.findIndex( - (r) => - r.videoId === m.videoId || (r.params?.v && r.params.v === m.videoId) - ); - if (index > -1) { - acc[index] = { - ...acc[index], - ...m, - index: acc[index].index, - }; - return acc; - } - - return acc.concat(m); - }, (oldMetadata as any)?.related ?? []); - - metadata = { - href: entry.source.html.href, - ...entry.findings.nature, - ...metadata, - related, - }; - return metadata; - } - - /* else ... */ - metadata = { - ...entry.findings.nature, - href: entry.source.html.href, - ...metadata, - }; - - if (Array.isArray(entry.source.html.n)) { - metadata.order = entry.source.html.n[0]; - } - - // from routes/events.js the 0 is videoCounter, client side - return metadata; -}; - export const updateMetadataAndMarkHTML = (db: ParserProviderContextDB): SaveResults => async (source, metadata) => { diff --git a/platforms/yttrex/backend/routes/__tests__/personal.e2e.ts b/platforms/yttrex/backend/routes/__tests__/personal.e2e.ts index ab97b58d2..ba601fd8d 100644 --- a/platforms/yttrex/backend/routes/__tests__/personal.e2e.ts +++ b/platforms/yttrex/backend/routes/__tests__/personal.e2e.ts @@ -14,12 +14,12 @@ import { Metadata } from '@yttrex/shared/models/Metadata'; import { pipe } from 'fp-ts/lib/function'; import * as fs from 'fs'; import { parserConfig } from '@yttrex/shared/parser/config'; +import { toMetadata as toHTMLMetadata } from '@yttrex/shared/parser/metadata'; import * as path from 'path'; import { addDom as addDOMToHTML, getLastHTMLs, getMetadata as getHTMLMetadata, - toMetadata as toHTMLMetadata, updateMetadataAndMarkHTML, } from '../../lib/parser/html'; import { diff --git a/platforms/yttrex/extension/src/app/app.ts b/platforms/yttrex/extension/src/app/app.ts index 87b46c1a1..4824aa1ed 100644 --- a/platforms/yttrex/extension/src/app/app.ts +++ b/platforms/yttrex/extension/src/app/app.ts @@ -34,6 +34,8 @@ import { import logger from '@shared/extension/logger'; import UserSettings from '@shared/extension/models/UserSettings'; import { sizeCheck } from '@shared/providers/dataDonation.provider'; +import { hasVideosInBody } from '@yttrex/shared/parser/parsers/searches'; +import { getVideoTitle } from '@yttrex/shared/parser/parsers/video'; import { consideredURLs, leafSelectors, @@ -280,16 +282,33 @@ export const handleRoute = ( return; } - hub.dispatch({ - type: 'NewVideo', - payload: { - type: urlkind, - element: sendableNode.outerHTML, - size: sendableNode.outerHTML.length, - href: window.location.href, - randomUUID: feedId, - }, - }); + if (hasVideosInBody(document)) { + hub.dispatch({ + type: 'NewVideo', + payload: { + type: urlkind, + element: sendableNode.outerHTML, + size: sendableNode.outerHTML.length, + href: window.location.href, + randomUUID: feedId, + }, + }); + return; + } + + if (getVideoTitle(document)) { + hub.dispatch({ + type: 'NewVideo', + payload: { + type: urlkind, + element: sendableNode.outerHTML, + size: sendableNode.outerHTML.length, + href: window.location.href, + randomUUID: feedId, + }, + }); + } + updateUI('video.send'); }; diff --git a/platforms/yttrex/extension/src/app/index.ts b/platforms/yttrex/extension/src/app/index.ts index 453c6d5b0..c1627153f 100644 --- a/platforms/yttrex/extension/src/app/index.ts +++ b/platforms/yttrex/extension/src/app/index.ts @@ -3,10 +3,17 @@ import { boot } from '@shared/extension/app'; import { bo } from '@shared/extension/utils/browser.utils'; +import { toMetadata } from '@yttrex/shared/parser/metadata'; +import nature, { getNatureFromURL } from '@yttrex/shared/parser/parsers/nature'; +import search from '@yttrex/shared/parser/parsers/searches'; import { youtubeDomainRegExp } from '@yttrex/shared/parser/selectors'; +import { HTMLSource } from '@yttrex/shared/parser/source'; import * as hubHandlers from '../handlers/events'; import ytHub from '../handlers/hub'; import { onLocationChange, watchedPaths, ytLogger, ytTrexActions } from './app'; +import { v4 as uuid } from 'uuid'; +import { trexLogger } from '@shared/logger'; +const metadataParserLogger = trexLogger.extend('parser'); bo.runtime.sendMessage({ type: 'chromeConfig' }, (config) => { ytLogger.info('Booting app with config %O', config); @@ -36,6 +43,62 @@ bo.runtime.sendMessage({ type: 'chromeConfig' }, (config) => { }, }, onAuthenticated: ytTrexActions, + ui: { + metadataLogger: { + parser: { + name: 'yt-metadata', + log: metadataParserLogger, + addDom: (h) => ({ + ...h, + jsdom: new DOMParser().parseFromString(h.html.html, 'text/html'), + }), + buildMetadata: toMetadata as any, + parsers: { + nature, + search, + }, + getEntryDate: (e) => e.html.clientTime, + getEntryNatureType: (e) => e.html.nature.type, + getEntryId: (e) => { + metadataParserLogger.debug('Get entry %O', e); + return e.html.id; + }, + config: {}, + }, + decode: HTMLSource.decode, + mapEvent: (id, e) => { + const ev: any = e; + if (ev.type === 'NewVideo') { + const { element, ...payload } = ev.payload; + return { + html: { + id, + metadataId: uuid(), + blang: '', + publicKey: '', + clientTime: new Date(), + savingTime: new Date(), + nature: getNatureFromURL(payload.href), + counters: [], + ...payload, + html: element, + processed: false, + }, + supporter: { + publicKey: '', + lastActivity: new Date(), + version: '', + creationTime: new Date(), + p: '', + }, + jsdom: {}, + }; + } + + return null; + }, + }, + }, }); } catch (e) { // eslint-disable-next-line diff --git a/platforms/yttrex/extension/tsconfig.json b/platforms/yttrex/extension/tsconfig.json index e1fb8c8d7..5e393a57b 100644 --- a/platforms/yttrex/extension/tsconfig.json +++ b/platforms/yttrex/extension/tsconfig.json @@ -29,7 +29,7 @@ "path": "../../../packages/shared" } ], - "include": ["./"], + "include": ["./src"], "exclude": [ "./jest.config.js", "./jest.setup.ts", diff --git a/platforms/yttrex/shared/src/models/Nature.ts b/platforms/yttrex/shared/src/models/Nature.ts index f09a80880..d6e27e6a2 100644 --- a/platforms/yttrex/shared/src/models/Nature.ts +++ b/platforms/yttrex/shared/src/models/Nature.ts @@ -31,5 +31,14 @@ export const ChannelN = t.strict( 'ChannelN' ); -export const Nature = t.union([HomeN, SearchN, VideoN, ChannelN], 'Nature'); +export const HashtagN = t.strict( + { + type: t.literal('hashtag'), + hashtag: t.string + + }, + 'ChannelN' +); + +export const Nature = t.union([HomeN, SearchN, VideoN, ChannelN, HashtagN], 'Nature'); export type Nature = t.TypeOf; diff --git a/platforms/yttrex/shared/src/parser/metadata.ts b/platforms/yttrex/shared/src/parser/metadata.ts new file mode 100644 index 000000000..525a4d6a3 --- /dev/null +++ b/platforms/yttrex/shared/src/parser/metadata.ts @@ -0,0 +1,88 @@ +import { BuildMetadataFn } from '@shared/providers/parser.provider'; +import { isValid } from 'date-fns'; +import { Parsers } from './parsers'; +import { HTMLSource } from './source'; + +export const toMetadata: BuildMetadataFn = ( + entry, + oldMetadata +) => { + // this contains the original .source (html, impression, timeline), the .findings and .failures + // the metadata is aggregated by unit and not unrolled in any way + if (!entry?.findings?.nature) return null; + + let metadata: any = {}; + metadata.savingTime = isValid(entry.source.html.savingTime) + ? entry.source.html.savingTime + : new Date(entry.source.html.savingTime); + metadata.clientTime = isValid(entry.source.html.clientTime) + ? entry.source.html.clientTime + : new Date(entry.source.html.clientTime); + metadata.id = entry.source.html.metadataId; + metadata.publicKey = entry.source.html.publicKey; + + if ( + entry.source.html.experimentId && + entry.source.html.experimentId.length > 0 + ) { + metadata.experimentId = entry.source.html.experimentId; + } + + if ( + entry.source.html.researchTag && + entry.source.html.researchTag.length > 0 + ) { + metadata.researchTag = entry.source.html.researchTag; + } + + if (entry.findings.nature.type === 'search') { + metadata = { + ...entry.findings.nature, + ...entry.findings.search, + ...metadata, + }; + + return metadata; + } + if (entry.findings.nature.type === 'video') { + const videoNature: any = entry.findings.nature; + const related = videoNature.related.reduce((acc: any[], m) => { + const index = acc.findIndex( + (r) => + r.videoId === m.videoId || (r.params?.v && r.params.v === m.videoId) + ); + if (index > -1) { + acc[index] = { + ...acc[index], + ...m, + index: acc[index].index, + }; + return acc; + } + + return acc.concat(m); + }, (oldMetadata as any)?.related ?? []); + + metadata = { + href: entry.source.html.href, + ...entry.findings.nature, + ...metadata, + related, + }; + return metadata; + } + + /* else ... */ + metadata = { + ...entry.findings.nature, + href: entry.source.html.href, + ...metadata, + }; + + if (Array.isArray(entry.source.html.n)) { + metadata.order = entry.source.html.n[0]; + } + + // from routes/events.js the 0 is videoCounter, client side + return metadata; +}; diff --git a/platforms/yttrex/shared/src/parser/parsers/nature.ts b/platforms/yttrex/shared/src/parser/parsers/nature.ts index eff78a42a..3981caaca 100644 --- a/platforms/yttrex/shared/src/parser/parsers/nature.ts +++ b/platforms/yttrex/shared/src/parser/parsers/nature.ts @@ -6,9 +6,53 @@ import processHome from './home'; import { processSearch } from './searches'; import parseVideo from './video'; import { trexLogger } from '@shared/logger'; +import * as _ from 'lodash'; const natureLogger = trexLogger.extend('parser:nature'); +export function getNatureFromURL(href: string): Nature | null { + // this function MUST support the different URLs + // format specify in ../../extension/src/consideredURLs.js + const uq = new URL(href); + if (uq.pathname === '/results') { + const searchTerms = uq.searchParams.get('search_query'); + return { + type: 'search', + query: searchTerms as any, + }; + } else if (uq.pathname === '/watch') { + const videoId = uq.searchParams.get('v') as any; + return { + type: 'video', + videoId, + }; + } else if (uq.pathname === '/') { + return { + type: 'home', + }; + } else if (_.startsWith(uq.pathname, '/hashtag')) { + const hashtag = uq.pathname.split('/').pop() as any; + return { + type: 'hashtag', + hashtag, + }; + } else if ( + _.startsWith(uq.pathname, '/channel') || + _.startsWith(uq.pathname, '/user') || + _.startsWith(uq.pathname, '/c') + ) { + const authorSource = uq.pathname.split('/').pop() as any; + return { + type: 'channel', + authorSource, + }; + } else { + natureLogger.debug('Unknow condition: %s', uq.href); + + return null; + } +} + const processNature = (type: Nature['type']): ParserFn => (e, findings, ctx) => { diff --git a/platforms/yttrex/shared/src/parser/parsers/searches.ts b/platforms/yttrex/shared/src/parser/parsers/searches.ts index 01d22b645..d8f978796 100644 --- a/platforms/yttrex/shared/src/parser/parsers/searches.ts +++ b/platforms/yttrex/shared/src/parser/parsers/searches.ts @@ -125,6 +125,9 @@ function unpackCorrection(corelem): string[] { return _.compact(_.flatten(_.map(corelem.children, unpackCorrection))); } +export const hasVideosInBody = (D: Document): NodeListOf => { + return D.querySelectorAll('ytd-video-renderer') +} export const processSearch: ParserFn< HTMLSource, any | null, @@ -136,14 +139,16 @@ export const processSearch: ParserFn< until you figure if belong to a 'macrosection' ("For You", "People Also Watched") or not. By using ytd-video-renderer the order is attributed */ - const videos = envelop.jsdom.querySelectorAll('ytd-video-renderer'); + const videos = hasVideosInBody(envelop.jsdom); if (!videos.length) { debuge( "Search result of %s doesn't seem having any video!", envelop.html.id ); - return null; + throw new Error( + `Search result of ${envelop.html.id} doesn't seem to have any video` + ); } const dissected = _.map(videos, dissectVideoAndParents); const results = _.compact(dissected); diff --git a/platforms/yttrex/shared/src/parser/parsers/uxlang.js b/platforms/yttrex/shared/src/parser/parsers/uxlang.js index 3e3cdbf84..f1b731492 100644 --- a/platforms/yttrex/shared/src/parser/parsers/uxlang.js +++ b/platforms/yttrex/shared/src/parser/parsers/uxlang.js @@ -268,6 +268,11 @@ const localizedFirstButton = [ first: 'Suchen', iso2: 'de', }, + { + type: 'video', + first: 'Salta link di navigazione', + iso2: 'it', + }, ]; function findLanguage(type, chunks) { diff --git a/platforms/yttrex/shared/src/parser/parsers/video.ts b/platforms/yttrex/shared/src/parser/parsers/video.ts index cc2a738e0..275803855 100644 --- a/platforms/yttrex/shared/src/parser/parsers/video.ts +++ b/platforms/yttrex/shared/src/parser/parsers/video.ts @@ -68,8 +68,8 @@ function parseLikes(D: Document): likesParser.Likes { const nodes = D.querySelectorAll( '.ytd-toggle-button-renderer > yt-formatted-string' ); - const likes = nodes[0].getAttribute('aria-label'); - const dislikes = nodes[1].getAttribute('aria-label'); + const likes = nodes[0]?.getAttribute('aria-label'); + const dislikes = nodes[1]?.getAttribute('aria-label'); const likeInfo = { likes, dislikes, @@ -303,7 +303,7 @@ export function parseSingleTry(D: Document, memo: any, spec: any): any { } } -function manyTries(D: Document, opportunities): any { +function manyTries(D: Document, opportunities: any[]): string | null { const r = _.reduce(opportunities, _.partial(parseSingleTry, D), null); videoLog.debug('manyTries: %j: %s', _.map(opportunities, 'name'), r); return r; @@ -381,15 +381,8 @@ export function simpleTitlePicker(D: Document, blang: string): string | null { return null; } -export function processVideo( - D: Document, - blang: string, - clientTime: Date, - urlinfo?: URL -): VideoMetadata { - /* this method to extract title was a nice experiment - * and/but should be refactored and upgraded */ - let title = manyTries(D, [ +export const getVideoTitle = (D: Document): string | null => { + return manyTries(D, [ { name: 'title h1', selector: 'h1 > yt-formatted-string', @@ -404,7 +397,18 @@ export function processVideo( selected: null, func: 'textContent', }, - ]); + ]) +} + +export function processVideo( + D: Document, + blang: string, + clientTime: Date, + urlinfo?: URL +): VideoMetadata { + /* this method to extract title was a nice experiment + * and/but should be refactored and upgraded */ + let title = getVideoTitle(D); if (!title) title = simpleTitlePicker(D, blang); diff --git a/yarn.lock b/yarn.lock index 4aa9af370..0ecc4c90a 100644 --- a/yarn.lock +++ b/yarn.lock @@ -4480,6 +4480,7 @@ __metadata: react: ^17.0.2 react-dom: ^17.0.2 react-i18next: ^11.18.0 + react-json-view: ^1.21.3 react-refresh: ^0.12.0 react-refresh-typescript: ^2.0.7 sass-loader: ^12.6.0