concord-consortium · scytacki · Aug 5, 2024
diff --git a/scripts/ai/collect-mods-documents.ts b/scripts/ai/collect-mods-documents.ts
@@ -0,0 +1,65 @@
+import { mkdir, copyFile, readFile, writeFile } from "fs/promises";
+import fsPath from "path";
+import stringify from "json-stringify-pretty-compact";
+
+import { processFiles } from "../lib/process-files.js";
+
+// This file was generated from a spreadsheet that included the
+// offeringId, classHash, and classId
+// A similar file can be created using the offering-json-to-csv.ts
+// script.
+const modsOfferingsFiles = "mods-offerings.csv";
+const modsOfferings = await readFile(modsOfferingsFiles, "utf8");
+const offeringRows = modsOfferings.split('\n');
+// remove the header
+offeringRows.splice(0,1);
+const offeringObjects = offeringRows.map(offering => {
+  const [offeringId, classHash, classId] = offering.split('\t');
+  return { offeringId, classHash, classId};
+});
+
+const classHashes = offeringObjects.map(offering => offering.classHash);
+const classHashSet = new Set(classHashes);
+
+const datasetFolder = "/Users/scytacki/Development/ai/dataset1720819925834/";
+const targetFolder = "/Users/scytacki/Development/ai/dataset1720819925834-mods/";
+
+// Create the target directories
+await mkdir(targetFolder);
+await mkdir(fsPath.join(targetFolder, "documentInfos"));
+await mkdir(fsPath.join(targetFolder, "documents"));
+
+async function processFile(file: string, path: string) {
+  const content = await readFile(path, "utf8");
+  const parsedContent = JSON.parse(content);
+
+  if (!classHashSet.has(parsedContent.classId)) return;
+
+  if (!parsedContent.documentContent) return;
+
+  const { tileMap } = parsedContent.documentContent;
+  if (!tileMap) return;
+
+  let emptyDocument = true;
+  const tiles = Object.values<any>(tileMap);
+  for (const tile of tiles) {
+    if (tile.content.type !== "Placeholder") {
+      emptyDocument = false;
+      break;
+    }
+  }
+  if (emptyDocument) return;
+
+  await copyFile(path, fsPath.join(targetFolder, "documentInfos", file));
+
+  const documentFile = file.replace("documentInfo-", "document-");
+  await writeFile(fsPath.join(targetFolder, "documents", documentFile), stringify(parsedContent.documentContent));
+}
+
+const stats = await processFiles({
+  sourcePath: datasetFolder,
+  processFile,
+  fileNamePrefix: "documentInfo"
+});
+
+console.log(stats);
diff --git a/scripts/ai/count-docs.ts b/scripts/ai/count-docs.ts
@@ -12,90 +12,44 @@
 // Set aiService to be whichever service you're interested in. This will determine the format of the output file.
 // $ cd scripts/ai
 // $ npx tsx count-docs.ts
-
-import fs from "fs";
-
+import { readFile } from "fs/promises";
 import { datasetPath } from "./script-constants.js";
 import { prettyDuration } from "../lib/script-utils.js";
+import { processFiles } from "../lib/process-files.js";
 
-const sourceDirectory = "dataset1706677550897";
-
-// The number of files to process in parallel
-const fileBatchSize = 8;
+const sourceDirectory = "dataset1721072285516";
 
 const sourcePath = `${datasetPath}${sourceDirectory}`;
 
 console.log(`*** Starting Document Count ***`);
 
-const startTime = Date.now();
-let checkedFiles = 0;
-let processedFiles = 0;
 const typeCounts: Record<string, number> = {};
 let titles = 0;
 
 // Processes a file, counting the relevant tiles in it if it's a document
-async function processFile(file: string) {
-  const path = `${sourcePath}/${file}`;
-  if (file.startsWith("documentInfo")) {
-    // For files named like documentXXX.txt, read the file
-    const content = fs.readFileSync(path, "utf8");
-    const parsedContent = JSON.parse(content);
-    const { documentContent, ...documentIds } = parsedContent;
+async function processFile(file: string, path: string) {
+  const content = await readFile(path, "utf8");
+  const parsedContent = JSON.parse(content);
 
-    if (!Object.keys(typeCounts).includes(documentIds.documentType)) {
-      typeCounts[documentIds.documentType] = 0;
-    }
-    typeCounts[documentIds.documentType]++;
+  const { documentContent, ...documentIds } = parsedContent;
 
-    if (documentIds.documentTitle) titles++;
-
-    processedFiles++;
+  if (!Object.keys(typeCounts).includes(documentIds.documentType)) {
+    typeCounts[documentIds.documentType] = 0;
   }
-}
-
-let fileBatch: string[] = [];
-// Process a batch of files
-async function processBatch() {
-  await Promise.all(fileBatch.map(async f => processFile(f)));
-  fileBatch = [];
+  typeCounts[documentIds.documentType]++;
 
-  const currentDuration = Date.now() - startTime;
-  console.log(`*** Time to count tiles in ${processedFiles} documents`, prettyDuration(currentDuration));
+  if (documentIds.documentTitle) titles++;
 }
 
-// Process every file in the source directory
-fs.readdir(sourcePath, async (_error, files) => {
-  for (const file of files) {
-    checkedFiles++;
-    fileBatch.push(file);
-
-    // We're finished if we've made it through all of the files
-    const finished = checkedFiles >= files.length;
-    if (fileBatch.length >= fileBatchSize || finished) {
-      await processBatch();
-
-      if (finished) {
-        // Write to an output file when all of the files have been processed
-        // const fileName = `${annotationTypes.join("-")}.json`;
-        // const filePath = `${sourcePath}/${fileName}`;
-        // console.log(`**** Writing annotation info to ${filePath} ****`);
-        // fs.writeFileSync(filePath, stringify(documentInfo, { maxLength: 100 }));
-        // console.log(`**** Annotation info saved to ${filePath} ****`);
-        // const outputFileProps = { documentInfo, fileName, sourceDirectory, azureMetadata };
-        // const outputFunctions = { azure: outputAzureFile, vertexAI: outputVertexAIFile };
-        // outputFunctions[aiService](outputFileProps);
-
-        const endTime = Date.now();
-        const finalDuration = endTime - startTime;
-        console.log(`***** Finished in ${prettyDuration(finalDuration)} *****`);
-        console.log(`*** Found ${titles} documents with titles ***`);
-        console.log(`*** Document types ***`);
-        Object.keys(typeCounts).forEach(type => {
-          console.log(`${type}: ${typeCounts[type]}`);
-        });
+const stats = await processFiles({
+  sourcePath,
+  processFile,
+  fileNamePrefix: "documentInfo"
+});
 
-        process.exit(0);
-      }
-    }
-  }
+console.log(`***** Finished in ${prettyDuration(stats.duration)} *****`);
+console.log(`*** Found ${titles} documents with titles ***`);
+console.log(`*** Document types ***`);
+Object.keys(typeCounts).forEach(type => {
+  console.log(`${type}: ${typeCounts[type]}`);
 });
diff --git a/scripts/ai/document-screenshots.ts b/scripts/ai/document-screenshots.ts
@@ -1,8 +1,9 @@
 import fs from "fs";
 import readline from "readline";
-import puppeteer from "puppeteer";
 
 import { prettyDuration } from "../lib/script-utils.js";
+import { makeCLUEScreenshot } from "../lib/screenshot.js";
+import { IProcessFilesStats, processFiles } from "../lib/process-files.js";
 
 // This script saves images of all the documents in a folder and updates its tags.csv with new folder and file names.
 // It's intended to be used on the output of count-document-tiles.ts.
@@ -19,23 +20,13 @@ import { prettyDuration } from "../lib/script-utils.js";
 const documentDirectory = "dataset1697150265495";
 // const documentDirectory = "dataset1";
 
-// Make falsy to include all documents
-const documentLimit = false;
-
-// Number of files to process in parallel
-const fileBatchSize = 8;
-
-// The width of the browser window. The height is determined dynamically.
-const windowWidth = 1920 / 2;
-
 const publicRoot = "ai";
 const rootPath = `../../src/public/${publicRoot}`;
 const documentPath = `${rootPath}/${documentDirectory}`;
 const publicPath = `${publicRoot}/${documentDirectory}`;
 const tagFileName = "tags.csv";
 
 const startTime = Date.now();
-let checkedFiles = 0;
 let totalSnapshots = 0;
 const targetDir = `screenshotDataset${startTime}`;
 const targetPath = `${rootPath}/${targetDir}`;
@@ -59,53 +50,22 @@ function newFileName(oldFileName: string) {
 // makeSnapshot loads document content at path in a CLUE standalone document editor, takes a snapshot of it,
 // then saves it in the output directory as fileName
 const urlRoot = `http://localhost:8080/editor/?appMode=dev&unit=example&document=`;
-async function makeSnapshot(path: string, fileName: string) {
-  console.log(`*   Processing snapshot`, path);
-  const targetFile = `${targetPath}/${fileName}`;
-
-  // View the document in the document editor
-  const browser = await puppeteer.launch({ headless: "new" });
-  const page = await browser.newPage();
-  const url = `${urlRoot}${path}`;
-  try {
-    await page.goto(url, {
-      timeout: 60000, // 30 seconds
-      waitUntil: 'networkidle0'
-    });
-  } catch (error) {
-    console.log(`!!!!! Failed to load file ${url}`, error);
-    failedFiles.push(path);
-    await page.close();
-    await browser.close();
-    return;
-  }
-
-  // Approximate the height of the document by adding up the heights of the rows and make the viewport that tall
-  let pageHeight = 30;
-  const rowElements = await page.$$(".tile-row");
-  for (const rowElement of rowElements) {
-    const boundingBox = await rowElement.boundingBox();
-    pageHeight += boundingBox?.height ?? 0;
-  }
-  await page.setViewport({ width: windowWidth, height: Math.round(pageHeight) });
-
-  // Take a screenshot and save it to a file
-  const buffer = await page.screenshot({ fullPage: true, type: 'png' });
-  await page.close();
-  await browser.close();
-  fs.writeFileSync(targetFile, buffer);
-
-  totalSnapshots++;
-}
 
 // Processes a file, usually making a screenshot but updating tags.csv when that file is encountered
-async function processFile(file: string) {
-  const path = `${documentPath}/${file}`;
+async function processFile(file: string, path) {
   if (file.startsWith("document")) {
     // For files named like documentXXX.txt, make a snapshot and save it
     const docEditorPath = `${publicPath}/${file}`;
     const screenshotFileName = newFileName(file);
-    await makeSnapshot(docEditorPath, screenshotFileName);
+    try {
+      await makeCLUEScreenshot({
+        url: `${urlRoot}${docEditorPath}`,
+        outputFile: `${targetPath}/${screenshotFileName}`
+      });
+      totalSnapshots++;
+    } catch (error) {
+      failedFiles.push(docEditorPath);
+    }
   } else if (file === tagFileName) {
     // For the tag.csv file, duplicate the file, modifying the directory and file names
     // Based on top answer at https://stackoverflow.com/questions/6156501/read-a-file-one-line-at-a-time-in-node-js
@@ -122,38 +82,22 @@ async function processFile(file: string) {
   }
 }
 
-let fileBatch: string[] = [];
-// Process a batch of files
-async function processBatch() {
-  await Promise.all(fileBatch.map(async f => processFile(f)));
-  fileBatch = [];
-
-  const currentDuration = Date.now() - startTime;
+function batchComplete(stats: IProcessFilesStats) {
+  const currentDuration = Date.now() - stats.startTime;
   console.log(`*** Time to process ${totalSnapshots} snapshots`, prettyDuration(currentDuration));
 }
 
 // Process every file in the source directory
-fs.readdir(documentPath, async (_error, files) => {
-  for (const file of files) {
-    if (documentLimit && checkedFiles >= documentLimit) break;
-
-    checkedFiles++;
-    fileBatch.push(file);
-
-    // We're finished if we've made it through all of the files or we've hit our limit
-    const finished = checkedFiles >= files.length || (documentLimit && checkedFiles >= documentLimit);
-    if (fileBatch.length >= fileBatchSize || finished) {
-      await processBatch();
-
-      if (finished) {
-        const endTime = Date.now();
-        const finalDuration = endTime - startTime;
-        console.log(`***** Finished in ${prettyDuration(finalDuration)}`);
-        if (failedFiles.length > 0) {
-          console.log(`Failed to get snapshots for the following files:`);
-          console.log(failedFiles);
-        }
-      }
-    }
-  }
+const resultStats = await processFiles({
+  sourcePath: documentPath,
+  processFile,
+  batchComplete,
+  // Uncomment to limit the number of files processed
+  // fileLimit: 100
 });
+
+console.log(`***** Finished in ${prettyDuration(resultStats.duration)}`);
+if (failedFiles.length > 0) {
+  console.log(`Failed to get snapshots for the following files:`);
+  console.log(failedFiles);
+}
diff --git a/scripts/ai/offering-json-to-csv.ts b/scripts/ai/offering-json-to-csv.ts
@@ -0,0 +1,48 @@
+import fs from "fs";
+
+const offeringInfoFile = `/Users/scytacki/Development/ai/dataset1720819925834/offering-info.json`;
+const offeringInfo = JSON.parse(fs.readFileSync(offeringInfoFile, "utf8"));
+
+// eslint-disable-next-line prefer-regex-literals
+const clueBranchRegExp = new RegExp("^https://[^/]*(/[^?]*)");
+function getClueBranch(activityUrl: string) {
+  return clueBranchRegExp.exec(activityUrl)?.[1];
+}
+
+// eslint-disable-next-line prefer-regex-literals
+const unitParamRegExp = new RegExp("unit=([^&]*)");
+function getUnitParam(activityUrl: string) {
+  return unitParamRegExp.exec(activityUrl)?.[1];
+}
+
+// eslint-disable-next-line prefer-regex-literals
+const unitBranchRegExp = new RegExp("/branch/[^/]*");
+function getUnitBranch(unitParam: string | undefined) {
+  if (unitParam?.startsWith("https://")) {
+    return unitBranchRegExp.exec(unitParam)?.[0];
+  } else {
+    return "";
+  }
+}
+
+// eslint-disable-next-line prefer-regex-literals
+const unitCodeRegExp = new RegExp("/([^/]*)/content.json");
+function getUnitCode(unitParam: string | undefined) {
+  if (unitParam?.startsWith("https://")) {
+    return unitCodeRegExp.exec(unitParam)?.[1];
+  } else {
+    return unitParam;
+  }
+}
+
+console.log("offering_id, activity_url, class_id, clazz_hash, clue_branch, unit_param, unit_branch, unit_code");
+Object.entries(offeringInfo).forEach(([offering_id, offering]) => {
+  const {activity_url, clazz_id, clazz_hash} = offering as any;
+  const clueBranch = getClueBranch(activity_url);
+  const unitParam = getUnitParam(activity_url);
+  const unitBranch = getUnitBranch(unitParam);
+  const unitCode = getUnitCode(unitParam);
+  console.log(
+    `${offering_id}, ${activity_url}, ${clazz_id}, ${clazz_hash}, ` +
+    `${clueBranch}, ${unitParam}, ${unitBranch}, ${unitCode}`);
+});