Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

updated document scripts #2382

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions scripts/ai/collect-mods-documents.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import { mkdir, copyFile, readFile, writeFile } from "fs/promises";
import fsPath from "path";
import stringify from "json-stringify-pretty-compact";

import { processFiles } from "../lib/process-files.js";

// This file was generated from a spreadsheet that included the
// offeringId, classHash, and classId
// A similar file can be created using the offering-json-to-csv.ts
// script.
const modsOfferingsFiles = "mods-offerings.csv";
const modsOfferings = await readFile(modsOfferingsFiles, "utf8");
const offeringRows = modsOfferings.split('\n');
// remove the header
offeringRows.splice(0,1);
const offeringObjects = offeringRows.map(offering => {
const [offeringId, classHash, classId] = offering.split('\t');
return { offeringId, classHash, classId};
});

const classHashes = offeringObjects.map(offering => offering.classHash);
const classHashSet = new Set(classHashes);

const datasetFolder = "/Users/scytacki/Development/ai/dataset1720819925834/";
const targetFolder = "/Users/scytacki/Development/ai/dataset1720819925834-mods/";

// Create the target directories
await mkdir(targetFolder);
await mkdir(fsPath.join(targetFolder, "documentInfos"));
await mkdir(fsPath.join(targetFolder, "documents"));

async function processFile(file: string, path: string) {
const content = await readFile(path, "utf8");
const parsedContent = JSON.parse(content);

if (!classHashSet.has(parsedContent.classId)) return;

if (!parsedContent.documentContent) return;

const { tileMap } = parsedContent.documentContent;
if (!tileMap) return;

let emptyDocument = true;
const tiles = Object.values<any>(tileMap);
for (const tile of tiles) {
if (tile.content.type !== "Placeholder") {
emptyDocument = false;
break;
}
}
if (emptyDocument) return;

await copyFile(path, fsPath.join(targetFolder, "documentInfos", file));

const documentFile = file.replace("documentInfo-", "document-");
await writeFile(fsPath.join(targetFolder, "documents", documentFile), stringify(parsedContent.documentContent));
}

const stats = await processFiles({
sourcePath: datasetFolder,
processFile,
fileNamePrefix: "documentInfo"
});

console.log(stats);
88 changes: 21 additions & 67 deletions scripts/ai/count-docs.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,90 +12,44 @@
// Set aiService to be whichever service you're interested in. This will determine the format of the output file.
// $ cd scripts/ai
// $ npx tsx count-docs.ts

import fs from "fs";

import { readFile } from "fs/promises";
import { datasetPath } from "./script-constants.js";
import { prettyDuration } from "../lib/script-utils.js";
import { processFiles } from "../lib/process-files.js";

const sourceDirectory = "dataset1706677550897";

// The number of files to process in parallel
const fileBatchSize = 8;
const sourceDirectory = "dataset1721072285516";

const sourcePath = `${datasetPath}${sourceDirectory}`;

console.log(`*** Starting Document Count ***`);

const startTime = Date.now();
let checkedFiles = 0;
let processedFiles = 0;
const typeCounts: Record<string, number> = {};
let titles = 0;

// Processes a file, counting the relevant tiles in it if it's a document
async function processFile(file: string) {
const path = `${sourcePath}/${file}`;
if (file.startsWith("documentInfo")) {
// For files named like documentXXX.txt, read the file
const content = fs.readFileSync(path, "utf8");
const parsedContent = JSON.parse(content);
const { documentContent, ...documentIds } = parsedContent;
async function processFile(file: string, path: string) {
const content = await readFile(path, "utf8");
const parsedContent = JSON.parse(content);

if (!Object.keys(typeCounts).includes(documentIds.documentType)) {
typeCounts[documentIds.documentType] = 0;
}
typeCounts[documentIds.documentType]++;
const { documentContent, ...documentIds } = parsedContent;

if (documentIds.documentTitle) titles++;

processedFiles++;
if (!Object.keys(typeCounts).includes(documentIds.documentType)) {
typeCounts[documentIds.documentType] = 0;
}
}

let fileBatch: string[] = [];
// Process a batch of files
async function processBatch() {
await Promise.all(fileBatch.map(async f => processFile(f)));
fileBatch = [];
typeCounts[documentIds.documentType]++;

const currentDuration = Date.now() - startTime;
console.log(`*** Time to count tiles in ${processedFiles} documents`, prettyDuration(currentDuration));
if (documentIds.documentTitle) titles++;
}

// Process every file in the source directory
fs.readdir(sourcePath, async (_error, files) => {
for (const file of files) {
checkedFiles++;
fileBatch.push(file);

// We're finished if we've made it through all of the files
const finished = checkedFiles >= files.length;
if (fileBatch.length >= fileBatchSize || finished) {
await processBatch();

if (finished) {
// Write to an output file when all of the files have been processed
// const fileName = `${annotationTypes.join("-")}.json`;
// const filePath = `${sourcePath}/${fileName}`;
// console.log(`**** Writing annotation info to ${filePath} ****`);
// fs.writeFileSync(filePath, stringify(documentInfo, { maxLength: 100 }));
// console.log(`**** Annotation info saved to ${filePath} ****`);
// const outputFileProps = { documentInfo, fileName, sourceDirectory, azureMetadata };
// const outputFunctions = { azure: outputAzureFile, vertexAI: outputVertexAIFile };
// outputFunctions[aiService](outputFileProps);

const endTime = Date.now();
const finalDuration = endTime - startTime;
console.log(`***** Finished in ${prettyDuration(finalDuration)} *****`);
console.log(`*** Found ${titles} documents with titles ***`);
console.log(`*** Document types ***`);
Object.keys(typeCounts).forEach(type => {
console.log(`${type}: ${typeCounts[type]}`);
});
const stats = await processFiles({
sourcePath,
processFile,
fileNamePrefix: "documentInfo"
});

process.exit(0);
}
}
}
console.log(`***** Finished in ${prettyDuration(stats.duration)} *****`);
console.log(`*** Found ${titles} documents with titles ***`);
console.log(`*** Document types ***`);
Object.keys(typeCounts).forEach(type => {
console.log(`${type}: ${typeCounts[type]}`);
});
108 changes: 26 additions & 82 deletions scripts/ai/document-screenshots.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import fs from "fs";
import readline from "readline";
import puppeteer from "puppeteer";

import { prettyDuration } from "../lib/script-utils.js";
import { makeCLUEScreenshot } from "../lib/screenshot.js";
import { IProcessFilesStats, processFiles } from "../lib/process-files.js";

// This script saves images of all the documents in a folder and updates its tags.csv with new folder and file names.
// It's intended to be used on the output of count-document-tiles.ts.
Expand All @@ -19,23 +20,13 @@ import { prettyDuration } from "../lib/script-utils.js";
const documentDirectory = "dataset1697150265495";
// const documentDirectory = "dataset1";

// Make falsy to include all documents
const documentLimit = false;

// Number of files to process in parallel
const fileBatchSize = 8;

// The width of the browser window. The height is determined dynamically.
const windowWidth = 1920 / 2;

const publicRoot = "ai";
const rootPath = `../../src/public/${publicRoot}`;
const documentPath = `${rootPath}/${documentDirectory}`;
const publicPath = `${publicRoot}/${documentDirectory}`;
const tagFileName = "tags.csv";

const startTime = Date.now();
let checkedFiles = 0;
let totalSnapshots = 0;
const targetDir = `screenshotDataset${startTime}`;
const targetPath = `${rootPath}/${targetDir}`;
Expand All @@ -59,53 +50,22 @@ function newFileName(oldFileName: string) {
// makeSnapshot loads document content at path in a CLUE standalone document editor, takes a snapshot of it,
// then saves it in the output directory as fileName
const urlRoot = `http://localhost:8080/editor/?appMode=dev&unit=example&document=`;
async function makeSnapshot(path: string, fileName: string) {
console.log(`* Processing snapshot`, path);
const targetFile = `${targetPath}/${fileName}`;

// View the document in the document editor
const browser = await puppeteer.launch({ headless: "new" });
const page = await browser.newPage();
const url = `${urlRoot}${path}`;
try {
await page.goto(url, {
timeout: 60000, // 30 seconds
waitUntil: 'networkidle0'
});
} catch (error) {
console.log(`!!!!! Failed to load file ${url}`, error);
failedFiles.push(path);
await page.close();
await browser.close();
return;
}

// Approximate the height of the document by adding up the heights of the rows and make the viewport that tall
let pageHeight = 30;
const rowElements = await page.$$(".tile-row");
for (const rowElement of rowElements) {
const boundingBox = await rowElement.boundingBox();
pageHeight += boundingBox?.height ?? 0;
}
await page.setViewport({ width: windowWidth, height: Math.round(pageHeight) });

// Take a screenshot and save it to a file
const buffer = await page.screenshot({ fullPage: true, type: 'png' });
await page.close();
await browser.close();
fs.writeFileSync(targetFile, buffer);

totalSnapshots++;
}

// Processes a file, usually making a screenshot but updating tags.csv when that file is encountered
async function processFile(file: string) {
const path = `${documentPath}/${file}`;
async function processFile(file: string, path) {
if (file.startsWith("document")) {
// For files named like documentXXX.txt, make a snapshot and save it
const docEditorPath = `${publicPath}/${file}`;
const screenshotFileName = newFileName(file);
await makeSnapshot(docEditorPath, screenshotFileName);
try {
await makeCLUEScreenshot({
url: `${urlRoot}${docEditorPath}`,
outputFile: `${targetPath}/${screenshotFileName}`
});
totalSnapshots++;
} catch (error) {
failedFiles.push(docEditorPath);
}
} else if (file === tagFileName) {
// For the tag.csv file, duplicate the file, modifying the directory and file names
// Based on top answer at https://stackoverflow.com/questions/6156501/read-a-file-one-line-at-a-time-in-node-js
Expand All @@ -122,38 +82,22 @@ async function processFile(file: string) {
}
}

let fileBatch: string[] = [];
// Process a batch of files
async function processBatch() {
await Promise.all(fileBatch.map(async f => processFile(f)));
fileBatch = [];

const currentDuration = Date.now() - startTime;
function batchComplete(stats: IProcessFilesStats) {
const currentDuration = Date.now() - stats.startTime;
console.log(`*** Time to process ${totalSnapshots} snapshots`, prettyDuration(currentDuration));
}

// Process every file in the source directory
fs.readdir(documentPath, async (_error, files) => {
for (const file of files) {
if (documentLimit && checkedFiles >= documentLimit) break;

checkedFiles++;
fileBatch.push(file);

// We're finished if we've made it through all of the files or we've hit our limit
const finished = checkedFiles >= files.length || (documentLimit && checkedFiles >= documentLimit);
if (fileBatch.length >= fileBatchSize || finished) {
await processBatch();

if (finished) {
const endTime = Date.now();
const finalDuration = endTime - startTime;
console.log(`***** Finished in ${prettyDuration(finalDuration)}`);
if (failedFiles.length > 0) {
console.log(`Failed to get snapshots for the following files:`);
console.log(failedFiles);
}
}
}
}
const resultStats = await processFiles({
sourcePath: documentPath,
processFile,
batchComplete,
// Uncomment to limit the number of files processed
// fileLimit: 100
});

console.log(`***** Finished in ${prettyDuration(resultStats.duration)}`);
if (failedFiles.length > 0) {
console.log(`Failed to get snapshots for the following files:`);
console.log(failedFiles);
}
48 changes: 48 additions & 0 deletions scripts/ai/offering-json-to-csv.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import fs from "fs";

const offeringInfoFile = `/Users/scytacki/Development/ai/dataset1720819925834/offering-info.json`;
const offeringInfo = JSON.parse(fs.readFileSync(offeringInfoFile, "utf8"));

// eslint-disable-next-line prefer-regex-literals
const clueBranchRegExp = new RegExp("^https://[^/]*(/[^?]*)");
function getClueBranch(activityUrl: string) {
return clueBranchRegExp.exec(activityUrl)?.[1];
}

// eslint-disable-next-line prefer-regex-literals
const unitParamRegExp = new RegExp("unit=([^&]*)");
function getUnitParam(activityUrl: string) {
return unitParamRegExp.exec(activityUrl)?.[1];
}

// eslint-disable-next-line prefer-regex-literals
const unitBranchRegExp = new RegExp("/branch/[^/]*");
function getUnitBranch(unitParam: string | undefined) {
if (unitParam?.startsWith("https://")) {
return unitBranchRegExp.exec(unitParam)?.[0];
} else {
return "";
}
}

// eslint-disable-next-line prefer-regex-literals
const unitCodeRegExp = new RegExp("/([^/]*)/content.json");
function getUnitCode(unitParam: string | undefined) {
if (unitParam?.startsWith("https://")) {
return unitCodeRegExp.exec(unitParam)?.[1];
} else {
return unitParam;
}
}

console.log("offering_id, activity_url, class_id, clazz_hash, clue_branch, unit_param, unit_branch, unit_code");
Object.entries(offeringInfo).forEach(([offering_id, offering]) => {
const {activity_url, clazz_id, clazz_hash} = offering as any;
const clueBranch = getClueBranch(activity_url);
const unitParam = getUnitParam(activity_url);
const unitBranch = getUnitBranch(unitParam);
const unitCode = getUnitCode(unitParam);
console.log(
`${offering_id}, ${activity_url}, ${clazz_id}, ${clazz_hash}, ` +
`${clueBranch}, ${unitParam}, ${unitBranch}, ${unitCode}`);
});
Loading
Loading