From e551ec61c352211cfe5d46926e4ba27154937344 Mon Sep 17 00:00:00 2001 From: Piotr Karwatka Date: Mon, 30 Dec 2024 13:55:16 +0100 Subject: [PATCH 1/2] feat: pandoc implementation --- packages/tools/src/pandoc.ts | 225 +++++++++++++++++++++++++++++++++++ 1 file changed, 225 insertions(+) create mode 100644 packages/tools/src/pandoc.ts diff --git a/packages/tools/src/pandoc.ts b/packages/tools/src/pandoc.ts new file mode 100644 index 0000000..dc712e5 --- /dev/null +++ b/packages/tools/src/pandoc.ts @@ -0,0 +1,225 @@ +import { execFile, execFileSync } from 'child_process' +import { tool } from 'fabrice-ai/tool' +import fs from 'fs' +import path from 'path' +import { promisify } from 'util' +import { z } from 'zod' + +const execFileAsync = promisify(execFile) + +/** + * Configuration options for creating the Pandoc Tools. + */ +interface PandocToolOptions { + /** + * Path to the pandoc binary (default: 'pandoc' in PATH) + */ + pandocPath?: string + + /** + * Directory in which to operate for file-based conversions. + * (e.g., '/tmp', ensure it exists and is writable) + */ + workingDir?: string +} + +/** + * Default values for the Pandoc Tool options. + */ +const defaults: Required = { + pandocPath: 'pandoc', + workingDir: process.cwd(), +} + +/** + * Utility to check if Pandoc is actually installed & working. + * Throws an error if not found. + */ +function ensurePandocExists(pandocPath: string) { + try { + // Just try running `pandoc --version` synchronously. + // If it fails, it will throw. + execFileSync(pandocPath, ['--version'], { stdio: 'ignore' }) + } catch (error) { + throw new Error(`Pandoc not found or not executable at path: ${pandocPath}`) + } +} + +/** + * Safely resolve a filename within workingDir, ensuring it doesn't escape. + */ +function resolveInWorkingDir(workingDir: string, fileName: string): string { + const resolved = path.resolve(workingDir, fileName) + if (!resolved.startsWith(path.resolve(workingDir))) { + // If the resolved path doesn't start with the workingDir, user tried to escape + throw new Error(`File path "${fileName}" is outside the working directory: ${workingDir}`) + } + return resolved +} + +/** + * Shared function to call Pandoc. We allow passing either + * (A) input via file path or + * (B) input via a Buffer/string (stdin). + * + * If `outputFile` is specified, we use the `-o` argument. + * Otherwise, we capture stdout (return it). + */ +async function runPandoc({ + pandocPath, + args, + workingDir, + inputData, +}: { + pandocPath: string + args: string[] + workingDir: string + inputData?: string | Buffer +}): Promise<{ stdout: string | Buffer }> { + const result = await execFileAsync(pandocPath, args, { + cwd: workingDir, + maxBuffer: 50 * 1024 * 1024, // 50 MB + input: inputData, // if defined, piped via stdin + encoding: inputData ? 'buffer' : 'utf8', + // Explanation: If we're passing binary or text to stdin, we might not need encoding at all. + // But to unify, we can read stdout as a Buffer if inputData is set (content-based). + }) + // result.stdout might be a Buffer or string depending on `encoding`. + return { stdout: result.stdout } +} + +/** + * Factory function that returns two tools: + * 1) convertFileWithPandoc (file-based I/O) + * 2) convertContentWithPandoc (content-based I/O) + */ +export function createPandocTool(options?: PandocToolOptions) { + const config = { + ...defaults, + ...options, + } + + // Ensure pandoc is installed at initialization: + ensurePandocExists(config.pandocPath) + + return { + /** + * Tool #1: convertFileWithPandoc + * + * Converts a file from one format to another using Pandoc. + * - fromFormat: "markdown", "docx", "html", etc. + * - toFormat: "pdf", "docx", "html", etc. + * - inputFileName: relative to workingDir + * - outputFileName: relative to workingDir + */ + convertFileWithPandoc: tool({ + description: + 'Converts a file from one format to another (via Pandoc). Requires inputFileName & outputFileName in workingDir. No direct content is handled.', + parameters: z.object({ + fromFormat: z.string().describe('E.g. "markdown", "html", "docx"'), + toFormat: z.string().describe('E.g. "pdf", "docx", "html"'), + inputFileName: z.string().describe('File in workingDir to read from'), + outputFileName: z.string().describe('File in workingDir to write to'), + }), + execute: async ({ fromFormat, toFormat, inputFileName, outputFileName }) => { + try { + // 1. Resolve the paths + const inputPath = resolveInWorkingDir(config.workingDir, inputFileName) + const outputPath = resolveInWorkingDir(config.workingDir, outputFileName) + + // 2. Check input file + if (!fs.existsSync(inputPath)) { + throw new Error(`Input file does not exist: ${inputPath}`) + } + + // 3. Build Pandoc arguments + const args = [inputPath, '-f', fromFormat, '-t', toFormat, '-o', outputPath] + + // 4. Call the shared runPandoc + await runPandoc({ + pandocPath: config.pandocPath, + args, + workingDir: config.workingDir, + }) + + // 5. Check if output file was created + if (!fs.existsSync(outputPath)) { + throw new Error(`Output file not created: ${outputPath}`) + } + + // 6. Return success + return JSON.stringify({ + success: true, + fromFormat, + toFormat, + inputPath, + outputPath, + }) + } catch (error) { + throw new Error(`Pandoc file-based conversion failed: ${error}`) + } + }, + }), + + /** + * Tool #2: convertContentWithPandoc + * + * Operates on raw string content. (e.g., from markdown to docx) + * By default, if the output is expected to be binary (docx, pdf, etc.), + * we return base64. Otherwise, return plain text. + */ + convertContentWithPandoc: tool({ + description: + 'Converts raw string content from one format to another (via Pandoc). Returns output as text or base64-encoded.', + parameters: z.object({ + fromFormat: z.string().describe('E.g. "markdown", "html"'), + toFormat: z.string().describe('E.g. "docx", "pdf", "html"'), + content: z.string().describe('Raw content to convert.'), + returnAsBase64: z + .boolean() + .default(true) + .describe( + 'If true (default), returns base64 (useful for docx/pdf). Otherwise, returns plain text.' + ), + }), + execute: async ({ fromFormat, toFormat, content, returnAsBase64 }) => { + try { + // We will NOT use -o with output file. We'll capture stdout. + + // 1. Build Pandoc arguments. + // We do not specify -o, so pandoc will write to stdout. + const args = ['-f', fromFormat, '-t', toFormat] + + // 2. Call the shared runPandoc with `inputData` + const { stdout } = await runPandoc({ + pandocPath: config.pandocPath, + args, + workingDir: config.workingDir, + inputData: content, // pass content via stdin + }) + + // 3. Decide how to return it + // `stdout` might be a Buffer if we set `encoding: 'buffer'` + const outputBuffer = Buffer.isBuffer(stdout) ? stdout : Buffer.from(stdout, 'utf8') + + let output: string + if (returnAsBase64) { + output = outputBuffer.toString('base64') + } else { + output = outputBuffer.toString('utf8') + } + + return JSON.stringify({ + success: true, + fromFormat, + toFormat, + returnAsBase64, + output, + }) + } catch (error) { + throw new Error(`Pandoc content-based conversion failed: ${error}`) + } + }, + }), + } +} From 7ff7e9494fbee4708c3edd328f9da308bb236c5d Mon Sep 17 00:00:00 2001 From: Piotr Karwatka Date: Mon, 30 Dec 2024 14:38:42 +0100 Subject: [PATCH 2/2] feat: unit tests --- example/src/github_docx_report.config.ts | 78 ++++++++++++++++++++++++ example/src/github_docx_report.test.ts | 53 ++++++++++++++++ example/src/github_docx_report.ts | 10 +++ packages/tools/src/pandoc.ts | 5 +- 4 files changed, 144 insertions(+), 2 deletions(-) create mode 100644 example/src/github_docx_report.config.ts create mode 100644 example/src/github_docx_report.test.ts create mode 100644 example/src/github_docx_report.ts diff --git a/example/src/github_docx_report.config.ts b/example/src/github_docx_report.config.ts new file mode 100644 index 0000000..47a7bce --- /dev/null +++ b/example/src/github_docx_report.config.ts @@ -0,0 +1,78 @@ +import 'dotenv/config' + +import { createFileSystemTools } from '@fabrice-ai/tools/filesystem' +import { httpTool } from '@fabrice-ai/tools/http' +import { createPandocTool } from '@fabrice-ai/tools/pandoc' +import { agent } from 'fabrice-ai/agent' +import { logger } from 'fabrice-ai/telemetry' +import { workflow } from 'fabrice-ai/workflow' +import fs from 'fs' +import path from 'path' + +import { askUser } from './tools/askUser.js' + +export const workingDir = path.resolve(import.meta.dirname, '../assets/') +const filesToCleanup = ['project-summary.docx', 'project-summary.md'] +for (const file in filesToCleanup) { + if (fs.existsSync(path.join(workingDir, file))) fs.rmSync(path.join(workingDir, file)) +} + +export const outputPath = path.join(workingDir, 'project-summary.docx') + +const human = agent({ + description: ` + Use askUser tool to get the required input information for other agents`, + tools: { + askUser, + }, +}) + +const browser = agent({ + description: ` + You are skilled at browsing Web with specified URLs, + methods, params etc. + You are using "httpTool" to get the data from the API and/or Web pages. + `, + tools: { + httpTool, + }, +}) + +const fsTools = createFileSystemTools({ + workingDir, +}) + +const reportCreator = agent({ + description: ` + Your role is to create a project report and save it in Microsfot Office, "docx" file. + I am able to read, save and convert documents and files using my toolkit. + `, + tools: { + convertFileWithPandoc: createPandocTool({ + workingDir, + }).convertFileWithPandoc, + saveFile: fsTools.saveFile, + readFile: fsTools.readFile, + }, +}) + +export const githubProjectReport = workflow({ + team: { human, browser, reportCreator }, + description: ` + Ask human for the Github project locator: "/". + Browse the following URL: "https://api.github.com/repos//". + + Create a Markdown report about the most important project information. + Convert this report to "docx" - Word format - and save in the "${outputPath}" file. + `, + knowledge: ` + Save files in the ${workingDir} only. + `, + output: ` + Comprehensive Github project raport: + - Returned in theMarkdown format, + - Saved, in "docx" format in the "${outputPath}", + - Keep strict to output file name: "${outputPath}" + `, + snapshot: logger, +}) diff --git a/example/src/github_docx_report.test.ts b/example/src/github_docx_report.test.ts new file mode 100644 index 0000000..6f747ef --- /dev/null +++ b/example/src/github_docx_report.test.ts @@ -0,0 +1,53 @@ +import 'dotenv/config' + +import { suite, test } from '@fabrice-ai/bdd/suite' +import { testwork } from '@fabrice-ai/bdd/testwork' +import fs from 'fs' + +import { githubProjectReport, outputPath, workingDir } from './github_docx_report.config.js' + +const testResults = await testwork( + githubProjectReport, + suite({ + description: 'Black box testing suite', + team: { + browser: [ + test( + '0_github_check', + 'Browser agent shoud use the "httpTool" to browse Github for project details' + ), + ], + reportCreator: [ + test( + '1_file_operations', + `The reportCreator agent is using saveFile, readFile or convertFileWithPandoc tools to operate only within the ${workingDir} directory` + ), + ], + }, + workflow: [ + test('2_finalOutput', `Final report saved to ${outputPath} file`, async (workflow, state) => { + if (!fs.existsSync(outputPath)) { + return { + passed: false, + reasoning: `Output file ${outputPath} does not exist`, + id: '2_finalOutput', + } + } else { + return { + passed: true, + reasoning: 'Output file saved correctly', + id: '2_finalOutput', + } + } + }), + ], + }) +) + +if (!testResults.passed) { + console.log('🚨 Test suite failed') + process.exit(-1) +} else { + console.log('✅ Test suite passed') + process.exit(0) +} diff --git a/example/src/github_docx_report.ts b/example/src/github_docx_report.ts new file mode 100644 index 0000000..1c916f9 --- /dev/null +++ b/example/src/github_docx_report.ts @@ -0,0 +1,10 @@ +import 'dotenv/config' + +import { solution } from 'fabrice-ai/solution' +import { teamwork } from 'fabrice-ai/teamwork' + +import { githubProjectReport } from './github_docx_report.config.js' + +const result = await teamwork(githubProjectReport) + +console.log(solution(result)) diff --git a/packages/tools/src/pandoc.ts b/packages/tools/src/pandoc.ts index dc712e5..f87ac0d 100644 --- a/packages/tools/src/pandoc.ts +++ b/packages/tools/src/pandoc.ts @@ -41,7 +41,9 @@ function ensurePandocExists(pandocPath: string) { // If it fails, it will throw. execFileSync(pandocPath, ['--version'], { stdio: 'ignore' }) } catch (error) { - throw new Error(`Pandoc not found or not executable at path: ${pandocPath}`) + throw new Error( + `Pandoc not found or not executable at path: ${pandocPath}. Go to https://pandoc.org for installation details. Use "homebrew install pandoc" if you are on MacOS and using Homebrew.` + ) } } @@ -177,7 +179,6 @@ export function createPandocTool(options?: PandocToolOptions) { content: z.string().describe('Raw content to convert.'), returnAsBase64: z .boolean() - .default(true) .describe( 'If true (default), returns base64 (useful for docx/pdf). Otherwise, returns plain text.' ),