From 0e3f7069133bde5ae0eea03b2d41a3c8c22edd7a Mon Sep 17 00:00:00 2001
From: Joe McIlvain <joe.eli.mac@gmail.com>
Date: Tue, 25 Jun 2024 16:41:44 -0700
Subject: [PATCH] feat: add optional `metadata` to `KurtResult` type

This metadata can give information like how many tokens were used (useful for cost tracking) and/or
a system fingerprint (useful in conjunction with LLM providers that let you seed the pseudo-random
number generator used during token sampling).
---
 packages/kurt/src/KurtStream.ts | 58 +++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)
diff --git a/packages/kurt/src/KurtStream.ts b/packages/kurt/src/KurtStream.ts
index 7f851c9..883d036 100644
--- a/packages/kurt/src/KurtStream.ts
+++ b/packages/kurt/src/KurtStream.ts
@@ -68,6 +68,64 @@ export type KurtResult<D = undefined> = {
    * easily ignore this field and only look at the `data` field.
    */
   additionalData?: D[]
+
+  /**
+   * Metadata about the result and how it was generated.
+   */
+  metadata?: {
+    /**
+     * The total number of tokens used to represent the original request,
+     * including all prompt messages, tool definitions, and other input data.
+     *
+     * LLMs deal with text one token at a time, so this is a good indicator of
+     * how many computation steps have been expended to ingest the input.
+     *
+     * Paid LLM services often have pricing models based on the number of
+     * tokens used (in both the request and response). Therefore, to calculate
+     * overall cost both the `totalInputTokens` and `totalOutputTokens`
+     * must be taken into account alongside the service's pricing details.
+     *
+     * Also, LLMs have maximum context windows measured in tokens, so this
+     * same sum can be used to evaluate how much of the available capacity
+     * was used by this request, and possibly set up an alert to fire if
+     * the application is coming too close to the maximum context window.
+     */
+    totalInputTokens?: number
+
+    /**
+     * The total number of tokens used across all of the response chunks,
+     * as reported by the underlying LLM model or service.
+     *
+     * LLMs deal with text one token at a time, so this is a good indicator of
+     * how many computation steps have been expended to create the output.
+     *
+     * Paid LLM services often have pricing models based on the number of
+     * tokens used (in both the request and response). Therefore, to calculate
+     * overall cost both the `totalInputTokens` and `totalOutputTokens`
+     * must be taken into account alongside the service's pricing details.
+     *
+     * Also, LLMs have maximum context windows measured in tokens, so this
+     * same sum can be used to evaluate how much of the available capacity
+     * was used by this request, and possibly set up an alert to fire if
+     * the application is coming too close to the maximum context window.
+     */
+    totalOutputTokens?: number
+
+    /**
+     * If present, this opaque string can be compared across responses to check
+     * if responses were generated by the same version of the underlying LLM.
+     *
+     * Some LLM providers support choosing the seed for the pseudo-random
+     * number generator used during token sampling, as a way of reproducing
+     * earlier results (by selecting the same seed), but this will only
+     * produce deterministic results if the underlying LLM and holistic system
+     * is shown to be the same across identically-seeded requests.
+     *
+     * Therefore, the `systemFingerprint` is provided by some LLM providers
+     * to be used as a hint to show when non-determinism should be expected.
+     */
+    systemFingerprint?: string
+  }
 }
 
 /**