From 0e3f7069133bde5ae0eea03b2d41a3c8c22edd7a Mon Sep 17 00:00:00 2001 From: Joe McIlvain Date: Tue, 25 Jun 2024 16:41:44 -0700 Subject: [PATCH] feat: add optional `metadata` to `KurtResult` type This metadata can give information like how many tokens were used (useful for cost tracking) and/or a system fingerprint (useful in conjunction with LLM providers that let you seed the pseudo-random number generator used during token sampling). --- packages/kurt/src/KurtStream.ts | 58 +++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/packages/kurt/src/KurtStream.ts b/packages/kurt/src/KurtStream.ts index 7f851c9..883d036 100644 --- a/packages/kurt/src/KurtStream.ts +++ b/packages/kurt/src/KurtStream.ts @@ -68,6 +68,64 @@ export type KurtResult = { * easily ignore this field and only look at the `data` field. */ additionalData?: D[] + + /** + * Metadata about the result and how it was generated. + */ + metadata?: { + /** + * The total number of tokens used to represent the original request, + * including all prompt messages, tool definitions, and other input data. + * + * LLMs deal with text one token at a time, so this is a good indicator of + * how many computation steps have been expended to ingest the input. + * + * Paid LLM services often have pricing models based on the number of + * tokens used (in both the request and response). Therefore, to calculate + * overall cost both the `totalInputTokens` and `totalOutputTokens` + * must be taken into account alongside the service's pricing details. + * + * Also, LLMs have maximum context windows measured in tokens, so this + * same sum can be used to evaluate how much of the available capacity + * was used by this request, and possibly set up an alert to fire if + * the application is coming too close to the maximum context window. + */ + totalInputTokens?: number + + /** + * The total number of tokens used across all of the response chunks, + * as reported by the underlying LLM model or service. + * + * LLMs deal with text one token at a time, so this is a good indicator of + * how many computation steps have been expended to create the output. + * + * Paid LLM services often have pricing models based on the number of + * tokens used (in both the request and response). Therefore, to calculate + * overall cost both the `totalInputTokens` and `totalOutputTokens` + * must be taken into account alongside the service's pricing details. + * + * Also, LLMs have maximum context windows measured in tokens, so this + * same sum can be used to evaluate how much of the available capacity + * was used by this request, and possibly set up an alert to fire if + * the application is coming too close to the maximum context window. + */ + totalOutputTokens?: number + + /** + * If present, this opaque string can be compared across responses to check + * if responses were generated by the same version of the underlying LLM. + * + * Some LLM providers support choosing the seed for the pseudo-random + * number generator used during token sampling, as a way of reproducing + * earlier results (by selecting the same seed), but this will only + * produce deterministic results if the underlying LLM and holistic system + * is shown to be the same across identically-seeded requests. + * + * Therefore, the `systemFingerprint` is provided by some LLM providers + * to be used as a hint to show when non-determinism should be expected. + */ + systemFingerprint?: string + } } /**