discojs*,cli*: rename blockSize and maxSequenceLength to contextLength

epfml · Nov 14, 2024 · 8cbc96e · 8cbc96e
1 parent cb806c0
commit 8cbc96e
Show file tree

Hide file tree

Showing 12 changed files with 41 additions and 41 deletions.
diff --git a/cli/src/benchmark_gpt.ts b/cli/src/benchmark_gpt.ts
@@ -69,18 +69,18 @@ async function main(args: Required<CLIArguments>): Promise<void> {
     const config: models.GPTConfig = {
       modelType: modelType as models.GPTConfig['modelType'],
       maxIter: iterationsPerEpoch,
-      blockSize: contextLength,
       lr: 0.0001,
+      contextLength,
     }
 
     // Load the dataset after setting the Task batch size and max sequence length
     // to make sure the dataset is batched and tokenized correctly
     task.trainingInformation.batchSize = batchSize
-    task.trainingInformation.maxSequenceLength = contextLength
+    task.trainingInformation.contextLength = contextLength
     const dataset = loadText('../datasets/wikitext/wiki.train.tokens')
       .map(text => processing.tokenize(tokenizer, text))
       .flat()
-      .batchWithOverlap(config.blockSize)
+      .batchWithOverlap(config.contextLength)
 
     const preprocessedDataset = dataset
       .map((tokens) => [tokens.pop(), tokens.last()] as [List<number>, number])

diff --git a/cli/src/train_gpt.ts b/cli/src/train_gpt.ts
@@ -13,7 +13,7 @@ async function main(): Promise<void> {
     maxIter: 50,
     evaluateEvery:50,
     maxEvalBatches: 10,
-    blockSize: 16,
+    contextLength: 16,
     seed
   }
 
@@ -22,7 +22,7 @@ async function main(): Promise<void> {
   const tokenDataset = new Dataset([data])
     .map((text: string) => processing.tokenize(tokenizer, text))
     .flat()
-    .batchWithOverlap(config.blockSize)
+    .batchWithOverlap(config.contextLength)
     .map((tokens) => [tokens.pop(), tokens.last()] as [List<number>, number])
     .repeat()
     .batch(8);

diff --git a/discojs/src/dataset/dataset.spec.ts b/discojs/src/dataset/dataset.spec.ts
@@ -152,29 +152,29 @@ describe("dataset", () => {
 
   it("batchWithOverlap yields correct batches", async () => {
     const expectedTokens = Range(0, 53).toList()
-    const blockSize = 4
+    const contextLength = 4
 
     const parsed = new Dataset([expectedTokens])
       .flat()
-      .batchWithOverlap(blockSize)
+      .batchWithOverlap(contextLength)
 
     // -1 because the last sequence is dropped as there is no next token label
-    const expectedLength = Math.ceil(expectedTokens.size / blockSize) - 1
+    const expectedLength = Math.ceil(expectedTokens.size / contextLength) - 1
     expect(await parsed.size()).to.equal(expectedLength);
 
     // exclude the last sequence because it has been padded
     let sequences = List(await arrayFromAsync(parsed))
-    // we expect the last sequence to have blockSize + 1 tokens via padding
-    expect(sequences.last()?.size).to.equal(blockSize + 1)
+    // we expect the last sequence to have contextLength + 1 tokens via padding
+    expect(sequences.last()?.size).to.equal(contextLength + 1)
     sequences = sequences.pop()
     let i = 0
     for await (const tokens of sequences) {
-      // each sequence has length blockSize + 1 (for the label)
+      // each sequence has length contextLength + 1 (for the label)
       expect(tokens.toArray()).to.deep.equal(
-        expectedTokens.slice(i, i + blockSize + 1).toArray()
+        expectedTokens.slice(i, i + contextLength + 1).toArray()
       );
-      // but the window should move by blockSize only
-      i += blockSize
+      // but the window should move by contextLength only
+      i += contextLength
     }
   })
 

diff --git a/discojs/src/default_tasks/wikitext.ts b/discojs/src/default_tasks/wikitext.ts
@@ -35,15 +35,15 @@ export const wikitext: TaskProvider<'text'> = {
         roundDuration: 2,
         batchSize: 8, // If set too high firefox raises a WebGL error
         tokenizer: 'Xenova/gpt2',
-        maxSequenceLength: 64,
+        contextLength: 64,
         tensorBackend: 'gpt'
       }
     }
   },
 
   getModel(): Promise<Model<'text'>> {
     return Promise.resolve(new models.GPT({
-      blockSize: this.getTask().trainingInformation.maxSequenceLength,
+      contextLength: this.getTask().trainingInformation.contextLength,
     }))
   }
 }
diff --git a/discojs/src/models/gpt/config.ts b/discojs/src/models/gpt/config.ts
@@ -9,7 +9,7 @@ type GPTModelType =
 
 export interface GPTConfig {
   lr: number
-  blockSize: number
+  contextLength: number
   vocabSize?: number
   modelType: GPTModelType
   name?: string,
@@ -39,7 +39,7 @@ export const DefaultGPTConfig: Required<GPTConfig> = {
   evaluate: true,
   maxEvalBatches: 12,
   evaluateEvery: 100,
-  blockSize: 128,
+  contextLength: 128,
   vocabSize: 50257,
   debug: false,
   dropout: 0.2,

diff --git a/discojs/src/models/gpt/gpt.spec.ts b/discojs/src/models/gpt/gpt.spec.ts
@@ -25,7 +25,7 @@ describe("gpt-tfjs", function () {
       maxIter: 10,
       evaluateEvery: 50,
       maxEvalBatches: 10,
-      blockSize: 8,
+      contextLength: 8,
       seed
     });
     for (let i = 0; i < 5; i++)

diff --git a/discojs/src/models/gpt/index.ts b/discojs/src/models/gpt/index.ts
@@ -27,7 +27,7 @@ export type GPTSerialization = {
 export class GPT extends Model<"text"> {
   private readonly model: GPTModel;
 
-  readonly #blockSize: number;
+  readonly #contextLength: number;
   readonly #maxBatchCount: number;
   readonly #vocabSize: number;
 
@@ -38,7 +38,7 @@ export class GPT extends Model<"text"> {
     model.compile();
     this.model = model;
 
-    this.#blockSize = partialConfig?.blockSize ?? DefaultGPTConfig.blockSize;
+    this.#contextLength = partialConfig?.contextLength ?? DefaultGPTConfig.contextLength;
     this.#maxBatchCount = partialConfig?.maxIter ?? DefaultGPTConfig.maxIter;
     this.#vocabSize = partialConfig?.vocabSize ?? DefaultGPTConfig.vocabSize;
   }
@@ -157,7 +157,7 @@ export class GPT extends Model<"text"> {
    * Generate the next token after the input sequence.
    * In other words, takes an input tensor of shape (prompt length T) and returns a tensor of shape (T+1)
    * 
-   * @param token input tokens of shape (T,). T is truncated to the model's block size
+   * @param token input tokens of shape (T,). T is truncated to the model's context length
    * @param config generation config: temperature, doSample, topk
    * @returns the next token predicted by the model
    */
@@ -166,7 +166,7 @@ export class GPT extends Model<"text"> {
     config: GenerationConfig,
   ): Promise<DataFormat.ModelEncoded["text"][1]> {
     // slice input tokens if longer than context length
-    tokens = tokens.slice(-this.#blockSize);
+    tokens = tokens.slice(-this.#contextLength);
 
     const input = tf.tidy(() =>
       tf.tensor1d(tokens.toArray(), "int32").expandDims<tf.Tensor2D>(0),

diff --git a/discojs/src/models/gpt/layers.ts b/discojs/src/models/gpt/layers.ts
@@ -67,7 +67,7 @@ tf.serialization.registerClass(LogLayer)
 
 type CausalSelfAttentionConfig =
     ConstructorParameters<typeof tf.layers.Layer>[0]
-    & Record<'blockSize' | 'nHead' | 'nEmbd' | 'dropout' | 'nLayer' | 'seed', number>
+    & Record<'contextLength' | 'nHead' | 'nEmbd' | 'dropout' | 'nLayer' | 'seed', number>
 
 class CausalSelfAttention extends tf.layers.Layer {
   static readonly className = 'CausalSelfAttention'
@@ -97,7 +97,7 @@ class CausalSelfAttention extends tf.layers.Layer {
     // mask is a lower triangular matrix filled with 1
     // calling bandPart zero out the upper triangular part of the all-ones matrix
     // from the doc: tf.linalg.band_part(input, -1, 0) ==> Lower triangular part
-    this.mask = tf.linalg.bandPart(tf.ones([config.blockSize, config.blockSize]), -1, 0)
+    this.mask = tf.linalg.bandPart(tf.ones([config.contextLength, config.contextLength]), -1, 0)
   }
 
   override build (): void {
@@ -266,15 +266,15 @@ class GELU extends tf.layers.Layer {
 tf.serialization.registerClass(GELU)
 
 type MLPConfig = ConstructorParameters<typeof tf.layers.Layer>[0] &
-  Required<ModelSize> & Record<'blockSize' | 'residDrop' | 'nLayer' | 'seed', number>
+  Required<ModelSize> & Record<'contextLength' | 'residDrop' | 'nLayer' | 'seed', number>
 
 function MLP(config: MLPConfig): tf.LayersModel {
   return tf.sequential({ layers: [
     tf.layers.dense({
       name: config.name + `.mlp.c_fc`,
       units: 4 * config.nEmbd,
       inputDim: config.nEmbd,
-      inputShape: [config.blockSize, config.nEmbd],
+      inputShape: [config.contextLength, config.nEmbd],
       kernelInitializer: tf.initializers.randomNormal({
         mean: 0, stddev: 0.02, seed: config.seed
       }),
@@ -284,7 +284,7 @@ function MLP(config: MLPConfig): tf.LayersModel {
       name: config.name + '.mlp.c_proj',
       units: config.nEmbd,
       inputDim: 4 * config.nEmbd,
-      inputShape: [config.blockSize, 4 * config.nEmbd],
+      inputShape: [config.contextLength, 4 * config.nEmbd],
       kernelInitializer: tf.initializers.randomNormal({
         mean: 0, stddev: 0.02 * Math.sqrt(2 * config.nLayer), seed: config.seed
       }),
@@ -306,7 +306,7 @@ type BlockConfig = CausalSelfAttentionConfig & MLPConfig & { debug: boolean }
  */
 function TransformerBlock (conf: BlockConfig): tf.LayersModel {
   const config = Object.assign({ name: '.h' }, conf)
-  const inputs = tf.input({ shape: [config.blockSize, config.nEmbd] })
+  const inputs = tf.input({ shape: [config.contextLength, config.nEmbd] })
   let x1, x2
   // input normalization
   x1 = tf.layers.layerNormalization({
@@ -469,7 +469,7 @@ export function GPTArchitecture(config: Required<GPTConfig>): tf.LayersModel {
   const range = new Range({}).apply(inputs)
   let posEmb = tf.layers.embedding({
     name: config.name + '.wpe',
-    inputDim: config.blockSize,
+    inputDim: config.contextLength,
     outputDim: config.nEmbd,
     embeddingsInitializer: tf.initializers.randomNormal({
       mean: 0, stddev: 0.02, seed: config.seed

diff --git a/discojs/src/processing/index.ts b/discojs/src/processing/index.ts
@@ -56,12 +56,12 @@ export async function preprocess<D extends DataType>(
       // cast as typescript doesn't reduce generic type
       const d = dataset as Dataset<DataFormat.Raw["text"]>;
       const t = task as Task<"text">;
-      const blockSize = task.trainingInformation.maxSequenceLength
+      const contextLength = task.trainingInformation.contextLength
 
       const tokenizer = await models.getTaskTokenizer(t);
       return d.map(text => processing.tokenize(tokenizer, text))
         .flat()
-        .batchWithOverlap(blockSize) 
+        .batchWithOverlap(contextLength) 
         .map((tokens) => [tokens.pop(), tokens.last()]) as
           Dataset<DataFormat.ModelEncoded[D]>;
     }
@@ -97,12 +97,12 @@ export async function preprocessWithoutLabel<D extends DataType>(
       // cast as typescript doesn't reduce generic type
       const d = dataset as Dataset<DataFormat.Raw["text"]>;
       const t = task as Task<"text">;
-      const blockSize = task.trainingInformation.maxSequenceLength
+      const contextLength = task.trainingInformation.contextLength
       const tokenizer = await models.getTaskTokenizer(t);
 
       return d.map(text => processing.tokenize(tokenizer, text))
         .flat()
-        .batch(blockSize)
+        .batch(contextLength)
     }
   }
 }

diff --git a/discojs/src/serialization/model.spec.ts b/discojs/src/serialization/model.spec.ts
@@ -51,7 +51,7 @@ describe('serialization', () => {
       maxIter: 10,
       evaluateEvery:10,
       maxEvalBatches: 10,
-      blockSize: 8,
+      contextLength: 8,
     }
     const model = new models.GPT(config)
 

diff --git a/discojs/src/task/training_information.ts b/discojs/src/task/training_information.ts
@@ -65,9 +65,9 @@ interface DataTypeToTrainingInformation {
     // When the tokenizer is first called, the actual object will be initialized and loaded into this field for the subsequent tokenizations.
     tokenizer: string | PreTrainedTokenizer;
 
-    // maxSequenceLength: the maximum length of a input string used as input to a GPT model. It is used during preprocessing to
+    // contextLength: the maximum length of a input string used as input to a GPT model. It is used during preprocessing to
     // truncate strings to a maximum length. The default value is tokenizer.model_max_length
-    maxSequenceLength: number;
+    contextLength: number;
   };
 }
 
@@ -224,7 +224,7 @@ export function isTrainingInformation(
     }
     case "text": {
       const {
-        maxSequenceLength,
+        contextLength,
         tokenizer,
       }: Partial<
         Omit<TrainingInformation<"text">,
@@ -234,14 +234,14 @@ export function isTrainingInformation(
       if (
         (typeof tokenizer !== "string" &&
           !(tokenizer instanceof PreTrainedTokenizer)) ||
-        (typeof maxSequenceLength !== "number")
+        (typeof contextLength !== "number")
       )
         return false;
 
       const _: TrainingInformation<"text"> = {
         ...repack,
         dataType,
-        maxSequenceLength,
+        contextLength,
         tokenizer,
       } satisfies Record<keyof TrainingInformation<"text">, unknown>;
 

diff --git a/webapp/src/components/testing/__tests__/Testing.spec.ts b/webapp/src/components/testing/__tests__/Testing.spec.ts
@@ -28,7 +28,7 @@ const TASK: Task<"text"> = {
     batchSize: 1,
     roundDuration: 1,
     validationSplit: 0,
-    maxSequenceLength: 64,
+    contextLength: 64,
   },
 };