From 445904e6e4a0b792aa2220878c87f24fe233bb25 Mon Sep 17 00:00:00 2001
From: Colin Kealty <ckealty1182@gmail.com>
Date: Tue, 26 Nov 2024 16:46:41 -0500
Subject: [PATCH] Add tokenizer type

---
 include/llama.h | 1 +
 src/llama.cpp   | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/include/llama.h b/include/llama.h
index ab5e376e6c7f2..1d285b40da011 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -104,6 +104,7 @@ extern "C" {
         LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH   = 24,
         LLAMA_VOCAB_PRE_TYPE_EXAONE         = 25,
         LLAMA_VOCAB_PRE_TYPE_CHAMELEON      = 26,
+        LLAMA_VOCAB_PRE_TYPE_OLMO2          = 27,
     };
 
     enum llama_rope_type {
diff --git a/src/llama.cpp b/src/llama.cpp
index af5e686e07eda..d157e024e19d3 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -6406,6 +6406,9 @@ static void llm_load_vocab(
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
                 vocab.tokenizer_add_bos = true;
                 vocab.tokenizer_clean_spaces = false;
+            } else if (
+                tokenizer_pre == "olmo2") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO2;
             } else {
                 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
             }