From 445904e6e4a0b792aa2220878c87f24fe233bb25 Mon Sep 17 00:00:00 2001 From: Colin Kealty Date: Tue, 26 Nov 2024 16:46:41 -0500 Subject: [PATCH] Add tokenizer type --- include/llama.h | 1 + src/llama.cpp | 3 +++ 2 files changed, 4 insertions(+) diff --git a/include/llama.h b/include/llama.h index ab5e376e6c7f2..1d285b40da011 100644 --- a/include/llama.h +++ b/include/llama.h @@ -104,6 +104,7 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24, LLAMA_VOCAB_PRE_TYPE_EXAONE = 25, LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26, + LLAMA_VOCAB_PRE_TYPE_OLMO2 = 27, }; enum llama_rope_type { diff --git a/src/llama.cpp b/src/llama.cpp index af5e686e07eda..d157e024e19d3 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6406,6 +6406,9 @@ static void llm_load_vocab( vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON; vocab.tokenizer_add_bos = true; vocab.tokenizer_clean_spaces = false; + } else if ( + tokenizer_pre == "olmo2") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO2; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); }