Skip to content

Commit

Permalink
Fix gemma2 tokenizer convert (#8244)
Browse files Browse the repository at this point in the history
* fix gemma2 tokenizer convert

* remove scores

* improve code, fix new line issue
  • Loading branch information
ngxson authored Jul 1, 2024
1 parent cb5fad4 commit 5fac350
Showing 1 changed file with 27 additions and 10 deletions.
37 changes: 27 additions & 10 deletions convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -576,7 +576,19 @@ def _set_vocab_qwen(self):
special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
special_vocab.add_to_gguf(self.gguf_writer)

def _set_vocab_sentencepiece(self):
def _set_vocab_sentencepiece(self, add_to_gguf=True):
tokens, scores, toktypes = self._create_vocab_sentencepiece()

self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)

special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)

def _create_vocab_sentencepiece(self):
from sentencepiece import SentencePieceProcessor

tokenizer_path = self.dir_model / 'tokenizer.model'
Expand Down Expand Up @@ -638,14 +650,7 @@ def _set_vocab_sentencepiece(self):
scores.append(-1000.0)
toktypes.append(SentencePieceTokenTypes.UNUSED)

self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)

special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)
return tokens, scores, toktypes

def _set_vocab_llama_hf(self):
vocab = gguf.LlamaHfVocab(self.dir_model)
Expand Down Expand Up @@ -2345,7 +2350,19 @@ class Gemma2Model(Model):
model_arch = gguf.MODEL_ARCH.GEMMA2

def set_vocab(self):
self._set_vocab_llama_hf()
tokens, scores, toktypes = self._create_vocab_sentencepiece()
# hack: This is required so that we can properly use start/end-of-turn for chat template
for i in range(108):
# including <unusedX>, <start_of_turn>, <end_of_turn>
toktypes[i] = SentencePieceTokenTypes.CONTROL
self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)

special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)
self.gguf_writer.add_add_space_prefix(False)

def set_gguf_parameters(self):
Expand Down

0 comments on commit 5fac350

Please sign in to comment.