From 62dc4a92cde8b937449fb91c7924f06b0da20149 Mon Sep 17 00:00:00 2001 From: ngxson Date: Mon, 1 Jul 2024 23:07:26 +0200 Subject: [PATCH 1/3] fix gemma2 tokenizer convert --- convert-hf-to-gguf.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 4a7f500ff7d..1bd6b6d42f3 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -576,7 +576,7 @@ def _set_vocab_qwen(self): special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) special_vocab.add_to_gguf(self.gguf_writer) - def _set_vocab_sentencepiece(self): + def _set_vocab_sentencepiece(self, add_to_gguf=True): from sentencepiece import SentencePieceProcessor tokenizer_path = self.dir_model / 'tokenizer.model' @@ -640,12 +640,15 @@ def _set_vocab_sentencepiece(self): self.gguf_writer.add_tokenizer_model("llama") self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) + + if add_to_gguf: + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + special_vocab.add_to_gguf(self.gguf_writer) + + return tokens, scores, toktypes, special_vocab def _set_vocab_llama_hf(self): vocab = gguf.LlamaHfVocab(self.dir_model) @@ -2345,7 +2348,15 @@ class Gemma2Model(Model): model_arch = gguf.MODEL_ARCH.GEMMA2 def set_vocab(self): - self._set_vocab_llama_hf() + tokens, scores, toktypes, special_vocab = self._set_vocab_sentencepiece(add_to_gguf=False) + # hack: This is required so that we can properly use start/end-of-turn for chat template + for i in range(216): # 216 -> last special token + scores[i] = -1000.0 + toktypes[i] = SentencePieceTokenTypes.CONTROL + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + special_vocab.add_to_gguf(self.gguf_writer) self.gguf_writer.add_add_space_prefix(False) def set_gguf_parameters(self): From 1a99b5ec6e462bcf82befe92059bd769a83f1656 Mon Sep 17 00:00:00 2001 From: ngxson Date: Mon, 1 Jul 2024 23:14:06 +0200 Subject: [PATCH 2/3] remove scores --- convert-hf-to-gguf.py | 1 - 1 file changed, 1 deletion(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 1bd6b6d42f3..b6d212f4e01 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -2351,7 +2351,6 @@ def set_vocab(self): tokens, scores, toktypes, special_vocab = self._set_vocab_sentencepiece(add_to_gguf=False) # hack: This is required so that we can properly use start/end-of-turn for chat template for i in range(216): # 216 -> last special token - scores[i] = -1000.0 toktypes[i] = SentencePieceTokenTypes.CONTROL self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_scores(scores) From 922a5e2939655b9e38afa1ab46f8db1ce5450cf6 Mon Sep 17 00:00:00 2001 From: ngxson Date: Tue, 2 Jul 2024 00:49:06 +0200 Subject: [PATCH 3/3] improve code, fix new line issue --- convert-hf-to-gguf.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index b6d212f4e01..6833e943765 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -577,6 +577,18 @@ def _set_vocab_qwen(self): special_vocab.add_to_gguf(self.gguf_writer) def _set_vocab_sentencepiece(self, add_to_gguf=True): + tokens, scores, toktypes = self._create_vocab_sentencepiece() + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def _create_vocab_sentencepiece(self): from sentencepiece import SentencePieceProcessor tokenizer_path = self.dir_model / 'tokenizer.model' @@ -638,17 +650,7 @@ def _set_vocab_sentencepiece(self, add_to_gguf=True): scores.append(-1000.0) toktypes.append(SentencePieceTokenTypes.UNUSED) - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - - if add_to_gguf: - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - special_vocab.add_to_gguf(self.gguf_writer) - - return tokens, scores, toktypes, special_vocab + return tokens, scores, toktypes def _set_vocab_llama_hf(self): vocab = gguf.LlamaHfVocab(self.dir_model) @@ -2348,13 +2350,18 @@ class Gemma2Model(Model): model_arch = gguf.MODEL_ARCH.GEMMA2 def set_vocab(self): - tokens, scores, toktypes, special_vocab = self._set_vocab_sentencepiece(add_to_gguf=False) + tokens, scores, toktypes = self._create_vocab_sentencepiece() # hack: This is required so that we can properly use start/end-of-turn for chat template - for i in range(216): # 216 -> last special token + for i in range(108): + # including , , toktypes[i] = SentencePieceTokenTypes.CONTROL + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_scores(scores) self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab.add_to_gguf(self.gguf_writer) self.gguf_writer.add_add_space_prefix(False)