Skip to content

Commit fa79495

Browse files
authored
llama : fix pre-tokenization of non-special added tokens (#8228)
* llama : fix mpt and olmo pre-tokenizer * llama : pre-tokenize non-special user-defined tokens first * llama : fix detection of control-like user-defined tokens * convert_hf : identify which user-defined tokens are control tokens Only used in _set_vocab_gpt2() for now. * convert_hf : identify more added control tokens for SPM tokenziers This makes Gemma and Gemma-2 tokenize pretty much EVERYTHING correctly, including HTML tags and consecutive spaces, but it unfortunately requires model re-conversion. There seems to be a weird behavior of the HF tokenizer for Gemma, which prefers to use the 16-space token over more lengthy space tokens, while using the SentencePiece tokenizer does not do this. (the implementation in llama.cpp has the same behavior as SentencePiece) * llama : fix wrong pre-tokenization of byte tokens * llama : fix Viking pre-tokenizer regex The order was previously wrong, which caused errors in some tests. * llama : fix command-r detokenization * convert_hf : reduce usages of the UNKNOWN token type * llama : add UNKNOWN tokens in the special tokens cache * convert_hf : reduce usages of UNKNOWN for InternLM2 This makes the changes from #8321 more consistent with the other changes made here. * test-tokenizer-random : reduce potential confilcts with #8379 * test-tokenizer-random : add a failing edge case for falcon
1 parent 17eb6aa commit fa79495

File tree

4 files changed

+91
-61
lines changed

4 files changed

+91
-61
lines changed

convert_hf_to_gguf.py

Lines changed: 71 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,29 @@ def from_model_architecture(cls, arch: str) -> type[Model]:
373373
except KeyError:
374374
raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
375375

376+
def does_token_look_special(self, token: str | bytes) -> bool:
377+
if isinstance(token, (bytes, bytearray)):
378+
token_text = token.decode(encoding="utf-8")
379+
elif isinstance(token, memoryview):
380+
token_text = token.tobytes().decode(encoding="utf-8")
381+
else:
382+
token_text = token
383+
384+
# Some models mark some added tokens which ought to be control tokens as not special.
385+
# (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2})
386+
seems_special = token_text in (
387+
"<pad>", # deepseek-coder
388+
"<mask>", "<2mass>", "[@BOS@]", # gemma{,-2}
389+
)
390+
391+
seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>"))
392+
seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>")) # deepseek-coder
393+
394+
# TODO: should these be marked as UNUSED instead? (maybe not)
395+
seems_special = seems_special or (token_text.startswith("<unused") and token_text.endswith(">")) # gemma{,-2}
396+
397+
return seems_special
398+
376399
# used for GPT-2 BPE and WordPiece vocabs
377400
def get_vocab_base(self) -> tuple[list[str], list[int], str]:
378401
tokens: list[str] = []
@@ -391,16 +414,18 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
391414
for i in range(vocab_size):
392415
if i not in reverse_vocab:
393416
tokens.append(f"[PAD{i}]")
394-
toktypes.append(gguf.TokenType.USER_DEFINED)
395-
elif reverse_vocab[i] in added_vocab:
396-
tokens.append(reverse_vocab[i])
397-
if tokenizer.added_tokens_decoder[i].special:
398-
toktypes.append(gguf.TokenType.CONTROL)
399-
else:
400-
toktypes.append(gguf.TokenType.USER_DEFINED)
417+
toktypes.append(gguf.TokenType.UNUSED)
401418
else:
402-
tokens.append(reverse_vocab[i])
403-
toktypes.append(gguf.TokenType.NORMAL)
419+
token: str = reverse_vocab[i]
420+
if token in added_vocab:
421+
if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
422+
toktypes.append(gguf.TokenType.CONTROL)
423+
else:
424+
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
425+
toktypes.append(gguf.TokenType.USER_DEFINED)
426+
else:
427+
toktypes.append(gguf.TokenType.NORMAL)
428+
tokens.append(token)
404429

405430
return tokens, toktypes, tokpre
406431

@@ -559,7 +584,7 @@ def _set_vocab_qwen(self):
559584
for i in range(vocab_size):
560585
if i not in reverse_vocab:
561586
tokens.append(f"[PAD{i}]")
562-
toktypes.append(gguf.TokenType.USER_DEFINED)
587+
toktypes.append(gguf.TokenType.UNUSED)
563588
elif reverse_vocab[i] in added_vocab:
564589
tokens.append(reverse_vocab[i])
565590
toktypes.append(gguf.TokenType.CONTROL)
@@ -609,7 +634,7 @@ def _create_vocab_sentencepiece(self):
609634

610635
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
611636
scores: list[float] = [-10000.0] * vocab_size
612-
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
637+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
613638

614639
for token_id in range(tokenizer.vocab_size()):
615640
piece = tokenizer.IdToPiece(token_id)
@@ -644,6 +669,25 @@ def _create_vocab_sentencepiece(self):
644669
scores[token_id] = -1000.0
645670
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
646671

672+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
673+
if tokenizer_config_file.is_file():
674+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
675+
tokenizer_config_json = json.load(f)
676+
added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
677+
for token_id, token_data in added_tokens_decoder.items():
678+
token_id = int(token_id)
679+
token: str = token_data["content"]
680+
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
681+
assert tokens[token_id] == token.encode("utf-8")
682+
if token_data.get("special") or self.does_token_look_special(token):
683+
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
684+
else:
685+
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
686+
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
687+
688+
scores[token_id] = -1000.0
689+
tokens[token_id] = token.encode("utf-8")
690+
647691
if vocab_size > len(tokens):
648692
pad_count = vocab_size - len(tokens)
649693
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
@@ -1266,7 +1310,7 @@ def set_vocab(self):
12661310
if (self.dir_model / "tokenizer.json").is_file():
12671311
self._set_vocab_gpt2()
12681312
else:
1269-
# StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab
1313+
# StableLM 2 1.6B used to have a vocab in a similar format to Qwen's vocab
12701314
self._set_vocab_qwen()
12711315

12721316
def set_gguf_parameters(self):
@@ -1578,7 +1622,6 @@ def set_gguf_parameters(self):
15781622
self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"])
15791623

15801624
self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"])
1581-
self.gguf_writer.add_file_type(self.ftype)
15821625

15831626
self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"])
15841627
self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"])
@@ -1872,7 +1915,7 @@ def set_vocab(self):
18721915

18731916
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
18741917
scores: list[float] = [-10000.0] * vocab_size
1875-
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
1918+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
18761919

18771920
for token_id in range(tokenizer.vocab_size()):
18781921

@@ -1917,7 +1960,7 @@ def set_vocab(self):
19171960
for token_id, foken_data in added_tokens_decoder.items():
19181961
token_id = int(token_id)
19191962
token = foken_data["content"].encode("utf-8")
1920-
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
1963+
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
19211964
assert tokens[token_id] == token
19221965
tokens[token_id] = token
19231966
scores[token_id] = -1000.0
@@ -1933,7 +1976,7 @@ def set_vocab(self):
19331976
for foken_data in added_tokens:
19341977
token_id = int(foken_data["id"])
19351978
token = foken_data["content"].encode("utf-8")
1936-
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
1979+
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
19371980
assert tokens[token_id] == token
19381981
tokens[token_id] = token
19391982
scores[token_id] = -1000.0
@@ -2145,7 +2188,7 @@ def set_vocab(self):
21452188
toktype = SentencePieceTokenTypes.BYTE
21462189
# take care of ununsed raw token
21472190
if piece.startswith('[UNUSED'):
2148-
toktype = SentencePieceTokenTypes.UNKNOWN
2191+
toktype = SentencePieceTokenTypes.UNUSED
21492192

21502193
tokens.append(text)
21512194
scores.append(score)
@@ -2175,7 +2218,7 @@ def set_vocab(self):
21752218
if token == chat_eos_token:
21762219
chat_eos_token_id = token_id
21772220
token = token.encode("utf-8")
2178-
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
2221+
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
21792222
assert(tokens[token_id] == token)
21802223
tokens[token_id] = token
21812224
scores[token_id] = -1000.0
@@ -2194,7 +2237,7 @@ def set_vocab(self):
21942237
if token == chat_eos_token:
21952238
chat_eos_token_id = token_id
21962239
token = token.encode("utf-8")
2197-
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
2240+
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
21982241
assert(tokens[token_id] == token)
21992242
tokens[token_id] = token
22002243
scores[token_id] = -1000.0
@@ -2434,19 +2477,7 @@ class Gemma2Model(Model):
24342477
model_arch = gguf.MODEL_ARCH.GEMMA2
24352478

24362479
def set_vocab(self):
2437-
tokens, scores, toktypes = self._create_vocab_sentencepiece()
2438-
# hack: This is required so that we can properly use start/end-of-turn for chat template
2439-
for i in range(108):
2440-
# including <unusedX>, <start_of_turn>, <end_of_turn>
2441-
toktypes[i] = SentencePieceTokenTypes.CONTROL
2442-
self.gguf_writer.add_tokenizer_model("llama")
2443-
self.gguf_writer.add_tokenizer_pre("default")
2444-
self.gguf_writer.add_token_list(tokens)
2445-
self.gguf_writer.add_token_scores(scores)
2446-
self.gguf_writer.add_token_types(toktypes)
2447-
2448-
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
2449-
special_vocab.add_to_gguf(self.gguf_writer)
2480+
self._set_vocab_sentencepiece()
24502481

24512482
self.gguf_writer.add_add_space_prefix(False)
24522483

@@ -2770,7 +2801,7 @@ def set_vocab(self):
27702801

27712802
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
27722803
scores: list[float] = [-10000.0] * vocab_size
2773-
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
2804+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
27742805

27752806
for token_id in range(tokenizer.vocab_size()):
27762807

@@ -3025,7 +3056,7 @@ def set_vocab(self):
30253056

30263057
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
30273058
scores: list[float] = [-10000.0] * vocab_size
3028-
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
3059+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
30293060

30303061
for token_id in range(tokenizer.vocab_size()):
30313062
piece = tokenizer.IdToPiece(token_id)
@@ -3243,15 +3274,14 @@ def set_vocab_chatglm3(self):
32433274
if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size():
32443275
score = tokenizer.tokenizer.sp_model.get_score(token_id)
32453276

3246-
if len(piece) == 0:
3247-
text = f"[PAD{token_id}]".encode("utf-8")
3248-
32493277
if token_id >= tokenizer.tokenizer.sp_model.vocab_size():
32503278
if piece in special_tokens:
3251-
# show special tokens in prompt
3252-
toktype = SentencePieceTokenTypes.USER_DEFINED
3279+
toktype = SentencePieceTokenTypes.CONTROL
3280+
elif len(piece) == 0:
3281+
text = f"[PAD{token_id}]".encode("utf-8")
3282+
toktype = SentencePieceTokenTypes.UNUSED
32533283
else:
3254-
toktype = SentencePieceTokenTypes.UNKNOWN
3284+
toktype = SentencePieceTokenTypes.USER_DEFINED
32553285
tokens.append(text)
32563286
scores.append(score)
32573287
toktypes.append(toktype)
@@ -3340,7 +3370,7 @@ def set_vocab(self):
33403370
for i in range(vocab_size):
33413371
if i not in reverse_vocab:
33423372
tokens.append(f"[PAD{i}]")
3343-
toktypes.append(gguf.TokenType.USER_DEFINED)
3373+
toktypes.append(gguf.TokenType.UNUSED)
33443374
elif reverse_vocab[i] in added_vocab:
33453375
tokens.append(reverse_vocab[i])
33463376
if tokenizer.added_tokens_decoder[i].special:

src/llama.cpp

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5419,6 +5419,7 @@ static void llm_load_vocab(
54195419
} else if (
54205420
tokenizer_pre == "command-r") {
54215421
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
5422+
vocab.tokenizer_clean_spaces = false;
54225423
} else if (
54235424
tokenizer_pre == "qwen2") {
54245425
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
@@ -5652,7 +5653,7 @@ static void llm_load_vocab(
56525653
// build special tokens cache
56535654
{
56545655
for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
5655-
if (!(vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL)) {
5656+
if (vocab.id_to_token[id].attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN)) {
56565657
vocab.cache_special_tokens.push_back(id);
56575658
}
56585659
}
@@ -15411,17 +15412,6 @@ struct llm_tokenizer_bpe {
1541115412
"[0-9][0-9][0-9]",
1541215413
};
1541315414
break;
15414-
case LLAMA_VOCAB_PRE_TYPE_MPT:
15415-
// TODO: MPT pre-tokenization regexes are unknown
15416-
// the following are close, but not exact. run the following:
15417-
// ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
15418-
GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
15419-
regex_exprs = {
15420-
"\\s?\\p{L}+",
15421-
"\\s?\\p{P}+",
15422-
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
15423-
};
15424-
break;
1542515415
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
1542615416
case LLAMA_VOCAB_PRE_TYPE_REFACT:
1542715417
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
@@ -15431,6 +15421,7 @@ struct llm_tokenizer_bpe {
1543115421
};
1543215422
break;
1543315423
case LLAMA_VOCAB_PRE_TYPE_GPT2:
15424+
case LLAMA_VOCAB_PRE_TYPE_MPT:
1543415425
case LLAMA_VOCAB_PRE_TYPE_OLMO:
1543515426
case LLAMA_VOCAB_PRE_TYPE_JAIS:
1543615427
regex_exprs = {
@@ -15457,8 +15448,8 @@ struct llm_tokenizer_bpe {
1545715448
break;
1545815449
case LLAMA_VOCAB_PRE_TYPE_VIKING:
1545915450
regex_exprs = {
15460-
"\\p{N}",
1546115451
" ?[^(\\s|.,!?…。,、।۔،)]+",
15452+
"\\p{N}",
1546215453
};
1546315454
break;
1546415455
default:
@@ -16178,12 +16169,20 @@ struct fragment_buffer_variant {
1617816169

1617916170
// #define PRETOKENIZERDEBUG
1618016171

16181-
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
16172+
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) {
1618216173
// for each special token
1618316174
for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
1618416175
const auto & data = vocab.id_to_token[special_id];
1618516176
const auto & special_token = data.text;
1618616177

16178+
if (!parse_special && (data.attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_UNKNOWN))) {
16179+
// Ignore control and unknown tokens when parse_special == false
16180+
continue;
16181+
// User-defined tokens are still pre-tokenized before everything else
16182+
// ref: https://github.com/huggingface/tokenizers/blob/fdd26ba9a3f0c133427aab0423888cbde91362d7/tokenizers/src/tokenizer/mod.rs#L726
16183+
// This is mostly relevant for neox-style tokenizers (mpt, olmo, stablelm, etc.)
16184+
}
16185+
1618716186
// for each text fragment
1618816187
std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
1618916188
while (it != buffer.end()) {
@@ -16296,7 +16295,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
1629616295

1629716296
if (!raw_text.empty()) {
1629816297
fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
16299-
if (parse_special) tokenizer_st_partition(vocab, fragment_buffer);
16298+
tokenizer_st_partition(vocab, fragment_buffer, parse_special);
1630016299
}
1630116300

1630216301
switch (vocab.type) {

tests/test-tokenizer-0.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ int main(int argc, char **argv) {
195195
const bool add_special = false;
196196

197197
for (const auto & test_kv : k_tests) {
198-
const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, add_special, true);
198+
const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, add_special, false);
199199

200200
printf("\n");
201201
printf("src: '%s'\n", test_kv.first.c_str());
@@ -253,7 +253,7 @@ int main(int argc, char **argv) {
253253
{
254254
const auto t_start = ggml_time_us();
255255

256-
res = llama_tokenize(ctx, text, add_special, true);
256+
res = llama_tokenize(ctx, text, add_special, false);
257257

258258
const auto t_end = ggml_time_us();
259259

tests/test-tokenizer-random.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from typing_extensions import Buffer
2121

2222
import cffi
23-
from transformers import AutoTokenizer
23+
from transformers import AutoTokenizer, PreTrainedTokenizer
2424

2525

2626
logger = logging.getLogger("test-tokenizer-random")
@@ -129,7 +129,7 @@ def decode(self, ids: list[int]) -> str:
129129
class TokenizerGroundtruth (Tokenizer):
130130

131131
def __init__(self, dir_tokenizer: str):
132-
self.model = AutoTokenizer.from_pretrained(dir_tokenizer)
132+
self.model: PreTrainedTokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
133133
# guess BOS and EOS
134134
ids = self.encode("a")
135135
assert 1 <= len(ids) <= 3
@@ -143,7 +143,7 @@ def __init__(self, dir_tokenizer: str):
143143
self.vocab = list(sorted(self.vocab))
144144
# tokens and lists
145145
self.special_tokens = list(self.model.all_special_tokens)
146-
self.added_tokens = list(self.model.added_tokens_encoder)
146+
self.added_tokens = self.model.batch_decode(self.model.added_tokens_encoder.values(), skip_special_tokens=False)
147147
self.bos_token = self.model.bos_token
148148
self.eos_token = self.model.eos_token
149149

@@ -232,6 +232,7 @@ def generator_custom_text_edge_cases() -> Iterator[str]:
232232
'a\na', # bert fail
233233
'"`', # falcon
234234
' \u2e4e', # falcon
235+
'\n\x0b ', # falcon
235236
'a\xa0\xa0\x00b', # jina-v2-es
236237
'one <mask>', # jina-v2-es <mask> lstrip=true
237238
'a </s> b', # rstrip phi-3

0 commit comments

Comments
 (0)