@@ -373,6 +373,29 @@ def from_model_architecture(cls, arch: str) -> type[Model]:
373
373
except KeyError :
374
374
raise NotImplementedError (f'Architecture { arch !r} not supported!' ) from None
375
375
376
+ def does_token_look_special (self , token : str | bytes ) -> bool :
377
+ if isinstance (token , (bytes , bytearray )):
378
+ token_text = token .decode (encoding = "utf-8" )
379
+ elif isinstance (token , memoryview ):
380
+ token_text = token .tobytes ().decode (encoding = "utf-8" )
381
+ else :
382
+ token_text = token
383
+
384
+ # Some models mark some added tokens which ought to be control tokens as not special.
385
+ # (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2})
386
+ seems_special = token_text in (
387
+ "<pad>" , # deepseek-coder
388
+ "<mask>" , "<2mass>" , "[@BOS@]" , # gemma{,-2}
389
+ )
390
+
391
+ seems_special = seems_special or (token_text .startswith ("<|" ) and token_text .endswith ("|>" ))
392
+ seems_special = seems_special or (token_text .startswith ("<|" ) and token_text .endswith ("|>" )) # deepseek-coder
393
+
394
+ # TODO: should these be marked as UNUSED instead? (maybe not)
395
+ seems_special = seems_special or (token_text .startswith ("<unused" ) and token_text .endswith (">" )) # gemma{,-2}
396
+
397
+ return seems_special
398
+
376
399
# used for GPT-2 BPE and WordPiece vocabs
377
400
def get_vocab_base (self ) -> tuple [list [str ], list [int ], str ]:
378
401
tokens : list [str ] = []
@@ -391,16 +414,18 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
391
414
for i in range (vocab_size ):
392
415
if i not in reverse_vocab :
393
416
tokens .append (f"[PAD{ i } ]" )
394
- toktypes .append (gguf .TokenType .USER_DEFINED )
395
- elif reverse_vocab [i ] in added_vocab :
396
- tokens .append (reverse_vocab [i ])
397
- if tokenizer .added_tokens_decoder [i ].special :
398
- toktypes .append (gguf .TokenType .CONTROL )
399
- else :
400
- toktypes .append (gguf .TokenType .USER_DEFINED )
417
+ toktypes .append (gguf .TokenType .UNUSED )
401
418
else :
402
- tokens .append (reverse_vocab [i ])
403
- toktypes .append (gguf .TokenType .NORMAL )
419
+ token : str = reverse_vocab [i ]
420
+ if token in added_vocab :
421
+ if tokenizer .added_tokens_decoder [i ].special or self .does_token_look_special (token ):
422
+ toktypes .append (gguf .TokenType .CONTROL )
423
+ else :
424
+ token = token .replace (b"\xe2 \x96 \x81 " .decode ("utf-8" ), " " ) # pre-normalize user-defined spaces
425
+ toktypes .append (gguf .TokenType .USER_DEFINED )
426
+ else :
427
+ toktypes .append (gguf .TokenType .NORMAL )
428
+ tokens .append (token )
404
429
405
430
return tokens , toktypes , tokpre
406
431
@@ -559,7 +584,7 @@ def _set_vocab_qwen(self):
559
584
for i in range (vocab_size ):
560
585
if i not in reverse_vocab :
561
586
tokens .append (f"[PAD{ i } ]" )
562
- toktypes .append (gguf .TokenType .USER_DEFINED )
587
+ toktypes .append (gguf .TokenType .UNUSED )
563
588
elif reverse_vocab [i ] in added_vocab :
564
589
tokens .append (reverse_vocab [i ])
565
590
toktypes .append (gguf .TokenType .CONTROL )
@@ -609,7 +634,7 @@ def _create_vocab_sentencepiece(self):
609
634
610
635
tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
611
636
scores : list [float ] = [- 10000.0 ] * vocab_size
612
- toktypes : list [int ] = [SentencePieceTokenTypes .UNKNOWN ] * vocab_size
637
+ toktypes : list [int ] = [SentencePieceTokenTypes .UNUSED ] * vocab_size
613
638
614
639
for token_id in range (tokenizer .vocab_size ()):
615
640
piece = tokenizer .IdToPiece (token_id )
@@ -644,6 +669,25 @@ def _create_vocab_sentencepiece(self):
644
669
scores [token_id ] = - 1000.0
645
670
toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
646
671
672
+ tokenizer_config_file = self .dir_model / 'tokenizer_config.json'
673
+ if tokenizer_config_file .is_file ():
674
+ with open (tokenizer_config_file , "r" , encoding = "utf-8" ) as f :
675
+ tokenizer_config_json = json .load (f )
676
+ added_tokens_decoder = tokenizer_config_json .get ("added_tokens_decoder" , {})
677
+ for token_id , token_data in added_tokens_decoder .items ():
678
+ token_id = int (token_id )
679
+ token : str = token_data ["content" ]
680
+ if toktypes [token_id ] != SentencePieceTokenTypes .UNUSED :
681
+ assert tokens [token_id ] == token .encode ("utf-8" )
682
+ if token_data .get ("special" ) or self .does_token_look_special (token ):
683
+ toktypes [token_id ] = SentencePieceTokenTypes .CONTROL
684
+ else :
685
+ token = token .replace (b"\xe2 \x96 \x81 " .decode ("utf-8" ), " " ) # pre-normalize user-defined spaces
686
+ toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
687
+
688
+ scores [token_id ] = - 1000.0
689
+ tokens [token_id ] = token .encode ("utf-8" )
690
+
647
691
if vocab_size > len (tokens ):
648
692
pad_count = vocab_size - len (tokens )
649
693
logger .debug (f"Padding vocab with { pad_count } token(s) - [PAD1] through [PAD{ pad_count } ]" )
@@ -1266,7 +1310,7 @@ def set_vocab(self):
1266
1310
if (self .dir_model / "tokenizer.json" ).is_file ():
1267
1311
self ._set_vocab_gpt2 ()
1268
1312
else :
1269
- # StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab
1313
+ # StableLM 2 1.6B used to have a vocab in a similar format to Qwen's vocab
1270
1314
self ._set_vocab_qwen ()
1271
1315
1272
1316
def set_gguf_parameters (self ):
@@ -1578,7 +1622,6 @@ def set_gguf_parameters(self):
1578
1622
self .gguf_writer .add_rope_freq_base (attn_config ["rope_theta" ])
1579
1623
1580
1624
self .gguf_writer .add_clamp_kqv (attn_config ["clip_qkv" ])
1581
- self .gguf_writer .add_file_type (self .ftype )
1582
1625
1583
1626
self .gguf_writer .add_expert_count (ffn_config ["moe_num_experts" ])
1584
1627
self .gguf_writer .add_expert_used_count (ffn_config ["moe_top_k" ])
@@ -1872,7 +1915,7 @@ def set_vocab(self):
1872
1915
1873
1916
tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
1874
1917
scores : list [float ] = [- 10000.0 ] * vocab_size
1875
- toktypes : list [int ] = [SentencePieceTokenTypes .UNKNOWN ] * vocab_size
1918
+ toktypes : list [int ] = [SentencePieceTokenTypes .UNUSED ] * vocab_size
1876
1919
1877
1920
for token_id in range (tokenizer .vocab_size ()):
1878
1921
@@ -1917,7 +1960,7 @@ def set_vocab(self):
1917
1960
for token_id , foken_data in added_tokens_decoder .items ():
1918
1961
token_id = int (token_id )
1919
1962
token = foken_data ["content" ].encode ("utf-8" )
1920
- if toktypes [token_id ] != SentencePieceTokenTypes .UNKNOWN :
1963
+ if toktypes [token_id ] != SentencePieceTokenTypes .UNUSED :
1921
1964
assert tokens [token_id ] == token
1922
1965
tokens [token_id ] = token
1923
1966
scores [token_id ] = - 1000.0
@@ -1933,7 +1976,7 @@ def set_vocab(self):
1933
1976
for foken_data in added_tokens :
1934
1977
token_id = int (foken_data ["id" ])
1935
1978
token = foken_data ["content" ].encode ("utf-8" )
1936
- if toktypes [token_id ] != SentencePieceTokenTypes .UNKNOWN :
1979
+ if toktypes [token_id ] != SentencePieceTokenTypes .UNUSED :
1937
1980
assert tokens [token_id ] == token
1938
1981
tokens [token_id ] = token
1939
1982
scores [token_id ] = - 1000.0
@@ -2145,7 +2188,7 @@ def set_vocab(self):
2145
2188
toktype = SentencePieceTokenTypes .BYTE
2146
2189
# take care of ununsed raw token
2147
2190
if piece .startswith ('[UNUSED' ):
2148
- toktype = SentencePieceTokenTypes .UNKNOWN
2191
+ toktype = SentencePieceTokenTypes .UNUSED
2149
2192
2150
2193
tokens .append (text )
2151
2194
scores .append (score )
@@ -2175,7 +2218,7 @@ def set_vocab(self):
2175
2218
if token == chat_eos_token :
2176
2219
chat_eos_token_id = token_id
2177
2220
token = token .encode ("utf-8" )
2178
- if toktypes [token_id ] != SentencePieceTokenTypes .UNKNOWN :
2221
+ if toktypes [token_id ] != SentencePieceTokenTypes .UNUSED :
2179
2222
assert (tokens [token_id ] == token )
2180
2223
tokens [token_id ] = token
2181
2224
scores [token_id ] = - 1000.0
@@ -2194,7 +2237,7 @@ def set_vocab(self):
2194
2237
if token == chat_eos_token :
2195
2238
chat_eos_token_id = token_id
2196
2239
token = token .encode ("utf-8" )
2197
- if toktypes [token_id ] != SentencePieceTokenTypes .UNKNOWN :
2240
+ if toktypes [token_id ] != SentencePieceTokenTypes .UNUSED :
2198
2241
assert (tokens [token_id ] == token )
2199
2242
tokens [token_id ] = token
2200
2243
scores [token_id ] = - 1000.0
@@ -2434,19 +2477,7 @@ class Gemma2Model(Model):
2434
2477
model_arch = gguf .MODEL_ARCH .GEMMA2
2435
2478
2436
2479
def set_vocab (self ):
2437
- tokens , scores , toktypes = self ._create_vocab_sentencepiece ()
2438
- # hack: This is required so that we can properly use start/end-of-turn for chat template
2439
- for i in range (108 ):
2440
- # including <unusedX>, <start_of_turn>, <end_of_turn>
2441
- toktypes [i ] = SentencePieceTokenTypes .CONTROL
2442
- self .gguf_writer .add_tokenizer_model ("llama" )
2443
- self .gguf_writer .add_tokenizer_pre ("default" )
2444
- self .gguf_writer .add_token_list (tokens )
2445
- self .gguf_writer .add_token_scores (scores )
2446
- self .gguf_writer .add_token_types (toktypes )
2447
-
2448
- special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
2449
- special_vocab .add_to_gguf (self .gguf_writer )
2480
+ self ._set_vocab_sentencepiece ()
2450
2481
2451
2482
self .gguf_writer .add_add_space_prefix (False )
2452
2483
@@ -2770,7 +2801,7 @@ def set_vocab(self):
2770
2801
2771
2802
tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
2772
2803
scores : list [float ] = [- 10000.0 ] * vocab_size
2773
- toktypes : list [int ] = [SentencePieceTokenTypes .UNKNOWN ] * vocab_size
2804
+ toktypes : list [int ] = [SentencePieceTokenTypes .UNUSED ] * vocab_size
2774
2805
2775
2806
for token_id in range (tokenizer .vocab_size ()):
2776
2807
@@ -3025,7 +3056,7 @@ def set_vocab(self):
3025
3056
3026
3057
tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
3027
3058
scores : list [float ] = [- 10000.0 ] * vocab_size
3028
- toktypes : list [int ] = [SentencePieceTokenTypes .UNKNOWN ] * vocab_size
3059
+ toktypes : list [int ] = [SentencePieceTokenTypes .UNUSED ] * vocab_size
3029
3060
3030
3061
for token_id in range (tokenizer .vocab_size ()):
3031
3062
piece = tokenizer .IdToPiece (token_id )
@@ -3243,15 +3274,14 @@ def set_vocab_chatglm3(self):
3243
3274
if len (piece ) != 0 and token_id < tokenizer .tokenizer .sp_model .vocab_size ():
3244
3275
score = tokenizer .tokenizer .sp_model .get_score (token_id )
3245
3276
3246
- if len (piece ) == 0 :
3247
- text = f"[PAD{ token_id } ]" .encode ("utf-8" )
3248
-
3249
3277
if token_id >= tokenizer .tokenizer .sp_model .vocab_size ():
3250
3278
if piece in special_tokens :
3251
- # show special tokens in prompt
3252
- toktype = SentencePieceTokenTypes .USER_DEFINED
3279
+ toktype = SentencePieceTokenTypes .CONTROL
3280
+ elif len (piece ) == 0 :
3281
+ text = f"[PAD{ token_id } ]" .encode ("utf-8" )
3282
+ toktype = SentencePieceTokenTypes .UNUSED
3253
3283
else :
3254
- toktype = SentencePieceTokenTypes .UNKNOWN
3284
+ toktype = SentencePieceTokenTypes .USER_DEFINED
3255
3285
tokens .append (text )
3256
3286
scores .append (score )
3257
3287
toktypes .append (toktype )
@@ -3340,7 +3370,7 @@ def set_vocab(self):
3340
3370
for i in range (vocab_size ):
3341
3371
if i not in reverse_vocab :
3342
3372
tokens .append (f"[PAD{ i } ]" )
3343
- toktypes .append (gguf .TokenType .USER_DEFINED )
3373
+ toktypes .append (gguf .TokenType .UNUSED )
3344
3374
elif reverse_vocab [i ] in added_vocab :
3345
3375
tokens .append (reverse_vocab [i ])
3346
3376
if tokenizer .added_tokens_decoder [i ].special :
0 commit comments