hamishivi commited on
Commit
7a3dfb3
1 Parent(s): 5fd88e4

fix tokenizer

Browse files
special_tokens_map.json CHANGED
@@ -12,5 +12,12 @@
12
  "normalized": false,
13
  "rstrip": false,
14
  "single_word": false
 
 
 
 
 
 
 
15
  }
16
  }
 
12
  "normalized": false,
13
  "rstrip": false,
14
  "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
  }
23
  }
tokenizer.json CHANGED
@@ -2300,7 +2300,7 @@
2300
  },
2301
  {
2302
  "id": 128255,
2303
- "content": "<|reserved_special_token_250|>",
2304
  "single_word": false,
2305
  "lstrip": false,
2306
  "rstrip": false,
@@ -2329,69 +2329,10 @@
2329
  ]
2330
  },
2331
  "post_processor": {
2332
- "type": "Sequence",
2333
- "processors": [
2334
- {
2335
- "type": "ByteLevel",
2336
- "add_prefix_space": true,
2337
- "trim_offsets": false,
2338
- "use_regex": true
2339
- },
2340
- {
2341
- "type": "TemplateProcessing",
2342
- "single": [
2343
- {
2344
- "SpecialToken": {
2345
- "id": "<|begin_of_text|>",
2346
- "type_id": 0
2347
- }
2348
- },
2349
- {
2350
- "Sequence": {
2351
- "id": "A",
2352
- "type_id": 0
2353
- }
2354
- }
2355
- ],
2356
- "pair": [
2357
- {
2358
- "SpecialToken": {
2359
- "id": "<|begin_of_text|>",
2360
- "type_id": 0
2361
- }
2362
- },
2363
- {
2364
- "Sequence": {
2365
- "id": "A",
2366
- "type_id": 0
2367
- }
2368
- },
2369
- {
2370
- "SpecialToken": {
2371
- "id": "<|begin_of_text|>",
2372
- "type_id": 1
2373
- }
2374
- },
2375
- {
2376
- "Sequence": {
2377
- "id": "B",
2378
- "type_id": 1
2379
- }
2380
- }
2381
- ],
2382
- "special_tokens": {
2383
- "<|begin_of_text|>": {
2384
- "id": "<|begin_of_text|>",
2385
- "ids": [
2386
- 128000
2387
- ],
2388
- "tokens": [
2389
- "<|begin_of_text|>"
2390
- ]
2391
- }
2392
- }
2393
- }
2394
- ]
2395
  },
2396
  "decoder": {
2397
  "type": "ByteLevel",
@@ -2407,7 +2348,7 @@
2407
  "end_of_word_suffix": null,
2408
  "fuse_unk": false,
2409
  "byte_fallback": false,
2410
- "ignore_merges": true,
2411
  "vocab": {
2412
  "!": 0,
2413
  "\"": 1,
@@ -410560,4 +410501,4 @@
410560
  "éĶ ¦"
410561
  ]
410562
  }
410563
- }
 
2300
  },
2301
  {
2302
  "id": 128255,
2303
+ "content": "<pad>",
2304
  "single_word": false,
2305
  "lstrip": false,
2306
  "rstrip": false,
 
2329
  ]
2330
  },
2331
  "post_processor": {
2332
+ "type": "ByteLevel",
2333
+ "add_prefix_space": true,
2334
+ "trim_offsets": false,
2335
+ "use_regex": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2336
  },
2337
  "decoder": {
2338
  "type": "ByteLevel",
 
2348
  "end_of_word_suffix": null,
2349
  "fuse_unk": false,
2350
  "byte_fallback": false,
2351
+ "ignore_merges": false,
2352
  "vocab": {
2353
  "!": 0,
2354
  "\"": 1,
 
410501
  "éĶ ¦"
410502
  ]
410503
  }
410504
+ }
tokenizer_config.json CHANGED
@@ -2041,7 +2041,7 @@
2041
  "special": true
2042
  },
2043
  "128255": {
2044
- "content": "<|reserved_special_token_250|>",
2045
  "lstrip": false,
2046
  "normalized": false,
2047
  "rstrip": false,
@@ -2050,6 +2050,7 @@
2050
  }
2051
  },
2052
  "bos_token": "<|begin_of_text|>",
 
2053
  "clean_up_tokenization_spaces": true,
2054
  "eos_token": "<|end_of_text|>",
2055
  "model_input_names": [
@@ -2057,6 +2058,6 @@
2057
  "attention_mask"
2058
  ],
2059
  "model_max_length": 1000000000000000019884624838656,
2060
- "tokenizer_class": "PreTrainedTokenizerFast",
2061
- "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
2062
  }
 
2041
  "special": true
2042
  },
2043
  "128255": {
2044
+ "content": "<pad>",
2045
  "lstrip": false,
2046
  "normalized": false,
2047
  "rstrip": false,
 
2050
  }
2051
  },
2052
  "bos_token": "<|begin_of_text|>",
2053
+ "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
2054
  "clean_up_tokenization_spaces": true,
2055
  "eos_token": "<|end_of_text|>",
2056
  "model_input_names": [
 
2058
  "attention_mask"
2059
  ],
2060
  "model_max_length": 1000000000000000019884624838656,
2061
+ "pad_token": "<pad>",
2062
+ "tokenizer_class": "PreTrainedTokenizerFast"
2063
  }