Now-or-Never commited on
Commit
e8bec2a
1 Parent(s): b947980

Upload 5 files

Browse files
added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "</s>": 2,
3
+ "<s>": 1,
4
+ "<unk>": 0
5
+ }
tokenization_internlm.py CHANGED
@@ -65,6 +65,13 @@ class InternLMTokenizer(PreTrainedTokenizer):
65
  **kwargs,
66
  ):
67
  self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
 
 
 
 
 
 
 
68
  super().__init__(
69
  bos_token=bos_token,
70
  eos_token=eos_token,
@@ -73,14 +80,6 @@ class InternLMTokenizer(PreTrainedTokenizer):
73
  clean_up_tokenization_spaces=clean_up_tokenization_spaces,
74
  **kwargs,
75
  )
76
- self.vocab_file = vocab_file
77
- self.add_bos_token = add_bos_token
78
- self.add_eos_token = add_eos_token
79
- self.decode_with_prefix_space = decode_with_prefix_space
80
- self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
81
- self.sp_model.Load(vocab_file)
82
- self._no_prefix_space_tokens = None
83
-
84
  """ Initialisation"""
85
 
86
  @property
 
65
  **kwargs,
66
  ):
67
  self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
68
+ self.vocab_file = vocab_file
69
+ self.add_bos_token = add_bos_token
70
+ self.add_eos_token = add_eos_token
71
+ self.decode_with_prefix_space = decode_with_prefix_space
72
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
73
+ self.sp_model.Load(vocab_file)
74
+ self._no_prefix_space_tokens = None
75
  super().__init__(
76
  bos_token=bos_token,
77
  eos_token=eos_token,
 
80
  clean_up_tokenization_spaces=clean_up_tokenization_spaces,
81
  **kwargs,
82
  )
 
 
 
 
 
 
 
 
83
  """ Initialisation"""
84
 
85
  @property
tokenizer_config.json CHANGED
@@ -1,4 +1,31 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "auto_map": {
3
  "AutoTokenizer": [
4
  "tokenization_internlm.InternLMTokenizer",
@@ -11,5 +38,6 @@
11
  "model_max_length": 1000000000000000019884624838656,
12
  "pad_token": "</s>",
13
  "tokenizer_class": "InternLMTokenizer",
 
14
  "unk_token": "<unk>"
15
  }
 
1
  {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<unk>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<s>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ }
27
+ },
28
+ "additional_special_tokens": [],
29
  "auto_map": {
30
  "AutoTokenizer": [
31
  "tokenization_internlm.InternLMTokenizer",
 
38
  "model_max_length": 1000000000000000019884624838656,
39
  "pad_token": "</s>",
40
  "tokenizer_class": "InternLMTokenizer",
41
+ "tokenizer_file": null,
42
  "unk_token": "<unk>"
43
  }