Upload tokenizer
Browse files- tokenizer.json +54 -54
- tokenizer_config.json +2 -2
- vocab.txt +32 -32
tokenizer.json
CHANGED
@@ -217,40 +217,40 @@
|
|
217 |
"|": 69,
|
218 |
"}": 70,
|
219 |
"~": 71,
|
220 |
-
"##
|
221 |
-
"##
|
222 |
-
"##
|
223 |
-
"##
|
224 |
-
"##
|
225 |
-
"##
|
226 |
-
"##
|
227 |
-
"##
|
228 |
-
"##
|
229 |
-
"##
|
230 |
-
"##
|
231 |
-
"##
|
232 |
-
"##
|
233 |
-
"##
|
234 |
-
"##
|
235 |
-
"##
|
236 |
-
"##
|
237 |
-
"##
|
238 |
-
"##
|
239 |
-
"##
|
240 |
-
"##
|
241 |
-
"##
|
242 |
-
"##
|
243 |
-
"##
|
244 |
-
"##
|
245 |
-
"##
|
246 |
-
"##
|
247 |
-
"##
|
248 |
"##q": 100,
|
249 |
-
"##
|
250 |
-
"##
|
251 |
-
"##
|
252 |
-
"##
|
253 |
-
"##
|
254 |
"##6": 106,
|
255 |
"##7": 107,
|
256 |
"##he": 108,
|
@@ -9740,8 +9740,8 @@
|
|
9740 |
"simpl": 9592,
|
9741 |
"thrusting": 9593,
|
9742 |
"intact": 9594,
|
9743 |
-
"##
|
9744 |
-
"##
|
9745 |
"cellar": 9597,
|
9746 |
"borrow": 9598,
|
9747 |
"tomb": 9599,
|
@@ -21890,8 +21890,8 @@
|
|
21890 |
"mellow": 21742,
|
21891 |
"recesses": 21743,
|
21892 |
"nondescript": 21744,
|
21893 |
-
"##
|
21894 |
-
"##
|
21895 |
"##idences": 21747,
|
21896 |
"altering": 21748,
|
21897 |
"ordinarily": 21749,
|
@@ -23289,8 +23289,8 @@
|
|
23289 |
"unwitting": 23141,
|
23290 |
"ogling": 23142,
|
23291 |
"horrendous": 23143,
|
23292 |
-
"##
|
23293 |
-
"##
|
23294 |
"undressing": 23146,
|
23295 |
"overdue": 23147,
|
23296 |
"trask": 23148,
|
@@ -25866,8 +25866,8 @@
|
|
25866 |
"casserole": 25718,
|
25867 |
"resilient": 25719,
|
25868 |
"latham": 25720,
|
25869 |
-
"##
|
25870 |
-
"##
|
25871 |
"watt": 25723,
|
25872 |
"lightness": 25724,
|
25873 |
"visage": 25725,
|
@@ -26792,9 +26792,9 @@
|
|
26792 |
"demolished": 26644,
|
26793 |
"jus": 26645,
|
26794 |
"zem": 26646,
|
26795 |
-
"##
|
26796 |
-
"##
|
26797 |
-
"##
|
26798 |
"blub": 26650,
|
26799 |
"talbot": 26651,
|
26800 |
"departments": 26652,
|
@@ -28185,8 +28185,8 @@
|
|
28185 |
"governess": 28037,
|
28186 |
"implacable": 28038,
|
28187 |
"vampaneze": 28039,
|
28188 |
-
"##
|
28189 |
-
"##
|
28190 |
"desari": 28042,
|
28191 |
"drax": 28043,
|
28192 |
"machi": 28044,
|
@@ -29352,8 +29352,8 @@
|
|
29352 |
"hinged": 29204,
|
29353 |
"nile": 29205,
|
29354 |
"##glasses": 29206,
|
29355 |
-
"##
|
29356 |
-
"##
|
29357 |
"stumped": 29209,
|
29358 |
"carrick": 29210,
|
29359 |
"##utha": 29211,
|
@@ -29611,8 +29611,8 @@
|
|
29611 |
"pleasured": 29463,
|
29612 |
"intermittent": 29464,
|
29613 |
"unscrewed": 29465,
|
29614 |
-
"##
|
29615 |
-
"##
|
29616 |
"##oreal": 29468,
|
29617 |
"disheart": 29469,
|
29618 |
"emban": 29470,
|
@@ -29845,8 +29845,8 @@
|
|
29845 |
"pf": 29697,
|
29846 |
"siber": 29698,
|
29847 |
"vie": 29699,
|
29848 |
-
"##
|
29849 |
-
"##
|
29850 |
"doomb": 29702,
|
29851 |
"atro": 29703,
|
29852 |
"insom": 29704,
|
@@ -30016,8 +30016,8 @@
|
|
30016 |
"imitate": 29868,
|
30017 |
"drinkers": 29869,
|
30018 |
"ethans": 29870,
|
30019 |
-
"##
|
30020 |
-
"##
|
30021 |
"spiel": 29873,
|
30022 |
"vehem": 29874,
|
30023 |
"besie": 29875,
|
|
|
217 |
"|": 69,
|
218 |
"}": 70,
|
219 |
"~": 71,
|
220 |
+
"##e": 72,
|
221 |
+
"##r": 73,
|
222 |
+
"##t": 74,
|
223 |
+
"##a": 75,
|
224 |
+
"##n": 76,
|
225 |
+
"##i": 77,
|
226 |
+
"##g": 78,
|
227 |
+
"##s": 79,
|
228 |
+
"##d": 80,
|
229 |
+
"##o": 81,
|
230 |
+
"##m": 82,
|
231 |
+
"##b": 83,
|
232 |
+
"##p": 84,
|
233 |
+
"##u": 85,
|
234 |
+
"##l": 86,
|
235 |
+
"##c": 87,
|
236 |
+
"##y": 88,
|
237 |
+
"##z": 89,
|
238 |
+
"##h": 90,
|
239 |
+
"##w": 91,
|
240 |
+
"##f": 92,
|
241 |
+
"##k": 93,
|
242 |
+
"##4": 94,
|
243 |
+
"##j": 95,
|
244 |
+
"##v": 96,
|
245 |
+
"##0": 97,
|
246 |
+
"##8": 98,
|
247 |
+
"##1": 99,
|
248 |
"##q": 100,
|
249 |
+
"##x": 101,
|
250 |
+
"##3": 102,
|
251 |
+
"##5": 103,
|
252 |
+
"##2": 104,
|
253 |
+
"##9": 105,
|
254 |
"##6": 106,
|
255 |
"##7": 107,
|
256 |
"##he": 108,
|
|
|
9740 |
"simpl": 9592,
|
9741 |
"thrusting": 9593,
|
9742 |
"intact": 9594,
|
9743 |
+
"##rus": 9595,
|
9744 |
+
"##phe": 9596,
|
9745 |
"cellar": 9597,
|
9746 |
"borrow": 9598,
|
9747 |
"tomb": 9599,
|
|
|
21890 |
"mellow": 21742,
|
21891 |
"recesses": 21743,
|
21892 |
"nondescript": 21744,
|
21893 |
+
"##nuts": 21745,
|
21894 |
+
"##oom": 21746,
|
21895 |
"##idences": 21747,
|
21896 |
"altering": 21748,
|
21897 |
"ordinarily": 21749,
|
|
|
23289 |
"unwitting": 23141,
|
23290 |
"ogling": 23142,
|
23291 |
"horrendous": 23143,
|
23292 |
+
"##din": 23144,
|
23293 |
+
"##uzz": 23145,
|
23294 |
"undressing": 23146,
|
23295 |
"overdue": 23147,
|
23296 |
"trask": 23148,
|
|
|
25866 |
"casserole": 25718,
|
25867 |
"resilient": 25719,
|
25868 |
"latham": 25720,
|
25869 |
+
"##grove": 25721,
|
25870 |
+
"##pace": 25722,
|
25871 |
"watt": 25723,
|
25872 |
"lightness": 25724,
|
25873 |
"visage": 25725,
|
|
|
26792 |
"demolished": 26644,
|
26793 |
"jus": 26645,
|
26794 |
"zem": 26646,
|
26795 |
+
"##eep": 26647,
|
26796 |
+
"##tus": 26648,
|
26797 |
+
"##mies": 26649,
|
26798 |
"blub": 26650,
|
26799 |
"talbot": 26651,
|
26800 |
"departments": 26652,
|
|
|
28185 |
"governess": 28037,
|
28186 |
"implacable": 28038,
|
28187 |
"vampaneze": 28039,
|
28188 |
+
"##rically": 28040,
|
28189 |
+
"##ugged": 28041,
|
28190 |
"desari": 28042,
|
28191 |
"drax": 28043,
|
28192 |
"machi": 28044,
|
|
|
29352 |
"hinged": 29204,
|
29353 |
"nile": 29205,
|
29354 |
"##glasses": 29206,
|
29355 |
+
"##yran": 29207,
|
29356 |
+
"##has": 29208,
|
29357 |
"stumped": 29209,
|
29358 |
"carrick": 29210,
|
29359 |
"##utha": 29211,
|
|
|
29611 |
"pleasured": 29463,
|
29612 |
"intermittent": 29464,
|
29613 |
"unscrewed": 29465,
|
29614 |
+
"##nick": 29466,
|
29615 |
+
"##pread": 29467,
|
29616 |
"##oreal": 29468,
|
29617 |
"disheart": 29469,
|
29618 |
"emban": 29470,
|
|
|
29845 |
"pf": 29697,
|
29846 |
"siber": 29698,
|
29847 |
"vie": 29699,
|
29848 |
+
"##sin": 29700,
|
29849 |
+
"##hinking": 29701,
|
29850 |
"doomb": 29702,
|
29851 |
"atro": 29703,
|
29852 |
"insom": 29704,
|
|
|
30016 |
"imitate": 29868,
|
30017 |
"drinkers": 29869,
|
30018 |
"ethans": 29870,
|
30019 |
+
"##ritch": 29871,
|
30020 |
+
"##uke": 29872,
|
30021 |
"spiel": 29873,
|
30022 |
"vehem": 29874,
|
30023 |
"besie": 29875,
|
tokenizer_config.json
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
{
|
|
|
2 |
"cls_token": "[CLS]",
|
3 |
"do_lower_case": true,
|
4 |
"mask_token": "[MASK]",
|
5 |
-
"model_max_length":
|
6 |
"pad_token": "[PAD]",
|
7 |
"sep_token": "[SEP]",
|
8 |
-
"special_tokens_map_file": null,
|
9 |
"strip_accents": null,
|
10 |
"tokenize_chinese_chars": true,
|
11 |
"tokenizer_class": "BertTokenizer",
|
|
|
1 |
{
|
2 |
+
"clean_up_tokenization_spaces": true,
|
3 |
"cls_token": "[CLS]",
|
4 |
"do_lower_case": true,
|
5 |
"mask_token": "[MASK]",
|
6 |
+
"model_max_length": 48,
|
7 |
"pad_token": "[PAD]",
|
8 |
"sep_token": "[SEP]",
|
|
|
9 |
"strip_accents": null,
|
10 |
"tokenize_chinese_chars": true,
|
11 |
"tokenizer_class": "BertTokenizer",
|
vocab.txt
CHANGED
@@ -70,40 +70,40 @@ z
|
|
70 |
|
|
71 |
}
|
72 |
~
|
73 |
-
##u
|
74 |
-
##m
|
75 |
-
##p
|
76 |
-
##i
|
77 |
##e
|
|
|
|
|
78 |
##a
|
79 |
-
##l
|
80 |
-
##d
|
81 |
-
##o
|
82 |
##n
|
|
|
83 |
##g
|
84 |
-
##h
|
85 |
-
##t
|
86 |
-
##0
|
87 |
-
##5
|
88 |
-
##r
|
89 |
##s
|
90 |
-
##
|
91 |
-
##
|
92 |
-
##
|
93 |
-
##
|
|
|
|
|
|
|
94 |
##c
|
|
|
95 |
##z
|
96 |
-
##
|
|
|
|
|
|
|
97 |
##4
|
|
|
|
|
|
|
98 |
##8
|
99 |
##1
|
100 |
-
##b
|
101 |
##q
|
102 |
-
##
|
103 |
-
##v
|
104 |
-
##k
|
105 |
##3
|
106 |
-
##
|
|
|
|
|
107 |
##6
|
108 |
##7
|
109 |
##he
|
@@ -9593,8 +9593,8 @@ meals
|
|
9593 |
simpl
|
9594 |
thrusting
|
9595 |
intact
|
9596 |
-
##phe
|
9597 |
##rus
|
|
|
9598 |
cellar
|
9599 |
borrow
|
9600 |
tomb
|
@@ -21743,8 +21743,8 @@ prophecies
|
|
21743 |
mellow
|
21744 |
recesses
|
21745 |
nondescript
|
21746 |
-
##oom
|
21747 |
##nuts
|
|
|
21748 |
##idences
|
21749 |
altering
|
21750 |
ordinarily
|
@@ -23142,8 +23142,8 @@ mounts
|
|
23142 |
unwitting
|
23143 |
ogling
|
23144 |
horrendous
|
23145 |
-
##uzz
|
23146 |
##din
|
|
|
23147 |
undressing
|
23148 |
overdue
|
23149 |
trask
|
@@ -25719,8 +25719,8 @@ rivulets
|
|
25719 |
casserole
|
25720 |
resilient
|
25721 |
latham
|
25722 |
-
##pace
|
25723 |
##grove
|
|
|
25724 |
watt
|
25725 |
lightness
|
25726 |
visage
|
@@ -26645,9 +26645,9 @@ impersonal
|
|
26645 |
demolished
|
26646 |
jus
|
26647 |
zem
|
26648 |
-
##mies
|
26649 |
##eep
|
26650 |
##tus
|
|
|
26651 |
blub
|
26652 |
talbot
|
26653 |
departments
|
@@ -28038,8 +28038,8 @@ dabbing
|
|
28038 |
governess
|
28039 |
implacable
|
28040 |
vampaneze
|
28041 |
-
##ugged
|
28042 |
##rically
|
|
|
28043 |
desari
|
28044 |
drax
|
28045 |
machi
|
@@ -29205,8 +29205,8 @@ recycling
|
|
29205 |
hinged
|
29206 |
nile
|
29207 |
##glasses
|
29208 |
-
##has
|
29209 |
##yran
|
|
|
29210 |
stumped
|
29211 |
carrick
|
29212 |
##utha
|
@@ -29464,8 +29464,8 @@ refocus
|
|
29464 |
pleasured
|
29465 |
intermittent
|
29466 |
unscrewed
|
29467 |
-
##pread
|
29468 |
##nick
|
|
|
29469 |
##oreal
|
29470 |
disheart
|
29471 |
emban
|
@@ -29698,8 +29698,8 @@ alder
|
|
29698 |
pf
|
29699 |
siber
|
29700 |
vie
|
29701 |
-
##hinking
|
29702 |
##sin
|
|
|
29703 |
doomb
|
29704 |
atro
|
29705 |
insom
|
@@ -29869,8 +29869,8 @@ aggie
|
|
29869 |
imitate
|
29870 |
drinkers
|
29871 |
ethans
|
29872 |
-
##uke
|
29873 |
##ritch
|
|
|
29874 |
spiel
|
29875 |
vehem
|
29876 |
besie
|
|
|
70 |
|
|
71 |
}
|
72 |
~
|
|
|
|
|
|
|
|
|
73 |
##e
|
74 |
+
##r
|
75 |
+
##t
|
76 |
##a
|
|
|
|
|
|
|
77 |
##n
|
78 |
+
##i
|
79 |
##g
|
|
|
|
|
|
|
|
|
|
|
80 |
##s
|
81 |
+
##d
|
82 |
+
##o
|
83 |
+
##m
|
84 |
+
##b
|
85 |
+
##p
|
86 |
+
##u
|
87 |
+
##l
|
88 |
##c
|
89 |
+
##y
|
90 |
##z
|
91 |
+
##h
|
92 |
+
##w
|
93 |
+
##f
|
94 |
+
##k
|
95 |
##4
|
96 |
+
##j
|
97 |
+
##v
|
98 |
+
##0
|
99 |
##8
|
100 |
##1
|
|
|
101 |
##q
|
102 |
+
##x
|
|
|
|
|
103 |
##3
|
104 |
+
##5
|
105 |
+
##2
|
106 |
+
##9
|
107 |
##6
|
108 |
##7
|
109 |
##he
|
|
|
9593 |
simpl
|
9594 |
thrusting
|
9595 |
intact
|
|
|
9596 |
##rus
|
9597 |
+
##phe
|
9598 |
cellar
|
9599 |
borrow
|
9600 |
tomb
|
|
|
21743 |
mellow
|
21744 |
recesses
|
21745 |
nondescript
|
|
|
21746 |
##nuts
|
21747 |
+
##oom
|
21748 |
##idences
|
21749 |
altering
|
21750 |
ordinarily
|
|
|
23142 |
unwitting
|
23143 |
ogling
|
23144 |
horrendous
|
|
|
23145 |
##din
|
23146 |
+
##uzz
|
23147 |
undressing
|
23148 |
overdue
|
23149 |
trask
|
|
|
25719 |
casserole
|
25720 |
resilient
|
25721 |
latham
|
|
|
25722 |
##grove
|
25723 |
+
##pace
|
25724 |
watt
|
25725 |
lightness
|
25726 |
visage
|
|
|
26645 |
demolished
|
26646 |
jus
|
26647 |
zem
|
|
|
26648 |
##eep
|
26649 |
##tus
|
26650 |
+
##mies
|
26651 |
blub
|
26652 |
talbot
|
26653 |
departments
|
|
|
28038 |
governess
|
28039 |
implacable
|
28040 |
vampaneze
|
|
|
28041 |
##rically
|
28042 |
+
##ugged
|
28043 |
desari
|
28044 |
drax
|
28045 |
machi
|
|
|
29205 |
hinged
|
29206 |
nile
|
29207 |
##glasses
|
|
|
29208 |
##yran
|
29209 |
+
##has
|
29210 |
stumped
|
29211 |
carrick
|
29212 |
##utha
|
|
|
29464 |
pleasured
|
29465 |
intermittent
|
29466 |
unscrewed
|
|
|
29467 |
##nick
|
29468 |
+
##pread
|
29469 |
##oreal
|
29470 |
disheart
|
29471 |
emban
|
|
|
29698 |
pf
|
29699 |
siber
|
29700 |
vie
|
|
|
29701 |
##sin
|
29702 |
+
##hinking
|
29703 |
doomb
|
29704 |
atro
|
29705 |
insom
|
|
|
29869 |
imitate
|
29870 |
drinkers
|
29871 |
ethans
|
|
|
29872 |
##ritch
|
29873 |
+
##uke
|
29874 |
spiel
|
29875 |
vehem
|
29876 |
besie
|