if001 commited on
Commit
e254bed
1 Parent(s): da15cde
Files changed (3) hide show
  1. README.md +24 -0
  2. special_tokens_map.json +7 -1
  3. tokenizer_config.json +6 -0
README.md CHANGED
@@ -7,3 +7,27 @@ tags:
7
  - tokenizer
8
  - sentencepiece
9
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  - tokenizer
8
  - sentencepiece
9
  ---
10
+
11
+ sentencepiece unigramを日本語で学習
12
+ https://github.com/huggingface/tokenizers
13
+
14
+ ## sample
15
+
16
+ ```
17
+ from transformers import AutoTokenizer
18
+ tokenizer = AutoTokenizer.from_pretrained("if001/sentencepiece_ja", trust_remote_code=True)
19
+ print(tokenizer("hello world"))
20
+
21
+ >> {'input_ids': [158, 8418, 1427, 15930, 866, 13782, 44, 15034, 1719, 16655, 8, 115, 5, 280, 17635, 94, 818, 2748, 1168, 1114], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
22
+
23
+ print(tokenizer.tokenize('それは九月初旬のある蒸し暑い晩のことであった。私は、D坂の大通りの中程にある'))
24
+ >> ['それは', '九月', '初', '旬', 'のある', '蒸', 'し', '暑い', '晩', 'のことであった', '。', '私は', '、', 'D', '坂の', '大', '通り', 'の中', '程', 'にある']
25
+
26
+ ```
27
+
28
+
29
+ ## データセット
30
+ https://huggingface.co/datasets/izumi-lab/wikinews-ja-20230728
31
+ https://huggingface.co/datasets/izumi-lab/wikinews-en-20230728
32
+ https://huggingface.co/datasets/if001/aozorabunko-clean-sin
33
+
special_tokens_map.json CHANGED
@@ -1 +1,7 @@
1
- {"eos_token": "<EOS>", "unk_token": "<UNK>", "pad_token": "<PAD>", "bos_token": "<BOS>", "additional_special_tokens": []}
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<BOS>",
3
+ "eos_token": "<EOS>",
4
+ "mask_token": "<MASK>",
5
+ "pad_token": "<PAD>",
6
+ "unk_token": "<UNK>"
7
+ }
tokenizer_config.json CHANGED
@@ -1,4 +1,10 @@
1
  {
 
 
 
 
 
 
2
  "clean_up_tokenization_spaces": true,
3
  "tokenizer_class": "SentencePieceJA",
4
  "auto_map": {
 
1
  {
2
+ "bos_token": "<BOS>",
3
+ "eos_token": "<EOS>",
4
+ "mask_token": "<MASK>",
5
+ "model_max_length": 1000000000000000019884624838656,
6
+ "pad_token": "<PAD>",
7
+ "unk_token": "<UNK>",
8
  "clean_up_tokenization_spaces": true,
9
  "tokenizer_class": "SentencePieceJA",
10
  "auto_map": {