File size: 551 Bytes
1101a21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17

#INPUT="roberta_train_data_raw/valid.json"
INPUT="/mnt/nvme0/ouyangxuan/project_pretrain/make_pretrain_data/roberta_train_data_raw/valid.json"
python preprocess_data.py \
       --input ${INPUT} \
       --output-prefix my-bert \
       --vocab bert-vocab.txt \
       --dataset-impl mmap \
       --worker 1 \
       --chunk-size 1 \
       --tokenizer-type BertWordPieceLowerCase \
       --split-sentences


#--input /mnt/nvme1/ouyangxuan/project_pretrain/find_framework/tmp_data/data.json \
       #--input roberta_train_data_raw/train_1g.json \