#INPUT="roberta_train_data_raw/valid.json" | |
INPUT="/mnt/nvme0/ouyangxuan/project_pretrain/make_pretrain_data/roberta_train_data_raw/valid.json" | |
python preprocess_data.py \ | |
--input ${INPUT} \ | |
--output-prefix my-bert \ | |
--vocab bert-vocab.txt \ | |
--dataset-impl mmap \ | |
--worker 1 \ | |
--chunk-size 1 \ | |
--tokenizer-type BertWordPieceLowerCase \ | |
--split-sentences | |
#--input /mnt/nvme1/ouyangxuan/project_pretrain/find_framework/tmp_data/data.json \ | |
#--input roberta_train_data_raw/train_1g.json \ | |