#INPUT="roberta_train_data_raw/valid.json" INPUT="/mnt/nvme0/ouyangxuan/project_pretrain/make_pretrain_data/roberta_train_data_raw/valid.json" python preprocess_data.py \ --input ${INPUT} \ --output-prefix my-bert \ --vocab bert-vocab.txt \ --dataset-impl mmap \ --worker 1 \ --chunk-size 1 \ --tokenizer-type BertWordPieceLowerCase \ --split-sentences #--input /mnt/nvme1/ouyangxuan/project_pretrain/find_framework/tmp_data/data.json \ #--input roberta_train_data_raw/train_1g.json \