File size: 551 Bytes
1101a21 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 |
#INPUT="roberta_train_data_raw/valid.json"
INPUT="/mnt/nvme0/ouyangxuan/project_pretrain/make_pretrain_data/roberta_train_data_raw/valid.json"
python preprocess_data.py \
--input ${INPUT} \
--output-prefix my-bert \
--vocab bert-vocab.txt \
--dataset-impl mmap \
--worker 1 \
--chunk-size 1 \
--tokenizer-type BertWordPieceLowerCase \
--split-sentences
#--input /mnt/nvme1/ouyangxuan/project_pretrain/find_framework/tmp_data/data.json \
#--input roberta_train_data_raw/train_1g.json \
|