yuyan-10b / tools /run_build_data.sh
Shawn001's picture
Upload 21 files
1101a21
raw
history blame contribute delete
No virus
551 Bytes
#INPUT="roberta_train_data_raw/valid.json"
INPUT="/mnt/nvme0/ouyangxuan/project_pretrain/make_pretrain_data/roberta_train_data_raw/valid.json"
python preprocess_data.py \
--input ${INPUT} \
--output-prefix my-bert \
--vocab bert-vocab.txt \
--dataset-impl mmap \
--worker 1 \
--chunk-size 1 \
--tokenizer-type BertWordPieceLowerCase \
--split-sentences
#--input /mnt/nvme1/ouyangxuan/project_pretrain/find_framework/tmp_data/data.json \
#--input roberta_train_data_raw/train_1g.json \