#!/bin/bash WORLD_SIZE=8 DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ --nnodes 1 \ --node_rank 0 \ --master_addr localhost \ --master_port 6003" TASK="AFQMC" TRAIN_DATA="clue_data/afqmc/train.json" VALID_DATA="clue_data/afqmc/dev.json" TEST_DATA="clue_data/afqmc/test.json" PRETRAINED_CHECKPOINT="./yuyan-10b" VOCAB_FILE=bert-vocab.txt for lr in 1e-5 2e-5 3e-5; do for bs in 32 16; do for ep in 3 5 8; do ct=`date +"%m%d%H%M%S"` OUTPUTS_PATH="outputs/${TASK}/yuyan_bs_${bs}_lr_${lr}_ep_${ep}_${ct}" if [ ! -d ${OUTPUTS_PATH} ];then mkdir -p ${OUTPUTS_PATH} else echo "dir exist, not mkdir" fi python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \ --task $TASK \ --seed 1234 \ --pretrained-checkpoint $PRETRAINED_CHECKPOINT \ --train-data $TRAIN_DATA \ --valid-data $VALID_DATA \ --test-data $TEST_DATA \ --tokenizer-type BertWordPieceLowerCase \ --vocab-file $VOCAB_FILE \ --epochs $ep \ --tensor-model-parallel-size 8 \ --num-layers 48 \ --hidden-size 4096 \ --num-attention-heads 64 \ --micro-batch-size $bs \ --lr $lr \ --lr-decay-style linear \ --lr-warmup-fraction 0.065 \ --seq-length 128 \ --max-position-embeddings 512 \ --log-interval 10 \ --eval-interval 800 \ --eval-iters 50 \ --weight-decay 1.0e-1 \ --res-path ${OUTPUTS_PATH} \ --fp16 | tee ${OUTPUTS_PATH}/job.log # --activations-checkpoint-method uniform \ done done done