Upload 8 files

Browse files

Files changed (8) hide show

README.md +42 -0
bert-vocab.txt +0 -0
finetune_afqmc_distributed.sh +58 -0
finetune_csl_distributed.sh +60 -0
finetune_iflytek_distributed.sh +60 -0
finetune_ocnli_distributed.sh +60 -0
finetune_wsc_distributed.sh +60 -0
requirements.txt +25 -0

README.md CHANGED Viewed

@@ -1,3 +1,45 @@
 ---
 license: apache-2.0
 ---

 ---
 license: apache-2.0
+language: zh
+inference: false
+tags:
+- bert
+- pytorch
 ---
+# YuYan-10b
+YuYan is a series of natural language processing models developed by Fuxi AI lab, Netease.Inc, including text generation models, natural language understanding models, and more. YuYan-10b is a natural language understanding model trained on high-quality Chinese corpus.
+YuYan-10b is similar to BERT in that it is trained on large-scale pre-training corpora using unsupervised learning. However, it differs in that it incorporates various tasks such as sentence order and word deletion in addition to the MLM task during training to enhance the model's semantic representation ability and improve its understanding of Chinese.
+# CLUE result
+|                | Score  | AFQMC | TNEWS1.1 | IFLYTEK | OCNLI_50k | WSC1.1 | CSL  |
+| -------------- | ------ | ----- | -------- | ------- | --------- | ------ | ---- |
+| YuYan-10b      |        |       |          |         |           |        |      |
+| HUMAN          | 84.1   | 81    | 71       | 80.3    | 90.3      | 98     | 84   |
+| HunYuan-NLP 1T | 83.632 | 85.11 | 70.44    | 67.54   | 86.5      | 96     | 96.2 |
+## How to use
+Our model is trained based on the [Megatron](https://github.com/NVIDIA/Megatron-LM). As a result, the inference and finetuning depend on it.
+Below are the install tutorial.
+We have packaged all the required dependencies for the model. Use the following command to obtain the model running environment.
+```
+pip install -r requirements.txt
+```
+## Finetuning script
+We provide multiple scripts for finetuning on the CLUE benchmark, which is a Chinese language understanding evaluation leaderboard that covers various tasks such as natural language understanding, reading comprehension, and semantic understanding. For any given CLUE task, use the following command to start finetuning.
+```
+# finetuning afqmc task
+sh finetune_afqmc_distributed.sh
+# finetuning csl task
+sh finetune_csl_distributed.sh
+```

bert-vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

finetune_afqmc_distributed.sh ADDED Viewed

	@@ -0,0 +1,58 @@

+#!/bin/bash
+WORLD_SIZE=8
+DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6003"
+TASK="AFQMC"
+TRAIN_DATA="clue_data/afqmc/train.json"
+VALID_DATA="clue_data/afqmc/dev.json"
+TEST_DATA="clue_data/afqmc/test.json"
+PRETRAINED_CHECKPOINT="./yuyan-10b"
+VOCAB_FILE=bert-vocab.txt
+for lr in 1e-5 2e-5 3e-5; do
+    for bs in 32 16; do
+        for ep in 3 5 8; do
+            ct=`date +"%m%d%H%M%S"`
+            OUTPUTS_PATH="outputs/${TASK}/yuyan_bs_${bs}_lr_${lr}_ep_${ep}_${ct}"
+            if [ ! -d ${OUTPUTS_PATH}  ];then
+                mkdir -p ${OUTPUTS_PATH}
+            else
+                echo "dir exist, not mkdir"
+            fi
+            python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
+                        --task $TASK \
+                        --seed 1234 \
+                        --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
+                        --train-data $TRAIN_DATA \
+                        --valid-data $VALID_DATA \
+                        --test-data $TEST_DATA \
+                        --tokenizer-type BertWordPieceLowerCase \
+                        --vocab-file $VOCAB_FILE \
+                        --epochs $ep \
+                        --tensor-model-parallel-size 8 \
+                        --num-layers 48 \
+                        --hidden-size 4096 \
+                        --num-attention-heads 64 \
+                        --micro-batch-size $bs \
+                        --lr $lr \
+                        --lr-decay-style linear \
+                        --lr-warmup-fraction 0.065 \
+                        --seq-length 128 \
+                        --max-position-embeddings 512 \
+                        --log-interval 10 \
+                        --eval-interval 800 \
+                        --eval-iters 50 \
+                        --weight-decay 1.0e-1 \
+                        --res-path ${OUTPUTS_PATH} \
+                        --fp16 | tee ${OUTPUTS_PATH}/job.log
+            #               --activations-checkpoint-method uniform \
+        done
+    done
+done

finetune_csl_distributed.sh ADDED Viewed

	@@ -0,0 +1,60 @@

+#!/bin/bash
+WORLD_SIZE=8
+DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+TASK="CSL"
+TRAIN_DATA="clue_data/csl/train.json"
+VALID_DATA="clue_data/csl/dev.json"
+TEST_DATA="clue_data/csl/test.json"
+PRETRAINED_CHECKPOINT="./yuyan-10b"
+VOCAB_FILE=bert-vocab.txt
+for lr in 4e-6 7e-6; do
+    for bs in 4 2; do
+        for ep in 7 10; do
+            ct=`date +"%m%d%H%M%S"`
+            OUTPUTS_PATH="outputs/${TASK}/yuyan_bs_${bs}_lr_${lr}_ep_${ep}_${ct}"
+            if [ ! -d ${OUTPUTS_PATH}  ];then
+                mkdir -p ${OUTPUTS_PATH}
+            else
+                echo "dir exist, not mkdir"
+            fi
+            python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
+                        --task $TASK \
+                        --seed 1234 \
+                        --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
+                        --train-data $TRAIN_DATA \
+                        --valid-data $VALID_DATA \
+                        --test-data $TEST_DATA \
+                        --tokenizer-type BertWordPieceLowerCase \
+                        --vocab-file $VOCAB_FILE \
+                        --epochs $ep \
+                        --tensor-model-parallel-size 8 \
+                        --num-layers 48 \
+                        --hidden-size 4096 \
+                        --num-attention-heads 64 \
+                        --micro-batch-size $bs \
+                        --lr $lr \
+                        --lr-decay-style linear \
+                        --lr-warmup-fraction 0.1 \
+                        --seq-length 512 \
+                        --max-position-embeddings 512 \
+                        --log-interval 10 \
+                        --eval-interval 3000 \
+                        --eval-iters 50 \
+                        --weight-decay 1.0e-1 \
+                        --res-path ${OUTPUTS_PATH} \
+                        --fp16 | tee ${OUTPUTS_PATH}/job.log
+            #               --activations-checkpoint-method uniform \
+        done
+    done
+done

finetune_iflytek_distributed.sh ADDED Viewed

	@@ -0,0 +1,60 @@

+#!/bin/bash
+WORLD_SIZE=8
+DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+TASK="IFLYTEK"
+TRAIN_DATA="clue_data/iflytek/train.json"
+VALID_DATA="clue_data/iflytek/dev.json"
+TEST_DATA="clue_data/iflytek/test.json"
+PRETRAINED_CHECKPOINT="./yuyan-10b"
+VOCAB_FILE=bert-vocab.txt
+for lr in 7e-6 1e-5 2e-5; do
+    for bs in 24 16 8; do
+        for ep in 2 3 5 7 15; do
+            ct=`date +"%m%d%H%M%S"`
+            OUTPUTS_PATH="outputs/${TASK}/yuyan_bs_${bs}_lr_${lr}_ep_${ep}_${ct}"
+            if [ ! -d ${OUTPUTS_PATH}  ];then
+                mkdir -p ${OUTPUTS_PATH}
+            else
+                echo "dir exist, not mkdir"
+            fi
+            python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
+                        --task $TASK \
+                        --seed 1242 \
+                        --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
+                        --train-data $TRAIN_DATA \
+                        --valid-data $VALID_DATA \
+                        --test-data $TEST_DATA \
+                        --tokenizer-type BertWordPieceLowerCase \
+                        --vocab-file $VOCAB_FILE \
+                        --epochs $ep \
+                        --tensor-model-parallel-size 8 \
+                        --num-layers 48 \
+                        --hidden-size 4096 \
+                        --num-attention-heads 64 \
+                        --micro-batch-size $bs \
+                        --lr $lr \
+                        --lr-decay-style linear \
+                        --lr-warmup-fraction 0.1 \
+                        --seq-length 512 \
+                        --max-position-embeddings 512 \
+                        --log-interval 10 \
+                        --eval-interval 600 \
+                        --eval-iters 20 \
+                        --weight-decay 1.0e-1 \
+                        --res-path ${OUTPUTS_PATH} \
+                        --fp16 | tee ${OUTPUTS_PATH}/job.log
+            #               --activations-checkpoint-method uniform \
+        done
+    done
+done

finetune_ocnli_distributed.sh ADDED Viewed

	@@ -0,0 +1,60 @@

+#!/bin/bash
+WORLD_SIZE=8
+DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+TASK="OCNLI"
+TRAIN_DATA="clue_data/ocnli/train.json"
+VALID_DATA="clue_data/ocnli/dev.json"
+TEST_DATA="clue_data/ocnli/test.json"
+PRETRAINED_CHECKPOINT="./yuyan-10b"
+VOCAB_FILE=bert-vocab.txt
+for lr in 2e-5 1e-5 7e-6; do
+    for bs in 32 16; do
+        for ep in 3 5 10 100; do
+            ct=`date +"%m%d%H%M%S"`
+            OUTPUTS_PATH="outputs/${TASK}/yuyan_bs_${bs}_lr_${lr}_ep_${ep}_${ct}"
+            if [ ! -d ${OUTPUTS_PATH}  ];then
+                mkdir -p ${OUTPUTS_PATH}
+            else
+                echo "dir exist, not mkdir"
+            fi
+            python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
+                        --task $TASK \
+                        --seed 1236 \
+                        --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
+                        --train-data $TRAIN_DATA \
+                        --valid-data $VALID_DATA \
+                        --test-data $TEST_DATA \
+                        --tokenizer-type BertWordPieceLowerCase \
+                        --vocab-file $VOCAB_FILE \
+                        --epochs $ep \
+                        --tensor-model-parallel-size 8 \
+                        --num-layers 48 \
+                        --hidden-size 4096 \
+                        --num-attention-heads 64 \
+                        --micro-batch-size $bs \
+                        --lr $lr \
+                        --lr-decay-style linear \
+                        --lr-warmup-fraction 0.1 \
+                        --seq-length 128 \
+                        --max-position-embeddings 512 \
+                        --log-interval 10 \
+                        --eval-interval 800 \
+                        --eval-iters 50 \
+                        --weight-decay 1.0e-1 \
+                        --res-path ${OUTPUTS_PATH} \
+                        --fp16 | tee ${OUTPUTS_PATH}/job.log
+            #               --activations-checkpoint-method uniform \
+        done
+    done
+done

finetune_wsc_distributed.sh ADDED Viewed

	@@ -0,0 +1,60 @@

+#!/bin/bash
+WORLD_SIZE=8
+DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+TASK="WSC"
+TRAIN_DATA="clue_data/wsc/train.json"
+VALID_DATA="clue_data/wsc/dev.json"
+TEST_DATA="clue_data/wsc/test.json"
+PRETRAINED_CHECKPOINT="./yuyan-10b"
+VOCAB_FILE=bert-vocab.txt
+for lr in 3e-6 5e-6 1e-5; do
+    for bs in 8 16 32; do
+        for ep in 10 20 30; do
+            ct=`date +"%m%d%H%M%S"`
+            OUTPUTS_PATH="outputs/${TASK}/yuyan_bs_${bs}_lr_${lr}_ep_${ep}_${ct}"
+            if [ ! -d ${OUTPUTS_PATH}  ];then
+                mkdir -p ${OUTPUTS_PATH}
+            else
+                echo "dir exist, not mkdir"
+            fi
+            python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
+                        --task $TASK \
+                        --seed 1238 \
+                        --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
+                        --train-data $TRAIN_DATA \
+                        --valid-data $VALID_DATA \
+                        --test-data $TEST_DATA \
+                        --tokenizer-type BertWordPieceLowerCase \
+                        --vocab-file $VOCAB_FILE \
+                        --epochs $ep \
+                        --tensor-model-parallel-size 8 \
+                        --num-layers 48 \
+                        --hidden-size 4096 \
+                        --num-attention-heads 64 \
+                        --micro-batch-size $bs \
+                        --lr $lr \
+                        --lr-decay-style linear \
+                        --lr-warmup-fraction 0.1 \
+                        --seq-length 128 \
+                        --max-position-embeddings 512 \
+                        --log-interval 5 \
+                        --eval-interval 50 \
+                        --eval-iters 25 \
+                        --weight-decay 1.0e-1 \
+                        --res-path ${OUTPUTS_PATH} \
+                        --fp16 | tee ${OUTPUTS_PATH}/job.log
+            #               --activations-checkpoint-method uniform \
+        done
+    done
+done

requirements.txt ADDED Viewed

	@@ -0,0 +1,25 @@

+apex==0.1
+autopep8==2.0.2
+einops==0.6.1
+faiss==1.5.3
+file_utils==0.0.1
+Flask==1.1.2
+flask_restful==0.3.10
+ftfy==6.1.1
+jieba_fast==0.53
+langdetect==1.0.9
+lsh==0.1.2
+mmcv==2.0.1
+nltk==3.5
+numpy==1.19.2
+Pillow==10.0.0
+regex==2020.11.13
+Requests==2.31.0
+six==1.15.0
+spacy==2.3.2
+timm==0.9.2
+tldextract==3.4.4
+torch==1.8.0a0+1606899
+torchvision==0.9.0a0
+tqdm==4.53.0
+transformers==4.21.1