yuekai's picture
Upload folder using huggingface_hub
7576105 verified
# You need to merge Qwen2-1.5B-Instruct with fine-tuned LLM lora first, see merge_lora.py.
# Then you can build the engine with the merged model.
# python3 convert_checkpoint.py --model_dir ${model_dir} \
# --output_dir ${checkpoint_dir} \
# --dtype float16
# We have merged the two models and convert it for trt-llm into the below checkpoint:
checkpoint_dir=tllm_checkpoint_1gpu_fp16_qwen2_1.5B_instruct_merged
# output engine directory
engine_dir=qwen2_1.5B_instruct_fp16_merged
# max_prompt_embedding_table_size should >= max_batch_size * speech_embedding_seq_length
trtllm-build --checkpoint_dir ${checkpoint_dir} \
--output_dir $engine_dir \
--max_prompt_embedding_table_size 4096 \
--max_batch_size 16 \
--gemm_plugin float16