# You need to merge Qwen2-1.5B-Instruct with fine-tuned LLM lora first, see merge_lora.py. # Then you can build the engine with the merged model. # python3 convert_checkpoint.py --model_dir ${model_dir} \ # --output_dir ${checkpoint_dir} \ # --dtype float16 # We have merged the two models and convert it for trt-llm into the below checkpoint: checkpoint_dir=tllm_checkpoint_1gpu_fp16_qwen2_1.5B_instruct_merged # output engine directory engine_dir=qwen2_1.5B_instruct_fp16_merged # max_prompt_embedding_table_size should >= max_batch_size * speech_embedding_seq_length trtllm-build --checkpoint_dir ${checkpoint_dir} \ --output_dir $engine_dir \ --max_prompt_embedding_table_size 4096 \ --max_batch_size 16 \ --gemm_plugin float16