IEIT-Yuan's picture
model scope
57d9b6d
raw
history blame
No virus
1.22 kB
import torch
from transformers import LlamaTokenizer, TextGenerationPipeline, AutoModelForCausalLM
from yuan_moe_hf_model import YuanForCausalLM
import sys, os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
quantized_model_dir = "/temp_data/LLM_test/MOE/Yuan2-M32-int4-hf"
# 加载tokenizer
tokenizer = LlamaTokenizer.from_pretrained(quantized_model_dir, add_eos_token=False, add_bos_token=False, eos_token='<eod>')
# 加载模型并移动到指定设备
model = YuanForCausalLM.from_pretrained(quantized_model_dir, trust_remote_code=True, use_safetensors=True, torch_dtype=torch.float16).to(device)
#for name, param in model.named_parameters():
# if not "quantized" in name:
# param.data.normal_(mean=0.0, std=0.02) # 或者使用其他适当的初始化方式
# 将模型移动到 GPU
#model.to(device)
# 推理生成文本
input_text = "北京是中国的"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
output_ids = model.generate(input_ids, max_new_tokens=256)
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(output_text)