zerogpudevmode / app.py
0num4
cudaにすれば5sぐらいが目標で
9ef92c0
raw
history blame contribute delete
No virus
2.21 kB
import gradio as gr # type: ignore
import spaces # type: ignore
import torch
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained(
"microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True
)
model = AutoModelForCausalLM.from_pretrained(
"microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True
)
model.to("cuda")
def greet(name, sliderint):
return "Hellonyaaaaa " + name + "!!" + str(sliderint)
chat_template = (
"{% for message in messages %}"
"{{'<|' + message['role'] + '|>' + message['content'] + '\n'}}"
"{% endfor %}"
"{% if add_generation_prompt %}"
"{{ '<|model|>\n' }}"
"{% endif %}"
)
# @spaces.GPU(duration=45)
def chatinterface_fn(message, history):
prompt = []
for human, assistant in history:
prompt.append({"role": "user", "content": human})
prompt.append({"role": "model", "content": assistant})
prompt.append({"role": "user", "content": message})
token_ids = tokenizer.apply_chat_template(
prompt,
tokenize=True,
add_generation_prompt=True,
chat_template=chat_template,
return_tensors="pt",
)
print("token_ids:", token_ids) # デバッグ用に追加
output_ids = model.generate(
token_ids.to(model.device),
temperature=0.1,
do_sample=True,
top_p=0.95,
top_k=40,
max_new_tokens=256,
)
text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(text)
return text
@spaces.GPU(duration=45)
def infer(message: str) -> str:
input_ids = tokenizer.encode(
"hello, this is", add_special_tokens=False, return_tensors="pt"
).to(model.device)
print(model.device)
outputs = model.generate(input_ids)
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
return text
with gr.Blocks() as demo:
name = gr.Textbox(label="name")
output = gr.Interface(fn=greet, inputs=["text", "slider"], outputs="text")
a = gr.ChatInterface(chatinterface_fn, title="microsoft/Phi-3-mini-4k-instruct")
b = gr.Interface(fn=infer, inputs="text", outputs="text")
demo.launch()