Phi-3-mini-4k-instruct

Runtime error

App Files Files Community

fixed app.py

by LightFury9 - opened Apr 23

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

-17

Files changed (1) hide show

app.py +7 -17

app.py CHANGED Viewed

@@ -4,21 +4,19 @@ from transformers import (
     AutoModelForCausalLM,
     AutoTokenizer,
     TextIteratorStreamer,
-    BitsAndBytesConfig,
 )
 import os
 from threading import Thread
 import spaces
 import time
 token = os.environ["HF_TOKEN"]
-quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16
-)
 model = AutoModelForCausalLM.from_pretrained(
-    "microsoft/Phi-3-mini-4k-instruct", quantization_config=quantization_config, token=token,trust_remote_code=True
 )
 tok = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct", token=token)
 terminators = [
@@ -32,13 +30,12 @@ else:
     device = torch.device("cpu")
     print("Using CPU")
-# model = model.to(device)
 # Dispatch Errors
 @spaces.GPU(duration=60)
 def chat(message, history, temperature,do_sample, max_tokens):
-    start_time = time.time()
     chat = []
     for item in history:
         chat.append({"role": "user", "content": item[0]})
@@ -66,19 +63,12 @@ def chat(message, history, temperature,do_sample, max_tokens):
     t.start()
     partial_text = ""
-    first_token_time = None
     for new_text in streamer:
-        if not first_token_time:
-            first_token_time = time.time() - start_time
         partial_text += new_text
         yield partial_text
-    total_time = time.time() - start_time
-    tokens = len(tok.tokenize(partial_text))
-    tokens_per_second = tokens / total_time if total_time > 0 else 0
-    timing_info = f"\n\nTime taken to first token: {first_token_time:.2f} seconds\nTokens per second: {tokens_per_second:.2f}"
-    yield partial_text +  timing_info
 demo = gr.ChatInterface(
@@ -104,6 +94,6 @@ demo = gr.ChatInterface(
     ],
     stop_btn="Stop Generation",
     title="Chat With LLMs",
-    description="Now Running [microsoft/Phi-3-mini-4k-instruct](https://huggingface.com/microsoft/Phi-3-mini-4k-instruct) in 4bit"
 )
-demo.launch()

     AutoModelForCausalLM,
     AutoTokenizer,
     TextIteratorStreamer,
 )
 import os
 from threading import Thread
 import spaces
 import time
+import subprocess
+subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 token = os.environ["HF_TOKEN"]
 model = AutoModelForCausalLM.from_pretrained(
+    "microsoft/Phi-3-mini-4k-instruct", token=token,trust_remote_code=True
 )
 tok = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct", token=token)
 terminators = [
     device = torch.device("cpu")
     print("Using CPU")
+model = model.to(device)
 # Dispatch Errors
 @spaces.GPU(duration=60)
 def chat(message, history, temperature,do_sample, max_tokens):
     chat = []
     for item in history:
         chat.append({"role": "user", "content": item[0]})
     t.start()
     partial_text = ""
     for new_text in streamer:
         partial_text += new_text
         yield partial_text
+    yield partial_text
 demo = gr.ChatInterface(
     ],
     stop_btn="Stop Generation",
     title="Chat With LLMs",
+    description="Now Running [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct)"
 )
+demo.launch()