Files changed (1) hide show
  1. app.py +7 -17
app.py CHANGED
@@ -4,21 +4,19 @@ from transformers import (
4
  AutoModelForCausalLM,
5
  AutoTokenizer,
6
  TextIteratorStreamer,
7
- BitsAndBytesConfig,
8
  )
9
  import os
10
  from threading import Thread
11
  import spaces
12
  import time
 
 
13
 
14
  token = os.environ["HF_TOKEN"]
15
 
16
- quantization_config = BitsAndBytesConfig(
17
- load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16
18
- )
19
 
20
  model = AutoModelForCausalLM.from_pretrained(
21
- "microsoft/Phi-3-mini-4k-instruct", quantization_config=quantization_config, token=token,trust_remote_code=True
22
  )
23
  tok = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct", token=token)
24
  terminators = [
@@ -32,13 +30,12 @@ else:
32
  device = torch.device("cpu")
33
  print("Using CPU")
34
 
35
- # model = model.to(device)
36
  # Dispatch Errors
37
 
38
 
39
  @spaces.GPU(duration=60)
40
  def chat(message, history, temperature,do_sample, max_tokens):
41
- start_time = time.time()
42
  chat = []
43
  for item in history:
44
  chat.append({"role": "user", "content": item[0]})
@@ -66,19 +63,12 @@ def chat(message, history, temperature,do_sample, max_tokens):
66
  t.start()
67
 
68
  partial_text = ""
69
- first_token_time = None
70
  for new_text in streamer:
71
- if not first_token_time:
72
- first_token_time = time.time() - start_time
73
  partial_text += new_text
74
  yield partial_text
75
 
76
- total_time = time.time() - start_time
77
- tokens = len(tok.tokenize(partial_text))
78
- tokens_per_second = tokens / total_time if total_time > 0 else 0
79
 
80
- timing_info = f"\n\nTime taken to first token: {first_token_time:.2f} seconds\nTokens per second: {tokens_per_second:.2f}"
81
- yield partial_text + timing_info
82
 
83
 
84
  demo = gr.ChatInterface(
@@ -104,6 +94,6 @@ demo = gr.ChatInterface(
104
  ],
105
  stop_btn="Stop Generation",
106
  title="Chat With LLMs",
107
- description="Now Running [microsoft/Phi-3-mini-4k-instruct](https://huggingface.com/microsoft/Phi-3-mini-4k-instruct) in 4bit"
108
  )
109
- demo.launch()
 
4
  AutoModelForCausalLM,
5
  AutoTokenizer,
6
  TextIteratorStreamer,
 
7
  )
8
  import os
9
  from threading import Thread
10
  import spaces
11
  import time
12
+ import subprocess
13
+ subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
14
 
15
  token = os.environ["HF_TOKEN"]
16
 
 
 
 
17
 
18
  model = AutoModelForCausalLM.from_pretrained(
19
+ "microsoft/Phi-3-mini-4k-instruct", token=token,trust_remote_code=True
20
  )
21
  tok = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct", token=token)
22
  terminators = [
 
30
  device = torch.device("cpu")
31
  print("Using CPU")
32
 
33
+ model = model.to(device)
34
  # Dispatch Errors
35
 
36
 
37
  @spaces.GPU(duration=60)
38
  def chat(message, history, temperature,do_sample, max_tokens):
 
39
  chat = []
40
  for item in history:
41
  chat.append({"role": "user", "content": item[0]})
 
63
  t.start()
64
 
65
  partial_text = ""
 
66
  for new_text in streamer:
 
 
67
  partial_text += new_text
68
  yield partial_text
69
 
 
 
 
70
 
71
+ yield partial_text
 
72
 
73
 
74
  demo = gr.ChatInterface(
 
94
  ],
95
  stop_btn="Stop Generation",
96
  title="Chat With LLMs",
97
+ description="Now Running [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct)"
98
  )
99
+ demo.launch()