whisper-vs-distil-whisper-zero

Sleeping

App Files Files Community

whisper-vs-distil-whisper-zero / app.py

sanchit-gandhi HF staff

create app.py

7091430 11 months ago

raw

history blame

No virus

5.22 kB

	from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
	from transformers.utils import is_flash_attn_2_available
	import torch
	import gradio as gr
	import matplotlib.pyplot as plt
	import time
	import os

	BATCH_SIZE = 16
	TOKEN = os.environ.get("HF_TOKEN", None)

	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
	use_flash_attention_2 = is_flash_attn_2_available()

	model = AutoModelForSpeechSeq2Seq.from_pretrained(
	"openai/whisper-large-v2", torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, use_flash_attention_2=use_flash_attention_2
	)
	distilled_model = AutoModelForSpeechSeq2Seq.from_pretrained(
	"sanchit-gandhi/distil-large-v2-private", torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, use_flash_attention_2=use_flash_attention_2, token=TOKEN
	)

	if not use_flash_attention_2:
	model = model.bettertransformer()
	distilled_model = distilled_model.bettertransformer()

	processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")

	model.to(device)
	distilled_model.to(device)

	pipe = pipeline(
	"automatic-speech-recognition",
	model=model,
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	max_new_tokens=128,
	chunk_length_s=30,
	torch_dtype=torch_dtype,
	device=device,
	language="en",
	task="transcribe",
	)
	pipe_forward = pipe._forward

	distil_pipe = pipeline(
	"automatic-speech-recognition",
	model=distilled_model,
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	max_new_tokens=128,
	chunk_length_s=15,
	torch_dtype=torch_dtype,
	device=device,
	language="en",
	task="transcribe",
	)
	distil_pipe_forward = distil_pipe._forward

	def transcribe(inputs):
	if inputs is None:
	raise gr.Error("No audio file submitted! Please record or upload an audio file before submitting your request.")

	def _forward_distil_time(args, *kwargs):
	global distil_runtime
	start_time = time.time()
	result = distil_pipe_forward(args, *kwargs)
	distil_runtime = time.time() - start_time
	return result

	distil_pipe._forward = _forward_distil_time
	distil_text = distil_pipe(inputs, batch_size=BATCH_SIZE)["text"]
	yield distil_text, distil_runtime, None, None, None

	def _forward_time(args, *kwargs):
	global runtime
	start_time = time.time()
	result = pipe_forward(args, *kwargs)
	runtime = time.time() - start_time
	return result

	pipe._forward = _forward_time
	text = pipe(inputs, batch_size=BATCH_SIZE)["text"]

	relative_latency = runtime / distil_runtime

	# Create figure and axis
	fig, ax = plt.subplots(figsize=(5, 5))

	# Define bar width and positions
	bar_width = 0.1
	positions = [0, 0.1] # Adjusted positions to bring bars closer

	# Plot data
	ax.bar(positions[0], distil_runtime, bar_width, edgecolor='black')
	ax.bar(positions[1], runtime, bar_width, edgecolor='black')

	# Set title, labels, and xticks
	ax.set_ylabel('Transcription time (s)')
	ax.set_xticks(positions)
	ax.set_xticklabels(['Distil-Whisper', 'Whisper'])

	# Gridlines and other styling
	ax.grid(which='major', axis='y', linestyle='--', linewidth=0.5)

	# Use tight layout to avoid overlaps
	plt.tight_layout()

	yield distil_text, distil_runtime, text, runtime, plt

	if __name__ == "__main__":
	with gr.Blocks() as demo:
	gr.HTML(
	"""
	<div style="text-align: center; max-width: 700px; margin: 0 auto;">
	<div
	style="
	display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;
	"
	>
	<h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
	Distil-Whisper VS Whisper
	</h1>
	</div>
	</div>
	"""
	)
	gr.HTML(
	f"""
	This demo evaluates the <a href="https://huggingface.co/distil-whisper/distil-large-v2"> Distil-Whisper </a> model
	against the <a href="https://huggingface.co/openai/whisper-large-v2"> Whisper </a> model.
	"""
	)
	audio = gr.components.Audio(source="upload", type="filepath", label="Audio file")
	button = gr.Button("Transcribe")
	plot = gr.components.Plot()
	with gr.Row():
	distil_runtime = gr.components.Textbox(label="Distil-Whisper Transcription Time (s)")
	runtime = gr.components.Textbox(label="Whisper Transcription Time (s)")
	with gr.Row():
	distil_transcription = gr.components.Textbox(label="Distil-Whisper Transcription").style(show_copy_button=True)
	transcription = gr.components.Textbox(label="Whisper Transcription").style(show_copy_button=True)

	button.click(
	fn=transcribe,
	inputs=audio,
	outputs=[distil_transcription, distil_runtime, transcription, runtime, plot],
	)

	demo.queue().launch()