Spaces:

sanchit-gandhi
/

whisper-jax

Running

sanchit-gandhi HF staff commited on Apr 7, 2023

Commit

d3e0df2

•

1 Parent(s): f9dc7b0

use byte64 encoding for faster file transfer

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,6 +1,9 @@
 import gradio as gr
 import requests
 from transformers.models.whisper.tokenization_whisper import TO_LANGUAGE_CODE
 title = "Whisper JAX: The Fastest Whisper API ⚡️"
@@ -10,7 +13,7 @@ description = "Whisper JAX is an optimised implementation of the [Whisper model]
 API_URL = "https://whisper-jax.ngrok.io/generate/"
-article = "Whisper large-v2 model by OpenAI. Backend running JAX on a TPU v4-8 through the generous support of the [TRC](https://sites.research.google/trc/about/) programme."
 language_names = sorted(TO_LANGUAGE_CODE.keys())
 SAMPLING_RATE = 16000
@@ -56,7 +59,11 @@ def transcribe_audio(microphone, file_upload, task, return_timestamps):
     inputs = microphone if microphone is not None else file_upload
-    inputs = {"array": inputs[1].tolist(), "sampling_rate": inputs[0]}
     text, timestamps = inference(inputs=inputs, task=task, return_timestamps=return_timestamps)
@@ -83,8 +90,8 @@ def transcribe_youtube(yt_url, task, return_timestamps):
 audio = gr.Interface(
     fn=transcribe_audio,
     inputs=[
-        gr.inputs.Audio(source="microphone", optional=True),
-        gr.inputs.Audio(source="upload", optional=True),
         gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
         gr.inputs.Checkbox(default=False, label="Return timestamps"),
     ],

+import base64
 import gradio as gr
 import requests
 from transformers.models.whisper.tokenization_whisper import TO_LANGUAGE_CODE
+from transformers.pipelines.audio_utils import ffmpeg_read
 title = "Whisper JAX: The Fastest Whisper API ⚡️"
 API_URL = "https://whisper-jax.ngrok.io/generate/"
+article = "Whisper large-v2 model by OpenAI. Backend running JAX on a TPU v4-8 through the generous support of the [TRC](https://sites.research.google/trc/about/) programme. Whisper JAX code and Gradio demo by 🤗 Hugging Face."
 language_names = sorted(TO_LANGUAGE_CODE.keys())
 SAMPLING_RATE = 16000
     inputs = microphone if microphone is not None else file_upload
+    with open(inputs, "rb") as f:
+        inputs = f.read()
+    inputs = ffmpeg_read(inputs, SAMPLING_RATE)
+    inputs = {"array": base64.b64encode(inputs.tobytes()), "sampling_rate": SAMPLING_RATE}
     text, timestamps = inference(inputs=inputs, task=task, return_timestamps=return_timestamps)
 audio = gr.Interface(
     fn=transcribe_audio,
     inputs=[
+        gr.inputs.Audio(source="microphone", optional=True, type="filepath"),
+        gr.inputs.Audio(source="upload", optional=True, type="filepath"),
         gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
         gr.inputs.Checkbox(default=False, label="Return timestamps"),
     ],