asr / app.py
nshmyrevgmail's picture
Bump big Russian model
f88e21a
raw
history blame contribute delete
No virus
3.79 kB
import logging
import sys
import gradio as gr
import vosk
import json
import subprocess
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
LARGE_MODEL_BY_LANGUAGE = {
"Russian": {"model_id": "vosk-model-ru-0.42"},
"Chinese": {"model_id": "vosk-model-cn-0.22"},
"English": {"model_id": "vosk-model-en-us-0.22"},
"French": {"model_id": "vosk-model-fr-0.22"},
"German": {"model_id": "vosk-model-de-0.22"},
"Italian": {"model_id": "vosk-model-it-0.22"},
"Japanese": {"model_id": "vosk-model-ja-0.22"},
"Hindi": {"model_id": "vosk-model-hi-0.22"},
"Persian": {"model_id": "vosk-model-fa-0.5"},
"Uzbek": {"model_id": "vosk-model-small-uz-0.22"},
}
LANGUAGES = sorted(LARGE_MODEL_BY_LANGUAGE.keys())
CACHED_MODELS_BY_ID = {}
def asr(model, input_file):
rec = vosk.KaldiRecognizer(model, 16000.0)
results = []
process = subprocess.Popen(f'ffmpeg -loglevel quiet -i {input_file} -ar 16000 -ac 1 -f s16le -'.split(),
stdout=subprocess.PIPE)
while True:
data = process.stdout.read(4000)
if len(data) == 0:
break
if rec.AcceptWaveform(data):
jres = json.loads(rec.Result())
results.append(jres['text'])
jres = json.loads(rec.FinalResult())
results.append(jres['text'])
return " ".join(results)
def run(input_file, language, history):
logger.info(f"Running ASR for {language} for {input_file}")
history = history or []
model = LARGE_MODEL_BY_LANGUAGE.get(language, None)
if model is None:
history.append({
"error_message": f"Failed to find a model for {language} language :("
})
elif input_file is None:
history.append({
"error_message": f"Record input audio first"
})
else:
model_instance = CACHED_MODELS_BY_ID.get(model["model_id"], None)
if model_instance is None:
model_instance = vosk.Model(model_name=model["model_id"])
CACHED_MODELS_BY_ID[model["model_id"]] = model_instance
transcription = asr(model_instance, input_file)
logger.info(f"Transcription for {input_file}: {transcription}")
history.append({
"model_id": model["model_id"],
"language": language,
"transcription": transcription,
"error_message": None
})
html_output = "<div class='result'>"
for item in history:
if item["error_message"] is not None:
html_output += f"<div class='result_item result_item_error'>{item['error_message']}</div>"
else:
html_output += "<div class='result_item result_item_success'>"
html_output += f'{item["transcription"]}<br/>'
html_output += "</div>"
html_output += "</div>"
return html_output, history
gr.Interface(
run,
inputs=[
gr.inputs.Audio(source="microphone", type="filepath", label="Record something..."),
gr.inputs.Radio(label="Language", choices=LANGUAGES),
"state"
],
outputs=[
gr.outputs.HTML(label="Outputs"),
"state"
],
title="Automatic Speech Recognition",
description="",
css="""
.result {display:flex;flex-direction:column}
.result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
.result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
.result_item_error {background-color:#ff7070;color:white;align-self:start}
""",
allow_flagging="never",
theme="default"
).launch(enable_queue=True)