File size: 802 Bytes
e076ae8
 
 
21d1ad5
e076ae8
db3dea6
e076ae8
5ab552b
e076ae8
db3dea6
 
2ad5e4b
db3dea6
 
 
 
 
21d1ad5
db3dea6
21d1ad5
 
 
 
 
d83e804
5ab552b
e076ae8
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import gradio as gr
import numpy as np
from utils import load_model, normalize_text
import time

vits = load_model()


def text_to_speech(text):
    """ Text to speech
    """
    text = normalize_text(text)
    text_inputs = np.asarray(
        vits.tokenizer.text_to_ids(text),
        dtype=np.int64,
    )[None, :]

    start = time.perf_counter()
    audio = vits.inference_onnx(text_inputs)
    end = time.perf_counter()
    inference_time = end - start
    audio_length = audio.shape[1] / vits.config.audio.sample_rate
    print('Inference time: {}'.format(inference_time))
    print('Real time factor: {}'.format(inference_time / audio_length))
    return 16000, audio[0]


gr.Interface(
    fn=text_to_speech,
    inputs="text",
    outputs="audio",
    theme="default",
).launch(debug=False)