Simonlob commited on
Commit
99e8a66
1 Parent(s): bc6b593

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -81
app.py CHANGED
@@ -24,7 +24,6 @@ args = Namespace(
24
  cpu=True,
25
  model="akyl_ai",
26
  vocoder="hifigan_T2_v1",
27
- spk=0,
28
  )
29
 
30
  CURRENTLY_LOADED_MODEL = args.model
@@ -40,11 +39,8 @@ def VOCODER_LOC(x):
40
 
41
  LOGO_URL = "https://github.com/simonlobgromov/Matcha-TTS/blob/main/photo_2024-04-07_15-59-52.png"
42
  RADIO_OPTIONS = {
43
- "Multi Speaker (VCTK)": {
44
- "model": "matcha_vctk",
45
- "vocoder": "hifigan_univ_v1",
46
- },
47
- "Single Speaker (Akyl_AI)": {
48
  "model": "akyl_ai",
49
  "vocoder": "hifigan_T2_v1",
50
  },
@@ -53,8 +49,7 @@ RADIO_OPTIONS = {
53
  # Ensure all the required models are downloaded
54
  assert_model_downloaded(MATCHA_TTS_LOC("akyl_ai"), MATCHA_URLS["akyl_ai"])
55
  assert_model_downloaded(VOCODER_LOC("hifigan_T2_v1"), VOCODER_URLS["hifigan_T2_v1"])
56
- assert_model_downloaded(MATCHA_TTS_LOC("matcha_vctk"), MATCHA_URLS["matcha_vctk"])
57
- assert_model_downloaded(VOCODER_LOC("hifigan_univ_v1"), VOCODER_URLS["hifigan_univ_v1"])
58
 
59
  device = get_device(args)
60
 
@@ -78,12 +73,10 @@ def load_model_ui(model_type, textbox):
78
  CURRENTLY_LOADED_MODEL = model_name
79
 
80
  if model_name == "akyl_ai":
81
- spk_slider = gr.update(visible=False, value=-1)
82
  single_speaker_examples = gr.update(visible=True)
83
  multi_speaker_examples = gr.update(visible=False)
84
  length_scale = gr.update(value=0.95)
85
  else:
86
- spk_slider = gr.update(visible=True, value=0)
87
  single_speaker_examples = gr.update(visible=False)
88
  multi_speaker_examples = gr.update(visible=True)
89
  length_scale = gr.update(value=0.85)
@@ -91,7 +84,6 @@ def load_model_ui(model_type, textbox):
91
  return (
92
  textbox,
93
  gr.update(interactive=True),
94
- spk_slider,
95
  single_speaker_examples,
96
  multi_speaker_examples,
97
  length_scale,
@@ -105,7 +97,7 @@ def process_text_gradio(text):
105
 
106
 
107
  @torch.inference_mode()
108
- def synthesise_mel(text, text_length, n_timesteps, temperature, length_scale, spk):
109
  spk = torch.tensor([spk], device=device, dtype=torch.long) if spk >= 0 else None
110
  output = model.synthesise(
111
  text,
@@ -122,21 +114,9 @@ def synthesise_mel(text, text_length, n_timesteps, temperature, length_scale, sp
122
  return fp.name, plot_tensor(output["mel"].squeeze().cpu().numpy())
123
 
124
 
125
- def multispeaker_example_cacher(text, n_timesteps, mel_temp, length_scale, spk):
126
- global CURRENTLY_LOADED_MODEL # pylint: disable=global-statement
127
- if CURRENTLY_LOADED_MODEL != "matcha_vctk":
128
- global model, vocoder, denoiser # pylint: disable=global-statement
129
- model, vocoder, denoiser = load_model("matcha_vctk", "hifigan_univ_v1")
130
- CURRENTLY_LOADED_MODEL = "matcha_vctk"
131
-
132
- phones, text, text_lengths = process_text_gradio(text)
133
- audio, mel_spectrogram = synthesise_mel(text, text_lengths, n_timesteps, mel_temp, length_scale, spk)
134
- return phones, audio, mel_spectrogram
135
-
136
-
137
  def ljspeech_example_cacher(text, n_timesteps, mel_temp, length_scale, spk=-1):
138
  global CURRENTLY_LOADED_MODEL # pylint: disable=global-statement
139
- if CURRENTLY_LOADED_MODEL != "akyl_ai":
140
  global model, vocoder, denoiser # pylint: disable=global-statement
141
  model, vocoder, denoiser = load_model("akyl_ai", "hifigan_T2_v1")
142
  CURRENTLY_LOADED_MODEL = "akyl_ai"
@@ -147,51 +127,46 @@ def ljspeech_example_cacher(text, n_timesteps, mel_temp, length_scale, spk=-1):
147
 
148
 
149
  def main():
150
- description = """# 🍵 Matcha-TTS: A fast TTS architecture with conditional flow matching
151
- ### [Shivam Mehta](https://www.kth.se/profile/smehta), [Ruibo Tu](https://www.kth.se/profile/ruibo), [Jonas Beskow](https://www.kth.se/profile/beskow), [Éva Székely](https://www.kth.se/profile/szekely), and [Gustav Eje Henter](https://people.kth.se/~ghe/)
152
- We propose 🍵 Matcha-TTS, a new approach to non-autoregressive neural TTS, that uses conditional flow matching (similar to rectified flows) to speed up ODE-based speech synthesis. Our method:
153
 
 
 
 
154
 
155
  * Is probabilistic
156
  * Has compact memory footprint
157
  * Sounds highly natural
158
  * Is very fast to synthesise from
159
-
160
-
161
- Check out our [demo page](https://shivammehta25.github.io/Matcha-TTS). Read our [arXiv preprint for more details](https://arxiv.org/abs/2309.03199).
162
- Code is available in our [GitHub repository](https://github.com/shivammehta25/Matcha-TTS), along with pre-trained models.
163
-
164
- Cached examples are available at the bottom of the page.
165
  """
166
 
167
- with gr.Blocks(title="🍵 Matcha-TTS: A fast TTS architecture with conditional flow matching") as demo:
168
  processed_text = gr.State(value=None)
169
  processed_text_len = gr.State(value=None)
170
 
171
  with gr.Box():
172
- with gr.Row():
173
- gr.Markdown(description, scale=3)
174
- with gr.Column():
175
- gr.Image(LOGO_URL, label="Matcha-TTS logo", height=50, width=50, scale=1, show_label=False)
176
- html = '<br><iframe width="560" height="315" src="https://www.youtube.com/embed/xmvJkz3bqw0?si=jN7ILyDsbPwJCGoa" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen></iframe>'
177
- gr.HTML(html)
178
 
179
  with gr.Box():
180
  radio_options = list(RADIO_OPTIONS.keys())
181
  model_type = gr.Radio(
182
- radio_options, value=radio_options[0], label="Choose a Model", interactive=True, container=False
183
  )
184
 
185
  with gr.Row():
186
- gr.Markdown("# Text Input")
187
  with gr.Row():
188
- text = gr.Textbox(value="", lines=2, label="Text to synthesise", scale=3)
189
- spk_slider = gr.Slider(
190
- minimum=0, maximum=107, step=1, value=args.spk, label="Speaker ID", interactive=True, scale=1
191
- )
192
 
193
  with gr.Row():
194
- gr.Markdown("### Hyper parameters")
 
 
195
  with gr.Row():
196
  n_timesteps = gr.Slider(
197
  label="Number of ODE steps",
@@ -200,9 +175,10 @@ def main():
200
  step=1,
201
  value=10,
202
  interactive=True,
 
203
  )
204
  length_scale = gr.Slider(
205
- label="Length scale (Speaking rate)",
206
  minimum=0.5,
207
  maximum=1.5,
208
  step=0.05,
@@ -216,14 +192,16 @@ def main():
216
  step=0.16675,
217
  value=0.667,
218
  interactive=True,
 
219
  )
220
 
221
- synth_btn = gr.Button("Synthesise")
222
 
223
  with gr.Box():
224
  with gr.Row():
225
- gr.Markdown("### Phonetised text")
226
- phonetised_text = gr.Textbox(interactive=False, scale=10, label="Phonetised text")
 
227
 
228
  with gr.Box():
229
  with gr.Row():
@@ -232,7 +210,7 @@ def main():
232
  # with gr.Row():
233
  audio = gr.Audio(interactive=False, label="Audio")
234
 
235
- with gr.Row(visible=False) as example_row_lj_speech:
236
  examples = gr.Examples( # pylint: disable=unused-variable
237
  examples=[
238
  [
@@ -256,36 +234,11 @@ def main():
256
  cache_examples=True,
257
  )
258
 
259
- with gr.Row() as example_row_multispeaker:
260
- multi_speaker_examples = gr.Examples( # pylint: disable=unused-variable
261
- examples=[
262
- [
263
- "Hello everyone! I am speaker 0 and I am here to tell you that Matcha-TTS is amazing!",
264
- 10,
265
- 0.677,
266
- 0.85,
267
- 0,
268
- ],
269
- [
270
- "Hello everyone! I am speaker 16 and I am here to tell you that Matcha-TTS is amazing!",
271
- 10,
272
- 0.677,
273
- 0.85,
274
- 16,
275
- ],
276
-
277
- ],
278
- fn=multispeaker_example_cacher,
279
- inputs=[text, n_timesteps, mel_temp, length_scale, spk_slider],
280
- outputs=[phonetised_text, audio, mel_spectrogram],
281
- cache_examples=True,
282
- label="Multi Speaker Examples",
283
- )
284
-
285
  model_type.change(lambda x: gr.update(interactive=False), inputs=[synth_btn], outputs=[synth_btn]).then(
286
  load_model_ui,
287
  inputs=[model_type, text],
288
- outputs=[text, synth_btn, spk_slider, example_row_lj_speech, example_row_multispeaker, length_scale],
289
  )
290
 
291
  synth_btn.click(
@@ -298,7 +251,7 @@ def main():
298
  queue=True,
299
  ).then(
300
  fn=synthesise_mel,
301
- inputs=[processed_text, processed_text_len, n_timesteps, mel_temp, length_scale, spk_slider],
302
  outputs=[audio, mel_spectrogram],
303
  )
304
 
 
24
  cpu=True,
25
  model="akyl_ai",
26
  vocoder="hifigan_T2_v1",
 
27
  )
28
 
29
  CURRENTLY_LOADED_MODEL = args.model
 
39
 
40
  LOGO_URL = "https://github.com/simonlobgromov/Matcha-TTS/blob/main/photo_2024-04-07_15-59-52.png"
41
  RADIO_OPTIONS = {
42
+
43
+ "Akyl_AI": {
 
 
 
44
  "model": "akyl_ai",
45
  "vocoder": "hifigan_T2_v1",
46
  },
 
49
  # Ensure all the required models are downloaded
50
  assert_model_downloaded(MATCHA_TTS_LOC("akyl_ai"), MATCHA_URLS["akyl_ai"])
51
  assert_model_downloaded(VOCODER_LOC("hifigan_T2_v1"), VOCODER_URLS["hifigan_T2_v1"])
52
+
 
53
 
54
  device = get_device(args)
55
 
 
73
  CURRENTLY_LOADED_MODEL = model_name
74
 
75
  if model_name == "akyl_ai":
 
76
  single_speaker_examples = gr.update(visible=True)
77
  multi_speaker_examples = gr.update(visible=False)
78
  length_scale = gr.update(value=0.95)
79
  else:
 
80
  single_speaker_examples = gr.update(visible=False)
81
  multi_speaker_examples = gr.update(visible=True)
82
  length_scale = gr.update(value=0.85)
 
84
  return (
85
  textbox,
86
  gr.update(interactive=True),
 
87
  single_speaker_examples,
88
  multi_speaker_examples,
89
  length_scale,
 
97
 
98
 
99
  @torch.inference_mode()
100
+ def synthesise_mel(text, text_length, n_timesteps, temperature, length_scale, spk=-1):
101
  spk = torch.tensor([spk], device=device, dtype=torch.long) if spk >= 0 else None
102
  output = model.synthesise(
103
  text,
 
114
  return fp.name, plot_tensor(output["mel"].squeeze().cpu().numpy())
115
 
116
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  def ljspeech_example_cacher(text, n_timesteps, mel_temp, length_scale, spk=-1):
118
  global CURRENTLY_LOADED_MODEL # pylint: disable=global-statement
119
+ if CURRENTLY_LOADED_MODEL == "akyl_ai":
120
  global model, vocoder, denoiser # pylint: disable=global-statement
121
  model, vocoder, denoiser = load_model("akyl_ai", "hifigan_T2_v1")
122
  CURRENTLY_LOADED_MODEL = "akyl_ai"
 
127
 
128
 
129
  def main():
130
+ description = """# AkylAI TTS mini
131
+ We present to you a fast speech synthesis model in the Kyrgyz language.
 
132
 
133
+
134
+ This is a new approach to non-autoregressive neural TTS that uses conditional stream matching (similar to rectified streams) to speed up ODE-based speech synthesis.
135
+ Method:
136
 
137
  * Is probabilistic
138
  * Has compact memory footprint
139
  * Sounds highly natural
140
  * Is very fast to synthesise from
141
+
 
 
 
 
 
142
  """
143
 
144
+ with gr.Blocks(title="AkylAI TTS") as demo:
145
  processed_text = gr.State(value=None)
146
  processed_text_len = gr.State(value=None)
147
 
148
  with gr.Box():
149
+ with gr.Row():
150
+ gr.Markdown(description, scale=3)
151
+ with gr.Column():
152
+ image_url = "https://github.com/simonlobgromov/Matcha-TTS/blob/main/photo_2024-04-07_15-59-52.png?raw=true"
153
+ gr.Image(image_url, label="Matcha-TTS logo", width=560, height=315)
 
154
 
155
  with gr.Box():
156
  radio_options = list(RADIO_OPTIONS.keys())
157
  model_type = gr.Radio(
158
+ radio_options, value=radio_options[0], label="Choose a Model", interactive=True, container=False, visible=False,
159
  )
160
 
161
  with gr.Row():
162
+ gr.Markdown("## Текстти кыргыз тилинде жазыңыз\n### Text Input")
163
  with gr.Row():
164
+ text = gr.Textbox(value="", lines=2, label=None, scale=3)
 
 
 
165
 
166
  with gr.Row():
167
+ gr.Markdown("## Сүйлөө ылдамдыгы\n### Speaking rate")
168
+ # gr.Markdown("")
169
+
170
  with gr.Row():
171
  n_timesteps = gr.Slider(
172
  label="Number of ODE steps",
 
175
  step=1,
176
  value=10,
177
  interactive=True,
178
+ visible=False
179
  )
180
  length_scale = gr.Slider(
181
+ label=None,
182
  minimum=0.5,
183
  maximum=1.5,
184
  step=0.05,
 
192
  step=0.16675,
193
  value=0.667,
194
  interactive=True,
195
+ visible=False
196
  )
197
 
198
+ synth_btn = gr.Button("БАШТОО | RUN")
199
 
200
  with gr.Box():
201
  with gr.Row():
202
+ gr.Markdown("## Фонетизацияланган текст\n### Phonetised text")
203
+ with gr.Row():
204
+ phonetised_text = gr.Textbox(interactive=False, scale=10, label=None)
205
 
206
  with gr.Box():
207
  with gr.Row():
 
210
  # with gr.Row():
211
  audio = gr.Audio(interactive=False, label="Audio")
212
 
213
+ with gr.Row(visible=True) as example_row_lj_speech:
214
  examples = gr.Examples( # pylint: disable=unused-variable
215
  examples=[
216
  [
 
234
  cache_examples=True,
235
  )
236
 
237
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  model_type.change(lambda x: gr.update(interactive=False), inputs=[synth_btn], outputs=[synth_btn]).then(
239
  load_model_ui,
240
  inputs=[model_type, text],
241
+ outputs=[text, synth_btn, example_row_lj_speech, length_scale],
242
  )
243
 
244
  synth_btn.click(
 
251
  queue=True,
252
  ).then(
253
  fn=synthesise_mel,
254
+ inputs=[processed_text, processed_text_len, n_timesteps, mel_temp, length_scale],
255
  outputs=[audio, mel_spectrogram],
256
  )
257