Kit-Lemonfoot commited on
Commit
d848e9a
1 Parent(s): 926eb83

Added Ame, Mint and Tenma, did some code changes

Browse files
.gitattributes CHANGED
@@ -59,3 +59,13 @@ referenceaudio/Pippa/A2.wav filter=lfs diff=lfs merge=lfs -text
59
  referenceaudio/Pippa/A3.wav filter=lfs diff=lfs merge=lfs -text
60
  referenceaudio/Pippa/A4.wav filter=lfs diff=lfs merge=lfs -text
61
  referenceaudio/Pippa/A5.wav filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
59
  referenceaudio/Pippa/A3.wav filter=lfs diff=lfs merge=lfs -text
60
  referenceaudio/Pippa/A4.wav filter=lfs diff=lfs merge=lfs -text
61
  referenceaudio/Pippa/A5.wav filter=lfs diff=lfs merge=lfs -text
62
+ referenceaudio/Mint/A1.wav filter=lfs diff=lfs merge=lfs -text
63
+ referenceaudio/Mint/A2.wav filter=lfs diff=lfs merge=lfs -text
64
+ referenceaudio/Mint/A3.wav filter=lfs diff=lfs merge=lfs -text
65
+ referenceaudio/Mint/A4.wav filter=lfs diff=lfs merge=lfs -text
66
+ referenceaudio/Mint/A5.wav filter=lfs diff=lfs merge=lfs -text
67
+ referenceaudio/Mint/A6.wav filter=lfs diff=lfs merge=lfs -text
68
+ referenceaudio/Tenma/A1.wav filter=lfs diff=lfs merge=lfs -text
69
+ referenceaudio/Tenma/A2.wav filter=lfs diff=lfs merge=lfs -text
70
+ referenceaudio/Tenma/A3.wav filter=lfs diff=lfs merge=lfs -text
71
+ referenceaudio/Tenma/A4.wav filter=lfs diff=lfs merge=lfs -text
GPT_SoVITS/GPT_weights/AmeliaWatson_GPT.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3069216bbe38824e8fa1ba2dd9e1c48a133f6a26714cc4837d1e423cd27b931a
3
+ size 155087286
GPT_SoVITS/GPT_weights/MintFantome_GPT.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dae9cc650512175057d2690f0124dfb6781f011e039f24123280b038e9adf495
3
+ size 155087286
GPT_SoVITS/GPT_weights/TenmaMaemi_GPT.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2942e8e31b4c9195baaf5b33d360f60de9e6974dcaedd209864e81ecb6b9c9f9
3
+ size 155087222
GPT_SoVITS/SoVITS_weights/AmeliaWatson_SoVITS.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a58edf0253460d726fae913a91c6aba1956df6f2c86419d8ed1bb380e66313a
3
+ size 84885457
GPT_SoVITS/SoVITS_weights/MintFantome_SoVITS.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09ac61e6f1c8739ac4b49ad61c26861f05225c3395fe7499662ac70e474935da
3
+ size 84885455
GPT_SoVITS/SoVITS_weights/TenmaMaemi_SoVITS.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:066a43e3e5b0b5ad7c2fa732b79d35bf00fee6f84a9d194a80bdc8d4e0ccf1d1
3
+ size 84885452
GPT_SoVITS/inference_webui.py CHANGED
@@ -81,7 +81,7 @@ def inference(name, gptmp, svmp, sty, text, text_lang,
81
  prompt_lang, top_k,
82
  top_p, temperature,
83
  text_split_method, batch_size,
84
- speed_factor, ref_text_free,
85
  split_bucket,fragment_interval,
86
  seed, keep_random, parallel_infer,
87
  repetition_penalty
@@ -98,14 +98,14 @@ def inference(name, gptmp, svmp, sty, text, text_lang,
98
  tts_pipeline.init_t2s_weights(gptmp)
99
  tts_pipeline.init_vits_weights(svmp)
100
 
101
-
102
  seed = -1 if keep_random else seed
103
  actual_seed = seed if seed not in [-1, "", None] else random.randrange(1 << 32)
 
104
  inputs={
105
  "text": text,
106
  "text_lang": dict_language[text_lang],
107
  "ref_audio_path": ref_audio_path,
108
- "prompt_text": prompt_text if not ref_text_free else "",
109
  "prompt_lang": dict_language[prompt_lang],
110
  "top_k": top_k,
111
  "top_p": top_p,
@@ -140,8 +140,6 @@ pretrained_sovits_name = "GPT_SoVITS/pretrained_models/s2G488k.pth"
140
  pretrained_gpt_name = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
141
  SoVITS_weight_root = "GPT_SoVITS/SoVITS_weights/"
142
  GPT_weight_root = "GPT_SoVITS/GPT_weights/"
143
- #os.makedirs(SoVITS_weight_root, exist_ok=True)
144
- #os.makedirs(GPT_weight_root, exist_ok=True)
145
 
146
  def get_weights_names():
147
  SoVITS_names = [pretrained_sovits_name]
@@ -162,13 +160,17 @@ def load_models():
162
  if not info['enable']:
163
  continue
164
  title= info['title']
165
- #gptmodelpath= info['gpt_model_path']
166
- #sovitsmodelpath= info['sovits_model_path']
167
  gptmodelpath= "%s/%s" % (GPT_weight_root, info['gpt_model_path'])
168
  sovitsmodelpath= "%s/%s" % (SoVITS_weight_root, info['sovits_model_path'])
169
  author= info['modelauthor']
170
  image = info['cover']
171
  styles = info['styles']
 
 
 
 
 
 
172
  styletrans = info['styletrans']
173
  st=[styles, styletrans]
174
  voices.append((name, title, gptmodelpath, sovitsmodelpath, author, image))
@@ -178,11 +180,6 @@ def load_models():
178
 
179
  modeldata, referencedata = load_models()
180
 
181
- #print(os.getcwd())
182
- #for r, _, f in os.walk(os.getcwd()):
183
- # for n in f:
184
- # print(os.path.join(r, n))
185
-
186
  #Gradio preload
187
  text = gr.TextArea(label="Input Text", value="Hello there! This is test audio of a new text to speech tool.")
188
  text_language = gr.Dropdown(label="Language", choices=["EN", "JP", "ZH", "ZH/EN", "JP/EN", "Automatic"], value="EN")
@@ -193,7 +190,7 @@ how_to_cut = gr.Dropdown(label="Slicing Method",
193
  )
194
  top_k = gr.Slider(minimum=1,maximum=100,step=1,label="Top_k",value=5,interactive=True)
195
  top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label="Top_p",value=1,interactive=True)
196
- temperature = gr.Slider(minimum=0,maximum=1,step=0.05,label="Temperature",value=1,interactive=True)
197
  batch_size = gr.Slider(minimum=1,maximum=200,step=1,label="Batch Size",value=20,interactive=True)
198
  fragment_interval = gr.Slider(minimum=0.01,maximum=1,step=0.01,label="Fragment Interval",value=0.3,interactive=True)
199
  speed_factor = gr.Slider(minimum=0.50,maximum=2,step=0.05,label="Speed Factor",value=1.0,interactive=True)
@@ -221,26 +218,30 @@ with gr.Blocks(title="Lemonfoot GPT-SoVITS") as app:
221
  gr.Markdown(f"**{title}**\n\n Dataset author: {author}")
222
  gr.Image(f"images/{image}", label=None, show_label=False, width=300, show_download_button=False, container=False, show_share_button=False)
223
  with gr.Column():
224
- with gr.TabItem("Style using a preset"):
225
- sty = gr.Dropdown(
226
- label="Current style",
227
- choices=referencedata[name][0].keys(),
228
- value="Neutral",
229
- interactive=True
230
- )
 
 
 
 
 
231
  with gr.TabItem("Style using a different audio"):
232
  with gr.Column():
233
  ref_audio_path = gr.Audio(label="Reference Audio", type="filepath")
234
- ref_text_free = gr.Checkbox(label="Enables no text-reference mode.", value=False, interactive=True)
235
- prompt_text = gr.Textbox(label="Reference Audio Text", interactive=True)
236
- prompt_language = gr.Textbox(value="EN", visible=False, interactive=False)
237
  with gr.Column():
238
  inference_button = gr.Button("Synthesize", variant="primary")
239
  output = gr.Audio(label="Output")
240
 
241
  inference_button.click(
242
  inference,
243
- inputs=[n, gptmp, svmp, sty, text, text_language, ref_audio_path, prompt_text, prompt_language, top_k, top_p, temperature, how_to_cut, batch_size, speed_factor, ref_text_free, split_bucket, fragment_interval, seed, keep_random, parallel_infer, repetition_penalty],
244
  outputs=[output, seed]
245
  )
246
 
 
81
  prompt_lang, top_k,
82
  top_p, temperature,
83
  text_split_method, batch_size,
84
+ speed_factor,
85
  split_bucket,fragment_interval,
86
  seed, keep_random, parallel_infer,
87
  repetition_penalty
 
98
  tts_pipeline.init_t2s_weights(gptmp)
99
  tts_pipeline.init_vits_weights(svmp)
100
 
 
101
  seed = -1 if keep_random else seed
102
  actual_seed = seed if seed not in [-1, "", None] else random.randrange(1 << 32)
103
+ print(f"TMP: {temperature} | SPDFCT: {speed_factor} | STY: {sty} | LANG: {text_lang}")
104
  inputs={
105
  "text": text,
106
  "text_lang": dict_language[text_lang],
107
  "ref_audio_path": ref_audio_path,
108
+ "prompt_text": prompt_text,
109
  "prompt_lang": dict_language[prompt_lang],
110
  "top_k": top_k,
111
  "top_p": top_p,
 
140
  pretrained_gpt_name = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
141
  SoVITS_weight_root = "GPT_SoVITS/SoVITS_weights/"
142
  GPT_weight_root = "GPT_SoVITS/GPT_weights/"
 
 
143
 
144
  def get_weights_names():
145
  SoVITS_names = [pretrained_sovits_name]
 
160
  if not info['enable']:
161
  continue
162
  title= info['title']
 
 
163
  gptmodelpath= "%s/%s" % (GPT_weight_root, info['gpt_model_path'])
164
  sovitsmodelpath= "%s/%s" % (SoVITS_weight_root, info['sovits_model_path'])
165
  author= info['modelauthor']
166
  image = info['cover']
167
  styles = info['styles']
168
+ #check that all styles properly exist
169
+ for s in styles.values():
170
+ if(not os.path.exists(f"referenceaudio/{name}/{s}")):
171
+ print(f"WARNING : Some defined preset styles do not exist for model {name}, skipping")
172
+ styles=None
173
+ break
174
  styletrans = info['styletrans']
175
  st=[styles, styletrans]
176
  voices.append((name, title, gptmodelpath, sovitsmodelpath, author, image))
 
180
 
181
  modeldata, referencedata = load_models()
182
 
 
 
 
 
 
183
  #Gradio preload
184
  text = gr.TextArea(label="Input Text", value="Hello there! This is test audio of a new text to speech tool.")
185
  text_language = gr.Dropdown(label="Language", choices=["EN", "JP", "ZH", "ZH/EN", "JP/EN", "Automatic"], value="EN")
 
190
  )
191
  top_k = gr.Slider(minimum=1,maximum=100,step=1,label="Top_k",value=5,interactive=True)
192
  top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label="Top_p",value=1,interactive=True)
193
+ temperature = gr.Slider(minimum=0,maximum=1,step=0.05,label="Temperature",value=0.7,interactive=True)
194
  batch_size = gr.Slider(minimum=1,maximum=200,step=1,label="Batch Size",value=20,interactive=True)
195
  fragment_interval = gr.Slider(minimum=0.01,maximum=1,step=0.01,label="Fragment Interval",value=0.3,interactive=True)
196
  speed_factor = gr.Slider(minimum=0.50,maximum=2,step=0.05,label="Speed Factor",value=1.0,interactive=True)
 
218
  gr.Markdown(f"**{title}**\n\n Dataset author: {author}")
219
  gr.Image(f"images/{image}", label=None, show_label=False, width=300, show_download_button=False, container=False, show_share_button=False)
220
  with gr.Column():
221
+ #if there isn't any styles don't bother rendering the style window
222
+ if(not referencedata[name][0]==None):
223
+ rd = list(referencedata[name][0].keys())
224
+ with gr.TabItem("Style using a preset"):
225
+ sty = gr.Dropdown(
226
+ label="Current style",
227
+ choices=rd,
228
+ value=rd[0],
229
+ interactive=True
230
+ )
231
+ else:
232
+ sty=gr.Textbox(value="none", visible=False, interactive=False)
233
  with gr.TabItem("Style using a different audio"):
234
  with gr.Column():
235
  ref_audio_path = gr.Audio(label="Reference Audio", type="filepath")
236
+ prompt_text = gr.Textbox(label="Reference Audio Text", interactive=True, placeholder="Leave blank to use no-text reference mode.")
237
+ prompt_language = gr.Dropdown(label="Reference Audio Language", choices=["EN", "JP", "ZH", "ZH/EN", "JP/EN", "Automatic"], value="EN")
 
238
  with gr.Column():
239
  inference_button = gr.Button("Synthesize", variant="primary")
240
  output = gr.Audio(label="Output")
241
 
242
  inference_button.click(
243
  inference,
244
+ inputs=[n, gptmp, svmp, sty, text, text_language, ref_audio_path, prompt_text, prompt_language, top_k, top_p, temperature, how_to_cut, batch_size, speed_factor, split_bucket, fragment_interval, seed, keep_random, parallel_infer, repetition_penalty],
245
  outputs=[output, seed]
246
  )
247
 
images/amelia.png ADDED
images/mint.png ADDED
images/tenma.png ADDED
referenceaudio/Amelia/A1.wav ADDED
Binary file (672 kB). View file
 
referenceaudio/Amelia/A2.wav ADDED
Binary file (825 kB). View file
 
referenceaudio/Amelia/A3.wav ADDED
Binary file (622 kB). View file
 
referenceaudio/Amelia/A4.wav ADDED
Binary file (602 kB). View file
 
referenceaudio/Amelia/A5.wav ADDED
Binary file (809 kB). View file
 
referenceaudio/Mint/A1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bdbedb07a9024190463a940a8b4a1ed3dabc0f1031d765262f881585908a504
3
+ size 1497732
referenceaudio/Mint/A2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48bdef64c6569cd949e77f47c683b499acc7afbea573ffe71cb290c5ed082da1
3
+ size 1515006
referenceaudio/Mint/A3.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:074c3b4ae81abc6a905d6bc82a5e1ad5d40af2fc0bf3cddb1f8cc6c51f27bf0e
3
+ size 1597566
referenceaudio/Mint/A4.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae4d7cbd69443a09595dd01ecdc11aade36bb10607233d664eb68bfb6fab5959
3
+ size 1480444
referenceaudio/Mint/A5.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:717bd5ac5cece6ead0ccfb0f80fd67dc98c9a452b78c5506db307b6a3c7d2d4e
3
+ size 1678204
referenceaudio/Mint/A6.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcc7d460127ab78b5ac94996d4a0431be1eb189f0e276ab9e7885c74008591fb
3
+ size 1893246
referenceaudio/Tenma/A1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3733afa17d7adf2e3c75b45e1c6c816ee358ef6c7ade428b8a1b6c6468544d2f
3
+ size 1559166
referenceaudio/Tenma/A2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:101f321ec072d758a9dfbef0c46f563a496950d88733f79cbdb7f035399389b0
3
+ size 1751166
referenceaudio/Tenma/A3.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b306cf0edf1c63de596057fe51afdb2a0cace7a77b0f1643966c5209e418498a
3
+ size 1593732
referenceaudio/Tenma/A4.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85c98f0ec0326ba1bc205a554780a6932294a47cadcd60b07eb027d774a6b2f0
3
+ size 1578372
voicelist.json CHANGED
@@ -23,6 +23,28 @@
23
  },
24
  "cover": "calli.png"
25
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  "Shiori": {
27
  "enable": true,
28
  "gpt_model_path": "ShioriNovella_GPT.ckpt",
@@ -113,6 +135,24 @@
113
  },
114
  "cover": "pippa.png"
115
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  "Lia": {
117
  "enable": true,
118
  "gpt_model_path": "AsheliaRinkou_GPT.ckpt",
@@ -163,6 +203,30 @@
163
  },
164
  "cover": "dokibird.png"
165
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  "Template": {
167
  "enable": false,
168
  "gpt_model_path": "model.ckpt",
 
23
  },
24
  "cover": "calli.png"
25
  },
26
+ "Amelia": {
27
+ "enable": true,
28
+ "gpt_model_path": "AmeliaWatson_GPT.ckpt",
29
+ "sovits_model_path": "AmeliaWatson_SoVITS.pth",
30
+ "title": "Amelia Watson",
31
+ "modelauthor": "Kit Lemonfoot",
32
+ "styles":{
33
+ "Neutral": "A1.wav",
34
+ "Punctual": "A2.wav",
35
+ "Thinking": "A3.wav",
36
+ "Humored": "A4.wav",
37
+ "Explaining": "A5.wav"
38
+ },
39
+ "styletrans":{
40
+ "Neutral": "You don't like that one either It's, it's uh, I didn't like it, but I guess when I saw it at the time I was like, this is pretty believable.",
41
+ "Punctual": "Okay, I'm gonna count it down just in case there's anybody new to watch alongs This is how it's gonna work I'm gonna go three two one go and then on go everybody press play but not yet because that was a test!",
42
+ "Thinking": "No I'm probably not allergic to oranges I'm still I'm touching it, I'm eating it, I'm gonna rub it all over my face and then I'm gonna see if it gives me a rash.",
43
+ "Humored": "It would be interesting though sometimes I like going to I M D B and, uh, reading like hold on I'm gonna, I'll read it to you guys.",
44
+ "Explaining": "Had reservations on her next project, which was Speed, 1994 I've never seen it actually It says, one of the most, critically and financially successful movies of the year."
45
+ },
46
+ "cover": "amelia.png"
47
+ },
48
  "Shiori": {
49
  "enable": true,
50
  "gpt_model_path": "ShioriNovella_GPT.ckpt",
 
135
  },
136
  "cover": "pippa.png"
137
  },
138
+ "Tenma": {
139
+ "enable": true,
140
+ "gpt_model_path": "TenmaMaemi_GPT.ckpt",
141
+ "sovits_model_path": "TenmaMaemi_SoVITS.pth",
142
+ "title": "Tenma Maemi",
143
+ "modelauthor": "Kit Lemonfoot",
144
+ "styles":{
145
+ "Neutral": "A1.wav",
146
+ "Questioning": "A2.wav",
147
+ "Preset 4": "A4.wav"
148
+ },
149
+ "styletrans":{
150
+ "Neutral": "They were talking about this scene in the movie I'm not pausing, but you see those five five five, cups of organs?",
151
+ "Questioning": "Actually why should I apologize? I love makeup and nails and fashion and gyaru fashion so I don't apologize. But this one artist, makeup artist who...",
152
+ "Preset 4": "Even if many want me and Pippa chan ship, me and Pippa chan have not more than, maybe sister, poi, neesan poi, relationship."
153
+ },
154
+ "cover": "tenma.png"
155
+ },
156
  "Lia": {
157
  "enable": true,
158
  "gpt_model_path": "AsheliaRinkou_GPT.ckpt",
 
203
  },
204
  "cover": "dokibird.png"
205
  },
206
+ "Mint": {
207
+ "enable": true,
208
+ "gpt_model_path": "MintFantome_GPT.ckpt",
209
+ "sovits_model_path": "MintFantome_SoVITS.pth",
210
+ "title": "Mint Fantôme",
211
+ "modelauthor": "Kit Lemonfoot",
212
+ "styles":{
213
+ "Neutral": "A1.wav",
214
+ "Soft": "A2.wav",
215
+ "Thinking": "A3.wav",
216
+ "Explaining": "A4.wav",
217
+ "Preset 5": "A5.wav",
218
+ "Preset 6": "A6.wav"
219
+ },
220
+ "styletrans":{
221
+ "Neutral": "And like creating the chaos and, portraying it so well, when you are literally stuck in a sound booth the entire movie.",
222
+ "Soft": "Where we should start? Hold on give me one second Let me make sure that you guys can't hear this. Ah ba ba ba ba ba.",
223
+ "Thinking": "Let's see Okay Uh, Let me go back 10 seconds cause the, boop be boop ba starts really abruptly So let me go back.",
224
+ "Explaining": "I'm going to count down from five okay So I'll go five, four three two one and then I'll say start And that's when I'll press the button at start okay?",
225
+ "Preset 5": "Typing this, uh this movie as Ponypool. Every time. I always forget the T. I'm just like, Po, po.",
226
+ "Preset 6": "And I was I, I had to, I stopped. I was like I can't. I can't, I- No, No! No, What is what No. No no no. No."
227
+ },
228
+ "cover": "mint.png"
229
+ },
230
  "Template": {
231
  "enable": false,
232
  "gpt_model_path": "model.ckpt",