voice-cloning / app.py
nateraw's picture
Update app.py
95688f8
raw
history blame
4.17 kB
import json
from pathlib import Path
import gradio as gr
import librosa
import numpy as np
import torch
from huggingface_hub import hf_hub_download, list_repo_files
from so_vits_svc_fork.hparams import HParams
from so_vits_svc_fork.inference.core import Svc
##########################################################
# REPLACE THESE VALUES TO CHANGE THE MODEL REPO/CKPT NAME
##########################################################
repo_id = "dog/theovon"
ckpt_name = None # or specify a ckpt. ex. "G_1257.pth"
##########################################################
# Figure out the latest generator by taking highest value one.
# Ex. if the repo has: G_0.pth, G_100.pth, G_200.pth, we'd use G_200.pth
if ckpt_name is None:
latest_id = sorted(
[
int(Path(x).stem.split("_")[1])
for x in list_repo_files(repo_id)
if x.startswith("G_") and x.endswith(".pth")
]
)[-1]
ckpt_name = f"G_{latest_id}.pth"
generator_path = hf_hub_download(repo_id, ckpt_name)
config_path = hf_hub_download(repo_id, "config.json")
hparams = HParams(**json.loads(Path(config_path).read_text()))
speakers = list(hparams.spk.keys())
device = "cuda" if torch.cuda.is_available() else "cpu"
model = Svc(net_g_path=generator_path, config_path=config_path, device=device, cluster_model_path=None)
def predict(
speaker,
audio,
transpose: int = 0,
auto_predict_f0: bool = False,
cluster_infer_ratio: float = 0,
noise_scale: float = 0.4,
f0_method: str = "crepe",
db_thresh: int = -40,
pad_seconds: float = 0.5,
chunk_seconds: float = 0.5,
absolute_thresh: bool = False,
):
audio, _ = librosa.load(audio, sr=model.target_sample)
audio = model.infer_silence(
audio.astype(np.float32),
speaker=speaker,
transpose=transpose,
auto_predict_f0=auto_predict_f0,
cluster_infer_ratio=cluster_infer_ratio,
noise_scale=noise_scale,
f0_method=f0_method,
db_thresh=db_thresh,
pad_seconds=pad_seconds,
chunk_seconds=chunk_seconds,
absolute_thresh=absolute_thresh,
)
return model.target_sample, audio
description=f"""
This app uses models trained with so-vits-svc-fork to clone your voice. Model currently being used is https://hf.co/{repo_id}.
To change the model being served, duplicate the space and update the `repo_id` in `app.py`.
""".strip()
article="""
<p style='text-align: center'>
<a href='https://github.com/voicepaw/so-vits-svc-fork' target='_blank'>Github Repo</a>
</p>
""".strip()
interface_mic = gr.Interface(
predict,
inputs=[
gr.Dropdown(speakers, value=speakers[0], label="Target Speaker"),
gr.Audio(type="filepath", source="microphone", label="Source Audio"),
gr.Slider(-12, 12, value=0, step=1, label="Transpose (Semitones)"),
gr.Checkbox(False, label="Auto Predict F0"),
gr.Slider(0.0, 1.0, value=0.0, step=0.1, label='cluster infer ratio'),
gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale"),
gr.Dropdown(choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"], value='crepe', label="f0 method"),
],
outputs="audio",
title="Voice Cloning",
description=description,
article=article,
)
interface_file = gr.Interface(
predict,
inputs=[
gr.Dropdown(speakers, value=speakers[0], label="Target Speaker"),
gr.Audio(type="filepath", source="upload", label="Source Audio"),
gr.Slider(-12, 12, value=0, step=1, label="Transpose (Semitones)"),
gr.Checkbox(False, label="Auto Predict F0"),
gr.Slider(0.0, 1.0, value=0.0, step=0.1, label='cluster infer ratio'),
gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale"),
gr.Dropdown(choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"], value='crepe', label="f0 method"),
],
outputs="audio",
title="Voice Cloning",
description=description,
article=article,
)
interface = gr.TabbedInterface(
[interface_mic, interface_file],
["Clone From Mic", "Clone From File"],
)
if __name__ == '__main__':
interface.launch()