""" Used to transcribe all audio files in one folder into another folder. e.g. Directory structure: --pre_data_root ----SP_1 ------01.wav ------02.wav ------...... ----SP_2 ------01.wav ------02.wav ------...... Use python tools/whisper_asr.py --audio-dir pre_data_root/SP_1 --save-dir data/SP_1 to transcribe the first speaker. Use python tools/whisper_asr.py --audio-dir pre_data_root/SP_2 --save-dir data/SP_2 to transcribe the second speaker. Note: Be aware of your audio sample rate, which defaults to 44.1kHz. """ import re from pathlib import Path import click import soundfile as sf from faster_whisper import WhisperModel from loguru import logger from pydub import AudioSegment from tqdm import tqdm from tools.file import AUDIO_EXTENSIONS, list_files @click.command() @click.option("--model-size", default="large-v3", help="Size of the Whisper model") @click.option( "--compute-type", default="float16", help="Computation Precision of the Whisper model [float16 / int8_float16 / int8]", ) @click.option("--audio-dir", required=True, help="Directory containing audio files") @click.option( "--save-dir", required=True, help="Directory to save processed audio files" ) @click.option( "--sample-rate", default=44100, type=int, help="Output sample rate, default to input sample rate", ) @click.option("--device", default="cuda", help="Device to use [cuda / cpu]") @click.option("--language", default="auto", help="Language of the transcription") @click.option("--initial-prompt", default=None, help="Initial prompt for transcribing") def main( model_size, compute_type, audio_dir, save_dir, sample_rate, device, language, initial_prompt, ): logger.info("Loading / Downloading Faster Whisper model...") model = WhisperModel( model_size, device=device, compute_type=compute_type, download_root="faster_whisper", ) logger.info("Model loaded.") save_path = Path(save_dir) save_path.mkdir(parents=True, exist_ok=True) audio_files = list_files( path=audio_dir, extensions=AUDIO_EXTENSIONS, recursive=True ) for file_path in tqdm(audio_files, desc="Processing audio file"): file_stem = file_path.stem file_suffix = file_path.suffix rel_path = Path(file_path).relative_to(audio_dir) (save_path / rel_path.parent).mkdir(parents=True, exist_ok=True) audio = AudioSegment.from_file(file_path) segments, info = model.transcribe( file_path, beam_size=5, language=None if language == "auto" else language, initial_prompt=initial_prompt, ) print( "Detected language '%s' with probability %f" % (info.language, info.language_probability) ) print("Total len(ms): ", len(audio)) whole_text = None for segment in segments: id, start, end, text = ( segment.id, segment.start, segment.end, segment.text, ) print("Segment %03d [%.2fs -> %.2fs] %s" % (id, start, end, text)) if not whole_text: whole_text = text else: whole_text += ", " + text whole_text += "." audio_save_path = save_path / rel_path.parent / f"{file_stem}{file_suffix}" audio.export(audio_save_path, format=file_suffix[1:]) print(f"Exported {audio_save_path}") transcript_save_path = save_path / rel_path.parent / f"{file_stem}.lab" with open( transcript_save_path, "w", encoding="utf-8", ) as f: f.write(whole_text) if __name__ == "__main__": main() exit(0) audio = AudioSegment.from_wav( r"D:\PythonProject\原神语音中文\胡桃\vo_hutao_draw_appear.wav" ) model_size = "large-v3" model = WhisperModel( model_size, device="cuda", compute_type="float16", download_root="faster_whisper", ) segments, info = model.transcribe( r"D:\PythonProject\原神语音中文\胡桃\vo_hutao_draw_appear.wav", beam_size=5, ) print( "Detected language '%s' with probability %f" % (info.language, info.language_probability) ) print("Total len(ms): ", len(audio)) for i, segment in enumerate(segments): print( "Segment %03d [%.2fs -> %.2fs] %s" % (i, segment.start, segment.end, segment.text) ) start_ms = int(segment.start * 1000) end_ms = int(segment.end * 1000) segment_audio = audio[start_ms:end_ms] segment_audio.export(f"segment_{i:03d}.wav", format="wav") print(f"Exported segment_{i:03d}.wav") print("All segments have been exported.")