Spaces:
Running
on
L4
Running
on
L4
| """ | |
| Used to transcribe all audio files in one folder into another folder. | |
| e.g. | |
| Directory structure: | |
| --pre_data_root | |
| ----SP_1 | |
| ------01.wav | |
| ------02.wav | |
| ------...... | |
| ----SP_2 | |
| ------01.wav | |
| ------02.wav | |
| ------...... | |
| Use | |
| python tools/whisper_asr.py --audio-dir pre_data_root/SP_1 --save-dir data/SP_1 | |
| to transcribe the first speaker. | |
| Use | |
| python tools/whisper_asr.py --audio-dir pre_data_root/SP_2 --save-dir data/SP_2 | |
| to transcribe the second speaker. | |
| Note: Be aware of your audio sample rate, which defaults to 44.1kHz. | |
| """ | |
| import re | |
| from pathlib import Path | |
| import click | |
| import soundfile as sf | |
| from faster_whisper import WhisperModel | |
| from loguru import logger | |
| from pydub import AudioSegment | |
| from tqdm import tqdm | |
| from tools.file import AUDIO_EXTENSIONS, list_files | |
| def main( | |
| model_size, | |
| compute_type, | |
| audio_dir, | |
| save_dir, | |
| sample_rate, | |
| device, | |
| language, | |
| initial_prompt, | |
| ): | |
| logger.info("Loading / Downloading Faster Whisper model...") | |
| model = WhisperModel( | |
| model_size, | |
| device=device, | |
| compute_type=compute_type, | |
| download_root="faster_whisper", | |
| ) | |
| logger.info("Model loaded.") | |
| save_path = Path(save_dir) | |
| save_path.mkdir(parents=True, exist_ok=True) | |
| audio_files = list_files( | |
| path=audio_dir, extensions=AUDIO_EXTENSIONS, recursive=True | |
| ) | |
| for file_path in tqdm(audio_files, desc="Processing audio file"): | |
| file_stem = file_path.stem | |
| file_suffix = file_path.suffix | |
| rel_path = Path(file_path).relative_to(audio_dir) | |
| (save_path / rel_path.parent).mkdir(parents=True, exist_ok=True) | |
| audio = AudioSegment.from_file(file_path) | |
| segments, info = model.transcribe( | |
| file_path, | |
| beam_size=5, | |
| language=None if language == "auto" else language, | |
| initial_prompt=initial_prompt, | |
| ) | |
| print( | |
| "Detected language '%s' with probability %f" | |
| % (info.language, info.language_probability) | |
| ) | |
| print("Total len(ms): ", len(audio)) | |
| whole_text = None | |
| for segment in segments: | |
| id, start, end, text = ( | |
| segment.id, | |
| segment.start, | |
| segment.end, | |
| segment.text, | |
| ) | |
| print("Segment %03d [%.2fs -> %.2fs] %s" % (id, start, end, text)) | |
| if not whole_text: | |
| whole_text = text | |
| else: | |
| whole_text += ", " + text | |
| whole_text += "." | |
| audio_save_path = save_path / rel_path.parent / f"{file_stem}{file_suffix}" | |
| audio.export(audio_save_path, format=file_suffix[1:]) | |
| print(f"Exported {audio_save_path}") | |
| transcript_save_path = save_path / rel_path.parent / f"{file_stem}.lab" | |
| with open( | |
| transcript_save_path, | |
| "w", | |
| encoding="utf-8", | |
| ) as f: | |
| f.write(whole_text) | |
| if __name__ == "__main__": | |
| main() | |
| exit(0) | |
| audio = AudioSegment.from_wav( | |
| r"D:\PythonProject\原神语音中文\胡桃\vo_hutao_draw_appear.wav" | |
| ) | |
| model_size = "large-v3" | |
| model = WhisperModel( | |
| model_size, | |
| device="cuda", | |
| compute_type="float16", | |
| download_root="faster_whisper", | |
| ) | |
| segments, info = model.transcribe( | |
| r"D:\PythonProject\原神语音中文\胡桃\vo_hutao_draw_appear.wav", | |
| beam_size=5, | |
| ) | |
| print( | |
| "Detected language '%s' with probability %f" | |
| % (info.language, info.language_probability) | |
| ) | |
| print("Total len(ms): ", len(audio)) | |
| for i, segment in enumerate(segments): | |
| print( | |
| "Segment %03d [%.2fs -> %.2fs] %s" | |
| % (i, segment.start, segment.end, segment.text) | |
| ) | |
| start_ms = int(segment.start * 1000) | |
| end_ms = int(segment.end * 1000) | |
| segment_audio = audio[start_ms:end_ms] | |
| segment_audio.export(f"segment_{i:03d}.wav", format="wav") | |
| print(f"Exported segment_{i:03d}.wav") | |
| print("All segments have been exported.") | |