Yazael commited on
Commit
2733e01
·
verified ·
1 Parent(s): 4ee761c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +134 -0
app.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import warnings
4
+ from pathlib import Path
5
+
6
+ import gradio as gr
7
+ import librosa
8
+ import spaces
9
+ import torch
10
+ from loguru import logger
11
+ from transformers import pipeline
12
+
13
+ warnings.filterwarnings("ignore")
14
+
15
+ is_hf = os.getenv("SYSTEM") == "spaces"
16
+
17
+ generate_kwargs = {
18
+ "language": "Japanese",
19
+ "do_sample": False,
20
+ "num_beams": 1,
21
+ "no_repeat_ngram_size": 5,
22
+ "max_new_tokens": 64,
23
+ }
24
+
25
+
26
+ model_dict = {
27
+ "whisper-large-v3-turbo": "openai/whisper-large-v3-turbo",
28
+ "kotoba-whisper-v2.0": "kotoba-tech/kotoba-whisper-v2.0",
29
+ "anime-whisper": "litagin/anime-whisper",
30
+ }
31
+
32
+ logger.info("Initializing pipelines...")
33
+ pipe_dict = {
34
+ k: pipeline(
35
+ "automatic-speech-recognition",
36
+ model=v,
37
+ device="cuda" if torch.cuda.is_available() else "cpu",
38
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
39
+ )
40
+ for k, v in model_dict.items()
41
+ }
42
+ logger.success("Pipelines initialized!")
43
+
44
+
45
+ @spaces.GPU
46
+ def transcribe_common(audio: str, model: str) -> str:
47
+ if not audio:
48
+ return "No audio file"
49
+ filename = Path(audio).name
50
+ logger.info(f"Model: {model}")
51
+ logger.info(f"Audio: {filename}")
52
+ # Read and resample audio to 16kHz
53
+ try:
54
+ y, sr = librosa.load(audio, mono=True, sr=16000)
55
+ except Exception as e:
56
+ # First convert to wav if librosa cannot read the file
57
+ logger.error(f"Error reading file: {e}")
58
+ from pydub import AudioSegment
59
+
60
+ audio = AudioSegment.from_file(audio)
61
+ audio.export("temp.wav", format="wav")
62
+ y, sr = librosa.load("temp.wav", mono=True, sr=16000)
63
+ Path("temp.wav").unlink()
64
+ # Get duration of audio
65
+ duration = librosa.get_duration(y=y, sr=sr)
66
+ logger.info(f"Duration: {duration:.2f}s")
67
+ if duration > 15:
68
+ logger.error(f"Audio too long, limit is 15 seconds, got {duration:.2f}s")
69
+ return f"Audio too long, limit is 15 seconds, got {duration:.2f}s"
70
+ start_time = time.time()
71
+ result = pipe_dict[model](y, generate_kwargs=generate_kwargs)["text"]
72
+ end_time = time.time()
73
+ logger.success(f"Finished in {end_time - start_time:.2f}s\n{result}")
74
+ return result
75
+
76
+
77
+ def transcribe_others(audio) -> tuple[str, str]:
78
+ result_v3 = transcribe_common(audio, "whisper-large-v3-turbo")
79
+ result_kotoba_v2 = transcribe_common(audio, "kotoba-whisper-v2.0")
80
+ return result_v3, result_kotoba_v2
81
+
82
+
83
+ def transcribe_anime_whisper(audio) -> str:
84
+ return transcribe_common(audio, "anime-whisper")
85
+
86
+
87
+ initial_md = """
88
+ # Anime-Whisper Demo
89
+ [**Anime Whisper**](https://huggingface.co/litagin/anime-whisper): 5千時間以上のアニメ調セリフと台本でファインチューニングされた日本語音声認識モデルのデモです。句読点や感嘆符がリズムや感情に合わせて自然に付き、NSFW含む非言語発話もうまく台本調に書き起こされます。
90
+ - デモでは**音声は15秒まで**しか受け付けません
91
+ - 日本語のみ対応 (Japanese only)
92
+ - 比較のために [openai/whisper-large-v3-turbo](https://huggingface.co/openai/whisper-large-v3-turbo) と [kotoba-tech/kotoba-whisper-v2.0](https://huggingface.co/kotoba-tech/kotoba-whisper-v2.0) も用意しています
93
+ pipeに渡しているkwargsは以下:
94
+ ```python
95
+ generate_kwargs = {
96
+ "language": "Japanese",
97
+ "do_sample": False,
98
+ "num_beams": 1,
99
+ "no_repeat_ngram_size": 5,
100
+ "max_new_tokens": 64, # 結果が長いときは途中で打ち切られる
101
+ }
102
+ ```
103
+ """
104
+
105
+ with gr.Blocks() as app:
106
+ gr.Markdown(initial_md)
107
+ audio = gr.Audio(type="filepath")
108
+ with gr.Row():
109
+ with gr.Column():
110
+ gr.Markdown("### Anime-Whisper")
111
+ button_galgame = gr.Button("Transcribe with Anime-Whisper")
112
+ output_galgame = gr.Textbox(label="Result")
113
+ gr.Markdown("### Comparison")
114
+ button_others = gr.Button("Transcribe with other models")
115
+ with gr.Row():
116
+ with gr.Column():
117
+ gr.Markdown("### Whisper-Large-V3-Turbo")
118
+ output_v3 = gr.Textbox(label="Result")
119
+ with gr.Column():
120
+ gr.Markdown("### Kotoba-Whisper-V2.0")
121
+ output_kotoba_v2 = gr.Textbox(label="Result")
122
+
123
+ button_galgame.click(
124
+ transcribe_anime_whisper,
125
+ inputs=[audio],
126
+ outputs=[output_galgame],
127
+ )
128
+ button_others.click(
129
+ transcribe_others,
130
+ inputs=[audio],
131
+ outputs=[output_v3, output_kotoba_v2],
132
+ )
133
+
134
+ app.launch(inbrowser=True)