Spaces:
Runtime error
Runtime error
terry-li-hm
commited on
Commit
·
9ecefd1
1
Parent(s):
09b9a40
Update
Browse files
app.py
CHANGED
|
@@ -4,14 +4,17 @@ import base64
|
|
| 4 |
import io
|
| 5 |
import os
|
| 6 |
import re
|
|
|
|
| 7 |
|
| 8 |
import gradio as gr
|
| 9 |
import librosa
|
| 10 |
import numpy as np
|
|
|
|
| 11 |
import spaces
|
| 12 |
import torch
|
| 13 |
import torchaudio
|
| 14 |
from funasr import AutoModel
|
|
|
|
| 15 |
|
| 16 |
model = "FunAudioLLM/SenseVoiceSmall"
|
| 17 |
model = AutoModel(
|
|
@@ -145,7 +148,6 @@ def format_str_v3(s):
|
|
| 145 |
|
| 146 |
@spaces.GPU
|
| 147 |
def model_inference(input_wav, language, fs=16000):
|
| 148 |
-
# task_abbr = {"Speech Recognition": "ASR", "Rich Text Transcription": ("ASR", "AED", "SER")}
|
| 149 |
language_abbr = {
|
| 150 |
"auto": "auto",
|
| 151 |
"zh": "zh",
|
|
@@ -156,42 +158,33 @@ def model_inference(input_wav, language, fs=16000):
|
|
| 156 |
"nospeech": "nospeech",
|
| 157 |
}
|
| 158 |
|
| 159 |
-
# task = "Speech Recognition" if task is None else task
|
| 160 |
language = "auto" if len(language) < 1 else language
|
| 161 |
selected_language = language_abbr[language]
|
| 162 |
-
# selected_task = task_abbr.get(task)
|
| 163 |
-
|
| 164 |
-
# print(f"input_wav: {type(input_wav)}, {input_wav[1].shape}, {input_wav}")
|
| 165 |
|
|
|
|
| 166 |
if isinstance(input_wav, tuple):
|
| 167 |
fs, input_wav = input_wav
|
| 168 |
input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
|
| 169 |
if len(input_wav.shape) > 1:
|
| 170 |
input_wav = input_wav.mean(-1)
|
| 171 |
if fs != 16000:
|
| 172 |
-
print(f"audio_fs: {fs}")
|
| 173 |
resampler = torchaudio.transforms.Resample(fs, 16000)
|
| 174 |
input_wav_t = torch.from_numpy(input_wav).to(torch.float32)
|
| 175 |
input_wav = resampler(input_wav_t[None, :])[0, :].numpy()
|
| 176 |
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
cache={},
|
| 182 |
-
language=language,
|
| 183 |
-
use_itn=True,
|
| 184 |
-
batch_size_s=500,
|
| 185 |
-
merge_vad=merge_vad,
|
| 186 |
-
)
|
| 187 |
-
|
| 188 |
-
print(text)
|
| 189 |
-
text = text[0]["text"]
|
| 190 |
-
text = format_str_v3(text)
|
| 191 |
|
| 192 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
|
| 194 |
-
return
|
| 195 |
|
| 196 |
|
| 197 |
audio_examples = [
|
|
|
|
| 4 |
import io
|
| 5 |
import os
|
| 6 |
import re
|
| 7 |
+
import tempfile
|
| 8 |
|
| 9 |
import gradio as gr
|
| 10 |
import librosa
|
| 11 |
import numpy as np
|
| 12 |
+
import soundfile as sf
|
| 13 |
import spaces
|
| 14 |
import torch
|
| 15 |
import torchaudio
|
| 16 |
from funasr import AutoModel
|
| 17 |
+
from sv import clean_and_emoji_annotate_speech, process_audio
|
| 18 |
|
| 19 |
model = "FunAudioLLM/SenseVoiceSmall"
|
| 20 |
model = AutoModel(
|
|
|
|
| 148 |
|
| 149 |
@spaces.GPU
|
| 150 |
def model_inference(input_wav, language, fs=16000):
|
|
|
|
| 151 |
language_abbr = {
|
| 152 |
"auto": "auto",
|
| 153 |
"zh": "zh",
|
|
|
|
| 158 |
"nospeech": "nospeech",
|
| 159 |
}
|
| 160 |
|
|
|
|
| 161 |
language = "auto" if len(language) < 1 else language
|
| 162 |
selected_language = language_abbr[language]
|
|
|
|
|
|
|
|
|
|
| 163 |
|
| 164 |
+
# Handle input_wav format
|
| 165 |
if isinstance(input_wav, tuple):
|
| 166 |
fs, input_wav = input_wav
|
| 167 |
input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
|
| 168 |
if len(input_wav.shape) > 1:
|
| 169 |
input_wav = input_wav.mean(-1)
|
| 170 |
if fs != 16000:
|
|
|
|
| 171 |
resampler = torchaudio.transforms.Resample(fs, 16000)
|
| 172 |
input_wav_t = torch.from_numpy(input_wav).to(torch.float32)
|
| 173 |
input_wav = resampler(input_wav_t[None, :])[0, :].numpy()
|
| 174 |
|
| 175 |
+
# Save the input audio to a temporary file
|
| 176 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
|
| 177 |
+
sf.write(temp_audio.name, input_wav, 16000)
|
| 178 |
+
temp_audio_path = temp_audio.name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
|
| 180 |
+
try:
|
| 181 |
+
# Process the audio using the function from sv.py
|
| 182 |
+
result = process_audio(temp_audio_path, language=selected_language)
|
| 183 |
+
finally:
|
| 184 |
+
# Remove the temporary audio file
|
| 185 |
+
os.remove(temp_audio_path)
|
| 186 |
|
| 187 |
+
return result
|
| 188 |
|
| 189 |
|
| 190 |
audio_examples = [
|