Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -5,6 +5,7 @@ import soundfile as sf
|
|
5 |
from xcodec2.modeling_xcodec2 import XCodec2Model
|
6 |
import torchaudio
|
7 |
import gradio as gr
|
|
|
8 |
|
9 |
llasa_model_id = 'OmniAICreator/Llasa-1B-run1'
|
10 |
|
@@ -28,6 +29,68 @@ whisper_turbo_pipe = pipeline(
|
|
28 |
device='cuda',
|
29 |
)
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
def ids_to_speech_tokens(speech_ids):
|
32 |
|
33 |
speech_tokens_str = []
|
@@ -56,6 +119,7 @@ def infer(sample_audio_path, target_text, temperature, top_p, progress=gr.Progr
|
|
56 |
if len(target_text) > 300:
|
57 |
gr.Warning("Text is too long. Please keep it under 300 characters.")
|
58 |
target_text = target_text[:300]
|
|
|
59 |
with torch.no_grad():
|
60 |
if sample_audio_path:
|
61 |
progress(0, 'Loading and trimming audio...')
|
|
|
5 |
from xcodec2.modeling_xcodec2 import XCodec2Model
|
6 |
import torchaudio
|
7 |
import gradio as gr
|
8 |
+
import re
|
9 |
|
10 |
llasa_model_id = 'OmniAICreator/Llasa-1B-run1'
|
11 |
|
|
|
29 |
device='cuda',
|
30 |
)
|
31 |
|
32 |
+
REPLACE_MAP: dict[str, str] = {
|
33 |
+
r"\t": "",
|
34 |
+
r"\[n\]": "",
|
35 |
+
r" ": "",
|
36 |
+
r"γ": "",
|
37 |
+
r"[;βΌββγγβͺβ«ξΎβ β‘β’β£β€β₯]": "",
|
38 |
+
r"[\u02d7\u2010-\u2015\u2043\u2212\u23af\u23e4\u2500\u2501\u2e3a\u2e3b]": "",
|
39 |
+
r"[\uff5e\u301C]": "γΌ",
|
40 |
+
r"οΌ": "?",
|
41 |
+
r"οΌ": "!",
|
42 |
+
r"[ββ―γ]": "β",
|
43 |
+
r"β₯": "β‘",
|
44 |
+
}
|
45 |
+
FULLWIDTH_ALPHA_TO_HALFWIDTH = str.maketrans(
|
46 |
+
{
|
47 |
+
chr(full): chr(half)
|
48 |
+
for full, half in zip(
|
49 |
+
list(range(0xFF21, 0xFF3B)) + list(range(0xFF41, 0xFF5B)),
|
50 |
+
list(range(0x41, 0x5B)) + list(range(0x61, 0x7B)),
|
51 |
+
)
|
52 |
+
}
|
53 |
+
)
|
54 |
+
HALFWIDTH_KATAKANA_TO_FULLWIDTH = str.maketrans(
|
55 |
+
{
|
56 |
+
chr(half): chr(full)
|
57 |
+
for half, full in zip(range(0xFF61, 0xFF9F), range(0x30A1, 0x30FB))
|
58 |
+
}
|
59 |
+
)
|
60 |
+
FULLWIDTH_DIGITS_TO_HALFWIDTH = str.maketrans(
|
61 |
+
{
|
62 |
+
chr(full): chr(half)
|
63 |
+
for full, half in zip(range(0xFF10, 0xFF1A), range(0x30, 0x3A))
|
64 |
+
}
|
65 |
+
)
|
66 |
+
INVALID_PATTERN = re.compile(
|
67 |
+
r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF\u3005"
|
68 |
+
r"\u0041-\u005A\u0061-\u007A"
|
69 |
+
r"\u0030-\u0039"
|
70 |
+
r"γγ!?β¦βͺβ‘β]"
|
71 |
+
)
|
72 |
+
|
73 |
+
def normalize(text: str) -> str:
|
74 |
+
for pattern, replacement in REPLACE_MAP.items():
|
75 |
+
text = re.sub(pattern, replacement, text)
|
76 |
+
|
77 |
+
text = text.translate(FULLWIDTH_ALPHA_TO_HALFWIDTH)
|
78 |
+
text = text.translate(FULLWIDTH_DIGITS_TO_HALFWIDTH)
|
79 |
+
text = text.translate(HALFWIDTH_KATAKANA_TO_FULLWIDTH)
|
80 |
+
|
81 |
+
text = re.sub(r"β¦{2,}", "β¦", text)
|
82 |
+
text = re.sub(r"γΌ{2,}", "γΌ", text)
|
83 |
+
|
84 |
+
def replace_special_chars(match):
|
85 |
+
seq = match.group(0)
|
86 |
+
return seq[0] if len(set(seq)) == 1 else seq[0] + seq[-1]
|
87 |
+
|
88 |
+
text = re.sub(r"[!?βͺβ‘]{2,}", replace_special_chars, text)
|
89 |
+
|
90 |
+
repeated = "γγ£γγγγγγγγγ
γγγγ
γγ’γ€γ¦γ¨γͺγ³γ‘γ£γ₯γ§γ©γ£γ₯γ§"
|
91 |
+
text = re.sub(f"([{repeated}])\\1{{2,}}", r"\1\1", text)
|
92 |
+
return text
|
93 |
+
|
94 |
def ids_to_speech_tokens(speech_ids):
|
95 |
|
96 |
speech_tokens_str = []
|
|
|
119 |
if len(target_text) > 300:
|
120 |
gr.Warning("Text is too long. Please keep it under 300 characters.")
|
121 |
target_text = target_text[:300]
|
122 |
+
target_text = normalize(target_text)
|
123 |
with torch.no_grad():
|
124 |
if sample_audio_path:
|
125 |
progress(0, 'Loading and trimming audio...')
|