File size: 4,186 Bytes
c35a6d8
 
130885f
 
8c99301
c35a6d8
8c99301
c35a6d8
 
8c99301
 
 
 
 
 
 
 
c35a6d8
 
 
8c99301
130885f
c35a6d8
8c99301
130885f
 
c35a6d8
8c99301
 
 
 
 
 
804826d
8c99301
 
 
 
804826d
8c99301
 
 
 
804826d
8c99301
 
 
 
 
804826d
8c99301
c35a6d8
130885f
8c99301
 
 
130885f
8c99301
130885f
8c99301
c35a6d8
8c99301
130885f
 
 
 
 
c35a6d8
8c99301
130885f
 
c35a6d8
8c99301
130885f
 
c35a6d8
8c99301
 
 
 
 
 
 
 
 
c35a6d8
 
8c99301
 
 
 
c35a6d8
 
 
 
 
8c99301
 
c35a6d8
8c99301
 
 
 
c35a6d8
8c99301
 
c35a6d8
 
 
 
8c99301
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import gradio as gr
from TTS.api import TTS
import torch
import os
from pydub import AudioSegment

# CSS for warnings and styling
css = """
#warning {background-color: #FFCCCB !important}
.feedback label textarea {
    height: auto !important;
    font-size: 22px !important;
    font-weight: 800 !important;
    text-align: center !important;
    color: #801313 !important;
    padding: 0px !important
}
#alert {background-color: #fff !important}
"""

# Use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load models
tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=(device=="cuda"))
tts.to(device)

# Convert mp3 to wav
def convert_mp3_to_wav(mp3_path: str) -> str:
    wav_path = mp3_path.replace(".mp3", ".wav")
    audio = AudioSegment.from_mp3(mp3_path)
    audio.export(wav_path, format="wav")
    return wav_path

# Voice cloning function
def text_to_speech(text: str, speaker_wav: str, speaker_wav_file: str):
    text = text.strip().replace("\n", " ")
    speaker_audio = speaker_wav_file or speaker_wav

    if not text:
        return None, "⚠️ Error: Text input is empty."
    if not speaker_audio or not os.path.exists(speaker_audio):
        return None, "⚠️ Error: No valid speaker audio provided."

    if speaker_audio.endswith(".mp3"):
        try:
            speaker_audio = convert_mp3_to_wav(speaker_audio)
        except Exception as e:
            return None, f"⚠️ Error converting MP3 to WAV: {str(e)}"

    output_path = "output.wav"

    try:
        tts.tts_to_file(text=text, speaker_wav=speaker_audio, language="en", file_path=output_path)
        if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
            return output_path, ""
        else:
            return None, "⚠️ Error: Audio was not generated."
    except Exception as e:
        return None, f"⚠️ Error during synthesis: {str(e)}"

# Toggle mic/file input visibility
def toggle(choice: str):
    return (
        gr.update(visible=(choice == "mic"), value=None),
        gr.update(visible=(choice != "mic"), value=None)
    )

# Change alert style based on input
def change_color(text_input: str):
    return gr.update(elem_id="warning" if len(text_input) == 0 else "alert", autofocus=(len(text_input) == 0))

# Reset fields
def clear_color(text_input: str, radio: str, error_box: str):
    return gr.update(elem_id="alert"), gr.update(value="mic"), gr.update(visible=False)

# Show error or success
def show_error(text: str):
    return (
        gr.update(visible=(text == ""), elem_id="warning", elem_classes="feedback"),
        gr.update(visible=(text != ""))
    )

# Gradio UI
with gr.Blocks(css=css) as demo:
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(label="Enter text to clone", value="", max_lines=4, lines=4)
            radio = gr.Radio(["mic", "file"], value="mic", label="Upload speaker audio")
            audio_input_mic = gr.Audio(label="Use Microphone", sources="microphone", type="filepath", visible=True)
            audio_input_file = gr.Audio(label="Upload File (.wav/.mp3)", type="filepath", visible=False)

            with gr.Row():
                with gr.Column():
                    btn_clear = gr.ClearButton([text_input, radio, audio_input_file])
                with gr.Column():
                    btn = gr.Button("Generate Voice", variant="primary")

        with gr.Column():
            audio_output = gr.Audio(label="Generated Voice", visible=True, autoplay=True, show_share_button=False)
            error_box = gr.Textbox(label="Status", value="Input box cannot be blank!!", visible=False, container=True)

    # Event bindings
    btn_clear.add(audio_output)
    btn.click(text_to_speech, inputs=[text_input, audio_input_mic, audio_input_file], outputs=[audio_output, error_box])
    btn.click(show_error, text_input, [error_box, audio_output])
    radio.change(toggle, radio, [audio_input_mic, audio_input_file])
    btn_clear.click(clear_color, [text_input, radio, error_box], [text_input, radio, error_box])
    btn.click(change_color, text_input, text_input)

# Launch the app
demo.launch()