higgs_audio-Enhanced

Running on Zero

App Files Files Community

ginipick commited on Jul 25

Commit

ab87e84

verified ·

1 Parent(s): 355d056

Update app.py

Browse files

Files changed (1) hide show

app.py +504 -137

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """
 Gradio UI for Text-to-Speech using HiggsAudioServeEngine
 """
 import argparse
@@ -23,6 +24,7 @@ from higgs_audio.data_types import ChatMLSample, AudioContent, Message
 # Global engine instance
 engine = None
 # Default model configuration
 DEFAULT_MODEL_PATH = "bosonai/higgs-audio-v2-generation-3B-base"
@@ -43,12 +45,16 @@ PREDEFINED_EXAMPLES = {
     "voice-clone": {
         "system_prompt": "",
         "input_text": "Hey there! I'm your friendly voice twin in the making. Pick a voice preset below or upload your own audio - let's clone some vocals and bring your voice to life! ",
-        "description": "Voice clone to clone the reference audio. Leave the system prompt empty.",
     },
     "smart-voice": {
         "system_prompt": DEFAULT_SYSTEM_PROMPT,
         "input_text": "The sun rises in the east and sets in the west. This simple fact has been observed by humans for thousands of years.",
-        "description": "Smart voice to generate speech based on the context",
     },
     "multispeaker-voice-description": {
         "system_prompt": "You are an AI assistant designed to convert text into speech.\n"
@@ -62,7 +68,9 @@ PREDEFINED_EXAMPLES = {
         "[SPEAKER1] Oh, come on! It wasn't a big deal, and I knew you would overreact like this.\n"
         "[SPEAKER0] Overreact? You made a decision that affects both of us without even considering my opinion!\n"
         "[SPEAKER1] Because I didn't have time to sit around waiting for you to make up your mind! Someone had to act.",
-        "description": "Multispeaker with different voice descriptions in the system prompt",
     },
     "single-speaker-voice-description": {
         "system_prompt": "Generate audio following instruction.\n\n"
@@ -74,7 +82,9 @@ PREDEFINED_EXAMPLES = {
         "And let's be honest, if you've been even remotely connected to tech, AI, or machine learning lately, you know that deep learning is everywhere.\n"
         "\n"
         "So here's the big question: Do you want to understand how deep learning works?\n",
-        "description": "Single speaker with voice description in the system prompt",
     },
     "single-speaker-zh": {
         "system_prompt": "Generate audio following instruction.\n\n"
@@ -85,12 +95,16 @@ PREDEFINED_EXAMPLES = {
         "今天我们要聊的是一个你绝对不能忽视的话题: 多模态学习.\n"
         "那么, 问题来了, 你真的了解多模态吗? 你知道如何自己动手构建多模态大模型吗.\n"
         "或者说, 你能察觉到我其实是个机器人吗?",
-        "description": "Single speaker speaking Chinese",
     },
     "single-speaker-bgm": {
         "system_prompt": DEFAULT_SYSTEM_PROMPT,
         "input_text": "[music start] I will remember this, thought Ender, when I am defeated. To keep dignity, and give honor where it's due, so that defeat is not disgrace. And I hope I don't have to do it often. [music end]",
-        "description": "Single speaker with BGM using music tag. This is an experimental feature and you may need to try multiple times to get the best result.",
     },
 }
@@ -110,10 +124,14 @@ def get_current_device():
 def load_voice_presets():
     """Load the voice presets from the voice_examples directory."""
     try:
-        with open(
-            os.path.join(os.path.dirname(__file__), "voice_examples", "config.json"),
-            "r",
-        ) as f:
             voice_dict = json.load(f)
         voice_presets = {k: v["transcript"] for k, v in voice_dict.items()}
         voice_presets["EMPTY"] = "No reference voice"
@@ -156,10 +174,10 @@ def normalize_chinese_punctuation(text):
         "】": "]",  # right square bracket
         "《": "<",  # left angle quote
         "》": ">",  # right angle quote
-        "“": '"',  # left double quotation
-        "”": '"',  # right double quotation
-        "‘": "'",  # left single quotation
-        "’": "'",  # right single quotation
         "、": ",",  # enumeration comma
         "—": "-",  # em dash
         "…": "...",  # ellipsis
@@ -210,11 +228,14 @@ def normalize_text(transcript: str):
     return transcript
-@spaces.GPU
 def initialize_engine(model_path, audio_tokenizer_path) -> bool:
     """Initialize the HiggsAudioServeEngine."""
     global engine
     try:
         logger.info(f"Initializing engine with model: {model_path} and audio tokenizer: {audio_tokenizer_path}")
         engine = HiggsAudioServeEngine(
             model_name_or_path=model_path,
@@ -305,7 +326,8 @@ def text_to_speech(
     global engine
     if engine is None:
-        initialize_engine(DEFAULT_MODEL_PATH, DEFAULT_AUDIO_TOKENIZER_PATH)
     try:
         # Prepare ChatML sample
@@ -360,11 +382,23 @@ def text_to_speech(
         return f"❌ {error_msg}", None
 def create_ui():
-    my_theme = gr.Theme.load("theme.json")
-    # Add custom CSS to disable focus highlighting on textboxes
     custom_css = """
     .gradio-container input:focus,
     .gradio-container textarea:focus,
     .gradio-container select:focus,
@@ -380,113 +414,340 @@ def create_ui():
         background-color: var(--input-background-fill) !important;
     }
-    /* Override any hover effects as well */
-    .gradio-container input:hover,
-    .gradio-container textarea:hover,
-    .gradio-container select:hover,
-    .gradio-container .gr-input:hover,
-    .gradio-container .gr-textarea:hover,
-    .gradio-container .gr-textbox:hover {
-        border-color: var(--border-color-primary) !important;
-        background-color: var(--input-background-fill) !important;
     }
-    /* Style for checked checkbox */
-    .gradio-container input[type="checkbox"]:checked {
-        background-color: var(--primary-500) !important;
-        border-color: var(--primary-500) !important;
     }
     """
     default_template = "smart-voice"
-    """Create the Gradio UI."""
-    with gr.Blocks(theme=my_theme, css=custom_css) as demo:
-        gr.Markdown("# Higgs Audio Text-to-Speech Playground")
         # Main UI section
         with gr.Row():
             with gr.Column(scale=2):
-                # Template selection dropdown
                 template_dropdown = gr.Dropdown(
                     label="TTS Template",
                     choices=list(PREDEFINED_EXAMPLES.keys()),
                     value=default_template,
-                    info="Select a predefined example for system and input messages.",
                 )
-                # Template description display
                 template_description = gr.HTML(
-                    value=f'<p style="font-size: 0.85em; color: var(--body-text-color-subdued); margin: 0; padding: 0;"> {PREDEFINED_EXAMPLES[default_template]["description"]}</p>',
                     visible=True,
                 )
-                system_prompt = gr.TextArea(
-                    label="System Prompt",
-                    placeholder="Enter system prompt to guide the model...",
-                    value=PREDEFINED_EXAMPLES[default_template]["system_prompt"],
-                    lines=2,
-                )
-                input_text = gr.TextArea(
-                    label="Input Text",
-                    placeholder="Type the text you want to convert to speech...",
-                    value=PREDEFINED_EXAMPLES[default_template]["input_text"],
-                    lines=5,
-                )
-                voice_preset = gr.Dropdown(
-                    label="Voice Preset",
-                    choices=list(VOICE_PRESETS.keys()),
-                    value="EMPTY",
-                    interactive=False,  # Disabled by default since default template is not voice-clone
-                    visible=False,
-                )
-                with gr.Accordion(
-                    "Custom Reference (Optional)", open=False, visible=False
-                ) as custom_reference_accordion:
-                    reference_audio = gr.Audio(label="Reference Audio", type="filepath")
-                    reference_text = gr.TextArea(
-                        label="Reference Text (transcript of the reference audio)",
-                        placeholder="Enter the transcript of your reference audio...",
                         lines=3,
                     )
-                with gr.Accordion("Advanced Parameters", open=False):
-                    max_completion_tokens = gr.Slider(
-                        minimum=128,
-                        maximum=4096,
-                        value=1024,
-                        step=10,
-                        label="Max Completion Tokens",
-                    )
-                    temperature = gr.Slider(
-                        minimum=0.0,
-                        maximum=1.5,
-                        value=1.0,
-                        step=0.1,
-                        label="Temperature",
-                    )
-                    top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top P")
-                    top_k = gr.Slider(minimum=-1, maximum=100, value=50, step=1, label="Top K")
-                    ras_win_len = gr.Slider(
-                        minimum=0,
-                        maximum=10,
-                        value=7,
-                        step=1,
-                        label="RAS Window Length",
-                        info="Window length for repetition avoidance sampling",
                     )
-                    ras_win_max_num_repeat = gr.Slider(
-                        minimum=1,
-                        maximum=10,
-                        value=2,
-                        step=1,
-                        label="RAS Max Num Repeat",
-                        info="Maximum number of repetitions allowed in the window",
                     )
-                    # Add stop strings component
                     stop_strings = gr.Dataframe(
                         label="Stop Strings",
                         headers=["stops"],
@@ -494,32 +755,93 @@ def create_ui():
                         value=[[s] for s in DEFAULT_STOP_STRINGS],
                         interactive=True,
                         col_count=(1, "fixed"),
                     )
-                submit_btn = gr.Button("Generate Speech", variant="primary", scale=1)
             with gr.Column(scale=2):
-                output_text = gr.TextArea(label="Model Response", lines=2)
-                # Audio output
-                output_audio = gr.Audio(label="Generated Audio", interactive=False, autoplay=True)
-                stop_btn = gr.Button("Stop Playback", variant="primary")
-        # Example voice
         with gr.Row(visible=False) as voice_samples_section:
             voice_samples_table = gr.Dataframe(
                 headers=["Voice Preset", "Sample Text"],
                 datatype=["str", "str"],
                 value=[[preset, text] for preset, text in VOICE_PRESETS.items() if preset != "EMPTY"],
                 interactive=False,
             )
-            sample_audio = gr.Audio(label="Voice Sample")
         # Function to play voice sample when clicking on a row
         def play_voice_sample(evt: gr.SelectData):
             try:
-                # Get the preset name from the clicked row
                 preset_names = [preset for preset in VOICE_PRESETS.keys() if preset != "EMPTY"]
                 if evt.index[0] < len(preset_names):
                     preset = preset_names[evt.index[0]]
@@ -537,43 +859,67 @@ def create_ui():
                 gr.Error(f"Error playing voice sample: {e}")
                 return None
-        voice_samples_table.select(fn=play_voice_sample, outputs=[sample_audio])
         # Function to handle template selection
         def apply_template(template_name):
             if template_name in PREDEFINED_EXAMPLES:
                 template = PREDEFINED_EXAMPLES[template_name]
-                # Enable voice preset and custom reference only for voice-clone template
                 is_voice_clone = template_name == "voice-clone"
                 voice_preset_value = "belinda" if is_voice_clone else "EMPTY"
-                # Set ras_win_len to 0 for single-speaker-bgm, 7 for others
                 ras_win_len_value = 0 if template_name == "single-speaker-bgm" else 7
-                description_text = f'<p style="font-size: 0.85em; color: var(--body-text-color-subdued); margin: 0; padding: 0;"> {template["description"]}</p>'
                 return (
                     template["system_prompt"],  # system_prompt
                     template["input_text"],  # input_text
-                    description_text,  # template_description
                     gr.update(
-                        value=voice_preset_value, interactive=is_voice_clone, visible=is_voice_clone
-                    ),  # voice_preset (value and interactivity)
-                    gr.update(visible=is_voice_clone),  # custom reference accordion visibility
-                    gr.update(visible=is_voice_clone),  # voice samples section visibility
                     ras_win_len_value,  # ras_win_len
                 )
             else:
-                return (
-                    gr.update(),
-                    gr.update(),
-                    gr.update(),
-                    gr.update(),
-                    gr.update(),
-                    gr.update(),
-                    gr.update(),
-                )  # No change if template not found
         # Set up event handlers
-        # Connect template dropdown to handler
         template_dropdown.change(
             fn=apply_template,
             inputs=[template_dropdown],
@@ -585,12 +931,20 @@ def create_ui():
                 custom_reference_accordion,
                 voice_samples_section,
                 ras_win_len,
             ],
         )
-        # Connect submit button to the TTS function
         submit_btn.click(
-            fn=text_to_speech,
             inputs=[
                 input_text,
                 voice_preset,
@@ -605,7 +959,7 @@ def create_ui():
                 ras_win_len,
                 ras_win_max_num_repeat,
             ],
-            outputs=[output_text, output_audio],
             api_name="generate_speech",
         )
@@ -617,12 +971,20 @@ def create_ui():
             js="() => {const audio = document.querySelector('audio'); if(audio) audio.pause(); return null;}",
         )
     return demo
 def main():
     """Main function to parse arguments and launch the UI."""
-    global DEFAULT_MODEL_PATH, DEFAULT_AUDIO_TOKENIZER_PATH, VOICE_PRESETS
     parser = argparse.ArgumentParser(description="Gradio UI for Text-to-Speech using HiggsAudioServeEngine")
     parser.add_argument(
@@ -637,13 +999,18 @@ def main():
     args = parser.parse_args()
-    # Update default values if provided via command line
-    VOICE_PRESETS = load_voice_presets()
     # Create and launch the UI
     demo = create_ui()
-    demo.launch(server_name=args.host, server_port=args.port)
 if __name__ == "__main__":
-    main()

 """
 Gradio UI for Text-to-Speech using HiggsAudioServeEngine
+Enhanced with visual improvements and better user experience
 """
 import argparse
 # Global engine instance
 engine = None
+VOICE_PRESETS = {}
 # Default model configuration
 DEFAULT_MODEL_PATH = "bosonai/higgs-audio-v2-generation-3B-base"
     "voice-clone": {
         "system_prompt": "",
         "input_text": "Hey there! I'm your friendly voice twin in the making. Pick a voice preset below or upload your own audio - let's clone some vocals and bring your voice to life! ",
+        "description": "🎭 <b>Voice Clone</b> - Clone any voice with reference audio. Leave the system prompt empty for best results.",
+        "icon": "🎭",
+        "color": "#FF6B6B"
     },
     "smart-voice": {
         "system_prompt": DEFAULT_SYSTEM_PROMPT,
         "input_text": "The sun rises in the east and sets in the west. This simple fact has been observed by humans for thousands of years.",
+        "description": "🧠 <b>Smart Voice</b> - Generate natural speech based on context",
+        "icon": "🧠",
+        "color": "#4ECDC4"
     },
     "multispeaker-voice-description": {
         "system_prompt": "You are an AI assistant designed to convert text into speech.\n"
         "[SPEAKER1] Oh, come on! It wasn't a big deal, and I knew you would overreact like this.\n"
         "[SPEAKER0] Overreact? You made a decision that affects both of us without even considering my opinion!\n"
         "[SPEAKER1] Because I didn't have time to sit around waiting for you to make up your mind! Someone had to act.",
+        "description": "👥 <b>Multi-Speaker</b> - Different voices for dialogue and conversations",
+        "icon": "👥",
+        "color": "#95E1D3"
     },
     "single-speaker-voice-description": {
         "system_prompt": "Generate audio following instruction.\n\n"
         "And let's be honest, if you've been even remotely connected to tech, AI, or machine learning lately, you know that deep learning is everywhere.\n"
         "\n"
         "So here's the big question: Do you want to understand how deep learning works?\n",
+        "description": "🎙️ <b>Voice Description</b> - Generate speech with specific voice characteristics",
+        "icon": "🎙️",
+        "color": "#F38181"
     },
     "single-speaker-zh": {
         "system_prompt": "Generate audio following instruction.\n\n"
         "今天我们要聊的是一个你绝对不能忽视的话题: 多模态学习.\n"
         "那么, 问题来了, 你真的了解多模态吗? 你知道如何自己动手构建多模态大模型吗.\n"
         "或者说, 你能察觉到我其实是个机器人吗?",
+        "description": "🇨🇳 <b>Chinese Speech</b> - Generate natural Chinese speech",
+        "icon": "🇨🇳",
+        "color": "#AA96DA"
     },
     "single-speaker-bgm": {
         "system_prompt": DEFAULT_SYSTEM_PROMPT,
         "input_text": "[music start] I will remember this, thought Ender, when I am defeated. To keep dignity, and give honor where it's due, so that defeat is not disgrace. And I hope I don't have to do it often. [music end]",
+        "description": "🎵 <b>Speech with BGM</b> - Add background music to your speech (experimental)",
+        "icon": "🎵",
+        "color": "#FCBAD3"
     },
 }
 def load_voice_presets():
     """Load the voice presets from the voice_examples directory."""
     try:
+        config_path = os.path.join(os.path.dirname(__file__), "voice_examples", "config.json")
+        # Check if directory exists
+        if not os.path.exists(os.path.dirname(config_path)):
+            logger.warning("Voice examples directory not found")
+            return {"EMPTY": "No reference voice"}
+        with open(config_path, "r") as f:
             voice_dict = json.load(f)
         voice_presets = {k: v["transcript"] for k, v in voice_dict.items()}
         voice_presets["EMPTY"] = "No reference voice"
         "】": "]",  # right square bracket
         "《": "<",  # left angle quote
         "》": ">",  # right angle quote
+        """: '"',  # left double quotation
+        """: '"',  # right double quotation
+        "'": "'",  # left single quotation
+        "'": "'",  # right single quotation
         "、": ",",  # enumeration comma
         "—": "-",  # em dash
         "…": "...",  # ellipsis
     return transcript
 def initialize_engine(model_path, audio_tokenizer_path) -> bool:
     """Initialize the HiggsAudioServeEngine."""
     global engine
     try:
+        if engine is not None:
+            logger.info("Engine already initialized")
+            return True
         logger.info(f"Initializing engine with model: {model_path} and audio tokenizer: {audio_tokenizer_path}")
         engine = HiggsAudioServeEngine(
             model_name_or_path=model_path,
     global engine
     if engine is None:
+        if not initialize_engine(DEFAULT_MODEL_PATH, DEFAULT_AUDIO_TOKENIZER_PATH):
+            return "❌ Failed to initialize engine", None
     try:
         # Prepare ChatML sample
         return f"❌ {error_msg}", None
+def initialize_globals():
+    """Initialize global variables"""
+    global VOICE_PRESETS
+    VOICE_PRESETS = load_voice_presets()
 def create_ui():
+    # Try to load theme
+    try:
+        my_theme = gr.Theme.load("theme.json")
+    except Exception as e:
+        logger.warning(f"Failed to load theme.json: {e}, using default theme")
+        my_theme = gr.themes.Default()
+    # Enhanced CSS with animations and visual improvements
     custom_css = """
+    /* Remove focus highlighting */
     .gradio-container input:focus,
     .gradio-container textarea:focus,
     .gradio-container select:focus,
         background-color: var(--input-background-fill) !important;
     }
+    /* Gradient background */
+    .gradio-container {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        min-height: 100vh;
+    }
+    /* Main container styling */
+    .container {
+        backdrop-filter: blur(10px);
+        background: rgba(255, 255, 255, 0.95);
+        border-radius: 20px;
+        box-shadow: 0 8px 32px 0 rgba(31, 38, 135, 0.37);
+    }
+    /* Header styling */
+    .header-container {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        padding: 2rem;
+        border-radius: 15px;
+        margin-bottom: 2rem;
+        box-shadow: 0 4px 15px rgba(0, 0, 0, 0.1);
+    }
+    .header-title {
+        color: white;
+        font-size: 2.5rem;
+        font-weight: bold;
+        text-align: center;
+        margin: 0;
+        text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.2);
+    }
+    .header-subtitle {
+        color: rgba(255, 255, 255, 0.9);
+        text-align: center;
+        margin-top: 0.5rem;
+        font-size: 1.1rem;
+    }
+    /* Template cards */
+    .template-card {
+        background: white;
+        border-radius: 12px;
+        padding: 1.5rem;
+        margin: 0.5rem;
+        border: 2px solid transparent;
+        transition: all 0.3s ease;
+        cursor: pointer;
+        box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1);
+    }
+    .template-card:hover {
+        transform: translateY(-3px);
+        box-shadow: 0 4px 20px rgba(0, 0, 0, 0.15);
+        border-color: var(--primary-500);
+    }
+    .template-card.selected {
+        border-color: var(--primary-500);
+        background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
+    }
+    .template-icon {
+        font-size: 2rem;
+        margin-bottom: 0.5rem;
+    }
+    /* Voice preset cards */
+    .voice-card {
+        background: white;
+        border-radius: 10px;
+        padding: 1rem;
+        margin: 0.5rem;
+        border: 2px solid #e0e0e0;
+        transition: all 0.3s ease;
+        cursor: pointer;
+        text-align: center;
+    }
+    .voice-card:hover {
+        border-color: var(--primary-500);
+        transform: scale(1.05);
+        box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
+    }
+    .voice-card.selected {
+        border-color: var(--primary-500);
+        background: #f0f8ff;
+    }
+    /* Generate button animation */
+    .generate-btn {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        color: white;
+        font-size: 1.2rem;
+        font-weight: bold;
+        padding: 0.8rem 2rem;
+        border-radius: 30px;
+        border: none;
+        cursor: pointer;
+        transition: all 0.3s ease;
+        box-shadow: 0 4px 15px rgba(102, 126, 234, 0.4);
+    }
+    .generate-btn:hover {
+        transform: translateY(-2px);
+        box-shadow: 0 6px 20px rgba(102, 126, 234, 0.6);
+    }
+    .generate-btn:active {
+        transform: translateY(0);
+    }
+    /* Audio player styling */
+    .audio-container {
+        background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
+        padding: 2rem;
+        border-radius: 15px;
+        box-shadow: 0 4px 15px rgba(0, 0, 0, 0.1);
+    }
+    /* Progress indicator */
+    .progress-bar {
+        height: 4px;
+        background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
+        border-radius: 2px;
+        animation: progress 2s ease-in-out infinite;
     }
+    @keyframes progress {
+        0% { transform: translateX(-100%); }
+        100% { transform: translateX(100%); }
+    }
+    /* Accordion styling */
+    .gr-accordion {
+        background: white;
+        border-radius: 10px;
+        border: 1px solid #e0e0e0;
+        margin-top: 1rem;
+    }
+    /* Info cards */
+    .info-card {
+        background: #f8f9fa;
+        border-left: 4px solid var(--primary-500);
+        padding: 1rem;
+        margin: 1rem 0;
+        border-radius: 5px;
+    }
+    /* Tooltips */
+    .tooltip {
+        position: relative;
+        display: inline-block;
+        border-bottom: 1px dotted black;
+    }
+    .tooltip .tooltiptext {
+        visibility: hidden;
+        width: 200px;
+        background-color: #555;
+        color: #fff;
+        text-align: center;
+        border-radius: 6px;
+        padding: 5px;
+        position: absolute;
+        z-index: 1;
+        bottom: 125%;
+        left: 50%;
+        margin-left: -100px;
+        opacity: 0;
+        transition: opacity 0.3s;
+    }
+    .tooltip:hover .tooltiptext {
+        visibility: visible;
+        opacity: 1;
+    }
+    /* Responsive design */
+    @media (max-width: 768px) {
+        .header-title {
+            font-size: 2rem;
+        }
+        .template-card {
+            margin: 0.25rem;
+            padding: 1rem;
+        }
     }
     """
     default_template = "smart-voice"
+    """Create the enhanced Gradio UI."""
+    with gr.Blocks(theme=my_theme, css=custom_css, title="Higgs Audio TTS") as demo:
+        # Header with gradient background
+        gr.HTML("""
+            <div class="header-container">
+                <h1 class="header-title">🎙️ Higgs Audio Text-to-Speech</h1>
+                <p class="header-subtitle">Transform your text into natural, expressive speech with AI</p>
+            </div>
+        """)
         # Main UI section
         with gr.Row():
             with gr.Column(scale=2):
+                # Template selection with visual cards
+                gr.Markdown("### 🎯 Choose Your Template")
                 template_dropdown = gr.Dropdown(
                     label="TTS Template",
                     choices=list(PREDEFINED_EXAMPLES.keys()),
                     value=default_template,
+                    info="Select a predefined template to get started quickly",
+                    elem_classes=["template-selector"]
                 )
+                # Template description with enhanced styling
                 template_description = gr.HTML(
+                    value=f'<div class="info-card">{PREDEFINED_EXAMPLES[default_template]["description"]}</div>',
                     visible=True,
                 )
+                # System prompt with better styling
+                with gr.Group():
+                    gr.Markdown("### 🔧 System Configuration")
+                    system_prompt = gr.TextArea(
+                        label="System Prompt",
+                        placeholder="Enter system prompt to guide the model...",
+                        value=PREDEFINED_EXAMPLES[default_template]["system_prompt"],
                         lines=3,
+                        elem_classes=["system-prompt"]
                     )
+                # Input text with character counter
+                with gr.Group():
+                    gr.Markdown("### ✍️ Your Text")
+                    input_text = gr.TextArea(
+                        label="Input Text",
+                        placeholder="Type the text you want to convert to speech...",
+                        value=PREDEFINED_EXAMPLES[default_template]["input_text"],
+                        lines=6,
+                        elem_classes=["input-text"]
                     )
+                    char_count = gr.Markdown(f"Character count: {len(PREDEFINED_EXAMPLES[default_template]['input_text'])}")
+                # Voice selection section
+                with gr.Group(visible=False) as voice_section:
+                    gr.Markdown("### 🎭 Voice Selection")
+                    voice_preset = gr.Dropdown(
+                        label="Voice Preset",
+                        choices=list(VOICE_PRESETS.keys()),
+                        value="EMPTY",
+                        interactive=False,
+                        visible=False,
+                        elem_classes=["voice-preset"]
                     )
+                    with gr.Accordion(
+                        "🎤 Custom Reference Audio", open=False, visible=False
+                    ) as custom_reference_accordion:
+                        reference_audio = gr.Audio(
+                            label="Upload Reference Audio",
+                            type="filepath",
+                            elem_classes=["reference-audio"]
+                        )
+                        reference_text = gr.TextArea(
+                            label="Reference Text (transcript of the reference audio)",
+                            placeholder="Enter the transcript of your reference audio for better voice cloning...",
+                            lines=3,
+                            elem_classes=["reference-text"]
+                        )
+                # Advanced parameters with better organization
+                with gr.Accordion("⚙️ Advanced Parameters", open=False):
+                    with gr.Row():
+                        with gr.Column():
+                            max_completion_tokens = gr.Slider(
+                                minimum=128,
+                                maximum=4096,
+                                value=1024,
+                                step=10,
+                                label="Max Completion Tokens",
+                                info="Maximum number of tokens to generate"
+                            )
+                            temperature = gr.Slider(
+                                minimum=0.0,
+                                maximum=1.5,
+                                value=1.0,
+                                step=0.1,
+                                label="Temperature",
+                                info="Controls randomness in generation"
+                            )
+                        with gr.Column():
+                            top_p = gr.Slider(
+                                minimum=0.1,
+                                maximum=1.0,
+                                value=0.95,
+                                step=0.05,
+                                label="Top P",
+                                info="Nucleus sampling parameter"
+                            )
+                            top_k = gr.Slider(
+                                minimum=-1,
+                                maximum=100,
+                                value=50,
+                                step=1,
+                                label="Top K",
+                                info="Top-k sampling parameter (-1 to disable)"
+                            )
+                    with gr.Row():
+                        with gr.Column():
+                            ras_win_len = gr.Slider(
+                                minimum=0,
+                                maximum=10,
+                                value=7,
+                                step=1,
+                                label="RAS Window Length",
+                                info="Window length for repetition avoidance sampling"
+                            )
+                        with gr.Column():
+                            ras_win_max_num_repeat = gr.Slider(
+                                minimum=1,
+                                maximum=10,
+                                value=2,
+                                step=1,
+                                label="RAS Max Num Repeat",
+                                info="Maximum repetitions allowed in the window"
+                            )
+                    # Stop strings with better UI
+                    gr.Markdown("#### Stop Strings")
                     stop_strings = gr.Dataframe(
                         label="Stop Strings",
                         headers=["stops"],
                         value=[[s] for s in DEFAULT_STOP_STRINGS],
                         interactive=True,
                         col_count=(1, "fixed"),
+                        elem_classes=["stop-strings"]
                     )
+                # Generate button with enhanced styling
+                with gr.Row():
+                    submit_btn = gr.Button(
+                        "🚀 Generate Speech",
+                        variant="primary",
+                        scale=1,
+                        elem_classes=["generate-btn"]
+                    )
+            # Output column with better organization
             with gr.Column(scale=2):
+                # Status and progress section
+                with gr.Group():
+                    gr.Markdown("### 📊 Generation Status")
+                    status_text = gr.Markdown("Ready to generate speech...", elem_classes=["status-text"])
+                # Model response section
+                with gr.Group():
+                    gr.Markdown("### 💬 Model Response")
+                    output_text = gr.TextArea(
+                        label="Generated Text Output",
+                        lines=3,
+                        interactive=False,
+                        elem_classes=["output-text"]
+                    )
+                # Audio output with enhanced player
+                with gr.Group():
+                    gr.Markdown("### 🎵 Generated Audio")
+                    output_audio = gr.Audio(
+                        label="Audio Player",
+                        interactive=False,
+                        autoplay=True,
+                        elem_classes=["audio-container"]
+                    )
+                    with gr.Row():
+                        stop_btn = gr.Button(
+                            "⏹️ Stop Playback",
+                            variant="secondary",
+                            elem_classes=["stop-btn"]
+                        )
+                        download_btn = gr.Button(
+                            "💾 Download Audio",
+                            variant="secondary",
+                            elem_classes=["download-btn"],
+                            visible=False
+                        )
+                # Quick tips section
+                gr.Markdown("""
+                    <div class="info-card">
+                        <h4>💡 Quick Tips:</h4>
+                        <ul>
+                            <li>For voice cloning, upload a clear 10-30 second audio sample</li>
+                            <li>Use [music start] and [music end] tags for background music</li>
+                            <li>Add [SPEAKER0] and [SPEAKER1] tags for multi-speaker dialogue</li>
+                            <li>Experiment with temperature (0.8-1.2) for varied speech styles</li>
+                        </ul>
+                    </div>
+                """)
+        # Voice samples section with visual cards
         with gr.Row(visible=False) as voice_samples_section:
+            gr.Markdown("### 🎧 Voice Samples Library")
             voice_samples_table = gr.Dataframe(
                 headers=["Voice Preset", "Sample Text"],
                 datatype=["str", "str"],
                 value=[[preset, text] for preset, text in VOICE_PRESETS.items() if preset != "EMPTY"],
                 interactive=False,
+                elem_classes=["voice-samples-table"]
+            )
+            sample_audio = gr.Audio(
+                label="🔊 Preview Voice Sample",
+                elem_classes=["sample-audio"]
             )
+        # Function to update character count
+        def update_char_count(text):
+            return f"Character count: {len(text)}"
         # Function to play voice sample when clicking on a row
         def play_voice_sample(evt: gr.SelectData):
             try:
                 preset_names = [preset for preset in VOICE_PRESETS.keys() if preset != "EMPTY"]
                 if evt.index[0] < len(preset_names):
                     preset = preset_names[evt.index[0]]
                 gr.Error(f"Error playing voice sample: {e}")
                 return None
         # Function to handle template selection
         def apply_template(template_name):
             if template_name in PREDEFINED_EXAMPLES:
                 template = PREDEFINED_EXAMPLES[template_name]
                 is_voice_clone = template_name == "voice-clone"
                 voice_preset_value = "belinda" if is_voice_clone else "EMPTY"
                 ras_win_len_value = 0 if template_name == "single-speaker-bgm" else 7
+                description_html = f'<div class="info-card">{template["description"]}</div>'
                 return (
                     template["system_prompt"],  # system_prompt
                     template["input_text"],  # input_text
+                    description_html,  # template_description
                     gr.update(
+                        value=voice_preset_value,
+                        interactive=is_voice_clone,
+                        visible=is_voice_clone
+                    ),  # voice_preset
+                    gr.update(visible=is_voice_clone),  # custom reference accordion
+                    gr.update(visible=is_voice_clone),  # voice samples section
                     ras_win_len_value,  # ras_win_len
+                    gr.update(visible=is_voice_clone),  # voice_section
+                    update_char_count(template["input_text"]),  # char_count
                 )
+            return (gr.update(),) * 9
+        # Enhanced text_to_speech wrapper with status updates
+        def text_to_speech_with_status(
+            text, voice_preset, reference_audio, reference_text,
+            max_completion_tokens, temperature, top_p, top_k,
+            system_prompt, stop_strings, ras_win_len, ras_win_max_num_repeat
+        ):
+            # Update status
+            yield "🔄 Initializing model...", None, None, gr.update(visible=False)
+            # Call the actual TTS function
+            result_text, audio_result = text_to_speech(
+                text, voice_preset, reference_audio, reference_text,
+                max_completion_tokens, temperature, top_p, top_k,
+                system_prompt, stop_strings, ras_win_len, ras_win_max_num_repeat
+            )
+            if audio_result:
+                status = "✅ Speech generated successfully!"
+                download_visible = True
             else:
+                status = "❌ Failed to generate speech"
+                download_visible = False
+            yield status, result_text, audio_result, gr.update(visible=download_visible)
         # Set up event handlers
+        # Character count update
+        input_text.change(
+            fn=update_char_count,
+            inputs=[input_text],
+            outputs=[char_count]
+        )
+        # Template selection
         template_dropdown.change(
             fn=apply_template,
             inputs=[template_dropdown],
                 custom_reference_accordion,
                 voice_samples_section,
                 ras_win_len,
+                voice_section,
+                char_count,
             ],
         )
+        # Voice sample preview
+        voice_samples_table.select(
+            fn=play_voice_sample,
+            outputs=[sample_audio]
+        )
+        # Generate button with status updates
         submit_btn.click(
+            fn=text_to_speech_with_status,
             inputs=[
                 input_text,
                 voice_preset,
                 ras_win_len,
                 ras_win_max_num_repeat,
             ],
+            outputs=[status_text, output_text, output_audio, download_btn],
             api_name="generate_speech",
         )
             js="() => {const audio = document.querySelector('audio'); if(audio) audio.pause(); return null;}",
         )
+        # Download button functionality
+        download_btn.click(
+            fn=lambda x: x,
+            inputs=[output_audio],
+            outputs=[],
+            js="(audio) => {if(audio) {const a = document.createElement('a'); a.href = audio.url; a.download = 'generated_speech.wav'; a.click();}}",
+        )
     return demo
 def main():
     """Main function to parse arguments and launch the UI."""
+    global DEFAULT_MODEL_PATH, DEFAULT_AUDIO_TOKENIZER_PATH
     parser = argparse.ArgumentParser(description="Gradio UI for Text-to-Speech using HiggsAudioServeEngine")
     parser.add_argument(
     args = parser.parse_args()
+    # Initialize global variables
+    initialize_globals()
     # Create and launch the UI
     demo = create_ui()
+    demo.launch(
+        server_name=args.host,
+        server_port=args.port,
+        share=False,
+        show_error=True
+    )
 if __name__ == "__main__":
+    main()