Spaces:
Paused
Paused
| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForTextToWaveform, AutoProcessor | |
| # Load model and processor | |
| model_name = "hexgrad/Kokoro-82M" | |
| processor = AutoProcessor.from_pretrained(model_name) | |
| model = AutoModelForTextToWaveform.from_pretrained(model_name, torch_dtype=torch.float16) | |
| # Move to GPU if available | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model = model.to(device) | |
| def text_to_audio(text, speed=1.0): | |
| """Convert text to audio using Kokoro model""" | |
| # Process the input text | |
| inputs = processor(text=text, return_tensors="pt") | |
| inputs = {k: v.to(device) for k, v in inputs.items()} | |
| # Set generation parameters | |
| gen_kwargs = { | |
| "do_sample": True, | |
| "temperature": 0.7, | |
| "length_penalty": 1.0, | |
| "repetition_penalty": 2.0, | |
| "top_p": 0.9, | |
| } | |
| # Generate waveform | |
| with torch.no_grad(): | |
| waveform = model.generate(**inputs, **gen_kwargs).cpu().numpy()[0] | |
| # Create a sample rate (typical for audio is 24000) | |
| sample_rate = 24000 | |
| # Apply speed factor if needed | |
| if speed != 1.0: | |
| import numpy as np | |
| import librosa | |
| waveform = librosa.effects.time_stretch(waveform.astype(np.float32), rate=speed) | |
| return sample_rate, waveform | |
| # Create Gradio interface | |
| with gr.Blocks(title="Kokoro Text-to-Audio") as app: | |
| gr.Markdown("# 🎵 Kokoro Text-to-Audio Converter") | |
| gr.Markdown("Convert text to speech using hexgrad/Kokoro-82M model") | |
| with gr.Row(): | |
| with gr.Column(): | |
| text_input = gr.Textbox( | |
| label="Enter your text", | |
| placeholder="Type something to convert to audio...", | |
| lines=5 | |
| ) | |
| speed_slider = gr.Slider( | |
| minimum=0.5, | |
| maximum=1.5, | |
| value=1.0, | |
| step=0.1, | |
| label="Speech Speed" | |
| ) | |
| submit_btn = gr.Button("Generate Audio") | |
| with gr.Column(): | |
| audio_output = gr.Audio(label="Generated Audio", type="numpy") | |
| submit_btn.click( | |
| fn=text_to_audio, | |
| inputs=[text_input, speed_slider], | |
| outputs=[audio_output] | |
| ) | |
| gr.Markdown("### Usage Tips") | |
| gr.Markdown("- For best results, keep your text reasonably short") | |
| gr.Markdown("- Adjust the speed slider to modify the pace of speech") | |
| gr.Markdown("- The model may take a moment to load on first use") | |
| # Launch the app | |
| if __name__ == "__main__": | |
| app.launch() | |