Kokoro-TTS / app.py
erfyhersr's picture
Create app.py
f7446d6 verified
import os
import streamlit as st
import time
from kokoro import KPipeline
import soundfile as sf
import io
st.title("Text-to-Speech with Kokoro Pipeline")
st.markdown("Enter your text and configure options to generate audio segments.")
# Text input for the content to be synthesized
text = st.text_area(
"Enter text",
value="The sky above the port was the color of television, tuned to a dead channel.",
height=150,
)
# Voice selection - add more voice options as needed
voice_options = {
"American English (af_heart)": "af_heart",
# You can add more voices here, for example:
# "British English (b_voice)": "b_voice",
# "Japanese (j_voice)": "j_voice",
}
voice_choice = st.selectbox("Select Voice", options=list(voice_options.keys()))
voice = voice_options[voice_choice]
# Slider for speech speed
speed = st.slider("Speech Speed", min_value=0.5, max_value=2.0, value=1.0)
if st.button("Generate Audio"):
if not text.strip():
st.error("Please enter some text!")
else:
try:
# Initialize the Kokoro pipeline.
# Ensure that lang_code matches your chosen voice.
with st.spinner("Initializing TTS pipeline..."):
pipeline = KPipeline(lang_code='a')
# Generate audio segments using the pipeline.
with st.spinner("Generating audio..."):
generator = pipeline(
text,
voice=voice,
speed=speed,
split_pattern=r'\n+'
)
# Process and display each generated segment.
segment_index = 0
for gs, ps, audio in generator:
st.markdown(f"**Segment {segment_index}**")
st.write("**Graphemes/Text:**", gs)
st.write("**Phonemes:**", ps)
# Convert the generated audio (assumed to be a numpy array)
# to a WAV file in-memory so it can be played in Streamlit.
audio_buffer = io.BytesIO()
sf.write(audio_buffer, audio, 24000, format='WAV')
audio_buffer.seek(0)
st.audio(audio_buffer, format="audio/wav")
segment_index += 1
st.success("Audio generation complete!")
except Exception as e:
st.error("An error occurred during audio generation.")
st.exception(e)