File size: 2,541 Bytes
f7446d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import os
import streamlit as st
import time
from kokoro import KPipeline
import soundfile as sf
import io

st.title("Text-to-Speech with Kokoro Pipeline")
st.markdown("Enter your text and configure options to generate audio segments.")

# Text input for the content to be synthesized
text = st.text_area(
    "Enter text",
    value="The sky above the port was the color of television, tuned to a dead channel.",
    height=150,
)

# Voice selection - add more voice options as needed
voice_options = {
    "American English (af_heart)": "af_heart",
    # You can add more voices here, for example:
    # "British English (b_voice)": "b_voice",
    # "Japanese (j_voice)": "j_voice",
}
voice_choice = st.selectbox("Select Voice", options=list(voice_options.keys()))
voice = voice_options[voice_choice]

# Slider for speech speed
speed = st.slider("Speech Speed", min_value=0.5, max_value=2.0, value=1.0)

if st.button("Generate Audio"):
    if not text.strip():
        st.error("Please enter some text!")
    else:
        try:
            # Initialize the Kokoro pipeline.
            # Ensure that lang_code matches your chosen voice.
            with st.spinner("Initializing TTS pipeline..."):
                pipeline = KPipeline(lang_code='a')
            
            # Generate audio segments using the pipeline.
            with st.spinner("Generating audio..."):
                generator = pipeline(
                    text,
                    voice=voice,
                    speed=speed,
                    split_pattern=r'\n+'
                )
                
                # Process and display each generated segment.
                segment_index = 0
                for gs, ps, audio in generator:
                    st.markdown(f"**Segment {segment_index}**")
                    st.write("**Graphemes/Text:**", gs)
                    st.write("**Phonemes:**", ps)
                    
                    # Convert the generated audio (assumed to be a numpy array)
                    # to a WAV file in-memory so it can be played in Streamlit.
                    audio_buffer = io.BytesIO()
                    sf.write(audio_buffer, audio, 24000, format='WAV')
                    audio_buffer.seek(0)
                    st.audio(audio_buffer, format="audio/wav")
                    
                    segment_index += 1
                st.success("Audio generation complete!")
        except Exception as e:
            st.error("An error occurred during audio generation.")
            st.exception(e)