import streamlit as st
import torch
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import soundfile as sf
import io
import numpy as np
import base64
# Set page config
st.set_page_config(
page_title="Nepali Text-to-Speech Converter",
page_icon="🎧",
layout="centered"
)
# Custom CSS
st.markdown("""
""", unsafe_allow_html=True)
@st.cache_resource
def load_model():
"""Load and cache the model and processor"""
try:
processor = SpeechT5Processor.from_pretrained("aryamanstha/speecht5_nepali_oslr43_oslr143")
model = SpeechT5ForTextToSpeech.from_pretrained("aryamanstha/speecht5_nepali_oslr43_oslr143")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
# Move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
vocoder = vocoder.to(device)
return processor, model, vocoder, device
except Exception as e:
st.error(f"Error loading model: {str(e)}")
return None, None, None, None
def create_speaker_embedding():
"""Create a default speaker embedding"""
speaker_embedding = torch.zeros(512)
return speaker_embedding.unsqueeze(0)
def generate_speech(text, processor, model, vocoder, speaker_embeddings, device):
"""Generate speech from text"""
try:
# Prepare input
inputs = processor(text=text, return_tensors="pt").to(device)
speaker_embeddings = speaker_embeddings.to(device)
# Generate speech
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
# Convert to numpy
speech = speech.cpu().numpy()
# Save to BytesIO
audio_buffer = io.BytesIO()
sf.write(audio_buffer, speech, samplerate=16000, format='WAV')
audio_buffer.seek(0)
return audio_buffer
except Exception as e:
st.error(f"Error generating speech: {str(e)}")
return None
def get_audio_player_html(audio_bytes):
"""Create an HTML audio player with the audio data"""
audio_base64 = base64.b64encode(audio_bytes.read()).decode()
return f"""
"""
def main():
st.title("🎤 Nepali Text-to-Speech Converter")
# Add introduction
st.markdown("""
Convert Nepali text to speech using SpeechT5 model. Simply enter your text below and click 'Generate Speech'.
""")
# Initialize session state for tracking model loading
if 'model_loaded' not in st.session_state:
st.session_state.model_loaded = False
# Load model
if not st.session_state.model_loaded:
with st.spinner("Loading model... This may take a few minutes..."):
processor, model, vocoder, device = load_model()
if None not in (processor, model, vocoder):
st.session_state.model_loaded = True
st.session_state.processor = processor
st.session_state.model = model
st.session_state.vocoder = vocoder
st.session_state.device = device
st.success("Model loaded successfully! 🚀")
else:
st.error("Failed to load model. Please refresh the page to try again.")
return
# Create text input area
text_input = st.text_area(
"Enter Nepali Text:",
height=100,
placeholder="तपाईंको नेपाली पाठ यहाँ लेख्नुहोस्..."
)
# Create speaker embedding
speaker_embeddings = create_speaker_embedding()
# Add generate button
col1, col2 = st.columns([1, 2])
with col1:
generate_button = st.button("🔊 Generate Speech")
# Generate speech when button is clicked
if generate_button and text_input:
with st.spinner("Generating speech..."):
audio_buffer = generate_speech(
text_input,
st.session_state.processor,
st.session_state.model,
st.session_state.vocoder,
speaker_embeddings,
st.session_state.device
)
if audio_buffer:
# Display audio player
st.markdown("### Generated Speech:")
st.markdown(get_audio_player_html(audio_buffer), unsafe_allow_html=True)
# Add download button
audio_buffer.seek(0)
st.download_button(
label="📥 Download Audio",
data=audio_buffer,
file_name="generated_speech.wav",
mime="audio/wav"
)
# Add usage instructions
with st.expander("ℹ️ Usage Instructions"):
st.markdown("""
1. Enter your Nepali text in the text area above
2. Click the 'Generate Speech' button
3. Wait for the audio to be generated
4. Use the audio player to listen to the generated speech
5. Download the audio file if desired
**Note**: For best results, enter clear and grammatically correct Nepali text.
""")
# Add footer
st.markdown("---")
st.markdown(
"Made with ❤️ using Streamlit and SpeechT5 | "
"Model: [aryamanstha/speecht5_nepali_oslr43_oslr143](https://huggingface.co/aryamanstha/speecht5_nepali_oslr43_oslr143)"
)
if __name__ == "__main__":
main()