import streamlit as st import torch from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan import soundfile as sf import io import numpy as np import base64 # Set page config st.set_page_config( page_title="Nepali Text-to-Speech Converter", page_icon="🎧", layout="centered" ) # Custom CSS st.markdown(""" """, unsafe_allow_html=True) @st.cache_resource def load_model(): """Load and cache the model and processor""" try: processor = SpeechT5Processor.from_pretrained("aryamanstha/speecht5_nepali_oslr43_oslr143") model = SpeechT5ForTextToSpeech.from_pretrained("aryamanstha/speecht5_nepali_oslr43_oslr143") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") # Move to GPU if available device = "cuda" if torch.cuda.is_available() else "cpu" model = model.to(device) vocoder = vocoder.to(device) return processor, model, vocoder, device except Exception as e: st.error(f"Error loading model: {str(e)}") return None, None, None, None def create_speaker_embedding(): """Create a default speaker embedding""" speaker_embedding = torch.zeros(512) return speaker_embedding.unsqueeze(0) def generate_speech(text, processor, model, vocoder, speaker_embeddings, device): """Generate speech from text""" try: # Prepare input inputs = processor(text=text, return_tensors="pt").to(device) speaker_embeddings = speaker_embeddings.to(device) # Generate speech speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) # Convert to numpy speech = speech.cpu().numpy() # Save to BytesIO audio_buffer = io.BytesIO() sf.write(audio_buffer, speech, samplerate=16000, format='WAV') audio_buffer.seek(0) return audio_buffer except Exception as e: st.error(f"Error generating speech: {str(e)}") return None def get_audio_player_html(audio_bytes): """Create an HTML audio player with the audio data""" audio_base64 = base64.b64encode(audio_bytes.read()).decode() return f""" """ def main(): st.title("🎤 Nepali Text-to-Speech Converter") # Add introduction st.markdown(""" Convert Nepali text to speech using SpeechT5 model. Simply enter your text below and click 'Generate Speech'. """) # Initialize session state for tracking model loading if 'model_loaded' not in st.session_state: st.session_state.model_loaded = False # Load model if not st.session_state.model_loaded: with st.spinner("Loading model... This may take a few minutes..."): processor, model, vocoder, device = load_model() if None not in (processor, model, vocoder): st.session_state.model_loaded = True st.session_state.processor = processor st.session_state.model = model st.session_state.vocoder = vocoder st.session_state.device = device st.success("Model loaded successfully! 🚀") else: st.error("Failed to load model. Please refresh the page to try again.") return # Create text input area text_input = st.text_area( "Enter Nepali Text:", height=100, placeholder="तपाईंको नेपाली पाठ यहाँ लेख्नुहोस्..." ) # Create speaker embedding speaker_embeddings = create_speaker_embedding() # Add generate button col1, col2 = st.columns([1, 2]) with col1: generate_button = st.button("🔊 Generate Speech") # Generate speech when button is clicked if generate_button and text_input: with st.spinner("Generating speech..."): audio_buffer = generate_speech( text_input, st.session_state.processor, st.session_state.model, st.session_state.vocoder, speaker_embeddings, st.session_state.device ) if audio_buffer: # Display audio player st.markdown("### Generated Speech:") st.markdown(get_audio_player_html(audio_buffer), unsafe_allow_html=True) # Add download button audio_buffer.seek(0) st.download_button( label="📥 Download Audio", data=audio_buffer, file_name="generated_speech.wav", mime="audio/wav" ) # Add usage instructions with st.expander("ℹ️ Usage Instructions"): st.markdown(""" 1. Enter your Nepali text in the text area above 2. Click the 'Generate Speech' button 3. Wait for the audio to be generated 4. Use the audio player to listen to the generated speech 5. Download the audio file if desired **Note**: For best results, enter clear and grammatically correct Nepali text. """) # Add footer st.markdown("---") st.markdown( "Made with ❤️ using Streamlit and SpeechT5 | " "Model: [aryamanstha/speecht5_nepali_oslr43_oslr143](https://huggingface.co/aryamanstha/speecht5_nepali_oslr43_oslr143)" ) if __name__ == "__main__": main()