Spaces:

rockerritesh
/

nepali-text-speech

Sleeping

File size: 5,906 Bytes

98ed243

import streamlit as st
import torch
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import soundfile as sf
import io
import numpy as np
import base64

# Set page config
st.set_page_config(
    page_title="Nepali Text-to-Speech Converter",
    page_icon="🎧",
    layout="centered"
)

# Custom CSS
st.markdown("""
    <style>
    .main {
        padding: 2rem;
    }
    .stTextInput > div > div > input {
        min-height: 100px;
    }
    </style>
    """, unsafe_allow_html=True)

@st.cache_resource
def load_model():
    """Load and cache the model and processor"""
    try:
        processor = SpeechT5Processor.from_pretrained("aryamanstha/speecht5_nepali_oslr43_oslr143")
        model = SpeechT5ForTextToSpeech.from_pretrained("aryamanstha/speecht5_nepali_oslr43_oslr143")
        vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
        
        # Move to GPU if available
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model = model.to(device)
        vocoder = vocoder.to(device)
        
        return processor, model, vocoder, device
    except Exception as e:
        st.error(f"Error loading model: {str(e)}")
        return None, None, None, None

def create_speaker_embedding():
    """Create a default speaker embedding"""
    speaker_embedding = torch.zeros(512)
    return speaker_embedding.unsqueeze(0)

def generate_speech(text, processor, model, vocoder, speaker_embeddings, device):
    """Generate speech from text"""
    try:
        # Prepare input
        inputs = processor(text=text, return_tensors="pt").to(device)
        speaker_embeddings = speaker_embeddings.to(device)
        
        # Generate speech
        speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
        
        # Convert to numpy
        speech = speech.cpu().numpy()
        
        # Save to BytesIO
        audio_buffer = io.BytesIO()
        sf.write(audio_buffer, speech, samplerate=16000, format='WAV')
        audio_buffer.seek(0)
        
        return audio_buffer
    except Exception as e:
        st.error(f"Error generating speech: {str(e)}")
        return None

def get_audio_player_html(audio_bytes):
    """Create an HTML audio player with the audio data"""
    audio_base64 = base64.b64encode(audio_bytes.read()).decode()
    return f"""
        <audio controls autoplay>
            <source src="data:audio/wav;base64,{audio_base64}" type="audio/wav">
            Your browser does not support the audio element.
        </audio>
    """

def main():
    st.title("🎤 Nepali Text-to-Speech Converter")
    
    # Add introduction
    st.markdown("""
    Convert Nepali text to speech using SpeechT5 model. Simply enter your text below and click 'Generate Speech'.
    """)
    
    # Initialize session state for tracking model loading
    if 'model_loaded' not in st.session_state:
        st.session_state.model_loaded = False
    
    # Load model
    if not st.session_state.model_loaded:
        with st.spinner("Loading model... This may take a few minutes..."):
            processor, model, vocoder, device = load_model()
            if None not in (processor, model, vocoder):
                st.session_state.model_loaded = True
                st.session_state.processor = processor
                st.session_state.model = model
                st.session_state.vocoder = vocoder
                st.session_state.device = device
                st.success("Model loaded successfully! 🚀")
            else:
                st.error("Failed to load model. Please refresh the page to try again.")
                return
    
    # Create text input area
    text_input = st.text_area(
        "Enter Nepali Text:",
        height=100,
        placeholder="तपाईंको नेपाली पाठ यहाँ लेख्नुहोस्..."
    )
    
    # Create speaker embedding
    speaker_embeddings = create_speaker_embedding()
    
    # Add generate button
    col1, col2 = st.columns([1, 2])
    with col1:
        generate_button = st.button("🔊 Generate Speech")
    
    # Generate speech when button is clicked
    if generate_button and text_input:
        with st.spinner("Generating speech..."):
            audio_buffer = generate_speech(
                text_input,
                st.session_state.processor,
                st.session_state.model,
                st.session_state.vocoder,
                speaker_embeddings,
                st.session_state.device
            )
            
            if audio_buffer:
                # Display audio player
                st.markdown("### Generated Speech:")
                st.markdown(get_audio_player_html(audio_buffer), unsafe_allow_html=True)
                
                # Add download button
                audio_buffer.seek(0)
                st.download_button(
                    label="📥 Download Audio",
                    data=audio_buffer,
                    file_name="generated_speech.wav",
                    mime="audio/wav"
                )
    
    # Add usage instructions
    with st.expander("ℹ️ Usage Instructions"):
        st.markdown("""
        1. Enter your Nepali text in the text area above
        2. Click the 'Generate Speech' button
        3. Wait for the audio to be generated
        4. Use the audio player to listen to the generated speech
        5. Download the audio file if desired
        
        **Note**: For best results, enter clear and grammatically correct Nepali text.
        """)
    
    # Add footer
    st.markdown("---")
    st.markdown(
        "Made with ❤️ using Streamlit and SpeechT5 | "
        "Model: [aryamanstha/speecht5_nepali_oslr43_oslr143](https://huggingface.co/aryamanstha/speecht5_nepali_oslr43_oslr143)"
    )

if __name__ == "__main__":
    main()