Spaces:
Sleeping
Sleeping
import streamlit as st | |
import torch | |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
import soundfile as sf | |
import io | |
import numpy as np | |
import base64 | |
# Set page config | |
st.set_page_config( | |
page_title="Nepali Text-to-Speech Converter", | |
page_icon="🎧", | |
layout="centered" | |
) | |
# Custom CSS | |
st.markdown(""" | |
<style> | |
.main { | |
padding: 2rem; | |
} | |
.stTextInput > div > div > input { | |
min-height: 100px; | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
def load_model(): | |
"""Load and cache the model and processor""" | |
try: | |
processor = SpeechT5Processor.from_pretrained("aryamanstha/speecht5_nepali_oslr43_oslr143") | |
model = SpeechT5ForTextToSpeech.from_pretrained("aryamanstha/speecht5_nepali_oslr43_oslr143") | |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
# Move to GPU if available | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model = model.to(device) | |
vocoder = vocoder.to(device) | |
return processor, model, vocoder, device | |
except Exception as e: | |
st.error(f"Error loading model: {str(e)}") | |
return None, None, None, None | |
def create_speaker_embedding(): | |
"""Create a default speaker embedding""" | |
speaker_embedding = torch.zeros(512) | |
return speaker_embedding.unsqueeze(0) | |
def generate_speech(text, processor, model, vocoder, speaker_embeddings, device): | |
"""Generate speech from text""" | |
try: | |
# Prepare input | |
inputs = processor(text=text, return_tensors="pt").to(device) | |
speaker_embeddings = speaker_embeddings.to(device) | |
# Generate speech | |
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) | |
# Convert to numpy | |
speech = speech.cpu().numpy() | |
# Save to BytesIO | |
audio_buffer = io.BytesIO() | |
sf.write(audio_buffer, speech, samplerate=16000, format='WAV') | |
audio_buffer.seek(0) | |
return audio_buffer | |
except Exception as e: | |
st.error(f"Error generating speech: {str(e)}") | |
return None | |
def get_audio_player_html(audio_bytes): | |
"""Create an HTML audio player with the audio data""" | |
audio_base64 = base64.b64encode(audio_bytes.read()).decode() | |
return f""" | |
<audio controls autoplay> | |
<source src="data:audio/wav;base64,{audio_base64}" type="audio/wav"> | |
Your browser does not support the audio element. | |
</audio> | |
""" | |
def main(): | |
st.title("🎤 Nepali Text-to-Speech Converter") | |
# Add introduction | |
st.markdown(""" | |
Convert Nepali text to speech using SpeechT5 model. Simply enter your text below and click 'Generate Speech'. | |
""") | |
# Initialize session state for tracking model loading | |
if 'model_loaded' not in st.session_state: | |
st.session_state.model_loaded = False | |
# Load model | |
if not st.session_state.model_loaded: | |
with st.spinner("Loading model... This may take a few minutes..."): | |
processor, model, vocoder, device = load_model() | |
if None not in (processor, model, vocoder): | |
st.session_state.model_loaded = True | |
st.session_state.processor = processor | |
st.session_state.model = model | |
st.session_state.vocoder = vocoder | |
st.session_state.device = device | |
st.success("Model loaded successfully! 🚀") | |
else: | |
st.error("Failed to load model. Please refresh the page to try again.") | |
return | |
# Create text input area | |
text_input = st.text_area( | |
"Enter Nepali Text:", | |
height=100, | |
placeholder="तपाईंको नेपाली पाठ यहाँ लेख्नुहोस्..." | |
) | |
# Create speaker embedding | |
speaker_embeddings = create_speaker_embedding() | |
# Add generate button | |
col1, col2 = st.columns([1, 2]) | |
with col1: | |
generate_button = st.button("🔊 Generate Speech") | |
# Generate speech when button is clicked | |
if generate_button and text_input: | |
with st.spinner("Generating speech..."): | |
audio_buffer = generate_speech( | |
text_input, | |
st.session_state.processor, | |
st.session_state.model, | |
st.session_state.vocoder, | |
speaker_embeddings, | |
st.session_state.device | |
) | |
if audio_buffer: | |
# Display audio player | |
st.markdown("### Generated Speech:") | |
st.markdown(get_audio_player_html(audio_buffer), unsafe_allow_html=True) | |
# Add download button | |
audio_buffer.seek(0) | |
st.download_button( | |
label="📥 Download Audio", | |
data=audio_buffer, | |
file_name="generated_speech.wav", | |
mime="audio/wav" | |
) | |
# Add usage instructions | |
with st.expander("ℹ️ Usage Instructions"): | |
st.markdown(""" | |
1. Enter your Nepali text in the text area above | |
2. Click the 'Generate Speech' button | |
3. Wait for the audio to be generated | |
4. Use the audio player to listen to the generated speech | |
5. Download the audio file if desired | |
**Note**: For best results, enter clear and grammatically correct Nepali text. | |
""") | |
# Add footer | |
st.markdown("---") | |
st.markdown( | |
"Made with ❤️ using Streamlit and SpeechT5 | " | |
"Model: [aryamanstha/speecht5_nepali_oslr43_oslr143](https://huggingface.co/aryamanstha/speecht5_nepali_oslr43_oslr143)" | |
) | |
if __name__ == "__main__": | |
main() |