Spaces:

rockerritesh
/

nepali-text-speech

Sleeping

App Files Files Community

nepali-text-speech / app.py

rockerritesh

Create app.py

98ed243 verified 4 months ago

raw

history blame contribute delete

5.91 kB

	import streamlit as st
	import torch
	from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
	import soundfile as sf
	import io
	import numpy as np
	import base64

	# Set page config
	st.set_page_config(
	page_title="Nepali Text-to-Speech Converter",
	page_icon="🎧",
	layout="centered"
	)

	# Custom CSS
	st.markdown("""
	<style>
	.main {
	padding: 2rem;
	}
	.stTextInput > div > div > input {
	min-height: 100px;
	}
	</style>
	""", unsafe_allow_html=True)

	@st.cache_resource
	def load_model():
	"""Load and cache the model and processor"""
	try:
	processor = SpeechT5Processor.from_pretrained("aryamanstha/speecht5_nepali_oslr43_oslr143")
	model = SpeechT5ForTextToSpeech.from_pretrained("aryamanstha/speecht5_nepali_oslr43_oslr143")
	vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

	# Move to GPU if available
	device = "cuda" if torch.cuda.is_available() else "cpu"
	model = model.to(device)
	vocoder = vocoder.to(device)

	return processor, model, vocoder, device
	except Exception as e:
	st.error(f"Error loading model: {str(e)}")
	return None, None, None, None

	def create_speaker_embedding():
	"""Create a default speaker embedding"""
	speaker_embedding = torch.zeros(512)
	return speaker_embedding.unsqueeze(0)

	def generate_speech(text, processor, model, vocoder, speaker_embeddings, device):
	"""Generate speech from text"""
	try:
	# Prepare input
	inputs = processor(text=text, return_tensors="pt").to(device)
	speaker_embeddings = speaker_embeddings.to(device)

	# Generate speech
	speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

	# Convert to numpy
	speech = speech.cpu().numpy()

	# Save to BytesIO
	audio_buffer = io.BytesIO()
	sf.write(audio_buffer, speech, samplerate=16000, format='WAV')
	audio_buffer.seek(0)

	return audio_buffer
	except Exception as e:
	st.error(f"Error generating speech: {str(e)}")
	return None

	def get_audio_player_html(audio_bytes):
	"""Create an HTML audio player with the audio data"""
	audio_base64 = base64.b64encode(audio_bytes.read()).decode()
	return f"""
	<audio controls autoplay>
	<source src="data:audio/wav;base64,{audio_base64}" type="audio/wav">
	Your browser does not support the audio element.
	</audio>
	"""

	def main():
	st.title("🎤 Nepali Text-to-Speech Converter")

	# Add introduction
	st.markdown("""
	Convert Nepali text to speech using SpeechT5 model. Simply enter your text below and click 'Generate Speech'.
	""")

	# Initialize session state for tracking model loading
	if 'model_loaded' not in st.session_state:
	st.session_state.model_loaded = False

	# Load model
	if not st.session_state.model_loaded:
	with st.spinner("Loading model... This may take a few minutes..."):
	processor, model, vocoder, device = load_model()
	if None not in (processor, model, vocoder):
	st.session_state.model_loaded = True
	st.session_state.processor = processor
	st.session_state.model = model
	st.session_state.vocoder = vocoder
	st.session_state.device = device
	st.success("Model loaded successfully! 🚀")
	else:
	st.error("Failed to load model. Please refresh the page to try again.")
	return

	# Create text input area
	text_input = st.text_area(
	"Enter Nepali Text:",
	height=100,
	placeholder="तपाईंको नेपाली पाठ यहाँ लेख्नुहोस्..."
	)

	# Create speaker embedding
	speaker_embeddings = create_speaker_embedding()

	# Add generate button
	col1, col2 = st.columns([1, 2])
	with col1:
	generate_button = st.button("🔊 Generate Speech")

	# Generate speech when button is clicked
	if generate_button and text_input:
	with st.spinner("Generating speech..."):
	audio_buffer = generate_speech(
	text_input,
	st.session_state.processor,
	st.session_state.model,
	st.session_state.vocoder,
	speaker_embeddings,
	st.session_state.device
	)

	if audio_buffer:
	# Display audio player
	st.markdown("### Generated Speech:")
	st.markdown(get_audio_player_html(audio_buffer), unsafe_allow_html=True)

	# Add download button
	audio_buffer.seek(0)
	st.download_button(
	label="📥 Download Audio",
	data=audio_buffer,
	file_name="generated_speech.wav",
	mime="audio/wav"
	)

	# Add usage instructions
	with st.expander("ℹ️ Usage Instructions"):
	st.markdown("""
	1. Enter your Nepali text in the text area above
	2. Click the 'Generate Speech' button
	3. Wait for the audio to be generated
	4. Use the audio player to listen to the generated speech
	5. Download the audio file if desired

	Note: For best results, enter clear and grammatically correct Nepali text.
	""")

	# Add footer
	st.markdown("---")
	st.markdown(
	"Made with ❤️ using Streamlit and SpeechT5 \| "
	"Model: [aryamanstha/speecht5_nepali_oslr43_oslr143](https://huggingface.co/aryamanstha/speecht5_nepali_oslr43_oslr143)"
	)

	if __name__ == "__main__":
	main()