szili2011's picture
Update app.py
d5a7fa2 verified
raw
history blame
3.15 kB
import gradio as gr
import tensorflow as tf
import numpy as np
import nltk
from nltk.corpus import cmudict
from scipy.io.wavfile import write
# Define sample_rate as a global constant
SAMPLE_RATE = 22050
# Download required NLTK data
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('cmudict', quiet=True)
# Load your model from the root directory
# Add compile=False as it's often needed for inference-only models
# and can resolve some loading warnings.
model = tf.keras.models.load_model('audio_model.h5', compile=False)
# Preprocess input text
def preprocess_text(text):
d = cmudict.dict()
words = text.lower().split()
phonemes = []
for word in words:
if word in d:
phonemes.append(d[word][0])
else:
phonemes.append(['UNKNOWN'])
flattened_phonemes = [p for sublist in phonemes for p in sublist]
# Create dummy 13-feature vectors for each phoneme (implement your own feature extraction)
num_features = 13
sequence_length = len(flattened_phonemes)
if sequence_length == 0: # Handle empty input
return np.zeros((1, 1, num_features))
input_data = np.random.rand(sequence_length, num_features)
# Add batch dimension
input_data = np.expand_dims(input_data, axis=0) # Shape (1, sequence_length, 13)
return input_data
# Convert model output to an audio file
def convert_to_audio(model_output, filename="output.wav"):
if model_output.size == 0: # Handle empty output
return None
# Normalize audio to be between -1 and 1
normalized_output = np.interp(model_output, (model_output.min(), model_output.max()), (-1, 1))
write(filename, SAMPLE_RATE, normalized_output.astype(np.float32))
return filename
# Define function to generate sound effect
def generate_sfx(text, duration):
input_data = preprocess_text(text)
# Check for empty input after preprocessing
if input_data.shape[1] == 0:
return None # Return None to clear the audio component
prediction = model.predict(input_data)
flat_prediction = prediction.flatten()
if len(flat_prediction) == 0:
return None
# Generate longer output by repeating or padding
num_repeats = (duration * SAMPLE_RATE // len(flat_prediction)) + 1
audio_data = np.tile(flat_prediction, num_repeats)[:duration * SAMPLE_RATE]
audio_file = convert_to_audio(audio_data, filename="output.wav")
return audio_file
# Define the Gradio interface
interface = gr.Interface(
fn=generate_sfx,
inputs=[
gr.Textbox(label="Enter a Word", placeholder="Write a Word To Convert it into SFX Sound"),
gr.Slider(minimum=1, maximum=20, value=3, step=1, label="Duration (seconds)")
],
outputs=gr.Audio(label="Generated SFX", type="filepath"),
title="SFX Generator from Text",
description="Enter a word or sentence, and the model will generate an SFX sound.",
)
# Run the interface
if __name__ == "__main__":
tf.config.set_visible_devices([], 'GPU') # Disable GPU
# --- FIX: Remove share=True for Hugging Face Spaces ---
interface.launch()