Spaces:

szili2011
/

FNaF-Audio-Generation

Runtime error

App Files Files Community

szili2011 commited on Sep 24, 2024

Commit

09a58b6

verified ·

1 Parent(s): 3ffb926

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -40

app.py CHANGED Viewed

@@ -1,57 +1,90 @@
-import os
-import numpy as np
 import gradio as gr
-from scipy.io.wavfile import write
 import tensorflow as tf
 import nltk
 from nltk.corpus import cmudict
-# Download CMU dictionary if not already downloaded
-nltk.download('cmudict', quiet=True)
-# Ensure TensorFlow uses CPU only
-os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # Disable GPU
-# Load CMU dictionary for pronunciation
-cmu_dict = cmudict.dict()
-# Load your pre-trained model (adjust the model loading according to your implementation)
-# For example, if your model is a Keras model, you would use:
-# model = tf.keras.models.load_model('path_to_your_model')
-# Replace this with your actual model loading code
-# model = ...
-def generate_audio(text, duration):
-    sample_rate = 22050  # Sample rate in Hz
-    t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
-    # Placeholder: Generate a simple sine wave audio signal
-    frequency = 440  # Frequency in Hz (A4 note)
-    audio_data = 0.5 * np.sin(2 * np.pi * frequency * t)  # Generate sine wave
-    return audio_data
-def generate_sfx(duration):
-    text = "Sample text for audio generation"  # Replace with actual input text if needed
-    audio_data = generate_audio(text, duration)
-    audio_data = (audio_data * 32767).astype(np.int16)  # Scale to 16-bit PCM
-    total_samples = duration * 22050  # Adjust based on sample rate
-    if len(audio_data) < total_samples:
-        raise ValueError(f"Generated audio is shorter than {duration} seconds.")
-    output_filename = "output.wav"
-    write(output_filename, 22050, audio_data[:total_samples])  # Write to WAV file
-    return output_filename
-duration_slider = gr.Slider(minimum=2, maximum=20, label="Duration (seconds)", value=10)
-app = gr.Interface(fn=generate_sfx,
-                   inputs=duration_slider,
-                   outputs="audio",
-                   title="Sound Effect Generator",
-                   description="Generate sound effects for a specified duration.")
 if __name__ == "__main__":
-    app.launch()

 import gradio as gr
 import tensorflow as tf
+import numpy as np
 import nltk
 from nltk.corpus import cmudict
+from scipy.io.wavfile import write
+# Download required NLTK data
+nltk.download('averaged_perceptron_tagger')
+nltk.download('cmudict')
+# Load your model from the root directory
+model = tf.keras.models.load_model('audio_model.h5')
+# Preprocess input text
+def preprocess_text(text):
+    """
+    Process the input text to prepare it for the model.
+    This includes tokenization and phoneme extraction.
+    """
+    d = cmudict.dict()
+    words = text.lower().split()
+    phonemes = []
+    for word in words:
+        if word in d:
+            phonemes.append(d[word][0])
+        else:
+            phonemes.append(['UNKNOWN'])
+    flattened_phonemes = [p for sublist in phonemes for p in sublist]
+    # Create dummy 13-feature vectors for each phoneme (you need to implement your own feature extraction)
+    num_features = 13
+    sequence_length = len(flattened_phonemes)
+    input_data = np.random.rand(sequence_length, num_features)
+    # Add batch dimension
+    input_data = np.expand_dims(input_data, axis=0)  # Shape (1, sequence_length, 13)
+    return input_data
+# Convert model output to an audio file
+def convert_to_audio(model_output, filename="output.wav", sample_rate=22050):
+    """
+    Convert the model output into a .wav file.
+    """
+    # Normalize the audio output
+    normalized_output = np.interp(model_output, (model_output.min(), model_output.max()), (-1, 1))
+    # Write the audio data to a file
+    write(filename, sample_rate, normalized_output.astype(np.float32))  # Ensure the output is of type float32
+    return filename
+# Define function to generate sound effect
+def generate_sfx(text, duration=30):
+    """
+    Takes input text, preprocesses it, runs it through the model,
+    and generates a downloadable audio file for the specified duration.
+    """
+    input_data = preprocess_text(text)
+    # Generate prediction
+    prediction = model.predict(input_data)
+    # Generate a longer output by repeating or padding
+    audio_data = np.tile(prediction.flatten(), (duration * 22050 // len(prediction.flatten()) + 1))[:duration * 22050]
+    # Convert the prediction to an audio file
+    audio_file = convert_to_audio(audio_data, filename="output.wav")
+    return audio_file
+# Define the Gradio interface
+interface = gr.Interface(
+    fn=generate_sfx,
+    inputs=[
+        gr.Textbox(label="Enter a Word", placeholder="Write a Word To Convert it into SFX Sound"),
+        gr.Slider(minimum=2, maximum=20, default=30, label="Duration (seconds)")
+    ],
+    outputs=gr.Audio(label="Generated SFX", type="filepath"),
+    title="SFX Generator from Text",
+    description="Enter a word or sentence, and the model will generate an SFX sound.",
+)
+# Run the interface
 if __name__ == "__main__":
+    tf.config.set_visible_devices([], 'GPU')  # Disable GPU
+    interface.launch()