szili2011 commited on
Commit
09a58b6
·
verified ·
1 Parent(s): 3ffb926

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -40
app.py CHANGED
@@ -1,57 +1,90 @@
1
- import os
2
- import numpy as np
3
  import gradio as gr
4
- from scipy.io.wavfile import write
5
  import tensorflow as tf
 
6
  import nltk
7
  from nltk.corpus import cmudict
 
8
 
9
- # Download CMU dictionary if not already downloaded
10
- nltk.download('cmudict', quiet=True)
11
-
12
- # Ensure TensorFlow uses CPU only
13
- os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # Disable GPU
14
-
15
- # Load CMU dictionary for pronunciation
16
- cmu_dict = cmudict.dict()
17
 
18
- # Load your pre-trained model (adjust the model loading according to your implementation)
19
- # For example, if your model is a Keras model, you would use:
20
- # model = tf.keras.models.load_model('path_to_your_model')
21
 
22
- # Replace this with your actual model loading code
23
- # model = ...
 
 
 
 
 
 
 
24
 
25
- def generate_audio(text, duration):
26
- sample_rate = 22050 # Sample rate in Hz
27
- t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
 
 
28
 
29
- # Placeholder: Generate a simple sine wave audio signal
30
- frequency = 440 # Frequency in Hz (A4 note)
31
- audio_data = 0.5 * np.sin(2 * np.pi * frequency * t) # Generate sine wave
 
 
 
 
 
 
 
 
32
 
33
- return audio_data
 
 
 
 
 
 
34
 
35
- def generate_sfx(duration):
36
- text = "Sample text for audio generation" # Replace with actual input text if needed
37
- audio_data = generate_audio(text, duration)
38
- audio_data = (audio_data * 32767).astype(np.int16) # Scale to 16-bit PCM
39
- total_samples = duration * 22050 # Adjust based on sample rate
40
 
41
- if len(audio_data) < total_samples:
42
- raise ValueError(f"Generated audio is shorter than {duration} seconds.")
43
 
44
- output_filename = "output.wav"
45
- write(output_filename, 22050, audio_data[:total_samples]) # Write to WAV file
46
- return output_filename
 
 
 
 
 
 
 
47
 
48
- duration_slider = gr.Slider(minimum=2, maximum=20, label="Duration (seconds)", value=10)
 
 
 
 
 
 
49
 
50
- app = gr.Interface(fn=generate_sfx,
51
- inputs=duration_slider,
52
- outputs="audio",
53
- title="Sound Effect Generator",
54
- description="Generate sound effects for a specified duration.")
 
 
 
 
 
 
55
 
 
56
  if __name__ == "__main__":
57
- app.launch()
 
 
 
 
1
  import gradio as gr
 
2
  import tensorflow as tf
3
+ import numpy as np
4
  import nltk
5
  from nltk.corpus import cmudict
6
+ from scipy.io.wavfile import write
7
 
8
+ # Download required NLTK data
9
+ nltk.download('averaged_perceptron_tagger')
10
+ nltk.download('cmudict')
 
 
 
 
 
11
 
12
+ # Load your model from the root directory
13
+ model = tf.keras.models.load_model('audio_model.h5')
 
14
 
15
+ # Preprocess input text
16
+ def preprocess_text(text):
17
+ """
18
+ Process the input text to prepare it for the model.
19
+ This includes tokenization and phoneme extraction.
20
+ """
21
+ d = cmudict.dict()
22
+ words = text.lower().split()
23
+ phonemes = []
24
 
25
+ for word in words:
26
+ if word in d:
27
+ phonemes.append(d[word][0])
28
+ else:
29
+ phonemes.append(['UNKNOWN'])
30
 
31
+ flattened_phonemes = [p for sublist in phonemes for p in sublist]
32
+
33
+ # Create dummy 13-feature vectors for each phoneme (you need to implement your own feature extraction)
34
+ num_features = 13
35
+ sequence_length = len(flattened_phonemes)
36
+ input_data = np.random.rand(sequence_length, num_features)
37
+
38
+ # Add batch dimension
39
+ input_data = np.expand_dims(input_data, axis=0) # Shape (1, sequence_length, 13)
40
+
41
+ return input_data
42
 
43
+ # Convert model output to an audio file
44
+ def convert_to_audio(model_output, filename="output.wav", sample_rate=22050):
45
+ """
46
+ Convert the model output into a .wav file.
47
+ """
48
+ # Normalize the audio output
49
+ normalized_output = np.interp(model_output, (model_output.min(), model_output.max()), (-1, 1))
50
 
51
+ # Write the audio data to a file
52
+ write(filename, sample_rate, normalized_output.astype(np.float32)) # Ensure the output is of type float32
 
 
 
53
 
54
+ return filename
 
55
 
56
+ # Define function to generate sound effect
57
+ def generate_sfx(text, duration=30):
58
+ """
59
+ Takes input text, preprocesses it, runs it through the model,
60
+ and generates a downloadable audio file for the specified duration.
61
+ """
62
+ input_data = preprocess_text(text)
63
+
64
+ # Generate prediction
65
+ prediction = model.predict(input_data)
66
 
67
+ # Generate a longer output by repeating or padding
68
+ audio_data = np.tile(prediction.flatten(), (duration * 22050 // len(prediction.flatten()) + 1))[:duration * 22050]
69
+
70
+ # Convert the prediction to an audio file
71
+ audio_file = convert_to_audio(audio_data, filename="output.wav")
72
+
73
+ return audio_file
74
 
75
+ # Define the Gradio interface
76
+ interface = gr.Interface(
77
+ fn=generate_sfx,
78
+ inputs=[
79
+ gr.Textbox(label="Enter a Word", placeholder="Write a Word To Convert it into SFX Sound"),
80
+ gr.Slider(minimum=2, maximum=20, default=30, label="Duration (seconds)")
81
+ ],
82
+ outputs=gr.Audio(label="Generated SFX", type="filepath"),
83
+ title="SFX Generator from Text",
84
+ description="Enter a word or sentence, and the model will generate an SFX sound.",
85
+ )
86
 
87
+ # Run the interface
88
  if __name__ == "__main__":
89
+ tf.config.set_visible_devices([], 'GPU') # Disable GPU
90
+ interface.launch()