File size: 2,661 Bytes
6c20eaa
38b530f
09a58b6
38b530f
 
09a58b6
3ffb926
773ac74
90e00da
 
09a58b6
773ac74
 
6c20eaa
09a58b6
773ac74
6c20eaa
09a58b6
 
 
 
 
6c20eaa
09a58b6
 
 
 
 
26107f3
09a58b6
 
 
 
012a8ba
773ac74
 
09a58b6
012a8ba
09a58b6
 
6c20eaa
09a58b6
90e00da
012a8ba
773ac74
09a58b6
90e00da
09a58b6
c3f5e81
09a58b6
773ac74
09a58b6
773ac74
 
012a8ba
773ac74
09a58b6
773ac74
012a8ba
773ac74
 
38b530f
773ac74
 
09a58b6
 
 
 
3ffb926
09a58b6
 
 
 
 
773ac74
09a58b6
 
 
 
 
6c20eaa
09a58b6
38b530f
012a8ba
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import gradio as gr
import tensorflow as tf
import numpy as np
import nltk
from nltk.corpus import cmudict
from scipy.io.wavfile import write

# Define sample_rate as a global constant
SAMPLE_RATE = 22050

# Download required NLTK data
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('cmudict', quiet=True)

# Load your model from the root directory
model = tf.keras.models.load_model('audio_model.h5', compile=False)

# Preprocess input text
def preprocess_text(text):
    d = cmudict.dict()
    words = text.lower().split()
    phonemes = []

    for word in words:
        if word in d:
            phonemes.append(d[word][0])
        else:
            phonemes.append(['UNKNOWN'])
    
    flattened_phonemes = [p for sublist in phonemes for p in sublist]
    
    num_features = 13
    sequence_length = len(flattened_phonemes)
    if sequence_length == 0:
        return np.zeros((1, 1, num_features))

    input_data = np.random.rand(sequence_length, num_features)
    input_data = np.expand_dims(input_data, axis=0)
    
    return input_data

# Convert model output to an audio file
def convert_to_audio(model_output, filename="output.wav"):
    if model_output.size == 0:
        return None
    normalized_output = np.interp(model_output, (model_output.min(), model_output.max()), (-1, 1))
    write(filename, SAMPLE_RATE, normalized_output.astype(np.float32))
    return filename

# Define function to generate sound effect
def generate_sfx(text, duration):
    input_data = preprocess_text(text)
    
    if input_data.shape[1] == 0:
        return None

    prediction = model.predict(input_data)
    flat_prediction = prediction.flatten()
    
    if len(flat_prediction) == 0:
        return None

    num_repeats = (duration * SAMPLE_RATE // len(flat_prediction)) + 1
    audio_data = np.tile(flat_prediction, num_repeats)[:duration * SAMPLE_RATE]
    
    audio_file = convert_to_audio(audio_data, filename="output.wav")
    
    return audio_file

# Define the Gradio interface
interface = gr.Interface(
    fn=generate_sfx,
    inputs=[
        gr.Textbox(label="Enter a Word", placeholder="Write a Word To Convert it into SFX Sound"),
        gr.Slider(minimum=1, maximum=20, value=3, step=1, label="Duration (seconds)")
    ],
    outputs=gr.Audio(label="Generated SFX", type="filepath"),
    title="SFX Generator from Text",
    description="Enter a word or sentence, and the model will generate an SFX sound.",
)

# Run the interface
if __name__ == "__main__":
    tf.config.set_visible_devices([], 'GPU')
    # The ValueError shows that share=True IS required for your environment.
    interface.launch(share=True)