File size: 4,328 Bytes
3dba9d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import gradio as gr
import numpy as np
import os
import time
from scipy.io import wavfile

# Explicitly import Bark components
from bark import generate_audio, SAMPLE_RATE
from bark.generation import preload_models

class VoiceCloningApp:
    def __init__(self):
        # Create working directory
        self.base_dir = os.path.dirname(os.path.abspath(__file__))
        self.working_dir = os.path.join(self.base_dir, "working_files")
        os.makedirs(self.working_dir, exist_ok=True)
        
        # Explicit model loading with error handling
        try:
            print("Attempting to load Bark models...")
            preload_models()
            print("Bark models loaded successfully.")
        except Exception as e:
            print(f"Error loading Bark models: {e}")
            import traceback
            traceback.print_exc()
            raise RuntimeError(f"Could not load Bark models. Error: {e}")

    def process_reference_audio(self, audio_data):
        """Simple audio processing"""
        if audio_data is None:
            return "Please provide an audio input"
        
        try:
            # Unpack audio data
            sample_rate, audio_array = audio_data
            
            # Normalize audio
            audio_array = audio_array / np.max(np.abs(audio_array))
            
            # Save reference audio
            filename = f"reference_{int(time.time())}.wav"
            filepath = os.path.join(self.working_dir, filename)
            wavfile.write(filepath, sample_rate, audio_array)
            
            return "βœ… Audio captured successfully!"
        
        except Exception as e:
            return f"Error processing audio: {str(e)}"

    def generate_speech(self, text):
        """Generate speech using Bark"""
        if not text or not text.strip():
            return None, "Please enter some text to speak"
        
        try:
            # Generate audio with explicit error handling
            print(f"Generating speech for text: {text}")
            
            # Simplified audio generation
            audio_array = generate_audio(
                text,
                history_prompt=None,
            )
            
            # Save generated audio
            filename = f"generated_speech_{int(time.time())}.wav"
            filepath = os.path.join(self.working_dir, filename)
            wavfile.write(filepath, SAMPLE_RATE, audio_array)
            
            return filepath, None
        
        except Exception as e:
            print(f"Speech generation error: {e}")
            import traceback
            traceback.print_exc()
            return None, f"Error generating speech: {str(e)}"

def create_interface():
    # Ensure working directory exists
    working_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "working_files")
    os.makedirs(working_dir, exist_ok=True)
    
    app = VoiceCloningApp()
    
    with gr.Blocks() as interface:
        gr.Markdown("# πŸŽ™οΈ Voice Cloning App")
        
        with gr.Row():
            with gr.Column():
                gr.Markdown("## 1. Capture Reference Voice")
                reference_audio = gr.Audio(sources=["microphone", "upload"], type="numpy")
                process_btn = gr.Button("Process Reference Voice")
                process_output = gr.Textbox(label="Processing Result")
                
            with gr.Column():
                gr.Markdown("## 2. Generate Speech")
                text_input = gr.Textbox(label="Enter Text to Speak")
                generate_btn = gr.Button("Generate Speech")
                audio_output = gr.Audio(label="Generated Speech")
                error_output = gr.Textbox(label="Errors", visible=True)
        
        # Bind functions
        process_btn.click(
            fn=app.process_reference_audio, 
            inputs=reference_audio, 
            outputs=process_output
        )
        
        generate_btn.click(
            fn=app.generate_speech, 
            inputs=text_input, 
            outputs=[audio_output, error_output]
        )
    
    return interface

if __name__ == "__main__":
    interface = create_interface()
    interface.launch(
        share=False,
        debug=True,
        show_error=True,
        server_name='0.0.0.0',
        server_port=7860
    )