File size: 4,631 Bytes
44a7013
 
 
 
 
 
02405e1
44a7013
54daa0d
 
 
 
 
4132cff
 
 
54daa0d
 
4132cff
54daa0d
4132cff
54daa0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44a7013
 
54daa0d
 
44a7013
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65495bc
54daa0d
 
 
 
 
 
 
65495bc
 
 
 
44a7013
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54daa0d
44a7013
 
 
 
 
 
 
 
 
 
 
 
 
54daa0d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import os
import requests
import gradio as gr
import moviepy.editor as mp
import torch
import assemblyai as aai
from xtts_v2 import model.pth

# Import specific model components
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts

# Define paths for model and configuration files
# model_path = "/xtts_v2"
#config_path = os.path.join(model_path, "/config.json")
# checkpoint_path = model_path

# Initialize and load the XTTS model
config = XttsConfig("/xtts_v2/config.json")
model = Xtts.init_from_config(config)
model.load_checkpoint(config, checkpoint_dir="/xtts_v2", eval=True)
model.cuda()  # Move model to GPU if available

def synthesize_text(text, speaker_wav, language):
    try:
        outputs = model.synthesize(
            text,
            config,
            speaker_wav=speaker_wav,
            gpt_cond_len=3,
            language=language
        )
        return outputs
    except Exception as e:
        print(f"Error during synthesis: {e}")
        raise

# Translation class
class Translation:
    def _init_(self, video_path, original_language, target_language):
        self.video_path = video_path
        self.original_language = original_language
        self.target_language = target_language

    def org_language_parameters(self, original_language):
        language_codes = {'English': 'en', 'German': 'de', 'Italian': 'it', 'Spanish': 'es'}
        self.lan_code = language_codes.get(original_language, '')

    def target_language_parameters(self, target_language):
        language_codes = {'English': 'en', 'German': 'de', 'Italian': 'it', 'Spanish': 'es'}
        self.tran_code = language_codes.get(target_language, '')

    def extract_audio(self):
        video = mp.VideoFileClip(self.video_path)
        audio = video.audio
        audio_path = "output_audio.wav"
        audio.write_audiofile(audio_path)
        return audio_path

    def transcribe_audio(self, audio_path):
        aai.settings.api_key = os.getenv("ASSEMBLYAI_API_KEY")
        config = aai.TranscriptionConfig(language_code=self.lan_code)
        transcriber = aai.Transcriber(config=config)
        transcript = transcriber.transcribe(audio_path)
        return transcript.text

    def translate_text(self, transcript_text):
        base_url = "https://api.cognitive.microsofttranslator.com/translate"
        headers = {
            "Ocp-Apim-Subscription-Key": os.getenv("MICROSOFT_TRANSLATOR_API_KEY"),
            "Content-Type": "application/json",
            "Ocp-Apim-Subscription-Region": "southeastasia"
        }
        params = {"api-version": "3.0", "from": self.lan_code, "to": self.tran_code}
        body = [{"text": transcript_text}]
        response = requests.post(base_url, headers=headers, params=params, json=body)
        translation = response.json()[0]["translations"][0]["text"]
        return translation

    def generate_audio(self, translated_text):
        try:
            synthesized_audio = synthesize_text(
                translated_text,
                speaker_wav='output_audio.wav',
                language=self.tran_code
            )
            with open("output_synth.wav", "wb") as f:
                f.write(synthesized_audio)
            return "output_synth.wav"
        except Exception as e:
            print(f"Error generating audio: {e}")
            raise

    def translate_video(self):
        audio_path = self.extract_audio()
        self.org_language_parameters(self.original_language)
        self.target_language_parameters(self.target_language)
        transcript_text = self.transcribe_audio(audio_path)
        translated_text = self.translate_text(transcript_text)
        translated_audio_path = self.generate_audio(translated_text)

        # Run Wav2Lip inference
        os.system(f"python inference.py --checkpoint_path 'checkpoints/wav2lip_gan.pth' --face {self.video_path} --audio {translated_audio_path} --outfile 'output_video.mp4'")
        return 'output_video.mp4'


# Gradio Interface
def app(video_path, original_language, target_language):
    translator = Translation(video_path, original_language, target_language)
    video_file = translator.translate_video()
    return video_file

interface = gr.Interface(
    fn=app,
    inputs=[
        gr.Video(label="Video Path"),
        gr.Dropdown(["English", "German", "Italian", "Spanish"], label="Original Language"),
        gr.Dropdown(["English", "German", "Italian", "Spanish"], label="Targeted Language"),
    ],
    outputs=gr.Video(label="Translated Video")
)

interface.launch(share=True)  # Optional: Set share=True to create a public link