File size: 4,423 Bytes
44a7013
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b07f8f
 
 
 
 
 
 
44a7013
 
 
 
 
 
 
 
37559b1
44a7013
 
 
c0daaf6
44a7013
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os
import requests
import gradio as gr
import moviepy.editor as mp
from TTS.api import TTS
import torch
import assemblyai as aai

# Download necessary models if not already present
model_files = {
    "wav2lip.pth": "https://github.com/justinjohn0306/Wav2Lip/releases/download/models/wav2lip.pth",
    "wav2lip_gan.pth": "https://github.com/justinjohn0306/Wav2Lip/releases/download/models/wav2lip_gan.pth",
    "resnet50.pth": "https://github.com/justinjohn0306/Wav2Lip/releases/download/models/resnet50.pth",
    "mobilenet.pth": "https://github.com/justinjohn0306/Wav2Lip/releases/download/models/mobilenet.pth",
    "s3fd.pth": "https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth"
}


tts_model_path = "./xtts_v2"

# Initialize TTS model
tts = TTS(tts_model_path, gpu=True)


for filename, url in model_files.items():
    file_path = os.path.join("checkpoints" if "pth" in filename else "face_detection", filename)
    if not os.path.exists(file_path):
        print(f"Downloading {filename}...")
        r = requests.get(url)
        with open(file_path, 'wb') as f:
            f.write(r.content)



# Translation class
class translation:
    def __init__(self, video_path, original_language, target_language):
        self.video_path = video_path
        self.original_language = original_language
        self.target_language = target_language

    def org_language_parameters(self, original_language):
        language_codes = {'English': 'en', 'German': 'de', 'Italian': 'it', 'Spanish': 'es'}
        self.lan_code = language_codes.get(original_language, '')

    def target_language_parameters(self, target_language):
        language_codes = {'English': 'en', 'German': 'de', 'Italian': 'it', 'Spanish': 'es'}
        self.tran_code = language_codes.get(target_language, '')

    def extract_audio(self):
        video = mp.VideoFileClip(self.video_path)
        audio = video.audio
        audio_path = "output_audio.wav"
        audio.write_audiofile(audio_path)
        return audio_path

    def transcribe_audio(self, audio_path):
        aai.settings.api_key = os.getenv("ASSEMBLYAI_API_KEY")
        config = aai.TranscriptionConfig(language_code=self.lan_code)
        transcriber = aai.Transcriber(config=config)
        transcript = transcriber.transcribe(audio_path)
        return transcript.text

    def translate_text(self, transcript_text):
        base_url = "https://api.cognitive.microsofttranslator.com/translate"
        headers = {
            "Ocp-Apim-Subscription-Key": os.getenv("MICROSOFT_TRANSLATOR_API_KEY"),
            "Content-Type": "application/json",
            "Ocp-Apim-Subscription-Region": "southeastasia"
        }
        params = {"api-version": "3.0", "from": self.lan_code, "to": self.tran_code}
        body = [{"text": transcript_text}]
        response = requests.post(base_url, headers=headers, params=params, json=body)
        translation = response.json()[0]["translations"][0]["text"]
        return translation

    def generate_audio(self, translated_text):
        tts.tts_to_file(text=translated_text, speaker_wav='output_audio.wav', file_path="output_synth.wav", language=self.tran_code)
        return "output_synth.wav"

    def translate_video(self):
        audio_path = self.extract_audio()
        self.org_language_parameters(self.original_language)
        self.target_language_parameters(self.target_language)
        transcript_text = self.transcribe_audio(audio_path)
        translated_text = self.translate_text(transcript_text)
        translated_audio_path = self.generate_audio(translated_text)

        # Run Wav2Lip inference
        os.system(f"python inference.py --checkpoint_path 'checkpoints/wav2lip_gan.pth' --face {self.video_path} --audio {translated_audio_path} --outfile 'output_video.mp4'")
        return 'output_video.mp4'


# Gradio Interface
def app(video_path, original_language, target_language):
    translator = translation(video_path, original_language, target_language)
    video_file = translator.translate_video()
    return video_file

interface = gr.Interface(
    fn=app,
    inputs=[
        gr.Video(label="Video Path"),
        gr.Dropdown(["English", "German", "Italian", "Spanish"], label="Original Language"),
        gr.Dropdown(["English", "German", "Italian", "Spanish"], label="Targeted Language"),
    ],
    outputs=gr.Video(label="Translated Video")
)

interface.launch()