File size: 4,458 Bytes
44a7013
 
 
 
430b249
44a7013
 
430b249
c7cb1f7
 
 
 
 
 
 
 
 
430b249
 
df8ff5c
430b249
 
 
 
 
c7cb1f7
 
 
 
 
 
 
 
45fe7e2
44a7013
 
430b249
7dee455
44a7013
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
430b249
 
44a7013
 
 
 
 
 
 
 
 
 
 
 
 
45fe7e2
44a7013
 
430b249
44a7013
 
 
 
 
 
 
 
430b249
44a7013
 
 
 
430b249
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import os
import requests
import gradio as gr
import moviepy.editor as mp
from TTS.api import TTS
import torch
import assemblyai as aai
os.environ["COQUI_TOS_AGREED"] = "1"
# Download necessary models if not already present
model_files = {
    "wav2lip.pth": "https://github.com/justinjohn0306/Wav2Lip/releases/download/models/wav2lip.pth",
    "wav2lip_gan.pth": "https://github.com/justinjohn0306/Wav2Lip/releases/download/models/wav2lip_gan.pth",
    "resnet50.pth": "https://github.com/justinjohn0306/Wav2Lip/releases/download/models/resnet50.pth",
    "mobilenet.pth": "https://github.com/justinjohn0306/Wav2Lip/releases/download/models/mobilenet.pth",
    "s3fd.pth": "https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth"
}



device = "cpu"

tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)



for filename, url in model_files.items():
    file_path = os.path.join("checkpoints" if "pth" in filename else "face_detection", filename)
    if not os.path.exists(file_path):
        print(f"Downloading {filename}...")
        r = requests.get(url)
        with open(file_path, 'wb') as f:
            f.write(r.content)



# Translation class
class translation:
    def __init__(self, video_path, original_language, target_language):
        self.video_path = video_path
        self.original_language = original_language
        self.target_language = target_language

    def org_language_parameters(self, original_language):
        language_codes = {'English': 'en', 'German': 'de', 'Italian': 'it', 'Spanish': 'es'}
        self.lan_code = language_codes.get(original_language, '')

    def target_language_parameters(self, target_language):
        language_codes = {'English': 'en', 'German': 'de', 'Italian': 'it', 'Spanish': 'es'}
        self.tran_code = language_codes.get(target_language, '')

    def extract_audio(self):
        video = mp.VideoFileClip(self.video_path)
        audio = video.audio
        audio_path = "output_audio.wav"
        audio.write_audiofile(audio_path)
        return audio_path

    def transcribe_audio(self, audio_path):
        aai.settings.api_key = os.getenv("ASSEMBLYAI_API_KEY")
        config = aai.TranscriptionConfig(language_code=self.lan_code)
        transcriber = aai.Transcriber(config=config)
        transcript = transcriber.transcribe(audio_path)
        return transcript.text

    def translate_text(self, transcript_text):
        base_url = "https://api.cognitive.microsofttranslator.com/translate"
        headers = {
            "Ocp-Apim-Subscription-Key": os.getenv("MICROSOFT_TRANSLATOR_API_KEY"),
            "Content-Type": "application/json",
            "Ocp-Apim-Subscription-Region": "southeastasia"
        }
        params = {"api-version": "3.0", "from": self.lan_code, "to": self.tran_code}
        body = [{"text": transcript_text}]
        response = requests.post(base_url, headers=headers, params=params, json=body)
        translation = response.json()[0]["translations"][0]["text"]
        return translation

    def generate_audio(self, translated_text):
        tts.tts_to_file(text=translated_text, speaker_wav='output_audio.wav', file_path="output_synth.wav", language=self.tran_code)
        return "output_synth.wav"

    def translate_video(self):
        audio_path = self.extract_audio()
        self.org_language_parameters(self.original_language)
        self.target_language_parameters(self.target_language)
        transcript_text = self.transcribe_audio(audio_path)
        translated_text = self.translate_text(transcript_text)
        translated_audio_path = self.generate_audio(translated_text)

        # Run Wav2Lip inference
        os.system(f"python inference.py --checkpoint_path 'checkpoints/wav2lip_gan.pth' --face {self.video_path} --audio {translated_audio_path} --outfile 'output_video.mp4'")
        return 'output_video.mp4'


# Gradio Interface
def app(video_path, original_language, target_language):
    translator = translation(video_path, original_language, target_language)
    video_file = translator.translate_video()
    return video_file

interface = gr.Interface(
    fn=app,
    inputs=[
        gr.Video(label="Video Path"),
        gr.Dropdown(["English", "German", "Italian", "Spanish"], label="Original Language"),
        gr.Dropdown(["English", "German", "Italian", "Spanish"], label="Targeted Language"),
    ],
    outputs=gr.Video(label="Translated Video")
)

interface.launch()