import os import requests import gradio as gr import moviepy.editor as mp from TTS.api import TTS import torch import assemblyai as aai # Download necessary models if not already present model_files = { "wav2lip.pth": "https://github.com/justinjohn0306/Wav2Lip/releases/download/models/wav2lip.pth", "wav2lip_gan.pth": "https://github.com/justinjohn0306/Wav2Lip/releases/download/models/wav2lip_gan.pth", "resnet50.pth": "https://github.com/justinjohn0306/Wav2Lip/releases/download/models/resnet50.pth", "mobilenet.pth": "https://github.com/justinjohn0306/Wav2Lip/releases/download/models/mobilenet.pth", "s3fd.pth": "https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth" } for filename, url in model_files.items(): file_path = os.path.join("checkpoints" if "pth" in filename else "face_detection", filename) if not os.path.exists(file_path): print(f"Downloading {filename}...") r = requests.get(url) with open(file_path, 'wb') as f: f.write(r.content) # Initialize TTS model without prompts tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True, progress_bar=False) # Translation class class translation: def init(self, video_path, original_language, target_language): self.video_path = video_path self.original_language = original_language self.target_language = target_language def org_language_parameters(self, original_language): language_codes = {'English': 'en', 'German': 'de', 'Italian': 'it', 'Spanish': 'es'} self.lan_code = language_codes.get(original_language, '') def target_language_parameters(self, target_language): language_codes = {'English': 'en', 'German': 'de', 'Italian': 'it', 'Spanish': 'es'} self.tran_code = language_codes.get(target_language, '') def extract_audio(self): video = mp.VideoFileClip(self.video_path) audio = video.audio audio_path = "output_audio.wav" audio.write_audiofile(audio_path) return audio_path def transcribe_audio(self, audio_path): aai.settings.api_key = os.getenv("ASSEMBLYAI_API_KEY") config = aai.TranscriptionConfig(language_code=self.lan_code) transcriber = aai.Transcriber(config=config) transcript = transcriber.transcribe(audio_path) return transcript.text def translate_text(self, transcript_text): base_url = "https://api.cognitive.microsofttranslator.com/translate" headers = { "Ocp-Apim-Subscription-Key": os.getenv("MICROSOFT_TRANSLATOR_API_KEY"), "Content-Type": "application/json", "Ocp-Apim-Subscription-Region": "southeastasia" } params = {"api-version": "3.0", "from": self.lan_code, "to": self.tran_code} body = [{"text": transcript_text}] response = requests.post(base_url, headers=headers, params=params, json=body) translation = response.json()[0]["translations"][0]["text"] return translation def generate_audio(self, translated_text): tts.tts_to_file(text=translated_text, speaker_wav='output_audio.wav', file_path="output_synth.wav", language=self.tran_code) return "output_synth.wav" def translate_video(self): audio_path = self.extract_audio() self.org_language_parameters(self.original_language) self.target_language_parameters(self.target_language) transcript_text = self.transcribe_audio(audio_path) translated_text = self.translate_text(transcript_text) translated_audio_path = self.generate_audio(translated_text) # Run Wav2Lip inference os.system(f"python inference.py --checkpoint_path 'checkpoints/wav2lip_gan.pth' --face {self.video_path} --audio {translated_audio_path} --outfile 'output_video.mp4'") return 'output_video.mp4' # Gradio Interface def app(video_path, original_language, target_language): translator = translation(video_path, original_language, target_language) video_file = translator.translate_video() return video_file interface = gr.Interface( fn=app, inputs=[ gr.Video(label="Video Path"), gr.Dropdown(["English", "German", "Italian", "Spanish"], label="Original Language"), gr.Dropdown(["English", "German", "Italian", "Spanish"], label="Targeted Language"), ], outputs=gr.Video(label="Translated Video") ) interface.launch()