import os import requests import gradio as gr import moviepy.editor as mp import torch import assemblyai as aai from xtts_v2 import model.pth # Import specific model components from TTS.tts.configs.xtts_config import XttsConfig from TTS.tts.models.xtts import Xtts # Define paths for model and configuration files # model_path = "/xtts_v2" #config_path = os.path.join(model_path, "/config.json") # checkpoint_path = model_path # Initialize and load the XTTS model config = XttsConfig("/xtts_v2/config.json") model = Xtts.init_from_config(config) model.load_checkpoint(config, checkpoint_dir="/xtts_v2", eval=True) model.cuda() # Move model to GPU if available def synthesize_text(text, speaker_wav, language): try: outputs = model.synthesize( text, config, speaker_wav=speaker_wav, gpt_cond_len=3, language=language ) return outputs except Exception as e: print(f"Error during synthesis: {e}") raise # Translation class class Translation: def _init_(self, video_path, original_language, target_language): self.video_path = video_path self.original_language = original_language self.target_language = target_language def org_language_parameters(self, original_language): language_codes = {'English': 'en', 'German': 'de', 'Italian': 'it', 'Spanish': 'es'} self.lan_code = language_codes.get(original_language, '') def target_language_parameters(self, target_language): language_codes = {'English': 'en', 'German': 'de', 'Italian': 'it', 'Spanish': 'es'} self.tran_code = language_codes.get(target_language, '') def extract_audio(self): video = mp.VideoFileClip(self.video_path) audio = video.audio audio_path = "output_audio.wav" audio.write_audiofile(audio_path) return audio_path def transcribe_audio(self, audio_path): aai.settings.api_key = os.getenv("ASSEMBLYAI_API_KEY") config = aai.TranscriptionConfig(language_code=self.lan_code) transcriber = aai.Transcriber(config=config) transcript = transcriber.transcribe(audio_path) return transcript.text def translate_text(self, transcript_text): base_url = "https://api.cognitive.microsofttranslator.com/translate" headers = { "Ocp-Apim-Subscription-Key": os.getenv("MICROSOFT_TRANSLATOR_API_KEY"), "Content-Type": "application/json", "Ocp-Apim-Subscription-Region": "southeastasia" } params = {"api-version": "3.0", "from": self.lan_code, "to": self.tran_code} body = [{"text": transcript_text}] response = requests.post(base_url, headers=headers, params=params, json=body) translation = response.json()[0]["translations"][0]["text"] return translation def generate_audio(self, translated_text): try: synthesized_audio = synthesize_text( translated_text, speaker_wav='output_audio.wav', language=self.tran_code ) with open("output_synth.wav", "wb") as f: f.write(synthesized_audio) return "output_synth.wav" except Exception as e: print(f"Error generating audio: {e}") raise def translate_video(self): audio_path = self.extract_audio() self.org_language_parameters(self.original_language) self.target_language_parameters(self.target_language) transcript_text = self.transcribe_audio(audio_path) translated_text = self.translate_text(transcript_text) translated_audio_path = self.generate_audio(translated_text) # Run Wav2Lip inference os.system(f"python inference.py --checkpoint_path 'checkpoints/wav2lip_gan.pth' --face {self.video_path} --audio {translated_audio_path} --outfile 'output_video.mp4'") return 'output_video.mp4' # Gradio Interface def app(video_path, original_language, target_language): translator = Translation(video_path, original_language, target_language) video_file = translator.translate_video() return video_file interface = gr.Interface( fn=app, inputs=[ gr.Video(label="Video Path"), gr.Dropdown(["English", "German", "Italian", "Spanish"], label="Original Language"), gr.Dropdown(["English", "German", "Italian", "Spanish"], label="Targeted Language"), ], outputs=gr.Video(label="Translated Video") ) interface.launch(share=True) # Optional: Set share=True to create a public link