Project / app.py
Salman11223's picture
Update app.py
7dee455 verified
raw
history blame
4.46 kB
import os
import requests
import gradio as gr
import moviepy.editor as mp
from TTS.api import TTS
import torch
import assemblyai as aai
os.environ["COQUI_TOS_AGREED"] = "1"
# Download necessary models if not already present
model_files = {
"wav2lip.pth": "https://github.com/justinjohn0306/Wav2Lip/releases/download/models/wav2lip.pth",
"wav2lip_gan.pth": "https://github.com/justinjohn0306/Wav2Lip/releases/download/models/wav2lip_gan.pth",
"resnet50.pth": "https://github.com/justinjohn0306/Wav2Lip/releases/download/models/resnet50.pth",
"mobilenet.pth": "https://github.com/justinjohn0306/Wav2Lip/releases/download/models/mobilenet.pth",
"s3fd.pth": "https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth"
}
device = "cpu"
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
for filename, url in model_files.items():
file_path = os.path.join("checkpoints" if "pth" in filename else "face_detection", filename)
if not os.path.exists(file_path):
print(f"Downloading {filename}...")
r = requests.get(url)
with open(file_path, 'wb') as f:
f.write(r.content)
# Translation class
class translation:
def __init__(self, video_path, original_language, target_language):
self.video_path = video_path
self.original_language = original_language
self.target_language = target_language
def org_language_parameters(self, original_language):
language_codes = {'English': 'en', 'German': 'de', 'Italian': 'it', 'Spanish': 'es'}
self.lan_code = language_codes.get(original_language, '')
def target_language_parameters(self, target_language):
language_codes = {'English': 'en', 'German': 'de', 'Italian': 'it', 'Spanish': 'es'}
self.tran_code = language_codes.get(target_language, '')
def extract_audio(self):
video = mp.VideoFileClip(self.video_path)
audio = video.audio
audio_path = "output_audio.wav"
audio.write_audiofile(audio_path)
return audio_path
def transcribe_audio(self, audio_path):
aai.settings.api_key = os.getenv("ASSEMBLYAI_API_KEY")
config = aai.TranscriptionConfig(language_code=self.lan_code)
transcriber = aai.Transcriber(config=config)
transcript = transcriber.transcribe(audio_path)
return transcript.text
def translate_text(self, transcript_text):
base_url = "https://api.cognitive.microsofttranslator.com/translate"
headers = {
"Ocp-Apim-Subscription-Key": os.getenv("MICROSOFT_TRANSLATOR_API_KEY"),
"Content-Type": "application/json",
"Ocp-Apim-Subscription-Region": "southeastasia"
}
params = {"api-version": "3.0", "from": self.lan_code, "to": self.tran_code}
body = [{"text": transcript_text}]
response = requests.post(base_url, headers=headers, params=params, json=body)
translation = response.json()[0]["translations"][0]["text"]
return translation
def generate_audio(self, translated_text):
tts.tts_to_file(text=translated_text, speaker_wav='output_audio.wav', file_path="output_synth.wav", language=self.tran_code)
return "output_synth.wav"
def translate_video(self):
audio_path = self.extract_audio()
self.org_language_parameters(self.original_language)
self.target_language_parameters(self.target_language)
transcript_text = self.transcribe_audio(audio_path)
translated_text = self.translate_text(transcript_text)
translated_audio_path = self.generate_audio(translated_text)
# Run Wav2Lip inference
os.system(f"python inference.py --checkpoint_path 'checkpoints/wav2lip_gan.pth' --face {self.video_path} --audio {translated_audio_path} --outfile 'output_video.mp4'")
return 'output_video.mp4'
# Gradio Interface
def app(video_path, original_language, target_language):
translator = translation(video_path, original_language, target_language)
video_file = translator.translate_video()
return video_file
interface = gr.Interface(
fn=app,
inputs=[
gr.Video(label="Video Path"),
gr.Dropdown(["English", "German", "Italian", "Spanish"], label="Original Language"),
gr.Dropdown(["English", "German", "Italian", "Spanish"], label="Targeted Language"),
],
outputs=gr.Video(label="Translated Video")
)
interface.launch()