Spaces:

Ghana-NLP
/

demo-dubbing

Paused

App Files Files Community

Lagyamfi commited on Nov 13, 2024

Commit

cf4f031

1 Parent(s): fb7e4f3

include asyncio for tts

Browse files

Files changed (2) hide show

app.py +13 -22
pipeline.py +48 -74

app.py CHANGED Viewed

@@ -1,12 +1,11 @@
 import gradio as gr
-from tqdm import tqdm
 from pipeline import (
     extract_audio_from_video,
     transcribe_and_preprocess_audio,
     translation_main,
     tts_main,
-    combine_audio_streams,
     create_combined_output,
 )
 from pipeline import translation_hdr, translation_url, LANG
@@ -18,44 +17,36 @@ async def process_video_translation(
     if input_video is None:
         gr.Info("Please upload a video file", duration=2)
         return
     total_stages = 6
     output_video = f"{input_video.split('.')[0]}_translated.mp4"
-    with tqdm(total=total_stages, desc="Processing video translation") as pbar:
         progress(0.1, desc="Extracting audio from video")
-        pbar.update(1)
         output_audio_path = extract_audio_from_video(input_video)
         # transcribe audio
-        pbar.set_description("Transcribing audio")
         sentences = transcribe_and_preprocess_audio(output_audio_path)
         pbar.update(1)
         # translate to twi
-        pbar.set_description("Translating to Twi")
         khaya_translations = await translation_main(
             sentences, translation_url, translation_hdr, LANG
         )
-        # create output files
-        print("Creating output files")
-        list_of_output_chunks = [
-            f"translated_{i}.wav" for i in range(len(khaya_translations))
-        ]
         pbar.update(1)
         # convert to speech
-        pbar.set_description("Converting to speech")
-        await tts_main(khaya_translations, speaker, list_of_output_chunks)
-        pbar.update(1)
-        # combine audio streams
-        print("Combining audio streams")
-        pbar.set_description("Combining audio streams")
-        output_audio = combine_audio_streams(
-            list_of_output_chunks, "combined_audio.wav"
-        )
         pbar.update(1)
-        pbar.set_description("Combining audio and video")
         create_combined_output(input_video, output_audio, output_video)
         pbar.update(1)

 import gradio as gr
+from tqdm.asyncio import tqdm_asyncio
 from pipeline import (
     extract_audio_from_video,
     transcribe_and_preprocess_audio,
     translation_main,
     tts_main,
     create_combined_output,
 )
 from pipeline import translation_hdr, translation_url, LANG
     if input_video is None:
         gr.Info("Please upload a video file", duration=2)
         return
     total_stages = 6
     output_video = f"{input_video.split('.')[0]}_translated.mp4"
+    with tqdm_asyncio(total=total_stages, desc="Processing video translation") as pbar:
+        # stage 1: extract audio from video
         progress(0.1, desc="Extracting audio from video")
         output_audio_path = extract_audio_from_video(input_video)
+        pbar.update(1)
         # transcribe audio
+        progress(0.2, desc="Transcribing audio")
         sentences = transcribe_and_preprocess_audio(output_audio_path)
         pbar.update(1)
         # translate to twi
+        progress(0.4, desc="Translating to Twi")
         khaya_translations = await translation_main(
             sentences, translation_url, translation_hdr, LANG
         )
         pbar.update(1)
         # convert to speech
+        progress(0.7, desc="Converting to speech")
+        output_audio = await tts_main(khaya_translations, speaker)
+        # print(tts_output_files)
         pbar.update(1)
+        progress(1.0, desc="Combining audio and video")
         create_combined_output(input_video, output_audio, output_video)
         pbar.update(1)

pipeline.py CHANGED Viewed

@@ -11,6 +11,8 @@ from dotenv import load_dotenv
 import requests
 import ffmpeg
 import torch
 # load khaya token from environment
@@ -20,6 +22,7 @@ load_dotenv()
 KHAYA_TOKEN = os.getenv("KHAYA_TOKEN")
 translation_url = "https://translation-api.ghananlp.org/v1/translate"
 translation_hdr = {
     # Request headers
@@ -28,6 +31,13 @@ translation_hdr = {
     "Ocp-Apim-Subscription-Key": KHAYA_TOKEN,
 }
 LANG = "tw"
 # Check if GPU is available
@@ -79,51 +89,53 @@ async def translation_main(sentences, url, headers, lang):
     return khaya_translations
-async def convert_text_to_speech(session, text, speaker, output_file):
     speaker_dict = {"male": "twi_speaker_5", "female": "twi_speaker_7"}
     speaker_id = speaker_dict[speaker]
     try:
-        tts_url = "https://tts-backend-nlpghana-staging.azurewebsites.net/v0/tts"  # Replace with your TTS API URL
-        data = {"text": text, "language": LANG, "speaker_id": speaker_id}
-        hdr = {
-            # Request headers
-            "Content-Type": "application/json",
-            "Cache-Control": "no-cache",
-            "Ocp-Apim-Subscription-Key": f"{KHAYA_TOKEN}",
-        }
-        async with session.post(tts_url, headers=hdr, json=data) as response:
-            response.raise_for_status()
-            with open(output_file, "wb") as file:
-                while True:
-                    chunk = await response.content.read(1024)
-                    if not chunk:
-                        break
-                    file.write(chunk)
     except aiohttp.ClientError as e:
         print(f"Request error: {e}")
     except Exception as e:
         print(f"Unexpected error: {e}")
-async def tts_main(khaya_translations, speaker, list_of_output_chunks):
-    async with aiohttp.ClientSession() as session:
-        tasks = []
-        for i, sent in enumerate(khaya_translations):
-            output_file = list_of_output_chunks[i]
-            tasks.append(convert_text_to_speech(session, sent, speaker, output_file))
-        for f in tqdm(
-            asyncio.as_completed(tasks), total=len(tasks), desc="Converting to Speech"
-        ):
-            await f
-output_path = "/Users/lawrenceadu-gyamfi/Documents/PERSONAL/GHANANLP/PROJECTS/SAINT/Examples/test_pipeline"
-input_video = "test_input_video.mov"
-input_audio = "input_audio.aac"
-output_audio = "output_audio.wav"
-output_video = "test_output_video.mp4"
-filename_with_path = f"{output_path}/{input_video}"
 def extract_audio_from_video(input_video):
@@ -190,41 +202,3 @@ def create_combined_output(input_video, output_audio, output_video):
     except ffmpeg.Error as e:
         print(e.stderr.decode())
         raise e
-async def process_video_translation(input_video, output_video):
-    print("Processing video translation")
-    print("Extracting audio from video")
-    output_audio_path = extract_audio_from_video(input_video)
-    # transcribe audio
-    print("Transcribing audio")
-    sentences = transcribe_and_preprocess_audio(output_audio_path)
-    # translate to twi
-    print("Translating to Twi")
-    khaya_translations = await translation_main(
-        sentences, translation_url, translation_hdr, LANG
-    )
-    # create output files
-    print("Creating output files")
-    list_of_output_chunks = [
-        f"translated_{i}.wav" for i in range(len(khaya_translations))
-    ]
-    # convert to speech
-    print("Converting to speech")
-    await tts_main(khaya_translations, list_of_output_chunks)
-    # combine audio streams
-    print("Combining audio streams")
-    output_audio = combine_audio_streams(list_of_output_chunks, "combined_audio.wav")
-    print("Combining audio and video")
-    create_combined_output(input_video, output_audio, output_video)
-    print("Video translation completed")
-    return output_video

 import requests
 import ffmpeg
 import torch
+import aiofiles
+import tempfile
 # load khaya token from environment
 KHAYA_TOKEN = os.getenv("KHAYA_TOKEN")
 translation_url = "https://translation-api.ghananlp.org/v1/translate"
+tts_url = "https://tts-backend-nlpghana-staging.azurewebsites.net/v0/tts"
 translation_hdr = {
     # Request headers
     "Ocp-Apim-Subscription-Key": KHAYA_TOKEN,
 }
+tts_header = {
+    # Request headers
+    "Content-Type": "application/json",
+    "Cache-Control": "no-cache",
+    "Ocp-Apim-Subscription-Key": f"{KHAYA_TOKEN}",
+}
 LANG = "tw"
 # Check if GPU is available
     return khaya_translations
+async def convert_text_to_speech(
+    session, tts_url, tts_header, text, speaker, semaphore, output_dir
+):
     speaker_dict = {"male": "twi_speaker_5", "female": "twi_speaker_7"}
     speaker_id = speaker_dict[speaker]
+    data = {"text": text, "language": LANG, "speaker_id": speaker_id}
     try:
+        async with semaphore:
+            async with session.post(tts_url, headers=tts_header, json=data) as response:
+                response.raise_for_status()
+                output_path = os.path.join(output_dir, f"{text[:4]}_tts.wav")
+                async with aiofiles.open(output_path, "wb") as file:
+                    while True:
+                        chunk = await response.content.read(16384)
+                        if not chunk:
+                            break
+                        await file.write(chunk)
+        return output_path
     except aiohttp.ClientError as e:
         print(f"Request error: {e}")
     except Exception as e:
         print(f"Unexpected error: {e}")
+async def tts_main(khaya_translations, speaker):
+    with tempfile.TemporaryDirectory() as temp_dir:
+        async with aiohttp.ClientSession() as session:
+            semaphore = asyncio.Semaphore(3)
+            tasks = [
+                convert_text_to_speech(
+                    session, tts_url, tts_header, sent, speaker, semaphore, temp_dir
+                )
+                for sent in khaya_translations
+            ]
+            output_files = []
+            for task in tqdm(
+                asyncio.as_completed(tasks),
+                total=len(tasks),
+                desc="Converting to Speech",
+            ):
+                result = await task
+                if result:
+                    output_files.append(result)
+            output_audio = combine_audio_streams(output_files, "combined_audio.wav")
+            return output_audio
 def extract_audio_from_video(input_video):
     except ffmpeg.Error as e:
         print(e.stderr.decode())
         raise e