Spaces:
Paused
Paused
include asyncio for tts
Browse files- app.py +13 -22
- pipeline.py +48 -74
app.py
CHANGED
@@ -1,12 +1,11 @@
|
|
1 |
import gradio as gr
|
2 |
-
from tqdm import
|
3 |
|
4 |
from pipeline import (
|
5 |
extract_audio_from_video,
|
6 |
transcribe_and_preprocess_audio,
|
7 |
translation_main,
|
8 |
tts_main,
|
9 |
-
combine_audio_streams,
|
10 |
create_combined_output,
|
11 |
)
|
12 |
from pipeline import translation_hdr, translation_url, LANG
|
@@ -18,44 +17,36 @@ async def process_video_translation(
|
|
18 |
if input_video is None:
|
19 |
gr.Info("Please upload a video file", duration=2)
|
20 |
return
|
|
|
21 |
total_stages = 6
|
|
|
22 |
output_video = f"{input_video.split('.')[0]}_translated.mp4"
|
23 |
-
with
|
|
|
|
|
24 |
progress(0.1, desc="Extracting audio from video")
|
25 |
-
pbar.update(1)
|
26 |
output_audio_path = extract_audio_from_video(input_video)
|
|
|
27 |
|
28 |
# transcribe audio
|
29 |
-
|
30 |
sentences = transcribe_and_preprocess_audio(output_audio_path)
|
31 |
pbar.update(1)
|
32 |
|
33 |
# translate to twi
|
34 |
-
|
35 |
khaya_translations = await translation_main(
|
36 |
sentences, translation_url, translation_hdr, LANG
|
37 |
)
|
38 |
-
# create output files
|
39 |
-
print("Creating output files")
|
40 |
-
list_of_output_chunks = [
|
41 |
-
f"translated_{i}.wav" for i in range(len(khaya_translations))
|
42 |
-
]
|
43 |
pbar.update(1)
|
44 |
|
45 |
# convert to speech
|
46 |
-
|
47 |
-
await tts_main(khaya_translations, speaker
|
48 |
-
|
49 |
-
|
50 |
-
# combine audio streams
|
51 |
-
print("Combining audio streams")
|
52 |
-
pbar.set_description("Combining audio streams")
|
53 |
-
output_audio = combine_audio_streams(
|
54 |
-
list_of_output_chunks, "combined_audio.wav"
|
55 |
-
)
|
56 |
pbar.update(1)
|
57 |
|
58 |
-
|
59 |
create_combined_output(input_video, output_audio, output_video)
|
60 |
pbar.update(1)
|
61 |
|
|
|
1 |
import gradio as gr
|
2 |
+
from tqdm.asyncio import tqdm_asyncio
|
3 |
|
4 |
from pipeline import (
|
5 |
extract_audio_from_video,
|
6 |
transcribe_and_preprocess_audio,
|
7 |
translation_main,
|
8 |
tts_main,
|
|
|
9 |
create_combined_output,
|
10 |
)
|
11 |
from pipeline import translation_hdr, translation_url, LANG
|
|
|
17 |
if input_video is None:
|
18 |
gr.Info("Please upload a video file", duration=2)
|
19 |
return
|
20 |
+
|
21 |
total_stages = 6
|
22 |
+
|
23 |
output_video = f"{input_video.split('.')[0]}_translated.mp4"
|
24 |
+
with tqdm_asyncio(total=total_stages, desc="Processing video translation") as pbar:
|
25 |
+
|
26 |
+
# stage 1: extract audio from video
|
27 |
progress(0.1, desc="Extracting audio from video")
|
|
|
28 |
output_audio_path = extract_audio_from_video(input_video)
|
29 |
+
pbar.update(1)
|
30 |
|
31 |
# transcribe audio
|
32 |
+
progress(0.2, desc="Transcribing audio")
|
33 |
sentences = transcribe_and_preprocess_audio(output_audio_path)
|
34 |
pbar.update(1)
|
35 |
|
36 |
# translate to twi
|
37 |
+
progress(0.4, desc="Translating to Twi")
|
38 |
khaya_translations = await translation_main(
|
39 |
sentences, translation_url, translation_hdr, LANG
|
40 |
)
|
|
|
|
|
|
|
|
|
|
|
41 |
pbar.update(1)
|
42 |
|
43 |
# convert to speech
|
44 |
+
progress(0.7, desc="Converting to speech")
|
45 |
+
output_audio = await tts_main(khaya_translations, speaker)
|
46 |
+
# print(tts_output_files)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
pbar.update(1)
|
48 |
|
49 |
+
progress(1.0, desc="Combining audio and video")
|
50 |
create_combined_output(input_video, output_audio, output_video)
|
51 |
pbar.update(1)
|
52 |
|
pipeline.py
CHANGED
@@ -11,6 +11,8 @@ from dotenv import load_dotenv
|
|
11 |
import requests
|
12 |
import ffmpeg
|
13 |
import torch
|
|
|
|
|
14 |
|
15 |
|
16 |
# load khaya token from environment
|
@@ -20,6 +22,7 @@ load_dotenv()
|
|
20 |
KHAYA_TOKEN = os.getenv("KHAYA_TOKEN")
|
21 |
|
22 |
translation_url = "https://translation-api.ghananlp.org/v1/translate"
|
|
|
23 |
|
24 |
translation_hdr = {
|
25 |
# Request headers
|
@@ -28,6 +31,13 @@ translation_hdr = {
|
|
28 |
"Ocp-Apim-Subscription-Key": KHAYA_TOKEN,
|
29 |
}
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
LANG = "tw"
|
32 |
|
33 |
# Check if GPU is available
|
@@ -79,51 +89,53 @@ async def translation_main(sentences, url, headers, lang):
|
|
79 |
return khaya_translations
|
80 |
|
81 |
|
82 |
-
async def convert_text_to_speech(
|
|
|
|
|
83 |
speaker_dict = {"male": "twi_speaker_5", "female": "twi_speaker_7"}
|
84 |
speaker_id = speaker_dict[speaker]
|
|
|
|
|
85 |
try:
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
while True:
|
98 |
-
chunk = await response.content.read(1024)
|
99 |
-
if not chunk:
|
100 |
-
break
|
101 |
-
file.write(chunk)
|
102 |
except aiohttp.ClientError as e:
|
103 |
print(f"Request error: {e}")
|
104 |
except Exception as e:
|
105 |
print(f"Unexpected error: {e}")
|
106 |
|
107 |
|
108 |
-
async def tts_main(khaya_translations, speaker
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
|
|
|
|
|
|
127 |
|
128 |
|
129 |
def extract_audio_from_video(input_video):
|
@@ -190,41 +202,3 @@ def create_combined_output(input_video, output_audio, output_video):
|
|
190 |
except ffmpeg.Error as e:
|
191 |
print(e.stderr.decode())
|
192 |
raise e
|
193 |
-
|
194 |
-
|
195 |
-
async def process_video_translation(input_video, output_video):
|
196 |
-
print("Processing video translation")
|
197 |
-
|
198 |
-
print("Extracting audio from video")
|
199 |
-
output_audio_path = extract_audio_from_video(input_video)
|
200 |
-
|
201 |
-
# transcribe audio
|
202 |
-
print("Transcribing audio")
|
203 |
-
sentences = transcribe_and_preprocess_audio(output_audio_path)
|
204 |
-
|
205 |
-
# translate to twi
|
206 |
-
print("Translating to Twi")
|
207 |
-
khaya_translations = await translation_main(
|
208 |
-
sentences, translation_url, translation_hdr, LANG
|
209 |
-
)
|
210 |
-
|
211 |
-
# create output files
|
212 |
-
print("Creating output files")
|
213 |
-
list_of_output_chunks = [
|
214 |
-
f"translated_{i}.wav" for i in range(len(khaya_translations))
|
215 |
-
]
|
216 |
-
|
217 |
-
# convert to speech
|
218 |
-
print("Converting to speech")
|
219 |
-
await tts_main(khaya_translations, list_of_output_chunks)
|
220 |
-
|
221 |
-
# combine audio streams
|
222 |
-
print("Combining audio streams")
|
223 |
-
output_audio = combine_audio_streams(list_of_output_chunks, "combined_audio.wav")
|
224 |
-
|
225 |
-
print("Combining audio and video")
|
226 |
-
create_combined_output(input_video, output_audio, output_video)
|
227 |
-
|
228 |
-
print("Video translation completed")
|
229 |
-
|
230 |
-
return output_video
|
|
|
11 |
import requests
|
12 |
import ffmpeg
|
13 |
import torch
|
14 |
+
import aiofiles
|
15 |
+
import tempfile
|
16 |
|
17 |
|
18 |
# load khaya token from environment
|
|
|
22 |
KHAYA_TOKEN = os.getenv("KHAYA_TOKEN")
|
23 |
|
24 |
translation_url = "https://translation-api.ghananlp.org/v1/translate"
|
25 |
+
tts_url = "https://tts-backend-nlpghana-staging.azurewebsites.net/v0/tts"
|
26 |
|
27 |
translation_hdr = {
|
28 |
# Request headers
|
|
|
31 |
"Ocp-Apim-Subscription-Key": KHAYA_TOKEN,
|
32 |
}
|
33 |
|
34 |
+
tts_header = {
|
35 |
+
# Request headers
|
36 |
+
"Content-Type": "application/json",
|
37 |
+
"Cache-Control": "no-cache",
|
38 |
+
"Ocp-Apim-Subscription-Key": f"{KHAYA_TOKEN}",
|
39 |
+
}
|
40 |
+
|
41 |
LANG = "tw"
|
42 |
|
43 |
# Check if GPU is available
|
|
|
89 |
return khaya_translations
|
90 |
|
91 |
|
92 |
+
async def convert_text_to_speech(
|
93 |
+
session, tts_url, tts_header, text, speaker, semaphore, output_dir
|
94 |
+
):
|
95 |
speaker_dict = {"male": "twi_speaker_5", "female": "twi_speaker_7"}
|
96 |
speaker_id = speaker_dict[speaker]
|
97 |
+
data = {"text": text, "language": LANG, "speaker_id": speaker_id}
|
98 |
+
|
99 |
try:
|
100 |
+
async with semaphore:
|
101 |
+
async with session.post(tts_url, headers=tts_header, json=data) as response:
|
102 |
+
response.raise_for_status()
|
103 |
+
output_path = os.path.join(output_dir, f"{text[:4]}_tts.wav")
|
104 |
+
async with aiofiles.open(output_path, "wb") as file:
|
105 |
+
while True:
|
106 |
+
chunk = await response.content.read(16384)
|
107 |
+
if not chunk:
|
108 |
+
break
|
109 |
+
await file.write(chunk)
|
110 |
+
return output_path
|
|
|
|
|
|
|
|
|
|
|
111 |
except aiohttp.ClientError as e:
|
112 |
print(f"Request error: {e}")
|
113 |
except Exception as e:
|
114 |
print(f"Unexpected error: {e}")
|
115 |
|
116 |
|
117 |
+
async def tts_main(khaya_translations, speaker):
|
118 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
119 |
+
async with aiohttp.ClientSession() as session:
|
120 |
+
semaphore = asyncio.Semaphore(3)
|
121 |
+
tasks = [
|
122 |
+
convert_text_to_speech(
|
123 |
+
session, tts_url, tts_header, sent, speaker, semaphore, temp_dir
|
124 |
+
)
|
125 |
+
for sent in khaya_translations
|
126 |
+
]
|
127 |
+
output_files = []
|
128 |
+
for task in tqdm(
|
129 |
+
asyncio.as_completed(tasks),
|
130 |
+
total=len(tasks),
|
131 |
+
desc="Converting to Speech",
|
132 |
+
):
|
133 |
+
result = await task
|
134 |
+
if result:
|
135 |
+
output_files.append(result)
|
136 |
+
|
137 |
+
output_audio = combine_audio_streams(output_files, "combined_audio.wav")
|
138 |
+
return output_audio
|
139 |
|
140 |
|
141 |
def extract_audio_from_video(input_video):
|
|
|
202 |
except ffmpeg.Error as e:
|
203 |
print(e.stderr.decode())
|
204 |
raise e
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|