Lagyamfi commited on
Commit
cf4f031
·
1 Parent(s): fb7e4f3

include asyncio for tts

Browse files
Files changed (2) hide show
  1. app.py +13 -22
  2. pipeline.py +48 -74
app.py CHANGED
@@ -1,12 +1,11 @@
1
  import gradio as gr
2
- from tqdm import tqdm
3
 
4
  from pipeline import (
5
  extract_audio_from_video,
6
  transcribe_and_preprocess_audio,
7
  translation_main,
8
  tts_main,
9
- combine_audio_streams,
10
  create_combined_output,
11
  )
12
  from pipeline import translation_hdr, translation_url, LANG
@@ -18,44 +17,36 @@ async def process_video_translation(
18
  if input_video is None:
19
  gr.Info("Please upload a video file", duration=2)
20
  return
 
21
  total_stages = 6
 
22
  output_video = f"{input_video.split('.')[0]}_translated.mp4"
23
- with tqdm(total=total_stages, desc="Processing video translation") as pbar:
 
 
24
  progress(0.1, desc="Extracting audio from video")
25
- pbar.update(1)
26
  output_audio_path = extract_audio_from_video(input_video)
 
27
 
28
  # transcribe audio
29
- pbar.set_description("Transcribing audio")
30
  sentences = transcribe_and_preprocess_audio(output_audio_path)
31
  pbar.update(1)
32
 
33
  # translate to twi
34
- pbar.set_description("Translating to Twi")
35
  khaya_translations = await translation_main(
36
  sentences, translation_url, translation_hdr, LANG
37
  )
38
- # create output files
39
- print("Creating output files")
40
- list_of_output_chunks = [
41
- f"translated_{i}.wav" for i in range(len(khaya_translations))
42
- ]
43
  pbar.update(1)
44
 
45
  # convert to speech
46
- pbar.set_description("Converting to speech")
47
- await tts_main(khaya_translations, speaker, list_of_output_chunks)
48
- pbar.update(1)
49
-
50
- # combine audio streams
51
- print("Combining audio streams")
52
- pbar.set_description("Combining audio streams")
53
- output_audio = combine_audio_streams(
54
- list_of_output_chunks, "combined_audio.wav"
55
- )
56
  pbar.update(1)
57
 
58
- pbar.set_description("Combining audio and video")
59
  create_combined_output(input_video, output_audio, output_video)
60
  pbar.update(1)
61
 
 
1
  import gradio as gr
2
+ from tqdm.asyncio import tqdm_asyncio
3
 
4
  from pipeline import (
5
  extract_audio_from_video,
6
  transcribe_and_preprocess_audio,
7
  translation_main,
8
  tts_main,
 
9
  create_combined_output,
10
  )
11
  from pipeline import translation_hdr, translation_url, LANG
 
17
  if input_video is None:
18
  gr.Info("Please upload a video file", duration=2)
19
  return
20
+
21
  total_stages = 6
22
+
23
  output_video = f"{input_video.split('.')[0]}_translated.mp4"
24
+ with tqdm_asyncio(total=total_stages, desc="Processing video translation") as pbar:
25
+
26
+ # stage 1: extract audio from video
27
  progress(0.1, desc="Extracting audio from video")
 
28
  output_audio_path = extract_audio_from_video(input_video)
29
+ pbar.update(1)
30
 
31
  # transcribe audio
32
+ progress(0.2, desc="Transcribing audio")
33
  sentences = transcribe_and_preprocess_audio(output_audio_path)
34
  pbar.update(1)
35
 
36
  # translate to twi
37
+ progress(0.4, desc="Translating to Twi")
38
  khaya_translations = await translation_main(
39
  sentences, translation_url, translation_hdr, LANG
40
  )
 
 
 
 
 
41
  pbar.update(1)
42
 
43
  # convert to speech
44
+ progress(0.7, desc="Converting to speech")
45
+ output_audio = await tts_main(khaya_translations, speaker)
46
+ # print(tts_output_files)
 
 
 
 
 
 
 
47
  pbar.update(1)
48
 
49
+ progress(1.0, desc="Combining audio and video")
50
  create_combined_output(input_video, output_audio, output_video)
51
  pbar.update(1)
52
 
pipeline.py CHANGED
@@ -11,6 +11,8 @@ from dotenv import load_dotenv
11
  import requests
12
  import ffmpeg
13
  import torch
 
 
14
 
15
 
16
  # load khaya token from environment
@@ -20,6 +22,7 @@ load_dotenv()
20
  KHAYA_TOKEN = os.getenv("KHAYA_TOKEN")
21
 
22
  translation_url = "https://translation-api.ghananlp.org/v1/translate"
 
23
 
24
  translation_hdr = {
25
  # Request headers
@@ -28,6 +31,13 @@ translation_hdr = {
28
  "Ocp-Apim-Subscription-Key": KHAYA_TOKEN,
29
  }
30
 
 
 
 
 
 
 
 
31
  LANG = "tw"
32
 
33
  # Check if GPU is available
@@ -79,51 +89,53 @@ async def translation_main(sentences, url, headers, lang):
79
  return khaya_translations
80
 
81
 
82
- async def convert_text_to_speech(session, text, speaker, output_file):
 
 
83
  speaker_dict = {"male": "twi_speaker_5", "female": "twi_speaker_7"}
84
  speaker_id = speaker_dict[speaker]
 
 
85
  try:
86
- tts_url = "https://tts-backend-nlpghana-staging.azurewebsites.net/v0/tts" # Replace with your TTS API URL
87
- data = {"text": text, "language": LANG, "speaker_id": speaker_id}
88
- hdr = {
89
- # Request headers
90
- "Content-Type": "application/json",
91
- "Cache-Control": "no-cache",
92
- "Ocp-Apim-Subscription-Key": f"{KHAYA_TOKEN}",
93
- }
94
- async with session.post(tts_url, headers=hdr, json=data) as response:
95
- response.raise_for_status()
96
- with open(output_file, "wb") as file:
97
- while True:
98
- chunk = await response.content.read(1024)
99
- if not chunk:
100
- break
101
- file.write(chunk)
102
  except aiohttp.ClientError as e:
103
  print(f"Request error: {e}")
104
  except Exception as e:
105
  print(f"Unexpected error: {e}")
106
 
107
 
108
- async def tts_main(khaya_translations, speaker, list_of_output_chunks):
109
- async with aiohttp.ClientSession() as session:
110
- tasks = []
111
- for i, sent in enumerate(khaya_translations):
112
- output_file = list_of_output_chunks[i]
113
- tasks.append(convert_text_to_speech(session, sent, speaker, output_file))
114
-
115
- for f in tqdm(
116
- asyncio.as_completed(tasks), total=len(tasks), desc="Converting to Speech"
117
- ):
118
- await f
119
-
120
-
121
- output_path = "/Users/lawrenceadu-gyamfi/Documents/PERSONAL/GHANANLP/PROJECTS/SAINT/Examples/test_pipeline"
122
- input_video = "test_input_video.mov"
123
- input_audio = "input_audio.aac"
124
- output_audio = "output_audio.wav"
125
- output_video = "test_output_video.mp4"
126
- filename_with_path = f"{output_path}/{input_video}"
 
 
 
127
 
128
 
129
  def extract_audio_from_video(input_video):
@@ -190,41 +202,3 @@ def create_combined_output(input_video, output_audio, output_video):
190
  except ffmpeg.Error as e:
191
  print(e.stderr.decode())
192
  raise e
193
-
194
-
195
- async def process_video_translation(input_video, output_video):
196
- print("Processing video translation")
197
-
198
- print("Extracting audio from video")
199
- output_audio_path = extract_audio_from_video(input_video)
200
-
201
- # transcribe audio
202
- print("Transcribing audio")
203
- sentences = transcribe_and_preprocess_audio(output_audio_path)
204
-
205
- # translate to twi
206
- print("Translating to Twi")
207
- khaya_translations = await translation_main(
208
- sentences, translation_url, translation_hdr, LANG
209
- )
210
-
211
- # create output files
212
- print("Creating output files")
213
- list_of_output_chunks = [
214
- f"translated_{i}.wav" for i in range(len(khaya_translations))
215
- ]
216
-
217
- # convert to speech
218
- print("Converting to speech")
219
- await tts_main(khaya_translations, list_of_output_chunks)
220
-
221
- # combine audio streams
222
- print("Combining audio streams")
223
- output_audio = combine_audio_streams(list_of_output_chunks, "combined_audio.wav")
224
-
225
- print("Combining audio and video")
226
- create_combined_output(input_video, output_audio, output_video)
227
-
228
- print("Video translation completed")
229
-
230
- return output_video
 
11
  import requests
12
  import ffmpeg
13
  import torch
14
+ import aiofiles
15
+ import tempfile
16
 
17
 
18
  # load khaya token from environment
 
22
  KHAYA_TOKEN = os.getenv("KHAYA_TOKEN")
23
 
24
  translation_url = "https://translation-api.ghananlp.org/v1/translate"
25
+ tts_url = "https://tts-backend-nlpghana-staging.azurewebsites.net/v0/tts"
26
 
27
  translation_hdr = {
28
  # Request headers
 
31
  "Ocp-Apim-Subscription-Key": KHAYA_TOKEN,
32
  }
33
 
34
+ tts_header = {
35
+ # Request headers
36
+ "Content-Type": "application/json",
37
+ "Cache-Control": "no-cache",
38
+ "Ocp-Apim-Subscription-Key": f"{KHAYA_TOKEN}",
39
+ }
40
+
41
  LANG = "tw"
42
 
43
  # Check if GPU is available
 
89
  return khaya_translations
90
 
91
 
92
+ async def convert_text_to_speech(
93
+ session, tts_url, tts_header, text, speaker, semaphore, output_dir
94
+ ):
95
  speaker_dict = {"male": "twi_speaker_5", "female": "twi_speaker_7"}
96
  speaker_id = speaker_dict[speaker]
97
+ data = {"text": text, "language": LANG, "speaker_id": speaker_id}
98
+
99
  try:
100
+ async with semaphore:
101
+ async with session.post(tts_url, headers=tts_header, json=data) as response:
102
+ response.raise_for_status()
103
+ output_path = os.path.join(output_dir, f"{text[:4]}_tts.wav")
104
+ async with aiofiles.open(output_path, "wb") as file:
105
+ while True:
106
+ chunk = await response.content.read(16384)
107
+ if not chunk:
108
+ break
109
+ await file.write(chunk)
110
+ return output_path
 
 
 
 
 
111
  except aiohttp.ClientError as e:
112
  print(f"Request error: {e}")
113
  except Exception as e:
114
  print(f"Unexpected error: {e}")
115
 
116
 
117
+ async def tts_main(khaya_translations, speaker):
118
+ with tempfile.TemporaryDirectory() as temp_dir:
119
+ async with aiohttp.ClientSession() as session:
120
+ semaphore = asyncio.Semaphore(3)
121
+ tasks = [
122
+ convert_text_to_speech(
123
+ session, tts_url, tts_header, sent, speaker, semaphore, temp_dir
124
+ )
125
+ for sent in khaya_translations
126
+ ]
127
+ output_files = []
128
+ for task in tqdm(
129
+ asyncio.as_completed(tasks),
130
+ total=len(tasks),
131
+ desc="Converting to Speech",
132
+ ):
133
+ result = await task
134
+ if result:
135
+ output_files.append(result)
136
+
137
+ output_audio = combine_audio_streams(output_files, "combined_audio.wav")
138
+ return output_audio
139
 
140
 
141
  def extract_audio_from_video(input_video):
 
202
  except ffmpeg.Error as e:
203
  print(e.stderr.decode())
204
  raise e