asmarx commited on
Commit
a26dd44
Β·
verified Β·
1 Parent(s): 280855a

Initial commit

Browse files
Files changed (3) hide show
  1. README.md +30 -7
  2. app.py +444 -0
  3. requirements.txt +12 -0
README.md CHANGED
@@ -1,13 +1,36 @@
1
  ---
2
- title: TranslateYoutubeVideo
3
- emoji: πŸ“‰
4
- colorFrom: red
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 5.38.1
8
  app_file: app.py
9
  pinned: false
10
- license: gpl-3.0
11
  ---
 
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: YouTube Translator and Speaker
3
+ emoji: 🌍
4
+ colorFrom: yellow
5
+ colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 5.28.0
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
+ # YouTube Translator and Speaker
12
 
13
+ This HuggingFace Space application allows you to get the translated transcript and speech for a given YouTube video.
14
+
15
+ ## How to Use
16
+
17
+ 1. Enter the YouTube Video ID in the provided text box.
18
+ (The video ID is the unique string of characters in the YouTube video URL after `v=`, e.g., `dQw4w9WgXcQ`)
19
+ 2. Select the target language from the dropdown menu.
20
+ 3. The translated text will appear in the 'Translated Text' box, and the translated speech will play automatically.
21
+
22
+ ## Supported Languages
23
+
24
+ - Arabic (ar)
25
+ - French (fr)
26
+ - Hausa (ha)
27
+ - Afghan Persian / Dari (fa)
28
+ - Pashto (ps)
29
+
30
+ ## Notes
31
+
32
+ - Translation for Arabic and French uses Helsinki-NLP models.
33
+ - Translation for Hausa, Afghan Persian, and Pashto uses the Facebook NLLB-200 model.
34
+ - Speech generation for Arabic and French uses gTTS.
35
+ - Speech generation for Hausa, Afghan Persian, and Pashto uses the ElevenLabs API. An ElevenLabs API key is required as a Space secret named `ELEVENLABS_API_KEY` for speech to work in these languages.
36
+ - Proxy settings for YouTube transcript retrieval can be configured using Space secrets named `WEBSHARE_PROXY_UN` and `WEBSHARE_PROXY_PW`.
app.py ADDED
@@ -0,0 +1,444 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ import torch
4
+ import os
5
+ from transformers import MarianMTModel, MarianTokenizer, AutoTokenizer, AutoModelForSeq2SeqLM
6
+ from youtube_transcript_api import YouTubeTranscriptApi
7
+ from youtube_transcript_api.proxies import WebshareProxyConfig
8
+ from gtts import gTTS
9
+
10
+ # ---- FastAPI Proxy Setup ----
11
+ from fastapi import FastAPI, Request
12
+ from fastapi.responses import StreamingResponse
13
+ import httpx
14
+ import uvicorn
15
+
16
+ fastapi_app = FastAPI()
17
+
18
+ @fastapi_app.get("/proxy")
19
+ async def proxy(url: str):
20
+ async with httpx.AsyncClient() as client:
21
+ r = await client.get(url, timeout=30.0, stream=True)
22
+ if r.status_code != 200:
23
+ return StreamingResponse(content=r.aiter_bytes(), status_code=r.status_code)
24
+ headers = {
25
+ "Content-Type": r.headers.get("content-type", "application/octet-stream"),
26
+ "Access-Control-Allow-Origin": "*"
27
+ }
28
+ return StreamingResponse(r.aiter_bytes(), headers=headers)
29
+
30
+
31
+ # ---- Your Existing Gradio App Below ----
32
+
33
+ # Initialize YouTubeTranscriptApi
34
+ proxy_username = os.environ.get('WEBSHARE_PROXY_UN')
35
+ proxy_password = os.environ.get('WEBSHARE_PROXY_PW')
36
+
37
+ ytt_api = None
38
+ try:
39
+ if proxy_username and proxy_password:
40
+ ytt_api = YouTubeTranscriptApi(
41
+ proxy_config=WebshareProxyConfig(
42
+ proxy_username=proxy_username,
43
+ proxy_password=proxy_password,
44
+ filter_ip_locations=["us"],
45
+ )
46
+ )
47
+ print(f"Successfully connected to the Youtube API with proxy.")
48
+ else:
49
+ ytt_api = YouTubeTranscriptApi()
50
+ print(f"Successfully connected to the Youtube API without proxy.")
51
+ except Exception as e:
52
+ print(f"A proxy error occurred in connecting to the Youtube API: {e}")
53
+ ytt_api = YouTubeTranscriptApi() # Fallback if proxy fails
54
+
55
+
56
+ def getEnglishTranscript(video_id):
57
+ """Retrieves the English transcript for a given YouTube video ID."""
58
+ if not ytt_api:
59
+ print("YouTubeTranscriptApi not initialized.")
60
+ return ""
61
+
62
+ try:
63
+ transcript_list = ytt_api.list(video_id)
64
+ english_original = None
65
+ for transcript in transcript_list:
66
+ if(transcript.language_code == 'en'):
67
+ english_original = transcript.fetch()
68
+ break
69
+ english_output = ""
70
+ if english_original:
71
+ for snippet in english_original:
72
+ english_output += snippet.text + " "
73
+ else:
74
+ print(f"No English transcript found for video ID: {video_id}")
75
+ return english_output.strip()
76
+ except Exception as e:
77
+ print(f"Error retrieving English transcript for video ID {video_id}: {e}")
78
+ return ""
79
+
80
+
81
+ def getArabicTranscript(video_id):
82
+ """Retrieves the Arabic transcript for a given YouTube video ID, translating if necessary."""
83
+ if not ytt_api:
84
+ print("YouTubeTranscriptApi not initialized.")
85
+ return ""
86
+
87
+ try:
88
+ transcript_list = ytt_api.list(video_id)
89
+ arabic_translation = None
90
+ for transcript in transcript_list:
91
+ if(transcript.is_translatable):
92
+ arabic_language_code = None
93
+ for lang in transcript.translation_languages:
94
+ if lang.language == 'Arabic':
95
+ arabic_language_code = lang.language_code
96
+ break
97
+ if arabic_language_code:
98
+ print(f"\nTranslating to Arabic ({arabic_language_code})...")
99
+ arabic_translation = transcript.translate(arabic_language_code).fetch()
100
+ print("Arabic Translation Found and Stored.")
101
+ break # Exit after finding the first Arabic translation
102
+ arabic_output = ""
103
+ if arabic_translation:
104
+ for snippet in arabic_translation:
105
+ arabic_output += snippet.text + " "
106
+ else:
107
+ print(f"No translatable transcript found for Arabic for video ID: {video_id}")
108
+ return arabic_output.strip()
109
+ except Exception as e:
110
+ print(f"Error retrieving or translating Arabic transcript for video ID {video_id}: {e}")
111
+ return ""
112
+
113
+
114
+ def getFrenchTranscript(video_id):
115
+ """Retrieves the French transcript for a given YouTube video ID, translating if necessary."""
116
+ if not ytt_api:
117
+ print("YouTubeTranscriptApi not initialized.")
118
+ return ""
119
+
120
+ try:
121
+ transcript_list = ytt_api.list(video_id)
122
+ french_translation = None
123
+ for transcript in transcript_list:
124
+ if(transcript.is_translatable):
125
+ french_language_code = None
126
+ for lang in transcript.translation_languages:
127
+ if lang.language == 'French':
128
+ french_language_code = lang.language_code
129
+ break
130
+ if french_language_code:
131
+ print(f"\nTranslating to French ({french_language_code})...")
132
+ french_translation = transcript.translate(french_language_code).fetch()
133
+ print("French Translation Found and Stored.")
134
+ break # Exit after finding the first French translation
135
+ french_output = ""
136
+ if french_translation:
137
+ for snippet in french_translation:
138
+ french_output += snippet.text + " "
139
+ else:
140
+ print(f"No translatable transcript found for French for video ID: {video_id}")
141
+ return french_output.strip()
142
+ except Exception as e:
143
+ print(f"Error retrieving or translating French transcript for video ID {video_id}: {e}")
144
+ return ""
145
+
146
+ model, tokenizer, device = None, None, None
147
+ formatted_language_code = ""
148
+
149
+ def setModelAndTokenizer(language_code):
150
+ """Sets the appropriate translation model and tokenizer based on the target language code."""
151
+ global model, tokenizer, device, formatted_language_code
152
+
153
+ _MODEL_NAME = None
154
+ _readable_name = None
155
+
156
+ if language_code == 'ar':
157
+ _MODEL_NAME = "Helsinki-NLP/opus-mt-tc-big-en-ar"
158
+ _readable_name = "English to Arabic"
159
+ elif language_code == 'fr':
160
+ _MODEL_NAME = "Helsinki-NLP/opus-mt-tc-big-en-fr"
161
+ _readable_name = "English to French"
162
+ elif language_code == 'ha':
163
+ _MODEL_NAME = "facebook/nllb-200-distilled-600M"
164
+ _readable_name = "English to Hausa"
165
+ formatted_language_code = "hau_Latn"
166
+ elif language_code == 'fa':
167
+ _MODEL_NAME = "facebook/nllb-200-distilled-600M"
168
+ _readable_name = "English to Dari/Afghan Persian"
169
+ formatted_language_code = "pes_Arab"
170
+ elif language_code == 'ps':
171
+ _MODEL_NAME = "facebook/nllb-200-distilled-600M"
172
+ _readable_name = "English to Pashto"
173
+ formatted_language_code = "pbt_Arab"
174
+ else:
175
+ return f"Language code '{language_code}' not supported for translation model."
176
+
177
+ if model is not None and tokenizer is not None and hasattr(tokenizer, 'name_or_path') and tokenizer.name_or_path == _MODEL_NAME:
178
+ print(f"Model and tokenizer for {_readable_name} already loaded.")
179
+ return f"Model and tokenizer for {_readable_name} already loaded."
180
+
181
+
182
+ print(f"Loading model and tokenizer for {_readable_name}...")
183
+ if "Helsinki-NLP" in _MODEL_NAME:
184
+ try:
185
+ tokenizer = MarianTokenizer.from_pretrained(_MODEL_NAME)
186
+ model = MarianMTModel.from_pretrained(_MODEL_NAME)
187
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
188
+ model.to(device)
189
+ print(f"Successfully loaded Helsinki-NLP model: {_MODEL_NAME}")
190
+ except Exception as e:
191
+ print(f"Error loading Helsinki-NLP model or tokenizer: {e}")
192
+ return "Error loading translation model."
193
+
194
+ elif "facebook" in _MODEL_NAME:
195
+ try:
196
+ tokenizer = AutoTokenizer.from_pretrained(_MODEL_NAME)
197
+ model = AutoModelForSeq2SeqLM.from_pretrained(_MODEL_NAME, device_map="auto")
198
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
199
+ model.to(device)
200
+ print(f"Successfully loaded Facebook NLLB model: {_MODEL_NAME}")
201
+ except Exception as e:
202
+ print(f"Error loading Facebook NLLB model or tokenizer: {e}")
203
+ return "Error loading translation model."
204
+ else:
205
+ return f"Unknown model type for {_MODEL_NAME}"
206
+
207
+ return f"Model and tokenizer set for {_readable_name}."
208
+
209
+
210
+ def chunk_text_by_tokens(text, tokenizer, max_tokens):
211
+ """Splits text into chunks based on token count."""
212
+ words = text.split()
213
+ chunks = []
214
+ current_chunk = []
215
+ for word in words:
216
+ trial_chunk = current_chunk + [word]
217
+ # Use add_special_tokens=False to get token count of just the words
218
+ num_tokens = len(tokenizer(" ".join(trial_chunk), add_special_tokens=False).input_ids)
219
+ if num_tokens > max_tokens:
220
+ if current_chunk:
221
+ chunks.append(" ".join(current_chunk))
222
+ current_chunk = [word]
223
+ else:
224
+ current_chunk = trial_chunk
225
+ if current_chunk:
226
+ chunks.append(" ".join(current_chunk))
227
+ return chunks
228
+
229
+
230
+ def translate_me(text, language_code):
231
+ """Translates the input text to the target language using the loaded model."""
232
+ global model, tokenizer, device, formatted_language_code
233
+
234
+ if model is None or tokenizer is None:
235
+ status = setModelAndTokenizer(language_code)
236
+ if "Error" in status or "not supported" in status:
237
+ print(status)
238
+ return f"Translation failed: {status}"
239
+
240
+ if text is None or text.strip() == "":
241
+ return "No text to translate."
242
+
243
+ try:
244
+ if language_code in ['ar', 'fr']:
245
+ inputs = tokenizer(text, return_tensors="pt", padding=True).to(device)
246
+ translated = model.generate(**inputs)
247
+ return tokenizer.decode(translated[0], skip_special_tokens=True)
248
+
249
+ elif language_code in ['ha','fa','ps']:
250
+ SAFE_CHUNK_SIZE = 900
251
+ tokenizer.src_lang = "eng_Latn" # English
252
+ bos_token_id = tokenizer.convert_tokens_to_ids([formatted_language_code])[0]
253
+ chunks = chunk_text_by_tokens(text, tokenizer, SAFE_CHUNK_SIZE)
254
+ translations = []
255
+ for chunk in chunks:
256
+ inputs = tokenizer(chunk, return_tensors="pt").to(device)
257
+ translated_tokens = model.generate(
258
+ **inputs,
259
+ forced_bos_token_id=bos_token_id,
260
+ max_length=512
261
+ )
262
+ translation = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
263
+ translations.append(translation)
264
+ return "\n".join(translations)
265
+ else:
266
+ return f"Translation not implemented for language code: {language_code}"
267
+
268
+ except Exception as e:
269
+ print(f"Error during translation: {e}")
270
+ return "Error during translation."
271
+
272
+
273
+ def say_it_api(text, _out_lang):
274
+ """
275
+ Converts text to speech using gTTS and saves it to a temporary file.
276
+ Returns the file path.
277
+ """
278
+ if text is None or text.strip() == "":
279
+ print("No text provided for gTTS speech generation.")
280
+ return None
281
+ try:
282
+ tts = gTTS(text=text, lang=_out_lang)
283
+ filename = "/tmp/gtts_audio.mp3"
284
+ tts.save(filename)
285
+ return filename
286
+ except Exception as e:
287
+ print(f"Error during gTTS speech generation: {e}")
288
+ return None
289
+
290
+ def speak_with_elevenlabs_api(text, language_code):
291
+ """
292
+ Converts text to speech using ElevenLabs API and saves it to a temporary file.
293
+ Returns the file path.
294
+ """
295
+ ELEVENLABS_API_KEY = os.environ.get('ELEVENLABS_API_KEY')
296
+ VOICE_ID = "EXAVITQu4vr4xnSDxMaL" # Rachel; see docs for voices
297
+
298
+ if not ELEVENLABS_API_KEY:
299
+ print("ElevenLabs API key not found in environment variables.")
300
+ return None
301
+
302
+ if text is None or text.strip() == "":
303
+ print("No text provided for ElevenLabs speech generation.")
304
+ return None
305
+
306
+ url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}"
307
+ headers = {
308
+ "xi-api-key": ELEVENLABS_API_KEY,
309
+ "Content-Type": "application/json"
310
+ }
311
+ data = {
312
+ "text": text,
313
+ "model_id": "eleven_multilingual_v2",
314
+ "voice_settings": {
315
+ "stability": 0.5,
316
+ "similarity_boost": 0.5
317
+ }
318
+ }
319
+ try:
320
+ response = requests.post(url, headers=headers, json=data)
321
+ if response.status_code == 200:
322
+ filename = "/tmp/elevenlabs_audio.mp3"
323
+ with open(filename, 'wb') as f:
324
+ f.write(response.content)
325
+ return filename
326
+ else:
327
+ print(f"Error from ElevenLabs API: Status Code {response.status_code}, Response: {response.text}")
328
+ return None
329
+ except Exception as e:
330
+ print(f"Error calling ElevenLabs API: {e}")
331
+ return None
332
+
333
+
334
+ def speechRouter_api(text,language_code):
335
+ """
336
+ Routes text-to-speech requests based on language code and returns the audio file path.
337
+ """
338
+ if text is None or text.strip() == "":
339
+ return None # No text to speak
340
+
341
+ if language_code == 'ar':
342
+ return say_it_api(text,language_code)
343
+ elif language_code == 'fr':
344
+ return say_it_api(text,language_code)
345
+ elif language_code in ['ha', 'fa', 'ps']:
346
+ return speak_with_elevenlabs_api(text, language_code)
347
+ else:
348
+ print(f"Language code '{language_code}' not supported for speech generation.")
349
+ return None
350
+
351
+
352
+ def translate_and_speak_api_wrapper(video_id, out_lang):
353
+ """
354
+ Translates the given English text from a Youtube video transcript
355
+ to other languages and generates speech for the translated text.
356
+
357
+ Args:
358
+ video_id: The Youtube video ID to translate and speak.
359
+ out_lang: The language to translate to.
360
+
361
+ Returns:
362
+ A tuple containing:
363
+ - translated_text (str): The translated text.
364
+ - audio_file_path (str or None): The path to the generated audio file, or None if speech generation failed.
365
+ """
366
+ # Ensure model and tokenizer are loaded for the target language
367
+ model_status = setModelAndTokenizer(out_lang)
368
+ if "Error" in model_status or "not supported" in model_status:
369
+ return f"Translation failed: {model_status}", None
370
+
371
+ english_text = getEnglishTranscript(video_id)
372
+
373
+ if english_text == "":
374
+ return "No English transcript available to translate.", None
375
+
376
+ translated_text = ""
377
+ if out_lang == "ar":
378
+ translated_text = getArabicTranscript(video_id)
379
+ if translated_text.strip() == "": # If no direct Arabic transcript, translate English
380
+ print("No direct Arabic transcript found, translating from English.")
381
+ translated_text = translate_me(english_text,out_lang)
382
+ elif out_lang == "fr":
383
+ translated_text = getFrenchTranscript(video_id)
384
+ if translated_text.strip() == "": # If no direct French transcript, translate English
385
+ print("No direct French transcript found, translating from English.")
386
+ translated_text = translate_me(english_text,out_lang)
387
+ elif out_lang in ["ha", "fa", "ps"]:
388
+ translated_text = translate_me(english_text,out_lang)
389
+ else:
390
+ return f"Language code '{out_lang}' not supported for translation.", None
391
+
392
+ if translated_text is None or translated_text.strip() == "" or "Translation failed" in translated_text:
393
+ return f"Translation to {out_lang} failed.", None
394
+
395
+ # Generate speech using the API wrapper
396
+ audio_file_path = speechRouter_api(translated_text, out_lang)
397
+
398
+ return translated_text, audio_file_path
399
+
400
+ # This function will serve as the API endpoint for Gradio.
401
+ def translate_and_speak_api(video_id: str, language_code: str):
402
+ """
403
+ API endpoint to translate and speak YouTube video transcripts.
404
+ """
405
+ print(f"Received request for video ID: {video_id}, language: {language_code}")
406
+ translated_text, audio_file_path = translate_and_speak_api_wrapper(video_id, language_code)
407
+
408
+ # Return the translated text and the audio file path (or an empty string if None)
409
+ # Returning an empty string instead of None for the audio output might resolve
410
+ # the TypeError when autoplay is True.
411
+ return translated_text, audio_file_path if audio_file_path is not None else ""
412
+
413
+
414
+ # Define input components
415
+ video_id_input = gr.Textbox(label="YouTube Video ID")
416
+ language_dropdown = gr.Dropdown(
417
+ label="Target Language",
418
+ choices=['ar', 'fr', 'ha', 'fa', 'ps'], # Supported language codes
419
+ value='ar' # Default value
420
+ )
421
+
422
+ # Define output components
423
+ translated_text_output = gr.Textbox(label="Translated Text")
424
+ audio_output = gr.Audio(label="Translated Speech", autoplay=True)
425
+
426
+ # Combine components and the translate_and_speak_api function into a Gradio interface
427
+ demo = gr.Interface(
428
+ fn=translate_and_speak_api, # Use the API endpoint function
429
+ inputs=[video_id_input, language_dropdown], # Inputs match the API function arguments
430
+ outputs=[translated_text_output, audio_output], # Outputs match the API function return values
431
+ title="YouTube Translator and Speaker",
432
+ description="Enter a YouTube video ID and select a language to get the translated transcript and speech."
433
+ )
434
+
435
+ # ---- Launch Both Gradio and Proxy Server ----
436
+ import multiprocessing
437
+
438
+ def run_fastapi():
439
+ uvicorn.run("app:fastapi_app", host="0.0.0.0", port=5001, log_level="info")
440
+
441
+ if __name__ == "__main__":
442
+ p = multiprocessing.Process(target=run_fastapi, daemon=True)
443
+ p.start()
444
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ youtube-transcript-api
3
+ transformers
4
+ sacremoses
5
+ gTTS
6
+ requests
7
+ torch
8
+ sentencepiece
9
+ accelerate
10
+ fastapi
11
+ uvicorn
12
+ httpx