Update app.py
Browse files
app.py
CHANGED
|
@@ -34,6 +34,12 @@ combined_models = []
|
|
| 34 |
combined_models.extend(whisper_models)
|
| 35 |
combined_models.extend(custom_models)
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
LANGUAGES = {
|
| 39 |
"en": "English",
|
|
@@ -217,7 +223,7 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model):
|
|
| 217 |
1. Download youtube video with a given url
|
| 218 |
2. Watch it in the first video component
|
| 219 |
3. Run automatic speech recognition on the video using fast Whisper models
|
| 220 |
-
4. Translate the recognized transcriptions to 26 languages supported by deepL
|
| 221 |
5. Download generated subtitles in .vtt and .srt formats
|
| 222 |
6. Watch the the original video with generated subtitles
|
| 223 |
|
|
@@ -229,13 +235,19 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model):
|
|
| 229 |
raise ValueError("Error no video input")
|
| 230 |
print(video_file_path)
|
| 231 |
try:
|
|
|
|
|
|
|
|
|
|
| 232 |
_,file_ending = os.path.splitext(f'{video_file_path}')
|
| 233 |
print(f'file enging is {file_ending}')
|
| 234 |
print("starting conversion to wav")
|
| 235 |
os.system(f'ffmpeg -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{video_file_path.replace(file_ending, ".wav")}"')
|
| 236 |
print("conversion to wav ready")
|
|
|
|
|
|
|
|
|
|
| 237 |
|
| 238 |
-
|
| 239 |
|
| 240 |
print("starting whisper c++")
|
| 241 |
srt_path = str(video_file_path.replace(file_ending, ".wav")) + ".srt"
|
|
@@ -249,7 +261,7 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model):
|
|
| 249 |
os.system(f'./whisper.cpp/main "{video_file_path.replace(file_ending, ".wav")}" -t 4 -l {source_languages.get(selected_source_lang)} -m ./whisper.cpp/models/ggml-{whisper_model}.bin -osrt')
|
| 250 |
print("starting whisper done with whisper")
|
| 251 |
except Exception as e:
|
| 252 |
-
raise RuntimeError("Error
|
| 253 |
|
| 254 |
try:
|
| 255 |
|
|
@@ -283,12 +295,27 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model):
|
|
| 283 |
}
|
| 284 |
|
| 285 |
df = pd.concat([df, pd.DataFrame(srt_to_df)])
|
|
|
|
|
|
|
|
|
|
| 286 |
|
| 287 |
-
|
| 288 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
|
| 290 |
except Exception as e:
|
| 291 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
|
| 293 |
|
| 294 |
def translate_transcriptions(df, selected_translation_lang_2):
|
|
@@ -316,20 +343,24 @@ def translate_transcriptions(df, selected_translation_lang_2):
|
|
| 316 |
|
| 317 |
usage = requests.get('https://api-free.deepl.com/v2/usage', headers=headers)
|
| 318 |
usage = json.loads(usage.text)
|
|
|
|
| 319 |
try:
|
| 320 |
-
print('Usage is at: ' +
|
| 321 |
except Exception as e:
|
| 322 |
print(e)
|
| 323 |
|
| 324 |
-
if
|
| 325 |
-
print("
|
| 326 |
-
|
| 327 |
-
response = requests.post('https://api-free.deepl.com/v2/translate', headers=headers, data=data)
|
| 328 |
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
except Exception as e:
|
| 334 |
print("EXCEPTION WITH DEEPL API")
|
| 335 |
print(e)
|
|
@@ -391,7 +422,7 @@ def translate_transcriptions(df, selected_translation_lang_2):
|
|
| 391 |
|
| 392 |
print("SRT DONE")
|
| 393 |
subtitle_files = ['subtitles.vtt','subtitles.srt']
|
| 394 |
-
|
| 395 |
return df, subtitle_files
|
| 396 |
|
| 397 |
# def burn_srt_to_video(srt_file, video_in):
|
|
@@ -467,6 +498,10 @@ demo = gr.Blocks(css='''
|
|
| 467 |
.output-markdown {max-width: 65ch !important;}
|
| 468 |
''')
|
| 469 |
demo.encrypt = False
|
|
|
|
|
|
|
|
|
|
|
|
|
| 470 |
with demo:
|
| 471 |
transcription_var = gr.Variable()
|
| 472 |
|
|
@@ -484,8 +519,9 @@ with demo:
|
|
| 484 |
|
| 485 |
with gr.Column():
|
| 486 |
gr.Markdown('''
|
| 487 |
-
|
| 488 |
-
(But please **consider using short videos** so others won't get queued)
|
|
|
|
| 489 |
''')
|
| 490 |
examples = gr.Examples(examples=
|
| 491 |
[ "https://www.youtube.com/watch?v=nlMuHtV82q8&ab_channel=NothingforSale24",
|
|
@@ -509,13 +545,13 @@ with demo:
|
|
| 509 |
with gr.Column():
|
| 510 |
gr.Markdown('''
|
| 511 |
##### Here you can start the transcription and translation process.
|
| 512 |
-
|
| 513 |
-
|
| 514 |
''')
|
| 515 |
selected_source_lang.render()
|
| 516 |
selected_whisper_model.render()
|
| 517 |
transcribe_btn = gr.Button("Step 2. Transcribe audio")
|
| 518 |
-
transcribe_btn.click(speech_to_text, [video_in, selected_source_lang, selected_whisper_model], transcription_df)
|
| 519 |
|
| 520 |
|
| 521 |
with gr.Row():
|
|
@@ -530,9 +566,15 @@ with demo:
|
|
| 530 |
with gr.Row():
|
| 531 |
with gr.Column():
|
| 532 |
gr.Markdown('''
|
| 533 |
-
|
| 534 |
Here you will can translate transcriptions to 26 languages.
|
| 535 |
-
If spoken language is not in the list, translation might not work. In this case original transcriptions are used
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 536 |
''')
|
| 537 |
selected_translation_lang_2.render()
|
| 538 |
translate_transcriptions_button = gr.Button("Step 3. Translate transcription")
|
|
|
|
| 34 |
combined_models.extend(whisper_models)
|
| 35 |
combined_models.extend(custom_models)
|
| 36 |
|
| 37 |
+
usage = requests.get('https://api-free.deepl.com/v2/usage', headers=headers)
|
| 38 |
+
usage = json.loads(usage.text)
|
| 39 |
+
deepL_character_usage = str(usage['character_count'])
|
| 40 |
+
print("deepL_character_usage")
|
| 41 |
+
|
| 42 |
+
|
| 43 |
|
| 44 |
LANGUAGES = {
|
| 45 |
"en": "English",
|
|
|
|
| 223 |
1. Download youtube video with a given url
|
| 224 |
2. Watch it in the first video component
|
| 225 |
3. Run automatic speech recognition on the video using fast Whisper models
|
| 226 |
+
4. Translate the recognized transcriptions to 26 languages supported by deepL (If free API usage for the month is not yet fully consumed)
|
| 227 |
5. Download generated subtitles in .vtt and .srt formats
|
| 228 |
6. Watch the the original video with generated subtitles
|
| 229 |
|
|
|
|
| 235 |
raise ValueError("Error no video input")
|
| 236 |
print(video_file_path)
|
| 237 |
try:
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
|
| 241 |
_,file_ending = os.path.splitext(f'{video_file_path}')
|
| 242 |
print(f'file enging is {file_ending}')
|
| 243 |
print("starting conversion to wav")
|
| 244 |
os.system(f'ffmpeg -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{video_file_path.replace(file_ending, ".wav")}"')
|
| 245 |
print("conversion to wav ready")
|
| 246 |
+
|
| 247 |
+
except Exception as e:
|
| 248 |
+
raise RuntimeError("Error Running inference with local model", e)
|
| 249 |
|
| 250 |
+
try:
|
| 251 |
|
| 252 |
print("starting whisper c++")
|
| 253 |
srt_path = str(video_file_path.replace(file_ending, ".wav")) + ".srt"
|
|
|
|
| 261 |
os.system(f'./whisper.cpp/main "{video_file_path.replace(file_ending, ".wav")}" -t 4 -l {source_languages.get(selected_source_lang)} -m ./whisper.cpp/models/ggml-{whisper_model}.bin -osrt')
|
| 262 |
print("starting whisper done with whisper")
|
| 263 |
except Exception as e:
|
| 264 |
+
raise RuntimeError("Error running Whisper cpp model")
|
| 265 |
|
| 266 |
try:
|
| 267 |
|
|
|
|
| 295 |
}
|
| 296 |
|
| 297 |
df = pd.concat([df, pd.DataFrame(srt_to_df)])
|
| 298 |
+
except Exception as e:
|
| 299 |
+
print("Error creating srt df")
|
| 300 |
+
|
| 301 |
|
| 302 |
+
try:
|
| 303 |
+
usage = requests.get('https://api-free.deepl.com/v2/usage', headers=headers)
|
| 304 |
+
usage = json.loads(usage.text)
|
| 305 |
+
char_count = str(usage['character_count'])
|
| 306 |
+
|
| 307 |
+
print('Usage is at: ' + str(usage['character_count']) + ' characters')
|
| 308 |
+
|
| 309 |
+
if usage['character_count'] >= 490000:
|
| 310 |
+
print("USAGE CLOSE TO LIMIT")
|
| 311 |
|
| 312 |
except Exception as e:
|
| 313 |
+
print('Error with DeepL API requesting usage count')
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
return df
|
| 317 |
+
|
| 318 |
+
|
| 319 |
|
| 320 |
|
| 321 |
def translate_transcriptions(df, selected_translation_lang_2):
|
|
|
|
| 343 |
|
| 344 |
usage = requests.get('https://api-free.deepl.com/v2/usage', headers=headers)
|
| 345 |
usage = json.loads(usage.text)
|
| 346 |
+
deepL_character_usage = str(usage['character_count'])
|
| 347 |
try:
|
| 348 |
+
print('Usage is at: ' + deepL_character_usage + 'characters')
|
| 349 |
except Exception as e:
|
| 350 |
print(e)
|
| 351 |
|
| 352 |
+
if int(deepL_character_usage) <= 490000:
|
| 353 |
+
print("STILL CHARACTERS LEFT")
|
| 354 |
+
response = requests.post('https://api-free.deepl.com/v2/translate', headers=headers, data=data)
|
|
|
|
| 355 |
|
| 356 |
+
# Print the response from the server
|
| 357 |
+
translated_sentences = json.loads(response.text)
|
| 358 |
+
translated_sentences = translated_sentences['translations'][0]['text'].split('\n')
|
| 359 |
+
df['translation'] = translated_sentences
|
| 360 |
+
|
| 361 |
+
else:
|
| 362 |
+
df['translation'] = df['text']
|
| 363 |
+
|
| 364 |
except Exception as e:
|
| 365 |
print("EXCEPTION WITH DEEPL API")
|
| 366 |
print(e)
|
|
|
|
| 422 |
|
| 423 |
print("SRT DONE")
|
| 424 |
subtitle_files = ['subtitles.vtt','subtitles.srt']
|
| 425 |
+
|
| 426 |
return df, subtitle_files
|
| 427 |
|
| 428 |
# def burn_srt_to_video(srt_file, video_in):
|
|
|
|
| 498 |
.output-markdown {max-width: 65ch !important;}
|
| 499 |
''')
|
| 500 |
demo.encrypt = False
|
| 501 |
+
|
| 502 |
+
|
| 503 |
+
|
| 504 |
+
|
| 505 |
with demo:
|
| 506 |
transcription_var = gr.Variable()
|
| 507 |
|
|
|
|
| 519 |
|
| 520 |
with gr.Column():
|
| 521 |
gr.Markdown('''
|
| 522 |
+
### 1. Copy any non-private Youtube video URL to box below or click one of the examples.
|
| 523 |
+
(But please **consider using short videos** so others won't get queued) <br>
|
| 524 |
+
Then press button "1. Download Youtube video"-button:
|
| 525 |
''')
|
| 526 |
examples = gr.Examples(examples=
|
| 527 |
[ "https://www.youtube.com/watch?v=nlMuHtV82q8&ab_channel=NothingforSale24",
|
|
|
|
| 545 |
with gr.Column():
|
| 546 |
gr.Markdown('''
|
| 547 |
##### Here you can start the transcription and translation process.
|
| 548 |
+
Be aware that processing will last some time. With base model it is around 3x speed
|
| 549 |
+
**Please select source language** for better transcriptions. Using 'Let the model analyze' makes mistakes sometimes and may lead to bad transcriptions
|
| 550 |
''')
|
| 551 |
selected_source_lang.render()
|
| 552 |
selected_whisper_model.render()
|
| 553 |
transcribe_btn = gr.Button("Step 2. Transcribe audio")
|
| 554 |
+
transcribe_btn.click(speech_to_text, [video_in, selected_source_lang, selected_whisper_model], [transcription_df])
|
| 555 |
|
| 556 |
|
| 557 |
with gr.Row():
|
|
|
|
| 566 |
with gr.Row():
|
| 567 |
with gr.Column():
|
| 568 |
gr.Markdown('''
|
| 569 |
+
### PLEASE READ BELOW
|
| 570 |
Here you will can translate transcriptions to 26 languages.
|
| 571 |
+
If spoken language is not in the list, translation might not work. In this case original transcriptions are used.
|
| 572 |
+
''')
|
| 573 |
+
gr.Markdown(f'''
|
| 574 |
+
DeepL API character usage:
|
| 575 |
+
{deepL_character_usage if deepL_character_usage is not None else ''}/500 000 characters
|
| 576 |
+
If usage is over 490 000 characters original transcriptions will be used for subtitles.
|
| 577 |
+
API usage resets on 5th of every month.
|
| 578 |
''')
|
| 579 |
selected_translation_lang_2.render()
|
| 580 |
translate_transcriptions_button = gr.Button("Step 3. Translate transcription")
|