|  | import os | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | os.system('git clone https://github.com/ggerganov/whisper.cpp.git') | 
					
						
						|  | os.system('make -C ./whisper.cpp') | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | os.system('bash ./whisper.cpp/models/download-ggml-model.sh small') | 
					
						
						|  | os.system('bash ./whisper.cpp/models/download-ggml-model.sh base') | 
					
						
						|  | os.system('bash ./whisper.cpp/models/download-ggml-model.sh medium') | 
					
						
						|  | os.system('bash ./whisper.cpp/models/download-ggml-model.sh base.en') | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | import gradio as gr | 
					
						
						|  | from pathlib import Path | 
					
						
						|  | import pysrt | 
					
						
						|  | import pandas as pd | 
					
						
						|  | import re | 
					
						
						|  | import time | 
					
						
						|  | import os | 
					
						
						|  | import json | 
					
						
						|  |  | 
					
						
						|  | from pytube import YouTube | 
					
						
						|  | from transformers import MarianMTModel, MarianTokenizer | 
					
						
						|  |  | 
					
						
						|  | import psutil | 
					
						
						|  | num_cores = psutil.cpu_count() | 
					
						
						|  | os.environ["OMP_NUM_THREADS"] = f"{num_cores}" | 
					
						
						|  | headers = {'Authorization': os.environ['DeepL_API_KEY']} | 
					
						
						|  |  | 
					
						
						|  | whisper_models = ["base", "small", "medium", "base.en"] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | source_languages = { | 
					
						
						|  | "Arabic": "ar", | 
					
						
						|  | "Asturian ":"st", | 
					
						
						|  | "Belarusian":"be", | 
					
						
						|  | "Bulgarian":"bg", | 
					
						
						|  | "Czech":"cs", | 
					
						
						|  | "Danish":"da", | 
					
						
						|  | "German":"de", | 
					
						
						|  | "Greeek":"el", | 
					
						
						|  | "English":"en", | 
					
						
						|  | "Estonian":"et", | 
					
						
						|  | "Finnish":"fi", | 
					
						
						|  | "Swedish": "sv", | 
					
						
						|  | "Spanish":"es", | 
					
						
						|  | "Let the model analyze": "Let the model analyze" | 
					
						
						|  | } | 
					
						
						|  |  | 
					
						
						|  | DeepL_language_codes_for_translation = { | 
					
						
						|  | "Bulgarian": "BG", | 
					
						
						|  | "Czech": "CS", | 
					
						
						|  | "Danish": "DA", | 
					
						
						|  | "German": "DE", | 
					
						
						|  | "Greek": "EL", | 
					
						
						|  | "English": "EN", | 
					
						
						|  | "Spanish": "ES", | 
					
						
						|  | "Estonian": "ET", | 
					
						
						|  | "Finnish": "FI", | 
					
						
						|  | "French": "FR", | 
					
						
						|  | "Hungarian": "HU", | 
					
						
						|  | "Indonesian": "ID", | 
					
						
						|  | "Italian": "IT", | 
					
						
						|  | "Japanese": "JA", | 
					
						
						|  | "Lithuanian": "LT", | 
					
						
						|  | "Latvian": "LV", | 
					
						
						|  | "Dutch": "NL", | 
					
						
						|  | "Polish": "PL", | 
					
						
						|  | "Portuguese": "PT", | 
					
						
						|  | "Romanian": "RO", | 
					
						
						|  | "Russian": "RU", | 
					
						
						|  | "Slovak": "SK", | 
					
						
						|  | "Slovenian": "SL", | 
					
						
						|  | "Swedish": "SV", | 
					
						
						|  | "Turkish": "TR", | 
					
						
						|  | "Ukrainian": "UK", | 
					
						
						|  | "Chinese": "ZH" | 
					
						
						|  | } | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | transcribe_options = dict(beam_size=3, best_of=3, without_timestamps=False) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | source_language_list = [key[0] for key in source_languages.items()] | 
					
						
						|  | translation_models_list = [key[0] for key in DeepL_language_codes_for_translation.items()] | 
					
						
						|  |  | 
					
						
						|  | videos_out_path = Path("./videos_out") | 
					
						
						|  | videos_out_path.mkdir(parents=True, exist_ok=True) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def get_youtube(video_url): | 
					
						
						|  | yt = YouTube(video_url) | 
					
						
						|  | abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download() | 
					
						
						|  | print("LADATATTU POLKUUN") | 
					
						
						|  | print(abs_video_path) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | return abs_video_path | 
					
						
						|  |  | 
					
						
						|  | def speech_to_text(video_file_path, selected_source_lang, whisper_model): | 
					
						
						|  | """ | 
					
						
						|  | # Youtube with translated subtitles using OpenAI Whisper and Opus-MT models. | 
					
						
						|  | # Currently supports only English audio | 
					
						
						|  | This space allows you to: | 
					
						
						|  | 1. Download youtube video with a given url | 
					
						
						|  | 2. Watch it in the first video component | 
					
						
						|  | 3. Run automatic speech recognition on the video using Whisper | 
					
						
						|  | 4. Translate the recognized transcriptions to Finnish, Swedish, Danish | 
					
						
						|  | 5. Burn the translations to the original video and watch the video in the 2nd video component | 
					
						
						|  |  | 
					
						
						|  | Speech Recognition is based on OpenAI Whisper https://github.com/openai/whisper | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | if(video_file_path == None): | 
					
						
						|  | raise ValueError("Error no video input") | 
					
						
						|  | print(video_file_path) | 
					
						
						|  | try: | 
					
						
						|  | _,file_ending = os.path.splitext(f'{video_file_path}') | 
					
						
						|  | print(f'file enging is {file_ending}') | 
					
						
						|  | print("starting conversion to wav") | 
					
						
						|  | os.system(f'ffmpeg -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{video_file_path.replace(file_ending, ".wav")}"') | 
					
						
						|  | print("conversion to wav ready") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | print("starting whisper c++") | 
					
						
						|  | srt_path = str(video_file_path.replace(file_ending, ".wav")) + ".srt" | 
					
						
						|  | os.system(f'rm -f {srt_path}') | 
					
						
						|  | if selected_source_lang == "Let the model analyze": | 
					
						
						|  | os.system(f'./whisper.cpp/main "{video_file_path.replace(file_ending, ".wav")}" -t 4 -m ./whisper.cpp/models/ggml-{whisper_model}.bin -osrt') | 
					
						
						|  | else: | 
					
						
						|  | os.system(f'./whisper.cpp/main "{video_file_path.replace(file_ending, ".wav")}" -t 4 -l {source_languages.get(selected_source_lang)} -m ./whisper.cpp/models/ggml-{whisper_model}.bin -osrt') | 
					
						
						|  | print("starting whisper done with whisper") | 
					
						
						|  | except Exception as e: | 
					
						
						|  | raise RuntimeError("Error converting video to audio") | 
					
						
						|  |  | 
					
						
						|  | try: | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | df = pd.DataFrame(columns = ['start','end','text']) | 
					
						
						|  | srt_path = str(video_file_path.replace(file_ending, ".wav")) + ".srt" | 
					
						
						|  | subs = pysrt.open(srt_path) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | objects = [] | 
					
						
						|  | for sub in subs: | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | start_hours = str(str(sub.start.hours) + "00")[0:2] if len(str(sub.start.hours)) == 2 else str("0" + str(sub.start.hours) + "00")[0:2] | 
					
						
						|  | end_hours = str(str(sub.end.hours) + "00")[0:2] if len(str(sub.end.hours)) == 2 else str("0" + str(sub.end.hours) + "00")[0:2] | 
					
						
						|  |  | 
					
						
						|  | start_minutes = str(str(sub.start.minutes) + "00")[0:2] if len(str(sub.start.minutes)) == 2 else str("0" + str(sub.start.minutes) + "00")[0:2] | 
					
						
						|  | end_minutes = str(str(sub.end.minutes) + "00")[0:2] if len(str(sub.end.minutes)) == 2 else str("0" + str(sub.end.minutes) + "00")[0:2] | 
					
						
						|  |  | 
					
						
						|  | start_seconds = str(str(sub.start.seconds) + "00")[0:2] if len(str(sub.start.seconds)) == 2 else str("0" + str(sub.start.seconds) + "00")[0:2] | 
					
						
						|  | end_seconds = str(str(sub.end.seconds) + "00")[0:2] if len(str(sub.end.seconds)) == 2 else str("0" + str(sub.end.seconds) + "00")[0:2] | 
					
						
						|  |  | 
					
						
						|  | start_millis = str(str(sub.start.milliseconds) + "000")[0:3] | 
					
						
						|  | end_millis = str(str(sub.end.milliseconds) + "000")[0:3] | 
					
						
						|  | objects.append([sub.text, f'{start_hours}:{start_minutes}:{start_seconds}.{start_millis}', f'{end_hours}:{end_minutes}:{end_seconds}.{end_millis}']) | 
					
						
						|  |  | 
					
						
						|  | for object in objects: | 
					
						
						|  | srt_to_df = { | 
					
						
						|  | 'start': [object[1]], | 
					
						
						|  | 'end': [object[2]], | 
					
						
						|  | 'text': [object[0]] | 
					
						
						|  | } | 
					
						
						|  |  | 
					
						
						|  | df = pd.concat([df, pd.DataFrame(srt_to_df)]) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | return df | 
					
						
						|  |  | 
					
						
						|  | except Exception as e: | 
					
						
						|  | raise RuntimeError("Error Running inference with local model", e) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def translate_transcriptions(df, selected_translation_lang_2): | 
					
						
						|  | if selected_translation_lang_2 is None: | 
					
						
						|  | selected_translation_lang_2 = 'English' | 
					
						
						|  | df.reset_index(inplace=True) | 
					
						
						|  |  | 
					
						
						|  | print("start_translation") | 
					
						
						|  | translations = [] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | text_combined = "" | 
					
						
						|  | for i, sentence in enumerate(df['text']): | 
					
						
						|  | if i == 0: | 
					
						
						|  | text_combined = sentence | 
					
						
						|  | else: | 
					
						
						|  | text_combined = text_combined + '\n' + sentence | 
					
						
						|  |  | 
					
						
						|  | data = {'text': text_combined, | 
					
						
						|  | 'tag_spitting': 'xml', | 
					
						
						|  | 'target_lang': DeepL_language_codes_for_translation.get(selected_translation_lang_2) | 
					
						
						|  | } | 
					
						
						|  | response = requests.post('https://api-free.deepl.com/v2/translate', headers=headers, data=data) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | translated_sentences = json.loads(response.text) | 
					
						
						|  | translated_sentences = translated_sentences['translations'][0]['text'].split('\n') | 
					
						
						|  | df['translation'] = translated_sentences | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | print("translations done") | 
					
						
						|  |  | 
					
						
						|  | return (df) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def create_srt_and_burn(df, video_in): | 
					
						
						|  |  | 
					
						
						|  | print("Starting creation of video wit srt") | 
					
						
						|  | print("video in path is:") | 
					
						
						|  | print(video_in) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | with open('testi.srt','w', encoding="utf-8") as file: | 
					
						
						|  | for i in range(len(df)): | 
					
						
						|  | file.write(str(i+1)) | 
					
						
						|  | file.write('\n') | 
					
						
						|  | start = df.iloc[i]['start'] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | file.write(f"{start}") | 
					
						
						|  |  | 
					
						
						|  | stop = df.iloc[i]['end'] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | file.write(' --> ') | 
					
						
						|  | file.write(f"{stop}") | 
					
						
						|  | file.write('\n') | 
					
						
						|  | file.writelines(df.iloc[i]['translation']) | 
					
						
						|  | if int(i) != len(df)-1: | 
					
						
						|  | file.write('\n\n') | 
					
						
						|  |  | 
					
						
						|  | print("SRT DONE") | 
					
						
						|  | try: | 
					
						
						|  | file1 = open('./testi.srt', 'r', encoding="utf-8") | 
					
						
						|  | Lines = file1.readlines() | 
					
						
						|  |  | 
					
						
						|  | count = 0 | 
					
						
						|  |  | 
					
						
						|  | for line in Lines: | 
					
						
						|  | count += 1 | 
					
						
						|  | print("{}".format(line)) | 
					
						
						|  |  | 
					
						
						|  | print(type(video_in)) | 
					
						
						|  | print(video_in) | 
					
						
						|  |  | 
					
						
						|  | video_out = video_in.replace('.mp4', '_out.mp4') | 
					
						
						|  | print("video_out_path") | 
					
						
						|  | print(video_out) | 
					
						
						|  | command = 'ffmpeg -i "{}" -y -vf subtitles=./testi.srt "{}"'.format(video_in, video_out) | 
					
						
						|  | print(command) | 
					
						
						|  | os.system(command) | 
					
						
						|  | return video_out | 
					
						
						|  | except Exception as e: | 
					
						
						|  | print(e) | 
					
						
						|  | return video_out | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | video_in = gr.Video(label="Video file", mirror_webcam=False) | 
					
						
						|  | youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True) | 
					
						
						|  | video_out = gr.Video(label="Video Out", mirror_webcam=False) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | df_init = pd.DataFrame(columns=['start','end','text']) | 
					
						
						|  | df_init_2 = pd.DataFrame(columns=['start','end','text','translation']) | 
					
						
						|  |  | 
					
						
						|  | selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="Let the model analyze", label="Spoken language in video", interactive=True) | 
					
						
						|  | selected_translation_lang_2 = gr.Dropdown(choices=translation_models_list, type="value", value="English", label="In which language you want the transcriptions?", interactive=True) | 
					
						
						|  | selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="base", label="Selected Whisper model", interactive=True) | 
					
						
						|  |  | 
					
						
						|  | transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate') | 
					
						
						|  | transcription_and_translation_df = gr.DataFrame(value=df_init_2,label="Transcription and translation dataframe", max_rows = 10, wrap=True, overflow_row_behaviour='paginate') | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | demo = gr.Blocks(css=''' | 
					
						
						|  | #cut_btn, #reset_btn { align-self:stretch; } | 
					
						
						|  | #\\31 3 { max-width: 540px; } | 
					
						
						|  | .output-markdown {max-width: 65ch !important;} | 
					
						
						|  | ''') | 
					
						
						|  | demo.encrypt = False | 
					
						
						|  | with demo: | 
					
						
						|  | transcription_var = gr.Variable() | 
					
						
						|  |  | 
					
						
						|  | with gr.Row(): | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | gr.Markdown(''' | 
					
						
						|  | ### This space allows you to: | 
					
						
						|  | ##### 1. Download youtube video with a given URL | 
					
						
						|  | ##### 2. Watch it in the first video component | 
					
						
						|  | ##### 3. Run automatic speech recognition on the video using Whisper (Please remember to select translation language) | 
					
						
						|  | ##### 4. Translate the recognized transcriptions to Finnish, Swedish, Danish | 
					
						
						|  | ##### 5. Burn the translations to the original video and watch the video in the 2nd video component | 
					
						
						|  | ''') | 
					
						
						|  |  | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | gr.Markdown(''' | 
					
						
						|  | ### 1. Insert Youtube URL below (Some examples below which I suggest to use for first tests) | 
					
						
						|  | ##### 1. https://www.youtube.com/watch?v=nlMuHtV82q8&ab_channel=NothingforSale24 | 
					
						
						|  | ##### 2. https://www.youtube.com/watch?v=JzPfMbG1vrE&ab_channel=ExplainerVideosByLauren | 
					
						
						|  | ##### 3. https://www.youtube.com/watch?v=S68vvV0kod8&ab_channel=Pearl-CohnTelevision | 
					
						
						|  | ''') | 
					
						
						|  |  | 
					
						
						|  | with gr.Row(): | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | youtube_url_in.render() | 
					
						
						|  | download_youtube_btn = gr.Button("Step 1. Download Youtube video") | 
					
						
						|  | download_youtube_btn.click(get_youtube, [youtube_url_in], [ | 
					
						
						|  | video_in]) | 
					
						
						|  | print(video_in) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | with gr.Row(): | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | video_in.render() | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | gr.Markdown(''' | 
					
						
						|  | ##### Here you can start the transcription and translation process. | 
					
						
						|  | ##### Be aware that processing will last for a while (35 second video took around 20 seconds in my testing and might fail for longer videos) | 
					
						
						|  | ''') | 
					
						
						|  | selected_source_lang.render() | 
					
						
						|  | selected_whisper_model.render() | 
					
						
						|  | transcribe_btn = gr.Button("Step 2. Transcribe audio") | 
					
						
						|  | transcribe_btn.click(speech_to_text, [video_in, selected_source_lang, selected_whisper_model], transcription_df) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | with gr.Row(): | 
					
						
						|  | gr.Markdown(''' | 
					
						
						|  | ##### Here you will get transcription  output | 
					
						
						|  | ##### ''') | 
					
						
						|  |  | 
					
						
						|  | with gr.Row(): | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | transcription_df.render() | 
					
						
						|  |  | 
					
						
						|  | with gr.Row(): | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | gr.Markdown(''' | 
					
						
						|  | ##### Here you will get translated transcriptions. | 
					
						
						|  | ##### Please remember to select Spoken Language and wanted translation language | 
					
						
						|  | ##### ''') | 
					
						
						|  | selected_translation_lang_2.render() | 
					
						
						|  | translate_transcriptions_button = gr.Button("Step 3. Translate transcription") | 
					
						
						|  | translate_transcriptions_button.click(translate_transcriptions, [transcription_df, selected_translation_lang_2], transcription_and_translation_df) | 
					
						
						|  | transcription_and_translation_df.render() | 
					
						
						|  |  | 
					
						
						|  | with gr.Row(): | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | gr.Markdown(''' | 
					
						
						|  | ##### Now press the Step 4. Button to create output video with translated transcriptions | 
					
						
						|  | ##### ''') | 
					
						
						|  | translate_and_make_srt_btn = gr.Button("Step 4. Create and burn srt to video") | 
					
						
						|  | print(video_in) | 
					
						
						|  | translate_and_make_srt_btn.click(create_srt_and_burn, [transcription_and_translation_df,video_in], [ | 
					
						
						|  | video_out]) | 
					
						
						|  | video_out.render() | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | demo.launch() |