|
import os |
|
|
|
import gradio as gr |
|
import torch |
|
import whisper |
|
from moviepy.editor import ( |
|
AudioFileClip, |
|
ColorClip, |
|
VideoFileClip, |
|
concatenate_videoclips, |
|
) |
|
|
|
|
|
def generate_srt_file(transcription_result: dict, srt_file_path: str, lag=0) -> None: |
|
""" |
|
Write and save an SRT file from the transcription result. |
|
|
|
Args: |
|
transcription_result: The transcription result from Whisper model. |
|
srt_file_path: The path to save the SRT file. |
|
""" |
|
with open(srt_file_path, "w") as file: |
|
for i, segment in enumerate(transcription_result["segments"], start=1): |
|
|
|
start_time = segment["start"] + lag |
|
end_time = segment["end"] + lag |
|
text = segment["text"] |
|
|
|
|
|
start_srt = f"{int(start_time // 3600):02d}:{int((start_time % 3600) // 60):02d}:{int(start_time % 60):02d},{int((start_time % 1) * 1000):03d}" |
|
end_srt = f"{int(end_time // 3600):02d}:{int((end_time % 3600) // 60):02d}:{int(end_time % 60):02d},{int((end_time % 1) * 1000):03d}" |
|
|
|
file.write(f"{i}\n{start_srt} --> {end_srt}\n{text}\n\n") |
|
|
|
|
|
def get_srt_filename(video_path: str, audio_path: str = None) -> str: |
|
""" |
|
Get the SRT filename based on the input video or audio file. |
|
|
|
Args: |
|
video_path: The path to the video file. |
|
audio_path: The path to the audio file. |
|
|
|
Returns: |
|
The SRT filename. |
|
""" |
|
if video_path is not None: |
|
return os.path.splitext(os.path.basename(video_path))[0] + ".srt" |
|
else: |
|
return os.path.splitext(os.path.basename(audio_path))[0] + ".srt" |
|
|
|
|
|
def generate_video( |
|
audio_path: str, |
|
video_path: str, |
|
input: str, |
|
language: str, |
|
lag: int, |
|
progress: gr.Progress = gr.Progress(track_tqdm=True), |
|
) -> tuple[str, str]: |
|
""" |
|
Generate a subtitled video from the input audio or video file. |
|
|
|
Args: |
|
audio_path: The path to the audio file. |
|
video_path: The path to the video file. |
|
input: The type of input file (audio or video). |
|
language: The language code for transcription. |
|
lag: The lag time in seconds to delay the transcription. |
|
progress: The progress bar to show the progress of the task. |
|
|
|
Returns: |
|
The path to the generated video file and the SRT file. |
|
""" |
|
if audio_path is None and video_path is None: |
|
raise gr.Error("Please upload an audio or video file.") |
|
if input == "Video" and video_path is None: |
|
raise gr.Error("Please upload a video file.") |
|
if input == "Audio" and audio_path is None: |
|
raise gr.Error("Please upload an audio file.") |
|
progress(0.0, "Checking input...") |
|
if input == "Video": |
|
progress(0.0, "Extracting audio from video...") |
|
audio_path = f"./{os.path.splitext(os.path.basename(video_path))[0]}.wav" |
|
video = VideoFileClip(video_path) |
|
video.audio.write_audiofile(audio_path) |
|
video.close() |
|
progress(0.1, "Audio extracted!") |
|
|
|
|
|
progress(0.1, "Transcribing audio...") |
|
result = MODEL.transcribe(audio_path, language=language) |
|
progress(0.30, "Audio transcribed!") |
|
|
|
|
|
progress(0.30, "Generating SRT file...") |
|
srt_file_path = get_srt_filename(video_path, audio_path) |
|
generate_srt_file(result, srt_file_path, lag=lag) |
|
progress(0.40, "SRT file generated!") |
|
|
|
if result["segments"] == []: |
|
raise gr.Error("No speech detected in the audio.") |
|
if input == "Video": |
|
if lag == 0: |
|
return video_path, srt_file_path |
|
else: |
|
|
|
video = VideoFileClip(video_path) |
|
black_screen = ColorClip( |
|
size=video.size, color=(0, 0, 0), duration=lag |
|
).set_fps(1) |
|
final_video = concatenate_videoclips([video, black_screen]) |
|
output_video_path = "./transcribed_video.mp4" |
|
final_video.write_videofile( |
|
output_video_path, codec="libx264", audio_codec="aac" |
|
) |
|
return output_video_path, srt_file_path |
|
else: |
|
output_video_path = "./transcribed_video.mp4" |
|
audio_clip = AudioFileClip(audio_path) |
|
duration = audio_clip.duration + lag |
|
video_clip = ColorClip( |
|
size=(1280, 720), color=(0, 0, 0), duration=duration |
|
).set_fps(1) |
|
video_clip = video_clip.set_audio(audio_clip) |
|
video_clip.write_videofile( |
|
output_video_path, codec="libx264", audio_codec="aac" |
|
) |
|
return output_video_path, srt_file_path |
|
|
|
|
|
def download_srt(audio_input: str, video_input: str) -> str: |
|
""" |
|
Download the SRT file based on the input audio or video file. |
|
|
|
Args: |
|
audio_input: The path to the audio file. |
|
video_input: The path to the video file. |
|
|
|
Returns: |
|
The path to the downloaded SRT file. |
|
""" |
|
srt_file_path = get_srt_filename(video_input, audio_input) |
|
if os.path.exists(srt_file_path): |
|
return srt_file_path |
|
else: |
|
raise gr.Error("No SRT file found. Please generate subtitles first.") |
|
|
|
|
|
if __name__ == "__main__": |
|
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
|
MODEL = whisper.load_model("base", device=DEVICE) |
|
|
|
with gr.Blocks(theme=gr.themes.Soft()) as demo: |
|
gr.Markdown( |
|
""" |
|
<div style="text-align: center;"> |
|
<h1 style="color: #4A90E2; font-size: 3em;">Audio Transcription & Subtitled Video Generator π₯β¨</h1> |
|
<p style="font-size: 1.2em; color: #333; max-width: 1000px; margin: auto; text-align: left;"> |
|
Transform your audio or video files into subtitled content effortlessly! <br> |
|
1. Upload your audio or video file, select the language, and receive a video with synchronized subtitles. <br> |
|
2. You can view the subtitled video directly here or download the subtitles as an SRT file for your use. |
|
</p> |
|
</div> |
|
""" |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
audio_input = gr.Audio( |
|
sources=["upload", "microphone"], |
|
type="filepath", |
|
label="π΅ Upload Audio File", |
|
) |
|
video_input = gr.Video( |
|
label="πΉ Or Upload Video File", sources=["upload", "webcam"] |
|
) |
|
with gr.Column(): |
|
file_type = gr.Dropdown( |
|
["Video", "Audio"], |
|
label="File Type", |
|
value="Video", |
|
interactive=True, |
|
) |
|
language = gr.Dropdown( |
|
["en", "es", "fr", "de", "it", "nl", "ru", "no", "zh"], |
|
label="Select Language", |
|
value="en", |
|
interactive=True, |
|
) |
|
lag_slider = gr.Slider( |
|
minimum=0, |
|
maximum=10, |
|
step=1, |
|
value=0, |
|
label="β± Lag (seconds): delay the transcription by this amount of time.", |
|
) |
|
transcribe_button = gr.Button( |
|
"π¬ Generate Subtitled Video", variant="primary" |
|
) |
|
download_button = gr.Button("πΎ Download SRT File", variant="secondary") |
|
|
|
with gr.Column(): |
|
video_output = gr.Video( |
|
label="Play Video with Subtitles", show_download_button=False |
|
) |
|
srt_file_output = gr.File(label="Download Subtitle (SRT)") |
|
|
|
transcribe_button.click( |
|
fn=generate_video, |
|
inputs=[audio_input, video_input, file_type, language, lag_slider], |
|
outputs=video_output, |
|
) |
|
|
|
download_button.click( |
|
fn=download_srt, |
|
inputs=[audio_input, video_input], |
|
outputs=srt_file_output, |
|
) |
|
|
|
demo.launch() |