import gradio as gr from run_on_video.run import MomentDETRPredictor import torch from lbhd.infer import lbhd_predict import os import subprocess from utils.export_utils import trim_video DESCRIPTION = """ _This Space demonstrates model [QVHighlights: Detecting Moments and Highlights in Videos via Natural Language Queries](https://arxiv.org/abs/2107.09609), NeurIPS 2021, by [Jie Lei](http://www.cs.unc.edu/~jielei/), [Tamara L. Berg](http://tamaraberg.com/), [Mohit Bansal](http://www.cs.unc.edu/~mbansal/)_ """ device = 'cuda' if torch.cuda.is_available() else 'cpu' ckpt_path = "run_on_video/moment_detr_ckpt/model_best.ckpt" clip_model_name_or_path = "ViT-B/32" moment_detr_predictor = MomentDETRPredictor( ckpt_path=ckpt_path, clip_model_name_or_path=clip_model_name_or_path, device=device ) def display_prediction(result): return f'### Start time: {result[0]:.2f}, End time: {result[1]:.2f}, Score: {result[2]:.2f}' with gr.Blocks(theme=gr.themes.Default()) as demo: output_videos = gr.State(None) output_lbhd_videos = gr.State(None) moment_prediction = gr.State(None) our_prediction = gr.State(None) gr.HTML("""

🎞️ Highlight Detection with MomentDETR

""") gr.Markdown(DESCRIPTION) with gr.Column(): with gr.Row(): with gr.Blocks(): with gr.Column(): gr.HTML("""

Input Video

""") input_video = gr.Video(label="Please input mp4", height=400) with gr.Blocks(): with gr.Column(): gr.HTML("""

MomentDETR Result

""") playable_video = gr.Video(height=400) display_score = gr.Markdown("### Start time, End time, Score") with gr.Blocks(): with gr.Column(): gr.HTML("""

Ours Result

""") our_result_video = gr.Video(height=400) display_clip_score = gr.Markdown("### Start time, End time, Score") with gr.Row(): with gr.Column(): retrieval_text = gr.Textbox( label="Query text", placeholder="What should be highlighted?", visible=True ) submit = gr.Button("Submit") with gr.Column(): radio_button = gr.Radio( choices=[i+1 for i in range(10)], label="Top 10", value=1 ) def update_video_player(radio_value, output_videos, output_lbhd_videos, moment_prediction, our_prediction): if output_videos is None or moment_prediction is None: return [None, None, None, None] return { playable_video: output_videos[radio_value-1], our_result_video: output_lbhd_videos[min(radio_value-1, len(output_lbhd_videos)-1)], display_score: display_prediction(moment_prediction[radio_value-1]), display_clip_score: display_prediction(our_prediction[min(radio_value-1, len(output_lbhd_videos)-1)]) } def submit_video(input_video, retrieval_text): ext = os.path.splitext(input_video)[-1].lower() if ext == ".mov": output_file = os.path.join(input_video.replace(".mov", ".mp4")) subprocess.call(['ffmpeg', '-i', input_video, "-vf", "scale=320:-2", output_file]) print(f'== video path: {input_video}') print(f'== retrieval_text: {retrieval_text}') if input_video is None: return [None, None, None, None, None, None, None, None, None, 1] if retrieval_text is None: retrieval_text = '' predictions, video_frames = moment_detr_predictor.localize_moment( video_path=input_video, query_list=[retrieval_text] ) predictions = predictions[0]['pred_relevant_windows'] print(f'== Moment prediction: {predictions}') output_files = [ trim_video( video_path= output_file if ext == ".mov" else input_video, start=predictions[i][0], end=predictions[i][1], output_file=f'{i}.mp4' ) for i in range(10)] lbhd_predictions = lbhd_predict(input_video) print(f'== lbhd_predictions: {lbhd_predictions}') output_files_lbhd = [ trim_video( video_path= output_file if ext == ".mov" else input_video, start=lbhd_predictions[i][0], end=lbhd_predictions[i][1], output_file=f'{i}_lbhd.mp4' ) for i in range(min(10, len(lbhd_predictions)))] return [ output_file if ext == ".mov" else input_video, output_files, output_files_lbhd, predictions, lbhd_predictions, output_files[0], output_files_lbhd[0], display_prediction(predictions[0]), display_prediction(lbhd_predictions[0]), 1 ] radio_button.change( fn=update_video_player, inputs=[radio_button, output_videos, output_lbhd_videos, moment_prediction, our_prediction], outputs=[playable_video, our_result_video, display_score, display_clip_score] ) submit.click( fn=submit_video, inputs=[input_video, retrieval_text], outputs=[input_video, output_videos, output_lbhd_videos, moment_prediction, our_prediction, playable_video, our_result_video, display_score, display_clip_score, radio_button] ) demo.launch()