AnsenH commited on
Commit
84805b3
·
1 Parent(s): 24615d9
Files changed (3) hide show
  1. .gitignore +2 -1
  2. app.py +43 -17
  3. run_on_video/run.py +1 -0
.gitignore CHANGED
@@ -1,4 +1,5 @@
1
  *.mp4
2
  *.MP4
3
  *.mov
4
- *.MOV
 
 
1
  *.mp4
2
  *.MP4
3
  *.mov
4
+ *.MOV
5
+ testing_data
app.py CHANGED
@@ -2,14 +2,16 @@ import gradio as gr
2
  from run_on_video.run import MomentDETRPredictor
3
  from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
4
  import torch
 
5
 
6
  DESCRIPTION = """
7
  _This Space demonstrates model [QVHighlights: Detecting Moments and Highlights in Videos via Natural Language Queries](https://arxiv.org/abs/2107.09609), NeurIPS 2021, by [Jie Lei](http://www.cs.unc.edu/~jielei/), [Tamara L. Berg](http://tamaraberg.com/), [Mohit Bansal](http://www.cs.unc.edu/~mbansal/)_
8
  """
9
 
 
 
10
  ckpt_path = "run_on_video/moment_detr_ckpt/model_best.ckpt"
11
  clip_model_name_or_path = "ViT-B/32"
12
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
 
14
  moment_detr_predictor = MomentDETRPredictor(
15
  ckpt_path=ckpt_path,
@@ -22,11 +24,14 @@ def trim_video(video_path, start, end, output_file='result.mp4'):
22
  return output_file
23
 
24
  def display_prediction(result):
25
- return f'### Moment Start time: {result[0]}, End time: {result[1]}, Score: {result[2]}'
26
 
27
  with gr.Blocks(theme=gr.themes.Default()) as demo:
28
  output_videos = gr.State(None)
 
29
  moment_prediction = gr.State(None)
 
 
30
  gr.HTML("""<h2 align="center"> 🎞️ Highlight Detection with MomentDETR </h2>""")
31
  gr.Markdown(DESCRIPTION)
32
  with gr.Column():
@@ -37,8 +42,14 @@ with gr.Blocks(theme=gr.themes.Default()) as demo:
37
  input_video = gr.Video(label="Please input mp4", height=400)
38
  with gr.Blocks():
39
  with gr.Column():
40
- gr.HTML("""<h3 align="center"> Highlight Videos </h3>""")
41
  playable_video = gr.Video(height=400)
 
 
 
 
 
 
42
  with gr.Row():
43
  with gr.Column():
44
  retrieval_text = gr.Textbox(
@@ -50,24 +61,27 @@ with gr.Blocks(theme=gr.themes.Default()) as demo:
50
  with gr.Column():
51
  radio_button = gr.Radio(
52
  choices=[i+1 for i in range(10)],
53
- label="Moments",
54
  value=1
55
  )
56
- display_score = gr.Markdown("### Moment Score: ")
 
57
 
58
- def update_video_player(radio_value, output_videos, moment_prediction):
59
  if output_videos is None or moment_prediction is None:
60
- return [None, None]
61
  return {
62
- playable_video: output_videos[radio_value-1],
63
- display_score: display_prediction(moment_prediction[radio_value-1])
 
 
64
  }
65
 
66
  def submit_video(input_video, retrieval_text):
67
  print(f'== video path: {input_video}')
68
  print(f'== retrieval_text: {retrieval_text}')
69
  if input_video is None:
70
- return [None, None, None, None, 1]
71
  if retrieval_text is None:
72
  retrieval_text = ''
73
  predictions, video_frames = moment_detr_predictor.localize_moment(
@@ -75,32 +89,44 @@ with gr.Blocks(theme=gr.themes.Default()) as demo:
75
  query_list=[retrieval_text]
76
  )
77
  predictions = predictions[0]['pred_relevant_windows']
78
- pred_windows = [[pred[0], pred[1]]for pred in predictions]
79
  output_files = [ trim_video(
80
  video_path=input_video,
81
- start=pred_windows[i][0],
82
- end=pred_windows[i][1],
83
  output_file=f'{i}.mp4'
84
  ) for i in range(10)]
85
-
 
 
 
 
 
 
 
 
 
86
  return {
87
  output_videos: output_files,
 
88
  moment_prediction: predictions,
 
89
  playable_video: output_files[0],
 
90
  display_score: display_prediction(predictions[0]),
 
91
  radio_button: 1
92
  }
93
 
94
  radio_button.change(
95
  fn=update_video_player,
96
- inputs=[radio_button, output_videos, moment_prediction],
97
- outputs=[playable_video, display_score]
98
  )
99
 
100
  submit.click(
101
  fn=submit_video,
102
  inputs=[input_video, retrieval_text],
103
- outputs=[output_videos, moment_prediction, playable_video, display_score, radio_button]
104
  )
105
 
106
  demo.launch()
 
2
  from run_on_video.run import MomentDETRPredictor
3
  from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
4
  import torch
5
+ from lbhd.infer import lbhd_predict
6
 
7
  DESCRIPTION = """
8
  _This Space demonstrates model [QVHighlights: Detecting Moments and Highlights in Videos via Natural Language Queries](https://arxiv.org/abs/2107.09609), NeurIPS 2021, by [Jie Lei](http://www.cs.unc.edu/~jielei/), [Tamara L. Berg](http://tamaraberg.com/), [Mohit Bansal](http://www.cs.unc.edu/~mbansal/)_
9
  """
10
 
11
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
12
+
13
  ckpt_path = "run_on_video/moment_detr_ckpt/model_best.ckpt"
14
  clip_model_name_or_path = "ViT-B/32"
 
15
 
16
  moment_detr_predictor = MomentDETRPredictor(
17
  ckpt_path=ckpt_path,
 
24
  return output_file
25
 
26
  def display_prediction(result):
27
+ return f'### Start time: {result[0]:.2f}, End time: {result[1]:.2f}, Score: {result[2]:.2f}'
28
 
29
  with gr.Blocks(theme=gr.themes.Default()) as demo:
30
  output_videos = gr.State(None)
31
+ output_lbhd_videos = gr.State(None)
32
  moment_prediction = gr.State(None)
33
+ our_prediction = gr.State(None)
34
+
35
  gr.HTML("""<h2 align="center"> 🎞️ Highlight Detection with MomentDETR </h2>""")
36
  gr.Markdown(DESCRIPTION)
37
  with gr.Column():
 
42
  input_video = gr.Video(label="Please input mp4", height=400)
43
  with gr.Blocks():
44
  with gr.Column():
45
+ gr.HTML("""<h3 align="center"> MomentDETR Result </h3>""")
46
  playable_video = gr.Video(height=400)
47
+ display_score = gr.Markdown("### Start time, End time, Score")
48
+ with gr.Blocks():
49
+ with gr.Column():
50
+ gr.HTML("""<h3 align="center"> Ours Result </h3>""")
51
+ our_result_video = gr.Video(height=400)
52
+ display_clip_score = gr.Markdown("### Start time, End time, Score")
53
  with gr.Row():
54
  with gr.Column():
55
  retrieval_text = gr.Textbox(
 
61
  with gr.Column():
62
  radio_button = gr.Radio(
63
  choices=[i+1 for i in range(10)],
64
+ label="Top 10",
65
  value=1
66
  )
67
+ # display_score = gr.Markdown("### Moment Score: ")
68
+
69
 
70
+ def update_video_player(radio_value, output_videos, output_lbhd_videos, moment_prediction, our_prediction):
71
  if output_videos is None or moment_prediction is None:
72
+ return [None, None, None, None]
73
  return {
74
+ playable_video: output_videos[radio_value-1],
75
+ our_result_video: output_lbhd_videos[min(radio_value-1, len(output_lbhd_videos)-1)],
76
+ display_score: display_prediction(moment_prediction[radio_value-1]),
77
+ display_clip_score: display_prediction(our_prediction[min(radio_value-1, len(output_lbhd_videos)-1)])
78
  }
79
 
80
  def submit_video(input_video, retrieval_text):
81
  print(f'== video path: {input_video}')
82
  print(f'== retrieval_text: {retrieval_text}')
83
  if input_video is None:
84
+ return [None, None, None, None, None, None, None, None, 1]
85
  if retrieval_text is None:
86
  retrieval_text = ''
87
  predictions, video_frames = moment_detr_predictor.localize_moment(
 
89
  query_list=[retrieval_text]
90
  )
91
  predictions = predictions[0]['pred_relevant_windows']
 
92
  output_files = [ trim_video(
93
  video_path=input_video,
94
+ start=predictions[i][0],
95
+ end=predictions[i][1],
96
  output_file=f'{i}.mp4'
97
  ) for i in range(10)]
98
+
99
+ lbhd_predictions = lbhd_predict(input_video)
100
+ print(f'== lbhd_predictions: {lbhd_predictions}')
101
+ output_files_lbhd = [ trim_video(
102
+ video_path=input_video,
103
+ start=lbhd_predictions[i][0],
104
+ end=lbhd_predictions[i][1],
105
+ output_file=f'{i}_lbhd.mp4'
106
+ ) for i in range(10)]
107
+
108
  return {
109
  output_videos: output_files,
110
+ output_lbhd_videos: output_files_lbhd,
111
  moment_prediction: predictions,
112
+ our_prediction: lbhd_predictions,
113
  playable_video: output_files[0],
114
+ our_result_video: output_files_lbhd[0],
115
  display_score: display_prediction(predictions[0]),
116
+ display_clip_score: display_prediction(lbhd_predictions[0]),
117
  radio_button: 1
118
  }
119
 
120
  radio_button.change(
121
  fn=update_video_player,
122
+ inputs=[radio_button, output_videos, output_lbhd_videos, moment_prediction, our_prediction],
123
+ outputs=[playable_video, our_result_video, display_score, display_clip_score]
124
  )
125
 
126
  submit.click(
127
  fn=submit_video,
128
  inputs=[input_video, retrieval_text],
129
+ outputs=[output_videos, output_lbhd_videos, moment_prediction, our_prediction, playable_video, our_result_video, display_score, display_clip_score, radio_button]
130
  )
131
 
132
  demo.launch()
run_on_video/run.py CHANGED
@@ -25,6 +25,7 @@ class MomentDETRPredictor:
25
  )
26
  print("Loading trained Moment-DETR model...")
27
  self.model = build_inference_model(ckpt_path).to(self.device)
 
28
 
29
  @torch.no_grad()
30
  def localize_moment(self, video_path, query_list):
 
25
  )
26
  print("Loading trained Moment-DETR model...")
27
  self.model = build_inference_model(ckpt_path).to(self.device)
28
+ self.model.eval()
29
 
30
  @torch.no_grad()
31
  def localize_moment(self, video_path, query_list):