ParsaKhaz commited on
Commit
12d83f6
·
verified ·
1 Parent(s): edf540a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -75
app.py CHANGED
@@ -20,25 +20,28 @@ print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
20
  print("Loading Moondream model...")
21
  model, tokenizer = load_moondream()
22
 
 
23
  # Uncomment for Hugging Face Spaces
24
  @spaces.GPU(duration=120)
25
- def process_video_file(video_file, detect_keyword, box_style, ffmpeg_preset, rows, cols, test_mode):
 
 
26
  """Process a video file through the Gradio interface."""
27
  try:
28
  if not video_file:
29
  raise gr.Error("Please upload a video file")
30
-
31
  # Ensure input/output directories exist using absolute paths
32
- inputs_dir = os.path.join(WORKSPACE_ROOT, 'inputs')
33
- outputs_dir = os.path.join(WORKSPACE_ROOT, 'outputs')
34
  os.makedirs(inputs_dir, exist_ok=True)
35
  os.makedirs(outputs_dir, exist_ok=True)
36
-
37
  # Copy uploaded video to inputs directory
38
  video_filename = f"input_{os.path.basename(video_file)}"
39
  input_video_path = os.path.join(inputs_dir, video_filename)
40
  shutil.copy2(video_file, input_video_path)
41
-
42
  try:
43
  # Process the video
44
  output_path = process_video(
@@ -48,31 +51,37 @@ def process_video_file(video_file, detect_keyword, box_style, ffmpeg_preset, row
48
  ffmpeg_preset=ffmpeg_preset,
49
  rows=rows,
50
  cols=cols,
51
- box_style=box_style
52
  )
53
-
54
  # Verify output exists and is readable
55
  if not output_path or not os.path.exists(output_path):
56
  print(f"Warning: Output path {output_path} does not exist")
57
  # Try to find the output based on expected naming convention
58
- expected_output = os.path.join(outputs_dir, f'{box_style}_{detect_keyword}_{video_filename}')
 
 
59
  if os.path.exists(expected_output):
60
  output_path = expected_output
61
  else:
62
  # Try searching in outputs directory for any matching file
63
- matching_files = [f for f in os.listdir(outputs_dir) if f.startswith(f'{box_style}_{detect_keyword}_')]
 
 
 
 
64
  if matching_files:
65
  output_path = os.path.join(outputs_dir, matching_files[0])
66
  else:
67
  raise gr.Error("Failed to locate output video")
68
-
69
  # Convert output path to absolute path if it isn't already
70
  if not os.path.isabs(output_path):
71
  output_path = os.path.join(WORKSPACE_ROOT, output_path)
72
-
73
  print(f"Returning output path: {output_path}")
74
  return output_path
75
-
76
  finally:
77
  # Clean up input file
78
  try:
@@ -80,92 +89,113 @@ def process_video_file(video_file, detect_keyword, box_style, ffmpeg_preset, row
80
  os.remove(input_video_path)
81
  except:
82
  pass
83
-
84
  except Exception as e:
85
  print(f"Error in process_video_file: {str(e)}")
86
  raise gr.Error(f"Error processing video: {str(e)}")
87
 
 
88
  # Create the Gradio interface
89
  with gr.Blocks(title="Promptable Video Redaction") as app:
90
  gr.Markdown("# Promptable Video Redaction with Moondream")
91
- gr.Markdown("""
92
- This app uses [Moondream 2B](https://github.com/vikhyat/moondream), a powerful yet lightweight vision-language model,
93
- to detect and visualize objects in videos. Moondream can recognize a wide variety of objects, people, text, and more
94
- with high accuracy while being much smaller than traditional models. This enables Moondream to redact content from
95
- video quickly with its [object detection](https://docs.moondream.ai/cloud/detect) capabilities.
96
-
97
- Upload a video and specify what you want to detect. The app will process each frame using Moondream and visualize
98
- the detections using your chosen style. Join the [Moondream Discord server](https://discord.com/invite/tRUdpjDQfH) if you have questions on how this works.
99
- """)
100
-
101
  with gr.Row():
102
  with gr.Column():
103
  # Input components
104
  video_input = gr.Video(label="Upload Video")
105
  detect_input = gr.Textbox(
106
- label="What to Detect",
107
- placeholder="e.g. face, logo, text, person, car, dog, etc.",
108
  value="face",
109
- info="Moondream can detect almost anything you can describe in natural language"
110
- )
111
- box_style_input = gr.Radio(
112
- choices=['censor', 'bounding-box', 'hitmarker'],
113
- value='censor',
114
- label="Visualization Style",
115
- info="Choose how to display detections"
116
- )
117
- preset_input = gr.Dropdown(
118
- choices=['ultrafast', 'superfast', 'veryfast', 'faster', 'fast', 'medium', 'slow', 'slower', 'veryslow'],
119
- value='medium',
120
- label="Processing Speed (faster = lower quality)"
121
  )
122
- with gr.Row():
123
- rows_input = gr.Slider(minimum=1, maximum=4, value=1, step=1, label="Grid Rows")
124
- cols_input = gr.Slider(minimum=1, maximum=4, value=1, step=1, label="Grid Columns")
125
-
126
- test_mode_input = gr.Checkbox(
127
- label="Test Mode (Process first 3 seconds only)",
128
- value=True,
129
- info="Enable to quickly test settings on a short clip before processing the full video (recommended)"
130
- )
131
-
132
  process_btn = gr.Button("Process Video", variant="primary")
133
- gr.Markdown("""
134
- Note: Processing in test mode will only process the first 3 seconds of the video and is recommended for testing settings.
135
- """)
136
-
137
- gr.Markdown("""
138
- We can get a rough estimate of how long the video will take to process by multiplying the videos framerate * seconds * the number of rows and columns and assuming 0.12 seconds processing time per detection.
139
- For example, a 3 second video at 30fps with 2x2 grid, the estimated time is 3 * 30 * 2 * 2 * 0.12 = 43.2 seconds (tested on a 4090 GPU).
140
- """)
141
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  with gr.Column():
143
  # Output components
144
  video_output = gr.Video(label="Processed Video")
145
-
146
  # About section under the video output
147
- gr.Markdown("""
148
- ### About Moondream
149
- Moondream is a tiny yet powerful vision-language model that can analyze images and answer questions about them.
150
- It's designed to be lightweight and efficient while maintaining high accuracy. Some key features:
151
- - Only 2B parameters
152
- - Fast inference with minimal resource requirements
153
- - Supports CPU and GPU execution
154
- - Open source and free to use
155
-
156
- Links:
157
  - [GitHub Repository](https://github.com/vikhyat/moondream)
158
- - [Hugging Face Space](https://huggingface.co/vikhyatk/moondream2)
159
  - [Python Package](https://pypi.org/project/moondream/)
160
- - [Promptable Redaction Recipe](https://docs.moondream.ai/recipes)
161
- """)
162
-
 
163
  # Event handlers
164
  process_btn.click(
165
  fn=process_video_file,
166
- inputs=[video_input, detect_input, box_style_input, preset_input, rows_input, cols_input, test_mode_input],
167
- outputs=video_output
 
 
 
 
 
 
 
 
168
  )
169
 
170
  if __name__ == "__main__":
171
- app.launch(share=True)
 
20
  print("Loading Moondream model...")
21
  model, tokenizer = load_moondream()
22
 
23
+
24
  # Uncomment for Hugging Face Spaces
25
  @spaces.GPU(duration=120)
26
+ def process_video_file(
27
+ video_file, detect_keyword, box_style, ffmpeg_preset, rows, cols, test_mode
28
+ ):
29
  """Process a video file through the Gradio interface."""
30
  try:
31
  if not video_file:
32
  raise gr.Error("Please upload a video file")
33
+
34
  # Ensure input/output directories exist using absolute paths
35
+ inputs_dir = os.path.join(WORKSPACE_ROOT, "inputs")
36
+ outputs_dir = os.path.join(WORKSPACE_ROOT, "outputs")
37
  os.makedirs(inputs_dir, exist_ok=True)
38
  os.makedirs(outputs_dir, exist_ok=True)
39
+
40
  # Copy uploaded video to inputs directory
41
  video_filename = f"input_{os.path.basename(video_file)}"
42
  input_video_path = os.path.join(inputs_dir, video_filename)
43
  shutil.copy2(video_file, input_video_path)
44
+
45
  try:
46
  # Process the video
47
  output_path = process_video(
 
51
  ffmpeg_preset=ffmpeg_preset,
52
  rows=rows,
53
  cols=cols,
54
+ box_style=box_style,
55
  )
56
+
57
  # Verify output exists and is readable
58
  if not output_path or not os.path.exists(output_path):
59
  print(f"Warning: Output path {output_path} does not exist")
60
  # Try to find the output based on expected naming convention
61
+ expected_output = os.path.join(
62
+ outputs_dir, f"{box_style}_{detect_keyword}_{video_filename}"
63
+ )
64
  if os.path.exists(expected_output):
65
  output_path = expected_output
66
  else:
67
  # Try searching in outputs directory for any matching file
68
+ matching_files = [
69
+ f
70
+ for f in os.listdir(outputs_dir)
71
+ if f.startswith(f"{box_style}_{detect_keyword}_")
72
+ ]
73
  if matching_files:
74
  output_path = os.path.join(outputs_dir, matching_files[0])
75
  else:
76
  raise gr.Error("Failed to locate output video")
77
+
78
  # Convert output path to absolute path if it isn't already
79
  if not os.path.isabs(output_path):
80
  output_path = os.path.join(WORKSPACE_ROOT, output_path)
81
+
82
  print(f"Returning output path: {output_path}")
83
  return output_path
84
+
85
  finally:
86
  # Clean up input file
87
  try:
 
89
  os.remove(input_video_path)
90
  except:
91
  pass
92
+
93
  except Exception as e:
94
  print(f"Error in process_video_file: {str(e)}")
95
  raise gr.Error(f"Error processing video: {str(e)}")
96
 
97
+
98
  # Create the Gradio interface
99
  with gr.Blocks(title="Promptable Video Redaction") as app:
100
  gr.Markdown("# Promptable Video Redaction with Moondream")
101
+ gr.Markdown(
102
+ """
103
+ [Moondream 2B](https://github.com/vikhyat/moondream) is a lightweight vision model that detects and visualizes objects in videos. It can identify objects, people, text and more.
104
+
105
+ Upload a video and specify what to detect. The app will process each frame and apply your chosen visualization style. For help, join the [Moondream Discord](https://discord.com/invite/tRUdpjDQfH).
106
+ """
107
+ )
108
+
 
 
109
  with gr.Row():
110
  with gr.Column():
111
  # Input components
112
  video_input = gr.Video(label="Upload Video")
113
  detect_input = gr.Textbox(
114
+ label="What to Detect",
115
+ placeholder="e.g. face, logo, text, person, car, dog, etc.",
116
  value="face",
117
+ info="Moondream can detect anything that you can describe in natural language",
 
 
 
 
 
 
 
 
 
 
 
118
  )
 
 
 
 
 
 
 
 
 
 
119
  process_btn = gr.Button("Process Video", variant="primary")
120
+
121
+ with gr.Accordion("Advanced Settings", open=False):
122
+ box_style_input = gr.Radio(
123
+ choices=["censor", "bounding-box", "hitmarker"],
124
+ value="censor",
125
+ label="Visualization Style",
126
+ info="Choose how to display detections",
127
+ )
128
+ preset_input = gr.Dropdown(
129
+ choices=[
130
+ "ultrafast",
131
+ "superfast",
132
+ "veryfast",
133
+ "faster",
134
+ "fast",
135
+ "medium",
136
+ "slow",
137
+ "slower",
138
+ "veryslow",
139
+ ],
140
+ value="medium",
141
+ label="Processing Speed (faster = lower quality)",
142
+ )
143
+ with gr.Row():
144
+ rows_input = gr.Slider(
145
+ minimum=1, maximum=4, value=1, step=1, label="Grid Rows"
146
+ )
147
+ cols_input = gr.Slider(
148
+ minimum=1, maximum=4, value=1, step=1, label="Grid Columns"
149
+ )
150
+
151
+ test_mode_input = gr.Checkbox(
152
+ label="Test Mode (Process first 3 seconds only)",
153
+ value=True,
154
+ info="Enable to quickly test settings on a short clip before processing the full video (recommended)",
155
+ )
156
+
157
+ gr.Markdown(
158
+ """
159
+ Note: Processing in test mode will only process the first 3 seconds of the video and is recommended for testing settings.
160
+ """
161
+ )
162
+
163
+ gr.Markdown(
164
+ """
165
+ We can get a rough estimate of how long the video will take to process by multiplying the videos framerate * seconds * the number of rows and columns and assuming 0.12 seconds processing time per detection.
166
+ For example, a 3 second video at 30fps with 2x2 grid, the estimated time is 3 * 30 * 2 * 2 * 0.12 = 43.2 seconds (tested on a 4090 GPU).
167
+ """
168
+ )
169
+
170
  with gr.Column():
171
  # Output components
172
  video_output = gr.Video(label="Processed Video")
173
+
174
  # About section under the video output
175
+ gr.Markdown(
176
+ """
177
+ ### Links:
 
 
 
 
 
 
 
178
  - [GitHub Repository](https://github.com/vikhyat/moondream)
179
+ - [Hugging Face](https://huggingface.co/vikhyatk/moondream2)
180
  - [Python Package](https://pypi.org/project/moondream/)
181
+ - [Moondream Recipes](https://docs.moondream.ai/recipes)
182
+ """
183
+ )
184
+
185
  # Event handlers
186
  process_btn.click(
187
  fn=process_video_file,
188
+ inputs=[
189
+ video_input,
190
+ detect_input,
191
+ box_style_input,
192
+ preset_input,
193
+ rows_input,
194
+ cols_input,
195
+ test_mode_input,
196
+ ],
197
+ outputs=video_output,
198
  )
199
 
200
  if __name__ == "__main__":
201
+ app.launch(share=True)