#!/usr/bin/env python3 import gradio as gr import os from main import load_moondream, process_video import tempfile import shutil import torch import spaces # Get absolute path to workspace root WORKSPACE_ROOT = os.path.dirname(os.path.abspath(__file__)) # Check CUDA availability print(f"Is CUDA available: {torch.cuda.is_available()}") # We want to get True print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}") # GPU Name # Initialize model globally for reuse print("Loading Moondream model...") model, tokenizer = load_moondream() # Uncomment for Hugging Face Spaces @spaces.GPU(duration=120) def process_video_file(video_file, detect_keyword, box_style, ffmpeg_preset, rows, cols, test_mode): """Process a video file through the Gradio interface.""" try: if not video_file: raise gr.Error("Please upload a video file") # Ensure input/output directories exist using absolute paths inputs_dir = os.path.join(WORKSPACE_ROOT, 'inputs') outputs_dir = os.path.join(WORKSPACE_ROOT, 'outputs') os.makedirs(inputs_dir, exist_ok=True) os.makedirs(outputs_dir, exist_ok=True) # Copy uploaded video to inputs directory video_filename = f"input_{os.path.basename(video_file)}" input_video_path = os.path.join(inputs_dir, video_filename) shutil.copy2(video_file, input_video_path) try: # Process the video output_path = process_video( input_video_path, detect_keyword, test_mode=test_mode, ffmpeg_preset=ffmpeg_preset, rows=rows, cols=cols, box_style=box_style ) # Verify output exists and is readable if not output_path or not os.path.exists(output_path): print(f"Warning: Output path {output_path} does not exist") # Try to find the output based on expected naming convention expected_output = os.path.join(outputs_dir, f'{box_style}_{detect_keyword}_{video_filename}') if os.path.exists(expected_output): output_path = expected_output else: # Try searching in outputs directory for any matching file matching_files = [f for f in os.listdir(outputs_dir) if f.startswith(f'{box_style}_{detect_keyword}_')] if matching_files: output_path = os.path.join(outputs_dir, matching_files[0]) else: raise gr.Error("Failed to locate output video") # Convert output path to absolute path if it isn't already if not os.path.isabs(output_path): output_path = os.path.join(WORKSPACE_ROOT, output_path) print(f"Returning output path: {output_path}") return output_path finally: # Clean up input file try: if os.path.exists(input_video_path): os.remove(input_video_path) except: pass except Exception as e: print(f"Error in process_video_file: {str(e)}") raise gr.Error(f"Error processing video: {str(e)}") # Create the Gradio interface with gr.Blocks(title="Promptable Video Redaction") as app: gr.Markdown("# Promptable Video Redaction with Moondream") gr.Markdown(""" This app uses [Moondream 2B](https://github.com/vikhyat/moondream), a powerful yet lightweight vision-language model, to detect and visualize objects in videos. Moondream can recognize a wide variety of objects, people, text, and more with high accuracy while being much smaller than traditional models. This enables Moondream to redact content from video quickly with its [object detection](https://docs.moondream.ai/cloud/detect) capabilities. Upload a video and specify what you want to detect. The app will process each frame using Moondream and visualize the detections using your chosen style. Join the [Moondream Discord server](https://discord.com/invite/tRUdpjDQfH) if you have questions on how this works. """) with gr.Row(): with gr.Column(): # Input components video_input = gr.Video(label="Upload Video") detect_input = gr.Textbox( label="What to Detect", placeholder="e.g. face, logo, text, person, car, dog, etc.", value="face", info="Moondream can detect almost anything you can describe in natural language" ) box_style_input = gr.Radio( choices=['censor', 'bounding-box', 'hitmarker'], value='censor', label="Visualization Style", info="Choose how to display detections" ) preset_input = gr.Dropdown( choices=['ultrafast', 'superfast', 'veryfast', 'faster', 'fast', 'medium', 'slow', 'slower', 'veryslow'], value='medium', label="Processing Speed (faster = lower quality)" ) with gr.Row(): rows_input = gr.Slider(minimum=1, maximum=4, value=1, step=1, label="Grid Rows") cols_input = gr.Slider(minimum=1, maximum=4, value=1, step=1, label="Grid Columns") test_mode_input = gr.Checkbox( label="Test Mode (Process first 3 seconds only)", value=True, info="Enable to quickly test settings on a short clip before processing the full video (recommended)" ) process_btn = gr.Button("Process Video", variant="primary") gr.Markdown(""" Note: Processing in test mode will only process the first 3 seconds of the video and is recommended for testing settings. """) gr.Markdown(""" We can get a rough estimate of how long the video will take to process by multiplying the videos framerate * seconds * the number of rows and columns and assuming 0.12 seconds processing time per detection. For example, a 3 second video at 30fps with 2x2 grid, the estimated time is 3 * 30 * 2 * 2 * 0.12 = 43.2 seconds (tested on a 4090 GPU). """) with gr.Column(): # Output components video_output = gr.Video(label="Processed Video") # About section under the video output gr.Markdown(""" ### About Moondream Moondream is a tiny yet powerful vision-language model that can analyze images and answer questions about them. It's designed to be lightweight and efficient while maintaining high accuracy. Some key features: - Only 2B parameters (compared to 80B+ in other models) - Fast inference with minimal resource requirements - Supports CPU and GPU execution - Open source and free to use Links: - [GitHub Repository](https://github.com/vikhyat/moondream) - [Hugging Face Space](https://huggingface.co/vikhyatk/moondream2) - [Python Package](https://pypi.org/project/moondream/) - [Promptable Redaction Recipe](https://docs.moondream.ai/recipes/) """) # Event handlers process_btn.click( fn=process_video_file, inputs=[video_input, detect_input, box_style_input, preset_input, rows_input, cols_input, test_mode_input], outputs=video_output ) if __name__ == "__main__": app.launch(share=True)