ParsaKhaz's picture
Upload folder using huggingface_hub
45ac42f verified
raw
history blame
7.2 kB
#!/usr/bin/env python3
import gradio as gr
import os
from main import load_moondream, process_video
import tempfile
import shutil
# Get absolute path to workspace root
WORKSPACE_ROOT = os.path.dirname(os.path.abspath(__file__))
# Initialize model globally for reuse
print("Loading Moondream model...")
model, tokenizer = load_moondream()
def process_video_file(video_file, detect_keyword, box_style, ffmpeg_preset, rows, cols, test_mode):
"""Process a video file through the Gradio interface."""
try:
if not video_file:
raise gr.Error("Please upload a video file")
# Ensure input/output directories exist using absolute paths
inputs_dir = os.path.join(WORKSPACE_ROOT, 'inputs')
outputs_dir = os.path.join(WORKSPACE_ROOT, 'outputs')
os.makedirs(inputs_dir, exist_ok=True)
os.makedirs(outputs_dir, exist_ok=True)
# Copy uploaded video to inputs directory
video_filename = f"input_{os.path.basename(video_file)}"
input_video_path = os.path.join(inputs_dir, video_filename)
shutil.copy2(video_file, input_video_path)
try:
# Process the video
output_path = process_video(
input_video_path,
detect_keyword,
test_mode=test_mode,
ffmpeg_preset=ffmpeg_preset,
rows=rows,
cols=cols,
box_style=box_style
)
# Verify output exists and is readable
if not output_path or not os.path.exists(output_path):
print(f"Warning: Output path {output_path} does not exist")
# Try to find the output based on expected naming convention
expected_output = os.path.join(outputs_dir, f'{box_style}_{detect_keyword}_{video_filename}')
if os.path.exists(expected_output):
output_path = expected_output
else:
# Try searching in outputs directory for any matching file
matching_files = [f for f in os.listdir(outputs_dir) if f.startswith(f'{box_style}_{detect_keyword}_')]
if matching_files:
output_path = os.path.join(outputs_dir, matching_files[0])
else:
raise gr.Error("Failed to locate output video")
# Convert output path to absolute path if it isn't already
if not os.path.isabs(output_path):
output_path = os.path.join(WORKSPACE_ROOT, output_path)
print(f"Returning output path: {output_path}")
return output_path
finally:
# Clean up input file
try:
if os.path.exists(input_video_path):
os.remove(input_video_path)
except:
pass
except Exception as e:
print(f"Error in process_video_file: {str(e)}")
raise gr.Error(f"Error processing video: {str(e)}")
# Create the Gradio interface
with gr.Blocks(title="Video Object Detection with Moondream") as app:
gr.Markdown("# Video Object Detection with Moondream")
gr.Markdown("""
This app uses [Moondream](https://github.com/vikhyat/moondream), a powerful yet lightweight vision-language model,
to detect and visualize objects in videos. Moondream can recognize a wide variety of objects, people, text, and more
with high accuracy while being much smaller than traditional models.
Upload a video and specify what you want to detect. The app will process each frame using Moondream and visualize
the detections using your chosen style.
""")
with gr.Row():
with gr.Column():
# Input components
video_input = gr.Video(label="Upload Video")
detect_input = gr.Textbox(
label="What to Detect",
placeholder="e.g. face, logo, text, person, car, dog, etc.",
value="face",
info="Moondream can detect almost anything you can describe in natural language"
)
box_style_input = gr.Radio(
choices=['censor', 'yolo', 'hitmarker'],
value='censor',
label="Visualization Style",
info="Choose how to display detections"
)
preset_input = gr.Dropdown(
choices=['ultrafast', 'superfast', 'veryfast', 'faster', 'fast', 'medium', 'slow', 'slower', 'veryslow'],
value='medium',
label="Processing Speed (faster = lower quality)"
)
with gr.Row():
rows_input = gr.Slider(minimum=1, maximum=4, value=1, step=1, label="Grid Rows")
cols_input = gr.Slider(minimum=1, maximum=4, value=1, step=1, label="Grid Columns")
test_mode_input = gr.Checkbox(
label="Test Mode (Process first 3 seconds only)",
value=True,
info="Enable to quickly test settings on a short clip before processing the full video (recommended)"
)
process_btn = gr.Button("Process Video", variant="primary")
gr.Markdown("""
Note: Processing in test mode will only process the first 3 seconds of the video and is recommended for testing settings.
""")
gr.Markdown("""
We can get a rough estimate of how long the video will take to process by multiplying the videos framerate * seconds * the number of rows and columns and assuming 0.12 seconds processing time per detection.
For example, a 3 second video at 30fps with 2x2 grid, the estimated time is 3 * 30 * 2 * 2 * 0.12 = 43.2 seconds (tested on a 4090 GPU).
""")
with gr.Column():
# Output components
video_output = gr.Video(label="Processed Video")
# About section under the video output
gr.Markdown("""
### About Moondream
Moondream is a tiny yet powerful vision-language model that can analyze images and answer questions about them.
It's designed to be lightweight and efficient while maintaining high accuracy. Some key features:
- Only 2B parameters (compared to 80B+ in other models)
- Fast inference with minimal resource requirements
- Supports CPU and GPU execution
- Open source and free to use
Links:
- [GitHub Repository](https://github.com/vikhyat/moondream)
- [Hugging Face Space](https://huggingface.co/vikhyatk/moondream2)
- [Python Package](https://pypi.org/project/moondream/)
""")
# Event handlers
process_btn.click(
fn=process_video_file,
inputs=[video_input, detect_input, box_style_input, preset_input, rows_input, cols_input, test_mode_input],
outputs=video_output
)
if __name__ == "__main__":
app.launch(share=True)