from typing import Optional import gradio as gr import numpy as np import spaces import supervision as sv import torch from PIL import Image from sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator from sam2.build_sam import build_sam2 MARKDOWN = """ # Segment Anything Model 2 🔥 Segment Anything Model 2 (SAM 2) is a foundation model designed to address promptable visual segmentation in both images and videos. The model extends its functionality to video by treating images as single-frame videos. Its design, a simple transformer architecture with streaming memory, enables real-time video processing. """ DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') MASK_ANNOTATOR = sv.MaskAnnotator(color_lookup=sv.ColorLookup.INDEX) CHECKPOINT = "checkpoints/sam2_hiera_large.pt" CONFIG = "sam2_hiera_l.yaml" sam2_model = build_sam2(CONFIG, CHECKPOINT, device=DEVICE, apply_postprocessing=False) @spaces.GPU def process(image_input) -> Optional[Image.Image]: mask_generator = SAM2AutomaticMaskGenerator(sam2_model) image = np.array(image_input.convert("RGB")) sam_result = mask_generator.generate(image) detections = sv.Detections.from_sam(sam_result=sam_result) return MASK_ANNOTATOR.annotate(scene=image_input, detections=detections) with gr.Blocks() as demo: gr.Markdown(MARKDOWN) with gr.Row(): with gr.Column(): image_input_component = gr.Image(type='pil', label='Upload image') submit_button_component = gr.Button(value='Submit', variant='primary') with gr.Column(): image_output_component = gr.Image(type='pil', label='Image Output') submit_button_component.click( fn=process, inputs=[image_input_component], outputs=[image_output_component] ) demo.launch(debug=False, show_error=True, max_threads=1)