Spaces:

Ashoka74
/

ProductPlacement

Runtime error

App Files Files Community

Ashoka74 commited on Dec 9, 2024

Commit

6af1a30

verified ·

1 Parent(s): abc4fd3

Update gradio_demo.py

Browse files

Files changed (1) hide show

gradio_demo.py +192 -32

gradio_demo.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import spaces
 import os
 import math
 import gradio as gr
@@ -16,20 +15,32 @@ from diffusers import AutoencoderKL, UNet2DConditionModel, DDIMScheduler, EulerA
 from diffusers.models.attention_processor import AttnProcessor2_0
 from transformers import CLIPTextModel, CLIPTokenizer
 from briarmbg import BriaRMBG
 from enum import Enum
 from torch.hub import download_url_to_file
-from torch.hub import download_url_to_file
 import cv2
 from typing import Optional
 from Depth.depth_anything_v2.dpt import DepthAnythingV2
-# from FLORENCE
 import supervision as sv
 import torch
 from PIL import Image
@@ -74,7 +85,7 @@ model.eval()
 # Change UNet
 with torch.no_grad():
-    new_conv_in = torch.nn.Conv2d(12, unet.conv_in.out_channels, unet.conv_in.kernel_size, unet.conv_in.stride, unet.conv_in.padding)
     new_conv_in.weight.zero_()
     new_conv_in.weight[:, :4, :, :].copy_(unet.conv_in.weight)
     new_conv_in.bias = unet.conv_in.bias
@@ -95,15 +106,15 @@ def enable_efficient_attention():
             print(f"Xformers error: {e}")
             print("Falling back to sliced attention")
             # Use sliced attention for RTX 2070
-            # unet.set_attention_slice_size(4)
-            # vae.set_attention_slice_size(4)
             unet.set_attn_processor(AttnProcessor2_0())
             vae.set_attn_processor(AttnProcessor2_0())
     else:
         # Fallback for when xformers is not available
         print("Using sliced attention")
-        # unet.set_attention_slice_size(4)
-        # vae.set_attention_slice_size(4)
         unet.set_attn_processor(AttnProcessor2_0())
         vae.set_attn_processor(AttnProcessor2_0())
@@ -129,12 +140,12 @@ unet.forward = hooked_unet_forward
 # Load
-#model_path = './models/iclight_sd15_fc.safetensors'
-model_path = './models/iclight_sd15_fbc.safetensors'
-if not os.path.exists(model_path):
-    download_url_to_file(url='https://huggingface.co/lllyasviel/ic-light/resolve/main/iclight_sd15_fbc.safetensors', dst=model_path)
 sd_offset = sf.load_file(model_path)
 sd_origin = unet.state_dict()
@@ -223,7 +234,7 @@ i2i_pipe = StableDiffusionImg2ImgPipeline(
     image_encoder=None
 )
-@spaces.GPU(duration=60)
 @torch.inference_mode()
 def encode_prompt_inner(txt: str):
     max_length = tokenizer.model_max_length
@@ -244,7 +255,7 @@ def encode_prompt_inner(txt: str):
     return conds
-@spaces.GPU(duration=60)
 @torch.inference_mode()
 def encode_prompt_pair(positive_prompt, negative_prompt):
     c = encode_prompt_inner(positive_prompt)
@@ -265,7 +276,7 @@ def encode_prompt_pair(positive_prompt, negative_prompt):
     return c, uc
-@spaces.GPU(duration=60)
 @torch.inference_mode()
 def pytorch2numpy(imgs, quant=True):
     results = []
@@ -282,7 +293,7 @@ def pytorch2numpy(imgs, quant=True):
         results.append(y)
     return results
-@spaces.GPU(duration=60)
 @torch.inference_mode()
 def numpy2pytorch(imgs):
     h = torch.from_numpy(np.stack(imgs, axis=0)).float() / 127.0 - 1.0  # so that 127 must be strictly 0.0
@@ -310,7 +321,7 @@ def resize_without_crop(image, target_width, target_height):
     resized_image = pil_image.resize((target_width, target_height), Image.LANCZOS)
     return np.array(resized_image)
-@spaces.GPU(duration=60)
 @torch.inference_mode()
 def run_rmbg(img, sigma=0.0):
     # Convert RGBA to RGB if needed
@@ -454,7 +465,6 @@ def process(input_fg, prompt, image_width, image_height, num_samples, seed, step
     return pixels
-@spaces.GPU(duration=60)
 @torch.inference_mode()
 def process_bg(input_fg, input_bg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, bg_source):
     clear_memory()
@@ -548,7 +558,7 @@ def process_bg(input_fg, input_bg, prompt, image_width, image_height, num_sample
     clear_memory()
     return pixels, [fg, bg]
-@spaces.GPU(duration=60)
 @torch.inference_mode()
 def process_relight(input_fg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source):
     input_fg, matting = run_rmbg(input_fg)
@@ -556,7 +566,7 @@ def process_relight(input_fg, prompt, image_width, image_height, num_samples, se
     return input_fg, results
-@spaces.GPU(duration=60)
 @torch.inference_mode()
 def process_relight_bg(input_fg, input_bg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, bg_source):
     bg_source = BGSource(bg_source)
@@ -760,17 +770,154 @@ def compress_image(image):
     compressed_img = np.array(Image.open("compressed_image.jpg"))
     return compressed_img
 block = gr.Blocks().queue()
 with block:
-    with gr.Tab("Text", visible=False):
         with gr.Row():
             gr.Markdown("## Product Placement from Text")
         with gr.Row():
             with gr.Column():
                 with gr.Row():
                     input_fg = gr.Image(type="numpy", label="Image", height=480)
-                    output_bg = gr.Image(type="numpy", label="Preprocessed Foreground", height=480)
                 with gr.Group():
                     prompt = gr.Textbox(label="Prompt")
                     bg_source = gr.Radio(choices=[e.value for e in BGSource],
@@ -811,14 +958,27 @@ with block:
             #     run_on_click=True, examples_per_page=1024
             # )
         ips = [input_fg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source]
-        relight_button.click(fn=process_relight, inputs=ips, outputs=[output_bg, result_gallery])
         example_quick_prompts.click(lambda x, y: ', '.join(y.split(', ')[:2] + [x[0]]), inputs=[example_quick_prompts, prompt], outputs=prompt, show_progress=False, queue=False)
         example_quick_subjects.click(lambda x: x[0], inputs=example_quick_subjects, outputs=prompt, show_progress=False, queue=False)
-    with gr.Tab("Background", visible=True):
-        mask_mover = MaskMover()
         with gr.Row():
             gr.Markdown("## IC-Light (Relighting with Foreground and Background Condition)")
             gr.Markdown("💾 Generated images are automatically saved to 'outputs' folder")
@@ -937,11 +1097,11 @@ with block:
             outputs=[extracted_fg, x_slider, y_slider]
         )
-        # find_objects_button.click(
-        #     fn=find_objects,
-        #     inputs=[input_image],
-        #     outputs=[extracted_fg]
-        # )
         get_depth_button.click(
             fn=get_depth,
@@ -1101,5 +1261,5 @@ with block:
     )
 block.launch(server_name='0.0.0.0', share=False)

 import os
 import math
 import gradio as gr
 from diffusers.models.attention_processor import AttnProcessor2_0
 from transformers import CLIPTextModel, CLIPTokenizer
 from briarmbg import BriaRMBG
+import dds_cloudapi_sdk
+from dds_cloudapi_sdk import Config, Client, TextPrompt
+from dds_cloudapi_sdk.tasks.dinox import DinoxTask
 from enum import Enum
 from torch.hub import download_url_to_file
+import tempfile
+from sam2.build_sam import build_sam2
+from sam2.sam2_image_predictor import SAM2ImagePredictor
 import cv2
 from typing import Optional
 from Depth.depth_anything_v2.dpt import DepthAnythingV2
+import httpx
+client = httpx.Client(timeout=httpx.Timeout(10.0))  # Set timeout to 10 seconds
+# from FLORENCE
+import spaces
 import supervision as sv
 import torch
 from PIL import Image
 # Change UNet
 with torch.no_grad():
+    new_conv_in = torch.nn.Conv2d(8, unet.conv_in.out_channels, unet.conv_in.kernel_size, unet.conv_in.stride, unet.conv_in.padding)
     new_conv_in.weight.zero_()
     new_conv_in.weight[:, :4, :, :].copy_(unet.conv_in.weight)
     new_conv_in.bias = unet.conv_in.bias
             print(f"Xformers error: {e}")
             print("Falling back to sliced attention")
             # Use sliced attention for RTX 2070
+            unet.set_attention_slice_size(4)
+            vae.set_attention_slice_size(4)
             unet.set_attn_processor(AttnProcessor2_0())
             vae.set_attn_processor(AttnProcessor2_0())
     else:
         # Fallback for when xformers is not available
         print("Using sliced attention")
+        unet.set_attention_slice_size(4)
+        vae.set_attention_slice_size(4)
         unet.set_attn_processor(AttnProcessor2_0())
         vae.set_attn_processor(AttnProcessor2_0())
 # Load
+model_path = './models/iclight_sd15_fc.safetensors'
+#model_path = './models/iclight_sd15_fbc.safetensors'
+# if not os.path.exists(model_path):
+#     download_url_to_file(url='https://huggingface.co/lllyasviel/ic-light/resolve/main/iclight_sd15_fc.safetensors', dst=model_path)
 sd_offset = sf.load_file(model_path)
 sd_origin = unet.state_dict()
     image_encoder=None
 )
 @torch.inference_mode()
 def encode_prompt_inner(txt: str):
     max_length = tokenizer.model_max_length
     return conds
 @torch.inference_mode()
 def encode_prompt_pair(positive_prompt, negative_prompt):
     c = encode_prompt_inner(positive_prompt)
     return c, uc
 @torch.inference_mode()
 def pytorch2numpy(imgs, quant=True):
     results = []
         results.append(y)
     return results
 @torch.inference_mode()
 def numpy2pytorch(imgs):
     h = torch.from_numpy(np.stack(imgs, axis=0)).float() / 127.0 - 1.0  # so that 127 must be strictly 0.0
     resized_image = pil_image.resize((target_width, target_height), Image.LANCZOS)
     return np.array(resized_image)
 @torch.inference_mode()
 def run_rmbg(img, sigma=0.0):
     # Convert RGBA to RGB if needed
     return pixels
 @torch.inference_mode()
 def process_bg(input_fg, input_bg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, bg_source):
     clear_memory()
     clear_memory()
     return pixels, [fg, bg]
 @torch.inference_mode()
 def process_relight(input_fg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source):
     input_fg, matting = run_rmbg(input_fg)
     return input_fg, results
 @torch.inference_mode()
 def process_relight_bg(input_fg, input_bg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, bg_source):
     bg_source = BGSource(bg_source)
     compressed_img = np.array(Image.open("compressed_image.jpg"))
     return compressed_img
+@spaces.GPU(duration=60)
+@torch.inference_mode()
+def process_image(input_image, input_text):
+    """Main processing function for the Gradio interface"""
+    # Initialize configs
+    API_TOKEN = "9c8c865e10ec1821bea79d9fa9dc8720"
+    SAM2_CHECKPOINT = "./checkpoints/sam2_hiera_large.pt"
+    SAM2_MODEL_CONFIG = os.path.join(os.path.dirname(os.path.abspath(__file__)), "configs/sam2_hiera_l.yaml")
+    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+    OUTPUT_DIR = Path("outputs/grounded_sam2_dinox_demo")
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+    # Initialize DDS client
+    config = Config(API_TOKEN)
+    client = Client(config)
+    # Process classes from text prompt
+    classes = [x.strip().lower() for x in input_text.split('.') if x]
+    class_name_to_id = {name: id for id, name in enumerate(classes)}
+    class_id_to_name = {id: name for name, id in class_name_to_id.items()}
+    # Save input image to temp file and get URL
+    with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmpfile:
+        cv2.imwrite(tmpfile.name, input_image)
+        image_url = client.upload_file(tmpfile.name)
+    os.remove(tmpfile.name)
+    # Run DINO-X detection
+    task = DinoxTask(
+        image_url=image_url,
+        prompts=[TextPrompt(text=input_text)]
+    )
+    client.run_task(task)
+    result = task.result
+    objects = result.objects
+    # Process detection results
+    input_boxes = []
+    confidences = []
+    class_names = []
+    class_ids = []
+    for obj in objects:
+        input_boxes.append(obj.bbox)
+        confidences.append(obj.score)
+        cls_name = obj.category.lower().strip()
+        class_names.append(cls_name)
+        class_ids.append(class_name_to_id[cls_name])
+    input_boxes = np.array(input_boxes)
+    class_ids = np.array(class_ids)
+    # Initialize SAM2
+    torch.autocast(device_type=DEVICE, dtype=torch.bfloat16).__enter__()
+    if torch.cuda.get_device_properties(0).major >= 8:
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+    sam2_model = build_sam2(SAM2_MODEL_CONFIG, SAM2_CHECKPOINT, device=DEVICE)
+    sam2_predictor = SAM2ImagePredictor(sam2_model)
+    sam2_predictor.set_image(input_image)
+    # sam2_predictor = run_sam_inference(SAM_IMAGE_MODEL, input_image, detections)
+    # Get masks from SAM2
+    masks, scores, logits = sam2_predictor.predict(
+        point_coords=None,
+        point_labels=None,
+        box=input_boxes,
+        multimask_output=False,
+    )
+    if masks.ndim == 4:
+        masks = masks.squeeze(1)
+    # Create visualization
+    labels = [f"{class_name} {confidence:.2f}"
+             for class_name, confidence in zip(class_names, confidences)]
+    detections = sv.Detections(
+        xyxy=input_boxes,
+        mask=masks.astype(bool),
+        class_id=class_ids
+    )
+    box_annotator = sv.BoxAnnotator()
+    label_annotator = sv.LabelAnnotator()
+    mask_annotator = sv.MaskAnnotator()
+    annotated_frame = input_image.copy()
+    annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections)
+    annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
+    annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections)
+    # Create transparent mask for first detected object
+    if len(detections) > 0:
+        # Get first mask
+        first_mask = detections.mask[0]
+        # Get original RGB image
+        img = input_image.copy()
+        H, W, C = img.shape
+        # Create RGBA image
+        alpha = np.zeros((H, W, 1), dtype=np.uint8)
+        alpha[first_mask] = 255
+        rgba = np.dstack((img, alpha)).astype(np.uint8)
+        # Crop to mask bounds to minimize image size
+        y_indices, x_indices = np.where(first_mask)
+        y_min, y_max = y_indices.min(), y_indices.max()
+        x_min, x_max = x_indices.min(), x_indices.max()
+        # Crop the RGBA image
+        cropped_rgba = rgba[y_min:y_max+1, x_min:x_max+1]
+        # Set extracted foreground for mask mover
+        mask_mover.set_extracted_fg(cropped_rgba)
+        return annotated_frame, cropped_rgba, gr.update(visible=True), gr.update(visible=True)
+    return annotated_frame, None, gr.update(visible=False), gr.update(visible=False)
 block = gr.Blocks().queue()
 with block:
+    with gr.Tab("Text"):
         with gr.Row():
             gr.Markdown("## Product Placement from Text")
         with gr.Row():
             with gr.Column():
                 with gr.Row():
                     input_fg = gr.Image(type="numpy", label="Image", height=480)
+                with gr.Row():
+                    with gr.Group():
+                        find_objects_button = gr.Button(value="(Option 1) Segment Object from text")
+                        text_prompt = gr.Textbox(
+                                label="Text Prompt",
+                                placeholder="Enter object classes separated by periods (e.g. 'car . person .')",
+                                value="couch . table ."
+                            )
+                    extract_button = gr.Button(value="(Option 2) Remove Background")
+                with gr.Row():
+                    extracted_objects = gr.Image(type="numpy", label="Extracted Foreground", height=480)
+                    extracted_fg = gr.Image(type="numpy", label="Extracted Foreground", height=480)
+                    # output_bg = gr.Image(type="numpy", label="Preprocessed Foreground", height=480)
                 with gr.Group():
                     prompt = gr.Textbox(label="Prompt")
                     bg_source = gr.Radio(choices=[e.value for e in BGSource],
             #     run_on_click=True, examples_per_page=1024
             # )
         ips = [input_fg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source]
+        relight_button.click(fn=process_relight, inputs=ips, outputs=[extracted_fg, result_gallery])
         example_quick_prompts.click(lambda x, y: ', '.join(y.split(', ')[:2] + [x[0]]), inputs=[example_quick_prompts, prompt], outputs=prompt, show_progress=False, queue=False)
         example_quick_subjects.click(lambda x: x[0], inputs=example_quick_subjects, outputs=prompt, show_progress=False, queue=False)
+        find_objects_button.click(
+            fn=process_image,
+            inputs=[input_fg, text_prompt],
+            outputs=[extracted_objects, extracted_fg]
+            )
+    with gr.Tab("Background", visible=False):
+        # empty cache
+        mask_mover = MaskMover()
+        # with torch.no_grad():
+        #     # Update the input channels to 12
+        #     new_conv_in = torch.nn.Conv2d(12, unet.conv_in.out_channels, unet.conv_in.kernel_size, unet.conv_in.stride, unet.conv_in.padding)  # Changed from 8 to 12
+        #     new_conv_in.weight.zero_()
+        #     new_conv_in.weight[:, :4, :, :].copy_(unet.conv_in.weight)
+        #     new_conv_in.bias = unet.conv_in.bias
+        #     unet.conv_in = new_conv_in
         with gr.Row():
             gr.Markdown("## IC-Light (Relighting with Foreground and Background Condition)")
             gr.Markdown("💾 Generated images are automatically saved to 'outputs' folder")
             outputs=[extracted_fg, x_slider, y_slider]
         )
+        find_objects_button.click(
+            fn=process_image,
+            inputs=[input_image, text_prompt],
+            outputs=[extracted_objects, extracted_fg, x_slider, y_slider]
+            )
         get_depth_button.click(
             fn=get_depth,
     )
 block.launch(server_name='0.0.0.0', share=False)