Spaces:

TencentARC
/

IC-Custom

Running on Zero

App Files Files Community

Yaowei222 commited on 10 days ago

Commit

12edc27

0 Parent(s):

Initial commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +91 -0
.gitignore +177 -0
README.md +14 -0
app.py +414 -0
app/APP.md +103 -0
app/BEN2.py +1394 -0
app/aspect_ratio_template.py +88 -0
app/business_logic.py +556 -0
app/config.py +72 -0
app/constants.py +35 -0
app/event_handlers.py +155 -0
app/examples.py +210 -0
app/metainfo.py +131 -0
app/stylesheets.py +1679 -0
app/ui_components.py +354 -0
app/utils.py +429 -0
assets/gradio/pos_aware/001/hypher_params.txt +3 -0
assets/gradio/pos_aware/001/img_gen.png +3 -0
assets/gradio/pos_aware/001/img_ref.png +3 -0
assets/gradio/pos_aware/001/img_target.png +3 -0
assets/gradio/pos_aware/001/mask_target.png +3 -0
assets/gradio/pos_aware/002/hypher_params.txt +3 -0
assets/gradio/pos_aware/002/img_gen.png +3 -0
assets/gradio/pos_aware/002/img_ref.png +3 -0
assets/gradio/pos_aware/002/img_target.png +3 -0
assets/gradio/pos_aware/002/mask_target.png +3 -0
assets/gradio/pos_aware/003/hypher_params.txt +3 -0
assets/gradio/pos_aware/003/img_gen.png +3 -0
assets/gradio/pos_aware/003/img_ref.png +3 -0
assets/gradio/pos_aware/003/img_target.png +3 -0
assets/gradio/pos_aware/003/mask_target.png +3 -0
assets/gradio/pos_aware/004/hypher_params.txt +3 -0
assets/gradio/pos_aware/004/img_gen.png +3 -0
assets/gradio/pos_aware/004/img_ref.png +3 -0
assets/gradio/pos_aware/004/img_target.png +3 -0
assets/gradio/pos_aware/004/mask_target.png +3 -0
assets/gradio/pos_aware/005/hypher_params.txt +3 -0
assets/gradio/pos_aware/005/img_gen.png +3 -0
assets/gradio/pos_aware/005/img_ref.png +3 -0
assets/gradio/pos_aware/005/img_target.png +3 -0
assets/gradio/pos_aware/005/mask_target.png +3 -0
assets/gradio/pos_free/001/hyper_params.json +1 -0
assets/gradio/pos_free/001/img_gen.png +3 -0
assets/gradio/pos_free/001/img_ref.png +3 -0
assets/gradio/pos_free/001/img_target.png +3 -0
assets/gradio/pos_free/001/mask_target.png +3 -0
assets/gradio/pos_free/002/hyper_params.json +1 -0
assets/gradio/pos_free/002/img_gen.png +3 -0
assets/gradio/pos_free/002/img_ref.png +3 -0
assets/gradio/pos_free/002/img_target.png +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,91 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_aware/005 filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_free filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_free/001/img_gen.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_free/001/img_ref.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_free/001/mask_target.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_free/003/hypher_params.txt filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_aware/002/hypher_params.txt filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_aware/003/hypher_params.txt filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_aware/004/mask_target.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_free/003/img_ref.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_free/004 filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_free/004/img_ref.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_aware/001/img_gen.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_aware/001/img_ref.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_aware/001/img_target.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_aware/002/img_ref.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_aware/003/img_gen.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_aware/003/img_ref.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_aware/004/img_gen.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_aware/004/img_ref.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_aware/004 filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_aware/004/hypher_params.txt filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_aware/005/img_gen.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_free/001/img_target.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_free/002 filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_free/002/img_ref.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_free/003/img_target.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_free/003/mask_target.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_aware filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_aware/001 filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_aware/005/img_ref.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_free/002/hypher_params.txt filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_free/002/img_gen.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_free/004/img_gen.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_aware/002/mask_target.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_aware/003 filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_aware/003/img_target.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_aware/003/mask_target.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_aware/005/img_target.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_free/001 filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_free/001/hypher_params.txt filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_free/002/mask_target.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_aware/005/hypher_params.txt filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_aware/005/mask_target.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_free/002/img_target.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_free/003 filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_free/003/img_gen.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_free/004/img_target.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_aware/001/hypher_params.txt filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_aware/001/mask_target.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_aware/002 filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_aware/004/img_target.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_free/004/hypher_params.txt filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_free/004/mask_target.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_aware/002/img_gen.png filter=lfs diff=lfs merge=lfs -text
+assets/gradio/pos_aware/002/img_target.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,177 @@

+# Initially taken from GitHub's Python gitignore file
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# tests and logs
+tests/fixtures/cached_*_text.txt
+logs/
+lightning_logs/
+lang_code_data/
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a Python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# vscode
+.vs
+.vscode
+# Pycharm
+.idea
+# TF code
+tensorflow_code
+# Models
+proc_data
+/models
+# examples
+runs
+/runs_old
+/wandb
+/examples/runs
+/examples/**/*.args
+/examples/rag/sweep
+# data
+data
+/data
+serialization_dir
+# emacs
+*.*~
+debug.env
+# vim
+.*.swp
+# ctags
+tags
+# pre-commit
+.pre-commit*
+# .lock
+*.lock
+# DS_Store (MacOS)
+.DS_Store
+# RL pipelines may produce mp4 outputs
+*.mp4
+# dependencies
+/transformers
+# ruff
+.ruff_cache

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: IC Custom
+emoji: 🎨
+colorFrom: purple
+colorTo: blue
+sdk: gradio
+sdk_version: 5.43.1
+app_file: app.py
+pinned: false
+license: other
+short_description: IC-Custom is designed for diverse image customization.
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,414 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+IC-Custom Gradio Application
+This module defines the UI and glue logic to run the IC-Custom pipeline
+via Gradio. The code aims to keep UI text user-friendly while keeping the
+implementation readable and maintainable.
+"""
+import os
+import sys
+import numpy as np
+import torch
+import gradio as gr
+import spaces
+from PIL import Image
+import time
+# Add current directory to path for imports
+sys.path.append(os.getcwd() + '/app')
+# Import modular components
+from config import parse_args, load_config, setup_environment
+from ui_components import (
+    create_theme, create_css, create_header_section, create_customization_section,
+    create_image_input_section, create_prompt_section, create_advanced_options_section,
+    create_mask_operation_section, create_output_section, create_examples_section,
+    create_citation_section
+)
+from event_handlers import setup_event_handlers
+from business_logic import (
+    init_image_target_1, init_image_target_2, init_image_reference,
+    undo_seg_points, segmentation, get_point, get_brush,
+    dilate_mask, erode_mask, bounding_box,
+    change_input_mask_mode, change_custmization_mode, change_seg_ref_mode,
+    vlm_auto_generate, vlm_auto_polish, save_results, set_mobile_predictor,
+    set_ben2_model, set_vlm_processor, set_vlm_model,
+)
+# Import other dependencies
+from utils import (
+    get_sam_predictor, get_vlm, get_ben2_model,
+    prepare_input_images, get_mask_type_ids
+)
+from examples import GRADIO_EXAMPLES, MASK_TGT, IMG_GEN
+from ic_custom.pipelines.ic_custom_pipeline import ICCustomPipeline
+# Global variables for pipeline and assets cache directory
+PIPELINE = None
+ASSETS_CACHE_DIR = None
+# Force Hugging Face to re-download models and clear cache
+os.environ["HF_HUB_FORCE_DOWNLOAD"] = "1"
+os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
+os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache"  # Use temp directory for Spaces
+os.environ["HF_HOME"] = "/tmp/hf_home"  # Use temp directory for Spaces
+os.environ["GRADIO_TEMP_DIR"] = os.path.abspath(os.path.join(os.path.dirname(__file__), "gradio_cache"))
+def set_pipeline(pipeline):
+    """Inject pipeline into this module without changing function signatures."""
+    global PIPELINE
+    PIPELINE = pipeline
+def set_assets_cache_dir(assets_cache_dir):
+    """Inject assets cache dir into this module without changing function signatures."""
+    global ASSETS_CACHE_DIR
+    ASSETS_CACHE_DIR = assets_cache_dir
+def initialize_models(args, cfg, device, weight_dtype):
+    """Initialize all required models."""
+    # Load IC-Custom pipeline
+    pipeline = ICCustomPipeline(
+        clip_path=cfg.checkpoint_config.clip_path if os.path.exists(cfg.checkpoint_config.clip_path) else "clip-vit-large-patch14",
+        t5_path=cfg.checkpoint_config.t5_path if os.path.exists(cfg.checkpoint_config.t5_path) else "t5-v1_1-xxl",
+        siglip_path=cfg.checkpoint_config.siglip_path if os.path.exists(cfg.checkpoint_config.siglip_path) else "siglip-so400m-patch14-384",
+        ae_path=cfg.checkpoint_config.ae_path if os.path.exists(cfg.checkpoint_config.ae_path) else "flux-fill-dev-ae",
+        dit_path=cfg.checkpoint_config.dit_path if os.path.exists(cfg.checkpoint_config.dit_path) else "flux-fill-dev-dit",
+        redux_path=cfg.checkpoint_config.redux_path if os.path.exists(cfg.checkpoint_config.redux_path) else "flux1-redux-dev",
+        lora_path=cfg.checkpoint_config.lora_path if os.path.exists(cfg.checkpoint_config.lora_path) else "dit_lora_0x1561",
+        img_txt_in_path=cfg.checkpoint_config.img_txt_in_path if os.path.exists(cfg.checkpoint_config.img_txt_in_path) else "dit_txt_img_in_0x1561",
+        boundary_embeddings_path=cfg.checkpoint_config.boundary_embeddings_path if os.path.exists(cfg.checkpoint_config.boundary_embeddings_path) else "dit_boundary_embeddings_0x1561",
+        task_register_embeddings_path=cfg.checkpoint_config.task_register_embeddings_path if os.path.exists(cfg.checkpoint_config.task_register_embeddings_path) else "dit_task_register_embeddings_0x1561",
+        network_alpha=cfg.model_config.network_alpha,
+        double_blocks_idx=cfg.model_config.double_blocks,
+        single_blocks_idx=cfg.model_config.single_blocks,
+        device=device,
+        weight_dtype=weight_dtype,
+        offload=True,
+    )
+    pipeline.set_pipeline_offload(True)
+    # pipeline.set_show_progress(True)
+    # Load SAM predictor
+    mobile_predictor = get_sam_predictor(cfg.checkpoint_config.sam_path, device)
+    # Load VLM if enabled
+    vlm_processor, vlm_model = None, None
+    if args.enable_vlm_for_prompt:
+        vlm_processor, vlm_model = get_vlm(
+            cfg.checkpoint_config.vlm_path,
+            device=device,
+            torch_dtype=weight_dtype,
+        )
+    # Load BEN2 model if enabled
+    ben2_model = None
+    if args.enable_ben2_for_mask_ref:
+        ben2_model = get_ben2_model(cfg.checkpoint_config.ben2_path, device)
+    return pipeline, mobile_predictor, vlm_processor, vlm_model, ben2_model
+@spaces.GPU(duration=140)
+def run_model(
+    image_target_state, mask_target_state, image_reference_ori_state,
+    image_reference_rmbg_state, prompt, seed, guidance, true_gs, num_steps,
+    num_images_per_prompt, use_background_preservation, background_blend_threshold,
+    aspect_ratio, custmization_mode, seg_ref_mode, input_mask_mode,
+    progress=gr.Progress()
+):
+    """Run IC-Custom pipeline with current UI state and return images."""
+    start_ts = time.time()
+    progress(0, desc="Starting generation...")
+    # Select reference image and check inputs
+    if seg_ref_mode == "Masked Ref":
+        image_reference_state = image_reference_rmbg_state
+    else:
+        image_reference_state = image_reference_ori_state
+    if image_reference_state is None:
+        gr.Warning('Please upload the reference image')
+        return None, seed, gr.update(placeholder="Last Input: " + prompt, value="")
+    if image_target_state is None and custmization_mode != "Position-free":
+        gr.Warning('Please upload the target image and mask it')
+        return None, seed, gr.update(placeholder="Last Input: " + prompt, value="")
+    if custmization_mode == "Position-aware" and mask_target_state is None:
+        gr.Warning('Please select/draw the target mask')
+        return None, seed, gr.update(placeholder=prompt, value="")
+    mask_type_ids = get_mask_type_ids(custmization_mode, input_mask_mode)
+    from constants import ASPECT_RATIO_TEMPLATE
+    output_w, output_h = ASPECT_RATIO_TEMPLATE[aspect_ratio]
+    image_reference, image_target, mask_target = prepare_input_images(
+        image_reference_state, custmization_mode, image_target_state, mask_target_state,
+        width=output_w, height=output_h,
+        force_resize_long_edge="long edge" in aspect_ratio,
+        return_type="pil"
+    )
+    gr.Info(f"Output WH resolution: {image_target.size[0]}px x {image_target.size[1]}px")
+    # Run the model
+    if seed == -1:
+        seed = torch.randint(0, 2147483647, (1,)).item()
+    width, height = image_target.size[0] + image_reference.size[0], image_target.size[1]
+    with torch.no_grad():
+        output_img = PIPELINE(
+            prompt=prompt, width=width, height=height, guidance=guidance,
+            num_steps=num_steps, seed=seed, img_ref=image_reference,
+            img_target=image_target, mask_target=mask_target, img_ip=image_reference,
+            cond_w_regions=[image_reference.size[0]], mask_type_ids=mask_type_ids,
+            use_background_preservation=use_background_preservation,
+            background_blend_threshold=background_blend_threshold, true_gs=true_gs,
+            neg_prompt="worst quality, normal quality, low quality, low res, blurry,",
+            num_images_per_prompt=num_images_per_prompt,
+            gradio_progress=progress,
+        )
+    elapsed = time.time() - start_ts
+    progress(1.0, desc=f"Completed in {elapsed:.2f}s!")
+    gr.Info(f"Finished in {elapsed:.2f}s")
+    return output_img, -1, gr.update(placeholder=f"Last Input ({elapsed:.2f}s): " + prompt, value="")
+def example_pipeline(
+    image_reference, image_target_1, image_target_2, custmization_mode,
+    input_mask_mode, seg_ref_mode, prompt, seed, true_gs, eg_idx,
+    num_steps, guidance
+):
+    """Handle example loading in the UI."""
+    if seg_ref_mode == "Full Ref":
+        image_reference_ori_state = np.array(image_reference.convert("RGB"))
+        image_reference_rmbg_state = None
+        image_reference_state = image_reference_ori_state
+    else:
+        image_reference_rmbg_state = np.array(image_reference.convert("RGB"))
+        image_reference_ori_state = None
+        image_reference_state = image_reference_rmbg_state
+    if custmization_mode == "Position-aware":
+        if input_mask_mode == "Precise mask":
+            image_target_state = np.array(image_target_1.convert("RGB"))
+        else:
+            image_target_state = np.array(image_target_2['composite'].convert("RGB"))
+        mask_target_state = np.array(Image.open(MASK_TGT[int(eg_idx)]))
+    else:  # Position-free mode
+        # For Position-free, use the target image from IMG_TGT1 and corresponding mask
+        image_target_state = np.array(image_target_1.convert("RGB"))
+        mask_target_state = np.array(Image.open(MASK_TGT[int(eg_idx)]))
+    mask_target_binary = mask_target_state / 255
+    masked_img = image_target_state * mask_target_binary
+    masked_img_pil = Image.fromarray(masked_img.astype("uint8"))
+    output_mask_pil = Image.fromarray(mask_target_state.astype("uint8"))
+    if custmization_mode == "Position-aware":
+        mask_gallery = [masked_img_pil, output_mask_pil]
+    else:
+        mask_gallery = gr.skip()
+    result_gallery = [Image.open(IMG_GEN[int(eg_idx)]).convert("RGB")]
+    if custmization_mode == "Position-free":
+        return (image_reference_ori_state, image_reference_rmbg_state, image_target_state,
+                mask_target_state, mask_gallery, result_gallery,
+                gr.update(visible=False), gr.update(visible=False))
+    if input_mask_mode == "Precise mask":
+        return (image_reference_ori_state, image_reference_rmbg_state, image_target_state,
+                mask_target_state, mask_gallery, result_gallery,
+                gr.update(visible=True), gr.update(visible=False))
+    else:
+        # Ensure ImageEditor has a proper background so brush + undo work
+        try:
+            bg_img = image_target_2.get('background') or image_target_2.get('composite')
+        except Exception:
+            bg_img = image_target_2
+        return (
+            image_reference_ori_state, image_reference_rmbg_state, image_target_state,
+            mask_target_state, mask_gallery, result_gallery,
+            gr.update(visible=False),
+            gr.update(visible=True, value={"background": bg_img, "layers": [], "composite": bg_img}),
+        )
+def create_application():
+    """Create the main Gradio application."""
+    # Create theme and CSS
+    theme = create_theme()
+    css = create_css()
+    with gr.Blocks(theme=theme, css=css) as demo:
+        with gr.Column(elem_id="global_glass_container"):
+            # Create UI sections
+            create_header_section()
+            # Hidden components
+            eg_idx = gr.Textbox(label="eg_idx", visible=False, value="-1")
+            # State variables
+            image_target_state = gr.State(value=None)
+            mask_target_state = gr.State(value=None)
+            image_reference_ori_state = gr.State(value=None)
+            image_reference_rmbg_state = gr.State(value=None)
+            selected_points = gr.State(value=[])
+            # Main UI content with optimized left-right layout
+            with gr.Column(elem_id="glass_card"):
+                # Top section - Mode selection (full width)
+                custmization_mode, md_custmization_mode = create_customization_section()
+                # Main layout: Left for inputs, Right for outputs
+                with gr.Row(equal_height=False):
+                    # LEFT COLUMN - ALL INPUTS
+                    with gr.Column(scale=3, min_width=400):
+                        # Image input section
+                        (image_reference, input_mask_mode, image_target_1, image_target_2,
+                            undo_target_seg_button, md_image_reference, md_input_mask_mode,
+                            md_target_image) = create_image_input_section()
+                        # Text prompt section
+                        prompt, vlm_generate_btn, vlm_polish_btn, md_prompt = create_prompt_section()
+                        # Advanced options (collapsible)
+                        (aspect_ratio, seg_ref_mode, move_to_center, use_background_preservation,
+                            background_blend_threshold, seed, num_images_per_prompt, guidance,
+                            num_steps, true_gs) = create_advanced_options_section()
+                    # RIGHT COLUMN - ALL OUTPUTS
+                    with gr.Column(scale=2, min_width=350):
+                        # Mask preview and operations
+                        (mask_gallery, dilate_button, erode_button, bounding_box_button,
+                            md_mask_operation) = create_mask_operation_section()
+                        # Generation controls and results
+                        result_gallery, submit_button, clear_btn, md_submit = create_output_section()
+                with gr.Row(elem_id="glass_card"):
+                    # Examples section
+                    examples = create_examples_section(
+                        GRADIO_EXAMPLES,
+                        inputs=[
+                            image_reference,
+                            image_target_1,
+                            image_target_2,
+                            custmization_mode,
+                            input_mask_mode,
+                            seg_ref_mode,
+                            prompt,
+                            seed,
+                            true_gs,
+                            eg_idx,
+                            num_steps,
+                            guidance
+                        ],
+                        outputs=[
+                            image_reference_ori_state,
+                            image_reference_rmbg_state,
+                            image_target_state,
+                            mask_target_state,
+                            mask_gallery,
+                            result_gallery,
+                            image_target_1,
+                            image_target_2,
+                        ],
+                        fn=example_pipeline,
+                    )
+            with gr.Row(elem_id="glass_card"):
+                # Citation section
+                create_citation_section()
+        # Setup event handlers
+        setup_event_handlers(
+            ## UI components
+            input_mask_mode, image_target_1, image_target_2, undo_target_seg_button,
+            custmization_mode, dilate_button, erode_button, bounding_box_button,
+            mask_gallery, md_input_mask_mode, md_target_image, md_mask_operation,
+            md_prompt, md_submit, result_gallery, image_target_state, mask_target_state,
+            seg_ref_mode, image_reference_ori_state, move_to_center,
+            image_reference, image_reference_rmbg_state,
+            ## Functions
+            change_input_mask_mode, change_custmization_mode,
+            change_seg_ref_mode,
+            init_image_target_1, init_image_target_2, init_image_reference,
+            get_point, undo_seg_points,
+            get_brush,
+            # VLM buttons
+            vlm_generate_btn, vlm_polish_btn,
+            # VLM functions
+            vlm_auto_generate,
+            vlm_auto_polish,
+            dilate_mask, erode_mask, bounding_box,
+            run_model,
+            ## Other components
+            selected_points, prompt,
+            use_background_preservation, background_blend_threshold, seed,
+            num_images_per_prompt, guidance, true_gs, num_steps, aspect_ratio,
+            submit_button,
+            eg_idx,
+        )
+        # Setup clear button
+        clear_btn.add(
+            [image_reference, image_target_1,image_target_2, mask_gallery, result_gallery,
+            selected_points, image_target_state, mask_target_state, prompt,
+            image_reference_ori_state, image_reference_rmbg_state]
+        )
+    return demo
+def main():
+    """Main entry point for the application."""
+    # Parse arguments and load config
+    args = parse_args()
+    cfg = load_config(args.config)
+    setup_environment(args)
+    # Initialize device and models
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    weight_dtype = torch.bfloat16
+    pipeline, mobile_predictor, vlm_processor, vlm_model, ben2_model = initialize_models(
+        args, cfg, device, weight_dtype
+    )
+    set_pipeline(pipeline)
+    set_assets_cache_dir(args.assets_cache_dir)
+    # Inject mobile predictor into business logic module so get_point can access it without lambdas
+    set_mobile_predictor(mobile_predictor)
+    set_ben2_model(ben2_model)
+    set_vlm_processor(vlm_processor)
+    set_vlm_model(vlm_model)
+    # Create and launch the application
+    demo = create_application()
+    # Launch the demo
+    demo.launch(server_port=7860, server_name="0.0.0.0",
+                allowed_paths=[os.path.abspath(os.path.join(os.path.dirname(__file__), "gradio_cache")),
+                os.path.abspath(os.path.join(os.path.dirname(__file__), "results"))])
+if __name__ == "__main__":
+    main()

app/APP.md ADDED Viewed

	@@ -0,0 +1,103 @@

+# IC-Custom Application
+A sophisticated image customization tool powered by advanced AI models.
+> 📺 **App Guide:**
+> For a fast overview of how to use the app, watch this video:
+> [IC-Custom App Usage Guide (YouTube)](https://www.youtube.com/watch?v=uaiZA3H5RV)
+---
+## 🚀 Quick Start
+```bash
+python src/app/app.py \
+  --config configs/app/app.yaml \
+  --hf_token $HF_TOKEN \
+  --hf_cache_dir $HF_CACHE_DIR \
+  --assets_cache_dir results/app \
+  --enable_ben2_for_mask_ref False \
+  --enable_vlm_for_prompt False \
+  --save_results True
+```
+---
+## ⚙️ Configuration & CLI Arguments
+| Argument | Type | Required | Default | Description |
+|----------|------|----------|---------|-------------|
+| `--config` | str | ✅ | - | Path to app YAML config file |
+| `--hf_token` | str | ❌ | - | Hugging Face access token. |
+| `--hf_cache_dir` | str | ❌ | `~/.cache/huggingface/hub` | HF assets cache directory |
+| `--assets_cache_dir` | str | ❌ | `results/app` | Output images & metadata directory |
+| `--save_results` | bool | ❌ | `False` | Save generated results |
+| `--enable_ben2_for_mask_ref` | bool | ❌ | `False` | Enable BEN2 background removal |
+| `--enable_vlm_for_prompt` | bool | ❌ | `False` | Enable VLM prompt generation |
+### Environment Variables
+- `HF_TOKEN` ← `--hf_token`
+- `HF_HUB_CACHE` ← `--hf_cache_dir`
+---
+## 📥 Model Downloads
+> **Model checkpoints are required before running the app.**
+> All required models will be automatically downloaded when you run the app, or you can manually download them and specify paths in `configs/app/app.yaml`.
+### Required Models
+The following models are **automatically downloaded** when running the app:
+| Model | Purpose | Source |
+|-------|---------|--------|
+| **IC-Custom** | Our customization model | [TencentARC/IC-Custom](https://huggingface.co/TencentARC/IC-Custom) |
+| **CLIP** | Vision-language understanding | [openai/clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) |
+| **T5** | Text processing | [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) |
+| **SigLIP** | Image understanding | [google/siglip-so400m-patch14-384](https://huggingface.co/google/siglip-so400m-patch14-384) |
+| **Autoencoder** | Image encoding/decoding | [black-forest-labs/FLUX.1-Fill-dev](https://huggingface.co/black-forest-labs/FLUX.1-Fill-dev/blob/main/ae.safetensors) |
+| **DIT** | Diffusion model | [black-forest-labs/FLUX.1-Fill-dev](https://huggingface.co/black-forest-labs/FLUX.1-Fill-dev/blob/main/flux1-fill-dev.safetensors) |
+| **Redux** | Image processing | [black-forest-labs/FLUX.1-Redux-dev](https://huggingface.co/black-forest-labs/FLUX.1-Redux-dev) |
+| **SAM-vit-h** | Image segmentation | [HCMUE-Research/SAM-vit-h](https://huggingface.co/HCMUE-Research/SAM-vit-h/blob/main/sam_vit_h_4b8939.pth) |
+### Optional Models (Selective Download)
+**BEN2 and Qwen2.5-VL models are disabled by default** and only downloaded when explicitly enabled:
+| Model | Flag | Source | Purpose |
+|-------|------|--------|---------|
+| **BEN2** | `--enable_ben2_for_mask_ref True` | [PramaLLC/BEN2](https://huggingface.co/PramaLLC/BEN2/blob/main/BEN2_Base.pth) | Background removal |
+| **Qwen2.5-VL** | `--enable_vlm_for_prompt True` | [Qwen/Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) | Prompt generation |
+### Manual Configuration
+**Alternative**: Manually download all models and specify paths in `configs/app/app.yaml`:
+```yaml
+checkpoint_config:
+  # Required models
+  dit_path: "/path/to/flux1-fill-dev.safetensors"
+  ae_path: "/path/to/ae.safetensors"
+  t5_path: "/path/to/t5-v1_1-xxl"
+  clip_path: "/path/to/clip-vit-large-patch14"
+  siglip_path: "/path/to/siglip-so400m-patch14-384"
+  redux_path: "/path/to/flux1-redux-dev.safetensors"
+  # IC-Custom models
+  lora_path: "/path/to/dit_lora_0x1561.safetensors"
+  img_txt_in_path: "/path/to/dit_txt_img_in_0x1561.safetensors"
+  boundary_embeddings_path: "/path/to/dit_boundary_embeddings_0x1561.safetensors"
+  task_register_embeddings_path: "/path/to/dit_task_register_embeddings_0x1561.safetensors"
+  # APP interactive models
+  sam_path: "/path/to/sam_vit_h_4b8939.pth"
+  # Optional models
+  ben2_path: "/path/to/BEN2_Base.pth"
+  vlm_path: "/path/to/Qwen2.5-VL-7B-Instruct"
+```
+### APP Overview
+<p align="center">
+  <img src="../../assets/gradio_ui.png" alt="IC-Custom APP" width="80%">
+</p>

app/BEN2.py ADDED Viewed

	@@ -0,0 +1,1394 @@

+# Copyright (c) 2025 Prama LLC
+# SPDX-License-Identifier: MIT
+import math
+import os
+import random
+import subprocess
+import tempfile
+import time
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from einops import rearrange
+from PIL import Image, ImageOps
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+from torchvision import transforms
+def set_random_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+# set_random_seed(9)
+torch.set_float32_matmul_precision('highest')
+class Mlp(nn.Module):
+    """ Multilayer perceptron."""
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+class WindowAttention(nn.Module):
+    """ Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x, mask=None):
+        """ Forward function.
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class SwinTransformerBlock(nn.Module):
+    """ Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, dim, num_heads, window_size=7, shift_size=0,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
+                 act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
+            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        self.H = None
+        self.W = None
+    def forward(self, x, mask_matrix):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+            mask_matrix: Attention mask for cyclic shift.
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, "input feature has wrong size"
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        _, Hp, Wp, _ = x.shape
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+        # partition windows
+        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp)  # B H' W' C
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :].contiguous()
+        x = x.view(B, H * W, C)
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class PatchMerging(nn.Module):
+    """ Patch Merging Layer
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+    def forward(self, x, H, W):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        x = x.view(B, H, W, C)
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+        x = self.norm(x)
+        x = self.reduction(x)
+        return x
+class BasicLayer(nn.Module):
+    """ A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (int): Local window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(self,
+                 dim,
+                 depth,
+                 num_heads,
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 use_checkpoint=False):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = window_size // 2
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList([
+            SwinTransformerBlock(
+                dim=dim,
+                num_heads=num_heads,
+                window_size=window_size,
+                shift_size=0 if (i % 2 == 0) else window_size // 2,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                norm_layer=norm_layer)
+            for i in range(depth)])
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+    def forward(self, x, H, W):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        # calculate attention mask for SW-MSA
+        Hp = int(np.ceil(H / self.window_size)) * self.window_size
+        Wp = int(np.ceil(W / self.window_size)) * self.window_size
+        img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
+        h_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        w_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+        mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, attn_mask)
+            else:
+                x = blk(x, attn_mask)
+        if self.downsample is not None:
+            x_down = self.downsample(x, H, W)
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+    def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, H, W = x.size()
+        if W % self.patch_size[1] != 0:
+            x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
+        if H % self.patch_size[0] != 0:
+            x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
+        x = self.proj(x)  # B C Wh Ww
+        if self.norm is not None:
+            Wh, Ww = x.size(2), x.size(3)
+            x = x.flatten(2).transpose(1, 2)
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
+        return x
+class SwinTransformer(nn.Module):
+    """ Swin Transformer backbone.
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+    Args:
+        pretrain_img_size (int): Input image size for training the pretrained model,
+            used in absolute postion embedding. Default 224.
+        patch_size (int | tuple(int)): Patch size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+        num_heads (tuple[int]): Number of attention head of each stage.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(self,
+                 pretrain_img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.2,
+                 norm_layer=nn.LayerNorm,
+                 ape=False,
+                 patch_norm=True,
+                 out_indices=(0, 1, 2, 3),
+                 frozen_stages=-1,
+                 use_checkpoint=False):
+        super().__init__()
+        self.pretrain_img_size = pretrain_img_size
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+        # absolute position embedding
+        if self.ape:
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+            patch_size = to_2tuple(patch_size)
+            patches_resolution = [pretrain_img_size[0] // patch_size[0], pretrain_img_size[1] // patch_size[1]]
+            self.absolute_pos_embed = nn.Parameter(
+                torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1]))
+            trunc_normal_(self.absolute_pos_embed, std=.02)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        # stochastic depth
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2 ** i_layer),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+                use_checkpoint=use_checkpoint)
+            self.layers.append(layer)
+        num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
+        self.num_features = num_features
+        # add a norm layer for each output
+        for i_layer in out_indices:
+            layer = norm_layer(num_features[i_layer])
+            layer_name = f'norm{i_layer}'
+            self.add_module(layer_name, layer)
+        self._freeze_stages()
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+        if self.frozen_stages >= 1 and self.ape:
+            self.absolute_pos_embed.requires_grad = False
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+    def forward(self, x):
+        x = self.patch_embed(x)
+        Wh, Ww = x.size(2), x.size(3)
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
+            x = (x + absolute_pos_embed)  # B Wh*Ww C
+        outs = [x.contiguous()]
+        x = x.flatten(2).transpose(1, 2)
+        x = self.pos_drop(x)
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                x_out = norm_layer(x_out)
+                out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
+                outs.append(out)
+        return tuple(outs)
+def get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "gelu":
+        return F.gelu
+    raise RuntimeError(F"activation should be gelu, not {activation}.")
+def make_cbr(in_dim, out_dim):
+    return nn.Sequential(nn.Conv2d(in_dim, out_dim, kernel_size=3, padding=1), nn.InstanceNorm2d(out_dim), nn.GELU())
+def make_cbg(in_dim, out_dim):
+    return nn.Sequential(nn.Conv2d(in_dim, out_dim, kernel_size=3, padding=1), nn.InstanceNorm2d(out_dim), nn.GELU())
+def rescale_to(x, scale_factor: float = 2, interpolation='nearest'):
+    return F.interpolate(x, scale_factor=scale_factor, mode=interpolation)
+def resize_as(x, y, interpolation='bilinear'):
+    return F.interpolate(x, size=y.shape[-2:], mode=interpolation)
+def image2patches(x):
+    """b c (hg h) (wg w) -> (hg wg b) c h w"""
+    x = rearrange(x, 'b c (hg h) (wg w) -> (hg wg b) c h w', hg=2, wg=2)
+    return x
+def patches2image(x):
+    """(hg wg b) c h w -> b c (hg h) (wg w)"""
+    x = rearrange(x, '(hg wg b) c h w -> b c (hg h) (wg w)', hg=2, wg=2)
+    return x
+class PositionEmbeddingSine:
+    def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+        self.dim_t = torch.arange(0, self.num_pos_feats, dtype=torch.float32)
+    def __call__(self, b, h, w):
+        device = self.dim_t.device
+        mask = torch.zeros([b, h, w], dtype=torch.bool, device=device)
+        assert mask is not None
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(dim=1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(dim=2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale
+        dim_t = self.temperature ** (2 * (self.dim_t.to(device) // 2) / self.num_pos_feats)
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        return torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+class PositionEmbeddingSine:
+    def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+        self.dim_t = torch.arange(0, self.num_pos_feats, dtype=torch.float32)
+    def __call__(self, b, h, w):
+        device = self.dim_t.device
+        mask = torch.zeros([b, h, w], dtype=torch.bool, device=device)
+        assert mask is not None
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(dim=1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(dim=2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale
+        dim_t = self.temperature ** (2 * (self.dim_t.to(device) // 2) / self.num_pos_feats)
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        return torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+class MCLM(nn.Module):
+    def __init__(self, d_model, num_heads, pool_ratios=[1, 4, 8]):
+        super(MCLM, self).__init__()
+        self.attention = nn.ModuleList([
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1)
+        ])
+        self.linear1 = nn.Linear(d_model, d_model * 2)
+        self.linear2 = nn.Linear(d_model * 2, d_model)
+        self.linear3 = nn.Linear(d_model, d_model * 2)
+        self.linear4 = nn.Linear(d_model * 2, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(0.1)
+        self.dropout1 = nn.Dropout(0.1)
+        self.dropout2 = nn.Dropout(0.1)
+        self.activation = get_activation_fn('gelu')
+        self.pool_ratios = pool_ratios
+        self.p_poses = []
+        self.g_pos = None
+        self.positional_encoding = PositionEmbeddingSine(num_pos_feats=d_model // 2, normalize=True)
+    def forward(self, l, g):
+        """
+        l: 4,c,h,w
+        g: 1,c,h,w
+        """
+        self.p_poses = []
+        self.g_pos = None
+        b, c, h, w = l.size()
+        # 4,c,h,w -> 1,c,2h,2w
+        concated_locs = rearrange(l, '(hg wg b) c h w -> b c (hg h) (wg w)', hg=2, wg=2)
+        pools = []
+        for pool_ratio in self.pool_ratios:
+            # b,c,h,w
+            tgt_hw = (round(h / pool_ratio), round(w / pool_ratio))
+            pool = F.adaptive_avg_pool2d(concated_locs, tgt_hw)
+            pools.append(rearrange(pool, 'b c h w -> (h w) b c'))
+            if self.g_pos is None:
+                pos_emb = self.positional_encoding(pool.shape[0], pool.shape[2], pool.shape[3])
+                pos_emb = rearrange(pos_emb, 'b c h w -> (h w) b c')
+                self.p_poses.append(pos_emb)
+        pools = torch.cat(pools, 0)
+        if self.g_pos is None:
+            self.p_poses = torch.cat(self.p_poses, dim=0)
+            pos_emb = self.positional_encoding(g.shape[0], g.shape[2], g.shape[3])
+            self.g_pos = rearrange(pos_emb, 'b c h w -> (h w) b c')
+        device = pools.device
+        self.p_poses = self.p_poses.to(device)
+        self.g_pos = self.g_pos.to(device)
+        # attention between glb (q) & multisensory concated-locs (k,v)
+        g_hw_b_c = rearrange(g, 'b c h w -> (h w) b c')
+        g_hw_b_c = g_hw_b_c + self.dropout1(self.attention[0](g_hw_b_c + self.g_pos, pools + self.p_poses, pools)[0])
+        g_hw_b_c = self.norm1(g_hw_b_c)
+        g_hw_b_c = g_hw_b_c + self.dropout2(self.linear2(self.dropout(self.activation(self.linear1(g_hw_b_c)).clone())))
+        g_hw_b_c = self.norm2(g_hw_b_c)
+        # attention between origin locs (q) & freashed glb (k,v)
+        l_hw_b_c = rearrange(l, "b c h w -> (h w) b c")
+        _g_hw_b_c = rearrange(g_hw_b_c, '(h w) b c -> h w b c', h=h, w=w)
+        _g_hw_b_c = rearrange(_g_hw_b_c, "(ng h) (nw w) b c -> (h w) (ng nw b) c", ng=2, nw=2)
+        outputs_re = []
+        for i, (_l, _g) in enumerate(zip(l_hw_b_c.chunk(4, dim=1), _g_hw_b_c.chunk(4, dim=1))):
+            outputs_re.append(self.attention[i + 1](_l, _g, _g)[0])  # (h w) 1 c
+        outputs_re = torch.cat(outputs_re, 1)  # (h w) 4 c
+        l_hw_b_c = l_hw_b_c + self.dropout1(outputs_re)
+        l_hw_b_c = self.norm1(l_hw_b_c)
+        l_hw_b_c = l_hw_b_c + self.dropout2(self.linear4(self.dropout(self.activation(self.linear3(l_hw_b_c)).clone())))
+        l_hw_b_c = self.norm2(l_hw_b_c)
+        l = torch.cat((l_hw_b_c, g_hw_b_c), 1)  # hw,b(5),c
+        return rearrange(l, "(h w) b c -> b c h w", h=h, w=w)  ## (5,c,h*w)
+class MCRM(nn.Module):
+    def __init__(self, d_model, num_heads, pool_ratios=[4, 8, 16], h=None):
+        super(MCRM, self).__init__()
+        self.attention = nn.ModuleList([
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1)
+        ])
+        self.linear3 = nn.Linear(d_model, d_model * 2)
+        self.linear4 = nn.Linear(d_model * 2, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(0.1)
+        self.dropout1 = nn.Dropout(0.1)
+        self.dropout2 = nn.Dropout(0.1)
+        self.sigmoid = nn.Sigmoid()
+        self.activation = get_activation_fn('gelu')
+        self.sal_conv = nn.Conv2d(d_model, 1, 1)
+        self.pool_ratios = pool_ratios
+    def forward(self, x):
+        device = x.device
+        b, c, h, w = x.size()
+        loc, glb = x.split([4, 1], dim=0)  # 4,c,h,w; 1,c,h,w
+        patched_glb = rearrange(glb, 'b c (hg h) (wg w) -> (hg wg b) c h w', hg=2, wg=2)
+        token_attention_map = self.sigmoid(self.sal_conv(glb))
+        token_attention_map = F.interpolate(token_attention_map, size=patches2image(loc).shape[-2:], mode='nearest')
+        loc = loc * rearrange(token_attention_map, 'b c (hg h) (wg w) -> (hg wg b) c h w', hg=2, wg=2)
+        pools = []
+        for pool_ratio in self.pool_ratios:
+            tgt_hw = (round(h / pool_ratio), round(w / pool_ratio))
+            pool = F.adaptive_avg_pool2d(patched_glb, tgt_hw)
+            pools.append(rearrange(pool, 'nl c h w -> nl c (h w)'))  # nl(4),c,hw
+        pools = rearrange(torch.cat(pools, 2), "nl c nphw -> nl nphw 1 c")
+        loc_ = rearrange(loc, 'nl c h w -> nl (h w) 1 c')
+        outputs = []
+        for i, q in enumerate(loc_.unbind(dim=0)):  # traverse all local patches
+            v = pools[i]
+            k = v
+            outputs.append(self.attention[i](q, k, v)[0])
+        outputs = torch.cat(outputs, 1)
+        src = loc.view(4, c, -1).permute(2, 0, 1) + self.dropout1(outputs)
+        src = self.norm1(src)
+        src = src + self.dropout2(self.linear4(self.dropout(self.activation(self.linear3(src)).clone())))
+        src = self.norm2(src)
+        src = src.permute(1, 2, 0).reshape(4, c, h, w)  # freshed loc
+        glb = glb + F.interpolate(patches2image(src), size=glb.shape[-2:], mode='nearest')  # freshed glb
+        return torch.cat((src, glb), 0), token_attention_map
+class BEN_Base(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.backbone = SwinTransformer(embed_dim=128, depths=[2, 2, 18, 2], num_heads=[4, 8, 16, 32], window_size=12)
+        emb_dim = 128
+        self.sideout5 = nn.Sequential(nn.Conv2d(emb_dim, 1, kernel_size=3, padding=1))
+        self.sideout4 = nn.Sequential(nn.Conv2d(emb_dim, 1, kernel_size=3, padding=1))
+        self.sideout3 = nn.Sequential(nn.Conv2d(emb_dim, 1, kernel_size=3, padding=1))
+        self.sideout2 = nn.Sequential(nn.Conv2d(emb_dim, 1, kernel_size=3, padding=1))
+        self.sideout1 = nn.Sequential(nn.Conv2d(emb_dim, 1, kernel_size=3, padding=1))
+        self.output5 = make_cbr(1024, emb_dim)
+        self.output4 = make_cbr(512, emb_dim)
+        self.output3 = make_cbr(256, emb_dim)
+        self.output2 = make_cbr(128, emb_dim)
+        self.output1 = make_cbr(128, emb_dim)
+        self.multifieldcrossatt = MCLM(emb_dim, 1, [1, 4, 8])
+        self.conv1 = make_cbr(emb_dim, emb_dim)
+        self.conv2 = make_cbr(emb_dim, emb_dim)
+        self.conv3 = make_cbr(emb_dim, emb_dim)
+        self.conv4 = make_cbr(emb_dim, emb_dim)
+        self.dec_blk1 = MCRM(emb_dim, 1, [2, 4, 8])
+        self.dec_blk2 = MCRM(emb_dim, 1, [2, 4, 8])
+        self.dec_blk3 = MCRM(emb_dim, 1, [2, 4, 8])
+        self.dec_blk4 = MCRM(emb_dim, 1, [2, 4, 8])
+        self.insmask_head = nn.Sequential(
+            nn.Conv2d(emb_dim, 384, kernel_size=3, padding=1),
+            nn.InstanceNorm2d(384),
+            nn.GELU(),
+            nn.Conv2d(384, 384, kernel_size=3, padding=1),
+            nn.InstanceNorm2d(384),
+            nn.GELU(),
+            nn.Conv2d(384, emb_dim, kernel_size=3, padding=1)
+        )
+        self.shallow = nn.Sequential(nn.Conv2d(3, emb_dim, kernel_size=3, padding=1))
+        self.upsample1 = make_cbg(emb_dim, emb_dim)
+        self.upsample2 = make_cbg(emb_dim, emb_dim)
+        self.output = nn.Sequential(nn.Conv2d(emb_dim, 1, kernel_size=3, padding=1))
+        for m in self.modules():
+            if isinstance(m, nn.GELU) or isinstance(m, nn.Dropout):
+                m.inplace = True
+    @torch.inference_mode()
+    @torch.autocast(device_type="cuda", dtype=torch.float16)
+    def forward(self, x):
+        real_batch = x.size(0)
+        shallow_batch = self.shallow(x)
+        glb_batch = rescale_to(x, scale_factor=0.5, interpolation='bilinear')
+        final_input = None
+        for i in range(real_batch):
+            start = i * 4
+            end = (i + 1) * 4
+            loc_batch = image2patches(x[i, :, :, :].unsqueeze(dim=0))
+            input_ = torch.cat((loc_batch, glb_batch[i, :, :, :].unsqueeze(dim=0)), dim=0)
+            if final_input == None:
+                final_input = input_
+            else:
+                final_input = torch.cat((final_input, input_), dim=0)
+        features = self.backbone(final_input)
+        outputs = []
+        for i in range(real_batch):
+            start = i * 5
+            end = (i + 1) * 5
+            f4 = features[4][start:end, :, :, :]  # shape: [5, C, H, W]
+            f3 = features[3][start:end, :, :, :]
+            f2 = features[2][start:end, :, :, :]
+            f1 = features[1][start:end, :, :, :]
+            f0 = features[0][start:end, :, :, :]
+            e5 = self.output5(f4)
+            e4 = self.output4(f3)
+            e3 = self.output3(f2)
+            e2 = self.output2(f1)
+            e1 = self.output1(f0)
+            loc_e5, glb_e5 = e5.split([4, 1], dim=0)
+            e5 = self.multifieldcrossatt(loc_e5, glb_e5)  # (4,128,16,16)
+            e4, tokenattmap4 = self.dec_blk4(e4 + resize_as(e5, e4))
+            e4 = self.conv4(e4)
+            e3, tokenattmap3 = self.dec_blk3(e3 + resize_as(e4, e3))
+            e3 = self.conv3(e3)
+            e2, tokenattmap2 = self.dec_blk2(e2 + resize_as(e3, e2))
+            e2 = self.conv2(e2)
+            e1, tokenattmap1 = self.dec_blk1(e1 + resize_as(e2, e1))
+            e1 = self.conv1(e1)
+            loc_e1, glb_e1 = e1.split([4, 1], dim=0)
+            output1_cat = patches2image(loc_e1)  # (1,128,256,256)
+            # add glb feat in
+            output1_cat = output1_cat + resize_as(glb_e1, output1_cat)
+            # merge
+            final_output = self.insmask_head(output1_cat)  # (1,128,256,256)
+            # shallow feature merge
+            shallow = shallow_batch[i, :, :, :].unsqueeze(dim=0)
+            final_output = final_output + resize_as(shallow, final_output)
+            final_output = self.upsample1(rescale_to(final_output))
+            final_output = rescale_to(final_output + resize_as(shallow, final_output))
+            final_output = self.upsample2(final_output)
+            final_output = self.output(final_output)
+            mask = final_output.sigmoid()
+            outputs.append(mask)
+        return torch.cat(outputs, dim=0)
+    def loadcheckpoints(self, model_path):
+        model_dict = torch.load(model_path, map_location="cpu", weights_only=True)
+        self.load_state_dict(model_dict['model_state_dict'], strict=True)
+        del model_path
+    def inference(self, image, refine_foreground=False, move_to_center=False):
+        # set_random_seed(9)
+        # image = ImageOps.exif_transpose(image)
+        if isinstance(image, Image.Image):
+            image, h, w, original_image = rgb_loader_refiner(image)
+            if torch.cuda.is_available():
+                img_tensor = img_transform(image).unsqueeze(0).to(next(self.parameters()).device)
+            else:
+                img_tensor = img_transform32(image).unsqueeze(0).to(next(self.parameters()).device)
+            with torch.no_grad():
+                res = self.forward(img_tensor)
+            # Show Results
+            if refine_foreground == True:
+                pred_pil = transforms.ToPILImage()(res.squeeze())
+                image_masked = refine_foreground_process(original_image, pred_pil)
+                image_masked.putalpha(pred_pil.resize(original_image.size))
+                return image_masked
+            else:
+                alpha = postprocess_image(res, im_size=[w, h])
+                pred_pil = transforms.ToPILImage()(alpha)
+                mask = pred_pil.resize(original_image.size)
+                original_image.putalpha(mask)
+                # mask = Image.fromarray(alpha)
+                # 将背景置为白色
+                white_background = Image.new('RGB', original_image.size, (255, 255, 255))
+                white_background.paste(original_image, mask=original_image.split()[3])
+                if move_to_center:
+                    # Get the bounding box of non-transparent pixels
+                    # Get alpha channel and convert to numpy array for processing
+                    alpha_mask = np.array(mask)
+                    # Find coordinates where mask is 255 (foreground)
+                    non_zero_coords = np.where(alpha_mask >= 127.5)
+                    if len(non_zero_coords[0]) > 0:
+                        # Get bounding box from non-zero coordinates
+                        min_y, max_y = non_zero_coords[0].min(), non_zero_coords[0].max()
+                        min_x, max_x = non_zero_coords[1].min(), non_zero_coords[1].max()
+                        # Extract the object region
+                        obj_width = max_x - min_x
+                        obj_height = max_y - min_y
+                        bbox = (min_x, min_y, max_x, max_y)
+                        # Calculate center position
+                        img_width, img_height = white_background.size
+                        center_x = (img_width - obj_width) // 2
+                        center_y = (img_height - obj_height) // 2
+                        # Create new white background
+                        new_background = Image.new('RGB', white_background.size, (255, 255, 255))
+                        # Paste the object at center position
+                        new_background.paste(white_background.crop(bbox), (center_x, center_y))
+                        original_image = new_background
+                    else:
+                        original_image = white_background
+                else:
+                    original_image = white_background
+                return original_image
+        else:
+            foregrounds = []
+            for batch in image:
+                image, h, w, original_image = rgb_loader_refiner(batch)
+                if torch.cuda.is_available():
+                    img_tensor = img_transform(image).unsqueeze(0).to(next(self.parameters()).device)
+                else:
+                    img_tensor = img_transform32(image).unsqueeze(0).to(next(self.parameters()).device)
+                with torch.no_grad():
+                    res = self.forward(img_tensor)
+                if refine_foreground == True:
+                    pred_pil = transforms.ToPILImage()(res.squeeze())
+                    image_masked = refine_foreground_process(original_image, pred_pil)
+                    image_masked.putalpha(pred_pil.resize(original_image.size))
+                    foregrounds.append(image_masked)
+                else:
+                    alpha = postprocess_image(res, im_size=[w, h])
+                    pred_pil = transforms.ToPILImage()(alpha)
+                    mask = pred_pil.resize(original_image.size)
+                    original_image.putalpha(mask)
+                    # mask = Image.fromarray(alpha)
+                    foregrounds.append(original_image)
+            return foregrounds
+    def segment_video(self, video_path, output_path="./", fps=0, refine_foreground=False, batch=1,
+                      print_frames_processed=True, webm=False, rgb_value=(0, 255, 0)):
+        """
+        Segments the given video to extract the foreground (with alpha) from each frame
+        and saves the result as either a WebM video (with alpha channel) or MP4 (with a
+        color background).
+        Args:
+            video_path (str):
+                Path to the input video file.
+            output_path (str, optional):
+                Directory (or full path) where the output video and/or files will be saved.
+                Defaults to "./".
+            fps (int, optional):
+                The frames per second (FPS) to use for the output video. If 0 (default), the
+                original FPS of the input video is used. Otherwise, overrides it.
+            refine_foreground (bool, optional):
+                Whether to run an additional “refine foreground” process on each frame.
+                Defaults to False.
+            batch (int, optional):
+                Number of frames to process at once (inference batch size). Large batch sizes
+                may require more GPU memory. Defaults to 1.
+            print_frames_processed (bool, optional):
+                If True (default), prints progress (how many frames have been processed) to
+                the console.
+            webm (bool, optional):
+                If True (default), exports a WebM video with alpha channel (VP9 / yuva420p).
+                If False, exports an MP4 video composited over a solid color background.
+            rgb_value (tuple, optional):
+                The RGB background color (e.g., green screen) used to composite frames when
+                saving to MP4. Defaults to (0, 255, 0).
+        Returns:
+            None. Writes the output video(s) to disk in the specified format.
+        """
+        cap = cv2.VideoCapture(video_path)
+        if not cap.isOpened():
+            raise IOError(f"Cannot open video: {video_path}")
+        original_fps = cap.get(cv2.CAP_PROP_FPS)
+        original_fps = 30 if original_fps == 0 else original_fps
+        fps = original_fps if fps == 0 else fps
+        ret, first_frame = cap.read()
+        if not ret:
+            raise ValueError("No frames found in the video.")
+        height, width = first_frame.shape[:2]
+        cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
+        foregrounds = []
+        frame_idx = 0
+        processed_count = 0
+        batch_frames = []
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                if batch_frames:
+                    batch_results = self.inference(batch_frames, refine_foreground)
+                    if isinstance(batch_results, Image.Image):
+                        foregrounds.append(batch_results)
+                    else:
+                        foregrounds.extend(batch_results)
+                    if print_frames_processed:
+                        print(f"Processed frames {frame_idx - len(batch_frames) + 1} to {frame_idx} of {total_frames}")
+                break
+            # Process every frame instead of using intervals
+            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            pil_frame = Image.fromarray(frame_rgb)
+            batch_frames.append(pil_frame)
+            if len(batch_frames) == batch:
+                batch_results = self.inference(batch_frames, refine_foreground)
+                if isinstance(batch_results, Image.Image):
+                    foregrounds.append(batch_results)
+                else:
+                    foregrounds.extend(batch_results)
+                if print_frames_processed:
+                    print(f"Processed frames {frame_idx - batch + 1} to {frame_idx} of {total_frames}")
+                batch_frames = []
+                processed_count += batch
+            frame_idx += 1
+        if webm:
+            alpha_webm_path = os.path.join(output_path, "foreground.webm")
+            pil_images_to_webm_alpha(foregrounds, alpha_webm_path, fps=original_fps)
+        else:
+            cap.release()
+            fg_output = os.path.join(output_path, 'foreground.mp4')
+            pil_images_to_mp4(foregrounds, fg_output, fps=original_fps, rgb_value=rgb_value)
+            cv2.destroyAllWindows()
+            try:
+                fg_audio_output = os.path.join(output_path, 'foreground_output_with_audio.mp4')
+                add_audio_to_video(fg_output, video_path, fg_audio_output)
+            except Exception as e:
+                print("No audio found in the original video")
+                print(e)
+def rgb_loader_refiner(original_image):
+    h, w = original_image.size
+    image = original_image
+    # Convert to RGB if necessary
+    if image.mode != 'RGB':
+        image = image.convert('RGB')
+    # Resize the image
+    image = image.resize((1024, 1024), resample=Image.LANCZOS)
+    return image.convert('RGB'), h, w, original_image
+# Define the image transformation
+img_transform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.ConvertImageDtype(torch.float16),
+    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+])
+img_transform32 = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.ConvertImageDtype(torch.float32),
+    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+])
+def pil_images_to_mp4(images, output_path, fps=24, rgb_value=(0, 255, 0)):
+    """
+    Converts an array of PIL images to an MP4 video.
+    Args:
+        images: List of PIL images
+        output_path: Path to save the MP4 file
+        fps: Frames per second (default: 24)
+        rgb_value: Background RGB color tuple (default: green (0, 255, 0))
+    """
+    if not images:
+        raise ValueError("No images provided to convert to MP4.")
+    width, height = images[0].size
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    video_writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
+    for image in images:
+        # If image has alpha channel, composite onto the specified background color
+        if image.mode == 'RGBA':
+            # Create background image with specified RGB color
+            background = Image.new('RGB', image.size, rgb_value)
+            background = background.convert('RGBA')
+            # Composite the image onto the background
+            image = Image.alpha_composite(background, image)
+            image = image.convert('RGB')
+        else:
+            # Ensure RGB format for non-alpha images
+            image = image.convert('RGB')
+        # Convert to OpenCV format and write
+        open_cv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+        video_writer.write(open_cv_image)
+    video_writer.release()
+def pil_images_to_webm_alpha(images, output_path, fps=30):
+    """
+    Converts a list of PIL RGBA images to a VP9 .webm video with alpha channel.
+    NOTE: Not all players will display alpha in WebM.
+          Browsers like Chrome/Firefox typically do support VP9 alpha.
+    """
+    if not images:
+        raise ValueError("No images provided for WebM with alpha.")
+    # Ensure output directory exists
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    with tempfile.TemporaryDirectory() as tmpdir:
+        # Save frames as PNG (with alpha)
+        for idx, img in enumerate(images):
+            if img.mode != "RGBA":
+                img = img.convert("RGBA")
+            out_path = os.path.join(tmpdir, f"{idx:06d}.png")
+            img.save(out_path, "PNG")
+        # Construct ffmpeg command
+        # -c:v libvpx-vp9 => VP9 encoder
+        # -pix_fmt yuva420p => alpha-enabled pixel format
+        # -auto-alt-ref 0 => helps preserve alpha frames (libvpx quirk)
+        ffmpeg_cmd = [
+            "ffmpeg", "-y",
+            "-framerate", str(fps),
+            "-i", os.path.join(tmpdir, "%06d.png"),
+            "-c:v", "libvpx-vp9",
+            "-pix_fmt", "yuva420p",
+            "-auto-alt-ref", "0",
+            output_path
+        ]
+        subprocess.run(ffmpeg_cmd, check=True)
+    print(f"WebM with alpha saved to {output_path}")
+def add_audio_to_video(video_without_audio_path, original_video_path, output_path):
+    """
+    Check if the original video has an audio stream. If yes, add it. If not, skip.
+    """
+    # 1) Probe original video for audio streams
+    probe_command = [
+        'ffprobe', '-v', 'error',
+        '-select_streams', 'a:0',
+        '-show_entries', 'stream=index',
+        '-of', 'csv=p=0',
+        original_video_path
+    ]
+    result = subprocess.run(probe_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    # result.stdout is empty if no audio stream found
+    if not result.stdout.strip():
+        print("No audio track found in original video, skipping audio addition.")
+        return
+    print("Audio track detected; proceeding to mux audio.")
+    # 2) If audio found, run ffmpeg to add it
+    command = [
+        'ffmpeg', '-y',
+        '-i', video_without_audio_path,
+        '-i', original_video_path,
+        '-c', 'copy',
+        '-map', '0:v:0',
+        '-map', '1:a:0',  # we know there's an audio track now
+        output_path
+    ]
+    subprocess.run(command, check=True)
+    print(f"Audio added successfully => {output_path}")
+### Thanks to the source: https://huggingface.co/ZhengPeng7/BiRefNet/blob/main/handler.py
+def refine_foreground_process(image, mask, r=90):
+    if mask.size != image.size:
+        mask = mask.resize(image.size)
+    image = np.array(image) / 255.0
+    mask = np.array(mask) / 255.0
+    estimated_foreground = FB_blur_fusion_foreground_estimator_2(image, mask, r=r)
+    image_masked = Image.fromarray((estimated_foreground * 255.0).astype(np.uint8))
+    return image_masked
+def FB_blur_fusion_foreground_estimator_2(image, alpha, r=90):
+    # Thanks to the source: https://github.com/Photoroom/fast-foreground-estimation
+    alpha = alpha[:, :, None]
+    F, blur_B = FB_blur_fusion_foreground_estimator(image, image, image, alpha, r)
+    return FB_blur_fusion_foreground_estimator(image, F, blur_B, alpha, r=6)[0]
+def FB_blur_fusion_foreground_estimator(image, F, B, alpha, r=90):
+    if isinstance(image, Image.Image):
+        image = np.array(image) / 255.0
+    blurred_alpha = cv2.blur(alpha, (r, r))[:, :, None]
+    blurred_FA = cv2.blur(F * alpha, (r, r))
+    blurred_F = blurred_FA / (blurred_alpha + 1e-5)
+    blurred_B1A = cv2.blur(B * (1 - alpha), (r, r))
+    blurred_B = blurred_B1A / ((1 - blurred_alpha) + 1e-5)
+    F = blurred_F + alpha * \
+        (image - alpha * blurred_F - (1 - alpha) * blurred_B)
+    F = np.clip(F, 0, 1)
+    return F, blurred_B
+def postprocess_image(result: torch.Tensor, im_size: list) -> np.ndarray:
+    result = torch.squeeze(F.interpolate(result, size=im_size, mode='bilinear'), 0)
+    ma = torch.max(result)
+    mi = torch.min(result)
+    result = (result - mi) / (ma - mi)
+    im_array = (result * 255).permute(1, 2, 0).cpu().data.numpy().astype(np.uint8)
+    im_array = np.squeeze(im_array)
+    return im_array
+def rgb_loader_refiner(original_image):
+    h, w = original_image.size
+    # # Apply EXIF orientation
+    image = ImageOps.exif_transpose(original_image)
+    if original_image.mode != 'RGB':
+        original_image = original_image.convert('RGB')
+    image = original_image
+    # Convert to RGB if necessary
+    # Resize the image
+    image = image.resize((1024, 1024), resample=Image.LANCZOS)
+    return image, h, w, original_image

app/aspect_ratio_template.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# From https://github.com/TencentARC/PhotoMaker/pull/120 written by https://github.com/DiscoNova
+# Note: Since output width & height need to be divisible by 8, the w & h -values do
+#       not exactly match the stated aspect ratios... but they are "close enough":)
+ASPECT_RATIO_TEMPLATE = [
+    {
+        "name": "Custom (long edge to 1024px)",
+        "w": "",
+        "h": "",
+    },
+    {
+        "name": "Custom",
+        "w": "",
+        "h": "",
+    },
+    {
+        "name": "Instagram (1:1)",
+        "w": 1024,
+        "h": 1024,
+    },
+    {
+        "name": "35mm film / Landscape (3:2)",
+        "w": 1024,
+        "h": 680,
+    },
+    {
+        "name": "35mm film / Portrait (2:3)",
+        "w": 680,
+        "h": 1024,
+    },
+    {
+        "name": "CRT Monitor / Landscape (4:3)",
+        "w": 1024,
+        "h": 768,
+    },
+    {
+        "name": "CRT Monitor / Portrait (3:4)",
+        "w": 768,
+        "h": 1024,
+    },
+    {
+        "name": "Widescreen TV / Landscape (16:9)",
+        "w": 1024,
+        "h": 576,
+    },
+    {
+        "name": "Widescreen TV / Portrait (9:16)",
+        "w": 576,
+        "h": 1024,
+    },
+    {
+        "name": "Widescreen Monitor / Landscape (16:10)",
+        "w": 1024,
+        "h": 640,
+    },
+    {
+        "name": "Widescreen Monitor / Portrait (10:16)",
+        "w": 640,
+        "h": 1024,
+    },
+    {
+        "name": "Cinemascope (2.39:1)",
+        "w": 1024,
+        "h": 424,
+    },
+    {
+        "name": "Widescreen Movie (1.85:1)",
+        "w": 1024,
+        "h": 552,
+    },
+    {
+        "name": "Academy Movie (1.37:1)",
+        "w": 1024,
+        "h": 744,
+    },
+    {
+        "name": "Sheet-print (A-series) / Landscape (297:210)",
+        "w": 1024,
+        "h": 720,
+    },
+    {
+        "name": "Sheet-print (A-series) / Portrait (210:297)",
+        "w": 720,
+        "h": 1024,
+    },
+]
+ASPECT_RATIO_TEMPLATE = {k["name"]: (k["w"], k["h"]) for k in ASPECT_RATIO_TEMPLATE}

app/business_logic.py ADDED Viewed

	@@ -0,0 +1,556 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+Business logic functions for IC-Custom application.
+"""
+import numpy as np
+import torch
+import cv2
+import gradio as gr
+from PIL import Image
+from datetime import datetime
+import json
+import os
+from scipy.ndimage import binary_dilation, binary_erosion
+from constants import (
+    DEFAULT_BACKGROUND_BLEND_THRESHOLD, DEFAULT_SEED, DEFAULT_NUM_IMAGES,
+    DEFAULT_GUIDANCE, DEFAULT_TRUE_GS, DEFAULT_NUM_STEPS, DEFAULT_ASPECT_RATIO,
+    DEFAULT_DILATION_KERNEL_SIZE, DEFAULT_MARKER_SIZE, DEFAULT_MARKER_THICKNESS,
+    DEFAULT_MASK_ALPHA, DEFAULT_COLOR_ALPHA, TIMESTAMP_FORMAT, SEGMENTATION_COLORS, SEGMENTATION_MARKERS
+)
+from utils import run_vlm, construct_vlm_gen_prompt, construct_vlm_polish_prompt
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Global holder for SAM mobile predictor injected from the app layer
+MOBILE_PREDICTOR = None
+BEN2_MODEL = None   # ben2 model injected from the app layer
+def set_mobile_predictor(predictor):
+    """Inject SAM mobile predictor into this module without changing function signatures."""
+    global MOBILE_PREDICTOR
+    MOBILE_PREDICTOR = predictor
+def set_ben2_model(ben2_model):
+    """Inject ben2 model into this module without changing function signatures."""
+    global BEN2_MODEL
+    BEN2_MODEL = ben2_model
+def set_vlm_processor(vlm_processor):
+    """Inject vlm processor into this module without changing function signatures."""
+    global VLM_PROCESSOR
+    VLM_PROCESSOR = vlm_processor
+def set_vlm_model(vlm_model):
+    """Inject vlm model into this module without changing function signatures."""
+    global VLM_MODEL
+    VLM_MODEL = vlm_model
+def init_image_target_1(target_image):
+    """Initialize UI state when a target image is uploaded."""
+    # Handle both PIL Image (image_target_1) and ImageEditor dict (image_target_2)
+    try:
+        if isinstance(target_image, dict) and 'composite' in target_image:
+            # ImageEditor format (user-drawn mask)
+            image_target_state = np.array(target_image['composite'].convert("RGB"))
+        else:
+            # PIL Image format (precise mask)
+            image_target_state = np.array(target_image.convert("RGB"))
+    except Exception as e:
+        # If there's an error processing the image, skip initialization
+        return (
+            gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(),
+            gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(),
+            gr.skip(), gr.skip(), gr.update(value="-1")
+        )
+    selected_points = []
+    mask_target_state = None
+    prompt = None
+    mask_gallery = []
+    result_gallery = []
+    use_background_preservation = False
+    background_blend_threshold = DEFAULT_BACKGROUND_BLEND_THRESHOLD
+    seed = DEFAULT_SEED
+    num_images_per_prompt = DEFAULT_NUM_IMAGES
+    guidance = DEFAULT_GUIDANCE
+    true_gs = DEFAULT_TRUE_GS
+    num_steps = DEFAULT_NUM_STEPS
+    aspect_ratio_val = gr.update(value=DEFAULT_ASPECT_RATIO)
+    return (image_target_state, selected_points, mask_target_state, prompt,
+            mask_gallery, result_gallery, use_background_preservation,
+            background_blend_threshold, seed, num_images_per_prompt, guidance,
+            true_gs, num_steps, aspect_ratio_val)
+def init_image_target_2(target_image):
+    """Initialize UI state when a target image is uploaded."""
+    # Handle both PIL Image (image_target_1) and ImageEditor dict (image_target_2)
+    try:
+        if isinstance(target_image, dict) and 'composite' in target_image:
+            # ImageEditor format (user-drawn mask)
+            image_target_state = np.array(target_image['composite'].convert("RGB"))
+        else:
+            # PIL Image format (precise mask)
+            image_target_state = np.array(target_image.convert("RGB"))
+    except Exception as e:
+        # If there's an error processing the image, skip initialization
+        return (
+            gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(),
+            gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(),
+            gr.skip(), gr.skip(), gr.update(value="-1")
+        )
+    selected_points = gr.skip()
+    mask_target_state = gr.skip()
+    prompt = gr.skip()
+    mask_gallery = gr.skip()
+    result_gallery = gr.skip()
+    use_background_preservation = gr.skip()
+    background_blend_threshold = gr.skip()
+    seed = gr.skip()
+    num_images_per_prompt = gr.skip()
+    guidance = gr.skip()
+    true_gs = gr.skip()
+    num_steps = gr.skip()
+    aspect_ratio_val = gr.skip()
+    return (image_target_state, selected_points, mask_target_state, prompt,
+            mask_gallery, result_gallery, use_background_preservation,
+            background_blend_threshold, seed, num_images_per_prompt, guidance,
+            true_gs, num_steps, aspect_ratio_val)
+def init_image_reference(image_reference):
+    """Initialize all UI states when a reference image is uploaded."""
+    image_reference_state = np.array(image_reference.convert("RGB"))
+    image_reference_ori_state = image_reference_state
+    image_reference_rmbg_state = None
+    image_target_state = None
+    mask_target_state = None
+    prompt = None
+    mask_gallery = []
+    result_gallery = []
+    image_target_1_val = None
+    image_target_2_val = None
+    selected_points = []
+    input_mask_mode_val = gr.update(value="Precise mask")
+    seg_ref_mode_val = gr.update(value="Full Ref")
+    move_to_center = False
+    use_background_preservation = False
+    background_blend_threshold = DEFAULT_BACKGROUND_BLEND_THRESHOLD
+    seed = DEFAULT_SEED
+    num_images_per_prompt = DEFAULT_NUM_IMAGES
+    guidance = DEFAULT_GUIDANCE
+    true_gs = DEFAULT_TRUE_GS
+    num_steps = DEFAULT_NUM_STEPS
+    aspect_ratio_val = gr.update(value=DEFAULT_ASPECT_RATIO)
+    return (
+        image_reference_ori_state, image_reference_rmbg_state, image_target_state,
+        mask_target_state, prompt, mask_gallery, result_gallery, image_target_1_val,
+        image_target_2_val, selected_points, input_mask_mode_val, seg_ref_mode_val,
+        move_to_center, use_background_preservation, background_blend_threshold,
+        seed, num_images_per_prompt, guidance, true_gs, num_steps, aspect_ratio_val,
+    )
+def undo_seg_points(orig_img, sel_pix):
+    """Remove the latest segmentation point and recompute the preview mask."""
+    if len(sel_pix) != 0:
+        temp = orig_img.copy()
+        sel_pix.pop()
+        # Online show seg mask
+        if len(sel_pix) != 0:
+            temp, output_mask = segmentation(temp, sel_pix, MOBILE_PREDICTOR, SEGMENTATION_COLORS, SEGMENTATION_MARKERS)
+            output_mask_pil = Image.fromarray(output_mask.astype("uint8"))
+            masked_img_pil = Image.fromarray(np.where(output_mask > 0, orig_img, 0).astype("uint8"))
+            mask_gallery = [masked_img_pil, output_mask_pil]
+        else:
+            output_mask = None
+            mask_gallery = []
+        return temp.astype(np.uint8), output_mask, mask_gallery
+    else:
+        gr.Warning("Nothing to Undo")
+        return orig_img, None, []
+def segmentation(img, sel_pix, mobile_predictor, colors, markers):
+    """Run SAM-based segmentation given selected points and return previews."""
+    points = []
+    labels = []
+    for p, l in sel_pix:
+        points.append(p)
+        labels.append(l)
+    mobile_predictor.set_image(img if isinstance(img, np.ndarray) else np.array(img))
+    with torch.no_grad():
+        masks, _, _ = mobile_predictor.predict(
+            point_coords=np.array(points),
+            point_labels=np.array(labels),
+            multimask_output=False
+        )
+    output_mask = np.ones((masks.shape[1], masks.shape[2], 3)) * 255
+    for i in range(3):
+        output_mask[masks[0] == True, i] = 0.0
+    mask_all = np.ones((masks.shape[1], masks.shape[2], 3))
+    color_mask = np.random.random((1, 3)).tolist()[0]
+    for i in range(3):
+        mask_all[masks[0] == True, i] = color_mask[i]
+    masked_img = img / 255 * DEFAULT_MASK_ALPHA + mask_all * DEFAULT_COLOR_ALPHA
+    masked_img = masked_img * 255
+    # Draw points
+    for point, label in sel_pix:
+        cv2.drawMarker(
+            masked_img, point, colors[label],
+            markerType=markers[label],
+            markerSize=DEFAULT_MARKER_SIZE,
+            thickness=DEFAULT_MARKER_THICKNESS
+        )
+    return masked_img, output_mask
+def get_point(img, sel_pix, evt: gr.SelectData):
+    """Handle a user click on the target image to add a foreground point."""
+    if evt is None or not hasattr(evt, 'index'):
+        gr.Warning(f"Event object missing index attribute. Event type: {type(evt)}")
+        return img, None, []
+    sel_pix.append((evt.index, 1))  # append the foreground_point
+    # Online show seg mask
+    global MOBILE_PREDICTOR
+    masked_img_seg, output_mask = segmentation(img, sel_pix, MOBILE_PREDICTOR, SEGMENTATION_COLORS, SEGMENTATION_MARKERS)
+    # Apply dilation to output_mask
+    output_mask = 1 - output_mask
+    kernel = np.ones((DEFAULT_DILATION_KERNEL_SIZE, DEFAULT_DILATION_KERNEL_SIZE), np.uint8)
+    output_mask = cv2.dilate(output_mask, kernel, iterations=1)
+    output_mask = 1 - output_mask
+    output_mask_binary = output_mask / 255
+    masked_img_seg = masked_img_seg.astype("uint8")
+    output_mask = output_mask.astype("uint8")
+    masked_img = img * output_mask_binary
+    masked_img_pil = Image.fromarray(masked_img.astype("uint8"))
+    output_mask_pil = Image.fromarray(output_mask.astype("uint8"))
+    outputs_gallery = [masked_img_pil, output_mask_pil]
+    return masked_img_seg, output_mask, outputs_gallery
+def get_brush(img):
+    """Extract a mask from ImageEditor brush layers or composite/background diff."""
+    if img is None or not isinstance(img, dict):
+        return gr.skip(), gr.skip()
+    layers = img.get("layers", [])
+    background = img.get('background', None)
+    composite = img.get('composite', None)
+    output_mask = None
+    if layers and layers[0] is not None and background is not None:
+        output_mask = 255 - np.array(layers[0].convert("RGB")).astype(np.uint8)
+    elif composite is not None and background is not None:
+        comp_rgb = np.array(composite.convert("RGB")).astype(np.int16)
+        bg_rgb = np.array(background.convert("RGB")).astype(np.int16)
+        diff = np.abs(comp_rgb - bg_rgb)
+        painted = (diff.sum(axis=2) > 0).astype(np.uint8)
+        output_mask = (1 - painted) * 255
+        output_mask = np.repeat(output_mask[:, :, None], 3, axis=2).astype(np.uint8)
+    else:
+        return gr.skip(), gr.skip()
+    if len(np.unique(output_mask)) == 1:
+        return gr.skip(), gr.skip()
+    img = np.array(background.convert("RGB")).astype(np.uint8)
+    output_mask_binary = output_mask / 255
+    masked_img = img * output_mask_binary
+    masked_img_pil = Image.fromarray(masked_img.astype("uint8"))
+    output_mask_pil = Image.fromarray(output_mask.astype("uint8"))
+    mask_gallery = [masked_img_pil, output_mask_pil]
+    return output_mask, mask_gallery
+def random_mask_func(mask, dilation_type='square', dilation_size=20):
+    """Utility to dilate/erode/box/ellipse expand a binary mask."""
+    binary_mask = mask[:,:,0] < 128
+    if dilation_type == 'square_dilation':
+        structure = np.ones((dilation_size, dilation_size), dtype=bool)
+        dilated_mask = binary_dilation(binary_mask, structure=structure)
+    elif dilation_type == 'square_erosion':
+        structure = np.ones((dilation_size, dilation_size), dtype=bool)
+        dilated_mask = binary_erosion(binary_mask, structure=structure)
+    elif dilation_type == 'bounding_box':
+        # Find the most left top and left bottom point
+        rows, cols = np.where(binary_mask)
+        if len(rows) == 0 or len(cols) == 0:
+            return mask  # return original mask if no valid points
+        min_row, max_row = np.min(rows), np.max(rows)
+        min_col, max_col = np.min(cols), np.max(cols)
+        # Create a bounding box
+        dilated_mask = np.zeros_like(binary_mask, dtype=bool)
+        dilated_mask[min_row:max_row + 1, min_col:max_col + 1] = True
+    elif dilation_type == 'bounding_ellipse':
+        # Find the most left top and left bottom point
+        rows, cols = np.where(binary_mask)
+        if len(rows) == 0 or len(cols) == 0:
+            return mask  # return original mask if no valid points
+        min_row, max_row = np.min(rows), np.max(rows)
+        min_col, max_col = np.min(cols), np.max(cols)
+        # Calculate the center and axis length of the ellipse
+        center = ((min_col + max_col) // 2, (min_row + max_row) // 2)
+        a = (max_col - min_col) // 2  # half long axis
+        b = (max_row - min_row) // 2  # half short axis
+        # Create a bounding ellipse
+        y, x = np.ogrid[:mask.shape[0], :mask.shape[1]]
+        ellipse_mask = ((x - center[0])**2 / a**2 + (y - center[1])**2 / b**2) <= 1
+        dilated_mask = np.zeros_like(binary_mask, dtype=bool)
+        dilated_mask[ellipse_mask] = True
+    else:
+        raise ValueError("dilation_type must be 'square', 'ellipse', 'bounding_box', or 'bounding_ellipse'")
+    # Use binary dilation
+    dilated_mask = 1 - dilated_mask
+    dilated_mask = np.uint8(dilated_mask[:,:,np.newaxis]) * 255
+    dilated_mask = np.concatenate([dilated_mask, dilated_mask, dilated_mask], axis=2)
+    return dilated_mask
+def dilate_mask(mask, image):
+    """Dilate the target mask for robustness and preview the result."""
+    if mask is None:
+        gr.Warning("Please input the target mask first")
+        return None, None
+    mask = random_mask_func(mask, dilation_type='square_dilation', dilation_size=DEFAULT_DILATION_KERNEL_SIZE)
+    masked_img = image * (mask > 0)
+    return mask, [masked_img, mask]
+def erode_mask(mask, image):
+    """Erode the target mask and preview the result."""
+    if mask is None:
+        gr.Warning("Please input the target mask first")
+        return None, None
+    mask = random_mask_func(mask, dilation_type='square_erosion', dilation_size=DEFAULT_DILATION_KERNEL_SIZE)
+    masked_img = image * (mask > 0)
+    return mask, [masked_img, mask]
+def bounding_box(mask, image):
+    """Create bounding box mask and preview the result."""
+    if mask is None:
+        gr.Warning("Please input the target mask first")
+        return None, None
+    mask = random_mask_func(mask, dilation_type='bounding_box', dilation_size=DEFAULT_DILATION_KERNEL_SIZE)
+    masked_img = image * (mask > 0)
+    return mask, [masked_img, mask]
+def change_input_mask_mode(input_mask_mode, custmization_mode):
+    """Change visibility of input mask mode components."""
+    if custmization_mode == "Position-free":
+        return (
+            gr.update(visible=False),
+            gr.update(visible=False),
+            gr.update(visible=False),
+            )
+    elif input_mask_mode.lower() == "precise mask":
+        return (
+            gr.update(visible=True),
+            gr.update(visible=False),
+            gr.update(visible=True),
+            )
+    elif input_mask_mode.lower() == "user-drawn mask":
+        return (
+            gr.update(visible=False),
+            gr.update(visible=True),
+            gr.update(visible=False),
+            )
+    else:
+        gr.Warning("Invalid input mask mode")
+        return (
+            gr.skip(), gr.skip(), gr.skip()
+            )
+def change_custmization_mode(custmization_mode, input_mask_mode):
+    """Change visibility and interactivity based on customization mode."""
+    if custmization_mode.lower() == "position-free":
+        return (gr.update(interactive=False, visible=False),
+                gr.update(interactive=False, visible=False),
+                gr.update(interactive=False, visible=False),
+                gr.update(interactive=False, visible=False),
+                gr.update(interactive=False, visible=False),
+                gr.update(interactive=False, visible=False),
+                gr.update(value="<s>Select a input mask mode</s>", visible=False),
+                gr.update(value="<s>Input target image & mask (Iterate clicking or brushing until the target is covered)</s>", visible=False),
+                gr.update(value="<s>View or modify the target mask</s>", visible=False),
+                gr.update(value="3. Input text prompt (necessary)"),
+                gr.update(value="4. Submit and view the output"),
+                gr.update(visible=False),
+                gr.update(visible=False),
+                )
+    else:
+        if input_mask_mode.lower() == "precise mask":
+            return (gr.update(interactive=True, visible=True),
+                    gr.update(interactive=True, visible=False),
+                    gr.update(interactive=True, visible=True),
+                    gr.update(interactive=True, visible=True),
+                    gr.update(interactive=True, visible=True),
+                    gr.update(interactive=True, visible=True),
+                    gr.update(value="3. Select a input mask mode", visible=True),
+                    gr.update(value="4. Input target image & mask (Iterate clicking or brushing until the target is covered)", visible=True),
+                    gr.update(value="6. View or modify the target mask", visible=True),
+                    gr.update(value="5. Input text prompt (optional)", visible=True),
+                    gr.update(value="7. Submit and view the output", visible=True),
+                    gr.update(visible=True, value="Precise mask"),
+                    gr.update(visible=True),
+                    )
+        elif input_mask_mode.lower() == "user-drawn mask":
+            return (gr.update(interactive=True, visible=False),
+                    gr.update(interactive=True, visible=True),
+                    gr.update(interactive=False, visible=False),
+                    gr.update(interactive=True, visible=True),
+                    gr.update(interactive=True, visible=True),
+                    gr.update(interactive=True, visible=True),
+                    gr.update(value="3. Select a input mask mode", visible=True),
+                    gr.update(value="4. Input target image & mask (Iterate clicking or brushing until the target is covered)", visible=True),
+                    gr.update(value="6. View or modify the target mask", visible=True),
+                    gr.update(value="5. Input text prompt (optional)", visible=True),
+                    gr.update(value="7. Submit and view the output", visible=True),
+                    gr.update(visible=True, value="User-drawn mask"),
+                    gr.update(visible=True),
+                    )
+def change_seg_ref_mode(seg_ref_mode, image_reference_state, move_to_center):
+    """Change segmentation reference mode and handle background removal."""
+    if image_reference_state is None:
+        gr.Warning("Please upload the reference image first")
+        return None, None
+    global BEN2_MODEL
+    if seg_ref_mode == "Full Ref":
+        return image_reference_state, None
+    else:
+        if BEN2_MODEL is None:
+            gr.Warning("Please enable ben2 for mask reference first")
+            return gr.skip(), gr.skip()
+        image_reference_pil = Image.fromarray(image_reference_state)
+        image_reference_pil_rmbg = BEN2_MODEL.inference(image_reference_pil, move_to_center=move_to_center)
+        image_reference_rmbg = np.array(image_reference_pil_rmbg)
+        return image_reference_rmbg, image_reference_rmbg
+def vlm_auto_generate(image_target_state, image_reference_state, mask_target_state,
+                      custmization_mode):
+    """Auto-generate prompt using VLM."""
+    global VLM_PROCESSOR, VLM_MODEL
+    if custmization_mode == "Position-aware":
+        if image_target_state is None or mask_target_state is None:
+            gr.Warning("Please upload the target image and get mask first")
+            return None
+    if image_reference_state is None:
+        gr.Warning("Please upload the reference image first")
+        return None
+    if VLM_PROCESSOR is None or VLM_MODEL is None:
+        gr.Warning("Please enable vlm for prompt first")
+        return prompt
+    messages = construct_vlm_gen_prompt(image_target_state, image_reference_state, mask_target_state, custmization_mode)
+    output_text = run_vlm(VLM_PROCESSOR, VLM_MODEL, messages, device=device)
+    return output_text
+def vlm_auto_polish(prompt, custmization_mode):
+    """Auto-polish prompt using VLM."""
+    global VLM_PROCESSOR, VLM_MODEL
+    if prompt is None:
+        gr.Warning("Please input the text prompt first")
+        return None
+    if custmization_mode == "Position-aware":
+        gr.Warning("Polishing only works in position-free mode")
+        return prompt
+    if VLM_PROCESSOR is None or VLM_MODEL is None:
+        gr.Warning("Please enable vlm for prompt first")
+        return prompt
+    messages = construct_vlm_polish_prompt(prompt)
+    output_text = run_vlm(VLM_PROCESSOR, VLM_MODEL, messages, device=device)
+    return output_text
+def save_results(output_img, image_reference, image_target, mask_target, prompt,
+                custmization_mode, input_mask_mode, seg_ref_mode, seed, guidance,
+                num_steps, num_images_per_prompt, use_background_preservation,
+                background_blend_threshold, true_gs, assets_cache_dir):
+    """Save generated results and metadata."""
+    save_name = datetime.now().strftime(TIMESTAMP_FORMAT)
+    results = []
+    for i in range(num_images_per_prompt):
+        save_dir = os.path.join(assets_cache_dir, save_name)
+        os.makedirs(save_dir, exist_ok=True)
+        output_img[i].save(os.path.join(save_dir, f"img_gen_{i}.png"))
+        image_reference.save(os.path.join(save_dir, f"img_ref_{i}.png"))
+        image_target.save(os.path.join(save_dir, f"img_target_{i}.png"))
+        mask_target.save(os.path.join(save_dir, f"mask_target_{i}.png"))
+        with open(os.path.join(save_dir, f"hyper_params_{i}.json"), "w") as f:
+            json.dump({
+                "prompt": prompt,
+                "custmization_mode": custmization_mode,
+                "input_mask_mode": input_mask_mode,
+                "seg_ref_mode": seg_ref_mode,
+                "seed": seed,
+                "guidance": guidance,
+                "num_steps": num_steps,
+                "num_images_per_prompt": num_images_per_prompt,
+                "use_background_preservation": use_background_preservation,
+                "background_blend_threshold": background_blend_threshold,
+                "true_gs": true_gs,
+            }, f)
+        results.append(output_img[i])
+    return results

app/config.py ADDED Viewed

	@@ -0,0 +1,72 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+Configuration management for IC-Custom application.
+"""
+import os
+import argparse
+from omegaconf import OmegaConf
+def parse_args():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(description="IC-Custom App.")
+    parser.add_argument(
+        "--config",
+        type=str,
+        default="configs/app/app.yaml",
+        help="path to config",
+    )
+    parser.add_argument(
+        "--hf_token",
+        type=str,
+        required=False,
+        help="Hugging Face token",
+    )
+    parser.add_argument(
+        "--hf_cache_dir",
+        type=str,
+        required=False,
+        default=os.path.expanduser("~/.cache/huggingface/hub"),
+        help="Cache directory to save the models, default is ~/.cache/huggingface/hub",
+    )
+    parser.add_argument(
+        "--assets_cache_dir",
+        type=str,
+        required=False,
+        default="results/app",
+        help="Cache directory to save the results, default is results/app",
+    )
+    parser.add_argument(
+        "--save_results",
+        action="store_true",
+        help="Save results",
+    )
+    parser.add_argument(
+        "--enable_ben2_for_mask_ref",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help="Enable ben2 for mask reference (default: True)",
+    )
+    parser.add_argument(
+        "--enable_vlm_for_prompt",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help="Enable vlm for prompt (default: True)",
+    )
+    return parser.parse_args()
+def load_config(config_path):
+    """Load configuration from file."""
+    return OmegaConf.load(config_path)
+def setup_environment(args):
+    """Setup environment variables from command line arguments."""
+    if args.hf_token is not None:
+        os.environ["HF_TOKEN"] = args.hf_token
+    if args.hf_cache_dir is not None:
+        os.environ["HF_HUB_CACHE"] = args.hf_cache_dir

app/constants.py ADDED Viewed

	@@ -0,0 +1,35 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+Constants and default values for IC-Custom application.
+"""
+from aspect_ratio_template import ASPECT_RATIO_TEMPLATE
+# Aspect ratio constants
+ASPECT_RATIO_LABELS = list(ASPECT_RATIO_TEMPLATE)
+DEFAULT_ASPECT_RATIO = ASPECT_RATIO_LABELS[0]
+# Colors and markers for segmentation
+# OpenCV expects BGR colors; keep tuples as (R, G, B) for consistency across code.
+SEGMENTATION_COLORS = [(255, 0, 0), (0, 255, 0)]
+SEGMENTATION_MARKERS = [1, 5]
+RGBA_COLORS = [(255, 0, 255, 255), (0, 255, 0, 255), (0, 0, 255, 255)]
+# Magic-number constants
+DEFAULT_BACKGROUND_BLEND_THRESHOLD = 0.5
+DEFAULT_NUM_STEPS = 32
+DEFAULT_GUIDANCE = 40
+DEFAULT_TRUE_GS = 1
+DEFAULT_NUM_IMAGES = 1
+DEFAULT_SEED = -1  # -1 indicates random seed
+DEFAULT_DILATION_KERNEL_SIZE = 7
+# UI constants
+DEFAULT_BRUSH_SIZE = 30
+DEFAULT_MARKER_SIZE = 20
+DEFAULT_MARKER_THICKNESS = 5
+DEFAULT_MASK_ALPHA = 0.3
+DEFAULT_COLOR_ALPHA = 0.7
+# File naming
+TIMESTAMP_FORMAT = "%Y%m%d_%H%M"

app/event_handlers.py ADDED Viewed

	@@ -0,0 +1,155 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+Event handlers for IC-Custom application.
+"""
+import gradio as gr
+def setup_event_handlers(
+    # UI components
+    input_mask_mode, image_target_1, image_target_2, undo_target_seg_button,
+    custmization_mode, dilate_button, erode_button, bounding_box_button,
+    mask_gallery, md_input_mask_mode, md_target_image, md_mask_operation,
+    md_prompt, md_submit, result_gallery, image_target_state, mask_target_state,
+    seg_ref_mode, image_reference_ori_state, move_to_center,
+    image_reference, image_reference_rmbg_state,
+    # Functions
+    change_input_mask_mode, change_custmization_mode, change_seg_ref_mode,
+    init_image_target_1,  init_image_target_2, init_image_reference,
+    get_point, undo_seg_points, get_brush,
+    # VLM buttons (UI components)
+    vlm_generate_btn, vlm_polish_btn,
+    # VLM functions
+    vlm_auto_generate, vlm_auto_polish,
+    dilate_mask, erode_mask, bounding_box,
+    run_model,
+    # Other components
+    selected_points, prompt,
+    use_background_preservation, background_blend_threshold, seed,
+    num_images_per_prompt, guidance, true_gs, num_steps, aspect_ratio,
+    submit_button,
+    # extra state
+    eg_idx,
+):
+    """Setup all event handlers for the application."""
+    # Change input mask mode: precise mask or user-drawn mask
+    input_mask_mode.change(
+        change_input_mask_mode,
+        [input_mask_mode, custmization_mode],
+        [image_target_1, image_target_2, undo_target_seg_button]
+    )
+    # Change customization mode: pos-aware or pos-free
+    custmization_mode.change(
+        change_custmization_mode,
+        [custmization_mode, input_mask_mode],
+        [image_target_1, image_target_2, undo_target_seg_button, dilate_button,
+         erode_button, bounding_box_button, md_input_mask_mode,
+         md_target_image, md_mask_operation, md_prompt, md_submit, input_mask_mode, mask_gallery]
+    )
+    # Remove background for reference image
+    seg_ref_mode.change(
+        change_seg_ref_mode,
+        [seg_ref_mode, image_reference_ori_state, move_to_center],
+        [image_reference, image_reference_rmbg_state]
+    )
+    # Initialize components only on user upload (not programmatic updates)
+    image_target_1.upload(
+        init_image_target_1,
+        [image_target_1],
+        [image_target_state, selected_points, prompt, mask_target_state, mask_gallery,
+         result_gallery, use_background_preservation, background_blend_threshold, seed,
+         num_images_per_prompt, guidance, true_gs, num_steps, aspect_ratio]
+    )
+    image_target_2.upload(
+        init_image_target_2,
+        [image_target_2],
+        [image_target_state, selected_points, prompt, mask_target_state, mask_gallery,
+         result_gallery, use_background_preservation, background_blend_threshold, seed,
+         num_images_per_prompt, guidance, true_gs, num_steps, aspect_ratio]
+    )
+    image_reference.upload(
+        init_image_reference,
+        [image_reference],
+        [image_reference_ori_state, image_reference_rmbg_state, image_target_state,
+         mask_target_state, prompt, mask_gallery, result_gallery, image_target_1,
+         image_target_2, selected_points, input_mask_mode, seg_ref_mode, move_to_center,
+         use_background_preservation, background_blend_threshold, seed,
+         num_images_per_prompt, guidance, true_gs, num_steps, aspect_ratio]
+    )
+    # SAM for image_target_1
+    image_target_1.select(
+        get_point,
+        [image_target_state, selected_points],
+        [image_target_1, mask_target_state, mask_gallery],
+    )
+    undo_target_seg_button.click(
+        undo_seg_points,
+        [image_target_state, selected_points],
+        [image_target_1, mask_target_state, mask_gallery]
+    )
+    # Brush for image_target_2
+    image_target_2.change(
+        get_brush,
+        [image_target_2],
+        [mask_target_state, mask_gallery],
+    )
+    # VLM auto generate
+    vlm_generate_btn.click(
+        vlm_auto_generate,
+        [image_target_state, image_reference_ori_state, mask_target_state, custmization_mode],
+        [prompt]
+    )
+    # VLM auto polish
+    vlm_polish_btn.click(
+        vlm_auto_polish,
+        [prompt, custmization_mode],
+        [prompt]
+    )
+    # Mask operations
+    dilate_button.click(
+        dilate_mask,
+        [mask_target_state, image_target_state],
+        [mask_target_state, mask_gallery]
+    )
+    erode_button.click(
+        erode_mask,
+        [mask_target_state, image_target_state],
+        [mask_target_state, mask_gallery]
+    )
+    bounding_box_button.click(
+        bounding_box,
+        [mask_target_state, image_target_state],
+        [mask_target_state, mask_gallery]
+    )
+    # Run function
+    ips = [
+        image_target_state, mask_target_state, image_reference_ori_state,
+        image_reference_rmbg_state, prompt, seed, guidance, true_gs, num_steps,
+        num_images_per_prompt, use_background_preservation, background_blend_threshold,
+        aspect_ratio, custmization_mode, seg_ref_mode, input_mask_mode,
+    ]
+    submit_button.click(
+        fn=run_model,
+        inputs=ips,
+        outputs=[result_gallery, seed, prompt],
+        show_progress=True,
+    )

app/examples.py ADDED Viewed

	@@ -0,0 +1,210 @@

+from PIL import Image
+make_dict = lambda x : { 'background': Image.open(x).convert("RGBA"), 'layers': [Image.new("RGBA", Image.open(x).size, (255, 255, 255, 0))], 'composite': Image.open(x).convert("RGBA") }
+null_dict = {
+    'background': None,
+    'composite': None,
+    'layers': []
+}
+IMG_REF = [
+    ## pos-aware: precise mask
+    "assets/gradio/pos_aware/001/img_ref.png",
+    "assets/gradio/pos_aware/002/img_ref.png",
+    ## pos-aware: User-drawn mask mask
+    "assets/gradio/pos_aware/003/img_ref.png",
+    "assets/gradio/pos_aware/004/img_ref.png",
+    "assets/gradio/pos_aware/005/img_ref.png",
+    ## pos-free
+    "assets/gradio/pos_free/001/img_ref.png",
+    "assets/gradio/pos_free/002/img_ref.png",
+    "assets/gradio/pos_free/003/img_ref.png",
+    "assets/gradio/pos_free/004/img_ref.png",
+    ]
+IMG_TGT1 = [
+    ## pos-aware: precise mask
+    "assets/gradio/pos_aware/001/img_target.png",
+    "assets/gradio/pos_aware/002/img_target.png",
+    ## pos-aware: User-drawn mask mask
+    None,
+    None,
+    None,
+    ## pos-free
+    "assets/gradio/pos_free/001/img_target.png",
+    "assets/gradio/pos_free/002/img_target.png",
+    "assets/gradio/pos_free/003/img_target.png",
+    "assets/gradio/pos_free/004/img_target.png",
+    ]
+IMG_TGT2 = [
+    ## pos-aware: precise mask
+    null_dict,
+    null_dict,
+    ## pos-aware: User-drawn mask mask
+    make_dict("assets/gradio/pos_aware/003/img_target.png"),
+    make_dict("assets/gradio/pos_aware/004/img_target.png"),
+    make_dict("assets/gradio/pos_aware/005/img_target.png"),
+    ## pos-free
+    null_dict,
+    null_dict,
+    null_dict,
+    null_dict,
+    ]
+MASK_TGT = [
+    ## pos-aware: precise mask
+    "assets/gradio/pos_aware/001/mask_target.png",
+    "assets/gradio/pos_aware/002/mask_target.png",
+    ## pos-aware: User-drawn mask mask
+    "assets/gradio/pos_aware/003/mask_target.png",
+    "assets/gradio/pos_aware/004/mask_target.png",
+    "assets/gradio/pos_aware/005/mask_target.png",
+    ## pos-free
+    "assets/gradio/pos_free/001/mask_target.png",
+    "assets/gradio/pos_free/002/mask_target.png",
+    "assets/gradio/pos_free/003/mask_target.png",
+    "assets/gradio/pos_free/004/mask_target.png",
+    ]
+CUSTOM_MODE = [
+    ## pos-aware
+    "Position-aware",
+    "Position-aware",
+    "Position-aware",
+    "Position-aware",
+    "Position-aware",
+    ## pos-free
+    "Position-free",
+    "Position-free",
+    "Position-free",
+    "Position-free",
+    ]
+INPUT_MASK_MODE = [
+    ## pos-aware: precise mask
+    "Precise mask",
+    "Precise mask",
+    ## pos-aware: User-drawn mask mask
+    "User-drawn mask",
+    "User-drawn mask",
+    "User-drawn mask",
+    ## pos-free
+    "Precise mask",
+    "Precise mask",
+    "Precise mask",
+    "Precise mask",
+    ]
+SEG_REF_MODE = [
+    ## pos-aware
+    "Full Ref",
+    "Full Ref",
+    "Full Ref",
+    "Full Ref",
+    "Full Ref",
+    ## pos-free
+    "Full Ref",
+    "Full Ref",
+    "Full Ref",
+    "Full Ref",
+    ]
+PROMPTS = [
+    ## pos-aware: precise mask
+    "",
+    "",
+    ## pos-aware: User-drawn mask mask
+    "A delicate necklace with a mother-of-pearl clover pendant hangs gracefully around the neck of a woman dressed in a black pinstripe blazer.",
+    "",
+    "",
+    ## pos-free
+    "TThe charming, soft plush toy is joyfully wandering through a lush, dense jungle, surrounded by vibrant green foliage and towering trees.",
+    "A bright yellow alarm clock sits on a wooden desk next to a stack of books in a cozy, sunlit room.",
+    "A Lego figure dressed in a vibrant chicken costume, leaning against a wooden chair, surrounded by lush green grass and blooming flowers.",
+    "The crocheted gingerbread man is perched on a tree branch in a dense forest, with sunlight filtering through the leaves, casting dappled shadows around him."
+    ]
+IMG_GEN = [
+    ## pos-aware: precise mask
+    "assets/gradio/pos_aware/001/img_gen.png",
+    "assets/gradio/pos_aware/002/img_gen.png",
+    ## pos-aware: User-drawn mask mask
+    "assets/gradio/pos_aware/003/img_gen.png",
+    "assets/gradio/pos_aware/004/img_gen.png",
+    "assets/gradio/pos_aware/005/img_gen.png",
+    ## pos-free
+    "assets/gradio/pos_free/001/img_gen.png",
+    "assets/gradio/pos_free/002/img_gen.png",
+    "assets/gradio/pos_free/003/img_gen.png",
+    "assets/gradio/pos_free/004/img_gen.png",
+    ]
+SEED = [
+    ## pos-aware
+    97175498,
+    2126677963,
+    346969695,
+    1172525388,
+    268683460,
+    ## pos-free
+    2126677963,
+    418898253,
+    2126677963,
+    2126677963
+    ]
+TRUE_GS = [
+    # pos-aware
+    1,
+    1,
+    1,
+    1,
+    1,
+    # pos-free
+    3,
+    3,
+    3,
+    3,
+]
+NUM_STEPS = [
+    ## pos-aware
+    32,
+    32,
+    32,
+    32,
+    32,
+    ## pos-free
+    20,
+    20,
+    20,
+    20,
+]
+GUIDANCE = [
+    ## pos-aware
+    40,
+    48,
+    40,
+    48,
+    48,
+    ## pos-free
+    40,
+    40,
+    40,
+    40,
+]
+GRADIO_EXAMPLES = [
+    [IMG_REF[0], IMG_TGT1[0], IMG_TGT2[0], CUSTOM_MODE[0], INPUT_MASK_MODE[0], SEG_REF_MODE[0], PROMPTS[0], SEED[0], TRUE_GS[0], '0', NUM_STEPS[0], GUIDANCE[0]],
+    [IMG_REF[1], IMG_TGT1[1], IMG_TGT2[1], CUSTOM_MODE[1], INPUT_MASK_MODE[1], SEG_REF_MODE[1], PROMPTS[1], SEED[1], TRUE_GS[1], '1', NUM_STEPS[1], GUIDANCE[1]],
+    [IMG_REF[2], IMG_TGT1[2], IMG_TGT2[2], CUSTOM_MODE[2], INPUT_MASK_MODE[2], SEG_REF_MODE[2], PROMPTS[2], SEED[2], TRUE_GS[2], '2', NUM_STEPS[2], GUIDANCE[2]],
+    [IMG_REF[3], IMG_TGT1[3], IMG_TGT2[3], CUSTOM_MODE[3], INPUT_MASK_MODE[3], SEG_REF_MODE[3], PROMPTS[3], SEED[3], TRUE_GS[3], '3', NUM_STEPS[3], GUIDANCE[3]],
+    [IMG_REF[4], IMG_TGT1[4], IMG_TGT2[4], CUSTOM_MODE[4], INPUT_MASK_MODE[4], SEG_REF_MODE[4], PROMPTS[4], SEED[4], TRUE_GS[4], '4', NUM_STEPS[4], GUIDANCE[4]],
+    [IMG_REF[5], IMG_TGT1[5], IMG_TGT2[5], CUSTOM_MODE[5], INPUT_MASK_MODE[5], SEG_REF_MODE[5], PROMPTS[5], SEED[5], TRUE_GS[5], '5', NUM_STEPS[5], GUIDANCE[5]],
+    [IMG_REF[6], IMG_TGT1[6], IMG_TGT2[6], CUSTOM_MODE[6], INPUT_MASK_MODE[6], SEG_REF_MODE[6], PROMPTS[6], SEED[6], TRUE_GS[6], '6', NUM_STEPS[6], GUIDANCE[6]],
+    [IMG_REF[7], IMG_TGT1[7], IMG_TGT2[7], CUSTOM_MODE[7], INPUT_MASK_MODE[7], SEG_REF_MODE[7], PROMPTS[7], SEED[7], TRUE_GS[7], '7', NUM_STEPS[7], GUIDANCE[7]],
+    [IMG_REF[8], IMG_TGT1[8], IMG_TGT2[8], CUSTOM_MODE[8], INPUT_MASK_MODE[8], SEG_REF_MODE[8], PROMPTS[8], SEED[8], TRUE_GS[8], '8', NUM_STEPS[8], GUIDANCE[8]],
+]

app/metainfo.py ADDED Viewed

	@@ -0,0 +1,131 @@

+#### metainfo ####
+head = r"""
+<div class="elegant-header">
+    <div class="header-content">
+        <!-- Main title -->
+        <h1 class="main-title">
+            <span class="title-icon">🎨</span>
+            <span class="title-text">IC-Custom</span>
+        </h1>
+        <!-- Subtitle -->
+        <p class="subtitle">Transform your images with AI-powered customization</p>
+        <!-- Action badges -->
+        <div class="header-badges">
+            <a href="https://liyaowei-stu.github.io/project/IC_Custom/" class="badge-link">
+                <span class="badge-icon">🔗</span>
+                <span class="badge-text">Project</span>
+            </a>
+            <a href="https://arxiv.org/abs/2507.01926" class="badge-link">
+                <span class="badge-icon">📄</span>
+                <span class="badge-text">Paper</span>
+            </a>
+            <a href="https://github.com/TencentARC/IC-Custom" class="badge-link">
+                <span class="badge-icon">💻</span>
+                <span class="badge-text">Code</span>
+            </a>
+        </div>
+    </div>
+</div>
+"""
+getting_started = r"""
+<div class="getting-started-container">
+    <!-- Header -->
+    <div class="guide-header">
+        <h3 class="guide-title">🚀 Quick Start Guide</h3>
+        <p class="guide-subtitle">Follow these steps to customize your images with IC-Custom</p>
+    </div>
+    <!-- What is IC-Custom -->
+    <div class="info-card">
+        <div class="info-content">
+            <strong class="brand-name">IC-Custom</strong> offers two customization modes:
+            <span class="mode-badge position-aware">Position-aware</span>
+            (precise placement in masked areas) and
+            <span class="mode-badge position-free">Position-free</span>
+            (subject-driven generation).
+        </div>
+    </div>
+    <!-- Common Steps -->
+    <div class="step-card common-steps">
+        <div class="step-header">
+            <span class="step-number">1</span>
+            Initial Setup (Both Modes)
+        </div>
+        <ul class="step-list">
+            <li>Choose your <strong>customization mode</strong></li>
+            <li>Upload a <strong>reference image</strong> 📸</li>
+        </ul>
+    </div>
+    <!-- Position-aware Mode -->
+    <div class="step-card position-aware-steps">
+        <div class="step-header">
+            <span class="step-number">2A</span>
+            🎯 Position-aware Mode Steps
+        </div>
+        <ul class="step-list">
+            <li>Select <strong>input mask mode</strong> (precise mask or user-drawn mask)</li>
+            <li>Upload <strong>target image</strong> and create mask (click for SAM or brush directly)</li>
+            <li>Add <strong>text prompt</strong> (optional) - use VLM buttons for auto-generation</li>
+            <li>Review and refine your <strong>mask</strong> using mask tools if needed</li>
+            <li>Click <span class="run-button position-aware">Run</span> ✨</li>
+        </ul>
+    </div>
+    <!-- Position-free Mode -->
+    <div class="step-card position-free-steps">
+        <div class="step-header">
+            <span class="step-number">2B</span>
+            🎨 Position-free Mode Steps
+        </div>
+        <ul class="step-list">
+            <li>Write your <strong>text prompt</strong> (required) - describe the target scene</li>
+            <li>Use VLM buttons for prompt auto-generation or polishing (if enabled)</li>
+            <li>Click <span class="run-button position-free">Run</span> ✨</li>
+        </ul>
+    </div>
+    <!-- Quick Tips -->
+    <div class="tips-card">
+        <div class="tips-content">
+            <strong>💡 Quick Tips:</strong>
+            Use <kbd class="key-hint">Alt + "-"</kbd> or <kbd class="key-hint">⌘ + "-"</kbd> to zoom out for better operation •
+            Adjust settings in <kbd class="key-hint">Advanced Options</kbd> • Use mask operations (<kbd class="key-hint">dilate</kbd>/<kbd class="key-hint">erode</kbd>/<kbd class="key-hint">bbox</kbd>) for better results •
+            Try different <kbd class="key-hint">seeds</kbd> for varied outputs
+        </div>
+    </div>
+    <!-- Final Message -->
+    <div class="final-message">
+        <div class="final-text">
+            🎉 Ready to start? Collapse this guide and begin customizing!
+        </div>
+    </div>
+</div>
+"""
+citation = r"""
+If IC-Custom is helpful, please help to ⭐ the <a href='https://github.com/TencentARC/IC-Custom' target='_blank'>Github Repo</a>. Thanks!
+[![GitHub Stars](https://img.shields.io/github/stars/TencentARC/IC-Custom?style=social)](https://github.com/TencentARC/IC-Custom)
+---
+📝 **Citation**
+<br>
+If our work is useful for your research, please consider citing:
+```bibtex
+@article{li2025ic,
+  title={IC-Custom: Diverse Image Customization via In-Context Learning},
+  author={Li, Yaowei and Li, Xiaoyu and Zhang, Zhaoyang and Bian, Yuxuan and Liu, Gan and Li, Xinyuan and Xu, Jiale and Hu, Wenbo and Liu, Yating and Li, Lingen and others},
+  journal={arXiv preprint arXiv:2507.01926},
+  year={2025}
+}
+```
+📧 **Contact**
+<br>
+If you have any questions, please feel free to reach me out at <b>[email protected]</b>.
+"""

app/stylesheets.py ADDED Viewed

	@@ -0,0 +1,1679 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+Centralized CSS and JS for the IC-Custom app UI.
+Expose helpers:
+- get_css(): return a single CSS string for gradio Blocks(css=...)
+- get_js(): return an JS for gradio.
+"""
+def get_css() -> str:
+    return r"""
+    /* Global Optimization Effects - No Layout Changes */
+    /* Apple-style segmented control for radio buttons */
+    #customization_mode_radio .wrap, #input_mask_mode_radio .wrap, #seg_ref_mode_radio .wrap {
+        display: flex;
+        flex-wrap: nowrap;
+        justify-content: center;
+        align-items: center;
+        gap: 0;
+        background: rgba(255, 255, 255, 0.8);
+        border: 1px solid var(--neutral-200);
+        border-radius: 10px;
+        padding: 3px;
+        backdrop-filter: blur(12px);
+        -webkit-backdrop-filter: blur(12px);
+        box-shadow: 0 2px 8px rgba(15, 23, 42, 0.08);
+    }
+    #customization_mode_radio .wrap label, #input_mask_mode_radio .wrap label, #seg_ref_mode_radio .wrap label {
+        display: flex;
+        flex: 1;
+        justify-content: center;
+        align-items: center;
+        margin: 0;
+        padding: 10px 16px;
+        box-sizing: border-box;
+        border-radius: 7px;
+        transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
+        background: transparent;
+        border: none;
+        font-weight: 500;
+        font-size: 0.9rem;
+        color: var(--text-secondary);
+        cursor: pointer;
+        position: relative;
+        white-space: nowrap;
+        min-width: 0;
+    }
+    /* Hide the actual radio input */
+    #customization_mode_radio .wrap label input[type="radio"],
+    #input_mask_mode_radio .wrap label input[type="radio"],
+    #seg_ref_mode_radio .wrap label input[type="radio"] {
+        display: none;
+    }
+    /* Hover states */
+    #customization_mode_radio .wrap label:hover,
+    #input_mask_mode_radio .wrap label:hover,
+    #seg_ref_mode_radio .wrap label:hover {
+        background: rgba(14, 165, 233, 0.1);
+        color: var(--primary-blue);
+    }
+    /* Selected state with smooth background */
+    #customization_mode_radio .wrap label:has(input[type="radio"]:checked),
+    #input_mask_mode_radio .wrap label:has(input[type="radio"]:checked),
+    #seg_ref_mode_radio .wrap label:has(input[type="radio"]:checked) {
+        background: var(--primary-blue);
+        color: white;
+        font-weight: 600;
+        box-shadow: 0 2px 6px rgba(14, 165, 233, 0.25);
+        transform: none;
+    }
+    /* Fallback for browsers that don't support :has() */
+    #customization_mode_radio .wrap label input[type="radio"]:checked + *,
+    #input_mask_mode_radio .wrap label input[type="radio"]:checked + *,
+    #seg_ref_mode_radio .wrap label input[type="radio"]:checked + * {
+        color: white;
+    }
+    #customization_mode_radio .wrap:has(input[type="radio"]:checked) label:has(input[type="radio"]:checked),
+    #input_mask_mode_radio .wrap:has(input[type="radio"]:checked) label:has(input[type="radio"]:checked),
+    #seg_ref_mode_radio .wrap:has(input[type="radio"]:checked) label:has(input[type="radio"]:checked) {
+        background: var(--primary-blue);
+    }
+    /* Active state */
+    #customization_mode_radio .wrap label:active,
+    #input_mask_mode_radio .wrap label:active,
+    #seg_ref_mode_radio .wrap label:active {
+        transform: scale(0.98);
+    }
+    /* Elegant header styling */
+    .elegant-header {
+        text-align: center;
+        margin: 0 0 2rem 0;
+        padding: 0;
+    }
+    .header-content {
+        display: inline-block;
+        padding: 1.8rem 2.5rem;
+        background: linear-gradient(135deg,
+            rgba(255, 255, 255, 0.1) 0%,
+            rgba(255, 255, 255, 0.05) 100%);
+        border: 1px solid rgba(255, 255, 255, 0.15);
+        border-radius: 20px;
+        backdrop-filter: blur(12px);
+        -webkit-backdrop-filter: blur(12px);
+        box-shadow:
+            0 8px 32px rgba(15, 23, 42, 0.04),
+            inset 0 1px 0 rgba(255, 255, 255, 0.2);
+        transition: all 0.4s ease;
+        position: relative;
+        overflow: hidden;
+    }
+    .header-content::before {
+        content: '';
+        position: absolute;
+        top: 0;
+        left: -100%;
+        width: 100%;
+        height: 100%;
+        background: linear-gradient(90deg,
+            transparent,
+            rgba(255, 255, 255, 0.1),
+            transparent);
+        transition: left 0.6s ease;
+    }
+    .header-content:hover::before {
+        left: 100%;
+    }
+    .header-content:hover {
+        transform: translateY(-2px);
+        box-shadow:
+            0 12px 40px rgba(15, 23, 42, 0.08),
+            inset 0 1px 0 rgba(255, 255, 255, 0.3);
+        border-color: rgba(14, 165, 233, 0.2);
+    }
+    /* Main title styling */
+    .main-title {
+        margin: 0 0 0.8rem 0;
+        font-size: 2.4rem;
+        font-weight: 800;
+        display: flex;
+        align-items: center;
+        justify-content: center;
+        gap: 0.5rem;
+    }
+    .title-icon {
+        font-size: 2.2rem;
+        filter: drop-shadow(0 2px 4px rgba(0, 0, 0, 0.1));
+    }
+    .title-text {
+        background: linear-gradient(135deg, #0ea5e9 0%, #06b6d4 50%, #10b981 100%);
+        -webkit-background-clip: text;
+        -webkit-text-fill-color: transparent;
+        background-clip: text;
+        text-shadow: none;
+        position: relative;
+    }
+    /* Subtitle styling */
+    .subtitle {
+        margin: 0 0 1.2rem 0;
+        font-size: 1rem;
+        color: #64748b;
+        font-weight: 500;
+        letter-spacing: 0.025em;
+        opacity: 0.9;
+    }
+    /* Header badges container */
+    .header-badges {
+        display: flex;
+        justify-content: center;
+        gap: 0.8rem;
+        flex-wrap: wrap;
+    }
+    /* Individual badge links */
+    .badge-link {
+        display: inline-flex;
+        align-items: center;
+        gap: 0.4rem;
+        padding: 0.5rem 1rem;
+        background: rgba(255, 255, 255, 0.15);
+        border: 1px solid rgba(255, 255, 255, 0.2);
+        border-radius: 12px;
+        color: #475569;
+        text-decoration: none;
+        font-weight: 500;
+        font-size: 0.9rem;
+        transition: all 0.3s ease;
+        backdrop-filter: blur(4px);
+        -webkit-backdrop-filter: blur(4px);
+        position: relative;
+        overflow: hidden;
+    }
+    .badge-link::before {
+        content: '';
+        position: absolute;
+        top: 0;
+        left: -100%;
+        width: 100%;
+        height: 100%;
+        background: var(--primary-gradient);
+        transition: left 0.3s ease;
+        z-index: -1;
+    }
+    .badge-link:hover::before {
+        left: 0;
+    }
+    .badge-link:hover {
+        transform: translateY(-2px);
+        color: white;
+        border-color: transparent;
+        box-shadow: 0 4px 12px rgba(14, 165, 233, 0.3);
+    }
+    .badge-icon {
+        font-size: 1rem;
+        opacity: 0.8;
+    }
+    .badge-text {
+        font-weight: 600;
+    }
+    /* Getting Started Guide Styling */
+    .getting-started-container {
+        padding: 1.5rem;
+        background: linear-gradient(135deg, rgba(255, 255, 255, 0.1), rgba(255, 255, 255, 0.05));
+        border-radius: 12px;
+        border: 1px solid rgba(255, 255, 255, 0.15);
+        backdrop-filter: blur(8px);
+        -webkit-backdrop-filter: blur(8px);
+        box-shadow: 0 4px 16px rgba(15, 23, 42, 0.06);
+    }
+    .guide-header {
+        text-align: center;
+        margin-bottom: 1.5rem;
+    }
+    .guide-title {
+        color: var(--text-primary);
+        margin: 0 0 0.5rem 0;
+        font-size: 1.2rem;
+        font-weight: 700;
+    }
+    .guide-subtitle {
+        color: var(--text-muted);
+        margin: 0;
+        font-size: 0.9rem;
+        opacity: 0.9;
+    }
+    /* Info card */
+    .info-card {
+        background: rgba(255, 255, 255, 0.4);
+        border-radius: 8px;
+        padding: 1rem;
+        margin-bottom: 1.2rem;
+        border-left: 3px solid var(--primary-blue);
+        backdrop-filter: blur(4px);
+        -webkit-backdrop-filter: blur(4px);
+        transition: all 0.3s ease;
+    }
+    .info-card:hover {
+        background: rgba(255, 255, 255, 0.5);
+        transform: translateX(2px);
+    }
+    .info-content {
+        color: var(--text-secondary);
+        font-size: 0.9rem;
+        line-height: 1.5;
+    }
+    .brand-name {
+        color: var(--primary-blue);
+        font-weight: 700;
+    }
+    /* Mode badges */
+    .mode-badge {
+        padding: 0.2rem 0.4rem;
+        border-radius: 4px;
+        font-size: 0.8rem;
+        font-weight: 600;
+        margin: 0 0.2rem;
+        transition: all 0.2s ease;
+    }
+    .mode-badge.position-aware {
+        background: var(--badge-blue-bg);
+        color: var(--badge-blue-text);
+    }
+    .mode-badge.position-free {
+        background: var(--badge-green-bg);
+        color: var(--badge-green-text);
+    }
+    /* Step cards */
+    .step-card {
+        background: rgba(255, 255, 255, 0.4);
+        border-radius: 8px;
+        padding: 1rem;
+        margin-bottom: 1.2rem;
+        backdrop-filter: blur(4px);
+        -webkit-backdrop-filter: blur(4px);
+        transition: all 0.3s ease;
+        position: relative;
+        overflow: hidden;
+    }
+    .step-card::before {
+        content: '';
+        position: absolute;
+        left: 0;
+        top: 0;
+        bottom: 0;
+        width: 3px;
+        transition: all 0.3s ease;
+    }
+    .step-card.common-steps::before {
+        background: var(--neutral-500);
+    }
+    .step-card.position-aware-steps::before {
+        background: var(--position-aware-blue);
+    }
+    .step-card.position-free-steps::before {
+        background: var(--position-free-purple);
+    }
+    .step-card:hover {
+        background: rgba(255, 255, 255, 0.5);
+        transform: translateX(2px);
+    }
+    .step-header {
+        font-weight: 600;
+        color: var(--text-primary);
+        margin-bottom: 0.75rem;
+        display: flex;
+        align-items: center;
+        font-size: 0.95rem;
+    }
+    .step-number {
+        color: white;
+        border-radius: 50%;
+        width: 24px;
+        height: 24px;
+        display: inline-flex;
+        align-items: center;
+        justify-content: center;
+        margin-right: 0.5rem;
+        font-size: 0.75rem;
+        font-weight: 700;
+    }
+    .common-steps .step-number {
+        background: var(--neutral-500);
+    }
+    .position-aware-steps .step-number {
+        background: var(--position-aware-blue);
+    }
+    .position-free-steps .step-number {
+        background: var(--position-free-purple);
+    }
+    .step-list {
+        margin: 0;
+        padding-left: 1.2rem;
+        font-size: 0.85rem;
+        color: var(--text-secondary);
+        line-height: 1.6;
+    }
+    .step-list li {
+        margin-bottom: 0.4rem;
+        position: relative;
+    }
+    .step-list li:last-child {
+        margin-bottom: 0;
+    }
+    /* Run buttons */
+    .run-button {
+        padding: 0.2rem 0.5rem;
+        border-radius: 4px;
+        font-weight: 600;
+        font-size: 0.8rem;
+        color: white;
+        transition: all 0.2s ease;
+    }
+    .run-button.position-aware {
+        background: var(--position-aware-blue);
+    }
+    .run-button.position-free {
+        background: var(--position-free-purple);
+    }
+    /* Tips card */
+    .tips-card {
+        background: rgba(241, 245, 249, 0.6);
+        border-radius: 8px;
+        padding: 0.8rem;
+        border-left: 3px solid var(--neutral-400);
+        margin-bottom: 1rem;
+        backdrop-filter: blur(4px);
+        -webkit-backdrop-filter: blur(4px);
+        transition: all 0.3s ease;
+    }
+    .tips-card:hover {
+        background: rgba(241, 245, 249, 0.8);
+        transform: translateX(2px);
+    }
+    .tips-content {
+        font-size: 0.8rem;
+        color: var(--text-tips);
+        line-height: 1.5;
+    }
+    /* Key hints */
+    .key-hint {
+        background: var(--kbd-bg);
+        color: var(--kbd-text);
+        padding: 0.1rem 0.3rem;
+        border-radius: 3px;
+        font-size: 0.75em;
+        border: 1px solid var(--kbd-border);
+        font-family: monospace;
+        font-weight: 500;
+        transition: all 0.2s ease;
+    }
+    .key-hint:hover {
+        background: var(--primary-blue);
+        color: white;
+        border-color: var(--primary-blue);
+    }
+    /* Final message */
+    .final-message {
+        padding: 0.8rem;
+        background: var(--bg-final);
+        border-radius: 8px;
+        text-align: center;
+        transition: all 0.3s ease;
+    }
+    .final-message:hover {
+        transform: translateY(-1px);
+        box-shadow: 0 4px 12px rgba(14, 165, 233, 0.1);
+    }
+    .final-text {
+        color: var(--text-final);
+        font-weight: 600;
+        font-size: 0.85rem;
+    }
+    /* Legacy header badge styling for backward compatibility */
+    .header-badge {
+        background: var(--primary-gradient);
+        color: white;
+        padding: 0.5rem 1rem;
+        border-radius: 6px;
+        font-weight: 500;
+        font-size: 0.9rem;
+        transition: all 0.3s ease;
+        box-shadow: 0 1px 3px rgba(14, 165, 233, 0.2);
+        display: inline-block;
+    }
+    .header-badge:hover {
+        transform: translateY(-2px);
+        box-shadow: 0 4px 8px rgba(14, 165, 233, 0.3);
+        text-decoration: none;
+    }
+    /* Accordion styling matching getting_started */
+    .gradio-accordion {
+        border: 1px solid rgba(14, 165, 233, 0.2);
+        border-radius: 8px;
+        overflow: visible !important; /* Allow dropdown to overflow */
+        box-shadow: 0 2px 8px rgba(14, 165, 233, 0.1);
+        background: linear-gradient(135deg, rgba(255, 255, 255, 0.98) 0%, rgba(240, 249, 255, 0.95) 100%);
+    }
+    /* Ensure accordion content area allows dropdown overflow */
+    .gradio-accordion .wrap {
+        overflow: visible !important;
+    }
+    .gradio-accordion > .label-wrap {
+        background: linear-gradient(135deg, rgba(255, 255, 255, 0.98) 0%, rgba(240, 249, 255, 0.95) 100%);
+        border-bottom: 1px solid rgba(14, 165, 233, 0.2);
+        padding: 1rem 1.5rem;
+        font-weight: 600;
+        color: var(--text-primary);
+    }
+    /* Minimal dropdown styling - let Gradio handle positioning naturally */
+    #aspect_ratio_dropdown {
+        border-radius: 8px;
+    }
+    /* COMPLETELY REMOVE all dropdown styling - let Gradio handle everything */
+    /* This was causing the dropdown to display as a text block instead of options */
+    /* DO NOT style .gradio-dropdown globally - causes functionality issues */
+    /* Slider styling matching theme */
+    .gradio-slider {
+        background: linear-gradient(135deg, rgba(255, 255, 255, 0.98) 0%, rgba(240, 249, 255, 0.95) 100%) !important;
+        border: 1px solid rgba(14, 165, 233, 0.2) !important;
+        border-radius: 8px !important;
+        box-shadow: 0 2px 8px rgba(14, 165, 233, 0.1) !important;
+        padding: 12px !important;
+    }
+    .gradio-slider:hover {
+        border-color: rgba(14, 165, 233, 0.3) !important;
+        box-shadow: 0 2px 8px rgba(14, 165, 233, 0.15) !important;
+    }
+    /* Slider input styling */
+    .gradio-slider input[type="range"] {
+        background: transparent !important;
+    }
+    .gradio-slider input[type="range"]::-webkit-slider-track {
+        background: rgba(14, 165, 233, 0.2) !important;
+        border-radius: 4px !important;
+    }
+    .gradio-slider input[type="range"]::-webkit-slider-thumb {
+        background: var(--primary-blue) !important;
+        border: 2px solid white !important;
+        box-shadow: 0 2px 8px rgba(14, 165, 233, 0.3) !important;
+    }
+    /* Checkbox styling matching theme */
+    .gradio-checkbox {
+        background: linear-gradient(135deg, rgba(255, 255, 255, 0.98) 0%, rgba(240, 249, 255, 0.95) 100%) !important;
+        border: 1px solid rgba(14, 165, 233, 0.2) !important;
+        border-radius: 8px !important;
+        box-shadow: 0 2px 8px rgba(14, 165, 233, 0.1) !important;
+        padding: 8px 12px !important;
+    }
+    /* Specific styling for identified components */
+    #aspect_ratio_dropdown,
+    #text_prompt,
+    #move_to_center_checkbox,
+    #use_bg_preservation_checkbox {
+        background: linear-gradient(135deg, rgba(255, 255, 255, 0.98) 0%, rgba(240, 249, 255, 0.95) 100%) !important;
+        border: 1px solid rgba(14, 165, 233, 0.2) !important;
+        border-radius: 8px !important;
+        box-shadow: 0 2px 8px rgba(14, 165, 233, 0.1) !important;
+    }
+    /* Removed specific aspect_ratio_dropdown styling to avoid conflicts */
+    #aspect_ratio_dropdown:hover,
+    #text_prompt:hover,
+    #move_to_center_checkbox:hover,
+    #use_bg_preservation_checkbox:hover {
+        border-color: rgba(14, 165, 233, 0.3) !important;
+        box-shadow: 0 2px 8px rgba(14, 165, 233, 0.15) !important;
+    }
+    /* Textbox specific styling */
+    #text_prompt textarea {
+        background: linear-gradient(135deg, rgba(255, 255, 255, 0.98) 0%, rgba(240, 249, 255, 0.95) 100%) !important;
+        border: 1px solid rgba(14, 165, 233, 0.2) !important;
+        border-radius: 8px !important;
+        box-shadow: 0 2px 8px rgba(14, 165, 233, 0.1) !important;
+    }
+    /* Color variables matching getting_started section exactly */
+    :root {
+        /* Primary colors from getting_started */
+        --primary-blue: #0ea5e9;
+        --primary-blue-secondary: #06b6d4;
+        --primary-green: #10b981;
+        --primary-gradient: linear-gradient(135deg, #0ea5e9 0%, #06b6d4 50%, #10b981 100%);
+        /* Mode-specific colors from getting_started */
+        --position-aware-blue: #3b82f6;
+        --position-free-purple: #8b5cf6;
+        /* Badge colors from getting_started */
+        --badge-blue-bg: #dbeafe;
+        --badge-blue-text: #1e40af;
+        --badge-green-bg: #dcfce7;
+        --badge-green-text: #166534;
+        /* Neutral colors from getting_started */
+        --neutral-50: #f8fafc;
+        --neutral-100: #f1f5f9;
+        --neutral-200: #e2e8f0;
+        --neutral-300: #cbd5e1;
+        --neutral-400: #94a3b8;
+        --neutral-500: #64748b;
+        --neutral-600: #475569;
+        --neutral-700: #334155;
+        --neutral-800: #1e293b;
+        /* Text colors from getting_started */
+        --text-primary: #1e293b;
+        --text-secondary: #4b5563;
+        --text-muted: #64748b;
+        --text-tips: #475569;
+        --text-final: #0c4a6e;
+        /* Background colors from getting_started */
+        --bg-primary: white;
+        --bg-secondary: #f8fafc;
+        --bg-tips: #f1f5f9;
+        --bg-final: linear-gradient(135deg, #f0f9ff 0%, #e0f2fe 100%);
+        /* Keyboard hint styles from getting_started */
+        --kbd-bg: #e2e8f0;
+        --kbd-text: #475569;
+        --kbd-border: #cbd5e1;
+    }
+    /* Global smooth transitions - exclude dropdowns */
+    *:not(.gradio-dropdown):not(.gradio-dropdown *) {
+        transition: all 0.2s ease;
+    }
+    /* Focus states using getting_started primary blue - exclude dropdowns */
+    button:focus,
+    input:not(.gradio-dropdown input):focus,
+    select:not(.gradio-dropdown select):focus,
+    textarea:not(.gradio-dropdown textarea):focus {
+        outline: none;
+        box-shadow: 0 0 0 2px rgba(14, 165, 233, 0.3);
+    }
+    /* Subtle hover effects for interactive elements - exclude dropdowns */
+    button:not(.gradio-dropdown button):hover {
+        transform: translateY(-1px);
+    }
+    /* Global text styling matching getting_started */
+    body {
+        font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+        line-height: 1.6;
+        color: var(--text-secondary);
+        background-color: var(--bg-secondary);
+    }
+    /* Enhanced form element styling - exclude dropdowns from global styling */
+    input:not(.gradio-dropdown input),
+    textarea:not(.gradio-dropdown textarea),
+    select:not(.gradio-dropdown select) {
+        border-radius: 8px;
+        border: 1px solid rgba(14, 165, 233, 0.2);
+        background: linear-gradient(135deg, rgba(255, 255, 255, 0.98) 0%, rgba(240, 249, 255, 0.95) 100%);
+        backdrop-filter: blur(8px);
+        -webkit-backdrop-filter: blur(8px);
+        color: var(--text-primary);
+        transition: all 0.3s ease;
+        padding: 12px 16px;
+        font-size: 0.95rem;
+        box-shadow: 0 2px 8px rgba(14, 165, 233, 0.1);
+    }
+    input:not(.gradio-dropdown input):focus,
+    textarea:not(.gradio-dropdown textarea):focus,
+    select:not(.gradio-dropdown select):focus {
+        border-color: var(--primary-blue);
+        box-shadow: 0 0 0 3px rgba(14, 165, 233, 0.1), 0 2px 8px rgba(14, 165, 233, 0.15);
+        background: linear-gradient(135deg, rgba(255, 255, 255, 1) 0%, rgba(240, 249, 255, 0.98) 100%);
+        outline: none;
+        transform: translateY(-1px);
+    }
+    input:not(.gradio-dropdown input):hover,
+    textarea:not(.gradio-dropdown textarea):hover,
+    select:not(.gradio-dropdown select):hover {
+        border-color: rgba(14, 165, 233, 0.3);
+        box-shadow: 0 2px 8px rgba(14, 165, 233, 0.12);
+    }
+    /* Textbox specific styling */
+    .gradio-textbox {
+        border-radius: 12px;
+        overflow: hidden;
+    }
+    .gradio-textbox textarea {
+        border-radius: 12px;
+        resize: vertical;
+        min-height: 44px;
+    }
+    /* Scrollbar styling matching getting_started */
+    ::-webkit-scrollbar {
+        width: 8px;
+    }
+    ::-webkit-scrollbar-track {
+        background: var(--neutral-100);
+        border-radius: 4px;
+    }
+    ::-webkit-scrollbar-thumb {
+        background: var(--neutral-400);
+        border-radius: 4px;
+    }
+    ::-webkit-scrollbar-thumb:hover {
+        background: var(--primary-blue);
+    }
+    /* Enhanced button styling with Apple-style refinement */
+    button {
+        border-radius: 8px;
+        font-weight: 500;
+        cursor: pointer;
+        transition: all 0.3s ease;
+        border: 1px solid var(--neutral-200);
+        position: relative;
+        overflow: hidden;
+        backdrop-filter: blur(8px);
+        -webkit-backdrop-filter: blur(8px);
+    }
+    /* Button hover glow effect */
+    button::after {
+        content: '';
+        position: absolute;
+        top: 50%;
+        left: 50%;
+        width: 0;
+        height: 0;
+        background: radial-gradient(circle, rgba(14, 165, 233, 0.1) 0%, transparent 70%);
+        transition: all 0.4s ease;
+        transform: translate(-50%, -50%);
+        pointer-events: none;
+    }
+    button:hover::after {
+        width: 200px;
+        height: 200px;
+    }
+    /* Primary button using unified primary blue */
+    button[variant="primary"] {
+        background: var(--primary-blue);
+        border-color: var(--primary-blue);
+        color: white;
+        box-shadow: 0 2px 6px rgba(14, 165, 233, 0.2);
+    }
+    button[variant="primary"]:hover {
+        background: #0284c7;
+        border-color: #0284c7;
+        box-shadow: 0 4px 12px rgba(14, 165, 233, 0.3);
+        transform: translateY(-1px);
+    }
+    /* Secondary buttons */
+    button[variant="secondary"], .secondary-button {
+        background: linear-gradient(135deg, rgba(255, 255, 255, 0.98) 0%, rgba(240, 249, 255, 0.95) 100%);
+        border: 1px solid rgba(14, 165, 233, 0.2);
+        color: var(--text-secondary);
+        box-shadow: 0 2px 8px rgba(14, 165, 233, 0.1);
+    }
+    button[variant="secondary"]:hover, .secondary-button:hover {
+        background: var(--primary-blue);
+        border-color: var(--primary-blue);
+        color: white;
+        transform: translateY(-1px);
+        box-shadow: 0 4px 12px rgba(14, 165, 233, 0.25);
+    }
+    /* VLM buttons with subtle, elegant styling */
+    #vlm_generate_btn, #vlm_polish_btn {
+        background: linear-gradient(135deg, rgba(255, 255, 255, 0.98) 0%, rgba(240, 249, 255, 0.95) 100%) !important;
+        border: 1px solid rgba(14, 165, 233, 0.2) !important;
+        color: var(--text-secondary) !important;
+        box-shadow: 0 2px 8px rgba(14, 165, 233, 0.1);
+        font-weight: 500;
+        border-radius: 8px;
+        position: relative;
+        overflow: hidden;
+        transition: all 0.3s ease;
+        backdrop-filter: blur(8px);
+        -webkit-backdrop-filter: blur(8px);
+    }
+    #vlm_generate_btn::before, #vlm_polish_btn::before {
+        content: '';
+        position: absolute;
+        top: 0;
+        left: -100%;
+        width: 100%;
+        height: 100%;
+        background: linear-gradient(90deg, transparent, rgba(14, 165, 233, 0.1), transparent);
+        transition: left 0.5s ease;
+    }
+    #vlm_generate_btn:hover::before, #vlm_polish_btn:hover::before {
+        left: 100%;
+    }
+    #vlm_generate_btn:hover, #vlm_polish_btn:hover {
+        background: var(--primary-blue) !important;
+        border-color: var(--primary-blue) !important;
+        color: white !important;
+        box-shadow: 0 4px 12px rgba(14, 165, 233, 0.25);
+        transform: translateY(-1px);
+    }
+    #vlm_generate_btn:active, #vlm_polish_btn:active {
+        transform: translateY(0px);
+        box-shadow: 0 2px 6px rgba(14, 165, 233, 0.2);
+    }
+    /* Enhanced image styling with fixed dimensions for consistency */
+    .gradio-image, .gradio-imageeditor {
+        height: 300px !important;
+        width: 100% !important;
+        padding: 0 !important;
+        margin: 0 !important;
+    }
+    .gradio-image img,
+    .gradio-imageeditor img {
+        height: 300px !important;
+        width: 100% !important;
+        object-fit: contain !important;
+        border-radius: 8px;
+        transition: all 0.3s ease;
+        box-shadow: 0 2px 8px rgba(15, 23, 42, 0.1);
+    }
+    .gradio-image img:hover,
+    .gradio-imageeditor img:hover {
+        transform: scale(1.02);
+        box-shadow: 0 4px 16px rgba(15, 23, 42, 0.15);
+    }
+    /* Gallery CSS - contained adaptive layout with theme colors */
+    #mask_gallery, #result_gallery, .custom-gallery {
+        overflow: visible !important; /* Allow progress indicator to show */
+        position: relative !important;
+        width: 100% !important;
+        height: auto !important;
+        max-height: 75vh !important;
+        min-height: 300px !important;
+        display: flex !important;
+        flex-direction: column !important;
+        padding: 6px !important;
+        margin: 0 !important;
+        border-radius: 12px !important;
+        background: linear-gradient(135deg, rgba(255, 255, 255, 0.98) 0%, rgba(240, 249, 255, 0.95) 100%) !important;
+        border: 1px solid rgba(14, 165, 233, 0.2) !important;
+        box-shadow: 0 2px 8px rgba(14, 165, 233, 0.1) !important;
+    }
+    /* Gallery containers with contained but flexible display */
+    #mask_gallery .gradio-gallery, #result_gallery .gradio-gallery {
+        width: 100% !important;
+        height: auto !important;
+        min-height: 280px !important;
+        max-height: 70vh !important;
+        padding: 8px !important;
+        overflow: auto !important;
+        display: flex !important;
+        flex-direction: column !important;
+        border-radius: 8px !important;
+    }
+    /* Only hide specific duplicate elements that cause the problem */
+    #mask_gallery > div > div:nth-child(n+2),
+    #result_gallery > div > div:nth-child(n+2) {
+        display: none !important;
+    }
+    /* Alternative: hide duplicate grid structures only */
+    #mask_gallery .gradio-gallery:nth-child(n+2),
+    #result_gallery .gradio-gallery:nth-child(n+2) {
+        display: none !important;
+    }
+    /* Ensure timing and status elements are NOT hidden by the above rules */
+    #result_gallery .status,
+    #result_gallery .timer,
+    #result_gallery [class*="time"],
+    #result_gallery [class*="status"],
+    #result_gallery [class*="duration"],
+    #result_gallery .gradio-status,
+    #result_gallery .gradio-timer,
+    #result_gallery .gradio-info,
+    #result_gallery [data-testid*="timer"],
+    #result_gallery [data-testid*="status"] {
+        display: block !important;
+        visibility: visible !important;
+        opacity: 1 !important;
+        position: relative !important;
+        z-index: 1000 !important;
+    }
+    /* Gallery images - contained adaptive display */
+    #mask_gallery img, #result_gallery img {
+        width: 100% !important;
+        height: auto !important;
+        max-width: 100% !important;
+        max-height: 60vh !important;
+        object-fit: contain !important;
+        border-radius: 8px;
+        box-shadow: 0 2px 8px rgba(15, 23, 42, 0.1);
+        display: block !important;
+        margin: 0 auto !important;
+    }
+    /* Main preview image styling - contained but responsive */
+    #mask_gallery .preview-image, #result_gallery .preview-image {
+        width: 100% !important;
+        height: auto !important;
+        max-width: 100% !important;
+        max-height: 55vh !important;
+        border-radius: 12px;
+        box-shadow: 0 4px 16px rgba(15, 23, 42, 0.15);
+        object-fit: contain !important;
+        display: block !important;
+        margin: 0 auto !important;
+    }
+    /* Gallery content wrappers - ensure no height constraints */
+    #mask_gallery .gradio-gallery > div,
+    #result_gallery .gradio-gallery > div {
+        width: 100% !important;
+        height: auto !important;
+        min-height: auto !important;
+        max-height: none !important;
+        overflow: visible !important;
+    }
+    /* Gallery image containers - remove any height limits */
+    #mask_gallery .image-container,
+    #result_gallery .image-container,
+    #mask_gallery [data-testid="image"],
+    #result_gallery [data-testid="image"] {
+        width: 100% !important;
+        height: auto !important;
+        max-height: none !important;
+        overflow: visible !important;
+    }
+    /* Controlled gallery wrapper elements */
+    #mask_gallery .image-wrapper,
+    #result_gallery .image-wrapper {
+        max-height: 60vh !important;
+        overflow: hidden !important;
+    }
+    /* Specific targeting for Gradio's internal gallery elements */
+    #mask_gallery .grid-wrap,
+    #result_gallery .grid-wrap,
+    #mask_gallery .preview-wrap,
+    #result_gallery .preview-wrap {
+        height: auto !important;
+        max-height: 65vh !important;
+        overflow: auto !important;
+        border-radius: 8px !important;
+    }
+    /* Ensure gallery grids are properly sized within container */
+    #mask_gallery .grid,
+    #result_gallery .grid {
+        height: auto !important;
+        max-height: 60vh !important;
+        display: grid !important;
+        grid-template-columns: repeat(auto-fit, minmax(120px, 1fr)) !important;
+        gap: 8px !important;
+        align-items: start !important;
+        overflow: auto !important;
+        padding: 4px !important;
+    }
+    /* Custom scrollbar for gallery */
+    #mask_gallery .gradio-gallery::-webkit-scrollbar,
+    #result_gallery .gradio-gallery::-webkit-scrollbar,
+    #mask_gallery .grid::-webkit-scrollbar,
+    #result_gallery .grid::-webkit-scrollbar {
+        width: 6px;
+        height: 6px;
+    }
+    #mask_gallery .gradio-gallery::-webkit-scrollbar-track,
+    #result_gallery .gradio-gallery::-webkit-scrollbar-track,
+    #mask_gallery .grid::-webkit-scrollbar-track,
+    #result_gallery .grid::-webkit-scrollbar-track {
+        background: rgba(0, 0, 0, 0.1);
+        border-radius: 3px;
+    }
+    #mask_gallery .gradio-gallery::-webkit-scrollbar-thumb,
+    #result_gallery .gradio-gallery::-webkit-scrollbar-thumb,
+    #mask_gallery .grid::-webkit-scrollbar-thumb,
+    #result_gallery .grid::-webkit-scrollbar-thumb {
+        background: var(--neutral-400);
+        border-radius: 3px;
+    }
+    #mask_gallery .gradio-gallery::-webkit-scrollbar-thumb:hover,
+    #result_gallery .gradio-gallery::-webkit-scrollbar-thumb:hover,
+    #mask_gallery .grid::-webkit-scrollbar-thumb:hover,
+    #result_gallery .grid::-webkit-scrollbar-thumb:hover {
+        background: var(--primary-blue);
+    }
+    /* Thumbnail navigation styling in preview mode */
+    #mask_gallery .thumbnail, #result_gallery .thumbnail {
+        opacity: 0.7;
+        transition: opacity 0.3s ease;
+        border-radius: 6px;
+    }
+    #mask_gallery .thumbnail:hover, #result_gallery .thumbnail:hover,
+    #mask_gallery .thumbnail.selected, #result_gallery .thumbnail.selected {
+        opacity: 1;
+        box-shadow: 0 2px 8px rgba(14, 165, 233, 0.3);
+    }
+    /* Improved layout spacing and organization */
+    #glass_card .gradio-row {
+        gap: 16px !important;
+        margin-bottom: 6px !important;
+    }
+    #glass_card .gradio-column {
+        gap: 10px !important;
+    }
+    /* Better section spacing with theme colors */
+    #glass_card .gradio-group {
+        margin-bottom: 6px !important;
+        padding: 10px !important;
+        border-radius: 8px !important;
+        background: linear-gradient(135deg, rgba(255, 255, 255, 0.95) 0%, rgba(240, 249, 255, 0.9) 100%) !important;
+        border: 1px solid rgba(14, 165, 233, 0.15) !important;
+        box-shadow: 0 1px 3px rgba(14, 165, 233, 0.05) !important;
+        transition: all 0.3s ease !important;
+        overflow: visible !important; /* Allow dropdown to overflow */
+    }
+    #glass_card .gradio-group:hover {
+        background: linear-gradient(135deg, rgba(255, 255, 255, 0.98) 0%, rgba(240, 249, 255, 0.95) 100%) !important;
+        border-color: rgba(14, 165, 233, 0.25) !important;
+        box-shadow: 0 2px 8px rgba(14, 165, 233, 0.12) !important;
+        /* transform removed to prevent layout shift that hides dropdown */
+    }
+    /* Enhanced button styling for improved UX */
+    button[variant="secondary"] {
+        background: linear-gradient(135deg, rgba(255, 255, 255, 0.98) 0%, rgba(240, 249, 255, 0.95) 100%) !important;
+        border: 1px solid rgba(14, 165, 233, 0.2) !important;
+        color: var(--text-secondary) !important;
+        box-shadow: 0 2px 8px rgba(14, 165, 233, 0.1) !important;
+        transition: all 0.3s ease !important;
+    }
+    button[variant="secondary"]:hover {
+        background: var(--primary-blue) !important;
+        border-color: var(--primary-blue) !important;
+        color: white !important;
+        transform: translateY(-1px) !important;
+        box-shadow: 0 4px 12px rgba(14, 165, 233, 0.25) !important;
+    }
+    /* Markdown header improvements */
+    .gradio-markdown h1, .gradio-markdown h2, .gradio-markdown h3 {
+        color: var(--text-primary) !important;
+        font-weight: 600 !important;
+        margin-bottom: 8px !important;
+        margin-top: 4px !important;
+    }
+    /* Radio button container improvements */
+    #customization_mode_radio, #input_mask_mode_radio, #seg_ref_mode_radio {
+        margin-bottom: 0px !important;
+        margin-top: 0px !important;
+    }
+    /* Reduce space between markdown headers and subsequent components */
+    .gradio-markdown + .gradio-group {
+        margin-top: 1px !important;
+    }
+    .gradio-markdown + .gradio-image,
+    .gradio-markdown + .gradio-imageeditor,
+    .gradio-markdown + .gradio-textbox,
+    .gradio-markdown + .gradio-gallery {
+        margin-top: 1px !important;
+    }
+    /* Specific spacing adjustments for numbered sections */
+    .gradio-markdown:has(h1), .gradio-markdown:has(h2), .gradio-markdown:has(h3) {
+        margin-bottom: 2px !important;
+    }
+    /* Remove padding from image and gallery containers */
+    .gradio-image, .gradio-imageeditor, .gradio-gallery {
+        padding: 0 !important;
+        margin: 0 !important;
+    }
+    /* Image container styling with theme colors */
+    .gradio-image, .gradio-imageeditor {
+        border-radius: 12px;
+        overflow: hidden;
+        background: linear-gradient(135deg, rgba(255, 255, 255, 0.98) 0%, rgba(240, 249, 255, 0.95) 100%) !important;
+        border: 1px solid rgba(14, 165, 233, 0.2) !important;
+        box-shadow: 0 2px 8px rgba(14, 165, 233, 0.1) !important;
+        transition: all 0.3s ease;
+    }
+    .gradio-image:hover, .gradio-imageeditor:hover {
+        border-color: var(--primary-blue) !important;
+        box-shadow: 0 4px 16px rgba(14, 165, 233, 0.15) !important;
+        transform: translateY(-1px);
+    }
+    /* Image upload area styling */
+    .gradio-image .upload-container,
+    .gradio-imageeditor .upload-container,
+    .gradio-image > div,
+    .gradio-imageeditor > div {
+        background: linear-gradient(135deg, rgba(255, 255, 255, 0.98) 0%, rgba(240, 249, 255, 0.95) 100%) !important;
+        border: 1px solid rgba(14, 165, 233, 0.2) !important;
+        border-radius: 12px !important;
+    }
+    /* Image upload placeholder styling */
+    .gradio-image .upload-text,
+    .gradio-imageeditor .upload-text,
+    .gradio-image [data-testid="upload-text"],
+    .gradio-imageeditor [data-testid="upload-text"] {
+        background: linear-gradient(135deg, rgba(255, 255, 255, 0.98) 0%, rgba(240, 249, 255, 0.95) 100%) !important;
+        color: var(--text-secondary) !important;
+    }
+    /* Image preview area */
+    .gradio-image .image-container,
+    .gradio-imageeditor .image-container,
+    .gradio-image .preview-container,
+    .gradio-imageeditor .preview-container {
+        background: linear-gradient(135deg, rgba(255, 255, 255, 0.98) 0%, rgba(240, 249, 255, 0.95) 100%) !important;
+        border-radius: 12px !important;
+    }
+    /* Specific targeting for image upload areas */
+    .gradio-image .wrap,
+    .gradio-imageeditor .wrap,
+    .gradio-image .block,
+    .gradio-imageeditor .block {
+        background: linear-gradient(135deg, rgba(255, 255, 255, 0.98) 0%, rgba(240, 249, 255, 0.95) 100%) !important;
+        border: 1px solid rgba(14, 165, 233, 0.2) !important;
+        border-radius: 12px !important;
+    }
+    /* Image drop zone styling */
+    .gradio-image .drop-zone,
+    .gradio-imageeditor .drop-zone,
+    .gradio-image .upload-area,
+    .gradio-imageeditor .upload-area {
+        background: linear-gradient(135deg, rgba(255, 255, 255, 0.98) 0%, rgba(240, 249, 255, 0.95) 100%) !important;
+        border: 2px dashed rgba(14, 165, 233, 0.3) !important;
+        border-radius: 12px !important;
+    }
+    /* Force override any white backgrounds in image components */
+    .gradio-image *,
+    .gradio-imageeditor * {
+        background-color: transparent !important;
+    }
+    .gradio-image .gradio-image,
+    .gradio-imageeditor .gradio-imageeditor {
+        background: linear-gradient(135deg, rgba(255, 255, 255, 0.98) 0%, rgba(240, 249, 255, 0.95) 100%) !important;
+    }
+    /* Specific styling for Reference Image and Target Images */
+    #reference_image,
+    #target_image_1,
+    #target_image_2 {
+        background: linear-gradient(135deg, rgba(255, 255, 255, 0.98) 0%, rgba(240, 249, 255, 0.95) 100%) !important;
+        border: 1px solid rgba(14, 165, 233, 0.2) !important;
+        box-shadow: 0 2px 8px rgba(14, 165, 233, 0.1) !important;
+        border-radius: 12px !important;
+    }
+    #reference_image *,
+    #target_image_1 *,
+    #target_image_2 * {
+        background: linear-gradient(135deg, rgba(255, 255, 255, 0.98) 0%, rgba(240, 249, 255, 0.95) 100%) !important;
+        border-radius: 12px !important;
+    }
+    /* Upload area for specific image components */
+    #reference_image .upload-container,
+    #target_image_1 .upload-container,
+    #target_image_2 .upload-container,
+    #reference_image .drop-zone,
+    #target_image_1 .drop-zone,
+    #target_image_2 .drop-zone {
+        background: linear-gradient(135deg, rgba(255, 255, 255, 0.98) 0%, rgba(240, 249, 255, 0.95) 100%) !important;
+        border: 2px dashed rgba(14, 165, 233, 0.3) !important;
+        border-radius: 12px !important;
+    }
+    /* Hover effects for specific image components */
+    #reference_image:hover,
+    #target_image_1:hover,
+    #target_image_2:hover {
+        border-color: var(--primary-blue) !important;
+        box-shadow: 0 4px 16px rgba(14, 165, 233, 0.15) !important;
+        transform: translateY(-1px);
+    }
+    /* Group styling matching getting_started white cards */
+    .group, .gradio-group {
+        border-radius: 8px;
+        background: var(--bg-primary);
+        border: 1px solid var(--neutral-200);
+        box-shadow: 0 1px 3px rgba(0, 0, 0, 0.05);
+    }
+    /* Subtle page background with theme colors */
+    body, .gradio-container {
+        background: linear-gradient(135deg, #f8fafc 0%, #f0f9ff 50%, #f8fafc 100%);
+        min-height: 100vh;
+    }
+    /* Global glass container with subtle Apple-style gradient */
+    #global_glass_container {
+        position: relative;
+        border-radius: 20px;
+        padding: 16px;
+        margin: 12px auto;
+        max-width: 1400px;
+        background: linear-gradient(145deg,
+            rgba(248, 250, 252, 0.98),
+            rgba(241, 245, 249, 0.95));
+        box-shadow:
+            0 20px 40px rgba(15, 23, 42, 0.08),
+            0 8px 24px rgba(15, 23, 42, 0.06),
+            inset 0 1px 0 rgba(255, 255, 255, 0.9);
+        border: 1px solid rgba(226, 232, 240, 0.7);
+        transition: all 0.3s ease;
+        overflow: visible !important; /* Allow dropdown to overflow */
+    }
+    /* Subtle gradient overlay for Apple effect */
+    #global_glass_container::after {
+        content: '';
+        position: absolute;
+        top: 0;
+        left: 0;
+        right: 0;
+        height: 250px;
+        background: linear-gradient(135deg,
+            rgba(14, 165, 233, 0.08) 0%,
+            rgba(6, 182, 212, 0.06) 25%,
+            rgba(16, 185, 129, 0.08) 50%,
+            rgba(139, 92, 246, 0.06) 75%,
+            rgba(14, 165, 233, 0.08) 100%);
+        background-size: 300% 300%;
+        animation: subtleGradientShift 15s ease-in-out infinite;
+        pointer-events: none;
+        z-index: 0;
+    }
+    @keyframes subtleGradientShift {
+        0%, 100% {
+            background-position: 0% 50%;
+            opacity: 0.8;
+        }
+        50% {
+            background-position: 100% 50%;
+            opacity: 1;
+        }
+    }
+    /* Ensure content is above the gradient overlay */
+    #global_glass_container > * {
+        position: relative;
+        z-index: 1;
+    }
+    /* Hover effect for global container - transform disabled to avoid dropdown reposition */
+    #global_glass_container:hover {
+        /* transform: translateY(-2px); */
+        box-shadow:
+            0 25px 50px rgba(15, 23, 42, 0.08),
+            0 12px 30px rgba(15, 23, 42, 0.06),
+            inset 0 1px 0 rgba(255, 255, 255, 0.9);
+        border-color: rgba(226, 232, 240, 0.8);
+    }
+    /* Subtle border highlight for global container */
+    #global_glass_container::before {
+        content: "";
+        position: absolute;
+        inset: 0;
+        border-radius: 20px;
+        padding: 1px;
+        background: linear-gradient(135deg,
+            rgba(255, 255, 255, 0.8),
+            rgba(226, 232, 240, 0.4),
+            rgba(255, 255, 255, 0.6),
+            rgba(226, 232, 240, 0.3)
+        );
+        -webkit-mask:
+            linear-gradient(#fff 0 0) content-box,
+            linear-gradient(#fff 0 0);
+        -webkit-mask-composite: xor;
+        mask-composite: exclude;
+        pointer-events: none;
+        z-index: 0;
+    }
+    /* Inner glassmorphism container with theme colors */
+    #glass_card {
+        position: relative;
+        border-radius: 16px;
+        padding: 16px;
+        background: linear-gradient(135deg, rgba(255, 255, 255, 0.6) 0%, rgba(240, 249, 255, 0.5) 100%);
+        box-shadow:
+            0 8px 24px rgba(14, 165, 233, 0.08),
+            inset 0 1px 0 rgba(255, 255, 255, 0.7);
+        border: 1px solid rgba(14, 165, 233, 0.2);
+        margin-bottom: 12px;
+        transition: all 0.3s ease;
+        overflow: visible !important; /* Allow dropdown to overflow */
+    }
+    #glass_card:hover {
+        background: linear-gradient(135deg, rgba(255, 255, 255, 0.7) 0%, rgba(240, 249, 255, 0.6) 100%);
+        border-color: rgba(14, 165, 233, 0.3);
+        box-shadow:
+            0 12px 32px rgba(14, 165, 233, 0.12),
+            inset 0 1px 0 rgba(255, 255, 255, 0.8);
+    }
+    /* Subtle inner border gradient for liquid glass feel */
+    #glass_card::before {
+        content: "";
+        position: absolute;
+        inset: 0;
+        border-radius: 16px;
+        padding: 1px;
+        background: linear-gradient(135deg,
+            rgba(255, 255, 255, 0.6),
+            rgba(226, 232, 240, 0.2));
+        -webkit-mask:
+            linear-gradient(#fff 0 0) content-box,
+            linear-gradient(#fff 0 0);
+        -webkit-mask-composite: xor;
+        mask-composite: exclude;
+        pointer-events: none;
+    }
+    /* Preserve the airy layout inside the cards */
+    #global_glass_container .gradio-column { gap: 12px; }
+    #glass_card .gradio-row { gap: 16px; }
+    #glass_card .gradio-column { gap: 12px; }
+    #glass_card .gradio-group { margin: 8px 0; }
+    /* Text selection matching getting_started colors */
+    ::selection {
+        background: var(--badge-blue-bg);
+        color: var(--badge-blue-text);
+    }
+    /* Placeholder styling */
+    ::placeholder {
+        color: var(--text-muted);
+        opacity: 0.8;
+    }
+    /* Improved error state styling */
+    .error {
+        border-color: #ef4444 !important;
+        box-shadow: 0 0 0 2px rgba(239, 68, 68, 0.1) !important;
+    }
+    /* Success state using getting_started green */
+    .success-state {
+        border-color: var(--primary-green) !important;
+        box-shadow: 0 0 0 2px rgba(16, 185, 129, 0.1) !important;
+    }
+    /* Label styling */
+    .gradio-label {
+        color: var(--text-primary);
+        font-weight: 600;
+    }
+    /* Markdown content styling */
+    .markdown-body {
+        color: var(--text-secondary);
+        line-height: 1.6;
+    }
+    .markdown-body h1, .markdown-body h2, .markdown-body h3 {
+        color: var(--text-primary);
+    }
+    /* Step indicators styling */
+    .gradio-markdown h1, .gradio-markdown h2, .gradio-markdown h3,
+    .gradio-markdown p {
+        margin: 0.25rem 0;
+    }
+    /* Enhanced step indicators with numbers */
+    .gradio-markdown:contains("1."), .gradio-markdown:contains("2."),
+    .gradio-markdown:contains("3."), .gradio-markdown:contains("4."),
+    .gradio-markdown:contains("5."), .gradio-markdown:contains("6."),
+    .gradio-markdown:contains("7.") {
+        position: relative;
+        padding-left: 2.5rem;
+        color: var(--text-primary);
+        font-weight: 600;
+    }
+    /* Specific button styling */
+    #undo_btnSEG, #dilate_btn, #erode_btn, #bounding_box_btn {
+        background: linear-gradient(135deg, rgba(255, 255, 255, 0.98) 0%, rgba(240, 249, 255, 0.95) 100%);
+        border: 1px solid rgba(14, 165, 233, 0.2);
+        color: var(--text-secondary);
+        font-weight: 500;
+        padding: 8px 16px;
+        border-radius: 6px;
+        box-shadow: 0 2px 8px rgba(14, 165, 233, 0.1);
+        transition: all 0.3s ease;
+    }
+    #undo_btnSEG:hover, #dilate_btn:hover, #erode_btn:hover, #bounding_box_btn:hover {
+        background: var(--primary-blue);
+        border-color: var(--primary-blue);
+        color: white;
+        transform: translateY(-1px);
+        box-shadow: 0 4px 12px rgba(14, 165, 233, 0.3);
+    }
+    /* Submit button enhanced styling - unified with primary blue */
+    button[variant="primary"], .gradio-button.primary {
+        background: var(--primary-blue);
+        border-color: var(--primary-blue);
+        color: white;
+        font-weight: 600;
+        font-size: 1rem;
+        padding: 12px 24px;
+        box-shadow: 0 4px 16px rgba(14, 165, 233, 0.25);
+        transition: all 0.3s ease;
+    }
+    button[variant="primary"]:hover, .gradio-button.primary:hover {
+        background: #0284c7;
+        border-color: #0284c7;
+        box-shadow: 0 6px 20px rgba(14, 165, 233, 0.4);
+        transform: translateY(-2px);
+    }
+    /* Improved button states */
+    button:disabled {
+        opacity: 0.5;
+        cursor: not-allowed;
+        transform: none !important;
+        box-shadow: none !important;
+    }
+    button:disabled::after {
+        display: none;
+    }
+    button.processing {
+        background: var(--neutral-400) !important;
+        border-color: var(--neutral-400) !important;
+        cursor: wait;
+        animation: processingPulse 2s ease-in-out infinite;
+    }
+    @keyframes processingPulse {
+        0%, 100% { opacity: 0.8; }
+        50% { opacity: 1; }
+    }
+    /* Responsive improvements */
+    @media (max-width: 768px) {
+        .header-content {
+            padding: 1.2rem 1.8rem;
+            margin: 0 1rem;
+        }
+        .main-title {
+            font-size: 2rem;
+        }
+        .title-icon {
+            font-size: 1.8rem;
+        }
+        .subtitle {
+            font-size: 0.9rem;
+        }
+        .header-badges {
+            gap: 0.6rem;
+        }
+        .badge-link {
+            padding: 0.4rem 0.8rem;
+            font-size: 0.85rem;
+        }
+        .header-badge {
+            padding: 0.4rem 0.8rem;
+            font-size: 0.85rem;
+        }
+        /* Getting Started responsive */
+        .getting-started-container {
+            padding: 1rem;
+            margin: 0 0.5rem;
+        }
+        .guide-title {
+            font-size: 1.1rem;
+        }
+        .guide-subtitle {
+            font-size: 0.85rem;
+        }
+        .step-card {
+            padding: 0.8rem;
+            margin-bottom: 1rem;
+        }
+        .step-header {
+            font-size: 0.9rem;
+        }
+        .step-number {
+            width: 22px;
+            height: 22px;
+            font-size: 0.7rem;
+        }
+        .step-list {
+            font-size: 0.8rem;
+            padding-left: 1rem;
+        }
+        .tips-card {
+            padding: 0.6rem;
+        }
+        .tips-content {
+            font-size: 0.75rem;
+        }
+        .final-message {
+            padding: 0.6rem;
+        }
+        .final-text {
+            font-size: 0.8rem;
+        }
+        button {
+            min-height: 44px;
+        }
+        input, textarea, select {
+            min-height: 44px;
+        }
+        /* Mobile optimization for subtle effects */
+        #global_glass_container {
+            padding: 16px;
+            margin: 8px;
+            border-radius: 16px;
+        }
+        #global_glass_container::after {
+            height: 180px;
+            animation-duration: 18s;
+        }
+        #glass_card {
+            padding: 20px;
+            margin: 10px;
+            border-radius: 12px;
+        }
+        #glass_card .gradio-row { gap: 12px; }
+        #glass_card .gradio-column { gap: 12px; }
+    }
+    /* Ensure gallery works properly in all screen sizes */
+    @media (min-width: 1200px) {
+        #mask_gallery .gradio-gallery, #result_gallery .gradio-gallery {
+            min-height: 300px !important;
+            max-height: 80vh !important;
+        }
+        .responsive-gallery .grid-container {
+            grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)) !important;
+        }
+    }
+    /* Fix for gallery duplicate content issue - ensure clean display */
+    #mask_gallery > div > div:nth-child(n+2),
+    #result_gallery > div > div:nth-child(n+2) {
+        display: none !important;
+    }
+    #mask_gallery .gradio-gallery:nth-child(n+2),
+    #result_gallery .gradio-gallery:nth-child(n+2) {
+        display: none !important;
+    }
+    """

app/ui_components.py ADDED Viewed

	@@ -0,0 +1,354 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+UI components construction for IC-Custom application.
+"""
+import gradio as gr
+from constants import (
+    ASPECT_RATIO_LABELS,
+    DEFAULT_ASPECT_RATIO,
+    DEFAULT_BRUSH_SIZE
+)
+def create_theme():
+    """Create and configure the Gradio theme."""
+    theme = gr.themes.Ocean()
+    theme.set(
+        checkbox_label_background_fill_selected="*button_primary_background_fill",
+        checkbox_label_text_color_selected="*button_primary_text_color",
+    )
+    return theme
+def create_css():
+    """Create custom CSS for the application."""
+    from stylesheets import get_css
+    return get_css()
+def create_header_section():
+    """Create the header section with title and description."""
+    from metainfo import head, getting_started
+    with gr.Row():
+        gr.HTML(head)
+    with gr.Accordion(label="🚀 Getting Started:", open=True, elem_id="accordion"):
+        with gr.Row(equal_height=True):
+            gr.HTML(getting_started)
+def create_customization_section():
+    """Create the customization mode selection section."""
+    with gr.Row():
+        # Add a note to remind users to click Clear before starting
+        md_custmization_mode = gr.Markdown(
+            "1. Select a Customization Mode\n\n*Tip: Please click the Clear button first to reset all states before starting a new task.*"
+        )
+    with gr.Row():
+        custmization_mode = gr.Radio(
+            ["Position-aware", "Position-free"],
+            value="Position-aware",
+            scale=1,
+            elem_id="customization_mode_radio",
+            show_label=False,
+            label="Customization Mode",
+        )
+    return custmization_mode, md_custmization_mode
+def create_image_input_section():
+    """Create image input section optimized for left column layout."""
+    # Reference image section
+    md_image_reference = gr.Markdown("2. Input reference image")
+    with gr.Group():
+        image_reference = gr.Image(
+            label="Reference Image",
+            type="pil",
+            interactive=True,
+            height=320,
+            container=True,
+            elem_id="reference_image"
+        )
+    # Input mask mode selection
+    md_input_mask_mode = gr.Markdown("3. Select input mask mode")
+    with gr.Group():
+        input_mask_mode = gr.Radio(
+            ["Precise mask", "User-drawn mask"],
+            value="Precise mask",
+            elem_id="input_mask_mode_radio",
+            show_label=False,
+            label="Input Mask Mode",
+        )
+    # Target image section
+    md_target_image = gr.Markdown("4. Input target image & mask (Iterate clicking or brushing until the target is covered)")
+    # Precise mask mode
+    with gr.Group():
+        image_target_1 = gr.Image(
+            type="pil",
+            label="Target Image (precise mask)",
+            interactive=True,
+            visible=True,
+            height=500,
+            container=True,
+            elem_id="target_image_1"
+        )
+        with gr.Row():
+            undo_target_seg_button = gr.Button(
+                'Undo seg',
+                elem_id="undo_btnSEG",
+                visible=True,
+                size="sm",
+                scale=1
+            )
+    # User-drawn mask mode
+    with gr.Group():
+        image_target_2 = gr.ImageEditor(
+            label="Target Image (user-drawn mask)",
+            type="pil",
+            brush=gr.Brush(colors=["#FFFFFF"], default_size=DEFAULT_BRUSH_SIZE, color_mode="fixed"),
+            layers=False,
+            interactive=True,
+            sources=["upload", "clipboard"],
+            placeholder="Please click here or the icon to upload the image.",
+            visible=False,
+            height=500,
+            container=True,
+            elem_id="target_image_2",
+            fixed_canvas=True,
+        )
+    return (image_reference, input_mask_mode, image_target_1, image_target_2,
+            undo_target_seg_button, md_image_reference, md_input_mask_mode, md_target_image)
+def create_prompt_section():
+    """Create the text prompt input section with improved layout."""
+    md_prompt = gr.Markdown("5. Input text prompt (optional)")
+    with gr.Group():
+        prompt = gr.Textbox(
+            placeholder="Please input the description for the target scene.",
+            value="",
+            lines=2,
+            show_label=False,
+            label="Text Prompt",
+            container=True,
+            elem_id="text_prompt"
+        )
+        with gr.Row():
+            vlm_generate_btn = gr.Button(
+                "🤖 VLM Auto-generate",
+                scale=1,
+                elem_id="vlm_generate_btn",
+                variant="secondary"
+            )
+            vlm_polish_btn = gr.Button(
+                "✨ VLM Auto-polish",
+                scale=1,
+                elem_id="vlm_polish_btn",
+                variant="secondary"
+            )
+    return prompt, vlm_generate_btn, vlm_polish_btn, md_prompt
+def create_advanced_options_section():
+    """Create the advanced options section."""
+    with gr.Accordion("Advanced Options", open=False, elem_id="accordion1"):
+        with gr.Group():
+            aspect_ratio = gr.Dropdown(
+                label="Output aspect ratio",
+                choices=ASPECT_RATIO_LABELS,
+                value=DEFAULT_ASPECT_RATIO,
+                interactive=True,
+                allow_custom_value=False,
+                filterable=False,
+                elem_id="aspect_ratio_dropdown"
+            )
+        with gr.Group():
+            seg_ref_mode = gr.Radio(
+                label="Segmentation mode",
+                choices=["Full Ref", "Masked Ref"],
+                value="Full Ref",
+                elem_id="seg_ref_mode_radio"
+            )
+            move_to_center = gr.Checkbox(label="Move object to center", value=False, elem_id="move_to_center_checkbox")
+        with gr.Group():
+            with gr.Row():
+                use_background_preservation = gr.Checkbox(label="Use background preservation", value=False, elem_id="use_bg_preservation_checkbox")
+                background_blend_threshold = gr.Slider(
+                    label="Background blend threshold",
+                    minimum=0,
+                    maximum=1,
+                    step=0.1,
+                    value=0.5
+                )
+        with gr.Group():
+            with gr.Row():
+                seed = gr.Slider(
+                    label="Seed (-1 for random): ",
+                    minimum=-1,
+                    maximum=2147483647,
+                    step=1,
+                    value=-1,
+                    scale=4
+                )
+                num_images_per_prompt = gr.Slider(
+                    label="Num samples",
+                    minimum=1,
+                    maximum=4,
+                    step=1,
+                    value=1,
+                    scale=1
+                )
+        with gr.Group():
+            with gr.Row():
+                guidance = gr.Slider(
+                    label="Guidance scale",
+                    minimum=10,
+                    maximum=65,
+                    step=1,
+                    value=40,
+                )
+                num_steps = gr.Slider(
+                    label="Number of inference steps",
+                    minimum=1,
+                    maximum=60,
+                    step=1,
+                    value=32,
+                )
+            with gr.Row():
+                true_gs = gr.Slider(
+                    label="True GS",
+                    minimum=1,
+                    maximum=10,
+                    step=1,
+                    value=3,
+                )
+    return (aspect_ratio, seg_ref_mode, move_to_center, use_background_preservation,
+            background_blend_threshold, seed, num_images_per_prompt, guidance, num_steps, true_gs)
+def create_mask_operation_section():
+    """Create mask operation section optimized for right column (outputs)."""
+    md_mask_operation = gr.Markdown("6. View or modify the target mask")
+    with gr.Group():
+        # Mask gallery with responsive layout
+        mask_gallery = gr.Gallery(
+            label='Mask Preview',
+            show_label=False,
+            interactive=False,
+            columns=2,
+            rows=1,
+            height="auto",
+            object_fit="contain",
+            preview=True,
+            allow_preview=True,
+            selected_index=0,
+            elem_id="mask_gallery",
+            elem_classes=["custom-gallery", "responsive-gallery"],
+            container=True,
+            show_fullscreen_button=False
+        )
+        # Mask operation buttons - horizontal layout
+        with gr.Row():
+            dilate_button = gr.Button(
+                '🔍 Dilate',
+                elem_id="dilate_btn",
+                variant="secondary",
+                size="sm",
+                scale=1
+            )
+            erode_button = gr.Button(
+                '🔽 Erode',
+                elem_id="erode_btn",
+                variant="secondary",
+                size="sm",
+                scale=1
+            )
+            bounding_box_button = gr.Button(
+                '📦 Bounding box',
+                elem_id="bounding_box_btn",
+                variant="secondary",
+                size="sm",
+                scale=1
+            )
+    return mask_gallery, dilate_button, erode_button, bounding_box_button, md_mask_operation
+def create_output_section():
+    """Create the output section optimized for right column."""
+    md_submit = gr.Markdown("7. Submit and view the output")
+    # Generation controls at top for better workflow
+    with gr.Group():
+        with gr.Row():
+            submit_button = gr.Button(
+                "💫 Generate",
+                variant="primary",
+                scale=3,
+                size="lg"
+            )
+            clear_btn = gr.ClearButton(
+                scale=1,
+                variant="secondary",
+                value="🗑️ Clear"
+            )
+    # Results gallery with responsive layout
+    with gr.Group():
+        result_gallery = gr.Gallery(
+            label='Generated Results',
+            show_label=False,
+            interactive=False,
+            columns=1,
+            rows=1,
+            height="auto",
+            object_fit="contain",
+            preview=True,
+            allow_preview=True,
+            selected_index=0,
+            elem_id="result_gallery",
+            elem_classes=["custom-gallery", "responsive-gallery"],
+            container=True,
+            show_fullscreen_button=False
+        )
+    return result_gallery, submit_button, clear_btn, md_submit
+def create_examples_section(examples_list, inputs, outputs, fn):
+    """Create the examples section with required arguments."""
+    examples = gr.Examples(
+        examples=examples_list,
+        inputs=inputs,
+        outputs=outputs,
+        fn=fn,
+        cache_examples=False,
+        examples_per_page=10,
+        run_on_click=True,
+    )
+    return examples
+def create_citation_section():
+    """Create the citation section."""
+    from metainfo import citation
+    with gr.Row():
+        gr.Markdown(citation)

app/utils.py ADDED Viewed

	@@ -0,0 +1,429 @@

+import os
+import sys
+import base64
+from io import BytesIO
+from typing import Optional
+from PIL import Image
+import numpy as np
+import torch
+from segment_anything import SamPredictor, sam_model_registry
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+from qwen_vl_utils import process_vision_info
+from huggingface_hub import hf_hub_download
+sys.path.append(os.getcwd())
+import BEN2
+## Ordinary function
+def resize(image: Image.Image,
+            target_width: int,
+            target_height: int,
+            interpolate: Image.Resampling = Image.Resampling.LANCZOS,
+            return_type: str = "pil") -> Image.Image | np.ndarray:
+    """
+    Crops and resizes an image while preserving the aspect ratio.
+    Args:
+        image (Image.Image): Input PIL image to be cropped and resized.
+        target_width (int): Target width of the output image.
+        target_height (int): Target height of the output image.
+        interpolate (Image.Resampling): The interpolation method.
+        return_type (str): The type of the output image.
+    Returns:
+        Image.Image: Cropped and resized image.
+    """
+    # Original dimensions
+    resized_image = image.resize((target_width, target_height), interpolate)
+    if return_type == "pil":
+        return resized_image
+    elif return_type == "np":
+        return np.array(resized_image)
+    else:
+        raise ValueError(f"Invalid return type: {return_type}")
+def resize_long_edge(
+    image: Image.Image,
+    long_edge_size: int,
+    interpolate: Image.Resampling = Image.Resampling.LANCZOS,
+    return_type: str = "pil"
+    ) -> np.ndarray | Image.Image:
+    """
+    Resize the long edge of the image to the long_edge_size.
+    Args:
+        image (Image.Image): The image to resize.
+        long_edge_size (int): The size of the long edge.
+        interpolate (Image.Resampling): The interpolation method.
+    Returns:
+        np.ndarray: The resized image.
+    """
+    w, h = image.size
+    scale_ratio = long_edge_size / max(h, w)
+    output_w = int(w * scale_ratio)
+    output_h = int(h * scale_ratio)
+    image = resize(image, target_width=int(output_w), target_height=int(output_h), interpolate=interpolate, return_type=return_type)
+    return image
+def ensure_divisible_by_value(
+    image: Image.Image | np.ndarray,
+    value: int = 8,
+    interpolate: Image.Resampling = Image.Resampling.NEAREST,
+    return_type: str = "np"
+    ) -> np.ndarray | Image.Image:
+    """
+    Ensure the image dimensions are divisible by value.
+    Args:
+        image_pil (Image.Image): The image to ensure divisible by value.
+        value (int): The value to ensure divisible by.
+        interpolate (Image.Resampling): The interpolation method.
+        return_type (str): The type of the output image.
+    Returns:
+        np.ndarray | Image.Image: The resized image.
+    """
+    if isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+    w, h = image.size
+    w = (w // value) * value
+    h = (h // value) * value
+    image = resize(image, w, h, interpolate=interpolate, return_type=return_type)
+    return image
+def resize_paired_image(
+    image_reference: np.ndarray,
+    image_target: np.ndarray,
+    mask_target: np.ndarray,
+    force_resize_long_edge: bool = False,
+    return_type: str = "np"
+    ) -> tuple[np.ndarray | Image.Image, np.ndarray | Image.Image, np.ndarray | Image.Image]:
+    if isinstance(image_reference, np.ndarray):
+        image_reference = Image.fromarray(image_reference)
+    if isinstance(image_target, np.ndarray):
+        image_target = Image.fromarray(image_target)
+    if isinstance(mask_target, np.ndarray):
+        mask_target = Image.fromarray(mask_target)
+    if force_resize_long_edge:
+        image_reference = resize_long_edge(image_reference, 1024, interpolate=Image.Resampling.LANCZOS, return_type=return_type)
+        image_target = resize_long_edge(image_target, 1024, interpolate=Image.Resampling.LANCZOS, return_type=return_type)
+        mask_target = resize_long_edge(mask_target, 1024, interpolate=Image.Resampling.NEAREST, return_type=return_type)
+    if isinstance(image_reference, Image.Image):
+        ref_width, ref_height = image_reference.size
+        target_width, target_height = image_target.size
+    else:
+        ref_height, ref_width = image_reference.shape[:2]
+        target_width, target_height = image_target.shape[:2]
+    # resize the ref image to the same height as the target image and ensure the ratio remains the same
+    if ref_height != target_height:
+        scale_ratio = target_height / ref_height
+        image_reference = resize(image_reference, int(ref_width * scale_ratio), target_height, interpolate=Image.Resampling.LANCZOS, return_type=return_type)
+    if return_type == "pil":
+        image_reference = Image.fromarray(image_reference) if isinstance(image_reference, np.ndarray) else image_reference
+        image_target = Image.fromarray(image_target) if isinstance(image_target, np.ndarray) else image_target
+        mask_target = Image.fromarray(mask_target) if isinstance(mask_target, np.ndarray) else mask_target
+        return image_reference, image_target, mask_target
+    else:
+        image_reference = np.array(image_reference) if isinstance(image_reference, Image.Image) else image_reference
+        image_target = np.array(image_target) if isinstance(image_target, Image.Image) else image_target
+        mask_target = np.array(mask_target) if isinstance(mask_target, Image.Image) else mask_target
+        return image_reference, image_target, mask_target
+def prepare_input_images(
+    img_ref: np.ndarray,
+    custmization_mode: str,
+    img_target: Optional[np.ndarray] = None,
+    mask_target: Optional[np.ndarray] = None,
+    width: Optional[int] = None,
+    height: Optional[int] = None,
+    force_resize_long_edge: bool = False,
+    return_type: str = "np"
+    ) -> tuple[np.ndarray | Image.Image, np.ndarray | Image.Image, np.ndarray | Image.Image]:
+    if custmization_mode.lower() == "position-free":
+        img_target = np.ones_like(img_ref) * 255
+        mask_target = np.zeros_like(img_ref)
+    if isinstance(width, int) and isinstance(height, int):
+        img_ref = resize(Image.fromarray(img_ref), width, height, interpolate=Image.Resampling.LANCZOS, return_type=return_type)
+        img_target = resize(Image.fromarray(img_target), width, height, interpolate=Image.Resampling.LANCZOS, return_type=return_type)
+        mask_target = resize(Image.fromarray(mask_target), width, height, interpolate=Image.Resampling.NEAREST, return_type=return_type)
+    else:
+        img_ref, img_target, mask_target = resize_paired_image(img_ref, img_target, mask_target, force_resize_long_edge, return_type=return_type)
+    img_ref = ensure_divisible_by_value(img_ref, value=16, interpolate=Image.Resampling.LANCZOS, return_type=return_type)
+    img_target = ensure_divisible_by_value(img_target, value=16, interpolate=Image.Resampling.LANCZOS, return_type=return_type)
+    mask_target = ensure_divisible_by_value(mask_target, value=16, interpolate=Image.Resampling.NEAREST, return_type=return_type)
+    return img_ref, img_target, mask_target
+def get_mask_type_ids(custmization_mode: str, input_mask_mode: str) -> int:
+    if custmization_mode.lower() == "position-free":
+        return torch.tensor([0])
+    elif custmization_mode.lower() == "position-aware":
+        if "precise" in input_mask_mode.lower():
+            return torch.tensor([1])
+        else:
+            return torch.tensor([2])
+    else:
+        raise ValueError(f"Invalid custmization mode: {custmization_mode}")
+def scale_image(image_np, is_mask: bool = False):
+    """
+    Scale the image to the range of [-1, 1] if not a mask, otherwise scale to [0, 1].
+    Args:
+        image_np (np.ndarray): Input image.
+        is_mask (bool): Whether the image is a mask.
+    Returns:
+        np.ndarray: Scaled image.
+    """
+    if is_mask:
+        image_np = image_np / 255.0
+    else:
+        image_np = image_np / 255.0
+        image_np = image_np * 2 - 1
+    return image_np
+def get_sam_predictor(sam_ckpt_path, device):
+    """
+    Get the SAM predictor.
+    Args:
+        sam_ckpt_path (str): The path to the SAM checkpoint.
+        device (str): The device to load the model on.
+    Returns:
+        SamPredictor: The SAM predictor.
+    """
+    if not os.path.exists(sam_ckpt_path):
+        sam_ckpt_path = hf_hub_download(repo_id="HCMUE-Research/SAM-vit-h", filename="sam_vit_h_4b8939.pth")
+    sam = sam_model_registry['vit_h'](checkpoint=sam_ckpt_path).to(device)
+    sam.eval()
+    predictor = SamPredictor(sam)
+    return predictor
+def image_to_base64(img):
+    """
+    Convert an image to a base64 string.
+    Args:
+        img (PIL.Image.Image): The image to convert.
+    Returns:
+        str: The base64 string.
+    """
+    buffered = BytesIO()
+    img.save(buffered, format="PNG")
+    img_bytes = buffered.getvalue()
+    return base64.b64encode(img_bytes).decode('utf-8')
+def get_vlm(vlm_ckpt_path, device, torch_dtype):
+    """
+    Get the VLM pipeline.
+    Args:
+        vlm_ckpt_path (str): The path to the VLM checkpoint.
+        device (str): The device to load the model on.
+        torch_dtype (torch.dtype): The data type of the model.
+    Returns:
+        tuple: The processor and model.
+    """
+    if not os.path.exists(vlm_ckpt_path):
+        vlm_ckpt_path = "Qwen/Qwen2.5-VL-7B-Instruct"
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    vlm_ckpt_path, torch_dtype=torch_dtype).to(device)
+    processor = AutoProcessor.from_pretrained(vlm_ckpt_path)
+    return processor, model
+def construct_vlm_gen_prompt(image_target, image_reference, target_mask, custmization_mode):
+    """
+    Construct the VLM generation prompt.
+    Args:
+        image_target (np.ndarray): The target image.
+        image_reference (np.ndarray): The reference image.
+        target_mask (np.ndarray): The target mask.
+        custmization_mode (str): The customization mode.
+    Returns:
+        list: The messages.
+    """
+    if custmization_mode.lower() == "position-free":
+        image_reference_pil = Image.fromarray(image_reference.astype(np.uint8))
+        image_reference_base_64 = image_to_base64(image_reference_pil)
+        messages = [
+            {
+                "role": "system",
+                "content": "I will input a reference image. Please identify the main subject/object in this image and generate a new description by placing this subject in a completely different scene or context. For example, if the reference image shows a rabbit sitting in a garden surrounded by green leaves and roses, you could generate a description like 'The rabbit is sitting on a rocky cliff overlooking a serene ocean, with the sun setting behind it, casting a warm glow over the scene'. Please directly output the new description without explaining your thought process. The description should not exceed 256 tokens."
+            },
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": f"data:image;base64,{image_reference_base_64}"
+                    },
+                ],
+            }
+        ]
+        return messages
+    else:
+        image_reference_pil = Image.fromarray(image_reference.astype(np.uint8))
+        image_reference_base_64 = image_to_base64(image_reference_pil)
+        target_mask_binary = target_mask > 127.5
+        masked_image_target = image_target * target_mask_binary
+        masked_image_target_pil = Image.fromarray(masked_image_target.astype(np.uint8))
+        masked_image_target_base_64 = image_to_base64(masked_image_target_pil)
+        messages = [
+            {
+                "role": "system",
+                "content": "I will input a reference image and a target image with its main subject area masked (in black). Please directly describe the scene where the main subject/object from the reference image is placed into the masked area of the target image. Focus on describing the final combined scene, making sure to clearly describe both the object from the reference image and the background/environment from the target image. For example, if the reference shows a white cat with orange stripes on a beach and the target shows a masked area in a garden with blooming roses and tulips, directly describe 'A white cat with orange stripes sits elegantly among the vibrant red roses and yellow tulips in the lush garden, surrounded by green foliage.' The description should not exceed 256 tokens."
+            },
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": f"data:image;base64,{image_reference_base_64}"
+                    },
+                    {
+                        "type": "image",
+                        "image": f"data:image;base64,{masked_image_target_base_64}"
+                    }
+                ],
+            }
+        ]
+        return messages
+def construct_vlm_polish_prompt(prompt):
+    """
+    Construct the VLM polish prompt.
+    Args:
+        prompt (str): The prompt to polish.
+    Returns:
+        list: The messages.
+    """
+    messages = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant that can polish the text prompt to make it more specific, detailed, and complete. Please directly output the polished prompt without explaining your thought process. The prompt should not exceed 256 tokens."
+        },
+        {
+            "role": "user",
+            "content": prompt
+        }
+    ]
+    return messages
+def run_vlm(vlm_processor, vlm_model, messages, device):
+    """
+    Run the VLM.
+    Args:
+        vlm_processor (torch.nn.Module): The VLM processor.
+        vlm_model (torch.nn.Module): The VLM model.
+        messages (list): The messages.
+        device (str): The device to run the model on.
+    Returns:
+        str: The output text.
+    """
+    text = vlm_processor.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True)
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = vlm_processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+    inputs = inputs.to(device)
+    # Inference
+    generated_ids = vlm_model.generate(**inputs, do_sample=True, num_beams=4, temperature=1.5, max_new_tokens=128)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    output_text = vlm_processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )[0]
+    return output_text
+def get_ben2_model(ben2_model_path, device):
+    """
+    Get the BEN2 model.
+    Args:
+        ben2_model_path (str): The path to the BEN2 model.
+        device (str): The device to load the model on.
+    Returns:
+        BEN2: The BEN2 model.
+    """
+    if not os.path.exists(ben2_model_path):
+        ben2_model_path = hf_hub_download(repo_id="PramaLLC/BEN2", filename="BEN2_Base.pth")
+    ben2_model = BEN2.BEN_Base().to(device)
+    ben2_model.loadcheckpoints(model_path=ben2_model_path)
+    return ben2_model
+def make_dict_img_mask(img_path, mask_path):
+    """
+    Make a dictionary of the image and mask for gr.ImageEditor.
+    Keep interface, not used in the gradio app.
+    Args:
+        img_path (str): The path to the image.
+        mask_path (str): The path to the mask.
+    Returns:
+        dict: The dictionary of the image and mask.
+    """
+    from PIL import ImageOps
+    background = Image.open(img_path).convert("RGBA")
+    layers = [
+        Image.merge("RGBA", (
+            Image.new("L", Image.open(mask_path).size, 255),  # R channel
+            Image.new("L", Image.open(mask_path).size, 255),  # G channel
+            Image.new("L", Image.open(mask_path).size, 255),  # B channel
+            ImageOps.invert(Image.open(mask_path).convert("L"))  # Inverted alpha channel
+        ))
+    ]
+    # Combine layers with background by replacing the alpha channel
+    background = np.array(background.convert("RGB"))
+    _, _, _, layer_alpha = layers[0].split()
+    layer_alpha = np.array(layer_alpha)[:,:,np.newaxis]
+    composite = background * (1 - (layer_alpha > 0)) + np.ones_like(background) * (layer_alpha > 0) * 255
+    composite = Image.fromarray(composite.astype("uint8")).convert("RGBA")
+    return {
+        'background': background,
+        'layers': layers,
+        'composite': composite
+    }

assets/gradio/pos_aware/001/hypher_params.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7609eddfca9279636bdbbcaa25fc0c04b52816fa563caa220668a30526004e81
+size 169

assets/gradio/pos_aware/001/img_gen.png ADDED Viewed

Git LFS Details

SHA256: c1bb90c2c7f1c0f3bdda840169e5800d52ac229deb6f2adcd6e2959e6db17b94
Pointer size: 131 Bytes
Size of remote file: 904 kB

assets/gradio/pos_aware/001/img_ref.png ADDED Viewed

Git LFS Details

SHA256: 4a0e009aaad8333c39d28148df6a4ce08efab636e6fc2a5302dc9ac7eb9c8260
Pointer size: 132 Bytes
Size of remote file: 1.27 MB

assets/gradio/pos_aware/001/img_target.png ADDED Viewed

Git LFS Details

SHA256: 32c3acbb178fe2eea27b1ab46e25470f8e972cfd832ee93a5dde44ab9311a773
Pointer size: 131 Bytes
Size of remote file: 666 kB

assets/gradio/pos_aware/001/mask_target.png ADDED Viewed

Git LFS Details

SHA256: 14f001bfb9ac31893fe3dbb76e8eb3640cec124a28f2c22e859fa532b7286e8b
Pointer size: 129 Bytes
Size of remote file: 7.26 kB

assets/gradio/pos_aware/002/hypher_params.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3e344b14750ee1f3ab780df3ce9871684f40cca81820ea0dcbc4896dd4b30a42
+size 170

assets/gradio/pos_aware/002/img_gen.png ADDED Viewed

Git LFS Details

SHA256: ef5d260e6b9d25252f260e6c870ec028582835e285a31140d1d0ca7ae2fd0ab2
Pointer size: 132 Bytes
Size of remote file: 1.09 MB

assets/gradio/pos_aware/002/img_ref.png ADDED Viewed

Git LFS Details

SHA256: 1a58a77c7c149353b857536b60dddd8c6853c51a60a13f290c670b5f1718d897
Pointer size: 131 Bytes
Size of remote file: 236 kB

assets/gradio/pos_aware/002/img_target.png ADDED Viewed

Git LFS Details

SHA256: 25dd856cf5c114f440d7b1ea64df8135240c9bd29c6da072ec5c72d579b605b5
Pointer size: 131 Bytes
Size of remote file: 949 kB

assets/gradio/pos_aware/002/mask_target.png ADDED Viewed

Git LFS Details

SHA256: 651a59bf5c9d841540c84b3a28b54d7798cf1b5e2be10fbaf7cbf22321d3f14e
Pointer size: 129 Bytes
Size of remote file: 5.71 kB

assets/gradio/pos_aware/003/hypher_params.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:76e365ad0ac4b41937a6ff5d2817e73f3e662240b94834e844b59687c45c01f0
+size 311

assets/gradio/pos_aware/003/img_gen.png ADDED Viewed

Git LFS Details

SHA256: 1caee153aaa03be649e33110092b2b3e63b6de8f5bd41115ccd315536f27ae5d
Pointer size: 132 Bytes
Size of remote file: 1.33 MB

assets/gradio/pos_aware/003/img_ref.png ADDED Viewed

Git LFS Details

SHA256: f295720c85f2d35de8e07f123a73a44a8338b6d8dce9617effbd639654afffa1
Pointer size: 131 Bytes
Size of remote file: 122 kB

assets/gradio/pos_aware/003/img_target.png ADDED Viewed

Git LFS Details

SHA256: 7de6458b1ed2cf0ce280f9662c39dcf4dd609c4f2e3991289c9b3de7ca04aa28
Pointer size: 131 Bytes
Size of remote file: 996 kB

assets/gradio/pos_aware/003/mask_target.png ADDED Viewed

Git LFS Details

SHA256: f8dd8b499cabec5615c745f87c4d604059d6e1c1933535a6e2c67039e720fc48
Pointer size: 129 Bytes
Size of remote file: 7.01 kB

assets/gradio/pos_aware/004/hypher_params.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c8a478290f495492d4361f0c7b429f1bf216d73eb89cb460acb210d7bb53901
+size 174

assets/gradio/pos_aware/004/img_gen.png ADDED Viewed

Git LFS Details

SHA256: 9ebe14a1bef8f7a92fb3f16d2119bf9d249624fcd54f49e547bd9c843b813877
Pointer size: 132 Bytes
Size of remote file: 1.45 MB

assets/gradio/pos_aware/004/img_ref.png ADDED Viewed

Git LFS Details

SHA256: 804a4b381965b46d8ccfc3c2f5167eec97a96104b6324e58afbf01bb995d1b23
Pointer size: 132 Bytes
Size of remote file: 1.04 MB

assets/gradio/pos_aware/004/img_target.png ADDED Viewed

Git LFS Details

SHA256: d2469e2c5b4bc736ece560438656376180b2de9c6c4f7d446112608ac6f3d0f1
Pointer size: 132 Bytes
Size of remote file: 1.32 MB

assets/gradio/pos_aware/004/mask_target.png ADDED Viewed

Git LFS Details

SHA256: 1d3f31df4d08aea1a689ac3cab96100d1d9d84cd6c73549e7f87c5915bbe4a19
Pointer size: 129 Bytes
Size of remote file: 8.43 kB

assets/gradio/pos_aware/005/hypher_params.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c0c8d875d0af12f57b70c596f70dfda334199990a819e96b67c8968538c7cdd6
+size 173

assets/gradio/pos_aware/005/img_gen.png ADDED Viewed

Git LFS Details

SHA256: 776179662774adfc458f988a58b55e8cb2ee29a92e70cada52a4e4d8ead2e643
Pointer size: 131 Bytes
Size of remote file: 970 kB

assets/gradio/pos_aware/005/img_ref.png ADDED Viewed

Git LFS Details

SHA256: eb6cf9308603da24b1bd4b63eee9989024024cefdc3dff0fc23415762aaa1833
Pointer size: 131 Bytes
Size of remote file: 305 kB

assets/gradio/pos_aware/005/img_target.png ADDED Viewed

Git LFS Details

SHA256: f61b04b8a6174a25c68f34aca03f9fafb46e9b6b6ca69e9457ff6ffebc4493f3
Pointer size: 131 Bytes
Size of remote file: 937 kB

assets/gradio/pos_aware/005/mask_target.png ADDED Viewed

Git LFS Details

SHA256: 3ce455fb1a94b698a7ae656aaaafce1dcbb9e170cf40103a60a1936e58053ba0
Pointer size: 129 Bytes
Size of remote file: 4.44 kB

assets/gradio/pos_free/001/hyper_params.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"prompt": "TThe charming, soft plush toy is joyfully wandering through a lush, dense jungle, surrounded by vibrant green foliage and towering trees.", "custmization_mode": "Position-free", "input_mask_mode": "Precise mask", "seg_ref_mode": "Full Ref", "seed": 2126677963, "guidance": 40, "num_steps": 20, "num_images_per_prompt": 1, "use_background_preservation": false, "background_blend_threshold": 0.5, "true_gs": 3}

assets/gradio/pos_free/001/img_gen.png ADDED Viewed

Git LFS Details

SHA256: d131b70aab831c75a7f8d65bffeef7a7263e00b5e9767dbaa1b04b49549a3a93
Pointer size: 132 Bytes
Size of remote file: 1.48 MB

assets/gradio/pos_free/001/img_ref.png ADDED Viewed

Git LFS Details

SHA256: 31eb98b779029afcee3e0be48eeb6d5df3d2e8a76b60142fba5bc7632f1a083e
Pointer size: 131 Bytes
Size of remote file: 423 kB

assets/gradio/pos_free/001/img_target.png ADDED Viewed

Git LFS Details

SHA256: d9bd147ced77bca4a875af12714949cd17ddd3c11cc47218b4de30185bc0b4e9
Pointer size: 129 Bytes
Size of remote file: 5.33 kB

assets/gradio/pos_free/001/mask_target.png ADDED Viewed

Git LFS Details

SHA256: 79238174f0b3e2441720c46339b8cce2c8be2c19a1507831e726b51b8bbe3b82
Pointer size: 129 Bytes
Size of remote file: 3.13 kB

assets/gradio/pos_free/002/hyper_params.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"prompt": "A bright yellow alarm clock sits on a wooden desk next to a stack of books in a cozy, sunlit room.", "custmization_mode": "Position-free", "input_mask_mode": "Precise mask", "seg_ref_mode": "Full Ref", "seed": 2126677963, "guidance": 40, "num_steps": 20, "num_images_per_prompt": 1, "use_background_preservation": false, "background_blend_threshold": 0.5, "true_gs": 3}

assets/gradio/pos_free/002/img_gen.png ADDED Viewed

Git LFS Details

SHA256: 6acf6529520813f6b40dbe953267555e84a373c6d9532ee795adbc5538ffe14e
Pointer size: 131 Bytes
Size of remote file: 901 kB

assets/gradio/pos_free/002/img_ref.png ADDED Viewed

Git LFS Details

SHA256: 8e11cfbb8300d6191af71e5b8fb040f98fe7ce59b62002e5852c5b2c2044455f
Pointer size: 132 Bytes
Size of remote file: 1.12 MB

assets/gradio/pos_free/002/img_target.png ADDED Viewed

Git LFS Details

SHA256: d9bd147ced77bca4a875af12714949cd17ddd3c11cc47218b4de30185bc0b4e9
Pointer size: 129 Bytes
Size of remote file: 5.33 kB