Spaces:

shenyunhang
/

APE_demo

Build error

App Files Files Community

shenyunhang commited on Dec 10, 2023

Commit

feac658

0 Parent(s):

rebase

Browse files

Files changed (17) hide show

.gitattributes +10 -0
README.md +13 -0
app.py +1032 -0
demo_lazy.py +263 -0
examples/013_438973263.jpg +3 -0
examples/094_56726435.jpg +3 -0
examples/199_3946193540.jpg +3 -0
examples/MatrixRevolutionForZion.jpg +3 -0
examples/Pisa.jpg +3 -0
examples/SolvayConference1927.jpg +3 -0
examples/Terminator3.jpg +3 -0
examples/TheGreatWall.jpg +3 -0
examples/Totoro01.png +3 -0
examples/Transformers.webp +3 -0
pre-requirements.txt +4 -0
predictor_lazy.py +429 -0
requirements.txt +11 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,10 @@

+examples/094_56726435.jpg filter=lfs diff=lfs merge=lfs -text
+examples/199_3946193540.jpg filter=lfs diff=lfs merge=lfs -text
+examples/SolvayConference1927.jpg filter=lfs diff=lfs merge=lfs -text
+examples/TheGreatWall.jpg filter=lfs diff=lfs merge=lfs -text
+examples/Totoro01.png filter=lfs diff=lfs merge=lfs -text
+examples/Transformers.webp filter=lfs diff=lfs merge=lfs -text
+examples/013_438973263.jpg filter=lfs diff=lfs merge=lfs -text
+examples/Pisa.jpg filter=lfs diff=lfs merge=lfs -text
+examples/Terminator3.jpg filter=lfs diff=lfs merge=lfs -text
+examples/MatrixRevolutionForZion.jpg filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: APE
+emoji: 🌍
+colorFrom: blue
+colorTo: indigo
+sdk: gradio
+sdk_version: 4.7.1
+app_file: app.py
+pinned: false
+license: apache-2.0
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,1032 @@

+import gc
+import multiprocessing as mp
+import os
+import shutil
+import sys
+import time
+from os import path
+import cv2
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+import ape
+import detectron2.data.transforms as T
+import gradio as gr
+from ape.model_zoo import get_config_file
+from demo_lazy import get_parser, setup_cfg
+from detectron2.config import CfgNode
+from detectron2.data.detection_utils import read_image
+from detectron2.evaluation.coco_evaluation import instances_to_coco_json
+from detectron2.utils.logger import setup_logger
+from predictor_lazy import VisualizationDemo
+this_dir = path.dirname(path.abspath(__file__))
+# os.system("git clone https://github.com/shenyunhang/APE.git")
+# os.system("python3.10 -m pip install -e APE/")
+example_list = [
+    [
+        this_dir + "/examples/Totoro01.png",
+        # "Sky, Water, Tree, The biggest Chinchilla, The older girl wearing skirt on branch, Grass",
+        "Girl with hat",
+        # 0.05,
+        0.25,
+        ["object detection", "instance segmentation"],
+    ],
+    [
+        this_dir + "/examples/Totoro01.png",
+        "Sky, Water, Tree, Chinchilla, Grass, Girl",
+        0.15,
+        ["semantic segmentation"],
+    ],
+    [
+        this_dir + "/examples/199_3946193540.jpg",
+        "chess piece of horse head",
+        0.30,
+        ["object detection", "instance segmentation"],
+    ],
+    [
+        this_dir + "/examples/TheGreatWall.jpg",
+        "The Great Wall",
+        0.1,
+        ["semantic segmentation"],
+    ],
+    [
+        this_dir + "/examples/Pisa.jpg",
+        "Pisa",
+        0.01,
+        ["object detection", "instance segmentation"],
+    ],
+    [
+        this_dir + "/examples/SolvayConference1927.jpg",
+        # "Albert Einstein, Madame Curie",
+        "Madame Curie",
+        # 0.01,
+        0.03,
+        ["object detection", "instance segmentation"],
+    ],
+    [
+        this_dir + "/examples/Transformers.webp",
+        "Optimus Prime",
+        0.11,
+        ["object detection", "instance segmentation"],
+    ],
+    [
+        this_dir + "/examples/Terminator3.jpg",
+        "Humanoid Robot",
+        0.10,
+        ["object detection", "instance segmentation"],
+    ],
+    [
+        this_dir + "/examples/MatrixRevolutionForZion.jpg",
+        """machine killer with gun in fighting,
+donut with colored granules on the surface,
+railings being crossed by horses,
+a horse running or jumping,
+equestrian rider's helmet,
+outdoor dog led by rope,
+a dog being touched,
+clothed dog,
+basketball in hand,
+a basketball player with both feet off the ground,
+player with basketball in the hand,
+spoon on the plate,
+coffee cup with coffee,
+the nearest dessert to the coffee cup,
+the bartender who is mixing wine,
+a bartender in a suit,
+wine glass with wine,
+a person in aprons,
+pot with food,
+a knife being used to cut vegetables,
+striped sofa in the room,
+a sofa with pillows on it in the room,
+lights on in the room,
+an indoor lying pet,
+a cat on the sofa,
+one pet looking directly at the camera indoors,
+a bed with patterns in the room,
+the lamp on the table beside the bed,
+pillow placed at the head of the bed,
+a blackboard full of words in the classroom,
+child sitting at desks in the classroom,
+a person standing in front of bookshelves in the library,
+the table someone is using in the library,
+a person who touches books in the library,
+a person standing in front of the cake counter,
+a square plate full of cakes,
+a cake decorated with cream,
+hot dog with vegetables,
+hot dog with sauce on the surface,
+red sausage,
+flowerpot with flowers potted inside,
+monochrome flowerpot,
+a flowerpot filled with black soil,
+apple growing on trees,
+red complete apple,
+apple with a stalk,
+a woman brushing her teeth,
+toothbrush held by someone,
+toilet brush with colored bristles,
+a customer whose hair is being cut by barber,
+a barber at work,
+cloth covering the barber,
+shopping cart pushed by people in the supermarket,
+shopping cart with people in the supermarket,
+shopping cart full of goods,
+a child wearing a mask,
+refrigerator with fruit,
+a drink bottle in the refrigerator,
+refrigerator with more than two doors,
+a watch placed on a table or cloth,
+a watch with three or more watch hands can be seen,
+a watch with one or more small dials,
+clothes hanger,
+a piece of clothing hanging on the hanger,
+a piece of clothing worn on plastic models,
+leather bag with glossy surface,
+backpack,
+open package,
+a fish held by people,
+a person who is fishing with a fishing rod,
+a fisherman standing on the shore with his body soaked in water, camera hold on someone's shoulder,
+a person being interviewed,
+a person with microphone hold in hand,
+        """,
+        0.20,
+        ["object detection", "instance segmentation"],
+    ],
+    [
+        this_dir + "/examples/094_56726435.jpg",
+        # "donut with colored granules on the surface",
+        """donut with colored granules on the surface,
+railings being crossed by horses,
+a horse running or jumping,
+equestrian rider's helmet,
+outdoor dog led by rope,
+a dog being touched,
+clothed dog,
+basketball in hand,
+a basketball player with both feet off the ground,
+player with basketball in the hand,
+spoon on the plate,
+coffee cup with coffee,
+the nearest dessert to the coffee cup,
+the bartender who is mixing wine,
+a bartender in a suit,
+wine glass with wine,
+a person in aprons,
+pot with food,
+a knife being used to cut vegetables,
+striped sofa in the room,
+a sofa with pillows on it in the room,
+lights on in the room,
+an indoor lying pet,
+a cat on the sofa,
+one pet looking directly at the camera indoors,
+a bed with patterns in the room,
+the lamp on the table beside the bed,
+pillow placed at the head of the bed,
+a blackboard full of words in the classroom,
+a blackboard or whiteboard with something pasted,
+child sitting at desks in the classroom,
+a person standing in front of bookshelves in the library,
+the table someone is using in the library,
+a person who touches books in the library,
+a person standing in front of the cake counter,
+a square plate full of cakes,
+a cake decorated with cream,
+hot dog with vegetables,
+hot dog with sauce on the surface,
+red sausage,
+flowerpot with flowers potted inside,
+monochrome flowerpot,
+a flowerpot filled with black soil,
+apple growing on trees,
+red complete apple,
+apple with a stalk,
+a woman brushing her teeth,
+toothbrush held by someone,
+toilet brush with colored bristles,
+a customer whose hair is being cut by barber,
+a barber at work,
+cloth covering the barber,
+a plastic toy,
+a plush toy,
+a humanoid toy,
+shopping cart pushed by people in the supermarket,
+shopping cart with people in the supermarket,
+shopping cart full of goods,
+a child wearing a mask,
+a mask on face with half a face exposed,
+a mask on face with only eyes exposed,
+refrigerator with fruit,
+a drink bottle in the refrigerator,
+refrigerator with more than two doors,
+a watch placed on a table or cloth,
+a watch with three or more watch hands can be seen,
+a watch with one or more small dials,
+clothes hanger,
+a piece of clothing hanging on the hanger,
+a piece of clothing worn on plastic models,
+leather bag with glossy surface,
+backpack,
+open package,
+a fish held by people,
+a person who is fishing with a fishing rod,
+a fisherman standing on the shore with his body soaked in water, camera hold on someone's shoulder,
+a person being interviewed,
+a person with microphone hold in hand,
+        """,
+        0.50,
+        ["object detection", "instance segmentation"],
+    ],
+    [
+        this_dir + "/examples/013_438973263.jpg",
+        # "a male lion with a mane",
+        """a male lion with a mane,
+railings being crossed by horses,
+a horse running or jumping,
+equestrian rider's helmet,
+outdoor dog led by rope,
+a dog being touched,
+clothed dog,
+basketball in hand,
+a basketball player with both feet off the ground,
+player with basketball in the hand,
+spoon on the plate,
+coffee cup with coffee,
+the nearest dessert to the coffee cup,
+the bartender who is mixing wine,
+a bartender in a suit,
+wine glass with wine,
+a person in aprons,
+pot with food,
+a knife being used to cut vegetables,
+striped sofa in the room,
+a sofa with pillows on it in the room,
+lights on in the room,
+an indoor lying pet,
+a cat on the sofa,
+one pet looking directly at the camera indoors,
+a bed with patterns in the room,
+the lamp on the table beside the bed,
+pillow placed at the head of the bed,
+a blackboard full of words in the classroom,
+a blackboard or whiteboard with something pasted,
+child sitting at desks in the classroom,
+a person standing in front of bookshelves in the library,
+the table someone is using in the library,
+a person who touches books in the library,
+a person standing in front of the cake counter,
+a square plate full of cakes,
+a cake decorated with cream,
+hot dog with vegetables,
+hot dog with sauce on the surface,
+red sausage,
+flowerpot with flowers potted inside,
+monochrome flowerpot,
+a flowerpot filled with black soil,
+apple growing on trees,
+red complete apple,
+apple with a stalk,
+a woman brushing her teeth,
+toothbrush held by someone,
+toilet brush with colored bristles,
+a customer whose hair is being cut by barber,
+a barber at work,
+cloth covering the barber,
+a plastic toy,
+a plush toy,
+a humanoid toy,
+shopping cart pushed by people in the supermarket,
+shopping cart with people in the supermarket,
+shopping cart full of goods,
+a child wearing a mask,
+a mask on face with half a face exposed,
+a mask on face with only eyes exposed,
+refrigerator with fruit,
+a drink bottle in the refrigerator,
+refrigerator with more than two doors,
+a watch placed on a table or cloth,
+a watch with three or more watch hands can be seen,
+a watch with one or more small dials,
+clothes hanger,
+a piece of clothing hanging on the hanger,
+a piece of clothing worn on plastic models,
+leather bag with glossy surface,
+backpack,
+open package,
+a fish held by people,
+a person who is fishing with a fishing rod,
+a fisherman standing on the shore with his body soaked in water, camera hold on someone's shoulder,
+a person being interviewed,
+a person with microphone hold in hand,
+        """,
+        # 0.25,
+        0.50,
+        ["object detection", "instance segmentation"],
+    ],
+]
+ckpt_repo_id = "shenyunhang/APE"
+def setup_model(name):
+    gc.collect()
+    torch.cuda.empty_cache()
+    if save_memory:
+        pass
+    else:
+        return
+    for key, demo in all_demo.items():
+        if key == name:
+            demo.predictor.model.to(running_device)
+        else:
+            demo.predictor.model.to("cpu")
+    gc.collect()
+    torch.cuda.empty_cache()
+def run_on_image_A(input_image_path, input_text, score_threshold, output_type):
+    logger.info("run_on_image")
+    setup_model("APE_A")
+    demo = all_demo["APE_A"]
+    cfg = all_cfg["APE_A"]
+    demo.predictor.model.model_vision.test_score_thresh = score_threshold
+    return run_on_image(
+        input_image_path,
+        input_text,
+        output_type,
+        demo,
+        cfg,
+    )
+def run_on_image_C(input_image_path, input_text, score_threshold, output_type):
+    logger.info("run_on_image_C")
+    setup_model("APE_C")
+    demo = all_demo["APE_C"]
+    cfg = all_cfg["APE_C"]
+    demo.predictor.model.model_vision.test_score_thresh = score_threshold
+    return run_on_image(
+        input_image_path,
+        input_text,
+        output_type,
+        demo,
+        cfg,
+    )
+def run_on_image_D(input_image_path, input_text, score_threshold, output_type):
+    logger.info("run_on_image_D")
+    setup_model("APE_D")
+    demo = all_demo["APE_D"]
+    cfg = all_cfg["APE_D"]
+    demo.predictor.model.model_vision.test_score_thresh = score_threshold
+    return run_on_image(
+        input_image_path,
+        input_text,
+        output_type,
+        demo,
+        cfg,
+    )
+def run_on_image_comparison(input_image_path, input_text, score_threshold, output_type):
+    logger.info("run_on_image_comparison")
+    r = []
+    for key in all_demo.keys():
+        logger.info("run_on_image_comparison {}".format(key))
+        setup_model(key)
+        demo = all_demo[key]
+        cfg = all_cfg[key]
+        demo.predictor.model.model_vision.test_score_thresh = score_threshold
+        img, _ = run_on_image(
+            input_image_path,
+            input_text,
+            output_type,
+            demo,
+            cfg,
+        )
+        r.append(img)
+    return r
+def run_on_image(
+    input_image_path,
+    input_text,
+    output_type,
+    demo,
+    cfg,
+):
+    with_box = False
+    with_mask = False
+    with_sseg = False
+    if "object detection" in output_type:
+        with_box = True
+    if "instance segmentation" in output_type:
+        with_mask = True
+    if "semantic segmentation" in output_type:
+        with_sseg = True
+    if isinstance(input_image_path, dict):
+        input_mask_path = input_image_path["mask"]
+        input_image_path = input_image_path["image"]
+        print("input_image_path", input_image_path)
+        print("input_mask_path", input_mask_path)
+    else:
+        input_mask_path = None
+    print("input_text", input_text)
+    if isinstance(cfg, CfgNode):
+        input_format = cfg.INPUT.FORMAT
+    else:
+        if "model_vision" in cfg.model:
+            input_format = cfg.model.model_vision.input_format
+        else:
+            input_format = cfg.model.input_format
+    input_image = read_image(input_image_path, format="BGR")
+    # img = cv2.imread(input_image_path)
+    # cv2.imwrite("tmp.jpg", img)
+    # # input_image = read_image("tmp.jpg", format=input_format)
+    # input_image = read_image("tmp.jpg", format="BGR")
+    if input_mask_path is not None:
+        input_mask = read_image(input_mask_path, "L").squeeze(2)
+        print("input_mask", input_mask)
+        print("input_mask", input_mask.shape)
+    else:
+        input_mask = None
+    if not with_box and not with_mask and not with_sseg:
+        return input_image[:, :, ::-1]
+    if input_image.shape[0] > 1024 or input_image.shape[1] > 1024:
+        transform = aug.get_transform(input_image)
+        input_image = transform.apply_image(input_image)
+    else:
+        transform = None
+    start_time = time.time()
+    predictions, visualized_output, _, metadata = demo.run_on_image(
+        input_image,
+        text_prompt=input_text,
+        mask_prompt=input_mask,
+        with_box=with_box,
+        with_mask=with_mask,
+        with_sseg=with_sseg,
+    )
+    logger.info(
+        "{} in {:.2f}s".format(
+            "detected {} instances".format(len(predictions["instances"]))
+            if "instances" in predictions
+            else "finished",
+            time.time() - start_time,
+        )
+    )
+    output_image = visualized_output.get_image()
+    print("output_image", output_image.shape)
+    # if input_format == "RGB":
+    #     output_image = output_image[:, :, ::-1]
+    if transform:
+        output_image = transform.inverse().apply_image(output_image)
+    print("output_image", output_image.shape)
+    output_image = Image.fromarray(output_image)
+    gc.collect()
+    torch.cuda.empty_cache()
+    json_results = instances_to_coco_json(predictions["instances"].to(demo.cpu_device), 0)
+    for json_result in json_results:
+        json_result["category_name"] = metadata.thing_classes[json_result["category_id"]]
+        del json_result["image_id"]
+    return output_image, json_results
+def load_APE_A():
+    # init_checkpoint= "output2/APE/configs/LVISCOCOCOCOSTUFF_O365_OID_VG/ape_deta/ape_deta_vitl_eva02_lsj_cp_720k_20230504_002019/model_final.pth"
+    init_checkpoint = "configs/LVISCOCOCOCOSTUFF_O365_OID_VG/ape_deta/ape_deta_vitl_eva02_lsj_cp_720k_20230504_002019/model_final.pth"
+    init_checkpoint = hf_hub_download(repo_id=ckpt_repo_id, filename=init_checkpoint)
+    args = get_parser().parse_args()
+    args.config_file = get_config_file(
+        "LVISCOCOCOCOSTUFF_O365_OID_VG/ape_deta/ape_deta_vitl_eva02_lsj1024_cp_720k.py"
+    )
+    args.confidence_threshold = 0.01
+    args.opts = [
+        "train.init_checkpoint='{}'".format(init_checkpoint),
+        "model.model_language.cache_dir=''",
+        "model.model_vision.select_box_nums_for_evaluation=500",
+        "model.model_vision.backbone.net.xattn=False",
+        "model.model_vision.transformer.encoder.pytorch_attn=True",
+        "model.model_vision.transformer.decoder.pytorch_attn=True",
+    ]
+    if running_device == "cpu":
+        args.opts += [
+            "model.model_language.dtype='float32'",
+        ]
+    logger.info("Arguments: " + str(args))
+    cfg = setup_cfg(args)
+    cfg.model.model_vision.criterion[0].use_fed_loss = False
+    cfg.model.model_vision.criterion[2].use_fed_loss = False
+    cfg.train.device = running_device
+    ape.modeling.text.eva01_clip.eva_clip._MODEL_CONFIGS[cfg.model.model_language.clip_model][
+        "vision_cfg"
+    ]["layers"] = 1
+    ape.modeling.text.eva01_clip.eva_clip._MODEL_CONFIGS[cfg.model.model_language.clip_model][
+        "vision_cfg"
+    ]["fusedLN"] = False
+    demo = VisualizationDemo(cfg, args=args)
+    if save_memory:
+        demo.predictor.model.to("cpu")
+        # demo.predictor.model.half()
+    else:
+        demo.predictor.model.to(running_device)
+    all_demo["APE_A"] = demo
+    all_cfg["APE_A"] = cfg
+def load_APE_B():
+    # init_checkpoint= "output2/APE/configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_REFCOCO/ape_deta/ape_deta_vitl_eva02_vlf_lsj_cp_1080k_20230702_225418/model_final.pth"
+    init_checkpoint = "configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_REFCOCO/ape_deta/ape_deta_vitl_eva02_vlf_lsj_cp_1080k_20230702_225418/model_final.pth"
+    init_checkpoint = hf_hub_download(repo_id=ckpt_repo_id, filename=init_checkpoint)
+    args = get_parser().parse_args()
+    args.config_file = get_config_file(
+        "LVISCOCOCOCOSTUFF_O365_OID_VGR_REFCOCO/ape_deta/ape_deta_vitl_eva02_vlf_lsj1024_cp_1080k.py"
+    )
+    args.confidence_threshold = 0.01
+    args.opts = [
+        "train.init_checkpoint='{}'".format(init_checkpoint),
+        "model.model_language.cache_dir=''",
+        "model.model_vision.select_box_nums_for_evaluation=500",
+        "model.model_vision.text_feature_bank_reset=True",
+        "model.model_vision.backbone.net.xattn=False",
+        "model.model_vision.transformer.encoder.pytorch_attn=True",
+        "model.model_vision.transformer.decoder.pytorch_attn=True",
+    ]
+    if running_device == "cpu":
+        args.opts += [
+            "model.model_language.dtype='float32'",
+        ]
+    logger.info("Arguments: " + str(args))
+    cfg = setup_cfg(args)
+    cfg.model.model_vision.criterion[0].use_fed_loss = False
+    cfg.model.model_vision.criterion[2].use_fed_loss = False
+    cfg.train.device = running_device
+    ape.modeling.text.eva01_clip.eva_clip._MODEL_CONFIGS[cfg.model.model_language.clip_model][
+        "vision_cfg"
+    ]["layers"] = 1
+    ape.modeling.text.eva01_clip.eva_clip._MODEL_CONFIGS[cfg.model.model_language.clip_model][
+        "vision_cfg"
+    ]["fusedLN"] = False
+    demo = VisualizationDemo(cfg, args=args)
+    if save_memory:
+        demo.predictor.model.to("cpu")
+        # demo.predictor.model.half()
+    else:
+        demo.predictor.model.to(running_device)
+    all_demo["APE_B"] = demo
+    all_cfg["APE_B"] = cfg
+def load_APE_C():
+    # init_checkpoint= "output2/APE/configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO/ape_deta/ape_deta_vitl_eva02_vlf_lsj_cp_1080k_20230702_210950/model_final.pth"
+    init_checkpoint = "configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO/ape_deta/ape_deta_vitl_eva02_vlf_lsj_cp_1080k_20230702_210950/model_final.pth"
+    init_checkpoint = hf_hub_download(repo_id=ckpt_repo_id, filename=init_checkpoint)
+    args = get_parser().parse_args()
+    args.config_file = get_config_file(
+        "LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO/ape_deta/ape_deta_vitl_eva02_vlf_lsj1024_cp_1080k.py"
+    )
+    args.confidence_threshold = 0.01
+    args.opts = [
+        "train.init_checkpoint='{}'".format(init_checkpoint),
+        "model.model_language.cache_dir=''",
+        "model.model_vision.select_box_nums_for_evaluation=500",
+        "model.model_vision.text_feature_bank_reset=True",
+        "model.model_vision.backbone.net.xattn=False",
+        "model.model_vision.transformer.encoder.pytorch_attn=True",
+        "model.model_vision.transformer.decoder.pytorch_attn=True",
+    ]
+    if running_device == "cpu":
+        args.opts += [
+            "model.model_language.dtype='float32'",
+        ]
+    logger.info("Arguments: " + str(args))
+    cfg = setup_cfg(args)
+    cfg.model.model_vision.criterion[0].use_fed_loss = False
+    cfg.model.model_vision.criterion[2].use_fed_loss = False
+    cfg.train.device = running_device
+    ape.modeling.text.eva01_clip.eva_clip._MODEL_CONFIGS[cfg.model.model_language.clip_model][
+        "vision_cfg"
+    ]["layers"] = 1
+    ape.modeling.text.eva01_clip.eva_clip._MODEL_CONFIGS[cfg.model.model_language.clip_model][
+        "vision_cfg"
+    ]["fusedLN"] = False
+    demo = VisualizationDemo(cfg, args=args)
+    if save_memory:
+        demo.predictor.model.to("cpu")
+        # demo.predictor.model.half()
+    else:
+        demo.predictor.model.to(running_device)
+    all_demo["APE_C"] = demo
+    all_cfg["APE_C"] = cfg
+def load_APE_D():
+    # init_checkpoint= "output2/APE/configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO_GQA_PhraseCut_Flickr30k/ape_deta/ape_deta_vitl_eva02_clip_vlf_lsj1024_cp_16x4_1080k_mdl_20230829_162438/model_final.pth"
+    init_checkpoint = "configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO_GQA_PhraseCut_Flickr30k/ape_deta/ape_deta_vitl_eva02_clip_vlf_lsj1024_cp_16x4_1080k_mdl_20230829_162438/model_final.pth"
+    init_checkpoint = hf_hub_download(repo_id=ckpt_repo_id, filename=init_checkpoint)
+    args = get_parser().parse_args()
+    args.config_file = get_config_file(
+        "LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO_GQA_PhraseCut_Flickr30k/ape_deta/ape_deta_vitl_eva02_clip_vlf_lsj1024_cp_16x4_1080k.py"
+    )
+    args.confidence_threshold = 0.01
+    args.opts = [
+        "train.init_checkpoint='{}'".format(init_checkpoint),
+        "model.model_language.cache_dir=''",
+        "model.model_vision.select_box_nums_for_evaluation=500",
+        "model.model_vision.text_feature_bank_reset=True",
+        "model.model_vision.backbone.net.xattn=False",
+        "model.model_vision.transformer.encoder.pytorch_attn=True",
+        "model.model_vision.transformer.decoder.pytorch_attn=True",
+    ]
+    if running_device == "cpu":
+        args.opts += [
+            "model.model_language.dtype='float32'",
+        ]
+    logger.info("Arguments: " + str(args))
+    cfg = setup_cfg(args)
+    cfg.model.model_vision.criterion[0].use_fed_loss = False
+    cfg.model.model_vision.criterion[2].use_fed_loss = False
+    cfg.train.device = running_device
+    ape.modeling.text.eva02_clip.factory._MODEL_CONFIGS[cfg.model.model_language.clip_model][
+        "vision_cfg"
+    ]["layers"] = 1
+    demo = VisualizationDemo(cfg, args=args)
+    if save_memory:
+        demo.predictor.model.to("cpu")
+        # demo.predictor.model.half()
+    else:
+        demo.predictor.model.to(running_device)
+    all_demo["APE_D"] = demo
+    all_cfg["APE_D"] = cfg
+def APE_A_tab():
+    with gr.Tab("APE A"):
+        with gr.Row(equal_height=False):
+            with gr.Column(scale=1):
+                input_image = gr.Image(
+                    sources=["upload"],
+                    type="filepath",
+                    # tool="sketch",
+                    # brush_radius=50,
+                )
+                input_text = gr.Textbox(
+                    label="Object Prompt (optional, if not provided, will only find COCO object.)",
+                    info="格式: word1,word2,word3,...",
+                )
+                score_threshold = gr.Slider(
+                    label="Score Threshold", minimum=0.01, maximum=1.0, value=0.3, step=0.01
+                )
+                output_type = gr.CheckboxGroup(
+                    ["object detection", "instance segmentation"],
+                    value=["object detection", "instance segmentation"],
+                    label="Output Type",
+                    info="Which kind of output is displayed?",
+                ).style(item_container=True, container=True)
+                run_button = gr.Button("Run")
+            with gr.Column(scale=2):
+                gallery = gr.Image(
+                    type="pil",
+                )
+        example_data = gr.Dataset(
+            components=[input_image, input_text, score_threshold],
+            samples=examples,
+            samples_per_page=5,
+        )
+        example_data.click(fn=set_example, inputs=example_data, outputs=example_data.components)
+        # add_tail_info()
+        output_json = gr.JSON(label="json results")
+        run_button.click(
+            fn=run_on_image,
+            inputs=[input_image, input_text, score_threshold, output_type],
+            outputs=[gallery, output_json],
+        )
+def APE_C_tab():
+    with gr.Tab("APE C"):
+        with gr.Row(equal_height=False):
+            with gr.Column(scale=1):
+                input_image = gr.Image(
+                    sources=["upload"],
+                    type="filepath",
+                    # tool="sketch",
+                    # brush_radius=50,
+                )
+                input_text = gr.Textbox(
+                    label="Object Prompt (optional, if not provided, will only find COCO object.)",
+                    info="格式: word1,word2,sentence1,sentence2,...",
+                )
+                score_threshold = gr.Slider(
+                    label="Score Threshold", minimum=0.01, maximum=1.0, value=0.3, step=0.01
+                )
+                output_type = gr.CheckboxGroup(
+                    ["object detection", "instance segmentation", "semantic segmentation"],
+                    value=["object detection", "instance segmentation"],
+                    label="Output Type",
+                    info="Which kind of output is displayed?",
+                ).style(item_container=True, container=True)
+                run_button = gr.Button("Run")
+            with gr.Column(scale=2):
+                gallery = gr.Image(
+                    type="pil",
+                )
+        example_data = gr.Dataset(
+            components=[input_image, input_text, score_threshold],
+            samples=example_list,
+            samples_per_page=5,
+        )
+        example_data.click(fn=set_example, inputs=example_data, outputs=example_data.components)
+        # add_tail_info()
+        output_json = gr.JSON(label="json results")
+        run_button.click(
+            fn=run_on_image_C,
+            inputs=[input_image, input_text, score_threshold, output_type],
+            outputs=[gallery, output_json],
+        )
+def APE_D_tab():
+    with gr.Tab("APE D"):
+        with gr.Row(equal_height=False):
+            with gr.Column(scale=1):
+                input_image = gr.Image(
+                    sources=["upload"],
+                    type="filepath",
+                    # tool="sketch",
+                    # brush_radius=50,
+                )
+                input_text = gr.Textbox(
+                    label="Object Prompt (optional, if not provided, will only find COCO object.)",
+                    info="格式: word1,word2,sentence1,sentence2,...",
+                )
+                score_threshold = gr.Slider(
+                    label="Score Threshold", minimum=0.01, maximum=1.0, value=0.1, step=0.01
+                )
+                output_type = gr.CheckboxGroup(
+                    ["object detection", "instance segmentation", "semantic segmentation"],
+                    value=["object detection", "instance segmentation"],
+                    label="Output Type",
+                    info="Which kind of output is displayed?",
+                )
+                run_button = gr.Button("Run")
+            with gr.Column(scale=2):
+                gallery = gr.Image(
+                    type="pil",
+                )
+        gr.Examples(
+            examples=example_list,
+            inputs=[input_image, input_text, score_threshold, output_type],
+        )
+        # add_tail_info()
+        output_json = gr.JSON(label="json results")
+        run_button.click(
+            fn=run_on_image_D,
+            inputs=[input_image, input_text, score_threshold, output_type],
+            outputs=[gallery, output_json],
+        )
+def comparison_tab():
+    with gr.Tab("APE all"):
+        with gr.Row(equal_height=False):
+            with gr.Column(scale=1):
+                input_image = gr.Image(
+                    sources=["upload"],
+                    type="filepath",
+                    # tool="sketch",
+                    # brush_radius=50,
+                )
+                input_text = gr.Textbox(
+                    label="Object Prompt (optional, if not provided, will only find COCO object.)",
+                    info="格式: word1,word2,sentence1,sentence2,...",
+                )
+                score_threshold = gr.Slider(
+                    label="Score Threshold", minimum=0.01, maximum=1.0, value=0.1, step=0.01
+                )
+                output_type = gr.CheckboxGroup(
+                    ["object detection", "instance segmentation", "semantic segmentation"],
+                    value=["object detection", "instance segmentation"],
+                    label="Output Type",
+                    info="Which kind of output is displayed?",
+                )
+                run_button = gr.Button("Run")
+            gallery_all = []
+            with gr.Column(scale=2):
+                for key in all_demo.keys():
+                    gallery = gr.Image(
+                        label=key,
+                        type="pil",
+                    )
+                    gallery_all.append(gallery)
+        gr.Examples(
+            examples=example_list,
+            inputs=[input_image, input_text, score_threshold, output_type],
+        )
+        # add_tail_info()
+        run_button.click(
+            fn=run_on_image_comparison,
+            inputs=[input_image, input_text, score_threshold, output_type],
+            outputs=gallery_all,
+        )
+def is_port_in_use(port: int) -> bool:
+    import socket
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        return s.connect_ex(("localhost", port)) == 0
+def add_head_info(max_available_memory):
+    gr.Markdown(
+        "# APE: Aligning and Prompting Everything All at Once for Universal Visual Perception"
+    )
+    if max_available_memory:
+        gr.Markdown(
+            "Note multiple models are deployed on single GPU, so it may take several minutes to run the models and visualize the results."
+        )
+    else:
+        gr.Markdown(
+            "Note multiple models are deployed on CPU, so it may take a while to run the models and visualize the results."
+        )
+        gr.Markdown(
+            "Noted results computed by CPU are slightly different to results computed by GPU, and some libraries are disabled on CPU."
+        )
+    gr.Markdown(
+        "If the demo is out of memory, try to ***decrease*** the number of object prompt and ***increase*** score threshold."
+    )
+    gr.Markdown("---")
+def add_tail_info():
+    gr.Markdown("---")
+    gr.Markdown("### We also support Prompt")
+    gr.Markdown(
+        """
+    |  Location prompt   | result |  Location prompt   | result  |
+    |  ----  | ----  |  ----  | ----  |
+    | ![Location prompt](/file=examples/prompt/20230627-131346_11.176.20.67_mask.PNG)  | ![结果](/file=examples/prompt/20230627-131346_11.176.20.67_pred.png) | ![Location prompt](/file=examples/prompt/20230627-131530_11.176.20.67_mask.PNG)  | ![结果](/file=examples/prompt/20230627-131530_11.176.20.67_pred.png) |
+    | ![Location prompt](/file=examples/prompt/20230627-131520_11.176.20.67_mask.PNG)  | ![结果](/file=examples/prompt/20230627-131520_11.176.20.67_pred.png) | ![Location prompt](/file=examples/prompt/20230627-114219_11.176.20.67_mask.PNG)  | ![结果](/file=examples/prompt/20230627-114219_11.176.20.67_pred.png) |
+    """
+    )
+    gr.Markdown("---")
+if __name__ == "__main__":
+    available_port = [80, 8080]
+    for port in available_port:
+        if is_port_in_use(port):
+            continue
+        else:
+            server_port = port
+            break
+    print("server_port", server_port)
+    available_memory = [
+        torch.cuda.mem_get_info(i)[0] / 1024**3 for i in range(torch.cuda.device_count())
+    ]
+    global running_device
+    if len(available_memory) > 0:
+        max_available_memory = max(available_memory)
+        device_id = available_memory.index(max_available_memory)
+        running_device = "cuda:" + str(device_id)
+    else:
+        max_available_memory = 0
+        running_device = "cpu"
+    global save_memory
+    save_memory = False
+    if max_available_memory > 0 and max_available_memory < 40:
+        save_memory = True
+    print("available_memory", available_memory)
+    print("max_available_memory", max_available_memory)
+    print("running_device", running_device)
+    print("save_memory", save_memory)
+    # ==========================================================================================
+    mp.set_start_method("spawn", force=True)
+    setup_logger(name="fvcore")
+    setup_logger(name="ape")
+    global logger
+    logger = setup_logger()
+    global aug
+    aug = T.ResizeShortestEdge([1024, 1024], 1024)
+    global all_demo
+    all_demo = {}
+    all_cfg = {}
+    # load_APE_A()
+    # load_APE_B()
+    # load_APE_C()
+    save_memory = False
+    load_APE_D()
+    title = "APE: Aligning and Prompting Everything All at Once for Universal Visual Perception"
+    block = gr.Blocks(title=title).queue()
+    with block:
+        add_head_info(max_available_memory)
+        # APE_A_tab()
+        # APE_C_tab()
+        APE_D_tab()
+        comparison_tab()
+        # add_tail_info()
+    block.launch(
+        share=False,
+        # server_name="0.0.0.0",
+        # server_port=server_port,
+        show_api=False,
+        show_error=True,
+    )

demo_lazy.py ADDED Viewed

	@@ -0,0 +1,263 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import argparse
+import glob
+import json
+import multiprocessing as mp
+import os
+import tempfile
+import time
+import warnings
+from collections import abc
+import cv2
+import numpy as np
+import tqdm
+from detectron2.config import LazyConfig, get_cfg
+from detectron2.data.detection_utils import read_image
+from detectron2.evaluation.coco_evaluation import instances_to_coco_json
+# from detectron2.projects.deeplab import add_deeplab_config
+# from detectron2.projects.panoptic_deeplab import add_panoptic_deeplab_config
+from detectron2.utils.logger import setup_logger
+from predictor_lazy import VisualizationDemo
+# constants
+WINDOW_NAME = "APE"
+def setup_cfg(args):
+    # load config from file and command-line arguments
+    cfg = LazyConfig.load(args.config_file)
+    cfg = LazyConfig.apply_overrides(cfg, args.opts)
+    if "output_dir" in cfg.model:
+        cfg.model.output_dir = cfg.train.output_dir
+    if "model_vision" in cfg.model and "output_dir" in cfg.model.model_vision:
+        cfg.model.model_vision.output_dir = cfg.train.output_dir
+    if "train" in cfg.dataloader:
+        if isinstance(cfg.dataloader.train, abc.MutableSequence):
+            for i in range(len(cfg.dataloader.train)):
+                if "output_dir" in cfg.dataloader.train[i].mapper:
+                    cfg.dataloader.train[i].mapper.output_dir = cfg.train.output_dir
+        else:
+            if "output_dir" in cfg.dataloader.train.mapper:
+                cfg.dataloader.train.mapper.output_dir = cfg.train.output_dir
+    if "model_vision" in cfg.model:
+        cfg.model.model_vision.test_score_thresh = args.confidence_threshold
+    else:
+        cfg.model.test_score_thresh = args.confidence_threshold
+    # default_setup(cfg, args)
+    setup_logger(name="ape")
+    setup_logger(name="timm")
+    return cfg
+def get_parser():
+    parser = argparse.ArgumentParser(description="Detectron2 demo for builtin configs")
+    parser.add_argument(
+        "--config-file",
+        default="configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml",
+        metavar="FILE",
+        help="path to config file",
+    )
+    parser.add_argument("--webcam", action="store_true", help="Take inputs from webcam.")
+    parser.add_argument("--video-input", help="Path to video file.")
+    parser.add_argument(
+        "--input",
+        nargs="+",
+        help="A list of space separated input images; "
+        "or a single glob pattern such as 'directory/*.jpg'",
+    )
+    parser.add_argument(
+        "--output",
+        help="A file or directory to save output visualizations. "
+        "If not given, will show output in an OpenCV window.",
+    )
+    parser.add_argument(
+        "--confidence-threshold",
+        type=float,
+        default=0.5,
+        help="Minimum score for instance predictions to be shown",
+    )
+    parser.add_argument(
+        "--opts",
+        help="Modify config options using the command-line 'KEY VALUE' pairs",
+        default=[],
+        nargs=argparse.REMAINDER,
+    )
+    parser.add_argument("--text-prompt", default=None)
+    parser.add_argument("--with-box", action="store_true", help="show box of instance")
+    parser.add_argument("--with-mask", action="store_true", help="show mask of instance")
+    parser.add_argument("--with-sseg", action="store_true", help="show mask of class")
+    return parser
+def test_opencv_video_format(codec, file_ext):
+    with tempfile.TemporaryDirectory(prefix="video_format_test") as dir:
+        filename = os.path.join(dir, "test_file" + file_ext)
+        writer = cv2.VideoWriter(
+            filename=filename,
+            fourcc=cv2.VideoWriter_fourcc(*codec),
+            fps=float(30),
+            frameSize=(10, 10),
+            isColor=True,
+        )
+        [writer.write(np.zeros((10, 10, 3), np.uint8)) for _ in range(30)]
+        writer.release()
+        if os.path.isfile(filename):
+            return True
+        return False
+if __name__ == "__main__":
+    mp.set_start_method("spawn", force=True)
+    args = get_parser().parse_args()
+    setup_logger(name="fvcore")
+    setup_logger(name="ape")
+    logger = setup_logger()
+    logger.info("Arguments: " + str(args))
+    cfg = setup_cfg(args)
+    if args.video_input:
+        demo = VisualizationDemo(cfg, parallel=True, args=args)
+    else:
+        demo = VisualizationDemo(cfg, args=args)
+    if args.input:
+        if len(args.input) == 1:
+            args.input = glob.glob(os.path.expanduser(args.input[0]), recursive=True)
+            assert args.input, "The input path(s) was not found"
+        for path in tqdm.tqdm(args.input, disable=not args.output):
+            # use PIL, to be consistent with evaluation
+            try:
+                img = read_image(path, format="BGR")
+            except Exception as e:
+                print("*" * 60)
+                print("fail to open image: ", e)
+                print("*" * 60)
+                continue
+            start_time = time.time()
+            predictions, visualized_output, visualized_outputs, metadata = demo.run_on_image(
+                img,
+                text_prompt=args.text_prompt,
+                with_box=args.with_box,
+                with_mask=args.with_mask,
+                with_sseg=args.with_sseg,
+            )
+            logger.info(
+                "{}: {} in {:.2f}s".format(
+                    path,
+                    "detected {} instances".format(len(predictions["instances"]))
+                    if "instances" in predictions
+                    else "finished",
+                    time.time() - start_time,
+                )
+            )
+            if args.output:
+                if os.path.isdir(args.output):
+                    assert os.path.isdir(args.output), args.output
+                    out_filename = os.path.join(args.output, os.path.basename(path))
+                else:
+                    assert len(args.input) == 1, "Please specify a directory with args.output"
+                    out_filename = args.output
+                out_filename = out_filename.replace(".webp", ".png")
+                out_filename = out_filename.replace(".crdownload", ".png")
+                out_filename = out_filename.replace(".jfif", ".png")
+                visualized_output.save(out_filename)
+                for i in range(len(visualized_outputs)):
+                    out_filename = (
+                        os.path.join(args.output, os.path.basename(path)) + "." + str(i) + ".png"
+                    )
+                    visualized_outputs[i].save(out_filename)
+                # import pickle
+                # with open(out_filename + ".pkl", "wb") as outp:
+                #     pickle.dump(predictions, outp, pickle.HIGHEST_PROTOCOL)
+                if "instances" in predictions:
+                    results = instances_to_coco_json(
+                        predictions["instances"].to(demo.cpu_device), path
+                    )
+                    for result in results:
+                        result["category_name"] = metadata.thing_classes[result["category_id"]]
+                        result["image_name"] = result["image_id"]
+                    with open(out_filename + ".json", "w") as outp:
+                        json.dump(results, outp)
+            else:
+                cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
+                cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1])
+                if cv2.waitKey(0) == 27:
+                    break  # esc to quit
+    elif args.webcam:
+        assert args.input is None, "Cannot have both --input and --webcam!"
+        assert args.output is None, "output not yet supported with --webcam!"
+        cam = cv2.VideoCapture(0)
+        for vis in tqdm.tqdm(demo.run_on_video(cam)):
+            cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
+            cv2.imshow(WINDOW_NAME, vis)
+            if cv2.waitKey(1) == 27:
+                break  # esc to quit
+        cam.release()
+        cv2.destroyAllWindows()
+    elif args.video_input:
+        video = cv2.VideoCapture(args.video_input)
+        width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        frames_per_second = video.get(cv2.CAP_PROP_FPS)
+        num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+        basename = os.path.basename(args.video_input)
+        codec, file_ext = (
+            ("x264", ".mkv") if test_opencv_video_format("x264", ".mkv") else ("mp4v", ".mp4")
+        )
+        codec, file_ext = "mp4v", ".mp4"
+        if codec == ".mp4v":
+            warnings.warn("x264 codec not available, switching to mp4v")
+        if args.output:
+            if os.path.isdir(args.output):
+                output_fname = os.path.join(args.output, basename)
+                output_fname = os.path.splitext(output_fname)[0] + file_ext
+            else:
+                output_fname = args.output
+            assert not os.path.isfile(output_fname), output_fname
+            output_file = cv2.VideoWriter(
+                filename=output_fname,
+                # some installation of opencv may not support x264 (due to its license),
+                # you can try other format (e.g. MPEG)
+                fourcc=cv2.VideoWriter_fourcc(*codec),
+                fps=float(frames_per_second),
+                frameSize=(width, height),
+                isColor=True,
+            )
+        # i = 0
+        assert os.path.isfile(args.video_input)
+        for vis_frame, predictions in tqdm.tqdm(demo.run_on_video(video), total=num_frames):
+            if args.output:
+                output_file.write(vis_frame)
+                # import pickle
+                # with open(output_fname + "." + str(i) + ".pkl", "wb") as outp:
+                #     pickle.dump(predictions, outp, pickle.HIGHEST_PROTOCOL)
+                # i += 1
+            else:
+                cv2.namedWindow(basename, cv2.WINDOW_NORMAL)
+                cv2.imshow(basename, vis_frame)
+                if cv2.waitKey(1) == 27:
+                    break  # esc to quit
+        video.release()
+        if args.output:
+            output_file.release()
+        else:
+            cv2.destroyAllWindows()

examples/013_438973263.jpg ADDED Viewed

Git LFS Details

SHA256: 61515686efdd612171d93d06242bc0da844a6d192feb2c4092cc3c8f79942e22
Pointer size: 130 Bytes
Size of remote file: 60.5 kB

examples/094_56726435.jpg ADDED Viewed

Git LFS Details

SHA256: 94fc8fafb23d53809673c639cda271df63234924797dbde4dda0a85ce85f7543
Pointer size: 130 Bytes
Size of remote file: 60.1 kB

examples/199_3946193540.jpg ADDED Viewed

Git LFS Details

SHA256: a22d8ed0d1a3bc50ba7c6724a35bfe0c52d7916ff6a43a52017b4ac8aaea93f0
Pointer size: 130 Bytes
Size of remote file: 32.5 kB

examples/MatrixRevolutionForZion.jpg ADDED Viewed

Git LFS Details

SHA256: 66ae46c66721ca81bbc667eff5163e558aea496108cdbed018ab78a0b38251d0
Pointer size: 131 Bytes
Size of remote file: 110 kB

examples/Pisa.jpg ADDED Viewed

Git LFS Details

SHA256: 90b77aaaa12c0657ada23e5cff3c94b689e017945dab5fe25872645bf8cbcf28
Pointer size: 130 Bytes
Size of remote file: 42.5 kB

examples/SolvayConference1927.jpg ADDED Viewed

Git LFS Details

SHA256: 043516fa47c14d20817ae26cfb1b0b7d82aa487ef7b6afdd573cd01a286a4618
Pointer size: 131 Bytes
Size of remote file: 238 kB

examples/Terminator3.jpg ADDED Viewed

Git LFS Details

SHA256: f1a236355c4f0377d27292eb91879d21cc4e1b411878cd43e3b0393734677341
Pointer size: 132 Bytes
Size of remote file: 2.94 MB

examples/TheGreatWall.jpg ADDED Viewed

Git LFS Details

SHA256: 0a5956e303ef846f40fb3cc4f8984d1f25e924b1a9e7a2daeca31b4797ff0a66
Pointer size: 130 Bytes
Size of remote file: 13.3 kB

examples/Totoro01.png ADDED Viewed

Git LFS Details

SHA256: bdb52d3bcea59e5232c1329e7a861a59ca77b690bbe85dcf6fb0ad63fa84a624
Pointer size: 131 Bytes
Size of remote file: 273 kB

examples/Transformers.webp ADDED Viewed

Git LFS Details

SHA256: b5fdfe662c60c0decdf8c96bdf20fb4ef002656de4faa5aa883ff08787ccff22
Pointer size: 131 Bytes
Size of remote file: 148 kB

pre-requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+--index-url https://download.pytorch.org/whl/cu118
+torch==2.0.1
+torchvision==0.15.2
+torchaudio==2.0.2

predictor_lazy.py ADDED Viewed

	@@ -0,0 +1,429 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import atexit
+import bisect
+import gc
+import json
+import multiprocessing as mp
+import time
+from collections import deque
+import cv2
+import numpy as np
+import torch
+from ape.engine.defaults import DefaultPredictor
+from detectron2.data import MetadataCatalog
+from detectron2.utils.video_visualizer import VideoVisualizer
+from detectron2.utils.visualizer import ColorMode, Visualizer
+def filter_instances(instances, metadata):
+    # return instances
+    keep = []
+    keep_classes = []
+    sorted_idxs = np.argsort(-instances.scores)
+    instances = instances[sorted_idxs]
+    for i in range(len(instances)):
+        instance = instances[i]
+        pred_class = instance.pred_classes
+        if pred_class >= len(metadata.thing_classes):
+            continue
+        keep.append(i)
+        keep_classes.append(pred_class)
+    return instances[keep]
+def cuda_grabcut(img, masks, iter=5, gamma=50, iou_threshold=0.75):
+    gc.collect()
+    torch.cuda.empty_cache()
+    try:
+        import grabcut
+    except Exception as e:
+        print("*" * 60)
+        print("fail to import grabCut: ", e)
+        print("*" * 60)
+        return masks
+    GC = grabcut.GrabCut(iter)
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2BGRA)
+    tic_0 = time.time()
+    for i in range(len(masks)):
+        mask = masks[i]
+        if mask.sum() > 10 * 10:
+            pass
+        else:
+            continue
+        # ----------------------------------------------------------------
+        fourmap = np.empty_like(mask, dtype=np.uint8)
+        fourmap[:, :] = 64
+        fourmap[mask == 0] = 64
+        fourmap[mask == 1] = 128
+        # Compute segmentation
+        tic = time.time()
+        seg = GC.estimateSegmentationFromFourmap(img, fourmap, gamma)
+        toc = time.time()
+        print("Time elapsed in GrabCut segmentation: " + str(toc - tic))
+        # ----------------------------------------------------------------
+        seg = torch.tensor(seg, dtype=torch.bool)
+        iou = (mask & seg).sum() / (mask | seg).sum()
+        if iou > iou_threshold:
+            masks[i] = seg
+        if toc - tic_0 > 10:
+            break
+    return masks
+def opencv_grabcut(img, masks, iter=5):
+    for i in range(len(masks)):
+        mask = masks[i]
+        # ----------------------------------------------------------------
+        fourmap = np.empty_like(mask, dtype=np.uint8)
+        fourmap[:, :] = cv2.GC_PR_BGD
+        # fourmap[mask == 0] = cv2.GC_BGD
+        fourmap[mask == 0] = cv2.GC_PR_BGD
+        fourmap[mask == 1] = cv2.GC_PR_FGD
+        # fourmap[mask == 1] = cv2.GC_FGD
+        # Create GrabCut algo
+        bgd_model = np.zeros((1, 65), np.float64)
+        fgd_model = np.zeros((1, 65), np.float64)
+        seg = np.zeros_like(fourmap, dtype=np.uint8)
+        # Compute segmentation
+        tic = time.time()
+        seg, bgd_model, fgd_model = cv2.grabCut(
+            img, fourmap, None, bgd_model, fgd_model, iter, cv2.GC_INIT_WITH_MASK
+        )
+        toc = time.time()
+        print("Time elapsed in GrabCut segmentation: " + str(toc - tic))
+        seg = np.where((seg == 2) | (seg == 0), 0, 1).astype("bool")
+        # ----------------------------------------------------------------
+        seg = torch.tensor(seg, dtype=torch.bool)
+        iou = (mask & seg).sum() / (mask | seg).sum()
+        if iou > 0.75:
+            masks[i] = seg
+        if i > 10:
+            break
+    return masks
+class VisualizationDemo(object):
+    def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False, args=None):
+        """
+        Args:
+            cfg (CfgNode):
+            instance_mode (ColorMode):
+            parallel (bool): whether to run the model in different processes from visualization.
+                Useful since the visualization logic can be slow.
+        """
+        self.metadata = MetadataCatalog.get(
+            "__unused_" + "_".join([d for d in cfg.dataloader.train.dataset.names])
+        )
+        self.metadata.thing_classes = [
+            c
+            for d in cfg.dataloader.train.dataset.names
+            for c in MetadataCatalog.get(d).get("thing_classes", default=[])
+            + MetadataCatalog.get(d).get("stuff_classes", default=["thing"])[1:]
+        ]
+        self.metadata.stuff_classes = [
+            c
+            for d in cfg.dataloader.train.dataset.names
+            for c in MetadataCatalog.get(d).get("thing_classes", default=[])
+            + MetadataCatalog.get(d).get("stuff_classes", default=["thing"])[1:]
+        ]
+        # self.metadata = MetadataCatalog.get(
+        #     "__unused_ape_" + "_".join([d for d in cfg.dataloader.train.dataset.names])
+        # )
+        # self.metadata.thing_classes = [
+        #     c
+        #     for d in ["coco_2017_train_panoptic_separated"]
+        #     for c in MetadataCatalog.get(d).get("thing_classes", default=[])
+        #     + MetadataCatalog.get(d).get("stuff_classes", default=["thing"])[1:]
+        # ]
+        # self.metadata.stuff_classes = [
+        #     c
+        #     for d in ["coco_2017_train_panoptic_separated"]
+        #     for c in MetadataCatalog.get(d).get("thing_classes", default=[])
+        #     + MetadataCatalog.get(d).get("stuff_classes", default=["thing"])[1:]
+        # ]
+        self.cpu_device = torch.device("cpu")
+        self.instance_mode = instance_mode
+        self.parallel = parallel
+        if parallel:
+            num_gpu = torch.cuda.device_count()
+            self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu)
+        else:
+            self.predictor = DefaultPredictor(cfg)
+        print(args)
+    def run_on_image(
+        self,
+        image,
+        text_prompt=None,
+        mask_prompt=None,
+        with_box=True,
+        with_mask=True,
+        with_sseg=True,
+    ):
+        """
+        Args:
+            image (np.ndarray): an image of shape (H, W, C) (in BGR order).
+                This is the format used by OpenCV.
+        Returns:
+            predictions (dict): the output of the model.
+            vis_output (VisImage): the visualized image output.
+        """
+        if text_prompt:
+            text_list = [x.strip() for x in text_prompt.split(",")]
+            text_list = [x for x in text_list if len(x) > 0]
+            metadata = MetadataCatalog.get("__unused_ape_" + text_prompt)
+            metadata.thing_classes = text_list
+            metadata.stuff_classes = text_list
+        else:
+            metadata = self.metadata
+        vis_output = None
+        predictions = self.predictor(image, text_prompt, mask_prompt)
+        if "instances" in predictions:
+            predictions["instances"] = filter_instances(
+                predictions["instances"].to(self.cpu_device), metadata
+            )
+        # Convert image from OpenCV BGR format to Matplotlib RGB format.
+        image = image[:, :, ::-1]
+        visualizer = Visualizer(image, metadata, instance_mode=self.instance_mode)
+        vis_outputs = []
+        if "panoptic_seg" in predictions and with_mask and with_sseg:
+            panoptic_seg, segments_info = predictions["panoptic_seg"]
+            vis_output = visualizer.draw_panoptic_seg_predictions(
+                panoptic_seg.to(self.cpu_device), segments_info
+            )
+        else:
+            if "sem_seg" in predictions and with_sseg:
+                # vis_output = visualizer.draw_sem_seg(
+                #     predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
+                # )
+                sem_seg = predictions["sem_seg"].to(self.cpu_device)
+                # sem_seg = opencv_grabcut(image, sem_seg, iter=10)
+                # sem_seg = cuda_grabcut(image, sem_seg > 0.5, iter=5, gamma=10, iou_threshold=0.1)
+                sem_seg = torch.cat((sem_seg, torch.ones_like(sem_seg[0:1, ...]) * 0.1), dim=0)
+                sem_seg = sem_seg.argmax(dim=0)
+                vis_output = visualizer.draw_sem_seg(sem_seg)
+            if "instances" in predictions and (with_box or with_mask):
+                instances = predictions["instances"].to(self.cpu_device)
+                if not with_box:
+                    instances.remove("pred_boxes")
+                if not with_mask:
+                    instances.remove("pred_masks")
+                if with_mask and False:
+                    # instances.pred_masks = opencv_grabcut(image, instances.pred_masks, iter=10)
+                    instances.pred_masks = cuda_grabcut(
+                        image, instances.pred_masks, iter=5, gamma=10, iou_threshold=0.75
+                    )
+                vis_output = visualizer.draw_instance_predictions(predictions=instances)
+                # for i in range(len(instances)):
+                #     visualizer = Visualizer(image, metadata, instance_mode=self.instance_mode)
+                #     vis_outputs.append(visualizer.draw_instance_predictions(predictions=instances[i]))
+            elif "proposals" in predictions:
+                visualizer = Visualizer(image, None, instance_mode=self.instance_mode)
+                instances = predictions["proposals"].to(self.cpu_device)
+                instances.pred_boxes = instances.proposal_boxes
+                instances.scores = instances.objectness_logits
+                vis_output = visualizer.draw_instance_predictions(predictions=instances)
+        return predictions, vis_output, vis_outputs, metadata
+    def _frame_from_video(self, video):
+        while video.isOpened():
+            success, frame = video.read()
+            if success:
+                yield frame
+            else:
+                break
+    def run_on_video(self, video):
+        """
+        Visualizes predictions on frames of the input video.
+        Args:
+            video (cv2.VideoCapture): a :class:`VideoCapture` object, whose source can be
+                either a webcam or a video file.
+        Yields:
+            ndarray: BGR visualizations of each video frame.
+        """
+        video_visualizer = VideoVisualizer(self.metadata, self.instance_mode)
+        def process_predictions(frame, predictions):
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            if "panoptic_seg" in predictions and False:
+                panoptic_seg, segments_info = predictions["panoptic_seg"]
+                vis_frame = video_visualizer.draw_panoptic_seg_predictions(
+                    frame, panoptic_seg.to(self.cpu_device), segments_info
+                )
+            elif "instances" in predictions and False:
+                predictions = predictions["instances"].to(self.cpu_device)
+                vis_frame = video_visualizer.draw_instance_predictions(frame, predictions)
+            elif "sem_seg" in predictions and False:
+                vis_frame = video_visualizer.draw_sem_seg(
+                    frame, predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
+                )
+            if "sem_seg" in predictions:
+                vis_frame = video_visualizer.draw_sem_seg(
+                    frame, predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
+                )
+                frame = vis_frame.get_image()
+            if "instances" in predictions:
+                predictions = predictions["instances"].to(self.cpu_device)
+                predictions = filter_instances(predictions, self.metadata)
+                vis_frame = video_visualizer.draw_instance_predictions(frame, predictions)
+            # Converts Matplotlib RGB format to OpenCV BGR format
+            vis_frame = cv2.cvtColor(vis_frame.get_image(), cv2.COLOR_RGB2BGR)
+            return vis_frame, predictions
+        frame_gen = self._frame_from_video(video)
+        if self.parallel:
+            buffer_size = self.predictor.default_buffer_size
+            frame_data = deque()
+            for cnt, frame in enumerate(frame_gen):
+                frame_data.append(frame)
+                self.predictor.put(frame)
+                if cnt >= buffer_size:
+                    frame = frame_data.popleft()
+                    predictions = self.predictor.get()
+                    yield process_predictions(frame, predictions)
+            while len(frame_data):
+                frame = frame_data.popleft()
+                predictions = self.predictor.get()
+                yield process_predictions(frame, predictions)
+        else:
+            for frame in frame_gen:
+                yield process_predictions(frame, self.predictor(frame))
+class AsyncPredictor:
+    """
+    A predictor that runs the model asynchronously, possibly on >1 GPUs.
+    Because rendering the visualization takes considerably amount of time,
+    this helps improve throughput a little bit when rendering videos.
+    """
+    class _StopToken:
+        pass
+    class _PredictWorker(mp.Process):
+        def __init__(self, cfg, task_queue, result_queue):
+            self.cfg = cfg
+            self.task_queue = task_queue
+            self.result_queue = result_queue
+            super().__init__()
+        def run(self):
+            predictor = DefaultPredictor(self.cfg)
+            while True:
+                task = self.task_queue.get()
+                if isinstance(task, AsyncPredictor._StopToken):
+                    break
+                idx, data = task
+                result = predictor(data)
+                self.result_queue.put((idx, result))
+    def __init__(self, cfg, num_gpus: int = 1):
+        """
+        Args:
+            cfg (CfgNode):
+            num_gpus (int): if 0, will run on CPU
+        """
+        num_workers = max(num_gpus, 1)
+        self.task_queue = mp.Queue(maxsize=num_workers * 3)
+        self.result_queue = mp.Queue(maxsize=num_workers * 3)
+        self.procs = []
+        for gpuid in range(max(num_gpus, 1)):
+            cfg = cfg.clone()
+            cfg.defrost()
+            cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu"
+            self.procs.append(
+                AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue)
+            )
+        self.put_idx = 0
+        self.get_idx = 0
+        self.result_rank = []
+        self.result_data = []
+        for p in self.procs:
+            p.start()
+        atexit.register(self.shutdown)
+    def put(self, image):
+        self.put_idx += 1
+        self.task_queue.put((self.put_idx, image))
+    def get(self):
+        self.get_idx += 1  # the index needed for this request
+        if len(self.result_rank) and self.result_rank[0] == self.get_idx:
+            res = self.result_data[0]
+            del self.result_data[0], self.result_rank[0]
+            return res
+        while True:
+            # make sure the results are returned in the correct order
+            idx, res = self.result_queue.get()
+            if idx == self.get_idx:
+                return res
+            insert = bisect.bisect(self.result_rank, idx)
+            self.result_rank.insert(insert, idx)
+            self.result_data.insert(insert, res)
+    def __len__(self):
+        return self.put_idx - self.get_idx
+    def __call__(self, image):
+        self.put(image)
+        return self.get()
+    def shutdown(self):
+        for _ in self.procs:
+            self.task_queue.put(AsyncPredictor._StopToken())
+    @property
+    def default_buffer_size(self):
+        return len(self.procs) * 5

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+transformers
+cython
+opencv-python
+scipy
+einops
+lvis
+fairscale
+git+https://github.com/facebookresearch/detectron2@017abbf
+git+https://github.com/IDEA-Research/detrex@776058e
+git+https://github.com/openai/CLIP.git@d50d76d
+git+https://github.com/shenyunhang/ape