Spaces:
Build error
Build error
| import gc | |
| import multiprocessing as mp | |
| import os | |
| import shutil | |
| import sys | |
| import time | |
| from os import path | |
| import cv2 | |
| import torch | |
| from huggingface_hub import hf_hub_download | |
| from PIL import Image | |
| import ape | |
| import detectron2.data.transforms as T | |
| import gradio as gr | |
| from ape.model_zoo import get_config_file | |
| from demo_lazy import get_parser, setup_cfg | |
| from detectron2.config import CfgNode | |
| from detectron2.data.detection_utils import read_image | |
| from detectron2.evaluation.coco_evaluation import instances_to_coco_json | |
| from detectron2.utils.logger import setup_logger | |
| from predictor_lazy import VisualizationDemo | |
| this_dir = path.dirname(path.abspath(__file__)) | |
| # os.system("git clone https://github.com/shenyunhang/APE.git") | |
| # os.system("python3.10 -m pip install -e APE/") | |
| example_list = [ | |
| [ | |
| this_dir + "/examples/Totoro01.png", | |
| # "Sky, Water, Tree, The biggest Chinchilla, The older girl wearing skirt on branch, Grass", | |
| "Girl with hat", | |
| # 0.05, | |
| 0.25, | |
| ["object detection", "instance segmentation"], | |
| ], | |
| [ | |
| this_dir + "/examples/Totoro01.png", | |
| "Sky, Water, Tree, Chinchilla, Grass, Girl", | |
| 0.15, | |
| ["semantic segmentation"], | |
| ], | |
| [ | |
| this_dir + "/examples/199_3946193540.jpg", | |
| "chess piece of horse head", | |
| 0.30, | |
| ["object detection", "instance segmentation"], | |
| ], | |
| [ | |
| this_dir + "/examples/TheGreatWall.jpg", | |
| "The Great Wall", | |
| 0.1, | |
| ["semantic segmentation"], | |
| ], | |
| [ | |
| this_dir + "/examples/Pisa.jpg", | |
| "Pisa", | |
| 0.01, | |
| ["object detection", "instance segmentation"], | |
| ], | |
| [ | |
| this_dir + "/examples/SolvayConference1927.jpg", | |
| # "Albert Einstein, Madame Curie", | |
| "Madame Curie", | |
| # 0.01, | |
| 0.03, | |
| ["object detection", "instance segmentation"], | |
| ], | |
| [ | |
| this_dir + "/examples/Transformers.webp", | |
| "Optimus Prime", | |
| 0.11, | |
| ["object detection", "instance segmentation"], | |
| ], | |
| [ | |
| this_dir + "/examples/Terminator3.jpg", | |
| "Humanoid Robot", | |
| 0.10, | |
| ["object detection", "instance segmentation"], | |
| ], | |
| [ | |
| this_dir + "/examples/MatrixRevolutionForZion.jpg", | |
| """machine killer with gun in fighting, | |
| donut with colored granules on the surface, | |
| railings being crossed by horses, | |
| a horse running or jumping, | |
| equestrian rider's helmet, | |
| outdoor dog led by rope, | |
| a dog being touched, | |
| clothed dog, | |
| basketball in hand, | |
| a basketball player with both feet off the ground, | |
| player with basketball in the hand, | |
| spoon on the plate, | |
| coffee cup with coffee, | |
| the nearest dessert to the coffee cup, | |
| the bartender who is mixing wine, | |
| a bartender in a suit, | |
| wine glass with wine, | |
| a person in aprons, | |
| pot with food, | |
| a knife being used to cut vegetables, | |
| striped sofa in the room, | |
| a sofa with pillows on it in the room, | |
| lights on in the room, | |
| an indoor lying pet, | |
| a cat on the sofa, | |
| one pet looking directly at the camera indoors, | |
| a bed with patterns in the room, | |
| the lamp on the table beside the bed, | |
| pillow placed at the head of the bed, | |
| a blackboard full of words in the classroom, | |
| child sitting at desks in the classroom, | |
| a person standing in front of bookshelves in the library, | |
| the table someone is using in the library, | |
| a person who touches books in the library, | |
| a person standing in front of the cake counter, | |
| a square plate full of cakes, | |
| a cake decorated with cream, | |
| hot dog with vegetables, | |
| hot dog with sauce on the surface, | |
| red sausage, | |
| flowerpot with flowers potted inside, | |
| monochrome flowerpot, | |
| a flowerpot filled with black soil, | |
| apple growing on trees, | |
| red complete apple, | |
| apple with a stalk, | |
| a woman brushing her teeth, | |
| toothbrush held by someone, | |
| toilet brush with colored bristles, | |
| a customer whose hair is being cut by barber, | |
| a barber at work, | |
| cloth covering the barber, | |
| shopping cart pushed by people in the supermarket, | |
| shopping cart with people in the supermarket, | |
| shopping cart full of goods, | |
| a child wearing a mask, | |
| refrigerator with fruit, | |
| a drink bottle in the refrigerator, | |
| refrigerator with more than two doors, | |
| a watch placed on a table or cloth, | |
| a watch with three or more watch hands can be seen, | |
| a watch with one or more small dials, | |
| clothes hanger, | |
| a piece of clothing hanging on the hanger, | |
| a piece of clothing worn on plastic models, | |
| leather bag with glossy surface, | |
| backpack, | |
| open package, | |
| a fish held by people, | |
| a person who is fishing with a fishing rod, | |
| a fisherman standing on the shore with his body soaked in water, camera hold on someone's shoulder, | |
| a person being interviewed, | |
| a person with microphone hold in hand, | |
| """, | |
| 0.20, | |
| ["object detection", "instance segmentation"], | |
| ], | |
| [ | |
| this_dir + "/examples/094_56726435.jpg", | |
| # "donut with colored granules on the surface", | |
| """donut with colored granules on the surface, | |
| railings being crossed by horses, | |
| a horse running or jumping, | |
| equestrian rider's helmet, | |
| outdoor dog led by rope, | |
| a dog being touched, | |
| clothed dog, | |
| basketball in hand, | |
| a basketball player with both feet off the ground, | |
| player with basketball in the hand, | |
| spoon on the plate, | |
| coffee cup with coffee, | |
| the nearest dessert to the coffee cup, | |
| the bartender who is mixing wine, | |
| a bartender in a suit, | |
| wine glass with wine, | |
| a person in aprons, | |
| pot with food, | |
| a knife being used to cut vegetables, | |
| striped sofa in the room, | |
| a sofa with pillows on it in the room, | |
| lights on in the room, | |
| an indoor lying pet, | |
| a cat on the sofa, | |
| one pet looking directly at the camera indoors, | |
| a bed with patterns in the room, | |
| the lamp on the table beside the bed, | |
| pillow placed at the head of the bed, | |
| a blackboard full of words in the classroom, | |
| a blackboard or whiteboard with something pasted, | |
| child sitting at desks in the classroom, | |
| a person standing in front of bookshelves in the library, | |
| the table someone is using in the library, | |
| a person who touches books in the library, | |
| a person standing in front of the cake counter, | |
| a square plate full of cakes, | |
| a cake decorated with cream, | |
| hot dog with vegetables, | |
| hot dog with sauce on the surface, | |
| red sausage, | |
| flowerpot with flowers potted inside, | |
| monochrome flowerpot, | |
| a flowerpot filled with black soil, | |
| apple growing on trees, | |
| red complete apple, | |
| apple with a stalk, | |
| a woman brushing her teeth, | |
| toothbrush held by someone, | |
| toilet brush with colored bristles, | |
| a customer whose hair is being cut by barber, | |
| a barber at work, | |
| cloth covering the barber, | |
| a plastic toy, | |
| a plush toy, | |
| a humanoid toy, | |
| shopping cart pushed by people in the supermarket, | |
| shopping cart with people in the supermarket, | |
| shopping cart full of goods, | |
| a child wearing a mask, | |
| a mask on face with half a face exposed, | |
| a mask on face with only eyes exposed, | |
| refrigerator with fruit, | |
| a drink bottle in the refrigerator, | |
| refrigerator with more than two doors, | |
| a watch placed on a table or cloth, | |
| a watch with three or more watch hands can be seen, | |
| a watch with one or more small dials, | |
| clothes hanger, | |
| a piece of clothing hanging on the hanger, | |
| a piece of clothing worn on plastic models, | |
| leather bag with glossy surface, | |
| backpack, | |
| open package, | |
| a fish held by people, | |
| a person who is fishing with a fishing rod, | |
| a fisherman standing on the shore with his body soaked in water, camera hold on someone's shoulder, | |
| a person being interviewed, | |
| a person with microphone hold in hand, | |
| """, | |
| 0.50, | |
| ["object detection", "instance segmentation"], | |
| ], | |
| [ | |
| this_dir + "/examples/013_438973263.jpg", | |
| # "a male lion with a mane", | |
| """a male lion with a mane, | |
| railings being crossed by horses, | |
| a horse running or jumping, | |
| equestrian rider's helmet, | |
| outdoor dog led by rope, | |
| a dog being touched, | |
| clothed dog, | |
| basketball in hand, | |
| a basketball player with both feet off the ground, | |
| player with basketball in the hand, | |
| spoon on the plate, | |
| coffee cup with coffee, | |
| the nearest dessert to the coffee cup, | |
| the bartender who is mixing wine, | |
| a bartender in a suit, | |
| wine glass with wine, | |
| a person in aprons, | |
| pot with food, | |
| a knife being used to cut vegetables, | |
| striped sofa in the room, | |
| a sofa with pillows on it in the room, | |
| lights on in the room, | |
| an indoor lying pet, | |
| a cat on the sofa, | |
| one pet looking directly at the camera indoors, | |
| a bed with patterns in the room, | |
| the lamp on the table beside the bed, | |
| pillow placed at the head of the bed, | |
| a blackboard full of words in the classroom, | |
| a blackboard or whiteboard with something pasted, | |
| child sitting at desks in the classroom, | |
| a person standing in front of bookshelves in the library, | |
| the table someone is using in the library, | |
| a person who touches books in the library, | |
| a person standing in front of the cake counter, | |
| a square plate full of cakes, | |
| a cake decorated with cream, | |
| hot dog with vegetables, | |
| hot dog with sauce on the surface, | |
| red sausage, | |
| flowerpot with flowers potted inside, | |
| monochrome flowerpot, | |
| a flowerpot filled with black soil, | |
| apple growing on trees, | |
| red complete apple, | |
| apple with a stalk, | |
| a woman brushing her teeth, | |
| toothbrush held by someone, | |
| toilet brush with colored bristles, | |
| a customer whose hair is being cut by barber, | |
| a barber at work, | |
| cloth covering the barber, | |
| a plastic toy, | |
| a plush toy, | |
| a humanoid toy, | |
| shopping cart pushed by people in the supermarket, | |
| shopping cart with people in the supermarket, | |
| shopping cart full of goods, | |
| a child wearing a mask, | |
| a mask on face with half a face exposed, | |
| a mask on face with only eyes exposed, | |
| refrigerator with fruit, | |
| a drink bottle in the refrigerator, | |
| refrigerator with more than two doors, | |
| a watch placed on a table or cloth, | |
| a watch with three or more watch hands can be seen, | |
| a watch with one or more small dials, | |
| clothes hanger, | |
| a piece of clothing hanging on the hanger, | |
| a piece of clothing worn on plastic models, | |
| leather bag with glossy surface, | |
| backpack, | |
| open package, | |
| a fish held by people, | |
| a person who is fishing with a fishing rod, | |
| a fisherman standing on the shore with his body soaked in water, camera hold on someone's shoulder, | |
| a person being interviewed, | |
| a person with microphone hold in hand, | |
| """, | |
| # 0.25, | |
| 0.50, | |
| ["object detection", "instance segmentation"], | |
| ], | |
| ] | |
| ckpt_repo_id = "shenyunhang/APE" | |
| def setup_model(name): | |
| gc.collect() | |
| torch.cuda.empty_cache() | |
| if save_memory: | |
| pass | |
| else: | |
| return | |
| for key, demo in all_demo.items(): | |
| if key == name: | |
| demo.predictor.model.to(running_device) | |
| else: | |
| demo.predictor.model.to("cpu") | |
| gc.collect() | |
| torch.cuda.empty_cache() | |
| def run_on_image_A(input_image_path, input_text, score_threshold, output_type): | |
| logger.info("run_on_image") | |
| setup_model("APE_A") | |
| demo = all_demo["APE_A"] | |
| cfg = all_cfg["APE_A"] | |
| demo.predictor.model.model_vision.test_score_thresh = score_threshold | |
| return run_on_image( | |
| input_image_path, | |
| input_text, | |
| output_type, | |
| demo, | |
| cfg, | |
| ) | |
| def run_on_image_C(input_image_path, input_text, score_threshold, output_type): | |
| logger.info("run_on_image_C") | |
| setup_model("APE_C") | |
| demo = all_demo["APE_C"] | |
| cfg = all_cfg["APE_C"] | |
| demo.predictor.model.model_vision.test_score_thresh = score_threshold | |
| return run_on_image( | |
| input_image_path, | |
| input_text, | |
| output_type, | |
| demo, | |
| cfg, | |
| ) | |
| def run_on_image_D(input_image_path, input_text, score_threshold, output_type): | |
| logger.info("run_on_image_D") | |
| setup_model("APE_D") | |
| demo = all_demo["APE_D"] | |
| cfg = all_cfg["APE_D"] | |
| demo.predictor.model.model_vision.test_score_thresh = score_threshold | |
| return run_on_image( | |
| input_image_path, | |
| input_text, | |
| output_type, | |
| demo, | |
| cfg, | |
| ) | |
| def run_on_image_comparison(input_image_path, input_text, score_threshold, output_type): | |
| logger.info("run_on_image_comparison") | |
| r = [] | |
| for key in all_demo.keys(): | |
| logger.info("run_on_image_comparison {}".format(key)) | |
| setup_model(key) | |
| demo = all_demo[key] | |
| cfg = all_cfg[key] | |
| demo.predictor.model.model_vision.test_score_thresh = score_threshold | |
| img, _ = run_on_image( | |
| input_image_path, | |
| input_text, | |
| output_type, | |
| demo, | |
| cfg, | |
| ) | |
| r.append(img) | |
| return r | |
| def run_on_image( | |
| input_image_path, | |
| input_text, | |
| output_type, | |
| demo, | |
| cfg, | |
| ): | |
| with_box = False | |
| with_mask = False | |
| with_sseg = False | |
| if "object detection" in output_type: | |
| with_box = True | |
| if "instance segmentation" in output_type: | |
| with_mask = True | |
| if "semantic segmentation" in output_type: | |
| with_sseg = True | |
| if isinstance(input_image_path, dict): | |
| input_mask_path = input_image_path["mask"] | |
| input_image_path = input_image_path["image"] | |
| print("input_image_path", input_image_path) | |
| print("input_mask_path", input_mask_path) | |
| else: | |
| input_mask_path = None | |
| print("input_text", input_text) | |
| if isinstance(cfg, CfgNode): | |
| input_format = cfg.INPUT.FORMAT | |
| else: | |
| if "model_vision" in cfg.model: | |
| input_format = cfg.model.model_vision.input_format | |
| else: | |
| input_format = cfg.model.input_format | |
| input_image = read_image(input_image_path, format="BGR") | |
| # img = cv2.imread(input_image_path) | |
| # cv2.imwrite("tmp.jpg", img) | |
| # # input_image = read_image("tmp.jpg", format=input_format) | |
| # input_image = read_image("tmp.jpg", format="BGR") | |
| if input_mask_path is not None: | |
| input_mask = read_image(input_mask_path, "L").squeeze(2) | |
| print("input_mask", input_mask) | |
| print("input_mask", input_mask.shape) | |
| else: | |
| input_mask = None | |
| if not with_box and not with_mask and not with_sseg: | |
| return input_image[:, :, ::-1] | |
| if input_image.shape[0] > 1024 or input_image.shape[1] > 1024: | |
| transform = aug.get_transform(input_image) | |
| input_image = transform.apply_image(input_image) | |
| else: | |
| transform = None | |
| start_time = time.time() | |
| predictions, visualized_output, _, metadata = demo.run_on_image( | |
| input_image, | |
| text_prompt=input_text, | |
| mask_prompt=input_mask, | |
| with_box=with_box, | |
| with_mask=with_mask, | |
| with_sseg=with_sseg, | |
| ) | |
| logger.info( | |
| "{} in {:.2f}s".format( | |
| "detected {} instances".format(len(predictions["instances"])) | |
| if "instances" in predictions | |
| else "finished", | |
| time.time() - start_time, | |
| ) | |
| ) | |
| output_image = visualized_output.get_image() | |
| print("output_image", output_image.shape) | |
| # if input_format == "RGB": | |
| # output_image = output_image[:, :, ::-1] | |
| if transform: | |
| output_image = transform.inverse().apply_image(output_image) | |
| print("output_image", output_image.shape) | |
| output_image = Image.fromarray(output_image) | |
| gc.collect() | |
| torch.cuda.empty_cache() | |
| json_results = instances_to_coco_json(predictions["instances"].to(demo.cpu_device), 0) | |
| for json_result in json_results: | |
| json_result["category_name"] = metadata.thing_classes[json_result["category_id"]] | |
| del json_result["image_id"] | |
| return output_image, json_results | |
| def load_APE_A(): | |
| # init_checkpoint= "output2/APE/configs/LVISCOCOCOCOSTUFF_O365_OID_VG/ape_deta/ape_deta_vitl_eva02_lsj_cp_720k_20230504_002019/model_final.pth" | |
| init_checkpoint = "configs/LVISCOCOCOCOSTUFF_O365_OID_VG/ape_deta/ape_deta_vitl_eva02_lsj_cp_720k_20230504_002019/model_final.pth" | |
| init_checkpoint = hf_hub_download(repo_id=ckpt_repo_id, filename=init_checkpoint) | |
| args = get_parser().parse_args() | |
| args.config_file = get_config_file( | |
| "LVISCOCOCOCOSTUFF_O365_OID_VG/ape_deta/ape_deta_vitl_eva02_lsj1024_cp_720k.py" | |
| ) | |
| args.confidence_threshold = 0.01 | |
| args.opts = [ | |
| "train.init_checkpoint='{}'".format(init_checkpoint), | |
| "model.model_language.cache_dir=''", | |
| "model.model_vision.select_box_nums_for_evaluation=500", | |
| "model.model_vision.backbone.net.xattn=False", | |
| "model.model_vision.transformer.encoder.pytorch_attn=True", | |
| "model.model_vision.transformer.decoder.pytorch_attn=True", | |
| ] | |
| if running_device == "cpu": | |
| args.opts += [ | |
| "model.model_language.dtype='float32'", | |
| ] | |
| logger.info("Arguments: " + str(args)) | |
| cfg = setup_cfg(args) | |
| cfg.model.model_vision.criterion[0].use_fed_loss = False | |
| cfg.model.model_vision.criterion[2].use_fed_loss = False | |
| cfg.train.device = running_device | |
| ape.modeling.text.eva01_clip.eva_clip._MODEL_CONFIGS[cfg.model.model_language.clip_model][ | |
| "vision_cfg" | |
| ]["layers"] = 1 | |
| ape.modeling.text.eva01_clip.eva_clip._MODEL_CONFIGS[cfg.model.model_language.clip_model][ | |
| "vision_cfg" | |
| ]["fusedLN"] = False | |
| demo = VisualizationDemo(cfg, args=args) | |
| if save_memory: | |
| demo.predictor.model.to("cpu") | |
| # demo.predictor.model.half() | |
| else: | |
| demo.predictor.model.to(running_device) | |
| all_demo["APE_A"] = demo | |
| all_cfg["APE_A"] = cfg | |
| def load_APE_B(): | |
| # init_checkpoint= "output2/APE/configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_REFCOCO/ape_deta/ape_deta_vitl_eva02_vlf_lsj_cp_1080k_20230702_225418/model_final.pth" | |
| init_checkpoint = "configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_REFCOCO/ape_deta/ape_deta_vitl_eva02_vlf_lsj_cp_1080k_20230702_225418/model_final.pth" | |
| init_checkpoint = hf_hub_download(repo_id=ckpt_repo_id, filename=init_checkpoint) | |
| args = get_parser().parse_args() | |
| args.config_file = get_config_file( | |
| "LVISCOCOCOCOSTUFF_O365_OID_VGR_REFCOCO/ape_deta/ape_deta_vitl_eva02_vlf_lsj1024_cp_1080k.py" | |
| ) | |
| args.confidence_threshold = 0.01 | |
| args.opts = [ | |
| "train.init_checkpoint='{}'".format(init_checkpoint), | |
| "model.model_language.cache_dir=''", | |
| "model.model_vision.select_box_nums_for_evaluation=500", | |
| "model.model_vision.text_feature_bank_reset=True", | |
| "model.model_vision.backbone.net.xattn=False", | |
| "model.model_vision.transformer.encoder.pytorch_attn=True", | |
| "model.model_vision.transformer.decoder.pytorch_attn=True", | |
| ] | |
| if running_device == "cpu": | |
| args.opts += [ | |
| "model.model_language.dtype='float32'", | |
| ] | |
| logger.info("Arguments: " + str(args)) | |
| cfg = setup_cfg(args) | |
| cfg.model.model_vision.criterion[0].use_fed_loss = False | |
| cfg.model.model_vision.criterion[2].use_fed_loss = False | |
| cfg.train.device = running_device | |
| ape.modeling.text.eva01_clip.eva_clip._MODEL_CONFIGS[cfg.model.model_language.clip_model][ | |
| "vision_cfg" | |
| ]["layers"] = 1 | |
| ape.modeling.text.eva01_clip.eva_clip._MODEL_CONFIGS[cfg.model.model_language.clip_model][ | |
| "vision_cfg" | |
| ]["fusedLN"] = False | |
| demo = VisualizationDemo(cfg, args=args) | |
| if save_memory: | |
| demo.predictor.model.to("cpu") | |
| # demo.predictor.model.half() | |
| else: | |
| demo.predictor.model.to(running_device) | |
| all_demo["APE_B"] = demo | |
| all_cfg["APE_B"] = cfg | |
| def load_APE_C(): | |
| # init_checkpoint= "output2/APE/configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO/ape_deta/ape_deta_vitl_eva02_vlf_lsj_cp_1080k_20230702_210950/model_final.pth" | |
| init_checkpoint = "configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO/ape_deta/ape_deta_vitl_eva02_vlf_lsj_cp_1080k_20230702_210950/model_final.pth" | |
| init_checkpoint = hf_hub_download(repo_id=ckpt_repo_id, filename=init_checkpoint) | |
| args = get_parser().parse_args() | |
| args.config_file = get_config_file( | |
| "LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO/ape_deta/ape_deta_vitl_eva02_vlf_lsj1024_cp_1080k.py" | |
| ) | |
| args.confidence_threshold = 0.01 | |
| args.opts = [ | |
| "train.init_checkpoint='{}'".format(init_checkpoint), | |
| "model.model_language.cache_dir=''", | |
| "model.model_vision.select_box_nums_for_evaluation=500", | |
| "model.model_vision.text_feature_bank_reset=True", | |
| "model.model_vision.backbone.net.xattn=False", | |
| "model.model_vision.transformer.encoder.pytorch_attn=True", | |
| "model.model_vision.transformer.decoder.pytorch_attn=True", | |
| ] | |
| if running_device == "cpu": | |
| args.opts += [ | |
| "model.model_language.dtype='float32'", | |
| ] | |
| logger.info("Arguments: " + str(args)) | |
| cfg = setup_cfg(args) | |
| cfg.model.model_vision.criterion[0].use_fed_loss = False | |
| cfg.model.model_vision.criterion[2].use_fed_loss = False | |
| cfg.train.device = running_device | |
| ape.modeling.text.eva01_clip.eva_clip._MODEL_CONFIGS[cfg.model.model_language.clip_model][ | |
| "vision_cfg" | |
| ]["layers"] = 1 | |
| ape.modeling.text.eva01_clip.eva_clip._MODEL_CONFIGS[cfg.model.model_language.clip_model][ | |
| "vision_cfg" | |
| ]["fusedLN"] = False | |
| demo = VisualizationDemo(cfg, args=args) | |
| if save_memory: | |
| demo.predictor.model.to("cpu") | |
| # demo.predictor.model.half() | |
| else: | |
| demo.predictor.model.to(running_device) | |
| all_demo["APE_C"] = demo | |
| all_cfg["APE_C"] = cfg | |
| def load_APE_D(): | |
| # init_checkpoint= "output2/APE/configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO_GQA_PhraseCut_Flickr30k/ape_deta/ape_deta_vitl_eva02_clip_vlf_lsj1024_cp_16x4_1080k_mdl_20230829_162438/model_final.pth" | |
| init_checkpoint = "configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO_GQA_PhraseCut_Flickr30k/ape_deta/ape_deta_vitl_eva02_clip_vlf_lsj1024_cp_16x4_1080k_mdl_20230829_162438/model_final.pth" | |
| init_checkpoint = hf_hub_download(repo_id=ckpt_repo_id, filename=init_checkpoint) | |
| args = get_parser().parse_args() | |
| args.config_file = get_config_file( | |
| "LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO_GQA_PhraseCut_Flickr30k/ape_deta/ape_deta_vitl_eva02_clip_vlf_lsj1024_cp_16x4_1080k.py" | |
| ) | |
| args.confidence_threshold = 0.01 | |
| args.opts = [ | |
| "train.init_checkpoint='{}'".format(init_checkpoint), | |
| "model.model_language.cache_dir=''", | |
| "model.model_vision.select_box_nums_for_evaluation=500", | |
| "model.model_vision.text_feature_bank_reset=True", | |
| "model.model_vision.backbone.net.xattn=False", | |
| "model.model_vision.transformer.encoder.pytorch_attn=True", | |
| "model.model_vision.transformer.decoder.pytorch_attn=True", | |
| ] | |
| if running_device == "cpu": | |
| args.opts += [ | |
| "model.model_language.dtype='float32'", | |
| ] | |
| logger.info("Arguments: " + str(args)) | |
| cfg = setup_cfg(args) | |
| cfg.model.model_vision.criterion[0].use_fed_loss = False | |
| cfg.model.model_vision.criterion[2].use_fed_loss = False | |
| cfg.train.device = running_device | |
| ape.modeling.text.eva02_clip.factory._MODEL_CONFIGS[cfg.model.model_language.clip_model][ | |
| "vision_cfg" | |
| ]["layers"] = 1 | |
| demo = VisualizationDemo(cfg, args=args) | |
| if save_memory: | |
| demo.predictor.model.to("cpu") | |
| # demo.predictor.model.half() | |
| else: | |
| demo.predictor.model.to(running_device) | |
| all_demo["APE_D"] = demo | |
| all_cfg["APE_D"] = cfg | |
| def APE_A_tab(): | |
| with gr.Tab("APE A"): | |
| with gr.Row(equal_height=False): | |
| with gr.Column(scale=1): | |
| input_image = gr.Image( | |
| sources=["upload"], | |
| type="filepath", | |
| # tool="sketch", | |
| # brush_radius=50, | |
| ) | |
| input_text = gr.Textbox( | |
| label="Object Prompt (optional, if not provided, will only find COCO object.)", | |
| info="格式: word1,word2,word3,...", | |
| ) | |
| score_threshold = gr.Slider( | |
| label="Score Threshold", minimum=0.01, maximum=1.0, value=0.3, step=0.01 | |
| ) | |
| output_type = gr.CheckboxGroup( | |
| ["object detection", "instance segmentation"], | |
| value=["object detection", "instance segmentation"], | |
| label="Output Type", | |
| info="Which kind of output is displayed?", | |
| ).style(item_container=True, container=True) | |
| run_button = gr.Button("Run") | |
| with gr.Column(scale=2): | |
| gallery = gr.Image( | |
| type="pil", | |
| ) | |
| example_data = gr.Dataset( | |
| components=[input_image, input_text, score_threshold], | |
| samples=examples, | |
| samples_per_page=5, | |
| ) | |
| example_data.click(fn=set_example, inputs=example_data, outputs=example_data.components) | |
| # add_tail_info() | |
| output_json = gr.JSON(label="json results") | |
| run_button.click( | |
| fn=run_on_image, | |
| inputs=[input_image, input_text, score_threshold, output_type], | |
| outputs=[gallery, output_json], | |
| ) | |
| def APE_C_tab(): | |
| with gr.Tab("APE C"): | |
| with gr.Row(equal_height=False): | |
| with gr.Column(scale=1): | |
| input_image = gr.Image( | |
| sources=["upload"], | |
| type="filepath", | |
| # tool="sketch", | |
| # brush_radius=50, | |
| ) | |
| input_text = gr.Textbox( | |
| label="Object Prompt (optional, if not provided, will only find COCO object.)", | |
| info="格式: word1,word2,sentence1,sentence2,...", | |
| ) | |
| score_threshold = gr.Slider( | |
| label="Score Threshold", minimum=0.01, maximum=1.0, value=0.3, step=0.01 | |
| ) | |
| output_type = gr.CheckboxGroup( | |
| ["object detection", "instance segmentation", "semantic segmentation"], | |
| value=["object detection", "instance segmentation"], | |
| label="Output Type", | |
| info="Which kind of output is displayed?", | |
| ).style(item_container=True, container=True) | |
| run_button = gr.Button("Run") | |
| with gr.Column(scale=2): | |
| gallery = gr.Image( | |
| type="pil", | |
| ) | |
| example_data = gr.Dataset( | |
| components=[input_image, input_text, score_threshold], | |
| samples=example_list, | |
| samples_per_page=5, | |
| ) | |
| example_data.click(fn=set_example, inputs=example_data, outputs=example_data.components) | |
| # add_tail_info() | |
| output_json = gr.JSON(label="json results") | |
| run_button.click( | |
| fn=run_on_image_C, | |
| inputs=[input_image, input_text, score_threshold, output_type], | |
| outputs=[gallery, output_json], | |
| ) | |
| def APE_D_tab(): | |
| with gr.Tab("APE D"): | |
| with gr.Row(equal_height=False): | |
| with gr.Column(scale=1): | |
| input_image = gr.Image( | |
| sources=["upload"], | |
| type="filepath", | |
| # tool="sketch", | |
| # brush_radius=50, | |
| ) | |
| input_text = gr.Textbox( | |
| label="Object Prompt (optional, if not provided, will only find COCO object.)", | |
| info="格式: word1,word2,sentence1,sentence2,...", | |
| ) | |
| score_threshold = gr.Slider( | |
| label="Score Threshold", minimum=0.01, maximum=1.0, value=0.1, step=0.01 | |
| ) | |
| output_type = gr.CheckboxGroup( | |
| ["object detection", "instance segmentation", "semantic segmentation"], | |
| value=["object detection", "instance segmentation"], | |
| label="Output Type", | |
| info="Which kind of output is displayed?", | |
| ) | |
| run_button = gr.Button("Run") | |
| with gr.Column(scale=2): | |
| gallery = gr.Image( | |
| type="pil", | |
| ) | |
| gr.Examples( | |
| examples=example_list, | |
| inputs=[input_image, input_text, score_threshold, output_type], | |
| ) | |
| # add_tail_info() | |
| output_json = gr.JSON(label="json results") | |
| run_button.click( | |
| fn=run_on_image_D, | |
| inputs=[input_image, input_text, score_threshold, output_type], | |
| outputs=[gallery, output_json], | |
| ) | |
| def comparison_tab(): | |
| with gr.Tab("APE all"): | |
| with gr.Row(equal_height=False): | |
| with gr.Column(scale=1): | |
| input_image = gr.Image( | |
| sources=["upload"], | |
| type="filepath", | |
| # tool="sketch", | |
| # brush_radius=50, | |
| ) | |
| input_text = gr.Textbox( | |
| label="Object Prompt (optional, if not provided, will only find COCO object.)", | |
| info="格式: word1,word2,sentence1,sentence2,...", | |
| ) | |
| score_threshold = gr.Slider( | |
| label="Score Threshold", minimum=0.01, maximum=1.0, value=0.1, step=0.01 | |
| ) | |
| output_type = gr.CheckboxGroup( | |
| ["object detection", "instance segmentation", "semantic segmentation"], | |
| value=["object detection", "instance segmentation"], | |
| label="Output Type", | |
| info="Which kind of output is displayed?", | |
| ) | |
| run_button = gr.Button("Run") | |
| gallery_all = [] | |
| with gr.Column(scale=2): | |
| for key in all_demo.keys(): | |
| gallery = gr.Image( | |
| label=key, | |
| type="pil", | |
| ) | |
| gallery_all.append(gallery) | |
| gr.Examples( | |
| examples=example_list, | |
| inputs=[input_image, input_text, score_threshold, output_type], | |
| ) | |
| # add_tail_info() | |
| run_button.click( | |
| fn=run_on_image_comparison, | |
| inputs=[input_image, input_text, score_threshold, output_type], | |
| outputs=gallery_all, | |
| ) | |
| def is_port_in_use(port: int) -> bool: | |
| import socket | |
| with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: | |
| return s.connect_ex(("localhost", port)) == 0 | |
| def add_head_info(max_available_memory): | |
| gr.Markdown( | |
| "# APE: Aligning and Prompting Everything All at Once for Universal Visual Perception" | |
| ) | |
| if max_available_memory: | |
| gr.Markdown( | |
| "Note multiple models are deployed on single GPU, so it may take several minutes to run the models and visualize the results." | |
| ) | |
| else: | |
| gr.Markdown( | |
| "Note multiple models are deployed on CPU, so it may take a while to run the models and visualize the results." | |
| ) | |
| gr.Markdown( | |
| "Noted results computed by CPU are slightly different to results computed by GPU, and some libraries are disabled on CPU." | |
| ) | |
| gr.Markdown( | |
| "If the demo is out of memory, try to ***decrease*** the number of object prompt and ***increase*** score threshold." | |
| ) | |
| gr.Markdown("---") | |
| def add_tail_info(): | |
| gr.Markdown("---") | |
| gr.Markdown("### We also support Prompt") | |
| gr.Markdown( | |
| """ | |
| | Location prompt | result | Location prompt | result | | |
| | ---- | ---- | ---- | ---- | | |
| |  |  |  |  | | |
| |  |  |  |  | | |
| """ | |
| ) | |
| gr.Markdown("---") | |
| if __name__ == "__main__": | |
| available_port = [80, 8080] | |
| for port in available_port: | |
| if is_port_in_use(port): | |
| continue | |
| else: | |
| server_port = port | |
| break | |
| print("server_port", server_port) | |
| available_memory = [ | |
| torch.cuda.mem_get_info(i)[0] / 1024**3 for i in range(torch.cuda.device_count()) | |
| ] | |
| global running_device | |
| if len(available_memory) > 0: | |
| max_available_memory = max(available_memory) | |
| device_id = available_memory.index(max_available_memory) | |
| running_device = "cuda:" + str(device_id) | |
| else: | |
| max_available_memory = 0 | |
| running_device = "cpu" | |
| global save_memory | |
| save_memory = False | |
| if max_available_memory > 0 and max_available_memory < 40: | |
| save_memory = True | |
| print("available_memory", available_memory) | |
| print("max_available_memory", max_available_memory) | |
| print("running_device", running_device) | |
| print("save_memory", save_memory) | |
| # ========================================================================================== | |
| mp.set_start_method("spawn", force=True) | |
| setup_logger(name="fvcore") | |
| setup_logger(name="ape") | |
| global logger | |
| logger = setup_logger() | |
| global aug | |
| aug = T.ResizeShortestEdge([1024, 1024], 1024) | |
| global all_demo | |
| all_demo = {} | |
| all_cfg = {} | |
| # load_APE_A() | |
| # load_APE_B() | |
| # load_APE_C() | |
| save_memory = False | |
| load_APE_D() | |
| title = "APE: Aligning and Prompting Everything All at Once for Universal Visual Perception" | |
| block = gr.Blocks(title=title).queue() | |
| with block: | |
| add_head_info(max_available_memory) | |
| # APE_A_tab() | |
| # APE_C_tab() | |
| APE_D_tab() | |
| comparison_tab() | |
| # add_tail_info() | |
| block.launch( | |
| share=False, | |
| # server_name="0.0.0.0", | |
| # server_port=server_port, | |
| show_api=False, | |
| show_error=True, | |
| ) | |