Spaces:

aletrn
/

lisa-on-cuda

Paused

App Files Files Community

alessandro trinca tornidor commited on Mar 4, 2024

Commit

f182d7a

1 Parent(s): a5b4be9

[refactor] add and use create_placeholder_variables() function

Browse files

Files changed (5) hide show

README.md +1 -0
main.py +10 -17
resources/placeholders/error_happened.png +3 -0
resources/placeholders/no_seg_out.png +3 -0
utils/utils.py +14 -0

README.md CHANGED Viewed

@@ -321,3 +321,4 @@ If you find this project useful in your research, please consider citing:
 ## Acknowledgement
 -  This work is built upon the [LLaVA](https://github.com/haotian-liu/LLaVA) and [SAM](https://github.com/facebookresearch/segment-anything).

 ## Acknowledgement
 -  This work is built upon the [LLaVA](https://github.com/haotian-liu/LLaVA) and [SAM](https://github.com/facebookresearch/segment-anything).
+-  placeholders images (error, 'no output segmentation') from Muhammad Khaleeq (https://www.vecteezy.com/members/iyikon)

main.py CHANGED Viewed

@@ -20,9 +20,7 @@ from model.LISA import LISAForCausalLM
 from model.llava import conversation as conversation_lib
 from model.llava.mm_utils import tokenizer_image_token
 from model.segment_anything.utils.transforms import ResizeLongestSide
-from utils import constants, session_logger
-from utils.utils import (DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN,
-                         DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX)
 session_logger.change_logging(logging.DEBUG)
@@ -34,6 +32,7 @@ FASTAPI_STATIC = os.getenv("FASTAPI_STATIC")
 os.makedirs(FASTAPI_STATIC, exist_ok=True)
 app.mount("/static", StaticFiles(directory=FASTAPI_STATIC), name="static")
 templates = Jinja2Templates(directory="templates")
 @app.get("/health")
@@ -230,6 +229,7 @@ def get_inference_model_by_args(args_to_parse):
     logging.info(f"args_to_parse:{args_to_parse}, creating model...")
     model, clip_image_processor, tokenizer, transform = get_model(args_to_parse)
     logging.info("created model, preparing inference function")
     @session_logger.set_uuid_logging
     def inference(input_str, input_image):
@@ -242,22 +242,19 @@ def get_inference_model_by_args(args_to_parse):
         ## input valid check
         if not re.match(r"^[A-Za-z ,.!?\'\"]+$", input_str) or len(input_str) < 1:
             output_str = "[Error] Invalid input: ", input_str
-            # output_image = np.zeros((128, 128, 3))
-            ## error happened
-            output_image = cv2.imread("./resources/error_happened.png")[:, :, ::-1]
-            return output_image, output_str
         # Model Inference
         conv = conversation_lib.conv_templates[args_to_parse.conv_type].copy()
         conv.messages = []
         prompt = input_str
-        prompt = DEFAULT_IMAGE_TOKEN + "\n" + prompt
         if args_to_parse.use_mm_start_end:
             replace_token = (
-                DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
             )
-            prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
         conv.append_message(conv.roles[0], prompt)
         conv.append_message(conv.roles[1], "")
@@ -300,7 +297,7 @@ def get_inference_model_by_args(args_to_parse):
             max_new_tokens=512,
             tokenizer=tokenizer,
         )
-        output_ids = output_ids[0][output_ids[0] != IMAGE_TOKEN_INDEX]
         text_output = tokenizer.decode(output_ids, skip_special_tokens=False)
         text_output = text_output.replace("\n", "").replace("  ", " ")
@@ -321,12 +318,8 @@ def get_inference_model_by_args(args_to_parse):
                 + pred_mask[:, :, None].astype(np.uint8) * np.array([255, 0, 0]) * 0.5
             )[pred_mask]
-        output_str = f"ASSITANT: {text_output}"
-        if save_img is not None:
-            output_image = save_img  # input_image
-        else:
-            ## no seg output
-            output_image = cv2.imread("./resources/no_seg_out.png")[:, :, ::-1]
         logging.info(f"output_image type: {type(output_image)}.")
         return output_image, output_str

 from model.llava import conversation as conversation_lib
 from model.llava.mm_utils import tokenizer_image_token
 from model.segment_anything.utils.transforms import ResizeLongestSide
+from utils import constants, session_logger, utils
 session_logger.change_logging(logging.DEBUG)
 os.makedirs(FASTAPI_STATIC, exist_ok=True)
 app.mount("/static", StaticFiles(directory=FASTAPI_STATIC), name="static")
 templates = Jinja2Templates(directory="templates")
+placeholders = utils.create_placeholder_variables()
 @app.get("/health")
     logging.info(f"args_to_parse:{args_to_parse}, creating model...")
     model, clip_image_processor, tokenizer, transform = get_model(args_to_parse)
     logging.info("created model, preparing inference function")
+    no_seg_out, error_happened = placeholders["no_seg_out"], placeholders["error_happened"]
     @session_logger.set_uuid_logging
     def inference(input_str, input_image):
         ## input valid check
         if not re.match(r"^[A-Za-z ,.!?\'\"]+$", input_str) or len(input_str) < 1:
             output_str = "[Error] Invalid input: ", input_str
+            return error_happened, output_str
         # Model Inference
         conv = conversation_lib.conv_templates[args_to_parse.conv_type].copy()
         conv.messages = []
         prompt = input_str
+        prompt = utils.DEFAULT_IMAGE_TOKEN + "\n" + prompt
         if args_to_parse.use_mm_start_end:
             replace_token = (
+                utils.DEFAULT_IM_START_TOKEN + utils.DEFAULT_IMAGE_TOKEN + utils.DEFAULT_IM_END_TOKEN
             )
+            prompt = prompt.replace(utils.DEFAULT_IMAGE_TOKEN, replace_token)
         conv.append_message(conv.roles[0], prompt)
         conv.append_message(conv.roles[1], "")
             max_new_tokens=512,
             tokenizer=tokenizer,
         )
+        output_ids = output_ids[0][output_ids[0] != utils.IMAGE_TOKEN_INDEX]
         text_output = tokenizer.decode(output_ids, skip_special_tokens=False)
         text_output = text_output.replace("\n", "").replace("  ", " ")
                 + pred_mask[:, :, None].astype(np.uint8) * np.array([255, 0, 0]) * 0.5
             )[pred_mask]
+        output_str = f"ASSISTANT: {text_output}"
+        output_image = no_seg_out if save_img is None else save_img
         logging.info(f"output_image type: {type(output_image)}.")
         return output_image, output_str

resources/placeholders/error_happened.png ADDED Viewed

Git LFS Details

SHA256: f485f5a4e3df0bc33f6117a03d919d8043a70077863239bb969946e46d6f7349
Pointer size: 130 Bytes
Size of remote file: 37 kB

resources/placeholders/no_seg_out.png ADDED Viewed

Git LFS Details

SHA256: cccb555bff1ac91973741f77617dcf039c38ce090ad5685e16cb2536d1180774
Pointer size: 130 Bytes
Size of remote file: 38.7 kB

utils/utils.py CHANGED Viewed

@@ -1,9 +1,11 @@
 from enum import Enum
 import numpy as np
 import torch
 import torch.distributed as dist
 IGNORE_INDEX = -100
 IMAGE_TOKEN_INDEX = -200
 DEFAULT_IMAGE_TOKEN = "<image>"
@@ -40,6 +42,7 @@ ANSWER_LIST = [
     "Sure, the segmentation result is [SEG].",
     "[SEG].",
 ]
 class Summary(Enum):
@@ -161,3 +164,14 @@ def dict_to_cuda(input_dict):
         ):
             input_dict[k] = [ele.cuda(non_blocking=True) for ele in v]
     return input_dict

 from enum import Enum
+from pathlib import Path
 import numpy as np
 import torch
 import torch.distributed as dist
 IGNORE_INDEX = -100
 IMAGE_TOKEN_INDEX = -200
 DEFAULT_IMAGE_TOKEN = "<image>"
     "Sure, the segmentation result is [SEG].",
     "[SEG].",
 ]
+ROOT = Path(__file__).parent.parent
 class Summary(Enum):
         ):
             input_dict[k] = [ele.cuda(non_blocking=True) for ele in v]
     return input_dict
+def create_placeholder_variables():
+    import cv2
+    no_seg_out = cv2.imread(str(ROOT / "resources" / "placeholder" / "no_seg_out.png"))[:, :, ::-1]
+    error_happened = cv2.imread(str(ROOT / "resources" / "placeholder" / "error_happened.png"))[:, :, ::-1]
+    return {
+        "no_seg_out": no_seg_out,
+        "error_happened": error_happened
+    }