Spaces:

chasetank
/

Visual-GPT-3.5-Turbo

Runtime error

App Files Files Community

chasetank commited on Mar 19, 2023

Commit

082f846

1 Parent(s): 8224e2b

fix: model test

Browse files

Files changed (1) hide show

app.py +215 -112

app.py CHANGED Viewed

@@ -21,7 +21,7 @@ from controlnet_aux import OpenposeDetector, MLSDdetector, HEDdetector
 from langchain.agents.initialize import initialize_agent
 from langchain.agents.tools import Tool
 from langchain.chains.conversation.memory import ConversationBufferMemory
-from langchain.llms import OpenAIChat
 VISUAL_CHATGPT_PREFIX = """Visual ChatGPT is designed to be able to assist with a wide range of text and visual related tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. Visual ChatGPT is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.
@@ -123,8 +123,10 @@ class MaskFormer:
     def __init__(self, device):
         print(f"Initializing MaskFormer to {device}")
         self.device = device
-        self.processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
-        self.model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined").to(device)
     def inference(self, image_path, text):
         threshold = 0.5
@@ -132,7 +134,8 @@ class MaskFormer:
         padding = 20
         original_image = Image.open(image_path)
         image = original_image.resize((512, 512))
-        inputs = self.processor(text=text, images=image, padding="max_length", return_tensors="pt").to(self.device)
         with torch.no_grad():
             outputs = self.model(**inputs)
         mask = torch.sigmoid(outputs[0]).squeeze().cpu().numpy() > threshold
@@ -142,7 +145,8 @@ class MaskFormer:
         true_indices = np.argwhere(mask)
         mask_array = np.zeros_like(mask, dtype=bool)
         for idx in true_indices:
-            padded_slice = tuple(slice(max(0, i - padding), i + padding + 1) for i in idx)
             mask_array[padded_slice] = True
         visual_mask = (mask_array * 255).astype(np.uint8)
         image_mask = Image.fromarray(visual_mask)
@@ -165,7 +169,8 @@ class ImageEditing:
                          "The input to this tool should be a comma separated string of two, "
                          "representing the image_path and the object need to be removed. ")
     def inference_remove(self, inputs):
-        image_path, to_be_removed_txt = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
         return self.inference_replace(f"{image_path},{to_be_removed_txt},background")
     @prompts(name="Replace Something From The Photo",
@@ -180,7 +185,8 @@ class ImageEditing:
         mask_image = self.mask_former.inference(image_path, to_be_replaced_txt)
         updated_image = self.inpaint(prompt=replace_with_txt, image=original_image.resize((512, 512)),
                                      mask_image=mask_image.resize((512, 512))).images[0]
-        updated_image_path = get_new_image_name(image_path, func_name="replace-something")
         updated_image = updated_image.resize(original_size)
         updated_image.save(updated_image_path)
         print(
@@ -197,7 +203,8 @@ class InstructPix2Pix:
         self.pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained("timbrooks/instruct-pix2pix",
                                                                            safety_checker=None,
                                                                            torch_dtype=self.torch_dtype).to(device)
-        self.pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(self.pipe.scheduler.config)
     @prompts(name="Instruct Image Using Text",
              description="useful when you want to the style of the image to be like the text. "
@@ -207,10 +214,13 @@ class InstructPix2Pix:
     def inference(self, inputs):
         """Change style of image."""
         print("===>Starting InstructPix2Pix Inference")
-        image_path, text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
         original_image = Image.open(image_path)
-        image = self.pipe(text, image=original_image, num_inference_steps=40, image_guidance_scale=1.2).images[0]
-        updated_image_path = get_new_image_name(image_path, func_name="pix2pix")
         image.save(updated_image_path)
         print(f"\nProcessed InstructPix2Pix, Input Image: {image_path}, Instruct Text: {text}, "
               f"Output Image: {updated_image_path}")
@@ -248,7 +258,8 @@ class ImageCaptioning:
         print(f"Initializing ImageCaptioning to {device}")
         self.device = device
         self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
-        self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
         self.model = BlipForConditionalGeneration.from_pretrained(
             "Salesforce/blip-image-captioning-base", torch_dtype=self.torch_dtype).to(self.device)
@@ -256,10 +267,12 @@ class ImageCaptioning:
              description="useful when you want to know what is inside the photo. receives image_path as input. "
                          "The input to this tool should be a string, representing the image_path. ")
     def inference(self, image_path):
-        inputs = self.processor(Image.open(image_path), return_tensors="pt").to(self.device, self.torch_dtype)
         out = self.model.generate(**inputs)
         captions = self.processor.decode(out[0], skip_special_tokens=True)
-        print(f"\nProcessed ImageCaptioning, Input Image: {image_path}, Output Text: {captions}")
         return captions
@@ -283,7 +296,8 @@ class Image2Canny:
         canny = Image.fromarray(canny)
         updated_image_path = get_new_image_name(inputs, func_name="edge")
         canny.save(updated_image_path)
-        print(f"\nProcessed Image2Canny, Input Image: {inputs}, Output Text: {updated_image_path}")
         return updated_image_path
@@ -296,12 +310,13 @@ class CannyText2Image:
         self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
             torch_dtype=self.torch_dtype)
-        self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
         self.pipe.to(device)
         self.seed = -1
         self.a_prompt = 'best quality, extremely detailed'
         self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
-                            'fewer digits, cropped, worst quality, low quality'
     @prompts(name="Generate Image Condition On Canny Image",
              description="useful when you want to generate a new real image from both the user description and a canny image."
@@ -310,14 +325,16 @@ class CannyText2Image:
                          "The input to this tool should be a comma separated string of two, "
                          "representing the image_path and the user description. ")
     def inference(self, inputs):
-        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
         image = Image.open(image_path)
         self.seed = random.randint(0, 65535)
         seed_everything(self.seed)
         prompt = f'{instruct_text}, {self.a_prompt}'
         image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
                           guidance_scale=9.0).images[0]
-        updated_image_path = get_new_image_name(image_path, func_name="canny2image")
         image.save(updated_image_path)
         print(f"\nProcessed CannyText2Image, Input Canny: {image_path}, Input Text: {instruct_text}, "
               f"Output Text: {updated_image_path}")
@@ -339,7 +356,8 @@ class Image2Line:
         mlsd = self.detector(image)
         updated_image_path = get_new_image_name(inputs, func_name="line-of")
         mlsd.save(updated_image_path)
-        print(f"\nProcessed Image2Line, Input Image: {inputs}, Output Line: {updated_image_path}")
         return updated_image_path
@@ -353,12 +371,13 @@ class LineText2Image:
             "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
             torch_dtype=self.torch_dtype
         )
-        self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
         self.pipe.to(device)
         self.seed = -1
         self.a_prompt = 'best quality, extremely detailed'
         self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
-                            'fewer digits, cropped, worst quality, low quality'
     @prompts(name="Generate Image Condition On Line Image",
              description="useful when you want to generate a new real image from both the user description "
@@ -368,14 +387,16 @@ class LineText2Image:
                          "The input to this tool should be a comma separated string of two, "
                          "representing the image_path and the user description. ")
     def inference(self, inputs):
-        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
         image = Image.open(image_path)
         self.seed = random.randint(0, 65535)
         seed_everything(self.seed)
         prompt = f'{instruct_text}, {self.a_prompt}'
         image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
                           guidance_scale=9.0).images[0]
-        updated_image_path = get_new_image_name(image_path, func_name="line2image")
         image.save(updated_image_path)
         print(f"\nProcessed LineText2Image, Input Line: {image_path}, Input Text: {instruct_text}, "
               f"Output Text: {updated_image_path}")
@@ -395,9 +416,11 @@ class Image2Hed:
     def inference(self, inputs):
         image = Image.open(inputs)
         hed = self.detector(image)
-        updated_image_path = get_new_image_name(inputs, func_name="hed-boundary")
         hed.save(updated_image_path)
-        print(f"\nProcessed Image2Hed, Input Image: {inputs}, Output Hed: {updated_image_path}")
         return updated_image_path
@@ -411,12 +434,13 @@ class HedText2Image:
             "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
             torch_dtype=self.torch_dtype
         )
-        self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
         self.pipe.to(device)
         self.seed = -1
         self.a_prompt = 'best quality, extremely detailed'
         self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
-                            'fewer digits, cropped, worst quality, low quality'
     @prompts(name="Generate Image Condition On Soft Hed Boundary Image",
              description="useful when you want to generate a new real image from both the user description "
@@ -426,14 +450,16 @@ class HedText2Image:
                          "The input to this tool should be a comma separated string of two, "
                          "representing the image_path and the user description")
     def inference(self, inputs):
-        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
         image = Image.open(image_path)
         self.seed = random.randint(0, 65535)
         seed_everything(self.seed)
         prompt = f'{instruct_text}, {self.a_prompt}'
         image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
                           guidance_scale=9.0).images[0]
-        updated_image_path = get_new_image_name(image_path, func_name="hed2image")
         image.save(updated_image_path)
         print(f"\nProcessed HedText2Image, Input Hed: {image_path}, Input Text: {instruct_text}, "
               f"Output Image: {updated_image_path}")
@@ -455,7 +481,8 @@ class Image2Scribble:
         scribble = self.detector(image, scribble=True)
         updated_image_path = get_new_image_name(inputs, func_name="scribble")
         scribble.save(updated_image_path)
-        print(f"\nProcessed Image2Scribble, Input Image: {inputs}, Output Scribble: {updated_image_path}")
         return updated_image_path
@@ -469,12 +496,13 @@ class ScribbleText2Image:
             "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
             torch_dtype=self.torch_dtype
         )
-        self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
         self.pipe.to(device)
         self.seed = -1
         self.a_prompt = 'best quality, extremely detailed'
         self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
-                            'fewer digits, cropped, worst quality, low quality'
     @prompts(name="Generate Image Condition On Sketch Image",
              description="useful when you want to generate a new real image from both the user description and "
@@ -482,14 +510,16 @@ class ScribbleText2Image:
                          "The input to this tool should be a comma separated string of two, "
                          "representing the image_path and the user description")
     def inference(self, inputs):
-        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
         image = Image.open(image_path)
         self.seed = random.randint(0, 65535)
         seed_everything(self.seed)
         prompt = f'{instruct_text}, {self.a_prompt}'
         image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
                           guidance_scale=9.0).images[0]
-        updated_image_path = get_new_image_name(image_path, func_name="scribble2image")
         image.save(updated_image_path)
         print(f"\nProcessed ScribbleText2Image, Input Scribble: {image_path}, Input Text: {instruct_text}, "
               f"Output Image: {updated_image_path}")
@@ -499,7 +529,8 @@ class ScribbleText2Image:
 class Image2Pose:
     def __init__(self, device):
         print("Initializing Image2Pose")
-        self.detector = OpenposeDetector.from_pretrained('lllyasviel/ControlNet')
     @prompts(name="Pose Detection On Image",
              description="useful when you want to detect the human pose of the image. "
@@ -510,7 +541,8 @@ class Image2Pose:
         pose = self.detector(image)
         updated_image_path = get_new_image_name(inputs, func_name="human-pose")
         pose.save(updated_image_path)
-        print(f"\nProcessed Image2Pose, Input Image: {inputs}, Output Pose: {updated_image_path}")
         return updated_image_path
@@ -523,14 +555,15 @@ class PoseText2Image:
         self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
             torch_dtype=self.torch_dtype)
-        self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
         self.pipe.to(device)
         self.num_inference_steps = 20
         self.seed = -1
         self.unconditional_guidance_scale = 9.0
         self.a_prompt = 'best quality, extremely detailed'
         self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
-                            ' fewer digits, cropped, worst quality, low quality'
     @prompts(name="Generate Image Condition On Pose Image",
              description="useful when you want to generate a new real image from both the user description "
@@ -540,14 +573,16 @@ class PoseText2Image:
                          "The input to this tool should be a comma separated string of two, "
                          "representing the image_path and the user description")
     def inference(self, inputs):
-        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
         image = Image.open(image_path)
         self.seed = random.randint(0, 65535)
         seed_everything(self.seed)
         prompt = f'{instruct_text}, {self.a_prompt}'
         image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
                           guidance_scale=9.0).images[0]
-        updated_image_path = get_new_image_name(image_path, func_name="pose2image")
         image.save(updated_image_path)
         print(f"\nProcessed PoseText2Image, Input Pose: {image_path}, Input Text: {instruct_text}, "
               f"Output Image: {updated_image_path}")
@@ -557,45 +592,83 @@ class PoseText2Image:
 class Image2Seg:
     def __init__(self, device):
         print("Initializing Image2Seg")
-        self.image_processor = AutoImageProcessor.from_pretrained("openmmlab/upernet-convnext-small")
-        self.image_segmentor = UperNetForSemanticSegmentation.from_pretrained("openmmlab/upernet-convnext-small")
         self.ade_palette = [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
-                            [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255],
-                            [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7],
-                            [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82],
-                            [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3],
-                            [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255],
-                            [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220],
-                            [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224],
-                            [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255],
-                            [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7],
-                            [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153],
-                            [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255],
-                            [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0],
-                            [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
-                            [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255],
-                            [11, 200, 200], [255, 82, 0], [0, 255, 245], [0, 61, 255],
-                            [0, 255, 112], [0, 255, 133], [255, 0, 0], [255, 163, 0],
-                            [255, 102, 0], [194, 255, 0], [0, 143, 255], [51, 255, 0],
-                            [0, 82, 255], [0, 255, 41], [0, 255, 173], [10, 0, 255],
-                            [173, 255, 0], [0, 255, 153], [255, 92, 0], [255, 0, 255],
-                            [255, 0, 245], [255, 0, 102], [255, 173, 0], [255, 0, 20],
-                            [255, 184, 184], [0, 31, 255], [0, 255, 61], [0, 71, 255],
-                            [255, 0, 204], [0, 255, 194], [0, 255, 82], [0, 10, 255],
-                            [0, 112, 255], [51, 0, 255], [0, 194, 255], [0, 122, 255],
-                            [0, 255, 163], [255, 153, 0], [0, 255, 10], [255, 112, 0],
-                            [143, 255, 0], [82, 0, 255], [163, 255, 0], [255, 235, 0],
-                            [8, 184, 170], [133, 0, 255], [0, 255, 92], [184, 0, 255],
-                            [255, 0, 31], [0, 184, 255], [0, 214, 255], [255, 0, 112],
-                            [92, 255, 0], [0, 224, 255], [112, 224, 255], [70, 184, 160],
-                            [163, 0, 255], [153, 0, 255], [71, 255, 0], [255, 0, 163],
-                            [255, 204, 0], [255, 0, 143], [0, 255, 235], [133, 255, 0],
-                            [255, 0, 235], [245, 0, 255], [255, 0, 122], [255, 245, 0],
-                            [10, 190, 212], [214, 255, 0], [0, 204, 255], [20, 0, 255],
-                            [255, 255, 0], [0, 153, 255], [0, 41, 255], [0, 255, 204],
-                            [41, 0, 255], [41, 255, 0], [173, 0, 255], [0, 245, 255],
-                            [71, 0, 255], [122, 0, 255], [0, 255, 184], [0, 92, 255],
-                            [184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194],
                             [102, 255, 0], [92, 0, 255]]
     @prompts(name="Segmentation On Image",
@@ -605,19 +678,24 @@ class Image2Seg:
                          "The input to this tool should be a string, representing the image_path")
     def inference(self, inputs):
         image = Image.open(inputs)
-        pixel_values = self.image_processor(image, return_tensors="pt").pixel_values
         with torch.no_grad():
             outputs = self.image_segmentor(pixel_values)
-        seg = self.image_processor.post_process_semantic_segmentation(outputs, target_sizes=[image.size[::-1]])[0]
-        color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)  # height, width, 3
         palette = np.array(self.ade_palette)
         for label, color in enumerate(palette):
             color_seg[seg == label, :] = color
         color_seg = color_seg.astype(np.uint8)
         segmentation = Image.fromarray(color_seg)
-        updated_image_path = get_new_image_name(inputs, func_name="segmentation")
         segmentation.save(updated_image_path)
-        print(f"\nProcessed Image2Pose, Input Image: {inputs}, Output Pose: {updated_image_path}")
         return updated_image_path
@@ -630,12 +708,13 @@ class SegText2Image:
         self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
             torch_dtype=self.torch_dtype)
-        self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
         self.pipe.to(device)
         self.seed = -1
         self.a_prompt = 'best quality, extremely detailed'
         self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
-                            ' fewer digits, cropped, worst quality, low quality'
     @prompts(name="Generate Image Condition On Segmentations",
              description="useful when you want to generate a new real image from both the user description and segmentations. "
@@ -644,14 +723,16 @@ class SegText2Image:
                          "The input to this tool should be a comma separated string of two, "
                          "representing the image_path and the user description")
     def inference(self, inputs):
-        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
         image = Image.open(image_path)
         self.seed = random.randint(0, 65535)
         seed_everything(self.seed)
         prompt = f'{instruct_text}, {self.a_prompt}'
         image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
                           guidance_scale=9.0).images[0]
-        updated_image_path = get_new_image_name(image_path, func_name="segment2image")
         image.save(updated_image_path)
         print(f"\nProcessed SegText2Image, Input Seg: {image_path}, Input Text: {instruct_text}, "
               f"Output Image: {updated_image_path}")
@@ -676,7 +757,8 @@ class Image2Depth:
         depth = Image.fromarray(depth)
         updated_image_path = get_new_image_name(inputs, func_name="depth")
         depth.save(updated_image_path)
-        print(f"\nProcessed Image2Depth, Input Image: {inputs}, Output Depth: {updated_image_path}")
         return updated_image_path
@@ -689,12 +771,13 @@ class DepthText2Image:
         self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
             torch_dtype=self.torch_dtype)
-        self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
         self.pipe.to(device)
         self.seed = -1
         self.a_prompt = 'best quality, extremely detailed'
         self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
-                            ' fewer digits, cropped, worst quality, low quality'
     @prompts(name="Generate Image Condition On Depth",
              description="useful when you want to generate a new real image from both the user description and depth image. "
@@ -703,14 +786,16 @@ class DepthText2Image:
                          "The input to this tool should be a comma separated string of two, "
                          "representing the image_path and the user description")
     def inference(self, inputs):
-        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
         image = Image.open(image_path)
         self.seed = random.randint(0, 65535)
         seed_everything(self.seed)
         prompt = f'{instruct_text}, {self.a_prompt}'
         image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
                           guidance_scale=9.0).images[0]
-        updated_image_path = get_new_image_name(image_path, func_name="depth2image")
         image.save(updated_image_path)
         print(f"\nProcessed DepthText2Image, Input Depth: {image_path}, Input Text: {instruct_text}, "
               f"Output Image: {updated_image_path}")
@@ -720,7 +805,8 @@ class DepthText2Image:
 class Image2Normal:
     def __init__(self, device):
         print("Initializing Image2Normal")
-        self.depth_estimator = pipeline("depth-estimation", model="Intel/dpt-hybrid-midas")
         self.bg_threhold = 0.4
     @prompts(name="Predict Normal Map On Image",
@@ -747,7 +833,8 @@ class Image2Normal:
         image = image.resize(original_size)
         updated_image_path = get_new_image_name(inputs, func_name="normal-map")
         image.save(updated_image_path)
-        print(f"\nProcessed Image2Normal, Input Image: {inputs}, Output Depth: {updated_image_path}")
         return updated_image_path
@@ -760,12 +847,13 @@ class NormalText2Image:
         self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
             torch_dtype=self.torch_dtype)
-        self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
         self.pipe.to(device)
         self.seed = -1
         self.a_prompt = 'best quality, extremely detailed'
         self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
-                            ' fewer digits, cropped, worst quality, low quality'
     @prompts(name="Generate Image Condition On Normal Map",
              description="useful when you want to generate a new real image from both the user description and normal map. "
@@ -774,14 +862,16 @@ class NormalText2Image:
                          "The input to this tool should be a comma separated string of two, "
                          "representing the image_path and the user description")
     def inference(self, inputs):
-        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
         image = Image.open(image_path)
         self.seed = random.randint(0, 65535)
         seed_everything(self.seed)
         prompt = f'{instruct_text}, {self.a_prompt}'
         image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
                           guidance_scale=9.0).images[0]
-        updated_image_path = get_new_image_name(image_path, func_name="normal2image")
         image.save(updated_image_path)
         print(f"\nProcessed NormalText2Image, Input Normal: {image_path}, Input Text: {instruct_text}, "
               f"Output Image: {updated_image_path}")
@@ -793,7 +883,8 @@ class VisualQuestionAnswering:
         print(f"Initializing VisualQuestionAnswering to {device}")
         self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
         self.device = device
-        self.processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
         self.model = BlipForQuestionAnswering.from_pretrained(
             "Salesforce/blip-vqa-base", torch_dtype=self.torch_dtype).to(self.device)
@@ -802,9 +893,11 @@ class VisualQuestionAnswering:
                          "like: what is the background color of the last image, how many cats in this figure, what is in this figure. "
                          "The input to this tool should be a comma separated string of two, representing the image_path and the question")
     def inference(self, inputs):
-        image_path, question = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
         raw_image = Image.open(image_path).convert('RGB')
-        inputs = self.processor(raw_image, question, return_tensors="pt").to(self.device, self.torch_dtype)
         out = self.model.generate(**inputs)
         answer = self.processor.decode(out[0], skip_special_tokens=True)
         print(f"\nProcessed VisualQuestionAnswering, Input Image: {image_path}, Input Question: {question}, "
@@ -817,10 +910,12 @@ class ConversationBot:
         # load_dict = {'VisualQuestionAnswering':'cuda:0', 'ImageCaptioning':'cuda:1',...}
         print(f"Initializing VisualChatGPT, load_dict={load_dict}")
         if 'ImageCaptioning' not in load_dict:
-            raise ValueError("You have to load ImageCaptioning as a basic function for VisualChatGPT")
         self.llm = OpenAIChat(temperature=0)
-        self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
         self.models = {}
         for class_name, device in load_dict.items():
@@ -831,7 +926,8 @@ class ConversationBot:
             for e in dir(instance):
                 if e.startswith('inference'):
                     func = getattr(instance, e)
-                    self.tools.append(Tool(name=func.name, description=func.description, func=func))
         self.agent = initialize_agent(
             self.tools,
@@ -844,10 +940,12 @@ class ConversationBot:
                           'suffix': VISUAL_CHATGPT_SUFFIX}, )
     def run_text(self, text, state):
-        self.agent.memory.buffer = cut_dialogue_history(self.agent.memory.buffer, keep_last_n_words=500)
         res = self.agent({"input": text})
         res['output'] = res['output'].replace("\\", "/")
-        response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
         state = state + [(text, response)]
         print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
               f"Current Memory: {self.agent.memory.buffer}")
@@ -865,12 +963,15 @@ class ConversationBot:
         img = img.resize((width_new, height_new))
         img = img.convert('RGB')
         img.save(image_filename, "PNG")
-        print(f"Resize image form {width}x{height} to {width_new}x{height_new}")
         description = self.models['ImageCaptioning'].inference(image_filename)
         Human_prompt = f'\nHuman: provide a figure named {image_filename}. The description is: {description}. This information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
         AI_prompt = "Received.  "
-        self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
-        state = state + [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)]
         print(f"\nProcessed run_image, Input image: {image_filename}\nCurrent state: {state}\n"
               f"Current Memory: {self.agent.memory.buffer}")
         return state, state, f'{txt} {image_filename} '
@@ -878,9 +979,11 @@ class ConversationBot:
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
-    parser.add_argument('--load', type=str, default="ImageCaptioning_cuda:0,Text2Image_cuda:0")
     args = parser.parse_args()
-    load_dict = {e.split('_')[0].strip(): e.split('_')[1].strip() for e in args.load.split(',')}
     bot = ConversationBot(load_dict=load_dict)
     with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo:
         chatbot = gr.Chatbot(elem_id="chatbot", label="Visual ChatGPT")

 from langchain.agents.initialize import initialize_agent
 from langchain.agents.tools import Tool
 from langchain.chains.conversation.memory import ConversationBufferMemory
+from langchain.llms.openai import OpenAI
 VISUAL_CHATGPT_PREFIX = """Visual ChatGPT is designed to be able to assist with a wide range of text and visual related tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. Visual ChatGPT is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.
     def __init__(self, device):
         print(f"Initializing MaskFormer to {device}")
         self.device = device
+        self.processor = CLIPSegProcessor.from_pretrained(
+            "CIDAS/clipseg-rd64-refined")
+        self.model = CLIPSegForImageSegmentation.from_pretrained(
+            "CIDAS/clipseg-rd64-refined").to(device)
     def inference(self, image_path, text):
         threshold = 0.5
         padding = 20
         original_image = Image.open(image_path)
         image = original_image.resize((512, 512))
+        inputs = self.processor(
+            text=text, images=image, padding="max_length", return_tensors="pt").to(self.device)
         with torch.no_grad():
             outputs = self.model(**inputs)
         mask = torch.sigmoid(outputs[0]).squeeze().cpu().numpy() > threshold
         true_indices = np.argwhere(mask)
         mask_array = np.zeros_like(mask, dtype=bool)
         for idx in true_indices:
+            padded_slice = tuple(
+                slice(max(0, i - padding), i + padding + 1) for i in idx)
             mask_array[padded_slice] = True
         visual_mask = (mask_array * 255).astype(np.uint8)
         image_mask = Image.fromarray(visual_mask)
                          "The input to this tool should be a comma separated string of two, "
                          "representing the image_path and the object need to be removed. ")
     def inference_remove(self, inputs):
+        image_path, to_be_removed_txt = inputs.split(
+            ",")[0], ','.join(inputs.split(',')[1:])
         return self.inference_replace(f"{image_path},{to_be_removed_txt},background")
     @prompts(name="Replace Something From The Photo",
         mask_image = self.mask_former.inference(image_path, to_be_replaced_txt)
         updated_image = self.inpaint(prompt=replace_with_txt, image=original_image.resize((512, 512)),
                                      mask_image=mask_image.resize((512, 512))).images[0]
+        updated_image_path = get_new_image_name(
+            image_path, func_name="replace-something")
         updated_image = updated_image.resize(original_size)
         updated_image.save(updated_image_path)
         print(
         self.pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained("timbrooks/instruct-pix2pix",
                                                                            safety_checker=None,
                                                                            torch_dtype=self.torch_dtype).to(device)
+        self.pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(
+            self.pipe.scheduler.config)
     @prompts(name="Instruct Image Using Text",
              description="useful when you want to the style of the image to be like the text. "
     def inference(self, inputs):
         """Change style of image."""
         print("===>Starting InstructPix2Pix Inference")
+        image_path, text = inputs.split(
+            ",")[0], ','.join(inputs.split(',')[1:])
         original_image = Image.open(image_path)
+        image = self.pipe(text, image=original_image,
+                          num_inference_steps=40, image_guidance_scale=1.2).images[0]
+        updated_image_path = get_new_image_name(
+            image_path, func_name="pix2pix")
         image.save(updated_image_path)
         print(f"\nProcessed InstructPix2Pix, Input Image: {image_path}, Instruct Text: {text}, "
               f"Output Image: {updated_image_path}")
         print(f"Initializing ImageCaptioning to {device}")
         self.device = device
         self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
+        self.processor = BlipProcessor.from_pretrained(
+            "Salesforce/blip-image-captioning-base")
         self.model = BlipForConditionalGeneration.from_pretrained(
             "Salesforce/blip-image-captioning-base", torch_dtype=self.torch_dtype).to(self.device)
              description="useful when you want to know what is inside the photo. receives image_path as input. "
                          "The input to this tool should be a string, representing the image_path. ")
     def inference(self, image_path):
+        inputs = self.processor(Image.open(image_path), return_tensors="pt").to(
+            self.device, self.torch_dtype)
         out = self.model.generate(**inputs)
         captions = self.processor.decode(out[0], skip_special_tokens=True)
+        print(
+            f"\nProcessed ImageCaptioning, Input Image: {image_path}, Output Text: {captions}")
         return captions
         canny = Image.fromarray(canny)
         updated_image_path = get_new_image_name(inputs, func_name="edge")
         canny.save(updated_image_path)
+        print(
+            f"\nProcessed Image2Canny, Input Image: {inputs}, Output Text: {updated_image_path}")
         return updated_image_path
         self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
             torch_dtype=self.torch_dtype)
+        self.pipe.scheduler = UniPCMultistepScheduler.from_config(
+            self.pipe.scheduler.config)
         self.pipe.to(device)
         self.seed = -1
         self.a_prompt = 'best quality, extremely detailed'
         self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
+            'fewer digits, cropped, worst quality, low quality'
     @prompts(name="Generate Image Condition On Canny Image",
              description="useful when you want to generate a new real image from both the user description and a canny image."
                          "The input to this tool should be a comma separated string of two, "
                          "representing the image_path and the user description. ")
     def inference(self, inputs):
+        image_path, instruct_text = inputs.split(
+            ",")[0], ','.join(inputs.split(',')[1:])
         image = Image.open(image_path)
         self.seed = random.randint(0, 65535)
         seed_everything(self.seed)
         prompt = f'{instruct_text}, {self.a_prompt}'
         image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
                           guidance_scale=9.0).images[0]
+        updated_image_path = get_new_image_name(
+            image_path, func_name="canny2image")
         image.save(updated_image_path)
         print(f"\nProcessed CannyText2Image, Input Canny: {image_path}, Input Text: {instruct_text}, "
               f"Output Text: {updated_image_path}")
         mlsd = self.detector(image)
         updated_image_path = get_new_image_name(inputs, func_name="line-of")
         mlsd.save(updated_image_path)
+        print(
+            f"\nProcessed Image2Line, Input Image: {inputs}, Output Line: {updated_image_path}")
         return updated_image_path
             "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
             torch_dtype=self.torch_dtype
         )
+        self.pipe.scheduler = UniPCMultistepScheduler.from_config(
+            self.pipe.scheduler.config)
         self.pipe.to(device)
         self.seed = -1
         self.a_prompt = 'best quality, extremely detailed'
         self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
+            'fewer digits, cropped, worst quality, low quality'
     @prompts(name="Generate Image Condition On Line Image",
              description="useful when you want to generate a new real image from both the user description "
                          "The input to this tool should be a comma separated string of two, "
                          "representing the image_path and the user description. ")
     def inference(self, inputs):
+        image_path, instruct_text = inputs.split(
+            ",")[0], ','.join(inputs.split(',')[1:])
         image = Image.open(image_path)
         self.seed = random.randint(0, 65535)
         seed_everything(self.seed)
         prompt = f'{instruct_text}, {self.a_prompt}'
         image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
                           guidance_scale=9.0).images[0]
+        updated_image_path = get_new_image_name(
+            image_path, func_name="line2image")
         image.save(updated_image_path)
         print(f"\nProcessed LineText2Image, Input Line: {image_path}, Input Text: {instruct_text}, "
               f"Output Text: {updated_image_path}")
     def inference(self, inputs):
         image = Image.open(inputs)
         hed = self.detector(image)
+        updated_image_path = get_new_image_name(
+            inputs, func_name="hed-boundary")
         hed.save(updated_image_path)
+        print(
+            f"\nProcessed Image2Hed, Input Image: {inputs}, Output Hed: {updated_image_path}")
         return updated_image_path
             "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
             torch_dtype=self.torch_dtype
         )
+        self.pipe.scheduler = UniPCMultistepScheduler.from_config(
+            self.pipe.scheduler.config)
         self.pipe.to(device)
         self.seed = -1
         self.a_prompt = 'best quality, extremely detailed'
         self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
+            'fewer digits, cropped, worst quality, low quality'
     @prompts(name="Generate Image Condition On Soft Hed Boundary Image",
              description="useful when you want to generate a new real image from both the user description "
                          "The input to this tool should be a comma separated string of two, "
                          "representing the image_path and the user description")
     def inference(self, inputs):
+        image_path, instruct_text = inputs.split(
+            ",")[0], ','.join(inputs.split(',')[1:])
         image = Image.open(image_path)
         self.seed = random.randint(0, 65535)
         seed_everything(self.seed)
         prompt = f'{instruct_text}, {self.a_prompt}'
         image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
                           guidance_scale=9.0).images[0]
+        updated_image_path = get_new_image_name(
+            image_path, func_name="hed2image")
         image.save(updated_image_path)
         print(f"\nProcessed HedText2Image, Input Hed: {image_path}, Input Text: {instruct_text}, "
               f"Output Image: {updated_image_path}")
         scribble = self.detector(image, scribble=True)
         updated_image_path = get_new_image_name(inputs, func_name="scribble")
         scribble.save(updated_image_path)
+        print(
+            f"\nProcessed Image2Scribble, Input Image: {inputs}, Output Scribble: {updated_image_path}")
         return updated_image_path
             "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
             torch_dtype=self.torch_dtype
         )
+        self.pipe.scheduler = UniPCMultistepScheduler.from_config(
+            self.pipe.scheduler.config)
         self.pipe.to(device)
         self.seed = -1
         self.a_prompt = 'best quality, extremely detailed'
         self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
+            'fewer digits, cropped, worst quality, low quality'
     @prompts(name="Generate Image Condition On Sketch Image",
              description="useful when you want to generate a new real image from both the user description and "
                          "The input to this tool should be a comma separated string of two, "
                          "representing the image_path and the user description")
     def inference(self, inputs):
+        image_path, instruct_text = inputs.split(
+            ",")[0], ','.join(inputs.split(',')[1:])
         image = Image.open(image_path)
         self.seed = random.randint(0, 65535)
         seed_everything(self.seed)
         prompt = f'{instruct_text}, {self.a_prompt}'
         image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
                           guidance_scale=9.0).images[0]
+        updated_image_path = get_new_image_name(
+            image_path, func_name="scribble2image")
         image.save(updated_image_path)
         print(f"\nProcessed ScribbleText2Image, Input Scribble: {image_path}, Input Text: {instruct_text}, "
               f"Output Image: {updated_image_path}")
 class Image2Pose:
     def __init__(self, device):
         print("Initializing Image2Pose")
+        self.detector = OpenposeDetector.from_pretrained(
+            'lllyasviel/ControlNet')
     @prompts(name="Pose Detection On Image",
              description="useful when you want to detect the human pose of the image. "
         pose = self.detector(image)
         updated_image_path = get_new_image_name(inputs, func_name="human-pose")
         pose.save(updated_image_path)
+        print(
+            f"\nProcessed Image2Pose, Input Image: {inputs}, Output Pose: {updated_image_path}")
         return updated_image_path
         self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
             torch_dtype=self.torch_dtype)
+        self.pipe.scheduler = UniPCMultistepScheduler.from_config(
+            self.pipe.scheduler.config)
         self.pipe.to(device)
         self.num_inference_steps = 20
         self.seed = -1
         self.unconditional_guidance_scale = 9.0
         self.a_prompt = 'best quality, extremely detailed'
         self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
+            ' fewer digits, cropped, worst quality, low quality'
     @prompts(name="Generate Image Condition On Pose Image",
              description="useful when you want to generate a new real image from both the user description "
                          "The input to this tool should be a comma separated string of two, "
                          "representing the image_path and the user description")
     def inference(self, inputs):
+        image_path, instruct_text = inputs.split(
+            ",")[0], ','.join(inputs.split(',')[1:])
         image = Image.open(image_path)
         self.seed = random.randint(0, 65535)
         seed_everything(self.seed)
         prompt = f'{instruct_text}, {self.a_prompt}'
         image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
                           guidance_scale=9.0).images[0]
+        updated_image_path = get_new_image_name(
+            image_path, func_name="pose2image")
         image.save(updated_image_path)
         print(f"\nProcessed PoseText2Image, Input Pose: {image_path}, Input Text: {instruct_text}, "
               f"Output Image: {updated_image_path}")
 class Image2Seg:
     def __init__(self, device):
         print("Initializing Image2Seg")
+        self.image_processor = AutoImageProcessor.from_pretrained(
+            "openmmlab/upernet-convnext-small")
+        self.image_segmentor = UperNetForSemanticSegmentation.from_pretrained(
+            "openmmlab/upernet-convnext-small")
         self.ade_palette = [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
+                            [4, 200, 3], [120, 120, 80], [
+                                140, 140, 140], [204, 5, 255],
+                            [230, 230, 230], [4, 250, 7], [
+                                224, 5, 255], [235, 255, 7],
+                            [150, 5, 61], [120, 120, 70], [
+                                8, 255, 51], [255, 6, 82],
+                            [143, 255, 140], [204, 255, 4], [
+                                255, 51, 7], [204, 70, 3],
+                            [0, 102, 200], [61, 230, 250], [
+                                255, 6, 51], [11, 102, 255],
+                            [255, 7, 71], [255, 9, 224], [
+                                9, 7, 230], [220, 220, 220],
+                            [255, 9, 92], [112, 9, 255], [
+                                8, 255, 214], [7, 255, 224],
+                            [255, 184, 6], [10, 255, 71], [
+                                255, 41, 10], [7, 255, 255],
+                            [224, 255, 8], [102, 8, 255], [
+                                255, 61, 6], [255, 194, 7],
+                            [255, 122, 8], [0, 255, 20], [
+                                255, 8, 41], [255, 5, 153],
+                            [6, 51, 255], [235, 12, 255], [
+                                160, 150, 20], [0, 163, 255],
+                            [140, 140, 140], [250, 10, 15], [
+                                20, 255, 0], [31, 255, 0],
+                            [255, 31, 0], [255, 224, 0], [
+                                153, 255, 0], [0, 0, 255],
+                            [255, 71, 0], [0, 235, 255], [
+                                0, 173, 255], [31, 0, 255],
+                            [11, 200, 200], [255, 82, 0], [
+                                0, 255, 245], [0, 61, 255],
+                            [0, 255, 112], [0, 255, 133], [
+                                255, 0, 0], [255, 163, 0],
+                            [255, 102, 0], [194, 255, 0], [
+                                0, 143, 255], [51, 255, 0],
+                            [0, 82, 255], [0, 255, 41], [
+                                0, 255, 173], [10, 0, 255],
+                            [173, 255, 0], [0, 255, 153], [
+                                255, 92, 0], [255, 0, 255],
+                            [255, 0, 245], [255, 0, 102], [
+                                255, 173, 0], [255, 0, 20],
+                            [255, 184, 184], [0, 31, 255], [
+                                0, 255, 61], [0, 71, 255],
+                            [255, 0, 204], [0, 255, 194], [
+                                0, 255, 82], [0, 10, 255],
+                            [0, 112, 255], [51, 0, 255], [
+                                0, 194, 255], [0, 122, 255],
+                            [0, 255, 163], [255, 153, 0], [
+                                0, 255, 10], [255, 112, 0],
+                            [143, 255, 0], [82, 0, 255], [
+                                163, 255, 0], [255, 235, 0],
+                            [8, 184, 170], [133, 0, 255], [
+                                0, 255, 92], [184, 0, 255],
+                            [255, 0, 31], [0, 184, 255], [
+                                0, 214, 255], [255, 0, 112],
+                            [92, 255, 0], [0, 224, 255], [
+                                112, 224, 255], [70, 184, 160],
+                            [163, 0, 255], [153, 0, 255], [
+                                71, 255, 0], [255, 0, 163],
+                            [255, 204, 0], [255, 0, 143], [
+                                0, 255, 235], [133, 255, 0],
+                            [255, 0, 235], [245, 0, 255], [
+                                255, 0, 122], [255, 245, 0],
+                            [10, 190, 212], [214, 255, 0], [
+                                0, 204, 255], [20, 0, 255],
+                            [255, 255, 0], [0, 153, 255], [
+                                0, 41, 255], [0, 255, 204],
+                            [41, 0, 255], [41, 255, 0], [
+                                173, 0, 255], [0, 245, 255],
+                            [71, 0, 255], [122, 0, 255], [
+                                0, 255, 184], [0, 92, 255],
+                            [184, 255, 0], [0, 133, 255], [
+                                255, 214, 0], [25, 194, 194],
                             [102, 255, 0], [92, 0, 255]]
     @prompts(name="Segmentation On Image",
                          "The input to this tool should be a string, representing the image_path")
     def inference(self, inputs):
         image = Image.open(inputs)
+        pixel_values = self.image_processor(
+            image, return_tensors="pt").pixel_values
         with torch.no_grad():
             outputs = self.image_segmentor(pixel_values)
+        seg = self.image_processor.post_process_semantic_segmentation(
+            outputs, target_sizes=[image.size[::-1]])[0]
+        # height, width, 3
+        color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)
         palette = np.array(self.ade_palette)
         for label, color in enumerate(palette):
             color_seg[seg == label, :] = color
         color_seg = color_seg.astype(np.uint8)
         segmentation = Image.fromarray(color_seg)
+        updated_image_path = get_new_image_name(
+            inputs, func_name="segmentation")
         segmentation.save(updated_image_path)
+        print(
+            f"\nProcessed Image2Pose, Input Image: {inputs}, Output Pose: {updated_image_path}")
         return updated_image_path
         self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
             torch_dtype=self.torch_dtype)
+        self.pipe.scheduler = UniPCMultistepScheduler.from_config(
+            self.pipe.scheduler.config)
         self.pipe.to(device)
         self.seed = -1
         self.a_prompt = 'best quality, extremely detailed'
         self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
+            ' fewer digits, cropped, worst quality, low quality'
     @prompts(name="Generate Image Condition On Segmentations",
              description="useful when you want to generate a new real image from both the user description and segmentations. "
                          "The input to this tool should be a comma separated string of two, "
                          "representing the image_path and the user description")
     def inference(self, inputs):
+        image_path, instruct_text = inputs.split(
+            ",")[0], ','.join(inputs.split(',')[1:])
         image = Image.open(image_path)
         self.seed = random.randint(0, 65535)
         seed_everything(self.seed)
         prompt = f'{instruct_text}, {self.a_prompt}'
         image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
                           guidance_scale=9.0).images[0]
+        updated_image_path = get_new_image_name(
+            image_path, func_name="segment2image")
         image.save(updated_image_path)
         print(f"\nProcessed SegText2Image, Input Seg: {image_path}, Input Text: {instruct_text}, "
               f"Output Image: {updated_image_path}")
         depth = Image.fromarray(depth)
         updated_image_path = get_new_image_name(inputs, func_name="depth")
         depth.save(updated_image_path)
+        print(
+            f"\nProcessed Image2Depth, Input Image: {inputs}, Output Depth: {updated_image_path}")
         return updated_image_path
         self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
             torch_dtype=self.torch_dtype)
+        self.pipe.scheduler = UniPCMultistepScheduler.from_config(
+            self.pipe.scheduler.config)
         self.pipe.to(device)
         self.seed = -1
         self.a_prompt = 'best quality, extremely detailed'
         self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
+            ' fewer digits, cropped, worst quality, low quality'
     @prompts(name="Generate Image Condition On Depth",
              description="useful when you want to generate a new real image from both the user description and depth image. "
                          "The input to this tool should be a comma separated string of two, "
                          "representing the image_path and the user description")
     def inference(self, inputs):
+        image_path, instruct_text = inputs.split(
+            ",")[0], ','.join(inputs.split(',')[1:])
         image = Image.open(image_path)
         self.seed = random.randint(0, 65535)
         seed_everything(self.seed)
         prompt = f'{instruct_text}, {self.a_prompt}'
         image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
                           guidance_scale=9.0).images[0]
+        updated_image_path = get_new_image_name(
+            image_path, func_name="depth2image")
         image.save(updated_image_path)
         print(f"\nProcessed DepthText2Image, Input Depth: {image_path}, Input Text: {instruct_text}, "
               f"Output Image: {updated_image_path}")
 class Image2Normal:
     def __init__(self, device):
         print("Initializing Image2Normal")
+        self.depth_estimator = pipeline(
+            "depth-estimation", model="Intel/dpt-hybrid-midas")
         self.bg_threhold = 0.4
     @prompts(name="Predict Normal Map On Image",
         image = image.resize(original_size)
         updated_image_path = get_new_image_name(inputs, func_name="normal-map")
         image.save(updated_image_path)
+        print(
+            f"\nProcessed Image2Normal, Input Image: {inputs}, Output Depth: {updated_image_path}")
         return updated_image_path
         self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
             torch_dtype=self.torch_dtype)
+        self.pipe.scheduler = UniPCMultistepScheduler.from_config(
+            self.pipe.scheduler.config)
         self.pipe.to(device)
         self.seed = -1
         self.a_prompt = 'best quality, extremely detailed'
         self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
+            ' fewer digits, cropped, worst quality, low quality'
     @prompts(name="Generate Image Condition On Normal Map",
              description="useful when you want to generate a new real image from both the user description and normal map. "
                          "The input to this tool should be a comma separated string of two, "
                          "representing the image_path and the user description")
     def inference(self, inputs):
+        image_path, instruct_text = inputs.split(
+            ",")[0], ','.join(inputs.split(',')[1:])
         image = Image.open(image_path)
         self.seed = random.randint(0, 65535)
         seed_everything(self.seed)
         prompt = f'{instruct_text}, {self.a_prompt}'
         image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
                           guidance_scale=9.0).images[0]
+        updated_image_path = get_new_image_name(
+            image_path, func_name="normal2image")
         image.save(updated_image_path)
         print(f"\nProcessed NormalText2Image, Input Normal: {image_path}, Input Text: {instruct_text}, "
               f"Output Image: {updated_image_path}")
         print(f"Initializing VisualQuestionAnswering to {device}")
         self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
         self.device = device
+        self.processor = BlipProcessor.from_pretrained(
+            "Salesforce/blip-vqa-base")
         self.model = BlipForQuestionAnswering.from_pretrained(
             "Salesforce/blip-vqa-base", torch_dtype=self.torch_dtype).to(self.device)
                          "like: what is the background color of the last image, how many cats in this figure, what is in this figure. "
                          "The input to this tool should be a comma separated string of two, representing the image_path and the question")
     def inference(self, inputs):
+        image_path, question = inputs.split(
+            ",")[0], ','.join(inputs.split(',')[1:])
         raw_image = Image.open(image_path).convert('RGB')
+        inputs = self.processor(raw_image, question, return_tensors="pt").to(
+            self.device, self.torch_dtype)
         out = self.model.generate(**inputs)
         answer = self.processor.decode(out[0], skip_special_tokens=True)
         print(f"\nProcessed VisualQuestionAnswering, Input Image: {image_path}, Input Question: {question}, "
         # load_dict = {'VisualQuestionAnswering':'cuda:0', 'ImageCaptioning':'cuda:1',...}
         print(f"Initializing VisualChatGPT, load_dict={load_dict}")
         if 'ImageCaptioning' not in load_dict:
+            raise ValueError(
+                "You have to load ImageCaptioning as a basic function for VisualChatGPT")
         self.llm = OpenAIChat(temperature=0)
+        self.memory = ConversationBufferMemory(
+            memory_key="chat_history", output_key='output')
         self.models = {}
         for class_name, device in load_dict.items():
             for e in dir(instance):
                 if e.startswith('inference'):
                     func = getattr(instance, e)
+                    self.tools.append(
+                        Tool(name=func.name, description=func.description, func=func))
         self.agent = initialize_agent(
             self.tools,
                           'suffix': VISUAL_CHATGPT_SUFFIX}, )
     def run_text(self, text, state):
+        self.agent.memory.buffer = cut_dialogue_history(
+            self.agent.memory.buffer, keep_last_n_words=500)
         res = self.agent({"input": text})
         res['output'] = res['output'].replace("\\", "/")
+        response = re.sub(
+            '(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
         state = state + [(text, response)]
         print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
               f"Current Memory: {self.agent.memory.buffer}")
         img = img.resize((width_new, height_new))
         img = img.convert('RGB')
         img.save(image_filename, "PNG")
+        print(
+            f"Resize image form {width}x{height} to {width_new}x{height_new}")
         description = self.models['ImageCaptioning'].inference(image_filename)
         Human_prompt = f'\nHuman: provide a figure named {image_filename}. The description is: {description}. This information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
         AI_prompt = "Received.  "
+        self.agent.memory.buffer = self.agent.memory.buffer + \
+            Human_prompt + 'AI: ' + AI_prompt
+        state = state + \
+            [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)]
         print(f"\nProcessed run_image, Input image: {image_filename}\nCurrent state: {state}\n"
               f"Current Memory: {self.agent.memory.buffer}")
         return state, state, f'{txt} {image_filename} '
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
+    parser.add_argument('--load', type=str,
+                        default="ImageCaptioning_cuda:0,Text2Image_cuda:0")
     args = parser.parse_args()
+    load_dict = {e.split('_')[0].strip(): e.split(
+        '_')[1].strip() for e in args.load.split(',')}
     bot = ConversationBot(load_dict=load_dict)
     with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo:
         chatbot = gr.Chatbot(elem_id="chatbot", label="Visual ChatGPT")