chasetank commited on
Commit
082f846
·
1 Parent(s): 8224e2b

fix: model test

Browse files
Files changed (1) hide show
  1. app.py +215 -112
app.py CHANGED
@@ -21,7 +21,7 @@ from controlnet_aux import OpenposeDetector, MLSDdetector, HEDdetector
21
  from langchain.agents.initialize import initialize_agent
22
  from langchain.agents.tools import Tool
23
  from langchain.chains.conversation.memory import ConversationBufferMemory
24
- from langchain.llms import OpenAIChat
25
 
26
  VISUAL_CHATGPT_PREFIX = """Visual ChatGPT is designed to be able to assist with a wide range of text and visual related tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. Visual ChatGPT is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.
27
 
@@ -123,8 +123,10 @@ class MaskFormer:
123
  def __init__(self, device):
124
  print(f"Initializing MaskFormer to {device}")
125
  self.device = device
126
- self.processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
127
- self.model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined").to(device)
 
 
128
 
129
  def inference(self, image_path, text):
130
  threshold = 0.5
@@ -132,7 +134,8 @@ class MaskFormer:
132
  padding = 20
133
  original_image = Image.open(image_path)
134
  image = original_image.resize((512, 512))
135
- inputs = self.processor(text=text, images=image, padding="max_length", return_tensors="pt").to(self.device)
 
136
  with torch.no_grad():
137
  outputs = self.model(**inputs)
138
  mask = torch.sigmoid(outputs[0]).squeeze().cpu().numpy() > threshold
@@ -142,7 +145,8 @@ class MaskFormer:
142
  true_indices = np.argwhere(mask)
143
  mask_array = np.zeros_like(mask, dtype=bool)
144
  for idx in true_indices:
145
- padded_slice = tuple(slice(max(0, i - padding), i + padding + 1) for i in idx)
 
146
  mask_array[padded_slice] = True
147
  visual_mask = (mask_array * 255).astype(np.uint8)
148
  image_mask = Image.fromarray(visual_mask)
@@ -165,7 +169,8 @@ class ImageEditing:
165
  "The input to this tool should be a comma separated string of two, "
166
  "representing the image_path and the object need to be removed. ")
167
  def inference_remove(self, inputs):
168
- image_path, to_be_removed_txt = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
 
169
  return self.inference_replace(f"{image_path},{to_be_removed_txt},background")
170
 
171
  @prompts(name="Replace Something From The Photo",
@@ -180,7 +185,8 @@ class ImageEditing:
180
  mask_image = self.mask_former.inference(image_path, to_be_replaced_txt)
181
  updated_image = self.inpaint(prompt=replace_with_txt, image=original_image.resize((512, 512)),
182
  mask_image=mask_image.resize((512, 512))).images[0]
183
- updated_image_path = get_new_image_name(image_path, func_name="replace-something")
 
184
  updated_image = updated_image.resize(original_size)
185
  updated_image.save(updated_image_path)
186
  print(
@@ -197,7 +203,8 @@ class InstructPix2Pix:
197
  self.pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained("timbrooks/instruct-pix2pix",
198
  safety_checker=None,
199
  torch_dtype=self.torch_dtype).to(device)
200
- self.pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(self.pipe.scheduler.config)
 
201
 
202
  @prompts(name="Instruct Image Using Text",
203
  description="useful when you want to the style of the image to be like the text. "
@@ -207,10 +214,13 @@ class InstructPix2Pix:
207
  def inference(self, inputs):
208
  """Change style of image."""
209
  print("===>Starting InstructPix2Pix Inference")
210
- image_path, text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
 
211
  original_image = Image.open(image_path)
212
- image = self.pipe(text, image=original_image, num_inference_steps=40, image_guidance_scale=1.2).images[0]
213
- updated_image_path = get_new_image_name(image_path, func_name="pix2pix")
 
 
214
  image.save(updated_image_path)
215
  print(f"\nProcessed InstructPix2Pix, Input Image: {image_path}, Instruct Text: {text}, "
216
  f"Output Image: {updated_image_path}")
@@ -248,7 +258,8 @@ class ImageCaptioning:
248
  print(f"Initializing ImageCaptioning to {device}")
249
  self.device = device
250
  self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
251
- self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 
252
  self.model = BlipForConditionalGeneration.from_pretrained(
253
  "Salesforce/blip-image-captioning-base", torch_dtype=self.torch_dtype).to(self.device)
254
 
@@ -256,10 +267,12 @@ class ImageCaptioning:
256
  description="useful when you want to know what is inside the photo. receives image_path as input. "
257
  "The input to this tool should be a string, representing the image_path. ")
258
  def inference(self, image_path):
259
- inputs = self.processor(Image.open(image_path), return_tensors="pt").to(self.device, self.torch_dtype)
 
260
  out = self.model.generate(**inputs)
261
  captions = self.processor.decode(out[0], skip_special_tokens=True)
262
- print(f"\nProcessed ImageCaptioning, Input Image: {image_path}, Output Text: {captions}")
 
263
  return captions
264
 
265
 
@@ -283,7 +296,8 @@ class Image2Canny:
283
  canny = Image.fromarray(canny)
284
  updated_image_path = get_new_image_name(inputs, func_name="edge")
285
  canny.save(updated_image_path)
286
- print(f"\nProcessed Image2Canny, Input Image: {inputs}, Output Text: {updated_image_path}")
 
287
  return updated_image_path
288
 
289
 
@@ -296,12 +310,13 @@ class CannyText2Image:
296
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
297
  "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
298
  torch_dtype=self.torch_dtype)
299
- self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
 
300
  self.pipe.to(device)
301
  self.seed = -1
302
  self.a_prompt = 'best quality, extremely detailed'
303
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
304
- 'fewer digits, cropped, worst quality, low quality'
305
 
306
  @prompts(name="Generate Image Condition On Canny Image",
307
  description="useful when you want to generate a new real image from both the user description and a canny image."
@@ -310,14 +325,16 @@ class CannyText2Image:
310
  "The input to this tool should be a comma separated string of two, "
311
  "representing the image_path and the user description. ")
312
  def inference(self, inputs):
313
- image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
 
314
  image = Image.open(image_path)
315
  self.seed = random.randint(0, 65535)
316
  seed_everything(self.seed)
317
  prompt = f'{instruct_text}, {self.a_prompt}'
318
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
319
  guidance_scale=9.0).images[0]
320
- updated_image_path = get_new_image_name(image_path, func_name="canny2image")
 
321
  image.save(updated_image_path)
322
  print(f"\nProcessed CannyText2Image, Input Canny: {image_path}, Input Text: {instruct_text}, "
323
  f"Output Text: {updated_image_path}")
@@ -339,7 +356,8 @@ class Image2Line:
339
  mlsd = self.detector(image)
340
  updated_image_path = get_new_image_name(inputs, func_name="line-of")
341
  mlsd.save(updated_image_path)
342
- print(f"\nProcessed Image2Line, Input Image: {inputs}, Output Line: {updated_image_path}")
 
343
  return updated_image_path
344
 
345
 
@@ -353,12 +371,13 @@ class LineText2Image:
353
  "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
354
  torch_dtype=self.torch_dtype
355
  )
356
- self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
 
357
  self.pipe.to(device)
358
  self.seed = -1
359
  self.a_prompt = 'best quality, extremely detailed'
360
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
361
- 'fewer digits, cropped, worst quality, low quality'
362
 
363
  @prompts(name="Generate Image Condition On Line Image",
364
  description="useful when you want to generate a new real image from both the user description "
@@ -368,14 +387,16 @@ class LineText2Image:
368
  "The input to this tool should be a comma separated string of two, "
369
  "representing the image_path and the user description. ")
370
  def inference(self, inputs):
371
- image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
 
372
  image = Image.open(image_path)
373
  self.seed = random.randint(0, 65535)
374
  seed_everything(self.seed)
375
  prompt = f'{instruct_text}, {self.a_prompt}'
376
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
377
  guidance_scale=9.0).images[0]
378
- updated_image_path = get_new_image_name(image_path, func_name="line2image")
 
379
  image.save(updated_image_path)
380
  print(f"\nProcessed LineText2Image, Input Line: {image_path}, Input Text: {instruct_text}, "
381
  f"Output Text: {updated_image_path}")
@@ -395,9 +416,11 @@ class Image2Hed:
395
  def inference(self, inputs):
396
  image = Image.open(inputs)
397
  hed = self.detector(image)
398
- updated_image_path = get_new_image_name(inputs, func_name="hed-boundary")
 
399
  hed.save(updated_image_path)
400
- print(f"\nProcessed Image2Hed, Input Image: {inputs}, Output Hed: {updated_image_path}")
 
401
  return updated_image_path
402
 
403
 
@@ -411,12 +434,13 @@ class HedText2Image:
411
  "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
412
  torch_dtype=self.torch_dtype
413
  )
414
- self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
 
415
  self.pipe.to(device)
416
  self.seed = -1
417
  self.a_prompt = 'best quality, extremely detailed'
418
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
419
- 'fewer digits, cropped, worst quality, low quality'
420
 
421
  @prompts(name="Generate Image Condition On Soft Hed Boundary Image",
422
  description="useful when you want to generate a new real image from both the user description "
@@ -426,14 +450,16 @@ class HedText2Image:
426
  "The input to this tool should be a comma separated string of two, "
427
  "representing the image_path and the user description")
428
  def inference(self, inputs):
429
- image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
 
430
  image = Image.open(image_path)
431
  self.seed = random.randint(0, 65535)
432
  seed_everything(self.seed)
433
  prompt = f'{instruct_text}, {self.a_prompt}'
434
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
435
  guidance_scale=9.0).images[0]
436
- updated_image_path = get_new_image_name(image_path, func_name="hed2image")
 
437
  image.save(updated_image_path)
438
  print(f"\nProcessed HedText2Image, Input Hed: {image_path}, Input Text: {instruct_text}, "
439
  f"Output Image: {updated_image_path}")
@@ -455,7 +481,8 @@ class Image2Scribble:
455
  scribble = self.detector(image, scribble=True)
456
  updated_image_path = get_new_image_name(inputs, func_name="scribble")
457
  scribble.save(updated_image_path)
458
- print(f"\nProcessed Image2Scribble, Input Image: {inputs}, Output Scribble: {updated_image_path}")
 
459
  return updated_image_path
460
 
461
 
@@ -469,12 +496,13 @@ class ScribbleText2Image:
469
  "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
470
  torch_dtype=self.torch_dtype
471
  )
472
- self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
 
473
  self.pipe.to(device)
474
  self.seed = -1
475
  self.a_prompt = 'best quality, extremely detailed'
476
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
477
- 'fewer digits, cropped, worst quality, low quality'
478
 
479
  @prompts(name="Generate Image Condition On Sketch Image",
480
  description="useful when you want to generate a new real image from both the user description and "
@@ -482,14 +510,16 @@ class ScribbleText2Image:
482
  "The input to this tool should be a comma separated string of two, "
483
  "representing the image_path and the user description")
484
  def inference(self, inputs):
485
- image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
 
486
  image = Image.open(image_path)
487
  self.seed = random.randint(0, 65535)
488
  seed_everything(self.seed)
489
  prompt = f'{instruct_text}, {self.a_prompt}'
490
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
491
  guidance_scale=9.0).images[0]
492
- updated_image_path = get_new_image_name(image_path, func_name="scribble2image")
 
493
  image.save(updated_image_path)
494
  print(f"\nProcessed ScribbleText2Image, Input Scribble: {image_path}, Input Text: {instruct_text}, "
495
  f"Output Image: {updated_image_path}")
@@ -499,7 +529,8 @@ class ScribbleText2Image:
499
  class Image2Pose:
500
  def __init__(self, device):
501
  print("Initializing Image2Pose")
502
- self.detector = OpenposeDetector.from_pretrained('lllyasviel/ControlNet')
 
503
 
504
  @prompts(name="Pose Detection On Image",
505
  description="useful when you want to detect the human pose of the image. "
@@ -510,7 +541,8 @@ class Image2Pose:
510
  pose = self.detector(image)
511
  updated_image_path = get_new_image_name(inputs, func_name="human-pose")
512
  pose.save(updated_image_path)
513
- print(f"\nProcessed Image2Pose, Input Image: {inputs}, Output Pose: {updated_image_path}")
 
514
  return updated_image_path
515
 
516
 
@@ -523,14 +555,15 @@ class PoseText2Image:
523
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
524
  "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
525
  torch_dtype=self.torch_dtype)
526
- self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
 
527
  self.pipe.to(device)
528
  self.num_inference_steps = 20
529
  self.seed = -1
530
  self.unconditional_guidance_scale = 9.0
531
  self.a_prompt = 'best quality, extremely detailed'
532
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
533
- ' fewer digits, cropped, worst quality, low quality'
534
 
535
  @prompts(name="Generate Image Condition On Pose Image",
536
  description="useful when you want to generate a new real image from both the user description "
@@ -540,14 +573,16 @@ class PoseText2Image:
540
  "The input to this tool should be a comma separated string of two, "
541
  "representing the image_path and the user description")
542
  def inference(self, inputs):
543
- image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
 
544
  image = Image.open(image_path)
545
  self.seed = random.randint(0, 65535)
546
  seed_everything(self.seed)
547
  prompt = f'{instruct_text}, {self.a_prompt}'
548
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
549
  guidance_scale=9.0).images[0]
550
- updated_image_path = get_new_image_name(image_path, func_name="pose2image")
 
551
  image.save(updated_image_path)
552
  print(f"\nProcessed PoseText2Image, Input Pose: {image_path}, Input Text: {instruct_text}, "
553
  f"Output Image: {updated_image_path}")
@@ -557,45 +592,83 @@ class PoseText2Image:
557
  class Image2Seg:
558
  def __init__(self, device):
559
  print("Initializing Image2Seg")
560
- self.image_processor = AutoImageProcessor.from_pretrained("openmmlab/upernet-convnext-small")
561
- self.image_segmentor = UperNetForSemanticSegmentation.from_pretrained("openmmlab/upernet-convnext-small")
 
 
562
  self.ade_palette = [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
563
- [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255],
564
- [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7],
565
- [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82],
566
- [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3],
567
- [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255],
568
- [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220],
569
- [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224],
570
- [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255],
571
- [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7],
572
- [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153],
573
- [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255],
574
- [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0],
575
- [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
576
- [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255],
577
- [11, 200, 200], [255, 82, 0], [0, 255, 245], [0, 61, 255],
578
- [0, 255, 112], [0, 255, 133], [255, 0, 0], [255, 163, 0],
579
- [255, 102, 0], [194, 255, 0], [0, 143, 255], [51, 255, 0],
580
- [0, 82, 255], [0, 255, 41], [0, 255, 173], [10, 0, 255],
581
- [173, 255, 0], [0, 255, 153], [255, 92, 0], [255, 0, 255],
582
- [255, 0, 245], [255, 0, 102], [255, 173, 0], [255, 0, 20],
583
- [255, 184, 184], [0, 31, 255], [0, 255, 61], [0, 71, 255],
584
- [255, 0, 204], [0, 255, 194], [0, 255, 82], [0, 10, 255],
585
- [0, 112, 255], [51, 0, 255], [0, 194, 255], [0, 122, 255],
586
- [0, 255, 163], [255, 153, 0], [0, 255, 10], [255, 112, 0],
587
- [143, 255, 0], [82, 0, 255], [163, 255, 0], [255, 235, 0],
588
- [8, 184, 170], [133, 0, 255], [0, 255, 92], [184, 0, 255],
589
- [255, 0, 31], [0, 184, 255], [0, 214, 255], [255, 0, 112],
590
- [92, 255, 0], [0, 224, 255], [112, 224, 255], [70, 184, 160],
591
- [163, 0, 255], [153, 0, 255], [71, 255, 0], [255, 0, 163],
592
- [255, 204, 0], [255, 0, 143], [0, 255, 235], [133, 255, 0],
593
- [255, 0, 235], [245, 0, 255], [255, 0, 122], [255, 245, 0],
594
- [10, 190, 212], [214, 255, 0], [0, 204, 255], [20, 0, 255],
595
- [255, 255, 0], [0, 153, 255], [0, 41, 255], [0, 255, 204],
596
- [41, 0, 255], [41, 255, 0], [173, 0, 255], [0, 245, 255],
597
- [71, 0, 255], [122, 0, 255], [0, 255, 184], [0, 92, 255],
598
- [184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
599
  [102, 255, 0], [92, 0, 255]]
600
 
601
  @prompts(name="Segmentation On Image",
@@ -605,19 +678,24 @@ class Image2Seg:
605
  "The input to this tool should be a string, representing the image_path")
606
  def inference(self, inputs):
607
  image = Image.open(inputs)
608
- pixel_values = self.image_processor(image, return_tensors="pt").pixel_values
 
609
  with torch.no_grad():
610
  outputs = self.image_segmentor(pixel_values)
611
- seg = self.image_processor.post_process_semantic_segmentation(outputs, target_sizes=[image.size[::-1]])[0]
612
- color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8) # height, width, 3
 
 
613
  palette = np.array(self.ade_palette)
614
  for label, color in enumerate(palette):
615
  color_seg[seg == label, :] = color
616
  color_seg = color_seg.astype(np.uint8)
617
  segmentation = Image.fromarray(color_seg)
618
- updated_image_path = get_new_image_name(inputs, func_name="segmentation")
 
619
  segmentation.save(updated_image_path)
620
- print(f"\nProcessed Image2Pose, Input Image: {inputs}, Output Pose: {updated_image_path}")
 
621
  return updated_image_path
622
 
623
 
@@ -630,12 +708,13 @@ class SegText2Image:
630
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
631
  "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
632
  torch_dtype=self.torch_dtype)
633
- self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
 
634
  self.pipe.to(device)
635
  self.seed = -1
636
  self.a_prompt = 'best quality, extremely detailed'
637
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
638
- ' fewer digits, cropped, worst quality, low quality'
639
 
640
  @prompts(name="Generate Image Condition On Segmentations",
641
  description="useful when you want to generate a new real image from both the user description and segmentations. "
@@ -644,14 +723,16 @@ class SegText2Image:
644
  "The input to this tool should be a comma separated string of two, "
645
  "representing the image_path and the user description")
646
  def inference(self, inputs):
647
- image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
 
648
  image = Image.open(image_path)
649
  self.seed = random.randint(0, 65535)
650
  seed_everything(self.seed)
651
  prompt = f'{instruct_text}, {self.a_prompt}'
652
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
653
  guidance_scale=9.0).images[0]
654
- updated_image_path = get_new_image_name(image_path, func_name="segment2image")
 
655
  image.save(updated_image_path)
656
  print(f"\nProcessed SegText2Image, Input Seg: {image_path}, Input Text: {instruct_text}, "
657
  f"Output Image: {updated_image_path}")
@@ -676,7 +757,8 @@ class Image2Depth:
676
  depth = Image.fromarray(depth)
677
  updated_image_path = get_new_image_name(inputs, func_name="depth")
678
  depth.save(updated_image_path)
679
- print(f"\nProcessed Image2Depth, Input Image: {inputs}, Output Depth: {updated_image_path}")
 
680
  return updated_image_path
681
 
682
 
@@ -689,12 +771,13 @@ class DepthText2Image:
689
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
690
  "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
691
  torch_dtype=self.torch_dtype)
692
- self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
 
693
  self.pipe.to(device)
694
  self.seed = -1
695
  self.a_prompt = 'best quality, extremely detailed'
696
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
697
- ' fewer digits, cropped, worst quality, low quality'
698
 
699
  @prompts(name="Generate Image Condition On Depth",
700
  description="useful when you want to generate a new real image from both the user description and depth image. "
@@ -703,14 +786,16 @@ class DepthText2Image:
703
  "The input to this tool should be a comma separated string of two, "
704
  "representing the image_path and the user description")
705
  def inference(self, inputs):
706
- image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
 
707
  image = Image.open(image_path)
708
  self.seed = random.randint(0, 65535)
709
  seed_everything(self.seed)
710
  prompt = f'{instruct_text}, {self.a_prompt}'
711
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
712
  guidance_scale=9.0).images[0]
713
- updated_image_path = get_new_image_name(image_path, func_name="depth2image")
 
714
  image.save(updated_image_path)
715
  print(f"\nProcessed DepthText2Image, Input Depth: {image_path}, Input Text: {instruct_text}, "
716
  f"Output Image: {updated_image_path}")
@@ -720,7 +805,8 @@ class DepthText2Image:
720
  class Image2Normal:
721
  def __init__(self, device):
722
  print("Initializing Image2Normal")
723
- self.depth_estimator = pipeline("depth-estimation", model="Intel/dpt-hybrid-midas")
 
724
  self.bg_threhold = 0.4
725
 
726
  @prompts(name="Predict Normal Map On Image",
@@ -747,7 +833,8 @@ class Image2Normal:
747
  image = image.resize(original_size)
748
  updated_image_path = get_new_image_name(inputs, func_name="normal-map")
749
  image.save(updated_image_path)
750
- print(f"\nProcessed Image2Normal, Input Image: {inputs}, Output Depth: {updated_image_path}")
 
751
  return updated_image_path
752
 
753
 
@@ -760,12 +847,13 @@ class NormalText2Image:
760
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
761
  "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
762
  torch_dtype=self.torch_dtype)
763
- self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
 
764
  self.pipe.to(device)
765
  self.seed = -1
766
  self.a_prompt = 'best quality, extremely detailed'
767
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
768
- ' fewer digits, cropped, worst quality, low quality'
769
 
770
  @prompts(name="Generate Image Condition On Normal Map",
771
  description="useful when you want to generate a new real image from both the user description and normal map. "
@@ -774,14 +862,16 @@ class NormalText2Image:
774
  "The input to this tool should be a comma separated string of two, "
775
  "representing the image_path and the user description")
776
  def inference(self, inputs):
777
- image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
 
778
  image = Image.open(image_path)
779
  self.seed = random.randint(0, 65535)
780
  seed_everything(self.seed)
781
  prompt = f'{instruct_text}, {self.a_prompt}'
782
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
783
  guidance_scale=9.0).images[0]
784
- updated_image_path = get_new_image_name(image_path, func_name="normal2image")
 
785
  image.save(updated_image_path)
786
  print(f"\nProcessed NormalText2Image, Input Normal: {image_path}, Input Text: {instruct_text}, "
787
  f"Output Image: {updated_image_path}")
@@ -793,7 +883,8 @@ class VisualQuestionAnswering:
793
  print(f"Initializing VisualQuestionAnswering to {device}")
794
  self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
795
  self.device = device
796
- self.processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
 
797
  self.model = BlipForQuestionAnswering.from_pretrained(
798
  "Salesforce/blip-vqa-base", torch_dtype=self.torch_dtype).to(self.device)
799
 
@@ -802,9 +893,11 @@ class VisualQuestionAnswering:
802
  "like: what is the background color of the last image, how many cats in this figure, what is in this figure. "
803
  "The input to this tool should be a comma separated string of two, representing the image_path and the question")
804
  def inference(self, inputs):
805
- image_path, question = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
 
806
  raw_image = Image.open(image_path).convert('RGB')
807
- inputs = self.processor(raw_image, question, return_tensors="pt").to(self.device, self.torch_dtype)
 
808
  out = self.model.generate(**inputs)
809
  answer = self.processor.decode(out[0], skip_special_tokens=True)
810
  print(f"\nProcessed VisualQuestionAnswering, Input Image: {image_path}, Input Question: {question}, "
@@ -817,10 +910,12 @@ class ConversationBot:
817
  # load_dict = {'VisualQuestionAnswering':'cuda:0', 'ImageCaptioning':'cuda:1',...}
818
  print(f"Initializing VisualChatGPT, load_dict={load_dict}")
819
  if 'ImageCaptioning' not in load_dict:
820
- raise ValueError("You have to load ImageCaptioning as a basic function for VisualChatGPT")
 
821
 
822
  self.llm = OpenAIChat(temperature=0)
823
- self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
 
824
 
825
  self.models = {}
826
  for class_name, device in load_dict.items():
@@ -831,7 +926,8 @@ class ConversationBot:
831
  for e in dir(instance):
832
  if e.startswith('inference'):
833
  func = getattr(instance, e)
834
- self.tools.append(Tool(name=func.name, description=func.description, func=func))
 
835
 
836
  self.agent = initialize_agent(
837
  self.tools,
@@ -844,10 +940,12 @@ class ConversationBot:
844
  'suffix': VISUAL_CHATGPT_SUFFIX}, )
845
 
846
  def run_text(self, text, state):
847
- self.agent.memory.buffer = cut_dialogue_history(self.agent.memory.buffer, keep_last_n_words=500)
 
848
  res = self.agent({"input": text})
849
  res['output'] = res['output'].replace("\\", "/")
850
- response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
 
851
  state = state + [(text, response)]
852
  print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
853
  f"Current Memory: {self.agent.memory.buffer}")
@@ -865,12 +963,15 @@ class ConversationBot:
865
  img = img.resize((width_new, height_new))
866
  img = img.convert('RGB')
867
  img.save(image_filename, "PNG")
868
- print(f"Resize image form {width}x{height} to {width_new}x{height_new}")
 
869
  description = self.models['ImageCaptioning'].inference(image_filename)
870
  Human_prompt = f'\nHuman: provide a figure named {image_filename}. The description is: {description}. This information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
871
  AI_prompt = "Received. "
872
- self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
873
- state = state + [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)]
 
 
874
  print(f"\nProcessed run_image, Input image: {image_filename}\nCurrent state: {state}\n"
875
  f"Current Memory: {self.agent.memory.buffer}")
876
  return state, state, f'{txt} {image_filename} '
@@ -878,9 +979,11 @@ class ConversationBot:
878
 
879
  if __name__ == '__main__':
880
  parser = argparse.ArgumentParser()
881
- parser.add_argument('--load', type=str, default="ImageCaptioning_cuda:0,Text2Image_cuda:0")
 
882
  args = parser.parse_args()
883
- load_dict = {e.split('_')[0].strip(): e.split('_')[1].strip() for e in args.load.split(',')}
 
884
  bot = ConversationBot(load_dict=load_dict)
885
  with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo:
886
  chatbot = gr.Chatbot(elem_id="chatbot", label="Visual ChatGPT")
 
21
  from langchain.agents.initialize import initialize_agent
22
  from langchain.agents.tools import Tool
23
  from langchain.chains.conversation.memory import ConversationBufferMemory
24
+ from langchain.llms.openai import OpenAI
25
 
26
  VISUAL_CHATGPT_PREFIX = """Visual ChatGPT is designed to be able to assist with a wide range of text and visual related tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. Visual ChatGPT is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.
27
 
 
123
  def __init__(self, device):
124
  print(f"Initializing MaskFormer to {device}")
125
  self.device = device
126
+ self.processor = CLIPSegProcessor.from_pretrained(
127
+ "CIDAS/clipseg-rd64-refined")
128
+ self.model = CLIPSegForImageSegmentation.from_pretrained(
129
+ "CIDAS/clipseg-rd64-refined").to(device)
130
 
131
  def inference(self, image_path, text):
132
  threshold = 0.5
 
134
  padding = 20
135
  original_image = Image.open(image_path)
136
  image = original_image.resize((512, 512))
137
+ inputs = self.processor(
138
+ text=text, images=image, padding="max_length", return_tensors="pt").to(self.device)
139
  with torch.no_grad():
140
  outputs = self.model(**inputs)
141
  mask = torch.sigmoid(outputs[0]).squeeze().cpu().numpy() > threshold
 
145
  true_indices = np.argwhere(mask)
146
  mask_array = np.zeros_like(mask, dtype=bool)
147
  for idx in true_indices:
148
+ padded_slice = tuple(
149
+ slice(max(0, i - padding), i + padding + 1) for i in idx)
150
  mask_array[padded_slice] = True
151
  visual_mask = (mask_array * 255).astype(np.uint8)
152
  image_mask = Image.fromarray(visual_mask)
 
169
  "The input to this tool should be a comma separated string of two, "
170
  "representing the image_path and the object need to be removed. ")
171
  def inference_remove(self, inputs):
172
+ image_path, to_be_removed_txt = inputs.split(
173
+ ",")[0], ','.join(inputs.split(',')[1:])
174
  return self.inference_replace(f"{image_path},{to_be_removed_txt},background")
175
 
176
  @prompts(name="Replace Something From The Photo",
 
185
  mask_image = self.mask_former.inference(image_path, to_be_replaced_txt)
186
  updated_image = self.inpaint(prompt=replace_with_txt, image=original_image.resize((512, 512)),
187
  mask_image=mask_image.resize((512, 512))).images[0]
188
+ updated_image_path = get_new_image_name(
189
+ image_path, func_name="replace-something")
190
  updated_image = updated_image.resize(original_size)
191
  updated_image.save(updated_image_path)
192
  print(
 
203
  self.pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained("timbrooks/instruct-pix2pix",
204
  safety_checker=None,
205
  torch_dtype=self.torch_dtype).to(device)
206
+ self.pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(
207
+ self.pipe.scheduler.config)
208
 
209
  @prompts(name="Instruct Image Using Text",
210
  description="useful when you want to the style of the image to be like the text. "
 
214
  def inference(self, inputs):
215
  """Change style of image."""
216
  print("===>Starting InstructPix2Pix Inference")
217
+ image_path, text = inputs.split(
218
+ ",")[0], ','.join(inputs.split(',')[1:])
219
  original_image = Image.open(image_path)
220
+ image = self.pipe(text, image=original_image,
221
+ num_inference_steps=40, image_guidance_scale=1.2).images[0]
222
+ updated_image_path = get_new_image_name(
223
+ image_path, func_name="pix2pix")
224
  image.save(updated_image_path)
225
  print(f"\nProcessed InstructPix2Pix, Input Image: {image_path}, Instruct Text: {text}, "
226
  f"Output Image: {updated_image_path}")
 
258
  print(f"Initializing ImageCaptioning to {device}")
259
  self.device = device
260
  self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
261
+ self.processor = BlipProcessor.from_pretrained(
262
+ "Salesforce/blip-image-captioning-base")
263
  self.model = BlipForConditionalGeneration.from_pretrained(
264
  "Salesforce/blip-image-captioning-base", torch_dtype=self.torch_dtype).to(self.device)
265
 
 
267
  description="useful when you want to know what is inside the photo. receives image_path as input. "
268
  "The input to this tool should be a string, representing the image_path. ")
269
  def inference(self, image_path):
270
+ inputs = self.processor(Image.open(image_path), return_tensors="pt").to(
271
+ self.device, self.torch_dtype)
272
  out = self.model.generate(**inputs)
273
  captions = self.processor.decode(out[0], skip_special_tokens=True)
274
+ print(
275
+ f"\nProcessed ImageCaptioning, Input Image: {image_path}, Output Text: {captions}")
276
  return captions
277
 
278
 
 
296
  canny = Image.fromarray(canny)
297
  updated_image_path = get_new_image_name(inputs, func_name="edge")
298
  canny.save(updated_image_path)
299
+ print(
300
+ f"\nProcessed Image2Canny, Input Image: {inputs}, Output Text: {updated_image_path}")
301
  return updated_image_path
302
 
303
 
 
310
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
311
  "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
312
  torch_dtype=self.torch_dtype)
313
+ self.pipe.scheduler = UniPCMultistepScheduler.from_config(
314
+ self.pipe.scheduler.config)
315
  self.pipe.to(device)
316
  self.seed = -1
317
  self.a_prompt = 'best quality, extremely detailed'
318
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
319
+ 'fewer digits, cropped, worst quality, low quality'
320
 
321
  @prompts(name="Generate Image Condition On Canny Image",
322
  description="useful when you want to generate a new real image from both the user description and a canny image."
 
325
  "The input to this tool should be a comma separated string of two, "
326
  "representing the image_path and the user description. ")
327
  def inference(self, inputs):
328
+ image_path, instruct_text = inputs.split(
329
+ ",")[0], ','.join(inputs.split(',')[1:])
330
  image = Image.open(image_path)
331
  self.seed = random.randint(0, 65535)
332
  seed_everything(self.seed)
333
  prompt = f'{instruct_text}, {self.a_prompt}'
334
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
335
  guidance_scale=9.0).images[0]
336
+ updated_image_path = get_new_image_name(
337
+ image_path, func_name="canny2image")
338
  image.save(updated_image_path)
339
  print(f"\nProcessed CannyText2Image, Input Canny: {image_path}, Input Text: {instruct_text}, "
340
  f"Output Text: {updated_image_path}")
 
356
  mlsd = self.detector(image)
357
  updated_image_path = get_new_image_name(inputs, func_name="line-of")
358
  mlsd.save(updated_image_path)
359
+ print(
360
+ f"\nProcessed Image2Line, Input Image: {inputs}, Output Line: {updated_image_path}")
361
  return updated_image_path
362
 
363
 
 
371
  "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
372
  torch_dtype=self.torch_dtype
373
  )
374
+ self.pipe.scheduler = UniPCMultistepScheduler.from_config(
375
+ self.pipe.scheduler.config)
376
  self.pipe.to(device)
377
  self.seed = -1
378
  self.a_prompt = 'best quality, extremely detailed'
379
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
380
+ 'fewer digits, cropped, worst quality, low quality'
381
 
382
  @prompts(name="Generate Image Condition On Line Image",
383
  description="useful when you want to generate a new real image from both the user description "
 
387
  "The input to this tool should be a comma separated string of two, "
388
  "representing the image_path and the user description. ")
389
  def inference(self, inputs):
390
+ image_path, instruct_text = inputs.split(
391
+ ",")[0], ','.join(inputs.split(',')[1:])
392
  image = Image.open(image_path)
393
  self.seed = random.randint(0, 65535)
394
  seed_everything(self.seed)
395
  prompt = f'{instruct_text}, {self.a_prompt}'
396
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
397
  guidance_scale=9.0).images[0]
398
+ updated_image_path = get_new_image_name(
399
+ image_path, func_name="line2image")
400
  image.save(updated_image_path)
401
  print(f"\nProcessed LineText2Image, Input Line: {image_path}, Input Text: {instruct_text}, "
402
  f"Output Text: {updated_image_path}")
 
416
  def inference(self, inputs):
417
  image = Image.open(inputs)
418
  hed = self.detector(image)
419
+ updated_image_path = get_new_image_name(
420
+ inputs, func_name="hed-boundary")
421
  hed.save(updated_image_path)
422
+ print(
423
+ f"\nProcessed Image2Hed, Input Image: {inputs}, Output Hed: {updated_image_path}")
424
  return updated_image_path
425
 
426
 
 
434
  "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
435
  torch_dtype=self.torch_dtype
436
  )
437
+ self.pipe.scheduler = UniPCMultistepScheduler.from_config(
438
+ self.pipe.scheduler.config)
439
  self.pipe.to(device)
440
  self.seed = -1
441
  self.a_prompt = 'best quality, extremely detailed'
442
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
443
+ 'fewer digits, cropped, worst quality, low quality'
444
 
445
  @prompts(name="Generate Image Condition On Soft Hed Boundary Image",
446
  description="useful when you want to generate a new real image from both the user description "
 
450
  "The input to this tool should be a comma separated string of two, "
451
  "representing the image_path and the user description")
452
  def inference(self, inputs):
453
+ image_path, instruct_text = inputs.split(
454
+ ",")[0], ','.join(inputs.split(',')[1:])
455
  image = Image.open(image_path)
456
  self.seed = random.randint(0, 65535)
457
  seed_everything(self.seed)
458
  prompt = f'{instruct_text}, {self.a_prompt}'
459
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
460
  guidance_scale=9.0).images[0]
461
+ updated_image_path = get_new_image_name(
462
+ image_path, func_name="hed2image")
463
  image.save(updated_image_path)
464
  print(f"\nProcessed HedText2Image, Input Hed: {image_path}, Input Text: {instruct_text}, "
465
  f"Output Image: {updated_image_path}")
 
481
  scribble = self.detector(image, scribble=True)
482
  updated_image_path = get_new_image_name(inputs, func_name="scribble")
483
  scribble.save(updated_image_path)
484
+ print(
485
+ f"\nProcessed Image2Scribble, Input Image: {inputs}, Output Scribble: {updated_image_path}")
486
  return updated_image_path
487
 
488
 
 
496
  "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
497
  torch_dtype=self.torch_dtype
498
  )
499
+ self.pipe.scheduler = UniPCMultistepScheduler.from_config(
500
+ self.pipe.scheduler.config)
501
  self.pipe.to(device)
502
  self.seed = -1
503
  self.a_prompt = 'best quality, extremely detailed'
504
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
505
+ 'fewer digits, cropped, worst quality, low quality'
506
 
507
  @prompts(name="Generate Image Condition On Sketch Image",
508
  description="useful when you want to generate a new real image from both the user description and "
 
510
  "The input to this tool should be a comma separated string of two, "
511
  "representing the image_path and the user description")
512
  def inference(self, inputs):
513
+ image_path, instruct_text = inputs.split(
514
+ ",")[0], ','.join(inputs.split(',')[1:])
515
  image = Image.open(image_path)
516
  self.seed = random.randint(0, 65535)
517
  seed_everything(self.seed)
518
  prompt = f'{instruct_text}, {self.a_prompt}'
519
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
520
  guidance_scale=9.0).images[0]
521
+ updated_image_path = get_new_image_name(
522
+ image_path, func_name="scribble2image")
523
  image.save(updated_image_path)
524
  print(f"\nProcessed ScribbleText2Image, Input Scribble: {image_path}, Input Text: {instruct_text}, "
525
  f"Output Image: {updated_image_path}")
 
529
  class Image2Pose:
530
  def __init__(self, device):
531
  print("Initializing Image2Pose")
532
+ self.detector = OpenposeDetector.from_pretrained(
533
+ 'lllyasviel/ControlNet')
534
 
535
  @prompts(name="Pose Detection On Image",
536
  description="useful when you want to detect the human pose of the image. "
 
541
  pose = self.detector(image)
542
  updated_image_path = get_new_image_name(inputs, func_name="human-pose")
543
  pose.save(updated_image_path)
544
+ print(
545
+ f"\nProcessed Image2Pose, Input Image: {inputs}, Output Pose: {updated_image_path}")
546
  return updated_image_path
547
 
548
 
 
555
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
556
  "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
557
  torch_dtype=self.torch_dtype)
558
+ self.pipe.scheduler = UniPCMultistepScheduler.from_config(
559
+ self.pipe.scheduler.config)
560
  self.pipe.to(device)
561
  self.num_inference_steps = 20
562
  self.seed = -1
563
  self.unconditional_guidance_scale = 9.0
564
  self.a_prompt = 'best quality, extremely detailed'
565
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
566
+ ' fewer digits, cropped, worst quality, low quality'
567
 
568
  @prompts(name="Generate Image Condition On Pose Image",
569
  description="useful when you want to generate a new real image from both the user description "
 
573
  "The input to this tool should be a comma separated string of two, "
574
  "representing the image_path and the user description")
575
  def inference(self, inputs):
576
+ image_path, instruct_text = inputs.split(
577
+ ",")[0], ','.join(inputs.split(',')[1:])
578
  image = Image.open(image_path)
579
  self.seed = random.randint(0, 65535)
580
  seed_everything(self.seed)
581
  prompt = f'{instruct_text}, {self.a_prompt}'
582
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
583
  guidance_scale=9.0).images[0]
584
+ updated_image_path = get_new_image_name(
585
+ image_path, func_name="pose2image")
586
  image.save(updated_image_path)
587
  print(f"\nProcessed PoseText2Image, Input Pose: {image_path}, Input Text: {instruct_text}, "
588
  f"Output Image: {updated_image_path}")
 
592
  class Image2Seg:
593
  def __init__(self, device):
594
  print("Initializing Image2Seg")
595
+ self.image_processor = AutoImageProcessor.from_pretrained(
596
+ "openmmlab/upernet-convnext-small")
597
+ self.image_segmentor = UperNetForSemanticSegmentation.from_pretrained(
598
+ "openmmlab/upernet-convnext-small")
599
  self.ade_palette = [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
600
+ [4, 200, 3], [120, 120, 80], [
601
+ 140, 140, 140], [204, 5, 255],
602
+ [230, 230, 230], [4, 250, 7], [
603
+ 224, 5, 255], [235, 255, 7],
604
+ [150, 5, 61], [120, 120, 70], [
605
+ 8, 255, 51], [255, 6, 82],
606
+ [143, 255, 140], [204, 255, 4], [
607
+ 255, 51, 7], [204, 70, 3],
608
+ [0, 102, 200], [61, 230, 250], [
609
+ 255, 6, 51], [11, 102, 255],
610
+ [255, 7, 71], [255, 9, 224], [
611
+ 9, 7, 230], [220, 220, 220],
612
+ [255, 9, 92], [112, 9, 255], [
613
+ 8, 255, 214], [7, 255, 224],
614
+ [255, 184, 6], [10, 255, 71], [
615
+ 255, 41, 10], [7, 255, 255],
616
+ [224, 255, 8], [102, 8, 255], [
617
+ 255, 61, 6], [255, 194, 7],
618
+ [255, 122, 8], [0, 255, 20], [
619
+ 255, 8, 41], [255, 5, 153],
620
+ [6, 51, 255], [235, 12, 255], [
621
+ 160, 150, 20], [0, 163, 255],
622
+ [140, 140, 140], [250, 10, 15], [
623
+ 20, 255, 0], [31, 255, 0],
624
+ [255, 31, 0], [255, 224, 0], [
625
+ 153, 255, 0], [0, 0, 255],
626
+ [255, 71, 0], [0, 235, 255], [
627
+ 0, 173, 255], [31, 0, 255],
628
+ [11, 200, 200], [255, 82, 0], [
629
+ 0, 255, 245], [0, 61, 255],
630
+ [0, 255, 112], [0, 255, 133], [
631
+ 255, 0, 0], [255, 163, 0],
632
+ [255, 102, 0], [194, 255, 0], [
633
+ 0, 143, 255], [51, 255, 0],
634
+ [0, 82, 255], [0, 255, 41], [
635
+ 0, 255, 173], [10, 0, 255],
636
+ [173, 255, 0], [0, 255, 153], [
637
+ 255, 92, 0], [255, 0, 255],
638
+ [255, 0, 245], [255, 0, 102], [
639
+ 255, 173, 0], [255, 0, 20],
640
+ [255, 184, 184], [0, 31, 255], [
641
+ 0, 255, 61], [0, 71, 255],
642
+ [255, 0, 204], [0, 255, 194], [
643
+ 0, 255, 82], [0, 10, 255],
644
+ [0, 112, 255], [51, 0, 255], [
645
+ 0, 194, 255], [0, 122, 255],
646
+ [0, 255, 163], [255, 153, 0], [
647
+ 0, 255, 10], [255, 112, 0],
648
+ [143, 255, 0], [82, 0, 255], [
649
+ 163, 255, 0], [255, 235, 0],
650
+ [8, 184, 170], [133, 0, 255], [
651
+ 0, 255, 92], [184, 0, 255],
652
+ [255, 0, 31], [0, 184, 255], [
653
+ 0, 214, 255], [255, 0, 112],
654
+ [92, 255, 0], [0, 224, 255], [
655
+ 112, 224, 255], [70, 184, 160],
656
+ [163, 0, 255], [153, 0, 255], [
657
+ 71, 255, 0], [255, 0, 163],
658
+ [255, 204, 0], [255, 0, 143], [
659
+ 0, 255, 235], [133, 255, 0],
660
+ [255, 0, 235], [245, 0, 255], [
661
+ 255, 0, 122], [255, 245, 0],
662
+ [10, 190, 212], [214, 255, 0], [
663
+ 0, 204, 255], [20, 0, 255],
664
+ [255, 255, 0], [0, 153, 255], [
665
+ 0, 41, 255], [0, 255, 204],
666
+ [41, 0, 255], [41, 255, 0], [
667
+ 173, 0, 255], [0, 245, 255],
668
+ [71, 0, 255], [122, 0, 255], [
669
+ 0, 255, 184], [0, 92, 255],
670
+ [184, 255, 0], [0, 133, 255], [
671
+ 255, 214, 0], [25, 194, 194],
672
  [102, 255, 0], [92, 0, 255]]
673
 
674
  @prompts(name="Segmentation On Image",
 
678
  "The input to this tool should be a string, representing the image_path")
679
  def inference(self, inputs):
680
  image = Image.open(inputs)
681
+ pixel_values = self.image_processor(
682
+ image, return_tensors="pt").pixel_values
683
  with torch.no_grad():
684
  outputs = self.image_segmentor(pixel_values)
685
+ seg = self.image_processor.post_process_semantic_segmentation(
686
+ outputs, target_sizes=[image.size[::-1]])[0]
687
+ # height, width, 3
688
+ color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)
689
  palette = np.array(self.ade_palette)
690
  for label, color in enumerate(palette):
691
  color_seg[seg == label, :] = color
692
  color_seg = color_seg.astype(np.uint8)
693
  segmentation = Image.fromarray(color_seg)
694
+ updated_image_path = get_new_image_name(
695
+ inputs, func_name="segmentation")
696
  segmentation.save(updated_image_path)
697
+ print(
698
+ f"\nProcessed Image2Pose, Input Image: {inputs}, Output Pose: {updated_image_path}")
699
  return updated_image_path
700
 
701
 
 
708
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
709
  "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
710
  torch_dtype=self.torch_dtype)
711
+ self.pipe.scheduler = UniPCMultistepScheduler.from_config(
712
+ self.pipe.scheduler.config)
713
  self.pipe.to(device)
714
  self.seed = -1
715
  self.a_prompt = 'best quality, extremely detailed'
716
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
717
+ ' fewer digits, cropped, worst quality, low quality'
718
 
719
  @prompts(name="Generate Image Condition On Segmentations",
720
  description="useful when you want to generate a new real image from both the user description and segmentations. "
 
723
  "The input to this tool should be a comma separated string of two, "
724
  "representing the image_path and the user description")
725
  def inference(self, inputs):
726
+ image_path, instruct_text = inputs.split(
727
+ ",")[0], ','.join(inputs.split(',')[1:])
728
  image = Image.open(image_path)
729
  self.seed = random.randint(0, 65535)
730
  seed_everything(self.seed)
731
  prompt = f'{instruct_text}, {self.a_prompt}'
732
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
733
  guidance_scale=9.0).images[0]
734
+ updated_image_path = get_new_image_name(
735
+ image_path, func_name="segment2image")
736
  image.save(updated_image_path)
737
  print(f"\nProcessed SegText2Image, Input Seg: {image_path}, Input Text: {instruct_text}, "
738
  f"Output Image: {updated_image_path}")
 
757
  depth = Image.fromarray(depth)
758
  updated_image_path = get_new_image_name(inputs, func_name="depth")
759
  depth.save(updated_image_path)
760
+ print(
761
+ f"\nProcessed Image2Depth, Input Image: {inputs}, Output Depth: {updated_image_path}")
762
  return updated_image_path
763
 
764
 
 
771
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
772
  "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
773
  torch_dtype=self.torch_dtype)
774
+ self.pipe.scheduler = UniPCMultistepScheduler.from_config(
775
+ self.pipe.scheduler.config)
776
  self.pipe.to(device)
777
  self.seed = -1
778
  self.a_prompt = 'best quality, extremely detailed'
779
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
780
+ ' fewer digits, cropped, worst quality, low quality'
781
 
782
  @prompts(name="Generate Image Condition On Depth",
783
  description="useful when you want to generate a new real image from both the user description and depth image. "
 
786
  "The input to this tool should be a comma separated string of two, "
787
  "representing the image_path and the user description")
788
  def inference(self, inputs):
789
+ image_path, instruct_text = inputs.split(
790
+ ",")[0], ','.join(inputs.split(',')[1:])
791
  image = Image.open(image_path)
792
  self.seed = random.randint(0, 65535)
793
  seed_everything(self.seed)
794
  prompt = f'{instruct_text}, {self.a_prompt}'
795
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
796
  guidance_scale=9.0).images[0]
797
+ updated_image_path = get_new_image_name(
798
+ image_path, func_name="depth2image")
799
  image.save(updated_image_path)
800
  print(f"\nProcessed DepthText2Image, Input Depth: {image_path}, Input Text: {instruct_text}, "
801
  f"Output Image: {updated_image_path}")
 
805
  class Image2Normal:
806
  def __init__(self, device):
807
  print("Initializing Image2Normal")
808
+ self.depth_estimator = pipeline(
809
+ "depth-estimation", model="Intel/dpt-hybrid-midas")
810
  self.bg_threhold = 0.4
811
 
812
  @prompts(name="Predict Normal Map On Image",
 
833
  image = image.resize(original_size)
834
  updated_image_path = get_new_image_name(inputs, func_name="normal-map")
835
  image.save(updated_image_path)
836
+ print(
837
+ f"\nProcessed Image2Normal, Input Image: {inputs}, Output Depth: {updated_image_path}")
838
  return updated_image_path
839
 
840
 
 
847
  self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
848
  "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
849
  torch_dtype=self.torch_dtype)
850
+ self.pipe.scheduler = UniPCMultistepScheduler.from_config(
851
+ self.pipe.scheduler.config)
852
  self.pipe.to(device)
853
  self.seed = -1
854
  self.a_prompt = 'best quality, extremely detailed'
855
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
856
+ ' fewer digits, cropped, worst quality, low quality'
857
 
858
  @prompts(name="Generate Image Condition On Normal Map",
859
  description="useful when you want to generate a new real image from both the user description and normal map. "
 
862
  "The input to this tool should be a comma separated string of two, "
863
  "representing the image_path and the user description")
864
  def inference(self, inputs):
865
+ image_path, instruct_text = inputs.split(
866
+ ",")[0], ','.join(inputs.split(',')[1:])
867
  image = Image.open(image_path)
868
  self.seed = random.randint(0, 65535)
869
  seed_everything(self.seed)
870
  prompt = f'{instruct_text}, {self.a_prompt}'
871
  image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
872
  guidance_scale=9.0).images[0]
873
+ updated_image_path = get_new_image_name(
874
+ image_path, func_name="normal2image")
875
  image.save(updated_image_path)
876
  print(f"\nProcessed NormalText2Image, Input Normal: {image_path}, Input Text: {instruct_text}, "
877
  f"Output Image: {updated_image_path}")
 
883
  print(f"Initializing VisualQuestionAnswering to {device}")
884
  self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
885
  self.device = device
886
+ self.processor = BlipProcessor.from_pretrained(
887
+ "Salesforce/blip-vqa-base")
888
  self.model = BlipForQuestionAnswering.from_pretrained(
889
  "Salesforce/blip-vqa-base", torch_dtype=self.torch_dtype).to(self.device)
890
 
 
893
  "like: what is the background color of the last image, how many cats in this figure, what is in this figure. "
894
  "The input to this tool should be a comma separated string of two, representing the image_path and the question")
895
  def inference(self, inputs):
896
+ image_path, question = inputs.split(
897
+ ",")[0], ','.join(inputs.split(',')[1:])
898
  raw_image = Image.open(image_path).convert('RGB')
899
+ inputs = self.processor(raw_image, question, return_tensors="pt").to(
900
+ self.device, self.torch_dtype)
901
  out = self.model.generate(**inputs)
902
  answer = self.processor.decode(out[0], skip_special_tokens=True)
903
  print(f"\nProcessed VisualQuestionAnswering, Input Image: {image_path}, Input Question: {question}, "
 
910
  # load_dict = {'VisualQuestionAnswering':'cuda:0', 'ImageCaptioning':'cuda:1',...}
911
  print(f"Initializing VisualChatGPT, load_dict={load_dict}")
912
  if 'ImageCaptioning' not in load_dict:
913
+ raise ValueError(
914
+ "You have to load ImageCaptioning as a basic function for VisualChatGPT")
915
 
916
  self.llm = OpenAIChat(temperature=0)
917
+ self.memory = ConversationBufferMemory(
918
+ memory_key="chat_history", output_key='output')
919
 
920
  self.models = {}
921
  for class_name, device in load_dict.items():
 
926
  for e in dir(instance):
927
  if e.startswith('inference'):
928
  func = getattr(instance, e)
929
+ self.tools.append(
930
+ Tool(name=func.name, description=func.description, func=func))
931
 
932
  self.agent = initialize_agent(
933
  self.tools,
 
940
  'suffix': VISUAL_CHATGPT_SUFFIX}, )
941
 
942
  def run_text(self, text, state):
943
+ self.agent.memory.buffer = cut_dialogue_history(
944
+ self.agent.memory.buffer, keep_last_n_words=500)
945
  res = self.agent({"input": text})
946
  res['output'] = res['output'].replace("\\", "/")
947
+ response = re.sub(
948
+ '(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
949
  state = state + [(text, response)]
950
  print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
951
  f"Current Memory: {self.agent.memory.buffer}")
 
963
  img = img.resize((width_new, height_new))
964
  img = img.convert('RGB')
965
  img.save(image_filename, "PNG")
966
+ print(
967
+ f"Resize image form {width}x{height} to {width_new}x{height_new}")
968
  description = self.models['ImageCaptioning'].inference(image_filename)
969
  Human_prompt = f'\nHuman: provide a figure named {image_filename}. The description is: {description}. This information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
970
  AI_prompt = "Received. "
971
+ self.agent.memory.buffer = self.agent.memory.buffer + \
972
+ Human_prompt + 'AI: ' + AI_prompt
973
+ state = state + \
974
+ [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)]
975
  print(f"\nProcessed run_image, Input image: {image_filename}\nCurrent state: {state}\n"
976
  f"Current Memory: {self.agent.memory.buffer}")
977
  return state, state, f'{txt} {image_filename} '
 
979
 
980
  if __name__ == '__main__':
981
  parser = argparse.ArgumentParser()
982
+ parser.add_argument('--load', type=str,
983
+ default="ImageCaptioning_cuda:0,Text2Image_cuda:0")
984
  args = parser.parse_args()
985
+ load_dict = {e.split('_')[0].strip(): e.split(
986
+ '_')[1].strip() for e in args.load.split(',')}
987
  bot = ConversationBot(load_dict=load_dict)
988
  with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo:
989
  chatbot = gr.Chatbot(elem_id="chatbot", label="Visual ChatGPT")