AustingDong
commited on
Commit
·
6a0d13c
1
Parent(s):
035a152
modified font, corrected model name
Browse files- app.py +1 -1
- demo/cam.py +7 -6
- demo/model_utils.py +2 -2
app.py
CHANGED
|
@@ -286,7 +286,7 @@ with gr.Blocks() as demo:
|
|
| 286 |
saliency_map_output = gr.Gallery(label="Saliency Map", height=300, columns=1)
|
| 287 |
|
| 288 |
with gr.Column():
|
| 289 |
-
model_selector = gr.Dropdown(choices=["Clip", "ChartGemma-2B", "Janus-1B", "Janus-7B", "LLaVA-
|
| 290 |
response_type = gr.Dropdown(choices=["Visualization only"], value="Visualization only", label="response_type")
|
| 291 |
focus = gr.Dropdown(choices=["Visual Encoder"], value="Visual Encoder", label="focus")
|
| 292 |
saliency_map_method = gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type")
|
|
|
|
| 286 |
saliency_map_output = gr.Gallery(label="Saliency Map", height=300, columns=1)
|
| 287 |
|
| 288 |
with gr.Column():
|
| 289 |
+
model_selector = gr.Dropdown(choices=["Clip", "ChartGemma-2B", "Janus-1B", "Janus-7B", "LLaVA-v1.6-Mistral-7B"], value="Clip", label="model")
|
| 290 |
response_type = gr.Dropdown(choices=["Visualization only"], value="Visualization only", label="response_type")
|
| 291 |
focus = gr.Dropdown(choices=["Visual Encoder"], value="Visual Encoder", label="focus")
|
| 292 |
saliency_map_method = gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type")
|
demo/cam.py
CHANGED
|
@@ -11,12 +11,13 @@ from demo.modify_llama import *
|
|
| 11 |
|
| 12 |
|
| 13 |
class AttentionGuidedCAM:
|
| 14 |
-
def __init__(self, model):
|
| 15 |
self.model = model
|
| 16 |
self.gradients = []
|
| 17 |
self.activations = []
|
| 18 |
self.hooks = []
|
| 19 |
-
|
|
|
|
| 20 |
|
| 21 |
def _register_hooks(self):
|
| 22 |
""" Registers hooks to extract activations and gradients from ALL attention layers. """
|
|
@@ -309,7 +310,7 @@ class AttentionGuidedCAMJanus(AttentionGuidedCAM):
|
|
| 309 |
class AttentionGuidedCAMLLaVA(AttentionGuidedCAM):
|
| 310 |
def __init__(self, model, target_layers):
|
| 311 |
self.target_layers = target_layers
|
| 312 |
-
super().__init__(model)
|
| 313 |
self._modify_layers()
|
| 314 |
self._register_hooks_activations()
|
| 315 |
|
|
@@ -439,7 +440,7 @@ class AttentionGuidedCAMLLaVA(AttentionGuidedCAM):
|
|
| 439 |
class AttentionGuidedCAMChartGemma(AttentionGuidedCAM):
|
| 440 |
def __init__(self, model, target_layers):
|
| 441 |
self.target_layers = target_layers
|
| 442 |
-
super().__init__(model)
|
| 443 |
self._modify_layers()
|
| 444 |
self._register_hooks_activations()
|
| 445 |
|
|
@@ -473,7 +474,7 @@ class AttentionGuidedCAMChartGemma(AttentionGuidedCAM):
|
|
| 473 |
outputs_raw = self.model(**inputs)
|
| 474 |
|
| 475 |
self.model.zero_grad()
|
| 476 |
-
print(outputs_raw)
|
| 477 |
# loss = self.target_layers[-1].attention_map.sum()
|
| 478 |
loss = outputs_raw.logits.max(dim=-1).values.sum()
|
| 479 |
loss.backward()
|
|
@@ -616,7 +617,7 @@ def generate_gradcam(
|
|
| 616 |
Returns:
|
| 617 |
PIL.Image: The image overlaid with the Grad-CAM heatmap.
|
| 618 |
"""
|
| 619 |
-
print("Generating Grad-CAM with shape:", cam.shape)
|
| 620 |
|
| 621 |
if normalize:
|
| 622 |
cam_min, cam_max = cam.min(), cam.max()
|
|
|
|
| 11 |
|
| 12 |
|
| 13 |
class AttentionGuidedCAM:
|
| 14 |
+
def __init__(self, model, register=True):
|
| 15 |
self.model = model
|
| 16 |
self.gradients = []
|
| 17 |
self.activations = []
|
| 18 |
self.hooks = []
|
| 19 |
+
if register:
|
| 20 |
+
self._register_hooks()
|
| 21 |
|
| 22 |
def _register_hooks(self):
|
| 23 |
""" Registers hooks to extract activations and gradients from ALL attention layers. """
|
|
|
|
| 310 |
class AttentionGuidedCAMLLaVA(AttentionGuidedCAM):
|
| 311 |
def __init__(self, model, target_layers):
|
| 312 |
self.target_layers = target_layers
|
| 313 |
+
super().__init__(model, register=False)
|
| 314 |
self._modify_layers()
|
| 315 |
self._register_hooks_activations()
|
| 316 |
|
|
|
|
| 440 |
class AttentionGuidedCAMChartGemma(AttentionGuidedCAM):
|
| 441 |
def __init__(self, model, target_layers):
|
| 442 |
self.target_layers = target_layers
|
| 443 |
+
super().__init__(model, register=False)
|
| 444 |
self._modify_layers()
|
| 445 |
self._register_hooks_activations()
|
| 446 |
|
|
|
|
| 474 |
outputs_raw = self.model(**inputs)
|
| 475 |
|
| 476 |
self.model.zero_grad()
|
| 477 |
+
# print(outputs_raw)
|
| 478 |
# loss = self.target_layers[-1].attention_map.sum()
|
| 479 |
loss = outputs_raw.logits.max(dim=-1).values.sum()
|
| 480 |
loss.backward()
|
|
|
|
| 617 |
Returns:
|
| 618 |
PIL.Image: The image overlaid with the Grad-CAM heatmap.
|
| 619 |
"""
|
| 620 |
+
# print("Generating Grad-CAM with shape:", cam.shape)
|
| 621 |
|
| 622 |
if normalize:
|
| 623 |
cam_min, cam_max = cam.min(), cam.max()
|
demo/model_utils.py
CHANGED
|
@@ -45,7 +45,7 @@ class Clip_Utils(Model_Utils):
|
|
| 45 |
@spaces.GPU(duration=120)
|
| 46 |
def prepare_inputs(self, question_lst, image):
|
| 47 |
image = Image.fromarray(image)
|
| 48 |
-
print("image_size: ", image.size)
|
| 49 |
inputs = self.processor(text=question_lst, images=image, return_tensors="pt", padding=True)
|
| 50 |
return inputs
|
| 51 |
|
|
@@ -228,7 +228,7 @@ class ChartGemma_Utils(Model_Utils):
|
|
| 228 |
|
| 229 |
|
| 230 |
|
| 231 |
-
def add_title_to_image(image, title, font_size=
|
| 232 |
"""Adds a title above an image using PIL and textbbox()."""
|
| 233 |
img_width, img_height = image.size
|
| 234 |
|
|
|
|
| 45 |
@spaces.GPU(duration=120)
|
| 46 |
def prepare_inputs(self, question_lst, image):
|
| 47 |
image = Image.fromarray(image)
|
| 48 |
+
# print("image_size: ", image.size)
|
| 49 |
inputs = self.processor(text=question_lst, images=image, return_tensors="pt", padding=True)
|
| 50 |
return inputs
|
| 51 |
|
|
|
|
| 228 |
|
| 229 |
|
| 230 |
|
| 231 |
+
def add_title_to_image(image, title, font_size=50):
|
| 232 |
"""Adds a title above an image using PIL and textbbox()."""
|
| 233 |
img_width, img_height = image.size
|
| 234 |
|