AustingDong
commited on
Commit
·
ee8653b
1
Parent(s):
f59a9b2
add accumulate methods: Sum or Mult
Browse files- app.py +5 -5
- demo/visualization.py +10 -7
app.py
CHANGED
|
@@ -56,7 +56,7 @@ def multimodal_understanding(model_type,
|
|
| 56 |
activation_map_method,
|
| 57 |
visual_method,
|
| 58 |
image, question, seed, top_p, temperature, target_token_idx,
|
| 59 |
-
visualization_layer_min, visualization_layer_max, focus, response_type, chart_type):
|
| 60 |
# Clear CUDA cache before generating
|
| 61 |
gc.collect()
|
| 62 |
if torch.cuda.is_available():
|
|
@@ -160,7 +160,7 @@ def multimodal_understanding(model_type,
|
|
| 160 |
gradcam = VisualizationLLaVA(vl_gpt, target_layers)
|
| 161 |
elif model_name.split('-')[0] == "ChartGemma":
|
| 162 |
gradcam = VisualizationChartGemma(vl_gpt, target_layers)
|
| 163 |
-
cam_tensors, grid_size, start = gradcam.generate_cam(prepare_inputs, tokenizer, temperature, top_p, i, visual_method, focus)
|
| 164 |
cam_grid = cam_tensors.reshape(grid_size, grid_size)
|
| 165 |
cam_i = generate_gradcam(cam_grid, image)
|
| 166 |
cam_i = add_title_to_image(cam_i, input_ids_decoded[start + i])
|
|
@@ -168,7 +168,7 @@ def multimodal_understanding(model_type,
|
|
| 168 |
gradcam.remove_hooks()
|
| 169 |
i += 1
|
| 170 |
else:
|
| 171 |
-
cam_tensors, grid_size, start = gradcam.generate_cam(prepare_inputs, tokenizer, temperature, top_p, target_token_idx, visual_method, focus)
|
| 172 |
if target_token_idx != -1:
|
| 173 |
input_text_decoded = input_ids_decoded[start + target_token_idx]
|
| 174 |
for i, cam_tensor in enumerate(cam_tensors):
|
|
@@ -379,7 +379,7 @@ with gr.Blocks() as demo:
|
|
| 379 |
response_type = gr.Dropdown(choices=["Visualization only"], value="Visualization only", label="response_type")
|
| 380 |
focus = gr.Dropdown(choices=["Visual Encoder"], value="Visual Encoder", label="focus")
|
| 381 |
activation_map_method = gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="visualization type")
|
| 382 |
-
|
| 383 |
visual_method = gr.Dropdown(choices=["CLS", "max", "avg"], value="CLS", label="visual pooling method")
|
| 384 |
|
| 385 |
|
|
@@ -512,7 +512,7 @@ with gr.Blocks() as demo:
|
|
| 512 |
understanding_button.click(
|
| 513 |
multimodal_understanding,
|
| 514 |
inputs=[model_selector, activation_map_method, visual_method, image_input, question_input, und_seed_input, top_p, temperature, target_token_idx,
|
| 515 |
-
visualization_layers_min, visualization_layers_max, focus, response_type, chart_type],
|
| 516 |
outputs=[understanding_output, activation_map_output, understanding_target_token_decoded_output]
|
| 517 |
)
|
| 518 |
|
|
|
|
| 56 |
activation_map_method,
|
| 57 |
visual_method,
|
| 58 |
image, question, seed, top_p, temperature, target_token_idx,
|
| 59 |
+
visualization_layer_min, visualization_layer_max, focus, response_type, chart_type, accumulate_method):
|
| 60 |
# Clear CUDA cache before generating
|
| 61 |
gc.collect()
|
| 62 |
if torch.cuda.is_available():
|
|
|
|
| 160 |
gradcam = VisualizationLLaVA(vl_gpt, target_layers)
|
| 161 |
elif model_name.split('-')[0] == "ChartGemma":
|
| 162 |
gradcam = VisualizationChartGemma(vl_gpt, target_layers)
|
| 163 |
+
cam_tensors, grid_size, start = gradcam.generate_cam(prepare_inputs, tokenizer, temperature, top_p, i, visual_method, focus, accumulate_method)
|
| 164 |
cam_grid = cam_tensors.reshape(grid_size, grid_size)
|
| 165 |
cam_i = generate_gradcam(cam_grid, image)
|
| 166 |
cam_i = add_title_to_image(cam_i, input_ids_decoded[start + i])
|
|
|
|
| 168 |
gradcam.remove_hooks()
|
| 169 |
i += 1
|
| 170 |
else:
|
| 171 |
+
cam_tensors, grid_size, start = gradcam.generate_cam(prepare_inputs, tokenizer, temperature, top_p, target_token_idx, visual_method, focus, accumulate_method)
|
| 172 |
if target_token_idx != -1:
|
| 173 |
input_text_decoded = input_ids_decoded[start + target_token_idx]
|
| 174 |
for i, cam_tensor in enumerate(cam_tensors):
|
|
|
|
| 379 |
response_type = gr.Dropdown(choices=["Visualization only"], value="Visualization only", label="response_type")
|
| 380 |
focus = gr.Dropdown(choices=["Visual Encoder"], value="Visual Encoder", label="focus")
|
| 381 |
activation_map_method = gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="visualization type")
|
| 382 |
+
accumulate_method = gr.Dropdown(choices=["sum", "mult"], value="sum", label="layers accumulate method")
|
| 383 |
visual_method = gr.Dropdown(choices=["CLS", "max", "avg"], value="CLS", label="visual pooling method")
|
| 384 |
|
| 385 |
|
|
|
|
| 512 |
understanding_button.click(
|
| 513 |
multimodal_understanding,
|
| 514 |
inputs=[model_selector, activation_map_method, visual_method, image_input, question_input, und_seed_input, top_p, temperature, target_token_idx,
|
| 515 |
+
visualization_layers_min, visualization_layers_max, focus, response_type, chart_type, accumulate_method],
|
| 516 |
outputs=[understanding_output, activation_map_output, understanding_target_token_decoded_output]
|
| 517 |
)
|
| 518 |
|
demo/visualization.py
CHANGED
|
@@ -196,7 +196,7 @@ class Visualization:
|
|
| 196 |
cam_sum_lst.append(cam_sum)
|
| 197 |
return cam_sum_lst, grid_size
|
| 198 |
|
| 199 |
-
def
|
| 200 |
cam_sum_lst = []
|
| 201 |
for i in range(start_idx, cams[0].shape[1]):
|
| 202 |
cam_sum = None
|
|
@@ -217,7 +217,10 @@ class Visualization:
|
|
| 217 |
if cam_sum == None:
|
| 218 |
cam_sum = cam_reshaped
|
| 219 |
else:
|
| 220 |
-
|
|
|
|
|
|
|
|
|
|
| 221 |
|
| 222 |
cam_sum = (cam_sum - cam_sum.min()) / (cam_sum.max() - cam_sum.min())
|
| 223 |
cam_sum_lst.append(cam_sum)
|
|
@@ -316,7 +319,7 @@ class VisualizationJanus(Visualization):
|
|
| 316 |
self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
|
| 317 |
|
| 318 |
@spaces.GPU(duration=120)
|
| 319 |
-
def generate_cam(self, input_tensor, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="Visual Encoder"):
|
| 320 |
|
| 321 |
self.setup_grads()
|
| 322 |
|
|
@@ -368,7 +371,7 @@ class VisualizationLLaVA(Visualization):
|
|
| 368 |
self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
|
| 369 |
|
| 370 |
@spaces.GPU(duration=120)
|
| 371 |
-
def generate_cam(self, inputs, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="Visual Encoder"):
|
| 372 |
|
| 373 |
self.setup_grads()
|
| 374 |
self.forward_backward(inputs)
|
|
@@ -388,7 +391,7 @@ class VisualizationLLaVA(Visualization):
|
|
| 388 |
# Aggregate activations and gradients from ALL layers
|
| 389 |
start_idx = last + 1
|
| 390 |
cams = self.attn_guided_cam()
|
| 391 |
-
cam_sum_lst, grid_size = self.
|
| 392 |
|
| 393 |
return cam_sum_lst, grid_size, start_idx
|
| 394 |
|
|
@@ -424,7 +427,7 @@ class VisualizationChartGemma(Visualization):
|
|
| 424 |
self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
|
| 425 |
|
| 426 |
@spaces.GPU(duration=120)
|
| 427 |
-
def generate_cam(self, inputs, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="Visual Encoder"):
|
| 428 |
|
| 429 |
# Forward pass
|
| 430 |
self.setup_grads()
|
|
@@ -453,7 +456,7 @@ class VisualizationChartGemma(Visualization):
|
|
| 453 |
elif focus == "Language Model":
|
| 454 |
|
| 455 |
cams = self.attn_guided_cam()
|
| 456 |
-
cam_sum_lst, grid_size = self.
|
| 457 |
|
| 458 |
# cams shape: [layers, 1, seq_len, seq_len]
|
| 459 |
|
|
|
|
| 196 |
cam_sum_lst.append(cam_sum)
|
| 197 |
return cam_sum_lst, grid_size
|
| 198 |
|
| 199 |
+
def process_multiple_acc(self, cams, start_idx, images_seq_mask, normalize=False, accumulate_method="sum"):
|
| 200 |
cam_sum_lst = []
|
| 201 |
for i in range(start_idx, cams[0].shape[1]):
|
| 202 |
cam_sum = None
|
|
|
|
| 217 |
if cam_sum == None:
|
| 218 |
cam_sum = cam_reshaped
|
| 219 |
else:
|
| 220 |
+
if accumulate_method == "sum":
|
| 221 |
+
cam_sum += cam_reshaped
|
| 222 |
+
elif accumulate_method == "mult":
|
| 223 |
+
cam_sum *= cam_reshaped + 1
|
| 224 |
|
| 225 |
cam_sum = (cam_sum - cam_sum.min()) / (cam_sum.max() - cam_sum.min())
|
| 226 |
cam_sum_lst.append(cam_sum)
|
|
|
|
| 319 |
self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
|
| 320 |
|
| 321 |
@spaces.GPU(duration=120)
|
| 322 |
+
def generate_cam(self, input_tensor, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="Visual Encoder", accumulate_method="sum"):
|
| 323 |
|
| 324 |
self.setup_grads()
|
| 325 |
|
|
|
|
| 371 |
self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
|
| 372 |
|
| 373 |
@spaces.GPU(duration=120)
|
| 374 |
+
def generate_cam(self, inputs, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="Visual Encoder", accumulate_method="sum"):
|
| 375 |
|
| 376 |
self.setup_grads()
|
| 377 |
self.forward_backward(inputs)
|
|
|
|
| 391 |
# Aggregate activations and gradients from ALL layers
|
| 392 |
start_idx = last + 1
|
| 393 |
cams = self.attn_guided_cam()
|
| 394 |
+
cam_sum_lst, grid_size = self.process_multiple_acc(cams, start_idx, images_seq_mask, accumulate_method=accumulate_method)
|
| 395 |
|
| 396 |
return cam_sum_lst, grid_size, start_idx
|
| 397 |
|
|
|
|
| 427 |
self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
|
| 428 |
|
| 429 |
@spaces.GPU(duration=120)
|
| 430 |
+
def generate_cam(self, inputs, tokenizer, temperature, top_p, target_token_idx=None, visual_method="softmax", focus="Visual Encoder", accumulate_method="sum"):
|
| 431 |
|
| 432 |
# Forward pass
|
| 433 |
self.setup_grads()
|
|
|
|
| 456 |
elif focus == "Language Model":
|
| 457 |
|
| 458 |
cams = self.attn_guided_cam()
|
| 459 |
+
cam_sum_lst, grid_size = self.process_multiple_acc(cams, start_idx, images_seq_mask, accumulate_method=accumulate_method)
|
| 460 |
|
| 461 |
# cams shape: [layers, 1, seq_len, seq_len]
|
| 462 |
|