Spaces:
Runtime error
Runtime error
Martijn van Beers
commited on
Commit
·
adf3a47
1
Parent(s):
6395dfb
Move text and examples into separate files
Browse files- app.py +15 -52
- description.md +9 -0
- entity_description.md +5 -0
- entity_examples.csv +3 -0
- examples.csv +12 -0
- footer.md +13 -0
app.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
| 1 |
import re
|
| 2 |
import sys
|
|
|
|
|
|
|
| 3 |
import gradio as gr
|
| 4 |
|
| 5 |
-
# sys.path.append("../")
|
| 6 |
sys.path.append("CLIP_explainability/Transformer-MM-Explainability/")
|
| 7 |
|
| 8 |
import torch
|
|
@@ -24,6 +25,12 @@ clip.clip._MODELS = {
|
|
| 24 |
"ViT-L/14@336px": "https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt",
|
| 25 |
}
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
colour_map = {
|
| 28 |
"N": "#f77189",
|
| 29 |
"CARDINAL": "#f7764a",
|
|
@@ -91,17 +98,9 @@ def run_demo(*args):
|
|
| 91 |
|
| 92 |
# Default demo:
|
| 93 |
|
| 94 |
-
|
| 95 |
-
<br> <br>
|
| 96 |
-
This demo shows attributions scores on both the image and the text input when presenting CLIP with a
|
| 97 |
-
<text,image> pair. Attributions are computed as Gradient-weighted Attention Rollout (Chefer et al.,
|
| 98 |
-
2021), and can be thought of as an estimate of the effective attention CLIP pays to its input when
|
| 99 |
-
computing a multimodal representation. <span style="color:red">Warning:</span> Note that attribution
|
| 100 |
-
methods such as the one from this demo can only give an estimate of the real underlying behavior
|
| 101 |
-
of the model."""
|
| 102 |
-
|
| 103 |
with gr.Blocks(title="CLIP Grounding Explainability") as iface_default:
|
| 104 |
-
gr.Markdown(description)
|
| 105 |
with gr.Row():
|
| 106 |
with gr.Column() as inputs:
|
| 107 |
orig = gr.components.Image(type='pil', label="Original Image")
|
|
@@ -112,22 +111,7 @@ with gr.Blocks(title="CLIP Grounding Explainability") as iface_default:
|
|
| 112 |
with gr.Column() as outputs:
|
| 113 |
image = gr.components.Image(type='pil', label="Output Image")
|
| 114 |
text = gr.components.HighlightedText(label="Text importance")
|
| 115 |
-
gr.Examples(
|
| 116 |
-
examples=[
|
| 117 |
-
["example_images/London.png", "London Eye"],
|
| 118 |
-
["example_images/London.png", "Big Ben"],
|
| 119 |
-
["example_images/harrypotter.png", "Harry"],
|
| 120 |
-
["example_images/harrypotter.png", "Hermione"],
|
| 121 |
-
["example_images/harrypotter.png", "Ron"],
|
| 122 |
-
["example_images/Amsterdam.png", "Amsterdam canal"],
|
| 123 |
-
["example_images/Amsterdam.png", "Old buildings"],
|
| 124 |
-
["example_images/Amsterdam.png", "Pink flowers"],
|
| 125 |
-
["example_images/dogs_on_bed.png", "Two dogs"],
|
| 126 |
-
["example_images/dogs_on_bed.png", "Book"],
|
| 127 |
-
["example_images/dogs_on_bed.png", "Cat"]
|
| 128 |
-
],
|
| 129 |
-
inputs=[orig, description]
|
| 130 |
-
)
|
| 131 |
default_model.change(update_slider, inputs=default_model, outputs=default_layer)
|
| 132 |
submit.click(run_demo, inputs=[orig, description, default_model, default_layer], outputs=[image, text])
|
| 133 |
|
|
@@ -181,13 +165,9 @@ def NER_demo(image, text, model_name):
|
|
| 181 |
return labeled_text, gallery_images
|
| 182 |
|
| 183 |
|
| 184 |
-
|
| 185 |
-
noun chunks, retrieved with the spaCy model. <span style="color:red">Warning:</span> Note
|
| 186 |
-
that attribution methods such as the one from this demo can only give an estimate of the real
|
| 187 |
-
underlying behavior of the model."""
|
| 188 |
-
|
| 189 |
with gr.Blocks(title="Entity Grounding explainability using CLIP") as iface_NER:
|
| 190 |
-
gr.Markdown(
|
| 191 |
with gr.Row():
|
| 192 |
with gr.Column() as inputs:
|
| 193 |
img = gr.Image(type='pil', label="Original Image")
|
|
@@ -199,28 +179,11 @@ with gr.Blocks(title="Entity Grounding explainability using CLIP") as iface_NER:
|
|
| 199 |
text = gr.components.HighlightedText(show_legend=True, color_map=colour_map, label="Noun chunks")
|
| 200 |
gallery = gr.components.Gallery(type='pil', label="NER Entity explanations")
|
| 201 |
|
| 202 |
-
gr.Examples(
|
| 203 |
-
examples=[
|
| 204 |
-
["example_images/London.png", "In this image we see Big Ben and the London Eye, on both sides of the river Thames."],
|
| 205 |
-
["example_images/harrypotter.png", "Hermione, Harry and Ron in their school uniform"],
|
| 206 |
-
],
|
| 207 |
-
inputs=[img, text],
|
| 208 |
-
)
|
| 209 |
ner_model.change(update_slider, inputs=ner_model, outputs=ner_layer)
|
| 210 |
submit.click(run_demo, inputs=[img, intext, ner_model, ner_layer], outputs=[text, gallery])
|
| 211 |
|
| 212 |
demo_tabs = gr.TabbedInterface([iface_default, iface_NER], ["Default", "Entities"])
|
| 213 |
with demo_tabs:
|
| 214 |
-
gr.Markdown(""
|
| 215 |
-
### Acknowledgements
|
| 216 |
-
This demo was developed for the Interpretability & Explainability in AI course at the University of
|
| 217 |
-
Amsterdam. We would like to express our thanks to Jelle Zuidema, Jaap Jumelet, Tom Kersten, Christos
|
| 218 |
-
Athanasiadis, Peter Heemskerk, Zhi Zhang, and all the other TAs who helped us during this course.
|
| 219 |
-
|
| 220 |
-
---
|
| 221 |
-
### References
|
| 222 |
-
\[1\]: Chefer, H., Gur, S., & Wolf, L. (2021). Generic attention-model explainability for interpreting bi-modal and encoder-decoder transformers. <br>
|
| 223 |
-
\[2\]: Abnar, S., & Zuidema, W. (2020). Quantifying attention flow in transformers. arXiv preprint arXiv:2005.00928. <br>
|
| 224 |
-
\[3\]: [https://samiraabnar.github.io/articles/2020-04/attention_flow](https://samiraabnar.github.io/articles/2020-04/attention_flow) <br>
|
| 225 |
-
""")
|
| 226 |
demo_tabs.launch(show_error=True)
|
|
|
|
| 1 |
import re
|
| 2 |
import sys
|
| 3 |
+
import pathlib
|
| 4 |
+
import csv
|
| 5 |
import gradio as gr
|
| 6 |
|
|
|
|
| 7 |
sys.path.append("CLIP_explainability/Transformer-MM-Explainability/")
|
| 8 |
|
| 9 |
import torch
|
|
|
|
| 25 |
"ViT-L/14@336px": "https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt",
|
| 26 |
}
|
| 27 |
|
| 28 |
+
def iter_file(filename):
|
| 29 |
+
with pathlib.Path(filename).open("r") as fh:
|
| 30 |
+
header = next(fh)
|
| 31 |
+
for line in fh:
|
| 32 |
+
yield line
|
| 33 |
+
|
| 34 |
colour_map = {
|
| 35 |
"N": "#f77189",
|
| 36 |
"CARDINAL": "#f7764a",
|
|
|
|
| 98 |
|
| 99 |
# Default demo:
|
| 100 |
|
| 101 |
+
examples = list(csv.reader(iter_file("examples.csv")))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
with gr.Blocks(title="CLIP Grounding Explainability") as iface_default:
|
| 103 |
+
gr.Markdown(pathlib.Path("description.md").read_text)
|
| 104 |
with gr.Row():
|
| 105 |
with gr.Column() as inputs:
|
| 106 |
orig = gr.components.Image(type='pil', label="Original Image")
|
|
|
|
| 111 |
with gr.Column() as outputs:
|
| 112 |
image = gr.components.Image(type='pil', label="Output Image")
|
| 113 |
text = gr.components.HighlightedText(label="Text importance")
|
| 114 |
+
gr.Examples(examples=examples, inputs=[orig, description])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
default_model.change(update_slider, inputs=default_model, outputs=default_layer)
|
| 116 |
submit.click(run_demo, inputs=[orig, description, default_model, default_layer], outputs=[image, text])
|
| 117 |
|
|
|
|
| 165 |
return labeled_text, gallery_images
|
| 166 |
|
| 167 |
|
| 168 |
+
entity_examples = list(csv.reader(iter_file("entity_examples.csv")))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
with gr.Blocks(title="Entity Grounding explainability using CLIP") as iface_NER:
|
| 170 |
+
gr.Markdown(pathlib.Path("entity_description.md").read_text)
|
| 171 |
with gr.Row():
|
| 172 |
with gr.Column() as inputs:
|
| 173 |
img = gr.Image(type='pil', label="Original Image")
|
|
|
|
| 179 |
text = gr.components.HighlightedText(show_legend=True, color_map=colour_map, label="Noun chunks")
|
| 180 |
gallery = gr.components.Gallery(type='pil', label="NER Entity explanations")
|
| 181 |
|
| 182 |
+
gr.Examples(examples=entity_examples, inputs=[img, text])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
ner_model.change(update_slider, inputs=ner_model, outputs=ner_layer)
|
| 184 |
submit.click(run_demo, inputs=[img, intext, ner_model, ner_layer], outputs=[text, gallery])
|
| 185 |
|
| 186 |
demo_tabs = gr.TabbedInterface([iface_default, iface_NER], ["Default", "Entities"])
|
| 187 |
with demo_tabs:
|
| 188 |
+
gr.Markdown(pathlib.Path("footer.md").read_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
demo_tabs.launch(show_error=True)
|
description.md
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
This demo is a copy of the demo CLIPGroundingExlainability built by Paul Hilders, Danilo de Goede and Piyush Bagad, as part of the course Interpretability and Explainability in AI (MSc AI, UvA, June 2022).
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
This demo shows attributions scores on both the image and the text input when presenting CLIP with a
|
| 5 |
+
<text,image> pair. Attributions are computed as Gradient-weighted Attention Rollout (Chefer et al.,
|
| 6 |
+
2021), and can be thought of as an estimate of the effective attention CLIP pays to its input when
|
| 7 |
+
computing a multimodal representation. <span style="color:red">Warning:</span> Note that attribution
|
| 8 |
+
methods such as the one from this demo can only give an estimate of the real underlying behavior
|
| 9 |
+
of the model.
|
entity_description.md
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Automatically generated CLIP grounding explanations for noun chunks,
|
| 2 |
+
retrieved with the spaCy model.
|
| 3 |
+
<span style="color:red">Warning:</span> Note that attribution methods
|
| 4 |
+
such as the one from this demo can only give an estimate of the real
|
| 5 |
+
underlying behavior of the model.
|
entity_examples.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
image,text
|
| 2 |
+
"example_images/London.png","In this image we see Big Ben and the London Eye, on both sides of the river Thames."
|
| 3 |
+
"example_images/harrypotter.png","Hermione, Harry and Ron in their school uniform"
|
examples.csv
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
image,text
|
| 2 |
+
"example_images/London.png","London Eye"
|
| 3 |
+
"example_images/London.png","Big Ben"
|
| 4 |
+
"example_images/harrypotter.png","Harry"
|
| 5 |
+
"example_images/harrypotter.png","Hermione"
|
| 6 |
+
"example_images/harrypotter.png","Ron"
|
| 7 |
+
"example_images/Amsterdam.png","Amsterdam canal"
|
| 8 |
+
"example_images/Amsterdam.png","Old buildings"
|
| 9 |
+
"example_images/Amsterdam.png","Pink flowers"
|
| 10 |
+
"example_images/dogs_on_bed.png","Two dogs"
|
| 11 |
+
"example_images/dogs_on_bed.png","Book"
|
| 12 |
+
"example_images/dogs_on_bed.png","Cat"
|
footer.md
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
### Acknowledgements
|
| 2 |
+
|
| 3 |
+
This demo was developed for the Interpretability & Explainability in AI course at the University of
|
| 4 |
+
Amsterdam. We would like to express our thanks to Jelle Zuidema, Jaap Jumelet, Tom Kersten, Christos
|
| 5 |
+
Athanasiadis, Peter Heemskerk, Zhi Zhang, and all the other TAs who helped us during this course.
|
| 6 |
+
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
### References
|
| 10 |
+
|
| 11 |
+
\[1\]: Chefer, H., Gur, S., & Wolf, L. (2021). Generic attention-model explainability for interpreting bi-modal and encoder-decoder transformers. <br>
|
| 12 |
+
\[2\]: Abnar, S., & Zuidema, W. (2020). Quantifying attention flow in transformers. arXiv preprint arXiv:2005.00928. <br>
|
| 13 |
+
\[3\]: [https://samiraabnar.github.io/articles/2020-04/attention_flow](https://samiraabnar.github.io/articles/2020-04/attention_flow) <br>
|