Spaces:

ahmedbrs
/

scene-sketch-seg

Running

App Files Files Community

ahmedbrs commited on May 9, 2024

Commit

254fdf2

1 Parent(s): c2422f6

first

Browse files

Files changed (46) hide show

app.py +423 -0
app_old.py +92 -0
demo/000000001611.png +0 -0
demo/000000004068.png +0 -0
demo/000000004546.png +0 -0
demo/000000005076.png +0 -0
demo/000000006336.png +0 -0
demo/000000011766.png +0 -0
demo/000000024458.png +0 -0
demo/000000024931.png +0 -0
demo/000000034214.png +0 -0
demo/000000038116.png +0 -0
demo/000000045280.png +0 -0
demo/000000221509.png +0 -0
demo/000000246066.png +0 -0
demo/000000260974.png +0 -0
demo/000000268340.png +0 -0
demo/000000305414.png +0 -0
demo/000000406874.png +0 -0
demo/000000484246.png +0 -0
demo/000000549338.png +0 -0
demo/sketch_1.png +0 -0
demo/sketch_2.png +0 -0
demo/sketch_3.png +0 -0
models/__init__.py +1 -0
models/auxilary.py +449 -0
models/bpe_simple_vocab_16e6.txt.gz +3 -0
models/build_model.py +83 -0
models/ca.py +165 -0
models/clip.py +357 -0
models/clip_model.py +436 -0
models/our_model.py +604 -0
models/simple_tokenizer.py +132 -0
output.png +0 -0
requirements.txt +12 -0
sketch_seg_best_miou.pth +3 -0
utils.py +120 -0
vpt/configs/base-prompt.yaml +25 -0
vpt/configs/prompt/cub.yaml +12 -0
vpt/launch.py +25 -0
vpt/src/configs/config.py +161 -0
vpt/src/configs/config_node.py +26 -0
vpt/src/configs/vit_configs.py +102 -0
vpt/src/utils/distributed.py +168 -0
vpt/src/utils/file_io.py +11 -0
vpt/src/utils/logging.py +186 -0

app.py ADDED Viewed

	@@ -0,0 +1,423 @@

+import gradio as gr
+from PIL import Image
+import torch
+from torchvision.transforms import InterpolationMode
+BICUBIC = InterpolationMode.BICUBIC
+from utils import setup, get_similarity_map, display_segmented_sketch
+from vpt.launch import default_argument_parser
+from collections import OrderedDict
+import numpy as np
+import matplotlib.pyplot as plt
+import models
+import torchvision
+args = default_argument_parser().parse_args()
+cfg = setup(args)
+device = "cpu"  # "cuda" if torch.cuda.is_available() else "cpu"
+Ours, preprocess = models.load("CS-ViT-B/16", device=device, cfg=cfg, train_bool=False)
+state_dict = torch.load("sketch_seg_best_miou.pth", map_location=device)
+# Trained on 2 gpus so we need to remove the prefix "module." to test it on a single GPU
+new_state_dict = OrderedDict()
+for k, v in state_dict.items():
+    name = k[7:]  # remove `module.`
+    new_state_dict[name] = v
+Ours.load_state_dict(new_state_dict)
+Ours.eval()
+print("Model loaded successfully")
+def run(sketch, caption, threshold, seed):
+    # set the condidate classes here
+    classes = [caption]
+    colors = plt.get_cmap("tab10").colors
+    classes_colors = colors[3:len(classes) + 3]
+    sketch2 = sketch['composite']
+    # sketch2 = sketch2[:, :, 1:4]
+    sketch2 = np.array(sketch2)
+    pil_img = Image.fromarray(sketch2).convert('RGB')
+    sketch_tensor = preprocess(pil_img).unsqueeze(0).to(device)
+    # torchvision.utils.save_image(sketch_tensor, 'sketch_tensor.png')
+    with torch.no_grad():
+        text_features = models.encode_text_with_prompt_ensemble(Ours, classes, device, no_module=True)
+        redundant_features = models.encode_text_with_prompt_ensemble(Ours, [""], device, no_module=True)
+    num_of_tokens = 3
+    with torch.no_grad():
+        sketch_features = Ours.encode_image(sketch_tensor, layers=[12],
+                                            text_features=text_features - redundant_features, mode="test").squeeze(0)
+        sketch_features = sketch_features / sketch_features.norm(dim=1, keepdim=True)
+    similarity = sketch_features @ (text_features - redundant_features).t()
+    patches_similarity = similarity[0, num_of_tokens + 1:, :]
+    pixel_similarity = get_similarity_map(patches_similarity.unsqueeze(0), pil_img.size).cpu()
+    # visualize_attention_maps_with_tokens(pixel_similarity, classes)
+    pixel_similarity[pixel_similarity < threshold] = 0
+    pixel_similarity_array = pixel_similarity.cpu().numpy().transpose(2, 0, 1)
+    display_segmented_sketch(pixel_similarity_array, sketch2, classes, classes_colors, live=True)
+    rgb_image = Image.open('output.png')
+    return rgb_image
+scripts = """
+async () => {
+    // START gallery format
+    // Get all image elements with the class "image"
+    var images = document.querySelectorAll('.image_gallery');
+    var originalParent = document.querySelector('#component-0');
+    // Create a new parent div element
+    var parentDiv = document.createElement('div');
+    var beforeDiv= document.querySelector('.table-wrap').parentElement;
+    parentDiv.id = "gallery_container";
+    // Loop through each image, append it to the parent div, and remove it from its original parent
+    images.forEach(function(image , index ) {
+        // Append the image to the parent div
+        parentDiv.appendChild(image);
+        // Add click event listener to each image
+        image.addEventListener('click', function() {
+            let nth_ch = index+1
+            document.querySelector('.tr-body:nth-child(' + nth_ch + ')').click()
+            console.log('.tr-body:nth-child(' + nth_ch + ')');
+        });
+        // Remove the image from its original parent
+    });
+    // Get a reference to the original parent of the images
+    var originalParent = document.querySelector('#component-0');
+    // Append the new parent div to the original parent
+    originalParent.insertBefore(parentDiv, beforeDiv);
+    // END gallery format
+    // START confidence span
+    // Get the selected div (replace 'selectedDivId' with the actual ID of your div)
+    var selectedDiv = document.querySelector("label[for='range_id_0'] > span")
+    // Get the text content of the div
+    var textContent = selectedDiv.textContent;
+    // Find the text before the first colon ':'
+    var colonIndex = textContent.indexOf(':');
+    var textBeforeColon = textContent.substring(0, colonIndex);
+    // Wrap the text before colon with a span element
+    var spanElement = document.createElement('span');
+    spanElement.textContent = textBeforeColon;
+    // Replace the original text with the modified text containing the span
+    selectedDiv.innerHTML = textContent.replace(textBeforeColon, spanElement.outerHTML);
+    // START format the column names :
+    // Get all elements with the class "test_class"
+    var elements = document.querySelectorAll('.tr-head > th');
+    // Iterate over each element
+    elements.forEach(function(element) {
+        // Get the text content of the element
+        var text = element.textContent.trim();
+        // Remove ":" from the text
+        var wordWithoutColon = text.replace(':', '');
+        // Split the text into words
+        var words = wordWithoutColon.split(' ');
+        // Keep only the first word
+        var firstWord = words[0];
+        // Set the text content of the element to the first word
+        element.textContent = firstWord;
+    });
+    document.querySelector('input[type=number]').disabled = true;
+}
+"""
+css="""
+gradio-app {
+    background-color: white !important;
+}
+.white-bg {
+    background-color: white !important;
+}
+.gray-border {
+    border: 1px solid dimgrey !important;
+}
+.border-radius {
+    border-radius: 8px !important;
+}
+.black-text {
+    color : black !important;
+}
+th {
+ color : black !important;
+}
+tr {
+    background-color: white !important;
+    color: black !important;
+}
+td {
+  border-bottom : 1px solid black !important;
+}
+label[data-testid="block-label"] {
+    background: white;
+    color: black;
+    font-weight: bold;
+}
+.controls-wrap button:disabled {
+    color: gray !important;
+    background-color: white !important;
+}
+.controls-wrap button:not(:disabled) {
+    color: black !important;
+    background-color: white !important;
+}
+.source-wrap button {
+    color: black !important;
+}
+.toolbar-wrap button {
+    color: black !important;
+}
+.empty.wrap {
+    color: black !important;
+}
+textarea {
+    background-color : #f7f9f8 !important;
+    color : #afb0b1 !important
+}
+input[data-testid="number-input"] {
+    background-color : #f7f9f8 !important;
+    color : black !important
+}
+tr > th {
+   border-bottom : 1px solid black !important;
+}
+tr:hover {
+    background: #f7f9f8 !important;
+}
+#component-17{
+    justify-content: center !important;
+}
+#component-17 > button {
+    flex: none !important;
+    background-color : black !important;
+        font-weight: bold !important;
+}
+.bold {
+    font-weight: bold !important;
+}
+span[data-testid="block-info"]{
+    color: black !important;
+    font-weight: bold !important;
+}
+#component-14 > div {
+    background-color : white !important;
+}
+button[aria-label="Clear"] {
+    background-color : white !important;
+    color: black !important;
+}
+#gallery_container {
+    display: flex;
+    flex-wrap: wrap;
+    justify-content: start;
+}
+.image_gallery {
+    margin-bottom: 1rem;
+    margin-right: 1rem;
+}
+label[for='range_id_0'] > span > span {
+    text-decoration: underline;
+}
+label[for='range_id_0'] > span > span {
+    font-size: normal !important;
+}
+.underline {
+    text-decoration: underline;
+}
+.mt-mb-1{
+    margin-top: 1rem;
+    margin-bottom: 1rem;
+}
+#gallery_container + div {
+  visibility: hidden;
+  height: 10px;
+}
+input[type=number][disabled] {
+    background-color: rgb(247, 249, 248) !important;
+    color: black !important;
+    -webkit-text-fill-color: black !important;
+}
+#component-13 {
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+}
+"""
+with gr.Blocks(js=scripts, css=css, theme='gstaff/xkcd') as demo:
+    gr.HTML("<h1 class='black-text'>Open Vocabulary Scene Sketch Semantic Understanding</div>")
+    # gr.HTML("<div class='black-text'></div>")
+    gr.HTML("<div class='black-text'></div>")
+    gr.HTML("<div class='black-text'>Ahmed Bourouis, Judith Ellen Fan, Yulia Gryaditskaya</div>")
+    gr.HTML("<div class='black-text'>CVPR, 2024</p>")
+    gr.HTML("<a >Project page</p>")
+    # gr.Markdown(   "Scene Sketch Semantic Segmentation.", elem_classes=["black-txt" , "h1"] )
+    # gr.Markdown(   "Open Vocabulary Scene Sketch Semantic Understanding", elem_classes=["black-txt" , "p"] )
+    # gr.Markdown(   "Open Vocabulary Scene Sketch Semantic Understanding", elem_classes=["black-txt" , "p"] )
+    # gr.Markdown( "")
+    with gr.Row():
+        with gr.Column():
+            # in_image = gr.Image( label="Sketch", type="pil", sources="upload" , height=512 )
+            in_canvas_image = gr.Sketchpad(  brush=gr.Brush(colors=["#000000"], color_mode="fixed" , default_size=2),
+                elem_classes=["white-bg", "gray-border" , "border-radius" ,"own-shadow" ] ,
+                label="Sketch" , canvas_size=(512,512) , sources=['upload'],
+                interactive=True , layers= False, transforms=[] )
+            query_selector = 'button[aria-label="Upload button"]'
+            with gr.Row():
+                # segment_btn.click(fn=run, inputs=[in_image, in_textbox, in_slider], outputs=[out_image])
+                upload_draw_btn = gr.HTML(f"""
+                <div id="upload_draw_group" class="svelte-15lo0d8 stretch">
+                    <button class="sm black-text white-bg gray-border  border-radius own-shadow svelte-cmf5ev bold" id="upload_btn" onclick="return document.querySelector('.source-wrap button').click()"> Upload a new sketch</button>
+                    <button class="sm black-text white-bg gray-border border-radius own-shadow svelte-cmf5ev bold" id="draw_btn" onclick="return document.querySelector('.controls-wrap button:nth-child(3)').click()"> Draw a new sketch</button>
+                </div>
+                """)
+            in_textbox = gr.Textbox( lines=3 , elem_classes=["white-bg", "gray-border" , "border-radius" ,"own-shadow" ]  ,label="Caption your Sketch!", placeholder="Include the categories that you want the AI to segment. \n e.g. 'giraffe, clouds' or 'a boy flying a kite' ")
+        with gr.Column():
+            out_image = gr.Image(elem_classes=["white-bg", "gray-border" , "border-radius" ,"own-shadow" ]  ,
+                                 type="pil", label="Segmented Sketch" ) #, height=512, width=512)
+            in_slider = gr.Slider(elem_classes=["white-bg", "gray-border" , "border-radius" ,"own-shadow" ]  ,
+                                    label="Confidence: Adjust AI agent confidence in guessing categories",
+                                    value=0.6 , interactive=True,  step=0.05, minimum=0, maximum=1)
+    with gr.Row():
+        segment_btn = gr.Button(   'Segment it !' , elem_classes=["white-bg", "gray-border" , "border-radius" ,"own-shadow" , 'bold' , 'mt-mb-1' ] , size="sm")
+        segment_btn.click(fn=run, inputs=[in_canvas_image , in_textbox , in_slider  ], outputs=[out_image])
+    gallery_label = gr.HTML("<h3 class='black-text'> <span class='black-text underline'>Gallery :</span> you can drag and drop any of the example sketches below into the sketch field above </div>")
+    gallery= gr.HTML(f"""
+        <div>
+            {gr.Image( elem_classes=["image_gallery"] , label="Sketch", show_download_button=False, show_label=False, type="pil", value='demo/sketch_1.png', height=200, width=200)}
+            {gr.Image( elem_classes=["image_gallery"] ,label="Sketch", show_download_button=False, show_label=False, type="pil", value='demo/sketch_2.png', height=200, width=200)}
+            {gr.Image( elem_classes=["image_gallery"] ,label="Sketch", show_download_button=False, show_label=False, type="pil", value='demo/sketch_3.png', height=200, width=200)}
+            {gr.Image( elem_classes=["image_gallery"] ,label="Sketch", show_download_button=False, show_label=False, type="pil", value='demo/000000004068.png', height=200, width=200)}
+            {gr.Image( elem_classes=["image_gallery"] ,label="Sketch", show_download_button=False, show_label=False, type="pil", value='demo/000000004546.png', height=200, width=200)}
+            {gr.Image( elem_classes=["image_gallery"] ,label="Sketch", show_download_button=False, show_label=False, type="pil", value='demo/000000005076.png', height=200, width=200)}
+            {gr.Image( elem_classes=["image_gallery"] ,label="Sketch", show_download_button=False, show_label=False, type="pil", value='demo/000000006336.png', height=200, width=200)}
+            {gr.Image( elem_classes=["image_gallery"] ,label="Sketch", show_download_button=False, show_label=False, type="pil", value='demo/000000011766.png', height=200, width=200)}
+            {gr.Image( elem_classes=["image_gallery"] ,label="Sketch", show_download_button=False, show_label=False, type="pil", value='demo/000000024458.png', height=200, width=200)}
+            {gr.Image( elem_classes=["image_gallery"] ,label="Sketch", show_download_button=False, show_label=False, type="pil", value='demo/000000024931.png', height=200, width=200)}
+            {gr.Image( elem_classes=["image_gallery"] ,label="Sketch", show_download_button=False, show_label=False, type="pil", value='demo/000000034214.png', height=200, width=200)}
+            {gr.Image( elem_classes=["image_gallery"] ,label="Sketch", show_download_button=False, show_label=False, type="pil", value='demo/000000260974.png', height=200, width=200)}
+            {gr.Image( elem_classes=["image_gallery"] ,label="Sketch", show_download_button=False, show_label=False, type="pil", value='demo/000000268340.png', height=200, width=200)}
+            {gr.Image( elem_classes=["image_gallery"] ,label="Sketch", show_download_button=False, show_label=False, type="pil", value='demo/000000305414.png', height=200, width=200)}
+            {gr.Image( elem_classes=["image_gallery"] ,label="Sketch", show_download_button=False, show_label=False, type="pil", value='demo/000000484246.png', height=200, width=200)}
+            {gr.Image( elem_classes=["image_gallery"] ,label="Sketch", show_download_button=False, show_label=False, type="pil", value='demo/000000549338.png', height=200, width=200)}
+            {gr.Image( elem_classes=["image_gallery"] ,label="Sketch", show_download_button=False, show_label=False, type="pil", value='demo/000000038116.png', height=200, width=200)}
+            {gr.Image( elem_classes=["image_gallery"] ,label="Sketch", show_download_button=False, show_label=False, type="pil", value='demo/000000221509.png', height=200, width=200)}
+            {gr.Image( elem_classes=["image_gallery"] ,label="Sketch", show_download_button=False, show_label=False, type="pil", value='demo/000000246066.png', height=200, width=200)}
+            {gr.Image( elem_classes=["image_gallery"] ,label="Sketch", show_download_button=False, show_label=False, type="pil", value='demo/000000001611.png', height=200, width=200)}
+        </div>
+    """)
+    examples = gr.Examples(
+        examples=[
+        ['demo/sketch_1.png', 'giraffe looking at you', 0.6],
+        ['demo/sketch_2.png', 'tree on the right', 0.6],
+        ['demo/sketch_3.png', 'a girl playing', 0.6],
+        ['demo/000000004068.png', 'car going so fast', 0.6],
+        ['demo/000000004546.png', 'mountains in the background', 0.6],
+        ['demo/000000005076.png', 'huge tree', 0.6],
+        ['demo/000000006336.png', 'nice three sheeps', 0.6],
+        ['demo/000000011766.png', 'bird minding its own business', 0.6],
+        ['demo/000000024458.png', 'horse with a mask on', 0.6],
+        ['demo/000000024931.png', 'some random person', 0.6],
+        ['demo/000000034214.png', 'a cool kid on a skateboard', 0.6],
+        ['demo/000000260974.png', 'the chair on the left', 0.6],
+        ['demo/000000268340.png', 'stop sign', 0.6],
+        ['demo/000000305414.png', 'a lonely elephant roaming around', 0.6],
+        ['demo/000000484246.png', 'giraffe with a loong neck', 0.6],
+        ['demo/000000549338.png', 'two donkeys trying to be smart', 0.6],
+        ['demo/000000038116.png', 'a bat on the left', 0.6],
+        ['demo/000000221509.png', 'funny looking cow', 0.6],
+        ['demo/000000246066.png', 'bench in the park', 0.6],
+        ['demo/000000001611.png', 'trees in the background', 0.6]
+        ],
+        inputs=[in_canvas_image, in_textbox , in_slider],
+        fn=run,
+        # cache_examples=True,
+    )
+demo.launch(share=False, )

app_old.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import gradio as gr
+from PIL import Image
+import torch
+from torchvision.transforms import InterpolationMode
+BICUBIC = InterpolationMode.BICUBIC
+from utils import setup, get_similarity_map, display_segmented_sketch
+from vpt.launch import default_argument_parser
+from collections import OrderedDict
+import numpy as np
+import matplotlib.pyplot as plt
+import models
+args = default_argument_parser().parse_args()
+cfg = setup(args)
+device ="cpu"# "cuda" if torch.cuda.is_available() else "cpu"
+Ours, preprocess = models.load("CS-ViT-B/16", device=device,cfg=cfg,train_bool=False)
+state_dict = torch.load("sketch_seg_best_miou.pth", map_location=device)
+# Trained on 2 gpus so we need to remove the prefix "module." to test it on a single GPU
+new_state_dict = OrderedDict()
+for k, v in state_dict.items():
+    name = k[7:] # remove `module.`
+    new_state_dict[name] = v
+Ours.load_state_dict(new_state_dict)
+Ours.eval()
+print("Model loaded successfully")
+def run(sketch, caption, threshold):
+    # set the condidate classes here
+    classes = [caption]
+    colors = plt.get_cmap("tab10").colors
+    classes_colors = colors[2:len(classes)+2]
+    sketch = sketch['composite']
+    sketch = np.array(sketch)
+    pil_img = Image.fromarray(sketch).convert('RGB')
+    sketch_tensor = preprocess(pil_img).unsqueeze(0).to(device)
+    with torch.no_grad():
+        text_features = models.encode_text_with_prompt_ensemble(Ours, classes, device,no_module=True)
+        redundant_features = models.encode_text_with_prompt_ensemble(Ours, [""], device,no_module=True)
+    num_of_tokens = 3
+    with torch.no_grad():
+        sketch_features = Ours.encode_image(sketch_tensor,layers=[12],text_features=text_features-redundant_features,mode="test").squeeze(0)
+        sketch_features = sketch_features / sketch_features.norm(dim=1, keepdim=True)
+    similarity = sketch_features @ (text_features - redundant_features).t()
+    patches_similarity = similarity[0, num_of_tokens +1:, :]
+    pixel_similarity = get_similarity_map(patches_similarity.unsqueeze(0),pil_img.size).cpu()
+    # visualize_attention_maps_with_tokens(pixel_similarity, classes)
+    pixel_similarity[pixel_similarity<threshold] = 0
+    pixel_similarity_array = pixel_similarity.cpu().numpy().transpose(2,0,1)
+    display_segmented_sketch(pixel_similarity_array,sketch,classes,classes_colors,live=True)
+    rgb_image = Image.open('output.png')
+    return rgb_image
+css=".gradio-container {background-color: black}"
+demo = gr.Interface(
+    fn=run,
+    # js=js,
+    css=css,
+    theme="gstaff/sketch", #xkcd
+    description='Upload a skecth and find objects.'\
+                ' Check run examples down the page.',
+    inputs=[
+        gr.ImageEditor(
+            label="Sketch", type="pil",sources="upload"),
+        gr.Textbox(label="Caption", placeholder="Describe which objects to segment"),
+        gr.Slider(label="Threshold", value=0.6, step=0.05, minimum=0, maximum=1),
+    ],
+    outputs=[gr.Image(type="pil", label="Segmented Sketch") ],
+    allow_flagging=False,
+    examples=[
+        ['demo/sketch_1.png', 'giraffe standing', 0.6],
+        ['demo/sketch_2.png', 'tree', 0.6],
+        ['demo/sketch_3.png', 'person', 0.6],
+    ],
+    title="Scene Sketch Semantic Segmentation")
+if __name__ == "__main__":
+    demo.launch()

demo/000000001611.png ADDED Viewed

demo/000000004068.png ADDED Viewed

demo/000000004546.png ADDED Viewed

demo/000000005076.png ADDED Viewed

demo/000000006336.png ADDED Viewed

demo/000000011766.png ADDED Viewed

demo/000000024458.png ADDED Viewed

demo/000000024931.png ADDED Viewed

demo/000000034214.png ADDED Viewed

demo/000000038116.png ADDED Viewed

demo/000000045280.png ADDED Viewed

demo/000000221509.png ADDED Viewed

demo/000000246066.png ADDED Viewed

demo/000000260974.png ADDED Viewed

demo/000000268340.png ADDED Viewed

demo/000000305414.png ADDED Viewed

demo/000000406874.png ADDED Viewed

demo/000000484246.png ADDED Viewed

demo/000000549338.png ADDED Viewed

demo/sketch_1.png ADDED Viewed

demo/sketch_2.png ADDED Viewed

demo/sketch_3.png ADDED Viewed

models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .clip import *

models/auxilary.py ADDED Viewed

	@@ -0,0 +1,449 @@

+import torch
+import warnings
+from typing import Tuple, Optional
+import torch
+from torch import Tensor
+from torch.nn.init import xavier_uniform_
+from torch.nn.init import constant_
+from torch.nn.init import xavier_normal_
+from torch.nn.parameter import Parameter
+from torch.nn import functional as F
+# We define this function as _pad because it takes an argument
+# named pad, which clobbers the recursive reference to the pad
+# function needed for __torch_function__ support
+pad = F.pad
+# This class exists solely for Transformer; it has an annotation stating
+# that bias is never None, which appeases TorchScript
+class _LinearWithBias(torch.nn.Linear):
+    bias: Tensor
+    def __init__(self, in_features: int, out_features: int) -> None:
+        super().__init__(in_features, out_features, bias=True)
+def multi_head_attention_forward(query: Tensor,
+                                 key: Tensor,
+                                 value: Tensor,
+                                 embed_dim_to_check: int,
+                                 num_heads: int,
+                                 in_proj_weight: Tensor,
+                                 in_proj_bias: Tensor,
+                                 bias_k: Optional[Tensor],
+                                 bias_v: Optional[Tensor],
+                                 add_zero_attn: bool,
+                                 dropout_p: float,
+                                 out_proj_weight: Tensor,
+                                 out_proj_bias: Tensor,
+                                 training: bool = True,
+                                 key_padding_mask: Optional[Tensor] = None,
+                                 need_weights: bool = True,
+                                 attn_mask: Optional[Tensor] = None,
+                                 use_separate_proj_weight: bool = False,
+                                 q_proj_weight: Optional[Tensor] = None,
+                                 k_proj_weight: Optional[Tensor] = None,
+                                 v_proj_weight: Optional[Tensor] = None,
+                                 static_k: Optional[Tensor] = None,
+                                 static_v: Optional[Tensor] = None,
+                                 attention_probs_forward_hook = None,
+                                 attention_probs_backwards_hook = None,
+                                 attention_keys_forward_hook = None,
+                                 ) -> Tuple[Tensor, Optional[Tensor]]:
+    if not torch.jit.is_scripting():
+        tens_ops = (query, key, value, in_proj_weight, in_proj_bias, bias_k, bias_v,
+                    out_proj_weight, out_proj_bias)
+        if any([type(t) is not Tensor for t in tens_ops]) and F.has_torch_function(tens_ops):
+            return F.handle_torch_function(
+                multi_head_attention_forward, tens_ops, query, key, value,
+                embed_dim_to_check, num_heads, in_proj_weight, in_proj_bias,
+                bias_k, bias_v, add_zero_attn, dropout_p, out_proj_weight,
+                out_proj_bias, training=training, key_padding_mask=key_padding_mask,
+                need_weights=need_weights, attn_mask=attn_mask,
+                use_separate_proj_weight=use_separate_proj_weight,
+                q_proj_weight=q_proj_weight, k_proj_weight=k_proj_weight,
+                v_proj_weight=v_proj_weight, static_k=static_k, static_v=static_v)
+    tgt_len, bsz, embed_dim = query.size()
+    assert embed_dim == embed_dim_to_check
+    # allow MHA to have different sizes for the feature dimension
+    assert key.size(0) == value.size(0) and key.size(1) == value.size(1)
+    head_dim = embed_dim // num_heads
+    assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+    scaling = float(head_dim) ** -0.5
+    if not use_separate_proj_weight:
+        if torch.equal(query, key) and torch.equal(key, value):
+            # self-attention
+            q, k, v = F.linear(query, in_proj_weight, in_proj_bias).chunk(3, dim=-1)
+        elif torch.equal(key, value):
+            # encoder-decoder attention
+            # This is inline in_proj function with in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = 0
+            _end = embed_dim
+            _w = in_proj_weight[_start:_end, :]
+            if _b is not None:
+                _b = _b[_start:_end]
+            q = F.linear(query, _w, _b)
+            if key is None:
+                assert value is None
+                k = None
+                v = None
+            else:
+                # This is inline in_proj function with in_proj_weight and in_proj_bias
+                _b = in_proj_bias
+                _start = embed_dim
+                _end = None
+                _w = in_proj_weight[_start:, :]
+                if _b is not None:
+                    _b = _b[_start:]
+                k, v = F.linear(key, _w, _b).chunk(2, dim=-1)
+        else:
+            # This is inline in_proj function with in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = 0
+            _end = embed_dim
+            _w = in_proj_weight[_start:_end, :]
+            if _b is not None:
+                _b = _b[_start:_end]
+            q = F.linear(query, _w, _b)
+            # This is inline in_proj function with in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = embed_dim
+            _end = embed_dim * 2
+            _w = in_proj_weight[_start:_end, :]
+            if _b is not None:
+                _b = _b[_start:_end]
+            k = F.linear(key, _w, _b)
+            # This is inline in_proj function with in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = embed_dim * 2
+            _end = None
+            _w = in_proj_weight[_start:, :]
+            if _b is not None:
+                _b = _b[_start:]
+            v = F.linear(value, _w, _b)
+    else:
+        q_proj_weight_non_opt = torch.jit._unwrap_optional(q_proj_weight)
+        len1, len2 = q_proj_weight_non_opt.size()
+        assert len1 == embed_dim and len2 == query.size(-1)
+        k_proj_weight_non_opt = torch.jit._unwrap_optional(k_proj_weight)
+        len1, len2 = k_proj_weight_non_opt.size()
+        assert len1 == embed_dim and len2 == key.size(-1)
+        v_proj_weight_non_opt = torch.jit._unwrap_optional(v_proj_weight)
+        len1, len2 = v_proj_weight_non_opt.size()
+        assert len1 == embed_dim and len2 == value.size(-1)
+        if in_proj_bias is not None:
+            q = F.linear(query, q_proj_weight_non_opt, in_proj_bias[0:embed_dim])
+            k = F.linear(key, k_proj_weight_non_opt, in_proj_bias[embed_dim:(embed_dim * 2)])
+            v = F.linear(value, v_proj_weight_non_opt, in_proj_bias[(embed_dim * 2):])
+        else:
+            q = F.linear(query, q_proj_weight_non_opt, in_proj_bias)
+            k = F.linear(key, k_proj_weight_non_opt, in_proj_bias)
+            v = F.linear(value, v_proj_weight_non_opt, in_proj_bias)
+    q = q * scaling
+    if attn_mask is not None:
+        assert attn_mask.dtype == torch.float32 or attn_mask.dtype == torch.float64 or \
+            attn_mask.dtype == torch.float16 or attn_mask.dtype == torch.uint8 or attn_mask.dtype == torch.bool, \
+            'Only float, byte, and bool types are supported for attn_mask, not {}'.format(attn_mask.dtype)
+        if attn_mask.dtype == torch.uint8:
+            warnings.warn("Byte tensor for attn_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
+            attn_mask = attn_mask.to(torch.bool)
+        if attn_mask.dim() == 2:
+            attn_mask = attn_mask.unsqueeze(0)
+            if list(attn_mask.size()) != [1, query.size(0), key.size(0)]:
+                raise RuntimeError('The size of the 2D attn_mask is not correct.')
+        elif attn_mask.dim() == 3:
+            if list(attn_mask.size()) != [bsz * num_heads, query.size(0), key.size(0)]:
+                raise RuntimeError('The size of the 3D attn_mask is not correct.')
+        else:
+            raise RuntimeError("attn_mask's dimension {} is not supported".format(attn_mask.dim()))
+        # attn_mask's dim is 3 now.
+    # convert ByteTensor key_padding_mask to bool
+    if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8:
+        warnings.warn("Byte tensor for key_padding_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
+        key_padding_mask = key_padding_mask.to(torch.bool)
+    if bias_k is not None and bias_v is not None:
+        if static_k is None and static_v is None:
+            k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
+            v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
+            if attn_mask is not None:
+                attn_mask = pad(attn_mask, (0, 1))
+            if key_padding_mask is not None:
+                key_padding_mask = pad(key_padding_mask, (0, 1))
+        else:
+            assert static_k is None, "bias cannot be added to static key."
+            assert static_v is None, "bias cannot be added to static value."
+    else:
+        assert bias_k is None
+        assert bias_v is None
+    if attention_keys_forward_hook is not None:
+        # print("from auxilary, k", k.shape)
+        attention_keys_forward_hook(k)
+    # k shape is [50, 5, 768]
+    q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
+    if k is not None:
+        k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+    if v is not None:
+        v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+    # k [60, 50, 64]
+    if static_k is not None:
+        assert static_k.size(0) == bsz * num_heads
+        assert static_k.size(2) == head_dim
+        k = static_k
+    if static_v is not None:
+        assert static_v.size(0) == bsz * num_heads
+        assert static_v.size(2) == head_dim
+        v = static_v
+    src_len = k.size(1)
+    if key_padding_mask is not None:
+        assert key_padding_mask.size(0) == bsz
+        assert key_padding_mask.size(1) == src_len
+    if add_zero_attn:
+        src_len += 1
+        k = torch.cat([k, torch.zeros((k.size(0), 1) + k.size()[2:], dtype=k.dtype, device=k.device)], dim=1)
+        v = torch.cat([v, torch.zeros((v.size(0), 1) + v.size()[2:], dtype=v.dtype, device=v.device)], dim=1)
+        if attn_mask is not None:
+            attn_mask = pad(attn_mask, (0, 1))
+        if key_padding_mask is not None:
+            key_padding_mask = pad(key_padding_mask, (0, 1))
+    attn_output_weights = torch.bmm(q, k.transpose(1, 2))
+    # q [60, 50, 64]
+    # k [60, 50, 64] k trans [60, 64, 50]
+    # attn_output_weights [60, 50, 50]
+    assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len]
+    if attn_mask is not None:
+        if attn_mask.dtype == torch.bool:
+            attn_output_weights.masked_fill_(attn_mask, float('-inf'))
+        else:
+            attn_output_weights += attn_mask
+    if key_padding_mask is not None:
+        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
+        attn_output_weights = attn_output_weights.masked_fill(
+            key_padding_mask.unsqueeze(1).unsqueeze(2),
+            float('-inf'),
+        )
+        attn_output_weights = attn_output_weights.view(bsz * num_heads, tgt_len, src_len)
+    attn_output_weights = F.softmax(
+        attn_output_weights, dim=-1)
+    attn_output_weights = F.dropout(attn_output_weights, p=dropout_p, training=training)
+    # if attn_mask is not None:
+    #     attn_mask_c = attn_mask.clone()
+    #     attn_mask_c[:,0,:] = attn_mask[:,1,:]
+    #     attn_mask_c[:,:,0] = attn_mask[:,:,1]
+    #     attn_mask_c[:,0,0] = False
+    #     attn_output_weights = attn_output_weights.masked_fill(attn_mask_c, 0)# *= (1 - attn_mask.half())
+    # print("attn_output_weights")
+    # print(attn_output_weights[0,8])
+    # print(attn_output_weights[0,:,8])
+    # use hooks for the attention weights if necessary
+    if attention_probs_forward_hook is not None and attention_probs_backwards_hook is not None:
+        attention_probs_forward_hook(attn_output_weights)
+        attn_output_weights.register_hook(attention_probs_backwards_hook)
+    # v shape [60, 50, 64], attn_output_weights [60, 50, 50]
+    attn_output = torch.bmm(attn_output_weights, v)
+    # attn_output", [60, 50, 64]
+    assert list(attn_output.size()) == [bsz * num_heads, tgt_len, head_dim]
+    # attn_output before [60, 50, 64]
+    attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
+    # attn_output [50, 5, 768]
+    attn_output = F.linear(attn_output, out_proj_weight, out_proj_bias)
+    # attn_output [50, 5, 768]
+    if need_weights:
+        # average attention weights over heads
+        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
+        return attn_output, attn_output_weights.sum(dim=1) / num_heads
+    else:
+        return attn_output
+class MultiheadAttention(torch.nn.Module):
+    r"""Allows the model to jointly attend to information
+    from different representation subspaces.
+    See reference: Attention Is All You Need
+    .. math::
+        \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
+        \text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)
+    Args:
+        embed_dim: total dimension of the model.
+        num_heads: parallel attention heads.
+        dropout: a Dropout layer on attn_output_weights. Default: 0.0.
+        bias: add bias as module parameter. Default: True.
+        add_bias_kv: add bias to the key and value sequences at dim=0.
+        add_zero_attn: add a new batch of zeros to the key and
+                       value sequences at dim=1.
+        kdim: total number of features in key. Default: None.
+        vdim: total number of features in value. Default: None.
+        Note: if kdim and vdim are None, they will be set to embed_dim such that
+        query, key, and value have the same number of features.
+    Examples::
+        >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
+        >>> attn_output, attn_output_weights = multihead_attn(query, key, value)
+    """
+    bias_k: Optional[torch.Tensor]
+    bias_v: Optional[torch.Tensor]
+    def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None, vdim=None):
+        super(MultiheadAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+        if self._qkv_same_embed_dim is False:
+            self.q_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim))
+            self.k_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim))
+            self.v_proj_weight = Parameter(torch.Tensor(embed_dim, self.vdim))
+            self.register_parameter('in_proj_weight', None)
+        else:
+            self.in_proj_weight = Parameter(torch.empty(3 * embed_dim, embed_dim))
+            self.register_parameter('q_proj_weight', None)
+            self.register_parameter('k_proj_weight', None)
+            self.register_parameter('v_proj_weight', None)
+        if bias:
+            self.in_proj_bias = Parameter(torch.empty(3 * embed_dim))
+        else:
+            self.register_parameter('in_proj_bias', None)
+        self.out_proj = _LinearWithBias(embed_dim, embed_dim)
+        if add_bias_kv:
+            self.bias_k = Parameter(torch.empty(1, 1, embed_dim))
+            self.bias_v = Parameter(torch.empty(1, 1, embed_dim))
+        else:
+            self.bias_k = self.bias_v = None
+        self.add_zero_attn = add_zero_attn
+        self._reset_parameters()
+    def _reset_parameters(self):
+        if self._qkv_same_embed_dim:
+            xavier_uniform_(self.in_proj_weight)
+        else:
+            xavier_uniform_(self.q_proj_weight)
+            xavier_uniform_(self.k_proj_weight)
+            xavier_uniform_(self.v_proj_weight)
+        if self.in_proj_bias is not None:
+            constant_(self.in_proj_bias, 0.)
+            constant_(self.out_proj.bias, 0.)
+        if self.bias_k is not None:
+            xavier_normal_(self.bias_k)
+        if self.bias_v is not None:
+            xavier_normal_(self.bias_v)
+    def __setstate__(self, state):
+        # Support loading old MultiheadAttention checkpoints generated by v1.1.0
+        if '_qkv_same_embed_dim' not in state:
+            state['_qkv_same_embed_dim'] = True
+        super(MultiheadAttention, self).__setstate__(state)
+    def forward(self, query, key, value, key_padding_mask=None,
+                need_weights=True, attn_mask=None, attention_probs_forward_hook=None,
+                attention_probs_backwards_hook=None, attention_keys_forward_hook=None):
+        r"""
+    Args:
+        query, key, value: map a query and a set of key-value pairs to an output.
+            See "Attention Is All You Need" for more details.
+        key_padding_mask: if provided, specified padding elements in the key will
+            be ignored by the attention. When given a binary mask and a value is True,
+            the corresponding value on the attention layer will be ignored. When given
+            a byte mask and a value is non-zero, the corresponding value on the attention
+            layer will be ignored
+        need_weights: output attn_output_weights.
+        attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
+            the batches while a 3D mask allows to specify a different mask for the entries of each batch.
+    Shape:
+        - Inputs:
+        - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
+          If a ByteTensor is provided, the non-zero positions will be ignored while the position
+          with the zero positions will be unchanged. If a BoolTensor is provided, the positions with the
+          value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
+        - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
+          3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
+          S is the source sequence length. attn_mask ensure that position i is allowed to attend the unmasked
+          positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
+          while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
+          is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
+          is provided, it will be added to the attention weight.
+        - Outputs:
+        - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
+          E is the embedding dimension.
+        - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
+          L is the target sequence length, S is the source sequence length.
+        """
+        if not self._qkv_same_embed_dim:
+            return multi_head_attention_forward(
+                query, key, value, self.embed_dim, self.num_heads,
+                self.in_proj_weight, self.in_proj_bias,
+                self.bias_k, self.bias_v, self.add_zero_attn,
+                self.dropout, self.out_proj.weight, self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask, need_weights=need_weights,
+                attn_mask=attn_mask, use_separate_proj_weight=True,
+                q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight,
+                v_proj_weight=self.v_proj_weight,
+                attention_probs_forward_hook=attention_probs_forward_hook,
+                attention_probs_backwards_hook=attention_probs_backwards_hook,
+                attention_keys_forward_hook=attention_keys_forward_hook)
+        else:
+            return multi_head_attention_forward(
+                query, key, value, self.embed_dim, self.num_heads,
+                self.in_proj_weight, self.in_proj_bias,
+                self.bias_k, self.bias_v, self.add_zero_attn,
+                self.dropout, self.out_proj.weight, self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask, need_weights=need_weights,
+                attn_mask=attn_mask,
+                attention_probs_forward_hook=attention_probs_forward_hook,
+                attention_probs_backwards_hook=attention_probs_backwards_hook,
+                attention_keys_forward_hook=attention_keys_forward_hook)

models/bpe_simple_vocab_16e6.txt.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a
+size 1356917

models/build_model.py ADDED Viewed

	@@ -0,0 +1,83 @@

+from torch import nn
+from .clip_model import CLIP
+from .our_model import ModifiedCLIPSurgery
+def convert_weights(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+    def _convert_weights_to_fp16(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            l.weight.data = l.weight.data.half()
+            if l.bias is not None:
+                l.bias.data = l.bias.data.half()
+        if isinstance(l, nn.MultiheadAttention):
+            for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
+                tensor = getattr(l, attr)
+                if tensor is not None:
+                    tensor.data = tensor.data.half()
+        for name in ["text_projection", "proj"]:
+            if hasattr(l, name):
+                attr = getattr(l, name)
+                if attr is not None:
+                    attr.data = attr.data.half()
+    model.apply(_convert_weights_to_fp16)
+def build_model(name: str, state_dict: dict,cfg: dict,train_bool: bool):
+    vit = "visual.proj" in state_dict
+    if vit:
+        vision_width = state_dict["visual.conv1.weight"].shape[0]
+        vision_layers = len([k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")])
+        vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
+        grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5)
+        image_resolution = vision_patch_size * grid_size
+    else:
+        counts: list = [len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in [1, 2, 3, 4]]
+        vision_layers = tuple(counts)
+        vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
+        output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5)
+        vision_patch_size = None
+        assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0]
+        image_resolution = output_width * 32
+    embed_dim = state_dict["text_projection"].shape[1]
+    context_length = state_dict["positional_embedding"].shape[0]
+    vocab_size = state_dict["token_embedding.weight"].shape[0]
+    transformer_width = state_dict["ln_final.weight"].shape[0]
+    transformer_heads = transformer_width // 64
+    transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith(f"transformer.resblocks")))
+    if 'CS-' in name:
+        model = ModifiedCLIPSurgery(
+            embed_dim,
+            image_resolution, vision_layers, vision_width, vision_patch_size,
+            context_length, vocab_size, transformer_width, transformer_heads, transformer_layers,cfg,train_bool
+        )
+    else:
+        model = CLIP(
+            embed_dim,
+            image_resolution, vision_layers, vision_width, vision_patch_size,
+            context_length, vocab_size, transformer_width, transformer_heads, transformer_layers
+        )
+    for key in ["input_resolution", "context_length", "vocab_size"]:
+        if key in state_dict:
+            del state_dict[key]
+    model.load_state_dict(state_dict,strict=False)
+    if not cfg.ft_all:
+        train_params_list= cfg.MODEL.PROMPT.TRAINABLE_PARM.split(',')
+        for name, param in model.named_parameters():
+            param.requires_grad = any(str(t_param) in name for t_param in train_params_list)
+    for name, param in model.named_parameters():
+        if "visual" not in name:
+            param.requires_grad = False
+    return model

models/ca.py ADDED Viewed

	@@ -0,0 +1,165 @@

+"""
+This code is borrowed from https://github.com/buptLinfy/ZSE-SBIR
+"""
+import math
+import copy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
+class LayerNorm(nn.Module):
+    def __init__(self, features, eps=1e-6):
+        super(LayerNorm, self).__init__()
+        self.a = nn.Parameter(torch.ones(features))
+        self.b = nn.Parameter(torch.zeros(features))
+        self.eps = eps
+    def forward(self, x):
+        mean = x.mean(-1, keepdim=True)
+        std = x.std(-1, keepdim=True)
+        return self.a * (x - mean) / (std + self.eps) + self.b
+class AddAndNorm(nn.Module):
+    def __init__(self, size, dropout):
+        super(AddAndNorm, self).__init__()
+        self.norm = LayerNorm(size)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, y):
+        return self.norm(x + self.dropout(y))
+class EncoderLayer(nn.Module):
+    "Encoder is made up of self-attn and feed forward (defined below)"
+    def __init__(self, size, self_attn, feed_forward, dropout):
+        super(EncoderLayer, self).__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.sublayer = clones(AddAndNorm(size, dropout), 2)
+        self.size = size
+    def forward(self, q, k, v, mask):
+        x = self.sublayer[0](v, self.self_attn(q, k, v, mask))
+        x = self.sublayer[1](x, self.feed_forward(x))
+        return x
+class Encoder(nn.Module):
+    def __init__(self, layer, N):
+        super(Encoder, self).__init__()
+        self.layers = clones(layer, N)
+        self.layer1 = clones(layer, N)
+        self.layer2 = clones(layer, N)
+    def forward(self, x_im, x_text, mask):
+        for layer1, layer2 in zip(self.layer1, self.layer2):
+            # 在此交换Q exchange Q here
+            # layer1 处理 sk - layer1 process sk
+            # x_text1 = layer1(x_text, x_im, x_text, mask)
+            # layer2 处理 im - layer2 process im
+            x_im = layer2(x_im, x_text, x_im, mask)
+            # x_sk = x_text1
+        return x_im
+def attention(query, key, value, dropout=None, mask=None, pos=None):
+    """
+    dk = dv = dmodel/h = 64,h=8
+    """
+    d_k = query.size(-1)
+    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
+    if mask is not None:
+        scores = scores.masked_fill(mask == 0, -1e9)
+    p_attn = F.softmax(scores, dim=-1)
+    if dropout is not None:
+        p_attn = dropout(p_attn)
+    return torch.matmul(p_attn, value), p_attn
+class MultiHeadedAttention(nn.Module):
+    def __init__(self, h, d_model, dropout=0.1):
+        "Take in model size and number of heads."
+        super(MultiHeadedAttention, self).__init__()
+        assert d_model % h == 0
+        # We assume d_v always equals d_k
+        self.d_k = d_model // h
+        self.h = h
+        self.linears = clones(nn.Linear(d_model, d_model), 4)
+        self.attn = None
+        self.dropout = nn.Dropout(p=dropout)
+    def forward(self, query, key, value, mask=None):
+        """
+        :param query: size(batch,seq,512)
+        :param key:
+        :param value:
+        :param mask:
+        :return:
+        """
+        if mask is not None:
+            # Same mask applied to all h heads.
+            mask = mask.unsqueeze(1)
+        nbatches = query.size(0)
+        # 1) Do all the linear projections in batch from d_model => h x d_k
+        # size(batch,h,seq,dk)
+        query, key, value = \
+            [lin(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
+             for lin, x in zip(self.linears, (query, key, value))]
+        # 2) Apply attention on all the projected vectors in batch.
+        x, self.attn = attention(query, key, value, mask=mask,
+                                 dropout=self.dropout)
+        # 3) "Concat" using a view and apply a final linear.
+        x = x.transpose(1, 2).contiguous() \
+            .view(nbatches, -1, self.h * self.d_k)
+        return self.linears[-1](x)
+class PositionwiseFeedForward(nn.Module):
+    """
+    d_model = 512
+    d_ff = 2048 为论文中数值
+    """
+    def __init__(self, d_model, d_ff, dropout=0.1):
+        super(PositionwiseFeedForward, self).__init__()
+        self.w_1 = nn.Linear(d_model, d_ff)
+        self.w_2 = nn.Linear(d_ff, d_model)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        return self.w_2(self.dropout(F.relu(self.w_1(x))))
+class Cross_Attention(nn.Module):
+    def __init__(self, h=8, n=1, d_model=768, d_ff=1024, dropout=0.1): #(self, args, h=8, n=1, d_model=768, d_ff=1024, dropout=0.1):
+        super(Cross_Attention, self).__init__()
+        multi_head_attention = MultiHeadedAttention(h, d_model)
+        ffn = PositionwiseFeedForward(d_model, d_ff, dropout)
+        encoderLayer = EncoderLayer(d_model, multi_head_attention, ffn, dropout)
+        self.encoder = Encoder(encoderLayer, n)
+        self.text_projection = nn.Linear(512, d_model)
+    def forward(self, x_patch,x_text):
+        length = x_text.shape[0]
+        x_text = self.text_projection(x_text)
+        x_sketch= self.encoder(x_patch, x_text, None)  # 不要mask - don't mask
+        return x_sketch

models/clip.py ADDED Viewed

	@@ -0,0 +1,357 @@

+import hashlib
+import os
+import urllib
+import warnings
+from typing import Union, List
+from pkg_resources import packaging
+import torch
+from PIL import Image
+from torchvision.transforms import Compose, Resize, ToTensor, Normalize
+from tqdm import tqdm
+import numpy as np
+from .build_model import build_model
+from .simple_tokenizer import SimpleTokenizer as _Tokenizer
+from fvcore.common.config import CfgNode
+try:
+    from torchvision.transforms import InterpolationMode
+    BICUBIC = InterpolationMode.BICUBIC
+except ImportError:
+    BICUBIC = Image.BICUBIC
+if packaging.version.parse(torch.__version__) < packaging.version.parse("1.7.1"):
+    warnings.warn("PyTorch version 1.7.1 or higher is recommended")
+__all__ = ["available_models", "load", "tokenize", "encode_text_with_prompt_ensemble",
+           "get_similarity_map", "clip_feature_surgery", "similarity_map_to_points"]
+_tokenizer = _Tokenizer()
+_MODELS = {
+    "RN50": "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt",
+    "RN101": "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt",
+    "RN50x4": "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt",
+    "RN50x16": "https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt",
+    "RN50x64": "https://openaipublic.azureedge.net/clip/models/be1cfb55d75a9666199fb2206c106743da0f6468c9d327f3e0d0a543a9919d9c/RN50x64.pt",
+    "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
+    "ViT-B/16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
+    "ViT-L/14": "https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt",
+    "ViT-L/14@336px": "https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt",
+    "CS-RN50": "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt",
+    "CS-RN101": "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt",
+    "CS-RN50x4": "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt",
+    "CS-RN50x16": "https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt",
+    "CS-RN50x64": "https://openaipublic.azureedge.net/clip/models/be1cfb55d75a9666199fb2206c106743da0f6468c9d327f3e0d0a543a9919d9c/RN50x64.pt",
+    "CS-ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
+    "CS-ViT-B/16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
+    "CS-ViT-L/14": "https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt",
+    "CS-ViT-L/14@336px": "https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt",
+}
+def _download(url: str, root: str):
+    os.makedirs(root, exist_ok=True)
+    filename = os.path.basename(url)
+    expected_sha256 = url.split("/")[-2]
+    download_target = os.path.join(root, filename)
+    if os.path.exists(download_target) and not os.path.isfile(download_target):
+        raise RuntimeError(f"{download_target} exists and is not a regular file")
+    if os.path.isfile(download_target):
+        if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256:
+            return download_target
+        else:
+            warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
+    with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
+        with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True, unit_divisor=1024) as loop:
+            while True:
+                buffer = source.read(8192)
+                if not buffer:
+                    break
+                output.write(buffer)
+                loop.update(len(buffer))
+    if hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256:
+        raise RuntimeError(f"Model has been downloaded but the SHA256 checksum does not not match")
+    return download_target
+def _convert_image_to_rgb(image):
+    return image.convert("RGB")
+def _transform(n_px):
+    return Compose([
+        Resize((n_px, n_px), interpolation=BICUBIC),
+        #CenterCrop(n_px), # rm center crop to explain whole image
+        _convert_image_to_rgb,
+        ToTensor(),
+        Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+    ])
+def available_models() -> List[str]:
+    """Returns the names of available CLIP models"""
+    return list(_MODELS.keys())
+def load(name: str, device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu", jit: bool = False, download_root: str = None,cfg: CfgNode=None, train_bool: bool = True,LT: bool = False,groupvit: bool = False):
+    """Load a CLIP model
+    Parameters
+    ----------
+    name : str
+        A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
+    device : Union[str, torch.device]
+        The device to put the loaded model
+    jit : bool
+        Whether to load the optimized JIT model or more hackable non-JIT model (default).
+    download_root: str
+        path to download the model files; by default, it uses "~/.cache/clip"
+    Returns
+    -------
+    model : torch.nn.Module
+        The CLIP model
+    preprocess : Callable[[PIL.Image], torch.Tensor]
+        A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
+    """
+    if name in _MODELS:
+        model_path = _download(_MODELS[name], download_root or os.path.expanduser("~/.cache/clip"))
+    elif os.path.isfile(name):
+        model_path = name
+    else:
+        raise RuntimeError(f"Model {name} not found; available models = {available_models()}")
+    with open(model_path, 'rb') as opened_file:
+        try:
+            # loading JIT archive
+            model = torch.jit.load(opened_file, map_location=device if jit else "cpu").eval()
+            state_dict = None
+        except RuntimeError:
+            # loading saved state dict
+            if jit:
+                warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead")
+                jit = False
+            state_dict = torch.load(opened_file, map_location="cpu")
+    # model_laion, preprocess_train, preprocess_val = open_clip.create_model_and_transforms('hf-hub:laion/CLIP-ViT-B-16-laion2B-s34B-b88K')
+    # laion_state_dict = model_laion.state_dict()
+    if not jit:
+        model = build_model(name, state_dict or model.state_dict(),cfg,train_bool).to(device)
+        # model = build_model(name, laion_state_dict,cfg,num_classes).to(device)
+        if str(device) == "cpu":
+            model.float()
+        return model, _transform(model.visual.input_resolution)
+    # patch the device names
+    device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
+    device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1]
+    def patch_device(module):
+        try:
+            graphs = [module.graph] if hasattr(module, "graph") else []
+        except RuntimeError:
+            graphs = []
+        if hasattr(module, "forward1"):
+            graphs.append(module.forward1.graph)
+        for graph in graphs:
+            for node in graph.findAllNodes("prim::Constant"):
+                if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"):
+                    node.copyAttributes(device_node)
+    model.apply(patch_device)
+    patch_device(model.encode_image)
+    patch_device(model.encode_text)
+    # patch dtype to float32 on CPU
+    if str(device) == "cpu":
+        float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[])
+        float_input = list(float_holder.graph.findNode("aten::to").inputs())[1]
+        float_node = float_input.node()
+        def patch_float(module):
+            try:
+                graphs = [module.graph] if hasattr(module, "graph") else []
+            except RuntimeError:
+                graphs = []
+            if hasattr(module, "forward1"):
+                graphs.append(module.forward1.graph)
+            for graph in graphs:
+                for node in graph.findAllNodes("aten::to"):
+                    inputs = list(node.inputs())
+                    for i in [1, 2]:  # dtype can be the second or third argument to aten::to()
+                        if inputs[i].node()["value"] == 5:
+                            inputs[i].node().copyAttributes(float_node)
+        model.apply(patch_float)
+        patch_float(model.encode_image)
+        patch_float(model.encode_text)
+        model.float()
+    return model, _transform(model.input_resolution.item())
+def tokenize(texts: Union[str, List[str]], context_length: int = 77, truncate: bool = False) -> Union[torch.IntTensor, torch.LongTensor]:
+    """
+    Returns the tokenized representation of given input string(s)
+    Parameters
+    ----------
+    texts : Union[str, List[str]]
+        An input string or a list of input strings to tokenize
+    context_length : int
+        The context length to use; all CLIP models use 77 as the context length
+    truncate: bool
+        Whether to truncate the text in case its encoding is longer than the context length
+    Returns
+    -------
+    A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length].
+    We return LongTensor when torch version is <1.8.0, since older index_select requires indices to be long.
+    """
+    if isinstance(texts, str):
+        texts = [texts]
+    sot_token = _tokenizer.encoder["<|startoftext|>"]
+    eot_token = _tokenizer.encoder["<|endoftext|>"]
+    all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts]
+    if packaging.version.parse(torch.__version__) < packaging.version.parse("1.8.0"):
+        result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+    else:
+        result = torch.zeros(len(all_tokens), context_length, dtype=torch.int)
+    for i, tokens in enumerate(all_tokens):
+        if len(tokens) > context_length:
+            if truncate:
+                tokens = tokens[:context_length]
+                tokens[-1] = eot_token
+            else:
+                raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}")
+        result[i, :len(tokens)] = torch.tensor(tokens)
+    return result
+def encode_text_with_prompt_ensemble(model, texts, device, prompt_templates=None,no_module=False):
+    # using default prompt templates for ImageNet
+    if prompt_templates == None:
+        prompt_templates = ['a bad photo of a {}.', 'a photo of many {}.', 'a sculpture of a {}.', 'a photo of the hard to see {}.', 'a low resolution photo of the {}.', 'a rendering of a {}.', 'graffiti of a {}.', 'a bad photo of the {}.', 'a cropped photo of the {}.', 'a tattoo of a {}.', 'the embroidered {}.', 'a photo of a hard to see {}.', 'a bright photo of a {}.', 'a photo of a clean {}.', 'a photo of a dirty {}.', 'a dark photo of the {}.', 'a drawing of a {}.', 'a photo of my {}.', 'the plastic {}.', 'a photo of the cool {}.', 'a close-up photo of a {}.', 'a black and white photo of the {}.', 'a painting of the {}.', 'a painting of a {}.', 'a pixelated photo of the {}.', 'a sculpture of the {}.', 'a bright photo of the {}.', 'a cropped photo of a {}.', 'a plastic {}.', 'a photo of the dirty {}.', 'a jpeg corrupted photo of a {}.', 'a blurry photo of the {}.', 'a photo of the {}.', 'a good photo of the {}.', 'a rendering of the {}.', 'a {} in a video game.', 'a photo of one {}.', 'a doodle of a {}.', 'a close-up photo of the {}.', 'a photo of a {}.', 'the origami {}.', 'the {} in a video game.', 'a sketch of a {}.', 'a doodle of the {}.', 'a origami {}.', 'a low resolution photo of a {}.', 'the toy {}.', 'a rendition of the {}.', 'a photo of the clean {}.', 'a photo of a large {}.', 'a rendition of a {}.', 'a photo of a nice {}.', 'a photo of a weird {}.', 'a blurry photo of a {}.', 'a cartoon {}.', 'art of a {}.', 'a sketch of the {}.', 'a embroidered {}.', 'a pixelated photo of a {}.', 'itap of the {}.', 'a jpeg corrupted photo of the {}.', 'a good photo of a {}.', 'a plushie {}.', 'a photo of the nice {}.', 'a photo of the small {}.', 'a photo of the weird {}.', 'the cartoon {}.', 'art of the {}.', 'a drawing of the {}.', 'a photo of the large {}.', 'a black and white photo of a {}.', 'the plushie {}.', 'a dark photo of a {}.', 'itap of a {}.', 'graffiti of the {}.', 'a toy {}.', 'itap of my {}.', 'a photo of a cool {}.', 'a photo of a small {}.', 'a tattoo of the {}.', 'there is a {} in the scene.', 'there is the {} in the scene.', 'this is a {} in the scene.', 'this is the {} in the scene.', 'this is one {} in the scene.']
+    text_features = []
+    for t in texts:
+        prompted_t = [template.format(t) for template in prompt_templates]
+        prompted_t = tokenize(prompted_t).to(device)
+        if no_module:
+            class_embeddings = model.encode_text(prompted_t)
+        else:
+            class_embeddings = model.module.encode_text(prompted_t)
+        class_embeddings = class_embeddings.clone() / class_embeddings.norm(dim=-1, keepdim=True)
+        class_embedding = class_embeddings.mean(dim=0) # mean of all prompts, from [85,512] to [512]
+        # class_embedding /= class_embedding.norm()
+        class_embedding = class_embedding.clone() / class_embedding.norm() # change here
+        text_features.append(class_embedding)
+    text_features = torch.stack(text_features, dim=1).to(device).t()
+    return text_features
+def get_similarity_map(sm, shape):
+    # min-max norm
+    sm = (sm - sm.min(1, keepdim=True)[0]) / (sm.max(1, keepdim=True)[0] - sm.min(1, keepdim=True)[0]) # torch.Size([1, 196, 1])
+    # reshape
+    side = int(sm.shape[1] ** 0.5) # square output, side = 14
+    sm = sm.reshape(sm.shape[0], side, side, -1).permute(0, 3, 1, 2) # torch.Size([1, 1, 14, 14])
+    # interpolate
+    sm = torch.nn.functional.interpolate(sm, shape, mode='bilinear') # torch.Size([1, 1, 512, 512])
+    sm = sm.permute(0, 2, 3, 1) # torch.Size([1, 512, 512, 1])
+    return sm
+def clip_feature_surgery(image_features, text_features, redundant_feats=None, t=2):
+    if redundant_feats != None:
+        similarity = image_features @ (text_features - redundant_feats).t() # torch.Size([1,197, 1])
+    else:
+        # weights to restrain influence of obvious classes on others
+        prob = image_features[:, :1, :] @ text_features.t() # torch.Size([1, 1, 512]) @ torch.Size([512, 59]) = torch.Size([1, 1, 59])
+        prob = (prob * 2).softmax(-1) #torch.Size([1, 1, 59])
+        w = prob / prob.mean(-1, keepdim=True) #torch.Size([1, 1, 59])
+        # element-wise multiplied features
+        b, n_t, n_i, c = image_features.shape[0], text_features.shape[0], image_features.shape[1], image_features.shape[2] # b = 1, n_t = 59, n_i = 197, c = 512
+        feats = image_features.reshape(b, n_i, 1, c) * text_features.reshape(1, 1, n_t, c) #torch.Size([1, 197, 59, 512])
+        feats *= w.reshape(1, 1, n_t, 1)
+        redundant_feats = feats.mean(2, keepdim=True) # along cls dim
+        feats = feats - redundant_feats
+        # sum the element-wise multiplied features as cosine similarity
+        similarity = feats.sum(-1)
+    return similarity
+# sm shape N_t
+def similarity_map_to_points(sm, shape, t=0.8, down_sample=2):
+    # sm.shape = [196]
+    # shape = [512, 512]
+    side = int(sm.shape[0] ** 0.5) # square root of 196 = 14
+    sm = sm.reshape(1, 1, side, side) # torch.Size([1, 1, 14, 14])
+    # down sample to smooth results
+    down_side = side // down_sample
+    sm = torch.nn.functional.interpolate(sm, (down_side, down_side), mode='bilinear')[0, 0, :, :] # torch.Size([7, 7])
+    h, w = sm.shape # 7, 7
+    sm = sm.reshape(-1) # torch.Size([49]), 7*7 = 49
+    sm = (sm - sm.min()) / (sm.max() - sm.min()) # min-max norm
+    rank = sm.sort(0)[1] # sort and get indices, torch.Size([49])
+    scale_h = float(shape[0]) / h # 512 / 7 = 73.14
+    scale_w = float(shape[1]) / w # 512 / 7 = 73.14
+    num = min((sm >= t).sum(), sm.shape[0] // 2)
+    labels = np.ones(num * 2).astype('uint8')
+    labels[num:] = 0
+    points = []
+    # positives
+    for idx in rank[-num:]:
+        x = min((idx % w + 0.5) * scale_w, shape[1] - 1)  # +0.5 to center
+        y = min((idx // w + 0.5) * scale_h, shape[0] - 1)
+        points.append([int(x.item()), int(y.item())])
+    # negatives
+    for idx in rank[:num]:
+        x = min((idx % w + 0.5) * scale_w, shape[1] - 1)
+        y = min((idx // w + 0.5) * scale_h, shape[0] - 1)
+        points.append([int(x.item()), int(y.item())])
+    return points, labels

models/clip_model.py ADDED Viewed

	@@ -0,0 +1,436 @@

+from collections import OrderedDict
+from typing import Tuple, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from .auxilary import *
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu3 = nn.ReLU(inplace=True)
+        self.downsample = None
+        self.stride = stride
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(OrderedDict([
+                ("-1", nn.AvgPool2d(stride)),
+                ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)),
+                ("1", nn.BatchNorm2d(planes * self.expansion))
+            ]))
+    def forward(self, x: torch.Tensor):
+        identity = x
+        out = self.relu1(self.bn1(self.conv1(x)))
+        out = self.relu2(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu3(out)
+        return out
+class AttentionPool2d(nn.Module):
+    def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+    def forward(self, x):
+        x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute(2, 0, 1)  # NCHW -> (HW)NC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
+        side = int((self.positional_embedding.shape[0] - 1) ** 0.5)
+        new_side = int((x.shape[0] - 1) ** 0.5)
+        # update the position embedding during inference for varied input size
+        if side != new_side:
+            new_pos = self.positional_embedding[1:, :].reshape(-1, side, side, x.shape[-1]).permute(0, 3, 1, 2)
+            new_pos = torch.nn.functional.interpolate(new_pos, (new_side, new_side), mode='bilinear')
+            new_pos = new_pos.reshape(-1, x.shape[-1], new_side * new_side).transpose(1, 2)
+            self.positional_embedding.data = torch.cat([self.positional_embedding[:1, :], new_pos[0]], 0)
+        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
+        x, _ = F.multi_head_attention_forward(
+            query=x, key=x, value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False
+        )
+        #return x[0]
+        return x.transpose(0, 1) # return both cls token and image tokens, B,N,C
+class ModifiedResNet(nn.Module):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+    def __init__(self, layers, output_dim, heads, input_resolution=224, width=64):
+        super().__init__()
+        self.output_dim = output_dim
+        self.input_resolution = input_resolution
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.relu3 = nn.ReLU(inplace=True)
+        self.avgpool = nn.AvgPool2d(2)
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
+        embed_dim = width * 32  # the ResNet feature dimension
+        self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, heads, output_dim)
+    def _make_layer(self, planes, blocks, stride=1):
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        def stem(x):
+            x = self.relu1(self.bn1(self.conv1(x)))
+            x = self.relu2(self.bn2(self.conv2(x)))
+            x = self.relu3(self.bn3(self.conv3(x)))
+            x = self.avgpool(x)
+            return x
+        x = x.type(self.conv1.weight.dtype)
+        x = stem(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.attnpool(x)
+        return x
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None, need_weights: bool = False):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(OrderedDict([
+            ("c_fc", nn.Linear(d_model, d_model * 4)),
+            ("gelu", QuickGELU()),
+            ("c_proj", nn.Linear(d_model * 4, d_model))
+        ]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+        self.need_weights = need_weights
+        self.attn_probs = None
+        self.attn_grad = None
+        self.attn_keys = None
+    def set_attn_probs(self, attn_probs):
+        self.attn_probs = attn_probs
+    def set_attn_keys(self, attn_keys):
+        self.attn_keys = attn_keys
+    def set_attn_grad(self, attn_grad):
+        self.attn_grad = attn_grad
+    # def attention(self, x: torch.Tensor):
+    #     self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
+    #     if self.need_weights == False:
+    #         return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+    #     else:
+    #         return self.attn(x, x, x, need_weights=True, attn_mask=self.attn_mask)
+    # def forward(self, x: torch.Tensor):
+    #     if self.need_weights == False:
+    #         x = x + self.attention(self.ln_1(x))
+    #         x = x + self.mlp(self.ln_2(x))
+    #         return x
+    #     else:
+    #         y, attn = self.attention(self.ln_1(x))
+    #         x = x + y
+    #         x = x + self.mlp(self.ln_2(x))
+    #         return x
+    def attention(self, x: torch.Tensor, attn_mask: torch.Tensor = None, mode="train"):
+        if mode == "saliency":
+            return self.attn(x, x, x, need_weights=False, attn_mask=attn_mask, attention_probs_forward_hook=self.set_attn_probs,
+                        attention_probs_backwards_hook=self.set_attn_grad, attention_keys_forward_hook=None)[0]
+        elif mode == "hook_keys":
+            return self.attn(x, x, x, need_weights=False, attn_mask=attn_mask, attention_probs_forward_hook=None,
+                        attention_probs_backwards_hook=None, attention_keys_forward_hook=self.set_attn_keys)[0]
+        return self.attn(x, x, x, need_weights=False, attn_mask=attn_mask, attention_probs_forward_hook=None,
+                        attention_probs_backwards_hook=None, attention_keys_forward_hook=None)[0]
+        # self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
+        # attn_mask = attn_mask.to(dtype=x.dtype, device=x.device) if attn_mask is not None else None
+    def forward(self, x: torch.Tensor, attn_mask=None, mode="train"):
+        x = x + self.attention(self.ln_1(x), attn_mask=attn_mask, mode=mode)
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class Transformer(nn.Module):
+    def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None, need_weights: bool = False):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask, need_weights if i == layers - 1 else False) for i in range(layers)])
+    def forward(self, x: torch.Tensor, attn_mask=None, mode="train"):
+        for l in self.resblocks:
+            x = l(x, attn_mask=attn_mask, mode=mode)
+        breakpoint()
+        return x
+        # return self.resblocks(x)
+class VisionTransformer(nn.Module):
+    def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, output_dim: int):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
+        scale = width ** -0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width))
+        self.ln_pre = LayerNorm(width)
+        self.transformer = Transformer(width, layers, heads, need_weights=True)
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+    def forward(self, x: torch.Tensor, attn_mask=None, mode="train"):
+        breakpoint()
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x, attn_mask, mode)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        #x = self.ln_post(x[:, 0, :])
+        x = self.ln_post(x) # return both cls token and image tokens
+        if self.proj is not None:
+            x = x @ self.proj
+        return x
+class CLIP(nn.Module):
+    def __init__(self,
+                 embed_dim: int,
+                 # vision
+                 image_resolution: int,
+                 vision_layers: Union[Tuple[int, int, int, int], int],
+                 vision_width: int,
+                 vision_patch_size: int,
+                 # text
+                 context_length: int,
+                 vocab_size: int,
+                 transformer_width: int,
+                 transformer_heads: int,
+                 transformer_layers: int
+                 ):
+        super().__init__()
+        self.context_length = context_length
+        if isinstance(vision_layers, (tuple, list)):
+            vision_heads = vision_width * 32 // 64
+            self.visual = ModifiedResNet(
+                layers=vision_layers,
+                output_dim=embed_dim,
+                heads=vision_heads,
+                input_resolution=image_resolution,
+                width=vision_width
+            )
+        else:
+            vision_heads = vision_width // 64
+            self.visual = VisionTransformer(
+                input_resolution=image_resolution,
+                patch_size=vision_patch_size,
+                width=vision_width,
+                layers=vision_layers,
+                heads=vision_heads,
+                output_dim=embed_dim
+            )
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask()
+        )
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width))
+        self.ln_final = LayerNorm(transformer_width)
+        self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim))
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.initialize_parameters()
+    def initialize_parameters(self):
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.positional_embedding, std=0.01)
+        if isinstance(self.visual, ModifiedResNet):
+            if self.visual.attnpool is not None:
+                std = self.visual.attnpool.c_proj.in_features ** -0.5
+                nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
+            for resnet_block in [self.visual.layer1, self.visual.layer2, self.visual.layer3, self.visual.layer4]:
+                for name, param in resnet_block.named_parameters():
+                    if name.endswith("bn3.weight"):
+                        nn.init.zeros_(param)
+        proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
+        attn_std = self.transformer.width ** -0.5
+        fc_std = (2 * self.transformer.width) ** -0.5
+        for block in self.transformer.resblocks:
+            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+        if self.text_projection is not None:
+            nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+    @property
+    def dtype(self):
+        return self.visual.conv1.weight.dtype
+    def encode_image(self, image):
+        return self.visual(image.type(self.dtype))
+    def encode_text(self, text):
+        x = self.token_embedding(text).type(self.dtype)  # [batch_size, n_ctx, d_model]
+        x = x + self.positional_embedding.type(self.dtype)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x).type(self.dtype)
+        # x.shape = [batch_size, n_ctx, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
+        return x
+    def forward(self, image, text,return_logits=False):
+        image_features = self.encode_image(image)
+        text_features = self.encode_text(text)
+        # normalized features
+        patch_features = image_features / image_features.norm(dim=1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=1, keepdim=True)
+        if return_logits:
+            logit_scale = self.logit_scale.exp()
+            sketch_features = patch_features.sum(dim=1)
+            sketch_features = sketch_features / sketch_features.norm(dim=1, keepdim=True)
+            logits_sketch = logit_scale * sketch_features @ text_features.t()
+            logits_text = logits_sketch.t()
+            return logits_sketch,logits_text
+        else:
+            return patch_features,text_features

models/our_model.py ADDED Viewed

	@@ -0,0 +1,604 @@

+from collections import OrderedDict
+from typing import Tuple, Union
+import math
+# import torchvision
+import torch
+import numpy as np
+import torch
+from torch import nn
+# from torch.nn.modules.utils import _pair
+from torch.nn import Dropout
+from functools import reduce
+from operator import mul
+# from vpt.src.utils import logging
+from .ca import Cross_Attention
+# logger = logging.get_logger("visual_prompt")
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu3 = nn.ReLU(inplace=True)
+        self.downsample = None
+        self.stride = stride
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(OrderedDict([
+                ("-1", nn.AvgPool2d(stride)),
+                ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)),
+                ("1", nn.BatchNorm2d(planes * self.expansion))
+            ]))
+    def forward(self, x: torch.Tensor):
+        identity = x
+        out = self.relu1(self.bn1(self.conv1(x)))
+        out = self.relu2(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu3(out)
+        return out
+# implement attention module for v-v self-attention
+class Attention(nn.Module):
+    def __init__(self, out_dim, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., settings=''):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(out_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.settings = settings
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        # original self-attention for the original path
+        attn_ori = (q @ k.transpose(-2, -1)) * self.scale
+        attn_ori = attn_ori.softmax(dim=-1)
+        attn_ori = self.attn_drop(attn_ori)
+        # replace k & q by v
+        k = v
+        q = k
+        # resnets have only one self-attention, norm and larger scale perform better
+        if self.settings == 'resnet':
+            k = k / (k.norm(p=2, dim=-1, keepdim=True) + 1e-6)
+            q = k
+            scale = self.scale * 8
+        else:
+            scale = self.scale
+        # self-attention, higher temperate for resnets performs better
+        attn = (q @ k.transpose(-2, -1)) * scale
+        attn = (attn).softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x_ori = (attn_ori @ v).transpose(1, 2).reshape(B, N, C)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C) # clip_surgery
+        #x = v.transpose(1, 2).reshape(B, N, C) # mask_clip
+        x = self.proj_drop(self.proj(x))
+        x_ori = self.proj_drop(self.proj(x_ori))
+        return [x, x_ori]
+class AttentionPool2d(nn.Module):
+    def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+        self.attn = None
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.output_dim = output_dim
+    def forward(self, x):
+        # reform transformer layer after init and load weights, using v only
+        if self.attn == None:
+            self.attn = Attention(self.output_dim, self.embed_dim, self.num_heads, True)
+            self.attn.qkv.weight = torch.nn.Parameter(torch.cat([self.v_proj.weight, self.v_proj.weight, self.v_proj.weight], 0))
+            self.attn.qkv.bias = torch.nn.Parameter(torch.cat([self.v_proj.bias, self.v_proj.bias, self.v_proj.bias]))
+            self.attn.proj.weight = self.c_proj.weight
+            self.attn.proj.bias = self.c_proj.bias
+        x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute(2, 0, 1)  # NCHW -> (HW)NC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
+        side = int((self.positional_embedding.shape[0] - 1) ** 0.5)
+        new_side = int((x.shape[0] - 1) ** 0.5)
+        # update the position embedding during inference for varied input size
+        if side != new_side:
+            new_pos = self.positional_embedding[1:, :].reshape(-1, side, side, x.shape[-1]).permute(0, 3, 1, 2)
+            new_pos = torch.nn.functional.interpolate(new_pos, (new_side, new_side), mode='bilinear')
+            new_pos = new_pos.reshape(-1, x.shape[-1], new_side * new_side).transpose(1, 2)
+            self.positional_embedding.data = torch.cat([self.positional_embedding[:1, :], new_pos[0]], 0)
+        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
+        x, x_ori = self.attn(x.transpose(0, 1))
+        # cls token from the original path, and img tokens from the new path
+        x[:, 0, :] = x_ori[:, 0, :]
+        return x
+class ModifiedResNet(nn.Module):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+    def __init__(self, layers, output_dim, heads, input_resolution=224, width=64):
+        super().__init__()
+        self.output_dim = output_dim
+        self.input_resolution = input_resolution
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.relu3 = nn.ReLU(inplace=True)
+        self.avgpool = nn.AvgPool2d(2)
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
+        embed_dim = width * 32  # the ResNet feature dimension
+        self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, heads, output_dim)
+    def _make_layer(self, planes, blocks, stride=1):
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        def stem(x):
+            x = self.relu1(self.bn1(self.conv1(x)))
+            x = self.relu2(self.bn2(self.conv2(x)))
+            x = self.relu3(self.bn3(self.conv3(x)))
+            x = self.avgpool(x)
+            return x
+        x = x.type(self.conv1.weight.dtype)
+        x = stem(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.attnpool(x)
+        # shape BNC
+        return x
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.clone().type(torch.float32))
+        return ret.type(orig_type)
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(OrderedDict([
+            ("c_fc", nn.Linear(d_model, d_model * 4)),
+            ("gelu", QuickGELU()),
+            ("c_proj", nn.Linear(d_model * 4, d_model))
+        ]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+        self.attn_probs = None
+        self.attn_grad = None
+        self.attn_keys = None
+    def set_attn_probs(self, attn_probs):
+        self.attn_probs = attn_probs
+    def set_attn_keys(self, attn_keys):
+        self.attn_keys = attn_keys
+    def set_attn_grad(self, attn_grad):
+        self.attn_grad = attn_grad
+    def attention(self, x: torch.Tensor, attn_mask: torch.Tensor = None, mode="train"):
+        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
+        if isinstance(self.attn, Attention):
+            x = x.transpose(0, 1)
+            x, x_ori = self.attn(x)
+            return [x.transpose(0, 1), x_ori.transpose(0, 1)]
+        else:
+            return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+    def forward(self, x, attn_mask: torch.Tensor = None, mode="train"):
+        # dual paths for blocks deeper than "d"
+        if isinstance(self.attn, Attention):
+            if isinstance(x, list):
+                x, x_ori = x
+                x_res = self.attention(self.ln_1(x_ori))
+                x_res, x_ori_res = x_res
+                x_ori += x_ori_res
+                x_ori = x_ori + self.mlp(self.ln_2(x_ori))
+                x += x_res # skip ffn for the new path
+                return [x, x_ori]
+            # start of dual path
+            else:
+                x_res = self.attention(self.ln_1(x))
+                if isinstance(x_res, list):
+                    x_res, x_ori_res = x_res
+                    x_ori = x + x_ori_res
+                    x_ori = x_ori + self.mlp(self.ln_2(x_ori))
+                    x += x_res
+                    return [x, x_ori]
+        # single path before "d"
+        else:
+            x = x + self.attention(self.ln_1(x))
+            x = x + self.mlp(self.ln_2(x))
+        return x
+class Transformer(nn.Module):
+    def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None, need_weights: bool = False):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for i in range(layers)])
+        self.ca = Cross_Attention(d_model=768)
+    def forward(self, x: torch.Tensor,layers=12,text_bool=False,text_features=None,mode="train"):
+        for idx,l in enumerate(self.resblocks):
+            x=l(x)
+            if idx+1 == layers:
+                if text_bool:
+                    return x
+                # implement cross attention between image tokens and text tokens
+                x_l = x[0]
+                x_ori_l = x[1]
+                text_features = text_features.unsqueeze(0).repeat(x_l.shape[0], 1, 1)
+                x_l = x_l.permute(1, 0, 2)
+                text_features = text_features.permute(1, 0, 2)
+                if mode == "test":
+                    x_l = x_l.repeat(text_features.shape[0], 1, 1)
+                x_l_ca = self.ca(x_l, text_features)
+                x_l_ca = x_l_ca.permute(1, 0, 2)
+                x_ori_l = x_ori_l.permute(1, 0, 2)
+                if mode == "test":
+                    x_ori_l = x_ori_l.repeat(text_features.shape[0], 1, 1)
+                x_ori_l_ca = self.ca(x_ori_l, text_features)
+                x_ori_l_ca = x_ori_l_ca.permute(1, 0, 2)
+                return [x_l_ca, x_ori_l_ca]
+class PromptedVisionTransformer(nn.Module):
+    def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, output_dim: int,prompt_config:dict,train_bool:bool):
+        super().__init__()
+        self.train_bool = train_bool
+        self.patch_size = patch_size
+        self.input_resolution = input_resolution
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
+        scale = width ** -0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width))
+        self.ln_pre = LayerNorm(width)
+        self.transformer = Transformer(width, layers, heads, need_weights=True)
+        self.attn = None
+        self.embed_dim = width
+        self.num_heads = heads
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+        self.prompt_config = prompt_config
+        self.prompt_dropout = Dropout(self.prompt_config.DROPOUT)
+        num_tokens = self.prompt_config.NUM_TOKENS
+        self.num_tokens = num_tokens  # number of prompted tokens
+        # if project the prompt embeddings
+        if self.prompt_config.PROJECT > -1:
+            # only for prepend / add
+            prompt_dim = self.prompt_config.PROJECT
+            self.prompt_proj = nn.Linear(
+                prompt_dim, 768)
+            nn.init.kaiming_normal_(
+                self.prompt_proj.weight, a=0, mode='fan_out')
+        else:
+            prompt_dim = 768
+            self.prompt_proj = nn.Identity()
+        # initiate prompt:
+        if self.prompt_config.INITIATION == "random":
+            val = math.sqrt(6. / float(3 * reduce(mul, (patch_size,patch_size), 1) + prompt_dim))  # noqa
+            self.prompt_embeddings = nn.Parameter(torch.zeros(
+                1, num_tokens, prompt_dim))
+            # xavier_uniform initialization
+            nn.init.uniform_(self.prompt_embeddings.data, -val, val)
+            if self.prompt_config.DEEP:  # noqa
+                total_d_layer = 12-1 #config.transformer["num_layers"]-1
+                self.deep_prompt_embeddings = nn.Parameter(torch.zeros(
+                    total_d_layer, num_tokens, prompt_dim))
+                # xavier_uniform initialization
+                nn.init.uniform_(self.deep_prompt_embeddings.data, -val, val)
+        else:
+            raise ValueError("Other initiation scheme is not supported")
+        if not self.train_bool:
+            if self.attn == None:
+                # apply architecture surgery on the last 6 blocks
+                for i in range(1, 7): # surgery 7, maskclip 2
+                    self.attn = Attention(self.embed_dim, self.embed_dim, self.num_heads, True)
+                    self.attn.qkv.weight.data = self.transformer.resblocks[-i].attn.in_proj_weight.clone()
+                    self.attn.qkv.bias.data = self.transformer.resblocks[-i].attn.in_proj_bias.clone()
+                    self.attn.proj.weight.data = self.transformer.resblocks[-i].attn.out_proj.weight.clone()
+                    self.attn.proj.bias.data = self.transformer.resblocks[-i].attn.out_proj.bias.clone()
+                    self.transformer.resblocks[-i].attn = self.attn
+    # @torch.no_grad()
+    def forward(self, x: torch.Tensor,layers: int = 12,text_features:torch.Tensor = None,mode:str = "test"):
+        if self.attn == None:
+            # apply architecture surgery on the last 6 blocks
+            for i in range(1, 7): # surgery 7, maskclip 2
+                self.attn = Attention(self.embed_dim, self.embed_dim, self.num_heads, True)
+                self.attn.qkv.weight.data = self.transformer.resblocks[-i].attn.in_proj_weight.clone()
+                self.attn.qkv.bias.data = self.transformer.resblocks[-i].attn.in_proj_bias.clone()
+                self.attn.proj.weight.data = self.transformer.resblocks[-i].attn.out_proj.weight.clone()
+                self.attn.proj.bias.data = self.transformer.resblocks[-i].attn.out_proj.bias.clone()
+                self.transformer.resblocks[-i].attn = self.attn
+        B = x.shape[0]
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]  ,, torch.Size([B, 196, 768])
+        x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
+        side = int((self.positional_embedding.shape[0] - 1) ** 0.5)
+        new_side = int((x.shape[1] - 1) ** 0.5)
+        # update the position embedding during inference for varied input size
+        if side != new_side:
+            new_pos = self.positional_embedding[1:, :].reshape(-1, side, side, x.shape[-1]).permute(0, 3, 1, 2)
+            new_pos = torch.nn.functional.interpolate(new_pos, (new_side, new_side), mode='bilinear')
+            new_pos = new_pos.reshape(-1, x.shape[-1], new_side * new_side).transpose(1, 2)
+            self.positional_embedding.data = torch.cat([self.positional_embedding[:1, :], new_pos[0]], 0)
+        pos = self.positional_embedding.to(x.dtype)
+        x = x + pos # add positional embedding torch.Size([B, 197, 768])
+        # ADD VISUAL PROMPTS HERE
+        if self.num_tokens > 0:
+            x = torch.cat((
+                    x[:, :1, :],
+                    self.prompt_dropout(self.prompt_proj(self.prompt_embeddings).expand(B, -1, -1)),
+                    x[:, 1:, :]
+                ), dim=1)
+        # (batch_size, cls_token + n_prompt + n_patches, hidden_dim)
+        x = self.ln_pre(x) # layer norm
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        if mode == "train":
+            x_multi = torch.zeros(len(layers),x.shape[1],x.shape[0],512).to(x.device)
+        elif mode == "test":
+            x_multi = torch.zeros(len(layers),text_features.shape[0],x.shape[0],512).to(x.device)
+        for d,layer in enumerate(layers):
+            x_l, x_ori_l = self.transformer(x,layers=layer,text_bool=False, text_features=text_features,mode = mode)
+            x_l[0, :, :] = x_ori_l[0, :, :] # clip_surgery
+            x_l = x_l.permute(1, 0, 2)  # LND -> NLD
+            x_l = self.ln_post(x_l) # layer norm
+            x_l = x_l @ self.proj
+            x_multi[d] = x_l
+        return x_multi
+class ModifiedCLIPSurgery(nn.Module):
+    def __init__(self,
+                 embed_dim: int,
+                 # vision
+                 image_resolution: int,
+                 vision_layers: Union[Tuple[int, int, int, int], int],
+                 vision_width: int,
+                 vision_patch_size: int,
+                 # text
+                 context_length: int,
+                 vocab_size: int,
+                 transformer_width: int,
+                 transformer_heads: int,
+                 transformer_layers: int,
+                 cfg:dict,
+                 train_bool:bool,
+                 ):
+        super().__init__()
+        if "prompt" in cfg.MODEL.TRANSFER_TYPE:
+            prompt_cfg = cfg.MODEL.PROMPT
+        else:
+            prompt_cfg = None
+        self.prompt_config = prompt_cfg
+        self.context_length = context_length
+        if isinstance(vision_layers, (tuple, list)):
+            vision_heads = vision_width * 32 // 64
+            self.visual = ModifiedResNet(
+                layers=vision_layers,
+                output_dim=embed_dim,
+                heads=vision_heads,
+                input_resolution=image_resolution,
+                width=vision_width
+            )
+        else:
+            vision_heads = vision_width // 64
+            self.visual = PromptedVisionTransformer(
+                input_resolution=image_resolution,
+                patch_size=vision_patch_size,
+                width=vision_width,
+                layers=vision_layers,
+                heads=vision_heads,
+                output_dim=embed_dim,
+                prompt_config=self.prompt_config,
+                train_bool=train_bool,
+                )
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask()
+        )
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width))
+        self.ln_final = LayerNorm(transformer_width)
+        self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim))
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.initialize_parameters()
+    def initialize_parameters(self):
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.positional_embedding, std=0.01)
+        # skipped because self.visual is PromptedVisionTransformer
+        if isinstance(self.visual, ModifiedResNet):
+            if self.visual.attnpool is not None:
+                std = self.visual.attnpool.c_proj.in_features ** -0.5
+                nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
+            for resnet_block in [self.visual.layer1, self.visual.layer2, self.visual.layer3, self.visual.layer4]:
+                for name, param in resnet_block.named_parameters():
+                    if name.endswith("bn3.weight"):
+                        nn.init.zeros_(param)
+        proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
+        attn_std = self.transformer.width ** -0.5
+        fc_std = (2 * self.transformer.width) ** -0.5
+        for block in self.transformer.resblocks:
+            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+        if self.text_projection is not None:
+            nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+    @property
+    def dtype(self):
+        return self.visual.conv1.weight.dtype
+    def encode_image(self, image,layers:int=12,text_features=None,mode="test"):
+        return self.visual(image.type(self.dtype),layers=layers,text_features=text_features,mode=mode)
+    def encode_text(self, text):
+        text_bool=True
+        x = self.token_embedding(text).type(self.dtype)  # [batch_size, n_ctx, d_model]
+        x = x + self.positional_embedding.type(self.dtype)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x,layers=12,text_bool=text_bool,text_features=None) # always get the last layer features for text
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x).type(self.dtype)
+        # x.shape = [batch_size, n_ctx, transformer.width]
+        x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
+        return x
+    def forward(self, image, text,layer_num=12,return_logits=False,mode="train"):
+        text_features = self.encode_text(text)
+        patch_features = self.encode_image(image,layers=layer_num,text_features=text_features,mode=mode).squeeze(0)
+        # normalized features
+        patch_features = patch_features / patch_features.norm(dim=1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=1, keepdim=True)
+        if return_logits:
+            logit_scale = self.logit_scale.exp()
+            sketch_features = patch_features[:,0,:]
+            logits_sketch = logit_scale * sketch_features @ text_features.t()
+            logits_text = logits_sketch.t()
+            return logits_sketch,logits_text
+        else:
+            return patch_features,text_features

models/simple_tokenizer.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import gzip
+import html
+import os
+from functools import lru_cache
+import ftfy
+import regex as re
+@lru_cache()
+def default_bpe():
+    return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+class SimpleTokenizer(object):
+    def __init__(self, bpe_path: str = default_bpe()):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
+        merges = merges[1:49152-256-2+1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v+'</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
+        self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + ( token[-1] + '</w>',)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token+'</w>'
+        while True:
+            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
+        return text

output.png ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+torch
+numpy
+torchvision
+matplotlib==3.7.1
+ml-collections==0.1.1
+pillow==9.5.0
+simplejson
+termcolor
+iopath
+ftfy
+fvcore
+regex

sketch_seg_best_miou.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:69d9913b629680f044ae4dfcaccd08e85f7d98ae90db270b863e2a623e9b98bd
+size 696369947

utils.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import torch
+import numpy as np
+from torchvision.transforms import InterpolationMode
+BICUBIC = InterpolationMode.BICUBIC
+from vpt.src.configs.config import get_cfg
+import os
+from time import sleep
+from random import randint
+from vpt.src.utils.file_io import PathManager
+import matplotlib.pyplot as plt
+from matplotlib.colors import rgb_to_hsv, hsv_to_rgb
+import warnings
+warnings.filterwarnings("ignore")
+def setup(args):
+    """
+    Create configs and perform basic setups.
+    """
+    cfg = get_cfg()
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    output_dir = cfg.OUTPUT_DIR
+    lr = cfg.SOLVER.BASE_LR
+    wd = cfg.SOLVER.WEIGHT_DECAY
+    output_folder = os.path.join(
+        cfg.DATA.NAME, cfg.DATA.FEATURE, f"lr{lr}_wd{wd}")
+    # train cfg.RUN_N_TIMES times
+    count = 1
+    while count <= cfg.RUN_N_TIMES:
+        output_path = os.path.join(output_dir, output_folder, f"run{count}")
+        # pause for a random time, so concurrent process with same setting won't interfere with each other. # noqa
+        sleep(randint(3, 30))
+        if not PathManager.exists(output_path):
+            PathManager.mkdirs(output_path)
+            cfg.OUTPUT_DIR = output_path
+            break
+        else:
+            count += 1
+    cfg.freeze()
+    return cfg
+def get_similarity_map(sm, shape):
+    # sm: torch.Size([1, 196, 1])
+    # min-max norm
+    sm = (sm - sm.min(1, keepdim=True)[0]) / (sm.max(1, keepdim=True)[0] - sm.min(1, keepdim=True)[0]) # torch.Size([1, 196, 1])
+    # reshape
+    side = int(sm.shape[1] ** 0.5) # square output, side = 14
+    sm = sm.reshape(sm.shape[0], side, side, -1).permute(0, 3, 1, 2)
+    # interpolate
+    sm = torch.nn.functional.interpolate(sm, shape, mode='bilinear')
+    sm = sm.permute(0, 2, 3, 1)
+    return sm.squeeze(0)
+def display_segmented_sketch(pixel_similarity_array,binary_sketch,classes,classes_colors,save_path=None,live=False):
+    # Find the class index with the highest similarity for each pixel
+    class_indices = np.argmax(pixel_similarity_array, axis=0)
+    # Create an HSV image placeholder
+    hsv_image = np.zeros(class_indices.shape + (3,))  # Shape (512, 512, 3)
+    hsv_image[..., 2] = 1  # Set Value to 1 for a white base
+    # Set the hue and value channels
+    for i, color in enumerate(classes_colors):
+        rgb_color = np.array(color).reshape(1, 1, 3)
+        hsv_color = rgb_to_hsv(rgb_color)
+        mask = class_indices == i
+        if i < len(classes):  # For the first N-2 classes, set color based on similarity
+            hsv_image[..., 0][mask] = hsv_color[0, 0, 0]  # Hue
+            hsv_image[..., 1][mask] = pixel_similarity_array[i][mask] > 0  # Saturation
+            hsv_image[..., 2][mask] = pixel_similarity_array[i][mask]  # Value
+        else:  # For the last two classes, set pixels to black
+            hsv_image[..., 0][mask] = 0  # Hue doesn't matter for black
+            hsv_image[..., 1][mask] = 0  # Saturation set to 0
+            hsv_image[..., 2][mask] = 0  # Value set to 0, making it black
+    mask_tensor_org = binary_sketch[:,:,0]/255
+    hsv_image[mask_tensor_org==1] = [0,0,1]
+    # Convert the HSV image back to RGB to display and save
+    rgb_image = hsv_to_rgb(hsv_image)
+    # # Calculate centroids and render class names
+    # for i, class_name in enumerate(classes):
+    #     mask = class_indices == i
+    #     if np.any(mask):
+    #         y, x = np.nonzero(mask)
+    #         centroid_x, centroid_y = np.mean(x), np.mean(y)
+    #         plt.text(centroid_x, centroid_y, class_name, color=classes_colors[i], ha='center', va='center',fontsize=14,   # color=classes_colors[i]
+    #         bbox=dict(facecolor='lightgrey', edgecolor='none', boxstyle='round,pad=0.2', alpha=0.8))
+    # Display the image with class names
+    plt.imshow(rgb_image)
+    plt.axis('off')
+    plt.tight_layout()
+    if live:
+        plt.savefig('output.png', bbox_inches='tight', pad_inches=0)
+    else:
+        save_dir = "/".join(save_path.split("/")[:-1])
+        if save_dir !='':
+            if not os.path.exists(save_dir):
+                os.makedirs(save_dir)
+            plt.savefig(save_path, bbox_inches='tight', pad_inches=0)
+        else:
+            plt.show()

vpt/configs/base-prompt.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+NUM_GPUS: 1
+NUM_SHARDS: 1
+OUTPUT_DIR: ""
+RUN_N_TIMES: 1
+MODEL:
+  TRANSFER_TYPE: "prompt"
+  TYPE: "vit"
+  LINEAR:
+    MLP_SIZES: []
+SOLVER:
+  SCHEDULER: "cosine"
+  PATIENCE: 300
+  LOSS: "softmax"
+  OPTIMIZER: "sgd"
+  MOMENTUM: 0.9
+  WEIGHT_DECAY: 0.0001
+  LOG_EVERY_N: 100
+  WARMUP_EPOCH: 10
+  TOTAL_EPOCH: 100
+DATA:
+  NAME: ""
+  NUMBER_CLASSES: -1
+  DATAPATH: ""
+  FEATURE: "sup_vitb16_224"
+  BATCH_SIZE: 128

vpt/configs/prompt/cub.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+_BASE_: "../base-prompt.yaml"
+RUN_N_TIMES: 1
+DATA:
+  NAME: "CUB"
+  DATAPATH: ""  #TODO: need to specify here
+  NUMBER_CLASSES: 200
+  MULTILABEL: False
+MODEL:
+  TYPE: "vit"
+SOLVER:
+  BASE_LR: 0.1
+  WEIGHT_DECAY: 0.01

vpt/launch.py ADDED Viewed

	@@ -0,0 +1,25 @@

+#!/usr/bin/env python3
+"""
+launch helper functions
+"""
+import argparse
+def default_argument_parser():
+    """
+    create a simple parser to wrap around config file
+    """
+    parser = argparse.ArgumentParser(description="visual-prompt")
+    parser.add_argument(
+        "--config-file", default="vpt/configs/prompt/cub.yaml", metavar="FILE", help="path to config file")
+    parser.add_argument(
+        "--train-type", default="", help="training types")
+    parser.add_argument(
+        "opts",
+        help="Modify config options using the command-line",
+        default=None,
+        nargs=argparse.REMAINDER,
+    )
+    return parser

vpt/src/configs/config.py ADDED Viewed

	@@ -0,0 +1,161 @@

+#!/usr/bin/env python3
+"""Config system (based on Detectron's)."""
+from .config_node import CfgNode
+# Global config object
+_C = CfgNode()
+# Example usage:
+#   from configs.config import cfg
+_C.DBG = False
+_C.OUTPUT_DIR = "./output"
+_C.RUN_N_TIMES = 5
+# Perform benchmarking to select the fastest CUDNN algorithms to use
+# Note that this may increase the memory usage and will likely not result
+# in overall speedups when variable size inputs are used (e.g. COCO training)
+_C.CUDNN_BENCHMARK = False
+# Number of GPUs to use (applies to both training and testing)
+_C.NUM_GPUS = 1
+_C.NUM_SHARDS = 1
+# Note that non-determinism may still be present due to non-deterministic
+# operator implementations in GPU operator libraries
+_C.SEED = None
+# ----------------------------------------------------------------------
+# Model options
+# ----------------------------------------------------------------------
+_C.MODEL = CfgNode()
+_C.MODEL.TRANSFER_TYPE = "linear"  # one of linear, end2end, prompt, adapter, side, partial-1, tinytl-bias
+_C.MODEL.WEIGHT_PATH = ""  # if resume from some checkpoint file
+_C.MODEL.SAVE_CKPT = False
+_C.MODEL.MODEL_ROOT = ""  # root folder for pretrained model weights
+_C.MODEL.TYPE = "vit"
+_C.MODEL.MLP_NUM = 0
+_C.MODEL.LINEAR = CfgNode()
+_C.MODEL.LINEAR.MLP_SIZES = []
+_C.MODEL.LINEAR.DROPOUT = 0.1
+# ----------------------------------------------------------------------
+# Prompt options
+# ----------------------------------------------------------------------
+_C.MODEL.PROMPT = CfgNode()
+_C.MODEL.PROMPT.NUM_TOKENS = 3
+_C.MODEL.PROMPT.LOCATION = "prepend"
+# prompt initalizatioin:
+    # (1) default "random"
+    # (2) "final-cls" use aggregated final [cls] embeddings from training dataset
+    # (3) "cls-nolastl": use first 12 cls embeddings (exclude the final output) for deep prompt
+    # (4) "cls-nofirstl": use last 12 cls embeddings (exclude the input to first layer)
+_C.MODEL.PROMPT.INITIATION = "random"  # "final-cls", "cls-first12"
+_C.MODEL.PROMPT.CLSEMB_FOLDER = ""
+_C.MODEL.PROMPT.CLSEMB_PATH = ""
+_C.MODEL.PROMPT.PROJECT = -1  # "projection mlp hidden dim"
+_C.MODEL.PROMPT.DEEP = False # "whether do deep prompt or not, only for prepend location"
+_C.MODEL.PROMPT.LOG = "set_log"  # log file for prompt
+_C.MODEL.PROMPT.NUM_DEEP_LAYERS = None  # if set to be an int, then do partial-deep prompt tuning
+_C.MODEL.PROMPT.REVERSE_DEEP = False  # if to only update last n layers, not the input layer
+_C.MODEL.PROMPT.DEEP_SHARED = False  # if true, all deep layers will be use the same prompt emb
+_C.MODEL.PROMPT.FORWARD_DEEP_NOEXPAND = False  # if true, will not expand input sequence for layers without prompt
+_C.MODEL.PROMPT.HEAD = False # if true, will add a trainable head to the model
+_C.MODEL.PROMPT.HEAD_CLASS = False # if true, will add a trainable classification head to the model
+# _C.MODEL.PROMPT.TRAINABLE_PARM is a list of strings, each string is a name of a parameter
+_C.MODEL.PROMPT.TRAINABLE_PARM = "prompt,head" # if not empty, will only train the parameters in this list
+_C.WANDB = True
+_C.margin = 0.5
+_C.threshold = 0.4
+_C.learning_rate = 1e-5
+_C.ft_all = True
+_C.max_classes = 3
+_C.bz = 16
+_C.save_every = 5
+_C.checkpoint_path = "checkpoint/sketch_seg_best_miou.pth"
+_C.sketch_path = 'demo/sketch_1.png'
+_C.output_path = "/output"
+# _C.classes = ['tree','bench','grass']
+# how to get the output emb for cls head:
+    # original: follow the orignial backbone choice,
+    # img_pool: image patch pool only
+    # prompt_pool: prompt embd pool only
+    # imgprompt_pool: pool everything but the cls token
+_C.MODEL.PROMPT.VIT_POOL_TYPE = "original"
+_C.MODEL.PROMPT.DROPOUT = 0.1
+_C.MODEL.PROMPT.SAVE_FOR_EACH_EPOCH = False
+# ----------------------------------------------------------------------
+# adapter options
+# ----------------------------------------------------------------------
+_C.MODEL.ADAPTER = CfgNode()
+_C.MODEL.ADAPTER.REDUCATION_FACTOR = 8
+_C.MODEL.ADAPTER.STYLE = "Pfeiffer"
+# ----------------------------------------------------------------------
+# Solver options
+# ----------------------------------------------------------------------
+_C.SOLVER = CfgNode()
+_C.SOLVER.LOSS = "softmax"
+_C.SOLVER.LOSS_ALPHA = 0.01
+_C.SOLVER.OPTIMIZER = "sgd"  # or "adamw"
+_C.SOLVER.MOMENTUM = 0.9
+_C.SOLVER.WEIGHT_DECAY = 0.0001
+_C.SOLVER.WEIGHT_DECAY_BIAS = 0
+_C.SOLVER.PATIENCE = 300
+_C.SOLVER.SCHEDULER = "cosine"
+_C.SOLVER.BASE_LR = 0.01
+_C.SOLVER.BIAS_MULTIPLIER = 1.              # for prompt + bias
+_C.SOLVER.WARMUP_EPOCH = 5
+_C.SOLVER.TOTAL_EPOCH = 30
+_C.SOLVER.LOG_EVERY_N = 1000
+_C.SOLVER.DBG_TRAINABLE = False # if True, will print the name of trainable params
+# ----------------------------------------------------------------------
+# Dataset options
+# ----------------------------------------------------------------------
+_C.DATA = CfgNode()
+_C.DATA.NAME = ""
+_C.DATA.DATAPATH = ""
+_C.DATA.FEATURE = ""  # e.g. inat2021_supervised
+_C.DATA.PERCENTAGE = 1.0
+_C.DATA.NUMBER_CLASSES = -1
+_C.DATA.MULTILABEL = False
+_C.DATA.CLASS_WEIGHTS_TYPE = "none"
+_C.DATA.CROPSIZE = 224  # or 384
+_C.DATA.NO_TEST = False
+_C.DATA.BATCH_SIZE = 32
+# Number of data loader workers per training process
+_C.DATA.NUM_WORKERS = 4
+# Load data to pinned host memory
+_C.DATA.PIN_MEMORY = True
+_C.DIST_BACKEND = "nccl"
+_C.DIST_INIT_PATH = "env://"
+_C.DIST_INIT_FILE = ""
+def get_cfg():
+    """
+    Get a copy of the default config.
+    """
+    return _C.clone()

vpt/src/configs/config_node.py ADDED Viewed

	@@ -0,0 +1,26 @@

+#!/usr/bin/env python3
+"""Config system (based on Detectron's)."""
+from fvcore.common.config import CfgNode as _CfgNode
+from ..utils.file_io import PathManager
+class CfgNode(_CfgNode):
+    """
+    The same as `fvcore.common.config.CfgNode`, but different in:
+    support manifold path
+    """
+    @classmethod
+    def _open_cfg(cls, filename):
+        return PathManager.open(filename, "r")
+    def dump(self, *args, **kwargs):
+        """
+        Returns:
+            str: a yaml string representation of the config
+        """
+        # to make it show up in docs
+        return super().dump(*args, **kwargs)

vpt/src/configs/vit_configs.py ADDED Viewed

	@@ -0,0 +1,102 @@

+#!/usr/bin/env python3
+"""
+Copyright (c) Meta Platforms, Inc. All Rights Reserved
+https://github.com/jeonsworld/ViT-pytorch/blob/main/models/configs.py
+"""
+import ml_collections
+def get_testing():
+    """Returns a minimal configuration for testing."""
+    config = ml_collections.ConfigDict()
+    config.patches = ml_collections.ConfigDict({'size': (16, 16)})
+    config.hidden_size = 1
+    config.transformer = ml_collections.ConfigDict()
+    config.transformer.mlp_dim = 1
+    config.transformer.num_heads = 1
+    config.transformer.num_layers = 1
+    config.transformer.attention_dropout_rate = 0.0
+    config.transformer.dropout_rate = 0.1
+    config.classifier = 'token'
+    config.representation_size = None
+    return config
+def get_b16_config():
+    """Returns the ViT-B/16 configuration."""
+    config = ml_collections.ConfigDict()
+    config.patches = ml_collections.ConfigDict({'size': (16, 16)})
+    config.hidden_size = 768
+    config.transformer = ml_collections.ConfigDict()
+    config.transformer.mlp_dim = 3072
+    config.transformer.num_heads = 12
+    config.transformer.num_layers = 12
+    config.transformer.attention_dropout_rate = 0.0
+    config.transformer.dropout_rate = 0.1
+    config.classifier = 'token'
+    config.representation_size = None
+    return config
+def get_r50_b16_config():
+    """Returns the Resnet50 + ViT-B/16 configuration."""
+    config = get_b16_config()
+    del config.patches.size
+    config.patches.grid = (14, 14)
+    config.resnet = ml_collections.ConfigDict()
+    config.resnet.num_layers = (3, 4, 9)
+    config.resnet.width_factor = 1
+    return config
+def get_b32_config():
+    """Returns the ViT-B/32 configuration."""
+    config = get_b16_config()
+    config.patches.size = (32, 32)
+    return config
+def get_b8_config():
+    """Returns the ViT-B/32 configuration."""
+    config = get_b16_config()
+    config.patches.size = (8, 8)
+    return config
+def get_l16_config():
+    """Returns the ViT-L/16 configuration."""
+    config = ml_collections.ConfigDict()
+    config.patches = ml_collections.ConfigDict({'size': (16, 16)})
+    config.hidden_size = 1024
+    config.transformer = ml_collections.ConfigDict()
+    config.transformer.mlp_dim = 4096
+    config.transformer.num_heads = 16
+    config.transformer.num_layers = 24
+    config.transformer.attention_dropout_rate = 0.0
+    config.transformer.dropout_rate = 0.1
+    config.classifier = 'token'
+    config.representation_size = None
+    return config
+def get_l32_config():
+    """Returns the ViT-L/32 configuration."""
+    config = get_l16_config()
+    config.patches.size = (32, 32)
+    return config
+def get_h14_config():
+    """Returns the ViT-L/16 configuration."""
+    config = ml_collections.ConfigDict()
+    config.patches = ml_collections.ConfigDict({'size': (14, 14)})
+    config.hidden_size = 1280
+    config.transformer = ml_collections.ConfigDict()
+    config.transformer.mlp_dim = 5120
+    config.transformer.num_heads = 16
+    config.transformer.num_layers = 32
+    config.transformer.attention_dropout_rate = 0.0
+    config.transformer.dropout_rate = 0.1
+    config.classifier = 'token'
+    config.representation_size = None
+    return config

vpt/src/utils/distributed.py ADDED Viewed

	@@ -0,0 +1,168 @@

+#!/usr/bin/env python3
+"""Distributed helpers."""
+import torch
+import torch.distributed as dist
+_LOCAL_PROCESS_GROUP = None
+def get_world_size() -> int:
+    if not dist.is_available():
+        return 1
+    if not dist.is_initialized():
+        return 1
+    return dist.get_world_size()
+def get_rank() -> int:
+    if not dist.is_available():
+        return 0
+    if not dist.is_initialized():
+        return 0
+    return dist.get_rank()
+def is_master_process(num_gpus=8):
+    """
+    Determines if the current process is the master process.
+    """
+    if torch.distributed.is_initialized():
+        return dist.get_rank() % num_gpus == 0
+    else:
+        return True
+def run(
+    local_rank,
+    num_proc,
+    func,
+    init_method,
+    shard_id,
+    num_shards,
+    backend,
+    cfg,
+    args,
+):
+    """
+    Runs a function from a child process.
+    Args:
+        local_rank (int): rank of the current process on the current machine.
+        num_proc (int): number of processes per machine.
+        func (function): function to execute on each of the process.
+        init_method (string): method to initialize the distributed training.
+            TCP initialization: equiring a network address reachable from all
+            processes followed by the port.
+            Shared file-system initialization: makes use of a file system that
+            is shared and visible from all machines. The URL should start with
+            file:// and contain a path to a non-existent file on a shared file
+            system.
+        shard_id (int): the rank of the current machine.
+        num_shards (int): number of overall machines for the distributed
+            training job.
+        backend (string): three distributed backends ('nccl', 'gloo', 'mpi') are
+            supports, each with different capabilities. Details can be found
+            here:
+            https://pytorch.org/docs/stable/distributed.html
+        cfg (CfgNode): configs. Details can be found in
+            loco/config/defaults.py
+    """
+    # Initialize the process group.
+    # shard_id = get_rank()
+    world_size = num_proc * num_shards
+    rank = shard_id * num_proc + local_rank
+    try:
+        torch.distributed.init_process_group(
+            backend=backend,
+            init_method=init_method,
+            world_size=world_size,
+            rank=rank,
+        )
+    except Exception as e:
+        raise e
+    torch.cuda.set_device(local_rank)
+    func(cfg, args)
+def destroy_process_group():
+    """Destroys the default process group."""
+    torch.distributed.destroy_process_group()
+def scaled_all_reduce(cfg, tensors):
+    """Performs the scaled all_reduce operation on the provided tensors.
+    The input tensors are modified in-place. Currently supports only the sum
+    reduction operator. The reduced values are scaled by the inverse size of
+    the process group (equivalent to cfg.NUM_GPUS).
+    """
+    # Queue the reductions
+    reductions = []
+    for tensor in tensors:
+        reduction = torch.distributed.all_reduce(tensor, async_op=True)
+        reductions.append(reduction)
+    # Wait for reductions to finish
+    for reduction in reductions:
+        reduction.wait()
+    # Scale the results
+    for tensor in tensors:
+        tensor.mul_(1.0 / cfg.NUM_GPUS / cfg.NUM_SHARDS)
+    return tensors
+def cat_all_gather(tensors):
+    """Performs the concatenated all_gather operation on the provided tensors.
+    """
+    tensors_gather = [
+        torch.ones_like(tensors)
+        for _ in range(torch.distributed.get_world_size())
+    ]
+    torch.distributed.all_gather(tensors_gather, tensors, async_op=False)
+    output = torch.cat(tensors_gather, dim=0)
+    return output
+def local_cat_all_gather(tensors):
+    """Performs the concatenated all_gather operation on the provided tensors.
+    """
+    tensors_gather = [
+        torch.ones_like(tensors)
+        for _ in range(get_local_size())
+    ]
+    torch.distributed.all_gather(
+        tensors_gather,
+        tensors,
+        async_op=False,
+        group=_LOCAL_PROCESS_GROUP,
+    )
+    output = torch.cat(tensors_gather, dim=0)
+    return output
+def get_local_size():
+    """
+    Returns:
+        The size of the per-machine process group,
+        i.e. the number of processes per machine.
+    """
+    if not dist.is_available():
+        return 1
+    if not dist.is_initialized():
+        return 1
+    return dist.get_world_size(group=_LOCAL_PROCESS_GROUP)
+def get_local_rank():
+    """
+    Returns:
+        The rank of the current process within the local (per-machine) process group.
+    """
+    if not dist.is_available():
+        return 0
+    if not dist.is_initialized():
+        return 0
+    assert _LOCAL_PROCESS_GROUP is not None
+    return dist.get_rank(group=_LOCAL_PROCESS_GROUP)

vpt/src/utils/file_io.py ADDED Viewed

	@@ -0,0 +1,11 @@

+#!/usr/bin/env python3
+"""
+Project specific pathmanagers for a project as recommended by Detectron2
+"""
+from iopath.common.file_io import PathManager as PathManagerBase
+from iopath.common.file_io import HTTPURLHandler
+PathManager = PathManagerBase()
+PathManager.register_handler(HTTPURLHandler())

vpt/src/utils/logging.py ADDED Viewed

	@@ -0,0 +1,186 @@

+#!/usr/bin/env python3
+"""Logging."""
+import builtins
+import decimal
+import functools
+import logging
+import simplejson
+import sys
+import os
+from termcolor import colored
+from .distributed import is_master_process
+from .file_io import PathManager
+# Show filename and line number in logs
+_FORMAT = "[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s"
+def _suppress_print():
+    """Suppresses printing from the current process."""
+    def print_pass(*objects, sep=" ", end="\n", file=sys.stdout, flush=False):
+        pass
+    builtins.print = print_pass
+# cache the opened file object, so that different calls to `setup_logger`
+# with the same file name can safely write to the same file.
+@functools.lru_cache(maxsize=None)
+def _cached_log_stream(filename):
+    return PathManager.open(filename, "a")
+@functools.lru_cache()  # so that calling setup_logger multiple times won't add many handlers  # noqa
+def setup_logging(
+    num_gpu, num_shards, output="", name="visual_prompt", color=True):
+    """Sets up the logging."""
+    # Enable logging only for the master process
+    if is_master_process(num_gpu):
+        # Clear the root logger to prevent any existing logging config
+        # (e.g. set by another module) from messing with our setup
+        logging.root.handlers = []
+        # Configure logging
+        logging.basicConfig(
+            level=logging.INFO, format=_FORMAT, stream=sys.stdout
+        )
+    else:
+        _suppress_print()
+    if name is None:
+        name = __name__
+    logger = logging.getLogger(name)
+    # remove any lingering handler
+    logger.handlers.clear()
+    logger.setLevel(logging.INFO)
+    logger.propagate = False
+    plain_formatter = logging.Formatter(
+        "[%(asctime)s][%(levelname)s] %(name)s: %(lineno)4d: %(message)s",
+        datefmt="%m/%d %H:%M:%S",
+    )
+    if color:
+        formatter = _ColorfulFormatter(
+            colored("[%(asctime)s %(name)s]: ", "green") + "%(message)s",
+            datefmt="%m/%d %H:%M:%S",
+            root_name=name,
+            abbrev_name=str(name),
+        )
+    else:
+        formatter = plain_formatter
+    if is_master_process(num_gpu):
+        ch = logging.StreamHandler(stream=sys.stdout)
+        ch.setLevel(logging.DEBUG)
+        ch.setFormatter(formatter)
+        logger.addHandler(ch)
+    if is_master_process(num_gpu * num_shards):
+        if len(output) > 0:
+            if output.endswith(".txt") or output.endswith(".log"):
+                filename = output
+            else:
+                filename = os.path.join(output, "logs.txt")
+            PathManager.mkdirs(os.path.dirname(filename))
+            fh = logging.StreamHandler(_cached_log_stream(filename))
+            fh.setLevel(logging.DEBUG)
+            fh.setFormatter(plain_formatter)
+            logger.addHandler(fh)
+    return logger
+def setup_single_logging(name, output=""):
+    """Sets up the logging."""
+    # Enable logging only for the master process
+    # Clear the root logger to prevent any existing logging config
+    # (e.g. set by another module) from messing with our setup
+    logging.root.handlers = []
+    # Configure logging
+    logging.basicConfig(
+        level=logging.INFO, format=_FORMAT, stream=sys.stdout
+    )
+    if len(name) == 0:
+        name = __name__
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.INFO)
+    logger.propagate = False
+    plain_formatter = logging.Formatter(
+        "[%(asctime)s][%(levelname)s] %(name)s: %(lineno)4d: %(message)s",
+        datefmt="%m/%d %H:%M:%S",
+    )
+    formatter = _ColorfulFormatter(
+        colored("[%(asctime)s %(name)s]: ", "green") + "%(message)s",
+        datefmt="%m/%d %H:%M:%S",
+        root_name=name,
+        abbrev_name=str(name),
+    )
+    ch = logging.StreamHandler(stream=sys.stdout)
+    ch.setLevel(logging.DEBUG)
+    ch.setFormatter(formatter)
+    logger.addHandler(ch)
+    if len(output) > 0:
+        if output.endswith(".txt") or output.endswith(".log"):
+            filename = output
+        else:
+            filename = os.path.join(output, "logs.txt")
+        PathManager.mkdirs(os.path.dirname(filename))
+        fh = logging.StreamHandler(_cached_log_stream(filename))
+        fh.setLevel(logging.DEBUG)
+        fh.setFormatter(plain_formatter)
+        logger.addHandler(fh)
+    return logger
+def get_logger(name):
+    """Retrieves the logger."""
+    return logging.getLogger(name)
+def log_json_stats(stats, sort_keys=True):
+    """Logs json stats."""
+    # It seems that in Python >= 3.6 json.encoder.FLOAT_REPR has no effect
+    # Use decimal+string as a workaround for having fixed length values in logs
+    logger = get_logger(__name__)
+    stats = {
+        k: decimal.Decimal("{:.6f}".format(v)) if isinstance(v, float) else v
+        for k, v in stats.items()
+    }
+    json_stats = simplejson.dumps(stats, sort_keys=True, use_decimal=True)
+    if stats["_type"] == "test_epoch" or stats["_type"] == "train_epoch":
+        logger.info("json_stats: {:s}".format(json_stats))
+    else:
+        logger.info("{:s}".format(json_stats))
+class _ColorfulFormatter(logging.Formatter):
+    # from detectron2
+    def __init__(self, *args, **kwargs):
+        self._root_name = kwargs.pop("root_name") + "."
+        self._abbrev_name = kwargs.pop("abbrev_name", "")
+        if len(self._abbrev_name):
+            self._abbrev_name = self._abbrev_name + "."
+        super(_ColorfulFormatter, self).__init__(*args, **kwargs)
+    def formatMessage(self, record: logging.LogRecord) -> str:
+        record.name = record.name.replace(self._root_name, self._abbrev_name)
+        log = super(_ColorfulFormatter, self).formatMessage(record)
+        if record.levelno == logging.WARNING:
+            prefix = colored("WARNING", "red", attrs=["blink"])
+        elif record.levelno == logging.ERROR or record.levelno == logging.CRITICAL:
+            prefix = colored("ERROR", "red", attrs=["blink", "underline"])
+        else:
+            return log
+        return prefix + " " + log