In [25]:
import os
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt

# DinoV2

In [26]:
# need to change TORCH_HOME env variable to specify pretrained model should go in shared folder, not home directory
os.environ['TORCH_HOME'] = '/fsx/proj-fmri/shared/cache/dinov2'
dinov2_model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14')
# remove initial image patching
dinov2_model.patch_embed = nn.Identity()
dinov2_model.patch_embed = nn.Identity()

Using cache found in /fsx/proj-fmri/shared/cache/dinov2/hub/facebookresearch_dinov2_main


In [27]:
dinov2_model.to("cuda")
input = torch.randn((2,33,1024)).to("cuda")

for block in dinov2_model.blocks: input = block(input)
input = dinov2_model.norm(input)

print(input.shape)

torch.Size([2, 33, 1024])


# eva

In [28]:
from urllib.request import urlopen
from PIL import Image
import timm

img = Image.open(urlopen(
 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png'
))

model = timm.create_model(
 "eva02_enormous_patch14_clip_224.laion2b",
 pretrained=True,
 num_classes=0, # remove classifier nn.Linear
)
model = model.eval()

In [39]:
# get model specific transforms (normalization, resize)
data_config = timm.data.resolve_model_data_config(model)
transforms = timm.data.create_transform(**data_config, is_training=False)

output = model(transforms(img).unsqueeze(0)) # output is (batch_size, num_features) shaped tensor
print(output.shape)
# or equivalently (without needing to set num_classes=0)

output = model.forward_features(transforms(img).unsqueeze(0))
# output is unpooled, a (1, 257, 768) shaped tensor
print(output.shape)

output = model.forward_head(output, pre_logits=True)
# output is a (1, num_features) shaped tensor
print(output.shape)

ValueError: You have to specify pixel_values

In [None]:
model.forward_features(transforms(img).unsqueeze(0)).shape

# Eva02-clip

In [29]:
import timm 
# couldnt figure out how to load pretrained model from shared folder rather than home directory using timm...
eva02_model = timm.create_model("eva02_enormous_patch14_clip_224.laion2b", pretrained=True)
# eva02_model.head_drop = nn.Identity()
# eva02_model.head = nn.Identity()

In [30]:
eva02_model(torch.randn((2,3,224,224))).shape

torch.Size([2, 1024])

In [31]:
image = torch.randn((2,3,224,224))

input = eva02_model.patch_embed(image)
input = eva02_model.pos_drop(input)
for block in eva02_model.blocks: input = block(input)
input = eva02_model.norm(input)
input = eva02_model.fc_norm(input)
input = eva02_model.head_drop(input)
input = eva02_model.head(input)

print(input.shape)

torch.Size([2, 256, 1024])


In [32]:
help(eva02_model)

Help on Eva in module timm.models.eva object:

class Eva(torch.nn.modules.module.Module)
 | Eva(img_size: Union[int, Tuple[int, int]] = 224, patch_size: Union[int, Tuple[int, int]] = 16, in_chans: int = 3, num_classes: int = 1000, global_pool: str = 'avg', embed_dim: int = 768, depth: int = 12, num_heads: int = 12, qkv_bias: bool = True, qkv_fused: bool = True, mlp_ratio: float = 4.0, swiglu_mlp: bool = False, scale_mlp: bool = False, scale_attn_inner: bool = False, drop_rate: float = 0.0, pos_drop_rate: float = 0.0, patch_drop_rate: float = 0.0, proj_drop_rate: float = 0.0, attn_drop_rate: float = 0.0, drop_path_rate: float = 0.0, norm_layer: Callable = , init_values: Optional[float] = None, class_token: bool = True, use_abs_pos_emb: bool = True, use_rot_pos_emb: bool = False, use_post_norm: bool = False, ref_feat_shape: Union[int, Tuple[int, int], NoneType] = None, head_init_scale: float = 0.001)
 | 
 | Eva Vision Transformer w/ Abs & Rotary Pos Embed
 | 
 | This class implements the

# DETR

In [33]:
from transformers import DetrImageProcessor, DetrForObjectDetection
import torch
from PIL import Image
import requests

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50", cache_dir='/fsx/proj-fmri/shared/cache')
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50", cache_dir='/fsx/proj-fmri/shared/cache')

[2023-08-28 01:51:14,033] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


In [34]:
inputs = processor(images=image, return_tensors="pt")
outputs = model(**inputs)

In [35]:
# convert outputs (bounding boxes and class logits) to COCO API
# let's only keep detections with score > 0.9
target_sizes = torch.tensor([image.size[::-1]])
results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]

for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
 box = [round(i, 2) for i in box.tolist()]
 print(
 f"Detected {model.config.id2label[label.item()]} with confidence "
 f"{round(score.item(), 3)} at location {box}"
 )

Detected remote with confidence 0.998 at location [40.16, 70.81, 175.55, 117.98]
Detected remote with confidence 0.996 at location [333.24, 72.55, 368.33, 187.66]
Detected couch with confidence 0.995 at location [-0.02, 1.15, 639.73, 473.76]
Detected cat with confidence 0.999 at location [13.24, 52.05, 314.02, 470.93]
Detected cat with confidence 0.999 at location [345.4, 23.85, 640.37, 368.72]


In [36]:
processor

DetrImageProcessor {
 "do_normalize": true,
 "do_pad": true,
 "do_rescale": true,
 "do_resize": true,
 "feature_extractor_type": "DetrFeatureExtractor",
 "format": "coco_detection",
 "image_mean": [
 0.485,
 0.456,
 0.406
 ],
 "image_processor_type": "DetrImageProcessor",
 "image_std": [
 0.229,
 0.224,
 0.225
 ],
 "resample": 2,
 "rescale_factor": 0.00392156862745098,
 "size": {
 "longest_edge": 1333,
 "shortest_edge": 800
 }
}

# CLIPSeg

In [37]:
from transformers import CLIPSegProcessor, CLIPSegForImageSegmentation

processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd16",cache_dir='/fsx/proj-fmri/shared/cache')
model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd16",cache_dir='/fsx/proj-fmri/shared/cache')

In [38]:
image.shape

AttributeError: 'JpegImageFile' object has no attribute 'shape'

In [None]:
from PIL import Image
import requests
import h5py

# url = "https://unsplash.com/photos/8Nc_oQsc2qQ/download?ixid=MnwxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNjcxMjAwNzI0&force=true&w=640"
# image = Image.open(requests.get(url, stream=True).raw)

image_path = "/fsx/proj-fmri/shared/mindeyev2_dataset/coco_images_224_float16.hdf5"
with h5py.File(image_path, 'r') as file:
 image = file['images'][0]
image = np.moveaxis(image, 0, -1).astype(np.float32)
plt.imshow(image)

prompts = ["person","animal","object","background"]
import torch

# Rescale to [0, 255]
array = (image * 255).astype(np.uint8)

# Convert to PIL image
image = Image.fromarray(array)

inputs = processor(text=prompts, images=[image] * len(prompts), padding="max_length", return_tensors="pt")
# predict
with torch.no_grad():
 outputs = model(**inputs)
preds = outputs.logits.unsqueeze(1)
print(preds.shape)

In [None]:
preds = ((preds[0] + preds[1] + preds[2] + preds[-1].max() - preds[-1]) / 4)[None]
preds.shape

In [None]:
_, ax = plt.subplots(1, len(prompts) + 1, figsize=(3*(len(prompts) + 1), 4))
[a.axis('off') for a in ax.flatten()]
ax[0].imshow(image)
[ax[i+1].imshow(torch.sigmoid(preds[i][0])) for i in range(1)];
# [ax[i+1].text(0, -15, prompt) for i, prompt in enumerate(prompts)];