{ "cells": [ { "cell_type": "code", "execution_count": 25, "id": "ef9e1556-7840-4004-b181-a2c97ac2ab17", "metadata": {}, "outputs": [], "source": [ "import os\n", "import torch\n", "import torch.nn as nn\n", "import numpy as np\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "markdown", "id": "b6f12dd4-f3aa-4981-b604-b72e67229011", "metadata": {}, "source": [ "# DinoV2" ] }, { "cell_type": "code", "execution_count": 26, "id": "2a604617-b602-4503-b288-e9828684505e", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using cache found in /fsx/proj-fmri/shared/cache/dinov2/hub/facebookresearch_dinov2_main\n" ] } ], "source": [ "# need to change TORCH_HOME env variable to specify pretrained model should go in shared folder, not home directory\n", "os.environ['TORCH_HOME'] = '/fsx/proj-fmri/shared/cache/dinov2'\n", "dinov2_model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14')\n", "# remove initial image patching\n", "dinov2_model.patch_embed = nn.Identity()\n", "dinov2_model.patch_embed = nn.Identity()" ] }, { "cell_type": "code", "execution_count": 27, "id": "32da913d-d931-4967-a5e8-bd40c21d1ad9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "torch.Size([2, 33, 1024])\n" ] } ], "source": [ "dinov2_model.to(\"cuda\")\n", "input = torch.randn((2,33,1024)).to(\"cuda\")\n", "\n", "for block in dinov2_model.blocks: input = block(input)\n", "input = dinov2_model.norm(input)\n", "\n", "print(input.shape)" ] }, { "cell_type": "markdown", "id": "febe89c0-06d0-4309-b378-a8d58b99bf4c", "metadata": {}, "source": [ "# eva" ] }, { "cell_type": "code", "execution_count": 28, "id": "690204d0-13d7-452b-97af-14d144800e81", "metadata": {}, "outputs": [], "source": [ "from urllib.request import urlopen\n", "from PIL import Image\n", "import timm\n", "\n", "img = Image.open(urlopen(\n", " 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png'\n", "))\n", "\n", "model = timm.create_model(\n", " \"eva02_enormous_patch14_clip_224.laion2b\",\n", " pretrained=True,\n", " num_classes=0, # remove classifier nn.Linear\n", ")\n", "model = model.eval()" ] }, { "cell_type": "code", "execution_count": 39, "id": "035e3e9d-86c9-4ddf-b760-7b78dded7d2e", "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "You have to specify pixel_values", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[39], line 5\u001b[0m\n\u001b[1;32m 2\u001b[0m data_config \u001b[38;5;241m=\u001b[39m timm\u001b[38;5;241m.\u001b[39mdata\u001b[38;5;241m.\u001b[39mresolve_model_data_config(model)\n\u001b[1;32m 3\u001b[0m transforms \u001b[38;5;241m=\u001b[39m "metadata": {}, "outputs": [], "source": [ "model.forward_features(transforms(img).unsqueeze(0)).shape" ] }, { "cell_type": "markdown", "id": "6546c673-f3ab-4d43-a051-cab20e782bab", "metadata": {}, "source": [ "# Eva02-clip" ] }, { "cell_type": "code", "execution_count": 29, "id": "dfbc95de-9af9-4583-98fc-b8061114ef64", "metadata": {}, "outputs": [], "source": [ "import timm \n", "# couldnt figure out how to load pretrained model from shared folder rather than home directory using timm...\n", "eva02_model = timm.create_model(\"eva02_enormous_patch14_clip_224.laion2b\", pretrained=True)\n", "# eva02_model.head_drop = nn.Identity()\n", "# eva02_model.head = nn.Identity()" ] }, { "cell_type": "code", "execution_count": 30, "id": "97e3ea29-ae6b-4bd2-b3d7-17839098a6e4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([2, 1024])" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "eva02_model(torch.randn((2,3,224,224))).shape" ] Please specify in `size['longest_edge'] instead`.\n" ] } ], "source": [ "from transformers import DetrImageProcessor, DetrForObjectDetection\n", "import torch\n", "from PIL import Image\n", "import requests\n", "\n", "url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n", "image = Image.open(requests.get(url, stream=True).raw)\n", "\n", "processor = DetrImageProcessor.from_pretrained(\"facebook/detr-resnet-50\", cache_dir='/fsx/proj-fmri/shared/cache')\n", "model = DetrForObjectDetection.from_pretrained(\"facebook/detr-resnet-50\", cache_dir='/fsx/proj-fmri/shared/cache')" ] }, { "cell_type": "code", "execution_count": 34, "id": "1d5aa2d7-4868-4751-8d90-7c52be028cd9", "metadata": {}, "outputs": [], "source": [ "inputs = processor(images=image, return_tensors=\"pt\")\n", "outputs = model(**inputs)" ] }, { "cell_type": "code", "execution_count": 35, "id": "ae6bafc6-cee4-4e59-b7ba-12efc2a65b74", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Detected remote with confidence 0.998 at location [40.16, 70.81, 175.55, 117.98]\n", "Detected remote with confidence 0.996 at location [333.24, 72.55, 368.33, 187.66]\n", "Detected couch with confidence 0.995 at location [-0.02, 1.15, 639.73, 473.76]\n", "Detected cat with confidence 0.999 at location [13.24, 52.05, 314.02, 470.93]\n", "Detected cat with confidence 0.999 at location [345.4, 23.85, 640.37, 368.72]\n" ] } ], "source": [ "# convert outputs (bounding boxes and class logits) to COCO API\n", "# let's only keep detections with score > 0.9\n", "target_sizes = torch.tensor([image.size[::-1]])\n", "results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]\n", "\n", "for score, label, box in zip(results[\"scores\"], results[\"labels\"], results[\"boxes\"]):\n", " box = [round(i, 2) for i in box.tolist()]\n", " print(\n", " f\"Detected {model.config.id2label[label.item()]} with confidence \"\n", " f\"{round(score.item(), 3)} at location {box}\"\n", " )" ] }, { "cell_type": "code", "execution_count": 36, "id": "6dcc5934-79d4-4062-8b32-e42b3ebcdc0f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DetrImageProcessor {\n", " \"do_normalize\": true,\n", " \"do_pad\": true,\n", " \"do_rescale\": true,\n", " \"do_resize\": true,\n", " \"feature_extractor_type\": \"DetrFeatureExtractor\",\n", " \"format\": \"coco_detection\",\n", " \"image_mean\": [\n", " 0.485,\n", " 0.456,\n", " 0.406\n", " ],\n", " \"image_processor_type\": \"DetrImageProcessor\",\n", " \"image_std\": [\n", " 0.229,\n", " 0.224,\n", " 0.225\n", " ],\n", " \"resample\": 2,\n", " \"rescale_factor\": 0.00392156862745098,\n", " \"size\": {\n", " \"longest_edge\": 1333,\n", " \"shortest_edge\": 800\n", " }\n", "}" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "processor" ] }, { "cell_type": "markdown", "id": "db1d89cc-b432-473e-af69-d81c435ac731", "metadata": {}, "source": [ "# CLIPSeg" ] }, { "cell_type": "code", "execution_count": 37, "id": "15db14d1-ee4d-4429-9286-054c4498293b", "metadata": {}, "outputs": [], "source": [ "from transformers import CLIPSegProcessor, CLIPSegForImageSegmentation\n", "\n", "processor = CLIPSegProcessor.from_pretrained(\"CIDAS/clipseg-rd16\",cache_dir='/fsx/proj-fmri/shared/cache')\n", "model = CLIPSegForImageSegmentation.from_pretrained(\"CIDAS/clipseg-rd16\",cache_dir='/fsx/proj-fmri/shared/cache')" ] }, { "cell_type": "code", "execution_count": 38, "id": "4aa225d4-5a3b-4dbb-ae57-dea2872ff492", "metadata": {}, "outputs": [ { "ename": "AttributeError", "evalue": "'JpegImageFile' object has no attribute 'shape'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[38], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mimage\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mshape\u001b[49m\n", "\u001b[0;31mAttributeError\u001b[0m: 'JpegImageFile' object has no attribute 'shape'" ] } ], "source": [ "image.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "ad7e2daf-0c7c-4fec-b29e-9ba47a037c6b", "metadata": {}, "outputs": [], "source": [ "from PIL import Image\n", "import requests\n", "import h5py\n", "\n", "# url = \"https://unsplash.com/photos/8Nc_oQsc2qQ/download?ixid=MnwxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNjcxMjAwNzI0&force=true&w=640\"\n", "# image = Image.open(requests.get(url, stream=True).raw)\n", "\n", "image_path = \"/fsx/proj-fmri/shared/mindeyev2_dataset/coco_images_224_float16.hdf5\"\n", "with h5py.File(image_path, 'r') as file:\n", " image = file['images'][0]\n", "image = np.moveaxis(image, 0, -1).astype(np.float32)\n", "plt.imshow(image)\n", "\n", "prompts = [\"person\",\"animal\",\"object\",\"background\"]\n", "import torch\n", "\n", "# Rescale to [0, 255]\n", "array = (image * 255).astype(np.uint8)\n", "\n", "# Convert to PIL image\n", "image = Image.fromarray(array)\n", "\n", "inputs = processor(text=prompts, images=[image] * len(prompts), padding=\"max_length\", return_tensors=\"pt\")\n", "# predict\n", "with torch.no_grad():\n", " outputs = model(**inputs)\n", "preds = outputs.logits.unsqueeze(1)\n", "print(preds.shape)" ] }, { "cell_type": "code", "execution_count": null, "id": "131eb5b7-2f16-4a79-8402-edc1a1d8c348", "metadata": {}, "outputs": [], "source": [ "preds = ((preds[0] + preds[1] + preds[2] + preds[-1].max() - preds[-1]) / 4)[None]\n", "preds.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "e2bf99e7-064d-4c22-997f-aa1a35dbab82", "metadata": {}, "outputs": [], "source": [ "_, ax = plt.subplots(1, len(prompts) + 1, figsize=(3*(len(prompts) + 1), 4))\n", "[a.axis('off') for a in ax.flatten()]\n", "ax[0].imshow(image)\n", "[ax[i+1].imshow(torch.sigmoid(preds[i][0])) for i in range(1)];\n", "# [ax[i+1].text(0, -15, 