promo

Browse files

Files changed (8) hide show

1.png +0 -0
README.md +13 -2
Untitled.ipynb +110 -39
low.png +0 -0
promo.png +2 -2
test.ipynb +0 -0
transformer/diffusion_pytorch_model.fp16.safetensors +1 -1
waifu.png +0 -0

1.png CHANGED Viewed

README.md CHANGED Viewed

@@ -16,12 +16,22 @@ waifu is a free text-to-image model that can efficiently generate images in 80 l
 (2) [**Linear DiT**](https://github.com/NVlabs/Sana): we use 1.6b DiT transformer with linear attention. \
 (3) [**MEXMA-SigLIP**](https://huggingface.co/visheratin/mexma-siglip): MEXMA-SigLIP is a model that combines the [MEXMA](https://huggingface.co/facebook/MEXMA) multilingual text encoder and an image encoder from the [SigLIP](https://huggingface.co/timm/ViT-SO400M-14-SigLIP-384) model. This allows us to get a high-performance CLIP model for 80 languages.. \
 (4) Other: we use Flow-Euler sampler, Adafactor-Fused optimizer and bf16 precision for training, and combine efficient caption labeling (MoonDream, CogVlM, Human, Gpt's) and danbooru tags to accelerate convergence.
 ## Example
 ```py
-# install diffusers from source
 pip install git+https://github.com/huggingface/diffusers
 ```
@@ -60,7 +70,8 @@ for img in image:
 ## Donations
-We are a small GPU poor group of enthusiasts (current train budget ~$2k)
 Please contact with us if you may provide some GPU's on training

 (2) [**Linear DiT**](https://github.com/NVlabs/Sana): we use 1.6b DiT transformer with linear attention. \
 (3) [**MEXMA-SigLIP**](https://huggingface.co/visheratin/mexma-siglip): MEXMA-SigLIP is a model that combines the [MEXMA](https://huggingface.co/facebook/MEXMA) multilingual text encoder and an image encoder from the [SigLIP](https://huggingface.co/timm/ViT-SO400M-14-SigLIP-384) model. This allows us to get a high-performance CLIP model for 80 languages.. \
 (4) Other: we use Flow-Euler sampler, Adafactor-Fused optimizer and bf16 precision for training, and combine efficient caption labeling (MoonDream, CogVlM, Human, Gpt's) and danbooru tags to accelerate convergence.
+, вот исправленный текст на английском языке:
+## Pros
+- Small model that can be trained on a common GPU; fast training process.
+- Supports multiple languages and demonstrates good prompt adherence.
+- Utilizes the best 16-channel VAE (Variational Autoencoder).
+## Cons
+- Trained on only 2 million images (low-budget model, approximately $3,000).
+- Training dataset consists primarily of anime and illustrations (only about 1% realistic images).
+- Only lowres for now (512)
 ## Example
 ```py
+# 1st, install latest diffusers from source!!
 pip install git+https://github.com/huggingface/diffusers
 ```
 ## Donations
+We are a small GPU poor group of enthusiasts (current train budget ~$3k)
+![image](./low.png)
 Please contact with us if you may provide some GPU's on training

Untitled.ipynb CHANGED Viewed

@@ -292,7 +292,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
    "id": "5d85c3c7-3b5b-40c8-bc2b-6b4d344287d9",
    "metadata": {},
    "outputs": [
@@ -300,8 +300,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "tensor([[-0.0287, -0.0082,  0.0444,  ..., -0.0011,  0.0306,  0.0251]],\n",
-      "       device='cuda:0', dtype=torch.bfloat16)\n"
      ]
     }
    ],
@@ -328,10 +327,10 @@
     "\n",
     "# Получение эмбеддингов изображения\n",
     "with torch.inference_mode():\n",
-    "    image_embeddings = model.encode_images(img, normalize=True)\n",
     "\n",
     "# Вывод эмбеддингов\n",
-    "print(image_embeddings)"
    ]
   },
   {
@@ -617,61 +616,133 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "6fc2606b-cf1c-488a-a8fc-d98a4abcc8c0",
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "9d4f5c2ccfbe4b2382b4943a7cfc51b3",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "open_clip_model.safetensors:   0%|          | 0.00/3.51G [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
     }
    ],
    "source": [
-    "from transformers import AutoProcessor, AutoModel\n",
-    "from PIL import Image\n",
-    "import requests\n",
     "import torch\n",
-    "\n",
-    "# Загрузка модели и процессора\n",
-    "#model = AutoModel.from_pretrained(\"timm/ViT-SO400M-14-SigLIP-384\")\n",
-    "#processor = AutoProcessor.from_pretrained(\"timm/ViT-SO400M-14-SigLIP-384\")\n",
-    "\n",
     "from open_clip import create_model_from_pretrained, get_tokenizer # works on open-clip-torch>=2.23.0, timm>=0.9.8\n",
     "\n",
-    "model, processor = create_model_from_pretrained('hf-hub:timm/ViT-SO400M-14-SigLIP-384')\n",
     "\n",
     "\n",
-    "# Загрузка изображения\n",
-    "url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n",
-    "image = Image.open(requests.get(url, stream=True).raw)\n",
     "\n",
-    "# Обработка изображения\n",
-    "inputs = processor(images=image, return_tensors=\"pt\")\n",
     "\n",
-    "# Извлечение эмбедингов\n",
-    "with torch.no_grad():\n",
-    "    outputs = model.get_image_features(**inputs)\n",
     "\n",
-    "# Эмбединги изображения (shape: [batch_size, embedding_dim])\n",
-    "image_embeddings = outputs.squeeze(0)  # Убираем batch_size, если он равен 1\n",
-    "print(\"Shape of image embeddings:\", image_embeddings.shape)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "e7b5d910-de0e-4f41-8d4e-7e4501aa33f4",
    "metadata": {},
    "outputs": [],
    "source": []
   }

   },
   {
    "cell_type": "code",
+   "execution_count": 10,
    "id": "5d85c3c7-3b5b-40c8-bc2b-6b4d344287d9",
    "metadata": {},
    "outputs": [
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "torch.Size([1, 1152])\n"
      ]
     }
    ],
     "\n",
     "# Получение эмбеддингов изображения\n",
     "with torch.inference_mode():\n",
+    "    image_embeddings = model.encode_images(img, normalize=False)\n",
     "\n",
     "# Вывод эмбеддингов\n",
+    "print(image_embeddings.shape)"
    ]
   },
   {
   },
   {
    "cell_type": "code",
+   "execution_count": 6,
    "id": "6fc2606b-cf1c-488a-a8fc-d98a4abcc8c0",
    "metadata": {},
    "outputs": [
     {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_19418/3674156061.py:18: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
+      "  with torch.no_grad(), torch.cuda.amp.autocast():\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([1, 1152])\n",
+      "tensor([[-0.0550,  0.1304,  0.1885,  ..., -0.1434, -0.4676,  0.1461]])\n",
+      "Label probabilities:  [('a dog', 0.0), ('a cat', 0.0), ('a donut', 0.0), ('a beignet', 0.517)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_19418/3674156061.py:31: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
+      "  with torch.no_grad(), torch.cuda.amp.autocast():\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "All patches shape: torch.Size([1, 1152])\n"
+     ]
     }
    ],
    "source": [
     "import torch\n",
+    "import torch.nn.functional as F\n",
+    "from urllib.request import urlopen\n",
+    "from PIL import Image\n",
     "from open_clip import create_model_from_pretrained, get_tokenizer # works on open-clip-torch>=2.23.0, timm>=0.9.8\n",
     "\n",
+    "model, preprocess = create_model_from_pretrained('hf-hub:timm/ViT-SO400M-14-SigLIP-384')\n",
+    "tokenizer = get_tokenizer('hf-hub:timm/ViT-SO400M-14-SigLIP-384')\n",
     "\n",
+    "image = Image.open(urlopen(\n",
+    "    'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png'\n",
+    "))\n",
+    "image = preprocess(image).unsqueeze(0)\n",
     "\n",
+    "labels_list = [\"a dog\", \"a cat\", \"a donut\", \"a beignet\"]\n",
+    "text = tokenizer(labels_list, context_length=model.context_length)\n",
     "\n",
+    "with torch.no_grad(), torch.cuda.amp.autocast():\n",
+    "    image_features = model.encode_image(image)\n",
+    "    print(image_features.shape)\n",
+    "    print(image_features)\n",
+    "    text_features = model.encode_text(text)\n",
+    "    image_features = F.normalize(image_features, dim=-1)\n",
+    "    text_features = F.normalize(text_features, dim=-1)\n",
     "\n",
+    "    text_probs = torch.sigmoid(image_features @ text_features.T * model.logit_scale.exp() + model.logit_bias)\n",
     "\n",
+    "zipped_list = list(zip(labels_list, [round(p.item(), 3) for p in text_probs[0]]))\n",
+    "print(\"Label probabilities: \", zipped_list)\n",
+    "\n",
+    "with torch.no_grad(), torch.cuda.amp.autocast():\n",
+    "    # Получаем скрытые состояния всех патчей\n",
+    "    outputs = model.visual(image)  # [batch_size, num_patches + 1, hidden_dim]\n",
+    "    print(\"All patches shape:\", outputs.shape)  # Пример: [1, 256, 1152]\n",
+    "    #all_patch_embeddings = outputs[:, 1:, :]  # Игнорируем [CLS]-токен\n",
+    "    #print(\"All patches shape:\", all_patch_embeddings.shape)  # Пример: [1, 256, 1152]\n"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 7,
    "id": "e7b5d910-de0e-4f41-8d4e-7e4501aa33f4",
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_19418/2526917774.py:1: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
+      "  with torch.no_grad(), torch.cuda.amp.autocast():\n"
+     ]
+    },
+    {
+     "ename": "AttributeError",
+     "evalue": "'TimmModel' object has no attribute 'patch_embed'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[7], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad(), torch\u001b[38;5;241m.\u001b[39mcuda\u001b[38;5;241m.\u001b[39mamp\u001b[38;5;241m.\u001b[39mautocast():\n\u001b[1;32m      2\u001b[0m     \u001b[38;5;66;03m# Извлекаем патчи и позиционные эмбеддинги\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m     x \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvisual\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpatch_embed\u001b[49m(image)  \u001b[38;5;66;03m# [1, num_patches, 1152]\u001b[39;00m\n\u001b[1;32m      4\u001b[0m     x \u001b[38;5;241m=\u001b[39m model\u001b[38;5;241m.\u001b[39mvisual\u001b[38;5;241m.\u001b[39mpos_drop(x \u001b[38;5;241m+\u001b[39m model\u001b[38;5;241m.\u001b[39mvisual\u001b[38;5;241m.\u001b[39mpos_embed)\n\u001b[1;32m      6\u001b[0m     \u001b[38;5;66;03m# Проход через трансформерные блоки\u001b[39;00m\n",
+      "File \u001b[0;32m~/.local/lib/python3.11/site-packages/torch/nn/modules/module.py:1931\u001b[0m, in \u001b[0;36mModule.__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m   1929\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m modules:\n\u001b[1;32m   1930\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m modules[name]\n\u001b[0;32m-> 1931\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mAttributeError\u001b[39;00m(\n\u001b[1;32m   1932\u001b[0m     \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m object has no attribute \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   1933\u001b[0m )\n",
+      "\u001b[0;31mAttributeError\u001b[0m: 'TimmModel' object has no attribute 'patch_embed'"
+     ]
+    }
+   ],
+   "source": [
+    "with torch.no_grad(), torch.cuda.amp.autocast():\n",
+    "    # Извлекаем патчи и позиционные эмбеддинги\n",
+    "    x = model.visual.patch_embed(image)  # [1, num_patches, 1152]\n",
+    "    x = model.visual.pos_drop(x + model.visual.pos_embed)\n",
+    "    \n",
+    "    # Проход через трансформерные блоки\n",
+    "    for blk in model.visual.blocks:\n",
+    "        x = blk(x)\n",
+    "    \n",
+    "    # Применяем LayerNorm (если есть)\n",
+    "    if hasattr(model.visual, \"norm\"):\n",
+    "        x = model.visual.norm(x)\n",
+    "    \n",
+    "    # Теперь x содержит все патчи\n",
+    "    print(\"All patches shape:\", x.shape)\n",
+    "    # Пример вывода: torch.Size([1, 756, 1152])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "29ecd610-7121-4c39-80cf-5021b80f6431",
+   "metadata": {},
    "outputs": [],
    "source": []
   }

low.png ADDED Viewed

promo.png CHANGED Viewed

Git LFS Details

SHA256: e504ece8b1e057d831a79edb019c5a554a9231004a23b727c37e9f8938f94cd2
Pointer size: 132 Bytes
Size of remote file: 2.22 MB

Git LFS Details

SHA256: e1818acd3fe47e95196093b8228bacf49401c75bf9c0cf340b4045c0f4a5cc14
Pointer size: 132 Bytes
Size of remote file: 9.52 MB

test.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

transformer/diffusion_pytorch_model.fp16.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:16780a90997af6f2b640ac53e5e69a8a6273c49552d9d84aff4cb3d9a4006409
 size 3203093344

 version https://git-lfs.github.com/spec/v1
+oid sha256:58fcdcbbb4ebdb16298ce420ee5277b89d52be4d0432ad93eea8ddbce4b3cf86
 size 3203093344

waifu.png CHANGED Viewed