recoilme commited on
Commit
0c1ffd6
·
1 Parent(s): 5a2f01c
1.png CHANGED
README.md CHANGED
@@ -16,12 +16,22 @@ waifu is a free text-to-image model that can efficiently generate images in 80 l
16
  (2) [**Linear DiT**](https://github.com/NVlabs/Sana): we use 1.6b DiT transformer with linear attention. \
17
  (3) [**MEXMA-SigLIP**](https://huggingface.co/visheratin/mexma-siglip): MEXMA-SigLIP is a model that combines the [MEXMA](https://huggingface.co/facebook/MEXMA) multilingual text encoder and an image encoder from the [SigLIP](https://huggingface.co/timm/ViT-SO400M-14-SigLIP-384) model. This allows us to get a high-performance CLIP model for 80 languages.. \
18
  (4) Other: we use Flow-Euler sampler, Adafactor-Fused optimizer and bf16 precision for training, and combine efficient caption labeling (MoonDream, CogVlM, Human, Gpt's) and danbooru tags to accelerate convergence.
 
19
 
 
 
 
 
 
 
 
 
 
20
 
21
  ## Example
22
 
23
  ```py
24
- # install diffusers from source
25
  pip install git+https://github.com/huggingface/diffusers
26
  ```
27
 
@@ -60,7 +70,8 @@ for img in image:
60
 
61
  ## Donations
62
 
63
- We are a small GPU poor group of enthusiasts (current train budget ~$2k)
 
64
 
65
  Please contact with us if you may provide some GPU's on training
66
 
 
16
  (2) [**Linear DiT**](https://github.com/NVlabs/Sana): we use 1.6b DiT transformer with linear attention. \
17
  (3) [**MEXMA-SigLIP**](https://huggingface.co/visheratin/mexma-siglip): MEXMA-SigLIP is a model that combines the [MEXMA](https://huggingface.co/facebook/MEXMA) multilingual text encoder and an image encoder from the [SigLIP](https://huggingface.co/timm/ViT-SO400M-14-SigLIP-384) model. This allows us to get a high-performance CLIP model for 80 languages.. \
18
  (4) Other: we use Flow-Euler sampler, Adafactor-Fused optimizer and bf16 precision for training, and combine efficient caption labeling (MoonDream, CogVlM, Human, Gpt's) and danbooru tags to accelerate convergence.
19
+ , вот исправленный текст на английском языке:
20
 
21
+ ## Pros
22
+ - Small model that can be trained on a common GPU; fast training process.
23
+ - Supports multiple languages and demonstrates good prompt adherence.
24
+ - Utilizes the best 16-channel VAE (Variational Autoencoder).
25
+
26
+ ## Cons
27
+ - Trained on only 2 million images (low-budget model, approximately $3,000).
28
+ - Training dataset consists primarily of anime and illustrations (only about 1% realistic images).
29
+ - Only lowres for now (512)
30
 
31
  ## Example
32
 
33
  ```py
34
+ # 1st, install latest diffusers from source!!
35
  pip install git+https://github.com/huggingface/diffusers
36
  ```
37
 
 
70
 
71
  ## Donations
72
 
73
+ We are a small GPU poor group of enthusiasts (current train budget ~$3k)
74
+ ![image](./low.png)
75
 
76
  Please contact with us if you may provide some GPU's on training
77
 
Untitled.ipynb CHANGED
@@ -292,7 +292,7 @@
292
  },
293
  {
294
  "cell_type": "code",
295
- "execution_count": 6,
296
  "id": "5d85c3c7-3b5b-40c8-bc2b-6b4d344287d9",
297
  "metadata": {},
298
  "outputs": [
@@ -300,8 +300,7 @@
300
  "name": "stdout",
301
  "output_type": "stream",
302
  "text": [
303
- "tensor([[-0.0287, -0.0082, 0.0444, ..., -0.0011, 0.0306, 0.0251]],\n",
304
- " device='cuda:0', dtype=torch.bfloat16)\n"
305
  ]
306
  }
307
  ],
@@ -328,10 +327,10 @@
328
  "\n",
329
  "# Получение эмбеддингов изображения\n",
330
  "with torch.inference_mode():\n",
331
- " image_embeddings = model.encode_images(img, normalize=True)\n",
332
  "\n",
333
  "# Вывод эмбеддингов\n",
334
- "print(image_embeddings)"
335
  ]
336
  },
337
  {
@@ -617,61 +616,133 @@
617
  },
618
  {
619
  "cell_type": "code",
620
- "execution_count": null,
621
  "id": "6fc2606b-cf1c-488a-a8fc-d98a4abcc8c0",
622
  "metadata": {},
623
  "outputs": [
624
  {
625
- "data": {
626
- "application/vnd.jupyter.widget-view+json": {
627
- "model_id": "9d4f5c2ccfbe4b2382b4943a7cfc51b3",
628
- "version_major": 2,
629
- "version_minor": 0
630
- },
631
- "text/plain": [
632
- "open_clip_model.safetensors: 0%| | 0.00/3.51G [00:00<?, ?B/s]"
633
- ]
634
- },
635
- "metadata": {},
636
- "output_type": "display_data"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
637
  }
638
  ],
639
  "source": [
640
- "from transformers import AutoProcessor, AutoModel\n",
641
- "from PIL import Image\n",
642
- "import requests\n",
643
  "import torch\n",
644
- "\n",
645
- "# Загрузка модели и процессора\n",
646
- "#model = AutoModel.from_pretrained(\"timm/ViT-SO400M-14-SigLIP-384\")\n",
647
- "#processor = AutoProcessor.from_pretrained(\"timm/ViT-SO400M-14-SigLIP-384\")\n",
648
- "\n",
649
  "from open_clip import create_model_from_pretrained, get_tokenizer # works on open-clip-torch>=2.23.0, timm>=0.9.8\n",
650
  "\n",
651
- "model, processor = create_model_from_pretrained('hf-hub:timm/ViT-SO400M-14-SigLIP-384')\n",
 
652
  "\n",
 
 
 
 
653
  "\n",
654
- "# Загрузка изображения\n",
655
- "url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n",
656
- "image = Image.open(requests.get(url, stream=True).raw)\n",
657
  "\n",
658
- "# Обработка изображения\n",
659
- "inputs = processor(images=image, return_tensors=\"pt\")\n",
 
 
 
 
 
660
  "\n",
661
- "# Извлечение эмбедингов\n",
662
- "with torch.no_grad():\n",
663
- " outputs = model.get_image_features(**inputs)\n",
664
  "\n",
665
- "# Эмбединги изображения (shape: [batch_size, embedding_dim])\n",
666
- "image_embeddings = outputs.squeeze(0) # Убираем batch_size, если он равен 1\n",
667
- "print(\"Shape of image embeddings:\", image_embeddings.shape)"
 
 
 
 
 
 
668
  ]
669
  },
670
  {
671
  "cell_type": "code",
672
- "execution_count": null,
673
  "id": "e7b5d910-de0e-4f41-8d4e-7e4501aa33f4",
674
  "metadata": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
675
  "outputs": [],
676
  "source": []
677
  }
 
292
  },
293
  {
294
  "cell_type": "code",
295
+ "execution_count": 10,
296
  "id": "5d85c3c7-3b5b-40c8-bc2b-6b4d344287d9",
297
  "metadata": {},
298
  "outputs": [
 
300
  "name": "stdout",
301
  "output_type": "stream",
302
  "text": [
303
+ "torch.Size([1, 1152])\n"
 
304
  ]
305
  }
306
  ],
 
327
  "\n",
328
  "# Получение эмбеддингов изображения\n",
329
  "with torch.inference_mode():\n",
330
+ " image_embeddings = model.encode_images(img, normalize=False)\n",
331
  "\n",
332
  "# Вывод эмбеддингов\n",
333
+ "print(image_embeddings.shape)"
334
  ]
335
  },
336
  {
 
616
  },
617
  {
618
  "cell_type": "code",
619
+ "execution_count": 6,
620
  "id": "6fc2606b-cf1c-488a-a8fc-d98a4abcc8c0",
621
  "metadata": {},
622
  "outputs": [
623
  {
624
+ "name": "stderr",
625
+ "output_type": "stream",
626
+ "text": [
627
+ "/tmp/ipykernel_19418/3674156061.py:18: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
628
+ " with torch.no_grad(), torch.cuda.amp.autocast():\n"
629
+ ]
630
+ },
631
+ {
632
+ "name": "stdout",
633
+ "output_type": "stream",
634
+ "text": [
635
+ "torch.Size([1, 1152])\n",
636
+ "tensor([[-0.0550, 0.1304, 0.1885, ..., -0.1434, -0.4676, 0.1461]])\n",
637
+ "Label probabilities: [('a dog', 0.0), ('a cat', 0.0), ('a donut', 0.0), ('a beignet', 0.517)]\n"
638
+ ]
639
+ },
640
+ {
641
+ "name": "stderr",
642
+ "output_type": "stream",
643
+ "text": [
644
+ "/tmp/ipykernel_19418/3674156061.py:31: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
645
+ " with torch.no_grad(), torch.cuda.amp.autocast():\n"
646
+ ]
647
+ },
648
+ {
649
+ "name": "stdout",
650
+ "output_type": "stream",
651
+ "text": [
652
+ "All patches shape: torch.Size([1, 1152])\n"
653
+ ]
654
  }
655
  ],
656
  "source": [
 
 
 
657
  "import torch\n",
658
+ "import torch.nn.functional as F\n",
659
+ "from urllib.request import urlopen\n",
660
+ "from PIL import Image\n",
 
 
661
  "from open_clip import create_model_from_pretrained, get_tokenizer # works on open-clip-torch>=2.23.0, timm>=0.9.8\n",
662
  "\n",
663
+ "model, preprocess = create_model_from_pretrained('hf-hub:timm/ViT-SO400M-14-SigLIP-384')\n",
664
+ "tokenizer = get_tokenizer('hf-hub:timm/ViT-SO400M-14-SigLIP-384')\n",
665
  "\n",
666
+ "image = Image.open(urlopen(\n",
667
+ " 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png'\n",
668
+ "))\n",
669
+ "image = preprocess(image).unsqueeze(0)\n",
670
  "\n",
671
+ "labels_list = [\"a dog\", \"a cat\", \"a donut\", \"a beignet\"]\n",
672
+ "text = tokenizer(labels_list, context_length=model.context_length)\n",
 
673
  "\n",
674
+ "with torch.no_grad(), torch.cuda.amp.autocast():\n",
675
+ " image_features = model.encode_image(image)\n",
676
+ " print(image_features.shape)\n",
677
+ " print(image_features)\n",
678
+ " text_features = model.encode_text(text)\n",
679
+ " image_features = F.normalize(image_features, dim=-1)\n",
680
+ " text_features = F.normalize(text_features, dim=-1)\n",
681
  "\n",
682
+ " text_probs = torch.sigmoid(image_features @ text_features.T * model.logit_scale.exp() + model.logit_bias)\n",
 
 
683
  "\n",
684
+ "zipped_list = list(zip(labels_list, [round(p.item(), 3) for p in text_probs[0]]))\n",
685
+ "print(\"Label probabilities: \", zipped_list)\n",
686
+ "\n",
687
+ "with torch.no_grad(), torch.cuda.amp.autocast():\n",
688
+ " # Получаем скрытые состояния всех патчей\n",
689
+ " outputs = model.visual(image) # [batch_size, num_patches + 1, hidden_dim]\n",
690
+ " print(\"All patches shape:\", outputs.shape) # Пример: [1, 256, 1152]\n",
691
+ " #all_patch_embeddings = outputs[:, 1:, :] # Игнорируем [CLS]-токен\n",
692
+ " #print(\"All patches shape:\", all_patch_embeddings.shape) # Пример: [1, 256, 1152]\n"
693
  ]
694
  },
695
  {
696
  "cell_type": "code",
697
+ "execution_count": 7,
698
  "id": "e7b5d910-de0e-4f41-8d4e-7e4501aa33f4",
699
  "metadata": {},
700
+ "outputs": [
701
+ {
702
+ "name": "stderr",
703
+ "output_type": "stream",
704
+ "text": [
705
+ "/tmp/ipykernel_19418/2526917774.py:1: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
706
+ " with torch.no_grad(), torch.cuda.amp.autocast():\n"
707
+ ]
708
+ },
709
+ {
710
+ "ename": "AttributeError",
711
+ "evalue": "'TimmModel' object has no attribute 'patch_embed'",
712
+ "output_type": "error",
713
+ "traceback": [
714
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
715
+ "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
716
+ "Cell \u001b[0;32mIn[7], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad(), torch\u001b[38;5;241m.\u001b[39mcuda\u001b[38;5;241m.\u001b[39mamp\u001b[38;5;241m.\u001b[39mautocast():\n\u001b[1;32m 2\u001b[0m \u001b[38;5;66;03m# Извлекаем патчи и позиционные эмбеддинги\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m x \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvisual\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpatch_embed\u001b[49m(image) \u001b[38;5;66;03m# [1, num_patches, 1152]\u001b[39;00m\n\u001b[1;32m 4\u001b[0m x \u001b[38;5;241m=\u001b[39m model\u001b[38;5;241m.\u001b[39mvisual\u001b[38;5;241m.\u001b[39mpos_drop(x \u001b[38;5;241m+\u001b[39m model\u001b[38;5;241m.\u001b[39mvisual\u001b[38;5;241m.\u001b[39mpos_embed)\n\u001b[1;32m 6\u001b[0m \u001b[38;5;66;03m# Проход через трансформерные блоки\u001b[39;00m\n",
717
+ "File \u001b[0;32m~/.local/lib/python3.11/site-packages/torch/nn/modules/module.py:1931\u001b[0m, in \u001b[0;36mModule.__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 1929\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m modules:\n\u001b[1;32m 1930\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m modules[name]\n\u001b[0;32m-> 1931\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mAttributeError\u001b[39;00m(\n\u001b[1;32m 1932\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m object has no attribute \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1933\u001b[0m )\n",
718
+ "\u001b[0;31mAttributeError\u001b[0m: 'TimmModel' object has no attribute 'patch_embed'"
719
+ ]
720
+ }
721
+ ],
722
+ "source": [
723
+ "with torch.no_grad(), torch.cuda.amp.autocast():\n",
724
+ " # Извлекаем патчи и позиционные эмбеддинги\n",
725
+ " x = model.visual.patch_embed(image) # [1, num_patches, 1152]\n",
726
+ " x = model.visual.pos_drop(x + model.visual.pos_embed)\n",
727
+ " \n",
728
+ " # Проход через трансформерные блоки\n",
729
+ " for blk in model.visual.blocks:\n",
730
+ " x = blk(x)\n",
731
+ " \n",
732
+ " # Применяем LayerNorm (если есть)\n",
733
+ " if hasattr(model.visual, \"norm\"):\n",
734
+ " x = model.visual.norm(x)\n",
735
+ " \n",
736
+ " # Теперь x содержит все патчи\n",
737
+ " print(\"All patches shape:\", x.shape)\n",
738
+ " # Пример вывода: torch.Size([1, 756, 1152])"
739
+ ]
740
+ },
741
+ {
742
+ "cell_type": "code",
743
+ "execution_count": null,
744
+ "id": "29ecd610-7121-4c39-80cf-5021b80f6431",
745
+ "metadata": {},
746
  "outputs": [],
747
  "source": []
748
  }
low.png ADDED
promo.png CHANGED

Git LFS Details

  • SHA256: e504ece8b1e057d831a79edb019c5a554a9231004a23b727c37e9f8938f94cd2
  • Pointer size: 132 Bytes
  • Size of remote file: 2.22 MB

Git LFS Details

  • SHA256: e1818acd3fe47e95196093b8228bacf49401c75bf9c0cf340b4045c0f4a5cc14
  • Pointer size: 132 Bytes
  • Size of remote file: 9.52 MB
test.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
transformer/diffusion_pytorch_model.fp16.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16780a90997af6f2b640ac53e5e69a8a6273c49552d9d84aff4cb3d9a4006409
3
  size 3203093344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58fcdcbbb4ebdb16298ce420ee5277b89d52be4d0432ad93eea8ddbce4b3cf86
3
  size 3203093344
waifu.png CHANGED