promo
Browse files- 1.png +0 -0
- README.md +13 -2
- Untitled.ipynb +110 -39
- low.png +0 -0
- promo.png +2 -2
- test.ipynb +0 -0
- transformer/diffusion_pytorch_model.fp16.safetensors +1 -1
- waifu.png +0 -0
1.png
CHANGED
![]() |
![]() |
README.md
CHANGED
@@ -16,12 +16,22 @@ waifu is a free text-to-image model that can efficiently generate images in 80 l
|
|
16 |
(2) [**Linear DiT**](https://github.com/NVlabs/Sana): we use 1.6b DiT transformer with linear attention. \
|
17 |
(3) [**MEXMA-SigLIP**](https://huggingface.co/visheratin/mexma-siglip): MEXMA-SigLIP is a model that combines the [MEXMA](https://huggingface.co/facebook/MEXMA) multilingual text encoder and an image encoder from the [SigLIP](https://huggingface.co/timm/ViT-SO400M-14-SigLIP-384) model. This allows us to get a high-performance CLIP model for 80 languages.. \
|
18 |
(4) Other: we use Flow-Euler sampler, Adafactor-Fused optimizer and bf16 precision for training, and combine efficient caption labeling (MoonDream, CogVlM, Human, Gpt's) and danbooru tags to accelerate convergence.
|
|
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
## Example
|
22 |
|
23 |
```py
|
24 |
-
# install diffusers from source
|
25 |
pip install git+https://github.com/huggingface/diffusers
|
26 |
```
|
27 |
|
@@ -60,7 +70,8 @@ for img in image:
|
|
60 |
|
61 |
## Donations
|
62 |
|
63 |
-
We are a small GPU poor group of enthusiasts (current train budget ~$
|
|
|
64 |
|
65 |
Please contact with us if you may provide some GPU's on training
|
66 |
|
|
|
16 |
(2) [**Linear DiT**](https://github.com/NVlabs/Sana): we use 1.6b DiT transformer with linear attention. \
|
17 |
(3) [**MEXMA-SigLIP**](https://huggingface.co/visheratin/mexma-siglip): MEXMA-SigLIP is a model that combines the [MEXMA](https://huggingface.co/facebook/MEXMA) multilingual text encoder and an image encoder from the [SigLIP](https://huggingface.co/timm/ViT-SO400M-14-SigLIP-384) model. This allows us to get a high-performance CLIP model for 80 languages.. \
|
18 |
(4) Other: we use Flow-Euler sampler, Adafactor-Fused optimizer and bf16 precision for training, and combine efficient caption labeling (MoonDream, CogVlM, Human, Gpt's) and danbooru tags to accelerate convergence.
|
19 |
+
, вот исправленный текст на английском языке:
|
20 |
|
21 |
+
## Pros
|
22 |
+
- Small model that can be trained on a common GPU; fast training process.
|
23 |
+
- Supports multiple languages and demonstrates good prompt adherence.
|
24 |
+
- Utilizes the best 16-channel VAE (Variational Autoencoder).
|
25 |
+
|
26 |
+
## Cons
|
27 |
+
- Trained on only 2 million images (low-budget model, approximately $3,000).
|
28 |
+
- Training dataset consists primarily of anime and illustrations (only about 1% realistic images).
|
29 |
+
- Only lowres for now (512)
|
30 |
|
31 |
## Example
|
32 |
|
33 |
```py
|
34 |
+
# 1st, install latest diffusers from source!!
|
35 |
pip install git+https://github.com/huggingface/diffusers
|
36 |
```
|
37 |
|
|
|
70 |
|
71 |
## Donations
|
72 |
|
73 |
+
We are a small GPU poor group of enthusiasts (current train budget ~$3k)
|
74 |
+

|
75 |
|
76 |
Please contact with us if you may provide some GPU's on training
|
77 |
|
Untitled.ipynb
CHANGED
@@ -292,7 +292,7 @@
|
|
292 |
},
|
293 |
{
|
294 |
"cell_type": "code",
|
295 |
-
"execution_count":
|
296 |
"id": "5d85c3c7-3b5b-40c8-bc2b-6b4d344287d9",
|
297 |
"metadata": {},
|
298 |
"outputs": [
|
@@ -300,8 +300,7 @@
|
|
300 |
"name": "stdout",
|
301 |
"output_type": "stream",
|
302 |
"text": [
|
303 |
-
"
|
304 |
-
" device='cuda:0', dtype=torch.bfloat16)\n"
|
305 |
]
|
306 |
}
|
307 |
],
|
@@ -328,10 +327,10 @@
|
|
328 |
"\n",
|
329 |
"# Получение эмбеддингов изображения\n",
|
330 |
"with torch.inference_mode():\n",
|
331 |
-
" image_embeddings = model.encode_images(img, normalize=
|
332 |
"\n",
|
333 |
"# Вывод эмбеддингов\n",
|
334 |
-
"print(image_embeddings)"
|
335 |
]
|
336 |
},
|
337 |
{
|
@@ -617,61 +616,133 @@
|
|
617 |
},
|
618 |
{
|
619 |
"cell_type": "code",
|
620 |
-
"execution_count":
|
621 |
"id": "6fc2606b-cf1c-488a-a8fc-d98a4abcc8c0",
|
622 |
"metadata": {},
|
623 |
"outputs": [
|
624 |
{
|
625 |
-
"
|
626 |
-
|
627 |
-
|
628 |
-
|
629 |
-
|
630 |
-
|
631 |
-
|
632 |
-
|
633 |
-
|
634 |
-
|
635 |
-
"
|
636 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
637 |
}
|
638 |
],
|
639 |
"source": [
|
640 |
-
"from transformers import AutoProcessor, AutoModel\n",
|
641 |
-
"from PIL import Image\n",
|
642 |
-
"import requests\n",
|
643 |
"import torch\n",
|
644 |
-
"\n",
|
645 |
-
"
|
646 |
-
"
|
647 |
-
"#processor = AutoProcessor.from_pretrained(\"timm/ViT-SO400M-14-SigLIP-384\")\n",
|
648 |
-
"\n",
|
649 |
"from open_clip import create_model_from_pretrained, get_tokenizer # works on open-clip-torch>=2.23.0, timm>=0.9.8\n",
|
650 |
"\n",
|
651 |
-
"model,
|
|
|
652 |
"\n",
|
|
|
|
|
|
|
|
|
653 |
"\n",
|
654 |
-
"
|
655 |
-
"
|
656 |
-
"image = Image.open(requests.get(url, stream=True).raw)\n",
|
657 |
"\n",
|
658 |
-
"
|
659 |
-
"
|
|
|
|
|
|
|
|
|
|
|
660 |
"\n",
|
661 |
-
"
|
662 |
-
"with torch.no_grad():\n",
|
663 |
-
" outputs = model.get_image_features(**inputs)\n",
|
664 |
"\n",
|
665 |
-
"
|
666 |
-
"
|
667 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
668 |
]
|
669 |
},
|
670 |
{
|
671 |
"cell_type": "code",
|
672 |
-
"execution_count":
|
673 |
"id": "e7b5d910-de0e-4f41-8d4e-7e4501aa33f4",
|
674 |
"metadata": {},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
675 |
"outputs": [],
|
676 |
"source": []
|
677 |
}
|
|
|
292 |
},
|
293 |
{
|
294 |
"cell_type": "code",
|
295 |
+
"execution_count": 10,
|
296 |
"id": "5d85c3c7-3b5b-40c8-bc2b-6b4d344287d9",
|
297 |
"metadata": {},
|
298 |
"outputs": [
|
|
|
300 |
"name": "stdout",
|
301 |
"output_type": "stream",
|
302 |
"text": [
|
303 |
+
"torch.Size([1, 1152])\n"
|
|
|
304 |
]
|
305 |
}
|
306 |
],
|
|
|
327 |
"\n",
|
328 |
"# Получение эмбеддингов изображения\n",
|
329 |
"with torch.inference_mode():\n",
|
330 |
+
" image_embeddings = model.encode_images(img, normalize=False)\n",
|
331 |
"\n",
|
332 |
"# Вывод эмбеддингов\n",
|
333 |
+
"print(image_embeddings.shape)"
|
334 |
]
|
335 |
},
|
336 |
{
|
|
|
616 |
},
|
617 |
{
|
618 |
"cell_type": "code",
|
619 |
+
"execution_count": 6,
|
620 |
"id": "6fc2606b-cf1c-488a-a8fc-d98a4abcc8c0",
|
621 |
"metadata": {},
|
622 |
"outputs": [
|
623 |
{
|
624 |
+
"name": "stderr",
|
625 |
+
"output_type": "stream",
|
626 |
+
"text": [
|
627 |
+
"/tmp/ipykernel_19418/3674156061.py:18: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
|
628 |
+
" with torch.no_grad(), torch.cuda.amp.autocast():\n"
|
629 |
+
]
|
630 |
+
},
|
631 |
+
{
|
632 |
+
"name": "stdout",
|
633 |
+
"output_type": "stream",
|
634 |
+
"text": [
|
635 |
+
"torch.Size([1, 1152])\n",
|
636 |
+
"tensor([[-0.0550, 0.1304, 0.1885, ..., -0.1434, -0.4676, 0.1461]])\n",
|
637 |
+
"Label probabilities: [('a dog', 0.0), ('a cat', 0.0), ('a donut', 0.0), ('a beignet', 0.517)]\n"
|
638 |
+
]
|
639 |
+
},
|
640 |
+
{
|
641 |
+
"name": "stderr",
|
642 |
+
"output_type": "stream",
|
643 |
+
"text": [
|
644 |
+
"/tmp/ipykernel_19418/3674156061.py:31: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
|
645 |
+
" with torch.no_grad(), torch.cuda.amp.autocast():\n"
|
646 |
+
]
|
647 |
+
},
|
648 |
+
{
|
649 |
+
"name": "stdout",
|
650 |
+
"output_type": "stream",
|
651 |
+
"text": [
|
652 |
+
"All patches shape: torch.Size([1, 1152])\n"
|
653 |
+
]
|
654 |
}
|
655 |
],
|
656 |
"source": [
|
|
|
|
|
|
|
657 |
"import torch\n",
|
658 |
+
"import torch.nn.functional as F\n",
|
659 |
+
"from urllib.request import urlopen\n",
|
660 |
+
"from PIL import Image\n",
|
|
|
|
|
661 |
"from open_clip import create_model_from_pretrained, get_tokenizer # works on open-clip-torch>=2.23.0, timm>=0.9.8\n",
|
662 |
"\n",
|
663 |
+
"model, preprocess = create_model_from_pretrained('hf-hub:timm/ViT-SO400M-14-SigLIP-384')\n",
|
664 |
+
"tokenizer = get_tokenizer('hf-hub:timm/ViT-SO400M-14-SigLIP-384')\n",
|
665 |
"\n",
|
666 |
+
"image = Image.open(urlopen(\n",
|
667 |
+
" 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png'\n",
|
668 |
+
"))\n",
|
669 |
+
"image = preprocess(image).unsqueeze(0)\n",
|
670 |
"\n",
|
671 |
+
"labels_list = [\"a dog\", \"a cat\", \"a donut\", \"a beignet\"]\n",
|
672 |
+
"text = tokenizer(labels_list, context_length=model.context_length)\n",
|
|
|
673 |
"\n",
|
674 |
+
"with torch.no_grad(), torch.cuda.amp.autocast():\n",
|
675 |
+
" image_features = model.encode_image(image)\n",
|
676 |
+
" print(image_features.shape)\n",
|
677 |
+
" print(image_features)\n",
|
678 |
+
" text_features = model.encode_text(text)\n",
|
679 |
+
" image_features = F.normalize(image_features, dim=-1)\n",
|
680 |
+
" text_features = F.normalize(text_features, dim=-1)\n",
|
681 |
"\n",
|
682 |
+
" text_probs = torch.sigmoid(image_features @ text_features.T * model.logit_scale.exp() + model.logit_bias)\n",
|
|
|
|
|
683 |
"\n",
|
684 |
+
"zipped_list = list(zip(labels_list, [round(p.item(), 3) for p in text_probs[0]]))\n",
|
685 |
+
"print(\"Label probabilities: \", zipped_list)\n",
|
686 |
+
"\n",
|
687 |
+
"with torch.no_grad(), torch.cuda.amp.autocast():\n",
|
688 |
+
" # Получаем скрытые состояния всех патчей\n",
|
689 |
+
" outputs = model.visual(image) # [batch_size, num_patches + 1, hidden_dim]\n",
|
690 |
+
" print(\"All patches shape:\", outputs.shape) # Пример: [1, 256, 1152]\n",
|
691 |
+
" #all_patch_embeddings = outputs[:, 1:, :] # Игнорируем [CLS]-токен\n",
|
692 |
+
" #print(\"All patches shape:\", all_patch_embeddings.shape) # Пример: [1, 256, 1152]\n"
|
693 |
]
|
694 |
},
|
695 |
{
|
696 |
"cell_type": "code",
|
697 |
+
"execution_count": 7,
|
698 |
"id": "e7b5d910-de0e-4f41-8d4e-7e4501aa33f4",
|
699 |
"metadata": {},
|
700 |
+
"outputs": [
|
701 |
+
{
|
702 |
+
"name": "stderr",
|
703 |
+
"output_type": "stream",
|
704 |
+
"text": [
|
705 |
+
"/tmp/ipykernel_19418/2526917774.py:1: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
|
706 |
+
" with torch.no_grad(), torch.cuda.amp.autocast():\n"
|
707 |
+
]
|
708 |
+
},
|
709 |
+
{
|
710 |
+
"ename": "AttributeError",
|
711 |
+
"evalue": "'TimmModel' object has no attribute 'patch_embed'",
|
712 |
+
"output_type": "error",
|
713 |
+
"traceback": [
|
714 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
715 |
+
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
|
716 |
+
"Cell \u001b[0;32mIn[7], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad(), torch\u001b[38;5;241m.\u001b[39mcuda\u001b[38;5;241m.\u001b[39mamp\u001b[38;5;241m.\u001b[39mautocast():\n\u001b[1;32m 2\u001b[0m \u001b[38;5;66;03m# Извлекаем патчи и позиционные эмбеддинги\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m x \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvisual\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpatch_embed\u001b[49m(image) \u001b[38;5;66;03m# [1, num_patches, 1152]\u001b[39;00m\n\u001b[1;32m 4\u001b[0m x \u001b[38;5;241m=\u001b[39m model\u001b[38;5;241m.\u001b[39mvisual\u001b[38;5;241m.\u001b[39mpos_drop(x \u001b[38;5;241m+\u001b[39m model\u001b[38;5;241m.\u001b[39mvisual\u001b[38;5;241m.\u001b[39mpos_embed)\n\u001b[1;32m 6\u001b[0m \u001b[38;5;66;03m# Проход через трансформерные блоки\u001b[39;00m\n",
|
717 |
+
"File \u001b[0;32m~/.local/lib/python3.11/site-packages/torch/nn/modules/module.py:1931\u001b[0m, in \u001b[0;36mModule.__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 1929\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m modules:\n\u001b[1;32m 1930\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m modules[name]\n\u001b[0;32m-> 1931\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mAttributeError\u001b[39;00m(\n\u001b[1;32m 1932\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m object has no attribute \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1933\u001b[0m )\n",
|
718 |
+
"\u001b[0;31mAttributeError\u001b[0m: 'TimmModel' object has no attribute 'patch_embed'"
|
719 |
+
]
|
720 |
+
}
|
721 |
+
],
|
722 |
+
"source": [
|
723 |
+
"with torch.no_grad(), torch.cuda.amp.autocast():\n",
|
724 |
+
" # Извлекаем патчи и позиционные эмбеддинги\n",
|
725 |
+
" x = model.visual.patch_embed(image) # [1, num_patches, 1152]\n",
|
726 |
+
" x = model.visual.pos_drop(x + model.visual.pos_embed)\n",
|
727 |
+
" \n",
|
728 |
+
" # Проход через трансформерные блоки\n",
|
729 |
+
" for blk in model.visual.blocks:\n",
|
730 |
+
" x = blk(x)\n",
|
731 |
+
" \n",
|
732 |
+
" # Применяем LayerNorm (если есть)\n",
|
733 |
+
" if hasattr(model.visual, \"norm\"):\n",
|
734 |
+
" x = model.visual.norm(x)\n",
|
735 |
+
" \n",
|
736 |
+
" # Теперь x содержит все патчи\n",
|
737 |
+
" print(\"All patches shape:\", x.shape)\n",
|
738 |
+
" # Пример вывода: torch.Size([1, 756, 1152])"
|
739 |
+
]
|
740 |
+
},
|
741 |
+
{
|
742 |
+
"cell_type": "code",
|
743 |
+
"execution_count": null,
|
744 |
+
"id": "29ecd610-7121-4c39-80cf-5021b80f6431",
|
745 |
+
"metadata": {},
|
746 |
"outputs": [],
|
747 |
"source": []
|
748 |
}
|
low.png
ADDED
![]() |
promo.png
CHANGED
![]() |
Git LFS Details
|
![]() |
Git LFS Details
|
test.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
transformer/diffusion_pytorch_model.fp16.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 3203093344
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:58fcdcbbb4ebdb16298ce420ee5277b89d52be4d0432ad93eea8ddbce4b3cf86
|
3 |
size 3203093344
|
waifu.png
CHANGED
![]() |
![]() |