MakeAnything

Running on Zero

App Files Files Community

yiren98 commited on Feb 8

Commit

abd09b6

verified ·

1 Parent(s): 12ae7b3

Upload 98 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +25 -1
images/datasets/claysculpture.png +3 -0
images/datasets/claytoys.png +3 -0
images/datasets/cook.png +3 -0
images/datasets/emoji.png +3 -0
images/datasets/fabrictoys.png +3 -0
images/datasets/icon.png +3 -0
images/datasets/illustration.png +3 -0
images/datasets/inkpainting.png +3 -0
images/datasets/jadecarving.png +3 -0
images/datasets/landscape.png +3 -0
images/datasets/lego.png +3 -0
images/datasets/linedraw.png +3 -0
images/datasets/oilpainting.png +3 -0
images/datasets/painting.png +3 -0
images/datasets/pencilsketch.png +3 -0
images/datasets/portrait.png +3 -0
images/datasets/sandart.png +3 -0
images/datasets/sketch.png +3 -0
images/datasets/transformer.png +3 -0
images/datasets/woodsculpture.png +3 -0
images/datasets/zbrush.png +3 -0
images/i2i.png +3 -0
images/oneshot.png +3 -0
images/t2i.png +3 -0
images/teaser.png +3 -0
library/__init__.py +0 -0
library/adafactor_fused.py +138 -0
library/attention_processors.py +227 -0
library/config_util.py +716 -0
library/custom_offloading_utils.py +227 -0
library/custom_train_functions.py +559 -0
library/deepspeed_utils.py +139 -0
library/device_utils.py +84 -0
library/flux_models.py +1237 -0
library/flux_train_utils.py +582 -0
library/flux_train_utils_recraft.py +659 -0
library/flux_utils.py +472 -0
library/huggingface_util.py +84 -0
library/hypernetwork.py +223 -0
library/ipex/__init__.py +180 -0
library/ipex/attention.py +177 -0
library/ipex/diffusers.py +312 -0
library/ipex/gradscaler.py +183 -0
library/ipex/hijacks.py +313 -0
library/lpw_stable_diffusion.py +1233 -0
library/model_util.py +1356 -0
library/original_unet.py +1919 -0
library/sai_model_spec.py +334 -0
library/sd3_models.py +1413 -0

.gitattributes CHANGED Viewed

@@ -43,4 +43,28 @@ asy_results
 recraft_results
 drop
 SplitAsy
-example*

 recraft_results
 drop
 SplitAsy
+example*images/datasets/claysculpture.png filter=lfs diff=lfs merge=lfs -text
+images/datasets/claytoys.png filter=lfs diff=lfs merge=lfs -text
+images/datasets/cook.png filter=lfs diff=lfs merge=lfs -text
+images/datasets/emoji.png filter=lfs diff=lfs merge=lfs -text
+images/datasets/fabrictoys.png filter=lfs diff=lfs merge=lfs -text
+images/datasets/icon.png filter=lfs diff=lfs merge=lfs -text
+images/datasets/illustration.png filter=lfs diff=lfs merge=lfs -text
+images/datasets/inkpainting.png filter=lfs diff=lfs merge=lfs -text
+images/datasets/jadecarving.png filter=lfs diff=lfs merge=lfs -text
+images/datasets/landscape.png filter=lfs diff=lfs merge=lfs -text
+images/datasets/lego.png filter=lfs diff=lfs merge=lfs -text
+images/datasets/linedraw.png filter=lfs diff=lfs merge=lfs -text
+images/datasets/oilpainting.png filter=lfs diff=lfs merge=lfs -text
+images/datasets/painting.png filter=lfs diff=lfs merge=lfs -text
+images/datasets/pencilsketch.png filter=lfs diff=lfs merge=lfs -text
+images/datasets/portrait.png filter=lfs diff=lfs merge=lfs -text
+images/datasets/sandart.png filter=lfs diff=lfs merge=lfs -text
+images/datasets/sketch.png filter=lfs diff=lfs merge=lfs -text
+images/datasets/transformer.png filter=lfs diff=lfs merge=lfs -text
+images/datasets/woodsculpture.png filter=lfs diff=lfs merge=lfs -text
+images/datasets/zbrush.png filter=lfs diff=lfs merge=lfs -text
+images/i2i.png filter=lfs diff=lfs merge=lfs -text
+images/oneshot.png filter=lfs diff=lfs merge=lfs -text
+images/t2i.png filter=lfs diff=lfs merge=lfs -text
+images/teaser.png filter=lfs diff=lfs merge=lfs -text

images/datasets/claysculpture.png ADDED Viewed

Git LFS Details

SHA256: f566abc3b8197f1bdb491947352fa7984d1ffa99bcc91e43fade76116d3232eb
Pointer size: 131 Bytes
Size of remote file: 392 kB

images/datasets/claytoys.png ADDED Viewed

Git LFS Details

SHA256: 86de33d83a349ff4319ec99f4edf7121453e93175fd43e477cdc7a71e03ad5ec
Pointer size: 131 Bytes
Size of remote file: 805 kB

images/datasets/cook.png ADDED Viewed

Git LFS Details

SHA256: 4fd01153e9ae01670bb57f48729adf46afcb4463a12f7f685747c73892373eaa
Pointer size: 132 Bytes
Size of remote file: 1.11 MB

images/datasets/emoji.png ADDED Viewed

Git LFS Details

SHA256: bc4d1abe5cad6f9f61838473a5449fcc65c361792479a50434b2a77b7d177b76
Pointer size: 131 Bytes
Size of remote file: 135 kB

images/datasets/fabrictoys.png ADDED Viewed

Git LFS Details

SHA256: 72274d9a4fdae9b66081e4508618f7c88eaf749a24881574af4190df5c6c3159
Pointer size: 131 Bytes
Size of remote file: 619 kB

images/datasets/icon.png ADDED Viewed

Git LFS Details

SHA256: b9886684d28e956d041bdf835f0d6365ea2dbd6298140f10faa8c076ce7215f3
Pointer size: 131 Bytes
Size of remote file: 158 kB

images/datasets/illustration.png ADDED Viewed

Git LFS Details

SHA256: 26f14d9ff97810012a809cbd4163c0662768c819b19f9f77622c233cdaaef4e8
Pointer size: 131 Bytes
Size of remote file: 238 kB

images/datasets/inkpainting.png ADDED Viewed

Git LFS Details

SHA256: b513e1b03984a127d424ef8e89ce687f2497e030f71efbdfd5501a71509ef961
Pointer size: 132 Bytes
Size of remote file: 1.23 MB

images/datasets/jadecarving.png ADDED Viewed

Git LFS Details

SHA256: 53d8976040d3329bed1811568f453ae6095683699d6dd962369220079ef738f9
Pointer size: 131 Bytes
Size of remote file: 905 kB

images/datasets/landscape.png ADDED Viewed

Git LFS Details

SHA256: 0972b0332c1e1dc9a7a01681d682625b2ab42c09579550bbaa43271dc5833880
Pointer size: 131 Bytes
Size of remote file: 209 kB

images/datasets/lego.png ADDED Viewed

Git LFS Details

SHA256: 63a87beac48dff4e1110e8c40c6bbe8899253bcf784725ea9183940c427a379a
Pointer size: 131 Bytes
Size of remote file: 557 kB

images/datasets/linedraw.png ADDED Viewed

Git LFS Details

SHA256: 70aada932b271a04b2a9e2adf78cacc2a5541e19ebeb01a33fdb225a3957fa00
Pointer size: 132 Bytes
Size of remote file: 1.57 MB

images/datasets/oilpainting.png ADDED Viewed

Git LFS Details

SHA256: 6316f86c9b8e28d1210574f3b8e041d15548c91ff0838e8ad3502c35fb60db9f
Pointer size: 132 Bytes
Size of remote file: 1.39 MB

images/datasets/painting.png ADDED Viewed

Git LFS Details

SHA256: b1a10a776f880c6fd217cec659374d81fc3a7bbccf0a11dd5a2155b7113d83d8
Pointer size: 131 Bytes
Size of remote file: 680 kB

images/datasets/pencilsketch.png ADDED Viewed

Git LFS Details

SHA256: 884ac2f80d63aef5b8fa54e8e37d428eb137fe5b37c3695b5d57eb1af79a0b7e
Pointer size: 132 Bytes
Size of remote file: 1.4 MB

images/datasets/portrait.png ADDED Viewed

Git LFS Details

SHA256: 6153919f4ed6ca2b18c88f75ffc44d8af37490cf615636931e710c73e01de015
Pointer size: 131 Bytes
Size of remote file: 606 kB

images/datasets/sandart.png ADDED Viewed

Git LFS Details

SHA256: 41d064a27b3ad7759784de84d7f2a5a6284b697d63a2e7e6f6aa54fc34c748dc
Pointer size: 132 Bytes
Size of remote file: 1.49 MB

images/datasets/sketch.png ADDED Viewed

Git LFS Details

SHA256: 84810c3716f1174ae994544ffc5fa166e0a14aee7d45e23db3fd9a4f31772728
Pointer size: 131 Bytes
Size of remote file: 490 kB

images/datasets/transformer.png ADDED Viewed

Git LFS Details

SHA256: af20605f443e31c9ad0ada9ee41a6f06364f77fef21c11756fc64878e217e63f
Pointer size: 131 Bytes
Size of remote file: 377 kB

images/datasets/woodsculpture.png ADDED Viewed

Git LFS Details

SHA256: a9b43a434bcc423ca659327665957e38d4855c0701839f0d2222b3f7621fa1d2
Pointer size: 132 Bytes
Size of remote file: 1.32 MB

images/datasets/zbrush.png ADDED Viewed

Git LFS Details

SHA256: 43547722eb67c24356a1f6c28485a0ec2cd2513a3b99579b697d2966c908892e
Pointer size: 131 Bytes
Size of remote file: 316 kB

images/i2i.png ADDED Viewed

Git LFS Details

SHA256: 78a94a6ba5fceea6e2f6e5a85dece91ce700b73c15884d1a7e8d435c71a1d5cd
Pointer size: 131 Bytes
Size of remote file: 555 kB

images/oneshot.png ADDED Viewed

Git LFS Details

SHA256: a14fb05ad2b648876021f2fa87044bc5693b99e8977920022c38e137a3122e61
Pointer size: 131 Bytes
Size of remote file: 572 kB

images/t2i.png ADDED Viewed

Git LFS Details

SHA256: 91445e3d371ec49fc47f828f97662597a065163b6893f457d1700404d3272e0c
Pointer size: 132 Bytes
Size of remote file: 1.27 MB

images/teaser.png ADDED Viewed

Git LFS Details

SHA256: 5bc7be63232429c28d5633a1654344de04dba2e13e163574f56bea6887a6402f
Pointer size: 132 Bytes
Size of remote file: 1.98 MB

library/__init__.py ADDED Viewed

File without changes

library/adafactor_fused.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import math
+import torch
+from transformers import Adafactor
+# stochastic rounding for bfloat16
+# The implementation was provided by 2kpr. Thank you very much!
+def copy_stochastic_(target: torch.Tensor, source: torch.Tensor):
+    """
+    copies source into target using stochastic rounding
+    Args:
+        target: the target tensor with dtype=bfloat16
+        source: the target tensor with dtype=float32
+    """
+    # create a random 16 bit integer
+    result = torch.randint_like(source, dtype=torch.int32, low=0, high=(1 << 16))
+    # add the random number to the lower 16 bit of the mantissa
+    result.add_(source.view(dtype=torch.int32))
+    # mask off the lower 16 bit of the mantissa
+    result.bitwise_and_(-65536)  # -65536 = FFFF0000 as a signed int32
+    # copy the higher 16 bit into the target tensor
+    target.copy_(result.view(dtype=torch.float32))
+    del result
+@torch.no_grad()
+def adafactor_step_param(self, p, group):
+    if p.grad is None:
+        return
+    grad = p.grad
+    if grad.dtype in {torch.float16, torch.bfloat16}:
+        grad = grad.float()
+    if grad.is_sparse:
+        raise RuntimeError("Adafactor does not support sparse gradients.")
+    state = self.state[p]
+    grad_shape = grad.shape
+    factored, use_first_moment = Adafactor._get_options(group, grad_shape)
+    # State Initialization
+    if len(state) == 0:
+        state["step"] = 0
+        if use_first_moment:
+            # Exponential moving average of gradient values
+            state["exp_avg"] = torch.zeros_like(grad)
+        if factored:
+            state["exp_avg_sq_row"] = torch.zeros(grad_shape[:-1]).to(grad)
+            state["exp_avg_sq_col"] = torch.zeros(grad_shape[:-2] + grad_shape[-1:]).to(grad)
+        else:
+            state["exp_avg_sq"] = torch.zeros_like(grad)
+        state["RMS"] = 0
+    else:
+        if use_first_moment:
+            state["exp_avg"] = state["exp_avg"].to(grad)
+        if factored:
+            state["exp_avg_sq_row"] = state["exp_avg_sq_row"].to(grad)
+            state["exp_avg_sq_col"] = state["exp_avg_sq_col"].to(grad)
+        else:
+            state["exp_avg_sq"] = state["exp_avg_sq"].to(grad)
+    p_data_fp32 = p
+    if p.dtype in {torch.float16, torch.bfloat16}:
+        p_data_fp32 = p_data_fp32.float()
+    state["step"] += 1
+    state["RMS"] = Adafactor._rms(p_data_fp32)
+    lr = Adafactor._get_lr(group, state)
+    beta2t = 1.0 - math.pow(state["step"], group["decay_rate"])
+    update = (grad**2) + group["eps"][0]
+    if factored:
+        exp_avg_sq_row = state["exp_avg_sq_row"]
+        exp_avg_sq_col = state["exp_avg_sq_col"]
+        exp_avg_sq_row.mul_(beta2t).add_(update.mean(dim=-1), alpha=(1.0 - beta2t))
+        exp_avg_sq_col.mul_(beta2t).add_(update.mean(dim=-2), alpha=(1.0 - beta2t))
+        # Approximation of exponential moving average of square of gradient
+        update = Adafactor._approx_sq_grad(exp_avg_sq_row, exp_avg_sq_col)
+        update.mul_(grad)
+    else:
+        exp_avg_sq = state["exp_avg_sq"]
+        exp_avg_sq.mul_(beta2t).add_(update, alpha=(1.0 - beta2t))
+        update = exp_avg_sq.rsqrt().mul_(grad)
+    update.div_((Adafactor._rms(update) / group["clip_threshold"]).clamp_(min=1.0))
+    update.mul_(lr)
+    if use_first_moment:
+        exp_avg = state["exp_avg"]
+        exp_avg.mul_(group["beta1"]).add_(update, alpha=(1 - group["beta1"]))
+        update = exp_avg
+    if group["weight_decay"] != 0:
+        p_data_fp32.add_(p_data_fp32, alpha=(-group["weight_decay"] * lr))
+    p_data_fp32.add_(-update)
+    # if p.dtype in {torch.float16, torch.bfloat16}:
+    #    p.copy_(p_data_fp32)
+    if p.dtype == torch.bfloat16:
+        copy_stochastic_(p, p_data_fp32)
+    elif p.dtype == torch.float16:
+        p.copy_(p_data_fp32)
+@torch.no_grad()
+def adafactor_step(self, closure=None):
+    """
+    Performs a single optimization step
+    Arguments:
+        closure (callable, optional): A closure that reevaluates the model
+            and returns the loss.
+    """
+    loss = None
+    if closure is not None:
+        loss = closure()
+    for group in self.param_groups:
+        for p in group["params"]:
+            adafactor_step_param(self, p, group)
+    return loss
+def patch_adafactor_fused(optimizer: Adafactor):
+    optimizer.step_param = adafactor_step_param.__get__(optimizer)
+    optimizer.step = adafactor_step.__get__(optimizer)

library/attention_processors.py ADDED Viewed

	@@ -0,0 +1,227 @@

+import math
+from typing import Any
+from einops import rearrange
+import torch
+from diffusers.models.attention_processor import Attention
+# flash attention forwards and backwards
+# https://arxiv.org/abs/2205.14135
+EPSILON = 1e-6
+class FlashAttentionFunction(torch.autograd.function.Function):
+    @staticmethod
+    @torch.no_grad()
+    def forward(ctx, q, k, v, mask, causal, q_bucket_size, k_bucket_size):
+        """Algorithm 2 in the paper"""
+        device = q.device
+        dtype = q.dtype
+        max_neg_value = -torch.finfo(q.dtype).max
+        qk_len_diff = max(k.shape[-2] - q.shape[-2], 0)
+        o = torch.zeros_like(q)
+        all_row_sums = torch.zeros((*q.shape[:-1], 1), dtype=dtype, device=device)
+        all_row_maxes = torch.full(
+            (*q.shape[:-1], 1), max_neg_value, dtype=dtype, device=device
+        )
+        scale = q.shape[-1] ** -0.5
+        if mask is None:
+            mask = (None,) * math.ceil(q.shape[-2] / q_bucket_size)
+        else:
+            mask = rearrange(mask, "b n -> b 1 1 n")
+            mask = mask.split(q_bucket_size, dim=-1)
+        row_splits = zip(
+            q.split(q_bucket_size, dim=-2),
+            o.split(q_bucket_size, dim=-2),
+            mask,
+            all_row_sums.split(q_bucket_size, dim=-2),
+            all_row_maxes.split(q_bucket_size, dim=-2),
+        )
+        for ind, (qc, oc, row_mask, row_sums, row_maxes) in enumerate(row_splits):
+            q_start_index = ind * q_bucket_size - qk_len_diff
+            col_splits = zip(
+                k.split(k_bucket_size, dim=-2),
+                v.split(k_bucket_size, dim=-2),
+            )
+            for k_ind, (kc, vc) in enumerate(col_splits):
+                k_start_index = k_ind * k_bucket_size
+                attn_weights = (
+                    torch.einsum("... i d, ... j d -> ... i j", qc, kc) * scale
+                )
+                if row_mask is not None:
+                    attn_weights.masked_fill_(~row_mask, max_neg_value)
+                if causal and q_start_index < (k_start_index + k_bucket_size - 1):
+                    causal_mask = torch.ones(
+                        (qc.shape[-2], kc.shape[-2]), dtype=torch.bool, device=device
+                    ).triu(q_start_index - k_start_index + 1)
+                    attn_weights.masked_fill_(causal_mask, max_neg_value)
+                block_row_maxes = attn_weights.amax(dim=-1, keepdims=True)
+                attn_weights -= block_row_maxes
+                exp_weights = torch.exp(attn_weights)
+                if row_mask is not None:
+                    exp_weights.masked_fill_(~row_mask, 0.0)
+                block_row_sums = exp_weights.sum(dim=-1, keepdims=True).clamp(
+                    min=EPSILON
+                )
+                new_row_maxes = torch.maximum(block_row_maxes, row_maxes)
+                exp_values = torch.einsum(
+                    "... i j, ... j d -> ... i d", exp_weights, vc
+                )
+                exp_row_max_diff = torch.exp(row_maxes - new_row_maxes)
+                exp_block_row_max_diff = torch.exp(block_row_maxes - new_row_maxes)
+                new_row_sums = (
+                    exp_row_max_diff * row_sums
+                    + exp_block_row_max_diff * block_row_sums
+                )
+                oc.mul_((row_sums / new_row_sums) * exp_row_max_diff).add_(
+                    (exp_block_row_max_diff / new_row_sums) * exp_values
+                )
+                row_maxes.copy_(new_row_maxes)
+                row_sums.copy_(new_row_sums)
+        ctx.args = (causal, scale, mask, q_bucket_size, k_bucket_size)
+        ctx.save_for_backward(q, k, v, o, all_row_sums, all_row_maxes)
+        return o
+    @staticmethod
+    @torch.no_grad()
+    def backward(ctx, do):
+        """Algorithm 4 in the paper"""
+        causal, scale, mask, q_bucket_size, k_bucket_size = ctx.args
+        q, k, v, o, l, m = ctx.saved_tensors
+        device = q.device
+        max_neg_value = -torch.finfo(q.dtype).max
+        qk_len_diff = max(k.shape[-2] - q.shape[-2], 0)
+        dq = torch.zeros_like(q)
+        dk = torch.zeros_like(k)
+        dv = torch.zeros_like(v)
+        row_splits = zip(
+            q.split(q_bucket_size, dim=-2),
+            o.split(q_bucket_size, dim=-2),
+            do.split(q_bucket_size, dim=-2),
+            mask,
+            l.split(q_bucket_size, dim=-2),
+            m.split(q_bucket_size, dim=-2),
+            dq.split(q_bucket_size, dim=-2),
+        )
+        for ind, (qc, oc, doc, row_mask, lc, mc, dqc) in enumerate(row_splits):
+            q_start_index = ind * q_bucket_size - qk_len_diff
+            col_splits = zip(
+                k.split(k_bucket_size, dim=-2),
+                v.split(k_bucket_size, dim=-2),
+                dk.split(k_bucket_size, dim=-2),
+                dv.split(k_bucket_size, dim=-2),
+            )
+            for k_ind, (kc, vc, dkc, dvc) in enumerate(col_splits):
+                k_start_index = k_ind * k_bucket_size
+                attn_weights = (
+                    torch.einsum("... i d, ... j d -> ... i j", qc, kc) * scale
+                )
+                if causal and q_start_index < (k_start_index + k_bucket_size - 1):
+                    causal_mask = torch.ones(
+                        (qc.shape[-2], kc.shape[-2]), dtype=torch.bool, device=device
+                    ).triu(q_start_index - k_start_index + 1)
+                    attn_weights.masked_fill_(causal_mask, max_neg_value)
+                exp_attn_weights = torch.exp(attn_weights - mc)
+                if row_mask is not None:
+                    exp_attn_weights.masked_fill_(~row_mask, 0.0)
+                p = exp_attn_weights / lc
+                dv_chunk = torch.einsum("... i j, ... i d -> ... j d", p, doc)
+                dp = torch.einsum("... i d, ... j d -> ... i j", doc, vc)
+                D = (doc * oc).sum(dim=-1, keepdims=True)
+                ds = p * scale * (dp - D)
+                dq_chunk = torch.einsum("... i j, ... j d -> ... i d", ds, kc)
+                dk_chunk = torch.einsum("... i j, ... i d -> ... j d", ds, qc)
+                dqc.add_(dq_chunk)
+                dkc.add_(dk_chunk)
+                dvc.add_(dv_chunk)
+        return dq, dk, dv, None, None, None, None
+class FlashAttnProcessor:
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+    ) -> Any:
+        q_bucket_size = 512
+        k_bucket_size = 1024
+        h = attn.heads
+        q = attn.to_q(hidden_states)
+        encoder_hidden_states = (
+            encoder_hidden_states
+            if encoder_hidden_states is not None
+            else hidden_states
+        )
+        encoder_hidden_states = encoder_hidden_states.to(hidden_states.dtype)
+        if hasattr(attn, "hypernetwork") and attn.hypernetwork is not None:
+            context_k, context_v = attn.hypernetwork.forward(
+                hidden_states, encoder_hidden_states
+            )
+            context_k = context_k.to(hidden_states.dtype)
+            context_v = context_v.to(hidden_states.dtype)
+        else:
+            context_k = encoder_hidden_states
+            context_v = encoder_hidden_states
+        k = attn.to_k(context_k)
+        v = attn.to_v(context_v)
+        del encoder_hidden_states, hidden_states
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v))
+        out = FlashAttentionFunction.apply(
+            q, k, v, attention_mask, False, q_bucket_size, k_bucket_size
+        )
+        out = rearrange(out, "b h n d -> b n (h d)")
+        out = attn.to_out[0](out)
+        out = attn.to_out[1](out)
+        return out

library/config_util.py ADDED Viewed

	@@ -0,0 +1,716 @@

+import argparse
+from dataclasses import (
+    asdict,
+    dataclass,
+)
+import functools
+import random
+from textwrap import dedent, indent
+import json
+from pathlib import Path
+# from toolz import curry
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+import toml
+import voluptuous
+from voluptuous import (
+    Any,
+    ExactSequence,
+    MultipleInvalid,
+    Object,
+    Required,
+    Schema,
+)
+from transformers import CLIPTokenizer
+from . import train_util
+from .train_util import (
+    DreamBoothSubset,
+    FineTuningSubset,
+    ControlNetSubset,
+    DreamBoothDataset,
+    FineTuningDataset,
+    ControlNetDataset,
+    DatasetGroup,
+)
+from .utils import setup_logging
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+def add_config_arguments(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "--dataset_config", type=Path, default=None, help="config file for detail settings / 詳細な設定用の設定ファイル"
+    )
+# TODO: inherit Params class in Subset, Dataset
+@dataclass
+class BaseSubsetParams:
+    image_dir: Optional[str] = None
+    num_repeats: int = 1
+    shuffle_caption: bool = False
+    caption_separator: str = (",",)
+    keep_tokens: int = 0
+    keep_tokens_separator: str = (None,)
+    secondary_separator: Optional[str] = None
+    enable_wildcard: bool = False
+    color_aug: bool = False
+    flip_aug: bool = False
+    face_crop_aug_range: Optional[Tuple[float, float]] = None
+    random_crop: bool = False
+    caption_prefix: Optional[str] = None
+    caption_suffix: Optional[str] = None
+    caption_dropout_rate: float = 0.0
+    caption_dropout_every_n_epochs: int = 0
+    caption_tag_dropout_rate: float = 0.0
+    token_warmup_min: int = 1
+    token_warmup_step: float = 0
+    custom_attributes: Optional[Dict[str, Any]] = None
+@dataclass
+class DreamBoothSubsetParams(BaseSubsetParams):
+    is_reg: bool = False
+    class_tokens: Optional[str] = None
+    caption_extension: str = ".caption"
+    cache_info: bool = False
+    alpha_mask: bool = False
+@dataclass
+class FineTuningSubsetParams(BaseSubsetParams):
+    metadata_file: Optional[str] = None
+    alpha_mask: bool = False
+@dataclass
+class ControlNetSubsetParams(BaseSubsetParams):
+    conditioning_data_dir: str = None
+    caption_extension: str = ".caption"
+    cache_info: bool = False
+@dataclass
+class BaseDatasetParams:
+    resolution: Optional[Tuple[int, int]] = None
+    network_multiplier: float = 1.0
+    debug_dataset: bool = False
+@dataclass
+class DreamBoothDatasetParams(BaseDatasetParams):
+    batch_size: int = 1
+    enable_bucket: bool = False
+    min_bucket_reso: int = 256
+    max_bucket_reso: int = 1024
+    bucket_reso_steps: int = 64
+    bucket_no_upscale: bool = False
+    prior_loss_weight: float = 1.0
+@dataclass
+class FineTuningDatasetParams(BaseDatasetParams):
+    batch_size: int = 1
+    enable_bucket: bool = False
+    min_bucket_reso: int = 256
+    max_bucket_reso: int = 1024
+    bucket_reso_steps: int = 64
+    bucket_no_upscale: bool = False
+@dataclass
+class ControlNetDatasetParams(BaseDatasetParams):
+    batch_size: int = 1
+    enable_bucket: bool = False
+    min_bucket_reso: int = 256
+    max_bucket_reso: int = 1024
+    bucket_reso_steps: int = 64
+    bucket_no_upscale: bool = False
+@dataclass
+class SubsetBlueprint:
+    params: Union[DreamBoothSubsetParams, FineTuningSubsetParams]
+@dataclass
+class DatasetBlueprint:
+    is_dreambooth: bool
+    is_controlnet: bool
+    params: Union[DreamBoothDatasetParams, FineTuningDatasetParams]
+    subsets: Sequence[SubsetBlueprint]
+@dataclass
+class DatasetGroupBlueprint:
+    datasets: Sequence[DatasetBlueprint]
+@dataclass
+class Blueprint:
+    dataset_group: DatasetGroupBlueprint
+class ConfigSanitizer:
+    # @curry
+    @staticmethod
+    def __validate_and_convert_twodim(klass, value: Sequence) -> Tuple:
+        Schema(ExactSequence([klass, klass]))(value)
+        return tuple(value)
+    # @curry
+    @staticmethod
+    def __validate_and_convert_scalar_or_twodim(klass, value: Union[float, Sequence]) -> Tuple:
+        Schema(Any(klass, ExactSequence([klass, klass])))(value)
+        try:
+            Schema(klass)(value)
+            return (value, value)
+        except:
+            return ConfigSanitizer.__validate_and_convert_twodim(klass, value)
+    # subset schema
+    SUBSET_ASCENDABLE_SCHEMA = {
+        "color_aug": bool,
+        "face_crop_aug_range": functools.partial(__validate_and_convert_twodim.__func__, float),
+        "flip_aug": bool,
+        "num_repeats": int,
+        "random_crop": bool,
+        "shuffle_caption": bool,
+        "keep_tokens": int,
+        "keep_tokens_separator": str,
+        "secondary_separator": str,
+        "caption_separator": str,
+        "enable_wildcard": bool,
+        "token_warmup_min": int,
+        "token_warmup_step": Any(float, int),
+        "caption_prefix": str,
+        "caption_suffix": str,
+        "custom_attributes": dict,
+    }
+    # DO means DropOut
+    DO_SUBSET_ASCENDABLE_SCHEMA = {
+        "caption_dropout_every_n_epochs": int,
+        "caption_dropout_rate": Any(float, int),
+        "caption_tag_dropout_rate": Any(float, int),
+    }
+    # DB means DreamBooth
+    DB_SUBSET_ASCENDABLE_SCHEMA = {
+        "caption_extension": str,
+        "class_tokens": str,
+        "cache_info": bool,
+    }
+    DB_SUBSET_DISTINCT_SCHEMA = {
+        Required("image_dir"): str,
+        "is_reg": bool,
+        "alpha_mask": bool,
+    }
+    # FT means FineTuning
+    FT_SUBSET_DISTINCT_SCHEMA = {
+        Required("metadata_file"): str,
+        "image_dir": str,
+        "alpha_mask": bool,
+    }
+    CN_SUBSET_ASCENDABLE_SCHEMA = {
+        "caption_extension": str,
+        "cache_info": bool,
+    }
+    CN_SUBSET_DISTINCT_SCHEMA = {
+        Required("image_dir"): str,
+        Required("conditioning_data_dir"): str,
+    }
+    # datasets schema
+    DATASET_ASCENDABLE_SCHEMA = {
+        "batch_size": int,
+        "bucket_no_upscale": bool,
+        "bucket_reso_steps": int,
+        "enable_bucket": bool,
+        "max_bucket_reso": int,
+        "min_bucket_reso": int,
+        "resolution": functools.partial(__validate_and_convert_scalar_or_twodim.__func__, int),
+        "network_multiplier": float,
+    }
+    # options handled by argparse but not handled by user config
+    ARGPARSE_SPECIFIC_SCHEMA = {
+        "debug_dataset": bool,
+        "max_token_length": Any(None, int),
+        "prior_loss_weight": Any(float, int),
+    }
+    # for handling default None value of argparse
+    ARGPARSE_NULLABLE_OPTNAMES = [
+        "face_crop_aug_range",
+        "resolution",
+    ]
+    # prepare map because option name may differ among argparse and user config
+    ARGPARSE_OPTNAME_TO_CONFIG_OPTNAME = {
+        "train_batch_size": "batch_size",
+        "dataset_repeats": "num_repeats",
+    }
+    def __init__(self, support_dreambooth: bool, support_finetuning: bool, support_controlnet: bool, support_dropout: bool) -> None:
+        assert support_dreambooth or support_finetuning or support_controlnet, (
+            "Neither DreamBooth mode nor fine tuning mode nor controlnet mode specified. Please specify one mode or more."
+            + " / DreamBooth モードか fine tuning モードか controlnet モードのどれも指定されていません。1つ以上指定してください。"
+        )
+        self.db_subset_schema = self.__merge_dict(
+            self.SUBSET_ASCENDABLE_SCHEMA,
+            self.DB_SUBSET_DISTINCT_SCHEMA,
+            self.DB_SUBSET_ASCENDABLE_SCHEMA,
+            self.DO_SUBSET_ASCENDABLE_SCHEMA if support_dropout else {},
+        )
+        self.ft_subset_schema = self.__merge_dict(
+            self.SUBSET_ASCENDABLE_SCHEMA,
+            self.FT_SUBSET_DISTINCT_SCHEMA,
+            self.DO_SUBSET_ASCENDABLE_SCHEMA if support_dropout else {},
+        )
+        self.cn_subset_schema = self.__merge_dict(
+            self.SUBSET_ASCENDABLE_SCHEMA,
+            self.CN_SUBSET_DISTINCT_SCHEMA,
+            self.CN_SUBSET_ASCENDABLE_SCHEMA,
+            self.DO_SUBSET_ASCENDABLE_SCHEMA if support_dropout else {},
+        )
+        self.db_dataset_schema = self.__merge_dict(
+            self.DATASET_ASCENDABLE_SCHEMA,
+            self.SUBSET_ASCENDABLE_SCHEMA,
+            self.DB_SUBSET_ASCENDABLE_SCHEMA,
+            self.DO_SUBSET_ASCENDABLE_SCHEMA if support_dropout else {},
+            {"subsets": [self.db_subset_schema]},
+        )
+        self.ft_dataset_schema = self.__merge_dict(
+            self.DATASET_ASCENDABLE_SCHEMA,
+            self.SUBSET_ASCENDABLE_SCHEMA,
+            self.DO_SUBSET_ASCENDABLE_SCHEMA if support_dropout else {},
+            {"subsets": [self.ft_subset_schema]},
+        )
+        self.cn_dataset_schema = self.__merge_dict(
+            self.DATASET_ASCENDABLE_SCHEMA,
+            self.SUBSET_ASCENDABLE_SCHEMA,
+            self.CN_SUBSET_ASCENDABLE_SCHEMA,
+            self.DO_SUBSET_ASCENDABLE_SCHEMA if support_dropout else {},
+            {"subsets": [self.cn_subset_schema]},
+        )
+        if support_dreambooth and support_finetuning:
+            def validate_flex_dataset(dataset_config: dict):
+                subsets_config = dataset_config.get("subsets", [])
+                if support_controlnet and all(["conditioning_data_dir" in subset for subset in subsets_config]):
+                    return Schema(self.cn_dataset_schema)(dataset_config)
+                # check dataset meets FT style
+                # NOTE: all FT subsets should have "metadata_file"
+                elif all(["metadata_file" in subset for subset in subsets_config]):
+                    return Schema(self.ft_dataset_schema)(dataset_config)
+                # check dataset meets DB style
+                # NOTE: all DB subsets should have no "metadata_file"
+                elif all(["metadata_file" not in subset for subset in subsets_config]):
+                    return Schema(self.db_dataset_schema)(dataset_config)
+                else:
+                    raise voluptuous.Invalid(
+                        "DreamBooth subset and fine tuning subset cannot be mixed in the same dataset. Please split them into separate datasets. / DreamBoothのサブセットとfine tuninのサブセットを同一のデータセットに混在させることはできません。別々のデータセットに分割してください。"
+                    )
+            self.dataset_schema = validate_flex_dataset
+        elif support_dreambooth:
+            if support_controlnet:
+                self.dataset_schema = self.cn_dataset_schema
+            else:
+                self.dataset_schema = self.db_dataset_schema
+        elif support_finetuning:
+            self.dataset_schema = self.ft_dataset_schema
+        elif support_controlnet:
+            self.dataset_schema = self.cn_dataset_schema
+        self.general_schema = self.__merge_dict(
+            self.DATASET_ASCENDABLE_SCHEMA,
+            self.SUBSET_ASCENDABLE_SCHEMA,
+            self.DB_SUBSET_ASCENDABLE_SCHEMA if support_dreambooth else {},
+            self.CN_SUBSET_ASCENDABLE_SCHEMA if support_controlnet else {},
+            self.DO_SUBSET_ASCENDABLE_SCHEMA if support_dropout else {},
+        )
+        self.user_config_validator = Schema(
+            {
+                "general": self.general_schema,
+                "datasets": [self.dataset_schema],
+            }
+        )
+        self.argparse_schema = self.__merge_dict(
+            self.general_schema,
+            self.ARGPARSE_SPECIFIC_SCHEMA,
+            {optname: Any(None, self.general_schema[optname]) for optname in self.ARGPARSE_NULLABLE_OPTNAMES},
+            {a_name: self.general_schema[c_name] for a_name, c_name in self.ARGPARSE_OPTNAME_TO_CONFIG_OPTNAME.items()},
+        )
+        self.argparse_config_validator = Schema(Object(self.argparse_schema), extra=voluptuous.ALLOW_EXTRA)
+    def sanitize_user_config(self, user_config: dict) -> dict:
+        try:
+            return self.user_config_validator(user_config)
+        except MultipleInvalid:
+            # TODO: エラー発生時のメッセージをわかりやすくする
+            logger.error("Invalid user config / ユーザ設定の形式が正しくないようです")
+            raise
+    # NOTE: In nature, argument parser result is not needed to be sanitize
+    #   However this will help us to detect program bug
+    def sanitize_argparse_namespace(self, argparse_namespace: argparse.Namespace) -> argparse.Namespace:
+        try:
+            return self.argparse_config_validator(argparse_namespace)
+        except MultipleInvalid:
+            # XXX: this should be a bug
+            logger.error(
+                "Invalid cmdline parsed arguments. This should be a bug. / コマンドラインのパース結果が正しくないようです。プログラムのバグの可能性が高いです。"
+            )
+            raise
+    # NOTE: value would be overwritten by latter dict if there is already the same key
+    @staticmethod
+    def __merge_dict(*dict_list: dict) -> dict:
+        merged = {}
+        for schema in dict_list:
+            # merged |= schema
+            for k, v in schema.items():
+                merged[k] = v
+        return merged
+class BlueprintGenerator:
+    BLUEPRINT_PARAM_NAME_TO_CONFIG_OPTNAME = {}
+    def __init__(self, sanitizer: ConfigSanitizer):
+        self.sanitizer = sanitizer
+    # runtime_params is for parameters which is only configurable on runtime, such as tokenizer
+    def generate(self, user_config: dict, argparse_namespace: argparse.Namespace, **runtime_params) -> Blueprint:
+        sanitized_user_config = self.sanitizer.sanitize_user_config(user_config)
+        sanitized_argparse_namespace = self.sanitizer.sanitize_argparse_namespace(argparse_namespace)
+        # convert argparse namespace to dict like config
+        # NOTE: it is ok to have extra entries in dict
+        optname_map = self.sanitizer.ARGPARSE_OPTNAME_TO_CONFIG_OPTNAME
+        argparse_config = {
+            optname_map.get(optname, optname): value for optname, value in vars(sanitized_argparse_namespace).items()
+        }
+        general_config = sanitized_user_config.get("general", {})
+        dataset_blueprints = []
+        for dataset_config in sanitized_user_config.get("datasets", []):
+            # NOTE: if subsets have no "metadata_file", these are DreamBooth datasets/subsets
+            subsets = dataset_config.get("subsets", [])
+            is_dreambooth = all(["metadata_file" not in subset for subset in subsets])
+            is_controlnet = all(["conditioning_data_dir" in subset for subset in subsets])
+            if is_controlnet:
+                subset_params_klass = ControlNetSubsetParams
+                dataset_params_klass = ControlNetDatasetParams
+            elif is_dreambooth:
+                subset_params_klass = DreamBoothSubsetParams
+                dataset_params_klass = DreamBoothDatasetParams
+            else:
+                subset_params_klass = FineTuningSubsetParams
+                dataset_params_klass = FineTuningDatasetParams
+            subset_blueprints = []
+            for subset_config in subsets:
+                params = self.generate_params_by_fallbacks(
+                    subset_params_klass, [subset_config, dataset_config, general_config, argparse_config, runtime_params]
+                )
+                subset_blueprints.append(SubsetBlueprint(params))
+            params = self.generate_params_by_fallbacks(
+                dataset_params_klass, [dataset_config, general_config, argparse_config, runtime_params]
+            )
+            dataset_blueprints.append(DatasetBlueprint(is_dreambooth, is_controlnet, params, subset_blueprints))
+        dataset_group_blueprint = DatasetGroupBlueprint(dataset_blueprints)
+        return Blueprint(dataset_group_blueprint)
+    @staticmethod
+    def generate_params_by_fallbacks(param_klass, fallbacks: Sequence[dict]):
+        name_map = BlueprintGenerator.BLUEPRINT_PARAM_NAME_TO_CONFIG_OPTNAME
+        search_value = BlueprintGenerator.search_value
+        default_params = asdict(param_klass())
+        param_names = default_params.keys()
+        params = {name: search_value(name_map.get(name, name), fallbacks, default_params.get(name)) for name in param_names}
+        return param_klass(**params)
+    @staticmethod
+    def search_value(key: str, fallbacks: Sequence[dict], default_value=None):
+        for cand in fallbacks:
+            value = cand.get(key)
+            if value is not None:
+                return value
+        return default_value
+def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlueprint):
+    datasets: List[Union[DreamBoothDataset, FineTuningDataset, ControlNetDataset]] = []
+    for dataset_blueprint in dataset_group_blueprint.datasets:
+        if dataset_blueprint.is_controlnet:
+            subset_klass = ControlNetSubset
+            dataset_klass = ControlNetDataset
+        elif dataset_blueprint.is_dreambooth:
+            subset_klass = DreamBoothSubset
+            dataset_klass = DreamBoothDataset
+        else:
+            subset_klass = FineTuningSubset
+            dataset_klass = FineTuningDataset
+        subsets = [subset_klass(**asdict(subset_blueprint.params)) for subset_blueprint in dataset_blueprint.subsets]
+        dataset = dataset_klass(subsets=subsets, **asdict(dataset_blueprint.params))
+        datasets.append(dataset)
+    # print info
+    info = ""
+    for i, dataset in enumerate(datasets):
+        is_dreambooth = isinstance(dataset, DreamBoothDataset)
+        is_controlnet = isinstance(dataset, ControlNetDataset)
+        info += dedent(
+            f"""\
+      [Dataset {i}]
+        batch_size: {dataset.batch_size}
+        resolution: {(dataset.width, dataset.height)}
+        enable_bucket: {dataset.enable_bucket}
+        network_multiplier: {dataset.network_multiplier}
+    """
+        )
+        if dataset.enable_bucket:
+            info += indent(
+                dedent(
+                    f"""\
+        min_bucket_reso: {dataset.min_bucket_reso}
+        max_bucket_reso: {dataset.max_bucket_reso}
+        bucket_reso_steps: {dataset.bucket_reso_steps}
+        bucket_no_upscale: {dataset.bucket_no_upscale}
+      \n"""
+                ),
+                "  ",
+            )
+        else:
+            info += "\n"
+        for j, subset in enumerate(dataset.subsets):
+            info += indent(
+                dedent(
+                    f"""\
+        [Subset {j} of Dataset {i}]
+          image_dir: "{subset.image_dir}"
+          image_count: {subset.img_count}
+          num_repeats: {subset.num_repeats}
+          shuffle_caption: {subset.shuffle_caption}
+          keep_tokens: {subset.keep_tokens}
+          keep_tokens_separator: {subset.keep_tokens_separator}
+          caption_separator: {subset.caption_separator}
+          secondary_separator: {subset.secondary_separator}
+          enable_wildcard: {subset.enable_wildcard}
+          caption_dropout_rate: {subset.caption_dropout_rate}
+          caption_dropout_every_n_epochs: {subset.caption_dropout_every_n_epochs}
+          caption_tag_dropout_rate: {subset.caption_tag_dropout_rate}
+          caption_prefix: {subset.caption_prefix}
+          caption_suffix: {subset.caption_suffix}
+          color_aug: {subset.color_aug}
+          flip_aug: {subset.flip_aug}
+          face_crop_aug_range: {subset.face_crop_aug_range}
+          random_crop: {subset.random_crop}
+          token_warmup_min: {subset.token_warmup_min}
+          token_warmup_step: {subset.token_warmup_step}
+          alpha_mask: {subset.alpha_mask}
+          custom_attributes: {subset.custom_attributes}
+      """
+                ),
+                "  ",
+            )
+            if is_dreambooth:
+                info += indent(
+                    dedent(
+                        f"""\
+          is_reg: {subset.is_reg}
+          class_tokens: {subset.class_tokens}
+          caption_extension: {subset.caption_extension}
+        \n"""
+                    ),
+                    "    ",
+                )
+            elif not is_controlnet:
+                info += indent(
+                    dedent(
+                        f"""\
+          metadata_file: {subset.metadata_file}
+        \n"""
+                    ),
+                    "    ",
+                )
+    logger.info(f"{info}")
+    # make buckets first because it determines the length of dataset
+    # and set the same seed for all datasets
+    seed = random.randint(0, 2**31)  # actual seed is seed + epoch_no
+    for i, dataset in enumerate(datasets):
+        logger.info(f"[Dataset {i}]")
+        dataset.make_buckets()
+        dataset.set_seed(seed)
+    return DatasetGroup(datasets)
+def generate_dreambooth_subsets_config_by_subdirs(train_data_dir: Optional[str] = None, reg_data_dir: Optional[str] = None):
+    def extract_dreambooth_params(name: str) -> Tuple[int, str]:
+        tokens = name.split("_")
+        try:
+            n_repeats = int(tokens[0])
+        except ValueError as e:
+            logger.warning(f"ignore directory without repeats / 繰り返し回数のないディレクトリを無視します: {name}")
+            return 0, ""
+        caption_by_folder = "_".join(tokens[1:])
+        return n_repeats, caption_by_folder
+    def generate(base_dir: Optional[str], is_reg: bool):
+        if base_dir is None:
+            return []
+        base_dir: Path = Path(base_dir)
+        if not base_dir.is_dir():
+            return []
+        subsets_config = []
+        for subdir in base_dir.iterdir():
+            if not subdir.is_dir():
+                continue
+            num_repeats, class_tokens = extract_dreambooth_params(subdir.name)
+            if num_repeats < 1:
+                continue
+            subset_config = {"image_dir": str(subdir), "num_repeats": num_repeats, "is_reg": is_reg, "class_tokens": class_tokens}
+            subsets_config.append(subset_config)
+        return subsets_config
+    subsets_config = []
+    subsets_config += generate(train_data_dir, False)
+    subsets_config += generate(reg_data_dir, True)
+    return subsets_config
+def generate_controlnet_subsets_config_by_subdirs(
+    train_data_dir: Optional[str] = None, conditioning_data_dir: Optional[str] = None, caption_extension: str = ".txt"
+):
+    def generate(base_dir: Optional[str]):
+        if base_dir is None:
+            return []
+        base_dir: Path = Path(base_dir)
+        if not base_dir.is_dir():
+            return []
+        subsets_config = []
+        subset_config = {
+            "image_dir": train_data_dir,
+            "conditioning_data_dir": conditioning_data_dir,
+            "caption_extension": caption_extension,
+            "num_repeats": 1,
+        }
+        subsets_config.append(subset_config)
+        return subsets_config
+    subsets_config = []
+    subsets_config += generate(train_data_dir)
+    return subsets_config
+def load_user_config(file: str) -> dict:
+    file: Path = Path(file)
+    if not file.is_file():
+        raise ValueError(f"file not found / ファイルが見つかりません: {file}")
+    if file.name.lower().endswith(".json"):
+        try:
+            with open(file, "r") as f:
+                config = json.load(f)
+        except Exception:
+            logger.error(
+                f"Error on parsing JSON config file. Please check the format. / JSON 形式の設定ファイルの読み込みに失敗しました。文法が正しいか確認してください。: {file}"
+            )
+            raise
+    elif file.name.lower().endswith(".toml"):
+        try:
+            config = toml.load(file)
+        except Exception:
+            logger.error(
+                f"Error on parsing TOML config file. Please check the format. / TOML 形式の設定ファイルの読み込みに失敗しました。文法が正しいか確認してください。: {file}"
+            )
+            raise
+    else:
+        raise ValueError(f"not supported config file format / 対応していない設定ファイルの形式です: {file}")
+    return config
+# for config test
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--support_dreambooth", action="store_true")
+    parser.add_argument("--support_finetuning", action="store_true")
+    parser.add_argument("--support_controlnet", action="store_true")
+    parser.add_argument("--support_dropout", action="store_true")
+    parser.add_argument("dataset_config")
+    config_args, remain = parser.parse_known_args()
+    parser = argparse.ArgumentParser()
+    train_util.add_dataset_arguments(
+        parser, config_args.support_dreambooth, config_args.support_finetuning, config_args.support_dropout
+    )
+    train_util.add_training_arguments(parser, config_args.support_dreambooth)
+    argparse_namespace = parser.parse_args(remain)
+    train_util.prepare_dataset_args(argparse_namespace, config_args.support_finetuning)
+    logger.info("[argparse_namespace]")
+    logger.info(f"{vars(argparse_namespace)}")
+    user_config = load_user_config(config_args.dataset_config)
+    logger.info("")
+    logger.info("[user_config]")
+    logger.info(f"{user_config}")
+    sanitizer = ConfigSanitizer(
+        config_args.support_dreambooth, config_args.support_finetuning, config_args.support_controlnet, config_args.support_dropout
+    )
+    sanitized_user_config = sanitizer.sanitize_user_config(user_config)
+    logger.info("")
+    logger.info("[sanitized_user_config]")
+    logger.info(f"{sanitized_user_config}")
+    blueprint = BlueprintGenerator(sanitizer).generate(user_config, argparse_namespace)
+    logger.info("")
+    logger.info("[blueprint]")
+    logger.info(f"{blueprint}")

library/custom_offloading_utils.py ADDED Viewed

	@@ -0,0 +1,227 @@

+from concurrent.futures import ThreadPoolExecutor
+import time
+from typing import Optional
+import torch
+import torch.nn as nn
+from library.device_utils import clean_memory_on_device
+def synchronize_device(device: torch.device):
+    if device.type == "cuda":
+        torch.cuda.synchronize()
+    elif device.type == "xpu":
+        torch.xpu.synchronize()
+    elif device.type == "mps":
+        torch.mps.synchronize()
+def swap_weight_devices_cuda(device: torch.device, layer_to_cpu: nn.Module, layer_to_cuda: nn.Module):
+    assert layer_to_cpu.__class__ == layer_to_cuda.__class__
+    weight_swap_jobs = []
+    # This is not working for all cases (e.g. SD3), so we need to find the corresponding modules
+    # for module_to_cpu, module_to_cuda in zip(layer_to_cpu.modules(), layer_to_cuda.modules()):
+    #     print(module_to_cpu.__class__, module_to_cuda.__class__)
+    #     if hasattr(module_to_cpu, "weight") and module_to_cpu.weight is not None:
+    #         weight_swap_jobs.append((module_to_cpu, module_to_cuda, module_to_cpu.weight.data, module_to_cuda.weight.data))
+    modules_to_cpu = {k: v for k, v in layer_to_cpu.named_modules()}
+    for module_to_cuda_name, module_to_cuda in layer_to_cuda.named_modules():
+        if hasattr(module_to_cuda, "weight") and module_to_cuda.weight is not None:
+            module_to_cpu = modules_to_cpu.get(module_to_cuda_name, None)
+            if module_to_cpu is not None and module_to_cpu.weight.shape == module_to_cuda.weight.shape:
+                weight_swap_jobs.append((module_to_cpu, module_to_cuda, module_to_cpu.weight.data, module_to_cuda.weight.data))
+            else:
+                if module_to_cuda.weight.data.device.type != device.type:
+                    # print(
+                    #     f"Module {module_to_cuda_name} not found in CPU model or shape mismatch, so not swapping and moving to device"
+                    # )
+                    module_to_cuda.weight.data = module_to_cuda.weight.data.to(device)
+    torch.cuda.current_stream().synchronize()  # this prevents the illegal loss value
+    stream = torch.cuda.Stream()
+    with torch.cuda.stream(stream):
+        # cuda to cpu
+        for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view in weight_swap_jobs:
+            cuda_data_view.record_stream(stream)
+            module_to_cpu.weight.data = cuda_data_view.data.to("cpu", non_blocking=True)
+        stream.synchronize()
+        # cpu to cuda
+        for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view in weight_swap_jobs:
+            cuda_data_view.copy_(module_to_cuda.weight.data, non_blocking=True)
+            module_to_cuda.weight.data = cuda_data_view
+    stream.synchronize()
+    torch.cuda.current_stream().synchronize()  # this prevents the illegal loss value
+def swap_weight_devices_no_cuda(device: torch.device, layer_to_cpu: nn.Module, layer_to_cuda: nn.Module):
+    """
+    not tested
+    """
+    assert layer_to_cpu.__class__ == layer_to_cuda.__class__
+    weight_swap_jobs = []
+    for module_to_cpu, module_to_cuda in zip(layer_to_cpu.modules(), layer_to_cuda.modules()):
+        if hasattr(module_to_cpu, "weight") and module_to_cpu.weight is not None:
+            weight_swap_jobs.append((module_to_cpu, module_to_cuda, module_to_cpu.weight.data, module_to_cuda.weight.data))
+    # device to cpu
+    for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view in weight_swap_jobs:
+        module_to_cpu.weight.data = cuda_data_view.data.to("cpu", non_blocking=True)
+    synchronize_device()
+    # cpu to device
+    for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view in weight_swap_jobs:
+        cuda_data_view.copy_(module_to_cuda.weight.data, non_blocking=True)
+        module_to_cuda.weight.data = cuda_data_view
+    synchronize_device()
+def weighs_to_device(layer: nn.Module, device: torch.device):
+    for module in layer.modules():
+        if hasattr(module, "weight") and module.weight is not None:
+            module.weight.data = module.weight.data.to(device, non_blocking=True)
+class Offloader:
+    """
+    common offloading class
+    """
+    def __init__(self, num_blocks: int, blocks_to_swap: int, device: torch.device, debug: bool = False):
+        self.num_blocks = num_blocks
+        self.blocks_to_swap = blocks_to_swap
+        self.device = device
+        self.debug = debug
+        self.thread_pool = ThreadPoolExecutor(max_workers=1)
+        self.futures = {}
+        self.cuda_available = device.type == "cuda"
+    def swap_weight_devices(self, block_to_cpu: nn.Module, block_to_cuda: nn.Module):
+        if self.cuda_available:
+            swap_weight_devices_cuda(self.device, block_to_cpu, block_to_cuda)
+        else:
+            swap_weight_devices_no_cuda(self.device, block_to_cpu, block_to_cuda)
+    def _submit_move_blocks(self, blocks, block_idx_to_cpu, block_idx_to_cuda):
+        def move_blocks(bidx_to_cpu, block_to_cpu, bidx_to_cuda, block_to_cuda):
+            if self.debug:
+                start_time = time.perf_counter()
+                print(f"Move block {bidx_to_cpu} to CPU and block {bidx_to_cuda} to {'CUDA' if self.cuda_available else 'device'}")
+            self.swap_weight_devices(block_to_cpu, block_to_cuda)
+            if self.debug:
+                print(f"Moved blocks {bidx_to_cpu} and {bidx_to_cuda} in {time.perf_counter()-start_time:.2f}s")
+            return bidx_to_cpu, bidx_to_cuda  # , event
+        block_to_cpu = blocks[block_idx_to_cpu]
+        block_to_cuda = blocks[block_idx_to_cuda]
+        self.futures[block_idx_to_cuda] = self.thread_pool.submit(
+            move_blocks, block_idx_to_cpu, block_to_cpu, block_idx_to_cuda, block_to_cuda
+        )
+    def _wait_blocks_move(self, block_idx):
+        if block_idx not in self.futures:
+            return
+        if self.debug:
+            print(f"Wait for block {block_idx}")
+            start_time = time.perf_counter()
+        future = self.futures.pop(block_idx)
+        _, bidx_to_cuda = future.result()
+        assert block_idx == bidx_to_cuda, f"Block index mismatch: {block_idx} != {bidx_to_cuda}"
+        if self.debug:
+            print(f"Waited for block {block_idx}: {time.perf_counter()-start_time:.2f}s")
+class ModelOffloader(Offloader):
+    """
+    supports forward offloading
+    """
+    def __init__(self, blocks: list[nn.Module], num_blocks: int, blocks_to_swap: int, device: torch.device, debug: bool = False):
+        super().__init__(num_blocks, blocks_to_swap, device, debug)
+        # register backward hooks
+        self.remove_handles = []
+        for i, block in enumerate(blocks):
+            hook = self.create_backward_hook(blocks, i)
+            if hook is not None:
+                handle = block.register_full_backward_hook(hook)
+                self.remove_handles.append(handle)
+    def __del__(self):
+        for handle in self.remove_handles:
+            handle.remove()
+    def create_backward_hook(self, blocks: list[nn.Module], block_index: int) -> Optional[callable]:
+        # -1 for 0-based index
+        num_blocks_propagated = self.num_blocks - block_index - 1
+        swapping = num_blocks_propagated > 0 and num_blocks_propagated <= self.blocks_to_swap
+        waiting = block_index > 0 and block_index <= self.blocks_to_swap
+        if not swapping and not waiting:
+            return None
+        # create  hook
+        block_idx_to_cpu = self.num_blocks - num_blocks_propagated
+        block_idx_to_cuda = self.blocks_to_swap - num_blocks_propagated
+        block_idx_to_wait = block_index - 1
+        def backward_hook(module, grad_input, grad_output):
+            if self.debug:
+                print(f"Backward hook for block {block_index}")
+            if swapping:
+                self._submit_move_blocks(blocks, block_idx_to_cpu, block_idx_to_cuda)
+            if waiting:
+                self._wait_blocks_move(block_idx_to_wait)
+            return None
+        return backward_hook
+    def prepare_block_devices_before_forward(self, blocks: list[nn.Module]):
+        if self.blocks_to_swap is None or self.blocks_to_swap == 0:
+            return
+        if self.debug:
+            print("Prepare block devices before forward")
+        for b in blocks[0 : self.num_blocks - self.blocks_to_swap]:
+            b.to(self.device)
+            weighs_to_device(b, self.device)  # make sure weights are on device
+        for b in blocks[self.num_blocks - self.blocks_to_swap :]:
+            b.to(self.device)  # move block to device first
+            weighs_to_device(b, "cpu")  # make sure weights are on cpu
+        synchronize_device(self.device)
+        clean_memory_on_device(self.device)
+    def wait_for_block(self, block_idx: int):
+        if self.blocks_to_swap is None or self.blocks_to_swap == 0:
+            return
+        self._wait_blocks_move(block_idx)
+    def submit_move_blocks(self, blocks: list[nn.Module], block_idx: int):
+        if self.blocks_to_swap is None or self.blocks_to_swap == 0:
+            return
+        if block_idx >= self.blocks_to_swap:
+            return
+        block_idx_to_cpu = block_idx
+        block_idx_to_cuda = self.num_blocks - self.blocks_to_swap + block_idx
+        self._submit_move_blocks(blocks, block_idx_to_cpu, block_idx_to_cuda)

library/custom_train_functions.py ADDED Viewed

	@@ -0,0 +1,559 @@

+import torch
+import argparse
+import random
+import re
+from typing import List, Optional, Union
+from .utils import setup_logging
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+def prepare_scheduler_for_custom_training(noise_scheduler, device):
+    if hasattr(noise_scheduler, "all_snr"):
+        return
+    alphas_cumprod = noise_scheduler.alphas_cumprod
+    sqrt_alphas_cumprod = torch.sqrt(alphas_cumprod)
+    sqrt_one_minus_alphas_cumprod = torch.sqrt(1.0 - alphas_cumprod)
+    alpha = sqrt_alphas_cumprod
+    sigma = sqrt_one_minus_alphas_cumprod
+    all_snr = (alpha / sigma) ** 2
+    noise_scheduler.all_snr = all_snr.to(device)
+def fix_noise_scheduler_betas_for_zero_terminal_snr(noise_scheduler):
+    # fix beta: zero terminal SNR
+    logger.info(f"fix noise scheduler betas: https://arxiv.org/abs/2305.08891")
+    def enforce_zero_terminal_snr(betas):
+        # Convert betas to alphas_bar_sqrt
+        alphas = 1 - betas
+        alphas_bar = alphas.cumprod(0)
+        alphas_bar_sqrt = alphas_bar.sqrt()
+        # Store old values.
+        alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+        alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+        # Shift so last timestep is zero.
+        alphas_bar_sqrt -= alphas_bar_sqrt_T
+        # Scale so first timestep is back to old value.
+        alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+        # Convert alphas_bar_sqrt to betas
+        alphas_bar = alphas_bar_sqrt**2
+        alphas = alphas_bar[1:] / alphas_bar[:-1]
+        alphas = torch.cat([alphas_bar[0:1], alphas])
+        betas = 1 - alphas
+        return betas
+    betas = noise_scheduler.betas
+    betas = enforce_zero_terminal_snr(betas)
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    # logger.info(f"original: {noise_scheduler.betas}")
+    # logger.info(f"fixed: {betas}")
+    noise_scheduler.betas = betas
+    noise_scheduler.alphas = alphas
+    noise_scheduler.alphas_cumprod = alphas_cumprod
+def apply_snr_weight(loss, timesteps, noise_scheduler, gamma, v_prediction=False):
+    snr = torch.stack([noise_scheduler.all_snr[t] for t in timesteps])
+    min_snr_gamma = torch.minimum(snr, torch.full_like(snr, gamma))
+    if v_prediction:
+        snr_weight = torch.div(min_snr_gamma, snr + 1).float().to(loss.device)
+    else:
+        snr_weight = torch.div(min_snr_gamma, snr).float().to(loss.device)
+    loss = loss * snr_weight
+    return loss
+def scale_v_prediction_loss_like_noise_prediction(loss, timesteps, noise_scheduler):
+    scale = get_snr_scale(timesteps, noise_scheduler)
+    loss = loss * scale
+    return loss
+def get_snr_scale(timesteps, noise_scheduler):
+    snr_t = torch.stack([noise_scheduler.all_snr[t] for t in timesteps])  # batch_size
+    snr_t = torch.minimum(snr_t, torch.ones_like(snr_t) * 1000)  # if timestep is 0, snr_t is inf, so limit it to 1000
+    scale = snr_t / (snr_t + 1)
+    # # show debug info
+    # logger.info(f"timesteps: {timesteps}, snr_t: {snr_t}, scale: {scale}")
+    return scale
+def add_v_prediction_like_loss(loss, timesteps, noise_scheduler, v_pred_like_loss):
+    scale = get_snr_scale(timesteps, noise_scheduler)
+    # logger.info(f"add v-prediction like loss: {v_pred_like_loss}, scale: {scale}, loss: {loss}, time: {timesteps}")
+    loss = loss + loss / scale * v_pred_like_loss
+    return loss
+def apply_debiased_estimation(loss, timesteps, noise_scheduler, v_prediction=False):
+    snr_t = torch.stack([noise_scheduler.all_snr[t] for t in timesteps])  # batch_size
+    snr_t = torch.minimum(snr_t, torch.ones_like(snr_t) * 1000)  # if timestep is 0, snr_t is inf, so limit it to 1000
+    if v_prediction:
+        weight = 1 / (snr_t + 1)
+    else:
+        weight = 1 / torch.sqrt(snr_t)
+    loss = weight * loss
+    return loss
+# TODO train_utilと分散しているのでどちらかに寄せる
+def add_custom_train_arguments(parser: argparse.ArgumentParser, support_weighted_captions: bool = True):
+    parser.add_argument(
+        "--min_snr_gamma",
+        type=float,
+        default=None,
+        help="gamma for reducing the weight of high loss timesteps. Lower numbers have stronger effect. 5 is recommended by paper. / 低いタイムステップでの高いlossに対して重みを減らすためのgamma値、低いほど効果が強く、論文では5が推奨",
+    )
+    parser.add_argument(
+        "--scale_v_pred_loss_like_noise_pred",
+        action="store_true",
+        help="scale v-prediction loss like noise prediction loss / v-prediction lossをnoise prediction lossと同じようにスケーリングする",
+    )
+    parser.add_argument(
+        "--v_pred_like_loss",
+        type=float,
+        default=None,
+        help="add v-prediction like loss multiplied by this value / v-prediction lossをこの値をかけ��ものをlossに加算する",
+    )
+    parser.add_argument(
+        "--debiased_estimation_loss",
+        action="store_true",
+        help="debiased estimation loss / debiased estimation loss",
+    )
+    if support_weighted_captions:
+        parser.add_argument(
+            "--weighted_captions",
+            action="store_true",
+            default=False,
+            help="Enable weighted captions in the standard style (token:1.3). No commas inside parens, or shuffle/dropout may break the decoder. / 「[token]」、「(token)」「(token:1.3)」のような重み付きキャプションを有効にする。カンマを括弧内に入れるとシャッフルやdropoutで重みづけがおかしくなるので注意",
+        )
+re_attention = re.compile(
+    r"""
+\\\(|
+\\\)|
+\\\[|
+\\]|
+\\\\|
+\\|
+\(|
+\[|
+:([+-]?[.\d]+)\)|
+\)|
+]|
+[^\\()\[\]:]+|
+:
+""",
+    re.X,
+)
+def parse_prompt_attention(text):
+    """
+    Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
+    Accepted tokens are:
+      (abc) - increases attention to abc by a multiplier of 1.1
+      (abc:3.12) - increases attention to abc by a multiplier of 3.12
+      [abc] - decreases attention to abc by a multiplier of 1.1
+      \( - literal character '('
+      \[ - literal character '['
+      \) - literal character ')'
+      \] - literal character ']'
+      \\ - literal character '\'
+      anything else - just text
+    >>> parse_prompt_attention('normal text')
+    [['normal text', 1.0]]
+    >>> parse_prompt_attention('an (important) word')
+    [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
+    >>> parse_prompt_attention('(unbalanced')
+    [['unbalanced', 1.1]]
+    >>> parse_prompt_attention('\(literal\]')
+    [['(literal]', 1.0]]
+    >>> parse_prompt_attention('(unnecessary)(parens)')
+    [['unnecessaryparens', 1.1]]
+    >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
+    [['a ', 1.0],
+     ['house', 1.5730000000000004],
+     [' ', 1.1],
+     ['on', 1.0],
+     [' a ', 1.1],
+     ['hill', 0.55],
+     [', sun, ', 1.1],
+     ['sky', 1.4641000000000006],
+     ['.', 1.1]]
+    """
+    res = []
+    round_brackets = []
+    square_brackets = []
+    round_bracket_multiplier = 1.1
+    square_bracket_multiplier = 1 / 1.1
+    def multiply_range(start_position, multiplier):
+        for p in range(start_position, len(res)):
+            res[p][1] *= multiplier
+    for m in re_attention.finditer(text):
+        text = m.group(0)
+        weight = m.group(1)
+        if text.startswith("\\"):
+            res.append([text[1:], 1.0])
+        elif text == "(":
+            round_brackets.append(len(res))
+        elif text == "[":
+            square_brackets.append(len(res))
+        elif weight is not None and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), float(weight))
+        elif text == ")" and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), round_bracket_multiplier)
+        elif text == "]" and len(square_brackets) > 0:
+            multiply_range(square_brackets.pop(), square_bracket_multiplier)
+        else:
+            res.append([text, 1.0])
+    for pos in round_brackets:
+        multiply_range(pos, round_bracket_multiplier)
+    for pos in square_brackets:
+        multiply_range(pos, square_bracket_multiplier)
+    if len(res) == 0:
+        res = [["", 1.0]]
+    # merge runs of identical weights
+    i = 0
+    while i + 1 < len(res):
+        if res[i][1] == res[i + 1][1]:
+            res[i][0] += res[i + 1][0]
+            res.pop(i + 1)
+        else:
+            i += 1
+    return res
+def get_prompts_with_weights(tokenizer, prompt: List[str], max_length: int):
+    r"""
+    Tokenize a list of prompts and return its tokens with weights of each token.
+    No padding, starting or ending token is included.
+    """
+    tokens = []
+    weights = []
+    truncated = False
+    for text in prompt:
+        texts_and_weights = parse_prompt_attention(text)
+        text_token = []
+        text_weight = []
+        for word, weight in texts_and_weights:
+            # tokenize and discard the starting and the ending token
+            token = tokenizer(word).input_ids[1:-1]
+            text_token += token
+            # copy the weight by length of token
+            text_weight += [weight] * len(token)
+            # stop if the text is too long (longer than truncation limit)
+            if len(text_token) > max_length:
+                truncated = True
+                break
+        # truncate
+        if len(text_token) > max_length:
+            truncated = True
+            text_token = text_token[:max_length]
+            text_weight = text_weight[:max_length]
+        tokens.append(text_token)
+        weights.append(text_weight)
+    if truncated:
+        logger.warning("Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples")
+    return tokens, weights
+def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, no_boseos_middle=True, chunk_length=77):
+    r"""
+    Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length.
+    """
+    max_embeddings_multiples = (max_length - 2) // (chunk_length - 2)
+    weights_length = max_length if no_boseos_middle else max_embeddings_multiples * chunk_length
+    for i in range(len(tokens)):
+        tokens[i] = [bos] + tokens[i] + [eos] * (max_length - 1 - len(tokens[i]))
+        if no_boseos_middle:
+            weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i]))
+        else:
+            w = []
+            if len(weights[i]) == 0:
+                w = [1.0] * weights_length
+            else:
+                for j in range(max_embeddings_multiples):
+                    w.append(1.0)  # weight for starting token in this chunk
+                    w += weights[i][j * (chunk_length - 2) : min(len(weights[i]), (j + 1) * (chunk_length - 2))]
+                    w.append(1.0)  # weight for ending token in this chunk
+                w += [1.0] * (weights_length - len(w))
+            weights[i] = w[:]
+    return tokens, weights
+def get_unweighted_text_embeddings(
+    tokenizer,
+    text_encoder,
+    text_input: torch.Tensor,
+    chunk_length: int,
+    clip_skip: int,
+    eos: int,
+    pad: int,
+    no_boseos_middle: Optional[bool] = True,
+):
+    """
+    When the length of tokens is a multiple of the capacity of the text encoder,
+    it should be split into chunks and sent to the text encoder individually.
+    """
+    max_embeddings_multiples = (text_input.shape[1] - 2) // (chunk_length - 2)
+    if max_embeddings_multiples > 1:
+        text_embeddings = []
+        for i in range(max_embeddings_multiples):
+            # extract the i-th chunk
+            text_input_chunk = text_input[:, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2].clone()
+            # cover the head and the tail by the starting and the ending tokens
+            text_input_chunk[:, 0] = text_input[0, 0]
+            if pad == eos:  # v1
+                text_input_chunk[:, -1] = text_input[0, -1]
+            else:  # v2
+                for j in range(len(text_input_chunk)):
+                    if text_input_chunk[j, -1] != eos and text_input_chunk[j, -1] != pad:  # 最後に普通の文字がある
+                        text_input_chunk[j, -1] = eos
+                    if text_input_chunk[j, 1] == pad:  # BOSだけであとはPAD
+                        text_input_chunk[j, 1] = eos
+            if clip_skip is None or clip_skip == 1:
+                text_embedding = text_encoder(text_input_chunk)[0]
+            else:
+                enc_out = text_encoder(text_input_chunk, output_hidden_states=True, return_dict=True)
+                text_embedding = enc_out["hidden_states"][-clip_skip]
+                text_embedding = text_encoder.text_model.final_layer_norm(text_embedding)
+            if no_boseos_middle:
+                if i == 0:
+                    # discard the ending token
+                    text_embedding = text_embedding[:, :-1]
+                elif i == max_embeddings_multiples - 1:
+                    # discard the starting token
+                    text_embedding = text_embedding[:, 1:]
+                else:
+                    # discard both starting and ending tokens
+                    text_embedding = text_embedding[:, 1:-1]
+            text_embeddings.append(text_embedding)
+        text_embeddings = torch.concat(text_embeddings, axis=1)
+    else:
+        if clip_skip is None or clip_skip == 1:
+            text_embeddings = text_encoder(text_input)[0]
+        else:
+            enc_out = text_encoder(text_input, output_hidden_states=True, return_dict=True)
+            text_embeddings = enc_out["hidden_states"][-clip_skip]
+            text_embeddings = text_encoder.text_model.final_layer_norm(text_embeddings)
+    return text_embeddings
+def get_weighted_text_embeddings(
+    tokenizer,
+    text_encoder,
+    prompt: Union[str, List[str]],
+    device,
+    max_embeddings_multiples: Optional[int] = 3,
+    no_boseos_middle: Optional[bool] = False,
+    clip_skip=None,
+):
+    r"""
+    Prompts can be assigned with local weights using brackets. For example,
+    prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful',
+    and the embedding tokens corresponding to the words get multiplied by a constant, 1.1.
+    Also, to regularize of the embedding, the weighted embedding would be scaled to preserve the original mean.
+    Args:
+        prompt (`str` or `List[str]`):
+            The prompt or prompts to guide the image generation.
+        max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+            The max multiple length of prompt embeddings compared to the max output length of text encoder.
+        no_boseos_middle (`bool`, *optional*, defaults to `False`):
+            If the length of text token is multiples of the capacity of text encoder, whether reserve the starting and
+            ending token in each of the chunk in the middle.
+        skip_parsing (`bool`, *optional*, defaults to `False`):
+            Skip the parsing of brackets.
+        skip_weighting (`bool`, *optional*, defaults to `False`):
+            Skip the weighting. When the parsing is skipped, it is forced True.
+    """
+    max_length = (tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
+    if isinstance(prompt, str):
+        prompt = [prompt]
+    prompt_tokens, prompt_weights = get_prompts_with_weights(tokenizer, prompt, max_length - 2)
+    # round up the longest length of tokens to a multiple of (model_max_length - 2)
+    max_length = max([len(token) for token in prompt_tokens])
+    max_embeddings_multiples = min(
+        max_embeddings_multiples,
+        (max_length - 1) // (tokenizer.model_max_length - 2) + 1,
+    )
+    max_embeddings_multiples = max(1, max_embeddings_multiples)
+    max_length = (tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
+    # pad the length of tokens and weights
+    bos = tokenizer.bos_token_id
+    eos = tokenizer.eos_token_id
+    pad = tokenizer.pad_token_id
+    prompt_tokens, prompt_weights = pad_tokens_and_weights(
+        prompt_tokens,
+        prompt_weights,
+        max_length,
+        bos,
+        eos,
+        no_boseos_middle=no_boseos_middle,
+        chunk_length=tokenizer.model_max_length,
+    )
+    prompt_tokens = torch.tensor(prompt_tokens, dtype=torch.long, device=device)
+    # get the embeddings
+    text_embeddings = get_unweighted_text_embeddings(
+        tokenizer,
+        text_encoder,
+        prompt_tokens,
+        tokenizer.model_max_length,
+        clip_skip,
+        eos,
+        pad,
+        no_boseos_middle=no_boseos_middle,
+    )
+    prompt_weights = torch.tensor(prompt_weights, dtype=text_embeddings.dtype, device=device)
+    # assign weights to the prompts and normalize in the sense of mean
+    previous_mean = text_embeddings.float().mean(axis=[-2, -1]).to(text_embeddings.dtype)
+    text_embeddings = text_embeddings * prompt_weights.unsqueeze(-1)
+    current_mean = text_embeddings.float().mean(axis=[-2, -1]).to(text_embeddings.dtype)
+    text_embeddings = text_embeddings * (previous_mean / current_mean).unsqueeze(-1).unsqueeze(-1)
+    return text_embeddings
+# https://wandb.ai/johnowhitaker/multires_noise/reports/Multi-Resolution-Noise-for-Diffusion-Model-Training--VmlldzozNjYyOTU2
+def pyramid_noise_like(noise, device, iterations=6, discount=0.4):
+    b, c, w, h = noise.shape  # EDIT: w and h get over-written, rename for a different variant!
+    u = torch.nn.Upsample(size=(w, h), mode="bilinear").to(device)
+    for i in range(iterations):
+        r = random.random() * 2 + 2  # Rather than always going 2x,
+        wn, hn = max(1, int(w / (r**i))), max(1, int(h / (r**i)))
+        noise += u(torch.randn(b, c, wn, hn).to(device)) * discount**i
+        if wn == 1 or hn == 1:
+            break  # Lowest resolution is 1x1
+    return noise / noise.std()  # Scaled back to roughly unit variance
+# https://www.crosslabs.org//blog/diffusion-with-offset-noise
+def apply_noise_offset(latents, noise, noise_offset, adaptive_noise_scale):
+    if noise_offset is None:
+        return noise
+    if adaptive_noise_scale is not None:
+        # latent shape: (batch_size, channels, height, width)
+        # abs mean value for each channel
+        latent_mean = torch.abs(latents.mean(dim=(2, 3), keepdim=True))
+        # multiply adaptive noise scale to the mean value and add it to the noise offset
+        noise_offset = noise_offset + adaptive_noise_scale * latent_mean
+        noise_offset = torch.clamp(noise_offset, 0.0, None)  # in case of adaptive noise scale is negative
+    noise = noise + noise_offset * torch.randn((latents.shape[0], latents.shape[1], 1, 1), device=latents.device)
+    return noise
+def apply_masked_loss(loss, batch):
+    if "conditioning_images" in batch:
+        # conditioning image is -1 to 1. we need to convert it to 0 to 1
+        mask_image = batch["conditioning_images"].to(dtype=loss.dtype)[:, 0].unsqueeze(1)  # use R channel
+        mask_image = mask_image / 2 + 0.5
+        # print(f"conditioning_image: {mask_image.shape}")
+    elif "alpha_masks" in batch and batch["alpha_masks"] is not None:
+        # alpha mask is 0 to 1
+        mask_image = batch["alpha_masks"].to(dtype=loss.dtype).unsqueeze(1) # add channel dimension
+        # print(f"mask_image: {mask_image.shape}, {mask_image.mean()}")
+    else:
+        return loss
+    # resize to the same size as the loss
+    mask_image = torch.nn.functional.interpolate(mask_image, size=loss.shape[2:], mode="area")
+    loss = loss * mask_image
+    return loss
+"""
+##########################################
+# Perlin Noise
+def rand_perlin_2d(device, shape, res, fade=lambda t: 6 * t**5 - 15 * t**4 + 10 * t**3):
+    delta = (res[0] / shape[0], res[1] / shape[1])
+    d = (shape[0] // res[0], shape[1] // res[1])
+    grid = (
+        torch.stack(
+            torch.meshgrid(torch.arange(0, res[0], delta[0], device=device), torch.arange(0, res[1], delta[1], device=device)),
+            dim=-1,
+        )
+        % 1
+    )
+    angles = 2 * torch.pi * torch.rand(res[0] + 1, res[1] + 1, device=device)
+    gradients = torch.stack((torch.cos(angles), torch.sin(angles)), dim=-1)
+    tile_grads = (
+        lambda slice1, slice2: gradients[slice1[0] : slice1[1], slice2[0] : slice2[1]]
+        .repeat_interleave(d[0], 0)
+        .repeat_interleave(d[1], 1)
+    )
+    dot = lambda grad, shift: (
+        torch.stack((grid[: shape[0], : shape[1], 0] + shift[0], grid[: shape[0], : shape[1], 1] + shift[1]), dim=-1)
+        * grad[: shape[0], : shape[1]]
+    ).sum(dim=-1)
+    n00 = dot(tile_grads([0, -1], [0, -1]), [0, 0])
+    n10 = dot(tile_grads([1, None], [0, -1]), [-1, 0])
+    n01 = dot(tile_grads([0, -1], [1, None]), [0, -1])
+    n11 = dot(tile_grads([1, None], [1, None]), [-1, -1])
+    t = fade(grid[: shape[0], : shape[1]])
+    return 1.414 * torch.lerp(torch.lerp(n00, n10, t[..., 0]), torch.lerp(n01, n11, t[..., 0]), t[..., 1])
+def rand_perlin_2d_octaves(device, shape, res, octaves=1, persistence=0.5):
+    noise = torch.zeros(shape, device=device)
+    frequency = 1
+    amplitude = 1
+    for _ in range(octaves):
+        noise += amplitude * rand_perlin_2d(device, shape, (frequency * res[0], frequency * res[1]))
+        frequency *= 2
+        amplitude *= persistence
+    return noise
+def perlin_noise(noise, device, octaves):
+    _, c, w, h = noise.shape
+    perlin = lambda: rand_perlin_2d_octaves(device, (w, h), (4, 4), octaves)
+    noise_perlin = []
+    for _ in range(c):
+        noise_perlin.append(perlin())
+    noise_perlin = torch.stack(noise_perlin).unsqueeze(0)   # (1, c, w, h)
+    noise += noise_perlin # broadcast for each batch
+    return noise / noise.std()  # Scaled back to roughly unit variance
+"""

library/deepspeed_utils.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import os
+import argparse
+import torch
+from accelerate import DeepSpeedPlugin, Accelerator
+from .utils import setup_logging
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+def add_deepspeed_arguments(parser: argparse.ArgumentParser):
+    # DeepSpeed Arguments. https://huggingface.co/docs/accelerate/usage_guides/deepspeed
+    parser.add_argument("--deepspeed", action="store_true", help="enable deepspeed training")
+    parser.add_argument("--zero_stage", type=int, default=2, choices=[0, 1, 2, 3], help="Possible options are 0,1,2,3.")
+    parser.add_argument(
+        "--offload_optimizer_device",
+        type=str,
+        default=None,
+        choices=[None, "cpu", "nvme"],
+        help="Possible options are none|cpu|nvme. Only applicable with ZeRO Stages 2 and 3.",
+    )
+    parser.add_argument(
+        "--offload_optimizer_nvme_path",
+        type=str,
+        default=None,
+        help="Possible options are /nvme|/local_nvme. Only applicable with ZeRO Stage 3.",
+    )
+    parser.add_argument(
+        "--offload_param_device",
+        type=str,
+        default=None,
+        choices=[None, "cpu", "nvme"],
+        help="Possible options are none|cpu|nvme. Only applicable with ZeRO Stage 3.",
+    )
+    parser.add_argument(
+        "--offload_param_nvme_path",
+        type=str,
+        default=None,
+        help="Possible options are /nvme|/local_nvme. Only applicable with ZeRO Stage 3.",
+    )
+    parser.add_argument(
+        "--zero3_init_flag",
+        action="store_true",
+        help="Flag to indicate whether to enable `deepspeed.zero.Init` for constructing massive models."
+        "Only applicable with ZeRO Stage-3.",
+    )
+    parser.add_argument(
+        "--zero3_save_16bit_model",
+        action="store_true",
+        help="Flag to indicate whether to save 16-bit model. Only applicable with ZeRO Stage-3.",
+    )
+    parser.add_argument(
+        "--fp16_master_weights_and_gradients",
+        action="store_true",
+        help="fp16_master_and_gradients requires optimizer to support keeping fp16 master and gradients while keeping the optimizer states in fp32.",
+    )
+def prepare_deepspeed_args(args: argparse.Namespace):
+    if not args.deepspeed:
+        return
+    # To avoid RuntimeError: DataLoader worker exited unexpectedly with exit code 1.
+    args.max_data_loader_n_workers = 1
+def prepare_deepspeed_plugin(args: argparse.Namespace):
+    if not args.deepspeed:
+        return None
+    try:
+        import deepspeed
+    except ImportError as e:
+        logger.error(
+            "deepspeed is not installed. please install deepspeed in your environment with following command. DS_BUILD_OPS=0 pip install deepspeed"
+        )
+        exit(1)
+    deepspeed_plugin = DeepSpeedPlugin(
+        zero_stage=args.zero_stage,
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        gradient_clipping=args.max_grad_norm,
+        offload_optimizer_device=args.offload_optimizer_device,
+        offload_optimizer_nvme_path=args.offload_optimizer_nvme_path,
+        offload_param_device=args.offload_param_device,
+        offload_param_nvme_path=args.offload_param_nvme_path,
+        zero3_init_flag=args.zero3_init_flag,
+        zero3_save_16bit_model=args.zero3_save_16bit_model,
+    )
+    deepspeed_plugin.deepspeed_config["train_micro_batch_size_per_gpu"] = args.train_batch_size
+    deepspeed_plugin.deepspeed_config["train_batch_size"] = (
+        args.train_batch_size * args.gradient_accumulation_steps * int(os.environ["WORLD_SIZE"])
+    )
+    deepspeed_plugin.set_mixed_precision(args.mixed_precision)
+    if args.mixed_precision.lower() == "fp16":
+        deepspeed_plugin.deepspeed_config["fp16"]["initial_scale_power"] = 0  # preventing overflow.
+    if args.full_fp16 or args.fp16_master_weights_and_gradients:
+        if args.offload_optimizer_device == "cpu" and args.zero_stage == 2:
+            deepspeed_plugin.deepspeed_config["fp16"]["fp16_master_weights_and_grads"] = True
+            logger.info("[DeepSpeed] full fp16 enable.")
+        else:
+            logger.info(
+                "[DeepSpeed]full fp16, fp16_master_weights_and_grads currently only supported using ZeRO-Offload with DeepSpeedCPUAdam on ZeRO-2 stage."
+            )
+    if args.offload_optimizer_device is not None:
+        logger.info("[DeepSpeed] start to manually build cpu_adam.")
+        deepspeed.ops.op_builder.CPUAdamBuilder().load()
+        logger.info("[DeepSpeed] building cpu_adam done.")
+    return deepspeed_plugin
+# Accelerate library does not support multiple models for deepspeed. So, we need to wrap multiple models into a single model.
+def prepare_deepspeed_model(args: argparse.Namespace, **models):
+    # remove None from models
+    models = {k: v for k, v in models.items() if v is not None}
+    class DeepSpeedWrapper(torch.nn.Module):
+        def __init__(self, **kw_models) -> None:
+            super().__init__()
+            self.models = torch.nn.ModuleDict()
+            for key, model in kw_models.items():
+                if isinstance(model, list):
+                    model = torch.nn.ModuleList(model)
+                assert isinstance(
+                    model, torch.nn.Module
+                ), f"model must be an instance of torch.nn.Module, but got {key} is {type(model)}"
+                self.models.update(torch.nn.ModuleDict({key: model}))
+        def get_models(self):
+            return self.models
+    ds_model = DeepSpeedWrapper(**models)
+    return ds_model

library/device_utils.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import functools
+import gc
+import torch
+try:
+    HAS_CUDA = torch.cuda.is_available()
+except Exception:
+    HAS_CUDA = False
+try:
+    HAS_MPS = torch.backends.mps.is_available()
+except Exception:
+    HAS_MPS = False
+try:
+    import intel_extension_for_pytorch as ipex  # noqa
+    HAS_XPU = torch.xpu.is_available()
+except Exception:
+    HAS_XPU = False
+def clean_memory():
+    gc.collect()
+    if HAS_CUDA:
+        torch.cuda.empty_cache()
+    if HAS_XPU:
+        torch.xpu.empty_cache()
+    if HAS_MPS:
+        torch.mps.empty_cache()
+def clean_memory_on_device(device: torch.device):
+    r"""
+    Clean memory on the specified device, will be called from training scripts.
+    """
+    gc.collect()
+    # device may "cuda" or "cuda:0", so we need to check the type of device
+    if device.type == "cuda":
+        torch.cuda.empty_cache()
+    if device.type == "xpu":
+        torch.xpu.empty_cache()
+    if device.type == "mps":
+        torch.mps.empty_cache()
+@functools.lru_cache(maxsize=None)
+def get_preferred_device() -> torch.device:
+    r"""
+    Do not call this function from training scripts. Use accelerator.device instead.
+    """
+    if HAS_CUDA:
+        device = torch.device("cuda")
+    elif HAS_XPU:
+        device = torch.device("xpu")
+    elif HAS_MPS:
+        device = torch.device("mps")
+    else:
+        device = torch.device("cpu")
+    print(f"get_preferred_device() -> {device}")
+    return device
+def init_ipex():
+    """
+    Apply IPEX to CUDA hijacks using `library.ipex.ipex_init`.
+    This function should run right after importing torch and before doing anything else.
+    If IPEX is not available, this function does nothing.
+    """
+    try:
+        if HAS_XPU:
+            from library.ipex import ipex_init
+            is_initialized, error_message = ipex_init()
+            if not is_initialized:
+                print("failed to initialize ipex:", error_message)
+        else:
+            return
+    except Exception as e:
+        print("failed to initialize ipex:", e)

library/flux_models.py ADDED Viewed

	@@ -0,0 +1,1237 @@

+# copy from FLUX repo: https://github.com/black-forest-labs/flux
+# license: Apache-2.0 License
+from concurrent.futures import Future, ThreadPoolExecutor
+from dataclasses import dataclass
+import math
+import os
+import time
+from typing import Dict, List, Optional, Union
+from library import utils
+from library.device_utils import init_ipex, clean_memory_on_device
+init_ipex()
+import torch
+from einops import rearrange
+from torch import Tensor, nn
+from torch.utils.checkpoint import checkpoint
+from library import custom_offloading_utils
+# USE_REENTRANT = True
+@dataclass
+class FluxParams:
+    in_channels: int
+    vec_in_dim: int
+    context_in_dim: int
+    hidden_size: int
+    mlp_ratio: float
+    num_heads: int
+    depth: int
+    depth_single_blocks: int
+    axes_dim: list[int]
+    theta: int
+    qkv_bias: bool
+    guidance_embed: bool
+# region autoencoder
+@dataclass
+class AutoEncoderParams:
+    resolution: int
+    in_channels: int
+    ch: int
+    out_ch: int
+    ch_mult: list[int]
+    num_res_blocks: int
+    z_channels: int
+    scale_factor: float
+    shift_factor: float
+def swish(x: Tensor) -> Tensor:
+    return x * torch.sigmoid(x)
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+    def attention(self, h_: Tensor) -> Tensor:
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        b, c, h, w = q.shape
+        q = rearrange(q, "b c h w -> b 1 (h w) c").contiguous()
+        k = rearrange(k, "b c h w -> b 1 (h w) c").contiguous()
+        v = rearrange(v, "b c h w -> b 1 (h w) c").contiguous()
+        h_ = nn.functional.scaled_dot_product_attention(q, k, v)
+        return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b)
+    def forward(self, x: Tensor) -> Tensor:
+        return x + self.proj_out(self.attention(x))
+class ResnetBlock(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.norm1 = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.norm2 = nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=1e-6, affine=True)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x):
+        h = x
+        h = self.norm1(h)
+        h = swish(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = swish(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            x = self.nin_shortcut(x)
+        return x + h
+class Downsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        # no asymmetric padding in torch conv, must do it ourselves
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+    def forward(self, x: Tensor):
+        pad = (0, 1, 0, 1)
+        x = nn.functional.pad(x, pad, mode="constant", value=0)
+        x = self.conv(x)
+        return x
+class Upsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x: Tensor):
+        x = nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        x = self.conv(x)
+        return x
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = nn.Conv2d(in_channels, self.ch, kernel_size=3, stride=1, padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        block_in = self.ch
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = nn.Conv2d(block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x: Tensor) -> Tensor:
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1])
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        ch: int,
+        out_ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        in_channels: int,
+        resolution: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.ffactor = 2 ** (self.num_resolutions - 1)
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        # z to block_in
+        self.conv_in = nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks + 1):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
+    def forward(self, z: Tensor) -> Tensor:
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+class DiagonalGaussian(nn.Module):
+    def __init__(self, sample: bool = True, chunk_dim: int = 1):
+        super().__init__()
+        self.sample = sample
+        self.chunk_dim = chunk_dim
+    def forward(self, z: Tensor) -> Tensor:
+        mean, logvar = torch.chunk(z, 2, dim=self.chunk_dim)
+        if self.sample:
+            std = torch.exp(0.5 * logvar)
+            return mean + std * torch.randn_like(mean)
+        else:
+            return mean
+class AutoEncoder(nn.Module):
+    def __init__(self, params: AutoEncoderParams):
+        super().__init__()
+        self.encoder = Encoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.decoder = Decoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            out_ch=params.out_ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.reg = DiagonalGaussian()
+        self.scale_factor = params.scale_factor
+        self.shift_factor = params.shift_factor
+    @property
+    def device(self) -> torch.device:
+        return next(self.parameters()).device
+    @property
+    def dtype(self) -> torch.dtype:
+        return next(self.parameters()).dtype
+    def encode(self, x: Tensor) -> Tensor:
+        z = self.reg(self.encoder(x))
+        z = self.scale_factor * (z - self.shift_factor)
+        return z
+    def decode(self, z: Tensor) -> Tensor:
+        z = z / self.scale_factor + self.shift_factor
+        return self.decoder(z)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.decode(self.encode(x))
+# endregion
+# region config
+@dataclass
+class ModelSpec:
+    params: FluxParams
+    ae_params: AutoEncoderParams
+    ckpt_path: str | None
+    ae_path: str | None
+    # repo_id: str | None
+    # repo_flow: str | None
+    # repo_ae: str | None
+configs = {
+    "dev": ModelSpec(
+        # repo_id="black-forest-labs/FLUX.1-dev",
+        # repo_flow="flux1-dev.sft",
+        # repo_ae="ae.sft",
+        ckpt_path=None,  # os.getenv("FLUX_DEV"),
+        params=FluxParams(
+            in_channels=64,
+            vec_in_dim=768,
+            context_in_dim=4096,
+            hidden_size=3072,
+            mlp_ratio=4.0,
+            num_heads=24,
+            depth=19,
+            depth_single_blocks=38,
+            axes_dim=[16, 56, 56],
+            theta=10_000,
+            qkv_bias=True,
+            guidance_embed=True,
+        ),
+        ae_path=None,  # os.getenv("AE"),
+        ae_params=AutoEncoderParams(
+            resolution=256,
+            in_channels=3,
+            ch=128,
+            out_ch=3,
+            ch_mult=[1, 2, 4, 4],
+            num_res_blocks=2,
+            z_channels=16,
+            scale_factor=0.3611,
+            shift_factor=0.1159,
+        ),
+    ),
+    "schnell": ModelSpec(
+        # repo_id="black-forest-labs/FLUX.1-schnell",
+        # repo_flow="flux1-schnell.sft",
+        # repo_ae="ae.sft",
+        ckpt_path=None,  # os.getenv("FLUX_SCHNELL"),
+        params=FluxParams(
+            in_channels=64,
+            vec_in_dim=768,
+            context_in_dim=4096,
+            hidden_size=3072,
+            mlp_ratio=4.0,
+            num_heads=24,
+            depth=19,
+            depth_single_blocks=38,
+            axes_dim=[16, 56, 56],
+            theta=10_000,
+            qkv_bias=True,
+            guidance_embed=False,
+        ),
+        ae_path=None,  # os.getenv("AE"),
+        ae_params=AutoEncoderParams(
+            resolution=256,
+            in_channels=3,
+            ch=128,
+            out_ch=3,
+            ch_mult=[1, 2, 4, 4],
+            num_res_blocks=2,
+            z_channels=16,
+            scale_factor=0.3611,
+            shift_factor=0.1159,
+        ),
+    ),
+}
+# endregion
+# region math
+def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, attn_mask: Optional[Tensor] = None) -> Tensor:
+    q, k = apply_rope(q, k, pe)
+    x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
+    x = rearrange(x, "B H L D -> B L (H D)")
+    return x
+def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
+    assert dim % 2 == 0
+    scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
+    omega = 1.0 / (theta**scale)
+    out = torch.einsum("...n,d->...nd", pos, omega)
+    out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1)
+    out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
+    return out.float()
+def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor) -> tuple[Tensor, Tensor]:
+    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+    return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
+# endregion
+# region layers
+# for cpu_offload_checkpointing
+def to_cuda(x):
+    if isinstance(x, torch.Tensor):
+        return x.cuda()
+    elif isinstance(x, (list, tuple)):
+        return [to_cuda(elem) for elem in x]
+    elif isinstance(x, dict):
+        return {k: to_cuda(v) for k, v in x.items()}
+    else:
+        return x
+def to_cpu(x):
+    if isinstance(x, torch.Tensor):
+        return x.cpu()
+    elif isinstance(x, (list, tuple)):
+        return [to_cpu(elem) for elem in x]
+    elif isinstance(x, dict):
+        return {k: to_cpu(v) for k, v in x.items()}
+    else:
+        return x
+class EmbedND(nn.Module):
+    def __init__(self, dim: int, theta: int, axes_dim: list[int]):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.axes_dim = axes_dim
+    def forward(self, ids: Tensor) -> Tensor:
+        n_axes = ids.shape[-1]
+        emb = torch.cat(
+            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
+            dim=-3,
+        )
+        return emb.unsqueeze(1)
+def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0):
+    """
+    Create sinusoidal timestep embeddings.
+    :param t: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an (N, D) Tensor of positional embeddings.
+    """
+    t = time_factor * t
+    half = dim // 2
+    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(t.device)
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    if torch.is_floating_point(t):
+        embedding = embedding.to(t)
+    return embedding
+class MLPEmbedder(nn.Module):
+    def __init__(self, in_dim: int, hidden_dim: int):
+        super().__init__()
+        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
+        self.silu = nn.SiLU()
+        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
+        self.gradient_checkpointing = False
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+    def _forward(self, x: Tensor) -> Tensor:
+        return self.out_layer(self.silu(self.in_layer(x)))
+    def forward(self, *args, **kwargs):
+        if self.training and self.gradient_checkpointing:
+            return checkpoint(self._forward, *args, use_reentrant=False, **kwargs)
+        else:
+            return self._forward(*args, **kwargs)
+    # def forward(self, x):
+    #     if self.training and self.gradient_checkpointing:
+    #         def create_custom_forward(func):
+    #             def custom_forward(*inputs):
+    #                 return func(*inputs)
+    #             return custom_forward
+    #         return torch.utils.checkpoint.checkpoint(create_custom_forward(self._forward), x, use_reentrant=USE_REENTRANT)
+    #     else:
+    #         return self._forward(x)
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.scale = nn.Parameter(torch.ones(dim))
+    def forward(self, x: Tensor):
+        x_dtype = x.dtype
+        x = x.float()
+        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
+        # return (x * rrms).to(dtype=x_dtype) * self.scale
+        return ((x * rrms) * self.scale.float()).to(dtype=x_dtype)
+class QKNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.query_norm = RMSNorm(dim)
+        self.key_norm = RMSNorm(dim)
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple[Tensor, Tensor]:
+        q = self.query_norm(q)
+        k = self.key_norm(k)
+        return q.to(v), k.to(v)
+class SelfAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.norm = QKNorm(head_dim)
+        self.proj = nn.Linear(dim, dim)
+    # this is not called from DoubleStreamBlock/SingleStreamBlock because they uses attention function directly
+    def forward(self, x: Tensor, pe: Tensor) -> Tensor:
+        qkv = self.qkv(x)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+        x = attention(q, k, v, pe=pe)
+        x = self.proj(x)
+        return x
+@dataclass
+class ModulationOut:
+    shift: Tensor
+    scale: Tensor
+    gate: Tensor
+class Modulation(nn.Module):
+    def __init__(self, dim: int, double: bool):
+        super().__init__()
+        self.is_double = double
+        self.multiplier = 6 if double else 3
+        self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
+    def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut | None]:
+        out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(self.multiplier, dim=-1)
+        return (
+            ModulationOut(*out[:3]),
+            ModulationOut(*out[3:]) if self.is_double else None,
+        )
+class DoubleStreamBlock(nn.Module):
+    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False):
+        super().__init__()
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.img_mod = Modulation(hidden_size, double=True)
+        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
+        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+        self.txt_mod = Modulation(hidden_size, double=True)
+        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
+        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+        self.gradient_checkpointing = False
+        self.cpu_offload_checkpointing = False
+    def enable_gradient_checkpointing(self, cpu_offload: bool = False):
+        self.gradient_checkpointing = True
+        self.cpu_offload_checkpointing = cpu_offload
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+        self.cpu_offload_checkpointing = False
+    def _forward(
+        self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, txt_attention_mask: Optional[Tensor] = None
+    ) -> tuple[Tensor, Tensor]:
+        img_mod1, img_mod2 = self.img_mod(vec)
+        txt_mod1, txt_mod2 = self.txt_mod(vec)
+        # prepare image for attention
+        img_modulated = self.img_norm1(img)
+        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
+        img_qkv = self.img_attn.qkv(img_modulated)
+        img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
+        # prepare txt for attention
+        txt_modulated = self.txt_norm1(txt)
+        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
+        txt_qkv = self.txt_attn.qkv(txt_modulated)
+        txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
+        # run actual attention
+        q = torch.cat((txt_q, img_q), dim=2)
+        k = torch.cat((txt_k, img_k), dim=2)
+        v = torch.cat((txt_v, img_v), dim=2)
+        # make attention mask if not None
+        attn_mask = None
+        if txt_attention_mask is not None:
+            # F.scaled_dot_product_attention expects attn_mask to be bool for binary mask
+            attn_mask = txt_attention_mask.to(torch.bool)  # b, seq_len
+            attn_mask = torch.cat(
+                (attn_mask, torch.ones(attn_mask.shape[0], img.shape[1], device=attn_mask.device, dtype=torch.bool)), dim=1
+            )  # b, seq_len + img_len
+            # broadcast attn_mask to all heads
+            attn_mask = attn_mask[:, None, None, :].expand(-1, q.shape[1], q.shape[2], -1)
+        attn = attention(q, k, v, pe=pe, attn_mask=attn_mask)
+        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
+        # calculate the img blocks
+        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
+        img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
+        # calculate the txt blocks
+        txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
+        txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
+        return img, txt
+    def forward(
+        self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, txt_attention_mask: Optional[Tensor] = None
+    ) -> tuple[Tensor, Tensor]:
+        if self.training and self.gradient_checkpointing:
+            if not self.cpu_offload_checkpointing:
+                return checkpoint(self._forward, img, txt, vec, pe, txt_attention_mask, use_reentrant=False)
+            # cpu offload checkpointing
+            def create_custom_forward(func):
+                def custom_forward(*inputs):
+                    cuda_inputs = to_cuda(inputs)
+                    outputs = func(*cuda_inputs)
+                    return to_cpu(outputs)
+                return custom_forward
+            return torch.utils.checkpoint.checkpoint(
+                create_custom_forward(self._forward), img, txt, vec, pe, txt_attention_mask, use_reentrant=False
+            )
+        else:
+            return self._forward(img, txt, vec, pe, txt_attention_mask)
+class SingleStreamBlock(nn.Module):
+    """
+    A DiT block with parallel linear layers as described in
+    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qk_scale: float | None = None,
+    ):
+        super().__init__()
+        self.hidden_dim = hidden_size
+        self.num_heads = num_heads
+        head_dim = hidden_size // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        # qkv and mlp_in
+        self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
+        # proj and mlp_out
+        self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
+        self.norm = QKNorm(head_dim)
+        self.hidden_size = hidden_size
+        self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.mlp_act = nn.GELU(approximate="tanh")
+        self.modulation = Modulation(hidden_size, double=False)
+        self.gradient_checkpointing = False
+        self.cpu_offload_checkpointing = False
+    def enable_gradient_checkpointing(self, cpu_offload: bool = False):
+        self.gradient_checkpointing = True
+        self.cpu_offload_checkpointing = cpu_offload
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+        self.cpu_offload_checkpointing = False
+    def _forward(self, x: Tensor, vec: Tensor, pe: Tensor, txt_attention_mask: Optional[Tensor] = None) -> Tensor:
+        mod, _ = self.modulation(vec)
+        x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
+        qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+        # make attention mask if not None
+        attn_mask = None
+        if txt_attention_mask is not None:
+            # F.scaled_dot_product_attention expects attn_mask to be bool for binary mask
+            attn_mask = txt_attention_mask.to(torch.bool)  # b, seq_len
+            attn_mask = torch.cat(
+                (
+                    attn_mask,
+                    torch.ones(
+                        attn_mask.shape[0], x.shape[1] - txt_attention_mask.shape[1], device=attn_mask.device, dtype=torch.bool
+                    ),
+                ),
+                dim=1,
+            )  # b, seq_len + img_len = x_len
+            # broadcast attn_mask to all heads
+            attn_mask = attn_mask[:, None, None, :].expand(-1, q.shape[1], q.shape[2], -1)
+        # compute attention
+        attn = attention(q, k, v, pe=pe, attn_mask=attn_mask)
+        # compute activation in mlp stream, cat again and run second linear layer
+        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
+        return x + mod.gate * output
+    def forward(self, x: Tensor, vec: Tensor, pe: Tensor, txt_attention_mask: Optional[Tensor] = None) -> Tensor:
+        if self.training and self.gradient_checkpointing:
+            if not self.cpu_offload_checkpointing:
+                return checkpoint(self._forward, x, vec, pe, txt_attention_mask, use_reentrant=False)
+            # cpu offload checkpointing
+            def create_custom_forward(func):
+                def custom_forward(*inputs):
+                    cuda_inputs = to_cuda(inputs)
+                    outputs = func(*cuda_inputs)
+                    return to_cpu(outputs)
+                return custom_forward
+            return torch.utils.checkpoint.checkpoint(
+                create_custom_forward(self._forward), x, vec, pe, txt_attention_mask, use_reentrant=False
+            )
+        else:
+            return self._forward(x, vec, pe, txt_attention_mask)
+class LastLayer(nn.Module):
+    def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))
+    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
+        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
+        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
+        x = self.linear(x)
+        return x
+# endregion
+class Flux(nn.Module):
+    """
+    Transformer model for flow matching on sequences.
+    """
+    def __init__(self, params: FluxParams):
+        super().__init__()
+        self.params = params
+        self.in_channels = params.in_channels
+        self.out_channels = self.in_channels
+        if params.hidden_size % params.num_heads != 0:
+            raise ValueError(f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}")
+        pe_dim = params.hidden_size // params.num_heads
+        if sum(params.axes_dim) != pe_dim:
+            raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}")
+        self.hidden_size = params.hidden_size
+        self.num_heads = params.num_heads
+        self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
+        self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+        self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size)
+        self.guidance_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if params.guidance_embed else nn.Identity()
+        self.txt_in = nn.Linear(params.context_in_dim, self.hidden_size)
+        self.double_blocks = nn.ModuleList(
+            [
+                DoubleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=params.mlp_ratio,
+                    qkv_bias=params.qkv_bias,
+                )
+                for _ in range(params.depth)
+            ]
+        )
+        self.single_blocks = nn.ModuleList(
+            [
+                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio)
+                for _ in range(params.depth_single_blocks)
+            ]
+        )
+        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
+        self.gradient_checkpointing = False
+        self.cpu_offload_checkpointing = False
+        self.blocks_to_swap = None
+        self.offloader_double = None
+        self.offloader_single = None
+        self.num_double_blocks = len(self.double_blocks)
+        self.num_single_blocks = len(self.single_blocks)
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+    def enable_gradient_checkpointing(self, cpu_offload: bool = False):
+        self.gradient_checkpointing = True
+        self.cpu_offload_checkpointing = cpu_offload
+        self.time_in.enable_gradient_checkpointing()
+        self.vector_in.enable_gradient_checkpointing()
+        if self.guidance_in.__class__ != nn.Identity:
+            self.guidance_in.enable_gradient_checkpointing()
+        for block in self.double_blocks + self.single_blocks:
+            block.enable_gradient_checkpointing(cpu_offload=cpu_offload)
+        print(f"FLUX: Gradient checkpointing enabled. CPU offload: {cpu_offload}")
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+        self.cpu_offload_checkpointing = False
+        self.time_in.disable_gradient_checkpointing()
+        self.vector_in.disable_gradient_checkpointing()
+        if self.guidance_in.__class__ != nn.Identity:
+            self.guidance_in.disable_gradient_checkpointing()
+        for block in self.double_blocks + self.single_blocks:
+            block.disable_gradient_checkpointing()
+        print("FLUX: Gradient checkpointing disabled.")
+    def enable_block_swap(self, num_blocks: int, device: torch.device):
+        self.blocks_to_swap = num_blocks
+        double_blocks_to_swap = num_blocks // 2
+        single_blocks_to_swap = (num_blocks - double_blocks_to_swap) * 2
+        assert double_blocks_to_swap <= self.num_double_blocks - 2 and single_blocks_to_swap <= self.num_single_blocks - 2, (
+            f"Cannot swap more than {self.num_double_blocks - 2} double blocks and {self.num_single_blocks - 2} single blocks. "
+            f"Requested {double_blocks_to_swap} double blocks and {single_blocks_to_swap} single blocks."
+        )
+        self.offloader_double = custom_offloading_utils.ModelOffloader(
+            self.double_blocks, self.num_double_blocks, double_blocks_to_swap, device  # , debug=True
+        )
+        self.offloader_single = custom_offloading_utils.ModelOffloader(
+            self.single_blocks, self.num_single_blocks, single_blocks_to_swap, device  # , debug=True
+        )
+        print(
+            f"FLUX: Block swap enabled. Swapping {num_blocks} blocks, double blocks: {double_blocks_to_swap}, single blocks: {single_blocks_to_swap}."
+        )
+    def move_to_device_except_swap_blocks(self, device: torch.device):
+        # assume model is on cpu. do not move blocks to device to reduce temporary memory usage
+        if self.blocks_to_swap:
+            save_double_blocks = self.double_blocks
+            save_single_blocks = self.single_blocks
+            self.double_blocks = None
+            self.single_blocks = None
+        self.to(device)
+        if self.blocks_to_swap:
+            self.double_blocks = save_double_blocks
+            self.single_blocks = save_single_blocks
+    def prepare_block_swap_before_forward(self):
+        if self.blocks_to_swap is None or self.blocks_to_swap == 0:
+            return
+        self.offloader_double.prepare_block_devices_before_forward(self.double_blocks)
+        self.offloader_single.prepare_block_devices_before_forward(self.single_blocks)
+    def forward(
+        self,
+        img: Tensor,
+        img_ids: Tensor,
+        txt: Tensor,
+        txt_ids: Tensor,
+        timesteps: Tensor,
+        y: Tensor,
+        guidance: Tensor | None = None,
+        txt_attention_mask: Tensor | None = None,
+    ) -> Tensor:
+        if img.ndim != 3 or txt.ndim != 3:
+            raise ValueError("Input img and txt tensors must have 3 dimensions.")
+        # running on sequences img
+        img = self.img_in(img)
+        vec = self.time_in(timestep_embedding(timesteps, 256))
+        if self.params.guidance_embed:
+            if guidance is None:
+                raise ValueError("Didn't get guidance strength for guidance distilled model.")
+            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
+        vec = vec + self.vector_in(y)
+        txt = self.txt_in(txt)
+        ids = torch.cat((txt_ids, img_ids), dim=1)
+        pe = self.pe_embedder(ids)
+        if not self.blocks_to_swap:
+            for block in self.double_blocks:
+                img, txt = block(img=img, txt=txt, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
+            img = torch.cat((txt, img), 1)
+            for block in self.single_blocks:
+                img = block(img, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
+        else:
+            for block_idx, block in enumerate(self.double_blocks):
+                self.offloader_double.wait_for_block(block_idx)
+                img, txt = block(img=img, txt=txt, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
+                self.offloader_double.submit_move_blocks(self.double_blocks, block_idx)
+            img = torch.cat((txt, img), 1)
+            for block_idx, block in enumerate(self.single_blocks):
+                self.offloader_single.wait_for_block(block_idx)
+                img = block(img, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
+                self.offloader_single.submit_move_blocks(self.single_blocks, block_idx)
+        img = img[:, txt.shape[1] :, ...]
+        if self.training and self.cpu_offload_checkpointing:
+            img = img.to(self.device)
+            vec = vec.to(self.device)
+        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+        return img
+"""
+class FluxUpper(nn.Module):
+    ""
+    Transformer model for flow matching on sequences.
+    ""
+    def __init__(self, params: FluxParams):
+        super().__init__()
+        self.params = params
+        self.in_channels = params.in_channels
+        self.out_channels = self.in_channels
+        if params.hidden_size % params.num_heads != 0:
+            raise ValueError(f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}")
+        pe_dim = params.hidden_size // params.num_heads
+        if sum(params.axes_dim) != pe_dim:
+            raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}")
+        self.hidden_size = params.hidden_size
+        self.num_heads = params.num_heads
+        self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
+        self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+        self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size)
+        self.guidance_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if params.guidance_embed else nn.Identity()
+        self.txt_in = nn.Linear(params.context_in_dim, self.hidden_size)
+        self.double_blocks = nn.ModuleList(
+            [
+                DoubleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=params.mlp_ratio,
+                    qkv_bias=params.qkv_bias,
+                )
+                for _ in range(params.depth)
+            ]
+        )
+        self.gradient_checkpointing = False
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+        self.time_in.enable_gradient_checkpointing()
+        self.vector_in.enable_gradient_checkpointing()
+        if self.guidance_in.__class__ != nn.Identity:
+            self.guidance_in.enable_gradient_checkpointing()
+        for block in self.double_blocks:
+            block.enable_gradient_checkpointing()
+        print("FLUX: Gradient checkpointing enabled.")
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+        self.time_in.disable_gradient_checkpointing()
+        self.vector_in.disable_gradient_checkpointing()
+        if self.guidance_in.__class__ != nn.Identity:
+            self.guidance_in.disable_gradient_checkpointing()
+        for block in self.double_blocks:
+            block.disable_gradient_checkpointing()
+        print("FLUX: Gradient checkpointing disabled.")
+    def forward(
+        self,
+        img: Tensor,
+        img_ids: Tensor,
+        txt: Tensor,
+        txt_ids: Tensor,
+        timesteps: Tensor,
+        y: Tensor,
+        guidance: Tensor | None = None,
+        txt_attention_mask: Tensor | None = None,
+    ) -> Tensor:
+        if img.ndim != 3 or txt.ndim != 3:
+            raise ValueError("Input img and txt tensors must have 3 dimensions.")
+        # running on sequences img
+        img = self.img_in(img)
+        vec = self.time_in(timestep_embedding(timesteps, 256))
+        if self.params.guidance_embed:
+            if guidance is None:
+                raise ValueError("Didn't get guidance strength for guidance distilled model.")
+            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
+        vec = vec + self.vector_in(y)
+        txt = self.txt_in(txt)
+        ids = torch.cat((txt_ids, img_ids), dim=1)
+        pe = self.pe_embedder(ids)
+        for block in self.double_blocks:
+            img, txt = block(img=img, txt=txt, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
+        return img, txt, vec, pe
+class FluxLower(nn.Module):
+    ""
+    Transformer model for flow matching on sequences.
+    ""
+    def __init__(self, params: FluxParams):
+        super().__init__()
+        self.hidden_size = params.hidden_size
+        self.num_heads = params.num_heads
+        self.out_channels = params.in_channels
+        self.single_blocks = nn.ModuleList(
+            [
+                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio)
+                for _ in range(params.depth_single_blocks)
+            ]
+        )
+        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
+        self.gradient_checkpointing = False
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+        for block in self.single_blocks:
+            block.enable_gradient_checkpointing()
+        print("FLUX: Gradient checkpointing enabled.")
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+        for block in self.single_blocks:
+            block.disable_gradient_checkpointing()
+        print("FLUX: Gradient checkpointing disabled.")
+    def forward(
+        self,
+        img: Tensor,
+        txt: Tensor,
+        vec: Tensor | None = None,
+        pe: Tensor | None = None,
+        txt_attention_mask: Tensor | None = None,
+    ) -> Tensor:
+        img = torch.cat((txt, img), 1)
+        for block in self.single_blocks:
+            img = block(img, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
+        img = img[:, txt.shape[1] :, ...]
+        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+        return img
+"""

library/flux_train_utils.py ADDED Viewed

	@@ -0,0 +1,582 @@

+import argparse
+import math
+import os
+import numpy as np
+import toml
+import json
+import time
+from typing import Callable, Dict, List, Optional, Tuple, Union
+import torch
+from accelerate import Accelerator, PartialState
+from transformers import CLIPTextModel
+from tqdm import tqdm
+from PIL import Image
+from safetensors.torch import save_file
+from library import flux_models, flux_utils, strategy_base, train_util
+from library.device_utils import init_ipex, clean_memory_on_device
+init_ipex()
+from .utils import setup_logging, mem_eff_save_file
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+# region sample images
+def sample_images(
+    accelerator: Accelerator,
+    args: argparse.Namespace,
+    epoch,
+    steps,
+    flux,
+    ae,
+    text_encoders,
+    sample_prompts_te_outputs,
+    prompt_replacement=None,
+):
+    if steps == 0:
+        if not args.sample_at_first:
+            return
+    else:
+        if args.sample_every_n_steps is None and args.sample_every_n_epochs is None:
+            return
+        if args.sample_every_n_epochs is not None:
+            # sample_every_n_steps は無視する
+            if epoch is None or epoch % args.sample_every_n_epochs != 0:
+                return
+        else:
+            if steps % args.sample_every_n_steps != 0 or epoch is not None:  # steps is not divisible or end of epoch
+                return
+    logger.info("")
+    logger.info(f"generating sample images at step / サンプル画像生成 ステップ: {steps}")
+    if not os.path.isfile(args.sample_prompts) and sample_prompts_te_outputs is None:
+        logger.error(f"No prompt file / プロンプトファイルがありません: {args.sample_prompts}")
+        return
+    distributed_state = PartialState()  # for multi gpu distributed inference. this is a singleton, so it's safe to use it here
+    # unwrap unet and text_encoder(s)
+    flux = accelerator.unwrap_model(flux)
+    if text_encoders is not None:
+        text_encoders = [accelerator.unwrap_model(te) for te in text_encoders]
+    # print([(te.parameters().__next__().device if te is not None else None) for te in text_encoders])
+    prompts = train_util.load_prompts(args.sample_prompts)
+    save_dir = args.output_dir + "/sample"
+    os.makedirs(save_dir, exist_ok=True)
+    # save random state to restore later
+    rng_state = torch.get_rng_state()
+    cuda_rng_state = None
+    try:
+        cuda_rng_state = torch.cuda.get_rng_state() if torch.cuda.is_available() else None
+    except Exception:
+        pass
+    if distributed_state.num_processes <= 1:
+        # If only one device is available, just use the original prompt list. We don't need to care about the distribution of prompts.
+        with torch.no_grad(), accelerator.autocast():
+            for prompt_dict in prompts:
+                sample_image_inference(
+                    accelerator,
+                    args,
+                    flux,
+                    text_encoders,
+                    ae,
+                    save_dir,
+                    prompt_dict,
+                    epoch,
+                    steps,
+                    sample_prompts_te_outputs,
+                    prompt_replacement,
+                )
+    else:
+        # Creating list with N elements, where each element is a list of prompt_dicts, and N is the number of processes available (number of devices available)
+        # prompt_dicts are assigned to lists based on order of processes, to attempt to time the image creation time to match enum order. Probably only works when steps and sampler are identical.
+        per_process_prompts = []  # list of lists
+        for i in range(distributed_state.num_processes):
+            per_process_prompts.append(prompts[i :: distributed_state.num_processes])
+        with torch.no_grad():
+            with distributed_state.split_between_processes(per_process_prompts) as prompt_dict_lists:
+                for prompt_dict in prompt_dict_lists[0]:
+                    sample_image_inference(
+                        accelerator,
+                        args,
+                        flux,
+                        text_encoders,
+                        ae,
+                        save_dir,
+                        prompt_dict,
+                        epoch,
+                        steps,
+                        sample_prompts_te_outputs,
+                        prompt_replacement,
+                    )
+    torch.set_rng_state(rng_state)
+    if cuda_rng_state is not None:
+        torch.cuda.set_rng_state(cuda_rng_state)
+    clean_memory_on_device(accelerator.device)
+def sample_image_inference(
+    accelerator: Accelerator,
+    args: argparse.Namespace,
+    flux: flux_models.Flux,
+    text_encoders: Optional[List[CLIPTextModel]],
+    ae: flux_models.AutoEncoder,
+    save_dir,
+    prompt_dict,
+    epoch,
+    steps,
+    sample_prompts_te_outputs,
+    prompt_replacement,
+):
+    assert isinstance(prompt_dict, dict)
+    # negative_prompt = prompt_dict.get("negative_prompt")
+    sample_steps = prompt_dict.get("sample_steps", 20)
+    width = prompt_dict.get("width", 512)
+    height = prompt_dict.get("height", 512)
+    scale = prompt_dict.get("scale", 3.5)
+    seed = prompt_dict.get("seed")
+    # controlnet_image = prompt_dict.get("controlnet_image")
+    prompt: str = prompt_dict.get("prompt", "")
+    # sampler_name: str = prompt_dict.get("sample_sampler", args.sample_sampler)
+    if prompt_replacement is not None:
+        prompt = prompt.replace(prompt_replacement[0], prompt_replacement[1])
+        # if negative_prompt is not None:
+        #     negative_prompt = negative_prompt.replace(prompt_replacement[0], prompt_replacement[1])
+    if seed is not None:
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
+    else:
+        # True random sample image generation
+        torch.seed()
+        torch.cuda.seed()
+    # if negative_prompt is None:
+    #     negative_prompt = ""
+    height = max(64, height - height % 16)  # round to divisible by 16
+    width = max(64, width - width % 16)  # round to divisible by 16
+    logger.info(f"prompt: {prompt}")
+    # logger.info(f"negative_prompt: {negative_prompt}")
+    logger.info(f"height: {height}")
+    logger.info(f"width: {width}")
+    logger.info(f"sample_steps: {sample_steps}")
+    logger.info(f"scale: {scale}")
+    # logger.info(f"sample_sampler: {sampler_name}")
+    if seed is not None:
+        logger.info(f"seed: {seed}")
+    # encode prompts
+    tokenize_strategy = strategy_base.TokenizeStrategy.get_strategy()
+    encoding_strategy = strategy_base.TextEncodingStrategy.get_strategy()
+    text_encoder_conds = []
+    if sample_prompts_te_outputs and prompt in sample_prompts_te_outputs:
+        text_encoder_conds = sample_prompts_te_outputs[prompt]
+        print(f"Using cached text encoder outputs for prompt: {prompt}")
+    if text_encoders is not None:
+        print(f"Encoding prompt: {prompt}")
+        tokens_and_masks = tokenize_strategy.tokenize(prompt)
+        # strategy has apply_t5_attn_mask option
+        encoded_text_encoder_conds = encoding_strategy.encode_tokens(tokenize_strategy, text_encoders, tokens_and_masks)
+        # if text_encoder_conds is not cached, use encoded_text_encoder_conds
+        if len(text_encoder_conds) == 0:
+            text_encoder_conds = encoded_text_encoder_conds
+        else:
+            # if encoded_text_encoder_conds is not None, update cached text_encoder_conds
+            for i in range(len(encoded_text_encoder_conds)):
+                if encoded_text_encoder_conds[i] is not None:
+                    text_encoder_conds[i] = encoded_text_encoder_conds[i]
+    l_pooled, t5_out, txt_ids, t5_attn_mask = text_encoder_conds
+    # sample image
+    weight_dtype = ae.dtype  # TOFO give dtype as argument
+    packed_latent_height = height // 16
+    packed_latent_width = width // 16
+    noise = torch.randn(
+        1,
+        packed_latent_height * packed_latent_width,
+        16 * 2 * 2,
+        device=accelerator.device,
+        dtype=weight_dtype,
+        generator=torch.Generator(device=accelerator.device).manual_seed(seed) if seed is not None else None,
+    )
+    timesteps = get_schedule(sample_steps, noise.shape[1], shift=True)  # FLUX.1 dev -> shift=True
+    img_ids = flux_utils.prepare_img_ids(1, packed_latent_height, packed_latent_width).to(accelerator.device, weight_dtype)
+    t5_attn_mask = t5_attn_mask.to(accelerator.device) if args.apply_t5_attn_mask else None
+    with accelerator.autocast(), torch.no_grad():
+        x = denoise(flux, noise, img_ids, t5_out, txt_ids, l_pooled, timesteps=timesteps, guidance=scale, t5_attn_mask=t5_attn_mask)
+    x = x.float()
+    x = flux_utils.unpack_latents(x, packed_latent_height, packed_latent_width)
+    # latent to image
+    clean_memory_on_device(accelerator.device)
+    org_vae_device = ae.device  # will be on cpu
+    ae.to(accelerator.device)  # distributed_state.device is same as accelerator.device
+    with accelerator.autocast(), torch.no_grad():
+        x = ae.decode(x)
+    ae.to(org_vae_device)
+    clean_memory_on_device(accelerator.device)
+    x = x.clamp(-1, 1)
+    x = x.permute(0, 2, 3, 1)
+    image = Image.fromarray((127.5 * (x + 1.0)).float().cpu().numpy().astype(np.uint8)[0])
+    # adding accelerator.wait_for_everyone() here should sync up and ensure that sample images are saved in the same order as the original prompt list
+    # but adding 'enum' to the filename should be enough
+    ts_str = time.strftime("%Y%m%d%H%M%S", time.localtime())
+    num_suffix = f"e{epoch:06d}" if epoch is not None else f"{steps:06d}"
+    seed_suffix = "" if seed is None else f"_{seed}"
+    i: int = prompt_dict["enum"]
+    img_filename = f"{'' if args.output_name is None else args.output_name + '_'}{num_suffix}_{i:02d}_{ts_str}{seed_suffix}.png"
+    image.save(os.path.join(save_dir, img_filename))
+    # send images to wandb if enabled
+    if "wandb" in [tracker.name for tracker in accelerator.trackers]:
+        wandb_tracker = accelerator.get_tracker("wandb")
+        import wandb
+        # not to commit images to avoid inconsistency between training and logging steps
+        wandb_tracker.log({f"sample_{i}": wandb.Image(image, caption=prompt)}, commit=False)  # positive prompt as a caption
+def time_shift(mu: float, sigma: float, t: torch.Tensor):
+    return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+def get_lin_function(x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15) -> Callable[[float], float]:
+    m = (y2 - y1) / (x2 - x1)
+    b = y1 - m * x1
+    return lambda x: m * x + b
+def get_schedule(
+    num_steps: int,
+    image_seq_len: int,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+    shift: bool = True,
+) -> list[float]:
+    # extra step for zero
+    timesteps = torch.linspace(1, 0, num_steps + 1)
+    # shifting the schedule to favor high timesteps for higher signal images
+    if shift:
+        # eastimate mu based on linear estimation between two points
+        mu = get_lin_function(y1=base_shift, y2=max_shift)(image_seq_len)
+        timesteps = time_shift(mu, 1.0, timesteps)
+    return timesteps.tolist()
+def denoise(
+    model: flux_models.Flux,
+    img: torch.Tensor,
+    img_ids: torch.Tensor,
+    txt: torch.Tensor,
+    txt_ids: torch.Tensor,
+    vec: torch.Tensor,
+    timesteps: list[float],
+    guidance: float = 4.0,
+    t5_attn_mask: Optional[torch.Tensor] = None,
+):
+    # this is ignored for schnell
+    guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
+    for t_curr, t_prev in zip(tqdm(timesteps[:-1]), timesteps[1:]):
+        t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
+        model.prepare_block_swap_before_forward()
+        pred = model(
+            img=img,
+            img_ids=img_ids,
+            txt=txt,
+            txt_ids=txt_ids,
+            y=vec,
+            timesteps=t_vec,
+            guidance=guidance_vec,
+            txt_attention_mask=t5_attn_mask,
+        )
+        img = img + (t_prev - t_curr) * pred
+    model.prepare_block_swap_before_forward()
+    return img
+# endregion
+# region train
+def get_sigmas(noise_scheduler, timesteps, device, n_dim=4, dtype=torch.float32):
+    sigmas = noise_scheduler.sigmas.to(device=device, dtype=dtype)
+    schedule_timesteps = noise_scheduler.timesteps.to(device)
+    timesteps = timesteps.to(device)
+    step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+    sigma = sigmas[step_indices].flatten()
+    while len(sigma.shape) < n_dim:
+        sigma = sigma.unsqueeze(-1)
+    return sigma
+def compute_density_for_timestep_sampling(
+    weighting_scheme: str, batch_size: int, logit_mean: float = None, logit_std: float = None, mode_scale: float = None
+):
+    """Compute the density for sampling the timesteps when doing SD3 training.
+    Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
+    SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
+    """
+    if weighting_scheme == "logit_normal":
+        # See 3.1 in the SD3 paper ($rf/lognorm(0.00,1.00)$).
+        u = torch.normal(mean=logit_mean, std=logit_std, size=(batch_size,), device="cpu")
+        u = torch.nn.functional.sigmoid(u)
+    elif weighting_scheme == "mode":
+        u = torch.rand(size=(batch_size,), device="cpu")
+        u = 1 - u - mode_scale * (torch.cos(math.pi * u / 2) ** 2 - 1 + u)
+    else:
+        u = torch.rand(size=(batch_size,), device="cpu")
+    return u
+def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
+    """Computes loss weighting scheme for SD3 training.
+    Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
+    SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
+    """
+    if weighting_scheme == "sigma_sqrt":
+        weighting = (sigmas**-2.0).float()
+    elif weighting_scheme == "cosmap":
+        bot = 1 - 2 * sigmas + 2 * sigmas**2
+        weighting = 2 / (math.pi * bot)
+    else:
+        weighting = torch.ones_like(sigmas)
+    return weighting
+def get_noisy_model_input_and_timesteps(
+    args, noise_scheduler, latents, noise, device, dtype
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    bsz, _, h, w = latents.shape
+    sigmas = None
+    if args.timestep_sampling == "uniform" or args.timestep_sampling == "sigmoid":
+        # Simple random t-based noise sampling
+        if args.timestep_sampling == "sigmoid":
+            # https://github.com/XLabs-AI/x-flux/tree/main
+            t = torch.sigmoid(args.sigmoid_scale * torch.randn((bsz,), device=device))
+        else:
+            t = torch.rand((bsz,), device=device)
+        timesteps = t * 1000.0
+        t = t.view(-1, 1, 1, 1)
+        noisy_model_input = (1 - t) * latents + t * noise
+    elif args.timestep_sampling == "shift":
+        shift = args.discrete_flow_shift
+        logits_norm = torch.randn(bsz, device=device)
+        logits_norm = logits_norm * args.sigmoid_scale  # larger scale for more uniform sampling
+        timesteps = logits_norm.sigmoid()
+        timesteps = (timesteps * shift) / (1 + (shift - 1) * timesteps)
+        t = timesteps.view(-1, 1, 1, 1)
+        timesteps = timesteps * 1000.0
+        noisy_model_input = (1 - t) * latents + t * noise
+    elif args.timestep_sampling == "flux_shift":
+        logits_norm = torch.randn(bsz, device=device)
+        logits_norm = logits_norm * args.sigmoid_scale  # larger scale for more uniform sampling
+        timesteps = logits_norm.sigmoid()
+        mu = get_lin_function(y1=0.5, y2=1.15)((h // 2) * (w // 2))
+        timesteps = time_shift(mu, 1.0, timesteps)
+        t = timesteps.view(-1, 1, 1, 1)
+        timesteps = timesteps * 1000.0
+        noisy_model_input = (1 - t) * latents + t * noise
+    else:
+        # Sample a random timestep for each image
+        # for weighting schemes where we sample timesteps non-uniformly
+        u = compute_density_for_timestep_sampling(
+            weighting_scheme=args.weighting_scheme,
+            batch_size=bsz,
+            logit_mean=args.logit_mean,
+            logit_std=args.logit_std,
+            mode_scale=args.mode_scale,
+        )
+        indices = (u * noise_scheduler.config.num_train_timesteps).long()
+        timesteps = noise_scheduler.timesteps[indices].to(device=device)
+        # Add noise according to flow matching.
+        sigmas = get_sigmas(noise_scheduler, timesteps, device, n_dim=latents.ndim, dtype=dtype)
+        noisy_model_input = sigmas * noise + (1.0 - sigmas) * latents
+    return noisy_model_input, timesteps, sigmas
+def apply_model_prediction_type(args, model_pred, noisy_model_input, sigmas):
+    weighting = None
+    if args.model_prediction_type == "raw":
+        pass
+    elif args.model_prediction_type == "additive":
+        # add the model_pred to the noisy_model_input
+        model_pred = model_pred + noisy_model_input
+    elif args.model_prediction_type == "sigma_scaled":
+        # apply sigma scaling
+        model_pred = model_pred * (-sigmas) + noisy_model_input
+        # these weighting schemes use a uniform timestep sampling
+        # and instead post-weight the loss
+        weighting = compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
+    return model_pred, weighting
+def save_models(
+    ckpt_path: str,
+    flux: flux_models.Flux,
+    sai_metadata: Optional[dict],
+    save_dtype: Optional[torch.dtype] = None,
+    use_mem_eff_save: bool = False,
+):
+    state_dict = {}
+    def update_sd(prefix, sd):
+        for k, v in sd.items():
+            key = prefix + k
+            if save_dtype is not None and v.dtype != save_dtype:
+                v = v.detach().clone().to("cpu").to(save_dtype)
+            state_dict[key] = v
+    update_sd("", flux.state_dict())
+    if not use_mem_eff_save:
+        save_file(state_dict, ckpt_path, metadata=sai_metadata)
+    else:
+        mem_eff_save_file(state_dict, ckpt_path, metadata=sai_metadata)
+def save_flux_model_on_train_end(
+    args: argparse.Namespace, save_dtype: torch.dtype, epoch: int, global_step: int, flux: flux_models.Flux
+):
+    def sd_saver(ckpt_file, epoch_no, global_step):
+        sai_metadata = train_util.get_sai_model_spec(None, args, False, False, False, is_stable_diffusion_ckpt=True, flux="dev")
+        save_models(ckpt_file, flux, sai_metadata, save_dtype, args.mem_eff_save)
+    train_util.save_sd_model_on_train_end_common(args, True, True, epoch, global_step, sd_saver, None)
+# epochとstepの保存、メタデータにepoch/stepが含まれ引数が同じになるため、統合している
+# on_epoch_end: Trueならepoch終了時、Falseならstep経過時
+def save_flux_model_on_epoch_end_or_stepwise(
+    args: argparse.Namespace,
+    on_epoch_end: bool,
+    accelerator,
+    save_dtype: torch.dtype,
+    epoch: int,
+    num_train_epochs: int,
+    global_step: int,
+    flux: flux_models.Flux,
+):
+    def sd_saver(ckpt_file, epoch_no, global_step):
+        sai_metadata = train_util.get_sai_model_spec(None, args, False, False, False, is_stable_diffusion_ckpt=True, flux="dev")
+        save_models(ckpt_file, flux, sai_metadata, save_dtype, args.mem_eff_save)
+    train_util.save_sd_model_on_epoch_end_or_stepwise_common(
+        args,
+        on_epoch_end,
+        accelerator,
+        True,
+        True,
+        epoch,
+        num_train_epochs,
+        global_step,
+        sd_saver,
+        None,
+    )
+# endregion
+def add_flux_train_arguments(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "--clip_l",
+        type=str,
+        help="path to clip_l (*.sft or *.safetensors), should be float16 / clip_lのパス（*.sftまたは*.safetensors）、float16が前提",
+    )
+    parser.add_argument(
+        "--t5xxl",
+        type=str,
+        help="path to t5xxl (*.sft or *.safetensors), should be float16 / t5xxlのパス（*.sftまたは*.safetensors）、float16が前提",
+    )
+    parser.add_argument("--ae", type=str, help="path to ae (*.sft or *.safetensors) / aeのパス（*.sftまたは*.safetensors）")
+    parser.add_argument(
+        "--t5xxl_max_token_length",
+        type=int,
+        default=None,
+        help="maximum token length for T5-XXL. if omitted, 256 for schnell and 512 for dev"
+        " / T5-XXLの最大トークン長。省略された場合、schnellの場合は256、devの場合は512",
+    )
+    parser.add_argument(
+        "--apply_t5_attn_mask",
+        action="store_true",
+        help="apply attention mask to T5-XXL encode and FLUX double blocks / T5-XXLエンコードとFLUXダブルブロックにアテンションマスクを適用する",
+    )
+    parser.add_argument(
+        "--guidance_scale",
+        type=float,
+        default=3.5,
+        help="the FLUX.1 dev variant is a guidance distilled model",
+    )
+    parser.add_argument(
+        "--timestep_sampling",
+        choices=["sigma", "uniform", "sigmoid", "shift", "flux_shift"],
+        default="sigma",
+        help="Method to sample timesteps: sigma-based, uniform random, sigmoid of random normal, shift of sigmoid and FLUX.1 shifting."
+        " / タイムステップをサンプリングする方法：sigma、random uniform、random normalのsigmoid、sigmoidのシフト、FLUX.1のシフト。",
+    )
+    parser.add_argument(
+        "--sigmoid_scale",
+        type=float,
+        default=1.0,
+        help='Scale factor for sigmoid timestep sampling (only used when timestep-sampling is "sigmoid"). / sigmoidタイムステップサンプリングの倍率（timestep-samplingが"sigmoid"の場合のみ有効）。',
+    )
+    parser.add_argument(
+        "--model_prediction_type",
+        choices=["raw", "additive", "sigma_scaled"],
+        default="sigma_scaled",
+        help="How to interpret and process the model prediction: "
+        "raw (use as is), additive (add to noisy input), sigma_scaled (apply sigma scaling)."
+        " / モデル予測の解釈と処理方法："
+        "raw（そのまま使用）、additive（ノイズ入力に加算）、sigma_scaled（シグマスケーリングを適用）。",
+    )
+    parser.add_argument(
+        "--discrete_flow_shift",
+        type=float,
+        default=3.0,
+        help="Discrete flow shift for the Euler Discrete Scheduler, default is 3.0. / Euler Discrete Schedulerの離散フローシフト、デフォルトは3.0。",
+    )

library/flux_train_utils_recraft.py ADDED Viewed

	@@ -0,0 +1,659 @@

+import argparse
+import math
+import os
+import numpy as np
+import toml
+import json
+import time
+from typing import Callable, Dict, List, Optional, Tuple, Union
+import pdb
+import torch
+from accelerate import Accelerator, PartialState
+from transformers import CLIPTextModel
+from tqdm import tqdm
+from PIL import Image
+from safetensors.torch import save_file
+from library import flux_models, flux_utils, strategy_base, train_util
+from library.device_utils import init_ipex, clean_memory_on_device
+init_ipex()
+from .utils import setup_logging, mem_eff_save_file
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+# region sample images
+def sample_images(
+    accelerator: Accelerator,
+    args: argparse.Namespace,
+    epoch,
+    steps,
+    flux,
+    ae,
+    text_encoders,
+    sample_prompts_te_outputs,
+    prompt_replacement=None,
+    sample_images_ae_outputs=None
+):
+    if steps == 0:
+        if not args.sample_at_first:
+            return
+    else:
+        if args.sample_every_n_steps is None and args.sample_every_n_epochs is None:
+            return
+        if args.sample_every_n_epochs is not None:
+            # sample_every_n_steps は無視する
+            if epoch is None or epoch % args.sample_every_n_epochs != 0:
+                return
+        else:
+            if steps % args.sample_every_n_steps != 0 or epoch is not None:  # steps is not divisible or end of epoch
+                return
+    logger.info("")
+    logger.info(f"generating sample images at step / サンプル画像生成 ステップ: {steps}")
+    if not os.path.isfile(args.sample_prompts) and sample_prompts_te_outputs is None:
+        logger.error(f"No prompt file / プロンプトファイルがありません: {args.sample_prompts}")
+        return
+    distributed_state = PartialState()  # for multi gpu distributed inference. this is a singleton, so it's safe to use it here
+    # unwrap unet and text_encoder(s)
+    flux = accelerator.unwrap_model(flux)
+    if text_encoders is not None:
+        text_encoders = [accelerator.unwrap_model(te) for te in text_encoders]
+    # print([(te.parameters().__next__().device if te is not None else None) for te in text_encoders])
+    prompts = train_util.load_prompts(args.sample_prompts)
+    save_dir = args.output_dir + "/sample"
+    os.makedirs(save_dir, exist_ok=True)
+    # save random state to restore later
+    rng_state = torch.get_rng_state()
+    cuda_rng_state = None
+    try:
+        cuda_rng_state = torch.cuda.get_rng_state() if torch.cuda.is_available() else None
+    except Exception:
+        pass
+    if distributed_state.num_processes <= 1:
+        # If only one device is available, just use the original prompt list. We don't need to care about the distribution of prompts.
+        with torch.no_grad(), accelerator.autocast():
+            for prompt_dict in prompts:
+                sample_image_inference(
+                    accelerator,
+                    args,
+                    flux,
+                    text_encoders,
+                    ae,
+                    save_dir,
+                    prompt_dict,
+                    epoch,
+                    steps,
+                    sample_prompts_te_outputs,
+                    prompt_replacement,
+                    sample_images_ae_outputs
+                )
+    else:
+        # Creating list with N elements, where each element is a list of prompt_dicts, and N is the number of processes available (number of devices available)
+        # prompt_dicts are assigned to lists based on order of processes, to attempt to time the image creation time to match enum order. Probably only works when steps and sampler are identical.
+        per_process_prompts = []  # list of lists
+        for i in range(distributed_state.num_processes):
+            per_process_prompts.append(prompts[i :: distributed_state.num_processes])
+        with torch.no_grad():
+            with distributed_state.split_between_processes(per_process_prompts) as prompt_dict_lists:
+                for prompt_dict in prompt_dict_lists[0]:
+                    sample_image_inference(
+                        accelerator,
+                        args,
+                        flux,
+                        text_encoders,
+                        ae,
+                        save_dir,
+                        prompt_dict,
+                        epoch,
+                        steps,
+                        sample_prompts_te_outputs,
+                        prompt_replacement,
+                        sample_images_ae_outputs
+                    )
+    torch.set_rng_state(rng_state)
+    if cuda_rng_state is not None:
+        torch.cuda.set_rng_state(cuda_rng_state)
+    clean_memory_on_device(accelerator.device)
+def sample_image_inference(
+    accelerator: Accelerator,
+    args: argparse.Namespace,
+    flux: flux_models.Flux,
+    text_encoders: Optional[List[CLIPTextModel]],
+    ae: flux_models.AutoEncoder,
+    save_dir,
+    prompt_dict,
+    epoch,
+    steps,
+    sample_prompts_te_outputs,
+    prompt_replacement,
+    sample_images_ae_outputs
+):
+    assert isinstance(prompt_dict, dict)
+    # negative_prompt = prompt_dict.get("negative_prompt")
+    sample_steps = prompt_dict.get("sample_steps", 20)
+    width = prompt_dict.get("width", 1024) if args.frame_num==4 else prompt_dict.get("width", 1056)
+    height = prompt_dict.get("height", 1024) if args.frame_num==4 else prompt_dict.get("height", 1056)
+    scale = prompt_dict.get("scale", 1.0)
+    seed = prompt_dict.get("seed")
+    # controlnet_image = prompt_dict.get("controlnet_image")
+    prompt: str = prompt_dict.get("prompt", "")
+    # sampler_name: str = prompt_dict.get("sample_sampler", args.sample_sampler)
+    if prompt_replacement is not None:
+        prompt = prompt.replace(prompt_replacement[0], prompt_replacement[1])
+        # if negative_prompt is not None:
+        #     negative_prompt = negative_prompt.replace(prompt_replacement[0], prompt_replacement[1])
+    if seed is not None:
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
+    else:
+        # True random sample image generation
+        torch.seed()
+        torch.cuda.seed()
+    # if negative_prompt is None:
+    #     negative_prompt = ""
+    height = max(64, height - height % 16)  # round to divisible by 16
+    width = max(64, width - width % 16)  # round to divisible by 16
+    logger.info(f"prompt: {prompt}")
+    # logger.info(f"negative_prompt: {negative_prompt}")
+    logger.info(f"height: {height}")
+    logger.info(f"width: {width}")
+    logger.info(f"sample_steps: {sample_steps}")
+    logger.info(f"scale: {scale}")
+    # logger.info(f"sample_sampler: {sampler_name}")
+    if seed is not None:
+        logger.info(f"seed: {seed}")
+    # encode prompts
+    tokenize_strategy = strategy_base.TokenizeStrategy.get_strategy()
+    encoding_strategy = strategy_base.TextEncodingStrategy.get_strategy()
+    text_encoder_conds = []
+    if sample_prompts_te_outputs and prompt in sample_prompts_te_outputs:
+        text_encoder_conds = sample_prompts_te_outputs[prompt]
+        print(f"Using cached text encoder outputs for prompt: {prompt}")
+    if text_encoders is not None:
+        print(f"Encoding prompt: {prompt}")
+        tokens_and_masks = tokenize_strategy.tokenize(prompt)
+        # strategy has apply_t5_attn_mask option
+        encoded_text_encoder_conds = encoding_strategy.encode_tokens(tokenize_strategy, text_encoders, tokens_and_masks)
+        # if text_encoder_conds is not cached, use encoded_text_encoder_conds
+        if len(text_encoder_conds) == 0:
+            text_encoder_conds = encoded_text_encoder_conds
+        else:
+            # if encoded_text_encoder_conds is not None, update cached text_encoder_conds
+            for i in range(len(encoded_text_encoder_conds)):
+                if encoded_text_encoder_conds[i] is not None:
+                    text_encoder_conds[i] = encoded_text_encoder_conds[i]
+    if sample_images_ae_outputs and prompt in sample_images_ae_outputs:
+        ae_outputs = sample_images_ae_outputs[prompt]
+    else:
+        ae_outputs = None
+    l_pooled, t5_out, txt_ids, t5_attn_mask = text_encoder_conds
+    # sample image
+    weight_dtype = ae.dtype  # TOFO give dtype as argument
+    packed_latent_height = height // 16
+    packed_latent_width = width // 16
+    noise = torch.randn(
+        1,
+        packed_latent_height * packed_latent_width,
+        16 * 2 * 2,
+        device=accelerator.device,
+        dtype=weight_dtype,
+        generator=torch.Generator(device=accelerator.device).manual_seed(seed) if seed is not None else None,
+    )
+    timesteps = get_schedule(sample_steps, noise.shape[1], shift=True)  # FLUX.1 dev -> shift=True
+    img_ids = flux_utils.prepare_img_ids(1, packed_latent_height, packed_latent_width).to(accelerator.device, weight_dtype)
+    t5_attn_mask = t5_attn_mask.to(accelerator.device) if args.apply_t5_attn_mask else None
+    with accelerator.autocast(), torch.no_grad():
+        x = denoise(args, flux, noise, img_ids, t5_out, txt_ids, l_pooled, timesteps=timesteps, guidance=scale, t5_attn_mask=t5_attn_mask, ae_outputs=ae_outputs)
+    x = x.float()
+    x = flux_utils.unpack_latents(x, packed_latent_height, packed_latent_width)
+    # latent to image
+    clean_memory_on_device(accelerator.device)
+    org_vae_device = ae.device  # will be on cpu
+    ae.to(accelerator.device)  # distributed_state.device is same as accelerator.device
+    with accelerator.autocast(), torch.no_grad():
+        x = ae.decode(x)
+    ae.to(org_vae_device)
+    clean_memory_on_device(accelerator.device)
+    x = x.clamp(-1, 1)
+    x = x.permute(0, 2, 3, 1)
+    image = Image.fromarray((127.5 * (x + 1.0)).float().cpu().numpy().astype(np.uint8)[0])
+    # adding accelerator.wait_for_everyone() here should sync up and ensure that sample images are saved in the same order as the original prompt list
+    # but adding 'enum' to the filename should be enough
+    ts_str = time.strftime("%Y%m%d%H%M%S", time.localtime())
+    num_suffix = f"e{epoch:06d}" if epoch is not None else f"{steps:06d}"
+    seed_suffix = "" if seed is None else f"_{seed}"
+    i: int = prompt_dict["enum"]
+    img_filename = f"{'' if args.output_name is None else args.output_name + '_'}{num_suffix}_{i:02d}_{ts_str}{seed_suffix}.png"
+    image.save(os.path.join(save_dir, img_filename))
+    # send images to wandb if enabled
+    if "wandb" in [tracker.name for tracker in accelerator.trackers]:
+        wandb_tracker = accelerator.get_tracker("wandb")
+        import wandb
+        # not to commit images to avoid inconsistency between training and logging steps
+        wandb_tracker.log(
+            {f"sample_{i}": wandb.Image(
+                image,
+                caption=prompt # positive prompt as a caption
+            )},
+            commit=False
+        )
+def time_shift(mu: float, sigma: float, t: torch.Tensor):
+    return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+def get_lin_function(x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15) -> Callable[[float], float]:
+    m = (y2 - y1) / (x2 - x1)
+    b = y1 - m * x1
+    return lambda x: m * x + b
+def get_schedule(
+    num_steps: int,
+    image_seq_len: int,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+    shift: bool = True,
+) -> list[float]:
+    # extra step for zero
+    timesteps = torch.linspace(1, 0, num_steps + 1)
+    # shifting the schedule to favor high timesteps for higher signal images
+    if shift:
+        # eastimate mu based on linear estimation between two points
+        mu = get_lin_function(y1=base_shift, y2=max_shift)(image_seq_len)
+        timesteps = time_shift(mu, 1.0, timesteps)
+    return timesteps.tolist()
+def denoise(
+    args: argparse.Namespace,
+    model: flux_models.Flux,
+    img: torch.Tensor,
+    img_ids: torch.Tensor,
+    txt: torch.Tensor,
+    txt_ids: torch.Tensor,
+    vec: torch.Tensor,
+    timesteps: list[float],
+    guidance: float = 4.0,
+    t5_attn_mask: Optional[torch.Tensor] = None,
+    ae_outputs: torch.Tensor = None,
+):
+    # this is ignored for schnell
+    guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
+    img_ids = img_ids.to(img.device)
+    txt_ids = txt_ids.to(img.device)
+    vec = vec.to(img.device)
+    txt = txt.to(img.device)
+    for t_curr, t_prev in zip(tqdm(timesteps[:-1]), timesteps[1:]):
+        t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
+        model.prepare_block_swap_before_forward()
+        if args.frame_num == 4:
+            packed_latent_height, packed_latent_width = ae_outputs.shape[2]*2 // 2, ae_outputs.shape[3]*2 // 2
+            img = flux_utils.unpack_latents(img, packed_latent_height, packed_latent_width)
+            img[:,:, img.shape[2] // 2: img.shape[2], :img.shape[3] // 2] = ae_outputs
+        else:
+            packed_latent_height, packed_latent_width = ae_outputs.shape[2]*3 // 2, ae_outputs.shape[3]*3 // 2
+            img = flux_utils.unpack_latents(img, packed_latent_height, packed_latent_width)
+            img[:,:, 2*img.shape[2] // 3: img.shape[2], 2*img.shape[3] // 3:img.shape[3]] = ae_outputs
+        img = flux_utils.pack_latents(img)
+        pred = model(
+            img=img,
+            img_ids=img_ids,
+            txt=txt,
+            txt_ids=txt_ids,
+            y=vec,
+            timesteps=t_vec,
+            guidance=guidance_vec,
+            txt_attention_mask=t5_attn_mask,
+        )
+        img = img + (t_prev - t_curr) * pred
+    model.prepare_block_swap_before_forward()
+    return img
+# endregion
+# region train
+def get_sigmas(noise_scheduler, timesteps, device, n_dim=4, dtype=torch.float32):
+    sigmas = noise_scheduler.sigmas.to(device=device, dtype=dtype)
+    schedule_timesteps = noise_scheduler.timesteps.to(device)
+    timesteps = timesteps.to(device)
+    step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+    sigma = sigmas[step_indices].flatten()
+    while len(sigma.shape) < n_dim:
+        sigma = sigma.unsqueeze(-1)
+    return sigma
+def compute_density_for_timestep_sampling(
+    weighting_scheme: str, batch_size: int, logit_mean: float = None, logit_std: float = None, mode_scale: float = None
+):
+    """Compute the density for sampling the timesteps when doing SD3 training.
+    Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
+    SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
+    """
+    if weighting_scheme == "logit_normal":
+        # See 3.1 in the SD3 paper ($rf/lognorm(0.00,1.00)$).
+        u = torch.normal(mean=logit_mean, std=logit_std, size=(batch_size,), device="cpu")
+        u = torch.nn.functional.sigmoid(u)
+    elif weighting_scheme == "mode":
+        u = torch.rand(size=(batch_size,), device="cpu")
+        u = 1 - u - mode_scale * (torch.cos(math.pi * u / 2) ** 2 - 1 + u)
+    else:
+        u = torch.rand(size=(batch_size,), device="cpu")
+    return u
+def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
+    """Computes loss weighting scheme for SD3 training.
+    Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
+    SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
+    """
+    if weighting_scheme == "sigma_sqrt":
+        weighting = (sigmas**-2.0).float()
+    elif weighting_scheme == "cosmap":
+        bot = 1 - 2 * sigmas + 2 * sigmas**2
+        weighting = 2 / (math.pi * bot)
+    else:
+        weighting = torch.ones_like(sigmas)
+    return weighting
+def get_noisy_model_input_and_timesteps(
+    args, noise_scheduler, latents, noise, device, dtype
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    bsz, _, h, w = latents.shape
+    sigmas = None
+    if args.timestep_sampling == "uniform" or args.timestep_sampling == "sigmoid":
+        # Simple random t-based noise sampling
+        if args.timestep_sampling == "sigmoid":
+            # https://github.com/XLabs-AI/x-flux/tree/main
+            t = torch.sigmoid(args.sigmoid_scale * torch.randn((bsz,), device=device))
+        else:
+            t = torch.rand((bsz,), device=device)
+        timesteps = t * 1000.0
+        t = t.view(-1, 1, 1, 1)
+        noisy_model_input = (1 - t) * latents + t * noise
+    elif args.timestep_sampling == "shift":
+        shift = args.discrete_flow_shift
+        logits_norm = torch.randn(bsz, device=device)
+        logits_norm = logits_norm * args.sigmoid_scale  # larger scale for more uniform sampling
+        timesteps = logits_norm.sigmoid()
+        timesteps = (timesteps * shift) / (1 + (shift - 1) * timesteps)
+        t = timesteps.view(-1, 1, 1, 1)
+        timesteps = timesteps * 1000.0
+        noisy_model_input = (1 - t) * latents + t * noise
+    elif args.timestep_sampling == "flux_shift":
+        logits_norm = torch.randn(bsz, device=device)
+        logits_norm = logits_norm * args.sigmoid_scale  # larger scale for more uniform sampling
+        timesteps = logits_norm.sigmoid()
+        mu = get_lin_function(y1=0.5, y2=1.15)((h // 2) * (w // 2))
+        timesteps = time_shift(mu, 1.0, timesteps)
+        t = timesteps.view(-1, 1, 1, 1)
+        timesteps = timesteps * 1000.0
+        noisy_model_input = (1 - t) * latents + t * noise
+    else:
+        # Sample a random timestep for each image
+        # for weighting schemes where we sample timesteps non-uniformly
+        u = compute_density_for_timestep_sampling(
+            weighting_scheme=args.weighting_scheme,
+            batch_size=bsz,
+            logit_mean=args.logit_mean,
+            logit_std=args.logit_std,
+            mode_scale=args.mode_scale,
+        )
+        indices = (u * noise_scheduler.config.num_train_timesteps).long()
+        timesteps = noise_scheduler.timesteps[indices].to(device=device)
+        # Add noise according to flow matching.
+        sigmas = get_sigmas(noise_scheduler, timesteps, device, n_dim=latents.ndim, dtype=dtype)
+        noisy_model_input = sigmas * noise + (1.0 - sigmas) * latents
+    # 替换部分区域为原始latents
+    h, w = noisy_model_input.shape[2], noisy_model_input.shape[3]
+    # import pdb; pdb.set_trace()
+    if args.frame_num == 4:
+        noisy_model_input[:, :, h//2 : h, w//2 : w] = latents[:, :, h//2:h, w//2:w]
+    else:
+        noisy_model_input[:, :, 2*h//3 : h, 2*w//3 : w] = latents[:, :, 2*h//3:h, 2*w//3:w]
+    return noisy_model_input, timesteps, sigmas
+def apply_model_prediction_type(args, model_pred, noisy_model_input, sigmas):
+    weighting = None
+    if args.model_prediction_type == "raw":
+        pass
+    elif args.model_prediction_type == "additive":
+        # add the model_pred to the noisy_model_input
+        model_pred = model_pred + noisy_model_input
+    elif args.model_prediction_type == "sigma_scaled":
+        # apply sigma scaling
+        model_pred = model_pred * (-sigmas) + noisy_model_input
+        # these weighting schemes use a uniform timestep sampling
+        # and instead post-weight the loss
+        weighting = compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
+    return model_pred, weighting
+def save_models(
+    ckpt_path: str,
+    flux: flux_models.Flux,
+    sai_metadata: Optional[dict],
+    save_dtype: Optional[torch.dtype] = None,
+    use_mem_eff_save: bool = False,
+):
+    state_dict = {}
+    def update_sd(prefix, sd):
+        for k, v in sd.items():
+            key = prefix + k
+            if save_dtype is not None and v.dtype != save_dtype:
+                v = v.detach().clone().to("cpu").to(save_dtype)
+            state_dict[key] = v
+    update_sd("", flux.state_dict())
+    if not use_mem_eff_save:
+        save_file(state_dict, ckpt_path, metadata=sai_metadata)
+    else:
+        mem_eff_save_file(state_dict, ckpt_path, metadata=sai_metadata)
+def save_flux_model_on_train_end(
+    args: argparse.Namespace, save_dtype: torch.dtype, epoch: int, global_step: int, flux: flux_models.Flux
+):
+    def sd_saver(ckpt_file, epoch_no, global_step):
+        sai_metadata = train_util.get_sai_model_spec(None, args, False, False, False, is_stable_diffusion_ckpt=True, flux="dev")
+        save_models(ckpt_file, flux, sai_metadata, save_dtype, args.mem_eff_save)
+    train_util.save_sd_model_on_train_end_common(args, True, True, epoch, global_step, sd_saver, None)
+# epochとstepの保存、メタデータにepoch/stepが含まれ引数が同じになるため、統合している
+# on_epoch_end: Trueならepoch終了時、Falseならstep経過時
+def save_flux_model_on_epoch_end_or_stepwise(
+    args: argparse.Namespace,
+    on_epoch_end: bool,
+    accelerator,
+    save_dtype: torch.dtype,
+    epoch: int,
+    num_train_epochs: int,
+    global_step: int,
+    flux: flux_models.Flux,
+):
+    def sd_saver(ckpt_file, epoch_no, global_step):
+        sai_metadata = train_util.get_sai_model_spec(None, args, False, False, False, is_stable_diffusion_ckpt=True, flux="dev")
+        save_models(ckpt_file, flux, sai_metadata, save_dtype, args.mem_eff_save)
+    train_util.save_sd_model_on_epoch_end_or_stepwise_common(
+        args,
+        on_epoch_end,
+        accelerator,
+        True,
+        True,
+        epoch,
+        num_train_epochs,
+        global_step,
+        sd_saver,
+        None,
+    )
+# endregion
+def add_flux_train_arguments(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "--clip_l",
+        type=str,
+        help="path to clip_l (*.sft or *.safetensors), should be float16 / clip_lのパス（*.sftまたは*.safetensors）、float16が前提",
+    )
+    parser.add_argument(
+        "--t5xxl",
+        type=str,
+        help="path to t5xxl (*.sft or *.safetensors), should be float16 / t5xxlのパス（*.sftまたは*.safetensors）、float16が前提",
+    )
+    parser.add_argument("--ae", type=str, help="path to ae (*.sft or *.safetensors) / aeのパス（*.sftまたは*.safetensors）")
+    parser.add_argument(
+        "--t5xxl_max_token_length",
+        type=int,
+        default=None,
+        help="maximum token length for T5-XXL. if omitted, 256 for schnell and 512 for dev"
+        " / T5-XXLの最大トークン長。省略された場合、schnellの場合は256、devの場合は512",
+    )
+    parser.add_argument(
+        "--apply_t5_attn_mask",
+        action="store_true",
+        help="apply attention mask to T5-XXL encode and FLUX double blocks / T5-XXLエンコードとFLUXダブルブロックにアテンションマスクを適用する",
+    )
+    parser.add_argument(
+        "--cache_text_encoder_outputs", action="store_true", help="cache text encoder outputs / text encoderの出力をキャッシュする"
+    )
+    parser.add_argument(
+        "--cache_text_encoder_outputs_to_disk",
+        action="store_true",
+        help="cache text encoder outputs to disk / text encoderの出力をディスクにキャッシュする",
+    )
+    parser.add_argument(
+        "--text_encoder_batch_size",
+        type=int,
+        default=None,
+        help="text encoder batch size (default: None, use dataset's batch size)"
+        + " / text encoderのバッチサイズ（デフォルト: None, データセットのバッチサイズを使用）",
+    )
+    parser.add_argument(
+        "--disable_mmap_load_safetensors",
+        action="store_true",
+        help="disable mmap load for safetensors. Speed up model loading in WSL environment / safetensorsのmmapロードを無効にする。WSL環境等でモデル読み込みを高速化できる",
+    )
+    # copy from Diffusers
+    parser.add_argument(
+        "--weighting_scheme",
+        type=str,
+        default="none",
+        choices=["sigma_sqrt", "logit_normal", "mode", "cosmap", "none"],
+    )
+    parser.add_argument(
+        "--logit_mean", type=float, default=0.0, help="mean to use when using the `'logit_normal'` weighting scheme."
+    )
+    parser.add_argument("--logit_std", type=float, default=1.0, help="std to use when using the `'logit_normal'` weighting scheme.")
+    parser.add_argument(
+        "--mode_scale",
+        type=float,
+        default=1.29,
+        help="Scale of mode weighting scheme. Only effective when using the `'mode'` as the `weighting_scheme`.",
+    )
+    parser.add_argument(
+        "--guidance_scale",
+        type=float,
+        default=3.5,
+        help="the FLUX.1 dev variant is a guidance distilled model",
+    )
+    parser.add_argument(
+        "--timestep_sampling",
+        choices=["sigma", "uniform", "sigmoid", "shift", "flux_shift"],
+        default="sigma",
+        help="Method to sample timesteps: sigma-based, uniform random, sigmoid of random normal, shift of sigmoid and FLUX.1 shifting."
+        " / タイムステップをサンプリングする方法：sigma、random uniform、random normalのsigmoid、sigmoidのシフト、FLUX.1のシフト。",
+    )
+    parser.add_argument(
+        "--sigmoid_scale",
+        type=float,
+        default=1.0,
+        help='Scale factor for sigmoid timestep sampling (only used when timestep-sampling is "sigmoid"). / sigmoidタイムステップサンプリングの倍率（timestep-samplingが"sigmoid"の場合のみ有効）。',
+    )
+    parser.add_argument(
+        "--model_prediction_type",
+        choices=["raw", "additive", "sigma_scaled"],
+        default="sigma_scaled",
+        help="How to interpret and process the model prediction: "
+        "raw (use as is), additive (add to noisy input), sigma_scaled (apply sigma scaling)."
+        " / モデル予測の解釈と処理方法："
+        "raw（そのまま使用）、additive（ノイズ入力に加算）、sigma_scaled（シグマスケーリングを適用）。",
+    )
+    parser.add_argument(
+        "--discrete_flow_shift",
+        type=float,
+        default=3.0,
+        help="Discrete flow shift for the Euler Discrete Scheduler, default is 3.0. / Euler Discrete Schedulerの離散フローシフト、デフォルトは3.0。",
+    )

library/flux_utils.py ADDED Viewed

	@@ -0,0 +1,472 @@

+from dataclasses import replace
+import json
+import os
+from typing import List, Optional, Tuple, Union
+import einops
+import torch
+from safetensors.torch import load_file
+from safetensors import safe_open
+from accelerate import init_empty_weights
+from transformers import CLIPTextModel, CLIPConfig, T5EncoderModel, T5Config
+from library.utils import setup_logging
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+from library import flux_models
+from library.utils import load_safetensors
+MODEL_VERSION_FLUX_V1 = "flux1"
+MODEL_NAME_DEV = "dev"
+MODEL_NAME_SCHNELL = "schnell"
+def analyze_checkpoint_state(ckpt_path: str) -> Tuple[bool, bool, Tuple[int, int], List[str]]:
+    """
+    チェックポイントの状態を分析し、DiffusersかBFLか、devかschnellか、ブロック数を計算して返す。
+    Args:
+        ckpt_path (str): チェックポイントファイルまたはディレクトリのパス。
+    Returns:
+        Tuple[bool, bool, Tuple[int, int], List[str]]:
+            - bool: Diffusersかどうかを示すフラグ。
+            - bool: Schnellかどうかを示すフラグ。
+            - Tuple[int, int]: ダブルブロックとシングルブロックの数。
+            - List[str]: チェックポイントに含まれるキーのリスト。
+    """
+    # check the state dict: Diffusers or BFL, dev or schnell, number of blocks
+    logger.info(f"Checking the state dict: Diffusers or BFL, dev or schnell")
+    if os.path.isdir(ckpt_path):  # if ckpt_path is a directory, it is Diffusers
+        ckpt_path = os.path.join(ckpt_path, "transformer", "diffusion_pytorch_model-00001-of-00003.safetensors")
+    if "00001-of-00003" in ckpt_path:
+        ckpt_paths = [ckpt_path.replace("00001-of-00003", f"0000{i}-of-00003") for i in range(1, 4)]
+    else:
+        ckpt_paths = [ckpt_path]
+    keys = []
+    for ckpt_path in ckpt_paths:
+        with safe_open(ckpt_path, framework="pt") as f:
+            keys.extend(f.keys())
+    # if the key has annoying prefix, remove it
+    if keys[0].startswith("model.diffusion_model."):
+        keys = [key.replace("model.diffusion_model.", "") for key in keys]
+    is_diffusers = "transformer_blocks.0.attn.add_k_proj.bias" in keys
+    is_schnell = not ("guidance_in.in_layer.bias" in keys or "time_text_embed.guidance_embedder.linear_1.bias" in keys)
+    # check number of double and single blocks
+    if not is_diffusers:
+        max_double_block_index = max(
+            [int(key.split(".")[1]) for key in keys if key.startswith("double_blocks.") and key.endswith(".img_attn.proj.bias")]
+        )
+        max_single_block_index = max(
+            [int(key.split(".")[1]) for key in keys if key.startswith("single_blocks.") and key.endswith(".modulation.lin.bias")]
+        )
+    else:
+        max_double_block_index = max(
+            [
+                int(key.split(".")[1])
+                for key in keys
+                if key.startswith("transformer_blocks.") and key.endswith(".attn.add_k_proj.bias")
+            ]
+        )
+        max_single_block_index = max(
+            [
+                int(key.split(".")[1])
+                for key in keys
+                if key.startswith("single_transformer_blocks.") and key.endswith(".attn.to_k.bias")
+            ]
+        )
+    num_double_blocks = max_double_block_index + 1
+    num_single_blocks = max_single_block_index + 1
+    return is_diffusers, is_schnell, (num_double_blocks, num_single_blocks), ckpt_paths
+def load_flow_model(
+    ckpt_path: str, dtype: Optional[torch.dtype], device: Union[str, torch.device], disable_mmap: bool = False
+) -> Tuple[bool, flux_models.Flux]:
+    is_diffusers, is_schnell, (num_double_blocks, num_single_blocks), ckpt_paths = analyze_checkpoint_state(ckpt_path)
+    name = MODEL_NAME_DEV if not is_schnell else MODEL_NAME_SCHNELL
+    # build model
+    logger.info(f"Building Flux model {name} from {'Diffusers' if is_diffusers else 'BFL'} checkpoint")
+    with torch.device("meta"):
+        params = flux_models.configs[name].params
+        # set the number of blocks
+        if params.depth != num_double_blocks:
+            logger.info(f"Setting the number of double blocks from {params.depth} to {num_double_blocks}")
+            params = replace(params, depth=num_double_blocks)
+        if params.depth_single_blocks != num_single_blocks:
+            logger.info(f"Setting the number of single blocks from {params.depth_single_blocks} to {num_single_blocks}")
+            params = replace(params, depth_single_blocks=num_single_blocks)
+        model = flux_models.Flux(params)
+        if dtype is not None:
+            model = model.to(dtype)
+    # load_sft doesn't support torch.device
+    logger.info(f"Loading state dict from {ckpt_path}")
+    sd = {}
+    for ckpt_path in ckpt_paths:
+        sd.update(load_safetensors(ckpt_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype))
+    # convert Diffusers to BFL
+    if is_diffusers:
+        logger.info("Converting Diffusers to BFL")
+        sd = convert_diffusers_sd_to_bfl(sd, num_double_blocks, num_single_blocks)
+        logger.info("Converted Diffusers to BFL")
+    # if the key has annoying prefix, remove it
+    for key in list(sd.keys()):
+        new_key = key.replace("model.diffusion_model.", "")
+        if new_key == key:
+            break  # the model doesn't have annoying prefix
+        sd[new_key] = sd.pop(key)
+    info = model.load_state_dict(sd, strict=False, assign=True)
+    logger.info(f"Loaded Flux: {info}")
+    return is_schnell, model
+def load_ae(
+    ckpt_path: str, dtype: torch.dtype, device: Union[str, torch.device], disable_mmap: bool = False
+) -> flux_models.AutoEncoder:
+    logger.info("Building AutoEncoder")
+    with torch.device("meta"):
+        # dev and schnell have the same AE params
+        ae = flux_models.AutoEncoder(flux_models.configs[MODEL_NAME_DEV].ae_params).to(dtype)
+    logger.info(f"Loading state dict from {ckpt_path}")
+    sd = load_safetensors(ckpt_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype)
+    info = ae.load_state_dict(sd, strict=False, assign=True)
+    logger.info(f"Loaded AE: {info}")
+    return ae
+def load_clip_l(
+    ckpt_path: Optional[str],
+    dtype: torch.dtype,
+    device: Union[str, torch.device],
+    disable_mmap: bool = False,
+    state_dict: Optional[dict] = None,
+) -> CLIPTextModel:
+    logger.info("Building CLIP-L")
+    CLIPL_CONFIG = {
+        "_name_or_path": "clip-vit-large-patch14/",
+        "architectures": ["CLIPModel"],
+        "initializer_factor": 1.0,
+        "logit_scale_init_value": 2.6592,
+        "model_type": "clip",
+        "projection_dim": 768,
+        # "text_config": {
+        "_name_or_path": "",
+        "add_cross_attention": False,
+        "architectures": None,
+        "attention_dropout": 0.0,
+        "bad_words_ids": None,
+        "bos_token_id": 0,
+        "chunk_size_feed_forward": 0,
+        "cross_attention_hidden_size": None,
+        "decoder_start_token_id": None,
+        "diversity_penalty": 0.0,
+        "do_sample": False,
+        "dropout": 0.0,
+        "early_stopping": False,
+        "encoder_no_repeat_ngram_size": 0,
+        "eos_token_id": 2,
+        "finetuning_task": None,
+        "forced_bos_token_id": None,
+        "forced_eos_token_id": None,
+        "hidden_act": "quick_gelu",
+        "hidden_size": 768,
+        "id2label": {"0": "LABEL_0", "1": "LABEL_1"},
+        "initializer_factor": 1.0,
+        "initializer_range": 0.02,
+        "intermediate_size": 3072,
+        "is_decoder": False,
+        "is_encoder_decoder": False,
+        "label2id": {"LABEL_0": 0, "LABEL_1": 1},
+        "layer_norm_eps": 1e-05,
+        "length_penalty": 1.0,
+        "max_length": 20,
+        "max_position_embeddings": 77,
+        "min_length": 0,
+        "model_type": "clip_text_model",
+        "no_repeat_ngram_size": 0,
+        "num_attention_heads": 12,
+        "num_beam_groups": 1,
+        "num_beams": 1,
+        "num_hidden_layers": 12,
+        "num_return_sequences": 1,
+        "output_attentions": False,
+        "output_hidden_states": False,
+        "output_scores": False,
+        "pad_token_id": 1,
+        "prefix": None,
+        "problem_type": None,
+        "projection_dim": 768,
+        "pruned_heads": {},
+        "remove_invalid_values": False,
+        "repetition_penalty": 1.0,
+        "return_dict": True,
+        "return_dict_in_generate": False,
+        "sep_token_id": None,
+        "task_specific_params": None,
+        "temperature": 1.0,
+        "tie_encoder_decoder": False,
+        "tie_word_embeddings": True,
+        "tokenizer_class": None,
+        "top_k": 50,
+        "top_p": 1.0,
+        "torch_dtype": None,
+        "torchscript": False,
+        "transformers_version": "4.16.0.dev0",
+        "use_bfloat16": False,
+        "vocab_size": 49408,
+        "hidden_act": "gelu",
+        "hidden_size": 1280,
+        "intermediate_size": 5120,
+        "num_attention_heads": 20,
+        "num_hidden_layers": 32,
+        # },
+        # "text_config_dict": {
+        "hidden_size": 768,
+        "intermediate_size": 3072,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "projection_dim": 768,
+        # },
+        # "torch_dtype": "float32",
+        # "transformers_version": None,
+    }
+    config = CLIPConfig(**CLIPL_CONFIG)
+    with init_empty_weights():
+        clip = CLIPTextModel._from_config(config)
+    if state_dict is not None:
+        sd = state_dict
+    else:
+        logger.info(f"Loading state dict from {ckpt_path}")
+        sd = load_safetensors(ckpt_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype)
+    info = clip.load_state_dict(sd, strict=False, assign=True)
+    logger.info(f"Loaded CLIP-L: {info}")
+    return clip
+def load_t5xxl(
+    ckpt_path: str,
+    dtype: Optional[torch.dtype],
+    device: Union[str, torch.device],
+    disable_mmap: bool = False,
+    state_dict: Optional[dict] = None,
+) -> T5EncoderModel:
+    T5_CONFIG_JSON = """
+{
+  "architectures": [
+    "T5EncoderModel"
+  ],
+  "classifier_dropout": 0.0,
+  "d_ff": 10240,
+  "d_kv": 64,
+  "d_model": 4096,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "gelu_new",
+  "dropout_rate": 0.1,
+  "eos_token_id": 1,
+  "feed_forward_proj": "gated-gelu",
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": true,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "num_decoder_layers": 24,
+  "num_heads": 64,
+  "num_layers": 24,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "use_cache": true,
+  "vocab_size": 32128
+}
+"""
+    config = json.loads(T5_CONFIG_JSON)
+    config = T5Config(**config)
+    with init_empty_weights():
+        t5xxl = T5EncoderModel._from_config(config)
+    if state_dict is not None:
+        sd = state_dict
+    else:
+        logger.info(f"Loading state dict from {ckpt_path}")
+        sd = load_safetensors(ckpt_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype)
+    info = t5xxl.load_state_dict(sd, strict=False, assign=True)
+    logger.info(f"Loaded T5xxl: {info}")
+    return t5xxl
+def get_t5xxl_actual_dtype(t5xxl: T5EncoderModel) -> torch.dtype:
+    # nn.Embedding is the first layer, but it could be casted to bfloat16 or float32
+    return t5xxl.encoder.block[0].layer[0].SelfAttention.q.weight.dtype
+def prepare_img_ids(batch_size: int, packed_latent_height: int, packed_latent_width: int):
+    img_ids = torch.zeros(packed_latent_height, packed_latent_width, 3)
+    img_ids[..., 1] = img_ids[..., 1] + torch.arange(packed_latent_height)[:, None]
+    img_ids[..., 2] = img_ids[..., 2] + torch.arange(packed_latent_width)[None, :]
+    img_ids = einops.repeat(img_ids, "h w c -> b (h w) c", b=batch_size)
+    return img_ids
+def unpack_latents(x: torch.Tensor, packed_latent_height: int, packed_latent_width: int) -> torch.Tensor:
+    """
+    x: [b (h w) (c ph pw)] -> [b c (h ph) (w pw)], ph=2, pw=2
+    """
+    x = einops.rearrange(x, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=packed_latent_height, w=packed_latent_width, ph=2, pw=2)
+    return x
+def pack_latents(x: torch.Tensor) -> torch.Tensor:
+    """
+    x: [b c (h ph) (w pw)] -> [b (h w) (c ph pw)], ph=2, pw=2
+    """
+    x = einops.rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
+    return x
+# region Diffusers
+NUM_DOUBLE_BLOCKS = 19
+NUM_SINGLE_BLOCKS = 38
+BFL_TO_DIFFUSERS_MAP = {
+    "time_in.in_layer.weight": ["time_text_embed.timestep_embedder.linear_1.weight"],
+    "time_in.in_layer.bias": ["time_text_embed.timestep_embedder.linear_1.bias"],
+    "time_in.out_layer.weight": ["time_text_embed.timestep_embedder.linear_2.weight"],
+    "time_in.out_layer.bias": ["time_text_embed.timestep_embedder.linear_2.bias"],
+    "vector_in.in_layer.weight": ["time_text_embed.text_embedder.linear_1.weight"],
+    "vector_in.in_layer.bias": ["time_text_embed.text_embedder.linear_1.bias"],
+    "vector_in.out_layer.weight": ["time_text_embed.text_embedder.linear_2.weight"],
+    "vector_in.out_layer.bias": ["time_text_embed.text_embedder.linear_2.bias"],
+    "guidance_in.in_layer.weight": ["time_text_embed.guidance_embedder.linear_1.weight"],
+    "guidance_in.in_layer.bias": ["time_text_embed.guidance_embedder.linear_1.bias"],
+    "guidance_in.out_layer.weight": ["time_text_embed.guidance_embedder.linear_2.weight"],
+    "guidance_in.out_layer.bias": ["time_text_embed.guidance_embedder.linear_2.bias"],
+    "txt_in.weight": ["context_embedder.weight"],
+    "txt_in.bias": ["context_embedder.bias"],
+    "img_in.weight": ["x_embedder.weight"],
+    "img_in.bias": ["x_embedder.bias"],
+    "double_blocks.().img_mod.lin.weight": ["norm1.linear.weight"],
+    "double_blocks.().img_mod.lin.bias": ["norm1.linear.bias"],
+    "double_blocks.().txt_mod.lin.weight": ["norm1_context.linear.weight"],
+    "double_blocks.().txt_mod.lin.bias": ["norm1_context.linear.bias"],
+    "double_blocks.().img_attn.qkv.weight": ["attn.to_q.weight", "attn.to_k.weight", "attn.to_v.weight"],
+    "double_blocks.().img_attn.qkv.bias": ["attn.to_q.bias", "attn.to_k.bias", "attn.to_v.bias"],
+    "double_blocks.().txt_attn.qkv.weight": ["attn.add_q_proj.weight", "attn.add_k_proj.weight", "attn.add_v_proj.weight"],
+    "double_blocks.().txt_attn.qkv.bias": ["attn.add_q_proj.bias", "attn.add_k_proj.bias", "attn.add_v_proj.bias"],
+    "double_blocks.().img_attn.norm.query_norm.scale": ["attn.norm_q.weight"],
+    "double_blocks.().img_attn.norm.key_norm.scale": ["attn.norm_k.weight"],
+    "double_blocks.().txt_attn.norm.query_norm.scale": ["attn.norm_added_q.weight"],
+    "double_blocks.().txt_attn.norm.key_norm.scale": ["attn.norm_added_k.weight"],
+    "double_blocks.().img_mlp.0.weight": ["ff.net.0.proj.weight"],
+    "double_blocks.().img_mlp.0.bias": ["ff.net.0.proj.bias"],
+    "double_blocks.().img_mlp.2.weight": ["ff.net.2.weight"],
+    "double_blocks.().img_mlp.2.bias": ["ff.net.2.bias"],
+    "double_blocks.().txt_mlp.0.weight": ["ff_context.net.0.proj.weight"],
+    "double_blocks.().txt_mlp.0.bias": ["ff_context.net.0.proj.bias"],
+    "double_blocks.().txt_mlp.2.weight": ["ff_context.net.2.weight"],
+    "double_blocks.().txt_mlp.2.bias": ["ff_context.net.2.bias"],
+    "double_blocks.().img_attn.proj.weight": ["attn.to_out.0.weight"],
+    "double_blocks.().img_attn.proj.bias": ["attn.to_out.0.bias"],
+    "double_blocks.().txt_attn.proj.weight": ["attn.to_add_out.weight"],
+    "double_blocks.().txt_attn.proj.bias": ["attn.to_add_out.bias"],
+    "single_blocks.().modulation.lin.weight": ["norm.linear.weight"],
+    "single_blocks.().modulation.lin.bias": ["norm.linear.bias"],
+    "single_blocks.().linear1.weight": ["attn.to_q.weight", "attn.to_k.weight", "attn.to_v.weight", "proj_mlp.weight"],
+    "single_blocks.().linear1.bias": ["attn.to_q.bias", "attn.to_k.bias", "attn.to_v.bias", "proj_mlp.bias"],
+    "single_blocks.().linear2.weight": ["proj_out.weight"],
+    "single_blocks.().norm.query_norm.scale": ["attn.norm_q.weight"],
+    "single_blocks.().norm.key_norm.scale": ["attn.norm_k.weight"],
+    "single_blocks.().linear2.weight": ["proj_out.weight"],
+    "single_blocks.().linear2.bias": ["proj_out.bias"],
+    "final_layer.linear.weight": ["proj_out.weight"],
+    "final_layer.linear.bias": ["proj_out.bias"],
+    "final_layer.adaLN_modulation.1.weight": ["norm_out.linear.weight"],
+    "final_layer.adaLN_modulation.1.bias": ["norm_out.linear.bias"],
+}
+def make_diffusers_to_bfl_map(num_double_blocks: int, num_single_blocks: int) -> dict[str, tuple[int, str]]:
+    # make reverse map from diffusers map
+    diffusers_to_bfl_map = {}  # key: diffusers_key, value: (index, bfl_key)
+    for b in range(num_double_blocks):
+        for key, weights in BFL_TO_DIFFUSERS_MAP.items():
+            if key.startswith("double_blocks."):
+                block_prefix = f"transformer_blocks.{b}."
+                for i, weight in enumerate(weights):
+                    diffusers_to_bfl_map[f"{block_prefix}{weight}"] = (i, key.replace("()", f"{b}"))
+    for b in range(num_single_blocks):
+        for key, weights in BFL_TO_DIFFUSERS_MAP.items():
+            if key.startswith("single_blocks."):
+                block_prefix = f"single_transformer_blocks.{b}."
+                for i, weight in enumerate(weights):
+                    diffusers_to_bfl_map[f"{block_prefix}{weight}"] = (i, key.replace("()", f"{b}"))
+    for key, weights in BFL_TO_DIFFUSERS_MAP.items():
+        if not (key.startswith("double_blocks.") or key.startswith("single_blocks.")):
+            for i, weight in enumerate(weights):
+                diffusers_to_bfl_map[weight] = (i, key)
+    return diffusers_to_bfl_map
+def convert_diffusers_sd_to_bfl(
+    diffusers_sd: dict[str, torch.Tensor], num_double_blocks: int = NUM_DOUBLE_BLOCKS, num_single_blocks: int = NUM_SINGLE_BLOCKS
+) -> dict[str, torch.Tensor]:
+    diffusers_to_bfl_map = make_diffusers_to_bfl_map(num_double_blocks, num_single_blocks)
+    # iterate over three safetensors files to reduce memory usage
+    flux_sd = {}
+    for diffusers_key, tensor in diffusers_sd.items():
+        if diffusers_key in diffusers_to_bfl_map:
+            index, bfl_key = diffusers_to_bfl_map[diffusers_key]
+            if bfl_key not in flux_sd:
+                flux_sd[bfl_key] = []
+            flux_sd[bfl_key].append((index, tensor))
+        else:
+            logger.error(f"Error: Key not found in diffusers_to_bfl_map: {diffusers_key}")
+            raise KeyError(f"Key not found in diffusers_to_bfl_map: {diffusers_key}")
+    # concat tensors if multiple tensors are mapped to a single key, sort by index
+    for key, values in flux_sd.items():
+        if len(values) == 1:
+            flux_sd[key] = values[0][1]
+        else:
+            flux_sd[key] = torch.cat([value[1] for value in sorted(values, key=lambda x: x[0])])
+    # special case for final_layer.adaLN_modulation.1.weight and final_layer.adaLN_modulation.1.bias
+    def swap_scale_shift(weight):
+        shift, scale = weight.chunk(2, dim=0)
+        new_weight = torch.cat([scale, shift], dim=0)
+        return new_weight
+    if "final_layer.adaLN_modulation.1.weight" in flux_sd:
+        flux_sd["final_layer.adaLN_modulation.1.weight"] = swap_scale_shift(flux_sd["final_layer.adaLN_modulation.1.weight"])
+    if "final_layer.adaLN_modulation.1.bias" in flux_sd:
+        flux_sd["final_layer.adaLN_modulation.1.bias"] = swap_scale_shift(flux_sd["final_layer.adaLN_modulation.1.bias"])
+    return flux_sd
+# endregion

library/huggingface_util.py ADDED Viewed

	@@ -0,0 +1,84 @@

+from typing import Union, BinaryIO
+from huggingface_hub import HfApi
+from pathlib import Path
+import argparse
+import os
+from library.utils import fire_in_thread
+from library.utils import setup_logging
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+def exists_repo(repo_id: str, repo_type: str, revision: str = "main", token: str = None):
+    api = HfApi(
+        token=token,
+    )
+    try:
+        api.repo_info(repo_id=repo_id, revision=revision, repo_type=repo_type)
+        return True
+    except:
+        return False
+def upload(
+    args: argparse.Namespace,
+    src: Union[str, Path, bytes, BinaryIO],
+    dest_suffix: str = "",
+    force_sync_upload: bool = False,
+):
+    repo_id = args.huggingface_repo_id
+    repo_type = args.huggingface_repo_type
+    token = args.huggingface_token
+    path_in_repo = args.huggingface_path_in_repo + dest_suffix if args.huggingface_path_in_repo is not None else None
+    private = args.huggingface_repo_visibility is None or args.huggingface_repo_visibility != "public"
+    api = HfApi(token=token)
+    if not exists_repo(repo_id=repo_id, repo_type=repo_type, token=token):
+        try:
+            api.create_repo(repo_id=repo_id, repo_type=repo_type, private=private)
+        except Exception as e:  # とりあえずRepositoryNotFoundErrorは確認したが他にあると困るので
+            logger.error("===========================================")
+            logger.error(f"failed to create HuggingFace repo / HuggingFaceのリポジトリの作成に失敗しました : {e}")
+            logger.error("===========================================")
+    is_folder = (type(src) == str and os.path.isdir(src)) or (isinstance(src, Path) and src.is_dir())
+    def uploader():
+        try:
+            if is_folder:
+                api.upload_folder(
+                    repo_id=repo_id,
+                    repo_type=repo_type,
+                    folder_path=src,
+                    path_in_repo=path_in_repo,
+                )
+            else:
+                api.upload_file(
+                    repo_id=repo_id,
+                    repo_type=repo_type,
+                    path_or_fileobj=src,
+                    path_in_repo=path_in_repo,
+                )
+        except Exception as e:  # RuntimeErrorを確認済みだが他にあると困るので
+            logger.error("===========================================")
+            logger.error(f"failed to upload to HuggingFace / HuggingFaceへのアップロードに失敗しました : {e}")
+            logger.error("===========================================")
+    if args.async_upload and not force_sync_upload:
+        fire_in_thread(uploader)
+    else:
+        uploader()
+def list_dir(
+    repo_id: str,
+    subfolder: str,
+    repo_type: str,
+    revision: str = "main",
+    token: str = None,
+):
+    api = HfApi(
+        token=token,
+    )
+    repo_info = api.repo_info(repo_id=repo_id, revision=revision, repo_type=repo_type)
+    file_list = [file for file in repo_info.siblings if file.rfilename.startswith(subfolder)]
+    return file_list

library/hypernetwork.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import torch
+import torch.nn.functional as F
+from diffusers.models.attention_processor import (
+    Attention,
+    AttnProcessor2_0,
+    SlicedAttnProcessor,
+    XFormersAttnProcessor
+)
+try:
+    import xformers.ops
+except:
+    xformers = None
+loaded_networks = []
+def apply_single_hypernetwork(
+    hypernetwork, hidden_states, encoder_hidden_states
+):
+    context_k, context_v = hypernetwork.forward(hidden_states, encoder_hidden_states)
+    return context_k, context_v
+def apply_hypernetworks(context_k, context_v, layer=None):
+    if len(loaded_networks) == 0:
+        return context_v, context_v
+    for hypernetwork in loaded_networks:
+        context_k, context_v = hypernetwork.forward(context_k, context_v)
+    context_k = context_k.to(dtype=context_k.dtype)
+    context_v = context_v.to(dtype=context_k.dtype)
+    return context_k, context_v
+def xformers_forward(
+    self: XFormersAttnProcessor,
+    attn: Attention,
+    hidden_states: torch.Tensor,
+    encoder_hidden_states: torch.Tensor = None,
+    attention_mask: torch.Tensor = None,
+):
+    batch_size, sequence_length, _ = (
+        hidden_states.shape
+        if encoder_hidden_states is None
+        else encoder_hidden_states.shape
+    )
+    attention_mask = attn.prepare_attention_mask(
+        attention_mask, sequence_length, batch_size
+    )
+    query = attn.to_q(hidden_states)
+    if encoder_hidden_states is None:
+        encoder_hidden_states = hidden_states
+    elif attn.norm_cross:
+        encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+    context_k, context_v = apply_hypernetworks(hidden_states, encoder_hidden_states)
+    key = attn.to_k(context_k)
+    value = attn.to_v(context_v)
+    query = attn.head_to_batch_dim(query).contiguous()
+    key = attn.head_to_batch_dim(key).contiguous()
+    value = attn.head_to_batch_dim(value).contiguous()
+    hidden_states = xformers.ops.memory_efficient_attention(
+        query,
+        key,
+        value,
+        attn_bias=attention_mask,
+        op=self.attention_op,
+        scale=attn.scale,
+    )
+    hidden_states = hidden_states.to(query.dtype)
+    hidden_states = attn.batch_to_head_dim(hidden_states)
+    # linear proj
+    hidden_states = attn.to_out[0](hidden_states)
+    # dropout
+    hidden_states = attn.to_out[1](hidden_states)
+    return hidden_states
+def sliced_attn_forward(
+    self: SlicedAttnProcessor,
+    attn: Attention,
+    hidden_states: torch.Tensor,
+    encoder_hidden_states: torch.Tensor = None,
+    attention_mask: torch.Tensor = None,
+):
+    batch_size, sequence_length, _ = (
+        hidden_states.shape
+        if encoder_hidden_states is None
+        else encoder_hidden_states.shape
+    )
+    attention_mask = attn.prepare_attention_mask(
+        attention_mask, sequence_length, batch_size
+    )
+    query = attn.to_q(hidden_states)
+    dim = query.shape[-1]
+    query = attn.head_to_batch_dim(query)
+    if encoder_hidden_states is None:
+        encoder_hidden_states = hidden_states
+    elif attn.norm_cross:
+        encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+    context_k, context_v = apply_hypernetworks(hidden_states, encoder_hidden_states)
+    key = attn.to_k(context_k)
+    value = attn.to_v(context_v)
+    key = attn.head_to_batch_dim(key)
+    value = attn.head_to_batch_dim(value)
+    batch_size_attention, query_tokens, _ = query.shape
+    hidden_states = torch.zeros(
+        (batch_size_attention, query_tokens, dim // attn.heads),
+        device=query.device,
+        dtype=query.dtype,
+    )
+    for i in range(batch_size_attention // self.slice_size):
+        start_idx = i * self.slice_size
+        end_idx = (i + 1) * self.slice_size
+        query_slice = query[start_idx:end_idx]
+        key_slice = key[start_idx:end_idx]
+        attn_mask_slice = (
+            attention_mask[start_idx:end_idx] if attention_mask is not None else None
+        )
+        attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice)
+        attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx])
+        hidden_states[start_idx:end_idx] = attn_slice
+    hidden_states = attn.batch_to_head_dim(hidden_states)
+    # linear proj
+    hidden_states = attn.to_out[0](hidden_states)
+    # dropout
+    hidden_states = attn.to_out[1](hidden_states)
+    return hidden_states
+def v2_0_forward(
+    self: AttnProcessor2_0,
+    attn: Attention,
+    hidden_states,
+    encoder_hidden_states=None,
+    attention_mask=None,
+):
+    batch_size, sequence_length, _ = (
+        hidden_states.shape
+        if encoder_hidden_states is None
+        else encoder_hidden_states.shape
+    )
+    inner_dim = hidden_states.shape[-1]
+    if attention_mask is not None:
+        attention_mask = attn.prepare_attention_mask(
+            attention_mask, sequence_length, batch_size
+        )
+        # scaled_dot_product_attention expects attention_mask shape to be
+        # (batch, heads, source_length, target_length)
+        attention_mask = attention_mask.view(
+            batch_size, attn.heads, -1, attention_mask.shape[-1]
+        )
+    query = attn.to_q(hidden_states)
+    if encoder_hidden_states is None:
+        encoder_hidden_states = hidden_states
+    elif attn.norm_cross:
+        encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+    context_k, context_v = apply_hypernetworks(hidden_states, encoder_hidden_states)
+    key = attn.to_k(context_k)
+    value = attn.to_v(context_v)
+    head_dim = inner_dim // attn.heads
+    query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+    key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+    value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+    # the output of sdp = (batch, num_heads, seq_len, head_dim)
+    # TODO: add support for attn.scale when we move to Torch 2.1
+    hidden_states = F.scaled_dot_product_attention(
+        query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+    )
+    hidden_states = hidden_states.transpose(1, 2).reshape(
+        batch_size, -1, attn.heads * head_dim
+    )
+    hidden_states = hidden_states.to(query.dtype)
+    # linear proj
+    hidden_states = attn.to_out[0](hidden_states)
+    # dropout
+    hidden_states = attn.to_out[1](hidden_states)
+    return hidden_states
+def replace_attentions_for_hypernetwork():
+    import diffusers.models.attention_processor
+    diffusers.models.attention_processor.XFormersAttnProcessor.__call__ = (
+        xformers_forward
+    )
+    diffusers.models.attention_processor.SlicedAttnProcessor.__call__ = (
+        sliced_attn_forward
+    )
+    diffusers.models.attention_processor.AttnProcessor2_0.__call__ = v2_0_forward

library/ipex/__init__.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import os
+import sys
+import contextlib
+import torch
+import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
+from .hijacks import ipex_hijacks
+# pylint: disable=protected-access, missing-function-docstring, line-too-long
+def ipex_init(): # pylint: disable=too-many-statements
+    try:
+        if hasattr(torch, "cuda") and hasattr(torch.cuda, "is_xpu_hijacked") and torch.cuda.is_xpu_hijacked:
+            return True, "Skipping IPEX hijack"
+        else:
+            # Replace cuda with xpu:
+            torch.cuda.current_device = torch.xpu.current_device
+            torch.cuda.current_stream = torch.xpu.current_stream
+            torch.cuda.device = torch.xpu.device
+            torch.cuda.device_count = torch.xpu.device_count
+            torch.cuda.device_of = torch.xpu.device_of
+            torch.cuda.get_device_name = torch.xpu.get_device_name
+            torch.cuda.get_device_properties = torch.xpu.get_device_properties
+            torch.cuda.init = torch.xpu.init
+            torch.cuda.is_available = torch.xpu.is_available
+            torch.cuda.is_initialized = torch.xpu.is_initialized
+            torch.cuda.is_current_stream_capturing = lambda: False
+            torch.cuda.set_device = torch.xpu.set_device
+            torch.cuda.stream = torch.xpu.stream
+            torch.cuda.synchronize = torch.xpu.synchronize
+            torch.cuda.Event = torch.xpu.Event
+            torch.cuda.Stream = torch.xpu.Stream
+            torch.cuda.FloatTensor = torch.xpu.FloatTensor
+            torch.Tensor.cuda = torch.Tensor.xpu
+            torch.Tensor.is_cuda = torch.Tensor.is_xpu
+            torch.nn.Module.cuda = torch.nn.Module.xpu
+            torch.UntypedStorage.cuda = torch.UntypedStorage.xpu
+            torch.cuda._initialization_lock = torch.xpu.lazy_init._initialization_lock
+            torch.cuda._initialized = torch.xpu.lazy_init._initialized
+            torch.cuda._lazy_seed_tracker = torch.xpu.lazy_init._lazy_seed_tracker
+            torch.cuda._queued_calls = torch.xpu.lazy_init._queued_calls
+            torch.cuda._tls = torch.xpu.lazy_init._tls
+            torch.cuda.threading = torch.xpu.lazy_init.threading
+            torch.cuda.traceback = torch.xpu.lazy_init.traceback
+            torch.cuda.Optional = torch.xpu.Optional
+            torch.cuda.__cached__ = torch.xpu.__cached__
+            torch.cuda.__loader__ = torch.xpu.__loader__
+            torch.cuda.ComplexFloatStorage = torch.xpu.ComplexFloatStorage
+            torch.cuda.Tuple = torch.xpu.Tuple
+            torch.cuda.streams = torch.xpu.streams
+            torch.cuda._lazy_new = torch.xpu._lazy_new
+            torch.cuda.FloatStorage = torch.xpu.FloatStorage
+            torch.cuda.Any = torch.xpu.Any
+            torch.cuda.__doc__ = torch.xpu.__doc__
+            torch.cuda.default_generators = torch.xpu.default_generators
+            torch.cuda.HalfTensor = torch.xpu.HalfTensor
+            torch.cuda._get_device_index = torch.xpu._get_device_index
+            torch.cuda.__path__ = torch.xpu.__path__
+            torch.cuda.Device = torch.xpu.Device
+            torch.cuda.IntTensor = torch.xpu.IntTensor
+            torch.cuda.ByteStorage = torch.xpu.ByteStorage
+            torch.cuda.set_stream = torch.xpu.set_stream
+            torch.cuda.BoolStorage = torch.xpu.BoolStorage
+            torch.cuda.os = torch.xpu.os
+            torch.cuda.torch = torch.xpu.torch
+            torch.cuda.BFloat16Storage = torch.xpu.BFloat16Storage
+            torch.cuda.Union = torch.xpu.Union
+            torch.cuda.DoubleTensor = torch.xpu.DoubleTensor
+            torch.cuda.ShortTensor = torch.xpu.ShortTensor
+            torch.cuda.LongTensor = torch.xpu.LongTensor
+            torch.cuda.IntStorage = torch.xpu.IntStorage
+            torch.cuda.LongStorage = torch.xpu.LongStorage
+            torch.cuda.__annotations__ = torch.xpu.__annotations__
+            torch.cuda.__package__ = torch.xpu.__package__
+            torch.cuda.__builtins__ = torch.xpu.__builtins__
+            torch.cuda.CharTensor = torch.xpu.CharTensor
+            torch.cuda.List = torch.xpu.List
+            torch.cuda._lazy_init = torch.xpu._lazy_init
+            torch.cuda.BFloat16Tensor = torch.xpu.BFloat16Tensor
+            torch.cuda.DoubleStorage = torch.xpu.DoubleStorage
+            torch.cuda.ByteTensor = torch.xpu.ByteTensor
+            torch.cuda.StreamContext = torch.xpu.StreamContext
+            torch.cuda.ComplexDoubleStorage = torch.xpu.ComplexDoubleStorage
+            torch.cuda.ShortStorage = torch.xpu.ShortStorage
+            torch.cuda._lazy_call = torch.xpu._lazy_call
+            torch.cuda.HalfStorage = torch.xpu.HalfStorage
+            torch.cuda.random = torch.xpu.random
+            torch.cuda._device = torch.xpu._device
+            torch.cuda.classproperty = torch.xpu.classproperty
+            torch.cuda.__name__ = torch.xpu.__name__
+            torch.cuda._device_t = torch.xpu._device_t
+            torch.cuda.warnings = torch.xpu.warnings
+            torch.cuda.__spec__ = torch.xpu.__spec__
+            torch.cuda.BoolTensor = torch.xpu.BoolTensor
+            torch.cuda.CharStorage = torch.xpu.CharStorage
+            torch.cuda.__file__ = torch.xpu.__file__
+            torch.cuda._is_in_bad_fork = torch.xpu.lazy_init._is_in_bad_fork
+            # torch.cuda.is_current_stream_capturing = torch.xpu.is_current_stream_capturing
+            # Memory:
+            torch.cuda.memory = torch.xpu.memory
+            if 'linux' in sys.platform and "WSL2" in os.popen("uname -a").read():
+                torch.xpu.empty_cache = lambda: None
+            torch.cuda.empty_cache = torch.xpu.empty_cache
+            torch.cuda.memory_stats = torch.xpu.memory_stats
+            torch.cuda.memory_summary = torch.xpu.memory_summary
+            torch.cuda.memory_snapshot = torch.xpu.memory_snapshot
+            torch.cuda.memory_allocated = torch.xpu.memory_allocated
+            torch.cuda.max_memory_allocated = torch.xpu.max_memory_allocated
+            torch.cuda.memory_reserved = torch.xpu.memory_reserved
+            torch.cuda.memory_cached = torch.xpu.memory_reserved
+            torch.cuda.max_memory_reserved = torch.xpu.max_memory_reserved
+            torch.cuda.max_memory_cached = torch.xpu.max_memory_reserved
+            torch.cuda.reset_peak_memory_stats = torch.xpu.reset_peak_memory_stats
+            torch.cuda.reset_max_memory_cached = torch.xpu.reset_peak_memory_stats
+            torch.cuda.reset_max_memory_allocated = torch.xpu.reset_peak_memory_stats
+            torch.cuda.memory_stats_as_nested_dict = torch.xpu.memory_stats_as_nested_dict
+            torch.cuda.reset_accumulated_memory_stats = torch.xpu.reset_accumulated_memory_stats
+            # RNG:
+            torch.cuda.get_rng_state = torch.xpu.get_rng_state
+            torch.cuda.get_rng_state_all = torch.xpu.get_rng_state_all
+            torch.cuda.set_rng_state = torch.xpu.set_rng_state
+            torch.cuda.set_rng_state_all = torch.xpu.set_rng_state_all
+            torch.cuda.manual_seed = torch.xpu.manual_seed
+            torch.cuda.manual_seed_all = torch.xpu.manual_seed_all
+            torch.cuda.seed = torch.xpu.seed
+            torch.cuda.seed_all = torch.xpu.seed_all
+            torch.cuda.initial_seed = torch.xpu.initial_seed
+            # AMP:
+            torch.cuda.amp = torch.xpu.amp
+            torch.is_autocast_enabled = torch.xpu.is_autocast_xpu_enabled
+            torch.get_autocast_gpu_dtype = torch.xpu.get_autocast_xpu_dtype
+            if not hasattr(torch.cuda.amp, "common"):
+                torch.cuda.amp.common = contextlib.nullcontext()
+            torch.cuda.amp.common.amp_definitely_not_available = lambda: False
+            try:
+                torch.cuda.amp.GradScaler = torch.xpu.amp.GradScaler
+            except Exception: # pylint: disable=broad-exception-caught
+                try:
+                    from .gradscaler import gradscaler_init # pylint: disable=import-outside-toplevel, import-error
+                    gradscaler_init()
+                    torch.cuda.amp.GradScaler = torch.xpu.amp.GradScaler
+                except Exception: # pylint: disable=broad-exception-caught
+                    torch.cuda.amp.GradScaler = ipex.cpu.autocast._grad_scaler.GradScaler
+            # C
+            torch._C._cuda_getCurrentRawStream = ipex._C._getCurrentStream
+            ipex._C._DeviceProperties.multi_processor_count = ipex._C._DeviceProperties.gpu_subslice_count
+            ipex._C._DeviceProperties.major = 2024
+            ipex._C._DeviceProperties.minor = 0
+            # Fix functions with ipex:
+            torch.cuda.mem_get_info = lambda device=None: [(torch.xpu.get_device_properties(device).total_memory - torch.xpu.memory_reserved(device)), torch.xpu.get_device_properties(device).total_memory]
+            torch._utils._get_available_device_type = lambda: "xpu"
+            torch.has_cuda = True
+            torch.cuda.has_half = True
+            torch.cuda.is_bf16_supported = lambda *args, **kwargs: True
+            torch.cuda.is_fp16_supported = lambda *args, **kwargs: True
+            torch.backends.cuda.is_built = lambda *args, **kwargs: True
+            torch.version.cuda = "12.1"
+            torch.cuda.get_device_capability = lambda *args, **kwargs: [12,1]
+            torch.cuda.get_device_properties.major = 12
+            torch.cuda.get_device_properties.minor = 1
+            torch.cuda.ipc_collect = lambda *args, **kwargs: None
+            torch.cuda.utilization = lambda *args, **kwargs: 0
+            ipex_hijacks()
+            if not torch.xpu.has_fp64_dtype() or os.environ.get('IPEX_FORCE_ATTENTION_SLICE', None) is not None:
+                try:
+                    from .diffusers import ipex_diffusers
+                    ipex_diffusers()
+                except Exception: # pylint: disable=broad-exception-caught
+                    pass
+            torch.cuda.is_xpu_hijacked = True
+    except Exception as e:
+        return False, e
+    return True, None

library/ipex/attention.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import os
+import torch
+import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
+from functools import cache
+# pylint: disable=protected-access, missing-function-docstring, line-too-long
+# ARC GPUs can't allocate more than 4GB to a single block so we slice the attention layers
+sdpa_slice_trigger_rate = float(os.environ.get('IPEX_SDPA_SLICE_TRIGGER_RATE', 4))
+attention_slice_rate = float(os.environ.get('IPEX_ATTENTION_SLICE_RATE', 4))
+# Find something divisible with the input_tokens
+@cache
+def find_slice_size(slice_size, slice_block_size):
+    while (slice_size * slice_block_size) > attention_slice_rate:
+        slice_size = slice_size // 2
+        if slice_size <= 1:
+            slice_size = 1
+            break
+    return slice_size
+# Find slice sizes for SDPA
+@cache
+def find_sdpa_slice_sizes(query_shape, query_element_size):
+    if len(query_shape) == 3:
+        batch_size_attention, query_tokens, shape_three = query_shape
+        shape_four = 1
+    else:
+        batch_size_attention, query_tokens, shape_three, shape_four = query_shape
+    slice_block_size = query_tokens * shape_three * shape_four / 1024 / 1024 * query_element_size
+    block_size = batch_size_attention * slice_block_size
+    split_slice_size = batch_size_attention
+    split_2_slice_size = query_tokens
+    split_3_slice_size = shape_three
+    do_split = False
+    do_split_2 = False
+    do_split_3 = False
+    if block_size > sdpa_slice_trigger_rate:
+        do_split = True
+        split_slice_size = find_slice_size(split_slice_size, slice_block_size)
+        if split_slice_size * slice_block_size > attention_slice_rate:
+            slice_2_block_size = split_slice_size * shape_three * shape_four / 1024 / 1024 * query_element_size
+            do_split_2 = True
+            split_2_slice_size = find_slice_size(split_2_slice_size, slice_2_block_size)
+            if split_2_slice_size * slice_2_block_size > attention_slice_rate:
+                slice_3_block_size = split_slice_size * split_2_slice_size * shape_four / 1024 / 1024 * query_element_size
+                do_split_3 = True
+                split_3_slice_size = find_slice_size(split_3_slice_size, slice_3_block_size)
+    return do_split, do_split_2, do_split_3, split_slice_size, split_2_slice_size, split_3_slice_size
+# Find slice sizes for BMM
+@cache
+def find_bmm_slice_sizes(input_shape, input_element_size, mat2_shape):
+    batch_size_attention, input_tokens, mat2_atten_shape = input_shape[0], input_shape[1], mat2_shape[2]
+    slice_block_size = input_tokens * mat2_atten_shape / 1024 / 1024 * input_element_size
+    block_size = batch_size_attention * slice_block_size
+    split_slice_size = batch_size_attention
+    split_2_slice_size = input_tokens
+    split_3_slice_size = mat2_atten_shape
+    do_split = False
+    do_split_2 = False
+    do_split_3 = False
+    if block_size > attention_slice_rate:
+        do_split = True
+        split_slice_size = find_slice_size(split_slice_size, slice_block_size)
+        if split_slice_size * slice_block_size > attention_slice_rate:
+            slice_2_block_size = split_slice_size * mat2_atten_shape / 1024 / 1024 * input_element_size
+            do_split_2 = True
+            split_2_slice_size = find_slice_size(split_2_slice_size, slice_2_block_size)
+            if split_2_slice_size * slice_2_block_size > attention_slice_rate:
+                slice_3_block_size = split_slice_size * split_2_slice_size / 1024 / 1024 * input_element_size
+                do_split_3 = True
+                split_3_slice_size = find_slice_size(split_3_slice_size, slice_3_block_size)
+    return do_split, do_split_2, do_split_3, split_slice_size, split_2_slice_size, split_3_slice_size
+original_torch_bmm = torch.bmm
+def torch_bmm_32_bit(input, mat2, *, out=None):
+    if input.device.type != "xpu":
+        return original_torch_bmm(input, mat2, out=out)
+    do_split, do_split_2, do_split_3, split_slice_size, split_2_slice_size, split_3_slice_size = find_bmm_slice_sizes(input.shape, input.element_size(), mat2.shape)
+    # Slice BMM
+    if do_split:
+        batch_size_attention, input_tokens, mat2_atten_shape = input.shape[0], input.shape[1], mat2.shape[2]
+        hidden_states = torch.zeros(input.shape[0], input.shape[1], mat2.shape[2], device=input.device, dtype=input.dtype)
+        for i in range(batch_size_attention // split_slice_size):
+            start_idx = i * split_slice_size
+            end_idx = (i + 1) * split_slice_size
+            if do_split_2:
+                for i2 in range(input_tokens // split_2_slice_size): # pylint: disable=invalid-name
+                    start_idx_2 = i2 * split_2_slice_size
+                    end_idx_2 = (i2 + 1) * split_2_slice_size
+                    if do_split_3:
+                        for i3 in range(mat2_atten_shape // split_3_slice_size): # pylint: disable=invalid-name
+                            start_idx_3 = i3 * split_3_slice_size
+                            end_idx_3 = (i3 + 1) * split_3_slice_size
+                            hidden_states[start_idx:end_idx, start_idx_2:end_idx_2, start_idx_3:end_idx_3] = original_torch_bmm(
+                                input[start_idx:end_idx, start_idx_2:end_idx_2, start_idx_3:end_idx_3],
+                                mat2[start_idx:end_idx, start_idx_2:end_idx_2, start_idx_3:end_idx_3],
+                                out=out
+                            )
+                    else:
+                        hidden_states[start_idx:end_idx, start_idx_2:end_idx_2] = original_torch_bmm(
+                            input[start_idx:end_idx, start_idx_2:end_idx_2],
+                            mat2[start_idx:end_idx, start_idx_2:end_idx_2],
+                            out=out
+                        )
+            else:
+                hidden_states[start_idx:end_idx] = original_torch_bmm(
+                    input[start_idx:end_idx],
+                    mat2[start_idx:end_idx],
+                    out=out
+                )
+        torch.xpu.synchronize(input.device)
+    else:
+        return original_torch_bmm(input, mat2, out=out)
+    return hidden_states
+original_scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention
+def scaled_dot_product_attention_32_bit(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, **kwargs):
+    if query.device.type != "xpu":
+        return original_scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=dropout_p, is_causal=is_causal, **kwargs)
+    do_split, do_split_2, do_split_3, split_slice_size, split_2_slice_size, split_3_slice_size = find_sdpa_slice_sizes(query.shape, query.element_size())
+    # Slice SDPA
+    if do_split:
+        batch_size_attention, query_tokens, shape_three = query.shape[0], query.shape[1], query.shape[2]
+        hidden_states = torch.zeros(query.shape, device=query.device, dtype=query.dtype)
+        for i in range(batch_size_attention // split_slice_size):
+            start_idx = i * split_slice_size
+            end_idx = (i + 1) * split_slice_size
+            if do_split_2:
+                for i2 in range(query_tokens // split_2_slice_size): # pylint: disable=invalid-name
+                    start_idx_2 = i2 * split_2_slice_size
+                    end_idx_2 = (i2 + 1) * split_2_slice_size
+                    if do_split_3:
+                        for i3 in range(shape_three // split_3_slice_size): # pylint: disable=invalid-name
+                            start_idx_3 = i3 * split_3_slice_size
+                            end_idx_3 = (i3 + 1) * split_3_slice_size
+                            hidden_states[start_idx:end_idx, start_idx_2:end_idx_2, start_idx_3:end_idx_3] = original_scaled_dot_product_attention(
+                                query[start_idx:end_idx, start_idx_2:end_idx_2, start_idx_3:end_idx_3],
+                                key[start_idx:end_idx, start_idx_2:end_idx_2, start_idx_3:end_idx_3],
+                                value[start_idx:end_idx, start_idx_2:end_idx_2, start_idx_3:end_idx_3],
+                                attn_mask=attn_mask[start_idx:end_idx, start_idx_2:end_idx_2, start_idx_3:end_idx_3] if attn_mask is not None else attn_mask,
+                                dropout_p=dropout_p, is_causal=is_causal, **kwargs
+                            )
+                    else:
+                        hidden_states[start_idx:end_idx, start_idx_2:end_idx_2] = original_scaled_dot_product_attention(
+                            query[start_idx:end_idx, start_idx_2:end_idx_2],
+                            key[start_idx:end_idx, start_idx_2:end_idx_2],
+                            value[start_idx:end_idx, start_idx_2:end_idx_2],
+                            attn_mask=attn_mask[start_idx:end_idx, start_idx_2:end_idx_2] if attn_mask is not None else attn_mask,
+                            dropout_p=dropout_p, is_causal=is_causal, **kwargs
+                        )
+            else:
+                hidden_states[start_idx:end_idx] = original_scaled_dot_product_attention(
+                    query[start_idx:end_idx],
+                    key[start_idx:end_idx],
+                    value[start_idx:end_idx],
+                    attn_mask=attn_mask[start_idx:end_idx] if attn_mask is not None else attn_mask,
+                    dropout_p=dropout_p, is_causal=is_causal, **kwargs
+                )
+        torch.xpu.synchronize(query.device)
+    else:
+        return original_scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=dropout_p, is_causal=is_causal, **kwargs)
+    return hidden_states

library/ipex/diffusers.py ADDED Viewed

	@@ -0,0 +1,312 @@

+import os
+import torch
+import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
+import diffusers #0.24.0 # pylint: disable=import-error
+from diffusers.models.attention_processor import Attention
+from diffusers.utils import USE_PEFT_BACKEND
+from functools import cache
+# pylint: disable=protected-access, missing-function-docstring, line-too-long
+attention_slice_rate = float(os.environ.get('IPEX_ATTENTION_SLICE_RATE', 4))
+@cache
+def find_slice_size(slice_size, slice_block_size):
+    while (slice_size * slice_block_size) > attention_slice_rate:
+        slice_size = slice_size // 2
+        if slice_size <= 1:
+            slice_size = 1
+            break
+    return slice_size
+@cache
+def find_attention_slice_sizes(query_shape, query_element_size, query_device_type, slice_size=None):
+    if len(query_shape) == 3:
+        batch_size_attention, query_tokens, shape_three = query_shape
+        shape_four = 1
+    else:
+        batch_size_attention, query_tokens, shape_three, shape_four = query_shape
+    if slice_size is not None:
+        batch_size_attention = slice_size
+    slice_block_size = query_tokens * shape_three * shape_four / 1024 / 1024 * query_element_size
+    block_size = batch_size_attention * slice_block_size
+    split_slice_size = batch_size_attention
+    split_2_slice_size = query_tokens
+    split_3_slice_size = shape_three
+    do_split = False
+    do_split_2 = False
+    do_split_3 = False
+    if query_device_type != "xpu":
+        return do_split, do_split_2, do_split_3, split_slice_size, split_2_slice_size, split_3_slice_size
+    if block_size > attention_slice_rate:
+        do_split = True
+        split_slice_size = find_slice_size(split_slice_size, slice_block_size)
+        if split_slice_size * slice_block_size > attention_slice_rate:
+            slice_2_block_size = split_slice_size * shape_three * shape_four / 1024 / 1024 * query_element_size
+            do_split_2 = True
+            split_2_slice_size = find_slice_size(split_2_slice_size, slice_2_block_size)
+            if split_2_slice_size * slice_2_block_size > attention_slice_rate:
+                slice_3_block_size = split_slice_size * split_2_slice_size * shape_four / 1024 / 1024 * query_element_size
+                do_split_3 = True
+                split_3_slice_size = find_slice_size(split_3_slice_size, slice_3_block_size)
+    return do_split, do_split_2, do_split_3, split_slice_size, split_2_slice_size, split_3_slice_size
+class SlicedAttnProcessor: # pylint: disable=too-few-public-methods
+    r"""
+    Processor for implementing sliced attention.
+    Args:
+        slice_size (`int`, *optional*):
+            The number of steps to compute attention. Uses as many slices as `attention_head_dim // slice_size`, and
+            `attention_head_dim` must be a multiple of the `slice_size`.
+    """
+    def __init__(self, slice_size):
+        self.slice_size = slice_size
+    def __call__(self, attn: Attention, hidden_states: torch.FloatTensor,
+    encoder_hidden_states=None, attention_mask=None) -> torch.FloatTensor: # pylint: disable=too-many-statements, too-many-locals, too-many-branches
+        residual = hidden_states
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        dim = query.shape[-1]
+        query = attn.head_to_batch_dim(query)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        batch_size_attention, query_tokens, shape_three = query.shape
+        hidden_states = torch.zeros(
+            (batch_size_attention, query_tokens, dim // attn.heads), device=query.device, dtype=query.dtype
+        )
+        ####################################################################
+        # ARC GPUs can't allocate more than 4GB to a single block, Slice it:
+        _, do_split_2, do_split_3, split_slice_size, split_2_slice_size, split_3_slice_size = find_attention_slice_sizes(query.shape, query.element_size(), query.device.type, slice_size=self.slice_size)
+        for i in range(batch_size_attention // split_slice_size):
+            start_idx = i * split_slice_size
+            end_idx = (i + 1) * split_slice_size
+            if do_split_2:
+                for i2 in range(query_tokens // split_2_slice_size): # pylint: disable=invalid-name
+                    start_idx_2 = i2 * split_2_slice_size
+                    end_idx_2 = (i2 + 1) * split_2_slice_size
+                    if do_split_3:
+                        for i3 in range(shape_three // split_3_slice_size): # pylint: disable=invalid-name
+                            start_idx_3 = i3 * split_3_slice_size
+                            end_idx_3 = (i3 + 1) * split_3_slice_size
+                            query_slice = query[start_idx:end_idx, start_idx_2:end_idx_2, start_idx_3:end_idx_3]
+                            key_slice = key[start_idx:end_idx, start_idx_2:end_idx_2, start_idx_3:end_idx_3]
+                            attn_mask_slice = attention_mask[start_idx:end_idx, start_idx_2:end_idx_2, start_idx_3:end_idx_3] if attention_mask is not None else None
+                            attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice)
+                            del query_slice
+                            del key_slice
+                            del attn_mask_slice
+                            attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx, start_idx_2:end_idx_2, start_idx_3:end_idx_3])
+                            hidden_states[start_idx:end_idx, start_idx_2:end_idx_2, start_idx_3:end_idx_3] = attn_slice
+                            del attn_slice
+                    else:
+                        query_slice = query[start_idx:end_idx, start_idx_2:end_idx_2]
+                        key_slice = key[start_idx:end_idx, start_idx_2:end_idx_2]
+                        attn_mask_slice = attention_mask[start_idx:end_idx, start_idx_2:end_idx_2] if attention_mask is not None else None
+                        attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice)
+                        del query_slice
+                        del key_slice
+                        del attn_mask_slice
+                        attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx, start_idx_2:end_idx_2])
+                        hidden_states[start_idx:end_idx, start_idx_2:end_idx_2] = attn_slice
+                        del attn_slice
+                torch.xpu.synchronize(query.device)
+            else:
+                query_slice = query[start_idx:end_idx]
+                key_slice = key[start_idx:end_idx]
+                attn_mask_slice = attention_mask[start_idx:end_idx] if attention_mask is not None else None
+                attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice)
+                del query_slice
+                del key_slice
+                del attn_mask_slice
+                attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx])
+                hidden_states[start_idx:end_idx] = attn_slice
+                del attn_slice
+        ####################################################################
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class AttnProcessor:
+    r"""
+    Default processor for performing attention-related computations.
+    """
+    def __call__(self, attn: Attention, hidden_states: torch.FloatTensor,
+    encoder_hidden_states=None, attention_mask=None,
+    temb=None, scale: float = 1.0) -> torch.Tensor: # pylint: disable=too-many-statements, too-many-locals, too-many-branches
+        residual = hidden_states
+        args = () if USE_PEFT_BACKEND else (scale,)
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states, *args)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states, *args)
+        value = attn.to_v(encoder_hidden_states, *args)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        ####################################################################
+        # ARC GPUs can't allocate more than 4GB to a single block, Slice it:
+        batch_size_attention, query_tokens, shape_three = query.shape[0], query.shape[1], query.shape[2]
+        hidden_states = torch.zeros(query.shape, device=query.device, dtype=query.dtype)
+        do_split, do_split_2, do_split_3, split_slice_size, split_2_slice_size, split_3_slice_size = find_attention_slice_sizes(query.shape, query.element_size(), query.device.type)
+        if do_split:
+            for i in range(batch_size_attention // split_slice_size):
+                start_idx = i * split_slice_size
+                end_idx = (i + 1) * split_slice_size
+                if do_split_2:
+                    for i2 in range(query_tokens // split_2_slice_size): # pylint: disable=invalid-name
+                        start_idx_2 = i2 * split_2_slice_size
+                        end_idx_2 = (i2 + 1) * split_2_slice_size
+                        if do_split_3:
+                            for i3 in range(shape_three // split_3_slice_size): # pylint: disable=invalid-name
+                                start_idx_3 = i3 * split_3_slice_size
+                                end_idx_3 = (i3 + 1) * split_3_slice_size
+                                query_slice = query[start_idx:end_idx, start_idx_2:end_idx_2, start_idx_3:end_idx_3]
+                                key_slice = key[start_idx:end_idx, start_idx_2:end_idx_2, start_idx_3:end_idx_3]
+                                attn_mask_slice = attention_mask[start_idx:end_idx, start_idx_2:end_idx_2, start_idx_3:end_idx_3] if attention_mask is not None else None
+                                attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice)
+                                del query_slice
+                                del key_slice
+                                del attn_mask_slice
+                                attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx, start_idx_2:end_idx_2, start_idx_3:end_idx_3])
+                                hidden_states[start_idx:end_idx, start_idx_2:end_idx_2, start_idx_3:end_idx_3] = attn_slice
+                                del attn_slice
+                        else:
+                            query_slice = query[start_idx:end_idx, start_idx_2:end_idx_2]
+                            key_slice = key[start_idx:end_idx, start_idx_2:end_idx_2]
+                            attn_mask_slice = attention_mask[start_idx:end_idx, start_idx_2:end_idx_2] if attention_mask is not None else None
+                            attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice)
+                            del query_slice
+                            del key_slice
+                            del attn_mask_slice
+                            attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx, start_idx_2:end_idx_2])
+                            hidden_states[start_idx:end_idx, start_idx_2:end_idx_2] = attn_slice
+                            del attn_slice
+                else:
+                    query_slice = query[start_idx:end_idx]
+                    key_slice = key[start_idx:end_idx]
+                    attn_mask_slice = attention_mask[start_idx:end_idx] if attention_mask is not None else None
+                    attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice)
+                    del query_slice
+                    del key_slice
+                    del attn_mask_slice
+                    attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx])
+                    hidden_states[start_idx:end_idx] = attn_slice
+                    del attn_slice
+            torch.xpu.synchronize(query.device)
+        else:
+            attention_probs = attn.get_attention_scores(query, key, attention_mask)
+            hidden_states = torch.bmm(attention_probs, value)
+        ####################################################################
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states, *args)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+def ipex_diffusers():
+    #ARC GPUs can't allocate more than 4GB to a single block:
+    diffusers.models.attention_processor.SlicedAttnProcessor = SlicedAttnProcessor
+    diffusers.models.attention_processor.AttnProcessor = AttnProcessor

library/ipex/gradscaler.py ADDED Viewed

	@@ -0,0 +1,183 @@

+from collections import defaultdict
+import torch
+import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
+import intel_extension_for_pytorch._C as core # pylint: disable=import-error, unused-import
+# pylint: disable=protected-access, missing-function-docstring, line-too-long
+device_supports_fp64 = torch.xpu.has_fp64_dtype()
+OptState = ipex.cpu.autocast._grad_scaler.OptState
+_MultiDeviceReplicator = ipex.cpu.autocast._grad_scaler._MultiDeviceReplicator
+_refresh_per_optimizer_state = ipex.cpu.autocast._grad_scaler._refresh_per_optimizer_state
+def _unscale_grads_(self, optimizer, inv_scale, found_inf, allow_fp16): # pylint: disable=unused-argument
+    per_device_inv_scale = _MultiDeviceReplicator(inv_scale)
+    per_device_found_inf = _MultiDeviceReplicator(found_inf)
+    # To set up _amp_foreach_non_finite_check_and_unscale_, split grads by device and dtype.
+    # There could be hundreds of grads, so we'd like to iterate through them just once.
+    # However, we don't know their devices or dtypes in advance.
+    # https://stackoverflow.com/questions/5029934/defaultdict-of-defaultdict
+    # Google says mypy struggles with defaultdicts type annotations.
+    per_device_and_dtype_grads = defaultdict(lambda: defaultdict(list))  # type: ignore[var-annotated]
+    # sync grad to master weight
+    if hasattr(optimizer, "sync_grad"):
+        optimizer.sync_grad()
+    with torch.no_grad():
+        for group in optimizer.param_groups:
+            for param in group["params"]:
+                if param.grad is None:
+                    continue
+                if (not allow_fp16) and param.grad.dtype == torch.float16:
+                    raise ValueError("Attempting to unscale FP16 gradients.")
+                if param.grad.is_sparse:
+                    # is_coalesced() == False means the sparse grad has values with duplicate indices.
+                    # coalesce() deduplicates indices and adds all values that have the same index.
+                    # For scaled fp16 values, there's a good chance coalescing will cause overflow,
+                    # so we should check the coalesced _values().
+                    if param.grad.dtype is torch.float16:
+                        param.grad = param.grad.coalesce()
+                    to_unscale = param.grad._values()
+                else:
+                    to_unscale = param.grad
+                # -: is there a way to split by device and dtype without appending in the inner loop?
+                to_unscale = to_unscale.to("cpu")
+                per_device_and_dtype_grads[to_unscale.device][
+                    to_unscale.dtype
+                ].append(to_unscale)
+        for _, per_dtype_grads in per_device_and_dtype_grads.items():
+            for grads in per_dtype_grads.values():
+                core._amp_foreach_non_finite_check_and_unscale_(
+                    grads,
+                    per_device_found_inf.get("cpu"),
+                    per_device_inv_scale.get("cpu"),
+                )
+    return per_device_found_inf._per_device_tensors
+def unscale_(self, optimizer):
+    """
+    Divides ("unscales") the optimizer's gradient tensors by the scale factor.
+    :meth:`unscale_` is optional, serving cases where you need to
+    :ref:`modify or inspect gradients<working-with-unscaled-gradients>`
+    between the backward pass(es) and :meth:`step`.
+    If :meth:`unscale_` is not called explicitly,  gradients will be unscaled  automatically during :meth:`step`.
+    Simple example, using :meth:`unscale_` to enable clipping of unscaled gradients::
+        ...
+        scaler.scale(loss).backward()
+        scaler.unscale_(optimizer)
+        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
+        scaler.step(optimizer)
+        scaler.update()
+    Args:
+        optimizer (torch.optim.Optimizer):  Optimizer that owns the gradients to be unscaled.
+    .. warning::
+        :meth:`unscale_` should only be called once per optimizer per :meth:`step` call,
+        and only after all gradients for that optimizer's assigned parameters have been accumulated.
+        Calling :meth:`unscale_` twice for a given optimizer between each :meth:`step` triggers a RuntimeError.
+    .. warning::
+        :meth:`unscale_` may unscale sparse gradients out of place, replacing the ``.grad`` attribute.
+    """
+    if not self._enabled:
+        return
+    self._check_scale_growth_tracker("unscale_")
+    optimizer_state = self._per_optimizer_states[id(optimizer)]
+    if optimizer_state["stage"] is OptState.UNSCALED: # pylint: disable=no-else-raise
+        raise RuntimeError(
+            "unscale_() has already been called on this optimizer since the last update()."
+        )
+    elif optimizer_state["stage"] is OptState.STEPPED:
+        raise RuntimeError("unscale_() is being called after step().")
+    # FP32 division can be imprecise for certain compile options, so we carry out the reciprocal in FP64.
+    assert self._scale is not None
+    if device_supports_fp64:
+        inv_scale = self._scale.double().reciprocal().float()
+    else:
+        inv_scale = self._scale.to("cpu").double().reciprocal().float().to(self._scale.device)
+    found_inf = torch.full(
+        (1,), 0.0, dtype=torch.float32, device=self._scale.device
+    )
+    optimizer_state["found_inf_per_device"] = self._unscale_grads_(
+        optimizer, inv_scale, found_inf, False
+    )
+    optimizer_state["stage"] = OptState.UNSCALED
+def update(self, new_scale=None):
+    """
+    Updates the scale factor.
+    If any optimizer steps were skipped the scale is multiplied by ``backoff_factor``
+    to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively,
+    the scale is multiplied by ``growth_factor`` to increase it.
+    Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not
+    used directly, it's used to fill GradScaler's internal scale tensor. So if
+    ``new_scale`` was a tensor, later in-place changes to that tensor will not further
+    affect the scale GradScaler uses internally.)
+    Args:
+        new_scale (float or :class:`torch.FloatTensor`, optional, default=None):  New scale factor.
+    .. warning::
+        :meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has
+        been invoked for all optimizers used this iteration.
+    """
+    if not self._enabled:
+        return
+    _scale, _growth_tracker = self._check_scale_growth_tracker("update")
+    if new_scale is not None:
+        # Accept a new user-defined scale.
+        if isinstance(new_scale, float):
+            self._scale.fill_(new_scale)  # type: ignore[union-attr]
+        else:
+            reason = "new_scale should be a float or a 1-element torch.FloatTensor with requires_grad=False."
+            assert isinstance(new_scale, torch.FloatTensor), reason  # type: ignore[attr-defined]
+            assert new_scale.numel() == 1, reason
+            assert new_scale.requires_grad is False, reason
+            self._scale.copy_(new_scale)  # type: ignore[union-attr]
+    else:
+        # Consume shared inf/nan data collected from optimizers to update the scale.
+        # If all found_inf tensors are on the same device as self._scale, this operation is asynchronous.
+        found_infs = [
+            found_inf.to(device="cpu", non_blocking=True)
+            for state in self._per_optimizer_states.values()
+            for found_inf in state["found_inf_per_device"].values()
+        ]
+        assert len(found_infs) > 0, "No inf checks were recorded prior to update."
+        found_inf_combined = found_infs[0]
+        if len(found_infs) > 1:
+            for i in range(1, len(found_infs)):
+                found_inf_combined += found_infs[i]
+        to_device = _scale.device
+        _scale = _scale.to("cpu")
+        _growth_tracker = _growth_tracker.to("cpu")
+        core._amp_update_scale_(
+            _scale,
+            _growth_tracker,
+            found_inf_combined,
+            self._growth_factor,
+            self._backoff_factor,
+            self._growth_interval,
+        )
+        _scale = _scale.to(to_device)
+        _growth_tracker = _growth_tracker.to(to_device)
+    # To prepare for next iteration, clear the data collected from optimizers this iteration.
+    self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state)
+def gradscaler_init():
+    torch.xpu.amp.GradScaler = ipex.cpu.autocast._grad_scaler.GradScaler
+    torch.xpu.amp.GradScaler._unscale_grads_ = _unscale_grads_
+    torch.xpu.amp.GradScaler.unscale_ = unscale_
+    torch.xpu.amp.GradScaler.update = update
+    return torch.xpu.amp.GradScaler

library/ipex/hijacks.py ADDED Viewed

	@@ -0,0 +1,313 @@

+import os
+from functools import wraps
+from contextlib import nullcontext
+import torch
+import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
+import numpy as np
+device_supports_fp64 = torch.xpu.has_fp64_dtype()
+# pylint: disable=protected-access, missing-function-docstring, line-too-long, unnecessary-lambda, no-else-return
+class DummyDataParallel(torch.nn.Module): # pylint: disable=missing-class-docstring, unused-argument, too-few-public-methods
+    def __new__(cls, module, device_ids=None, output_device=None, dim=0): # pylint: disable=unused-argument
+        if isinstance(device_ids, list) and len(device_ids) > 1:
+            print("IPEX backend doesn't support DataParallel on multiple XPU devices")
+        return module.to("xpu")
+def return_null_context(*args, **kwargs): # pylint: disable=unused-argument
+    return nullcontext()
+@property
+def is_cuda(self):
+    return self.device.type == 'xpu' or self.device.type == 'cuda'
+def check_device(device):
+    return bool((isinstance(device, torch.device) and device.type == "cuda") or (isinstance(device, str) and "cuda" in device) or isinstance(device, int))
+def return_xpu(device):
+    return f"xpu:{device.split(':')[-1]}" if isinstance(device, str) and ":" in device else f"xpu:{device}" if isinstance(device, int) else torch.device("xpu") if isinstance(device, torch.device) else "xpu"
+# Autocast
+original_autocast_init = torch.amp.autocast_mode.autocast.__init__
+@wraps(torch.amp.autocast_mode.autocast.__init__)
+def autocast_init(self, device_type, dtype=None, enabled=True, cache_enabled=None):
+    if device_type == "cuda":
+        return original_autocast_init(self, device_type="xpu", dtype=dtype, enabled=enabled, cache_enabled=cache_enabled)
+    else:
+        return original_autocast_init(self, device_type=device_type, dtype=dtype, enabled=enabled, cache_enabled=cache_enabled)
+# Latent Antialias CPU Offload:
+original_interpolate = torch.nn.functional.interpolate
+@wraps(torch.nn.functional.interpolate)
+def interpolate(tensor, size=None, scale_factor=None, mode='nearest', align_corners=None, recompute_scale_factor=None, antialias=False): # pylint: disable=too-many-arguments
+    if antialias or align_corners is not None or mode == 'bicubic':
+        return_device = tensor.device
+        return_dtype = tensor.dtype
+        return original_interpolate(tensor.to("cpu", dtype=torch.float32), size=size, scale_factor=scale_factor, mode=mode,
+        align_corners=align_corners, recompute_scale_factor=recompute_scale_factor, antialias=antialias).to(return_device, dtype=return_dtype)
+    else:
+        return original_interpolate(tensor, size=size, scale_factor=scale_factor, mode=mode,
+        align_corners=align_corners, recompute_scale_factor=recompute_scale_factor, antialias=antialias)
+# Diffusers Float64 (Alchemist GPUs doesn't support 64 bit):
+original_from_numpy = torch.from_numpy
+@wraps(torch.from_numpy)
+def from_numpy(ndarray):
+    if ndarray.dtype == float:
+        return original_from_numpy(ndarray.astype('float32'))
+    else:
+        return original_from_numpy(ndarray)
+original_as_tensor = torch.as_tensor
+@wraps(torch.as_tensor)
+def as_tensor(data, dtype=None, device=None):
+    if check_device(device):
+        device = return_xpu(device)
+    if isinstance(data, np.ndarray) and data.dtype == float and not (
+        (isinstance(device, torch.device) and device.type == "cpu") or (isinstance(device, str) and "cpu" in device)):
+        return original_as_tensor(data, dtype=torch.float32, device=device)
+    else:
+        return original_as_tensor(data, dtype=dtype, device=device)
+if device_supports_fp64 and os.environ.get('IPEX_FORCE_ATTENTION_SLICE', None) is None:
+    original_torch_bmm = torch.bmm
+    original_scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention
+else:
+    # 32 bit attention workarounds for Alchemist:
+    try:
+        from .attention import torch_bmm_32_bit as original_torch_bmm
+        from .attention import scaled_dot_product_attention_32_bit as original_scaled_dot_product_attention
+    except Exception: # pylint: disable=broad-exception-caught
+        original_torch_bmm = torch.bmm
+        original_scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention
+# Data Type Errors:
+@wraps(torch.bmm)
+def torch_bmm(input, mat2, *, out=None):
+    if input.dtype != mat2.dtype:
+        mat2 = mat2.to(input.dtype)
+    return original_torch_bmm(input, mat2, out=out)
+@wraps(torch.nn.functional.scaled_dot_product_attention)
+def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False):
+    if query.dtype != key.dtype:
+        key = key.to(dtype=query.dtype)
+    if query.dtype != value.dtype:
+        value = value.to(dtype=query.dtype)
+    if attn_mask is not None and query.dtype != attn_mask.dtype:
+        attn_mask = attn_mask.to(dtype=query.dtype)
+    return original_scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=dropout_p, is_causal=is_causal)
+# A1111 FP16
+original_functional_group_norm = torch.nn.functional.group_norm
+@wraps(torch.nn.functional.group_norm)
+def functional_group_norm(input, num_groups, weight=None, bias=None, eps=1e-05):
+    if weight is not None and input.dtype != weight.data.dtype:
+        input = input.to(dtype=weight.data.dtype)
+    if bias is not None and weight is not None and bias.data.dtype != weight.data.dtype:
+        bias.data = bias.data.to(dtype=weight.data.dtype)
+    return original_functional_group_norm(input, num_groups, weight=weight, bias=bias, eps=eps)
+# A1111 BF16
+original_functional_layer_norm = torch.nn.functional.layer_norm
+@wraps(torch.nn.functional.layer_norm)
+def functional_layer_norm(input, normalized_shape, weight=None, bias=None, eps=1e-05):
+    if weight is not None and input.dtype != weight.data.dtype:
+        input = input.to(dtype=weight.data.dtype)
+    if bias is not None and weight is not None and bias.data.dtype != weight.data.dtype:
+        bias.data = bias.data.to(dtype=weight.data.dtype)
+    return original_functional_layer_norm(input, normalized_shape, weight=weight, bias=bias, eps=eps)
+# Training
+original_functional_linear = torch.nn.functional.linear
+@wraps(torch.nn.functional.linear)
+def functional_linear(input, weight, bias=None):
+    if input.dtype != weight.data.dtype:
+        input = input.to(dtype=weight.data.dtype)
+    if bias is not None and bias.data.dtype != weight.data.dtype:
+        bias.data = bias.data.to(dtype=weight.data.dtype)
+    return original_functional_linear(input, weight, bias=bias)
+original_functional_conv2d = torch.nn.functional.conv2d
+@wraps(torch.nn.functional.conv2d)
+def functional_conv2d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
+    if input.dtype != weight.data.dtype:
+        input = input.to(dtype=weight.data.dtype)
+    if bias is not None and bias.data.dtype != weight.data.dtype:
+        bias.data = bias.data.to(dtype=weight.data.dtype)
+    return original_functional_conv2d(input, weight, bias=bias, stride=stride, padding=padding, dilation=dilation, groups=groups)
+# A1111 Embedding BF16
+original_torch_cat = torch.cat
+@wraps(torch.cat)
+def torch_cat(tensor, *args, **kwargs):
+    if len(tensor) == 3 and (tensor[0].dtype != tensor[1].dtype or tensor[2].dtype != tensor[1].dtype):
+        return original_torch_cat([tensor[0].to(tensor[1].dtype), tensor[1], tensor[2].to(tensor[1].dtype)], *args, **kwargs)
+    else:
+        return original_torch_cat(tensor, *args, **kwargs)
+# SwinIR BF16:
+original_functional_pad = torch.nn.functional.pad
+@wraps(torch.nn.functional.pad)
+def functional_pad(input, pad, mode='constant', value=None):
+    if mode == 'reflect' and input.dtype == torch.bfloat16:
+        return original_functional_pad(input.to(torch.float32), pad, mode=mode, value=value).to(dtype=torch.bfloat16)
+    else:
+        return original_functional_pad(input, pad, mode=mode, value=value)
+original_torch_tensor = torch.tensor
+@wraps(torch.tensor)
+def torch_tensor(data, *args, dtype=None, device=None, **kwargs):
+    if check_device(device):
+        device = return_xpu(device)
+    if not device_supports_fp64:
+        if (isinstance(device, torch.device) and device.type == "xpu") or (isinstance(device, str) and "xpu" in device):
+            if dtype == torch.float64:
+                dtype = torch.float32
+            elif dtype is None and (hasattr(data, "dtype") and (data.dtype == torch.float64 or data.dtype == float)):
+                dtype = torch.float32
+    return original_torch_tensor(data, *args, dtype=dtype, device=device, **kwargs)
+original_Tensor_to = torch.Tensor.to
+@wraps(torch.Tensor.to)
+def Tensor_to(self, device=None, *args, **kwargs):
+    if check_device(device):
+        return original_Tensor_to(self, return_xpu(device), *args, **kwargs)
+    else:
+        return original_Tensor_to(self, device, *args, **kwargs)
+original_Tensor_cuda = torch.Tensor.cuda
+@wraps(torch.Tensor.cuda)
+def Tensor_cuda(self, device=None, *args, **kwargs):
+    if check_device(device):
+        return original_Tensor_cuda(self, return_xpu(device), *args, **kwargs)
+    else:
+        return original_Tensor_cuda(self, device, *args, **kwargs)
+original_Tensor_pin_memory = torch.Tensor.pin_memory
+@wraps(torch.Tensor.pin_memory)
+def Tensor_pin_memory(self, device=None, *args, **kwargs):
+    if device is None:
+        device = "xpu"
+    if check_device(device):
+        return original_Tensor_pin_memory(self, return_xpu(device), *args, **kwargs)
+    else:
+        return original_Tensor_pin_memory(self, device, *args, **kwargs)
+original_UntypedStorage_init = torch.UntypedStorage.__init__
+@wraps(torch.UntypedStorage.__init__)
+def UntypedStorage_init(*args, device=None, **kwargs):
+    if check_device(device):
+        return original_UntypedStorage_init(*args, device=return_xpu(device), **kwargs)
+    else:
+        return original_UntypedStorage_init(*args, device=device, **kwargs)
+original_UntypedStorage_cuda = torch.UntypedStorage.cuda
+@wraps(torch.UntypedStorage.cuda)
+def UntypedStorage_cuda(self, device=None, *args, **kwargs):
+    if check_device(device):
+        return original_UntypedStorage_cuda(self, return_xpu(device), *args, **kwargs)
+    else:
+        return original_UntypedStorage_cuda(self, device, *args, **kwargs)
+original_torch_empty = torch.empty
+@wraps(torch.empty)
+def torch_empty(*args, device=None, **kwargs):
+    if check_device(device):
+        return original_torch_empty(*args, device=return_xpu(device), **kwargs)
+    else:
+        return original_torch_empty(*args, device=device, **kwargs)
+original_torch_randn = torch.randn
+@wraps(torch.randn)
+def torch_randn(*args, device=None, dtype=None, **kwargs):
+    if dtype == bytes:
+        dtype = None
+    if check_device(device):
+        return original_torch_randn(*args, device=return_xpu(device), **kwargs)
+    else:
+        return original_torch_randn(*args, device=device, **kwargs)
+original_torch_ones = torch.ones
+@wraps(torch.ones)
+def torch_ones(*args, device=None, **kwargs):
+    if check_device(device):
+        return original_torch_ones(*args, device=return_xpu(device), **kwargs)
+    else:
+        return original_torch_ones(*args, device=device, **kwargs)
+original_torch_zeros = torch.zeros
+@wraps(torch.zeros)
+def torch_zeros(*args, device=None, **kwargs):
+    if check_device(device):
+        return original_torch_zeros(*args, device=return_xpu(device), **kwargs)
+    else:
+        return original_torch_zeros(*args, device=device, **kwargs)
+original_torch_linspace = torch.linspace
+@wraps(torch.linspace)
+def torch_linspace(*args, device=None, **kwargs):
+    if check_device(device):
+        return original_torch_linspace(*args, device=return_xpu(device), **kwargs)
+    else:
+        return original_torch_linspace(*args, device=device, **kwargs)
+original_torch_Generator = torch.Generator
+@wraps(torch.Generator)
+def torch_Generator(device=None):
+    if check_device(device):
+        return original_torch_Generator(return_xpu(device))
+    else:
+        return original_torch_Generator(device)
+original_torch_load = torch.load
+@wraps(torch.load)
+def torch_load(f, map_location=None, *args, **kwargs):
+    if map_location is None:
+        map_location = "xpu"
+    if check_device(map_location):
+        return original_torch_load(f, *args, map_location=return_xpu(map_location), **kwargs)
+    else:
+        return original_torch_load(f, *args, map_location=map_location, **kwargs)
+# Hijack Functions:
+def ipex_hijacks():
+    torch.tensor = torch_tensor
+    torch.Tensor.to = Tensor_to
+    torch.Tensor.cuda = Tensor_cuda
+    torch.Tensor.pin_memory = Tensor_pin_memory
+    torch.UntypedStorage.__init__ = UntypedStorage_init
+    torch.UntypedStorage.cuda = UntypedStorage_cuda
+    torch.empty = torch_empty
+    torch.randn = torch_randn
+    torch.ones = torch_ones
+    torch.zeros = torch_zeros
+    torch.linspace = torch_linspace
+    torch.Generator = torch_Generator
+    torch.load = torch_load
+    torch.backends.cuda.sdp_kernel = return_null_context
+    torch.nn.DataParallel = DummyDataParallel
+    torch.UntypedStorage.is_cuda = is_cuda
+    torch.amp.autocast_mode.autocast.__init__ = autocast_init
+    torch.nn.functional.scaled_dot_product_attention = scaled_dot_product_attention
+    torch.nn.functional.group_norm = functional_group_norm
+    torch.nn.functional.layer_norm = functional_layer_norm
+    torch.nn.functional.linear = functional_linear
+    torch.nn.functional.conv2d = functional_conv2d
+    torch.nn.functional.interpolate = interpolate
+    torch.nn.functional.pad = functional_pad
+    torch.bmm = torch_bmm
+    torch.cat = torch_cat
+    if not device_supports_fp64:
+        torch.from_numpy = from_numpy
+        torch.as_tensor = as_tensor

library/lpw_stable_diffusion.py ADDED Viewed

	@@ -0,0 +1,1233 @@

+# copy from https://github.com/huggingface/diffusers/blob/main/examples/community/lpw_stable_diffusion.py
+# and modify to support SD2.x
+import inspect
+import re
+from typing import Callable, List, Optional, Union
+import numpy as np
+import PIL.Image
+import torch
+from packaging import version
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+import diffusers
+from diffusers import SchedulerMixin, StableDiffusionPipeline
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
+from diffusers.utils import logging
+try:
+    from diffusers.utils import PIL_INTERPOLATION
+except ImportError:
+    if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
+        PIL_INTERPOLATION = {
+            "linear": PIL.Image.Resampling.BILINEAR,
+            "bilinear": PIL.Image.Resampling.BILINEAR,
+            "bicubic": PIL.Image.Resampling.BICUBIC,
+            "lanczos": PIL.Image.Resampling.LANCZOS,
+            "nearest": PIL.Image.Resampling.NEAREST,
+        }
+    else:
+        PIL_INTERPOLATION = {
+            "linear": PIL.Image.LINEAR,
+            "bilinear": PIL.Image.BILINEAR,
+            "bicubic": PIL.Image.BICUBIC,
+            "lanczos": PIL.Image.LANCZOS,
+            "nearest": PIL.Image.NEAREST,
+        }
+# ------------------------------------------------------------------------------
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+re_attention = re.compile(
+    r"""
+\\\(|
+\\\)|
+\\\[|
+\\]|
+\\\\|
+\\|
+\(|
+\[|
+:([+-]?[.\d]+)\)|
+\)|
+]|
+[^\\()\[\]:]+|
+:
+""",
+    re.X,
+)
+def parse_prompt_attention(text):
+    """
+    Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
+    Accepted tokens are:
+      (abc) - increases attention to abc by a multiplier of 1.1
+      (abc:3.12) - increases attention to abc by a multiplier of 3.12
+      [abc] - decreases attention to abc by a multiplier of 1.1
+      \( - literal character '('
+      \[ - literal character '['
+      \) - literal character ')'
+      \] - literal character ']'
+      \\ - literal character '\'
+      anything else - just text
+    >>> parse_prompt_attention('normal text')
+    [['normal text', 1.0]]
+    >>> parse_prompt_attention('an (important) word')
+    [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
+    >>> parse_prompt_attention('(unbalanced')
+    [['unbalanced', 1.1]]
+    >>> parse_prompt_attention('\(literal\]')
+    [['(literal]', 1.0]]
+    >>> parse_prompt_attention('(unnecessary)(parens)')
+    [['unnecessaryparens', 1.1]]
+    >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
+    [['a ', 1.0],
+     ['house', 1.5730000000000004],
+     [' ', 1.1],
+     ['on', 1.0],
+     [' a ', 1.1],
+     ['hill', 0.55],
+     [', sun, ', 1.1],
+     ['sky', 1.4641000000000006],
+     ['.', 1.1]]
+    """
+    res = []
+    round_brackets = []
+    square_brackets = []
+    round_bracket_multiplier = 1.1
+    square_bracket_multiplier = 1 / 1.1
+    def multiply_range(start_position, multiplier):
+        for p in range(start_position, len(res)):
+            res[p][1] *= multiplier
+    for m in re_attention.finditer(text):
+        text = m.group(0)
+        weight = m.group(1)
+        if text.startswith("\\"):
+            res.append([text[1:], 1.0])
+        elif text == "(":
+            round_brackets.append(len(res))
+        elif text == "[":
+            square_brackets.append(len(res))
+        elif weight is not None and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), float(weight))
+        elif text == ")" and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), round_bracket_multiplier)
+        elif text == "]" and len(square_brackets) > 0:
+            multiply_range(square_brackets.pop(), square_bracket_multiplier)
+        else:
+            res.append([text, 1.0])
+    for pos in round_brackets:
+        multiply_range(pos, round_bracket_multiplier)
+    for pos in square_brackets:
+        multiply_range(pos, square_bracket_multiplier)
+    if len(res) == 0:
+        res = [["", 1.0]]
+    # merge runs of identical weights
+    i = 0
+    while i + 1 < len(res):
+        if res[i][1] == res[i + 1][1]:
+            res[i][0] += res[i + 1][0]
+            res.pop(i + 1)
+        else:
+            i += 1
+    return res
+def get_prompts_with_weights(pipe: StableDiffusionPipeline, prompt: List[str], max_length: int):
+    r"""
+    Tokenize a list of prompts and return its tokens with weights of each token.
+    No padding, starting or ending token is included.
+    """
+    tokens = []
+    weights = []
+    truncated = False
+    for text in prompt:
+        texts_and_weights = parse_prompt_attention(text)
+        text_token = []
+        text_weight = []
+        for word, weight in texts_and_weights:
+            # tokenize and discard the starting and the ending token
+            token = pipe.tokenizer(word).input_ids[1:-1]
+            text_token += token
+            # copy the weight by length of token
+            text_weight += [weight] * len(token)
+            # stop if the text is too long (longer than truncation limit)
+            if len(text_token) > max_length:
+                truncated = True
+                break
+        # truncate
+        if len(text_token) > max_length:
+            truncated = True
+            text_token = text_token[:max_length]
+            text_weight = text_weight[:max_length]
+        tokens.append(text_token)
+        weights.append(text_weight)
+    if truncated:
+        logger.warning("Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples")
+    return tokens, weights
+def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, no_boseos_middle=True, chunk_length=77):
+    r"""
+    Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length.
+    """
+    max_embeddings_multiples = (max_length - 2) // (chunk_length - 2)
+    weights_length = max_length if no_boseos_middle else max_embeddings_multiples * chunk_length
+    for i in range(len(tokens)):
+        tokens[i] = [bos] + tokens[i] + [eos] * (max_length - 1 - len(tokens[i]))
+        if no_boseos_middle:
+            weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i]))
+        else:
+            w = []
+            if len(weights[i]) == 0:
+                w = [1.0] * weights_length
+            else:
+                for j in range(max_embeddings_multiples):
+                    w.append(1.0)  # weight for starting token in this chunk
+                    w += weights[i][j * (chunk_length - 2) : min(len(weights[i]), (j + 1) * (chunk_length - 2))]
+                    w.append(1.0)  # weight for ending token in this chunk
+                w += [1.0] * (weights_length - len(w))
+            weights[i] = w[:]
+    return tokens, weights
+def get_unweighted_text_embeddings(
+    pipe: StableDiffusionPipeline,
+    text_input: torch.Tensor,
+    chunk_length: int,
+    clip_skip: int,
+    eos: int,
+    pad: int,
+    no_boseos_middle: Optional[bool] = True,
+):
+    """
+    When the length of tokens is a multiple of the capacity of the text encoder,
+    it should be split into chunks and sent to the text encoder individually.
+    """
+    max_embeddings_multiples = (text_input.shape[1] - 2) // (chunk_length - 2)
+    if max_embeddings_multiples > 1:
+        text_embeddings = []
+        for i in range(max_embeddings_multiples):
+            # extract the i-th chunk
+            text_input_chunk = text_input[:, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2].clone()
+            # cover the head and the tail by the starting and the ending tokens
+            text_input_chunk[:, 0] = text_input[0, 0]
+            if pad == eos:  # v1
+                text_input_chunk[:, -1] = text_input[0, -1]
+            else:  # v2
+                for j in range(len(text_input_chunk)):
+                    if text_input_chunk[j, -1] != eos and text_input_chunk[j, -1] != pad:  # 最後に普通の文字がある
+                        text_input_chunk[j, -1] = eos
+                    if text_input_chunk[j, 1] == pad:  # BOSだけであとはPAD
+                        text_input_chunk[j, 1] = eos
+            if clip_skip is None or clip_skip == 1:
+                text_embedding = pipe.text_encoder(text_input_chunk)[0]
+            else:
+                enc_out = pipe.text_encoder(text_input_chunk, output_hidden_states=True, return_dict=True)
+                text_embedding = enc_out["hidden_states"][-clip_skip]
+                text_embedding = pipe.text_encoder.text_model.final_layer_norm(text_embedding)
+            if no_boseos_middle:
+                if i == 0:
+                    # discard the ending token
+                    text_embedding = text_embedding[:, :-1]
+                elif i == max_embeddings_multiples - 1:
+                    # discard the starting token
+                    text_embedding = text_embedding[:, 1:]
+                else:
+                    # discard both starting and ending tokens
+                    text_embedding = text_embedding[:, 1:-1]
+            text_embeddings.append(text_embedding)
+        text_embeddings = torch.concat(text_embeddings, axis=1)
+    else:
+        if clip_skip is None or clip_skip == 1:
+            text_embeddings = pipe.text_encoder(text_input)[0]
+        else:
+            enc_out = pipe.text_encoder(text_input, output_hidden_states=True, return_dict=True)
+            text_embeddings = enc_out["hidden_states"][-clip_skip]
+            text_embeddings = pipe.text_encoder.text_model.final_layer_norm(text_embeddings)
+    return text_embeddings
+def get_weighted_text_embeddings(
+    pipe: StableDiffusionPipeline,
+    prompt: Union[str, List[str]],
+    uncond_prompt: Optional[Union[str, List[str]]] = None,
+    max_embeddings_multiples: Optional[int] = 3,
+    no_boseos_middle: Optional[bool] = False,
+    skip_parsing: Optional[bool] = False,
+    skip_weighting: Optional[bool] = False,
+    clip_skip=None,
+):
+    r"""
+    Prompts can be assigned with local weights using brackets. For example,
+    prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful',
+    and the embedding tokens corresponding to the words get multiplied by a constant, 1.1.
+    Also, to regularize of the embedding, the weighted embedding would be scaled to preserve the original mean.
+    Args:
+        pipe (`StableDiffusionPipeline`):
+            Pipe to provide access to the tokenizer and the text encoder.
+        prompt (`str` or `List[str]`):
+            The prompt or prompts to guide the image generation.
+        uncond_prompt (`str` or `List[str]`):
+            The unconditional prompt or prompts for guide the image generation. If unconditional prompt
+            is provided, the embeddings of prompt and uncond_prompt are concatenated.
+        max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+            The max multiple length of prompt embeddings compared to the max output length of text encoder.
+        no_boseos_middle (`bool`, *optional*, defaults to `False`):
+            If the length of text token is multiples of the capacity of text encoder, whether reserve the starting and
+            ending token in each of the chunk in the middle.
+        skip_parsing (`bool`, *optional*, defaults to `False`):
+            Skip the parsing of brackets.
+        skip_weighting (`bool`, *optional*, defaults to `False`):
+            Skip the weighting. When the parsing is skipped, it is forced True.
+    """
+    max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
+    if isinstance(prompt, str):
+        prompt = [prompt]
+    if not skip_parsing:
+        prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt, max_length - 2)
+        if uncond_prompt is not None:
+            if isinstance(uncond_prompt, str):
+                uncond_prompt = [uncond_prompt]
+            uncond_tokens, uncond_weights = get_prompts_with_weights(pipe, uncond_prompt, max_length - 2)
+    else:
+        prompt_tokens = [token[1:-1] for token in pipe.tokenizer(prompt, max_length=max_length, truncation=True).input_ids]
+        prompt_weights = [[1.0] * len(token) for token in prompt_tokens]
+        if uncond_prompt is not None:
+            if isinstance(uncond_prompt, str):
+                uncond_prompt = [uncond_prompt]
+            uncond_tokens = [
+                token[1:-1] for token in pipe.tokenizer(uncond_prompt, max_length=max_length, truncation=True).input_ids
+            ]
+            uncond_weights = [[1.0] * len(token) for token in uncond_tokens]
+    # round up the longest length of tokens to a multiple of (model_max_length - 2)
+    max_length = max([len(token) for token in prompt_tokens])
+    if uncond_prompt is not None:
+        max_length = max(max_length, max([len(token) for token in uncond_tokens]))
+    max_embeddings_multiples = min(
+        max_embeddings_multiples,
+        (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1,
+    )
+    max_embeddings_multiples = max(1, max_embeddings_multiples)
+    max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
+    # pad the length of tokens and weights
+    bos = pipe.tokenizer.bos_token_id
+    eos = pipe.tokenizer.eos_token_id
+    pad = pipe.tokenizer.pad_token_id
+    prompt_tokens, prompt_weights = pad_tokens_and_weights(
+        prompt_tokens,
+        prompt_weights,
+        max_length,
+        bos,
+        eos,
+        no_boseos_middle=no_boseos_middle,
+        chunk_length=pipe.tokenizer.model_max_length,
+    )
+    prompt_tokens = torch.tensor(prompt_tokens, dtype=torch.long, device=pipe.device)
+    if uncond_prompt is not None:
+        uncond_tokens, uncond_weights = pad_tokens_and_weights(
+            uncond_tokens,
+            uncond_weights,
+            max_length,
+            bos,
+            eos,
+            no_boseos_middle=no_boseos_middle,
+            chunk_length=pipe.tokenizer.model_max_length,
+        )
+        uncond_tokens = torch.tensor(uncond_tokens, dtype=torch.long, device=pipe.device)
+    # get the embeddings
+    text_embeddings = get_unweighted_text_embeddings(
+        pipe,
+        prompt_tokens,
+        pipe.tokenizer.model_max_length,
+        clip_skip,
+        eos,
+        pad,
+        no_boseos_middle=no_boseos_middle,
+    )
+    prompt_weights = torch.tensor(prompt_weights, dtype=text_embeddings.dtype, device=pipe.device)
+    if uncond_prompt is not None:
+        uncond_embeddings = get_unweighted_text_embeddings(
+            pipe,
+            uncond_tokens,
+            pipe.tokenizer.model_max_length,
+            clip_skip,
+            eos,
+            pad,
+            no_boseos_middle=no_boseos_middle,
+        )
+        uncond_weights = torch.tensor(uncond_weights, dtype=uncond_embeddings.dtype, device=pipe.device)
+    # assign weights to the prompts and normalize in the sense of mean
+    # TODO: should we normalize by chunk or in a whole (current implementation)?
+    if (not skip_parsing) and (not skip_weighting):
+        previous_mean = text_embeddings.float().mean(axis=[-2, -1]).to(text_embeddings.dtype)
+        text_embeddings *= prompt_weights.unsqueeze(-1)
+        current_mean = text_embeddings.float().mean(axis=[-2, -1]).to(text_embeddings.dtype)
+        text_embeddings *= (previous_mean / current_mean).unsqueeze(-1).unsqueeze(-1)
+        if uncond_prompt is not None:
+            previous_mean = uncond_embeddings.float().mean(axis=[-2, -1]).to(uncond_embeddings.dtype)
+            uncond_embeddings *= uncond_weights.unsqueeze(-1)
+            current_mean = uncond_embeddings.float().mean(axis=[-2, -1]).to(uncond_embeddings.dtype)
+            uncond_embeddings *= (previous_mean / current_mean).unsqueeze(-1).unsqueeze(-1)
+    if uncond_prompt is not None:
+        return text_embeddings, uncond_embeddings
+    return text_embeddings, None
+def preprocess_image(image):
+    w, h = image.size
+    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
+    image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
+    image = np.array(image).astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image)
+    return 2.0 * image - 1.0
+def preprocess_mask(mask, scale_factor=8):
+    mask = mask.convert("L")
+    w, h = mask.size
+    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
+    mask = mask.resize((w // scale_factor, h // scale_factor), resample=PIL_INTERPOLATION["nearest"])
+    mask = np.array(mask).astype(np.float32) / 255.0
+    mask = np.tile(mask, (4, 1, 1))
+    mask = mask[None].transpose(0, 1, 2, 3)  # what does this step do?
+    mask = 1 - mask  # repaint white, keep black
+    mask = torch.from_numpy(mask)
+    return mask
+def prepare_controlnet_image(
+    image: PIL.Image.Image,
+    width: int,
+    height: int,
+    batch_size: int,
+    num_images_per_prompt: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    do_classifier_free_guidance: bool = False,
+    guess_mode: bool = False,
+):
+    if not isinstance(image, torch.Tensor):
+        if isinstance(image, PIL.Image.Image):
+            image = [image]
+        if isinstance(image[0], PIL.Image.Image):
+            images = []
+            for image_ in image:
+                image_ = image_.convert("RGB")
+                image_ = image_.resize((width, height), resample=PIL_INTERPOLATION["lanczos"])
+                image_ = np.array(image_)
+                image_ = image_[None, :]
+                images.append(image_)
+            image = images
+            image = np.concatenate(image, axis=0)
+            image = np.array(image).astype(np.float32) / 255.0
+            image = image.transpose(0, 3, 1, 2)
+            image = torch.from_numpy(image)
+        elif isinstance(image[0], torch.Tensor):
+            image = torch.cat(image, dim=0)
+    image_batch_size = image.shape[0]
+    if image_batch_size == 1:
+        repeat_by = batch_size
+    else:
+        # image batch size is the same as prompt batch size
+        repeat_by = num_images_per_prompt
+    image = image.repeat_interleave(repeat_by, dim=0)
+    image = image.to(device=device, dtype=dtype)
+    if do_classifier_free_guidance and not guess_mode:
+        image = torch.cat([image] * 2)
+    return image
+class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion without tokens length limit, and support parsing
+    weighting in prompt.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+    # if version.parse(version.parse(diffusers.__version__).base_version) >= version.parse("0.9.0"):
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: SchedulerMixin,
+        # clip_skip: int,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        clip_skip: int = 1,
+    ):
+        super().__init__(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            requires_safety_checker=requires_safety_checker,
+            image_encoder=image_encoder,
+        )
+        self.custom_clip_skip = clip_skip
+        self.__init__additional__()
+    def __init__additional__(self):
+        if not hasattr(self, "vae_scale_factor"):
+            setattr(self, "vae_scale_factor", 2 ** (len(self.vae.config.block_out_channels) - 1))
+    @property
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt,
+        max_embeddings_multiples,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`str` or `list(int)`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+        """
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        if negative_prompt is None:
+            negative_prompt = [""] * batch_size
+        elif isinstance(negative_prompt, str):
+            negative_prompt = [negative_prompt] * batch_size
+        if batch_size != len(negative_prompt):
+            raise ValueError(
+                f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                " the batch size of `prompt`."
+            )
+        text_embeddings, uncond_embeddings = get_weighted_text_embeddings(
+            pipe=self,
+            prompt=prompt,
+            uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
+            max_embeddings_multiples=max_embeddings_multiples,
+            clip_skip=self.custom_clip_skip,
+        )
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        if do_classifier_free_guidance:
+            bs_embed, seq_len, _ = uncond_embeddings.shape
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+        return text_embeddings
+    def check_inputs(self, prompt, height, width, strength, callback_steps):
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+        if height % 8 != 0 or width % 8 != 0:
+            logger.info(f'{height} {width}')
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type" f" {type(callback_steps)}."
+            )
+    def get_timesteps(self, num_inference_steps, strength, device, is_text2img):
+        if is_text2img:
+            return self.scheduler.timesteps.to(device), num_inference_steps
+        else:
+            # get the original timestep using init_timestep
+            offset = self.scheduler.config.get("steps_offset", 0)
+            init_timestep = int(num_inference_steps * strength) + offset
+            init_timestep = min(init_timestep, num_inference_steps)
+            t_start = max(num_inference_steps - init_timestep + offset, 0)
+            timesteps = self.scheduler.timesteps[t_start:].to(device)
+            return timesteps, num_inference_steps - t_start
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(images=image, clip_input=safety_checker_input.pixel_values.to(dtype))
+        else:
+            has_nsfw_concept = None
+        return image, has_nsfw_concept
+    def decode_latents(self, latents):
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    def prepare_latents(self, image, timestep, batch_size, height, width, dtype, device, generator, latents=None):
+        if image is None:
+            shape = (
+                batch_size,
+                self.unet.in_channels,
+                height // self.vae_scale_factor,
+                width // self.vae_scale_factor,
+            )
+            if latents is None:
+                if device.type == "mps":
+                    # randn does not work reproducibly on mps
+                    latents = torch.randn(shape, generator=generator, device="cpu", dtype=dtype).to(device)
+                else:
+                    latents = torch.randn(shape, generator=generator, device=device, dtype=dtype)
+            else:
+                if latents.shape != shape:
+                    raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+                latents = latents.to(device)
+            # scale the initial noise by the standard deviation required by the scheduler
+            latents = latents * self.scheduler.init_noise_sigma
+            return latents, None, None
+        else:
+            init_latent_dist = self.vae.encode(image).latent_dist
+            init_latents = init_latent_dist.sample(generator=generator)
+            init_latents = 0.18215 * init_latents
+            init_latents = torch.cat([init_latents] * batch_size, dim=0)
+            init_latents_orig = init_latents
+            shape = init_latents.shape
+            # add noise to latents using the timesteps
+            if device.type == "mps":
+                noise = torch.randn(shape, generator=generator, device="cpu", dtype=dtype).to(device)
+            else:
+                noise = torch.randn(shape, generator=generator, device=device, dtype=dtype)
+            latents = self.scheduler.add_noise(init_latents, noise, timestep)
+            return latents, init_latents_orig, noise
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        strength: float = 0.8,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        controlnet=None,
+        controlnet_image=None,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        is_cancelled_callback: Optional[Callable[[], bool]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
+                PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
+                contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
+                `image` will be used as a starting point, adding more noise to it the larger the `strength`. The
+                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
+                noise will be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            controlnet (`diffusers.ControlNetModel`, *optional*):
+                A controlnet model to be used for the inference. If not provided, controlnet will be disabled.
+            controlnet_image (`torch.FloatTensor` or `PIL.Image.Image`, *optional*):
+                `Image`, or tensor representing an image batch, to be used as the starting point for the controlnet
+                inference.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            is_cancelled_callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. If the function returns
+                `True`, the inference will be cancelled.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+        Returns:
+            `None` if cancelled by `is_cancelled_callback`,
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        if controlnet is not None and controlnet_image is None:
+            raise ValueError("controlnet_image must be provided if controlnet is not None.")
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, strength, callback_steps)
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input prompt
+        text_embeddings = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            max_embeddings_multiples,
+        )
+        dtype = text_embeddings.dtype
+        # 4. Preprocess image and mask
+        if isinstance(image, PIL.Image.Image):
+            image = preprocess_image(image)
+        if image is not None:
+            image = image.to(device=self.device, dtype=dtype)
+        if isinstance(mask_image, PIL.Image.Image):
+            mask_image = preprocess_mask(mask_image, self.vae_scale_factor)
+        if mask_image is not None:
+            mask = mask_image.to(device=self.device, dtype=dtype)
+            mask = torch.cat([mask] * batch_size * num_images_per_prompt)
+        else:
+            mask = None
+        if controlnet_image is not None:
+            controlnet_image = prepare_controlnet_image(
+                controlnet_image, width, height, batch_size, 1, self.device, controlnet.dtype, do_classifier_free_guidance, False
+            )
+        # 5. set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device, image is None)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        # 6. Prepare latent variables
+        latents, init_latents_orig, noise = self.prepare_latents(
+            image,
+            latent_timestep,
+            batch_size * num_images_per_prompt,
+            height,
+            width,
+            dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 8. Denoising loop
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+            unet_additional_args = {}
+            if controlnet is not None:
+                down_block_res_samples, mid_block_res_sample = controlnet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=text_embeddings,
+                    controlnet_cond=controlnet_image,
+                    conditioning_scale=1.0,
+                    guess_mode=False,
+                    return_dict=False,
+                )
+                unet_additional_args["down_block_additional_residuals"] = down_block_res_samples
+                unet_additional_args["mid_block_additional_residual"] = mid_block_res_sample
+            # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings, **unet_additional_args).sample
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+            if mask is not None:
+                # masking
+                init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, torch.tensor([t]))
+                latents = (init_latents_proper * mask) + (latents * (1 - mask))
+            # call the callback, if provided
+            if i % callback_steps == 0:
+                if callback is not None:
+                    callback(i, t, latents)
+                if is_cancelled_callback is not None and is_cancelled_callback():
+                    return None
+        return latents
+    def latents_to_image(self, latents):
+        # 9. Post-processing
+        image = self.decode_latents(latents.to(self.vae.dtype))
+        image = self.numpy_to_pil(image)
+        return image
+    def text2img(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        is_cancelled_callback: Optional[Callable[[], bool]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        Function for text-to-image generation.
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            is_cancelled_callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. If the function returns
+                `True`, the inference will be cancelled.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        return self.__call__(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            max_embeddings_multiples=max_embeddings_multiples,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            is_cancelled_callback=is_cancelled_callback,
+            callback_steps=callback_steps,
+        )
+    def img2img(
+        self,
+        image: Union[torch.FloatTensor, PIL.Image.Image],
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[torch.Generator] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        is_cancelled_callback: Optional[Callable[[], bool]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        Function for image-to-image generation.
+        Args:
+            image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
+                `image` will be used as a starting point, adding more noise to it the larger the `strength`. The
+                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
+                noise will be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter will be modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            is_cancelled_callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. If the function returns
+                `True`, the inference will be cancelled.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        return self.__call__(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            image=image,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            strength=strength,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            max_embeddings_multiples=max_embeddings_multiples,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            is_cancelled_callback=is_cancelled_callback,
+            callback_steps=callback_steps,
+        )
+    def inpaint(
+        self,
+        image: Union[torch.FloatTensor, PIL.Image.Image],
+        mask_image: Union[torch.FloatTensor, PIL.Image.Image],
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[torch.Generator] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        is_cancelled_callback: Optional[Callable[[], bool]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        Function for inpaint.
+        Args:
+            image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process. This is the image whose masked region will be inpainted.
+            mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
+                PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
+                contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
+                is 1, the denoising process will be run on the masked area for the full number of iterations specified
+                in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more
+                noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
+                the expense of slower inference. This parameter will be modulated by `strength`, as explained above.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            is_cancelled_callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. If the function returns
+                `True`, the inference will be cancelled.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        return self.__call__(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            image=image,
+            mask_image=mask_image,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            strength=strength,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            max_embeddings_multiples=max_embeddings_multiples,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            is_cancelled_callback=is_cancelled_callback,
+            callback_steps=callback_steps,
+        )

library/model_util.py ADDED Viewed

	@@ -0,0 +1,1356 @@

+# v1: split from train_db_fixed.py.
+# v2: support safetensors
+import math
+import os
+import torch
+from library.device_utils import init_ipex
+init_ipex()
+import diffusers
+from transformers import CLIPTextModel, CLIPTokenizer, CLIPTextConfig, logging
+from diffusers import AutoencoderKL, DDIMScheduler, StableDiffusionPipeline  # , UNet2DConditionModel
+from safetensors.torch import load_file, save_file
+from library.original_unet import UNet2DConditionModel
+from library.utils import setup_logging
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+# DiffUsers版StableDiffusionのモデルパラメータ
+NUM_TRAIN_TIMESTEPS = 1000
+BETA_START = 0.00085
+BETA_END = 0.0120
+UNET_PARAMS_MODEL_CHANNELS = 320
+UNET_PARAMS_CHANNEL_MULT = [1, 2, 4, 4]
+UNET_PARAMS_ATTENTION_RESOLUTIONS = [4, 2, 1]
+UNET_PARAMS_IMAGE_SIZE = 64  # fixed from old invalid value `32`
+UNET_PARAMS_IN_CHANNELS = 4
+UNET_PARAMS_OUT_CHANNELS = 4
+UNET_PARAMS_NUM_RES_BLOCKS = 2
+UNET_PARAMS_CONTEXT_DIM = 768
+UNET_PARAMS_NUM_HEADS = 8
+# UNET_PARAMS_USE_LINEAR_PROJECTION = False
+VAE_PARAMS_Z_CHANNELS = 4
+VAE_PARAMS_RESOLUTION = 256
+VAE_PARAMS_IN_CHANNELS = 3
+VAE_PARAMS_OUT_CH = 3
+VAE_PARAMS_CH = 128
+VAE_PARAMS_CH_MULT = [1, 2, 4, 4]
+VAE_PARAMS_NUM_RES_BLOCKS = 2
+# V2
+V2_UNET_PARAMS_ATTENTION_HEAD_DIM = [5, 10, 20, 20]
+V2_UNET_PARAMS_CONTEXT_DIM = 1024
+# V2_UNET_PARAMS_USE_LINEAR_PROJECTION = True
+# Diffusersの設定を読み込むための参照モデル
+DIFFUSERS_REF_MODEL_ID_V1 = "runwayml/stable-diffusion-v1-5"
+DIFFUSERS_REF_MODEL_ID_V2 = "stabilityai/stable-diffusion-2-1"
+# region StableDiffusion->Diffusersの変換コード
+# convert_original_stable_diffusion_to_diffusers をコピーして修正している（ASL 2.0）
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+        if diffusers.__version__ < "0.17.0":
+            new_item = new_item.replace("q.weight", "query.weight")
+            new_item = new_item.replace("q.bias", "query.bias")
+            new_item = new_item.replace("k.weight", "key.weight")
+            new_item = new_item.replace("k.bias", "key.bias")
+            new_item = new_item.replace("v.weight", "value.weight")
+            new_item = new_item.replace("v.bias", "value.bias")
+            new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
+            new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
+        else:
+            new_item = new_item.replace("q.weight", "to_q.weight")
+            new_item = new_item.replace("q.bias", "to_q.bias")
+            new_item = new_item.replace("k.weight", "to_k.weight")
+            new_item = new_item.replace("k.bias", "to_k.bias")
+            new_item = new_item.replace("v.weight", "to_v.weight")
+            new_item = new_item.replace("v.bias", "to_v.bias")
+            new_item = new_item.replace("proj_out.weight", "to_out.0.weight")
+            new_item = new_item.replace("proj_out.bias", "to_out.0.bias")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming
+    to them. It splits attention layers, and takes into account additional replacements
+    that may arise.
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+            checkpoint[path_map["query"]] = query.reshape(target_shape)
+            checkpoint[path_map["key"]] = key.reshape(target_shape)
+            checkpoint[path_map["value"]] = value.reshape(target_shape)
+    for path in paths:
+        new_path = path["new"]
+        # These have already been assigned
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+        # Global renaming happens here
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+        # proj_attn.weight has to be converted from conv 1D to linear
+        reshaping = False
+        if diffusers.__version__ < "0.17.0":
+            if "proj_attn.weight" in new_path:
+                reshaping = True
+        else:
+            if ".attentions." in new_path and ".0.to_" in new_path and old_checkpoint[path["old"]].ndim > 2:
+                reshaping = True
+        if reshaping:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["query.weight", "key.weight", "value.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+        elif "proj_attn.weight" in key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0]
+def linear_transformer_to_conv(checkpoint):
+    keys = list(checkpoint.keys())
+    tf_keys = ["proj_in.weight", "proj_out.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in tf_keys:
+            if checkpoint[key].ndim == 2:
+                checkpoint[key] = checkpoint[key].unsqueeze(2).unsqueeze(2)
+def convert_ldm_unet_checkpoint(v2, checkpoint, config):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+    # extract state_dict for UNet
+    unet_state_dict = {}
+    unet_key = "model.diffusion_model."
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(unet_key):
+            unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
+    new_checkpoint = {}
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}." in key] for layer_id in range(num_input_blocks)
+    }
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}." in key] for layer_id in range(num_middle_blocks)
+    }
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}." in key] for layer_id in range(num_output_blocks)
+    }
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+        resnets = [key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(f"input_blocks.{i}.0.op.bias")
+        paths = renew_resnet_paths(resnets)
+        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
+            assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
+    for i in range(num_output_blocks):
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+            resnet_0_paths = renew_resnet_paths(resnets)
+            paths = renew_resnet_paths(resnets)
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+            assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
+            # オリジナル：
+            # if ["conv.weight", "conv.bias"] in output_block_list.values():
+            #   index = list(output_block_list.values()).index(["conv.weight", "conv.bias"])
+            # biasとweightの順番に依存しないようにする：もっといいやり方がありそうだが
+            for l in output_block_list.values():
+                l.sort()
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    "old": f"output_blocks.{i}.1",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                }
+                assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+    # SDのv2では1*1のconv2dがlinearに変わっている
+    # 誤って Diffusers 側を conv2d のままにしてしまったので、変換必要
+    if v2 and not config.get("use_linear_projection", False):
+        linear_transformer_to_conv(new_checkpoint)
+    return new_checkpoint
+def convert_ldm_vae_checkpoint(checkpoint, config):
+    # extract state dict for VAE
+    vae_state_dict = {}
+    vae_key = "first_stage_model."
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(vae_key):
+            vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
+    # if len(vae_state_dict) == 0:
+    #   # 渡されたcheckpointは.ckptから読み込んだcheckpointではなくvaeのstate_dict
+    #   vae_state_dict = checkpoint
+    new_checkpoint = {}
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
+    down_blocks = {layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)}
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
+    up_blocks = {layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)}
+    for i in range(num_down_blocks):
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key]
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    return new_checkpoint
+def create_unet_diffusers_config(v2, use_linear_projection_in_v2=False):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    # unet_params = original_config.model.params.unet_config.params
+    block_out_channels = [UNET_PARAMS_MODEL_CHANNELS * mult for mult in UNET_PARAMS_CHANNEL_MULT]
+    down_block_types = []
+    resolution = 1
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnDownBlock2D" if resolution in UNET_PARAMS_ATTENTION_RESOLUTIONS else "DownBlock2D"
+        down_block_types.append(block_type)
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnUpBlock2D" if resolution in UNET_PARAMS_ATTENTION_RESOLUTIONS else "UpBlock2D"
+        up_block_types.append(block_type)
+        resolution //= 2
+    config = dict(
+        sample_size=UNET_PARAMS_IMAGE_SIZE,
+        in_channels=UNET_PARAMS_IN_CHANNELS,
+        out_channels=UNET_PARAMS_OUT_CHANNELS,
+        down_block_types=tuple(down_block_types),
+        up_block_types=tuple(up_block_types),
+        block_out_channels=tuple(block_out_channels),
+        layers_per_block=UNET_PARAMS_NUM_RES_BLOCKS,
+        cross_attention_dim=UNET_PARAMS_CONTEXT_DIM if not v2 else V2_UNET_PARAMS_CONTEXT_DIM,
+        attention_head_dim=UNET_PARAMS_NUM_HEADS if not v2 else V2_UNET_PARAMS_ATTENTION_HEAD_DIM,
+        # use_linear_projection=UNET_PARAMS_USE_LINEAR_PROJECTION if not v2 else V2_UNET_PARAMS_USE_LINEAR_PROJECTION,
+    )
+    if v2 and use_linear_projection_in_v2:
+        config["use_linear_projection"] = True
+    return config
+def create_vae_diffusers_config():
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    # vae_params = original_config.model.params.first_stage_config.params.ddconfig
+    # _ = original_config.model.params.first_stage_config.params.embed_dim
+    block_out_channels = [VAE_PARAMS_CH * mult for mult in VAE_PARAMS_CH_MULT]
+    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
+    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
+    config = dict(
+        sample_size=VAE_PARAMS_RESOLUTION,
+        in_channels=VAE_PARAMS_IN_CHANNELS,
+        out_channels=VAE_PARAMS_OUT_CH,
+        down_block_types=tuple(down_block_types),
+        up_block_types=tuple(up_block_types),
+        block_out_channels=tuple(block_out_channels),
+        latent_channels=VAE_PARAMS_Z_CHANNELS,
+        layers_per_block=VAE_PARAMS_NUM_RES_BLOCKS,
+    )
+    return config
+def convert_ldm_clip_checkpoint_v1(checkpoint):
+    keys = list(checkpoint.keys())
+    text_model_dict = {}
+    for key in keys:
+        if key.startswith("cond_stage_model.transformer"):
+            text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
+    # remove position_ids for newer transformer, which causes error :(
+    if "text_model.embeddings.position_ids" in text_model_dict:
+        text_model_dict.pop("text_model.embeddings.position_ids")
+    return text_model_dict
+def convert_ldm_clip_checkpoint_v2(checkpoint, max_length):
+    # 嫌になるくらい違うぞ！
+    def convert_key(key):
+        if not key.startswith("cond_stage_model"):
+            return None
+        # common conversion
+        key = key.replace("cond_stage_model.model.transformer.", "text_model.encoder.")
+        key = key.replace("cond_stage_model.model.", "text_model.")
+        if "resblocks" in key:
+            # resblocks conversion
+            key = key.replace(".resblocks.", ".layers.")
+            if ".ln_" in key:
+                key = key.replace(".ln_", ".layer_norm")
+            elif ".mlp." in key:
+                key = key.replace(".c_fc.", ".fc1.")
+                key = key.replace(".c_proj.", ".fc2.")
+            elif ".attn.out_proj" in key:
+                key = key.replace(".attn.out_proj.", ".self_attn.out_proj.")
+            elif ".attn.in_proj" in key:
+                key = None  # 特殊なので後で処理する
+            else:
+                raise ValueError(f"unexpected key in SD: {key}")
+        elif ".positional_embedding" in key:
+            key = key.replace(".positional_embedding", ".embeddings.position_embedding.weight")
+        elif ".text_projection" in key:
+            key = None  # 使われない???
+        elif ".logit_scale" in key:
+            key = None  # 使われない???
+        elif ".token_embedding" in key:
+            key = key.replace(".token_embedding.weight", ".embeddings.token_embedding.weight")
+        elif ".ln_final" in key:
+            key = key.replace(".ln_final", ".final_layer_norm")
+        return key
+    keys = list(checkpoint.keys())
+    new_sd = {}
+    for key in keys:
+        # remove resblocks 23
+        if ".resblocks.23." in key:
+            continue
+        new_key = convert_key(key)
+        if new_key is None:
+            continue
+        new_sd[new_key] = checkpoint[key]
+    # attnの変換
+    for key in keys:
+        if ".resblocks.23." in key:
+            continue
+        if ".resblocks" in key and ".attn.in_proj_" in key:
+            # 三つに分割
+            values = torch.chunk(checkpoint[key], 3)
+            key_suffix = ".weight" if "weight" in key else ".bias"
+            key_pfx = key.replace("cond_stage_model.model.transformer.resblocks.", "text_model.encoder.layers.")
+            key_pfx = key_pfx.replace("_weight", "")
+            key_pfx = key_pfx.replace("_bias", "")
+            key_pfx = key_pfx.replace(".attn.in_proj", ".self_attn.")
+            new_sd[key_pfx + "q_proj" + key_suffix] = values[0]
+            new_sd[key_pfx + "k_proj" + key_suffix] = values[1]
+            new_sd[key_pfx + "v_proj" + key_suffix] = values[2]
+    # rename or add position_ids
+    ANOTHER_POSITION_IDS_KEY = "text_model.encoder.text_model.embeddings.position_ids"
+    if ANOTHER_POSITION_IDS_KEY in new_sd:
+        # waifu diffusion v1.4
+        position_ids = new_sd[ANOTHER_POSITION_IDS_KEY]
+        del new_sd[ANOTHER_POSITION_IDS_KEY]
+    else:
+        position_ids = torch.Tensor([list(range(max_length))]).to(torch.int64)
+    new_sd["text_model.embeddings.position_ids"] = position_ids
+    return new_sd
+# endregion
+# region Diffusers->StableDiffusion の変換コード
+# convert_diffusers_to_original_stable_diffusion をコピーして修正している（ASL 2.0）
+def conv_transformer_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    tf_keys = ["proj_in.weight", "proj_out.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in tf_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+def convert_unet_state_dict_to_sd(v2, unet_state_dict):
+    unet_conversion_map = [
+        # (stable-diffusion, HF Diffusers)
+        ("time_embed.0.weight", "time_embedding.linear_1.weight"),
+        ("time_embed.0.bias", "time_embedding.linear_1.bias"),
+        ("time_embed.2.weight", "time_embedding.linear_2.weight"),
+        ("time_embed.2.bias", "time_embedding.linear_2.bias"),
+        ("input_blocks.0.0.weight", "conv_in.weight"),
+        ("input_blocks.0.0.bias", "conv_in.bias"),
+        ("out.0.weight", "conv_norm_out.weight"),
+        ("out.0.bias", "conv_norm_out.bias"),
+        ("out.2.weight", "conv_out.weight"),
+        ("out.2.bias", "conv_out.bias"),
+    ]
+    unet_conversion_map_resnet = [
+        # (stable-diffusion, HF Diffusers)
+        ("in_layers.0", "norm1"),
+        ("in_layers.2", "conv1"),
+        ("out_layers.0", "norm2"),
+        ("out_layers.3", "conv2"),
+        ("emb_layers.1", "time_emb_proj"),
+        ("skip_connection", "conv_shortcut"),
+    ]
+    unet_conversion_map_layer = []
+    for i in range(4):
+        # loop over downblocks/upblocks
+        for j in range(2):
+            # loop over resnets/attentions for downblocks
+            hf_down_res_prefix = f"down_blocks.{i}.resnets.{j}."
+            sd_down_res_prefix = f"input_blocks.{3*i + j + 1}.0."
+            unet_conversion_map_layer.append((sd_down_res_prefix, hf_down_res_prefix))
+            if i < 3:
+                # no attention layers in down_blocks.3
+                hf_down_atn_prefix = f"down_blocks.{i}.attentions.{j}."
+                sd_down_atn_prefix = f"input_blocks.{3*i + j + 1}.1."
+                unet_conversion_map_layer.append((sd_down_atn_prefix, hf_down_atn_prefix))
+        for j in range(3):
+            # loop over resnets/attentions for upblocks
+            hf_up_res_prefix = f"up_blocks.{i}.resnets.{j}."
+            sd_up_res_prefix = f"output_blocks.{3*i + j}.0."
+            unet_conversion_map_layer.append((sd_up_res_prefix, hf_up_res_prefix))
+            if i > 0:
+                # no attention layers in up_blocks.0
+                hf_up_atn_prefix = f"up_blocks.{i}.attentions.{j}."
+                sd_up_atn_prefix = f"output_blocks.{3*i + j}.1."
+                unet_conversion_map_layer.append((sd_up_atn_prefix, hf_up_atn_prefix))
+        if i < 3:
+            # no downsample in down_blocks.3
+            hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0.conv."
+            sd_downsample_prefix = f"input_blocks.{3*(i+1)}.0.op."
+            unet_conversion_map_layer.append((sd_downsample_prefix, hf_downsample_prefix))
+            # no upsample in up_blocks.3
+            hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
+            sd_upsample_prefix = f"output_blocks.{3*i + 2}.{1 if i == 0 else 2}."
+            unet_conversion_map_layer.append((sd_upsample_prefix, hf_upsample_prefix))
+    hf_mid_atn_prefix = "mid_block.attentions.0."
+    sd_mid_atn_prefix = "middle_block.1."
+    unet_conversion_map_layer.append((sd_mid_atn_prefix, hf_mid_atn_prefix))
+    for j in range(2):
+        hf_mid_res_prefix = f"mid_block.resnets.{j}."
+        sd_mid_res_prefix = f"middle_block.{2*j}."
+        unet_conversion_map_layer.append((sd_mid_res_prefix, hf_mid_res_prefix))
+    # buyer beware: this is a *brittle* function,
+    # and correct output requires that all of these pieces interact in
+    # the exact order in which I have arranged them.
+    mapping = {k: k for k in unet_state_dict.keys()}
+    for sd_name, hf_name in unet_conversion_map:
+        mapping[hf_name] = sd_name
+    for k, v in mapping.items():
+        if "resnets" in k:
+            for sd_part, hf_part in unet_conversion_map_resnet:
+                v = v.replace(hf_part, sd_part)
+            mapping[k] = v
+    for k, v in mapping.items():
+        for sd_part, hf_part in unet_conversion_map_layer:
+            v = v.replace(hf_part, sd_part)
+        mapping[k] = v
+    new_state_dict = {v: unet_state_dict[k] for k, v in mapping.items()}
+    if v2:
+        conv_transformer_to_linear(new_state_dict)
+    return new_state_dict
+def controlnet_conversion_map():
+    unet_conversion_map = [
+        ("time_embed.0.weight", "time_embedding.linear_1.weight"),
+        ("time_embed.0.bias", "time_embedding.linear_1.bias"),
+        ("time_embed.2.weight", "time_embedding.linear_2.weight"),
+        ("time_embed.2.bias", "time_embedding.linear_2.bias"),
+        ("input_blocks.0.0.weight", "conv_in.weight"),
+        ("input_blocks.0.0.bias", "conv_in.bias"),
+        ("middle_block_out.0.weight", "controlnet_mid_block.weight"),
+        ("middle_block_out.0.bias", "controlnet_mid_block.bias"),
+    ]
+    unet_conversion_map_resnet = [
+        ("in_layers.0", "norm1"),
+        ("in_layers.2", "conv1"),
+        ("out_layers.0", "norm2"),
+        ("out_layers.3", "conv2"),
+        ("emb_layers.1", "time_emb_proj"),
+        ("skip_connection", "conv_shortcut"),
+    ]
+    unet_conversion_map_layer = []
+    for i in range(4):
+        for j in range(2):
+            hf_down_res_prefix = f"down_blocks.{i}.resnets.{j}."
+            sd_down_res_prefix = f"input_blocks.{3*i + j + 1}.0."
+            unet_conversion_map_layer.append((sd_down_res_prefix, hf_down_res_prefix))
+            if i < 3:
+                hf_down_atn_prefix = f"down_blocks.{i}.attentions.{j}."
+                sd_down_atn_prefix = f"input_blocks.{3*i + j + 1}.1."
+                unet_conversion_map_layer.append((sd_down_atn_prefix, hf_down_atn_prefix))
+        if i < 3:
+            hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0.conv."
+            sd_downsample_prefix = f"input_blocks.{3*(i+1)}.0.op."
+            unet_conversion_map_layer.append((sd_downsample_prefix, hf_downsample_prefix))
+    hf_mid_atn_prefix = "mid_block.attentions.0."
+    sd_mid_atn_prefix = "middle_block.1."
+    unet_conversion_map_layer.append((sd_mid_atn_prefix, hf_mid_atn_prefix))
+    for j in range(2):
+        hf_mid_res_prefix = f"mid_block.resnets.{j}."
+        sd_mid_res_prefix = f"middle_block.{2*j}."
+        unet_conversion_map_layer.append((sd_mid_res_prefix, hf_mid_res_prefix))
+    controlnet_cond_embedding_names = ["conv_in"] + [f"blocks.{i}" for i in range(6)] + ["conv_out"]
+    for i, hf_prefix in enumerate(controlnet_cond_embedding_names):
+        hf_prefix = f"controlnet_cond_embedding.{hf_prefix}."
+        sd_prefix = f"input_hint_block.{i*2}."
+        unet_conversion_map_layer.append((sd_prefix, hf_prefix))
+    for i in range(12):
+        hf_prefix = f"controlnet_down_blocks.{i}."
+        sd_prefix = f"zero_convs.{i}.0."
+        unet_conversion_map_layer.append((sd_prefix, hf_prefix))
+    return unet_conversion_map, unet_conversion_map_resnet, unet_conversion_map_layer
+def convert_controlnet_state_dict_to_sd(controlnet_state_dict):
+    unet_conversion_map, unet_conversion_map_resnet, unet_conversion_map_layer = controlnet_conversion_map()
+    mapping = {k: k for k in controlnet_state_dict.keys()}
+    for sd_name, diffusers_name in unet_conversion_map:
+        mapping[diffusers_name] = sd_name
+    for k, v in mapping.items():
+        if "resnets" in k:
+            for sd_part, diffusers_part in unet_conversion_map_resnet:
+                v = v.replace(diffusers_part, sd_part)
+            mapping[k] = v
+    for k, v in mapping.items():
+        for sd_part, diffusers_part in unet_conversion_map_layer:
+            v = v.replace(diffusers_part, sd_part)
+        mapping[k] = v
+    new_state_dict = {v: controlnet_state_dict[k] for k, v in mapping.items()}
+    return new_state_dict
+def convert_controlnet_state_dict_to_diffusers(controlnet_state_dict):
+    unet_conversion_map, unet_conversion_map_resnet, unet_conversion_map_layer = controlnet_conversion_map()
+    mapping = {k: k for k in controlnet_state_dict.keys()}
+    for sd_name, diffusers_name in unet_conversion_map:
+        mapping[sd_name] = diffusers_name
+    for k, v in mapping.items():
+        for sd_part, diffusers_part in unet_conversion_map_layer:
+            v = v.replace(sd_part, diffusers_part)
+        mapping[k] = v
+    for k, v in mapping.items():
+        if "resnets" in v:
+            for sd_part, diffusers_part in unet_conversion_map_resnet:
+                v = v.replace(sd_part, diffusers_part)
+            mapping[k] = v
+    new_state_dict = {v: controlnet_state_dict[k] for k, v in mapping.items()}
+    return new_state_dict
+# ================#
+# VAE Conversion #
+# ================#
+def reshape_weight_for_sd(w):
+    # convert HF linear weights to SD conv2d weights
+    return w.reshape(*w.shape, 1, 1)
+def convert_vae_state_dict(vae_state_dict):
+    vae_conversion_map = [
+        # (stable-diffusion, HF Diffusers)
+        ("nin_shortcut", "conv_shortcut"),
+        ("norm_out", "conv_norm_out"),
+        ("mid.attn_1.", "mid_block.attentions.0."),
+    ]
+    for i in range(4):
+        # down_blocks have two resnets
+        for j in range(2):
+            hf_down_prefix = f"encoder.down_blocks.{i}.resnets.{j}."
+            sd_down_prefix = f"encoder.down.{i}.block.{j}."
+            vae_conversion_map.append((sd_down_prefix, hf_down_prefix))
+        if i < 3:
+            hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0."
+            sd_downsample_prefix = f"down.{i}.downsample."
+            vae_conversion_map.append((sd_downsample_prefix, hf_downsample_prefix))
+            hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
+            sd_upsample_prefix = f"up.{3-i}.upsample."
+            vae_conversion_map.append((sd_upsample_prefix, hf_upsample_prefix))
+        # up_blocks have three resnets
+        # also, up blocks in hf are numbered in reverse from sd
+        for j in range(3):
+            hf_up_prefix = f"decoder.up_blocks.{i}.resnets.{j}."
+            sd_up_prefix = f"decoder.up.{3-i}.block.{j}."
+            vae_conversion_map.append((sd_up_prefix, hf_up_prefix))
+    # this part accounts for mid blocks in both the encoder and the decoder
+    for i in range(2):
+        hf_mid_res_prefix = f"mid_block.resnets.{i}."
+        sd_mid_res_prefix = f"mid.block_{i+1}."
+        vae_conversion_map.append((sd_mid_res_prefix, hf_mid_res_prefix))
+    if diffusers.__version__ < "0.17.0":
+        vae_conversion_map_attn = [
+            # (stable-diffusion, HF Diffusers)
+            ("norm.", "group_norm."),
+            ("q.", "query."),
+            ("k.", "key."),
+            ("v.", "value."),
+            ("proj_out.", "proj_attn."),
+        ]
+    else:
+        vae_conversion_map_attn = [
+            # (stable-diffusion, HF Diffusers)
+            ("norm.", "group_norm."),
+            ("q.", "to_q."),
+            ("k.", "to_k."),
+            ("v.", "to_v."),
+            ("proj_out.", "to_out.0."),
+        ]
+    mapping = {k: k for k in vae_state_dict.keys()}
+    for k, v in mapping.items():
+        for sd_part, hf_part in vae_conversion_map:
+            v = v.replace(hf_part, sd_part)
+        mapping[k] = v
+    for k, v in mapping.items():
+        if "attentions" in k:
+            for sd_part, hf_part in vae_conversion_map_attn:
+                v = v.replace(hf_part, sd_part)
+            mapping[k] = v
+    new_state_dict = {v: vae_state_dict[k] for k, v in mapping.items()}
+    weights_to_convert = ["q", "k", "v", "proj_out"]
+    for k, v in new_state_dict.items():
+        for weight_name in weights_to_convert:
+            if f"mid.attn_1.{weight_name}.weight" in k:
+                # logger.info(f"Reshaping {k} for SD format: shape {v.shape} -> {v.shape} x 1 x 1")
+                new_state_dict[k] = reshape_weight_for_sd(v)
+    return new_state_dict
+# endregion
+# region 自作のモデル読み書きなど
+def is_safetensors(path):
+    return os.path.splitext(path)[1].lower() == ".safetensors"
+def load_checkpoint_with_text_encoder_conversion(ckpt_path, device="cpu"):
+    # text encoderの格納形式が違うモデルに対応する ('text_model'がない)
+    TEXT_ENCODER_KEY_REPLACEMENTS = [
+        ("cond_stage_model.transformer.embeddings.", "cond_stage_model.transformer.text_model.embeddings."),
+        ("cond_stage_model.transformer.encoder.", "cond_stage_model.transformer.text_model.encoder."),
+        ("cond_stage_model.transformer.final_layer_norm.", "cond_stage_model.transformer.text_model.final_layer_norm."),
+    ]
+    if is_safetensors(ckpt_path):
+        checkpoint = None
+        state_dict = load_file(ckpt_path)  # , device) # may causes error
+    else:
+        checkpoint = torch.load(ckpt_path, map_location=device)
+        if "state_dict" in checkpoint:
+            state_dict = checkpoint["state_dict"]
+        else:
+            state_dict = checkpoint
+            checkpoint = None
+    key_reps = []
+    for rep_from, rep_to in TEXT_ENCODER_KEY_REPLACEMENTS:
+        for key in state_dict.keys():
+            if key.startswith(rep_from):
+                new_key = rep_to + key[len(rep_from) :]
+                key_reps.append((key, new_key))
+    for key, new_key in key_reps:
+        state_dict[new_key] = state_dict[key]
+        del state_dict[key]
+    return checkpoint, state_dict
+# TODO dtype指定の動作が怪しいので確認する text_encoderを指定形式で作れるか未確認
+def load_models_from_stable_diffusion_checkpoint(v2, ckpt_path, device="cpu", dtype=None, unet_use_linear_projection_in_v2=True):
+    _, state_dict = load_checkpoint_with_text_encoder_conversion(ckpt_path, device)
+    # Convert the UNet2DConditionModel model.
+    unet_config = create_unet_diffusers_config(v2, unet_use_linear_projection_in_v2)
+    converted_unet_checkpoint = convert_ldm_unet_checkpoint(v2, state_dict, unet_config)
+    unet = UNet2DConditionModel(**unet_config).to(device)
+    info = unet.load_state_dict(converted_unet_checkpoint)
+    logger.info(f"loading u-net: {info}")
+    # Convert the VAE model.
+    vae_config = create_vae_diffusers_config()
+    converted_vae_checkpoint = convert_ldm_vae_checkpoint(state_dict, vae_config)
+    vae = AutoencoderKL(**vae_config).to(device)
+    info = vae.load_state_dict(converted_vae_checkpoint)
+    logger.info(f"loading vae: {info}")
+    # convert text_model
+    if v2:
+        converted_text_encoder_checkpoint = convert_ldm_clip_checkpoint_v2(state_dict, 77)
+        cfg = CLIPTextConfig(
+            vocab_size=49408,
+            hidden_size=1024,
+            intermediate_size=4096,
+            num_hidden_layers=23,
+            num_attention_heads=16,
+            max_position_embeddings=77,
+            hidden_act="gelu",
+            layer_norm_eps=1e-05,
+            dropout=0.0,
+            attention_dropout=0.0,
+            initializer_range=0.02,
+            initializer_factor=1.0,
+            pad_token_id=1,
+            bos_token_id=0,
+            eos_token_id=2,
+            model_type="clip_text_model",
+            projection_dim=512,
+            torch_dtype="float32",
+            transformers_version="4.25.0.dev0",
+        )
+        text_model = CLIPTextModel._from_config(cfg)
+        info = text_model.load_state_dict(converted_text_encoder_checkpoint)
+    else:
+        converted_text_encoder_checkpoint = convert_ldm_clip_checkpoint_v1(state_dict)
+        # logging.set_verbosity_error()  # don't show annoying warning
+        # text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14").to(device)
+        # logging.set_verbosity_warning()
+        # logger.info(f"config: {text_model.config}")
+        cfg = CLIPTextConfig(
+            vocab_size=49408,
+            hidden_size=768,
+            intermediate_size=3072,
+            num_hidden_layers=12,
+            num_attention_heads=12,
+            max_position_embeddings=77,
+            hidden_act="quick_gelu",
+            layer_norm_eps=1e-05,
+            dropout=0.0,
+            attention_dropout=0.0,
+            initializer_range=0.02,
+            initializer_factor=1.0,
+            pad_token_id=1,
+            bos_token_id=0,
+            eos_token_id=2,
+            model_type="clip_text_model",
+            projection_dim=768,
+            torch_dtype="float32",
+        )
+        text_model = CLIPTextModel._from_config(cfg)
+        info = text_model.load_state_dict(converted_text_encoder_checkpoint)
+    logger.info(f"loading text encoder: {info}")
+    return text_model, vae, unet
+def get_model_version_str_for_sd1_sd2(v2, v_parameterization):
+    # only for reference
+    version_str = "sd"
+    if v2:
+        version_str += "_v2"
+    else:
+        version_str += "_v1"
+    if v_parameterization:
+        version_str += "_v"
+    return version_str
+def convert_text_encoder_state_dict_to_sd_v2(checkpoint, make_dummy_weights=False):
+    def convert_key(key):
+        # position_idsの除去
+        if ".position_ids" in key:
+            return None
+        # common
+        key = key.replace("text_model.encoder.", "transformer.")
+        key = key.replace("text_model.", "")
+        if "layers" in key:
+            # resblocks conversion
+            key = key.replace(".layers.", ".resblocks.")
+            if ".layer_norm" in key:
+                key = key.replace(".layer_norm", ".ln_")
+            elif ".mlp." in key:
+                key = key.replace(".fc1.", ".c_fc.")
+                key = key.replace(".fc2.", ".c_proj.")
+            elif ".self_attn.out_proj" in key:
+                key = key.replace(".self_attn.out_proj.", ".attn.out_proj.")
+            elif ".self_attn." in key:
+                key = None  # 特殊なので後で処理する
+            else:
+                raise ValueError(f"unexpected key in DiffUsers model: {key}")
+        elif ".position_embedding" in key:
+            key = key.replace("embeddings.position_embedding.weight", "positional_embedding")
+        elif ".token_embedding" in key:
+            key = key.replace("embeddings.token_embedding.weight", "token_embedding.weight")
+        elif "final_layer_norm" in key:
+            key = key.replace("final_layer_norm", "ln_final")
+        return key
+    keys = list(checkpoint.keys())
+    new_sd = {}
+    for key in keys:
+        new_key = convert_key(key)
+        if new_key is None:
+            continue
+        new_sd[new_key] = checkpoint[key]
+    # attnの変換
+    for key in keys:
+        if "layers" in key and "q_proj" in key:
+            # 三つを結合
+            key_q = key
+            key_k = key.replace("q_proj", "k_proj")
+            key_v = key.replace("q_proj", "v_proj")
+            value_q = checkpoint[key_q]
+            value_k = checkpoint[key_k]
+            value_v = checkpoint[key_v]
+            value = torch.cat([value_q, value_k, value_v])
+            new_key = key.replace("text_model.encoder.layers.", "transformer.resblocks.")
+            new_key = new_key.replace(".self_attn.q_proj.", ".attn.in_proj_")
+            new_sd[new_key] = value
+    # 最後の層などを捏造するか
+    if make_dummy_weights:
+        logger.info("make dummy weights for resblock.23, text_projection and logit scale.")
+        keys = list(new_sd.keys())
+        for key in keys:
+            if key.startswith("transformer.resblocks.22."):
+                new_sd[key.replace(".22.", ".23.")] = new_sd[key].clone()  # copyしないとsafetensorsの保存で落ちる
+        # Diffusersに含まれない重みを作っておく
+        new_sd["text_projection"] = torch.ones((1024, 1024), dtype=new_sd[keys[0]].dtype, device=new_sd[keys[0]].device)
+        new_sd["logit_scale"] = torch.tensor(1)
+    return new_sd
+def save_stable_diffusion_checkpoint(
+    v2, output_file, text_encoder, unet, ckpt_path, epochs, steps, metadata, save_dtype=None, vae=None
+):
+    if ckpt_path is not None:
+        # epoch/stepを参照する。またVAEがメモリ上にないときなど、もう一度VAEを含めて読み込む
+        checkpoint, state_dict = load_checkpoint_with_text_encoder_conversion(ckpt_path)
+        if checkpoint is None:  # safetensors または state_dictのckpt
+            checkpoint = {}
+            strict = False
+        else:
+            strict = True
+        if "state_dict" in state_dict:
+            del state_dict["state_dict"]
+    else:
+        # 新しく作る
+        assert vae is not None, "VAE is required to save a checkpoint without a given checkpoint"
+        checkpoint = {}
+        state_dict = {}
+        strict = False
+    def update_sd(prefix, sd):
+        for k, v in sd.items():
+            key = prefix + k
+            assert not strict or key in state_dict, f"Illegal key in save SD: {key}"
+            if save_dtype is not None:
+                v = v.detach().clone().to("cpu").to(save_dtype)
+            state_dict[key] = v
+    # Convert the UNet model
+    unet_state_dict = convert_unet_state_dict_to_sd(v2, unet.state_dict())
+    update_sd("model.diffusion_model.", unet_state_dict)
+    # Convert the text encoder model
+    if v2:
+        make_dummy = ckpt_path is None  # 参照元のcheckpointがない場合は最後の層を前の層から複製して作るなどダミーの重みを入れる
+        text_enc_dict = convert_text_encoder_state_dict_to_sd_v2(text_encoder.state_dict(), make_dummy)
+        update_sd("cond_stage_model.model.", text_enc_dict)
+    else:
+        text_enc_dict = text_encoder.state_dict()
+        update_sd("cond_stage_model.transformer.", text_enc_dict)
+    # Convert the VAE
+    if vae is not None:
+        vae_dict = convert_vae_state_dict(vae.state_dict())
+        update_sd("first_stage_model.", vae_dict)
+    # Put together new checkpoint
+    key_count = len(state_dict.keys())
+    new_ckpt = {"state_dict": state_dict}
+    # epoch and global_step are sometimes not int
+    try:
+        if "epoch" in checkpoint:
+            epochs += checkpoint["epoch"]
+        if "global_step" in checkpoint:
+            steps += checkpoint["global_step"]
+    except:
+        pass
+    new_ckpt["epoch"] = epochs
+    new_ckpt["global_step"] = steps
+    if is_safetensors(output_file):
+        # TODO Tensor以外のdictの値を削除したほうがいいか
+        save_file(state_dict, output_file, metadata)
+    else:
+        torch.save(new_ckpt, output_file)
+    return key_count
+def save_diffusers_checkpoint(v2, output_dir, text_encoder, unet, pretrained_model_name_or_path, vae=None, use_safetensors=False):
+    if pretrained_model_name_or_path is None:
+        # load default settings for v1/v2
+        if v2:
+            pretrained_model_name_or_path = DIFFUSERS_REF_MODEL_ID_V2
+        else:
+            pretrained_model_name_or_path = DIFFUSERS_REF_MODEL_ID_V1
+    scheduler = DDIMScheduler.from_pretrained(pretrained_model_name_or_path, subfolder="scheduler")
+    tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="tokenizer")
+    if vae is None:
+        vae = AutoencoderKL.from_pretrained(pretrained_model_name_or_path, subfolder="vae")
+    # original U-Net cannot be saved, so we need to convert it to the Diffusers version
+    # TODO this consumes a lot of memory
+    diffusers_unet = diffusers.UNet2DConditionModel.from_pretrained(pretrained_model_name_or_path, subfolder="unet")
+    diffusers_unet.load_state_dict(unet.state_dict())
+    pipeline = StableDiffusionPipeline(
+        unet=diffusers_unet,
+        text_encoder=text_encoder,
+        vae=vae,
+        scheduler=scheduler,
+        tokenizer=tokenizer,
+        safety_checker=None,
+        feature_extractor=None,
+        requires_safety_checker=None,
+    )
+    pipeline.save_pretrained(output_dir, safe_serialization=use_safetensors)
+VAE_PREFIX = "first_stage_model."
+def load_vae(vae_id, dtype):
+    logger.info(f"load VAE: {vae_id}")
+    if os.path.isdir(vae_id) or not os.path.isfile(vae_id):
+        # Diffusers local/remote
+        try:
+            vae = AutoencoderKL.from_pretrained(vae_id, subfolder=None, torch_dtype=dtype)
+        except EnvironmentError as e:
+            logger.error(f"exception occurs in loading vae: {e}")
+            logger.error("retry with subfolder='vae'")
+            vae = AutoencoderKL.from_pretrained(vae_id, subfolder="vae", torch_dtype=dtype)
+        return vae
+    # local
+    vae_config = create_vae_diffusers_config()
+    if vae_id.endswith(".bin"):
+        # SD 1.5 VAE on Huggingface
+        converted_vae_checkpoint = torch.load(vae_id, map_location="cpu")
+    else:
+        # StableDiffusion
+        vae_model = load_file(vae_id, "cpu") if is_safetensors(vae_id) else torch.load(vae_id, map_location="cpu")
+        vae_sd = vae_model["state_dict"] if "state_dict" in vae_model else vae_model
+        # vae only or full model
+        full_model = False
+        for vae_key in vae_sd:
+            if vae_key.startswith(VAE_PREFIX):
+                full_model = True
+                break
+        if not full_model:
+            sd = {}
+            for key, value in vae_sd.items():
+                sd[VAE_PREFIX + key] = value
+            vae_sd = sd
+            del sd
+        # Convert the VAE model.
+        converted_vae_checkpoint = convert_ldm_vae_checkpoint(vae_sd, vae_config)
+    vae = AutoencoderKL(**vae_config)
+    vae.load_state_dict(converted_vae_checkpoint)
+    return vae
+# endregion
+def make_bucket_resolutions(max_reso, min_size=256, max_size=1024, divisible=64):
+    max_width, max_height = max_reso
+    max_area = max_width * max_height
+    resos = set()
+    width = int(math.sqrt(max_area) // divisible) * divisible
+    resos.add((width, width))
+    width = min_size
+    while width <= max_size:
+        height = min(max_size, int((max_area // width) // divisible) * divisible)
+        if height >= min_size:
+            resos.add((width, height))
+            resos.add((height, width))
+        # # make additional resos
+        # if width >= height and width - divisible >= min_size:
+        #   resos.add((width - divisible, height))
+        #   resos.add((height, width - divisible))
+        # if height >= width and height - divisible >= min_size:
+        #   resos.add((width, height - divisible))
+        #   resos.add((height - divisible, width))
+        width += divisible
+    resos = list(resos)
+    resos.sort()
+    return resos
+if __name__ == "__main__":
+    resos = make_bucket_resolutions((512, 768))
+    logger.info(f"{len(resos)}")
+    logger.info(f"{resos}")
+    aspect_ratios = [w / h for w, h in resos]
+    logger.info(f"{aspect_ratios}")
+    ars = set()
+    for ar in aspect_ratios:
+        if ar in ars:
+            logger.error(f"error! duplicate ar: {ar}")
+        ars.add(ar)

library/original_unet.py ADDED Viewed

	@@ -0,0 +1,1919 @@

+# Diffusers 0.10.2からStable Diffusionに必要な部分だけを持ってくる
+# 条件分岐等で不要な部分は削除している
+# コードの多くはDiffusersからコピーしている
+# 制約として、モデルのstate_dictがDiffusers 0.10.2のものと同じ形式である必要がある
+# Copy from Diffusers 0.10.2 for Stable Diffusion. Most of the code is copied from Diffusers.
+# Unnecessary parts are deleted by condition branching.
+# As a constraint, the state_dict of the model must be in the same format as that of Diffusers 0.10.2
+"""
+v1.5とv2.1の相違点は
+- attention_head_dimがintかlist[int]か
+- cross_attention_dimが768か1024か
+- use_linear_projection: trueがない（=False, 1.5）かあるか
+- upcast_attentionがFalse(1.5)かTrue(2.1)か
+- （以下は多分無視していい）
+- sample_sizeが64か96か
+- dual_cross_attentionがあるかないか
+- num_class_embedsがあるかないか
+- only_cross_attentionがあるかないか
+v1.5
+{
+  "_class_name": "UNet2DConditionModel",
+  "_diffusers_version": "0.6.0",
+  "act_fn": "silu",
+  "attention_head_dim": 8,
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "center_input_sample": false,
+  "cross_attention_dim": 768,
+  "down_block_types": [
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "DownBlock2D"
+  ],
+  "downsample_padding": 1,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_scale_factor": 1,
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "out_channels": 4,
+  "sample_size": 64,
+  "up_block_types": [
+    "UpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D"
+  ]
+}
+v2.1
+{
+  "_class_name": "UNet2DConditionModel",
+  "_diffusers_version": "0.10.0.dev0",
+  "act_fn": "silu",
+  "attention_head_dim": [
+    5,
+    10,
+    20,
+    20
+  ],
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "center_input_sample": false,
+  "cross_attention_dim": 1024,
+  "down_block_types": [
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "DownBlock2D"
+  ],
+  "downsample_padding": 1,
+  "dual_cross_attention": false,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_scale_factor": 1,
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "num_class_embeds": null,
+  "only_cross_attention": false,
+  "out_channels": 4,
+  "sample_size": 96,
+  "up_block_types": [
+    "UpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D"
+  ],
+  "use_linear_projection": true,
+  "upcast_attention": true
+}
+"""
+import math
+from types import SimpleNamespace
+from typing import Dict, Optional, Tuple, Union
+import torch
+from torch import nn
+from torch.nn import functional as F
+from einops import rearrange
+from library.utils import setup_logging
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+BLOCK_OUT_CHANNELS: Tuple[int] = (320, 640, 1280, 1280)
+TIMESTEP_INPUT_DIM = BLOCK_OUT_CHANNELS[0]
+TIME_EMBED_DIM = BLOCK_OUT_CHANNELS[0] * 4
+IN_CHANNELS: int = 4
+OUT_CHANNELS: int = 4
+LAYERS_PER_BLOCK: int = 2
+LAYERS_PER_BLOCK_UP: int = LAYERS_PER_BLOCK + 1
+TIME_EMBED_FLIP_SIN_TO_COS: bool = True
+TIME_EMBED_FREQ_SHIFT: int = 0
+NORM_GROUPS: int = 32
+NORM_EPS: float = 1e-5
+TRANSFORMER_NORM_NUM_GROUPS = 32
+DOWN_BLOCK_TYPES = ["CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D"]
+UP_BLOCK_TYPES = ["UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"]
+# region memory efficient attention
+# FlashAttentionを使うCrossAttention
+# based on https://github.com/lucidrains/memory-efficient-attention-pytorch/blob/main/memory_efficient_attention_pytorch/flash_attention.py
+# LICENSE MIT https://github.com/lucidrains/memory-efficient-attention-pytorch/blob/main/LICENSE
+# constants
+EPSILON = 1e-6
+# helper functions
+def exists(val):
+    return val is not None
+def default(val, d):
+    return val if exists(val) else d
+# flash attention forwards and backwards
+# https://arxiv.org/abs/2205.14135
+class FlashAttentionFunction(torch.autograd.Function):
+    @staticmethod
+    @torch.no_grad()
+    def forward(ctx, q, k, v, mask, causal, q_bucket_size, k_bucket_size):
+        """Algorithm 2 in the paper"""
+        device = q.device
+        dtype = q.dtype
+        max_neg_value = -torch.finfo(q.dtype).max
+        qk_len_diff = max(k.shape[-2] - q.shape[-2], 0)
+        o = torch.zeros_like(q)
+        all_row_sums = torch.zeros((*q.shape[:-1], 1), dtype=dtype, device=device)
+        all_row_maxes = torch.full((*q.shape[:-1], 1), max_neg_value, dtype=dtype, device=device)
+        scale = q.shape[-1] ** -0.5
+        if not exists(mask):
+            mask = (None,) * math.ceil(q.shape[-2] / q_bucket_size)
+        else:
+            mask = rearrange(mask, "b n -> b 1 1 n")
+            mask = mask.split(q_bucket_size, dim=-1)
+        row_splits = zip(
+            q.split(q_bucket_size, dim=-2),
+            o.split(q_bucket_size, dim=-2),
+            mask,
+            all_row_sums.split(q_bucket_size, dim=-2),
+            all_row_maxes.split(q_bucket_size, dim=-2),
+        )
+        for ind, (qc, oc, row_mask, row_sums, row_maxes) in enumerate(row_splits):
+            q_start_index = ind * q_bucket_size - qk_len_diff
+            col_splits = zip(
+                k.split(k_bucket_size, dim=-2),
+                v.split(k_bucket_size, dim=-2),
+            )
+            for k_ind, (kc, vc) in enumerate(col_splits):
+                k_start_index = k_ind * k_bucket_size
+                attn_weights = torch.einsum("... i d, ... j d -> ... i j", qc, kc) * scale
+                if exists(row_mask):
+                    attn_weights.masked_fill_(~row_mask, max_neg_value)
+                if causal and q_start_index < (k_start_index + k_bucket_size - 1):
+                    causal_mask = torch.ones((qc.shape[-2], kc.shape[-2]), dtype=torch.bool, device=device).triu(
+                        q_start_index - k_start_index + 1
+                    )
+                    attn_weights.masked_fill_(causal_mask, max_neg_value)
+                block_row_maxes = attn_weights.amax(dim=-1, keepdims=True)
+                attn_weights -= block_row_maxes
+                exp_weights = torch.exp(attn_weights)
+                if exists(row_mask):
+                    exp_weights.masked_fill_(~row_mask, 0.0)
+                block_row_sums = exp_weights.sum(dim=-1, keepdims=True).clamp(min=EPSILON)
+                new_row_maxes = torch.maximum(block_row_maxes, row_maxes)
+                exp_values = torch.einsum("... i j, ... j d -> ... i d", exp_weights, vc)
+                exp_row_max_diff = torch.exp(row_maxes - new_row_maxes)
+                exp_block_row_max_diff = torch.exp(block_row_maxes - new_row_maxes)
+                new_row_sums = exp_row_max_diff * row_sums + exp_block_row_max_diff * block_row_sums
+                oc.mul_((row_sums / new_row_sums) * exp_row_max_diff).add_((exp_block_row_max_diff / new_row_sums) * exp_values)
+                row_maxes.copy_(new_row_maxes)
+                row_sums.copy_(new_row_sums)
+        ctx.args = (causal, scale, mask, q_bucket_size, k_bucket_size)
+        ctx.save_for_backward(q, k, v, o, all_row_sums, all_row_maxes)
+        return o
+    @staticmethod
+    @torch.no_grad()
+    def backward(ctx, do):
+        """Algorithm 4 in the paper"""
+        causal, scale, mask, q_bucket_size, k_bucket_size = ctx.args
+        q, k, v, o, l, m = ctx.saved_tensors
+        device = q.device
+        max_neg_value = -torch.finfo(q.dtype).max
+        qk_len_diff = max(k.shape[-2] - q.shape[-2], 0)
+        dq = torch.zeros_like(q)
+        dk = torch.zeros_like(k)
+        dv = torch.zeros_like(v)
+        row_splits = zip(
+            q.split(q_bucket_size, dim=-2),
+            o.split(q_bucket_size, dim=-2),
+            do.split(q_bucket_size, dim=-2),
+            mask,
+            l.split(q_bucket_size, dim=-2),
+            m.split(q_bucket_size, dim=-2),
+            dq.split(q_bucket_size, dim=-2),
+        )
+        for ind, (qc, oc, doc, row_mask, lc, mc, dqc) in enumerate(row_splits):
+            q_start_index = ind * q_bucket_size - qk_len_diff
+            col_splits = zip(
+                k.split(k_bucket_size, dim=-2),
+                v.split(k_bucket_size, dim=-2),
+                dk.split(k_bucket_size, dim=-2),
+                dv.split(k_bucket_size, dim=-2),
+            )
+            for k_ind, (kc, vc, dkc, dvc) in enumerate(col_splits):
+                k_start_index = k_ind * k_bucket_size
+                attn_weights = torch.einsum("... i d, ... j d -> ... i j", qc, kc) * scale
+                if causal and q_start_index < (k_start_index + k_bucket_size - 1):
+                    causal_mask = torch.ones((qc.shape[-2], kc.shape[-2]), dtype=torch.bool, device=device).triu(
+                        q_start_index - k_start_index + 1
+                    )
+                    attn_weights.masked_fill_(causal_mask, max_neg_value)
+                exp_attn_weights = torch.exp(attn_weights - mc)
+                if exists(row_mask):
+                    exp_attn_weights.masked_fill_(~row_mask, 0.0)
+                p = exp_attn_weights / lc
+                dv_chunk = torch.einsum("... i j, ... i d -> ... j d", p, doc)
+                dp = torch.einsum("... i d, ... j d -> ... i j", doc, vc)
+                D = (doc * oc).sum(dim=-1, keepdims=True)
+                ds = p * scale * (dp - D)
+                dq_chunk = torch.einsum("... i j, ... j d -> ... i d", ds, kc)
+                dk_chunk = torch.einsum("... i j, ... i d -> ... j d", ds, qc)
+                dqc.add_(dq_chunk)
+                dkc.add_(dk_chunk)
+                dvc.add_(dv_chunk)
+        return dq, dk, dv, None, None, None, None
+# endregion
+def get_parameter_dtype(parameter: torch.nn.Module):
+    return next(parameter.parameters()).dtype
+def get_parameter_device(parameter: torch.nn.Module):
+    return next(parameter.parameters()).device
+def get_timestep_embedding(
+    timesteps: torch.Tensor,
+    embedding_dim: int,
+    flip_sin_to_cos: bool = False,
+    downscale_freq_shift: float = 1,
+    scale: float = 1,
+    max_period: int = 10000,
+):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
+    embeddings. :return: an [N x dim] Tensor of positional embeddings.
+    """
+    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
+    half_dim = embedding_dim // 2
+    exponent = -math.log(max_period) * torch.arange(start=0, end=half_dim, dtype=torch.float32, device=timesteps.device)
+    exponent = exponent / (half_dim - downscale_freq_shift)
+    emb = torch.exp(exponent)
+    emb = timesteps[:, None].float() * emb[None, :]
+    # scale embeddings
+    emb = scale * emb
+    # concat sine and cosine embeddings
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+    # flip sine and cosine embeddings
+    if flip_sin_to_cos:
+        emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
+    # zero pad
+    if embedding_dim % 2 == 1:
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb
+# Deep Shrink: We do not common this function, because minimize dependencies.
+def resize_like(x, target, mode="bicubic", align_corners=False):
+    org_dtype = x.dtype
+    if org_dtype == torch.bfloat16:
+        x = x.to(torch.float32)
+    if x.shape[-2:] != target.shape[-2:]:
+        if mode == "nearest":
+            x = F.interpolate(x, size=target.shape[-2:], mode=mode)
+        else:
+            x = F.interpolate(x, size=target.shape[-2:], mode=mode, align_corners=align_corners)
+    if org_dtype == torch.bfloat16:
+        x = x.to(org_dtype)
+    return x
+class SampleOutput:
+    def __init__(self, sample):
+        self.sample = sample
+class TimestepEmbedding(nn.Module):
+    def __init__(self, in_channels: int, time_embed_dim: int, act_fn: str = "silu", out_dim: int = None):
+        super().__init__()
+        self.linear_1 = nn.Linear(in_channels, time_embed_dim)
+        self.act = None
+        if act_fn == "silu":
+            self.act = nn.SiLU()
+        elif act_fn == "mish":
+            self.act = nn.Mish()
+        if out_dim is not None:
+            time_embed_dim_out = out_dim
+        else:
+            time_embed_dim_out = time_embed_dim
+        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out)
+    def forward(self, sample):
+        sample = self.linear_1(sample)
+        if self.act is not None:
+            sample = self.act(sample)
+        sample = self.linear_2(sample)
+        return sample
+class Timesteps(nn.Module):
+    def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float):
+        super().__init__()
+        self.num_channels = num_channels
+        self.flip_sin_to_cos = flip_sin_to_cos
+        self.downscale_freq_shift = downscale_freq_shift
+    def forward(self, timesteps):
+        t_emb = get_timestep_embedding(
+            timesteps,
+            self.num_channels,
+            flip_sin_to_cos=self.flip_sin_to_cos,
+            downscale_freq_shift=self.downscale_freq_shift,
+        )
+        return t_emb
+class ResnetBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.norm1 = torch.nn.GroupNorm(num_groups=NORM_GROUPS, num_channels=in_channels, eps=NORM_EPS, affine=True)
+        self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.time_emb_proj = torch.nn.Linear(TIME_EMBED_DIM, out_channels)
+        self.norm2 = torch.nn.GroupNorm(num_groups=NORM_GROUPS, num_channels=out_channels, eps=NORM_EPS, affine=True)
+        self.conv2 = torch.nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        # if non_linearity == "swish":
+        self.nonlinearity = lambda x: F.silu(x)
+        self.use_in_shortcut = self.in_channels != self.out_channels
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, input_tensor, temb):
+        hidden_states = input_tensor
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+        temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None]
+        hidden_states = hidden_states + temb
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        if self.conv_shortcut is not None:
+            input_tensor = self.conv_shortcut(input_tensor)
+        output_tensor = input_tensor + hidden_states
+        return output_tensor
+class DownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        add_downsample=True,
+    ):
+        super().__init__()
+        self.has_cross_attention = False
+        resnets = []
+        for i in range(LAYERS_PER_BLOCK):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        if add_downsample:
+            self.downsamplers = [Downsample2D(out_channels, out_channels=out_channels)]
+        else:
+            self.downsamplers = None
+        self.gradient_checkpointing = False
+    def set_use_memory_efficient_attention(self, xformers, mem_eff):
+        pass
+    def set_use_sdpa(self, sdpa):
+        pass
+    def forward(self, hidden_states, temb=None):
+        output_states = ()
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+            else:
+                hidden_states = resnet(hidden_states, temb)
+            output_states += (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+            output_states += (hidden_states,)
+        return hidden_states, output_states
+class Downsample2D(nn.Module):
+    def __init__(self, channels, out_channels):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels
+        self.conv = nn.Conv2d(self.channels, self.out_channels, 3, stride=2, padding=1)
+    def forward(self, hidden_states):
+        assert hidden_states.shape[1] == self.channels
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+class CrossAttention(nn.Module):
+    def __init__(
+        self,
+        query_dim: int,
+        cross_attention_dim: Optional[int] = None,
+        heads: int = 8,
+        dim_head: int = 64,
+        upcast_attention: bool = False,
+    ):
+        super().__init__()
+        inner_dim = dim_head * heads
+        cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
+        self.upcast_attention = upcast_attention
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(cross_attention_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(cross_attention_dim, inner_dim, bias=False)
+        self.to_out = nn.ModuleList([])
+        self.to_out.append(nn.Linear(inner_dim, query_dim))
+        # no dropout here
+        self.use_memory_efficient_attention_xformers = False
+        self.use_memory_efficient_attention_mem_eff = False
+        self.use_sdpa = False
+        # Attention processor
+        self.processor = None
+    def set_use_memory_efficient_attention(self, xformers, mem_eff):
+        self.use_memory_efficient_attention_xformers = xformers
+        self.use_memory_efficient_attention_mem_eff = mem_eff
+    def set_use_sdpa(self, sdpa):
+        self.use_sdpa = sdpa
+    def reshape_heads_to_batch_dim(self, tensor):
+        batch_size, seq_len, dim = tensor.shape
+        head_size = self.heads
+        tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)
+        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size * head_size, seq_len, dim // head_size)
+        return tensor
+    def reshape_batch_dim_to_heads(self, tensor):
+        batch_size, seq_len, dim = tensor.shape
+        head_size = self.heads
+        tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
+        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size)
+        return tensor
+    def set_processor(self):
+        return self.processor
+    def get_processor(self):
+        return self.processor
+    def forward(self, hidden_states, context=None, mask=None, **kwargs):
+        if self.processor is not None:
+            (
+                hidden_states,
+                encoder_hidden_states,
+                attention_mask,
+            ) = translate_attention_names_from_diffusers(
+                hidden_states=hidden_states, context=context, mask=mask, **kwargs
+            )
+            return self.processor(
+                attn=self,
+                hidden_states=hidden_states,
+                encoder_hidden_states=context,
+                attention_mask=mask,
+                **kwargs
+            )
+        if self.use_memory_efficient_attention_xformers:
+            return self.forward_memory_efficient_xformers(hidden_states, context, mask)
+        if self.use_memory_efficient_attention_mem_eff:
+            return self.forward_memory_efficient_mem_eff(hidden_states, context, mask)
+        if self.use_sdpa:
+            return self.forward_sdpa(hidden_states, context, mask)
+        query = self.to_q(hidden_states)
+        context = context if context is not None else hidden_states
+        key = self.to_k(context)
+        value = self.to_v(context)
+        query = self.reshape_heads_to_batch_dim(query)
+        key = self.reshape_heads_to_batch_dim(key)
+        value = self.reshape_heads_to_batch_dim(value)
+        hidden_states = self._attention(query, key, value)
+        # linear proj
+        hidden_states = self.to_out[0](hidden_states)
+        # hidden_states = self.to_out[1](hidden_states)     # no dropout
+        return hidden_states
+    def _attention(self, query, key, value):
+        if self.upcast_attention:
+            query = query.float()
+            key = key.float()
+        attention_scores = torch.baddbmm(
+            torch.empty(query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device),
+            query,
+            key.transpose(-1, -2),
+            beta=0,
+            alpha=self.scale,
+        )
+        attention_probs = attention_scores.softmax(dim=-1)
+        # cast back to the original dtype
+        attention_probs = attention_probs.to(value.dtype)
+        # compute attention output
+        hidden_states = torch.bmm(attention_probs, value)
+        # reshape hidden_states
+        hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
+        return hidden_states
+    # TODO support Hypernetworks
+    def forward_memory_efficient_xformers(self, x, context=None, mask=None):
+        import xformers.ops
+        h = self.heads
+        q_in = self.to_q(x)
+        context = context if context is not None else x
+        context = context.to(x.dtype)
+        k_in = self.to_k(context)
+        v_in = self.to_v(context)
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b n h d", h=h), (q_in, k_in, v_in))
+        del q_in, k_in, v_in
+        q = q.contiguous()
+        k = k.contiguous()
+        v = v.contiguous()
+        out = xformers.ops.memory_efficient_attention(q, k, v, attn_bias=None)  # 最適なのを選んでくれる
+        out = rearrange(out, "b n h d -> b n (h d)", h=h)
+        out = self.to_out[0](out)
+        return out
+    def forward_memory_efficient_mem_eff(self, x, context=None, mask=None):
+        flash_func = FlashAttentionFunction
+        q_bucket_size = 512
+        k_bucket_size = 1024
+        h = self.heads
+        q = self.to_q(x)
+        context = context if context is not None else x
+        context = context.to(x.dtype)
+        k = self.to_k(context)
+        v = self.to_v(context)
+        del context, x
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v))
+        out = flash_func.apply(q, k, v, mask, False, q_bucket_size, k_bucket_size)
+        out = rearrange(out, "b h n d -> b n (h d)")
+        out = self.to_out[0](out)
+        return out
+    def forward_sdpa(self, x, context=None, mask=None):
+        h = self.heads
+        q_in = self.to_q(x)
+        context = context if context is not None else x
+        context = context.to(x.dtype)
+        k_in = self.to_k(context)
+        v_in = self.to_v(context)
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q_in, k_in, v_in))
+        del q_in, k_in, v_in
+        out = F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False)
+        out = rearrange(out, "b h n d -> b n (h d)", h=h)
+        out = self.to_out[0](out)
+        return out
+def translate_attention_names_from_diffusers(
+    hidden_states: torch.FloatTensor,
+    context: Optional[torch.FloatTensor] = None,
+    mask: Optional[torch.FloatTensor] = None,
+    # HF naming
+    encoder_hidden_states: Optional[torch.FloatTensor] = None,
+    attention_mask: Optional[torch.FloatTensor] = None
+):
+    # translate from hugging face diffusers
+    context = context if context is not None else encoder_hidden_states
+    # translate from hugging face diffusers
+    mask = mask if mask is not None else attention_mask
+    return hidden_states, context, mask
+# feedforward
+class GEGLU(nn.Module):
+    r"""
+    A variant of the gated linear unit activation function from https://arxiv.org/abs/2002.05202.
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+    """
+    def __init__(self, dim_in: int, dim_out: int):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+    def gelu(self, gate):
+        if gate.device.type != "mps":
+            return F.gelu(gate)
+        # mps: gelu is not implemented for float16
+        return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)
+    def forward(self, hidden_states):
+        hidden_states, gate = self.proj(hidden_states).chunk(2, dim=-1)
+        return hidden_states * self.gelu(gate)
+class FeedForward(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+    ):
+        super().__init__()
+        inner_dim = int(dim * 4)  # mult is always 4
+        self.net = nn.ModuleList([])
+        # project in
+        self.net.append(GEGLU(dim, inner_dim))
+        # project dropout
+        self.net.append(nn.Identity())  # nn.Dropout(0)) # dummy for dropout with 0
+        # project out
+        self.net.append(nn.Linear(inner_dim, dim))
+    def forward(self, hidden_states):
+        for module in self.net:
+            hidden_states = module(hidden_states)
+        return hidden_states
+class BasicTransformerBlock(nn.Module):
+    def __init__(
+        self, dim: int, num_attention_heads: int, attention_head_dim: int, cross_attention_dim: int, upcast_attention: bool = False
+    ):
+        super().__init__()
+        # 1. Self-Attn
+        self.attn1 = CrossAttention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            upcast_attention=upcast_attention,
+        )
+        self.ff = FeedForward(dim)
+        # 2. Cross-Attn
+        self.attn2 = CrossAttention(
+            query_dim=dim,
+            cross_attention_dim=cross_attention_dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            upcast_attention=upcast_attention,
+        )
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        # 3. Feed-forward
+        self.norm3 = nn.LayerNorm(dim)
+    def set_use_memory_efficient_attention(self, xformers: bool, mem_eff: bool):
+        self.attn1.set_use_memory_efficient_attention(xformers, mem_eff)
+        self.attn2.set_use_memory_efficient_attention(xformers, mem_eff)
+    def set_use_sdpa(self, sdpa: bool):
+        self.attn1.set_use_sdpa(sdpa)
+        self.attn2.set_use_sdpa(sdpa)
+    def forward(self, hidden_states, context=None, timestep=None):
+        # 1. Self-Attention
+        norm_hidden_states = self.norm1(hidden_states)
+        hidden_states = self.attn1(norm_hidden_states) + hidden_states
+        # 2. Cross-Attention
+        norm_hidden_states = self.norm2(hidden_states)
+        hidden_states = self.attn2(norm_hidden_states, context=context) + hidden_states
+        # 3. Feed-forward
+        hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states
+        return hidden_states
+class Transformer2DModel(nn.Module):
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        cross_attention_dim: Optional[int] = None,
+        use_linear_projection: bool = False,
+        upcast_attention: bool = False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+        self.use_linear_projection = use_linear_projection
+        self.norm = torch.nn.GroupNorm(num_groups=TRANSFORMER_NORM_NUM_GROUPS, num_channels=in_channels, eps=1e-6, affine=True)
+        if use_linear_projection:
+            self.proj_in = nn.Linear(in_channels, inner_dim)
+        else:
+            self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    cross_attention_dim=cross_attention_dim,
+                    upcast_attention=upcast_attention,
+                )
+            ]
+        )
+        if use_linear_projection:
+            self.proj_out = nn.Linear(in_channels, inner_dim)
+        else:
+            self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+    def set_use_memory_efficient_attention(self, xformers, mem_eff):
+        for transformer in self.transformer_blocks:
+            transformer.set_use_memory_efficient_attention(xformers, mem_eff)
+    def set_use_sdpa(self, sdpa):
+        for transformer in self.transformer_blocks:
+            transformer.set_use_sdpa(sdpa)
+    def forward(self, hidden_states, encoder_hidden_states=None, timestep=None, return_dict: bool = True):
+        # 1. Input
+        batch, _, height, weight = hidden_states.shape
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)
+        if not self.use_linear_projection:
+            hidden_states = self.proj_in(hidden_states)
+            inner_dim = hidden_states.shape[1]
+            hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
+        else:
+            inner_dim = hidden_states.shape[1]
+            hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
+            hidden_states = self.proj_in(hidden_states)
+        # 2. Blocks
+        for block in self.transformer_blocks:
+            hidden_states = block(hidden_states, context=encoder_hidden_states, timestep=timestep)
+        # 3. Output
+        if not self.use_linear_projection:
+            hidden_states = hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()
+            hidden_states = self.proj_out(hidden_states)
+        else:
+            hidden_states = self.proj_out(hidden_states)
+            hidden_states = hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()
+        output = hidden_states + residual
+        if not return_dict:
+            return (output,)
+        return SampleOutput(sample=output)
+class CrossAttnDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        add_downsample=True,
+        cross_attention_dim=1280,
+        attn_num_head_channels=1,
+        use_linear_projection=False,
+        upcast_attention=False,
+    ):
+        super().__init__()
+        self.has_cross_attention = True
+        resnets = []
+        attentions = []
+        self.attn_num_head_channels = attn_num_head_channels
+        for i in range(LAYERS_PER_BLOCK):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(ResnetBlock2D(in_channels=in_channels, out_channels=out_channels))
+            attentions.append(
+                Transformer2DModel(
+                    attn_num_head_channels,
+                    out_channels // attn_num_head_channels,
+                    in_channels=out_channels,
+                    cross_attention_dim=cross_attention_dim,
+                    use_linear_projection=use_linear_projection,
+                    upcast_attention=upcast_attention,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList([Downsample2D(out_channels, out_channels)])
+        else:
+            self.downsamplers = None
+        self.gradient_checkpointing = False
+    def set_use_memory_efficient_attention(self, xformers, mem_eff):
+        for attn in self.attentions:
+            attn.set_use_memory_efficient_attention(xformers, mem_eff)
+    def set_use_sdpa(self, sdpa):
+        for attn in self.attentions:
+            attn.set_use_sdpa(sdpa)
+    def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
+        output_states = ()
+        for resnet, attn in zip(self.resnets, self.attentions):
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(attn, return_dict=False), hidden_states, encoder_hidden_states
+                )[0]
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample
+            output_states += (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+            output_states += (hidden_states,)
+        return hidden_states, output_states
+class UNetMidBlock2DCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        attn_num_head_channels=1,
+        cross_attention_dim=1280,
+        use_linear_projection=False,
+    ):
+        super().__init__()
+        self.has_cross_attention = True
+        self.attn_num_head_channels = attn_num_head_channels
+        # Middle block has two resnets and one attention
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+            ),
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+            ),
+        ]
+        attentions = [
+            Transformer2DModel(
+                attn_num_head_channels,
+                in_channels // attn_num_head_channels,
+                in_channels=in_channels,
+                cross_attention_dim=cross_attention_dim,
+                use_linear_projection=use_linear_projection,
+            )
+        ]
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        self.gradient_checkpointing = False
+    def set_use_memory_efficient_attention(self, xformers, mem_eff):
+        for attn in self.attentions:
+            attn.set_use_memory_efficient_attention(xformers, mem_eff)
+    def set_use_sdpa(self, sdpa):
+        for attn in self.attentions:
+            attn.set_use_sdpa(sdpa)
+    def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
+        for i, resnet in enumerate(self.resnets):
+            attn = None if i == 0 else self.attentions[i - 1]
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                if attn is not None:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(attn, return_dict=False), hidden_states, encoder_hidden_states
+                    )[0]
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+            else:
+                if attn is not None:
+                    hidden_states = attn(hidden_states, encoder_hidden_states).sample
+                hidden_states = resnet(hidden_states, temb)
+        return hidden_states
+class Upsample2D(nn.Module):
+    def __init__(self, channels, out_channels):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels
+        self.conv = nn.Conv2d(self.channels, self.out_channels, 3, padding=1)
+    def forward(self, hidden_states, output_size):
+        assert hidden_states.shape[1] == self.channels
+        # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16
+        # TODO(Suraj): Remove this cast once the issue is fixed in PyTorch
+        # https://github.com/pytorch/pytorch/issues/86679
+        dtype = hidden_states.dtype
+        if dtype == torch.bfloat16:
+            hidden_states = hidden_states.to(torch.float32)
+        # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+        if hidden_states.shape[0] >= 64:
+            hidden_states = hidden_states.contiguous()
+        # if `output_size` is passed we force the interpolation output size and do not make use of `scale_factor=2`
+        if output_size is None:
+            hidden_states = F.interpolate(hidden_states, scale_factor=2.0, mode="nearest")
+        else:
+            hidden_states = F.interpolate(hidden_states, size=output_size, mode="nearest")
+        # If the input is bfloat16, we cast back to bfloat16
+        if dtype == torch.bfloat16:
+            hidden_states = hidden_states.to(dtype)
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+class UpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        add_upsample=True,
+    ):
+        super().__init__()
+        self.has_cross_attention = False
+        resnets = []
+        for i in range(LAYERS_PER_BLOCK_UP):
+            res_skip_channels = in_channels if (i == LAYERS_PER_BLOCK_UP - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, out_channels)])
+        else:
+            self.upsamplers = None
+        self.gradient_checkpointing = False
+    def set_use_memory_efficient_attention(self, xformers, mem_eff):
+        pass
+    def set_use_sdpa(self, sdpa):
+        pass
+    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+            else:
+                hidden_states = resnet(hidden_states, temb)
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+        return hidden_states
+class CrossAttnUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        attn_num_head_channels=1,
+        cross_attention_dim=1280,
+        add_upsample=True,
+        use_linear_projection=False,
+        upcast_attention=False,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        self.has_cross_attention = True
+        self.attn_num_head_channels = attn_num_head_channels
+        for i in range(LAYERS_PER_BLOCK_UP):
+            res_skip_channels = in_channels if (i == LAYERS_PER_BLOCK_UP - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                )
+            )
+            attentions.append(
+                Transformer2DModel(
+                    attn_num_head_channels,
+                    out_channels // attn_num_head_channels,
+                    in_channels=out_channels,
+                    cross_attention_dim=cross_attention_dim,
+                    use_linear_projection=use_linear_projection,
+                    upcast_attention=upcast_attention,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, out_channels)])
+        else:
+            self.upsamplers = None
+        self.gradient_checkpointing = False
+    def set_use_memory_efficient_attention(self, xformers, mem_eff):
+        for attn in self.attentions:
+            attn.set_use_memory_efficient_attention(xformers, mem_eff)
+    def set_use_sdpa(self, sdpa):
+        for attn in self.attentions:
+            attn.set_use_sdpa(sdpa)
+    def forward(
+        self,
+        hidden_states,
+        res_hidden_states_tuple,
+        temb=None,
+        encoder_hidden_states=None,
+        upsample_size=None,
+    ):
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(attn, return_dict=False), hidden_states, encoder_hidden_states
+                )[0]
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+        return hidden_states
+def get_down_block(
+    down_block_type,
+    in_channels,
+    out_channels,
+    add_downsample,
+    attn_num_head_channels,
+    cross_attention_dim,
+    use_linear_projection,
+    upcast_attention,
+):
+    if down_block_type == "DownBlock2D":
+        return DownBlock2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_downsample=add_downsample,
+        )
+    elif down_block_type == "CrossAttnDownBlock2D":
+        return CrossAttnDownBlock2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_downsample=add_downsample,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attn_num_head_channels,
+            use_linear_projection=use_linear_projection,
+            upcast_attention=upcast_attention,
+        )
+def get_up_block(
+    up_block_type,
+    in_channels,
+    out_channels,
+    prev_output_channel,
+    add_upsample,
+    attn_num_head_channels,
+    cross_attention_dim=None,
+    use_linear_projection=False,
+    upcast_attention=False,
+):
+    if up_block_type == "UpBlock2D":
+        return UpBlock2D(
+            in_channels=in_channels,
+            prev_output_channel=prev_output_channel,
+            out_channels=out_channels,
+            add_upsample=add_upsample,
+        )
+    elif up_block_type == "CrossAttnUpBlock2D":
+        return CrossAttnUpBlock2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            attn_num_head_channels=attn_num_head_channels,
+            cross_attention_dim=cross_attention_dim,
+            add_upsample=add_upsample,
+            use_linear_projection=use_linear_projection,
+            upcast_attention=upcast_attention,
+        )
+class UNet2DConditionModel(nn.Module):
+    _supports_gradient_checkpointing = True
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        cross_attention_dim: int = 1280,
+        use_linear_projection: bool = False,
+        upcast_attention: bool = False,
+        **kwargs,
+    ):
+        super().__init__()
+        assert sample_size is not None, "sample_size must be specified"
+        logger.info(
+            f"UNet2DConditionModel: {sample_size}, {attention_head_dim}, {cross_attention_dim}, {use_linear_projection}, {upcast_attention}"
+        )
+        # 外部からの参照用に定義しておく
+        self.in_channels = IN_CHANNELS
+        self.out_channels = OUT_CHANNELS
+        self.sample_size = sample_size
+        self.prepare_config(sample_size=sample_size)
+        # state_dictの書式が変わるのでmoduleの持ち方は変えられない
+        # input
+        self.conv_in = nn.Conv2d(IN_CHANNELS, BLOCK_OUT_CHANNELS[0], kernel_size=3, padding=(1, 1))
+        # time
+        self.time_proj = Timesteps(BLOCK_OUT_CHANNELS[0], TIME_EMBED_FLIP_SIN_TO_COS, TIME_EMBED_FREQ_SHIFT)
+        self.time_embedding = TimestepEmbedding(TIMESTEP_INPUT_DIM, TIME_EMBED_DIM)
+        self.down_blocks = nn.ModuleList([])
+        self.mid_block = None
+        self.up_blocks = nn.ModuleList([])
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * 4
+        # down
+        output_channel = BLOCK_OUT_CHANNELS[0]
+        for i, down_block_type in enumerate(DOWN_BLOCK_TYPES):
+            input_channel = output_channel
+            output_channel = BLOCK_OUT_CHANNELS[i]
+            is_final_block = i == len(BLOCK_OUT_CHANNELS) - 1
+            down_block = get_down_block(
+                down_block_type,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                add_downsample=not is_final_block,
+                attn_num_head_channels=attention_head_dim[i],
+                cross_attention_dim=cross_attention_dim,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+            )
+            self.down_blocks.append(down_block)
+        # mid
+        self.mid_block = UNetMidBlock2DCrossAttn(
+            in_channels=BLOCK_OUT_CHANNELS[-1],
+            attn_num_head_channels=attention_head_dim[-1],
+            cross_attention_dim=cross_attention_dim,
+            use_linear_projection=use_linear_projection,
+        )
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+        # up
+        reversed_block_out_channels = list(reversed(BLOCK_OUT_CHANNELS))
+        reversed_attention_head_dim = list(reversed(attention_head_dim))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(UP_BLOCK_TYPES):
+            is_final_block = i == len(BLOCK_OUT_CHANNELS) - 1
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(BLOCK_OUT_CHANNELS) - 1)]
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+            up_block = get_up_block(
+                up_block_type,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                add_upsample=add_upsample,
+                attn_num_head_channels=reversed_attention_head_dim[i],
+                cross_attention_dim=cross_attention_dim,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        # out
+        self.conv_norm_out = nn.GroupNorm(num_channels=BLOCK_OUT_CHANNELS[0], num_groups=NORM_GROUPS, eps=NORM_EPS)
+        self.conv_act = nn.SiLU()
+        self.conv_out = nn.Conv2d(BLOCK_OUT_CHANNELS[0], OUT_CHANNELS, kernel_size=3, padding=1)
+    # region diffusers compatibility
+    def prepare_config(self, *args, **kwargs):
+        self.config = SimpleNamespace(**kwargs)
+    @property
+    def dtype(self) -> torch.dtype:
+        # `torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
+        return get_parameter_dtype(self)
+    @property
+    def device(self) -> torch.device:
+        # `torch.device`: The device on which the module is (assuming that all the module parameters are on the same device).
+        return get_parameter_device(self)
+    def set_attention_slice(self, slice_size):
+        raise NotImplementedError("Attention slicing is not supported for this model.")
+    def is_gradient_checkpointing(self) -> bool:
+        return any(hasattr(m, "gradient_checkpointing") and m.gradient_checkpointing for m in self.modules())
+    def enable_gradient_checkpointing(self):
+        self.set_gradient_checkpointing(value=True)
+    def disable_gradient_checkpointing(self):
+        self.set_gradient_checkpointing(value=False)
+    def set_use_memory_efficient_attention(self, xformers: bool, mem_eff: bool) -> None:
+        modules = self.down_blocks + [self.mid_block] + self.up_blocks
+        for module in modules:
+            module.set_use_memory_efficient_attention(xformers, mem_eff)
+    def set_use_sdpa(self, sdpa: bool) -> None:
+        modules = self.down_blocks + [self.mid_block] + self.up_blocks
+        for module in modules:
+            module.set_use_sdpa(sdpa)
+    def set_gradient_checkpointing(self, value=False):
+        modules = self.down_blocks + [self.mid_block] + self.up_blocks
+        for module in modules:
+            logger.info(f"{module.__class__.__name__} {module.gradient_checkpointing} -> {value}")
+            module.gradient_checkpointing = value
+    # endregion
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
+    ) -> Union[Dict, Tuple]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
+            timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps
+            encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a dict instead of a plain tuple.
+        Returns:
+            `SampleOutput` or `tuple`:
+            `SampleOutput` if `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        # デフォルトではサンプルは「2^アップサンプルの数」、つまり64の倍数である必要がある
+        # ただそれ以外のサイズにも対応できるように、必要ならアップサンプルのサイズを変更する
+        # 多分画質が悪くなるので、64で割り切れるようにしておくのが良い
+        default_overall_up_factor = 2**self.num_upsamplers
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        # 64で割り切れないときはupsamplerにサイズを伝える
+        forward_upsample_size = False
+        upsample_size = None
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            # logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+        # 1. time
+        timesteps = timestep
+        timesteps = self.handle_unusual_timesteps(sample, timesteps)  # 変な時だけ処理
+        t_emb = self.time_proj(timesteps)
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        # timestepsは重みを含まないので常にfloat32のテンソルを返す
+        # しかしtime_embeddingはfp16で動いているかもしれないので、ここでキャストする必要がある
+        # time_projでキャストしておけばいいんじゃね？
+        t_emb = t_emb.to(dtype=self.dtype)
+        emb = self.time_embedding(t_emb)
+        # 2. pre-process
+        sample = self.conv_in(sample)
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            # downblockはforwardで必ずencoder_hidden_statesを受け取るようにしても良さそうだけど、
+            # まあこちらのほうがわかりやすいかもしれない
+            if downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+            down_block_res_samples += res_samples
+        # skip connectionにControlNetの出力を追加する
+        if down_block_additional_residuals is not None:
+            down_block_res_samples = list(down_block_res_samples)
+            for i in range(len(down_block_res_samples)):
+                down_block_res_samples[i] += down_block_additional_residuals[i]
+            down_block_res_samples = tuple(down_block_res_samples)
+        # 4. mid
+        sample = self.mid_block(sample, emb, encoder_hidden_states=encoder_hidden_states)
+        # ControlNetの出力を追加する
+        if mid_block_additional_residual is not None:
+            sample += mid_block_additional_residual
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]  # skip connection
+            # if we have not reached the final block and need to forward the upsample size, we do it here
+            # 前述のように最後のブロック以外ではupsample_sizeを伝える
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+            if upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    upsample_size=upsample_size,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
+                )
+        # 6. post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        if not return_dict:
+            return (sample,)
+        return SampleOutput(sample=sample)
+    def handle_unusual_timesteps(self, sample, timesteps):
+        r"""
+        timestampsがTensorでない場合、Tensorに変換する。またOnnx/Core MLと互換性のあるようにbatchサイズまでbroadcastする。
+        """
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timesteps, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        return timesteps
+class InferUNet2DConditionModel:
+    def __init__(self, original_unet: UNet2DConditionModel):
+        self.delegate = original_unet
+        # override original model's forward method: because forward is not called by `__call__`
+        # overriding `__call__` is not enough, because nn.Module.forward has a special handling
+        self.delegate.forward = self.forward
+        # override original model's up blocks' forward method
+        for up_block in self.delegate.up_blocks:
+            if up_block.__class__.__name__ == "UpBlock2D":
+                def resnet_wrapper(func, block):
+                    def forward(*args, **kwargs):
+                        return func(block, *args, **kwargs)
+                    return forward
+                up_block.forward = resnet_wrapper(self.up_block_forward, up_block)
+            elif up_block.__class__.__name__ == "CrossAttnUpBlock2D":
+                def cross_attn_up_wrapper(func, block):
+                    def forward(*args, **kwargs):
+                        return func(block, *args, **kwargs)
+                    return forward
+                up_block.forward = cross_attn_up_wrapper(self.cross_attn_up_block_forward, up_block)
+        # Deep Shrink
+        self.ds_depth_1 = None
+        self.ds_depth_2 = None
+        self.ds_timesteps_1 = None
+        self.ds_timesteps_2 = None
+        self.ds_ratio = None
+    # call original model's methods
+    def __getattr__(self, name):
+        return getattr(self.delegate, name)
+    def __call__(self, *args, **kwargs):
+        return self.delegate(*args, **kwargs)
+    def set_deep_shrink(self, ds_depth_1, ds_timesteps_1=650, ds_depth_2=None, ds_timesteps_2=None, ds_ratio=0.5):
+        if ds_depth_1 is None:
+            logger.info("Deep Shrink is disabled.")
+            self.ds_depth_1 = None
+            self.ds_timesteps_1 = None
+            self.ds_depth_2 = None
+            self.ds_timesteps_2 = None
+            self.ds_ratio = None
+        else:
+            logger.info(
+                f"Deep Shrink is enabled: [depth={ds_depth_1}/{ds_depth_2}, timesteps={ds_timesteps_1}/{ds_timesteps_2}, ratio={ds_ratio}]"
+            )
+            self.ds_depth_1 = ds_depth_1
+            self.ds_timesteps_1 = ds_timesteps_1
+            self.ds_depth_2 = ds_depth_2 if ds_depth_2 is not None else -1
+            self.ds_timesteps_2 = ds_timesteps_2 if ds_timesteps_2 is not None else 1000
+            self.ds_ratio = ds_ratio
+    def up_block_forward(self, _self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
+        for resnet in _self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            # Deep Shrink
+            if res_hidden_states.shape[-2:] != hidden_states.shape[-2:]:
+                hidden_states = resize_like(hidden_states, res_hidden_states)
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            hidden_states = resnet(hidden_states, temb)
+        if _self.upsamplers is not None:
+            for upsampler in _self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+        return hidden_states
+    def cross_attn_up_block_forward(
+        self,
+        _self,
+        hidden_states,
+        res_hidden_states_tuple,
+        temb=None,
+        encoder_hidden_states=None,
+        upsample_size=None,
+    ):
+        for resnet, attn in zip(_self.resnets, _self.attentions):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            # Deep Shrink
+            if res_hidden_states.shape[-2:] != hidden_states.shape[-2:]:
+                hidden_states = resize_like(hidden_states, res_hidden_states)
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample
+        if _self.upsamplers is not None:
+            for upsampler in _self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+        return hidden_states
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
+    ) -> Union[Dict, Tuple]:
+        r"""
+        current implementation is a copy of `UNet2DConditionModel.forward()` with Deep Shrink.
+        """
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
+            timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps
+            encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a dict instead of a plain tuple.
+        Returns:
+            `SampleOutput` or `tuple`:
+            `SampleOutput` if `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
+        """
+        _self = self.delegate
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        # デフォルトではサンプルは「2^アップサンプルの数」、つまり64の倍数である必要がある
+        # ただそれ以外のサイズにも対応できるように、必要ならアップサンプルのサイズを変更する
+        # 多分画質が悪くなるので、64で割り切れるようにしておくのが良い
+        default_overall_up_factor = 2**_self.num_upsamplers
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        # 64で割り切れないときはupsamplerにサイズを伝える
+        forward_upsample_size = False
+        upsample_size = None
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            # logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+        # 1. time
+        timesteps = timestep
+        timesteps = _self.handle_unusual_timesteps(sample, timesteps)  # 変な時だけ処理
+        t_emb = _self.time_proj(timesteps)
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        # timestepsは重みを含まないので常にfloat32のテンソルを返す
+        # しかしtime_embeddingはfp16で動いているかもしれないので、ここでキャストする必要がある
+        # time_projでキャストしておけばいいんじゃね？
+        t_emb = t_emb.to(dtype=_self.dtype)
+        emb = _self.time_embedding(t_emb)
+        # 2. pre-process
+        sample = _self.conv_in(sample)
+        down_block_res_samples = (sample,)
+        for depth, downsample_block in enumerate(_self.down_blocks):
+            # Deep Shrink
+            if self.ds_depth_1 is not None:
+                if (depth == self.ds_depth_1 and timesteps[0] >= self.ds_timesteps_1) or (
+                    self.ds_depth_2 is not None
+                    and depth == self.ds_depth_2
+                    and timesteps[0] < self.ds_timesteps_1
+                    and timesteps[0] >= self.ds_timesteps_2
+                ):
+                    org_dtype = sample.dtype
+                    if org_dtype == torch.bfloat16:
+                        sample = sample.to(torch.float32)
+                    sample = F.interpolate(sample, scale_factor=self.ds_ratio, mode="bicubic", align_corners=False).to(org_dtype)
+            # downblockはforwardで必ずencoder_hidden_statesを受け取るようにしても良さそうだけど、
+            # まあこちらのほうがわかりやすいかもしれない
+            if downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+            down_block_res_samples += res_samples
+        # skip connectionにControlNetの出力を追加する
+        if down_block_additional_residuals is not None:
+            down_block_res_samples = list(down_block_res_samples)
+            for i in range(len(down_block_res_samples)):
+                down_block_res_samples[i] += down_block_additional_residuals[i]
+            down_block_res_samples = tuple(down_block_res_samples)
+        # 4. mid
+        sample = _self.mid_block(sample, emb, encoder_hidden_states=encoder_hidden_states)
+        # ControlNetの出力を追加する
+        if mid_block_additional_residual is not None:
+            sample += mid_block_additional_residual
+        # 5. up
+        for i, upsample_block in enumerate(_self.up_blocks):
+            is_final_block = i == len(_self.up_blocks) - 1
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]  # skip connection
+            # if we have not reached the final block and need to forward the upsample size, we do it here
+            # 前述のように最後のブロック以外ではupsample_sizeを伝える
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+            if upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    upsample_size=upsample_size,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
+                )
+        # 6. post-process
+        sample = _self.conv_norm_out(sample)
+        sample = _self.conv_act(sample)
+        sample = _self.conv_out(sample)
+        if not return_dict:
+            return (sample,)
+        return SampleOutput(sample=sample)

library/sai_model_spec.py ADDED Viewed

	@@ -0,0 +1,334 @@

+# based on https://github.com/Stability-AI/ModelSpec
+import datetime
+import hashlib
+from io import BytesIO
+import os
+from typing import List, Optional, Tuple, Union
+import safetensors
+from library.utils import setup_logging
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+r"""
+# Metadata Example
+metadata = {
+    # === Must ===
+    "modelspec.sai_model_spec": "1.0.0", # Required version ID for the spec
+    "modelspec.architecture": "stable-diffusion-xl-v1-base", # Architecture, reference the ID of the original model of the arch to match the ID
+    "modelspec.implementation": "sgm",
+    "modelspec.title": "Example Model Version 1.0", # Clean, human-readable title. May use your own phrasing/language/etc
+    # === Should ===
+    "modelspec.author": "Example Corp", # Your name or company name
+    "modelspec.description": "This is my example model to show you how to do it!", # Describe the model in your own words/language/etc. Focus on what users need to know
+    "modelspec.date": "2023-07-20", # ISO-8601 compliant date of when the model was created
+    # === Can ===
+    "modelspec.license": "ExampleLicense-1.0", # eg CreativeML Open RAIL, etc.
+    "modelspec.usage_hint": "Use keyword 'example'" # In your own language, very short hints about how the user should use the model
+}
+"""
+BASE_METADATA = {
+    # === Must ===
+    "modelspec.sai_model_spec": "1.0.0",  # Required version ID for the spec
+    "modelspec.architecture": None,
+    "modelspec.implementation": None,
+    "modelspec.title": None,
+    "modelspec.resolution": None,
+    # === Should ===
+    "modelspec.description": None,
+    "modelspec.author": None,
+    "modelspec.date": None,
+    # === Can ===
+    "modelspec.license": None,
+    "modelspec.tags": None,
+    "modelspec.merged_from": None,
+    "modelspec.prediction_type": None,
+    "modelspec.timestep_range": None,
+    "modelspec.encoder_layer": None,
+}
+# 別に使うやつだけ定義
+MODELSPEC_TITLE = "modelspec.title"
+ARCH_SD_V1 = "stable-diffusion-v1"
+ARCH_SD_V2_512 = "stable-diffusion-v2-512"
+ARCH_SD_V2_768_V = "stable-diffusion-v2-768-v"
+ARCH_SD_XL_V1_BASE = "stable-diffusion-xl-v1-base"
+ARCH_SD3_M = "stable-diffusion-3"  # may be followed by "-m" or "-5-large" etc.
+# ARCH_SD3_UNKNOWN = "stable-diffusion-3"
+ARCH_FLUX_1_DEV = "flux-1-dev"
+ARCH_FLUX_1_UNKNOWN = "flux-1"
+ADAPTER_LORA = "lora"
+ADAPTER_TEXTUAL_INVERSION = "textual-inversion"
+IMPL_STABILITY_AI = "https://github.com/Stability-AI/generative-models"
+IMPL_COMFY_UI = "https://github.com/comfyanonymous/ComfyUI"
+IMPL_DIFFUSERS = "diffusers"
+IMPL_FLUX = "https://github.com/black-forest-labs/flux"
+PRED_TYPE_EPSILON = "epsilon"
+PRED_TYPE_V = "v"
+def load_bytes_in_safetensors(tensors):
+    bytes = safetensors.torch.save(tensors)
+    b = BytesIO(bytes)
+    b.seek(0)
+    header = b.read(8)
+    n = int.from_bytes(header, "little")
+    offset = n + 8
+    b.seek(offset)
+    return b.read()
+def precalculate_safetensors_hashes(state_dict):
+    # calculate each tensor one by one to reduce memory usage
+    hash_sha256 = hashlib.sha256()
+    for tensor in state_dict.values():
+        single_tensor_sd = {"tensor": tensor}
+        bytes_for_tensor = load_bytes_in_safetensors(single_tensor_sd)
+        hash_sha256.update(bytes_for_tensor)
+    return f"0x{hash_sha256.hexdigest()}"
+def update_hash_sha256(metadata: dict, state_dict: dict):
+    raise NotImplementedError
+def build_metadata(
+    state_dict: Optional[dict],
+    v2: bool,
+    v_parameterization: bool,
+    sdxl: bool,
+    lora: bool,
+    textual_inversion: bool,
+    timestamp: float,
+    title: Optional[str] = None,
+    reso: Optional[Union[int, Tuple[int, int]]] = None,
+    is_stable_diffusion_ckpt: Optional[bool] = None,
+    author: Optional[str] = None,
+    description: Optional[str] = None,
+    license: Optional[str] = None,
+    tags: Optional[str] = None,
+    merged_from: Optional[str] = None,
+    timesteps: Optional[Tuple[int, int]] = None,
+    clip_skip: Optional[int] = None,
+    sd3: Optional[str] = None,
+    flux: Optional[str] = None,
+):
+    """
+    sd3: only supports "m", flux: only supports "dev"
+    """
+    # if state_dict is None, hash is not calculated
+    metadata = {}
+    metadata.update(BASE_METADATA)
+    # TODO メモリを消費せずかつ正しいハッシュ計算の方法がわかったら実装する
+    # if state_dict is not None:
+    # hash = precalculate_safetensors_hashes(state_dict)
+    # metadata["modelspec.hash_sha256"] = hash
+    if sdxl:
+        arch = ARCH_SD_XL_V1_BASE
+    elif sd3 is not None:
+        arch = ARCH_SD3_M + "-" + sd3
+    elif flux is not None:
+        if flux == "dev":
+            arch = ARCH_FLUX_1_DEV
+        else:
+            arch = ARCH_FLUX_1_UNKNOWN
+    elif v2:
+        if v_parameterization:
+            arch = ARCH_SD_V2_768_V
+        else:
+            arch = ARCH_SD_V2_512
+    else:
+        arch = ARCH_SD_V1
+    if lora:
+        arch += f"/{ADAPTER_LORA}"
+    elif textual_inversion:
+        arch += f"/{ADAPTER_TEXTUAL_INVERSION}"
+    metadata["modelspec.architecture"] = arch
+    if not lora and not textual_inversion and is_stable_diffusion_ckpt is None:
+        is_stable_diffusion_ckpt = True  # default is stable diffusion ckpt if not lora and not textual_inversion
+    if flux is not None:
+        # Flux
+        impl = IMPL_FLUX
+    elif (lora and sdxl) or textual_inversion or is_stable_diffusion_ckpt:
+        # Stable Diffusion ckpt, TI, SDXL LoRA
+        impl = IMPL_STABILITY_AI
+    else:
+        # v1/v2 LoRA or Diffusers
+        impl = IMPL_DIFFUSERS
+    metadata["modelspec.implementation"] = impl
+    if title is None:
+        if lora:
+            title = "LoRA"
+        elif textual_inversion:
+            title = "TextualInversion"
+        else:
+            title = "Checkpoint"
+        title += f"@{timestamp}"
+    metadata[MODELSPEC_TITLE] = title
+    if author is not None:
+        metadata["modelspec.author"] = author
+    else:
+        del metadata["modelspec.author"]
+    if description is not None:
+        metadata["modelspec.description"] = description
+    else:
+        del metadata["modelspec.description"]
+    if merged_from is not None:
+        metadata["modelspec.merged_from"] = merged_from
+    else:
+        del metadata["modelspec.merged_from"]
+    if license is not None:
+        metadata["modelspec.license"] = license
+    else:
+        del metadata["modelspec.license"]
+    if tags is not None:
+        metadata["modelspec.tags"] = tags
+    else:
+        del metadata["modelspec.tags"]
+    # remove microsecond from time
+    int_ts = int(timestamp)
+    # time to iso-8601 compliant date
+    date = datetime.datetime.fromtimestamp(int_ts).isoformat()
+    metadata["modelspec.date"] = date
+    if reso is not None:
+        # comma separated to tuple
+        if isinstance(reso, str):
+            reso = tuple(map(int, reso.split(",")))
+        if len(reso) == 1:
+            reso = (reso[0], reso[0])
+    else:
+        # resolution is defined in dataset, so use default
+        if sdxl or sd3 is not None or flux is not None:
+            reso = 1024
+        elif v2 and v_parameterization:
+            reso = 768
+        else:
+            reso = 512
+    if isinstance(reso, int):
+        reso = (reso, reso)
+    metadata["modelspec.resolution"] = f"{reso[0]}x{reso[1]}"
+    if flux is not None:
+        del metadata["modelspec.prediction_type"]
+    elif v_parameterization:
+        metadata["modelspec.prediction_type"] = PRED_TYPE_V
+    else:
+        metadata["modelspec.prediction_type"] = PRED_TYPE_EPSILON
+    if timesteps is not None:
+        if isinstance(timesteps, str) or isinstance(timesteps, int):
+            timesteps = (timesteps, timesteps)
+        if len(timesteps) == 1:
+            timesteps = (timesteps[0], timesteps[0])
+        metadata["modelspec.timestep_range"] = f"{timesteps[0]},{timesteps[1]}"
+    else:
+        del metadata["modelspec.timestep_range"]
+    if clip_skip is not None:
+        metadata["modelspec.encoder_layer"] = f"{clip_skip}"
+    else:
+        del metadata["modelspec.encoder_layer"]
+    # # assert all values are filled
+    # assert all([v is not None for v in metadata.values()]), metadata
+    if not all([v is not None for v in metadata.values()]):
+        logger.error(f"Internal error: some metadata values are None: {metadata}")
+    return metadata
+# region utils
+def get_title(metadata: dict) -> Optional[str]:
+    return metadata.get(MODELSPEC_TITLE, None)
+def load_metadata_from_safetensors(model: str) -> dict:
+    if not model.endswith(".safetensors"):
+        return {}
+    with safetensors.safe_open(model, framework="pt") as f:
+        metadata = f.metadata()
+    if metadata is None:
+        metadata = {}
+    return metadata
+def build_merged_from(models: List[str]) -> str:
+    def get_title(model: str):
+        metadata = load_metadata_from_safetensors(model)
+        title = metadata.get(MODELSPEC_TITLE, None)
+        if title is None:
+            title = os.path.splitext(os.path.basename(model))[0]  # use filename
+        return title
+    titles = [get_title(model) for model in models]
+    return ", ".join(titles)
+# endregion
+r"""
+if __name__ == "__main__":
+    import argparse
+    import torch
+    from safetensors.torch import load_file
+    from library import train_util
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ckpt", type=str, required=True)
+    args = parser.parse_args()
+    print(f"Loading {args.ckpt}")
+    state_dict = load_file(args.ckpt)
+    print(f"Calculating metadata")
+    metadata = get(state_dict, False, False, False, False, "sgm", False, False, "title", "date", 256, 1000, 0)
+    print(metadata)
+    del state_dict
+    # by reference implementation
+    with open(args.ckpt, mode="rb") as file_data:
+        file_hash = hashlib.sha256()
+        head_len = struct.unpack("Q", file_data.read(8))  # int64 header length prefix
+        header = json.loads(file_data.read(head_len[0]))  # header itself, json string
+        content = (
+            file_data.read()
+        )  # All other content is tightly packed tensors. Copy to RAM for simplicity, but you can avoid this read with a more careful FS-dependent impl.
+        file_hash.update(content)
+        # ===== Update the hash for modelspec =====
+        by_ref = f"0x{file_hash.hexdigest()}"
+    print(by_ref)
+    print("is same?", by_ref == metadata["modelspec.hash_sha256"])
+"""

library/sd3_models.py ADDED Viewed

	@@ -0,0 +1,1413 @@

+# some modules/classes are copied and modified from https://github.com/mcmonkey4eva/sd3-ref
+# the original code is licensed under the MIT License
+# and some module/classes are contributed from KohakuBlueleaf. Thanks for the contribution!
+from ast import Tuple
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass
+from functools import partial
+import math
+from types import SimpleNamespace
+from typing import Dict, List, Optional, Union
+import einops
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.checkpoint import checkpoint
+from transformers import CLIPTokenizer, T5TokenizerFast
+from library import custom_offloading_utils
+from library.device_utils import clean_memory_on_device
+from .utils import setup_logging
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+memory_efficient_attention = None
+try:
+    import xformers
+except:
+    pass
+try:
+    from xformers.ops import memory_efficient_attention
+except:
+    memory_efficient_attention = None
+# region mmdit
+@dataclass
+class SD3Params:
+    patch_size: int
+    depth: int
+    num_patches: int
+    pos_embed_max_size: int
+    adm_in_channels: int
+    qk_norm: Optional[str]
+    x_block_self_attn_layers: list[int]
+    context_embedder_in_features: int
+    context_embedder_out_features: int
+    model_type: str
+def get_2d_sincos_pos_embed(
+    embed_dim,
+    grid_size,
+    scaling_factor=None,
+    offset=None,
+):
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    if scaling_factor is not None:
+        grid = grid / scaling_factor
+    if offset is not None:
+        grid = grid - offset
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+def get_scaled_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0, sample_size=64, base_size=16):
+    """
+    This function is contributed by KohakuBlueleaf. Thanks for the contribution!
+    Creates scaled 2D sinusoidal positional embeddings that maintain consistent relative positions
+    when the resolution differs from the training resolution.
+    Args:
+        embed_dim (int): Dimension of the positional embedding.
+        grid_size (int or tuple): Size of the position grid (H, W). If int, assumes square grid.
+        cls_token (bool): Whether to include class token. Defaults to False.
+        extra_tokens (int): Number of extra tokens (e.g., cls_token). Defaults to 0.
+        sample_size (int): Reference resolution (typically training resolution). Defaults to 64.
+        base_size (int): Base grid size used during training. Defaults to 16.
+    Returns:
+        numpy.ndarray: Positional embeddings of shape (H*W, embed_dim) or
+                      (H*W + extra_tokens, embed_dim) if cls_token is True.
+    """
+    # Convert grid_size to tuple if it's an integer
+    if isinstance(grid_size, int):
+        grid_size = (grid_size, grid_size)
+    # Create normalized grid coordinates (0 to 1)
+    grid_h = np.arange(grid_size[0], dtype=np.float32) / grid_size[0]
+    grid_w = np.arange(grid_size[1], dtype=np.float32) / grid_size[1]
+    # Calculate scaling factors for height and width
+    # This ensures that the central region matches the original resolution's embeddings
+    scale_h = base_size * grid_size[0] / (sample_size)
+    scale_w = base_size * grid_size[1] / (sample_size)
+    # Calculate shift values to center the original resolution's embedding region
+    # This ensures that the central sample_size x sample_size region has similar
+    # positional embeddings to the original resolution
+    shift_h = 1 * scale_h * (grid_size[0] - sample_size) / (2 * grid_size[0])
+    shift_w = 1 * scale_w * (grid_size[1] - sample_size) / (2 * grid_size[1])
+    # Apply scaling and shifting to create the final grid coordinates
+    grid_h = grid_h * scale_h - shift_h
+    grid_w = grid_w * scale_w - shift_w
+    # Create 2D grid using meshgrid (note: w goes first)
+    grid = np.meshgrid(grid_w, grid_h)
+    grid = np.stack(grid, axis=0)
+    # # Calculate the starting indices for the central region
+    # # This is used for debugging/visualization of the central region
+    # st_h = (grid_size[0] - sample_size) // 2
+    # st_w = (grid_size[1] - sample_size) // 2
+    # print(grid[:, st_h : st_h + sample_size, st_w : st_w + sample_size])
+    # Reshape grid for positional embedding calculation
+    grid = grid.reshape([2, 1, grid_size[1], grid_size[0]])
+    # Generate the sinusoidal positional embeddings
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    # Add zeros for extra tokens (e.g., [CLS] token) if required
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+# if __name__ == "__main__":
+#     # This is what you get when you load SD3.5 state dict
+#     pos_emb = torch.from_numpy(get_scaled_2d_sincos_pos_embed(
+#         1536, [384, 384], sample_size=64, base_size=16
+#     )).float().unsqueeze(0)
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid_torch(
+    embed_dim,
+    pos,
+    device=None,
+    dtype=torch.float32,
+):
+    omega = torch.arange(embed_dim // 2, device=device, dtype=dtype)
+    omega *= 2.0 / embed_dim
+    omega = 1.0 / 10000**omega
+    out = torch.outer(pos.reshape(-1), omega)
+    emb = torch.cat([out.sin(), out.cos()], dim=1)
+    return emb
+def get_2d_sincos_pos_embed_torch(
+    embed_dim,
+    w,
+    h,
+    val_center=7.5,
+    val_magnitude=7.5,
+    device=None,
+    dtype=torch.float32,
+):
+    small = min(h, w)
+    val_h = (h / small) * val_magnitude
+    val_w = (w / small) * val_magnitude
+    grid_h, grid_w = torch.meshgrid(
+        torch.linspace(-val_h + val_center, val_h + val_center, h, device=device, dtype=dtype),
+        torch.linspace(-val_w + val_center, val_w + val_center, w, device=device, dtype=dtype),
+        indexing="ij",
+    )
+    emb_h = get_1d_sincos_pos_embed_from_grid_torch(embed_dim // 2, grid_h, device=device, dtype=dtype)
+    emb_w = get_1d_sincos_pos_embed_from_grid_torch(embed_dim // 2, grid_w, device=device, dtype=dtype)
+    emb = torch.cat([emb_w, emb_h], dim=1)  # (H*W, D)
+    return emb
+def modulate(x, shift, scale):
+    if shift is None:
+        shift = torch.zeros_like(scale)
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+def default(x, default_value):
+    if x is None:
+        return default_value
+    return x
+def timestep_embedding(t, dim, max_period=10000):
+    half = dim // 2
+    # freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(
+    #     device=t.device, dtype=t.dtype
+    # )
+    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(device=t.device)
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    if torch.is_floating_point(t):
+        embedding = embedding.to(dtype=t.dtype)
+    return embedding
+class PatchEmbed(nn.Module):
+    def __init__(
+        self,
+        img_size=256,
+        patch_size=4,
+        in_channels=3,
+        embed_dim=512,
+        norm_layer=None,
+        flatten=True,
+        bias=True,
+        strict_img_size=True,
+        dynamic_img_pad=False,
+    ):
+        # dynamic_img_pad and norm is omitted in SD3.5
+        super().__init__()
+        self.patch_size = patch_size
+        self.flatten = flatten
+        self.strict_img_size = strict_img_size
+        self.dynamic_img_pad = dynamic_img_pad
+        if img_size is not None:
+            self.img_size = img_size
+            self.grid_size = img_size // patch_size
+            self.num_patches = self.grid_size**2
+        else:
+            self.img_size = None
+            self.grid_size = None
+            self.num_patches = None
+        self.proj = nn.Conv2d(in_channels, embed_dim, patch_size, patch_size, bias=bias)
+        self.norm = nn.Identity() if norm_layer is None else norm_layer(embed_dim)
+    def forward(self, x):
+        B, C, H, W = x.shape
+        if self.dynamic_img_pad:
+            # Pad input so we won't have partial patch
+            pad_h = (self.patch_size - H % self.patch_size) % self.patch_size
+            pad_w = (self.patch_size - W % self.patch_size) % self.patch_size
+            x = nn.functional.pad(x, (0, pad_w, 0, pad_h), mode="reflect")
+        x = self.proj(x)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)
+        x = self.norm(x)
+        return x
+# FinalLayer in mmdit.py
+class UnPatch(nn.Module):
+    def __init__(self, hidden_size=512, patch_size=4, out_channels=3):
+        super().__init__()
+        self.patch_size = patch_size
+        self.c = out_channels
+        # eps is default in mmdit.py
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size**2 * out_channels)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 2 * hidden_size),
+        )
+    def forward(self, x: torch.Tensor, cmod, H=None, W=None):
+        b, n, _ = x.shape
+        p = self.patch_size
+        c = self.c
+        if H is None and W is None:
+            w = h = int(n**0.5)
+            assert h * w == n
+        else:
+            h = H // p if H else n // (W // p)
+            w = W // p if W else n // h
+            assert h * w == n
+        shift, scale = self.adaLN_modulation(cmod).chunk(2, dim=-1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        x = x.view(b, h, w, p, p, c)
+        x = x.permute(0, 5, 1, 3, 2, 4).contiguous()
+        x = x.view(b, c, h * p, w * p)
+        return x
+class MLP(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=lambda: nn.GELU(),
+        norm_layer=None,
+        bias=True,
+        use_conv=False,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.use_conv = use_conv
+        layer = partial(nn.Conv1d, kernel_size=1) if use_conv else nn.Linear
+        self.fc1 = layer(in_features, hidden_features, bias=bias)
+        self.fc2 = layer(hidden_features, out_features, bias=bias)
+        self.act = act_layer()
+        self.norm = norm_layer(hidden_features) if norm_layer else nn.Identity()
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.norm(x)
+        x = self.fc2(x)
+        return x
+class TimestepEmbedding(nn.Module):
+    def __init__(self, hidden_size, freq_embed_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(freq_embed_size, hidden_size),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size),
+        )
+        self.freq_embed_size = freq_embed_size
+    def forward(self, t, dtype=None, **kwargs):
+        t_freq = timestep_embedding(t, self.freq_embed_size).to(dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class Embedder(nn.Module):
+    def __init__(self, input_dim, hidden_size):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(input_dim, hidden_size),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size),
+        )
+    def forward(self, x):
+        return self.mlp(x)
+def rmsnorm(x, eps=1e-6):
+    return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)
+class RMSNorm(torch.nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        elementwise_affine: bool = False,
+        eps: float = 1e-6,
+        device=None,
+        dtype=None,
+    ):
+        """
+        Initialize the RMSNorm normalization layer.
+        Args:
+            dim (int): The dimension of the input tensor.
+            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+        Attributes:
+            eps (float): A small value added to the denominator for numerical stability.
+            weight (nn.Parameter): Learnable scaling parameter.
+        """
+        super().__init__()
+        self.eps = eps
+        self.learnable_scale = elementwise_affine
+        if self.learnable_scale:
+            self.weight = nn.Parameter(torch.empty(dim, device=device, dtype=dtype))
+        else:
+            self.register_parameter("weight", None)
+    def forward(self, x):
+        """
+        Forward pass through the RMSNorm layer.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The output tensor after applying RMSNorm.
+        """
+        x = rmsnorm(x, eps=self.eps)
+        if self.learnable_scale:
+            return x * self.weight.to(device=x.device, dtype=x.dtype)
+        else:
+            return x
+class SwiGLUFeedForward(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        multiple_of: int,
+        ffn_dim_multiplier: float = None,
+    ):
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        # custom dim factor multiplier
+        if ffn_dim_multiplier is not None:
+            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+    def forward(self, x):
+        return self.w2(nn.functional.silu(self.w1(x)) * self.w3(x))
+# Linears for SelfAttention in mmdit.py
+class AttentionLinears(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        pre_only: bool = False,
+        qk_norm: Optional[str] = None,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        if not pre_only:
+            self.proj = nn.Linear(dim, dim)
+        self.pre_only = pre_only
+        if qk_norm == "rms":
+            self.ln_q = RMSNorm(self.head_dim, elementwise_affine=True, eps=1.0e-6)
+            self.ln_k = RMSNorm(self.head_dim, elementwise_affine=True, eps=1.0e-6)
+        elif qk_norm == "ln":
+            self.ln_q = nn.LayerNorm(self.head_dim, elementwise_affine=True, eps=1.0e-6)
+            self.ln_k = nn.LayerNorm(self.head_dim, elementwise_affine=True, eps=1.0e-6)
+        elif qk_norm is None:
+            self.ln_q = nn.Identity()
+            self.ln_k = nn.Identity()
+        else:
+            raise ValueError(qk_norm)
+    def pre_attention(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        output:
+            q, k, v: [B, L, D]
+        """
+        B, L, C = x.shape
+        qkv: torch.Tensor = self.qkv(x)
+        q, k, v = qkv.reshape(B, L, -1, self.head_dim).chunk(3, dim=2)
+        q = self.ln_q(q).reshape(q.shape[0], q.shape[1], -1)
+        k = self.ln_k(k).reshape(q.shape[0], q.shape[1], -1)
+        return (q, k, v)
+    def post_attention(self, x: torch.Tensor) -> torch.Tensor:
+        assert not self.pre_only
+        x = self.proj(x)
+        return x
+MEMORY_LAYOUTS = {
+    "torch": (
+        lambda x, head_dim: x.reshape(x.shape[0], x.shape[1], -1, head_dim).transpose(1, 2),
+        lambda x: x.transpose(1, 2).reshape(x.shape[0], x.shape[2], -1),
+        lambda x: (1, x, 1, 1),
+    ),
+    "xformers": (
+        lambda x, head_dim: x.reshape(x.shape[0], x.shape[1], -1, head_dim),
+        lambda x: x.reshape(x.shape[0], x.shape[1], -1),
+        lambda x: (1, 1, x, 1),
+    ),
+    "math": (
+        lambda x, head_dim: x.reshape(x.shape[0], x.shape[1], -1, head_dim).transpose(1, 2),
+        lambda x: x.transpose(1, 2).reshape(x.shape[0], x.shape[2], -1),
+        lambda x: (1, x, 1, 1),
+    ),
+}
+# ATTN_FUNCTION = {
+#     "torch": F.scaled_dot_product_attention,
+#     "xformers": memory_efficient_attention,
+# }
+def vanilla_attention(q, k, v, mask, scale=None):
+    if scale is None:
+        scale = math.sqrt(q.size(-1))
+    scores = torch.bmm(q, k.transpose(-1, -2)) / scale
+    if mask is not None:
+        mask = einops.rearrange(mask, "b ... -> b (...)")
+        max_neg_value = -torch.finfo(scores.dtype).max
+        mask = einops.repeat(mask, "b j -> (b h) j", h=q.size(-3))
+        scores = scores.masked_fill(~mask, max_neg_value)
+    p_attn = F.softmax(scores, dim=-1)
+    return torch.bmm(p_attn, v)
+def attention(q, k, v, head_dim, mask=None, scale=None, mode="xformers"):
+    """
+    q, k, v: [B, L, D]
+    """
+    pre_attn_layout = MEMORY_LAYOUTS[mode][0]
+    post_attn_layout = MEMORY_LAYOUTS[mode][1]
+    q = pre_attn_layout(q, head_dim)
+    k = pre_attn_layout(k, head_dim)
+    v = pre_attn_layout(v, head_dim)
+    # scores = ATTN_FUNCTION[mode](q, k.to(q), v.to(q), mask, scale=scale)
+    if mode == "torch":
+        assert scale is None
+        scores = F.scaled_dot_product_attention(q, k.to(q), v.to(q), mask)  # , scale=scale)
+    elif mode == "xformers":
+        scores = memory_efficient_attention(q, k.to(q), v.to(q), mask, scale=scale)
+    else:
+        scores = vanilla_attention(q, k.to(q), v.to(q), mask, scale=scale)
+    scores = post_attn_layout(scores)
+    return scores
+# DismantledBlock in mmdit.py
+class SingleDiTBlock(nn.Module):
+    """
+    A DiT block with gated adaptive layer norm (adaLN) conditioning.
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        attn_mode: str = "xformers",
+        qkv_bias: bool = False,
+        pre_only: bool = False,
+        rmsnorm: bool = False,
+        scale_mod_only: bool = False,
+        swiglu: bool = False,
+        qk_norm: Optional[str] = None,
+        x_block_self_attn: bool = False,
+        **block_kwargs,
+    ):
+        super().__init__()
+        assert attn_mode in MEMORY_LAYOUTS
+        self.attn_mode = attn_mode
+        if not rmsnorm:
+            self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        else:
+            self.norm1 = RMSNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn = AttentionLinears(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, pre_only=pre_only, qk_norm=qk_norm)
+        self.x_block_self_attn = x_block_self_attn
+        if self.x_block_self_attn:
+            assert not pre_only
+            assert not scale_mod_only
+            self.attn2 = AttentionLinears(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, pre_only=False, qk_norm=qk_norm)
+        if not pre_only:
+            if not rmsnorm:
+                self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+            else:
+                self.norm2 = RMSNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        if not pre_only:
+            if not swiglu:
+                self.mlp = MLP(
+                    in_features=hidden_size,
+                    hidden_features=mlp_hidden_dim,
+                    act_layer=lambda: nn.GELU(approximate="tanh"),
+                )
+            else:
+                self.mlp = SwiGLUFeedForward(
+                    dim=hidden_size,
+                    hidden_dim=mlp_hidden_dim,
+                    multiple_of=256,
+                )
+        self.scale_mod_only = scale_mod_only
+        if self.x_block_self_attn:
+            n_mods = 9
+        elif not scale_mod_only:
+            n_mods = 6 if not pre_only else 2
+        else:
+            n_mods = 4 if not pre_only else 1
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, n_mods * hidden_size))
+        self.pre_only = pre_only
+    def pre_attention(self, x: torch.Tensor, c: torch.Tensor) -> torch.Tensor:
+        if not self.pre_only:
+            if not self.scale_mod_only:
+                (shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp) = self.adaLN_modulation(c).chunk(6, dim=-1)
+            else:
+                shift_msa = None
+                shift_mlp = None
+                (scale_msa, gate_msa, scale_mlp, gate_mlp) = self.adaLN_modulation(c).chunk(4, dim=-1)
+            qkv = self.attn.pre_attention(modulate(self.norm1(x), shift_msa, scale_msa))
+            return qkv, (x, gate_msa, shift_mlp, scale_mlp, gate_mlp)
+        else:
+            if not self.scale_mod_only:
+                (shift_msa, scale_msa) = self.adaLN_modulation(c).chunk(2, dim=-1)
+            else:
+                shift_msa = None
+                scale_msa = self.adaLN_modulation(c)
+            qkv = self.attn.pre_attention(modulate(self.norm1(x), shift_msa, scale_msa))
+            return qkv, None
+    def pre_attention_x(self, x: torch.Tensor, c: torch.Tensor) -> torch.Tensor:
+        assert self.x_block_self_attn
+        (shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp, shift_msa2, scale_msa2, gate_msa2) = self.adaLN_modulation(
+            c
+        ).chunk(9, dim=1)
+        x_norm = self.norm1(x)
+        qkv = self.attn.pre_attention(modulate(x_norm, shift_msa, scale_msa))
+        qkv2 = self.attn2.pre_attention(modulate(x_norm, shift_msa2, scale_msa2))
+        return qkv, qkv2, (x, gate_msa, shift_mlp, scale_mlp, gate_mlp, gate_msa2)
+    def post_attention(self, attn, x, gate_msa, shift_mlp, scale_mlp, gate_mlp):
+        assert not self.pre_only
+        x = x + gate_msa.unsqueeze(1) * self.attn.post_attention(attn)
+        x = x + gate_mlp.unsqueeze(1) * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))
+        return x
+    def post_attention_x(self, attn, attn2, x, gate_msa, shift_mlp, scale_mlp, gate_mlp, gate_msa2, attn1_dropout: float = 0.0):
+        assert not self.pre_only
+        if attn1_dropout > 0.0:
+            # Use torch.bernoulli to implement dropout, only dropout the batch dimension
+            attn1_dropout = torch.bernoulli(torch.full((attn.size(0), 1, 1), 1 - attn1_dropout, device=attn.device))
+            attn_ = gate_msa.unsqueeze(1) * self.attn.post_attention(attn) * attn1_dropout
+        else:
+            attn_ = gate_msa.unsqueeze(1) * self.attn.post_attention(attn)
+        x = x + attn_
+        attn2_ = gate_msa2.unsqueeze(1) * self.attn2.post_attention(attn2)
+        x = x + attn2_
+        mlp_ = gate_mlp.unsqueeze(1) * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))
+        x = x + mlp_
+        return x
+# JointBlock + block_mixing in mmdit.py
+class MMDiTBlock(nn.Module):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        pre_only = kwargs.pop("pre_only")
+        x_block_self_attn = kwargs.pop("x_block_self_attn")
+        self.context_block = SingleDiTBlock(*args, pre_only=pre_only, **kwargs)
+        self.x_block = SingleDiTBlock(*args, pre_only=False, x_block_self_attn=x_block_self_attn, **kwargs)
+        self.head_dim = self.x_block.attn.head_dim
+        self.mode = self.x_block.attn_mode
+        self.gradient_checkpointing = False
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+    def _forward(self, context, x, c):
+        ctx_qkv, ctx_intermediate = self.context_block.pre_attention(context, c)
+        if self.x_block.x_block_self_attn:
+            x_qkv, x_qkv2, x_intermediates = self.x_block.pre_attention_x(x, c)
+        else:
+            x_qkv, x_intermediates = self.x_block.pre_attention(x, c)
+        ctx_len = ctx_qkv[0].size(1)
+        q = torch.concat((ctx_qkv[0], x_qkv[0]), dim=1)
+        k = torch.concat((ctx_qkv[1], x_qkv[1]), dim=1)
+        v = torch.concat((ctx_qkv[2], x_qkv[2]), dim=1)
+        attn = attention(q, k, v, head_dim=self.head_dim, mode=self.mode)
+        ctx_attn_out = attn[:, :ctx_len]
+        x_attn_out = attn[:, ctx_len:]
+        if self.x_block.x_block_self_attn:
+            x_q2, x_k2, x_v2 = x_qkv2
+            attn2 = attention(x_q2, x_k2, x_v2, self.x_block.attn2.num_heads, mode=self.mode)
+            x = self.x_block.post_attention_x(x_attn_out, attn2, *x_intermediates)
+        else:
+            x = self.x_block.post_attention(x_attn_out, *x_intermediates)
+        if not self.context_block.pre_only:
+            context = self.context_block.post_attention(ctx_attn_out, *ctx_intermediate)
+        else:
+            context = None
+        return context, x
+    def forward(self, *args, **kwargs):
+        if self.training and self.gradient_checkpointing:
+            return checkpoint(self._forward, *args, use_reentrant=False, **kwargs)
+        else:
+            return self._forward(*args, **kwargs)
+class MMDiT(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+    # prepare pos_embed for latent size * 2
+    POS_EMBED_MAX_RATIO = 1.5
+    def __init__(
+        self,
+        input_size: int = 32,
+        patch_size: int = 2,
+        in_channels: int = 4,
+        depth: int = 28,
+        # hidden_size: Optional[int] = None,
+        # num_heads: Optional[int] = None,
+        mlp_ratio: float = 4.0,
+        learn_sigma: bool = False,
+        adm_in_channels: Optional[int] = None,
+        context_embedder_in_features: Optional[int] = None,
+        context_embedder_out_features: Optional[int] = None,
+        use_checkpoint: bool = False,
+        register_length: int = 0,
+        attn_mode: str = "torch",
+        rmsnorm: bool = False,
+        scale_mod_only: bool = False,
+        swiglu: bool = False,
+        out_channels: Optional[int] = None,
+        pos_embed_scaling_factor: Optional[float] = None,
+        pos_embed_offset: Optional[float] = None,
+        pos_embed_max_size: Optional[int] = None,
+        num_patches=None,
+        qk_norm: Optional[str] = None,
+        x_block_self_attn_layers: Optional[list[int]] = [],
+        qkv_bias: bool = True,
+        pos_emb_random_crop_rate: float = 0.0,
+        use_scaled_pos_embed: bool = False,
+        pos_embed_latent_sizes: Optional[list[int]] = None,
+        model_type: str = "sd3m",
+    ):
+        super().__init__()
+        self._model_type = model_type
+        self.learn_sigma = learn_sigma
+        self.in_channels = in_channels
+        default_out_channels = in_channels * 2 if learn_sigma else in_channels
+        self.out_channels = default(out_channels, default_out_channels)
+        self.patch_size = patch_size
+        self.pos_embed_scaling_factor = pos_embed_scaling_factor
+        self.pos_embed_offset = pos_embed_offset
+        self.pos_embed_max_size = pos_embed_max_size
+        self.x_block_self_attn_layers = x_block_self_attn_layers
+        self.pos_emb_random_crop_rate = pos_emb_random_crop_rate
+        self.gradient_checkpointing = use_checkpoint
+        # hidden_size = default(hidden_size, 64 * depth)
+        # num_heads = default(num_heads, hidden_size // 64)
+        # apply magic --> this defines a head_size of 64
+        self.hidden_size = 64 * depth
+        num_heads = depth
+        self.num_heads = num_heads
+        self.enable_scaled_pos_embed(use_scaled_pos_embed, pos_embed_latent_sizes)
+        self.x_embedder = PatchEmbed(
+            input_size,
+            patch_size,
+            in_channels,
+            self.hidden_size,
+            bias=True,
+            strict_img_size=self.pos_embed_max_size is None,
+        )
+        self.t_embedder = TimestepEmbedding(self.hidden_size)
+        self.y_embedder = None
+        if adm_in_channels is not None:
+            assert isinstance(adm_in_channels, int)
+            self.y_embedder = Embedder(adm_in_channels, self.hidden_size)
+        if context_embedder_in_features is not None:
+            self.context_embedder = nn.Linear(context_embedder_in_features, context_embedder_out_features)
+        else:
+            self.context_embedder = nn.Identity()
+        self.register_length = register_length
+        if self.register_length > 0:
+            self.register = nn.Parameter(torch.randn(1, register_length, self.hidden_size))
+        # num_patches = self.x_embedder.num_patches
+        # Will use fixed sin-cos embedding:
+        # just use a buffer already
+        if num_patches is not None:
+            self.register_buffer(
+                "pos_embed",
+                torch.empty(1, num_patches, self.hidden_size),
+            )
+        else:
+            self.pos_embed = None
+        self.use_checkpoint = use_checkpoint
+        self.joint_blocks = nn.ModuleList(
+            [
+                MMDiTBlock(
+                    self.hidden_size,
+                    num_heads,
+                    mlp_ratio=mlp_ratio,
+                    attn_mode=attn_mode,
+                    qkv_bias=qkv_bias,
+                    pre_only=i == depth - 1,
+                    rmsnorm=rmsnorm,
+                    scale_mod_only=scale_mod_only,
+                    swiglu=swiglu,
+                    qk_norm=qk_norm,
+                    x_block_self_attn=(i in self.x_block_self_attn_layers),
+                )
+                for i in range(depth)
+            ]
+        )
+        for block in self.joint_blocks:
+            block.gradient_checkpointing = use_checkpoint
+        self.final_layer = UnPatch(self.hidden_size, patch_size, self.out_channels)
+        # self.initialize_weights()
+        self.blocks_to_swap = None
+        self.offloader = None
+        self.num_blocks = len(self.joint_blocks)
+    def enable_scaled_pos_embed(self, use_scaled_pos_embed: bool, latent_sizes: Optional[list[int]]):
+        self.use_scaled_pos_embed = use_scaled_pos_embed
+        if self.use_scaled_pos_embed:
+            # remove pos_embed to free up memory up to 0.4 GB
+            self.pos_embed = None
+            # remove duplicates and sort latent sizes in ascending order
+            latent_sizes = list(set(latent_sizes))
+            latent_sizes = sorted(latent_sizes)
+            patched_sizes = [latent_size // self.patch_size for latent_size in latent_sizes]
+            # calculate value range for each latent area: this is used to determine the pos_emb size from the latent shape
+            max_areas = []
+            for i in range(1, len(patched_sizes)):
+                prev_area = patched_sizes[i - 1] ** 2
+                area = patched_sizes[i] ** 2
+                max_areas.append((prev_area + area) // 2)
+            # area of the last latent size, if the latent size exceeds this, error will be raised
+            max_areas.append(int((patched_sizes[-1] * MMDiT.POS_EMBED_MAX_RATIO) ** 2))
+            # print("max_areas", max_areas)
+            self.resolution_area_to_latent_size = [(area, latent_size) for area, latent_size in zip(max_areas, patched_sizes)]
+            self.resolution_pos_embeds = {}
+            for patched_size in patched_sizes:
+                grid_size = int(patched_size * MMDiT.POS_EMBED_MAX_RATIO)
+                pos_embed = get_scaled_2d_sincos_pos_embed(self.hidden_size, grid_size, sample_size=patched_size)
+                pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0)
+                self.resolution_pos_embeds[patched_size] = pos_embed
+                # print(f"pos_embed for {patched_size}x{patched_size} latent size: {pos_embed.shape}")
+        else:
+            self.resolution_area_to_latent_size = None
+            self.resolution_pos_embeds = None
+    @property
+    def model_type(self):
+        return self._model_type
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+        for block in self.joint_blocks:
+            block.enable_gradient_checkpointing()
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+        for block in self.joint_blocks:
+            block.disable_gradient_checkpointing()
+    def initialize_weights(self):
+        # TODO: Init context_embedder?
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize (and freeze) pos_embed by sin-cos embedding
+        if self.pos_embed is not None:
+            pos_embed = get_2d_sincos_pos_embed(
+                self.pos_embed.shape[-1],
+                int(self.pos_embed.shape[-2] ** 0.5),
+                scaling_factor=self.pos_embed_scaling_factor,
+            )
+            self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
+        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d)
+        w = self.x_embedder.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.x_embedder.proj.bias, 0)
+        if getattr(self, "y_embedder", None) is not None:
+            nn.init.normal_(self.y_embedder.mlp[0].weight, std=0.02)
+            nn.init.normal_(self.y_embedder.mlp[2].weight, std=0.02)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in DiT blocks:
+        for block in self.joint_blocks:
+            nn.init.constant_(block.x_block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.x_block.adaLN_modulation[-1].bias, 0)
+            nn.init.constant_(block.context_block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.context_block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+    def set_pos_emb_random_crop_rate(self, rate: float):
+        self.pos_emb_random_crop_rate = rate
+    def cropped_pos_embed(self, h, w, device=None, random_crop: bool = False):
+        p = self.x_embedder.patch_size
+        # patched size
+        h = (h + 1) // p
+        w = (w + 1) // p
+        if self.pos_embed is None:  # should not happen
+            return get_2d_sincos_pos_embed_torch(self.hidden_size, w, h, device=device)
+        assert self.pos_embed_max_size is not None
+        assert h <= self.pos_embed_max_size, (h, self.pos_embed_max_size)
+        assert w <= self.pos_embed_max_size, (w, self.pos_embed_max_size)
+        if not random_crop:
+            top = (self.pos_embed_max_size - h) // 2
+            left = (self.pos_embed_max_size - w) // 2
+        else:
+            top = torch.randint(0, self.pos_embed_max_size - h + 1, (1,)).item()
+            left = torch.randint(0, self.pos_embed_max_size - w + 1, (1,)).item()
+        spatial_pos_embed = self.pos_embed.reshape(
+            1,
+            self.pos_embed_max_size,
+            self.pos_embed_max_size,
+            self.pos_embed.shape[-1],
+        )
+        spatial_pos_embed = spatial_pos_embed[:, top : top + h, left : left + w, :]
+        spatial_pos_embed = spatial_pos_embed.reshape(1, -1, spatial_pos_embed.shape[-1])
+        return spatial_pos_embed
+    def cropped_scaled_pos_embed(self, h, w, device=None, dtype=None, random_crop: bool = False):
+        p = self.x_embedder.patch_size
+        # patched size
+        h = (h + 1) // p
+        w = (w + 1) // p
+        # select pos_embed size based on area
+        area = h * w
+        patched_size = None
+        for area_, patched_size_ in self.resolution_area_to_latent_size:
+            if area <= area_:
+                patched_size = patched_size_
+                break
+        if patched_size is None:
+            raise ValueError(f"Area {area} is too large for the given latent sizes {self.resolution_area_to_latent_size}.")
+        pos_embed = self.resolution_pos_embeds[patched_size]
+        pos_embed_size = round(math.sqrt(pos_embed.shape[1]))
+        if h > pos_embed_size or w > pos_embed_size:
+            # # fallback to normal pos_embed
+            # return self.cropped_pos_embed(h * p, w * p, device=device, random_crop=random_crop)
+            # extend pos_embed size
+            logger.warning(
+                f"Using normal pos_embed for size {h}x{w} as it exceeds the scaled pos_embed size {pos_embed_size}. Image is too tall or wide."
+            )
+            pos_embed_size = max(h, w)
+            pos_embed = get_scaled_2d_sincos_pos_embed(self.hidden_size, pos_embed_size, sample_size=patched_size)
+            pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0)
+            self.resolution_pos_embeds[patched_size] = pos_embed
+            logger.info(f"Updated pos_embed for size {pos_embed_size}x{pos_embed_size}")
+        if not random_crop:
+            top = (pos_embed_size - h) // 2
+            left = (pos_embed_size - w) // 2
+        else:
+            top = torch.randint(0, pos_embed_size - h + 1, (1,)).item()
+            left = torch.randint(0, pos_embed_size - w + 1, (1,)).item()
+        if pos_embed.device != device:
+            pos_embed = pos_embed.to(device)
+            # which is better to update device, or transfer every time to device? -> 64x64 emb is 96*96*1536*4=56MB. It's okay to update device.
+            self.resolution_pos_embeds[patched_size] = pos_embed  # update device
+        if pos_embed.dtype != dtype:
+            pos_embed = pos_embed.to(dtype)
+            self.resolution_pos_embeds[patched_size] = pos_embed  # update dtype
+        spatial_pos_embed = pos_embed.reshape(1, pos_embed_size, pos_embed_size, pos_embed.shape[-1])
+        spatial_pos_embed = spatial_pos_embed[:, top : top + h, left : left + w, :]
+        spatial_pos_embed = spatial_pos_embed.reshape(1, -1, spatial_pos_embed.shape[-1])
+        # print(
+        #     f"patched size: {h}x{w}, pos_embed size: {pos_embed_size}, pos_embed shape: {pos_embed.shape}, top: {top}, left: {left}"
+        # )
+        return spatial_pos_embed
+    def enable_block_swap(self, num_blocks: int, device: torch.device):
+        self.blocks_to_swap = num_blocks
+        assert (
+            self.blocks_to_swap <= self.num_blocks - 2
+        ), f"Cannot swap more than {self.num_blocks - 2} blocks. Requested: {self.blocks_to_swap} blocks."
+        self.offloader = custom_offloading_utils.ModelOffloader(
+            self.joint_blocks, self.num_blocks, self.blocks_to_swap, device  # , debug=True
+        )
+        print(f"SD3: Block swap enabled. Swapping {num_blocks} blocks, total blocks: {self.num_blocks}, device: {device}.")
+    def move_to_device_except_swap_blocks(self, device: torch.device):
+        # assume model is on cpu. do not move blocks to device to reduce temporary memory usage
+        if self.blocks_to_swap:
+            save_blocks = self.joint_blocks
+            self.joint_blocks = None
+        self.to(device)
+        if self.blocks_to_swap:
+            self.joint_blocks = save_blocks
+    def prepare_block_swap_before_forward(self):
+        if self.blocks_to_swap is None or self.blocks_to_swap == 0:
+            return
+        self.offloader.prepare_block_devices_before_forward(self.joint_blocks)
+    def forward(
+        self,
+        x: torch.Tensor,
+        t: torch.Tensor,
+        y: Optional[torch.Tensor] = None,
+        context: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Forward pass of DiT.
+        x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
+        t: (N,) tensor of diffusion timesteps
+        y: (N, D) tensor of class labels
+        """
+        pos_emb_random_crop = (
+            False if self.pos_emb_random_crop_rate == 0.0 else torch.rand(1).item() < self.pos_emb_random_crop_rate
+        )
+        B, C, H, W = x.shape
+        # x = self.x_embedder(x) + self.cropped_pos_embed(H, W, device=x.device, random_crop=pos_emb_random_crop).to(dtype=x.dtype)
+        if not self.use_scaled_pos_embed:
+            pos_embed = self.cropped_pos_embed(H, W, device=x.device, random_crop=pos_emb_random_crop).to(dtype=x.dtype)
+        else:
+            # print(f"Using scaled pos_embed for size {H}x{W}")
+            pos_embed = self.cropped_scaled_pos_embed(H, W, device=x.device, dtype=x.dtype, random_crop=pos_emb_random_crop)
+        x = self.x_embedder(x) + pos_embed
+        del pos_embed
+        c = self.t_embedder(t, dtype=x.dtype)  # (N, D)
+        if y is not None and self.y_embedder is not None:
+            y = self.y_embedder(y)  # (N, D)
+            c = c + y  # (N, D)
+        if context is not None:
+            context = self.context_embedder(context)
+        if self.register_length > 0:
+            context = torch.cat(
+                (einops.repeat(self.register, "1 ... -> b ...", b=x.shape[0]), default(context, torch.Tensor([]).type_as(x))), 1
+            )
+        if not self.blocks_to_swap:
+            for block in self.joint_blocks:
+                context, x = block(context, x, c)
+        else:
+            for block_idx, block in enumerate(self.joint_blocks):
+                self.offloader.wait_for_block(block_idx)
+                context, x = block(context, x, c)
+                self.offloader.submit_move_blocks(self.joint_blocks, block_idx)
+        x = self.final_layer(x, c, H, W)  # Our final layer combined UnPatchify
+        return x[:, :, :H, :W]
+def create_sd3_mmdit(params: SD3Params, attn_mode: str = "torch") -> MMDiT:
+    mmdit = MMDiT(
+        input_size=None,
+        pos_embed_max_size=params.pos_embed_max_size,
+        patch_size=params.patch_size,
+        in_channels=16,
+        adm_in_channels=params.adm_in_channels,
+        context_embedder_in_features=params.context_embedder_in_features,
+        context_embedder_out_features=params.context_embedder_out_features,
+        depth=params.depth,
+        mlp_ratio=4,
+        qk_norm=params.qk_norm,
+        x_block_self_attn_layers=params.x_block_self_attn_layers,
+        num_patches=params.num_patches,
+        attn_mode=attn_mode,
+        model_type=params.model_type,
+    )
+    return mmdit
+# endregion
+# region VAE
+VAE_SCALE_FACTOR = 1.5305
+VAE_SHIFT_FACTOR = 0.0609
+def Normalize(in_channels, num_groups=32, dtype=torch.float32, device=None):
+    return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True, dtype=dtype, device=device)
+class ResnetBlock(torch.nn.Module):
+    def __init__(self, *, in_channels, out_channels=None, dtype=torch.float32, device=None):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.norm1 = Normalize(in_channels, dtype=dtype, device=device)
+        self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1, dtype=dtype, device=device)
+        self.norm2 = Normalize(out_channels, dtype=dtype, device=device)
+        self.conv2 = torch.nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, dtype=dtype, device=device)
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = torch.nn.Conv2d(
+                in_channels, out_channels, kernel_size=1, stride=1, padding=0, dtype=dtype, device=device
+            )
+        else:
+            self.nin_shortcut = None
+        self.swish = torch.nn.SiLU(inplace=True)
+    def forward(self, x):
+        hidden = x
+        hidden = self.norm1(hidden)
+        hidden = self.swish(hidden)
+        hidden = self.conv1(hidden)
+        hidden = self.norm2(hidden)
+        hidden = self.swish(hidden)
+        hidden = self.conv2(hidden)
+        if self.in_channels != self.out_channels:
+            x = self.nin_shortcut(x)
+        return x + hidden
+class AttnBlock(torch.nn.Module):
+    def __init__(self, in_channels, dtype=torch.float32, device=None):
+        super().__init__()
+        self.norm = Normalize(in_channels, dtype=dtype, device=device)
+        self.q = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0, dtype=dtype, device=device)
+        self.k = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0, dtype=dtype, device=device)
+        self.v = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0, dtype=dtype, device=device)
+        self.proj_out = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0, dtype=dtype, device=device)
+    def forward(self, x):
+        hidden = self.norm(x)
+        q = self.q(hidden)
+        k = self.k(hidden)
+        v = self.v(hidden)
+        b, c, h, w = q.shape
+        q, k, v = map(lambda x: einops.rearrange(x, "b c h w -> b 1 (h w) c").contiguous(), (q, k, v))
+        hidden = torch.nn.functional.scaled_dot_product_attention(q, k, v)  # scale is dim ** -0.5 per default
+        hidden = einops.rearrange(hidden, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b)
+        hidden = self.proj_out(hidden)
+        return x + hidden
+class Downsample(torch.nn.Module):
+    def __init__(self, in_channels, dtype=torch.float32, device=None):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0, dtype=dtype, device=device)
+    def forward(self, x):
+        pad = (0, 1, 0, 1)
+        x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+        x = self.conv(x)
+        return x
+class Upsample(torch.nn.Module):
+    def __init__(self, in_channels, dtype=torch.float32, device=None):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, dtype=dtype, device=device)
+    def forward(self, x):
+        org_dtype = x.dtype
+        if x.dtype == torch.bfloat16:
+            x = x.to(torch.float32)
+        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        if x.dtype != org_dtype:
+            x = x.to(org_dtype)
+        x = self.conv(x)
+        return x
+class VAEEncoder(torch.nn.Module):
+    def __init__(
+        self, ch=128, ch_mult=(1, 2, 4, 4), num_res_blocks=2, in_channels=3, z_channels=16, dtype=torch.float32, device=None
+    ):
+        super().__init__()
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(in_channels, ch, kernel_size=3, stride=1, padding=1, dtype=dtype, device=device)
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = torch.nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = torch.nn.ModuleList()
+            attn = torch.nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out, dtype=dtype, device=device))
+                block_in = block_out
+            down = torch.nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in, dtype=dtype, device=device)
+            self.down.append(down)
+        # middle
+        self.mid = torch.nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in, dtype=dtype, device=device)
+        self.mid.attn_1 = AttnBlock(block_in, dtype=dtype, device=device)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in, dtype=dtype, device=device)
+        # end
+        self.norm_out = Normalize(block_in, dtype=dtype, device=device)
+        self.conv_out = torch.nn.Conv2d(block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1, dtype=dtype, device=device)
+        self.swish = torch.nn.SiLU(inplace=True)
+    def forward(self, x):
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1])
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # end
+        h = self.norm_out(h)
+        h = self.swish(h)
+        h = self.conv_out(h)
+        return h
+class VAEDecoder(torch.nn.Module):
+    def __init__(
+        self,
+        ch=128,
+        out_ch=3,
+        ch_mult=(1, 2, 4, 4),
+        num_res_blocks=2,
+        resolution=256,
+        z_channels=16,
+        dtype=torch.float32,
+        device=None,
+    ):
+        super().__init__()
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1, dtype=dtype, device=device)
+        # middle
+        self.mid = torch.nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in, dtype=dtype, device=device)
+        self.mid.attn_1 = AttnBlock(block_in, dtype=dtype, device=device)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in, dtype=dtype, device=device)
+        # upsampling
+        self.up = torch.nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = torch.nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out, dtype=dtype, device=device))
+                block_in = block_out
+            up = torch.nn.Module()
+            up.block = block
+            if i_level != 0:
+                up.upsample = Upsample(block_in, dtype=dtype, device=device)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in, dtype=dtype, device=device)
+        self.conv_out = torch.nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1, dtype=dtype, device=device)
+        self.swish = torch.nn.SiLU(inplace=True)
+    def forward(self, z):
+        # z to block_in
+        hidden = self.conv_in(z)
+        # middle
+        hidden = self.mid.block_1(hidden)
+        hidden = self.mid.attn_1(hidden)
+        hidden = self.mid.block_2(hidden)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                hidden = self.up[i_level].block[i_block](hidden)
+            if i_level != 0:
+                hidden = self.up[i_level].upsample(hidden)
+        # end
+        hidden = self.norm_out(hidden)
+        hidden = self.swish(hidden)
+        hidden = self.conv_out(hidden)
+        return hidden
+class SDVAE(torch.nn.Module):
+    def __init__(self, dtype=torch.float32, device=None):
+        super().__init__()
+        self.encoder = VAEEncoder(dtype=dtype, device=device)
+        self.decoder = VAEDecoder(dtype=dtype, device=device)
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+    # @torch.autocast("cuda", dtype=torch.float16)
+    def decode(self, latent):
+        return self.decoder(latent)
+    # @torch.autocast("cuda", dtype=torch.float16)
+    def encode(self, image):
+        hidden = self.encoder(image)
+        mean, logvar = torch.chunk(hidden, 2, dim=1)
+        logvar = torch.clamp(logvar, -30.0, 20.0)
+        std = torch.exp(0.5 * logvar)
+        return mean + std * torch.randn_like(mean)
+    @staticmethod
+    def process_in(latent):
+        return (latent - VAE_SHIFT_FACTOR) * VAE_SCALE_FACTOR
+    @staticmethod
+    def process_out(latent):
+        return (latent / VAE_SCALE_FACTOR) + VAE_SHIFT_FACTOR
+# endregion