Upload 5 files

Browse files

Files changed (5) hide show

image_processor.py +196 -0
prompt_parser.py +373 -0
requirements.txt +12 -0
stable_diffusion_custom_v4_1.py +795 -0
tokenizer_util.py +354 -0

image_processor.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import warnings
+from typing import List, Optional, Union
+import numpy as np
+import PIL
+import torch
+from PIL import Image
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils import CONFIG_NAME, PIL_INTERPOLATION, deprecate
+class VaeImageProcessor(ConfigMixin):
+    """
+    Image Processor for VAE
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`.
+        vae_scale_factor (`int`, *optional*, defaults to `8`):
+            VAE scale factor. If `do_resize` is True, the image will be automatically resized to multiples of this
+            factor.
+        resample (`str`, *optional*, defaults to `lanczos`):
+            Resampling filter to use when resizing the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image to [-1,1]
+    """
+    config_name = CONFIG_NAME
+    @register_to_config
+    def __init__(
+        self,
+        do_resize: bool = True,
+        vae_scale_factor: int = 8,
+        resample: str = "lanczos",
+        do_normalize: bool = True,
+    ):
+        super().__init__()
+    @staticmethod
+    def numpy_to_pil(images):
+        """
+        Convert a numpy image or a batch of images to a PIL image.
+        """
+        if images.ndim == 3:
+            images = images[None, ...]
+        images = (images * 255).round().astype("uint8")
+        if images.shape[-1] == 1:
+            # special case for grayscale (single channel) images
+            pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
+        else:
+            pil_images = [Image.fromarray(image) for image in images]
+        return pil_images
+    @staticmethod
+    def numpy_to_pt(images):
+        """
+        Convert a numpy image to a pytorch tensor
+        """
+        if images.ndim == 3:
+            images = images[..., None]
+        images = torch.from_numpy(images.transpose(0, 3, 1, 2))
+        return images
+    @staticmethod
+    def pt_to_numpy(images):
+        """
+        Convert a pytorch tensor to a numpy image
+        """
+        images = images.cpu().permute(0, 2, 3, 1).float().numpy()
+        return images
+    @staticmethod
+    def normalize(images):
+        """
+        Normalize an image array to [-1,1]
+        """
+        return 2.0 * images - 1.0
+    @staticmethod
+    def denormalize(images):
+        """
+        Denormalize an image array to [0,1]
+        """
+        return (images / 2 + 0.5).clamp(0, 1)
+    def resize(self, images: PIL.Image.Image) -> PIL.Image.Image:
+        """
+        Resize a PIL image. Both height and width will be downscaled to the next integer multiple of `vae_scale_factor`
+        """
+        w, h = images.size
+        w, h = (x - x % self.config.vae_scale_factor for x in (w, h))  # resize to integer multiple of vae_scale_factor
+        images = images.resize((w, h), resample=PIL_INTERPOLATION[self.config.resample])
+        return images
+    def preprocess(
+        self,
+        image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray],
+    ) -> torch.Tensor:
+        """
+        Preprocess the image input, accepted formats are PIL images, numpy arrays or pytorch tensors"
+        """
+        supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor)
+        if isinstance(image, supported_formats):
+            image = [image]
+        elif not (isinstance(image, list) and all(isinstance(i, supported_formats) for i in image)):
+            raise ValueError(
+                f"Input is in incorrect format: {[type(i) for i in image]}. Currently, we only support {', '.join(supported_formats)}"
+            )
+        if isinstance(image[0], PIL.Image.Image):
+            if self.config.do_resize:
+                image = [self.resize(i) for i in image]
+            image = [np.array(i).astype(np.float32) / 255.0 for i in image]
+            image = np.stack(image, axis=0)  # to np
+            image = self.numpy_to_pt(image)  # to pt
+        elif isinstance(image[0], np.ndarray):
+            image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
+            image = self.numpy_to_pt(image)
+            _, _, height, width = image.shape
+            if self.config.do_resize and (
+                height % self.config.vae_scale_factor != 0 or width % self.config.vae_scale_factor != 0
+            ):
+                raise ValueError(
+                    f"Currently we only support resizing for PIL image - please resize your numpy array to be divisible by {self.config.vae_scale_factor}"
+                    f"currently the sizes are {height} and {width}. You can also pass a PIL image instead to use resize option in VAEImageProcessor"
+                )
+        elif isinstance(image[0], torch.Tensor):
+            image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0)
+            _, _, height, width = image.shape
+            if self.config.do_resize and (
+                height % self.config.vae_scale_factor != 0 or width % self.config.vae_scale_factor != 0
+            ):
+                raise ValueError(
+                    f"Currently we only support resizing for PIL image - please resize your pytorch tensor to be divisible by {self.config.vae_scale_factor}"
+                    f"currently the sizes are {height} and {width}. You can also pass a PIL image instead to use resize option in VAEImageProcessor"
+                )
+        # expected range [0,1], normalize to [-1,1]
+        do_normalize = self.config.do_normalize
+        if image.min() < 0:
+            warnings.warn(
+                "Passing `image` as torch tensor with value range in [-1,1] is deprecated. The expected value range for image tensor is [0,1] "
+                f"when passing as pytorch tensor or numpy Array. You passed `image` with value range [{image.min()},{image.max()}]",
+                FutureWarning,
+            )
+            do_normalize = False
+        if do_normalize:
+            image = self.normalize(image)
+        return image
+    def postprocess(
+        self,
+        image: torch.FloatTensor,
+        output_type: str = "pil",
+        do_denormalize: Optional[List[bool]] = None,
+    ):
+        if not isinstance(image, torch.Tensor):
+            raise ValueError(
+                f"Input for postprocessing is in incorrect format: {type(image)}. We only support pytorch tensor"
+            )
+        if output_type not in ["latent", "pt", "np", "pil"]:
+            deprecation_message = (
+                f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
+                "`pil`, `np`, `pt`, `latent`"
+            )
+            deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False)
+            output_type = "np"
+        if output_type == "latent":
+            return image
+        if do_denormalize is None:
+            do_denormalize = [self.config.do_normalize] * image.shape[0]
+        image = torch.stack(
+            [self.denormalize(image[i]) if do_denormalize[i] else image[i] for i in range(image.shape[0])]
+        )
+        if output_type == "pt":
+            return image
+        image = self.pt_to_numpy(image)
+        if output_type == "np":
+            return image
+        if output_type == "pil":
+            return self.numpy_to_pil(image)

prompt_parser.py ADDED Viewed

	@@ -0,0 +1,373 @@

+import re
+from collections import namedtuple
+from typing import List
+import lark
+# a prompt like this: "fantasy landscape with a [mountain:lake:0.25] and [an oak:a christmas tree:0.75][ in foreground::0.6][ in background:0.25] [shoddy:masterful:0.5]"
+# will be represented with prompt_schedule like this (assuming steps=100):
+# [25, 'fantasy landscape with a mountain and an oak in foreground shoddy']
+# [50, 'fantasy landscape with a lake and an oak in foreground in background shoddy']
+# [60, 'fantasy landscape with a lake and an oak in foreground in background masterful']
+# [75, 'fantasy landscape with a lake and an oak in background masterful']
+# [100, 'fantasy landscape with a lake and a christmas tree in background masterful']
+schedule_parser = lark.Lark(r"""
+!start: (prompt | /[][():]/+)*
+prompt: (emphasized | scheduled | alternate | plain | WHITESPACE)*
+!emphasized: "(" prompt ")"
+        | "(" prompt ":" prompt ")"
+        | "[" prompt "]"
+scheduled: "[" [prompt ":"] prompt ":" [WHITESPACE] NUMBER "]"
+alternate: "[" prompt ("|" prompt)+ "]"
+WHITESPACE: /\s+/
+plain: /([^\\\[\]():|]|\\.)+/
+%import common.SIGNED_NUMBER -> NUMBER
+""")
+def get_learned_conditioning_prompt_schedules(prompts, steps):
+    """
+    >>> g = lambda p: get_learned_conditioning_prompt_schedules([p], 10)[0]
+    >>> g("test")
+    [[10, 'test']]
+    >>> g("a [b:3]")
+    [[3, 'a '], [10, 'a b']]
+    >>> g("a [b: 3]")
+    [[3, 'a '], [10, 'a b']]
+    >>> g("a [[[b]]:2]")
+    [[2, 'a '], [10, 'a [[b]]']]
+    >>> g("[(a:2):3]")
+    [[3, ''], [10, '(a:2)']]
+    >>> g("a [b : c : 1] d")
+    [[1, 'a b  d'], [10, 'a  c  d']]
+    >>> g("a[b:[c:d:2]:1]e")
+    [[1, 'abe'], [2, 'ace'], [10, 'ade']]
+    >>> g("a [unbalanced")
+    [[10, 'a [unbalanced']]
+    >>> g("a [b:.5] c")
+    [[5, 'a  c'], [10, 'a b c']]
+    >>> g("a [{b|d{:.5] c")  # not handling this right now
+    [[5, 'a  c'], [10, 'a {b|d{ c']]
+    >>> g("((a][:b:c [d:3]")
+    [[3, '((a][:b:c '], [10, '((a][:b:c d']]
+    >>> g("[a|(b:1.1)]")
+    [[1, 'a'], [2, '(b:1.1)'], [3, 'a'], [4, '(b:1.1)'], [5, 'a'], [6, '(b:1.1)'], [7, 'a'], [8, '(b:1.1)'], [9, 'a'], [10, '(b:1.1)']]
+    """
+    def collect_steps(steps, tree):
+        l = [steps]
+        class CollectSteps(lark.Visitor):
+            def scheduled(self, tree):
+                tree.children[-1] = float(tree.children[-1])
+                if tree.children[-1] < 1:
+                    tree.children[-1] *= steps
+                tree.children[-1] = min(steps, int(tree.children[-1]))
+                l.append(tree.children[-1])
+            def alternate(self, tree):
+                l.extend(range(1, steps+1))
+        CollectSteps().visit(tree)
+        return sorted(set(l))
+    def at_step(step, tree):
+        class AtStep(lark.Transformer):
+            def scheduled(self, args):
+                before, after, _, when = args
+                yield before or () if step <= when else after
+            def alternate(self, args):
+                yield next(args[(step - 1)%len(args)])
+            def start(self, args):
+                def flatten(x):
+                    if type(x) == str:
+                        yield x
+                    else:
+                        for gen in x:
+                            yield from flatten(gen)
+                return ''.join(flatten(args))
+            def plain(self, args):
+                yield args[0].value
+            def __default__(self, data, children, meta):
+                for child in children:
+                    yield child
+        return AtStep().transform(tree)
+    def get_schedule(prompt):
+        try:
+            tree = schedule_parser.parse(prompt)
+        except lark.exceptions.LarkError as e:
+            if 0:
+                import traceback
+                traceback.print_exc()
+            return [[steps, prompt]]
+        return [[t, at_step(t, tree)] for t in collect_steps(steps, tree)]
+    promptdict = {prompt: get_schedule(prompt) for prompt in set(prompts)}
+    return [promptdict[prompt] for prompt in prompts]
+ScheduledPromptConditioning = namedtuple("ScheduledPromptConditioning", ["end_at_step", "cond"])
+def get_learned_conditioning(model, prompts, steps):
+    """converts a list of prompts into a list of prompt schedules - each schedule is a list of ScheduledPromptConditioning, specifying the comdition (cond),
+    and the sampling step at which this condition is to be replaced by the next one.
+    Input:
+    (model, ['a red crown', 'a [blue:green:5] jeweled crown'], 20)
+    Output:
+    [
+        [
+            ScheduledPromptConditioning(end_at_step=20, cond=tensor([[-0.3886,  0.0229, -0.0523,  ..., -0.4901, -0.3066,  0.0674], ..., [ 0.3317, -0.5102, -0.4066,  ...,  0.4119, -0.7647, -1.0160]], device='cuda:0'))
+        ],
+        [
+            ScheduledPromptConditioning(end_at_step=5, cond=tensor([[-0.3886,  0.0229, -0.0522,  ..., -0.4901, -0.3067,  0.0673], ..., [-0.0192,  0.3867, -0.4644,  ...,  0.1135, -0.3696, -0.4625]], device='cuda:0')),
+            ScheduledPromptConditioning(end_at_step=20, cond=tensor([[-0.3886,  0.0229, -0.0522,  ..., -0.4901, -0.3067,  0.0673], ..., [-0.7352, -0.4356, -0.7888,  ...,  0.6994, -0.4312, -1.2593]], device='cuda:0'))
+        ]
+    ]
+    """
+    res = []
+    prompt_schedules = get_learned_conditioning_prompt_schedules(prompts, steps)
+    cache = {}
+    for prompt, prompt_schedule in zip(prompts, prompt_schedules):
+        cached = cache.get(prompt, None)
+        if cached is not None:
+            res.append(cached)
+            continue
+        texts = [x[1] for x in prompt_schedule]
+        conds = [model.build_conditioning_tensor(text) for text in texts]
+        cond_schedule = []
+        for i, (end_at_step, text) in enumerate(prompt_schedule):
+            cond_schedule.append(ScheduledPromptConditioning(end_at_step, conds[i]))
+        cache[prompt] = cond_schedule
+        res.append(cond_schedule)
+    return res
+re_AND = re.compile(r"\bAND\b")
+re_weight = re.compile(r"^(.*?)(?:\s*:\s*([-+]?(?:\d+\.?|\d*\.\d+)))?\s*$")
+def get_multicond_prompt_list(prompts):
+    res_indexes = []
+    prompt_flat_list = []
+    prompt_indexes = {}
+    for prompt in prompts:
+        subprompts = re_AND.split(prompt)
+        indexes = []
+        for subprompt in subprompts:
+            match = re_weight.search(subprompt)
+            text, weight = match.groups() if match is not None else (subprompt, 1.0)
+            weight = float(weight) if weight is not None else 1.0
+            index = prompt_indexes.get(text, None)
+            if index is None:
+                index = len(prompt_flat_list)
+                prompt_flat_list.append(text)
+                prompt_indexes[text] = index
+            indexes.append((index, weight))
+        res_indexes.append(indexes)
+    return res_indexes, prompt_flat_list, prompt_indexes
+class ComposableScheduledPromptConditioning:
+    def __init__(self, schedules, weight=1.0):
+        self.schedules: List[ScheduledPromptConditioning] = schedules
+        self.weight: float = weight
+class MulticondLearnedConditioning:
+    def __init__(self, shape, batch):
+        self.shape: tuple = shape  # the shape field is needed to send this object to DDIM/PLMS
+        self.batch: List[List[ComposableScheduledPromptConditioning]] = batch
+def get_multicond_learned_conditioning(model, prompts, steps) -> MulticondLearnedConditioning:
+    """same as get_learned_conditioning, but returns a list of ScheduledPromptConditioning along with the weight objects for each prompt.
+    For each prompt, the list is obtained by splitting the prompt using the AND separator.
+    https://energy-based-model.github.io/Compositional-Visual-Generation-with-Composable-Diffusion-Models/
+    """
+    res_indexes, prompt_flat_list, prompt_indexes = get_multicond_prompt_list(prompts)
+    learned_conditioning = get_learned_conditioning(model, prompt_flat_list, steps)
+    res = []
+    for indexes in res_indexes:
+        res.append([ComposableScheduledPromptConditioning(learned_conditioning[i], weight) for i, weight in indexes])
+    return MulticondLearnedConditioning(shape=(len(prompts),), batch=res)
+def reconstruct_cond_batch(c: List[List[ScheduledPromptConditioning]], current_step):
+    param = c[0][0].cond
+    res = torch.zeros((len(c),) + param.shape, device=param.device, dtype=param.dtype)
+    for i, cond_schedule in enumerate(c):
+        target_index = 0
+        for current, (end_at, cond) in enumerate(cond_schedule):
+            if current_step <= end_at:
+                target_index = current
+                break
+        res[i] = cond_schedule[target_index].cond
+    return res
+def reconstruct_multicond_batch(c: MulticondLearnedConditioning, current_step):
+    param = c.batch[0][0].schedules[0].cond
+    tensors = []
+    conds_list = []
+    for batch_no, composable_prompts in enumerate(c.batch):
+        conds_for_batch = []
+        for cond_index, composable_prompt in enumerate(composable_prompts):
+            target_index = 0
+            for current, (end_at, cond) in enumerate(composable_prompt.schedules):
+                if current_step <= end_at:
+                    target_index = current
+                    break
+            conds_for_batch.append((len(tensors), composable_prompt.weight))
+            tensors.append(composable_prompt.schedules[target_index].cond)
+        conds_list.append(conds_for_batch)
+    # if prompts have wildly different lengths above the limit we'll get tensors fo different shapes
+    # and won't be able to torch.stack them. So this fixes that.
+    token_count = max([x.shape[0] for x in tensors])
+    for i in range(len(tensors)):
+        if tensors[i].shape[0] != token_count:
+            last_vector = tensors[i][-1:]
+            last_vector_repeated = last_vector.repeat([token_count - tensors[i].shape[0], 1])
+            tensors[i] = torch.vstack([tensors[i], last_vector_repeated])
+    return conds_list, torch.stack(tensors).to(device=param.device, dtype=param.dtype)
+re_attention = re.compile(r"""
+\\\(|
+\\\)|
+\\\[|
+\\]|
+\\\\|
+\\|
+\(|
+\[|
+:([+-]?[.\d]+)\)|
+\)|
+]|
+[^\\()\[\]:]+|
+:
+""", re.X)
+re_break = re.compile(r"\s*\bBREAK\b\s*", re.S)
+def parse_prompt_attention(text):
+    """
+    Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
+    Accepted tokens are:
+      (abc) - increases attention to abc by a multiplier of 1.1
+      (abc:3.12) - increases attention to abc by a multiplier of 3.12
+      [abc] - decreases attention to abc by a multiplier of 1.1
+      \( - literal character '('
+      \[ - literal character '['
+      \) - literal character ')'
+      \] - literal character ']'
+      \\ - literal character '\'
+      anything else - just text
+    >>> parse_prompt_attention('normal text')
+    [['normal text', 1.0]]
+    >>> parse_prompt_attention('an (important) word')
+    [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
+    >>> parse_prompt_attention('(unbalanced')
+    [['unbalanced', 1.1]]
+    >>> parse_prompt_attention('\(literal\]')
+    [['(literal]', 1.0]]
+    >>> parse_prompt_attention('(unnecessary)(parens)')
+    [['unnecessaryparens', 1.1]]
+    >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
+    [['a ', 1.0],
+     ['house', 1.5730000000000004],
+     [' ', 1.1],
+     ['on', 1.0],
+     [' a ', 1.1],
+     ['hill', 0.55],
+     [', sun, ', 1.1],
+     ['sky', 1.4641000000000006],
+     ['.', 1.1]]
+    """
+    res = []
+    round_brackets = []
+    square_brackets = []
+    round_bracket_multiplier = 1.1
+    square_bracket_multiplier = 1 / 1.1
+    def multiply_range(start_position, multiplier):
+        for p in range(start_position, len(res)):
+            res[p][1] *= multiplier
+    for m in re_attention.finditer(text):
+        text = m.group(0)
+        weight = m.group(1)
+        if text.startswith('\\'):
+            res.append([text[1:], 1.0])
+        elif text == '(':
+            round_brackets.append(len(res))
+        elif text == '[':
+            square_brackets.append(len(res))
+        elif weight is not None and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), float(weight))
+        elif text == ')' and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), round_bracket_multiplier)
+        elif text == ']' and len(square_brackets) > 0:
+            multiply_range(square_brackets.pop(), square_bracket_multiplier)
+        else:
+            parts = re.split(re_break, text)
+            for i, part in enumerate(parts):
+                if i > 0:
+                    res.append(["BREAK", -1])
+                res.append([part, 1.0])
+    for pos in round_brackets:
+        multiply_range(pos, round_bracket_multiplier)
+    for pos in square_brackets:
+        multiply_range(pos, square_bracket_multiplier)
+    if len(res) == 0:
+        res = [["", 1.0]]
+    # merge runs of identical weights
+    i = 0
+    while i + 1 < len(res):
+        if res[i][1] == res[i + 1][1]:
+            res[i][0] += res[i + 1][0]
+            res.pop(i + 1)
+        else:
+            i += 1
+    return res
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
+else:
+    import torch  # doctest faster

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+compel==1.2.1
+tokenizers==0.13.3
+typing_extensions<4.6.0,>=3.6.6
+diffusers==0.20.2
+torch
+tqdm==4.65.0
+transformers==4.27.1
+cachetools==5.3.1
+dynamicprompts==0.27.0
+numpy==1.24
+lark==1.1.5
+accelerate==0.21.0

stable_diffusion_custom_v4_1.py ADDED Viewed

	@@ -0,0 +1,795 @@

+import random
+from diffusers import StableDiffusionPipeline
+# from diffusers.schedulers.scheduling_euler_ancestral_discrete import EulerAncestralDiscreteScheduler
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput, AutoencoderKL, CLIPTextModel, CLIPTokenizer, UNet2DConditionModel, KarrasDiffusionSchedulers, StableDiffusionSafetyChecker, CLIPImageProcessor
+from compel import Compel
+from tokenizer_util import TextualInversionLoaderMixin, MultiTokenCLIPTokenizer
+import torch
+from typing import Any, Callable, Dict, List, Optional, Union
+from dynamicprompts.generators import RandomPromptGenerator
+import time
+from compel import Compel
+from prompt_parser import ScheduledPromptConditioning
+from prompt_parser import get_learned_conditioning_prompt_schedules
+from dynamicprompts.generators import RandomPromptGenerator
+import tqdm
+from cachetools import LRUCache
+from image_processor import VaeImageProcessor
+class CustomStableDiffusionPipeline4_1(TextualInversionLoaderMixin, StableDiffusionPipeline):
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+        prompt_cache_size: int = 1024,
+        prompt_cache_ttl: int = 60 * 2,
+    ) -> None:
+        super().__init__(vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet, scheduler=scheduler,
+                         safety_checker=safety_checker, feature_extractor=feature_extractor, requires_safety_checker=requires_safety_checker)
+        self.vae_scale_factor = 2 ** (
+            len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(
+            requires_safety_checker=requires_safety_checker)
+        self.compel = Compel(tokenizer=self.tokenizer,
+                             text_encoder=self.text_encoder, truncate_long_prompts=False)
+        self.cache = LRUCache(maxsize=prompt_cache_size)
+        self.cached_uc = [None, None]
+        self.cached_c = [None, None]
+        self.prompt_handler = None
+    def build_scheduled_cond(self, prompt, steps, key):
+        prompt_schedule = get_learned_conditioning_prompt_schedules([prompt], steps)[
+            0]
+        cached = self.cache.get(key, None)
+        if cached is not None:
+            return cached
+        texts = [x[1] for x in prompt_schedule]
+        conds = [self.compel.build_conditioning_tensor(
+            text).to('cpu') for text in texts]
+        cond_schedule = []
+        for i, s in enumerate(prompt_schedule):
+            cond_schedule.append(ScheduledPromptConditioning(s[0], conds[i]))
+        self.cache[key] = cond_schedule
+        return cond_schedule
+    def initialize_magic_prompt_cache(self, pos_prompt_template: str, plain_prompt_template: str, neg_prompt_template: str, num_to_generate: int, steps: int):
+        r"""
+        Initializes the magic prompt cache for the forward pass.
+        Must be called immedaitely after Compel is loaded and embeds are initalized.
+        """
+        rpg = RandomPromptGenerator(ignore_whitespace=True, seed=555)
+        positive_prompts = rpg.generate(
+            template=pos_prompt_template, num_images=num_to_generate)
+        scheduled_conds = []
+        with torch.no_grad():
+            cache = {}
+            for i in tqdm.tqdm(range(len(positive_prompts))):
+                scheduled_conds.append(self.build_scheduled_cond(
+                    positive_prompts[i], steps, cache))
+            plain_scheduled_cond = self.build_scheduled_cond(
+                plain_prompt_template, steps, cache)
+            scheduled_uncond = self.build_scheduled_cond(
+                neg_prompt_template, steps, cache)
+        self.scheduled_conds = scheduled_conds
+        self.plain_scheduled_cond = plain_scheduled_cond
+        self.scheduled_uncond = scheduled_uncond
+    def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`str` or `list(int)`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+        """
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="np",
+        )
+        text_input_ids = text_inputs.input_ids
+        text_input_ids = torch.from_numpy(text_input_ids)
+        untruncated_ids = self.tokenizer(
+            prompt, padding="max_length", return_tensors="np").input_ids
+        untruncated_ids = torch.from_numpy(untruncated_ids)
+        if (
+            text_input_ids.shape == untruncated_ids.shape
+            and text_input_ids.numel() == untruncated_ids.numel()
+            and not torch.equal(text_input_ids, untruncated_ids)
+        ):
+            removed_text = self.tokenizer.batch_decode(
+                untruncated_ids[:, self.tokenizer.model_max_length - 1: -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
+        else:
+            attention_mask = None
+        text_embeddings = self.text_encoder(
+            text_input_ids.to(device), attention_mask=attention_mask)
+        text_embeddings = text_embeddings[0]
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(
+            bs_embed * num_images_per_prompt, seq_len, -1)
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens, padding="max_length", max_length=max_length, truncation=True, return_tensors="np",
+            )
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = torch.from_numpy(
+                    uncond_input.attention_mask).to(device)
+            else:
+                attention_mask = None
+            uncond_embeddings = self.text_encoder(
+                torch.from_numpy(uncond_input.input_ids).to(device), attention_mask=attention_mask,
+            )
+            uncond_embeddings = uncond_embeddings[0]
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(
+                1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(
+                batch_size * num_images_per_prompt, seq_len, -1)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+        return text_embeddings
+    def _encode_promptv2(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+    ):
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if prompt_embeds is None:
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(
+                prompt, padding="longest", return_tensors="pt").input_ids
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1: -1]
+                )
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+        prompt_embeds = prompt_embeds.to(
+            dtype=self.text_encoder.dtype, device=device)
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(
+            bs_embed * num_images_per_prompt, seq_len, -1)
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(
+                dtype=self.text_encoder.dtype, device=device)
+            negative_prompt_embeds = negative_prompt_embeds.repeat(
+                1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(
+                batch_size * num_images_per_prompt, seq_len, -1)
+            negative_prompt_embeds, prompt_embeds = self.compel.pad_conditioning_tensors_to_same_length(
+                [negative_prompt_embeds, prompt_embeds])
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+        return prompt_embeds
+    def _pyramid_noise_like(self, noise, device, seed, iterations=6, discount=0.4):
+        gen = torch.manual_seed(seed)
+        # EDIT: w and h get over-written, rename for a different variant!
+        b, c, w, h = noise.shape
+        u = torch.nn.Upsample(size=(w, h), mode="bilinear").to(device)
+        for i in range(iterations):
+            r = random.random() * 2 + 2  # Rather than always going 2x,
+            wn, hn = max(1, int(w / (r**i))), max(1, int(h / (r**i)))
+            noise += u(torch.randn(b, c, wn, hn,
+                       generator=gen).to(device)) * discount**i
+            if wn == 1 or hn == 1:
+                break  # Lowest resolution is 1x1
+        return noise / noise.std()  # Scaled back to roughly unit variance
+    @torch.no_grad()
+    def inferV4(
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[
+            int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        compile_unet: bool = True,
+        compile_vae: bool = True,
+        compile_tenc: bool = True,
+        max_tokens=0,
+        seed=-1,
+        flags=[],
+        og_prompt=None,
+        og_neg_prompt=None,
+        disc=0.4,
+        iter=6,
+        pyramid=0,  # disabled by default unless specified
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        self.check_inputs(prompt, height, width, callback_steps)
+        if negative_prompt == None:
+            negative_prompt = ['']
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # # 3. Encode input prompt
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # Cache key for flags
+        plain = "plain" in flags
+        flair = None
+        for flag in flags:
+            if "flair" in flag:
+                flair = flag
+                break
+        with torch.no_grad():
+            c_time = time.time()
+            user_cond = self.build_scheduled_cond(
+                prompt[0], num_inference_steps, ('pos', og_prompt, seed, plain, flair))
+            c_time = time.time()
+            user_uncond = self.build_scheduled_cond(
+                negative_prompt[0], num_inference_steps, ('neg', negative_prompt[0], 0))
+        c = []
+        c.extend(user_cond)
+        uc = []
+        uc.extend(user_uncond)
+        max_token_count = 0
+        for cond in uc:
+            if cond.cond.shape[1] > max_token_count:
+                max_token_count = cond.cond.shape[1]
+        for cond in c:
+            if cond.cond.shape[1] > max_token_count:
+                max_token_count = cond.cond.shape[1]
+        def pad_tensor(conditionings: List[ScheduledPromptConditioning], max_token_count: int) -> List[ScheduledPromptConditioning]:
+            c0_shape = conditionings[0].cond.shape
+            if not all([len(c.cond.shape) == len(c0_shape) for c in conditionings]):
+                raise ValueError(
+                    "Conditioning tensors must all have either 2 dimensions (unbatched) or 3 dimensions (batched)")
+            if len(c0_shape) == 2:
+                # need to be unsqueezed
+                for c in conditionings:
+                    c.cond = c.cond.unsqueeze(0)
+                c0_shape = conditionings[0].cond.shape
+            if len(c0_shape) != 3:
+                raise ValueError(
+                    f"All conditioning tensors must have the same number of dimensions (2 or 3)")
+            if not all([c.cond.shape[0] == c0_shape[0] and c.cond.shape[2] == c0_shape[2] for c in conditionings]):
+                raise ValueError(
+                    f"All conditioning tensors must have the same batch size ({c0_shape[0]}) and number of embeddings per token ({c0_shape[1]}")
+            # if necessary, pad shorter tensors out with an emptystring tensor
+            empty_z = torch.cat(
+                [self.compel.build_conditioning_tensor("")] * c0_shape[0])
+            for i, c in enumerate(conditionings):
+                cond = c.cond.to(self.device)
+                while cond.shape[1] < max_token_count:
+                    cond = torch.cat([cond, empty_z], dim=1)
+                conditionings[i] = ScheduledPromptConditioning(
+                    c.end_at_step, cond)
+            return conditionings
+        uc = pad_tensor(uc, max_token_count)
+        c = pad_tensor(c, max_token_count)
+        next_uc = uc.pop(0)
+        next_c = c.pop(0)
+        prompt_embeds = None
+        new_embeds = True
+        embed_per_step = []
+        for i in range(len(timesteps)):
+            if i > next_uc.end_at_step:
+                next_uc = uc.pop(0)
+                new_embeds = True
+            if i > next_c.end_at_step:
+                next_c = c.pop(0)
+                new_embeds = True
+            if new_embeds:
+                negative_prompt_embeds, prompt_embeds = self.compel.pad_conditioning_tensors_to_same_length([
+                                                                                                            next_uc.cond, next_c.cond])
+                prompt_embeds = torch.cat(
+                    [negative_prompt_embeds, prompt_embeds])
+                new_embeds = False
+            embed_per_step.append(prompt_embeds)
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - \
+            num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat(
+                    [latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t)
+                prompt_embeds = embed_per_step[i]
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * \
+                        (noise_pred_text - noise_pred_uncond)
+                    if (i < pyramid*num_inference_steps):
+                        noise_pred = self._pyramid_noise_like(
+                            noise_pred, device, seed, iterations=iter, discount=disc)
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents = self.scheduler.step(
+                        noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                # call the callback, if provided
+                if (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0:
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        if not output_type == "latent":
+            image = self.vae.decode(
+                latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(
+                image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+        image = self.image_processor.postprocess(
+            image, output_type=output_type, do_denormalize=do_denormalize)
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+    @torch.no_grad()
+    def inferPipe(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator,
+                                  List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[
+            int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, callback_steps)
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input prompt
+        text_embeddings = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+        timesteps = self.scheduler.timesteps
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            text_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - \
+            num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat(
+                    [latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t)
+                noise_pred = self.unet(
+                    latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * \
+                        (noise_pred_text - noise_pred_uncond)
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents = self.scheduler.step(
+                        noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                # call the callback, if provided
+                if (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0:
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        if not output_type == "latent":
+            image = self.vae.decode(
+                latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(
+                image, device, text_embeddings.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+        image = self.image_processor.postprocess(
+            image, output_type=output_type, do_denormalize=do_denormalize)
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)

tokenizer_util.py ADDED Viewed

	@@ -0,0 +1,354 @@

+import torch
+from typing import Callable, Dict, Optional, Union, List
+from urllib.parse import urlparse
+from transformers import PreTrainedModel, PreTrainedTokenizer, CLIPTokenizer
+import copy
+import random
+from io import BytesIO
+from compel.embeddings_provider import BaseTextualInversionManager
+class TextualInversionLoaderMixin(BaseTextualInversionManager):
+    r"""
+    Mixin class for adding textual inversion tokens and embeddings to the tokenizer and text encoder with method:
+    - [`~TextualInversionLoaderMixin.load_textual_inversion_embeddings`]
+    - [`~TextualInversionLoaderMixin.add_textual_inversion_embedding`]
+    """
+    def load_textual_inversion_embeddings(
+        self,
+        embedding_path_dict_or_list: Union[Dict[str, str], List[Dict[str, str]]],
+        allow_replacement: bool = False,
+    ):
+        r"""
+        Loads textual inversion embeddings and adds them to the tokenizer's vocabulary and the text encoder's embeddings.
+        Arguments:
+            embeddings_path_dict_or_list (`Dict[str, str]` or `List[str]`):
+                Dictionary of token to embedding path or List of embedding paths to embedding dictionaries.
+                The dictionary must have the following keys:
+                    - `token`: name of the token to be added to the tokenizers' vocabulary
+                    - `embedding`: path to the embedding of the token to be added to the text encoder's embedding matrix
+                The list must contain paths to embedding dictionaries where the keys are the tokens and the
+                values are the embeddings (same as above dictionary definition).
+            allow_replacement (`bool`, *optional*, defaults to `False`):
+                Whether to allow replacement of existing tokens in the tokenizer's vocabulary. If `False`
+                and a token is already in the vocabulary, an error will be raised.
+        Returns:
+            None
+        """
+        # Validate that inheriting class instance contains required attributes
+        self._validate_method_call(self.load_textual_inversion_embeddings)
+        if isinstance(embedding_path_dict_or_list, dict):
+            for token, embedding_path in embedding_path_dict_or_list.items():
+                embedding_dict = torch.load(embedding_path, map_location=self.text_encoder.device)
+                embedding, is_multi_vec_token = self._extract_embedding_from_dict(embedding_dict)
+                self._validate_token_update(token, allow_replacement, is_multi_vec_token)
+                self.add_textual_inversion_embedding(token, embedding)
+        elif isinstance(embedding_path_dict_or_list, list):
+            for embedding_path in embedding_path_dict_or_list:
+                embedding_dict = torch.load(embedding_path, map_location=self.text_encoder.device)
+                token = self._extract_token_from_dict(embedding_dict)
+                embedding, is_multi_vec_token = self._extract_embedding_from_dict(embedding_dict)
+                self._validate_token_update(token, allow_replacement, is_multi_vec_token)
+                self.add_textual_inversion_embedding(token, embedding)
+        else:
+            raise ValueError(
+                f"Type {type(embedding_path_dict_or_list)} is invalid. The value passed to `embedding_path_dict_or_list` "
+                "must be a dictionary that maps a token to it's embedding file path "
+                "or a list of paths to embedding files containing embedding dictionaries."
+            )
+    def add_textual_inversion_embedding(self, token: str, embedding: torch.Tensor):
+        r"""
+        Adds a token to the tokenizer's vocabulary and an embedding to the text encoder's embedding matrix.
+        Arguments:
+            token (`str`):
+                The token to be added to the tokenizers' vocabulary
+            embedding (`torch.Tensor`):
+                The embedding of the token to be added to the text encoder's embedding matrix
+        Returns:
+            None
+        """
+        # NOTE: Not clear to me that we intend for this to be a public/exposed method.
+        # Validate that inheriting class instance contains required attributes
+        self._validate_method_call(self.load_textual_inversion_embeddings)
+        embedding = embedding.to(self.text_encoder.dtype)
+        if not isinstance(self.tokenizer, MultiTokenCLIPTokenizer):
+            if token in self.tokenizer.get_vocab():
+                # If user has allowed replacement and the token exists, we only need to
+                # extract the existing id and update the embedding
+                token_id = self.tokenizer.convert_tokens_to_ids(token)
+                self.text_encoder.get_input_embeddings().weight.data[token_id] = embedding
+            else:
+                # If the token does not exist, we add it to the tokenizer, then resize and update the
+                # text encoder acccordingly
+                self.tokenizer.add_tokens([token])
+                token_id = self.tokenizer.convert_tokens_to_ids(token)
+                self.text_encoder.resize_token_embeddings(len(self.tokenizer))
+                self.text_encoder.get_input_embeddings().weight.data[token_id] = embedding
+        else:
+            if token in self.tokenizer.token_map:
+                # If user has allowed replacement and the token exists, we need to
+                # remove all existing tokens associated with the old embbedding and
+                # upddate with the new ones
+                indices_to_remove = []
+                for token_to_remove in self.tokenizer.token_map[token]:
+                    indices_to_remove.append(self.tokenizer.get_added_vocab()[token_to_remove])
+                    # Remove old  tokens from tokenizer
+                    self.tokenizer.added_tokens_encoder.pop(token_to_remove)
+                # Convert indices to remove to tensor
+                indices_to_remove = torch.LongTensor(indices_to_remove)
+                # Remove old tokens from text encoder
+                token_embeds = self.text_encoder.get_input_embeddings().weight.data
+                indices_to_keep = torch.arange(0, token_embeds.shape[0])
+                indices_to_keep = indices_to_keep[indices_to_keep != indices_to_remove].squeeze()
+                token_embeds = token_embeds[indices_to_keep]
+                # Downsize text encoder
+                self.text_encoder.resize_token_embeddings(len(self.tokenizer))
+                # Remove token from map so MultiTokenCLIPTokenizer doesn't complain
+                # on update
+                self.tokenizer.token_map.pop(token)
+            # Update token with new embedding
+            embedding_dims = len(embedding.shape)
+            num_vec_per_token = 1 if embedding_dims == 1 else embedding.shape[0]
+            self.tokenizer.add_placeholder_tokens(token, num_vec_per_token=num_vec_per_token)
+            self.text_encoder.resize_token_embeddings(len(self.tokenizer))
+            token_ids = self.tokenizer.encode(token, add_special_tokens=False)
+            if embedding_dims > 1:
+                for i, token_id in enumerate(token_ids):
+                    self.text_encoder.get_input_embeddings().weight.data[token_id] = embedding[i]
+            else:
+                self.text_encoder.get_input_embeddings().weight.data[token_ids] = embedding
+    def _extract_embedding_from_dict(self, embedding_dict: Dict[str, str]) -> torch.Tensor:
+        r"""
+        Extracts the embedding from the embedding dictionary.
+        Arguments:
+            embedding_dict (`Dict[str, str]`):
+                The embedding dictionary loaded from the embedding path
+        Returns:
+            embedding (`torch.Tensor`):
+                The embedding to be added to the text encoder's embedding matrix
+            is_multi_vec_token (`bool`):
+                Whether the embedding is a multi-vector token or not
+        """
+        is_multi_vec_token = False
+        # auto1111 embedding case
+        if "string_to_param" in embedding_dict:
+            embedding_dict = embedding_dict["string_to_param"]
+            embedding = embedding_dict["*"]
+        else:
+            embedding = list(embedding_dict.values())[0]
+        if len(embedding.shape) > 1:
+            # If the embedding has more than one dimension,
+            # We need to ensure the tokenizer is a MultiTokenTokenizer
+            # because there is branching logic that depends on that class
+            if not isinstance(self.tokenizer, MultiTokenCLIPTokenizer):
+                raise ValueError(
+                    f"{self.__class__.__name__} requires `self.tokenizer` of type `MultiTokenCLIPTokenizer` for loading embeddings with more than one dimension."
+                )
+            is_multi_vec_token = True
+        return embedding, is_multi_vec_token
+    def _extract_token_from_dict(self, embedding_dict: Dict[str, str]) -> str:
+        r"""
+        Extracts the token from the embedding dictionary.
+        Arguments:
+            embedding_dict (`Dict[str, str]`):
+                The embedding dictionary loaded from the embedding path
+        Returns:
+            token (`str`):
+                The token to be added to the tokenizers' vocabulary
+        """
+        # auto1111 embedding case
+        if "string_to_param" in embedding_dict:
+            token = embedding_dict["name"]
+            return token
+        return list(embedding_dict.keys())[0]
+    def _validate_method_call(self, method: Callable):
+        r"""
+        Validates that the method is being called from a class instance that has the required attributes.
+        Arguments:
+            method (`function`):
+                The class's method being called
+        Raises:
+            ValueError:
+                If the method is being called from a class instance that does not have
+                the required attributes, the method will not be callable.
+        Returns:
+            None
+        """
+        if not hasattr(self, "tokenizer") or not isinstance(self.tokenizer, PreTrainedTokenizer):
+            raise ValueError(
+                f"{self.__class__.__name__} requires `self.tokenizer` of type `PreTrainedTokenizer` for calling `{method.__name__}`"
+            )
+        if not hasattr(self, "text_encoder") or not isinstance(self.text_encoder, PreTrainedModel):
+            raise ValueError(
+                f"{self.__class__.__name__} requires `self.text_encoder` of type `PreTrainedModel` for calling `{method.__name__}`"
+            )
+    def _validate_token_update(self, token, allow_replacement=False, is_multi_vec_token=False):
+        r"""Validates that the token is not already in the tokenizer's vocabulary.
+        Arguments:
+            token (`str`):
+                The token to be added to the tokenizers' vocabulary
+            allow_replacement (`bool`):
+                Whether to allow replacement of the token if it already exists in the tokenizer's vocabulary
+            is_multi_vec_token (`bool`):
+                Whether the embedding is a multi-vector token or not
+        Raises:
+            ValueError:
+                If the token is already in the tokenizer's vocabulary and `allow_replacement` is False.
+        Returns:
+            None
+        """
+        if (not is_multi_vec_token and token in self.tokenizer.get_vocab()) or (
+            is_multi_vec_token and token in self.tokenizer.token_map
+        ):
+            if allow_replacement:
+                print(
+                    f"Token {token} already in tokenizer vocabulary. Overwriting existing token and embedding with the new one."
+                )
+            else:
+                raise ValueError(
+                    f"Token {token} already in tokenizer vocabulary. Please choose a different token name."
+                )
+    def expand_textual_inversion_token_ids_if_necessary(self, token_ids: List[int]) -> List[int]:
+        pass
+class MultiTokenCLIPTokenizer(CLIPTokenizer):
+    """Tokenizer for CLIP models that have multi-vector tokens."""
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.token_map = {}
+    def add_placeholder_tokens(self, placeholder_token, *args, num_vec_per_token=1, **kwargs):
+        r"""Adds placeholder tokens to the tokenizer's vocabulary.
+        Arguments:
+            placeholder_token (`str`):
+                The placeholder token to be added to the tokenizers' vocabulary and token map.
+            num_vec_per_token (`int`):
+                The number of vectors per token. Defaults to 1.
+            *args:
+                The arguments to be passed to the tokenizer's `add_tokens` method.
+            **kwargs:
+                The keyword arguments to be passed to the tokenizer's `add_tokens` method.
+        Returns:
+            None
+        """
+        output = []
+        if num_vec_per_token == 1:
+            self.add_tokens(placeholder_token, *args, **kwargs)
+            output.append(placeholder_token)
+        else:
+            output = []
+            for i in range(num_vec_per_token):
+                ith_token = placeholder_token + f"_{i}"
+                self.add_tokens(ith_token, *args, **kwargs)
+                output.append(ith_token)
+        # handle cases where there is a new placeholder token that contains the current placeholder token but is larger
+        for token in self.token_map:
+            if token in placeholder_token:
+                raise ValueError(
+                    f"The tokenizer already has placeholder token {token} that can get confused with"
+                    f" {placeholder_token}keep placeholder tokens independent"
+                )
+        self.token_map[placeholder_token] = output
+    def replace_placeholder_tokens_in_text(self, text, vector_shuffle=False, prop_tokens_to_load=1.0):
+        r"""Replaces placeholder tokens in text with the tokens in the token map.
+        Opttionally, implements:
+            a) vector shuffling (https://github.com/rinongal/textual_inversion/pull/119)where
+            shuffling tokens were found to force the model to learn the concepts more descriptively.
+            b) proportional token loading so that not every token in the token map is loaded on each call;
+            used as part of progressive token loading during training which can improve generalization
+            during inference.
+        Arguments:
+            text (`str`):
+                The text to be processed.
+            vector_shuffle (`bool`):
+                Whether to shuffle the vectors in the token map. Defaults to False.
+            prop_tokens_to_load (`float`):
+                The proportion of tokens to load from the token map. Defaults to 1.0.
+        Returns:
+            `str`: The processed text.
+        """
+        if isinstance(text, list):
+            output = []
+            for i in range(len(text)):
+                output.append(self.replace_placeholder_tokens_in_text(text[i], vector_shuffle=vector_shuffle))
+            return output
+        for placeholder_token in self.token_map:
+            if placeholder_token in text:
+                tokens = self.token_map[placeholder_token]
+                tokens = tokens[: 1 + int(len(tokens) * prop_tokens_to_load)]
+                if vector_shuffle:
+                    tokens = copy.copy(tokens)
+                    random.shuffle(tokens)
+                text = text.replace(placeholder_token, " ".join(tokens))
+        return text
+    def __call__(self, text, *args, vector_shuffle=False, prop_tokens_to_load=1.0, **kwargs):
+        """Wrapper around [`~transformers.tokenization_utils.PreTrainedTokenizerBase.__call__`] method
+        but first replace placeholder tokens in text with the tokens in the token map.
+        Returns:
+            [`~transformers.tokenization_utils_base.BatchEncoding`]
+        """
+        return super().__call__(
+            self.replace_placeholder_tokens_in_text(
+                text,
+                vector_shuffle=vector_shuffle,
+                prop_tokens_to_load=prop_tokens_to_load,
+            ),
+            *args,
+            **kwargs,
+        )
+    def encode(self, text, *args, vector_shuffle=False, prop_tokens_to_load=1.0, **kwargs):
+        """Wrapper around the tokenizer's [`transformers.tokenization_utils.PreTrainedTokenizerBase.encode`] method
+        but first replaces placeholder tokens in text with the tokens in the token map.
+        Arguments:
+            text (`str`):
+                The text to be encoded.
+            *args:
+                The arguments to be passed to the tokenizer's `encode` method.
+            vector_shuffle (`bool`):
+                Whether to shuffle the vectors in the token map. Defaults to False.
+            prop_tokens_to_load (`float`):
+                The proportion of tokens to load from the token map. Defaults to 1.0.
+            **kwargs:
+                The keyword arguments to be passed to the tokenizer's `encode` method.
+        Returns:
+            List[`int`]: sequence of ids (integer)
+        """
+        return super().encode(
+            self.replace_placeholder_tokens_in_text(
+                text,
+                vector_shuffle=vector_shuffle,
+                prop_tokens_to_load=prop_tokens_to_load,
+            ),
+            *args,
+            **kwargs,
+        )