Spaces:

zerogpu-aoti
/

FLUX.1-dev-fa3-aoti

Running on Zero

App Files Files Community

cbensimon HF Staff commited on 12 days ago

Commit

69667cb

1 Parent(s): da628cb

fa3 + big refresh

Browse files

Files changed (5) hide show

app.py +4 -55
fa3.py +115 -0
optimization.py +43 -0
requirements.txt +1 -1
zerogpu.py +0 -62

app.py CHANGED Viewed

@@ -1,75 +1,24 @@
-"""
-"""
-# Upgrade PyTorch
-import os
-os.system('pip install --upgrade --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu126 "torch<2.9" spaces')
-# Actual app.py
-import os
 from datetime import datetime
 import gradio as gr
 import spaces
 import torch
 from diffusers import FluxPipeline
-from torchao.quantization import quantize_
-from torchao.quantization import Float8DynamicActivationFloat8WeightConfig
-from zerogpu import aoti_compile
 pipeline = FluxPipeline.from_pretrained('black-forest-labs/FLUX.1-schnell', torch_dtype=torch.bfloat16).to('cuda')
-@spaces.GPU(duration=1500)
-def compile_transformer():
-    pipeline.transformer.fuse_qkv_projections()
-    quantize_(pipeline.transformer, Float8DynamicActivationFloat8WeightConfig())
-    def _example_tensor(*shape):
-        return torch.randn(*shape, device='cuda', dtype=torch.bfloat16)
-    is_timestep_distilled = not pipeline.transformer.config.guidance_embeds
-    seq_length = 256 if is_timestep_distilled else 512
-    transformer_kwargs = {
-        'hidden_states': _example_tensor(1, 4096, 64),
-        'timestep': torch.tensor([1.], device='cuda', dtype=torch.bfloat16),
-        'guidance': None if is_timestep_distilled else torch.tensor([1.], device='cuda', dtype=torch.bfloat16),
-        'pooled_projections': _example_tensor(1, 768),
-        'encoder_hidden_states': _example_tensor(1, seq_length, 4096),
-        'txt_ids': _example_tensor(seq_length, 3),
-        'img_ids': _example_tensor(4096, 3),
-        'joint_attention_kwargs': {},
-        'return_dict': False,
-    }
-    inductor_configs = {
-        'conv_1x1_as_mm': True,
-        'epilogue_fusion': False,
-        'coordinate_descent_tuning': True,
-        'coordinate_descent_check_all_directions': True,
-        'max_autotune': True,
-        'triton.cudagraphs': True,
-    }
-    exported = torch.export.export(pipeline.transformer, args=(), kwargs=transformer_kwargs)
-    return aoti_compile(exported, inductor_configs)
-transformer_config = pipeline.transformer.config
-pipeline.transformer = compile_transformer()
-pipeline.transformer.config = transformer_config
 @spaces.GPU
 def generate_image(prompt: str):
     t0 = datetime.now()
     images = []
     for _ in range(9):
-        image = pipeline(prompt, num_inference_steps=4).images[0]
         elapsed = -(t0 - (t0 := datetime.now()))
         images += [(image, f'{elapsed.total_seconds():.2f}s')]
         yield images

 from datetime import datetime
 import gradio as gr
 import spaces
 import torch
 from diffusers import FluxPipeline
+from optimization import optimize_pipeline_
 pipeline = FluxPipeline.from_pretrained('black-forest-labs/FLUX.1-schnell', torch_dtype=torch.bfloat16).to('cuda')
+optimize_pipeline_(pipeline, "prompt")
 @spaces.GPU
 def generate_image(prompt: str):
+    generator = torch.Generator(device='cuda').manual_seed(42)
     t0 = datetime.now()
     images = []
     for _ in range(9):
+        image = pipeline(prompt, num_inference_steps=4, generator=generator).images[0]
         elapsed = -(t0 - (t0 := datetime.now()))
         images += [(image, f'{elapsed.total_seconds():.2f}s')]
         yield images

fa3.py ADDED Viewed

	@@ -0,0 +1,115 @@

+"""
+"""
+import torch
+from kernels import get_kernel
+_flash_attn_func = get_kernel("kernels-community/vllm-flash-attn3").flash_attn_func
+@torch.library.custom_op("flash::flash_attn_func", mutates_args=())
+def flash_attn_func(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
+    outputs, lse = _flash_attn_func(q, k, v)
+    return outputs
+@flash_attn_func.register_fake
+def _(q, k, v, **kwargs):
+    # two outputs:
+    # 1. output: (batch, seq_len, num_heads, head_dim)
+    # 2. softmax_lse: (batch, num_heads, seq_len) with dtype=torch.float32
+    meta_q = torch.empty_like(q).contiguous()
+    return meta_q #, q.new_empty((q.size(0), q.size(2), q.size(1)), dtype=torch.float32)
+# Copied FusedFluxAttnProcessor2_0 but using flash v3 instead of SDPA
+class FlashFusedFluxAttnProcessor3_0:
+    """Attention processor used typically in processing the SD3-like self-attention projections."""
+    def __call__(
+        self,
+        attn,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor | None = None,
+        attention_mask: torch.FloatTensor | None = None,
+        image_rotary_emb: torch.Tensor | None = None,
+    ) -> torch.FloatTensor:
+        batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        # `sample` projections.
+        qkv = attn.to_qkv(hidden_states)
+        split_size = qkv.shape[-1] // 3
+        query, key, value = torch.split(qkv, split_size, dim=-1)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # the attention in FluxSingleTransformerBlock does not use `encoder_hidden_states`
+        # `context` projections.
+        if encoder_hidden_states is not None:
+            encoder_qkv = attn.to_added_qkv(encoder_hidden_states)
+            split_size = encoder_qkv.shape[-1] // 3
+            (
+                encoder_hidden_states_query_proj,
+                encoder_hidden_states_key_proj,
+                encoder_hidden_states_value_proj,
+            ) = torch.split(encoder_qkv, split_size, dim=-1)
+            encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            if attn.norm_added_q is not None:
+                encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
+            if attn.norm_added_k is not None:
+                encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj)
+            # attention
+            query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
+        if image_rotary_emb is not None:
+            from diffusers.models.embeddings import apply_rotary_emb
+            query = apply_rotary_emb(query, image_rotary_emb)
+            key = apply_rotary_emb(key, image_rotary_emb)
+        # NB: transposes are necessary to match expected SDPA input shape
+        hidden_states = flash_attn_func(
+            query.transpose(1, 2),
+            key.transpose(1, 2),
+            value.transpose(1, 2))[0].transpose(1, 2)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        if encoder_hidden_states is not None:
+            encoder_hidden_states, hidden_states = (
+                hidden_states[:, : encoder_hidden_states.shape[1]],
+                hidden_states[:, encoder_hidden_states.shape[1] :],
+            )
+            # linear proj
+            hidden_states = attn.to_out[0](hidden_states)
+            # dropout
+            hidden_states = attn.to_out[1](hidden_states)
+            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+            return hidden_states, encoder_hidden_states
+        else:
+            return hidden_states

optimization.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""
+"""
+from typing import Any
+from typing import Callable
+from typing import ParamSpec
+import spaces
+import torch
+from fa3 import FlashFusedFluxAttnProcessor3_0
+P = ParamSpec('P')
+INDUCTOR_CONFIGS = {
+    'conv_1x1_as_mm': True,
+    'epilogue_fusion': False,
+    'coordinate_descent_tuning': True,
+    'coordinate_descent_check_all_directions': True,
+    'max_autotune': True,
+    'triton.cudagraphs': True,
+}
+def optimize_pipeline_(pipeline: Callable[P, Any], *args: P.args, **kwargs: P.kwargs):
+    @spaces.GPU(duration=1500)
+    def compile_transformer():
+        with spaces.aoti_capture(pipeline.transformer) as call:
+            pipeline(*args, **kwargs)
+        exported = torch.export.export(
+            mod=pipeline.transformer,
+            args=call.args,
+            kwargs=call.kwargs,
+        )
+        return spaces.aoti_compile(exported, INDUCTOR_CONFIGS)
+    pipeline.transformer.set_attn_processor(FlashFusedFluxAttnProcessor3_0())
+    spaces.aoti_apply(compile_transformer(), pipeline.transformer)

requirements.txt CHANGED Viewed

@@ -3,4 +3,4 @@ diffusers
 transformers
 sentencepiece
 protobuf
-torchao

 transformers
 sentencepiece
 protobuf
+kernels

zerogpu.py DELETED Viewed

@@ -1,62 +0,0 @@
-"""
-"""
-from contextvars import ContextVar
-from io import BytesIO
-from typing import Any
-from typing import cast
-import torch
-from torch._inductor.package.package import package_aoti
-from torch.export.pt2_archive._package import AOTICompiledModel
-from torch.export.pt2_archive._package_weights import TensorProperties
-from torch.export.pt2_archive._package_weights import Weights
-INDUCTOR_CONFIGS_OVERRIDES = {
-    'aot_inductor.package_constants_in_so': False,
-    'aot_inductor.package_constants_on_disk': True,
-    'aot_inductor.package': True,
-}
-class ZeroGPUCompiledModel:
-    def __init__(self, archive_file: torch.types.FileLike, weights: Weights, cuda: bool = False):
-        self.archive_file = archive_file
-        self.weights = weights
-        if cuda:
-            self.weights_to_cuda_()
-        self.compiled_model: ContextVar[AOTICompiledModel | None] = ContextVar('compiled_model', default=None)
-    def weights_to_cuda_(self):
-        for name in self.weights:
-            tensor, properties = self.weights.get_weight(name)
-            self.weights[name] = (tensor.to('cuda'), properties)
-    def __call__(self, *args, **kwargs):
-        if (compiled_model := self.compiled_model.get()) is None:
-            constants_map = {name: value[0] for name, value in self.weights.items()}
-            compiled_model = cast(AOTICompiledModel, torch._inductor.aoti_load_package(self.archive_file))
-            compiled_model.load_constants(constants_map, check_full_update=True, user_managed=True)
-            self.compiled_model.set(compiled_model)
-        return compiled_model(*args, **kwargs)
-    def __reduce__(self):
-        weight_dict: dict[str, tuple[torch.Tensor, TensorProperties]] = {}
-        for name in self.weights:
-            tensor, properties = self.weights.get_weight(name)
-            tensor_ = torch.empty_like(tensor, device='cpu').pin_memory()
-            weight_dict[name] = (tensor_.copy_(tensor).detach().share_memory_(), properties)
-        return ZeroGPUCompiledModel, (self.archive_file, Weights(weight_dict), True)
-def aoti_compile(
-    exported_program: torch.export.ExportedProgram,
-    inductor_configs: dict[str, Any] | None = None,
-):
-    inductor_configs = (inductor_configs or {}) | INDUCTOR_CONFIGS_OVERRIDES
-    gm = exported_program.module()
-    assert exported_program.example_inputs is not None
-    args, kwargs = exported_program.example_inputs
-    artifacts = torch._inductor.aot_compile(gm, args, kwargs, options=inductor_configs)
-    archive_file = BytesIO()
-    files = [file for file in artifacts if isinstance(file, str)]
-    package_aoti(archive_file, files)
-    weights, = (artifact for artifact in artifacts if isinstance(artifact, Weights))
-    return ZeroGPUCompiledModel(archive_file, weights)