jimmycarter
/

LibreFLUX

@@ -825,105 +825,6 @@ class FluxTransformer2DModelWithMasking(
         return Transformer2DModelOutput(sample=output)
-if __name__ == "__main__":
-    dtype = torch.bfloat16
-    bsz = 2
-    img = torch.rand((bsz, 16, 64, 64)).to("cuda", dtype=dtype)
-    timestep = torch.tensor([0.5, 0.5]).to("cuda", dtype=torch.float32)
-    pooled = torch.rand(bsz, 768).to("cuda", dtype=dtype)
-    text = torch.rand((bsz, 512, 4096)).to("cuda", dtype=dtype)
-    attn_mask = torch.tensor([[1.0] * 384 + [0.0] * 128] * bsz).to(
-        "cuda", dtype=dtype
-    )  # Last 128 positions are masked
-    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
-        latents = latents.view(
-            batch_size, num_channels_latents, height // 2, 2, width // 2, 2
-        )
-        latents = latents.permute(0, 2, 4, 1, 3, 5)
-        latents = latents.reshape(
-            batch_size, (height // 2) * (width // 2), num_channels_latents * 4
-        )
-        return latents
-    def _prepare_latent_image_ids(
-        batch_size, height, width, device="cuda", dtype=dtype
-    ):
-        latent_image_ids = torch.zeros(height // 2, width // 2, 3)
-        latent_image_ids[..., 1] = (
-            latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
-        )
-        latent_image_ids[..., 2] = (
-            latent_image_ids[..., 2] + torch.arange(width // 2)[None, :]
-        )
-        latent_image_id_height, latent_image_id_width, latent_image_id_channels = (
-            latent_image_ids.shape
-        )
-        latent_image_ids = latent_image_ids[None, :].repeat(batch_size, 1, 1, 1)
-        latent_image_ids = latent_image_ids.reshape(
-            batch_size,
-            latent_image_id_height * latent_image_id_width,
-            latent_image_id_channels,
-        )
-        return latent_image_ids.to(device=device, dtype=dtype)
-    txt_ids = torch.zeros(bsz, text.shape[1], 3).to(device="cuda", dtype=dtype)
-    vae_scale_factor = 16
-    height = 2 * (int(512) // vae_scale_factor)
-    width = 2 * (int(512) // vae_scale_factor)
-    img_ids = _prepare_latent_image_ids(bsz, height, width)
-    img = _pack_latents(img, img.shape[0], 16, height, width)
-    # Gotta go fast
-    transformer = FluxTransformer2DModelWithMasking.from_config(
-        {
-            "attention_head_dim": 128,
-            "guidance_embeds": True,
-            "in_channels": 64,
-            "joint_attention_dim": 4096,
-            "num_attention_heads": 24,
-            "num_layers": 4,
-            "num_single_layers": 8,
-            "patch_size": 1,
-            "pooled_projection_dim": 768,
-        }
-    ).to("cuda", dtype=dtype)
-    guidance = torch.tensor([2.0], device="cuda")
-    guidance = guidance.expand(bsz)
-    with torch.no_grad():
-        no_mask = transformer(
-            img,
-            encoder_hidden_states=text,
-            pooled_projections=pooled,
-            timestep=timestep,
-            img_ids=img_ids,
-            txt_ids=txt_ids,
-            guidance=guidance,
-        )
-        mask = transformer(
-            img,
-            encoder_hidden_states=text,
-            pooled_projections=pooled,
-            timestep=timestep,
-            img_ids=img_ids,
-            txt_ids=txt_ids,
-            guidance=guidance,
-            attention_mask=attn_mask,
-        )
-    assert torch.allclose(no_mask.sample, mask.sample) is False
-    print("Attention masking test ran OK. Differences in output were detected.")
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 EXAMPLE_DOC_STRING = """
     Examples:
         ```py

         return Transformer2DModelOutput(sample=output)
 EXAMPLE_DOC_STRING = """
     Examples:
         ```py