Spaces:

multimodalart
/

hypernoise-sana-sprint

Running on Zero

App Files Files Community

multimodalart HF Staff commited on Aug 15

Commit

e1b4118

verified ·

1 Parent(s): 85cc5c2

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -58

app.py CHANGED Viewed

@@ -51,11 +51,6 @@ def infer(prompt, seed=42, randomize_seed=False, width=1024, height=1024,
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
-    # Calculate latent dimensions based on image size
-    # Sana uses 32x downsampling factor
-    latent_height = height // 32
-    latent_width = width // 32
     with torch.inference_mode():
         # Encode the prompt
         prompt_embeds, prompt_attention_mask = pipe.encode_prompt(
@@ -63,14 +58,14 @@ def infer(prompt, seed=42, randomize_seed=False, width=1024, height=1024,
             device=device
         )
-        # Generate initial random latents with correct dimensions
         init_latents = torch.randn(
-            [1, 32, latent_height, latent_width],
             device=device,
             dtype=dtype
         )
-        # Apply HyperNoise modulation with adapter enabled (single forward pass)
         pipe.transformer.enable_adapter_layers()
         modulated_latents = pipe.transformer(
             hidden_states=init_latents,
@@ -82,56 +77,15 @@ def infer(prompt, seed=42, randomize_seed=False, width=1024, height=1024,
         # Generate final image with adapter disabled
         pipe.transformer.disable_adapter_layers()
-        # For SCM scheduler, we need to handle the timesteps carefully
-        # The pipeline expects intermediate_timesteps only when num_inference_steps=2
-        # For other values, we use the workaround from the original code
-        if num_inference_steps == 2:
-            # Use the default pipeline behavior for 2 steps
-            image = pipe(
-                latents=modulated_latents,
-                prompt_embeds=prompt_embeds,
-                prompt_attention_mask=prompt_attention_mask,
-                num_inference_steps=num_inference_steps,
-            ).images[0]
-        else:
-            # For num_inference_steps != 2, we need to work around the restriction
-            # by directly calling the denoising loop
-            pipe.scheduler.set_timesteps(
-                num_inference_steps,
-                device=device,
-                timesteps=torch.linspace(1.57080, 0, num_inference_steps + 1, device=device)
-            )
-            # Run the denoising loop manually
-            latents = modulated_latents
-            for i, t in enumerate(pipe.scheduler.timesteps[:-1]):
-                # Expand timestep to match batch dimension
-                timestep = t.expand(latents.shape[0])
-                # Predict noise
-                noise_pred = pipe.transformer(
-                    hidden_states=latents,
-                    encoder_hidden_states=prompt_embeds,
-                    encoder_attention_mask=prompt_attention_mask,
-                    timestep=timestep,
-                    guidance=torch.tensor([0.0], device=device, dtype=dtype),  # No guidance for denoising
-                    return_dict=False,
-                )[0]
-                # Compute previous noisy sample
-                latents = pipe.scheduler.step(
-                    noise_pred,
-                    t,
-                    latents,
-                    return_dict=False
-                )[0]
-            # Decode latents to image
-            latents = pipe._unpack_latents(latents, height, width, pipe.vae_scale_factor)
-            latents = (latents / pipe.vae.scaling_factor) + pipe.vae.shift_factor
-            image = pipe.vae.decode(latents, return_dict=False)[0]
-            image = pipe.image_processor.postprocess(image, output_type="pil")[0]
     return image, seed

     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
     with torch.inference_mode():
         # Encode the prompt
         prompt_embeds, prompt_attention_mask = pipe.encode_prompt(
             device=device
         )
+        # Generate initial random latents
         init_latents = torch.randn(
+            [1, 32, 32, 32],
             device=device,
             dtype=dtype
         )
+        # Apply HyperNoise modulation with adapter enabled
         pipe.transformer.enable_adapter_layers()
         modulated_latents = pipe.transformer(
             hidden_states=init_latents,
         # Generate final image with adapter disabled
         pipe.transformer.disable_adapter_layers()
+        image = pipe(
+            latents=modulated_latents,
+            prompt_embeds=prompt_embeds,
+            prompt_attention_mask=prompt_attention_mask,
+            intermediate_steps=None,
+            num_inference_steps=num_inference_steps,
+            height=height,
+            width=width,
+        ).images[0]
     return image, seed