Spaces:

multimodalart
/

hypernoise-sana-sprint

Running on Zero

App Files Files Community

multimodalart HF Staff commited on Aug 15

Commit

85cc5c2

verified ·

1 Parent(s): 606b9af

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -15

app.py CHANGED Viewed

@@ -83,27 +83,55 @@ def infer(prompt, seed=42, randomize_seed=False, width=1024, height=1024,
         # Generate final image with adapter disabled
         pipe.transformer.disable_adapter_layers()
-        # Manually set timesteps to avoid intermediate_timesteps issue
-        # SCM scheduler only supports intermediate_timesteps when num_inference_steps=2
         if num_inference_steps == 2:
-            pipe.scheduler.set_timesteps(num_inference_steps, device=device)
         else:
-            # For num_inference_steps != 2, we need to avoid intermediate_timesteps
-            max_timesteps = 1.57080  # Default from SCM paper
             pipe.scheduler.set_timesteps(
                 num_inference_steps,
                 device=device,
-                max_timesteps=max_timesteps,
-                intermediate_timesteps=None
             )
-        # Now generate the image with pre-set timesteps
-        image = pipe(
-            latents=modulated_latents,
-            prompt_embeds=prompt_embeds,
-            prompt_attention_mask=prompt_attention_mask,
-            num_inference_steps=num_inference_steps,
-        ).images[0]
     return image, seed

         # Generate final image with adapter disabled
         pipe.transformer.disable_adapter_layers()
+        # For SCM scheduler, we need to handle the timesteps carefully
+        # The pipeline expects intermediate_timesteps only when num_inference_steps=2
+        # For other values, we use the workaround from the original code
         if num_inference_steps == 2:
+            # Use the default pipeline behavior for 2 steps
+            image = pipe(
+                latents=modulated_latents,
+                prompt_embeds=prompt_embeds,
+                prompt_attention_mask=prompt_attention_mask,
+                num_inference_steps=num_inference_steps,
+            ).images[0]
         else:
+            # For num_inference_steps != 2, we need to work around the restriction
+            # by directly calling the denoising loop
             pipe.scheduler.set_timesteps(
                 num_inference_steps,
                 device=device,
+                timesteps=torch.linspace(1.57080, 0, num_inference_steps + 1, device=device)
             )
+            # Run the denoising loop manually
+            latents = modulated_latents
+            for i, t in enumerate(pipe.scheduler.timesteps[:-1]):
+                # Expand timestep to match batch dimension
+                timestep = t.expand(latents.shape[0])
+                # Predict noise
+                noise_pred = pipe.transformer(
+                    hidden_states=latents,
+                    encoder_hidden_states=prompt_embeds,
+                    encoder_attention_mask=prompt_attention_mask,
+                    timestep=timestep,
+                    guidance=torch.tensor([0.0], device=device, dtype=dtype),  # No guidance for denoising
+                    return_dict=False,
+                )[0]
+                # Compute previous noisy sample
+                latents = pipe.scheduler.step(
+                    noise_pred,
+                    t,
+                    latents,
+                    return_dict=False
+                )[0]
+            # Decode latents to image
+            latents = pipe._unpack_latents(latents, height, width, pipe.vae_scale_factor)
+            latents = (latents / pipe.vae.scaling_factor) + pipe.vae.shift_factor
+            image = pipe.vae.decode(latents, return_dict=False)[0]
+            image = pipe.image_processor.postprocess(image, output_type="pil")[0]
     return image, seed