Spaces:

prithivMLmods
/

Agent-Dino

Running on Zero

App Files Files Community

prithivMLmods commited on 14 days ago

Commit

a57530c

verified ·

1 Parent(s): ccc8b48

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -44

app.py CHANGED Viewed

@@ -35,6 +35,11 @@ from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
 from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
 from diffusers.utils import export_to_ply
 # Global constants and helper functions
 MAX_SEED = np.iinfo(np.int32).max
@@ -337,53 +342,49 @@ def save_image(img: Image.Image) -> str:
     img.save(unique_name)
     return unique_name
-@spaces.GPU(duration=60, enable_queue=True)
-def generate_image_fn(
     prompt: str,
     negative_prompt: str = "",
-    use_negative_prompt: bool = False,
     seed: int = 1,
-    width: int = 1024,
-    height: int = 1024,
-    guidance_scale: float = 3,
-    num_inference_steps: int = 25,
     randomize_seed: bool = False,
-    use_resolution_binning: bool = True,
-    num_images: int = 1,
-    progress=gr.Progress(track_tqdm=True),
 ):
-    """Generate images using the SDXL pipeline."""
     seed = int(randomize_seed_fn(seed, randomize_seed))
-    generator = torch.Generator(device=device).manual_seed(seed)
-    options = {
-        "prompt": [prompt] * num_images,
-        "negative_prompt": [negative_prompt] * num_images if use_negative_prompt else None,
-        "width": width,
-        "height": height,
-        "guidance_scale": guidance_scale,
-        "num_inference_steps": num_inference_steps,
-        "generator": generator,
-        "output_type": "pil",
-    }
-    if use_resolution_binning:
-        options["use_resolution_binning"] = True
-    images = []
-    # Process in batches
-    for i in range(0, num_images, BATCH_SIZE):
-        batch_options = options.copy()
-        batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
-        if "negative_prompt" in batch_options and batch_options["negative_prompt"] is not None:
-            batch_options["negative_prompt"] = options["negative_prompt"][i:i+BATCH_SIZE]
-        if device.type == "cuda":
-            with torch.autocast("cuda", dtype=torch.float16):
-                outputs = sd_pipe(**batch_options)
-        else:
-            outputs = sd_pipe(**batch_options)
-        images.extend(outputs.images)
-    image_paths = [save_image(img) for img in images]
-    return image_paths, seed
 # Text-to-3D Generation using the ShapE Pipeline
@@ -424,7 +425,7 @@ def detect_objects(image: np.ndarray):
     return Image.fromarray(annotated_image)
-# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, and @yolo commands
 @spaces.GPU
 def generate(
@@ -444,6 +445,7 @@ def generate(
       - "@web": triggers a web search or webpage visit.
       - "@rAgent": initiates a reasoning chain using Llama mode OpenAI.
       - "@yolo": triggers object detection using YOLO.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
@@ -539,6 +541,42 @@ def generate(
         yield gr.Image(result_img)
         return
     # --- Text and TTS branch ---
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
@@ -629,19 +667,20 @@ demo = gr.ChatInterface(
     examples=[
         ["@tts2 What causes rainbows to form?"],
         ["@image Chocolate dripping from a donut"],
         ["@3d A birthday cupcake with cherry"],
         [{"text": "Summarize the letter", "files": ["examples/1.png"]}],
         [{"text": "@yolo", "files": ["examples/yolo.jpeg"]}],
         ["@rAgent Explain how a binary search algorithm works."],
         ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
-        ["@tts1 Explain Tower of Hanoi"],
     ],
     cache_examples=False,
     type="messages",
     description=DESCRIPTION,
     css=css,
     fill_height=True,
-    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="@tts1-♀, @tts2-♂, @image-image gen, @3d-3d mesh gen, @rAgent-coding, @web-websearch, @yolo-object detection, default-{text gen}{image-text-text}"),
     stop_btn="Stop Generation",
     multimodal=True,
 )

 from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
 from diffusers.utils import export_to_ply
+# NEW IMPORTS FOR TEXT-TO-VIDEO FEATURE
+import torch  # already imported above; included here for clarity
+from diffsynth import ModelManager, WanVideoPipeline, save_video, VideoData
+from modelscope import snapshot_download
 # Global constants and helper functions
 MAX_SEED = np.iinfo(np.int32).max
     img.save(unique_name)
     return unique_name
+# NEW: Global setup for Wan Video Pipeline
+wan_pipe = None
+def get_wan_pipe():
+    global wan_pipe
+    if wan_pipe is None:
+        snapshot_download("Wan-AI/Wan2.1-T2V-1.3B", local_dir="models/Wan-AI/Wan2.1-T2V-1.3B")
+        model_manager = ModelManager(device="cpu")
+        model_manager.load_models(
+            [
+                "models/Wan-AI/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors",
+                "models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth",
+                "models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth",
+            ],
+            torch_dtype=torch.bfloat16,
+        )
+        wan_pipe = WanVideoPipeline.from_model_manager(model_manager, torch_dtype=torch.bfloat16, device="cuda")
+        wan_pipe.enable_vram_management(num_persistent_param_in_dit=None)
+    return wan_pipe
+@spaces.GPU(duration=120, enable_queue=True)
+def generate_video_fn(
     prompt: str,
     negative_prompt: str = "",
     seed: int = 1,
+    num_inference_steps: int = 50,
     randomize_seed: bool = False,
 ):
+    """
+    Generate a video from text using the Wan pipeline.
+    Returns a tuple of (video_file_path, used_seed).
+    """
     seed = int(randomize_seed_fn(seed, randomize_seed))
+    pipe = get_wan_pipe()
+    video = pipe(
+        prompt=prompt,
+        negative_prompt=negative_prompt,
+        num_inference_steps=num_inference_steps,
+        seed=seed,
+        tiled=True
+    )
+    video_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
+    save_video(video, video_path, fps=15, quality=5)
+    return video_path, seed
 # Text-to-3D Generation using the ShapE Pipeline
     return Image.fromarray(annotated_image)
+# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, @yolo, and now @wan commands
 @spaces.GPU
 def generate(
       - "@web": triggers a web search or webpage visit.
       - "@rAgent": initiates a reasoning chain using Llama mode OpenAI.
       - "@yolo": triggers object detection using YOLO.
+      - "@wan": triggers video generation using the Wan pipeline.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
         yield gr.Image(result_img)
         return
+    # --- Wan Video Generation branch ---
+    if text.strip().lower().startswith("@wan"):
+        prompt = text[len("@wan"):].strip()
+        yield "🎞️ Generating video..."
+        # If a video file is attached, perform video-to-video generation.
+        if files and len(files) > 0:
+            try:
+                input_video_path = files[0]
+                video_data = VideoData(input_video_path, height=480, width=832)
+            except Exception as e:
+                yield f"Error loading video: {str(e)}"
+                return
+            pipe = get_wan_pipe()
+            video = pipe(
+                prompt=prompt,
+                negative_prompt="",
+                input_video=video_data,
+                denoising_strength=0.7,
+                num_inference_steps=50,
+                seed=randomize_seed_fn(1, True),
+                tiled=True
+            )
+            video_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
+            save_video(video, video_path, fps=24, quality=5)
+            yield gr.Video(video_path)
+        else:
+            video_path, used_seed = generate_video_fn(
+                prompt=prompt,
+                negative_prompt="",
+                seed=1,
+                num_inference_steps=50,
+                randomize_seed=True,
+            )
+            yield gr.Video(video_path)
+        return
     # --- Text and TTS branch ---
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
     examples=[
         ["@tts2 What causes rainbows to form?"],
         ["@image Chocolate dripping from a donut"],
+        ["@wan A documentary-style shot of a lively puppy running"],
         ["@3d A birthday cupcake with cherry"],
         [{"text": "Summarize the letter", "files": ["examples/1.png"]}],
         [{"text": "@yolo", "files": ["examples/yolo.jpeg"]}],
         ["@rAgent Explain how a binary search algorithm works."],
         ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
+        ["@tts1 Explain Tower of Hanoi"]
     ],
     cache_examples=False,
     type="messages",
     description=DESCRIPTION,
     css=css,
     fill_height=True,
+    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple", placeholder="@tts1-♀, @tts2-♂, @image-image gen, @3d-3d mesh gen, @rAgent-coding, @web-websearch, @yolo-object detection, @wan-video gen, default-{text gen}{image-text-text}"),
     stop_btn="Stop Generation",
     multimodal=True,
 )