Spaces:

prithivMLmods
/

Agent-Dino

Running on Zero

App Files Files Community

prithivMLmods commited on 13 days ago

Commit

e685e73

verified ·

1 Parent(s): 95e0bed

Update app.py

Browse files

Files changed (1) hide show

app.py +90 -7

app.py CHANGED Viewed

@@ -5,10 +5,11 @@ import json
 import time
 import asyncio
 import tempfile
-from threading import Thread
 import base64
 import shutil
 import re
 import gradio as gr
 import spaces
@@ -33,7 +34,10 @@ from transformers.image_utils import load_image
 from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
 from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
-from diffusers.utils import export_to_ply
 # Global constants and helper functions
@@ -88,7 +92,7 @@ class Model:
         return mesh_path.name
     def run_text(self, prompt: str, seed: int = 0, guidance_scale: float = 15.0, num_steps: int = 64) -> str:
-        generator = torch.Generator(device=self.device).manual_seed(seed)
         images = self.pipe(
             prompt,
             generator=generator,
@@ -101,7 +105,7 @@ class Model:
         return self.to_glb(ply_path.name)
     def run_image(self, image: Image.Image, seed: int = 0, guidance_scale: float = 3.0, num_steps: int = 64) -> str:
-        generator = torch.Generator(device=self.device).manual_seed(seed)
         images = self.pipe_img(
             image,
             generator=generator,
@@ -235,7 +239,9 @@ def ragent_reasoning(prompt: str, history: list[dict], max_tokens: int = 2048, t
 # Gradio UI configuration
 DESCRIPTION = """
-# Agent Dino 🌠 """
 css = '''
 h1 {
@@ -404,6 +410,64 @@ def generate_3d_fn(
     glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
     return glb_path, seed
 # YOLO Object Detection Setup
 YOLO_MODEL_REPO = "strangerzonehf/Flux-Ultimate-LoRA-Collection"
 YOLO_CHECKPOINT_NAME = "images/demo.pt"
@@ -424,7 +488,7 @@ def detect_objects(image: np.ndarray):
     return Image.fromarray(annotated_image)
-# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, and @yolo commands
 @spaces.GPU
 def generate(
@@ -444,6 +508,7 @@ def generate(
       - "@web": triggers a web search or webpage visit.
       - "@rAgent": initiates a reasoning chain using Llama mode OpenAI.
       - "@yolo": triggers object detection using YOLO.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
@@ -539,6 +604,23 @@ def generate(
         yield gr.Image(result_img)
         return
     # --- Text and TTS branch ---
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
@@ -635,13 +717,14 @@ demo = gr.ChatInterface(
         ["@rAgent Explain how a binary search algorithm works."],
         ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
         ["@tts1 Explain Tower of Hanoi"],
     ],
     cache_examples=False,
     type="messages",
     description=DESCRIPTION,
     css=css,
     fill_height=True,
-    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="@tts1-♀, @tts2-♂, @image-image gen, @3d-3d mesh gen, @rAgent-coding, @web-websearch, @yolo-object detection, default-{text gen}{image-text-text}"),
     stop_btn="Stop Generation",
     multimodal=True,
 )

 import time
 import asyncio
 import tempfile
 import base64
 import shutil
 import re
+import gc
+from threading import Thread
 import gradio as gr
 import spaces
 from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
 from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
+from diffusers.utils import export_to_ply, export_to_video
+# NEW IMPORTS FOR TEXT-TO-VIDEO
+from diffusers import LTXPipeline, LTXImageToVideoPipeline
 # Global constants and helper functions
         return mesh_path.name
     def run_text(self, prompt: str, seed: int = 0, guidance_scale: float = 15.0, num_steps: int = 64) -> str:
+        generator = torch.Generator(device=self.pipe.device).manual_seed(seed)
         images = self.pipe(
             prompt,
             generator=generator,
         return self.to_glb(ply_path.name)
     def run_image(self, image: Image.Image, seed: int = 0, guidance_scale: float = 3.0, num_steps: int = 64) -> str:
+        generator = torch.Generator(device=self.pipe.device).manual_seed(seed)
         images = self.pipe_img(
             image,
             generator=generator,
 # Gradio UI configuration
 DESCRIPTION = """
+# Agent Dino 🌠
+Your multimodal chatbot supporting text, image, 3D, web search, object detection, reasoning, and now text-to-video generation.
+"""
 css = '''
 h1 {
     glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
     return glb_path, seed
+# ---------------------------
+# NEW: Text-to-Video Generation
+# ---------------------------
+# Initialize text-to-video pipeline
+t2v_pipe = LTXPipeline.from_pretrained("Skywork/SkyReels-V1-Hunyuan-T2V", torch_dtype=torch.bfloat16)
+t2v_pipe.to(device)
+def get_time_cost(run_task_time, time_cost_str):
+    now_time = int(time.time() * 1000)
+    if run_task_time == 0:
+        time_cost_str = 'start'
+    else:
+        if time_cost_str != '':
+            time_cost_str += f'-->'
+        time_cost_str += f'{now_time - run_task_time}'
+    run_task_time = now_time
+    return run_task_time, time_cost_str
+@spaces.GPU(duration=60)
+def text_to_video(
+    prompt: str,
+    negative_prompt: str,
+    width: int = 768,
+    height: int = 512,
+    num_frames: int = 121,
+    frame_rate: int = 25,
+    num_inference_steps: int = 30,
+    seed: int = 8,
+    progress: gr.Progress = gr.Progress(),
+):
+    generator = torch.Generator(device=device).manual_seed(seed)
+    run_task_time = 0
+    time_cost_str = ''
+    run_task_time, time_cost_str = get_time_cost(run_task_time, time_cost_str)
+    try:
+        with torch.no_grad():
+            video = t2v_pipe(
+                prompt=prompt,
+                negative_prompt=negative_prompt,
+                generator=generator,
+                width=width,
+                height=height,
+                num_frames=num_frames,
+                num_inference_steps=num_inference_steps,
+            ).frames[0]
+    finally:
+        torch.cuda.empty_cache()
+        gc.collect()
+    run_task_time, time_cost_str = get_time_cost(run_task_time, time_cost_str)
+    output_path = tempfile.mktemp(suffix=".mp4")
+    export_to_video(video, output_path, fps=frame_rate)
+    del video
+    torch.cuda.empty_cache()
+    return output_path, time_cost_str
 # YOLO Object Detection Setup
 YOLO_MODEL_REPO = "strangerzonehf/Flux-Ultimate-LoRA-Collection"
 YOLO_CHECKPOINT_NAME = "images/demo.pt"
     return Image.fromarray(annotated_image)
+# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, @yolo, and now @text2video commands
 @spaces.GPU
 def generate(
       - "@web": triggers a web search or webpage visit.
       - "@rAgent": initiates a reasoning chain using Llama mode OpenAI.
       - "@yolo": triggers object detection using YOLO.
+      - "@text2video": triggers text-to-video generation.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
         yield gr.Image(result_img)
         return
+    # --- Text-to-Video Generation branch ---
+    if text.strip().lower().startswith("@text2video"):
+        # Expect the command to be: "@text2video <prompt> [|| <negative prompt>]"
+        command_body = text[len("@text2video"):].strip()
+        if "||" in command_body:
+            prompt_text, negative_prompt = command_body.split("||", 1)
+            prompt_text = prompt_text.strip()
+            negative_prompt = negative_prompt.strip()
+        else:
+            prompt_text = command_body
+            negative_prompt = "low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly"
+        yield "🎞️ Generating video..."
+        video_path, time_cost_str = text_to_video(prompt_text, negative_prompt)
+        yield gr.Video(video_path)
+        yield f"Time cost by step (ms): {time_cost_str}"
+        return
     # --- Text and TTS branch ---
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
         ["@rAgent Explain how a binary search algorithm works."],
         ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
         ["@tts1 Explain Tower of Hanoi"],
+        ["@text2video A futuristic cityscape at dusk"],
     ],
     cache_examples=False,
     type="messages",
     description=DESCRIPTION,
     css=css,
     fill_height=True,
+    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="@tts1-♀, @tts2-♂, @image-image gen, @3d-3d mesh gen, @rAgent-coding, @web-websearch, @yolo-object detection, @text2video-video gen, default-{text gen}{image-text-text}"),
     stop_btn="Stop Generation",
     multimodal=True,
 )