Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -5,10 +5,11 @@ import json
|
|
5 |
import time
|
6 |
import asyncio
|
7 |
import tempfile
|
8 |
-
from threading import Thread
|
9 |
import base64
|
10 |
import shutil
|
11 |
import re
|
|
|
|
|
12 |
|
13 |
import gradio as gr
|
14 |
import spaces
|
@@ -33,7 +34,10 @@ from transformers.image_utils import load_image
|
|
33 |
|
34 |
from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
|
35 |
from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
|
36 |
-
from diffusers.utils import export_to_ply
|
|
|
|
|
|
|
37 |
|
38 |
# Global constants and helper functions
|
39 |
|
@@ -88,7 +92,7 @@ class Model:
|
|
88 |
return mesh_path.name
|
89 |
|
90 |
def run_text(self, prompt: str, seed: int = 0, guidance_scale: float = 15.0, num_steps: int = 64) -> str:
|
91 |
-
generator = torch.Generator(device=self.device).manual_seed(seed)
|
92 |
images = self.pipe(
|
93 |
prompt,
|
94 |
generator=generator,
|
@@ -101,7 +105,7 @@ class Model:
|
|
101 |
return self.to_glb(ply_path.name)
|
102 |
|
103 |
def run_image(self, image: Image.Image, seed: int = 0, guidance_scale: float = 3.0, num_steps: int = 64) -> str:
|
104 |
-
generator = torch.Generator(device=self.device).manual_seed(seed)
|
105 |
images = self.pipe_img(
|
106 |
image,
|
107 |
generator=generator,
|
@@ -235,7 +239,9 @@ def ragent_reasoning(prompt: str, history: list[dict], max_tokens: int = 2048, t
|
|
235 |
# Gradio UI configuration
|
236 |
|
237 |
DESCRIPTION = """
|
238 |
-
# Agent Dino π
|
|
|
|
|
239 |
|
240 |
css = '''
|
241 |
h1 {
|
@@ -404,6 +410,64 @@ def generate_3d_fn(
|
|
404 |
glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
|
405 |
return glb_path, seed
|
406 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
407 |
# YOLO Object Detection Setup
|
408 |
YOLO_MODEL_REPO = "strangerzonehf/Flux-Ultimate-LoRA-Collection"
|
409 |
YOLO_CHECKPOINT_NAME = "images/demo.pt"
|
@@ -424,7 +488,7 @@ def detect_objects(image: np.ndarray):
|
|
424 |
|
425 |
return Image.fromarray(annotated_image)
|
426 |
|
427 |
-
# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, and @
|
428 |
|
429 |
@spaces.GPU
|
430 |
def generate(
|
@@ -444,6 +508,7 @@ def generate(
|
|
444 |
- "@web": triggers a web search or webpage visit.
|
445 |
- "@rAgent": initiates a reasoning chain using Llama mode OpenAI.
|
446 |
- "@yolo": triggers object detection using YOLO.
|
|
|
447 |
"""
|
448 |
text = input_dict["text"]
|
449 |
files = input_dict.get("files", [])
|
@@ -539,6 +604,23 @@ def generate(
|
|
539 |
yield gr.Image(result_img)
|
540 |
return
|
541 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
542 |
# --- Text and TTS branch ---
|
543 |
tts_prefix = "@tts"
|
544 |
is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
|
@@ -635,13 +717,14 @@ demo = gr.ChatInterface(
|
|
635 |
["@rAgent Explain how a binary search algorithm works."],
|
636 |
["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
|
637 |
["@tts1 Explain Tower of Hanoi"],
|
|
|
638 |
],
|
639 |
cache_examples=False,
|
640 |
type="messages",
|
641 |
description=DESCRIPTION,
|
642 |
css=css,
|
643 |
fill_height=True,
|
644 |
-
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="@tts1-β, @tts2-β, @image-image gen, @3d-3d mesh gen, @rAgent-coding, @web-websearch, @yolo-object detection, default-{text gen}{image-text-text}"),
|
645 |
stop_btn="Stop Generation",
|
646 |
multimodal=True,
|
647 |
)
|
|
|
5 |
import time
|
6 |
import asyncio
|
7 |
import tempfile
|
|
|
8 |
import base64
|
9 |
import shutil
|
10 |
import re
|
11 |
+
import gc
|
12 |
+
from threading import Thread
|
13 |
|
14 |
import gradio as gr
|
15 |
import spaces
|
|
|
34 |
|
35 |
from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
|
36 |
from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
|
37 |
+
from diffusers.utils import export_to_ply, export_to_video
|
38 |
+
|
39 |
+
# NEW IMPORTS FOR TEXT-TO-VIDEO
|
40 |
+
from diffusers import LTXPipeline, LTXImageToVideoPipeline
|
41 |
|
42 |
# Global constants and helper functions
|
43 |
|
|
|
92 |
return mesh_path.name
|
93 |
|
94 |
def run_text(self, prompt: str, seed: int = 0, guidance_scale: float = 15.0, num_steps: int = 64) -> str:
|
95 |
+
generator = torch.Generator(device=self.pipe.device).manual_seed(seed)
|
96 |
images = self.pipe(
|
97 |
prompt,
|
98 |
generator=generator,
|
|
|
105 |
return self.to_glb(ply_path.name)
|
106 |
|
107 |
def run_image(self, image: Image.Image, seed: int = 0, guidance_scale: float = 3.0, num_steps: int = 64) -> str:
|
108 |
+
generator = torch.Generator(device=self.pipe.device).manual_seed(seed)
|
109 |
images = self.pipe_img(
|
110 |
image,
|
111 |
generator=generator,
|
|
|
239 |
# Gradio UI configuration
|
240 |
|
241 |
DESCRIPTION = """
|
242 |
+
# Agent Dino π
|
243 |
+
Your multimodal chatbot supporting text, image, 3D, web search, object detection, reasoning, and now text-to-video generation.
|
244 |
+
"""
|
245 |
|
246 |
css = '''
|
247 |
h1 {
|
|
|
410 |
glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
|
411 |
return glb_path, seed
|
412 |
|
413 |
+
# ---------------------------
|
414 |
+
# NEW: Text-to-Video Generation
|
415 |
+
# ---------------------------
|
416 |
+
|
417 |
+
# Initialize text-to-video pipeline
|
418 |
+
t2v_pipe = LTXPipeline.from_pretrained("Skywork/SkyReels-V1-Hunyuan-T2V", torch_dtype=torch.bfloat16)
|
419 |
+
t2v_pipe.to(device)
|
420 |
+
|
421 |
+
def get_time_cost(run_task_time, time_cost_str):
|
422 |
+
now_time = int(time.time() * 1000)
|
423 |
+
if run_task_time == 0:
|
424 |
+
time_cost_str = 'start'
|
425 |
+
else:
|
426 |
+
if time_cost_str != '':
|
427 |
+
time_cost_str += f'-->'
|
428 |
+
time_cost_str += f'{now_time - run_task_time}'
|
429 |
+
run_task_time = now_time
|
430 |
+
return run_task_time, time_cost_str
|
431 |
+
|
432 |
+
@spaces.GPU(duration=60)
|
433 |
+
def text_to_video(
|
434 |
+
prompt: str,
|
435 |
+
negative_prompt: str,
|
436 |
+
width: int = 768,
|
437 |
+
height: int = 512,
|
438 |
+
num_frames: int = 121,
|
439 |
+
frame_rate: int = 25,
|
440 |
+
num_inference_steps: int = 30,
|
441 |
+
seed: int = 8,
|
442 |
+
progress: gr.Progress = gr.Progress(),
|
443 |
+
):
|
444 |
+
generator = torch.Generator(device=device).manual_seed(seed)
|
445 |
+
run_task_time = 0
|
446 |
+
time_cost_str = ''
|
447 |
+
run_task_time, time_cost_str = get_time_cost(run_task_time, time_cost_str)
|
448 |
+
try:
|
449 |
+
with torch.no_grad():
|
450 |
+
video = t2v_pipe(
|
451 |
+
prompt=prompt,
|
452 |
+
negative_prompt=negative_prompt,
|
453 |
+
generator=generator,
|
454 |
+
width=width,
|
455 |
+
height=height,
|
456 |
+
num_frames=num_frames,
|
457 |
+
num_inference_steps=num_inference_steps,
|
458 |
+
).frames[0]
|
459 |
+
finally:
|
460 |
+
torch.cuda.empty_cache()
|
461 |
+
gc.collect()
|
462 |
+
run_task_time, time_cost_str = get_time_cost(run_task_time, time_cost_str)
|
463 |
+
|
464 |
+
output_path = tempfile.mktemp(suffix=".mp4")
|
465 |
+
export_to_video(video, output_path, fps=frame_rate)
|
466 |
+
|
467 |
+
del video
|
468 |
+
torch.cuda.empty_cache()
|
469 |
+
return output_path, time_cost_str
|
470 |
+
|
471 |
# YOLO Object Detection Setup
|
472 |
YOLO_MODEL_REPO = "strangerzonehf/Flux-Ultimate-LoRA-Collection"
|
473 |
YOLO_CHECKPOINT_NAME = "images/demo.pt"
|
|
|
488 |
|
489 |
return Image.fromarray(annotated_image)
|
490 |
|
491 |
+
# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, @yolo, and now @text2video commands
|
492 |
|
493 |
@spaces.GPU
|
494 |
def generate(
|
|
|
508 |
- "@web": triggers a web search or webpage visit.
|
509 |
- "@rAgent": initiates a reasoning chain using Llama mode OpenAI.
|
510 |
- "@yolo": triggers object detection using YOLO.
|
511 |
+
- "@text2video": triggers text-to-video generation.
|
512 |
"""
|
513 |
text = input_dict["text"]
|
514 |
files = input_dict.get("files", [])
|
|
|
604 |
yield gr.Image(result_img)
|
605 |
return
|
606 |
|
607 |
+
# --- Text-to-Video Generation branch ---
|
608 |
+
if text.strip().lower().startswith("@text2video"):
|
609 |
+
# Expect the command to be: "@text2video <prompt> [|| <negative prompt>]"
|
610 |
+
command_body = text[len("@text2video"):].strip()
|
611 |
+
if "||" in command_body:
|
612 |
+
prompt_text, negative_prompt = command_body.split("||", 1)
|
613 |
+
prompt_text = prompt_text.strip()
|
614 |
+
negative_prompt = negative_prompt.strip()
|
615 |
+
else:
|
616 |
+
prompt_text = command_body
|
617 |
+
negative_prompt = "low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly"
|
618 |
+
yield "ποΈ Generating video..."
|
619 |
+
video_path, time_cost_str = text_to_video(prompt_text, negative_prompt)
|
620 |
+
yield gr.Video(video_path)
|
621 |
+
yield f"Time cost by step (ms): {time_cost_str}"
|
622 |
+
return
|
623 |
+
|
624 |
# --- Text and TTS branch ---
|
625 |
tts_prefix = "@tts"
|
626 |
is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
|
|
|
717 |
["@rAgent Explain how a binary search algorithm works."],
|
718 |
["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
|
719 |
["@tts1 Explain Tower of Hanoi"],
|
720 |
+
["@text2video A futuristic cityscape at dusk"],
|
721 |
],
|
722 |
cache_examples=False,
|
723 |
type="messages",
|
724 |
description=DESCRIPTION,
|
725 |
css=css,
|
726 |
fill_height=True,
|
727 |
+
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="@tts1-β, @tts2-β, @image-image gen, @3d-3d mesh gen, @rAgent-coding, @web-websearch, @yolo-object detection, @text2video-video gen, default-{text gen}{image-text-text}"),
|
728 |
stop_btn="Stop Generation",
|
729 |
multimodal=True,
|
730 |
)
|