prithivMLmods commited on
Commit
e685e73
Β·
verified Β·
1 Parent(s): 95e0bed

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +90 -7
app.py CHANGED
@@ -5,10 +5,11 @@ import json
5
  import time
6
  import asyncio
7
  import tempfile
8
- from threading import Thread
9
  import base64
10
  import shutil
11
  import re
 
 
12
 
13
  import gradio as gr
14
  import spaces
@@ -33,7 +34,10 @@ from transformers.image_utils import load_image
33
 
34
  from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
35
  from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
36
- from diffusers.utils import export_to_ply
 
 
 
37
 
38
  # Global constants and helper functions
39
 
@@ -88,7 +92,7 @@ class Model:
88
  return mesh_path.name
89
 
90
  def run_text(self, prompt: str, seed: int = 0, guidance_scale: float = 15.0, num_steps: int = 64) -> str:
91
- generator = torch.Generator(device=self.device).manual_seed(seed)
92
  images = self.pipe(
93
  prompt,
94
  generator=generator,
@@ -101,7 +105,7 @@ class Model:
101
  return self.to_glb(ply_path.name)
102
 
103
  def run_image(self, image: Image.Image, seed: int = 0, guidance_scale: float = 3.0, num_steps: int = 64) -> str:
104
- generator = torch.Generator(device=self.device).manual_seed(seed)
105
  images = self.pipe_img(
106
  image,
107
  generator=generator,
@@ -235,7 +239,9 @@ def ragent_reasoning(prompt: str, history: list[dict], max_tokens: int = 2048, t
235
  # Gradio UI configuration
236
 
237
  DESCRIPTION = """
238
- # Agent Dino 🌠 """
 
 
239
 
240
  css = '''
241
  h1 {
@@ -404,6 +410,64 @@ def generate_3d_fn(
404
  glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
405
  return glb_path, seed
406
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
407
  # YOLO Object Detection Setup
408
  YOLO_MODEL_REPO = "strangerzonehf/Flux-Ultimate-LoRA-Collection"
409
  YOLO_CHECKPOINT_NAME = "images/demo.pt"
@@ -424,7 +488,7 @@ def detect_objects(image: np.ndarray):
424
 
425
  return Image.fromarray(annotated_image)
426
 
427
- # Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, and @yolo commands
428
 
429
  @spaces.GPU
430
  def generate(
@@ -444,6 +508,7 @@ def generate(
444
  - "@web": triggers a web search or webpage visit.
445
  - "@rAgent": initiates a reasoning chain using Llama mode OpenAI.
446
  - "@yolo": triggers object detection using YOLO.
 
447
  """
448
  text = input_dict["text"]
449
  files = input_dict.get("files", [])
@@ -539,6 +604,23 @@ def generate(
539
  yield gr.Image(result_img)
540
  return
541
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
542
  # --- Text and TTS branch ---
543
  tts_prefix = "@tts"
544
  is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
@@ -635,13 +717,14 @@ demo = gr.ChatInterface(
635
  ["@rAgent Explain how a binary search algorithm works."],
636
  ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
637
  ["@tts1 Explain Tower of Hanoi"],
 
638
  ],
639
  cache_examples=False,
640
  type="messages",
641
  description=DESCRIPTION,
642
  css=css,
643
  fill_height=True,
644
- textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="@tts1-♀, @tts2-β™‚, @image-image gen, @3d-3d mesh gen, @rAgent-coding, @web-websearch, @yolo-object detection, default-{text gen}{image-text-text}"),
645
  stop_btn="Stop Generation",
646
  multimodal=True,
647
  )
 
5
  import time
6
  import asyncio
7
  import tempfile
 
8
  import base64
9
  import shutil
10
  import re
11
+ import gc
12
+ from threading import Thread
13
 
14
  import gradio as gr
15
  import spaces
 
34
 
35
  from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
36
  from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
37
+ from diffusers.utils import export_to_ply, export_to_video
38
+
39
+ # NEW IMPORTS FOR TEXT-TO-VIDEO
40
+ from diffusers import LTXPipeline, LTXImageToVideoPipeline
41
 
42
  # Global constants and helper functions
43
 
 
92
  return mesh_path.name
93
 
94
  def run_text(self, prompt: str, seed: int = 0, guidance_scale: float = 15.0, num_steps: int = 64) -> str:
95
+ generator = torch.Generator(device=self.pipe.device).manual_seed(seed)
96
  images = self.pipe(
97
  prompt,
98
  generator=generator,
 
105
  return self.to_glb(ply_path.name)
106
 
107
  def run_image(self, image: Image.Image, seed: int = 0, guidance_scale: float = 3.0, num_steps: int = 64) -> str:
108
+ generator = torch.Generator(device=self.pipe.device).manual_seed(seed)
109
  images = self.pipe_img(
110
  image,
111
  generator=generator,
 
239
  # Gradio UI configuration
240
 
241
  DESCRIPTION = """
242
+ # Agent Dino 🌠
243
+ Your multimodal chatbot supporting text, image, 3D, web search, object detection, reasoning, and now text-to-video generation.
244
+ """
245
 
246
  css = '''
247
  h1 {
 
410
  glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
411
  return glb_path, seed
412
 
413
+ # ---------------------------
414
+ # NEW: Text-to-Video Generation
415
+ # ---------------------------
416
+
417
+ # Initialize text-to-video pipeline
418
+ t2v_pipe = LTXPipeline.from_pretrained("Skywork/SkyReels-V1-Hunyuan-T2V", torch_dtype=torch.bfloat16)
419
+ t2v_pipe.to(device)
420
+
421
+ def get_time_cost(run_task_time, time_cost_str):
422
+ now_time = int(time.time() * 1000)
423
+ if run_task_time == 0:
424
+ time_cost_str = 'start'
425
+ else:
426
+ if time_cost_str != '':
427
+ time_cost_str += f'-->'
428
+ time_cost_str += f'{now_time - run_task_time}'
429
+ run_task_time = now_time
430
+ return run_task_time, time_cost_str
431
+
432
+ @spaces.GPU(duration=60)
433
+ def text_to_video(
434
+ prompt: str,
435
+ negative_prompt: str,
436
+ width: int = 768,
437
+ height: int = 512,
438
+ num_frames: int = 121,
439
+ frame_rate: int = 25,
440
+ num_inference_steps: int = 30,
441
+ seed: int = 8,
442
+ progress: gr.Progress = gr.Progress(),
443
+ ):
444
+ generator = torch.Generator(device=device).manual_seed(seed)
445
+ run_task_time = 0
446
+ time_cost_str = ''
447
+ run_task_time, time_cost_str = get_time_cost(run_task_time, time_cost_str)
448
+ try:
449
+ with torch.no_grad():
450
+ video = t2v_pipe(
451
+ prompt=prompt,
452
+ negative_prompt=negative_prompt,
453
+ generator=generator,
454
+ width=width,
455
+ height=height,
456
+ num_frames=num_frames,
457
+ num_inference_steps=num_inference_steps,
458
+ ).frames[0]
459
+ finally:
460
+ torch.cuda.empty_cache()
461
+ gc.collect()
462
+ run_task_time, time_cost_str = get_time_cost(run_task_time, time_cost_str)
463
+
464
+ output_path = tempfile.mktemp(suffix=".mp4")
465
+ export_to_video(video, output_path, fps=frame_rate)
466
+
467
+ del video
468
+ torch.cuda.empty_cache()
469
+ return output_path, time_cost_str
470
+
471
  # YOLO Object Detection Setup
472
  YOLO_MODEL_REPO = "strangerzonehf/Flux-Ultimate-LoRA-Collection"
473
  YOLO_CHECKPOINT_NAME = "images/demo.pt"
 
488
 
489
  return Image.fromarray(annotated_image)
490
 
491
+ # Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, @yolo, and now @text2video commands
492
 
493
  @spaces.GPU
494
  def generate(
 
508
  - "@web": triggers a web search or webpage visit.
509
  - "@rAgent": initiates a reasoning chain using Llama mode OpenAI.
510
  - "@yolo": triggers object detection using YOLO.
511
+ - "@text2video": triggers text-to-video generation.
512
  """
513
  text = input_dict["text"]
514
  files = input_dict.get("files", [])
 
604
  yield gr.Image(result_img)
605
  return
606
 
607
+ # --- Text-to-Video Generation branch ---
608
+ if text.strip().lower().startswith("@text2video"):
609
+ # Expect the command to be: "@text2video <prompt> [|| <negative prompt>]"
610
+ command_body = text[len("@text2video"):].strip()
611
+ if "||" in command_body:
612
+ prompt_text, negative_prompt = command_body.split("||", 1)
613
+ prompt_text = prompt_text.strip()
614
+ negative_prompt = negative_prompt.strip()
615
+ else:
616
+ prompt_text = command_body
617
+ negative_prompt = "low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly"
618
+ yield "🎞️ Generating video..."
619
+ video_path, time_cost_str = text_to_video(prompt_text, negative_prompt)
620
+ yield gr.Video(video_path)
621
+ yield f"Time cost by step (ms): {time_cost_str}"
622
+ return
623
+
624
  # --- Text and TTS branch ---
625
  tts_prefix = "@tts"
626
  is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
 
717
  ["@rAgent Explain how a binary search algorithm works."],
718
  ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
719
  ["@tts1 Explain Tower of Hanoi"],
720
+ ["@text2video A futuristic cityscape at dusk"],
721
  ],
722
  cache_examples=False,
723
  type="messages",
724
  description=DESCRIPTION,
725
  css=css,
726
  fill_height=True,
727
+ textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="@tts1-♀, @tts2-β™‚, @image-image gen, @3d-3d mesh gen, @rAgent-coding, @web-websearch, @yolo-object detection, @text2video-video gen, default-{text gen}{image-text-text}"),
728
  stop_btn="Stop Generation",
729
  multimodal=True,
730
  )