Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -18,6 +18,10 @@ from PIL import Image
|
|
| 18 |
import edge_tts
|
| 19 |
import trimesh
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
from transformers import (
|
| 22 |
AutoModelForCausalLM,
|
| 23 |
AutoTokenizer,
|
|
@@ -400,7 +404,27 @@ def generate_3d_fn(
|
|
| 400 |
glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
|
| 401 |
return glb_path, seed
|
| 402 |
|
| 403 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 404 |
|
| 405 |
@spaces.GPU
|
| 406 |
def generate(
|
|
@@ -419,6 +443,7 @@ def generate(
|
|
| 419 |
- "@3d": triggers 3D model generation using the ShapE pipeline.
|
| 420 |
- "@web": triggers a web search or webpage visit.
|
| 421 |
- "@rAgent": initiates a reasoning chain using Llama mode OpenAI.
|
|
|
|
| 422 |
"""
|
| 423 |
text = input_dict["text"]
|
| 424 |
files = input_dict.get("files", [])
|
|
@@ -493,6 +518,27 @@ def generate(
|
|
| 493 |
yield partial
|
| 494 |
return
|
| 495 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 496 |
# --- Text and TTS branch ---
|
| 497 |
tts_prefix = "@tts"
|
| 498 |
is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
|
|
@@ -583,18 +629,17 @@ demo = gr.ChatInterface(
|
|
| 583 |
examples=[
|
| 584 |
["@tts2 What causes rainbows to form?"],
|
| 585 |
["@3d A birthday cupcake with cherry"],
|
| 586 |
-
[{"text": "
|
| 587 |
-
["@image Chocolate dripping from a donut
|
| 588 |
["@rAgent Explain how a binary search algorithm works."],
|
| 589 |
["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
|
| 590 |
-
|
| 591 |
],
|
| 592 |
cache_examples=False,
|
| 593 |
type="messages",
|
| 594 |
description=DESCRIPTION,
|
| 595 |
css=css,
|
| 596 |
fill_height=True,
|
| 597 |
-
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="@tts1-♀, @tts2-♂, @image-image gen, @3d-3d mesh gen, @rAgent-coding, @web-websearch, default-{text gen}{image-text-text}"),
|
| 598 |
stop_btn="Stop Generation",
|
| 599 |
multimodal=True,
|
| 600 |
)
|
|
|
|
| 18 |
import edge_tts
|
| 19 |
import trimesh
|
| 20 |
|
| 21 |
+
import supervision as sv
|
| 22 |
+
from ultralytics import YOLO as YOLODetector
|
| 23 |
+
from huggingface_hub import hf_hub_download
|
| 24 |
+
|
| 25 |
from transformers import (
|
| 26 |
AutoModelForCausalLM,
|
| 27 |
AutoTokenizer,
|
|
|
|
| 404 |
glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
|
| 405 |
return glb_path, seed
|
| 406 |
|
| 407 |
+
# YOLO Object Detection Setup
|
| 408 |
+
YOLO_MODEL_REPO = "strangerzonehf/Flux-Ultimate-LoRA-Collection"
|
| 409 |
+
YOLO_CHECKPOINT_NAME = "images/demo.pt"
|
| 410 |
+
yolo_model_path = hf_hub_download(repo_id=YOLO_MODEL_REPO, filename=YOLO_CHECKPOINT_NAME)
|
| 411 |
+
yolo_detector = YOLODetector(yolo_model_path)
|
| 412 |
+
|
| 413 |
+
def detect_objects(image: np.ndarray):
|
| 414 |
+
"""Runs object detection on the input image."""
|
| 415 |
+
results = yolo_detector(image, verbose=False)[0]
|
| 416 |
+
detections = sv.Detections.from_ultralytics(results).with_nms()
|
| 417 |
+
|
| 418 |
+
box_annotator = sv.BoxAnnotator()
|
| 419 |
+
label_annotator = sv.LabelAnnotator()
|
| 420 |
+
|
| 421 |
+
annotated_image = image.copy()
|
| 422 |
+
annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections)
|
| 423 |
+
annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections)
|
| 424 |
+
|
| 425 |
+
return Image.fromarray(annotated_image)
|
| 426 |
+
|
| 427 |
+
# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, and @yolo commands
|
| 428 |
|
| 429 |
@spaces.GPU
|
| 430 |
def generate(
|
|
|
|
| 443 |
- "@3d": triggers 3D model generation using the ShapE pipeline.
|
| 444 |
- "@web": triggers a web search or webpage visit.
|
| 445 |
- "@rAgent": initiates a reasoning chain using Llama mode OpenAI.
|
| 446 |
+
- "@yolo": triggers object detection using YOLO.
|
| 447 |
"""
|
| 448 |
text = input_dict["text"]
|
| 449 |
files = input_dict.get("files", [])
|
|
|
|
| 518 |
yield partial
|
| 519 |
return
|
| 520 |
|
| 521 |
+
# --- YOLO Object Detection branch ---
|
| 522 |
+
if text.strip().lower().startswith("@yolo"):
|
| 523 |
+
yield "🔍 Running object detection with YOLO..."
|
| 524 |
+
if not files or len(files) == 0:
|
| 525 |
+
yield "Error: Please attach an image for YOLO object detection."
|
| 526 |
+
return
|
| 527 |
+
# Use the first attached image
|
| 528 |
+
input_file = files[0]
|
| 529 |
+
try:
|
| 530 |
+
if isinstance(input_file, str):
|
| 531 |
+
pil_image = Image.open(input_file)
|
| 532 |
+
else:
|
| 533 |
+
pil_image = input_file
|
| 534 |
+
except Exception as e:
|
| 535 |
+
yield f"Error loading image: {str(e)}"
|
| 536 |
+
return
|
| 537 |
+
np_image = np.array(pil_image)
|
| 538 |
+
result_img = detect_objects(np_image)
|
| 539 |
+
yield gr.Image(result_img)
|
| 540 |
+
return
|
| 541 |
+
|
| 542 |
# --- Text and TTS branch ---
|
| 543 |
tts_prefix = "@tts"
|
| 544 |
is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
|
|
|
|
| 629 |
examples=[
|
| 630 |
["@tts2 What causes rainbows to form?"],
|
| 631 |
["@3d A birthday cupcake with cherry"],
|
| 632 |
+
[{"text": "Summarize the letter", "files": ["examples/1.png"]}],
|
| 633 |
+
["@image Chocolate dripping from a donut"],
|
| 634 |
["@rAgent Explain how a binary search algorithm works."],
|
| 635 |
["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
|
|
|
|
| 636 |
],
|
| 637 |
cache_examples=False,
|
| 638 |
type="messages",
|
| 639 |
description=DESCRIPTION,
|
| 640 |
css=css,
|
| 641 |
fill_height=True,
|
| 642 |
+
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="@tts1-♀, @tts2-♂, @image-image gen, @3d-3d mesh gen, @rAgent-coding, @web-websearch, @yolo-object detection, default-{text gen}{image-text-text}"),
|
| 643 |
stop_btn="Stop Generation",
|
| 644 |
multimodal=True,
|
| 645 |
)
|