Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -35,6 +35,9 @@ from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
|
|
35 |
from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
|
36 |
from diffusers.utils import export_to_ply
|
37 |
|
|
|
|
|
|
|
38 |
# Global constants and helper functions
|
39 |
|
40 |
MAX_SEED = np.iinfo(np.int32).max
|
@@ -424,7 +427,60 @@ def detect_objects(image: np.ndarray):
|
|
424 |
|
425 |
return Image.fromarray(annotated_image)
|
426 |
|
427 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
428 |
|
429 |
@spaces.GPU
|
430 |
def generate(
|
@@ -442,8 +498,9 @@ def generate(
|
|
442 |
- "@image": triggers image generation using the SDXL pipeline.
|
443 |
- "@3d": triggers 3D model generation using the ShapE pipeline.
|
444 |
- "@web": triggers a web search or webpage visit.
|
445 |
-
- "@
|
446 |
- "@yolo": triggers object detection using YOLO.
|
|
|
447 |
"""
|
448 |
text = input_dict["text"]
|
449 |
files = input_dict.get("files", [])
|
@@ -539,6 +596,24 @@ def generate(
|
|
539 |
yield gr.Image(result_img)
|
540 |
return
|
541 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
542 |
# --- Text and TTS branch ---
|
543 |
tts_prefix = "@tts"
|
544 |
is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
|
@@ -627,12 +702,14 @@ demo = gr.ChatInterface(
|
|
627 |
gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
|
628 |
],
|
629 |
examples=[
|
|
|
|
|
630 |
["@tts2 What causes rainbows to form?"],
|
631 |
["@image Chocolate dripping from a donut"],
|
632 |
["@3d A birthday cupcake with cherry"],
|
633 |
[{"text": "Summarize the letter", "files": ["examples/1.png"]}],
|
634 |
[{"text": "@yolo", "files": ["examples/yolo.jpeg"]}],
|
635 |
-
["@
|
636 |
["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
|
637 |
["@tts1 Explain Tower of Hanoi"],
|
638 |
],
|
@@ -641,7 +718,7 @@ demo = gr.ChatInterface(
|
|
641 |
description=DESCRIPTION,
|
642 |
css=css,
|
643 |
fill_height=True,
|
644 |
-
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="@tts1
|
645 |
stop_btn="Stop Generation",
|
646 |
multimodal=True,
|
647 |
)
|
|
|
35 |
from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
|
36 |
from diffusers.utils import export_to_ply
|
37 |
|
38 |
+
# Additional import for Phi-4 multimodality (audio support)
|
39 |
+
import soundfile as sf
|
40 |
+
|
41 |
# Global constants and helper functions
|
42 |
|
43 |
MAX_SEED = np.iinfo(np.int32).max
|
|
|
427 |
|
428 |
return Image.fromarray(annotated_image)
|
429 |
|
430 |
+
# ---------------------------
|
431 |
+
# Phi-4 Multimodal Model Setup with Text Streaming
|
432 |
+
# ---------------------------
|
433 |
+
phi4_model_path = "microsoft/Phi-4-multimodal-instruct"
|
434 |
+
|
435 |
+
phi4_processor = AutoProcessor.from_pretrained(phi4_model_path, trust_remote_code=True)
|
436 |
+
phi4_model = AutoModelForCausalLM.from_pretrained(
|
437 |
+
phi4_model_path,
|
438 |
+
device_map="auto",
|
439 |
+
torch_dtype="auto",
|
440 |
+
trust_remote_code=True,
|
441 |
+
_attn_implementation="eager",
|
442 |
+
)
|
443 |
+
|
444 |
+
def process_phi4(input_type: str, file, question: str, max_new_tokens: int = 200):
|
445 |
+
"""
|
446 |
+
Process an image or audio input with the Phi-4 multimodal model.
|
447 |
+
Uses a text streamer to yield incremental outputs.
|
448 |
+
Expects input_type to be either 'image' or 'audio'.
|
449 |
+
"""
|
450 |
+
user_prompt = '<|user|>'
|
451 |
+
assistant_prompt = '<|assistant|>'
|
452 |
+
prompt_suffix = '<|end|>'
|
453 |
+
|
454 |
+
if not file or not question:
|
455 |
+
yield "Please upload a file and provide a question."
|
456 |
+
return
|
457 |
+
|
458 |
+
if input_type.lower() == "image":
|
459 |
+
prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
|
460 |
+
image = Image.open(file)
|
461 |
+
inputs = phi4_processor(text=prompt, images=image, return_tensors='pt').to(phi4_model.device)
|
462 |
+
elif input_type.lower() == "audio":
|
463 |
+
prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
|
464 |
+
audio, samplerate = sf.read(file)
|
465 |
+
inputs = phi4_processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(phi4_model.device)
|
466 |
+
else:
|
467 |
+
yield "Invalid input type selected."
|
468 |
+
return
|
469 |
+
|
470 |
+
# Setup text streamer using TextIteratorStreamer for incremental generation
|
471 |
+
streamer = TextIteratorStreamer(phi4_processor, skip_prompt=True, skip_special_tokens=True)
|
472 |
+
generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
|
473 |
+
thread = Thread(target=phi4_model.generate, kwargs=generation_kwargs)
|
474 |
+
thread.start()
|
475 |
+
buffer = ""
|
476 |
+
yield "🤔 Thinking..."
|
477 |
+
for new_text in streamer:
|
478 |
+
buffer += new_text
|
479 |
+
buffer = buffer.replace("<|im_end|>", "")
|
480 |
+
time.sleep(0.01)
|
481 |
+
yield buffer
|
482 |
+
|
483 |
+
# Chat Generation Function with support for @tts, @image, @3d, @web, @ragent, @yolo, and now @phi4 commands
|
484 |
|
485 |
@spaces.GPU
|
486 |
def generate(
|
|
|
498 |
- "@image": triggers image generation using the SDXL pipeline.
|
499 |
- "@3d": triggers 3D model generation using the ShapE pipeline.
|
500 |
- "@web": triggers a web search or webpage visit.
|
501 |
+
- "@ragent": initiates a reasoning chain using Llama mode.
|
502 |
- "@yolo": triggers object detection using YOLO.
|
503 |
+
- **New:** "@phi4": processes image or audio inputs with the Phi-4 multimodal model and streams text output.
|
504 |
"""
|
505 |
text = input_dict["text"]
|
506 |
files = input_dict.get("files", [])
|
|
|
596 |
yield gr.Image(result_img)
|
597 |
return
|
598 |
|
599 |
+
# --- Phi-4 Multimodal branch with text streaming ---
|
600 |
+
if text.strip().lower().startswith("@phi4"):
|
601 |
+
# Expected format: "@phi4 [image|audio] <your question>"
|
602 |
+
parts = text.strip().split(maxsplit=2)
|
603 |
+
if len(parts) < 3:
|
604 |
+
yield "Error: Please provide input type and a question. Format: '@phi4 [image|audio] <your question>'"
|
605 |
+
return
|
606 |
+
input_type = parts[1]
|
607 |
+
question = parts[2]
|
608 |
+
if not files or len(files) == 0:
|
609 |
+
yield "Error: Please attach an image or audio file for Phi-4 processing."
|
610 |
+
return
|
611 |
+
file_input = files[0]
|
612 |
+
yield "🔄 Processing multimodal input with Phi-4..."
|
613 |
+
for partial in process_phi4(input_type, file_input, question):
|
614 |
+
yield partial
|
615 |
+
return
|
616 |
+
|
617 |
# --- Text and TTS branch ---
|
618 |
tts_prefix = "@tts"
|
619 |
is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
|
|
|
702 |
gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
|
703 |
],
|
704 |
examples=[
|
705 |
+
[{"text": "@phi4 Solve the problem", "files": ["examples/math.webp"]}],
|
706 |
+
[{"text": "@phi4 Transcribe the audio to text.", "files": ["examples/harvard.wav"]}],
|
707 |
["@tts2 What causes rainbows to form?"],
|
708 |
["@image Chocolate dripping from a donut"],
|
709 |
["@3d A birthday cupcake with cherry"],
|
710 |
[{"text": "Summarize the letter", "files": ["examples/1.png"]}],
|
711 |
[{"text": "@yolo", "files": ["examples/yolo.jpeg"]}],
|
712 |
+
["@ragent Explain how a binary search algorithm works."],
|
713 |
["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
|
714 |
["@tts1 Explain Tower of Hanoi"],
|
715 |
],
|
|
|
718 |
description=DESCRIPTION,
|
719 |
css=css,
|
720 |
fill_height=True,
|
721 |
+
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "audio"], file_count="multiple", placeholder="@tts1, @tts2, @image, @3d, @ragent, @web, @yolo, @phi4, or plain text"),
|
722 |
stop_btn="Stop Generation",
|
723 |
multimodal=True,
|
724 |
)
|