Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -37,7 +37,6 @@ from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
|
|
37 |
from diffusers.utils import export_to_ply
|
38 |
|
39 |
os.system('pip install backoff')
|
40 |
-
|
41 |
# Global constants and helper functions
|
42 |
|
43 |
MAX_SEED = np.iinfo(np.int32).max
|
@@ -259,16 +258,7 @@ phi4_model = AutoModelForCausalLM.from_pretrained(
|
|
259 |
# ------------------------------------------------------------------------------
|
260 |
|
261 |
DESCRIPTION = """
|
262 |
-
# Agent Dino 🌠
|
263 |
-
This chatbot supports various commands:
|
264 |
-
- **@tts1 / @tts2:** text-to-speech
|
265 |
-
- **@image:** image generation
|
266 |
-
- **@3d:** 3D mesh generation
|
267 |
-
- **@web:** web search/visit
|
268 |
-
- **@rAgent:** reasoning chain
|
269 |
-
- **@yolo:** object detection
|
270 |
-
- **@phi4:** multimodal (image/audio) question answering
|
271 |
-
"""
|
272 |
|
273 |
css = '''
|
274 |
h1 {
|
@@ -582,14 +572,15 @@ def generate(
|
|
582 |
if not question:
|
583 |
yield "Error: Please provide a question after @phi4."
|
584 |
return
|
585 |
-
|
586 |
# Determine input type (Image or Audio) from the first file
|
587 |
input_file = files[0]
|
588 |
try:
|
|
|
589 |
if isinstance(input_file, Image.Image):
|
590 |
input_type = "Image"
|
591 |
file_for_phi4 = input_file
|
592 |
else:
|
|
|
593 |
try:
|
594 |
file_for_phi4 = Image.open(input_file)
|
595 |
input_type = "Image"
|
@@ -599,7 +590,7 @@ def generate(
|
|
599 |
except Exception:
|
600 |
input_type = "Audio"
|
601 |
file_for_phi4 = input_file
|
602 |
-
|
603 |
if input_type == "Image":
|
604 |
phi4_prompt = f'{phi4_user_prompt}<|image_1|>{question}{phi4_prompt_suffix}{phi4_assistant_prompt}'
|
605 |
inputs = phi4_processor(text=phi4_prompt, images=file_for_phi4, return_tensors='pt').to(phi4_model.device)
|
@@ -610,22 +601,20 @@ def generate(
|
|
610 |
else:
|
611 |
yield "Invalid file type for @phi4 multimodal processing."
|
612 |
return
|
613 |
-
|
614 |
with torch.no_grad():
|
615 |
generate_ids = phi4_model.generate(
|
616 |
**inputs,
|
617 |
max_new_tokens=200,
|
618 |
num_logits_to_keep=0,
|
619 |
-
streamer=streamer # Adding text streamer
|
620 |
)
|
621 |
-
|
622 |
-
|
623 |
-
|
624 |
-
|
625 |
-
|
626 |
-
|
627 |
-
|
628 |
-
|
629 |
|
630 |
# --- Text and TTS branch ---
|
631 |
tts_prefix = "@tts"
|
|
|
37 |
from diffusers.utils import export_to_ply
|
38 |
|
39 |
os.system('pip install backoff')
|
|
|
40 |
# Global constants and helper functions
|
41 |
|
42 |
MAX_SEED = np.iinfo(np.int32).max
|
|
|
258 |
# ------------------------------------------------------------------------------
|
259 |
|
260 |
DESCRIPTION = """
|
261 |
+
# Agent Dino 🌠"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
262 |
|
263 |
css = '''
|
264 |
h1 {
|
|
|
572 |
if not question:
|
573 |
yield "Error: Please provide a question after @phi4."
|
574 |
return
|
|
|
575 |
# Determine input type (Image or Audio) from the first file
|
576 |
input_file = files[0]
|
577 |
try:
|
578 |
+
# If file is already a PIL Image, treat as image
|
579 |
if isinstance(input_file, Image.Image):
|
580 |
input_type = "Image"
|
581 |
file_for_phi4 = input_file
|
582 |
else:
|
583 |
+
# Try opening as image; if it fails, assume audio
|
584 |
try:
|
585 |
file_for_phi4 = Image.open(input_file)
|
586 |
input_type = "Image"
|
|
|
590 |
except Exception:
|
591 |
input_type = "Audio"
|
592 |
file_for_phi4 = input_file
|
593 |
+
|
594 |
if input_type == "Image":
|
595 |
phi4_prompt = f'{phi4_user_prompt}<|image_1|>{question}{phi4_prompt_suffix}{phi4_assistant_prompt}'
|
596 |
inputs = phi4_processor(text=phi4_prompt, images=file_for_phi4, return_tensors='pt').to(phi4_model.device)
|
|
|
601 |
else:
|
602 |
yield "Invalid file type for @phi4 multimodal processing."
|
603 |
return
|
604 |
+
|
605 |
with torch.no_grad():
|
606 |
generate_ids = phi4_model.generate(
|
607 |
**inputs,
|
608 |
max_new_tokens=200,
|
609 |
num_logits_to_keep=0,
|
|
|
610 |
)
|
611 |
+
input_length = inputs['input_ids'].shape[1]
|
612 |
+
generate_ids = generate_ids[:, input_length:]
|
613 |
+
response = phi4_processor.batch_decode(
|
614 |
+
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
615 |
+
)[0]
|
616 |
+
yield response
|
617 |
+
return
|
|
|
618 |
|
619 |
# --- Text and TTS branch ---
|
620 |
tts_prefix = "@tts"
|