prithivMLmods commited on
Commit
fcf45c6
·
verified ·
1 Parent(s): e680658

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -23
app.py CHANGED
@@ -37,7 +37,6 @@ from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
37
  from diffusers.utils import export_to_ply
38
 
39
  os.system('pip install backoff')
40
-
41
  # Global constants and helper functions
42
 
43
  MAX_SEED = np.iinfo(np.int32).max
@@ -259,16 +258,7 @@ phi4_model = AutoModelForCausalLM.from_pretrained(
259
  # ------------------------------------------------------------------------------
260
 
261
  DESCRIPTION = """
262
- # Agent Dino 🌠
263
- This chatbot supports various commands:
264
- - **@tts1 / @tts2:** text-to-speech
265
- - **@image:** image generation
266
- - **@3d:** 3D mesh generation
267
- - **@web:** web search/visit
268
- - **@rAgent:** reasoning chain
269
- - **@yolo:** object detection
270
- - **@phi4:** multimodal (image/audio) question answering
271
- """
272
 
273
  css = '''
274
  h1 {
@@ -582,14 +572,15 @@ def generate(
582
  if not question:
583
  yield "Error: Please provide a question after @phi4."
584
  return
585
-
586
  # Determine input type (Image or Audio) from the first file
587
  input_file = files[0]
588
  try:
 
589
  if isinstance(input_file, Image.Image):
590
  input_type = "Image"
591
  file_for_phi4 = input_file
592
  else:
 
593
  try:
594
  file_for_phi4 = Image.open(input_file)
595
  input_type = "Image"
@@ -599,7 +590,7 @@ def generate(
599
  except Exception:
600
  input_type = "Audio"
601
  file_for_phi4 = input_file
602
-
603
  if input_type == "Image":
604
  phi4_prompt = f'{phi4_user_prompt}<|image_1|>{question}{phi4_prompt_suffix}{phi4_assistant_prompt}'
605
  inputs = phi4_processor(text=phi4_prompt, images=file_for_phi4, return_tensors='pt').to(phi4_model.device)
@@ -610,22 +601,20 @@ def generate(
610
  else:
611
  yield "Invalid file type for @phi4 multimodal processing."
612
  return
613
-
614
  with torch.no_grad():
615
  generate_ids = phi4_model.generate(
616
  **inputs,
617
  max_new_tokens=200,
618
  num_logits_to_keep=0,
619
- streamer=streamer # Adding text streamer
620
  )
621
-
622
- buffer = "⚛️ phi4 multimodal is initiated, hold tight"
623
- for new_text in streamer:
624
- buffer += new_text
625
- buffer = buffer.replace("<|im_end|>", "")
626
- time.sleep(0.01)
627
- yield buffer
628
-
629
 
630
  # --- Text and TTS branch ---
631
  tts_prefix = "@tts"
 
37
  from diffusers.utils import export_to_ply
38
 
39
  os.system('pip install backoff')
 
40
  # Global constants and helper functions
41
 
42
  MAX_SEED = np.iinfo(np.int32).max
 
258
  # ------------------------------------------------------------------------------
259
 
260
  DESCRIPTION = """
261
+ # Agent Dino 🌠"""
 
 
 
 
 
 
 
 
 
262
 
263
  css = '''
264
  h1 {
 
572
  if not question:
573
  yield "Error: Please provide a question after @phi4."
574
  return
 
575
  # Determine input type (Image or Audio) from the first file
576
  input_file = files[0]
577
  try:
578
+ # If file is already a PIL Image, treat as image
579
  if isinstance(input_file, Image.Image):
580
  input_type = "Image"
581
  file_for_phi4 = input_file
582
  else:
583
+ # Try opening as image; if it fails, assume audio
584
  try:
585
  file_for_phi4 = Image.open(input_file)
586
  input_type = "Image"
 
590
  except Exception:
591
  input_type = "Audio"
592
  file_for_phi4 = input_file
593
+
594
  if input_type == "Image":
595
  phi4_prompt = f'{phi4_user_prompt}<|image_1|>{question}{phi4_prompt_suffix}{phi4_assistant_prompt}'
596
  inputs = phi4_processor(text=phi4_prompt, images=file_for_phi4, return_tensors='pt').to(phi4_model.device)
 
601
  else:
602
  yield "Invalid file type for @phi4 multimodal processing."
603
  return
604
+
605
  with torch.no_grad():
606
  generate_ids = phi4_model.generate(
607
  **inputs,
608
  max_new_tokens=200,
609
  num_logits_to_keep=0,
 
610
  )
611
+ input_length = inputs['input_ids'].shape[1]
612
+ generate_ids = generate_ids[:, input_length:]
613
+ response = phi4_processor.batch_decode(
614
+ generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
615
+ )[0]
616
+ yield response
617
+ return
 
618
 
619
  # --- Text and TTS branch ---
620
  tts_prefix = "@tts"