ariG23498 HF staff commited on
Commit
d2c4dcd
·
1 Parent(s): 39d7a6f
Files changed (1) hide show
  1. app.py +65 -76
app.py CHANGED
@@ -3,6 +3,7 @@ from PIL import Image
3
  import torch
4
  import soundfile as sf
5
  from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
 
6
  import spaces
7
 
8
  # Define model path
@@ -23,37 +24,27 @@ user_prompt = '<|user|>'
23
  assistant_prompt = '<|assistant|>'
24
  prompt_suffix = '<|end|>'
25
 
26
- # Define inference functions for each input type
27
  @spaces.GPU
28
- def process_image(image, question):
29
- if not image or not question:
30
- return "Please upload an image and provide a question."
31
-
32
- prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
33
- inputs = processor(text=prompt, images=image, return_tensors='pt').to(model.device)
34
-
35
- with torch.no_grad():
36
- generate_ids = model.generate(
37
- **inputs,
38
- max_new_tokens=200,
39
- num_logits_to_keep=0,
40
- )
41
- generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
42
- response = processor.batch_decode(
43
- generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
44
- )[0]
45
-
46
- return response
47
 
48
- @spaces.GPU
49
- def process_audio(audio, question):
50
- if not audio or not question:
51
- return "Please upload an audio file and provide a question."
52
-
53
- prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
54
- samplerate, audio_data = audio # Gradio Audio returns (samplerate, data)
55
- inputs = processor(text=prompt, audios=[(audio_data, samplerate)], return_tensors='pt').to(model.device)
56
-
 
 
 
 
 
 
57
  with torch.no_grad():
58
  generate_ids = model.generate(
59
  **inputs,
@@ -64,7 +55,7 @@ def process_audio(audio, question):
64
  response = processor.batch_decode(
65
  generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
66
  )[0]
67
-
68
  return response
69
 
70
  # Gradio interface
@@ -79,59 +70,57 @@ with gr.Blocks(
79
  gr.Markdown(
80
  """
81
  # Phi-4 Multimodal Demo
82
- Select a tab below to upload an **image** or **audio** file, ask a question, and get a response from the model!
83
  Built with the `microsoft/Phi-4-multimodal-instruct` model by xAI.
84
  """
85
  )
86
 
87
- with gr.Tabs():
88
- # Image Tab
89
- with gr.TabItem("Image"):
90
- with gr.Row():
91
- with gr.Column(scale=1):
92
- image_input = gr.Image(label="Upload Your Image", type="pil")
93
- image_question = gr.Textbox(
94
- label="Your Question",
95
- placeholder="e.g., 'What is shown in this image?'",
96
- lines=2,
97
- )
98
- image_submit = gr.Button("Submit", variant="primary")
99
- with gr.Column(scale=2):
100
- image_output = gr.Textbox(
101
- label="Model Response",
102
- placeholder="Response will appear here...",
103
- lines=10,
104
- interactive=False,
105
- )
106
- image_submit.click(
107
- fn=process_image,
108
- inputs=[image_input, image_question],
109
- outputs=image_output,
110
  )
 
 
 
 
 
 
 
 
 
 
111
 
112
- # Audio Tab
113
- with gr.TabItem("Audio"):
114
- with gr.Row():
115
- with gr.Column(scale=1):
116
- audio_input = gr.Audio(label="Upload Your Audio", type="numpy")
117
- audio_question = gr.Textbox(
118
- label="Your Question",
119
- placeholder="e.g., 'Transcribe this audio.'",
120
- lines=2,
121
- )
122
- audio_submit = gr.Button("Submit", variant="primary")
123
- with gr.Column(scale=2):
124
- audio_output = gr.Textbox(
125
- label="Model Response",
126
- placeholder="Response will appear here...",
127
- lines=10,
128
- interactive=False,
129
- )
130
- audio_submit.click(
131
- fn=process_audio,
132
- inputs=[audio_input, audio_question],
133
- outputs=audio_output,
134
  )
135
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  # Launch the demo
137
  demo.launch()
 
3
  import torch
4
  import soundfile as sf
5
  from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
6
+ from urllib.request import urlopen
7
  import spaces
8
 
9
  # Define model path
 
24
  assistant_prompt = '<|assistant|>'
25
  prompt_suffix = '<|end|>'
26
 
27
+ # Define inference function
28
  @spaces.GPU
29
+ def process_input(input_type, file, question):
30
+ if not file or not question:
31
+ return "Please upload a file and provide a question."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
+ # Prepare the prompt
34
+ if input_type == "Image":
35
+ prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
36
+ # Open image from uploaded file
37
+ image = Image.open(file)
38
+ inputs = processor(text=prompt, images=image, return_tensors='pt').to(model.device)
39
+ elif input_type == "Audio":
40
+ prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
41
+ # Read audio from uploaded file
42
+ audio, samplerate = sf.read(file)
43
+ inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(model.device)
44
+ else:
45
+ return "Invalid input type selected."
46
+
47
+ # Generate response
48
  with torch.no_grad():
49
  generate_ids = model.generate(
50
  **inputs,
 
55
  response = processor.batch_decode(
56
  generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
57
  )[0]
58
+
59
  return response
60
 
61
  # Gradio interface
 
70
  gr.Markdown(
71
  """
72
  # Phi-4 Multimodal Demo
73
+ Upload an **image** or **audio** file, ask a question, and get a response from the model!
74
  Built with the `microsoft/Phi-4-multimodal-instruct` model by xAI.
75
  """
76
  )
77
 
78
+ with gr.Row():
79
+ with gr.Column(scale=1):
80
+ input_type = gr.Radio(
81
+ choices=["Image", "Audio"],
82
+ label="Select Input Type",
83
+ value="Image",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  )
85
+ file_input = gr.File(
86
+ label="Upload Your File",
87
+ file_types=["image", "audio"],
88
+ )
89
+ question_input = gr.Textbox(
90
+ label="Your Question",
91
+ placeholder="e.g., 'What is shown in this image?' or 'Transcribe this audio.'",
92
+ lines=2,
93
+ )
94
+ submit_btn = gr.Button("Submit", variant="primary")
95
 
96
+ with gr.Column(scale=2):
97
+ output_text = gr.Textbox(
98
+ label="Model Response",
99
+ placeholder="Response will appear here...",
100
+ lines=10,
101
+ interactive=False,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  )
103
 
104
+ # Example section
105
+ with gr.Accordion("Examples", open=False):
106
+ gr.Markdown("Try these examples:")
107
+ gr.Examples(
108
+ examples=[
109
+ ["Image", "https://www.ilankelman.org/stopsigns/australia.jpg", "What is shown in this image?"],
110
+ ["Audio", "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac", "Transcribe the audio to text."],
111
+ ],
112
+ inputs=[input_type, file_input, question_input],
113
+ outputs=output_text,
114
+ fn=process_input,
115
+ cache_examples=False,
116
+ )
117
+
118
+ # Connect the submit button
119
+ submit_btn.click(
120
+ fn=process_input,
121
+ inputs=[input_type, file_input, question_input],
122
+ outputs=output_text,
123
+ )
124
+
125
  # Launch the demo
126
  demo.launch()