ariG23498 HF staff commited on
Commit
97e7627
·
1 Parent(s): 22dc319
Files changed (1) hide show
  1. app.py +97 -40
app.py CHANGED
@@ -1,8 +1,10 @@
1
  import gradio as gr
2
- import soundfile as sf
3
  from PIL import Image
4
- import spaces
 
5
  from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
 
 
6
 
7
  # Define model path
8
  model_path = "microsoft/Phi-4-multimodal-instruct"
@@ -10,12 +12,14 @@ model_path = "microsoft/Phi-4-multimodal-instruct"
10
  # Load model and processor
11
  processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
12
  model = AutoModelForCausalLM.from_pretrained(
13
- model_path,
14
- device_map="cuda",
15
- torch_dtype="auto",
16
- trust_remote_code=True,
 
17
  ).cuda()
18
 
 
19
  generation_config = GenerationConfig.from_pretrained(model_path)
20
 
21
  # Define prompt structure
@@ -23,51 +27,104 @@ user_prompt = '<|user|>'
23
  assistant_prompt = '<|assistant|>'
24
  prompt_suffix = '<|end|>'
25
 
 
26
  @spaces.GPU
27
- def process_multimodal(input_file, query):
28
- if input_file is None:
29
- return "Please upload an image or an audio file."
30
-
31
- file_type = input_file.type
32
- prompt = f"{user_prompt}<|media_1|>{query}{prompt_suffix}{assistant_prompt}"
33
-
34
- if "image" in file_type:
35
- image = Image.open(input_file)
36
  inputs = processor(text=prompt, images=image, return_tensors='pt').to('cuda:0')
37
- elif "audio" in file_type:
38
- audio, samplerate = sf.read(input_file.name)
 
 
39
  inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to('cuda:0')
40
  else:
41
- return "Unsupported file format. Please upload an image or audio file."
42
-
43
- generate_ids = model.generate(
44
- **inputs,
45
- max_new_tokens=1000,
46
- generation_config=generation_config,
47
- num_logits_to_keep=0,
48
- )
 
 
49
  generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
50
  response = processor.batch_decode(
51
  generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
52
  )[0]
53
-
54
  return response
55
 
56
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
57
- gr.Markdown("""
58
- # Phi-4 Multimodal Chat
59
- Upload an image or an audio file and ask questions related to it!
60
- """)
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  with gr.Row():
63
- with gr.Column():
64
- input_file = gr.File(label="Upload Image or Audio")
65
- query = gr.Textbox(label="Ask a question")
66
- submit_btn = gr.Button("Submit")
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
- with gr.Column():
69
- output = gr.Textbox(label="Response", interactive=False)
70
-
71
- submit_btn.click(process_multimodal, inputs=[input_file, query], outputs=output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
- demo.launch()
 
 
1
  import gradio as gr
 
2
  from PIL import Image
3
+ import torch
4
+ import soundfile as sf
5
  from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
6
+ from urllib.request import urlopen
7
+ import spaces
8
 
9
  # Define model path
10
  model_path = "microsoft/Phi-4-multimodal-instruct"
 
12
  # Load model and processor
13
  processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
14
  model = AutoModelForCausalLM.from_pretrained(
15
+ model_path,
16
+ device_map="cuda",
17
+ torch_dtype="auto",
18
+ trust_remote_code=True,
19
+ attn_implementation="eager", # Changed from 'flash_attention_2' to 'eager'
20
  ).cuda()
21
 
22
+ # Load generation config
23
  generation_config = GenerationConfig.from_pretrained(model_path)
24
 
25
  # Define prompt structure
 
27
  assistant_prompt = '<|assistant|>'
28
  prompt_suffix = '<|end|>'
29
 
30
+ # Define inference function
31
  @spaces.GPU
32
+ def process_input(input_type, file, question):
33
+ if not file or not question:
34
+ return "Please upload a file and provide a question."
35
+
36
+ # Prepare the prompt
37
+ if input_type == "Image":
38
+ prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
39
+ # Open image from uploaded file
40
+ image = Image.open(file)
41
  inputs = processor(text=prompt, images=image, return_tensors='pt').to('cuda:0')
42
+ elif input_type == "Audio":
43
+ prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
44
+ # Read audio from uploaded file
45
+ audio, samplerate = sf.read(file)
46
  inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to('cuda:0')
47
  else:
48
+ return "Invalid input type selected."
49
+
50
+ # Generate response
51
+ with torch.no_grad():
52
+ generate_ids = model.generate(
53
+ **inputs,
54
+ max_new_tokens=1000,
55
+ generation_config=generation_config,
56
+ num_logits_to_keep=0,
57
+ )
58
  generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
59
  response = processor.batch_decode(
60
  generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
61
  )[0]
62
+
63
  return response
64
 
65
+ # Gradio interface
66
+ with gr.Blocks(
67
+ title="Phi-4 Multimodal Demo",
68
+ theme=gr.themes.Soft(
69
+ primary_hue="blue",
70
+ secondary_hue="gray",
71
+ radius_size="lg",
72
+ ),
73
+ ) as demo:
74
+ gr.Markdown(
75
+ """
76
+ # Phi-4 Multimodal Demo
77
+ Upload an **image** or **audio** file, ask a question, and get a response from the model!
78
+ Built with the `microsoft/Phi-4-multimodal-instruct` model by xAI.
79
+ """
80
+ )
81
 
82
  with gr.Row():
83
+ with gr.Column(scale=1):
84
+ input_type = gr.Radio(
85
+ choices=["Image", "Audio"],
86
+ label="Select Input Type",
87
+ value="Image",
88
+ )
89
+ file_input = gr.File(
90
+ label="Upload Your File",
91
+ file_types=["image", "audio"],
92
+ )
93
+ question_input = gr.Textbox(
94
+ label="Your Question",
95
+ placeholder="e.g., 'What is shown in this image?' or 'Transcribe this audio.'",
96
+ lines=2,
97
+ )
98
+ submit_btn = gr.Button("Submit", variant="primary")
99
 
100
+ with gr.Column(scale=2):
101
+ output_text = gr.Textbox(
102
+ label="Model Response",
103
+ placeholder="Response will appear here...",
104
+ lines=10,
105
+ interactive=False,
106
+ )
107
+
108
+ # Example section
109
+ with gr.Accordion("Examples", open=False):
110
+ gr.Markdown("Try these examples:")
111
+ gr.Examples(
112
+ examples=[
113
+ ["Image", "https://www.ilankelman.org/stopsigns/australia.jpg", "What is shown in this image?"],
114
+ ["Audio", "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac", "Transcribe the audio to text."],
115
+ ],
116
+ inputs=[input_type, file_input, question_input],
117
+ outputs=output_text,
118
+ fn=process_input,
119
+ cache_examples=False,
120
+ )
121
+
122
+ # Connect the submit button
123
+ submit_btn.click(
124
+ fn=process_input,
125
+ inputs=[input_type, file_input, question_input],
126
+ outputs=output_text,
127
+ )
128
 
129
+ # Launch the demo
130
+ demo.launch()