ariG23498 HF staff commited on
Commit
f9923d4
·
1 Parent(s): dfe1f0b
Files changed (2) hide show
  1. app.py +74 -0
  2. requirements.txt +11 -0
app.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import soundfile as sf
3
+ from PIL import Image
4
+ import spaces
5
+ from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
6
+
7
+ # Define model path
8
+ model_path = "microsoft/Phi-4-multimodal-instruct"
9
+
10
+ # Load model and processor
11
+ processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
12
+ model = AutoModelForCausalLM.from_pretrained(
13
+ model_path,
14
+ device_map="auto",
15
+ torch_dtype="auto",
16
+ trust_remote_code=True,
17
+ attn_implementation='eager',
18
+ )
19
+
20
+ generation_config = GenerationConfig.from_pretrained(model_path)
21
+
22
+ # Define prompt structure
23
+ user_prompt = '<|user|>'
24
+ assistant_prompt = '<|assistant|>'
25
+ prompt_suffix = '<|end|>'
26
+
27
+ @spaces.GPU
28
+ def process_multimodal(input_file, query):
29
+ if input_file is None:
30
+ return "Please upload an image or an audio file."
31
+
32
+ file_type = input_file.type
33
+ prompt = f"{user_prompt}<|media_1|>{query}{prompt_suffix}{assistant_prompt}"
34
+
35
+ if "image" in file_type:
36
+ image = Image.open(input_file)
37
+ inputs = processor(text=prompt, images=image, return_tensors='pt').to('cuda:0')
38
+ elif "audio" in file_type:
39
+ audio, samplerate = sf.read(input_file.name)
40
+ inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to('cuda:0')
41
+ else:
42
+ return "Unsupported file format. Please upload an image or audio file."
43
+
44
+ generate_ids = model.generate(
45
+ **inputs,
46
+ max_new_tokens=1000,
47
+ generation_config=generation_config,
48
+ num_logits_to_keep=0,
49
+ )
50
+ generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
51
+ response = processor.batch_decode(
52
+ generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
53
+ )[0]
54
+
55
+ return response
56
+
57
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
58
+ gr.Markdown("""
59
+ # Phi-4 Multimodal Chat
60
+ Upload an image or an audio file and ask questions related to it!
61
+ """)
62
+
63
+ with gr.Row():
64
+ with gr.Column():
65
+ input_file = gr.File(label="Upload Image or Audio")
66
+ query = gr.Textbox(label="Ask a question")
67
+ submit_btn = gr.Button("Submit")
68
+
69
+ with gr.Column():
70
+ output = gr.Textbox(label="Response", interactive=False)
71
+
72
+ submit_btn.click(process_multimodal, inputs=[input_file, query], outputs=output)
73
+
74
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ spaces
3
+ torch
4
+ peft
5
+ torchvision
6
+ scipy
7
+ soundfile
8
+ pillow
9
+ accelerate
10
+ transformers
11
+ backoff