Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -12,15 +12,17 @@ model = AutoModelForImageTextToText.from_pretrained(
|
|
12 |
).eval().to("cuda")
|
13 |
|
14 |
@spaces.GPU
|
15 |
-
def process_inputs(image, audio):
|
16 |
messages = [
|
17 |
{
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
|
|
|
|
24 |
|
25 |
input_ids = processor.apply_chat_template(
|
26 |
messages,
|
@@ -38,25 +40,28 @@ def process_inputs(image, audio):
|
|
38 |
max_new_tokens=256,
|
39 |
disable_compile=True
|
40 |
)
|
41 |
-
|
42 |
outputs[:, input_len:],
|
43 |
skip_special_tokens=True,
|
44 |
clean_up_tokenization_spaces=True
|
45 |
)
|
46 |
-
return
|
47 |
|
48 |
# Gradio interface
|
49 |
iface = gr.Interface(
|
50 |
fn=process_inputs,
|
51 |
inputs=[
|
52 |
gr.Image(label="Upload Image", type="pil"),
|
53 |
-
gr.Audio(label="
|
|
|
54 |
],
|
55 |
outputs=gr.Textbox(label="Answer"),
|
56 |
-
title="Visual
|
57 |
-
description="Upload an image
|
58 |
-
examples=[
|
|
|
|
|
59 |
)
|
60 |
|
61 |
if __name__ == "__main__":
|
62 |
-
iface.launch(share=True)
|
|
|
12 |
).eval().to("cuda")
|
13 |
|
14 |
@spaces.GPU
|
15 |
+
def process_inputs(image, audio, text):
|
16 |
messages = [
|
17 |
{
|
18 |
+
"role": "user",
|
19 |
+
"content": [
|
20 |
+
{"type": "image", "image": image},
|
21 |
+
{"type": "audio", "audio": audio},
|
22 |
+
{"type": "text", "text": text},
|
23 |
+
]
|
24 |
+
},
|
25 |
+
]
|
26 |
|
27 |
input_ids = processor.apply_chat_template(
|
28 |
messages,
|
|
|
40 |
max_new_tokens=256,
|
41 |
disable_compile=True
|
42 |
)
|
43 |
+
text_output = processor.batch_decode(
|
44 |
outputs[:, input_len:],
|
45 |
skip_special_tokens=True,
|
46 |
clean_up_tokenization_spaces=True
|
47 |
)
|
48 |
+
return text_output[0]
|
49 |
|
50 |
# Gradio interface
|
51 |
iface = gr.Interface(
|
52 |
fn=process_inputs,
|
53 |
inputs=[
|
54 |
gr.Image(label="Upload Image", type="pil"),
|
55 |
+
gr.Audio(label="Upload Audio", type="filepath"),
|
56 |
+
gr.Textbox(label="Enter Your Question", type="text")
|
57 |
],
|
58 |
outputs=gr.Textbox(label="Answer"),
|
59 |
+
title="Visual + Audio + Text Question Answering",
|
60 |
+
description="Upload an image, an audio file, and enter a text question. The model will generate a text response based on all inputs.",
|
61 |
+
examples=[
|
62 |
+
["cat.jpg", "cats.wav", "What do you see in the image?"],
|
63 |
+
]
|
64 |
)
|
65 |
|
66 |
if __name__ == "__main__":
|
67 |
+
iface.launch(share=True)
|