gemma3n-image-audio

Runtime error

suanan commited on Jul 30

Commit

fc548d4

verified ·

1 Parent(s): 64a5675

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -12,15 +12,17 @@ model = AutoModelForImageTextToText.from_pretrained(
 ).eval().to("cuda")
 @spaces.GPU
-def process_inputs(image, audio):
     messages = [
         {
-        "role": "user",
-        "content": [
-            {"type": "image", "image": image,},
-            {"type": "audio", "audio": audio,},
-        ]
-    },]
     input_ids = processor.apply_chat_template(
         messages,
@@ -38,25 +40,28 @@ def process_inputs(image, audio):
             max_new_tokens=256,
             disable_compile=True
         )
-    text = processor.batch_decode(
         outputs[:, input_len:],
         skip_special_tokens=True,
         clean_up_tokenization_spaces=True
     )
-    return text[0]
 # Gradio interface
 iface = gr.Interface(
     fn=process_inputs,
     inputs=[
         gr.Image(label="Upload Image", type="pil"),
-        gr.Audio(label="Ask Question about the Image", type="filepath")
     ],
     outputs=gr.Textbox(label="Answer"),
-    title="Visual (Audio) Question Answering",
-    description="Upload an image as context and ask a quesiton about the image. The model will generate a text response.",
-    examples=[["cat.jpg", "cats.wav"]]
 )
 if __name__ == "__main__":
-    iface.launch(share=True)

 ).eval().to("cuda")
 @spaces.GPU
+def process_inputs(image, audio, text):
     messages = [
         {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "audio", "audio": audio},
+                {"type": "text",  "text": text},
+            ]
+        },
+    ]
     input_ids = processor.apply_chat_template(
         messages,
             max_new_tokens=256,
             disable_compile=True
         )
+    text_output = processor.batch_decode(
         outputs[:, input_len:],
         skip_special_tokens=True,
         clean_up_tokenization_spaces=True
     )
+    return text_output[0]
 # Gradio interface
 iface = gr.Interface(
     fn=process_inputs,
     inputs=[
         gr.Image(label="Upload Image", type="pil"),
+        gr.Audio(label="Upload Audio", type="filepath"),
+        gr.Textbox(label="Enter Your Question", type="text")
     ],
     outputs=gr.Textbox(label="Answer"),
+    title="Visual + Audio + Text Question Answering",
+    description="Upload an image, an audio file, and enter a text question. The model will generate a text response based on all inputs.",
+    examples=[
+        ["cat.jpg", "cats.wav", "What do you see in the image?"],
+    ]
 )
 if __name__ == "__main__":
+    iface.launch(share=True)