suanan commited on
Commit
fc548d4
·
verified ·
1 Parent(s): 64a5675

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -14
app.py CHANGED
@@ -12,15 +12,17 @@ model = AutoModelForImageTextToText.from_pretrained(
12
  ).eval().to("cuda")
13
 
14
  @spaces.GPU
15
- def process_inputs(image, audio):
16
  messages = [
17
  {
18
- "role": "user",
19
- "content": [
20
- {"type": "image", "image": image,},
21
- {"type": "audio", "audio": audio,},
22
- ]
23
- },]
 
 
24
 
25
  input_ids = processor.apply_chat_template(
26
  messages,
@@ -38,25 +40,28 @@ def process_inputs(image, audio):
38
  max_new_tokens=256,
39
  disable_compile=True
40
  )
41
- text = processor.batch_decode(
42
  outputs[:, input_len:],
43
  skip_special_tokens=True,
44
  clean_up_tokenization_spaces=True
45
  )
46
- return text[0]
47
 
48
  # Gradio interface
49
  iface = gr.Interface(
50
  fn=process_inputs,
51
  inputs=[
52
  gr.Image(label="Upload Image", type="pil"),
53
- gr.Audio(label="Ask Question about the Image", type="filepath")
 
54
  ],
55
  outputs=gr.Textbox(label="Answer"),
56
- title="Visual (Audio) Question Answering",
57
- description="Upload an image as context and ask a quesiton about the image. The model will generate a text response.",
58
- examples=[["cat.jpg", "cats.wav"]]
 
 
59
  )
60
 
61
  if __name__ == "__main__":
62
- iface.launch(share=True)
 
12
  ).eval().to("cuda")
13
 
14
  @spaces.GPU
15
+ def process_inputs(image, audio, text):
16
  messages = [
17
  {
18
+ "role": "user",
19
+ "content": [
20
+ {"type": "image", "image": image},
21
+ {"type": "audio", "audio": audio},
22
+ {"type": "text", "text": text},
23
+ ]
24
+ },
25
+ ]
26
 
27
  input_ids = processor.apply_chat_template(
28
  messages,
 
40
  max_new_tokens=256,
41
  disable_compile=True
42
  )
43
+ text_output = processor.batch_decode(
44
  outputs[:, input_len:],
45
  skip_special_tokens=True,
46
  clean_up_tokenization_spaces=True
47
  )
48
+ return text_output[0]
49
 
50
  # Gradio interface
51
  iface = gr.Interface(
52
  fn=process_inputs,
53
  inputs=[
54
  gr.Image(label="Upload Image", type="pil"),
55
+ gr.Audio(label="Upload Audio", type="filepath"),
56
+ gr.Textbox(label="Enter Your Question", type="text")
57
  ],
58
  outputs=gr.Textbox(label="Answer"),
59
+ title="Visual + Audio + Text Question Answering",
60
+ description="Upload an image, an audio file, and enter a text question. The model will generate a text response based on all inputs.",
61
+ examples=[
62
+ ["cat.jpg", "cats.wav", "What do you see in the image?"],
63
+ ]
64
  )
65
 
66
  if __name__ == "__main__":
67
+ iface.launch(share=True)