Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,7 +5,6 @@ from threading import Thread
|
|
| 5 |
import time
|
| 6 |
import torch
|
| 7 |
import spaces
|
| 8 |
-
from qwen_vl_utils import process_vision_info
|
| 9 |
|
| 10 |
# Fine-tuned for OCR-based tasks from Qwen's [ Qwen/Qwen2-VL-2B-Instruct ]
|
| 11 |
MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
|
|
@@ -56,18 +55,14 @@ def model_inference(input_dict, history):
|
|
| 56 |
}
|
| 57 |
]
|
| 58 |
|
| 59 |
-
# Process vision info (images and videos)
|
| 60 |
-
image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
|
| 61 |
-
|
| 62 |
# Apply chat template and process inputs
|
| 63 |
prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 64 |
inputs = processor(
|
| 65 |
text=[prompt],
|
| 66 |
-
images=
|
| 67 |
-
videos=
|
| 68 |
-
padding=True,
|
| 69 |
return_tensors="pt",
|
| 70 |
-
|
| 71 |
).to("cuda")
|
| 72 |
|
| 73 |
# Set up streamer for real-time output
|
|
@@ -90,7 +85,6 @@ def model_inference(input_dict, history):
|
|
| 90 |
|
| 91 |
# Example inputs
|
| 92 |
examples = [
|
| 93 |
-
[{"text": "Describe the video.", "files": ["examples/demo.mp4"]}],
|
| 94 |
[{"text": "Extract JSON from the image", "files": ["example_images/document.jpg"]}],
|
| 95 |
[{"text": "summarize the letter", "files": ["examples/1.png"]}],
|
| 96 |
[{"text": "Describe the photo", "files": ["examples/3.png"]}],
|
|
@@ -101,7 +95,7 @@ examples = [
|
|
| 101 |
[{"text": "Can you describe this image?", "files": ["example_images/newyork.jpg"]}],
|
| 102 |
[{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
|
| 103 |
[{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
|
| 104 |
-
|
| 105 |
]
|
| 106 |
|
| 107 |
demo = gr.ChatInterface(
|
|
@@ -114,4 +108,4 @@ demo = gr.ChatInterface(
|
|
| 114 |
cache_examples=False,
|
| 115 |
)
|
| 116 |
|
| 117 |
-
demo.launch(debug=True)
|
|
|
|
| 5 |
import time
|
| 6 |
import torch
|
| 7 |
import spaces
|
|
|
|
| 8 |
|
| 9 |
# Fine-tuned for OCR-based tasks from Qwen's [ Qwen/Qwen2-VL-2B-Instruct ]
|
| 10 |
MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
|
|
|
|
| 55 |
}
|
| 56 |
]
|
| 57 |
|
|
|
|
|
|
|
|
|
|
| 58 |
# Apply chat template and process inputs
|
| 59 |
prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 60 |
inputs = processor(
|
| 61 |
text=[prompt],
|
| 62 |
+
images=images if images else None,
|
| 63 |
+
videos=videos if videos else None,
|
|
|
|
| 64 |
return_tensors="pt",
|
| 65 |
+
padding=True,
|
| 66 |
).to("cuda")
|
| 67 |
|
| 68 |
# Set up streamer for real-time output
|
|
|
|
| 85 |
|
| 86 |
# Example inputs
|
| 87 |
examples = [
|
|
|
|
| 88 |
[{"text": "Extract JSON from the image", "files": ["example_images/document.jpg"]}],
|
| 89 |
[{"text": "summarize the letter", "files": ["examples/1.png"]}],
|
| 90 |
[{"text": "Describe the photo", "files": ["examples/3.png"]}],
|
|
|
|
| 95 |
[{"text": "Can you describe this image?", "files": ["example_images/newyork.jpg"]}],
|
| 96 |
[{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
|
| 97 |
[{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
|
| 98 |
+
[{"text": "Describe the video.", "files": ["example_videos/sample.mp4"]}],
|
| 99 |
]
|
| 100 |
|
| 101 |
demo = gr.ChatInterface(
|
|
|
|
| 108 |
cache_examples=False,
|
| 109 |
)
|
| 110 |
|
| 111 |
+
demo.launch(debug=True, share=True)
|