Multimodal-OCR

Runtime error

App Files Files Community

prithivMLmods commited on Feb 4

Commit

fe53594

verified ·

1 Parent(s): 95550be

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -11

app.py CHANGED Viewed

@@ -5,7 +5,6 @@ from threading import Thread
 import time
 import torch
 import spaces
-from qwen_vl_utils import process_vision_info
 # Fine-tuned for OCR-based tasks from Qwen's [ Qwen/Qwen2-VL-2B-Instruct ]
 MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
@@ -56,18 +55,14 @@ def model_inference(input_dict, history):
         }
     ]
-    # Process vision info (images and videos)
-    image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
     # Apply chat template and process inputs
     prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(
         text=[prompt],
-        images=image_inputs,
-        videos=video_inputs,
-        padding=True,
         return_tensors="pt",
-        **video_kwargs,
     ).to("cuda")
     # Set up streamer for real-time output
@@ -90,7 +85,6 @@ def model_inference(input_dict, history):
 # Example inputs
 examples = [
-    [{"text": "Describe the video.", "files": ["examples/demo.mp4"]}],
     [{"text": "Extract JSON from the image", "files": ["example_images/document.jpg"]}],
     [{"text": "summarize the letter", "files": ["examples/1.png"]}],
     [{"text": "Describe the photo", "files": ["examples/3.png"]}],
@@ -101,7 +95,7 @@ examples = [
     [{"text": "Can you describe this image?", "files": ["example_images/newyork.jpg"]}],
     [{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
     [{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
 ]
 demo = gr.ChatInterface(
@@ -114,4 +108,4 @@ demo = gr.ChatInterface(
     cache_examples=False,
 )
-demo.launch(debug=True)

 import time
 import torch
 import spaces
 # Fine-tuned for OCR-based tasks from Qwen's [ Qwen/Qwen2-VL-2B-Instruct ]
 MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
         }
     ]
     # Apply chat template and process inputs
     prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(
         text=[prompt],
+        images=images if images else None,
+        videos=videos if videos else None,
         return_tensors="pt",
+        padding=True,
     ).to("cuda")
     # Set up streamer for real-time output
 # Example inputs
 examples = [
     [{"text": "Extract JSON from the image", "files": ["example_images/document.jpg"]}],
     [{"text": "summarize the letter", "files": ["examples/1.png"]}],
     [{"text": "Describe the photo", "files": ["examples/3.png"]}],
     [{"text": "Can you describe this image?", "files": ["example_images/newyork.jpg"]}],
     [{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
     [{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
+    [{"text": "Describe the video.", "files": ["example_videos/sample.mp4"]}],
 ]
 demo = gr.ChatInterface(
     cache_examples=False,
 )
+demo.launch(debug=True, share=True)