Spaces:

prithivMLmods
/

Multimodal-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on Apr 16

Commit

92e002a

verified ·

1 Parent(s): 080d79a

Update app.py

Browse files

Files changed (1) hide show

app.py +144 -72

app.py CHANGED Viewed

@@ -4,25 +4,30 @@ from threading import Thread
 import time
 import torch
 import spaces
-from PIL import Image
-import requests
-from io import BytesIO
 import cv2
 import numpy as np
 from transformers import (
     Qwen2VLForConditionalGeneration,
     AutoProcessor,
     TextIteratorStreamer,
     AutoModelForImageTextToText,
 )
-# Helper function to return a progress bar HTML snippet.
-def progress_bar_html(label: str) -> str:
     return f'''
 <div style="display: flex; align-items: center;">
     <span style="margin-right: 10px; font-size: 14px;">{label}</span>
-    <div style="width: 110px; height: 5px; background-color: #FFB6C1; border-radius: 2px; overflow: hidden;">
-        <div style="width: 100%; height: 100%; background-color: #FF69B4; animation: loading 1.5s linear infinite;"></div>
     </div>
 </div>
 <style>
@@ -33,13 +38,19 @@ def progress_bar_html(label: str) -> str:
 </style>
     '''
-# Helper function to downsample a video into 10 evenly spaced frames.
 def downsample_video(video_path):
     vidcap = cv2.VideoCapture(video_path)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     frames = []
-    # Sample 10 evenly spaced frames.
     frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
     for i in frame_indices:
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
@@ -52,10 +63,9 @@ def downsample_video(video_path):
     vidcap.release()
     return frames
-# Model and processor setups
-# Setup for Qwen2VL OCR branch (default).
-QV_MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"  # or use "prithivMLmods/Qwen2-VL-OCR2-2B-Instruct"
 qwen_processor = AutoProcessor.from_pretrained(QV_MODEL_ID, trust_remote_code=True)
 qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
     QV_MODEL_ID,
@@ -63,22 +73,31 @@ qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to("cuda").eval()
-# Setup for Aya-Vision branch.
 AYA_MODEL_ID = "CohereForAI/aya-vision-8b"
 aya_processor = AutoProcessor.from_pretrained(AYA_MODEL_ID)
 aya_model = AutoModelForImageTextToText.from_pretrained(
     AYA_MODEL_ID, device_map="auto", torch_dtype=torch.float16
 )
-# ---------------------------
 # Main Inference Function
-# ---------------------------
 @spaces.GPU
 def model_inference(input_dict, history):
     text = input_dict["text"].strip()
     files = input_dict.get("files", [])
-    # Branch for video inference with Aya-Vision using @video-infer.
     if text.lower().startswith("@video-infer"):
         prompt = text[len("@video-infer"):].strip()
         if not files:
@@ -89,16 +108,12 @@ def model_inference(input_dict, history):
         if not frames:
             yield "Error: Could not extract frames from the video."
             return
-        # Build messages: start with the prompt then add each frame with its timestamp.
-        content_list = []
-        content_list.append({"type": "text", "text": prompt})
         for frame, timestamp in frames:
             content_list.append({"type": "text", "text": f"Frame {timestamp}:"})
             content_list.append({"type": "image", "image": frame})
-        messages = [{
-            "role": "user",
-            "content": content_list,
-        }]
         inputs = aya_processor.apply_chat_template(
             messages,
             padding=True,
@@ -126,50 +141,114 @@ def model_inference(input_dict, history):
             yield buffer
         return
-    # Branch for single image inference with Aya-Vision using @aya-vision.
     if text.lower().startswith("@aya-vision"):
         text_prompt = text[len("@aya-vision"):].strip()
         if not files:
             yield "Error: Please provide an image for the @aya-vision feature."
             return
         else:
-            # Use the first provided image.
-            image = load_image(files[0])
-            yield progress_bar_html("Processing with Aya-Vision-8b")
             messages = [{
                 "role": "user",
                 "content": [
-                    {"type": "image", "image": image},
                     {"type": "text", "text": text_prompt},
                 ],
             }]
-            inputs = aya_processor.apply_chat_template(
-                messages,
                 padding=True,
-                add_generation_prompt=True,
-                tokenize=True,
-                return_dict=True,
-                return_tensors="pt"
-            ).to(aya_model.device)
-            streamer = TextIteratorStreamer(aya_processor, skip_prompt=True, skip_special_tokens=True)
-            generation_kwargs = dict(
-                inputs,
-                streamer=streamer,
-                max_new_tokens=1024,
-                do_sample=True,
-                temperature=0.3
-            )
-            thread = Thread(target=aya_model.generate, kwargs=generation_kwargs)
-            thread.start()
-            buffer = ""
-            for new_text in streamer:
-                buffer += new_text
-                buffer = buffer.replace("<|im_end|>", "")
-                time.sleep(0.01)
-                yield buffer
-            return
-    # Default branch: Use Qwen2VL OCR for text (with optional images).
     if len(files) > 1:
         images = [load_image(image) for image in files]
     elif len(files) == 1:
@@ -178,7 +257,7 @@ def model_inference(input_dict, history):
         images = []
     if text == "" and not images:
-        yield "Error: Please input a query and optionally image(s)."
         return
     if text == "" and images:
         yield "Error: Please input a text query along with the image(s)."
@@ -191,23 +270,17 @@ def model_inference(input_dict, history):
             {"type": "text", "text": text},
         ],
     }]
-    prompt = qwen_processor.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
     inputs = qwen_processor(
-        text=[prompt],
         images=images if images else None,
         return_tensors="pt",
         padding=True,
     ).to("cuda")
     streamer = TextIteratorStreamer(qwen_processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
     thread = Thread(target=qwen_model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     yield progress_bar_html("Processing with Qwen2VL OCR")
     for new_text in streamer:
@@ -216,32 +289,31 @@ def model_inference(input_dict, history):
         time.sleep(0.01)
         yield buffer
-# Gradio Interface Setup
 examples = [
     [{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
     [{"text": "@aya-vision Summarize the letter", "files": ["examples/1.png"]}],
     [{"text": "@aya-vision Extract JSON from the image", "files": ["example_images/document.jpg"]}],
-    [{"text": "@video-infer Explain what is happening in this video ?", "files": ["examples/oreo.mp4"]}],
     [{"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
     [{"text": "@aya-vision Describe the photo", "files": ["examples/3.png"]}],
     [{"text": "@aya-vision Summarize the full image in detail", "files": ["examples/2.jpg"]}],
     [{"text": "@aya-vision Describe this image.", "files": ["example_images/campeones.jpg"]}],
     [{"text": "@aya-vision What is this UI about?", "files": ["example_images/s2w_example.png"]}],
     [{"text": "Can you describe this image?", "files": ["example_images/newyork.jpg"]}],
-    [{"text": "@aya-vision Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
 ]
 demo = gr.ChatInterface(
     fn=model_inference,
-    description="# **Multimodal OCR `@aya-vision for image, @video-infer for video`**",
     examples=examples,
     textbox=gr.MultimodalTextbox(
         label="Query Input",
         file_types=["image", "video"],
         file_count="multiple",
-        placeholder="Tag @aya-vision for Aya-Vision image infer, @video-infer for Aya-Vision video infer, default runs Qwen2VL OCR"
     ),
     stop_btn="Stop Generation",
     multimodal=True,

 import time
 import torch
 import spaces
 import cv2
 import numpy as np
+from PIL import Image
 from transformers import (
     Qwen2VLForConditionalGeneration,
     AutoProcessor,
     TextIteratorStreamer,
     AutoModelForImageTextToText,
 )
+from transformers import Qwen2_5_VLForConditionalGeneration
+# ---------------------------
+# Helper Functions
+# ---------------------------
+def progress_bar_html(label: str, primary_color: str = "#FF69B4", secondary_color: str = "#FFB6C1") -> str:
+    """
+    Returns an HTML snippet for a thin animated progress bar with a label.
+    Colors can be customized; default colors are used for Qwen2VL/Aya‑Vision.
+    """
     return f'''
 <div style="display: flex; align-items: center;">
     <span style="margin-right: 10px; font-size: 14px;">{label}</span>
+    <div style="width: 110px; height: 5px; background-color: {secondary_color}; border-radius: 2px; overflow: hidden;">
+        <div style="width: 100%; height: 100%; background-color: {primary_color}; animation: loading 1.5s linear infinite;"></div>
     </div>
 </div>
 <style>
 </style>
     '''
 def downsample_video(video_path):
+    """
+    Downsamples a video file by extracting 10 evenly spaced frames.
+    Returns a list of tuples (PIL.Image, timestamp).
+    """
     vidcap = cv2.VideoCapture(video_path)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     frames = []
+    if total_frames <= 0 or fps <= 0:
+        vidcap.release()
+        return frames
+    # Determine 10 evenly spaced frame indices.
     frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
     for i in frame_indices:
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
     vidcap.release()
     return frames
+# Model and Processor Setup
+# Qwen2VL OCR (default branch)
+QV_MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 qwen_processor = AutoProcessor.from_pretrained(QV_MODEL_ID, trust_remote_code=True)
 qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
     QV_MODEL_ID,
     torch_dtype=torch.float16
 ).to("cuda").eval()
+# Aya-Vision branch (for @aya-vision and @video-infer)
 AYA_MODEL_ID = "CohereForAI/aya-vision-8b"
 aya_processor = AutoProcessor.from_pretrained(AYA_MODEL_ID)
 aya_model = AutoModelForImageTextToText.from_pretrained(
     AYA_MODEL_ID, device_map="auto", torch_dtype=torch.float16
 )
+# RolmOCR branch (@RolmOCR)
+ROLMOCR_MODEL_ID = "reducto/RolmOCR"
+rolmocr_processor = AutoProcessor.from_pretrained(ROLMOCR_MODEL_ID, trust_remote_code=True)
+rolmocr_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    ROLMOCR_MODEL_ID,
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16
+).to("cuda").eval()
 # Main Inference Function
 @spaces.GPU
 def model_inference(input_dict, history):
     text = input_dict["text"].strip()
     files = input_dict.get("files", [])
+    # ---------------------------
+    # Aya-Vision Video Inference (@video-infer)
+    # ---------------------------
     if text.lower().startswith("@video-infer"):
         prompt = text[len("@video-infer"):].strip()
         if not files:
         if not frames:
             yield "Error: Could not extract frames from the video."
             return
+        # Build the message with the text prompt followed by each frame (with timestamp label).
+        content_list = [{"type": "text", "text": prompt}]
         for frame, timestamp in frames:
             content_list.append({"type": "text", "text": f"Frame {timestamp}:"})
             content_list.append({"type": "image", "image": frame})
+        messages = [{"role": "user", "content": content_list}]
         inputs = aya_processor.apply_chat_template(
             messages,
             padding=True,
             yield buffer
         return
+    # Aya-Vision Image Inference (@aya-vision)
     if text.lower().startswith("@aya-vision"):
         text_prompt = text[len("@aya-vision"):].strip()
         if not files:
             yield "Error: Please provide an image for the @aya-vision feature."
             return
+        image = load_image(files[0])
+        yield progress_bar_html("Processing with Aya-Vision-8b")
+        messages = [{
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": text_prompt},
+            ],
+        }]
+        inputs = aya_processor.apply_chat_template(
+            messages,
+            padding=True,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt"
+        ).to(aya_model.device)
+        streamer = TextIteratorStreamer(aya_processor, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = dict(
+            inputs,
+            streamer=streamer,
+            max_new_tokens=1024,
+            do_sample=True,
+            temperature=0.3
+        )
+        thread = Thread(target=aya_model.generate, kwargs=generation_kwargs)
+        thread.start()
+        buffer = ""
+        for new_text in streamer:
+            buffer += new_text
+            buffer = buffer.replace("<|im_end|>", "")
+            time.sleep(0.01)
+            yield buffer
+        return
+    # RolmOCR Inference (@RolmOCR)
+    if text.lower().startswith("@rolmocr"):
+        # Remove the tag from the query.
+        text_prompt = text[len("@rolmocr"):].strip()
+        # Check if a video is provided for inference.
+        if files and isinstance(files[0], str) and files[0].lower().endswith((".mp4", ".avi", ".mov")):
+            video_path = files[0]
+            frames = downsample_video(video_path)
+            if not frames:
+                yield "Error: Could not extract frames from the video."
+                return
+            # Build the message: prompt followed by each frame with its timestamp.
+            content_list = [{"type": "text", "text": text_prompt}]
+            for image, timestamp in frames:
+                content_list.append({"type": "text", "text": f"Frame {timestamp}:"})
+                content_list.append({"type": "image", "image": image})
+            messages = [{"role": "user", "content": content_list}]
+            # For video, extract images only.
+            video_images = [image for image, _ in frames]
+            prompt_full = rolmocr_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            inputs = rolmocr_processor(
+                text=[prompt_full],
+                images=video_images,
+                return_tensors="pt",
+                padding=True,
+            ).to("cuda")
         else:
+            # Assume image(s) or text query.
+            if len(files) > 1:
+                images = [load_image(image) for image in files]
+            elif len(files) == 1:
+                images = [load_image(files[0])]
+            else:
+                images = []
+            if text_prompt == "" and not images:
+                yield "Error: Please input a text query and/or provide an image for the @RolmOCR feature."
+                return
             messages = [{
                 "role": "user",
                 "content": [
+                    *[{"type": "image", "image": image} for image in images],
                     {"type": "text", "text": text_prompt},
                 ],
             }]
+            prompt_full = rolmocr_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            inputs = rolmocr_processor(
+                text=[prompt_full],
+                images=images if images else None,
+                return_tensors="pt",
                 padding=True,
+            ).to("cuda")
+        streamer = TextIteratorStreamer(rolmocr_processor, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
+        thread = Thread(target=rolmocr_model.generate, kwargs=generation_kwargs)
+        thread.start()
+        buffer = ""
+        # Use a different color scheme for RolmOCR (purple-themed).
+        yield progress_bar_html("Processing with Qwen2.5VL (RolmOCR)", primary_color="#4B0082", secondary_color="#9370DB")
+        for new_text in streamer:
+            buffer += new_text
+            buffer = buffer.replace("<|im_end|>", "")
+            time.sleep(0.01)
+            yield buffer
+        return
+    # Default Inference: Qwen2VL OCR
+    # Process files: support multiple images.
     if len(files) > 1:
         images = [load_image(image) for image in files]
     elif len(files) == 1:
         images = []
     if text == "" and not images:
+        yield "Error: Please input a text query and optionally image(s)."
         return
     if text == "" and images:
         yield "Error: Please input a text query along with the image(s)."
             {"type": "text", "text": text},
         ],
     }]
+    prompt_full = qwen_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = qwen_processor(
+        text=[prompt_full],
         images=images if images else None,
         return_tensors="pt",
         padding=True,
     ).to("cuda")
     streamer = TextIteratorStreamer(qwen_processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
     thread = Thread(target=qwen_model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     yield progress_bar_html("Processing with Qwen2VL OCR")
     for new_text in streamer:
         time.sleep(0.01)
         yield buffer
+# Gradio Interface
 examples = [
+    [{"text": "@RolmOCR OCR the Text in the Image", "files": ["rolm/1.jpeg"]}],
+    [{"text": "@RolmOCR OCR the Image", "files": ["rolm/2.jpeg"]}],
     [{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
     [{"text": "@aya-vision Summarize the letter", "files": ["examples/1.png"]}],
     [{"text": "@aya-vision Extract JSON from the image", "files": ["example_images/document.jpg"]}],
+    [{"text": "@video-infer Explain what is happening in this video?", "files": ["examples/oreo.mp4"]}],
     [{"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
     [{"text": "@aya-vision Describe the photo", "files": ["examples/3.png"]}],
     [{"text": "@aya-vision Summarize the full image in detail", "files": ["examples/2.jpg"]}],
     [{"text": "@aya-vision Describe this image.", "files": ["example_images/campeones.jpg"]}],
     [{"text": "@aya-vision What is this UI about?", "files": ["example_images/s2w_example.png"]}],
     [{"text": "Can you describe this image?", "files": ["example_images/newyork.jpg"]}],
 ]
 demo = gr.ChatInterface(
     fn=model_inference,
+    description="# **Multimodal OCR `@RolmOCR, @aya-vision for image, @video-infer for video`**",
     examples=examples,
     textbox=gr.MultimodalTextbox(
         label="Query Input",
         file_types=["image", "video"],
         file_count="multiple",
+        placeholder="Tag @aya-vision for Aya‑Vision, @video-infer for video,  for RolmOCR, or leave blank for default Qwen2VL OCR"
     ),
     stop_btn="Stop Generation",
     multimodal=True,