ocr-time-machine

Running on Zero

App Files Files Community

davanstrien HF Staff Claude commited on Aug 22

Commit

64747fe

1 Parent(s): 4386729

Add Ovis2.5-9B model support

Browse files

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <[email protected]>

Files changed (1) hide show

app.py +87 -6

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ import xml.etree.ElementTree as ET
 import os
 import torch
 import json
-from transformers import AutoProcessor, AutoModelForImageTextToText, pipeline, Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration
 import spaces
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"  # turn on HF_TRANSFER
@@ -15,7 +15,7 @@ PIPELINES = {}
 MODEL_LOAD_ERROR_MSG = {}
 # Available models
-AVAILABLE_MODELS = ["RolmOCR", "Nanonets-OCR-s", "olmOCR", "OCRFlux-3B"]
 # Load RolmOCR
 try:
@@ -75,6 +75,21 @@ except Exception as e:
     MODEL_LOAD_ERROR_MSG["OCRFlux-3B"] = f"Failed to load OCRFlux-3B: {str(e)}"
     print(f"Error loading OCRFlux-3B: {e}")
 # --- Helper Functions ---
@@ -197,8 +212,68 @@ def parse_xml_for_text(xml_file_path):
 @spaces.GPU
 def predict(pil_image, model_name="RolmOCR"):
     """Performs OCR prediction using the selected Hugging Face model."""
-    global PIPELINES, MODEL_LOAD_ERROR_MSG
     if model_name not in PIPELINES:
         error_to_report = MODEL_LOAD_ERROR_MSG.get(
             model_name,
@@ -444,7 +519,8 @@ with gr.Blocks() as demo:
         "• [RolmOCR](https://huggingface.co/reducto/RolmOCR) - Fast & general-purpose\n"
         "• [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s) - Advanced with table/math support\n"
         "• [olmOCR](https://huggingface.co/allenai/olmOCR-7B-0225-preview) - Allen AI's pioneering 7B document specialist\n"
-        "• [OCRFlux-3B](https://huggingface.co/ChatDOC/OCRFlux-3B) - Document specialist with table parsing & cross-page merging"
     )
     gr.Markdown("---")
@@ -484,7 +560,7 @@ with gr.Blocks() as demo:
                     choices=AVAILABLE_MODELS,
                     value="RolmOCR",
                     label="Choose Model",
-                    info="RolmOCR: Fast & general-purpose | Nanonets: Advanced with table/math support | olmOCR: 7B specialized for documents | OCRFlux-3B: Document specialist with cross-page merging",
                 )
             submit_button = gr.Button(
@@ -561,6 +637,11 @@ with gr.Blocks() as demo:
                     "examples/one/74442232.34.xml",
                     "OCRFlux-3B",
                 ],
             ],
             inputs=[image_input, xml_input, model_selector],
             outputs=[

 import os
 import torch
 import json
+from transformers import AutoProcessor, AutoModelForImageTextToText, AutoModelForCausalLM, pipeline, Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration
 import spaces
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"  # turn on HF_TRANSFER
 MODEL_LOAD_ERROR_MSG = {}
 # Available models
+AVAILABLE_MODELS = ["RolmOCR", "Nanonets-OCR-s", "olmOCR", "OCRFlux-3B", "Ovis2.5-9B"]
 # Load RolmOCR
 try:
     MODEL_LOAD_ERROR_MSG["OCRFlux-3B"] = f"Failed to load OCRFlux-3B: {str(e)}"
     print(f"Error loading OCRFlux-3B: {e}")
+# Load Ovis2.5-9B
+try:
+    # For Zero GPU compatibility, load to CPU first then move in predict function
+    MODELS["Ovis2.5-9B"] = AutoModelForCausalLM.from_pretrained(
+        "AIDC-AI/Ovis2.5-9B",
+        torch_dtype=torch.bfloat16,
+        trust_remote_code=True
+    )
+    # Ovis uses its own preprocessing, so we handle it differently
+    PROCESSORS["Ovis2.5-9B"] = None  # Ovis has built-in preprocessing
+    PIPELINES["Ovis2.5-9B"] = None  # We'll use the model directly
+except Exception as e:
+    MODEL_LOAD_ERROR_MSG["Ovis2.5-9B"] = f"Failed to load Ovis2.5-9B: {str(e)}"
+    print(f"Error loading Ovis2.5-9B: {e}")
 # --- Helper Functions ---
 @spaces.GPU
 def predict(pil_image, model_name="RolmOCR"):
     """Performs OCR prediction using the selected Hugging Face model."""
+    global MODELS, PIPELINES, MODEL_LOAD_ERROR_MSG
+    # Special handling for Ovis2.5-9B
+    if model_name == "Ovis2.5-9B":
+        if model_name not in MODELS:
+            error_to_report = MODEL_LOAD_ERROR_MSG.get(
+                model_name,
+                f"Model {model_name} could not be initialized or is not available.",
+            )
+            raise RuntimeError(error_to_report)
+        model = MODELS[model_name]
+        # Move model to CUDA within the GPU-decorated function
+        model = model.cuda()
+        # Format messages in Ovis format
+        messages = [{
+            "role": "user",
+            "content": [
+                {"type": "image", "image": pil_image},
+                {"type": "text", "text": "Extract and return all text from this document image. Preserve the reading order and layout structure. Return the complete text content."}
+            ],
+        }]
+        # Preprocess inputs using Ovis's built-in method
+        input_ids, pixel_values, grid_thws = model.preprocess_inputs(
+            messages=messages,
+            add_generation_prompt=True
+        )
+        # Move inputs to CUDA
+        input_ids = input_ids.cuda()
+        pixel_values = pixel_values.cuda() if pixel_values is not None else None
+        grid_thws = grid_thws.cuda() if grid_thws is not None else None
+        # Generate output
+        with torch.inference_mode():
+            outputs = model.generate(
+                inputs=input_ids,
+                pixel_values=pixel_values,
+                grid_thws=grid_thws,
+                max_new_tokens=8096,
+                do_sample=False,
+                temperature=0.0
+            )
+        # Decode the output using text_tokenizer
+        response = model.text_tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Extract only the assistant's response (after the user message)
+        # The response includes the conversation, so we need to extract just the generated part
+        if "assistant\n" in response:
+            response = response.split("assistant\n")[-1].strip()
+        elif len(response.split("\n\n")) > 1:
+            # Fallback: take the last part after double newline
+            response = response.split("\n\n")[-1].strip()
+        # Return in a format similar to pipeline output for consistency
+        return [{"generated_text": response}]
+    # Standard pipeline handling for other models
     if model_name not in PIPELINES:
         error_to_report = MODEL_LOAD_ERROR_MSG.get(
             model_name,
         "• [RolmOCR](https://huggingface.co/reducto/RolmOCR) - Fast & general-purpose\n"
         "• [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s) - Advanced with table/math support\n"
         "• [olmOCR](https://huggingface.co/allenai/olmOCR-7B-0225-preview) - Allen AI's pioneering 7B document specialist\n"
+        "• [OCRFlux-3B](https://huggingface.co/ChatDOC/OCRFlux-3B) - Document specialist with table parsing & cross-page merging\n"
+        "• [Ovis2.5-9B](https://huggingface.co/AIDC-AI/Ovis2.5-9B) - Native-resolution multimodal model with advanced reasoning"
     )
     gr.Markdown("---")
                     choices=AVAILABLE_MODELS,
                     value="RolmOCR",
                     label="Choose Model",
+                    info="RolmOCR: Fast & general-purpose | Nanonets: Advanced with table/math support | olmOCR: 7B specialized for documents | OCRFlux-3B: Document specialist with cross-page merging | Ovis2.5-9B: Native-resolution with advanced reasoning",
                 )
             submit_button = gr.Button(
                     "examples/one/74442232.34.xml",
                     "OCRFlux-3B",
                 ],
+                [
+                    "examples/one/74442232.3.jpg",
+                    "examples/one/74442232.34.xml",
+                    "Ovis2.5-9B",
+                ],
             ],
             inputs=[image_input, xml_input, model_selector],
             outputs=[