davanstrien HF Staff Claude commited on
Commit
64747fe
·
1 Parent(s): 4386729

Add Ovis2.5-9B model support

Browse files

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <[email protected]>

Files changed (1) hide show
  1. app.py +87 -6
app.py CHANGED
@@ -4,7 +4,7 @@ import xml.etree.ElementTree as ET
4
  import os
5
  import torch
6
  import json
7
- from transformers import AutoProcessor, AutoModelForImageTextToText, pipeline, Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration
8
  import spaces
9
 
10
  os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" # turn on HF_TRANSFER
@@ -15,7 +15,7 @@ PIPELINES = {}
15
  MODEL_LOAD_ERROR_MSG = {}
16
 
17
  # Available models
18
- AVAILABLE_MODELS = ["RolmOCR", "Nanonets-OCR-s", "olmOCR", "OCRFlux-3B"]
19
 
20
  # Load RolmOCR
21
  try:
@@ -75,6 +75,21 @@ except Exception as e:
75
  MODEL_LOAD_ERROR_MSG["OCRFlux-3B"] = f"Failed to load OCRFlux-3B: {str(e)}"
76
  print(f"Error loading OCRFlux-3B: {e}")
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
  # --- Helper Functions ---
80
 
@@ -197,8 +212,68 @@ def parse_xml_for_text(xml_file_path):
197
  @spaces.GPU
198
  def predict(pil_image, model_name="RolmOCR"):
199
  """Performs OCR prediction using the selected Hugging Face model."""
200
- global PIPELINES, MODEL_LOAD_ERROR_MSG
201
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  if model_name not in PIPELINES:
203
  error_to_report = MODEL_LOAD_ERROR_MSG.get(
204
  model_name,
@@ -444,7 +519,8 @@ with gr.Blocks() as demo:
444
  "• [RolmOCR](https://huggingface.co/reducto/RolmOCR) - Fast & general-purpose\n"
445
  "• [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s) - Advanced with table/math support\n"
446
  "• [olmOCR](https://huggingface.co/allenai/olmOCR-7B-0225-preview) - Allen AI's pioneering 7B document specialist\n"
447
- "• [OCRFlux-3B](https://huggingface.co/ChatDOC/OCRFlux-3B) - Document specialist with table parsing & cross-page merging"
 
448
  )
449
 
450
  gr.Markdown("---")
@@ -484,7 +560,7 @@ with gr.Blocks() as demo:
484
  choices=AVAILABLE_MODELS,
485
  value="RolmOCR",
486
  label="Choose Model",
487
- info="RolmOCR: Fast & general-purpose | Nanonets: Advanced with table/math support | olmOCR: 7B specialized for documents | OCRFlux-3B: Document specialist with cross-page merging",
488
  )
489
 
490
  submit_button = gr.Button(
@@ -561,6 +637,11 @@ with gr.Blocks() as demo:
561
  "examples/one/74442232.34.xml",
562
  "OCRFlux-3B",
563
  ],
 
 
 
 
 
564
  ],
565
  inputs=[image_input, xml_input, model_selector],
566
  outputs=[
 
4
  import os
5
  import torch
6
  import json
7
+ from transformers import AutoProcessor, AutoModelForImageTextToText, AutoModelForCausalLM, pipeline, Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration
8
  import spaces
9
 
10
  os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" # turn on HF_TRANSFER
 
15
  MODEL_LOAD_ERROR_MSG = {}
16
 
17
  # Available models
18
+ AVAILABLE_MODELS = ["RolmOCR", "Nanonets-OCR-s", "olmOCR", "OCRFlux-3B", "Ovis2.5-9B"]
19
 
20
  # Load RolmOCR
21
  try:
 
75
  MODEL_LOAD_ERROR_MSG["OCRFlux-3B"] = f"Failed to load OCRFlux-3B: {str(e)}"
76
  print(f"Error loading OCRFlux-3B: {e}")
77
 
78
+ # Load Ovis2.5-9B
79
+ try:
80
+ # For Zero GPU compatibility, load to CPU first then move in predict function
81
+ MODELS["Ovis2.5-9B"] = AutoModelForCausalLM.from_pretrained(
82
+ "AIDC-AI/Ovis2.5-9B",
83
+ torch_dtype=torch.bfloat16,
84
+ trust_remote_code=True
85
+ )
86
+ # Ovis uses its own preprocessing, so we handle it differently
87
+ PROCESSORS["Ovis2.5-9B"] = None # Ovis has built-in preprocessing
88
+ PIPELINES["Ovis2.5-9B"] = None # We'll use the model directly
89
+ except Exception as e:
90
+ MODEL_LOAD_ERROR_MSG["Ovis2.5-9B"] = f"Failed to load Ovis2.5-9B: {str(e)}"
91
+ print(f"Error loading Ovis2.5-9B: {e}")
92
+
93
 
94
  # --- Helper Functions ---
95
 
 
212
  @spaces.GPU
213
  def predict(pil_image, model_name="RolmOCR"):
214
  """Performs OCR prediction using the selected Hugging Face model."""
215
+ global MODELS, PIPELINES, MODEL_LOAD_ERROR_MSG
216
+
217
+ # Special handling for Ovis2.5-9B
218
+ if model_name == "Ovis2.5-9B":
219
+ if model_name not in MODELS:
220
+ error_to_report = MODEL_LOAD_ERROR_MSG.get(
221
+ model_name,
222
+ f"Model {model_name} could not be initialized or is not available.",
223
+ )
224
+ raise RuntimeError(error_to_report)
225
+
226
+ model = MODELS[model_name]
227
+
228
+ # Move model to CUDA within the GPU-decorated function
229
+ model = model.cuda()
230
+
231
+ # Format messages in Ovis format
232
+ messages = [{
233
+ "role": "user",
234
+ "content": [
235
+ {"type": "image", "image": pil_image},
236
+ {"type": "text", "text": "Extract and return all text from this document image. Preserve the reading order and layout structure. Return the complete text content."}
237
+ ],
238
+ }]
239
+
240
+ # Preprocess inputs using Ovis's built-in method
241
+ input_ids, pixel_values, grid_thws = model.preprocess_inputs(
242
+ messages=messages,
243
+ add_generation_prompt=True
244
+ )
245
+
246
+ # Move inputs to CUDA
247
+ input_ids = input_ids.cuda()
248
+ pixel_values = pixel_values.cuda() if pixel_values is not None else None
249
+ grid_thws = grid_thws.cuda() if grid_thws is not None else None
250
+
251
+ # Generate output
252
+ with torch.inference_mode():
253
+ outputs = model.generate(
254
+ inputs=input_ids,
255
+ pixel_values=pixel_values,
256
+ grid_thws=grid_thws,
257
+ max_new_tokens=8096,
258
+ do_sample=False,
259
+ temperature=0.0
260
+ )
261
+
262
+ # Decode the output using text_tokenizer
263
+ response = model.text_tokenizer.decode(outputs[0], skip_special_tokens=True)
264
+
265
+ # Extract only the assistant's response (after the user message)
266
+ # The response includes the conversation, so we need to extract just the generated part
267
+ if "assistant\n" in response:
268
+ response = response.split("assistant\n")[-1].strip()
269
+ elif len(response.split("\n\n")) > 1:
270
+ # Fallback: take the last part after double newline
271
+ response = response.split("\n\n")[-1].strip()
272
+
273
+ # Return in a format similar to pipeline output for consistency
274
+ return [{"generated_text": response}]
275
+
276
+ # Standard pipeline handling for other models
277
  if model_name not in PIPELINES:
278
  error_to_report = MODEL_LOAD_ERROR_MSG.get(
279
  model_name,
 
519
  "• [RolmOCR](https://huggingface.co/reducto/RolmOCR) - Fast & general-purpose\n"
520
  "• [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s) - Advanced with table/math support\n"
521
  "• [olmOCR](https://huggingface.co/allenai/olmOCR-7B-0225-preview) - Allen AI's pioneering 7B document specialist\n"
522
+ "• [OCRFlux-3B](https://huggingface.co/ChatDOC/OCRFlux-3B) - Document specialist with table parsing & cross-page merging\n"
523
+ "• [Ovis2.5-9B](https://huggingface.co/AIDC-AI/Ovis2.5-9B) - Native-resolution multimodal model with advanced reasoning"
524
  )
525
 
526
  gr.Markdown("---")
 
560
  choices=AVAILABLE_MODELS,
561
  value="RolmOCR",
562
  label="Choose Model",
563
+ info="RolmOCR: Fast & general-purpose | Nanonets: Advanced with table/math support | olmOCR: 7B specialized for documents | OCRFlux-3B: Document specialist with cross-page merging | Ovis2.5-9B: Native-resolution with advanced reasoning",
564
  )
565
 
566
  submit_button = gr.Button(
 
637
  "examples/one/74442232.34.xml",
638
  "OCRFlux-3B",
639
  ],
640
+ [
641
+ "examples/one/74442232.3.jpg",
642
+ "examples/one/74442232.34.xml",
643
+ "Ovis2.5-9B",
644
+ ],
645
  ],
646
  inputs=[image_input, xml_input, model_selector],
647
  outputs=[