ocr-time-machine

Running on Zero

App Files Files Community

davanstrien HF Staff commited on May 22

Commit

58b56ea

1 Parent(s): 864e5c4

Refactor OCR model loading to use lazy initialization and enhance error handling in predict function

Browse files

Files changed (1) hide show

app.py +62 -56

app.py CHANGED Viewed

@@ -5,21 +5,12 @@ import os
 import torch
 from transformers import AutoProcessor, AutoModelForImageTextToText, pipeline
 import spaces
-# --- Global Model and Processor Initialization ---
-# Load the OCR model and processor once when the app starts
-try:
-    HF_PROCESSOR = AutoProcessor.from_pretrained("reducto/RolmOCR")
-    HF_MODEL = AutoModelForImageTextToText.from_pretrained(
-        "reducto/RolmOCR",
-        torch_dtype=torch.bfloat16,
-        # attn_implementation="flash_attention_2", # User had this commented out
-        device_map="auto"
-    )
-    HF_PIPE = pipeline("image-text-to-text", model=HF_MODEL, processor=HF_PROCESSOR)
-    print("Hugging Face OCR model loaded successfully.")
-except Exception as e:
-    print(f"Error loading Hugging Face model: {e}")
-    HF_PIPE = None
 # --- Helper Functions ---
@@ -68,72 +59,87 @@ def parse_alto_xml_for_text(xml_file_path):
     except Exception as e:
         return f"An unexpected error occurred during XML parsing: {e}"
 def run_hf_ocr(image_path):
     """
-    Runs OCR on the provided image using the pre-loaded Hugging Face model.
     """
-    if HF_PIPE is None:
-        return "Hugging Face OCR model not available."
     if image_path is None:
         return "No image provided for OCR."
     try:
-        # Load the image using PIL, as the pipeline expects an image object or path
         pil_image = Image.open(image_path).convert("RGB")
-        # The user's example output for the pipeline call was:
-        # [{'generated_text': [{'role': 'user', ...}, {'role': 'assistant', 'content': "TEXT..."}]}]
-        # This suggests the pipeline is returning a conversational style output.
-        # We will try to call the pipeline with the image and prompt directly.
-        ocr_results = predict(pil_image)
         # Parse the output based on the user's example structure
         if isinstance(ocr_results, list) and ocr_results and 'generated_text' in ocr_results[0]:
             generated_content = ocr_results[0]['generated_text']
-            # Check if generated_content itself is the direct text (some pipelines do this)
             if isinstance(generated_content, str):
                 return generated_content
-            # Check for the conversational structure
-            # [{'role': 'user', ...}, {'role': 'assistant', 'content': "TEXT..."}]
             if isinstance(generated_content, list) and generated_content:
-                # The assistant's response is typically the last message in the list
-                # or specifically the one with role 'assistant'.
-                assistant_message = None
-                for msg in reversed(generated_content): # Check from the end
-                    if isinstance(msg, dict) and msg.get('role') == 'assistant' and 'content' in msg:
-                        assistant_message = msg['content']
-                        break
-                if assistant_message:
                     return assistant_message
-            # Fallback if parsing the complex structure fails but we got some string
-            if isinstance(generated_content, list) and generated_content and isinstance(generated_content[0], dict) and 'content' in generated_content[0]:
-                 # This is a guess if the structure is simpler than expected.
-                 # Or if the first part is the user prompt echo and second is assistant.
-                 if len(generated_content) > 1 and isinstance(generated_content[1], dict) and 'content' in generated_content[1]:
-                    return generated_content[1]['content'] # Assuming second part is assistant
             print(f"Unexpected OCR output structure from HF model: {ocr_results}")
-            return "Error: Could not parse OCR model output. Please check console for details."
         else:
             print(f"Unexpected OCR output structure from HF model: {ocr_results}")
-            return "Error: OCR model did not return expected output. Please check console for details."
     except Exception as e:
-        print(f"Error during Hugging Face OCR: {e}")
         return f"Error during Hugging Face OCR: {str(e)}"
-@spaces.GPU
-def predict(pil_image):
-    ocr_results = HF_PIPE(
-            pil_image,
-            prompt="Return the plain text representation of this document as if you were reading it naturally.\n"
-            # The pipeline should handle formatting this into messages if needed by the model.
-        )
-    return ocr_results
 # --- Gradio Interface Function ---
@@ -241,5 +247,5 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
 if __name__ == "__main__":
     # Removed dummy file creation as it's less relevant for single file focus
     print("Attempting to launch Gradio demo...")
-    print("If the Hugging Face model is large, initial startup might take some time due to model download/loading.")
     demo.launch()

 import torch
 from transformers import AutoProcessor, AutoModelForImageTextToText, pipeline
 import spaces
+# --- Global Model and Processor (initialize as None for lazy loading) ---
+HF_PROCESSOR = None
+HF_MODEL = None
+HF_PIPE = None
+MODEL_LOAD_ERROR_MSG = None # To store any error message from loading
 # --- Helper Functions ---
     except Exception as e:
         return f"An unexpected error occurred during XML parsing: {e}"
+@spaces.GPU # Ensures GPU is available for model loading (on first call) and inference
+def predict(pil_image):
+    """Performs OCR prediction using the Hugging Face model, with lazy loading."""
+    global HF_PROCESSOR, HF_MODEL, HF_PIPE, MODEL_LOAD_ERROR_MSG
+    if HF_PIPE is None and MODEL_LOAD_ERROR_MSG is None:
+        try:
+            print("Attempting to load Hugging Face model and processor within @spaces.GPU context...")
+            HF_PROCESSOR = AutoProcessor.from_pretrained("reducto/RolmOCR")
+            HF_MODEL = AutoModelForImageTextToText.from_pretrained(
+                "reducto/RolmOCR",
+                torch_dtype=torch.bfloat16,
+                device_map="auto" # Should utilize ZeroGPU correctly here
+            )
+            HF_PIPE = pipeline("image-text-to-text", model=HF_MODEL, processor=HF_PROCESSOR)
+            print("Hugging Face OCR model loaded successfully.")
+        except Exception as e:
+            MODEL_LOAD_ERROR_MSG = f"Error loading Hugging Face model: {str(e)}"
+            print(MODEL_LOAD_ERROR_MSG)
+            # HF_PIPE remains None, error message is stored
+    if HF_PIPE is None:
+        error_to_report = MODEL_LOAD_ERROR_MSG if MODEL_LOAD_ERROR_MSG else "OCR model could not be initialized."
+        raise RuntimeError(error_to_report)
+    # Proceed with inference if pipe is available
+    return HF_PIPE(
+        pil_image,
+        prompt="Return the plain text representation of this document as if you were reading it naturally.\n",
+    )
 def run_hf_ocr(image_path):
     """
+    Runs OCR on the provided image using the Hugging Face model (via predict function).
     """
     if image_path is None:
         return "No image provided for OCR."
     try:
         pil_image = Image.open(image_path).convert("RGB")
+        ocr_results = predict(pil_image) # predict handles model loading and inference
         # Parse the output based on the user's example structure
         if isinstance(ocr_results, list) and ocr_results and 'generated_text' in ocr_results[0]:
             generated_content = ocr_results[0]['generated_text']
             if isinstance(generated_content, str):
                 return generated_content
             if isinstance(generated_content, list) and generated_content:
+                if assistant_message := next(
+                    (
+                        msg['content']
+                        for msg in reversed(generated_content)
+                        if isinstance(msg, dict)
+                        and msg.get('role') == 'assistant'
+                        and 'content' in msg
+                    ),
+                    None,
+                ):
                     return assistant_message
+                # Fallback if the specific assistant message structure isn't found but there's content
+                if isinstance(generated_content[0], dict) and 'content' in generated_content[0]:
+                    if len(generated_content) > 1 and isinstance(generated_content[1], dict) and 'content' in generated_content[1]:
+                        return generated_content[1]['content'] # Assuming second part is assistant
+                    elif 'content' in generated_content[0]: # Or if first part is already the content
+                        return generated_content[0]['content']
             print(f"Unexpected OCR output structure from HF model: {ocr_results}")
+            return "Error: Could not parse OCR model output. Check console."
         else:
             print(f"Unexpected OCR output structure from HF model: {ocr_results}")
+            return "Error: OCR model did not return expected output. Check console."
+    except RuntimeError as e: # Catch model loading/initialization errors from predict
+        return str(e)
     except Exception as e:
+        print(f"Error during Hugging Face OCR processing: {e}")
         return f"Error during Hugging Face OCR: {str(e)}"
 # --- Gradio Interface Function ---
 if __name__ == "__main__":
     # Removed dummy file creation as it's less relevant for single file focus
     print("Attempting to launch Gradio demo...")
+    print("If the Hugging Face model is large, initial startup might take some time due to model download/loading (on first OCR attempt).")
     demo.launch()