ocr-time-machine

Running on Zero

App Files Files Community

davanstrien HF Staff commited on Jun 24

Commit

f55c2c2

1 Parent(s): 83e370e

better description

Browse files

Files changed (1) hide show

app.py +45 -23

app.py CHANGED Viewed

@@ -21,18 +21,26 @@ try:
     MODELS["RolmOCR"] = AutoModelForImageTextToText.from_pretrained(
         "reducto/RolmOCR", torch_dtype=torch.bfloat16, device_map="auto"
     )
-    PIPELINES["RolmOCR"] = pipeline("image-text-to-text", model=MODELS["RolmOCR"], processor=PROCESSORS["RolmOCR"])
 except Exception as e:
     MODEL_LOAD_ERROR_MSG["RolmOCR"] = f"Failed to load RolmOCR: {str(e)}"
     print(f"Error loading RolmOCR: {e}")
 # Load Nanonets-OCR-s
 try:
-    PROCESSORS["Nanonets-OCR-s"] = AutoProcessor.from_pretrained("nanonets/Nanonets-OCR-s")
     MODELS["Nanonets-OCR-s"] = AutoModelForImageTextToText.from_pretrained(
         "nanonets/Nanonets-OCR-s", torch_dtype=torch.bfloat16, device_map="auto"
     )
-    PIPELINES["Nanonets-OCR-s"] = pipeline("image-text-to-text", model=MODELS["Nanonets-OCR-s"], processor=PROCESSORS["Nanonets-OCR-s"])
 except Exception as e:
     MODEL_LOAD_ERROR_MSG["Nanonets-OCR-s"] = f"Failed to load Nanonets-OCR-s: {str(e)}"
     print(f"Error loading Nanonets-OCR-s: {e}")
@@ -165,7 +173,7 @@ def predict(pil_image, model_name="RolmOCR"):
     if model_name not in PIPELINES:
         error_to_report = MODEL_LOAD_ERROR_MSG.get(
             model_name,
-            f"Model {model_name} could not be initialized or is not available."
         )
         raise RuntimeError(error_to_report)
@@ -214,7 +222,9 @@ def run_hf_ocr(image_path, model_name="RolmOCR"):
     try:
         pil_image = Image.open(image_path).convert("RGB")
-        ocr_results = predict(pil_image, model_name)  # predict handles model loading and inference
         # Parse the output based on the user's example structure
         if (
@@ -289,16 +299,14 @@ def process_files(image_path, xml_path, model_name):
         try:
             img_to_display = Image.open(image_path).convert("RGB")
             hf_ocr_text_output = run_hf_ocr(image_path, model_name)
             # Create download file for OCR output
             if hf_ocr_text_output and not hf_ocr_text_output.startswith("Error"):
                 ocr_filename = f"vlm_ocr_output_{model_name}.txt"
                 with open(ocr_filename, "w", encoding="utf-8") as f:
                     f.write(hf_ocr_text_output)
                 ocr_download = gr.DownloadButton(
-                    label="Download VLM OCR",
-                    value=ocr_filename,
-                    visible=True
                 )
         except Exception as e:
             img_to_display = None  # Clear image if it failed to load
@@ -308,16 +316,14 @@ def process_files(image_path, xml_path, model_name):
     if xml_path:
         xml_text_output = parse_xml_for_text(xml_path)
         # Create download file for XML text
         if xml_text_output and not xml_text_output.startswith("Error"):
             xml_filename = "traditional_ocr_output.txt"
             with open(xml_filename, "w", encoding="utf-8") as f:
                 f.write(xml_text_output)
             xml_download = gr.DownloadButton(
-                label="Download XML Text",
-                value=xml_filename,
-                visible=True
             )
     else:
         xml_text_output = "No XML file uploaded."
@@ -327,16 +333,28 @@ def process_files(image_path, xml_path, model_name):
         img_to_display = None  # No image to display
         hf_ocr_text_output = "Upload an image to perform OCR."
-    return img_to_display, xml_text_output, hf_ocr_text_output, ocr_download, xml_download
 # --- Create Gradio App ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# OCR Comparison Tool: Traditional vs VLM-based")
     gr.Markdown(
-        "Compare traditional OCR outputs (ALTO/PAGE XML) with modern Vision-Language Model OCR that produces clean Markdown. "
-        "Upload an image and its XML file to see how VLMs simplify document text extraction."
     )
     with gr.Row():
@@ -345,7 +363,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
                 choices=AVAILABLE_MODELS,
                 value="RolmOCR",
                 label="Select OCR Model",
-                info="RolmOCR: Fast extraction, clean readable output | Nanonets-OCR-s: Detailed extraction with tables/math support, outputs structured Markdown"
             )
             image_input = gr.File(
                 label="Upload Image (PNG, JPG, etc.)", type="filepath"
@@ -366,8 +384,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
                 show_copy_button=True,
             )
             ocr_download_btn = gr.DownloadButton(
-                label="Download VLM OCR",
-                visible=False
             )
             xml_output_textbox = gr.Textbox(
                 label="Traditional OCR (XML Reading Order)",
@@ -376,14 +393,19 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
                 show_copy_button=True,
             )
             xml_download_btn = gr.DownloadButton(
-                label="Download XML Text",
-                visible=False
             )
     submit_button.click(
         fn=process_files,
         inputs=[image_input, xml_input, model_selector],
-        outputs=[output_image_display, xml_output_textbox, hf_ocr_output_textbox, ocr_download_btn, xml_download_btn],
     )
     gr.Markdown("---")

     MODELS["RolmOCR"] = AutoModelForImageTextToText.from_pretrained(
         "reducto/RolmOCR", torch_dtype=torch.bfloat16, device_map="auto"
     )
+    PIPELINES["RolmOCR"] = pipeline(
+        "image-text-to-text", model=MODELS["RolmOCR"], processor=PROCESSORS["RolmOCR"]
+    )
 except Exception as e:
     MODEL_LOAD_ERROR_MSG["RolmOCR"] = f"Failed to load RolmOCR: {str(e)}"
     print(f"Error loading RolmOCR: {e}")
 # Load Nanonets-OCR-s
 try:
+    PROCESSORS["Nanonets-OCR-s"] = AutoProcessor.from_pretrained(
+        "nanonets/Nanonets-OCR-s"
+    )
     MODELS["Nanonets-OCR-s"] = AutoModelForImageTextToText.from_pretrained(
         "nanonets/Nanonets-OCR-s", torch_dtype=torch.bfloat16, device_map="auto"
     )
+    PIPELINES["Nanonets-OCR-s"] = pipeline(
+        "image-text-to-text",
+        model=MODELS["Nanonets-OCR-s"],
+        processor=PROCESSORS["Nanonets-OCR-s"],
+    )
 except Exception as e:
     MODEL_LOAD_ERROR_MSG["Nanonets-OCR-s"] = f"Failed to load Nanonets-OCR-s: {str(e)}"
     print(f"Error loading Nanonets-OCR-s: {e}")
     if model_name not in PIPELINES:
         error_to_report = MODEL_LOAD_ERROR_MSG.get(
             model_name,
+            f"Model {model_name} could not be initialized or is not available.",
         )
         raise RuntimeError(error_to_report)
     try:
         pil_image = Image.open(image_path).convert("RGB")
+        ocr_results = predict(
+            pil_image, model_name
+        )  # predict handles model loading and inference
         # Parse the output based on the user's example structure
         if (
         try:
             img_to_display = Image.open(image_path).convert("RGB")
             hf_ocr_text_output = run_hf_ocr(image_path, model_name)
             # Create download file for OCR output
             if hf_ocr_text_output and not hf_ocr_text_output.startswith("Error"):
                 ocr_filename = f"vlm_ocr_output_{model_name}.txt"
                 with open(ocr_filename, "w", encoding="utf-8") as f:
                     f.write(hf_ocr_text_output)
                 ocr_download = gr.DownloadButton(
+                    label="Download VLM OCR", value=ocr_filename, visible=True
                 )
         except Exception as e:
             img_to_display = None  # Clear image if it failed to load
     if xml_path:
         xml_text_output = parse_xml_for_text(xml_path)
         # Create download file for XML text
         if xml_text_output and not xml_text_output.startswith("Error"):
             xml_filename = "traditional_ocr_output.txt"
             with open(xml_filename, "w", encoding="utf-8") as f:
                 f.write(xml_text_output)
             xml_download = gr.DownloadButton(
+                label="Download XML Text", value=xml_filename, visible=True
             )
     else:
         xml_text_output = "No XML file uploaded."
         img_to_display = None  # No image to display
         hf_ocr_text_output = "Upload an image to perform OCR."
+    return (
+        img_to_display,
+        xml_text_output,
+        hf_ocr_text_output,
+        ocr_download,
+        xml_download,
+    )
 # --- Create Gradio App ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🕰️ OCR Time Machine")
     gr.Markdown(
+        "Travel through time to see how OCR technology has evolved! "
+        "For decades, galleries, libraries, archives, and museums (GLAMs) have used Optical Character Recognition "
+        "to transform digitized books, newspapers, and manuscripts into machine-readable text. Traditional OCR "
+        "produces complex XML formats like ALTO, packed with layout details but difficult to use. "
+        "Now, cutting-edge Vision-Language Models (VLMs) are revolutionizing OCR with simpler, cleaner Markdown output. "
+        "This Space makes it easy to compare these two approaches and see which works best for your historical documents. "
+        "Upload a historical document image and its XML file to compare these approaches side-by-side. "
+        "We'll extract the reading order from your XML for an apples-to-apples comparison of the actual text content."
     )
     with gr.Row():
                 choices=AVAILABLE_MODELS,
                 value="RolmOCR",
                 label="Select OCR Model",
+                info="RolmOCR: Fast extraction, clean readable output | Nanonets-OCR-s: Detailed extraction with tables/math support, outputs structured Markdown",
             )
             image_input = gr.File(
                 label="Upload Image (PNG, JPG, etc.)", type="filepath"
                 show_copy_button=True,
             )
             ocr_download_btn = gr.DownloadButton(
+                label="Download VLM OCR", visible=False
             )
             xml_output_textbox = gr.Textbox(
                 label="Traditional OCR (XML Reading Order)",
                 show_copy_button=True,
             )
             xml_download_btn = gr.DownloadButton(
+                label="Download XML Text", visible=False
             )
     submit_button.click(
         fn=process_files,
         inputs=[image_input, xml_input, model_selector],
+        outputs=[
+            output_image_display,
+            xml_output_textbox,
+            hf_ocr_output_textbox,
+            ocr_download_btn,
+            xml_download_btn,
+        ],
     )
     gr.Markdown("---")