Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
f55c2c2
1
Parent(s):
83e370e
better description
Browse files
app.py
CHANGED
|
@@ -21,18 +21,26 @@ try:
|
|
| 21 |
MODELS["RolmOCR"] = AutoModelForImageTextToText.from_pretrained(
|
| 22 |
"reducto/RolmOCR", torch_dtype=torch.bfloat16, device_map="auto"
|
| 23 |
)
|
| 24 |
-
PIPELINES["RolmOCR"] = pipeline(
|
|
|
|
|
|
|
| 25 |
except Exception as e:
|
| 26 |
MODEL_LOAD_ERROR_MSG["RolmOCR"] = f"Failed to load RolmOCR: {str(e)}"
|
| 27 |
print(f"Error loading RolmOCR: {e}")
|
| 28 |
|
| 29 |
# Load Nanonets-OCR-s
|
| 30 |
try:
|
| 31 |
-
PROCESSORS["Nanonets-OCR-s"] = AutoProcessor.from_pretrained(
|
|
|
|
|
|
|
| 32 |
MODELS["Nanonets-OCR-s"] = AutoModelForImageTextToText.from_pretrained(
|
| 33 |
"nanonets/Nanonets-OCR-s", torch_dtype=torch.bfloat16, device_map="auto"
|
| 34 |
)
|
| 35 |
-
PIPELINES["Nanonets-OCR-s"] = pipeline(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
except Exception as e:
|
| 37 |
MODEL_LOAD_ERROR_MSG["Nanonets-OCR-s"] = f"Failed to load Nanonets-OCR-s: {str(e)}"
|
| 38 |
print(f"Error loading Nanonets-OCR-s: {e}")
|
|
@@ -165,7 +173,7 @@ def predict(pil_image, model_name="RolmOCR"):
|
|
| 165 |
if model_name not in PIPELINES:
|
| 166 |
error_to_report = MODEL_LOAD_ERROR_MSG.get(
|
| 167 |
model_name,
|
| 168 |
-
f"Model {model_name} could not be initialized or is not available."
|
| 169 |
)
|
| 170 |
raise RuntimeError(error_to_report)
|
| 171 |
|
|
@@ -214,7 +222,9 @@ def run_hf_ocr(image_path, model_name="RolmOCR"):
|
|
| 214 |
|
| 215 |
try:
|
| 216 |
pil_image = Image.open(image_path).convert("RGB")
|
| 217 |
-
ocr_results = predict(
|
|
|
|
|
|
|
| 218 |
|
| 219 |
# Parse the output based on the user's example structure
|
| 220 |
if (
|
|
@@ -289,16 +299,14 @@ def process_files(image_path, xml_path, model_name):
|
|
| 289 |
try:
|
| 290 |
img_to_display = Image.open(image_path).convert("RGB")
|
| 291 |
hf_ocr_text_output = run_hf_ocr(image_path, model_name)
|
| 292 |
-
|
| 293 |
# Create download file for OCR output
|
| 294 |
if hf_ocr_text_output and not hf_ocr_text_output.startswith("Error"):
|
| 295 |
ocr_filename = f"vlm_ocr_output_{model_name}.txt"
|
| 296 |
with open(ocr_filename, "w", encoding="utf-8") as f:
|
| 297 |
f.write(hf_ocr_text_output)
|
| 298 |
ocr_download = gr.DownloadButton(
|
| 299 |
-
label="Download VLM OCR",
|
| 300 |
-
value=ocr_filename,
|
| 301 |
-
visible=True
|
| 302 |
)
|
| 303 |
except Exception as e:
|
| 304 |
img_to_display = None # Clear image if it failed to load
|
|
@@ -308,16 +316,14 @@ def process_files(image_path, xml_path, model_name):
|
|
| 308 |
|
| 309 |
if xml_path:
|
| 310 |
xml_text_output = parse_xml_for_text(xml_path)
|
| 311 |
-
|
| 312 |
# Create download file for XML text
|
| 313 |
if xml_text_output and not xml_text_output.startswith("Error"):
|
| 314 |
xml_filename = "traditional_ocr_output.txt"
|
| 315 |
with open(xml_filename, "w", encoding="utf-8") as f:
|
| 316 |
f.write(xml_text_output)
|
| 317 |
xml_download = gr.DownloadButton(
|
| 318 |
-
label="Download XML Text",
|
| 319 |
-
value=xml_filename,
|
| 320 |
-
visible=True
|
| 321 |
)
|
| 322 |
else:
|
| 323 |
xml_text_output = "No XML file uploaded."
|
|
@@ -327,16 +333,28 @@ def process_files(image_path, xml_path, model_name):
|
|
| 327 |
img_to_display = None # No image to display
|
| 328 |
hf_ocr_text_output = "Upload an image to perform OCR."
|
| 329 |
|
| 330 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 331 |
|
| 332 |
|
| 333 |
# --- Create Gradio App ---
|
| 334 |
|
| 335 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 336 |
-
gr.Markdown("# OCR
|
| 337 |
gr.Markdown(
|
| 338 |
-
"
|
| 339 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 340 |
)
|
| 341 |
|
| 342 |
with gr.Row():
|
|
@@ -345,7 +363,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 345 |
choices=AVAILABLE_MODELS,
|
| 346 |
value="RolmOCR",
|
| 347 |
label="Select OCR Model",
|
| 348 |
-
info="RolmOCR: Fast extraction, clean readable output | Nanonets-OCR-s: Detailed extraction with tables/math support, outputs structured Markdown"
|
| 349 |
)
|
| 350 |
image_input = gr.File(
|
| 351 |
label="Upload Image (PNG, JPG, etc.)", type="filepath"
|
|
@@ -366,8 +384,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 366 |
show_copy_button=True,
|
| 367 |
)
|
| 368 |
ocr_download_btn = gr.DownloadButton(
|
| 369 |
-
label="Download VLM OCR",
|
| 370 |
-
visible=False
|
| 371 |
)
|
| 372 |
xml_output_textbox = gr.Textbox(
|
| 373 |
label="Traditional OCR (XML Reading Order)",
|
|
@@ -376,14 +393,19 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 376 |
show_copy_button=True,
|
| 377 |
)
|
| 378 |
xml_download_btn = gr.DownloadButton(
|
| 379 |
-
label="Download XML Text",
|
| 380 |
-
visible=False
|
| 381 |
)
|
| 382 |
|
| 383 |
submit_button.click(
|
| 384 |
fn=process_files,
|
| 385 |
inputs=[image_input, xml_input, model_selector],
|
| 386 |
-
outputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 387 |
)
|
| 388 |
|
| 389 |
gr.Markdown("---")
|
|
|
|
| 21 |
MODELS["RolmOCR"] = AutoModelForImageTextToText.from_pretrained(
|
| 22 |
"reducto/RolmOCR", torch_dtype=torch.bfloat16, device_map="auto"
|
| 23 |
)
|
| 24 |
+
PIPELINES["RolmOCR"] = pipeline(
|
| 25 |
+
"image-text-to-text", model=MODELS["RolmOCR"], processor=PROCESSORS["RolmOCR"]
|
| 26 |
+
)
|
| 27 |
except Exception as e:
|
| 28 |
MODEL_LOAD_ERROR_MSG["RolmOCR"] = f"Failed to load RolmOCR: {str(e)}"
|
| 29 |
print(f"Error loading RolmOCR: {e}")
|
| 30 |
|
| 31 |
# Load Nanonets-OCR-s
|
| 32 |
try:
|
| 33 |
+
PROCESSORS["Nanonets-OCR-s"] = AutoProcessor.from_pretrained(
|
| 34 |
+
"nanonets/Nanonets-OCR-s"
|
| 35 |
+
)
|
| 36 |
MODELS["Nanonets-OCR-s"] = AutoModelForImageTextToText.from_pretrained(
|
| 37 |
"nanonets/Nanonets-OCR-s", torch_dtype=torch.bfloat16, device_map="auto"
|
| 38 |
)
|
| 39 |
+
PIPELINES["Nanonets-OCR-s"] = pipeline(
|
| 40 |
+
"image-text-to-text",
|
| 41 |
+
model=MODELS["Nanonets-OCR-s"],
|
| 42 |
+
processor=PROCESSORS["Nanonets-OCR-s"],
|
| 43 |
+
)
|
| 44 |
except Exception as e:
|
| 45 |
MODEL_LOAD_ERROR_MSG["Nanonets-OCR-s"] = f"Failed to load Nanonets-OCR-s: {str(e)}"
|
| 46 |
print(f"Error loading Nanonets-OCR-s: {e}")
|
|
|
|
| 173 |
if model_name not in PIPELINES:
|
| 174 |
error_to_report = MODEL_LOAD_ERROR_MSG.get(
|
| 175 |
model_name,
|
| 176 |
+
f"Model {model_name} could not be initialized or is not available.",
|
| 177 |
)
|
| 178 |
raise RuntimeError(error_to_report)
|
| 179 |
|
|
|
|
| 222 |
|
| 223 |
try:
|
| 224 |
pil_image = Image.open(image_path).convert("RGB")
|
| 225 |
+
ocr_results = predict(
|
| 226 |
+
pil_image, model_name
|
| 227 |
+
) # predict handles model loading and inference
|
| 228 |
|
| 229 |
# Parse the output based on the user's example structure
|
| 230 |
if (
|
|
|
|
| 299 |
try:
|
| 300 |
img_to_display = Image.open(image_path).convert("RGB")
|
| 301 |
hf_ocr_text_output = run_hf_ocr(image_path, model_name)
|
| 302 |
+
|
| 303 |
# Create download file for OCR output
|
| 304 |
if hf_ocr_text_output and not hf_ocr_text_output.startswith("Error"):
|
| 305 |
ocr_filename = f"vlm_ocr_output_{model_name}.txt"
|
| 306 |
with open(ocr_filename, "w", encoding="utf-8") as f:
|
| 307 |
f.write(hf_ocr_text_output)
|
| 308 |
ocr_download = gr.DownloadButton(
|
| 309 |
+
label="Download VLM OCR", value=ocr_filename, visible=True
|
|
|
|
|
|
|
| 310 |
)
|
| 311 |
except Exception as e:
|
| 312 |
img_to_display = None # Clear image if it failed to load
|
|
|
|
| 316 |
|
| 317 |
if xml_path:
|
| 318 |
xml_text_output = parse_xml_for_text(xml_path)
|
| 319 |
+
|
| 320 |
# Create download file for XML text
|
| 321 |
if xml_text_output and not xml_text_output.startswith("Error"):
|
| 322 |
xml_filename = "traditional_ocr_output.txt"
|
| 323 |
with open(xml_filename, "w", encoding="utf-8") as f:
|
| 324 |
f.write(xml_text_output)
|
| 325 |
xml_download = gr.DownloadButton(
|
| 326 |
+
label="Download XML Text", value=xml_filename, visible=True
|
|
|
|
|
|
|
| 327 |
)
|
| 328 |
else:
|
| 329 |
xml_text_output = "No XML file uploaded."
|
|
|
|
| 333 |
img_to_display = None # No image to display
|
| 334 |
hf_ocr_text_output = "Upload an image to perform OCR."
|
| 335 |
|
| 336 |
+
return (
|
| 337 |
+
img_to_display,
|
| 338 |
+
xml_text_output,
|
| 339 |
+
hf_ocr_text_output,
|
| 340 |
+
ocr_download,
|
| 341 |
+
xml_download,
|
| 342 |
+
)
|
| 343 |
|
| 344 |
|
| 345 |
# --- Create Gradio App ---
|
| 346 |
|
| 347 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 348 |
+
gr.Markdown("# 🕰️ OCR Time Machine")
|
| 349 |
gr.Markdown(
|
| 350 |
+
"Travel through time to see how OCR technology has evolved! "
|
| 351 |
+
"For decades, galleries, libraries, archives, and museums (GLAMs) have used Optical Character Recognition "
|
| 352 |
+
"to transform digitized books, newspapers, and manuscripts into machine-readable text. Traditional OCR "
|
| 353 |
+
"produces complex XML formats like ALTO, packed with layout details but difficult to use. "
|
| 354 |
+
"Now, cutting-edge Vision-Language Models (VLMs) are revolutionizing OCR with simpler, cleaner Markdown output. "
|
| 355 |
+
"This Space makes it easy to compare these two approaches and see which works best for your historical documents. "
|
| 356 |
+
"Upload a historical document image and its XML file to compare these approaches side-by-side. "
|
| 357 |
+
"We'll extract the reading order from your XML for an apples-to-apples comparison of the actual text content."
|
| 358 |
)
|
| 359 |
|
| 360 |
with gr.Row():
|
|
|
|
| 363 |
choices=AVAILABLE_MODELS,
|
| 364 |
value="RolmOCR",
|
| 365 |
label="Select OCR Model",
|
| 366 |
+
info="RolmOCR: Fast extraction, clean readable output | Nanonets-OCR-s: Detailed extraction with tables/math support, outputs structured Markdown",
|
| 367 |
)
|
| 368 |
image_input = gr.File(
|
| 369 |
label="Upload Image (PNG, JPG, etc.)", type="filepath"
|
|
|
|
| 384 |
show_copy_button=True,
|
| 385 |
)
|
| 386 |
ocr_download_btn = gr.DownloadButton(
|
| 387 |
+
label="Download VLM OCR", visible=False
|
|
|
|
| 388 |
)
|
| 389 |
xml_output_textbox = gr.Textbox(
|
| 390 |
label="Traditional OCR (XML Reading Order)",
|
|
|
|
| 393 |
show_copy_button=True,
|
| 394 |
)
|
| 395 |
xml_download_btn = gr.DownloadButton(
|
| 396 |
+
label="Download XML Text", visible=False
|
|
|
|
| 397 |
)
|
| 398 |
|
| 399 |
submit_button.click(
|
| 400 |
fn=process_files,
|
| 401 |
inputs=[image_input, xml_input, model_selector],
|
| 402 |
+
outputs=[
|
| 403 |
+
output_image_display,
|
| 404 |
+
xml_output_textbox,
|
| 405 |
+
hf_ocr_output_textbox,
|
| 406 |
+
ocr_download_btn,
|
| 407 |
+
xml_download_btn,
|
| 408 |
+
],
|
| 409 |
)
|
| 410 |
|
| 411 |
gr.Markdown("---")
|