Spaces:

Chamin09
/

BrailleMenuGen

Runtime error

App Files Files Community

Chamin09 commited on May 8

Commit

843da6b

verified ·

1 Parent(s): 7dd9188

Create document_api.py

Browse files

Files changed (1) hide show

models/document_api.py +74 -0

models/document_api.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import torch
+from transformers import LayoutLMv2Processor, LayoutLMv2ForSequenceClassification
+from PIL import Image
+import numpy as np
+import pytesseract
+# Initialize the model and processor with caching
+processor = None
+model = None
+def get_document_ai_models():
+    """Get or initialize document AI models with proper caching."""
+    global processor, model
+    if processor is None:
+        processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
+    if model is None:
+        model = LayoutLMv2ForSequenceClassification.from_pretrained("microsoft/layoutlmv2-base-uncased")
+    return processor, model
+def extract_text_with_tesseract(image):
+    """Extract text using Tesseract OCR."""
+    if isinstance(image, np.ndarray):
+        pil_image = Image.fromarray(image).convert("RGB")
+    else:
+        pil_image = image.convert("RGB")
+    # Use pytesseract for OCR
+    text = pytesseract.image_to_string(pil_image)
+    # Get word boxes for structure
+    boxes = pytesseract.image_to_data(pil_image, output_type=pytesseract.Output.DICT)
+    # Extract words and their positions
+    words = []
+    word_boxes = []
+    for i in range(len(boxes['text'])):
+        if boxes['text'][i].strip() != '':
+            words.append(boxes['text'][i])
+            x, y, w, h = boxes['left'][i], boxes['top'][i], boxes['width'][i], boxes['height'][i]
+            word_boxes.append([x, y, x + w, y + h])
+    return words, word_boxes
+def extract_text_and_layout(image):
+    """
+    Extract text and layout information using OCR and LayoutLMv2.
+    Args:
+        image: PIL Image object
+    Returns:
+        Dictionary with extracted text and layout information
+    """
+    # Convert numpy array to PIL Image if needed
+    if isinstance(image, np.ndarray):
+        image = Image.fromarray(image).convert("RGB")
+    # Extract text using Tesseract
+    words, boxes = extract_text_with_tesseract(image)
+    # If no words were found, return empty result
+    if not words:
+        return {
+            'words': [],
+            'boxes': [],
+            'success': False
+        }
+    return {
+        'words': words,
+        'boxes': boxes,
+        'success': True
+    }