Chamin09 commited on
Commit
843da6b
·
verified ·
1 Parent(s): 7dd9188

Create document_api.py

Browse files
Files changed (1) hide show
  1. models/document_api.py +74 -0
models/document_api.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import LayoutLMv2Processor, LayoutLMv2ForSequenceClassification
3
+ from PIL import Image
4
+ import numpy as np
5
+ import pytesseract
6
+
7
+ # Initialize the model and processor with caching
8
+ processor = None
9
+ model = None
10
+
11
+ def get_document_ai_models():
12
+ """Get or initialize document AI models with proper caching."""
13
+ global processor, model
14
+ if processor is None:
15
+ processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
16
+ if model is None:
17
+ model = LayoutLMv2ForSequenceClassification.from_pretrained("microsoft/layoutlmv2-base-uncased")
18
+ return processor, model
19
+
20
+ def extract_text_with_tesseract(image):
21
+ """Extract text using Tesseract OCR."""
22
+ if isinstance(image, np.ndarray):
23
+ pil_image = Image.fromarray(image).convert("RGB")
24
+ else:
25
+ pil_image = image.convert("RGB")
26
+
27
+ # Use pytesseract for OCR
28
+ text = pytesseract.image_to_string(pil_image)
29
+
30
+ # Get word boxes for structure
31
+ boxes = pytesseract.image_to_data(pil_image, output_type=pytesseract.Output.DICT)
32
+
33
+ # Extract words and their positions
34
+ words = []
35
+ word_boxes = []
36
+
37
+ for i in range(len(boxes['text'])):
38
+ if boxes['text'][i].strip() != '':
39
+ words.append(boxes['text'][i])
40
+ x, y, w, h = boxes['left'][i], boxes['top'][i], boxes['width'][i], boxes['height'][i]
41
+ word_boxes.append([x, y, x + w, y + h])
42
+
43
+ return words, word_boxes
44
+
45
+ def extract_text_and_layout(image):
46
+ """
47
+ Extract text and layout information using OCR and LayoutLMv2.
48
+
49
+ Args:
50
+ image: PIL Image object
51
+
52
+ Returns:
53
+ Dictionary with extracted text and layout information
54
+ """
55
+ # Convert numpy array to PIL Image if needed
56
+ if isinstance(image, np.ndarray):
57
+ image = Image.fromarray(image).convert("RGB")
58
+
59
+ # Extract text using Tesseract
60
+ words, boxes = extract_text_with_tesseract(image)
61
+
62
+ # If no words were found, return empty result
63
+ if not words:
64
+ return {
65
+ 'words': [],
66
+ 'boxes': [],
67
+ 'success': False
68
+ }
69
+
70
+ return {
71
+ 'words': words,
72
+ 'boxes': boxes,
73
+ 'success': True
74
+ }