CadExtractor / src /table_ocr.py
Martin Krockert
Demo with tesseract / paddle and finetuned yolo 12
fa54254
import pandas as pd
import re # regex
import numpy as np
import PIL.Image as Image
from paddleocr import PPStructure
import html_to_json
class TableEx:
def __init__(self):
self.table_engine = PPStructure(lang='en', layout=False, show_log=True, use_gpu=False, download_models=True, rec=True)
def extract_table_information(self, pil_image : np.array):
#img_byte_arr = toBytes(pil_image)
#table_engine = PPStructure(lang='en', recovery=True, ocr=True, show_log=True, mode='kie')
result = self.table_engine(pil_image)
try:
extracted_tables = html_to_json.convert_tables(result[0]['res']['html'])
extracted_tables = self.remove_empty_elements(extracted_tables)
except Exception as e:
print('Structure extraction Failed, using fallback plain text.')
x = [x['text'] for x in result[0]['res']]
extracted_tables = ' '.join(x)
return extracted_tables
def remove_empty_elements(self, nested_list):
"""
Recursively removes empty elements from a nested list.
"""
cleaned_list = []
for item in nested_list:
if isinstance(item, list):
# Recurse into sublists
cleaned_sublist = self.remove_empty_elements(item)
if cleaned_sublist:
cleaned_list.append(cleaned_sublist)
elif item != '':
# Add non-empty items to the cleaned list
cleaned_list.append(item)
return cleaned_list
def extract_table_data(self, img_array, x1, y1, x2, y2):
# Crop the detected table region
table_region = img_array[max(0, y1):min(img_array.shape[0], y2),
max(0, x1):min(img_array.shape[1], x2)]
if table_region.size > 0 and table_region.shape[0] > 0 and table_region.shape[1] > 0:
try:
# Save the table image for display
table_images = Image.fromarray(table_region)
# Extract table data
extracted_info = self.extract_table_information(table_region)
# Store the extracted data with position info
table_data = extracted_info[0]
except Exception as e:
print(f"Error extracting table data: {e}")
table_data = {
"region": f"({x1}, {y1}) to ({x2}, {y2})",
"error": str(e),
"data": None
}
return table_images, table_data