Spaces:
Running
Running
File size: 2,587 Bytes
fa54254 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
import pandas as pd
import re # regex
import numpy as np
import PIL.Image as Image
from paddleocr import PPStructure
import html_to_json
class TableEx:
def __init__(self):
self.table_engine = PPStructure(lang='en', layout=False, show_log=True, use_gpu=False, download_models=True, rec=True)
def extract_table_information(self, pil_image : np.array):
#img_byte_arr = toBytes(pil_image)
#table_engine = PPStructure(lang='en', recovery=True, ocr=True, show_log=True, mode='kie')
result = self.table_engine(pil_image)
try:
extracted_tables = html_to_json.convert_tables(result[0]['res']['html'])
extracted_tables = self.remove_empty_elements(extracted_tables)
except Exception as e:
print('Structure extraction Failed, using fallback plain text.')
x = [x['text'] for x in result[0]['res']]
extracted_tables = ' '.join(x)
return extracted_tables
def remove_empty_elements(self, nested_list):
"""
Recursively removes empty elements from a nested list.
"""
cleaned_list = []
for item in nested_list:
if isinstance(item, list):
# Recurse into sublists
cleaned_sublist = self.remove_empty_elements(item)
if cleaned_sublist:
cleaned_list.append(cleaned_sublist)
elif item != '':
# Add non-empty items to the cleaned list
cleaned_list.append(item)
return cleaned_list
def extract_table_data(self, img_array, x1, y1, x2, y2):
# Crop the detected table region
table_region = img_array[max(0, y1):min(img_array.shape[0], y2),
max(0, x1):min(img_array.shape[1], x2)]
if table_region.size > 0 and table_region.shape[0] > 0 and table_region.shape[1] > 0:
try:
# Save the table image for display
table_images = Image.fromarray(table_region)
# Extract table data
extracted_info = self.extract_table_information(table_region)
# Store the extracted data with position info
table_data = extracted_info[0]
except Exception as e:
print(f"Error extracting table data: {e}")
table_data = {
"region": f"({x1}, {y1}) to ({x2}, {y2})",
"error": str(e),
"data": None
}
return table_images, table_data |