Spaces:
Sleeping
Sleeping
import pandas as pd | |
import re # regex | |
import numpy as np | |
import PIL.Image as Image | |
from paddleocr import PPStructure | |
import html_to_json | |
class TableEx: | |
def __init__(self): | |
self.table_engine = PPStructure(lang='en', layout=False, show_log=True, use_gpu=False, download_models=True, rec=True) | |
def extract_table_information(self, pil_image : np.array): | |
#img_byte_arr = toBytes(pil_image) | |
#table_engine = PPStructure(lang='en', recovery=True, ocr=True, show_log=True, mode='kie') | |
result = self.table_engine(pil_image) | |
try: | |
extracted_tables = html_to_json.convert_tables(result[0]['res']['html']) | |
extracted_tables = self.remove_empty_elements(extracted_tables) | |
except Exception as e: | |
print('Structure extraction Failed, using fallback plain text.') | |
x = [x['text'] for x in result[0]['res']] | |
extracted_tables = ' '.join(x) | |
return extracted_tables | |
def remove_empty_elements(self, nested_list): | |
""" | |
Recursively removes empty elements from a nested list. | |
""" | |
cleaned_list = [] | |
for item in nested_list: | |
if isinstance(item, list): | |
# Recurse into sublists | |
cleaned_sublist = self.remove_empty_elements(item) | |
if cleaned_sublist: | |
cleaned_list.append(cleaned_sublist) | |
elif item != '': | |
# Add non-empty items to the cleaned list | |
cleaned_list.append(item) | |
return cleaned_list | |
def extract_table_data(self, img_array, x1, y1, x2, y2): | |
# Crop the detected table region | |
table_region = img_array[max(0, y1):min(img_array.shape[0], y2), | |
max(0, x1):min(img_array.shape[1], x2)] | |
if table_region.size > 0 and table_region.shape[0] > 0 and table_region.shape[1] > 0: | |
try: | |
# Save the table image for display | |
table_images = Image.fromarray(table_region) | |
# Extract table data | |
extracted_info = self.extract_table_information(table_region) | |
# Store the extracted data with position info | |
table_data = extracted_info[0] | |
except Exception as e: | |
print(f"Error extracting table data: {e}") | |
table_data = { | |
"region": f"({x1}, {y1}) to ({x2}, {y2})", | |
"error": str(e), | |
"data": None | |
} | |
return table_images, table_data |