File size: 2,587 Bytes
fa54254
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import pandas as pd
import re # regex
import numpy as np
import PIL.Image as Image
from paddleocr import PPStructure
import html_to_json


class TableEx:
    def __init__(self):
        self.table_engine = PPStructure(lang='en', layout=False, show_log=True, use_gpu=False, download_models=True, rec=True)

    def extract_table_information(self, pil_image : np.array):
        #img_byte_arr = toBytes(pil_image)
        #table_engine = PPStructure(lang='en', recovery=True, ocr=True, show_log=True, mode='kie')
        result = self.table_engine(pil_image)
        try: 
            extracted_tables = html_to_json.convert_tables(result[0]['res']['html'])
            extracted_tables = self.remove_empty_elements(extracted_tables)
        except Exception as e:
            print('Structure extraction Failed, using fallback plain text.')
            x = [x['text'] for x in result[0]['res']]
            extracted_tables = ' '.join(x)
        return extracted_tables
    
    def remove_empty_elements(self, nested_list):
        """
        Recursively removes empty elements from a nested list.
        """
        cleaned_list = []
        for item in nested_list:
            if isinstance(item, list):
                # Recurse into sublists
                cleaned_sublist = self.remove_empty_elements(item)
                if cleaned_sublist:
                    cleaned_list.append(cleaned_sublist)
            elif item != '':
                # Add non-empty items to the cleaned list
                cleaned_list.append(item)
        return cleaned_list
    
    def extract_table_data(self, img_array, x1, y1, x2, y2):
    # Crop the detected table region
        table_region = img_array[max(0, y1):min(img_array.shape[0], y2), 
                                max(0, x1):min(img_array.shape[1], x2)]
        
        if table_region.size > 0 and table_region.shape[0] > 0 and table_region.shape[1] > 0:
            try:
                # Save the table image for display
                table_images = Image.fromarray(table_region)
                # Extract table data
                extracted_info = self.extract_table_information(table_region)
                # Store the extracted data with position info
                table_data = extracted_info[0]

            except Exception as e:
                print(f"Error extracting table data: {e}")
                table_data = {
                    "region": f"({x1}, {y1}) to ({x2}, {y2})",
                    "error": str(e),
                    "data": None
                }
        return table_images, table_data