Spaces:
Runtime error
Runtime error
| # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """ | |
| conver table label to html | |
| """ | |
| import json | |
| import argparse | |
| from tqdm import tqdm | |
| def save_pred_txt(key, val, tmp_file_path): | |
| with open(tmp_file_path, 'a+', encoding='utf-8') as f: | |
| f.write('{}\t{}\n'.format(key, val)) | |
| def skip_char(text, sp_char_list): | |
| """ | |
| skip empty cell | |
| @param text: text in cell | |
| @param sp_char_list: style char and special code | |
| @return: | |
| """ | |
| for sp_char in sp_char_list: | |
| text = text.replace(sp_char, '') | |
| return text | |
| def gen_html(img): | |
| ''' | |
| Formats HTML code from tokenized annotation of img | |
| ''' | |
| html_code = img['html']['structure']['tokens'].copy() | |
| to_insert = [i for i, tag in enumerate(html_code) if tag in ('<td>', '>')] | |
| for i, cell in zip(to_insert[::-1], img['html']['cells'][::-1]): | |
| if cell['tokens']: | |
| text = ''.join(cell['tokens']) | |
| # skip empty text | |
| sp_char_list = ['<b>', '</b>', '\u2028', ' ', '<i>', '</i>'] | |
| text_remove_style = skip_char(text, sp_char_list) | |
| if len(text_remove_style) == 0: | |
| continue | |
| html_code.insert(i + 1, text) | |
| html_code = ''.join(html_code) | |
| html_code = '<html><body><table>{}</table></body></html>'.format(html_code) | |
| return html_code | |
| def load_gt_data(gt_path): | |
| """ | |
| load gt | |
| @param gt_path: | |
| @return: | |
| """ | |
| data_list = {} | |
| with open(gt_path, 'rb') as f: | |
| lines = f.readlines() | |
| for line in tqdm(lines): | |
| data_line = line.decode('utf-8').strip("\n") | |
| info = json.loads(data_line) | |
| data_list[info['filename']] = info | |
| return data_list | |
| def convert(origin_gt_path, save_path): | |
| """ | |
| gen html from label file | |
| @param origin_gt_path: | |
| @param save_path: | |
| @return: | |
| """ | |
| data_dict = load_gt_data(origin_gt_path) | |
| for img_name, gt in tqdm(data_dict.items()): | |
| html = gen_html(gt) | |
| save_pred_txt(img_name, html, save_path) | |
| print('conver finish') | |
| def parse_args(): | |
| parser = argparse.ArgumentParser(description="args for paddleserving") | |
| parser.add_argument( | |
| "--ori_gt_path", type=str, required=True, help="label gt path") | |
| parser.add_argument( | |
| "--save_path", type=str, required=True, help="path to save file") | |
| args = parser.parse_args() | |
| return args | |
| if __name__ == '__main__': | |
| args = parse_args() | |
| convert(args.ori_gt_path, args.save_path) | |