alps / unitable /src /utils /engine.py
yumikimi381's picture
Upload folder using huggingface_hub
daf0288 verified
import os
import json
import argparse
from pathlib import Path
import glob
from ..utils import build_table_from_html_and_cell, html_table_template
def combine_all_json(file_dir: str) -> dict:
total_result = dict()
files = os.listdir(file_dir)
try:
files.remove("final.json")
except ValueError:
pass
for file in files:
with open(os.path.join(file_dir, file), "r") as f:
result = json.load(f)
total_result.update(result)
print(f"Combined to a json with {len(total_result)} entries.")
return total_result
def json_to_final(file_dir: str, type: str):
if type == "html" or type == "bbox":
result = combine_all_json(file_dir)
elif type == "html+cell":
result_cell = combine_all_json(file_dir)
result_html_file = os.path.join(
Path(file_dir).parent,
Path(file_dir).name.split("-")[0].replace("cell", "html") + "-html",
)
assert Path(result_html_file).is_dir(), f"{result_html_file} does not exist."
result = combine_all_json(result_html_file)
assert len(result) == len(result_cell)
else:
# assert html and cell json files have the same length
raise NotImplementedError
out = dict()
if type == "bbox":
out = result
else:
for filename, obj in result.items():
if type == "html":
pred_html = "".join(obj["pred"])
gt_html = "".join(obj["gt"])
out[filename] = dict(
pred=html_table_template(pred_html), gt=html_table_template(gt_html)
)
elif type == "html+cell":
pred_html_cell = build_table_from_html_and_cell(
obj["pred"], result_cell[filename]["pred"]
)
gt_html_cell = build_table_from_html_and_cell(
obj["gt"], result_cell[filename]["gt"]
)
out[filename] = dict(
pred=html_table_template(pred_html_cell),
gt=html_table_template(gt_html_cell),
)
else:
raise NotImplementedError
# write to file
with open(os.path.join(file_dir, f"final.json"), "w", encoding="utf-8") as f:
json.dump(out, f, indent=4)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="postprecess")
parser.add_argument(
"-f", "--file", help="path to all json files from difference devices"
)
parser.add_argument("-t", "--type", help="html, html+cell")
args = parser.parse_args()
json_to_final(args.file, args.type)