Spaces:

gyrojeff
/

YuzuMarker.FontDetection

Running

App Files Files Community

gyrojeff commited on Mar 16, 2023

Commit

3daa9d7

1 Parent(s): eb77f3e

feat: add broken detection script

Browse files

Files changed (1) hide show

font_ds_detect_broken.py +96 -0

font_ds_detect_broken.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import sys
+import traceback
+import pickle
+import os
+import concurrent.futures
+from tqdm import tqdm
+import time
+from font_dataset.font import load_fonts
+import cv2
+cjk_ratio = 3
+train_cnt = 100
+val_cnt = 5
+test_cnt = 30
+train_cnt_cjk = int(train_cnt * cjk_ratio)
+val_cnt_cjk = int(val_cnt * cjk_ratio)
+test_cnt_cjk = int(test_cnt * cjk_ratio)
+dataset_path = "./dataset/font_img"
+os.makedirs(dataset_path, exist_ok=True)
+unqualified_log_file_name = f"unqualified_font_{time.time()}.txt"
+runtime_exclusion_list = []
+fonts, exclusion_rule = load_fonts()
+def generate_dataset(dataset_type: str, cnt: int):
+    dataset_bath_dir = os.path.join(dataset_path, dataset_type)
+    os.makedirs(dataset_bath_dir, exist_ok=True)
+    def _generate_single(args):
+        i, j, font = args
+        print(
+            f"Checking {dataset_type} font: {font.path} {i} / {len(fonts)}, image {j}",
+            end="\r",
+        )
+        if exclusion_rule(font):
+            print(f"Excluded font: {font.path}")
+            return
+        if font.path in runtime_exclusion_list:
+            print(f"Excluded font: {font.path}")
+            return
+        image_file_name = f"font_{i}_img_{j}.jpg"
+        label_file_name = f"font_{i}_img_{j}.bin"
+        image_file_path = os.path.join(dataset_bath_dir, image_file_name)
+        label_file_path = os.path.join(dataset_bath_dir, label_file_name)
+        # detect cache
+        if (not os.path.exists(image_file_path)) or (
+            not os.path.exists(label_file_path)
+        ):
+            print(
+                f"Missing {dataset_type} font: {font.path} {i} / {len(fonts)}, image {j}"
+            )
+        # detect broken
+        try:
+            # check image
+            cv2.imread(image_file_path)
+            # check label
+            with open(label_file_path, "rb") as f:
+                pickle.load(f)
+        except Exception as e:
+            print(
+                f"Broken {dataset_type} font: {font.path} {i} / {len(fonts)}, image {j}"
+            )
+            os.remove(image_file_path)
+            os.remove(label_file_path)
+        return
+    work_list = []
+    # divide len(fonts) into 64 parts and choose the third part for this script
+    for i in range(len(fonts)):
+        font = fonts[i]
+        if font.language == "CJK":
+            true_cnt = cnt * cjk_ratio
+        else:
+            true_cnt = cnt
+        for j in range(true_cnt):
+            work_list.append((i, j, font))
+    for i in tqdm(range(len(work_list))):
+        _generate_single(work_list[i])
+generate_dataset("train", train_cnt)
+generate_dataset("val", val_cnt)
+generate_dataset("test", test_cnt)