Spaces:
Paused
Paused
| """ | |
| python script/mapping.py --gray /Users/jimmyzhengyz/Documents/Research/ui2code_demo/public/assets/debug/bboxes.json --uied /Users/jimmyzhengyz/Documents/Research/ui2code_demo/public/assets/demo1_output/ip/demo1_filtered.json --debug overlay.png --debug-src public/assets/demo1.png | |
| """ | |
| import json, argparse, numpy as np, cv2 | |
| from pathlib import Path | |
| from typing import List, Dict | |
| from collections import defaultdict | |
| from sklearn.linear_model import RANSACRegressor | |
| from scipy.spatial.distance import cdist | |
| from scipy.optimize import linear_sum_assignment | |
| import sys | |
| CIOU_STRICT = -0.9 # Min CIoU score for a valid one-to-one mapping | |
| FILTER_MIN_WH = 10 # UIED filter: ignore boxes smaller than this | |
| # Tools | |
| def ciou(a, b): | |
| """ | |
| Calculate Complete IoU (CIoU) between two bounding boxes. | |
| `a`, `b`: bounding boxes in format (x, y, w, h). | |
| Returns a value between -1 and 1. Higher is better. | |
| """ | |
| # Epsilon to prevent division by zero | |
| epsilon = 1e-7 | |
| # Standard IoU | |
| xa, ya, wa, ha = a | |
| xb, yb, wb, hb = b | |
| x1, y1 = max(xa, xb), max(ya, yb) | |
| x2, y2 = min(xa + wa, xb + wb), min(ya + ha, yb + hb) | |
| intersection_area = max(0, x2 - x1) * max(0, y2 - y1) | |
| union_area = (wa * ha) + (wb * hb) - intersection_area | |
| iou_val = intersection_area / (union_area + epsilon) | |
| # Center points distance | |
| center_a = center(a) | |
| center_b = center(b) | |
| center_distance_sq = np.sum((center_a - center_b) ** 2) | |
| # Enclosing box diagonal | |
| enclose_x1 = min(xa, xb) | |
| enclose_y1 = min(ya, yb) | |
| enclose_x2 = max(xa + wa, xb + wb) | |
| enclose_y2 = max(ya + ha, yb + hb) | |
| enclose_diag_sq = ((enclose_x2 - enclose_x1) ** 2) + ((enclose_y2 - enclose_y1) ** 2) | |
| distance_penalty = center_distance_sq / (enclose_diag_sq + epsilon) | |
| # Aspect ratio consistency | |
| arctan_a = np.arctan(wa / (ha + epsilon)) | |
| arctan_b = np.arctan(wb / (hb + epsilon)) | |
| v = (4 / (np.pi ** 2)) * ((arctan_a - arctan_b) ** 2) | |
| # Trade-off parameter alpha | |
| with np.errstate(divide='ignore', invalid='ignore'): | |
| alpha = v / (1 - iou_val + v + epsilon) | |
| alpha = 0 if np.isnan(alpha) else alpha # if iou=1 and v=0, alpha is nan. | |
| aspect_ratio_penalty = alpha * v | |
| # CIOU | |
| ciou_val = iou_val - distance_penalty - aspect_ratio_penalty | |
| return ciou_val | |
| def center(box): | |
| x, y, w, h = box | |
| return np.array([x + w / 2, y + h / 2]) | |
| def load_regions_and_placeholders(p: Path, W_img, H_img): | |
| """ | |
| Loads region and placeholder data from the specified JSON file. | |
| The file is expected to have 'regions' and 'placeholders' keys with | |
| proportional bbox values, which are converted to absolute pixel values. | |
| """ | |
| data = json.loads(p.read_text()) | |
| def to_pixels(b): | |
| return (b['x']*W_img, b['y']*H_img, b['w']*W_img, b['h']*H_img) | |
| regions = [{**d, "bbox": to_pixels(d)} for d in data.get("regions", [])] | |
| placeholders = [{**d, "bbox": to_pixels(d)} for d in data.get("placeholders", [])] | |
| if not regions or not placeholders: | |
| print(f"Warning: JSON file {p} does not contain 'regions' or 'placeholders' keys.") | |
| return regions, placeholders | |
| def load_uied_boxes(p: Path): | |
| """ | |
| Loads UIED component detection data. | |
| The JSON file is expected to contain the shape of the image that was | |
| processed, which is crucial for calculating scaling factors later. | |
| """ | |
| data = json.loads(p.read_text()) | |
| compos = data.get("compos", []) | |
| shape = data.get("img_shape") # e.g., [800, 571, 3] | |
| items = [] | |
| for d in compos: | |
| w, h = d.get("width", 0), d.get("height", 0) | |
| if w < FILTER_MIN_WH or h < FILTER_MIN_WH: continue | |
| items.append({"id": d["id"], | |
| "bbox": (d["column_min"], d["row_min"], w, h)}) | |
| # print(d["id"], d["column_min"], d["row_min"], w, h) | |
| return items, shape | |
| def estimate_global_transform(pixel_placeholders, uied_boxes, uied_shape, W_orig, H_orig): | |
| """ | |
| Estimates a global affine transform from the UIED coordinate space to the | |
| original screenshot's coordinate space. This is used for rough alignment. | |
| """ | |
| # 1. Calculate base scaling from image dimension ratios | |
| H_proc, W_proc, _ = uied_shape | |
| scale_x = W_orig / W_proc | |
| scale_y = H_orig / H_proc | |
| # 2. Apply this scaling to all UIED boxes | |
| uied_scaled = [{**u, "bbox": (u["bbox"][0]*scale_x, u["bbox"][1]*scale_y, u["bbox"][2]*scale_x, u["bbox"][3]*scale_y)} for u in uied_boxes] | |
| # 3. Estimate residual translation (dx, dy) by matching centers | |
| if not pixel_placeholders or not uied_scaled: | |
| return scale_x, scale_y, 0, 0 | |
| ph_centers = np.array([center(p["bbox"]) for p in pixel_placeholders]) | |
| uied_scaled_centers = np.array([center(u["bbox"]) for u in uied_scaled]) | |
| indices = cdist(ph_centers, uied_scaled_centers).argmin(axis=1) | |
| translations = ph_centers - uied_scaled_centers[indices] | |
| dx, dy = np.median(translations, axis=0) | |
| return scale_x, scale_y, dx, dy | |
| def apply_affine_transform(box, scale_x, scale_y, dx, dy): | |
| x, y, w, h = box | |
| return (x * scale_x + dx, y * scale_y + dy, w * scale_x, h * scale_y) | |
| # Mapping Function | |
| def find_local_mapping_and_transform(placeholders, uied_boxes, uied_shape, W_orig, H_orig): | |
| """ | |
| Finds the optimal one-to-one mapping and the local affine transform for a given | |
| subset of placeholders and UIED boxes. | |
| """ | |
| if not placeholders or not uied_boxes: | |
| return {}, (1, 1, 0, 0) | |
| # 1. Estimate local affine transform | |
| # 1a. Calculate base scaling from image dimension ratios | |
| H_proc, W_proc, _ = uied_shape | |
| scale_x = W_orig / W_proc | |
| scale_y = H_orig / H_proc | |
| # 1b. Apply this scaling to UIED boxes | |
| uied_scaled = [{**u, "bbox": (u["bbox"][0]*scale_x, u["bbox"][1]*scale_y, u["bbox"][2]*scale_x, u["bbox"][3]*scale_y)} for u in uied_boxes] | |
| # 1c. Estimate residual translation (dx, dy) by matching centers | |
| ph_centers = np.array([center(p["bbox"]) for p in placeholders]) | |
| uied_scaled_centers = np.array([center(u["bbox"]) for u in uied_scaled]) | |
| indices = cdist(ph_centers, uied_scaled_centers).argmin(axis=1) | |
| translations = ph_centers - uied_scaled_centers[indices] | |
| dx, dy = np.median(translations, axis=0) | |
| transform = (scale_x, scale_y, dx, dy) | |
| # 2. Apply the final, full transformation to all UIED boxes in this subset | |
| uied_tf = [{**u, "bbox_tf": apply_affine_transform(u["bbox"], scale_x, scale_y, dx, dy)} for u in uied_boxes] | |
| # 3. Create a cost matrix and find optimal assignment | |
| num_gray = len(placeholders) | |
| num_uied = len(uied_tf) | |
| cost_matrix = np.zeros((num_gray, num_uied)) | |
| for i in range(num_gray): | |
| for j in range(num_uied): | |
| cost_matrix[i, j] = -ciou(placeholders[i]["bbox"], uied_tf[j]["bbox_tf"]) | |
| row_ind, col_ind = linear_sum_assignment(cost_matrix) | |
| # 4. Create the one-to-one mapping | |
| mapping = {} | |
| for r, c in zip(row_ind, col_ind): | |
| score = -cost_matrix[r, c] | |
| if score >= CIOU_STRICT: | |
| g_id = placeholders[r]["id"] | |
| u_id = uied_tf[c]["id"] | |
| mapping[g_id] = u_id | |
| return mapping, transform | |
| def generate_debug_overlay(img_path, all_uied_boxes, region_results, uied_shape, out_png): | |
| """ | |
| Generates a debug image by drawing the mapped UIED boxes on the original screenshot. | |
| This version uses a simple scaling based on image dimensions, without any translation. | |
| """ | |
| canvas = cv2.imread(str(img_path)) | |
| if canvas is None: | |
| print(f"Error: Could not read debug source image at {img_path}.") | |
| return | |
| # Use a fixed red color for all bounding boxes for consistency | |
| color = (0, 0, 255) # Red in BGR | |
| # 1. Calculate simple scaling factors from the provided image shapes. | |
| H_proc, W_proc, _ = uied_shape | |
| H_orig, W_orig, _ = canvas.shape | |
| scale_x = W_orig / W_proc | |
| scale_y = H_orig / H_proc | |
| # 2. Draw all mapped UIED boxes using only this simple scaling. | |
| for region_id, result in region_results.items(): | |
| mapping = result.get("mapping", {}) | |
| for g_id, uid in mapping.items(): | |
| u_box = next((box for box in all_uied_boxes if box["id"] == uid), None) | |
| if u_box is None: continue | |
| # Apply simple scaling directly, without any translation offset. | |
| x_proc, y_proc, w_proc, h_proc = u_box["bbox"] | |
| x = x_proc * scale_x | |
| y = y_proc * scale_y | |
| w = w_proc * scale_x | |
| h = h_proc * scale_y | |
| cv2.rectangle(canvas, (int(x), int(y)), (int(x + w), int(y + h)), color, 2) | |
| cv2.putText(canvas, f"uied_{uid}", (int(x), int(y) - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2) | |
| cv2.imwrite(str(out_png), canvas) | |
| def main(): | |
| args = get_args() | |
| run_id = args.run_id | |
| # --- Dynamic Path Construction --- | |
| base_dir = Path(__file__).parent.resolve() | |
| tmp_dir = base_dir / 'data' / 'tmp' / run_id | |
| gray_json_path = tmp_dir / f"{run_id}_bboxes.json" | |
| uied_json_path = tmp_dir / "ip" / f"{run_id}.json" | |
| mapping_output_path = tmp_dir / f"mapping_full_{run_id}.json" | |
| debug_src_path = tmp_dir / f"{run_id}.png" | |
| debug_overlay_path = tmp_dir / f"overlay_test_{run_id}.png" | |
| # --- Input Validation --- | |
| if not gray_json_path.exists(): | |
| sys.exit(f"Error: Placeholder JSON not found at {gray_json_path}") | |
| if not uied_json_path.exists(): | |
| sys.exit(f"Error: UIED JSON not found at {uied_json_path}") | |
| if not debug_src_path.exists(): | |
| sys.exit(f"Error: Source image for coordinate conversion not found at {debug_src_path}") | |
| print(f"--- Starting Mapping for run_id: {run_id} ---") | |
| # 1. Load the original screenshot to get its absolute dimensions | |
| orig_img = cv2.imread(str(debug_src_path)) | |
| if orig_img is None: | |
| sys.exit(f"Error: Could not read debug source image at {debug_src_path}.") | |
| H_orig, W_orig, _ = orig_img.shape | |
| # 2. Load proportional data and convert to absolute pixel coordinates | |
| pixel_regions, pixel_placeholders = load_regions_and_placeholders(gray_json_path, W_orig, H_orig) | |
| # 3. Load UIED data | |
| all_uied_boxes, uied_shape = load_uied_boxes(uied_json_path) | |
| if not pixel_placeholders or not all_uied_boxes: | |
| print("Error: Could not proceed without placeholder and UIED data.") | |
| return | |
| # 4. Estimate a GLOBAL transform for rough, initial alignment of all UIED boxes | |
| g_scale_x, g_scale_y, g_dx, g_dy = estimate_global_transform(pixel_placeholders, all_uied_boxes, uied_shape, W_orig, H_orig) | |
| print(f"Estimated Global Transform: scale_x={g_scale_x:.3f}, scale_y={g_scale_y:.3f}, dx={g_dx:.1f}, dy={g_dy:.1f}") | |
| # Apply the global transform to all UIED boxes to get them into the main coordinate space | |
| uied_tf_global = [{**u, "bbox_tf": apply_affine_transform(u["bbox"], g_scale_x, g_scale_y, g_dx, g_dy)} for u in all_uied_boxes] | |
| # 5. Loop through regions and perform LOCALIZED matching and transform estimation | |
| final_results = {} | |
| total_placeholders_count = len(pixel_placeholders) | |
| total_mappings_count = 0 | |
| for region in pixel_regions: | |
| # Filter placeholders for the current region | |
| region_placeholders = [p for p in pixel_placeholders if p.get("region_id") == region["id"]] | |
| if not region_placeholders: | |
| continue | |
| # Filter UIED boxes for the current region using the globally transformed coordinates | |
| rx, ry, rw, rh = region["bbox"] | |
| region_uied_ids = { | |
| u['id'] for u in uied_tf_global | |
| if rx <= center(u["bbox_tf"])[0] <= rx + rw and ry <= center(u["bbox_tf"])[1] <= ry + rh | |
| } | |
| # Get the original uied boxes that correspond to this region | |
| region_uied_boxes = [u for u in all_uied_boxes if u['id'] in region_uied_ids] | |
| if not region_uied_boxes: | |
| print(f"Warning: No UIED boxes found in region {region['id']} after global alignment.") | |
| continue | |
| # Find the precise LOCAL mapping and transform for this region | |
| region_mapping, region_transform = find_local_mapping_and_transform( | |
| region_placeholders, region_uied_boxes, uied_shape, W_orig, H_orig | |
| ) | |
| if region_mapping: | |
| total_mappings_count += len(region_mapping) | |
| l_scale_x, l_scale_y, l_dx, l_dy = region_transform | |
| final_results[region["id"]] = { | |
| "transform": { "scale_x": l_scale_x, "scale_y": l_scale_y, "dx": l_dx, "dy": l_dy }, | |
| "mapping": region_mapping | |
| } | |
| # 6. Report and save results | |
| print(f"Successfully created {total_mappings_count} one-to-one mappings out of {total_placeholders_count} placeholders.") | |
| mapping_output_path.write_text(json.dumps(final_results, indent=2, ensure_ascii=False)) | |
| print(f"Mapping data written to {mapping_output_path}") | |
| # Always generate the debug image if the source exists | |
| generate_debug_overlay(debug_src_path, all_uied_boxes, final_results, uied_shape, debug_overlay_path) | |
| print(f"--- Mapping Complete for run_id: {run_id} ---") | |
| def get_args(): | |
| ap = argparse.ArgumentParser(description="Map UIED components to placeholder boxes.") | |
| ap.add_argument('--run_id', required=True, type=str, help="A unique identifier for the processing run.") | |
| return ap.parse_args() | |
| if __name__ == "__main__": | |
| main() | |