File size: 13,895 Bytes
dc6b3d4
 
 
 
 
 
 
 
 
 
 
 
c068575
dc6b3d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
import torch
from ultralytics import YOLO
from PIL import Image
import io
import base64
device = 'cuda'

from PIL import Image, ImageDraw, ImageFont
import numpy as np
import networkx as nx
# import cv2

font_path = "./util/arial.ttf"
class MarkHelper:
    def __init__(self):    
        self.markSize_dict = {}
        self.font_dict = {}
        self.min_font_size = 20 # 1 in v1
        self.max_font_size = 30
        self.max_font_proportion = 0.04 # 0.032 in v1

    def __get_markSize(self, text, image_height, image_width, font):
        im = Image.new('RGB', (image_width, image_height))
        draw = ImageDraw.Draw(im)
        _, _, width, height = draw.textbbox((0, 0), text=text, font=font)
        return height, width

    def _setup_new_font(self, image_height, image_width):
        key = f"{image_height}_{image_width}"
        # print(f"Setting up new font for image size: {key}")
        
        # setup the font
        fontsize = self.min_font_size
        font = ImageFont.truetype(font_path, fontsize)
        # font = ImageFont.load_default(size=fontsize)
        while min(self.__get_markSize("555", image_height, image_width, font)) < min(self.max_font_size, self.max_font_proportion * min(image_height, image_width)):
            # iterate until the text size is just larger than the criteria
            fontsize += 1
            font = ImageFont.truetype(font_path, fontsize)
            # font = ImageFont.load_default(size=fontsize)
        self.font_dict[key] = font

        # setup the markSize dict
        markSize_3digits = self.__get_markSize('555', image_height, image_width, font)
        markSize_2digits = self.__get_markSize('55', image_height, image_width, font)
        markSize_1digit = self.__get_markSize('5', image_height, image_width, font)
        self.markSize_dict[key] = {
            1: markSize_1digit,
            2: markSize_2digits,
            3: markSize_3digits
        }

    def get_font(self, image_height, image_width):
        key = f"{image_height}_{image_width}"
        if key not in self.font_dict:
            self._setup_new_font(image_height, image_width)
        return self.font_dict[key]
        
    def get_mark_size(self, text_str, image_height, image_width):
        """Get the font size for the given image dimensions."""
        key = f"{image_height}_{image_width}"
        if key not in self.markSize_dict:
            self._setup_new_font(image_height, image_width)

        largest_size = self.markSize_dict[key].get(3, None)
        text_h, text_w = self.markSize_dict[key].get(len(text_str), largest_size) # default to the largest size if the text is too long
        return text_h, text_w

def __calculate_iou(box1, box2, return_area=False):
    """
    Calculate the Intersection over Union (IoU) of two bounding boxes.
    :param box1: Tuple of (y, x, h, w) for the first bounding box
    :param box2: Tuple of (y, x, h, w) for the second bounding box
    :return: IoU value
    """
    y1, x1, h1, w1 = box1
    y2, x2, h2, w2 = box2

    # Calculate the intersection area
    y_min = max(y1, y2)
    x_min = max(x1, x2)
    y_max = min(y1 + h1, y2 + h2)
    x_max = min(x1 + w1, x2 + w2)

    intersection_area = max(0, y_max - y_min) * max(0, x_max - x_min)

    # Compute the area of both bounding boxes
    box1_area = h1 * w1
    box2_area = h2 * w2

    # Calculate the IoU
    # iou = intersection_area / box1_area + box2_area - intersection_area
    iou = intersection_area / (min(box1_area, box2_area) + 0.0001)

    if return_area:
        return iou, intersection_area
    return iou

def __calculate_nearest_corner_distance(box1, box2):
    """Calculate the distance between the nearest edge or corner of two bounding boxes."""
    y1, x1, h1, w1 = box1
    y2, x2, h2, w2 = box2
    corners1 = np.array([
        [y1, x1],
        [y1, x1 + w1],
        [y1 + h1, x1],
        [y1 + h1, x1 + w1]
    ])
    corners2 = np.array([
        [y2, x2],
        [y2, x2 + w2],
        [y2 + h2, x2],
        [y2 + h2, x2 + w2]
    ])
    # Calculate pairwise distances between corners
    distances = np.linalg.norm(corners1[:, np.newaxis] - corners2, axis=2)

    # Find the minimum distance
    min_distance = np.min(distances)
    return min_distance

def _find_least_overlapping_corner(bbox, bboxes, drawn_boxes, text_size, image_size):
    """Find the corner with the least overlap with other bboxes.
    Args:
        bbox: (y, x, h, w) The bounding box to place the text on.
        bboxes: [(y, x, h, w)] The list of bounding boxes to compare against.
        drawn_boxes: [(y, x, h, w)] The list of bounding boxes that have already been drawn on.
        text_size: (height, width) The size of the text to be drawn.
        image_size: (height, width) The size of the image.
    """
    y, x, h, w = bbox
    h_text, w_text = text_size
    image_height, image_width = image_size
    corners = [
        # top-left
        (y - h_text, x),
        # top-right
        (y - h_text, x + w - w_text),
        # right-top
        (y, x + w),
        # right-bottom
        (y + h - h_text, x + w),
        # bottom-right
        (y + h, x + w - w_text),
        # bottom-left
        (y + h, x),
        # left-bottom
        (y + h - h_text, x - w_text),
        # left-top
        (y, x - w_text),
        ]
    best_corner = corners[0]
    max_flag = float('inf')

    for corner in corners:
        corner_bbox = (corner[0], corner[1], h_text, w_text)
        # if the corner is out of the image, skip
        if corner[0] < 0 or corner[1] < 0 or corner[0] + h_text > image_height or corner[1] + w_text > image_width:
            continue
        max_iou = - (image_width + image_height)
        # 找到关于这个角最差的 case
        # given the current corner, find the larget iou with other bboxes.
        for other_bbox in bboxes + drawn_boxes:
            if np.array_equal(bbox, other_bbox):
                continue
            iou = __calculate_iou(corner_bbox, other_bbox, return_area=True)[1]
            max_iou = max(max_iou, iou - 0.0001 * __calculate_nearest_corner_distance(corner_bbox, other_bbox))
        # the smaller the max_IOU, the better the corner
        # 取最差的值 相对最好的那个角
        if max_iou < max_flag:
            max_flag = max_iou
            best_corner = corner

    return best_corner

def plot_boxes_with_marks(
    image: Image.Image,
    bboxes, # (y, x, h, w)
    mark_helper: MarkHelper,
    linewidth=2,
    alpha=0,
    edgecolor=None,
    fn_save=None,
    normalized_to_pixel=True,
    add_mark=True
) -> np.ndarray:
    """Plots bounding boxes on an image with marks attached to the edges of the boxes where no overlap with other boxes occurs.
    Args:
        image: The image to plot the bounding boxes on.
        bboxes: A 2D int array of shape (num_boxes, 4), where each row represents a bounding box: (y_top_left, x_top_left, box_height, box_width). If normalized_to_pixel is True, the values are float and are normalized with the image size. If normalized_to_pixel is False, the values are int and are in pixel.
    """
    # Then modify the drawing code
    draw = ImageDraw.Draw(image)

    # draw boxes on the image
    image_width, image_height = image.size

    if normalized_to_pixel:
        bboxes = [(int(y * image_height), int(x * image_width), int(h * image_height), int(w * image_width)) for y, x, h, w in bboxes]

    for box in bboxes:
        y, x, h, w = box
        draw.rectangle([x, y, x + w, y + h], outline=edgecolor, width=linewidth)
    
    # Draw the bounding boxes with index at the least overlapping corner
    drawn_boxes = []
    for idx, bbox in enumerate(bboxes):
        text = str(idx)
        text_h, text_w = mark_helper.get_mark_size(text, image_height, image_width)
        corner_y, corner_x = _find_least_overlapping_corner(
            bbox, bboxes, drawn_boxes, (text_h, text_w), (image_height, image_width))
        
        # Define the index box (y, x, y + h, x + w)
        text_box = (corner_y, corner_x, text_h, text_w)

        if add_mark:
            # Draw the filled index box and text
            draw.rectangle([corner_x, corner_y, corner_x + text_w, corner_y + text_h], # (x, y, x + w, y + h)
                        fill="red")        
            font = mark_helper.get_font(image_height, image_width)
            draw.text((corner_x, corner_y), text, fill='white', font=font)
        
        # Update the list of drawn boxes
        drawn_boxes.append(np.array(text_box))
        
    if fn_save is not None: # PIL image
        image.save(fn_save)
    return image

def plot_circles_with_marks(
    image: Image.Image,
    points, # (x, y)
    mark_helper: MarkHelper,
    linewidth=2,
    edgecolor=None,
    fn_save=None,
    normalized_to_pixel=True,
    add_mark=True
) -> np.ndarray:
    """Plots bounding boxes on an image with marks attached to the edges of the boxes where no overlap with other boxes occurs.
    Args:
        image: The image to plot the bounding boxes on.
        bboxes: A 2D int array of shape (num_boxes, 4), where each row represents a bounding box: (y_top_left, x_top_left, box_height, box_width). If normalized_to_pixel is True, the values are float and are normalized with the image size. If normalized_to_pixel is False, the values are int and are in pixel.
    """
    # draw boxes on the image
    image_width, image_height = image.size

    if normalized_to_pixel:
        bboxes = [(int(y * image_height), int(x * image_width), int(h * image_height), int(w * image_width)) for y, x, h, w in bboxes]

    draw = ImageDraw.Draw(image)
    for point in points:
        x, y = point
        draw.circle((x, y), radius=5, outline=edgecolor, width=linewidth)
        
    if fn_save is not None: # PIL image
        image.save(fn_save)
    return image

markhelper = MarkHelper()

BBOX_DEDUPLICATION_IOU_PROPORTION = 0.5
BBOX_GROUPING_VERTICAL_THRESHOLD = 20
BBOX_GROUPING_HORIZONTAL_THRESHOLD = 20
BBOX_AUG_TARGET = 2.0

def _is_boxes_same_line_or_near(bbox1, bbox2, vertical_threshold, horizontal_threshold):
    """check if two boxes are in the same line or close enough to be considered together"""
    y1, x1, h1, w1 = bbox1
    y2, x2, h2, w2 = bbox2
    
    # Check if the boxes are close horizontally (consider the edge case where the boxes are touching)
    horizontally_close = (x1 <= x2 and x2 - x1 <= w1 + horizontal_threshold) or (x2 <= x1 and x1 - x2 <= w2 + horizontal_threshold)

    # Check if the boxes are close vertically (consider the edge case where the boxes are touching)
    vertically_close = (y1 <= y2 and y2 - y1 <= h1 + vertical_threshold) or (y2 <= y1 and y1 - y2 <= h2 + vertical_threshold)
    
    # Consider the boxes to be in the same line if they are vertically close and either overlap or are close horizontally
    return vertically_close and horizontally_close

def _build_adjacency_matrix(bboxes, vertical_threshold, horizontal_threshold):
    """Build the adjacency matrix based on the merging criteria."""
    num_boxes = len(bboxes)
    A = np.zeros((num_boxes, num_boxes), dtype=int)

    for i in range(num_boxes):
        for j in range(i + 1, num_boxes):
            if _is_boxes_same_line_or_near(bboxes[i], bboxes[j], vertical_threshold, horizontal_threshold):
                A[i, j] = 1
                A[j, i] = 1  # Symmetric matrix

    return A

def merge_connected_bboxes(bboxes, text_details, 
    vertical_threshold=BBOX_GROUPING_VERTICAL_THRESHOLD, 
    horizontal_threshold=BBOX_GROUPING_HORIZONTAL_THRESHOLD
):
    """Merge bboxes based on the adjacency matrix and return merged bboxes.
    Args:
        bboxes: A 2D array of shape (num_boxes, 4), where each row represents a bounding box: (y, x, height, width).
        text_details: A list of text details for each bounding box.
        vertical_threshold: The maximum vertical distance between two boxes to be considered in the same line.
        horizontal_threshold: The maximum horizontal distance between two boxes to be considered close.
    """
    # return if there are no bboxes
    if len(bboxes) <= 1:
        return bboxes, text_details
    
    # Convert bboxes (x1, y1, x2, y2) to (y, x, height, width) format
    bboxes = np.array(bboxes)
    bboxes = np.array([bboxes[:, 1], bboxes[:, 0], bboxes[:, 3] - bboxes[:, 1], bboxes[:, 2] - bboxes[:, 0]]).T

    # Build adjacency matrix
    A = _build_adjacency_matrix(bboxes, vertical_threshold, horizontal_threshold)
    
    # Create graph from adjacency matrix
    G = nx.from_numpy_array(A)
    
    # Find connected components
    components = list(nx.connected_components(G))
    
    # Convert bboxes to (y_min, x_min, y_max, x_max) format
    corners = np.copy(bboxes)
    corners_y, corners_x, corners_h, corners_w = corners[:, 0], corners[:, 1], corners[:, 2], corners[:, 3]
    
    corners_y_max = corners_y + corners_h
    corners_x_max = corners_x + corners_w
    
    # Merge bboxes for each connected component
    merged_bboxes = []
    merged_text_details = []
    for component in components:
        indices = list(component) # e.g., [32, 33, 34, 30, 31]
        indices = sorted(indices)

        # merge the text details
        merged_text_details.append(' '.join([text_details[i] for i in indices]))

        # merge the bboxes
        y_min = min(corners_y[i] for i in indices)
        x_min = min(corners_x[i] for i in indices)
        y_max = max(corners_y_max[i] for i in indices)
        x_max = max(corners_x_max[i] for i in indices)
        merged_bboxes.append((y_min, x_min, y_max - y_min, x_max - x_min)) # Convert merged_bbox back to (y, x, height, width) format
    
    # convert (y, x, height, width) to (x1, y1, x2, y2) format without np.array
    merged_bboxes = [(bbox[1], bbox[0], bbox[1] + bbox[3], bbox[0] + bbox[2]) for bbox in merged_bboxes]
    return merged_bboxes, merged_text_details