drlon's picture
update som.py
c068575
import torch
from ultralytics import YOLO
from PIL import Image
import io
import base64
device = 'cuda'
from PIL import Image, ImageDraw, ImageFont
import numpy as np
import networkx as nx
# import cv2
font_path = "./util/arial.ttf"
class MarkHelper:
def __init__(self):
self.markSize_dict = {}
self.font_dict = {}
self.min_font_size = 20 # 1 in v1
self.max_font_size = 30
self.max_font_proportion = 0.04 # 0.032 in v1
def __get_markSize(self, text, image_height, image_width, font):
im = Image.new('RGB', (image_width, image_height))
draw = ImageDraw.Draw(im)
_, _, width, height = draw.textbbox((0, 0), text=text, font=font)
return height, width
def _setup_new_font(self, image_height, image_width):
key = f"{image_height}_{image_width}"
# print(f"Setting up new font for image size: {key}")
# setup the font
fontsize = self.min_font_size
font = ImageFont.truetype(font_path, fontsize)
# font = ImageFont.load_default(size=fontsize)
while min(self.__get_markSize("555", image_height, image_width, font)) < min(self.max_font_size, self.max_font_proportion * min(image_height, image_width)):
# iterate until the text size is just larger than the criteria
fontsize += 1
font = ImageFont.truetype(font_path, fontsize)
# font = ImageFont.load_default(size=fontsize)
self.font_dict[key] = font
# setup the markSize dict
markSize_3digits = self.__get_markSize('555', image_height, image_width, font)
markSize_2digits = self.__get_markSize('55', image_height, image_width, font)
markSize_1digit = self.__get_markSize('5', image_height, image_width, font)
self.markSize_dict[key] = {
1: markSize_1digit,
2: markSize_2digits,
3: markSize_3digits
}
def get_font(self, image_height, image_width):
key = f"{image_height}_{image_width}"
if key not in self.font_dict:
self._setup_new_font(image_height, image_width)
return self.font_dict[key]
def get_mark_size(self, text_str, image_height, image_width):
"""Get the font size for the given image dimensions."""
key = f"{image_height}_{image_width}"
if key not in self.markSize_dict:
self._setup_new_font(image_height, image_width)
largest_size = self.markSize_dict[key].get(3, None)
text_h, text_w = self.markSize_dict[key].get(len(text_str), largest_size) # default to the largest size if the text is too long
return text_h, text_w
def __calculate_iou(box1, box2, return_area=False):
"""
Calculate the Intersection over Union (IoU) of two bounding boxes.
:param box1: Tuple of (y, x, h, w) for the first bounding box
:param box2: Tuple of (y, x, h, w) for the second bounding box
:return: IoU value
"""
y1, x1, h1, w1 = box1
y2, x2, h2, w2 = box2
# Calculate the intersection area
y_min = max(y1, y2)
x_min = max(x1, x2)
y_max = min(y1 + h1, y2 + h2)
x_max = min(x1 + w1, x2 + w2)
intersection_area = max(0, y_max - y_min) * max(0, x_max - x_min)
# Compute the area of both bounding boxes
box1_area = h1 * w1
box2_area = h2 * w2
# Calculate the IoU
# iou = intersection_area / box1_area + box2_area - intersection_area
iou = intersection_area / (min(box1_area, box2_area) + 0.0001)
if return_area:
return iou, intersection_area
return iou
def __calculate_nearest_corner_distance(box1, box2):
"""Calculate the distance between the nearest edge or corner of two bounding boxes."""
y1, x1, h1, w1 = box1
y2, x2, h2, w2 = box2
corners1 = np.array([
[y1, x1],
[y1, x1 + w1],
[y1 + h1, x1],
[y1 + h1, x1 + w1]
])
corners2 = np.array([
[y2, x2],
[y2, x2 + w2],
[y2 + h2, x2],
[y2 + h2, x2 + w2]
])
# Calculate pairwise distances between corners
distances = np.linalg.norm(corners1[:, np.newaxis] - corners2, axis=2)
# Find the minimum distance
min_distance = np.min(distances)
return min_distance
def _find_least_overlapping_corner(bbox, bboxes, drawn_boxes, text_size, image_size):
"""Find the corner with the least overlap with other bboxes.
Args:
bbox: (y, x, h, w) The bounding box to place the text on.
bboxes: [(y, x, h, w)] The list of bounding boxes to compare against.
drawn_boxes: [(y, x, h, w)] The list of bounding boxes that have already been drawn on.
text_size: (height, width) The size of the text to be drawn.
image_size: (height, width) The size of the image.
"""
y, x, h, w = bbox
h_text, w_text = text_size
image_height, image_width = image_size
corners = [
# top-left
(y - h_text, x),
# top-right
(y - h_text, x + w - w_text),
# right-top
(y, x + w),
# right-bottom
(y + h - h_text, x + w),
# bottom-right
(y + h, x + w - w_text),
# bottom-left
(y + h, x),
# left-bottom
(y + h - h_text, x - w_text),
# left-top
(y, x - w_text),
]
best_corner = corners[0]
max_flag = float('inf')
for corner in corners:
corner_bbox = (corner[0], corner[1], h_text, w_text)
# if the corner is out of the image, skip
if corner[0] < 0 or corner[1] < 0 or corner[0] + h_text > image_height or corner[1] + w_text > image_width:
continue
max_iou = - (image_width + image_height)
# 找到关于这个角最差的 case
# given the current corner, find the larget iou with other bboxes.
for other_bbox in bboxes + drawn_boxes:
if np.array_equal(bbox, other_bbox):
continue
iou = __calculate_iou(corner_bbox, other_bbox, return_area=True)[1]
max_iou = max(max_iou, iou - 0.0001 * __calculate_nearest_corner_distance(corner_bbox, other_bbox))
# the smaller the max_IOU, the better the corner
# 取最差的值 相对最好的那个角
if max_iou < max_flag:
max_flag = max_iou
best_corner = corner
return best_corner
def plot_boxes_with_marks(
image: Image.Image,
bboxes, # (y, x, h, w)
mark_helper: MarkHelper,
linewidth=2,
alpha=0,
edgecolor=None,
fn_save=None,
normalized_to_pixel=True,
add_mark=True
) -> np.ndarray:
"""Plots bounding boxes on an image with marks attached to the edges of the boxes where no overlap with other boxes occurs.
Args:
image: The image to plot the bounding boxes on.
bboxes: A 2D int array of shape (num_boxes, 4), where each row represents a bounding box: (y_top_left, x_top_left, box_height, box_width). If normalized_to_pixel is True, the values are float and are normalized with the image size. If normalized_to_pixel is False, the values are int and are in pixel.
"""
# Then modify the drawing code
draw = ImageDraw.Draw(image)
# draw boxes on the image
image_width, image_height = image.size
if normalized_to_pixel:
bboxes = [(int(y * image_height), int(x * image_width), int(h * image_height), int(w * image_width)) for y, x, h, w in bboxes]
for box in bboxes:
y, x, h, w = box
draw.rectangle([x, y, x + w, y + h], outline=edgecolor, width=linewidth)
# Draw the bounding boxes with index at the least overlapping corner
drawn_boxes = []
for idx, bbox in enumerate(bboxes):
text = str(idx)
text_h, text_w = mark_helper.get_mark_size(text, image_height, image_width)
corner_y, corner_x = _find_least_overlapping_corner(
bbox, bboxes, drawn_boxes, (text_h, text_w), (image_height, image_width))
# Define the index box (y, x, y + h, x + w)
text_box = (corner_y, corner_x, text_h, text_w)
if add_mark:
# Draw the filled index box and text
draw.rectangle([corner_x, corner_y, corner_x + text_w, corner_y + text_h], # (x, y, x + w, y + h)
fill="red")
font = mark_helper.get_font(image_height, image_width)
draw.text((corner_x, corner_y), text, fill='white', font=font)
# Update the list of drawn boxes
drawn_boxes.append(np.array(text_box))
if fn_save is not None: # PIL image
image.save(fn_save)
return image
def plot_circles_with_marks(
image: Image.Image,
points, # (x, y)
mark_helper: MarkHelper,
linewidth=2,
edgecolor=None,
fn_save=None,
normalized_to_pixel=True,
add_mark=True
) -> np.ndarray:
"""Plots bounding boxes on an image with marks attached to the edges of the boxes where no overlap with other boxes occurs.
Args:
image: The image to plot the bounding boxes on.
bboxes: A 2D int array of shape (num_boxes, 4), where each row represents a bounding box: (y_top_left, x_top_left, box_height, box_width). If normalized_to_pixel is True, the values are float and are normalized with the image size. If normalized_to_pixel is False, the values are int and are in pixel.
"""
# draw boxes on the image
image_width, image_height = image.size
if normalized_to_pixel:
bboxes = [(int(y * image_height), int(x * image_width), int(h * image_height), int(w * image_width)) for y, x, h, w in bboxes]
draw = ImageDraw.Draw(image)
for point in points:
x, y = point
draw.circle((x, y), radius=5, outline=edgecolor, width=linewidth)
if fn_save is not None: # PIL image
image.save(fn_save)
return image
markhelper = MarkHelper()
BBOX_DEDUPLICATION_IOU_PROPORTION = 0.5
BBOX_GROUPING_VERTICAL_THRESHOLD = 20
BBOX_GROUPING_HORIZONTAL_THRESHOLD = 20
BBOX_AUG_TARGET = 2.0
def _is_boxes_same_line_or_near(bbox1, bbox2, vertical_threshold, horizontal_threshold):
"""check if two boxes are in the same line or close enough to be considered together"""
y1, x1, h1, w1 = bbox1
y2, x2, h2, w2 = bbox2
# Check if the boxes are close horizontally (consider the edge case where the boxes are touching)
horizontally_close = (x1 <= x2 and x2 - x1 <= w1 + horizontal_threshold) or (x2 <= x1 and x1 - x2 <= w2 + horizontal_threshold)
# Check if the boxes are close vertically (consider the edge case where the boxes are touching)
vertically_close = (y1 <= y2 and y2 - y1 <= h1 + vertical_threshold) or (y2 <= y1 and y1 - y2 <= h2 + vertical_threshold)
# Consider the boxes to be in the same line if they are vertically close and either overlap or are close horizontally
return vertically_close and horizontally_close
def _build_adjacency_matrix(bboxes, vertical_threshold, horizontal_threshold):
"""Build the adjacency matrix based on the merging criteria."""
num_boxes = len(bboxes)
A = np.zeros((num_boxes, num_boxes), dtype=int)
for i in range(num_boxes):
for j in range(i + 1, num_boxes):
if _is_boxes_same_line_or_near(bboxes[i], bboxes[j], vertical_threshold, horizontal_threshold):
A[i, j] = 1
A[j, i] = 1 # Symmetric matrix
return A
def merge_connected_bboxes(bboxes, text_details,
vertical_threshold=BBOX_GROUPING_VERTICAL_THRESHOLD,
horizontal_threshold=BBOX_GROUPING_HORIZONTAL_THRESHOLD
):
"""Merge bboxes based on the adjacency matrix and return merged bboxes.
Args:
bboxes: A 2D array of shape (num_boxes, 4), where each row represents a bounding box: (y, x, height, width).
text_details: A list of text details for each bounding box.
vertical_threshold: The maximum vertical distance between two boxes to be considered in the same line.
horizontal_threshold: The maximum horizontal distance between two boxes to be considered close.
"""
# return if there are no bboxes
if len(bboxes) <= 1:
return bboxes, text_details
# Convert bboxes (x1, y1, x2, y2) to (y, x, height, width) format
bboxes = np.array(bboxes)
bboxes = np.array([bboxes[:, 1], bboxes[:, 0], bboxes[:, 3] - bboxes[:, 1], bboxes[:, 2] - bboxes[:, 0]]).T
# Build adjacency matrix
A = _build_adjacency_matrix(bboxes, vertical_threshold, horizontal_threshold)
# Create graph from adjacency matrix
G = nx.from_numpy_array(A)
# Find connected components
components = list(nx.connected_components(G))
# Convert bboxes to (y_min, x_min, y_max, x_max) format
corners = np.copy(bboxes)
corners_y, corners_x, corners_h, corners_w = corners[:, 0], corners[:, 1], corners[:, 2], corners[:, 3]
corners_y_max = corners_y + corners_h
corners_x_max = corners_x + corners_w
# Merge bboxes for each connected component
merged_bboxes = []
merged_text_details = []
for component in components:
indices = list(component) # e.g., [32, 33, 34, 30, 31]
indices = sorted(indices)
# merge the text details
merged_text_details.append(' '.join([text_details[i] for i in indices]))
# merge the bboxes
y_min = min(corners_y[i] for i in indices)
x_min = min(corners_x[i] for i in indices)
y_max = max(corners_y_max[i] for i in indices)
x_max = max(corners_x_max[i] for i in indices)
merged_bboxes.append((y_min, x_min, y_max - y_min, x_max - x_min)) # Convert merged_bbox back to (y, x, height, width) format
# convert (y, x, height, width) to (x1, y1, x2, y2) format without np.array
merged_bboxes = [(bbox[1], bbox[0], bbox[1] + bbox[3], bbox[0] + bbox[2]) for bbox in merged_bboxes]
return merged_bboxes, merged_text_details