Spaces:
Running
on
Zero
Running
on
Zero
import torch | |
from ultralytics import YOLO | |
from PIL import Image | |
import io | |
import base64 | |
device = 'cuda' | |
from PIL import Image, ImageDraw, ImageFont | |
import numpy as np | |
import networkx as nx | |
# import cv2 | |
font_path = "./util/arial.ttf" | |
class MarkHelper: | |
def __init__(self): | |
self.markSize_dict = {} | |
self.font_dict = {} | |
self.min_font_size = 20 # 1 in v1 | |
self.max_font_size = 30 | |
self.max_font_proportion = 0.04 # 0.032 in v1 | |
def __get_markSize(self, text, image_height, image_width, font): | |
im = Image.new('RGB', (image_width, image_height)) | |
draw = ImageDraw.Draw(im) | |
_, _, width, height = draw.textbbox((0, 0), text=text, font=font) | |
return height, width | |
def _setup_new_font(self, image_height, image_width): | |
key = f"{image_height}_{image_width}" | |
# print(f"Setting up new font for image size: {key}") | |
# setup the font | |
fontsize = self.min_font_size | |
font = ImageFont.truetype(font_path, fontsize) | |
# font = ImageFont.load_default(size=fontsize) | |
while min(self.__get_markSize("555", image_height, image_width, font)) < min(self.max_font_size, self.max_font_proportion * min(image_height, image_width)): | |
# iterate until the text size is just larger than the criteria | |
fontsize += 1 | |
font = ImageFont.truetype(font_path, fontsize) | |
# font = ImageFont.load_default(size=fontsize) | |
self.font_dict[key] = font | |
# setup the markSize dict | |
markSize_3digits = self.__get_markSize('555', image_height, image_width, font) | |
markSize_2digits = self.__get_markSize('55', image_height, image_width, font) | |
markSize_1digit = self.__get_markSize('5', image_height, image_width, font) | |
self.markSize_dict[key] = { | |
1: markSize_1digit, | |
2: markSize_2digits, | |
3: markSize_3digits | |
} | |
def get_font(self, image_height, image_width): | |
key = f"{image_height}_{image_width}" | |
if key not in self.font_dict: | |
self._setup_new_font(image_height, image_width) | |
return self.font_dict[key] | |
def get_mark_size(self, text_str, image_height, image_width): | |
"""Get the font size for the given image dimensions.""" | |
key = f"{image_height}_{image_width}" | |
if key not in self.markSize_dict: | |
self._setup_new_font(image_height, image_width) | |
largest_size = self.markSize_dict[key].get(3, None) | |
text_h, text_w = self.markSize_dict[key].get(len(text_str), largest_size) # default to the largest size if the text is too long | |
return text_h, text_w | |
def __calculate_iou(box1, box2, return_area=False): | |
""" | |
Calculate the Intersection over Union (IoU) of two bounding boxes. | |
:param box1: Tuple of (y, x, h, w) for the first bounding box | |
:param box2: Tuple of (y, x, h, w) for the second bounding box | |
:return: IoU value | |
""" | |
y1, x1, h1, w1 = box1 | |
y2, x2, h2, w2 = box2 | |
# Calculate the intersection area | |
y_min = max(y1, y2) | |
x_min = max(x1, x2) | |
y_max = min(y1 + h1, y2 + h2) | |
x_max = min(x1 + w1, x2 + w2) | |
intersection_area = max(0, y_max - y_min) * max(0, x_max - x_min) | |
# Compute the area of both bounding boxes | |
box1_area = h1 * w1 | |
box2_area = h2 * w2 | |
# Calculate the IoU | |
# iou = intersection_area / box1_area + box2_area - intersection_area | |
iou = intersection_area / (min(box1_area, box2_area) + 0.0001) | |
if return_area: | |
return iou, intersection_area | |
return iou | |
def __calculate_nearest_corner_distance(box1, box2): | |
"""Calculate the distance between the nearest edge or corner of two bounding boxes.""" | |
y1, x1, h1, w1 = box1 | |
y2, x2, h2, w2 = box2 | |
corners1 = np.array([ | |
[y1, x1], | |
[y1, x1 + w1], | |
[y1 + h1, x1], | |
[y1 + h1, x1 + w1] | |
]) | |
corners2 = np.array([ | |
[y2, x2], | |
[y2, x2 + w2], | |
[y2 + h2, x2], | |
[y2 + h2, x2 + w2] | |
]) | |
# Calculate pairwise distances between corners | |
distances = np.linalg.norm(corners1[:, np.newaxis] - corners2, axis=2) | |
# Find the minimum distance | |
min_distance = np.min(distances) | |
return min_distance | |
def _find_least_overlapping_corner(bbox, bboxes, drawn_boxes, text_size, image_size): | |
"""Find the corner with the least overlap with other bboxes. | |
Args: | |
bbox: (y, x, h, w) The bounding box to place the text on. | |
bboxes: [(y, x, h, w)] The list of bounding boxes to compare against. | |
drawn_boxes: [(y, x, h, w)] The list of bounding boxes that have already been drawn on. | |
text_size: (height, width) The size of the text to be drawn. | |
image_size: (height, width) The size of the image. | |
""" | |
y, x, h, w = bbox | |
h_text, w_text = text_size | |
image_height, image_width = image_size | |
corners = [ | |
# top-left | |
(y - h_text, x), | |
# top-right | |
(y - h_text, x + w - w_text), | |
# right-top | |
(y, x + w), | |
# right-bottom | |
(y + h - h_text, x + w), | |
# bottom-right | |
(y + h, x + w - w_text), | |
# bottom-left | |
(y + h, x), | |
# left-bottom | |
(y + h - h_text, x - w_text), | |
# left-top | |
(y, x - w_text), | |
] | |
best_corner = corners[0] | |
max_flag = float('inf') | |
for corner in corners: | |
corner_bbox = (corner[0], corner[1], h_text, w_text) | |
# if the corner is out of the image, skip | |
if corner[0] < 0 or corner[1] < 0 or corner[0] + h_text > image_height or corner[1] + w_text > image_width: | |
continue | |
max_iou = - (image_width + image_height) | |
# 找到关于这个角最差的 case | |
# given the current corner, find the larget iou with other bboxes. | |
for other_bbox in bboxes + drawn_boxes: | |
if np.array_equal(bbox, other_bbox): | |
continue | |
iou = __calculate_iou(corner_bbox, other_bbox, return_area=True)[1] | |
max_iou = max(max_iou, iou - 0.0001 * __calculate_nearest_corner_distance(corner_bbox, other_bbox)) | |
# the smaller the max_IOU, the better the corner | |
# 取最差的值 相对最好的那个角 | |
if max_iou < max_flag: | |
max_flag = max_iou | |
best_corner = corner | |
return best_corner | |
def plot_boxes_with_marks( | |
image: Image.Image, | |
bboxes, # (y, x, h, w) | |
mark_helper: MarkHelper, | |
linewidth=2, | |
alpha=0, | |
edgecolor=None, | |
fn_save=None, | |
normalized_to_pixel=True, | |
add_mark=True | |
) -> np.ndarray: | |
"""Plots bounding boxes on an image with marks attached to the edges of the boxes where no overlap with other boxes occurs. | |
Args: | |
image: The image to plot the bounding boxes on. | |
bboxes: A 2D int array of shape (num_boxes, 4), where each row represents a bounding box: (y_top_left, x_top_left, box_height, box_width). If normalized_to_pixel is True, the values are float and are normalized with the image size. If normalized_to_pixel is False, the values are int and are in pixel. | |
""" | |
# Then modify the drawing code | |
draw = ImageDraw.Draw(image) | |
# draw boxes on the image | |
image_width, image_height = image.size | |
if normalized_to_pixel: | |
bboxes = [(int(y * image_height), int(x * image_width), int(h * image_height), int(w * image_width)) for y, x, h, w in bboxes] | |
for box in bboxes: | |
y, x, h, w = box | |
draw.rectangle([x, y, x + w, y + h], outline=edgecolor, width=linewidth) | |
# Draw the bounding boxes with index at the least overlapping corner | |
drawn_boxes = [] | |
for idx, bbox in enumerate(bboxes): | |
text = str(idx) | |
text_h, text_w = mark_helper.get_mark_size(text, image_height, image_width) | |
corner_y, corner_x = _find_least_overlapping_corner( | |
bbox, bboxes, drawn_boxes, (text_h, text_w), (image_height, image_width)) | |
# Define the index box (y, x, y + h, x + w) | |
text_box = (corner_y, corner_x, text_h, text_w) | |
if add_mark: | |
# Draw the filled index box and text | |
draw.rectangle([corner_x, corner_y, corner_x + text_w, corner_y + text_h], # (x, y, x + w, y + h) | |
fill="red") | |
font = mark_helper.get_font(image_height, image_width) | |
draw.text((corner_x, corner_y), text, fill='white', font=font) | |
# Update the list of drawn boxes | |
drawn_boxes.append(np.array(text_box)) | |
if fn_save is not None: # PIL image | |
image.save(fn_save) | |
return image | |
def plot_circles_with_marks( | |
image: Image.Image, | |
points, # (x, y) | |
mark_helper: MarkHelper, | |
linewidth=2, | |
edgecolor=None, | |
fn_save=None, | |
normalized_to_pixel=True, | |
add_mark=True | |
) -> np.ndarray: | |
"""Plots bounding boxes on an image with marks attached to the edges of the boxes where no overlap with other boxes occurs. | |
Args: | |
image: The image to plot the bounding boxes on. | |
bboxes: A 2D int array of shape (num_boxes, 4), where each row represents a bounding box: (y_top_left, x_top_left, box_height, box_width). If normalized_to_pixel is True, the values are float and are normalized with the image size. If normalized_to_pixel is False, the values are int and are in pixel. | |
""" | |
# draw boxes on the image | |
image_width, image_height = image.size | |
if normalized_to_pixel: | |
bboxes = [(int(y * image_height), int(x * image_width), int(h * image_height), int(w * image_width)) for y, x, h, w in bboxes] | |
draw = ImageDraw.Draw(image) | |
for point in points: | |
x, y = point | |
draw.circle((x, y), radius=5, outline=edgecolor, width=linewidth) | |
if fn_save is not None: # PIL image | |
image.save(fn_save) | |
return image | |
markhelper = MarkHelper() | |
BBOX_DEDUPLICATION_IOU_PROPORTION = 0.5 | |
BBOX_GROUPING_VERTICAL_THRESHOLD = 20 | |
BBOX_GROUPING_HORIZONTAL_THRESHOLD = 20 | |
BBOX_AUG_TARGET = 2.0 | |
def _is_boxes_same_line_or_near(bbox1, bbox2, vertical_threshold, horizontal_threshold): | |
"""check if two boxes are in the same line or close enough to be considered together""" | |
y1, x1, h1, w1 = bbox1 | |
y2, x2, h2, w2 = bbox2 | |
# Check if the boxes are close horizontally (consider the edge case where the boxes are touching) | |
horizontally_close = (x1 <= x2 and x2 - x1 <= w1 + horizontal_threshold) or (x2 <= x1 and x1 - x2 <= w2 + horizontal_threshold) | |
# Check if the boxes are close vertically (consider the edge case where the boxes are touching) | |
vertically_close = (y1 <= y2 and y2 - y1 <= h1 + vertical_threshold) or (y2 <= y1 and y1 - y2 <= h2 + vertical_threshold) | |
# Consider the boxes to be in the same line if they are vertically close and either overlap or are close horizontally | |
return vertically_close and horizontally_close | |
def _build_adjacency_matrix(bboxes, vertical_threshold, horizontal_threshold): | |
"""Build the adjacency matrix based on the merging criteria.""" | |
num_boxes = len(bboxes) | |
A = np.zeros((num_boxes, num_boxes), dtype=int) | |
for i in range(num_boxes): | |
for j in range(i + 1, num_boxes): | |
if _is_boxes_same_line_or_near(bboxes[i], bboxes[j], vertical_threshold, horizontal_threshold): | |
A[i, j] = 1 | |
A[j, i] = 1 # Symmetric matrix | |
return A | |
def merge_connected_bboxes(bboxes, text_details, | |
vertical_threshold=BBOX_GROUPING_VERTICAL_THRESHOLD, | |
horizontal_threshold=BBOX_GROUPING_HORIZONTAL_THRESHOLD | |
): | |
"""Merge bboxes based on the adjacency matrix and return merged bboxes. | |
Args: | |
bboxes: A 2D array of shape (num_boxes, 4), where each row represents a bounding box: (y, x, height, width). | |
text_details: A list of text details for each bounding box. | |
vertical_threshold: The maximum vertical distance between two boxes to be considered in the same line. | |
horizontal_threshold: The maximum horizontal distance between two boxes to be considered close. | |
""" | |
# return if there are no bboxes | |
if len(bboxes) <= 1: | |
return bboxes, text_details | |
# Convert bboxes (x1, y1, x2, y2) to (y, x, height, width) format | |
bboxes = np.array(bboxes) | |
bboxes = np.array([bboxes[:, 1], bboxes[:, 0], bboxes[:, 3] - bboxes[:, 1], bboxes[:, 2] - bboxes[:, 0]]).T | |
# Build adjacency matrix | |
A = _build_adjacency_matrix(bboxes, vertical_threshold, horizontal_threshold) | |
# Create graph from adjacency matrix | |
G = nx.from_numpy_array(A) | |
# Find connected components | |
components = list(nx.connected_components(G)) | |
# Convert bboxes to (y_min, x_min, y_max, x_max) format | |
corners = np.copy(bboxes) | |
corners_y, corners_x, corners_h, corners_w = corners[:, 0], corners[:, 1], corners[:, 2], corners[:, 3] | |
corners_y_max = corners_y + corners_h | |
corners_x_max = corners_x + corners_w | |
# Merge bboxes for each connected component | |
merged_bboxes = [] | |
merged_text_details = [] | |
for component in components: | |
indices = list(component) # e.g., [32, 33, 34, 30, 31] | |
indices = sorted(indices) | |
# merge the text details | |
merged_text_details.append(' '.join([text_details[i] for i in indices])) | |
# merge the bboxes | |
y_min = min(corners_y[i] for i in indices) | |
x_min = min(corners_x[i] for i in indices) | |
y_max = max(corners_y_max[i] for i in indices) | |
x_max = max(corners_x_max[i] for i in indices) | |
merged_bboxes.append((y_min, x_min, y_max - y_min, x_max - x_min)) # Convert merged_bbox back to (y, x, height, width) format | |
# convert (y, x, height, width) to (x1, y1, x2, y2) format without np.array | |
merged_bboxes = [(bbox[1], bbox[0], bbox[1] + bbox[3], bbox[0] + bbox[2]) for bbox in merged_bboxes] | |
return merged_bboxes, merged_text_details |