Spaces:

drlon
/

magma-ui-agent

Running on Zero

File size: 13,895 Bytes

import torch
from ultralytics import YOLO
from PIL import Image
import io
import base64
device = 'cuda'

from PIL import Image, ImageDraw, ImageFont
import numpy as np
import networkx as nx
# import cv2

font_path = "./util/arial.ttf"
class MarkHelper:
    def __init__(self):    
        self.markSize_dict = {}
        self.font_dict = {}
        self.min_font_size = 20 # 1 in v1
        self.max_font_size = 30
        self.max_font_proportion = 0.04 # 0.032 in v1

    def __get_markSize(self, text, image_height, image_width, font):
        im = Image.new('RGB', (image_width, image_height))
        draw = ImageDraw.Draw(im)
        _, _, width, height = draw.textbbox((0, 0), text=text, font=font)
        return height, width

    def _setup_new_font(self, image_height, image_width):
        key = f"{image_height}_{image_width}"
        # print(f"Setting up new font for image size: {key}")
        
        # setup the font
        fontsize = self.min_font_size
        font = ImageFont.truetype(font_path, fontsize)
        # font = ImageFont.load_default(size=fontsize)
        while min(self.__get_markSize("555", image_height, image_width, font)) < min(self.max_font_size, self.max_font_proportion * min(image_height, image_width)):
            # iterate until the text size is just larger than the criteria
            fontsize += 1
            font = ImageFont.truetype(font_path, fontsize)
            # font = ImageFont.load_default(size=fontsize)
        self.font_dict[key] = font

        # setup the markSize dict
        markSize_3digits = self.__get_markSize('555', image_height, image_width, font)
        markSize_2digits = self.__get_markSize('55', image_height, image_width, font)
        markSize_1digit = self.__get_markSize('5', image_height, image_width, font)
        self.markSize_dict[key] = {
            1: markSize_1digit,
            2: markSize_2digits,
            3: markSize_3digits
        }

    def get_font(self, image_height, image_width):
        key = f"{image_height}_{image_width}"
        if key not in self.font_dict:
            self._setup_new_font(image_height, image_width)
        return self.font_dict[key]
        
    def get_mark_size(self, text_str, image_height, image_width):
        """Get the font size for the given image dimensions."""
        key = f"{image_height}_{image_width}"
        if key not in self.markSize_dict:
            self._setup_new_font(image_height, image_width)

        largest_size = self.markSize_dict[key].get(3, None)
        text_h, text_w = self.markSize_dict[key].get(len(text_str), largest_size) # default to the largest size if the text is too long
        return text_h, text_w

def __calculate_iou(box1, box2, return_area=False):
    """
    Calculate the Intersection over Union (IoU) of two bounding boxes.
    :param box1: Tuple of (y, x, h, w) for the first bounding box
    :param box2: Tuple of (y, x, h, w) for the second bounding box
    :return: IoU value
    """
    y1, x1, h1, w1 = box1
    y2, x2, h2, w2 = box2

    # Calculate the intersection area
    y_min = max(y1, y2)
    x_min = max(x1, x2)
    y_max = min(y1 + h1, y2 + h2)
    x_max = min(x1 + w1, x2 + w2)

    intersection_area = max(0, y_max - y_min) * max(0, x_max - x_min)

    # Compute the area of both bounding boxes
    box1_area = h1 * w1
    box2_area = h2 * w2

    # Calculate the IoU
    # iou = intersection_area / box1_area + box2_area - intersection_area
    iou = intersection_area / (min(box1_area, box2_area) + 0.0001)

    if return_area:
        return iou, intersection_area
    return iou

def __calculate_nearest_corner_distance(box1, box2):
    """Calculate the distance between the nearest edge or corner of two bounding boxes."""
    y1, x1, h1, w1 = box1
    y2, x2, h2, w2 = box2
    corners1 = np.array([
        [y1, x1],
        [y1, x1 + w1],
        [y1 + h1, x1],
        [y1 + h1, x1 + w1]
    ])
    corners2 = np.array([
        [y2, x2],
        [y2, x2 + w2],
        [y2 + h2, x2],
        [y2 + h2, x2 + w2]
    ])
    # Calculate pairwise distances between corners
    distances = np.linalg.norm(corners1[:, np.newaxis] - corners2, axis=2)

    # Find the minimum distance
    min_distance = np.min(distances)
    return min_distance

def _find_least_overlapping_corner(bbox, bboxes, drawn_boxes, text_size, image_size):
    """Find the corner with the least overlap with other bboxes.
    Args:
        bbox: (y, x, h, w) The bounding box to place the text on.
        bboxes: [(y, x, h, w)] The list of bounding boxes to compare against.
        drawn_boxes: [(y, x, h, w)] The list of bounding boxes that have already been drawn on.
        text_size: (height, width) The size of the text to be drawn.
        image_size: (height, width) The size of the image.
    """
    y, x, h, w = bbox
    h_text, w_text = text_size
    image_height, image_width = image_size
    corners = [
        # top-left
        (y - h_text, x),
        # top-right
        (y - h_text, x + w - w_text),
        # right-top
        (y, x + w),
        # right-bottom
        (y + h - h_text, x + w),
        # bottom-right
        (y + h, x + w - w_text),
        # bottom-left
        (y + h, x),
        # left-bottom
        (y + h - h_text, x - w_text),
        # left-top
        (y, x - w_text),
        ]
    best_corner = corners[0]
    max_flag = float('inf')

    for corner in corners:
        corner_bbox = (corner[0], corner[1], h_text, w_text)
        # if the corner is out of the image, skip
        if corner[0] < 0 or corner[1] < 0 or corner[0] + h_text > image_height or corner[1] + w_text > image_width:
            continue
        max_iou = - (image_width + image_height)
        # 找到关于这个角最差的 case
        # given the current corner, find the larget iou with other bboxes.
        for other_bbox in bboxes + drawn_boxes:
            if np.array_equal(bbox, other_bbox):
                continue
            iou = __calculate_iou(corner_bbox, other_bbox, return_area=True)[1]
            max_iou = max(max_iou, iou - 0.0001 * __calculate_nearest_corner_distance(corner_bbox, other_bbox))
        # the smaller the max_IOU, the better the corner
        # 取最差的值 相对最好的那个角
        if max_iou < max_flag:
            max_flag = max_iou
            best_corner = corner

    return best_corner

def plot_boxes_with_marks(
    image: Image.Image,
    bboxes, # (y, x, h, w)
    mark_helper: MarkHelper,
    linewidth=2,
    alpha=0,
    edgecolor=None,
    fn_save=None,
    normalized_to_pixel=True,
    add_mark=True
) -> np.ndarray:
    """Plots bounding boxes on an image with marks attached to the edges of the boxes where no overlap with other boxes occurs.
    Args:
        image: The image to plot the bounding boxes on.
        bboxes: A 2D int array of shape (num_boxes, 4), where each row represents a bounding box: (y_top_left, x_top_left, box_height, box_width). If normalized_to_pixel is True, the values are float and are normalized with the image size. If normalized_to_pixel is False, the values are int and are in pixel.
    """
    # Then modify the drawing code
    draw = ImageDraw.Draw(image)

    # draw boxes on the image
    image_width, image_height = image.size

    if normalized_to_pixel:
        bboxes = [(int(y * image_height), int(x * image_width), int(h * image_height), int(w * image_width)) for y, x, h, w in bboxes]

    for box in bboxes:
        y, x, h, w = box
        draw.rectangle([x, y, x + w, y + h], outline=edgecolor, width=linewidth)
    
    # Draw the bounding boxes with index at the least overlapping corner
    drawn_boxes = []
    for idx, bbox in enumerate(bboxes):
        text = str(idx)
        text_h, text_w = mark_helper.get_mark_size(text, image_height, image_width)
        corner_y, corner_x = _find_least_overlapping_corner(
            bbox, bboxes, drawn_boxes, (text_h, text_w), (image_height, image_width))
        
        # Define the index box (y, x, y + h, x + w)
        text_box = (corner_y, corner_x, text_h, text_w)

        if add_mark:
            # Draw the filled index box and text
            draw.rectangle([corner_x, corner_y, corner_x + text_w, corner_y + text_h], # (x, y, x + w, y + h)
                        fill="red")        
            font = mark_helper.get_font(image_height, image_width)
            draw.text((corner_x, corner_y), text, fill='white', font=font)
        
        # Update the list of drawn boxes
        drawn_boxes.append(np.array(text_box))
        
    if fn_save is not None: # PIL image
        image.save(fn_save)
    return image

def plot_circles_with_marks(
    image: Image.Image,
    points, # (x, y)
    mark_helper: MarkHelper,
    linewidth=2,
    edgecolor=None,
    fn_save=None,
    normalized_to_pixel=True,
    add_mark=True
) -> np.ndarray:
    """Plots bounding boxes on an image with marks attached to the edges of the boxes where no overlap with other boxes occurs.
    Args:
        image: The image to plot the bounding boxes on.
        bboxes: A 2D int array of shape (num_boxes, 4), where each row represents a bounding box: (y_top_left, x_top_left, box_height, box_width). If normalized_to_pixel is True, the values are float and are normalized with the image size. If normalized_to_pixel is False, the values are int and are in pixel.
    """
    # draw boxes on the image
    image_width, image_height = image.size

    if normalized_to_pixel:
        bboxes = [(int(y * image_height), int(x * image_width), int(h * image_height), int(w * image_width)) for y, x, h, w in bboxes]

    draw = ImageDraw.Draw(image)
    for point in points:
        x, y = point
        draw.circle((x, y), radius=5, outline=edgecolor, width=linewidth)
        
    if fn_save is not None: # PIL image
        image.save(fn_save)
    return image

markhelper = MarkHelper()

BBOX_DEDUPLICATION_IOU_PROPORTION = 0.5
BBOX_GROUPING_VERTICAL_THRESHOLD = 20
BBOX_GROUPING_HORIZONTAL_THRESHOLD = 20
BBOX_AUG_TARGET = 2.0

def _is_boxes_same_line_or_near(bbox1, bbox2, vertical_threshold, horizontal_threshold):
    """check if two boxes are in the same line or close enough to be considered together"""
    y1, x1, h1, w1 = bbox1
    y2, x2, h2, w2 = bbox2
    
    # Check if the boxes are close horizontally (consider the edge case where the boxes are touching)
    horizontally_close = (x1 <= x2 and x2 - x1 <= w1 + horizontal_threshold) or (x2 <= x1 and x1 - x2 <= w2 + horizontal_threshold)

    # Check if the boxes are close vertically (consider the edge case where the boxes are touching)
    vertically_close = (y1 <= y2 and y2 - y1 <= h1 + vertical_threshold) or (y2 <= y1 and y1 - y2 <= h2 + vertical_threshold)
    
    # Consider the boxes to be in the same line if they are vertically close and either overlap or are close horizontally
    return vertically_close and horizontally_close

def _build_adjacency_matrix(bboxes, vertical_threshold, horizontal_threshold):
    """Build the adjacency matrix based on the merging criteria."""
    num_boxes = len(bboxes)
    A = np.zeros((num_boxes, num_boxes), dtype=int)

    for i in range(num_boxes):
        for j in range(i + 1, num_boxes):
            if _is_boxes_same_line_or_near(bboxes[i], bboxes[j], vertical_threshold, horizontal_threshold):
                A[i, j] = 1
                A[j, i] = 1  # Symmetric matrix

    return A

def merge_connected_bboxes(bboxes, text_details, 
    vertical_threshold=BBOX_GROUPING_VERTICAL_THRESHOLD, 
    horizontal_threshold=BBOX_GROUPING_HORIZONTAL_THRESHOLD
):
    """Merge bboxes based on the adjacency matrix and return merged bboxes.
    Args:
        bboxes: A 2D array of shape (num_boxes, 4), where each row represents a bounding box: (y, x, height, width).
        text_details: A list of text details for each bounding box.
        vertical_threshold: The maximum vertical distance between two boxes to be considered in the same line.
        horizontal_threshold: The maximum horizontal distance between two boxes to be considered close.
    """
    # return if there are no bboxes
    if len(bboxes) <= 1:
        return bboxes, text_details
    
    # Convert bboxes (x1, y1, x2, y2) to (y, x, height, width) format
    bboxes = np.array(bboxes)
    bboxes = np.array([bboxes[:, 1], bboxes[:, 0], bboxes[:, 3] - bboxes[:, 1], bboxes[:, 2] - bboxes[:, 0]]).T

    # Build adjacency matrix
    A = _build_adjacency_matrix(bboxes, vertical_threshold, horizontal_threshold)
    
    # Create graph from adjacency matrix
    G = nx.from_numpy_array(A)
    
    # Find connected components
    components = list(nx.connected_components(G))
    
    # Convert bboxes to (y_min, x_min, y_max, x_max) format
    corners = np.copy(bboxes)
    corners_y, corners_x, corners_h, corners_w = corners[:, 0], corners[:, 1], corners[:, 2], corners[:, 3]
    
    corners_y_max = corners_y + corners_h
    corners_x_max = corners_x + corners_w
    
    # Merge bboxes for each connected component
    merged_bboxes = []
    merged_text_details = []
    for component in components:
        indices = list(component) # e.g., [32, 33, 34, 30, 31]
        indices = sorted(indices)

        # merge the text details
        merged_text_details.append(' '.join([text_details[i] for i in indices]))

        # merge the bboxes
        y_min = min(corners_y[i] for i in indices)
        x_min = min(corners_x[i] for i in indices)
        y_max = max(corners_y_max[i] for i in indices)
        x_max = max(corners_x_max[i] for i in indices)
        merged_bboxes.append((y_min, x_min, y_max - y_min, x_max - x_min)) # Convert merged_bbox back to (y, x, height, width) format
    
    # convert (y, x, height, width) to (x1, y1, x2, y2) format without np.array
    merged_bboxes = [(bbox[1], bbox[0], bbox[1] + bbox[3], bbox[0] + bbox[2]) for bbox in merged_bboxes]
    return merged_bboxes, merged_text_details