Spaces:

drlon
/

magma-ui-agent

Running on Zero

App Files Files Community

magma-ui-agent / util /som.py

drlon

update som.py

c068575 20 days ago

raw

history blame contribute delete

13.9 kB

	import torch
	from ultralytics import YOLO
	from PIL import Image
	import io
	import base64
	device = 'cuda'

	from PIL import Image, ImageDraw, ImageFont
	import numpy as np
	import networkx as nx
	# import cv2

	font_path = "./util/arial.ttf"
	class MarkHelper:
	def __init__(self):
	self.markSize_dict = {}
	self.font_dict = {}
	self.min_font_size = 20 # 1 in v1
	self.max_font_size = 30
	self.max_font_proportion = 0.04 # 0.032 in v1

	def __get_markSize(self, text, image_height, image_width, font):
	im = Image.new('RGB', (image_width, image_height))
	draw = ImageDraw.Draw(im)
	_, _, width, height = draw.textbbox((0, 0), text=text, font=font)
	return height, width

	def _setup_new_font(self, image_height, image_width):
	key = f"{image_height}_{image_width}"
	# print(f"Setting up new font for image size: {key}")

	# setup the font
	fontsize = self.min_font_size
	font = ImageFont.truetype(font_path, fontsize)
	# font = ImageFont.load_default(size=fontsize)
	while min(self.__get_markSize("555", image_height, image_width, font)) < min(self.max_font_size, self.max_font_proportion * min(image_height, image_width)):
	# iterate until the text size is just larger than the criteria
	fontsize += 1
	font = ImageFont.truetype(font_path, fontsize)
	# font = ImageFont.load_default(size=fontsize)
	self.font_dict[key] = font

	# setup the markSize dict
	markSize_3digits = self.__get_markSize('555', image_height, image_width, font)
	markSize_2digits = self.__get_markSize('55', image_height, image_width, font)
	markSize_1digit = self.__get_markSize('5', image_height, image_width, font)
	self.markSize_dict[key] = {
	1: markSize_1digit,
	2: markSize_2digits,
	3: markSize_3digits
	}

	def get_font(self, image_height, image_width):
	key = f"{image_height}_{image_width}"
	if key not in self.font_dict:
	self._setup_new_font(image_height, image_width)
	return self.font_dict[key]

	def get_mark_size(self, text_str, image_height, image_width):
	"""Get the font size for the given image dimensions."""
	key = f"{image_height}_{image_width}"
	if key not in self.markSize_dict:
	self._setup_new_font(image_height, image_width)

	largest_size = self.markSize_dict[key].get(3, None)
	text_h, text_w = self.markSize_dict[key].get(len(text_str), largest_size) # default to the largest size if the text is too long
	return text_h, text_w

	def __calculate_iou(box1, box2, return_area=False):
	"""
	Calculate the Intersection over Union (IoU) of two bounding boxes.
	:param box1: Tuple of (y, x, h, w) for the first bounding box
	:param box2: Tuple of (y, x, h, w) for the second bounding box
	:return: IoU value
	"""
	y1, x1, h1, w1 = box1
	y2, x2, h2, w2 = box2

	# Calculate the intersection area
	y_min = max(y1, y2)
	x_min = max(x1, x2)
	y_max = min(y1 + h1, y2 + h2)
	x_max = min(x1 + w1, x2 + w2)

	intersection_area = max(0, y_max - y_min) * max(0, x_max - x_min)

	# Compute the area of both bounding boxes
	box1_area = h1 * w1
	box2_area = h2 * w2

	# Calculate the IoU
	# iou = intersection_area / box1_area + box2_area - intersection_area
	iou = intersection_area / (min(box1_area, box2_area) + 0.0001)

	if return_area:
	return iou, intersection_area
	return iou

	def __calculate_nearest_corner_distance(box1, box2):
	"""Calculate the distance between the nearest edge or corner of two bounding boxes."""
	y1, x1, h1, w1 = box1
	y2, x2, h2, w2 = box2
	corners1 = np.array([
	[y1, x1],
	[y1, x1 + w1],
	[y1 + h1, x1],
	[y1 + h1, x1 + w1]
	])
	corners2 = np.array([
	[y2, x2],
	[y2, x2 + w2],
	[y2 + h2, x2],
	[y2 + h2, x2 + w2]
	])
	# Calculate pairwise distances between corners
	distances = np.linalg.norm(corners1[:, np.newaxis] - corners2, axis=2)

	# Find the minimum distance
	min_distance = np.min(distances)
	return min_distance

	def _find_least_overlapping_corner(bbox, bboxes, drawn_boxes, text_size, image_size):
	"""Find the corner with the least overlap with other bboxes.
	Args:
	bbox: (y, x, h, w) The bounding box to place the text on.
	bboxes: [(y, x, h, w)] The list of bounding boxes to compare against.
	drawn_boxes: [(y, x, h, w)] The list of bounding boxes that have already been drawn on.
	text_size: (height, width) The size of the text to be drawn.
	image_size: (height, width) The size of the image.
	"""
	y, x, h, w = bbox
	h_text, w_text = text_size
	image_height, image_width = image_size
	corners = [
	# top-left
	(y - h_text, x),
	# top-right
	(y - h_text, x + w - w_text),
	# right-top
	(y, x + w),
	# right-bottom
	(y + h - h_text, x + w),
	# bottom-right
	(y + h, x + w - w_text),
	# bottom-left
	(y + h, x),
	# left-bottom
	(y + h - h_text, x - w_text),
	# left-top
	(y, x - w_text),
	]
	best_corner = corners[0]
	max_flag = float('inf')

	for corner in corners:
	corner_bbox = (corner[0], corner[1], h_text, w_text)
	# if the corner is out of the image, skip
	if corner[0] < 0 or corner[1] < 0 or corner[0] + h_text > image_height or corner[1] + w_text > image_width:
	continue
	max_iou = - (image_width + image_height)
	# 找到关于这个角最差的 case
	# given the current corner, find the larget iou with other bboxes.
	for other_bbox in bboxes + drawn_boxes:
	if np.array_equal(bbox, other_bbox):
	continue
	iou = __calculate_iou(corner_bbox, other_bbox, return_area=True)[1]
	max_iou = max(max_iou, iou - 0.0001 * __calculate_nearest_corner_distance(corner_bbox, other_bbox))
	# the smaller the max_IOU, the better the corner
	# 取最差的值相对最好的那个角
	if max_iou < max_flag:
	max_flag = max_iou
	best_corner = corner

	return best_corner

	def plot_boxes_with_marks(
	image: Image.Image,
	bboxes, # (y, x, h, w)
	mark_helper: MarkHelper,
	linewidth=2,
	alpha=0,
	edgecolor=None,
	fn_save=None,
	normalized_to_pixel=True,
	add_mark=True
	) -> np.ndarray:
	"""Plots bounding boxes on an image with marks attached to the edges of the boxes where no overlap with other boxes occurs.
	Args:
	image: The image to plot the bounding boxes on.
	bboxes: A 2D int array of shape (num_boxes, 4), where each row represents a bounding box: (y_top_left, x_top_left, box_height, box_width). If normalized_to_pixel is True, the values are float and are normalized with the image size. If normalized_to_pixel is False, the values are int and are in pixel.
	"""
	# Then modify the drawing code
	draw = ImageDraw.Draw(image)

	# draw boxes on the image
	image_width, image_height = image.size

	if normalized_to_pixel:
	bboxes = [(int(y * image_height), int(x * image_width), int(h * image_height), int(w * image_width)) for y, x, h, w in bboxes]

	for box in bboxes:
	y, x, h, w = box
	draw.rectangle([x, y, x + w, y + h], outline=edgecolor, width=linewidth)

	# Draw the bounding boxes with index at the least overlapping corner
	drawn_boxes = []
	for idx, bbox in enumerate(bboxes):
	text = str(idx)
	text_h, text_w = mark_helper.get_mark_size(text, image_height, image_width)
	corner_y, corner_x = _find_least_overlapping_corner(
	bbox, bboxes, drawn_boxes, (text_h, text_w), (image_height, image_width))

	# Define the index box (y, x, y + h, x + w)
	text_box = (corner_y, corner_x, text_h, text_w)

	if add_mark:
	# Draw the filled index box and text
	draw.rectangle([corner_x, corner_y, corner_x + text_w, corner_y + text_h], # (x, y, x + w, y + h)
	fill="red")
	font = mark_helper.get_font(image_height, image_width)
	draw.text((corner_x, corner_y), text, fill='white', font=font)

	# Update the list of drawn boxes
	drawn_boxes.append(np.array(text_box))

	if fn_save is not None: # PIL image
	image.save(fn_save)
	return image

	def plot_circles_with_marks(
	image: Image.Image,
	points, # (x, y)
	mark_helper: MarkHelper,
	linewidth=2,
	edgecolor=None,
	fn_save=None,
	normalized_to_pixel=True,
	add_mark=True
	) -> np.ndarray:
	"""Plots bounding boxes on an image with marks attached to the edges of the boxes where no overlap with other boxes occurs.
	Args:
	image: The image to plot the bounding boxes on.
	bboxes: A 2D int array of shape (num_boxes, 4), where each row represents a bounding box: (y_top_left, x_top_left, box_height, box_width). If normalized_to_pixel is True, the values are float and are normalized with the image size. If normalized_to_pixel is False, the values are int and are in pixel.
	"""
	# draw boxes on the image
	image_width, image_height = image.size

	if normalized_to_pixel:
	bboxes = [(int(y * image_height), int(x * image_width), int(h * image_height), int(w * image_width)) for y, x, h, w in bboxes]

	draw = ImageDraw.Draw(image)
	for point in points:
	x, y = point
	draw.circle((x, y), radius=5, outline=edgecolor, width=linewidth)

	if fn_save is not None: # PIL image
	image.save(fn_save)
	return image

	markhelper = MarkHelper()

	BBOX_DEDUPLICATION_IOU_PROPORTION = 0.5
	BBOX_GROUPING_VERTICAL_THRESHOLD = 20
	BBOX_GROUPING_HORIZONTAL_THRESHOLD = 20
	BBOX_AUG_TARGET = 2.0

	def _is_boxes_same_line_or_near(bbox1, bbox2, vertical_threshold, horizontal_threshold):
	"""check if two boxes are in the same line or close enough to be considered together"""
	y1, x1, h1, w1 = bbox1
	y2, x2, h2, w2 = bbox2

	# Check if the boxes are close horizontally (consider the edge case where the boxes are touching)
	horizontally_close = (x1 <= x2 and x2 - x1 <= w1 + horizontal_threshold) or (x2 <= x1 and x1 - x2 <= w2 + horizontal_threshold)

	# Check if the boxes are close vertically (consider the edge case where the boxes are touching)
	vertically_close = (y1 <= y2 and y2 - y1 <= h1 + vertical_threshold) or (y2 <= y1 and y1 - y2 <= h2 + vertical_threshold)

	# Consider the boxes to be in the same line if they are vertically close and either overlap or are close horizontally
	return vertically_close and horizontally_close

	def _build_adjacency_matrix(bboxes, vertical_threshold, horizontal_threshold):
	"""Build the adjacency matrix based on the merging criteria."""
	num_boxes = len(bboxes)
	A = np.zeros((num_boxes, num_boxes), dtype=int)

	for i in range(num_boxes):
	for j in range(i + 1, num_boxes):
	if _is_boxes_same_line_or_near(bboxes[i], bboxes[j], vertical_threshold, horizontal_threshold):
	A[i, j] = 1
	A[j, i] = 1 # Symmetric matrix

	return A

	def merge_connected_bboxes(bboxes, text_details,
	vertical_threshold=BBOX_GROUPING_VERTICAL_THRESHOLD,
	horizontal_threshold=BBOX_GROUPING_HORIZONTAL_THRESHOLD
	):
	"""Merge bboxes based on the adjacency matrix and return merged bboxes.
	Args:
	bboxes: A 2D array of shape (num_boxes, 4), where each row represents a bounding box: (y, x, height, width).
	text_details: A list of text details for each bounding box.
	vertical_threshold: The maximum vertical distance between two boxes to be considered in the same line.
	horizontal_threshold: The maximum horizontal distance between two boxes to be considered close.
	"""
	# return if there are no bboxes
	if len(bboxes) <= 1:
	return bboxes, text_details

	# Convert bboxes (x1, y1, x2, y2) to (y, x, height, width) format
	bboxes = np.array(bboxes)
	bboxes = np.array([bboxes[:, 1], bboxes[:, 0], bboxes[:, 3] - bboxes[:, 1], bboxes[:, 2] - bboxes[:, 0]]).T

	# Build adjacency matrix
	A = _build_adjacency_matrix(bboxes, vertical_threshold, horizontal_threshold)

	# Create graph from adjacency matrix
	G = nx.from_numpy_array(A)

	# Find connected components
	components = list(nx.connected_components(G))

	# Convert bboxes to (y_min, x_min, y_max, x_max) format
	corners = np.copy(bboxes)
	corners_y, corners_x, corners_h, corners_w = corners[:, 0], corners[:, 1], corners[:, 2], corners[:, 3]

	corners_y_max = corners_y + corners_h
	corners_x_max = corners_x + corners_w

	# Merge bboxes for each connected component
	merged_bboxes = []
	merged_text_details = []
	for component in components:
	indices = list(component) # e.g., [32, 33, 34, 30, 31]
	indices = sorted(indices)

	# merge the text details
	merged_text_details.append(' '.join([text_details[i] for i in indices]))

	# merge the bboxes
	y_min = min(corners_y[i] for i in indices)
	x_min = min(corners_x[i] for i in indices)
	y_max = max(corners_y_max[i] for i in indices)
	x_max = max(corners_x_max[i] for i in indices)
	merged_bboxes.append((y_min, x_min, y_max - y_min, x_max - x_min)) # Convert merged_bbox back to (y, x, height, width) format

	# convert (y, x, height, width) to (x1, y1, x2, y2) format without np.array
	merged_bboxes = [(bbox[1], bbox[0], bbox[1] + bbox[3], bbox[0] + bbox[2]) for bbox in merged_bboxes]
	return merged_bboxes, merged_text_details