Spaces:
Running
Running
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license | |
""" | |
Ultralytics Results, Boxes and Masks classes for handling inference results. | |
Usage: See https://docs.ultralytics.com/modes/predict/ | |
""" | |
from copy import deepcopy | |
from functools import lru_cache | |
from pathlib import Path | |
import numpy as np | |
import torch | |
from ultralytics.data.augment import LetterBox | |
from ultralytics.utils import LOGGER, SimpleClass, ops | |
from ultralytics.utils.checks import check_requirements | |
from ultralytics.utils.plotting import Annotator, colors, save_one_box | |
from ultralytics.utils.torch_utils import smart_inference_mode | |
class BaseTensor(SimpleClass): | |
""" | |
Base tensor class with additional methods for easy manipulation and device handling. | |
Attributes: | |
data (torch.Tensor | np.ndarray): Prediction data such as bounding boxes, masks, or keypoints. | |
orig_shape (Tuple[int, int]): Original shape of the image, typically in the format (height, width). | |
Methods: | |
cpu: Return a copy of the tensor stored in CPU memory. | |
numpy: Returns a copy of the tensor as a numpy array. | |
cuda: Moves the tensor to GPU memory, returning a new instance if necessary. | |
to: Return a copy of the tensor with the specified device and dtype. | |
Examples: | |
>>> import torch | |
>>> data = torch.tensor([[1, 2, 3], [4, 5, 6]]) | |
>>> orig_shape = (720, 1280) | |
>>> base_tensor = BaseTensor(data, orig_shape) | |
>>> cpu_tensor = base_tensor.cpu() | |
>>> numpy_array = base_tensor.numpy() | |
>>> gpu_tensor = base_tensor.cuda() | |
""" | |
def __init__(self, data, orig_shape) -> None: | |
""" | |
Initialize BaseTensor with prediction data and the original shape of the image. | |
Args: | |
data (torch.Tensor | np.ndarray): Prediction data such as bounding boxes, masks, or keypoints. | |
orig_shape (Tuple[int, int]): Original shape of the image in (height, width) format. | |
Examples: | |
>>> import torch | |
>>> data = torch.tensor([[1, 2, 3], [4, 5, 6]]) | |
>>> orig_shape = (720, 1280) | |
>>> base_tensor = BaseTensor(data, orig_shape) | |
""" | |
assert isinstance(data, (torch.Tensor, np.ndarray)), "data must be torch.Tensor or np.ndarray" | |
self.data = data | |
self.orig_shape = orig_shape | |
def shape(self): | |
""" | |
Returns the shape of the underlying data tensor. | |
Returns: | |
(Tuple[int, ...]): The shape of the data tensor. | |
Examples: | |
>>> data = torch.rand(100, 4) | |
>>> base_tensor = BaseTensor(data, orig_shape=(720, 1280)) | |
>>> print(base_tensor.shape) | |
(100, 4) | |
""" | |
return self.data.shape | |
def cpu(self): | |
""" | |
Returns a copy of the tensor stored in CPU memory. | |
Returns: | |
(BaseTensor): A new BaseTensor object with the data tensor moved to CPU memory. | |
Examples: | |
>>> data = torch.tensor([[1, 2, 3], [4, 5, 6]]).cuda() | |
>>> base_tensor = BaseTensor(data, orig_shape=(720, 1280)) | |
>>> cpu_tensor = base_tensor.cpu() | |
>>> isinstance(cpu_tensor, BaseTensor) | |
True | |
>>> cpu_tensor.data.device | |
device(type='cpu') | |
""" | |
return self if isinstance(self.data, np.ndarray) else self.__class__(self.data.cpu(), self.orig_shape) | |
def numpy(self): | |
""" | |
Returns a copy of the tensor as a numpy array. | |
Returns: | |
(np.ndarray): A numpy array containing the same data as the original tensor. | |
Examples: | |
>>> data = torch.tensor([[1, 2, 3], [4, 5, 6]]) | |
>>> orig_shape = (720, 1280) | |
>>> base_tensor = BaseTensor(data, orig_shape) | |
>>> numpy_array = base_tensor.numpy() | |
>>> print(type(numpy_array)) | |
<class 'numpy.ndarray'> | |
""" | |
return self if isinstance(self.data, np.ndarray) else self.__class__(self.data.numpy(), self.orig_shape) | |
def cuda(self): | |
""" | |
Moves the tensor to GPU memory. | |
Returns: | |
(BaseTensor): A new BaseTensor instance with the data moved to GPU memory if it's not already a | |
numpy array, otherwise returns self. | |
Examples: | |
>>> import torch | |
>>> from ultralytics.engine.results import BaseTensor | |
>>> data = torch.tensor([[1, 2, 3], [4, 5, 6]]) | |
>>> base_tensor = BaseTensor(data, orig_shape=(720, 1280)) | |
>>> gpu_tensor = base_tensor.cuda() | |
>>> print(gpu_tensor.data.device) | |
cuda:0 | |
""" | |
return self.__class__(torch.as_tensor(self.data).cuda(), self.orig_shape) | |
def to(self, *args, **kwargs): | |
""" | |
Return a copy of the tensor with the specified device and dtype. | |
Args: | |
*args (Any): Variable length argument list to be passed to torch.Tensor.to(). | |
**kwargs (Any): Arbitrary keyword arguments to be passed to torch.Tensor.to(). | |
Returns: | |
(BaseTensor): A new BaseTensor instance with the data moved to the specified device and/or dtype. | |
Examples: | |
>>> base_tensor = BaseTensor(torch.randn(3, 4), orig_shape=(480, 640)) | |
>>> cuda_tensor = base_tensor.to("cuda") | |
>>> float16_tensor = base_tensor.to(dtype=torch.float16) | |
""" | |
return self.__class__(torch.as_tensor(self.data).to(*args, **kwargs), self.orig_shape) | |
def __len__(self): # override len(results) | |
""" | |
Returns the length of the underlying data tensor. | |
Returns: | |
(int): The number of elements in the first dimension of the data tensor. | |
Examples: | |
>>> data = torch.tensor([[1, 2, 3], [4, 5, 6]]) | |
>>> base_tensor = BaseTensor(data, orig_shape=(720, 1280)) | |
>>> len(base_tensor) | |
2 | |
""" | |
return len(self.data) | |
def __getitem__(self, idx): | |
""" | |
Returns a new BaseTensor instance containing the specified indexed elements of the data tensor. | |
Args: | |
idx (int | List[int] | torch.Tensor): Index or indices to select from the data tensor. | |
Returns: | |
(BaseTensor): A new BaseTensor instance containing the indexed data. | |
Examples: | |
>>> data = torch.tensor([[1, 2, 3], [4, 5, 6]]) | |
>>> base_tensor = BaseTensor(data, orig_shape=(720, 1280)) | |
>>> result = base_tensor[0] # Select the first row | |
>>> print(result.data) | |
tensor([1, 2, 3]) | |
""" | |
return self.__class__(self.data[idx], self.orig_shape) | |
class Results(SimpleClass): | |
""" | |
A class for storing and manipulating inference results. | |
This class encapsulates the functionality for handling detection, segmentation, pose estimation, | |
and classification results from YOLO models. | |
Attributes: | |
orig_img (numpy.ndarray): Original image as a numpy array. | |
orig_shape (Tuple[int, int]): Original image shape in (height, width) format. | |
boxes (Boxes | None): Object containing detection bounding boxes. | |
masks (Masks | None): Object containing detection masks. | |
probs (Probs | None): Object containing class probabilities for classification tasks. | |
keypoints (Keypoints | None): Object containing detected keypoints for each object. | |
obb (OBB | None): Object containing oriented bounding boxes. | |
speed (Dict[str, float | None]): Dictionary of preprocess, inference, and postprocess speeds. | |
names (Dict[int, str]): Dictionary mapping class IDs to class names. | |
path (str): Path to the image file. | |
_keys (Tuple[str, ...]): Tuple of attribute names for internal use. | |
Methods: | |
update: Updates object attributes with new detection results. | |
cpu: Returns a copy of the Results object with all tensors on CPU memory. | |
numpy: Returns a copy of the Results object with all tensors as numpy arrays. | |
cuda: Returns a copy of the Results object with all tensors on GPU memory. | |
to: Returns a copy of the Results object with tensors on a specified device and dtype. | |
new: Returns a new Results object with the same image, path, and names. | |
plot: Plots detection results on an input image, returning an annotated image. | |
show: Shows annotated results on screen. | |
save: Saves annotated results to file. | |
verbose: Returns a log string for each task, detailing detections and classifications. | |
save_txt: Saves detection results to a text file. | |
save_crop: Saves cropped detection images. | |
tojson: Converts detection results to JSON format. | |
Examples: | |
>>> results = model("path/to/image.jpg") | |
>>> for result in results: | |
... print(result.boxes) # Print detection boxes | |
... result.show() # Display the annotated image | |
... result.save(filename="result.jpg") # Save annotated image | |
""" | |
def __init__( | |
self, orig_img, path, names, boxes=None, masks=None, probs=None, keypoints=None, obb=None, speed=None | |
) -> None: | |
""" | |
Initialize the Results class for storing and manipulating inference results. | |
Args: | |
orig_img (numpy.ndarray): The original image as a numpy array. | |
path (str): The path to the image file. | |
names (Dict): A dictionary of class names. | |
boxes (torch.Tensor | None): A 2D tensor of bounding box coordinates for each detection. | |
masks (torch.Tensor | None): A 3D tensor of detection masks, where each mask is a binary image. | |
probs (torch.Tensor | None): A 1D tensor of probabilities of each class for classification task. | |
keypoints (torch.Tensor | None): A 2D tensor of keypoint coordinates for each detection. | |
obb (torch.Tensor | None): A 2D tensor of oriented bounding box coordinates for each detection. | |
speed (Dict | None): A dictionary containing preprocess, inference, and postprocess speeds (ms/image). | |
Examples: | |
>>> results = model("path/to/image.jpg") | |
>>> result = results[0] # Get the first result | |
>>> boxes = result.boxes # Get the boxes for the first result | |
>>> masks = result.masks # Get the masks for the first result | |
Notes: | |
For the default pose model, keypoint indices for human body pose estimation are: | |
0: Nose, 1: Left Eye, 2: Right Eye, 3: Left Ear, 4: Right Ear | |
5: Left Shoulder, 6: Right Shoulder, 7: Left Elbow, 8: Right Elbow | |
9: Left Wrist, 10: Right Wrist, 11: Left Hip, 12: Right Hip | |
13: Left Knee, 14: Right Knee, 15: Left Ankle, 16: Right Ankle | |
""" | |
self.orig_img = orig_img | |
self.orig_shape = orig_img.shape[:2] | |
self.boxes = Boxes(boxes, self.orig_shape) if boxes is not None else None # native size boxes | |
self.masks = Masks(masks, self.orig_shape) if masks is not None else None # native size or imgsz masks | |
self.probs = Probs(probs) if probs is not None else None | |
self.keypoints = Keypoints(keypoints, self.orig_shape) if keypoints is not None else None | |
self.obb = OBB(obb, self.orig_shape) if obb is not None else None | |
self.speed = speed if speed is not None else {"preprocess": None, "inference": None, "postprocess": None} | |
self.names = names | |
self.path = path | |
self.save_dir = None | |
self._keys = "boxes", "masks", "probs", "keypoints", "obb" | |
def __getitem__(self, idx): | |
""" | |
Return a Results object for a specific index of inference results. | |
Args: | |
idx (int | slice): Index or slice to retrieve from the Results object. | |
Returns: | |
(Results): A new Results object containing the specified subset of inference results. | |
Examples: | |
>>> results = model("path/to/image.jpg") # Perform inference | |
>>> single_result = results[0] # Get the first result | |
>>> subset_results = results[1:4] # Get a slice of results | |
""" | |
return self._apply("__getitem__", idx) | |
def __len__(self): | |
""" | |
Return the number of detections in the Results object. | |
Returns: | |
(int): The number of detections, determined by the length of the first non-empty attribute | |
(boxes, masks, probs, keypoints, or obb). | |
Examples: | |
>>> results = Results(orig_img, path, names, boxes=torch.rand(5, 4)) | |
>>> len(results) | |
5 | |
""" | |
for k in self._keys: | |
v = getattr(self, k) | |
if v is not None: | |
return len(v) | |
def update(self, boxes=None, masks=None, probs=None, obb=None): | |
""" | |
Updates the Results object with new detection data. | |
This method allows updating the boxes, masks, probabilities, and oriented bounding boxes (OBB) of the | |
Results object. It ensures that boxes are clipped to the original image shape. | |
Args: | |
boxes (torch.Tensor | None): A tensor of shape (N, 6) containing bounding box coordinates and | |
confidence scores. The format is (x1, y1, x2, y2, conf, class). | |
masks (torch.Tensor | None): A tensor of shape (N, H, W) containing segmentation masks. | |
probs (torch.Tensor | None): A tensor of shape (num_classes,) containing class probabilities. | |
obb (torch.Tensor | None): A tensor of shape (N, 5) containing oriented bounding box coordinates. | |
Examples: | |
>>> results = model("image.jpg") | |
>>> new_boxes = torch.tensor([[100, 100, 200, 200, 0.9, 0]]) | |
>>> results[0].update(boxes=new_boxes) | |
""" | |
if boxes is not None: | |
self.boxes = Boxes(ops.clip_boxes(boxes, self.orig_shape), self.orig_shape) | |
if masks is not None: | |
self.masks = Masks(masks, self.orig_shape) | |
if probs is not None: | |
self.probs = probs | |
if obb is not None: | |
self.obb = OBB(obb, self.orig_shape) | |
def _apply(self, fn, *args, **kwargs): | |
""" | |
Applies a function to all non-empty attributes and returns a new Results object with modified attributes. | |
This method is internally called by methods like .to(), .cuda(), .cpu(), etc. | |
Args: | |
fn (str): The name of the function to apply. | |
*args (Any): Variable length argument list to pass to the function. | |
**kwargs (Any): Arbitrary keyword arguments to pass to the function. | |
Returns: | |
(Results): A new Results object with attributes modified by the applied function. | |
Examples: | |
>>> results = model("path/to/image.jpg") | |
>>> for result in results: | |
... result_cuda = result.cuda() | |
... result_cpu = result.cpu() | |
""" | |
r = self.new() | |
for k in self._keys: | |
v = getattr(self, k) | |
if v is not None: | |
setattr(r, k, getattr(v, fn)(*args, **kwargs)) | |
return r | |
def cpu(self): | |
""" | |
Returns a copy of the Results object with all its tensors moved to CPU memory. | |
This method creates a new Results object with all tensor attributes (boxes, masks, probs, keypoints, obb) | |
transferred to CPU memory. It's useful for moving data from GPU to CPU for further processing or saving. | |
Returns: | |
(Results): A new Results object with all tensor attributes on CPU memory. | |
Examples: | |
>>> results = model("path/to/image.jpg") # Perform inference | |
>>> cpu_result = results[0].cpu() # Move the first result to CPU | |
>>> print(cpu_result.boxes.device) # Output: cpu | |
""" | |
return self._apply("cpu") | |
def numpy(self): | |
""" | |
Converts all tensors in the Results object to numpy arrays. | |
Returns: | |
(Results): A new Results object with all tensors converted to numpy arrays. | |
Examples: | |
>>> results = model("path/to/image.jpg") | |
>>> numpy_result = results[0].numpy() | |
>>> type(numpy_result.boxes.data) | |
<class 'numpy.ndarray'> | |
Notes: | |
This method creates a new Results object, leaving the original unchanged. It's useful for | |
interoperability with numpy-based libraries or when CPU-based operations are required. | |
""" | |
return self._apply("numpy") | |
def cuda(self): | |
""" | |
Moves all tensors in the Results object to GPU memory. | |
Returns: | |
(Results): A new Results object with all tensors moved to CUDA device. | |
Examples: | |
>>> results = model("path/to/image.jpg") | |
>>> cuda_results = results[0].cuda() # Move first result to GPU | |
>>> for result in results: | |
... result_cuda = result.cuda() # Move each result to GPU | |
""" | |
return self._apply("cuda") | |
def to(self, *args, **kwargs): | |
""" | |
Moves all tensors in the Results object to the specified device and dtype. | |
Args: | |
*args (Any): Variable length argument list to be passed to torch.Tensor.to(). | |
**kwargs (Any): Arbitrary keyword arguments to be passed to torch.Tensor.to(). | |
Returns: | |
(Results): A new Results object with all tensors moved to the specified device and dtype. | |
Examples: | |
>>> results = model("path/to/image.jpg") | |
>>> result_cuda = results[0].to("cuda") # Move first result to GPU | |
>>> result_cpu = results[0].to("cpu") # Move first result to CPU | |
>>> result_half = results[0].to(dtype=torch.float16) # Convert first result to half precision | |
""" | |
return self._apply("to", *args, **kwargs) | |
def new(self): | |
""" | |
Creates a new Results object with the same image, path, names, and speed attributes. | |
Returns: | |
(Results): A new Results object with copied attributes from the original instance. | |
Examples: | |
>>> results = model("path/to/image.jpg") | |
>>> new_result = results[0].new() | |
""" | |
return Results(orig_img=self.orig_img, path=self.path, names=self.names, speed=self.speed) | |
def plot( | |
self, | |
conf=True, | |
line_width=None, | |
font_size=None, | |
font="Arial.ttf", | |
pil=False, | |
img=None, | |
im_gpu=None, | |
kpt_radius=5, | |
kpt_line=True, | |
labels=True, | |
boxes=True, | |
masks=True, | |
probs=True, | |
show=False, | |
save=False, | |
filename=None, | |
color_mode="class", | |
): | |
""" | |
Plots detection results on an input RGB image. | |
Args: | |
conf (bool): Whether to plot detection confidence scores. | |
line_width (float | None): Line width of bounding boxes. If None, scaled to image size. | |
font_size (float | None): Font size for text. If None, scaled to image size. | |
font (str): Font to use for text. | |
pil (bool): Whether to return the image as a PIL Image. | |
img (np.ndarray | None): Image to plot on. If None, uses original image. | |
im_gpu (torch.Tensor | None): Normalized image on GPU for faster mask plotting. | |
kpt_radius (int): Radius of drawn keypoints. | |
kpt_line (bool): Whether to draw lines connecting keypoints. | |
labels (bool): Whether to plot labels of bounding boxes. | |
boxes (bool): Whether to plot bounding boxes. | |
masks (bool): Whether to plot masks. | |
probs (bool): Whether to plot classification probabilities. | |
show (bool): Whether to display the annotated image. | |
save (bool): Whether to save the annotated image. | |
filename (str | None): Filename to save image if save is True. | |
color_mode (bool): Specify the color mode, e.g., 'instance' or 'class'. Default to 'class'. | |
Returns: | |
(np.ndarray): Annotated image as a numpy array. | |
Examples: | |
>>> results = model("image.jpg") | |
>>> for result in results: | |
... im = result.plot() | |
... im.show() | |
""" | |
assert color_mode in {"instance", "class"}, f"Expected color_mode='instance' or 'class', not {color_mode}." | |
if img is None and isinstance(self.orig_img, torch.Tensor): | |
img = (self.orig_img[0].detach().permute(1, 2, 0).contiguous() * 255).to(torch.uint8).cpu().numpy() | |
names = self.names | |
is_obb = self.obb is not None | |
pred_boxes, show_boxes = self.obb if is_obb else self.boxes, boxes | |
pred_masks, show_masks = self.masks, masks | |
pred_probs, show_probs = self.probs, probs | |
annotator = Annotator( | |
deepcopy(self.orig_img if img is None else img), | |
line_width, | |
font_size, | |
font, | |
pil or (pred_probs is not None and show_probs), # Classify tasks default to pil=True | |
example=names, | |
) | |
# Plot Segment results | |
if pred_masks and show_masks: | |
if im_gpu is None: | |
img = LetterBox(pred_masks.shape[1:])(image=annotator.result()) | |
im_gpu = ( | |
torch.as_tensor(img, dtype=torch.float16, device=pred_masks.data.device) | |
.permute(2, 0, 1) | |
.flip(0) | |
.contiguous() | |
/ 255 | |
) | |
idx = ( | |
pred_boxes.id | |
if pred_boxes.id is not None and color_mode == "instance" | |
else pred_boxes.cls | |
if pred_boxes and color_mode == "class" | |
else reversed(range(len(pred_masks))) | |
) | |
annotator.masks(pred_masks.data, colors=[colors(x, True) for x in idx], im_gpu=im_gpu) | |
# Plot Detect results | |
if pred_boxes is not None and show_boxes: | |
for i, d in enumerate(reversed(pred_boxes)): | |
c, d_conf, id = int(d.cls), float(d.conf) if conf else None, None if d.id is None else int(d.id.item()) | |
name = ("" if id is None else f"id:{id} ") + names[c] | |
label = (f"{name} {d_conf:.2f}" if conf else name) if labels else None | |
box = d.xyxyxyxy.reshape(-1, 4, 2).squeeze() if is_obb else d.xyxy.squeeze() | |
annotator.box_label( | |
box, | |
label, | |
color=colors( | |
c | |
if color_mode == "class" | |
else id | |
if id is not None | |
else i | |
if color_mode == "instance" | |
else None, | |
True, | |
), | |
rotated=is_obb, | |
) | |
# Plot Classify results | |
if pred_probs is not None and show_probs: | |
text = ",\n".join(f"{names[j] if names else j} {pred_probs.data[j]:.2f}" for j in pred_probs.top5) | |
x = round(self.orig_shape[0] * 0.03) | |
annotator.text([x, x], text, txt_color=(255, 255, 255)) # TODO: allow setting colors | |
# Plot Pose results | |
if self.keypoints is not None: | |
for i, k in enumerate(reversed(self.keypoints.data)): | |
annotator.kpts( | |
k, | |
self.orig_shape, | |
radius=kpt_radius, | |
kpt_line=kpt_line, | |
kpt_color=colors(i, True) if color_mode == "instance" else None, | |
) | |
# Show results | |
if show: | |
annotator.show(self.path) | |
# Save results | |
if save: | |
annotator.save(filename) | |
return annotator.result() | |
def show(self, *args, **kwargs): | |
""" | |
Display the image with annotated inference results. | |
This method plots the detection results on the original image and displays it. It's a convenient way to | |
visualize the model's predictions directly. | |
Args: | |
*args (Any): Variable length argument list to be passed to the `plot()` method. | |
**kwargs (Any): Arbitrary keyword arguments to be passed to the `plot()` method. | |
Examples: | |
>>> results = model("path/to/image.jpg") | |
>>> results[0].show() # Display the first result | |
>>> for result in results: | |
... result.show() # Display all results | |
""" | |
self.plot(show=True, *args, **kwargs) | |
def save(self, filename=None, *args, **kwargs): | |
""" | |
Saves annotated inference results image to file. | |
This method plots the detection results on the original image and saves the annotated image to a file. It | |
utilizes the `plot` method to generate the annotated image and then saves it to the specified filename. | |
Args: | |
filename (str | Path | None): The filename to save the annotated image. If None, a default filename | |
is generated based on the original image path. | |
*args (Any): Variable length argument list to be passed to the `plot` method. | |
**kwargs (Any): Arbitrary keyword arguments to be passed to the `plot` method. | |
Examples: | |
>>> results = model("path/to/image.jpg") | |
>>> for result in results: | |
... result.save("annotated_image.jpg") | |
>>> # Or with custom plot arguments | |
>>> for result in results: | |
... result.save("annotated_image.jpg", conf=False, line_width=2) | |
""" | |
if not filename: | |
filename = f"results_{Path(self.path).name}" | |
self.plot(save=True, filename=filename, *args, **kwargs) | |
return filename | |
def verbose(self): | |
""" | |
Returns a log string for each task in the results, detailing detection and classification outcomes. | |
This method generates a human-readable string summarizing the detection and classification results. It includes | |
the number of detections for each class and the top probabilities for classification tasks. | |
Returns: | |
(str): A formatted string containing a summary of the results. For detection tasks, it includes the | |
number of detections per class. For classification tasks, it includes the top 5 class probabilities. | |
Examples: | |
>>> results = model("path/to/image.jpg") | |
>>> for result in results: | |
... print(result.verbose()) | |
2 persons, 1 car, 3 traffic lights, | |
dog 0.92, cat 0.78, horse 0.64, | |
Notes: | |
- If there are no detections, the method returns "(no detections), " for detection tasks. | |
- For classification tasks, it returns the top 5 class probabilities and their corresponding class names. | |
- The returned string is comma-separated and ends with a comma and a space. | |
""" | |
log_string = "" | |
probs = self.probs | |
if len(self) == 0: | |
return log_string if probs is not None else f"{log_string}(no detections), " | |
if probs is not None: | |
log_string += f"{', '.join(f'{self.names[j]} {probs.data[j]:.2f}' for j in probs.top5)}, " | |
if boxes := self.boxes: | |
for c in boxes.cls.unique(): | |
n = (boxes.cls == c).sum() # detections per class | |
log_string += f"{n} {self.names[int(c)]}{'s' * (n > 1)}, " | |
return log_string | |
def save_txt(self, txt_file, save_conf=False): | |
""" | |
Save detection results to a text file. | |
Args: | |
txt_file (str | Path): Path to the output text file. | |
save_conf (bool): Whether to include confidence scores in the output. | |
Returns: | |
(str): Path to the saved text file. | |
Examples: | |
>>> from ultralytics import YOLO | |
>>> model = YOLO("yolo11n.pt") | |
>>> results = model("path/to/image.jpg") | |
>>> for result in results: | |
... result.save_txt("output.txt") | |
Notes: | |
- The file will contain one line per detection or classification with the following structure: | |
- For detections: `class confidence x_center y_center width height` | |
- For classifications: `confidence class_name` | |
- For masks and keypoints, the specific formats will vary accordingly. | |
- The function will create the output directory if it does not exist. | |
- If save_conf is False, the confidence scores will be excluded from the output. | |
- Existing contents of the file will not be overwritten; new results will be appended. | |
""" | |
is_obb = self.obb is not None | |
boxes = self.obb if is_obb else self.boxes | |
masks = self.masks | |
probs = self.probs | |
kpts = self.keypoints | |
texts = [] | |
if probs is not None: | |
# Classify | |
[texts.append(f"{probs.data[j]:.2f} {self.names[j]}") for j in probs.top5] | |
elif boxes: | |
# Detect/segment/pose | |
for j, d in enumerate(boxes): | |
c, conf, id = int(d.cls), float(d.conf), None if d.id is None else int(d.id.item()) | |
line = (c, *(d.xyxyxyxyn.view(-1) if is_obb else d.xywhn.view(-1))) | |
if masks: | |
seg = masks[j].xyn[0].copy().reshape(-1) # reversed mask.xyn, (n,2) to (n*2) | |
line = (c, *seg) | |
if kpts is not None: | |
kpt = torch.cat((kpts[j].xyn, kpts[j].conf[..., None]), 2) if kpts[j].has_visible else kpts[j].xyn | |
line += (*kpt.reshape(-1).tolist(),) | |
line += (conf,) * save_conf + (() if id is None else (id,)) | |
texts.append(("%g " * len(line)).rstrip() % line) | |
if texts: | |
Path(txt_file).parent.mkdir(parents=True, exist_ok=True) # make directory | |
with open(txt_file, "a") as f: | |
f.writelines(text + "\n" for text in texts) | |
def save_crop(self, save_dir, file_name=Path("im.jpg")): | |
""" | |
Saves cropped detection images to specified directory. | |
This method saves cropped images of detected objects to a specified directory. Each crop is saved in a | |
subdirectory named after the object's class, with the filename based on the input file_name. | |
Args: | |
save_dir (str | Path): Directory path where cropped images will be saved. | |
file_name (str | Path): Base filename for the saved cropped images. Default is Path("im.jpg"). | |
Notes: | |
- This method does not support Classify or Oriented Bounding Box (OBB) tasks. | |
- Crops are saved as 'save_dir/class_name/file_name.jpg'. | |
- The method will create necessary subdirectories if they don't exist. | |
- Original image is copied before cropping to avoid modifying the original. | |
Examples: | |
>>> results = model("path/to/image.jpg") | |
>>> for result in results: | |
... result.save_crop(save_dir="path/to/crops", file_name="detection") | |
""" | |
if self.probs is not None: | |
LOGGER.warning("WARNING ⚠️ Classify task do not support `save_crop`.") | |
return | |
if self.obb is not None: | |
LOGGER.warning("WARNING ⚠️ OBB task do not support `save_crop`.") | |
return | |
for d in self.boxes: | |
save_one_box( | |
d.xyxy, | |
self.orig_img.copy(), | |
file=Path(save_dir) / self.names[int(d.cls)] / Path(file_name).with_suffix(".jpg"), | |
BGR=True, | |
) | |
def summary(self, normalize=False, decimals=5): | |
""" | |
Converts inference results to a summarized dictionary with optional normalization for box coordinates. | |
This method creates a list of detection dictionaries, each containing information about a single | |
detection or classification result. For classification tasks, it returns the top class and its | |
confidence. For detection tasks, it includes class information, bounding box coordinates, and | |
optionally mask segments and keypoints. | |
Args: | |
normalize (bool): Whether to normalize bounding box coordinates by image dimensions. Defaults to False. | |
decimals (int): Number of decimal places to round the output values to. Defaults to 5. | |
Returns: | |
(List[Dict]): A list of dictionaries, each containing summarized information for a single | |
detection or classification result. The structure of each dictionary varies based on the | |
task type (classification or detection) and available information (boxes, masks, keypoints). | |
Examples: | |
>>> results = model("image.jpg") | |
>>> summary = results[0].summary() | |
>>> print(summary) | |
""" | |
# Create list of detection dictionaries | |
results = [] | |
if self.probs is not None: | |
class_id = self.probs.top1 | |
results.append( | |
{ | |
"name": self.names[class_id], | |
"class": class_id, | |
"confidence": round(self.probs.top1conf.item(), decimals), | |
} | |
) | |
return results | |
is_obb = self.obb is not None | |
data = self.obb if is_obb else self.boxes | |
h, w = self.orig_shape if normalize else (1, 1) | |
for i, row in enumerate(data): # xyxy, track_id if tracking, conf, class_id | |
class_id, conf = int(row.cls), round(row.conf.item(), decimals) | |
box = (row.xyxyxyxy if is_obb else row.xyxy).squeeze().reshape(-1, 2).tolist() | |
xy = {} | |
for j, b in enumerate(box): | |
xy[f"x{j + 1}"] = round(b[0] / w, decimals) | |
xy[f"y{j + 1}"] = round(b[1] / h, decimals) | |
result = {"name": self.names[class_id], "class": class_id, "confidence": conf, "box": xy} | |
if data.is_track: | |
result["track_id"] = int(row.id.item()) # track ID | |
if self.masks: | |
result["segments"] = { | |
"x": (self.masks.xy[i][:, 0] / w).round(decimals).tolist(), | |
"y": (self.masks.xy[i][:, 1] / h).round(decimals).tolist(), | |
} | |
if self.keypoints is not None: | |
x, y, visible = self.keypoints[i].data[0].cpu().unbind(dim=1) # torch Tensor | |
result["keypoints"] = { | |
"x": (x / w).numpy().round(decimals).tolist(), # decimals named argument required | |
"y": (y / h).numpy().round(decimals).tolist(), | |
"visible": visible.numpy().round(decimals).tolist(), | |
} | |
results.append(result) | |
return results | |
def to_df(self, normalize=False, decimals=5): | |
""" | |
Converts detection results to a Pandas Dataframe. | |
This method converts the detection results into Pandas Dataframe format. It includes information | |
about detected objects such as bounding boxes, class names, confidence scores, and optionally | |
segmentation masks and keypoints. | |
Args: | |
normalize (bool): Whether to normalize the bounding box coordinates by the image dimensions. | |
If True, coordinates will be returned as float values between 0 and 1. Defaults to False. | |
decimals (int): Number of decimal places to round the output values to. Defaults to 5. | |
Returns: | |
(DataFrame): A Pandas Dataframe containing all the information in results in an organized way. | |
Examples: | |
>>> results = model("path/to/image.jpg") | |
>>> df_result = results[0].to_df() | |
>>> print(df_result) | |
""" | |
import pandas as pd # scope for faster 'import ultralytics' | |
return pd.DataFrame(self.summary(normalize=normalize, decimals=decimals)) | |
def to_csv(self, normalize=False, decimals=5, *args, **kwargs): | |
""" | |
Converts detection results to a CSV format. | |
This method serializes the detection results into a CSV format. It includes information | |
about detected objects such as bounding boxes, class names, confidence scores, and optionally | |
segmentation masks and keypoints. | |
Args: | |
normalize (bool): Whether to normalize the bounding box coordinates by the image dimensions. | |
If True, coordinates will be returned as float values between 0 and 1. Defaults to False. | |
decimals (int): Number of decimal places to round the output values to. Defaults to 5. | |
*args (Any): Variable length argument list to be passed to pandas.DataFrame.to_csv(). | |
**kwargs (Any): Arbitrary keyword arguments to be passed to pandas.DataFrame.to_csv(). | |
Returns: | |
(str): CSV containing all the information in results in an organized way. | |
Examples: | |
>>> results = model("path/to/image.jpg") | |
>>> csv_result = results[0].to_csv() | |
>>> print(csv_result) | |
""" | |
return self.to_df(normalize=normalize, decimals=decimals).to_csv(*args, **kwargs) | |
def to_xml(self, normalize=False, decimals=5, *args, **kwargs): | |
""" | |
Converts detection results to XML format. | |
This method serializes the detection results into an XML format. It includes information | |
about detected objects such as bounding boxes, class names, confidence scores, and optionally | |
segmentation masks and keypoints. | |
Args: | |
normalize (bool): Whether to normalize the bounding box coordinates by the image dimensions. | |
If True, coordinates will be returned as float values between 0 and 1. Defaults to False. | |
decimals (int): Number of decimal places to round the output values to. Defaults to 5. | |
*args (Any): Variable length argument list to be passed to pandas.DataFrame.to_xml(). | |
**kwargs (Any): Arbitrary keyword arguments to be passed to pandas.DataFrame.to_xml(). | |
Returns: | |
(str): An XML string containing all the information in results in an organized way. | |
Examples: | |
>>> results = model("path/to/image.jpg") | |
>>> xml_result = results[0].to_xml() | |
>>> print(xml_result) | |
""" | |
check_requirements("lxml") | |
df = self.to_df(normalize=normalize, decimals=decimals) | |
return '<?xml version="1.0" encoding="utf-8"?>\n<root></root>' if df.empty else df.to_xml(*args, **kwargs) | |
def tojson(self, normalize=False, decimals=5): | |
"""Deprecated version of to_json().""" | |
LOGGER.warning("WARNING ⚠️ 'result.tojson()' is deprecated, replace with 'result.to_json()'.") | |
return self.to_json(normalize, decimals) | |
def to_json(self, normalize=False, decimals=5): | |
""" | |
Converts detection results to JSON format. | |
This method serializes the detection results into a JSON-compatible format. It includes information | |
about detected objects such as bounding boxes, class names, confidence scores, and optionally | |
segmentation masks and keypoints. | |
Args: | |
normalize (bool): Whether to normalize the bounding box coordinates by the image dimensions. | |
If True, coordinates will be returned as float values between 0 and 1. Defaults to False. | |
decimals (int): Number of decimal places to round the output values to. Defaults to 5. | |
Returns: | |
(str): A JSON string containing the serialized detection results. | |
Examples: | |
>>> results = model("path/to/image.jpg") | |
>>> json_result = results[0].to_json() | |
>>> print(json_result) | |
Notes: | |
- For classification tasks, the JSON will contain class probabilities instead of bounding boxes. | |
- For object detection tasks, the JSON will include bounding box coordinates, class names, and | |
confidence scores. | |
- If available, segmentation masks and keypoints will also be included in the JSON output. | |
- The method uses the `summary` method internally to generate the data structure before | |
converting it to JSON. | |
""" | |
import json | |
return json.dumps(self.summary(normalize=normalize, decimals=decimals), indent=2) | |
class Boxes(BaseTensor): | |
""" | |
A class for managing and manipulating detection boxes. | |
This class provides functionality for handling detection boxes, including their coordinates, confidence scores, | |
class labels, and optional tracking IDs. It supports various box formats and offers methods for easy manipulation | |
and conversion between different coordinate systems. | |
Attributes: | |
data (torch.Tensor | numpy.ndarray): The raw tensor containing detection boxes and associated data. | |
orig_shape (Tuple[int, int]): The original image dimensions (height, width). | |
is_track (bool): Indicates whether tracking IDs are included in the box data. | |
xyxy (torch.Tensor | numpy.ndarray): Boxes in [x1, y1, x2, y2] format. | |
conf (torch.Tensor | numpy.ndarray): Confidence scores for each box. | |
cls (torch.Tensor | numpy.ndarray): Class labels for each box. | |
id (torch.Tensor | numpy.ndarray): Tracking IDs for each box (if available). | |
xywh (torch.Tensor | numpy.ndarray): Boxes in [x, y, width, height] format. | |
xyxyn (torch.Tensor | numpy.ndarray): Normalized [x1, y1, x2, y2] boxes relative to orig_shape. | |
xywhn (torch.Tensor | numpy.ndarray): Normalized [x, y, width, height] boxes relative to orig_shape. | |
Methods: | |
cpu(): Returns a copy of the object with all tensors on CPU memory. | |
numpy(): Returns a copy of the object with all tensors as numpy arrays. | |
cuda(): Returns a copy of the object with all tensors on GPU memory. | |
to(*args, **kwargs): Returns a copy of the object with tensors on specified device and dtype. | |
Examples: | |
>>> import torch | |
>>> boxes_data = torch.tensor([[100, 50, 150, 100, 0.9, 0], [200, 150, 300, 250, 0.8, 1]]) | |
>>> orig_shape = (480, 640) # height, width | |
>>> boxes = Boxes(boxes_data, orig_shape) | |
>>> print(boxes.xyxy) | |
>>> print(boxes.conf) | |
>>> print(boxes.cls) | |
>>> print(boxes.xywhn) | |
""" | |
def __init__(self, boxes, orig_shape) -> None: | |
""" | |
Initialize the Boxes class with detection box data and the original image shape. | |
This class manages detection boxes, providing easy access and manipulation of box coordinates, | |
confidence scores, class identifiers, and optional tracking IDs. It supports multiple formats | |
for box coordinates, including both absolute and normalized forms. | |
Args: | |
boxes (torch.Tensor | np.ndarray): A tensor or numpy array with detection boxes of shape | |
(num_boxes, 6) or (num_boxes, 7). Columns should contain | |
[x1, y1, x2, y2, confidence, class, (optional) track_id]. | |
orig_shape (Tuple[int, int]): The original image shape as (height, width). Used for normalization. | |
Attributes: | |
data (torch.Tensor): The raw tensor containing detection boxes and their associated data. | |
orig_shape (Tuple[int, int]): The original image size, used for normalization. | |
is_track (bool): Indicates whether tracking IDs are included in the box data. | |
Examples: | |
>>> import torch | |
>>> boxes = torch.tensor([[100, 50, 150, 100, 0.9, 0]]) | |
>>> orig_shape = (480, 640) | |
>>> detection_boxes = Boxes(boxes, orig_shape) | |
>>> print(detection_boxes.xyxy) | |
tensor([[100., 50., 150., 100.]]) | |
""" | |
if boxes.ndim == 1: | |
boxes = boxes[None, :] | |
n = boxes.shape[-1] | |
assert n in {6, 7}, f"expected 6 or 7 values but got {n}" # xyxy, track_id, conf, cls | |
super().__init__(boxes, orig_shape) | |
self.is_track = n == 7 | |
self.orig_shape = orig_shape | |
def xyxy(self): | |
""" | |
Returns bounding boxes in [x1, y1, x2, y2] format. | |
Returns: | |
(torch.Tensor | numpy.ndarray): A tensor or numpy array of shape (n, 4) containing bounding box | |
coordinates in [x1, y1, x2, y2] format, where n is the number of boxes. | |
Examples: | |
>>> results = model("image.jpg") | |
>>> boxes = results[0].boxes | |
>>> xyxy = boxes.xyxy | |
>>> print(xyxy) | |
""" | |
return self.data[:, :4] | |
def conf(self): | |
""" | |
Returns the confidence scores for each detection box. | |
Returns: | |
(torch.Tensor | numpy.ndarray): A 1D tensor or array containing confidence scores for each detection, | |
with shape (N,) where N is the number of detections. | |
Examples: | |
>>> boxes = Boxes(torch.tensor([[10, 20, 30, 40, 0.9, 0]]), orig_shape=(100, 100)) | |
>>> conf_scores = boxes.conf | |
>>> print(conf_scores) | |
tensor([0.9000]) | |
""" | |
return self.data[:, -2] | |
def cls(self): | |
""" | |
Returns the class ID tensor representing category predictions for each bounding box. | |
Returns: | |
(torch.Tensor | numpy.ndarray): A tensor or numpy array containing the class IDs for each detection box. | |
The shape is (N,), where N is the number of boxes. | |
Examples: | |
>>> results = model("image.jpg") | |
>>> boxes = results[0].boxes | |
>>> class_ids = boxes.cls | |
>>> print(class_ids) # tensor([0., 2., 1.]) | |
""" | |
return self.data[:, -1] | |
def id(self): | |
""" | |
Returns the tracking IDs for each detection box if available. | |
Returns: | |
(torch.Tensor | None): A tensor containing tracking IDs for each box if tracking is enabled, | |
otherwise None. Shape is (N,) where N is the number of boxes. | |
Examples: | |
>>> results = model.track("path/to/video.mp4") | |
>>> for result in results: | |
... boxes = result.boxes | |
... if boxes.is_track: | |
... track_ids = boxes.id | |
... print(f"Tracking IDs: {track_ids}") | |
... else: | |
... print("Tracking is not enabled for these boxes.") | |
Notes: | |
- This property is only available when tracking is enabled (i.e., when `is_track` is True). | |
- The tracking IDs are typically used to associate detections across multiple frames in video analysis. | |
""" | |
return self.data[:, -3] if self.is_track else None | |
# maxsize 1 should suffice | |
def xywh(self): | |
""" | |
Convert bounding boxes from [x1, y1, x2, y2] format to [x, y, width, height] format. | |
Returns: | |
(torch.Tensor | numpy.ndarray): Boxes in [x_center, y_center, width, height] format, where x_center, y_center are the coordinates of | |
the center point of the bounding box, width, height are the dimensions of the bounding box and the | |
shape of the returned tensor is (N, 4), where N is the number of boxes. | |
Examples: | |
>>> boxes = Boxes(torch.tensor([[100, 50, 150, 100], [200, 150, 300, 250]]), orig_shape=(480, 640)) | |
>>> xywh = boxes.xywh | |
>>> print(xywh) | |
tensor([[100.0000, 50.0000, 50.0000, 50.0000], | |
[200.0000, 150.0000, 100.0000, 100.0000]]) | |
""" | |
return ops.xyxy2xywh(self.xyxy) | |
def xyxyn(self): | |
""" | |
Returns normalized bounding box coordinates relative to the original image size. | |
This property calculates and returns the bounding box coordinates in [x1, y1, x2, y2] format, | |
normalized to the range [0, 1] based on the original image dimensions. | |
Returns: | |
(torch.Tensor | numpy.ndarray): Normalized bounding box coordinates with shape (N, 4), where N is | |
the number of boxes. Each row contains [x1, y1, x2, y2] values normalized to [0, 1]. | |
Examples: | |
>>> boxes = Boxes(torch.tensor([[100, 50, 300, 400, 0.9, 0]]), orig_shape=(480, 640)) | |
>>> normalized = boxes.xyxyn | |
>>> print(normalized) | |
tensor([[0.1562, 0.1042, 0.4688, 0.8333]]) | |
""" | |
xyxy = self.xyxy.clone() if isinstance(self.xyxy, torch.Tensor) else np.copy(self.xyxy) | |
xyxy[..., [0, 2]] /= self.orig_shape[1] | |
xyxy[..., [1, 3]] /= self.orig_shape[0] | |
return xyxy | |
def xywhn(self): | |
""" | |
Returns normalized bounding boxes in [x, y, width, height] format. | |
This property calculates and returns the normalized bounding box coordinates in the format | |
[x_center, y_center, width, height], where all values are relative to the original image dimensions. | |
Returns: | |
(torch.Tensor | numpy.ndarray): Normalized bounding boxes with shape (N, 4), where N is the | |
number of boxes. Each row contains [x_center, y_center, width, height] values normalized | |
to [0, 1] based on the original image dimensions. | |
Examples: | |
>>> boxes = Boxes(torch.tensor([[100, 50, 150, 100, 0.9, 0]]), orig_shape=(480, 640)) | |
>>> normalized = boxes.xywhn | |
>>> print(normalized) | |
tensor([[0.1953, 0.1562, 0.0781, 0.1042]]) | |
""" | |
xywh = ops.xyxy2xywh(self.xyxy) | |
xywh[..., [0, 2]] /= self.orig_shape[1] | |
xywh[..., [1, 3]] /= self.orig_shape[0] | |
return xywh | |
class Masks(BaseTensor): | |
""" | |
A class for storing and manipulating detection masks. | |
This class extends BaseTensor and provides functionality for handling segmentation masks, | |
including methods for converting between pixel and normalized coordinates. | |
Attributes: | |
data (torch.Tensor | numpy.ndarray): The raw tensor or array containing mask data. | |
orig_shape (tuple): Original image shape in (height, width) format. | |
xy (List[numpy.ndarray]): A list of segments in pixel coordinates. | |
xyn (List[numpy.ndarray]): A list of normalized segments. | |
Methods: | |
cpu(): Returns a copy of the Masks object with the mask tensor on CPU memory. | |
numpy(): Returns a copy of the Masks object with the mask tensor as a numpy array. | |
cuda(): Returns a copy of the Masks object with the mask tensor on GPU memory. | |
to(*args, **kwargs): Returns a copy of the Masks object with the mask tensor on specified device and dtype. | |
Examples: | |
>>> masks_data = torch.rand(1, 160, 160) | |
>>> orig_shape = (720, 1280) | |
>>> masks = Masks(masks_data, orig_shape) | |
>>> pixel_coords = masks.xy | |
>>> normalized_coords = masks.xyn | |
""" | |
def __init__(self, masks, orig_shape) -> None: | |
""" | |
Initialize the Masks class with detection mask data and the original image shape. | |
Args: | |
masks (torch.Tensor | np.ndarray): Detection masks with shape (num_masks, height, width). | |
orig_shape (tuple): The original image shape as (height, width). Used for normalization. | |
Examples: | |
>>> import torch | |
>>> from ultralytics.engine.results import Masks | |
>>> masks = torch.rand(10, 160, 160) # 10 masks of 160x160 resolution | |
>>> orig_shape = (720, 1280) # Original image shape | |
>>> mask_obj = Masks(masks, orig_shape) | |
""" | |
if masks.ndim == 2: | |
masks = masks[None, :] | |
super().__init__(masks, orig_shape) | |
def xyn(self): | |
""" | |
Returns normalized xy-coordinates of the segmentation masks. | |
This property calculates and caches the normalized xy-coordinates of the segmentation masks. The coordinates | |
are normalized relative to the original image shape. | |
Returns: | |
(List[numpy.ndarray]): A list of numpy arrays, where each array contains the normalized xy-coordinates | |
of a single segmentation mask. Each array has shape (N, 2), where N is the number of points in the | |
mask contour. | |
Examples: | |
>>> results = model("image.jpg") | |
>>> masks = results[0].masks | |
>>> normalized_coords = masks.xyn | |
>>> print(normalized_coords[0]) # Normalized coordinates of the first mask | |
""" | |
return [ | |
ops.scale_coords(self.data.shape[1:], x, self.orig_shape, normalize=True) | |
for x in ops.masks2segments(self.data) | |
] | |
def xy(self): | |
""" | |
Returns the [x, y] pixel coordinates for each segment in the mask tensor. | |
This property calculates and returns a list of pixel coordinates for each segmentation mask in the | |
Masks object. The coordinates are scaled to match the original image dimensions. | |
Returns: | |
(List[numpy.ndarray]): A list of numpy arrays, where each array contains the [x, y] pixel | |
coordinates for a single segmentation mask. Each array has shape (N, 2), where N is the | |
number of points in the segment. | |
Examples: | |
>>> results = model("image.jpg") | |
>>> masks = results[0].masks | |
>>> xy_coords = masks.xy | |
>>> print(len(xy_coords)) # Number of masks | |
>>> print(xy_coords[0].shape) # Shape of first mask's coordinates | |
""" | |
return [ | |
ops.scale_coords(self.data.shape[1:], x, self.orig_shape, normalize=False) | |
for x in ops.masks2segments(self.data) | |
] | |
class Keypoints(BaseTensor): | |
""" | |
A class for storing and manipulating detection keypoints. | |
This class encapsulates functionality for handling keypoint data, including coordinate manipulation, | |
normalization, and confidence values. | |
Attributes: | |
data (torch.Tensor): The raw tensor containing keypoint data. | |
orig_shape (Tuple[int, int]): The original image dimensions (height, width). | |
has_visible (bool): Indicates whether visibility information is available for keypoints. | |
xy (torch.Tensor): Keypoint coordinates in [x, y] format. | |
xyn (torch.Tensor): Normalized keypoint coordinates in [x, y] format, relative to orig_shape. | |
conf (torch.Tensor): Confidence values for each keypoint, if available. | |
Methods: | |
cpu(): Returns a copy of the keypoints tensor on CPU memory. | |
numpy(): Returns a copy of the keypoints tensor as a numpy array. | |
cuda(): Returns a copy of the keypoints tensor on GPU memory. | |
to(*args, **kwargs): Returns a copy of the keypoints tensor with specified device and dtype. | |
Examples: | |
>>> import torch | |
>>> from ultralytics.engine.results import Keypoints | |
>>> keypoints_data = torch.rand(1, 17, 3) # 1 detection, 17 keypoints, (x, y, conf) | |
>>> orig_shape = (480, 640) # Original image shape (height, width) | |
>>> keypoints = Keypoints(keypoints_data, orig_shape) | |
>>> print(keypoints.xy.shape) # Access xy coordinates | |
>>> print(keypoints.conf) # Access confidence values | |
>>> keypoints_cpu = keypoints.cpu() # Move keypoints to CPU | |
""" | |
# avoid keypoints < conf in-place error | |
def __init__(self, keypoints, orig_shape) -> None: | |
""" | |
Initializes the Keypoints object with detection keypoints and original image dimensions. | |
This method processes the input keypoints tensor, handling both 2D and 3D formats. For 3D tensors | |
(x, y, confidence), it masks out low-confidence keypoints by setting their coordinates to zero. | |
Args: | |
keypoints (torch.Tensor): A tensor containing keypoint data. Shape can be either: | |
- (num_objects, num_keypoints, 2) for x, y coordinates only | |
- (num_objects, num_keypoints, 3) for x, y coordinates and confidence scores | |
orig_shape (Tuple[int, int]): The original image dimensions (height, width). | |
Examples: | |
>>> kpts = torch.rand(1, 17, 3) # 1 object, 17 keypoints (COCO format), x,y,conf | |
>>> orig_shape = (720, 1280) # Original image height, width | |
>>> keypoints = Keypoints(kpts, orig_shape) | |
""" | |
if keypoints.ndim == 2: | |
keypoints = keypoints[None, :] | |
if keypoints.shape[2] == 3: # x, y, conf | |
mask = keypoints[..., 2] < 0.5 # points with conf < 0.5 (not visible) | |
keypoints[..., :2][mask] = 0 | |
super().__init__(keypoints, orig_shape) | |
self.has_visible = self.data.shape[-1] == 3 | |
def xy(self): | |
""" | |
Returns x, y coordinates of keypoints. | |
Returns: | |
(torch.Tensor): A tensor containing the x, y coordinates of keypoints with shape (N, K, 2), where N is | |
the number of detections and K is the number of keypoints per detection. | |
Examples: | |
>>> results = model("image.jpg") | |
>>> keypoints = results[0].keypoints | |
>>> xy = keypoints.xy | |
>>> print(xy.shape) # (N, K, 2) | |
>>> print(xy[0]) # x, y coordinates of keypoints for first detection | |
Notes: | |
- The returned coordinates are in pixel units relative to the original image dimensions. | |
- If keypoints were initialized with confidence values, only keypoints with confidence >= 0.5 are returned. | |
- This property uses LRU caching to improve performance on repeated access. | |
""" | |
return self.data[..., :2] | |
def xyn(self): | |
""" | |
Returns normalized coordinates (x, y) of keypoints relative to the original image size. | |
Returns: | |
(torch.Tensor | numpy.ndarray): A tensor or array of shape (N, K, 2) containing normalized keypoint | |
coordinates, where N is the number of instances, K is the number of keypoints, and the last | |
dimension contains [x, y] values in the range [0, 1]. | |
Examples: | |
>>> keypoints = Keypoints(torch.rand(1, 17, 2), orig_shape=(480, 640)) | |
>>> normalized_kpts = keypoints.xyn | |
>>> print(normalized_kpts.shape) | |
torch.Size([1, 17, 2]) | |
""" | |
xy = self.xy.clone() if isinstance(self.xy, torch.Tensor) else np.copy(self.xy) | |
xy[..., 0] /= self.orig_shape[1] | |
xy[..., 1] /= self.orig_shape[0] | |
return xy | |
def conf(self): | |
""" | |
Returns confidence values for each keypoint. | |
Returns: | |
(torch.Tensor | None): A tensor containing confidence scores for each keypoint if available, | |
otherwise None. Shape is (num_detections, num_keypoints) for batched data or (num_keypoints,) | |
for single detection. | |
Examples: | |
>>> keypoints = Keypoints(torch.rand(1, 17, 3), orig_shape=(640, 640)) # 1 detection, 17 keypoints | |
>>> conf = keypoints.conf | |
>>> print(conf.shape) # torch.Size([1, 17]) | |
""" | |
return self.data[..., 2] if self.has_visible else None | |
class Probs(BaseTensor): | |
""" | |
A class for storing and manipulating classification probabilities. | |
This class extends BaseTensor and provides methods for accessing and manipulating | |
classification probabilities, including top-1 and top-5 predictions. | |
Attributes: | |
data (torch.Tensor | numpy.ndarray): The raw tensor or array containing classification probabilities. | |
orig_shape (tuple | None): The original image shape as (height, width). Not used in this class. | |
top1 (int): Index of the class with the highest probability. | |
top5 (List[int]): Indices of the top 5 classes by probability. | |
top1conf (torch.Tensor | numpy.ndarray): Confidence score of the top 1 class. | |
top5conf (torch.Tensor | numpy.ndarray): Confidence scores of the top 5 classes. | |
Methods: | |
cpu(): Returns a copy of the probabilities tensor on CPU memory. | |
numpy(): Returns a copy of the probabilities tensor as a numpy array. | |
cuda(): Returns a copy of the probabilities tensor on GPU memory. | |
to(*args, **kwargs): Returns a copy of the probabilities tensor with specified device and dtype. | |
Examples: | |
>>> probs = torch.tensor([0.1, 0.3, 0.6]) | |
>>> p = Probs(probs) | |
>>> print(p.top1) | |
2 | |
>>> print(p.top5) | |
[2, 1, 0] | |
>>> print(p.top1conf) | |
tensor(0.6000) | |
>>> print(p.top5conf) | |
tensor([0.6000, 0.3000, 0.1000]) | |
""" | |
def __init__(self, probs, orig_shape=None) -> None: | |
""" | |
Initialize the Probs class with classification probabilities. | |
This class stores and manages classification probabilities, providing easy access to top predictions and their | |
confidences. | |
Args: | |
probs (torch.Tensor | np.ndarray): A 1D tensor or array of classification probabilities. | |
orig_shape (tuple | None): The original image shape as (height, width). Not used in this class but kept for | |
consistency with other result classes. | |
Attributes: | |
data (torch.Tensor | np.ndarray): The raw tensor or array containing classification probabilities. | |
top1 (int): Index of the top 1 class. | |
top5 (List[int]): Indices of the top 5 classes. | |
top1conf (torch.Tensor | np.ndarray): Confidence of the top 1 class. | |
top5conf (torch.Tensor | np.ndarray): Confidences of the top 5 classes. | |
Examples: | |
>>> import torch | |
>>> probs = torch.tensor([0.1, 0.3, 0.2, 0.4]) | |
>>> p = Probs(probs) | |
>>> print(p.top1) | |
3 | |
>>> print(p.top1conf) | |
tensor(0.4000) | |
>>> print(p.top5) | |
[3, 1, 2, 0] | |
""" | |
super().__init__(probs, orig_shape) | |
def top1(self): | |
""" | |
Returns the index of the class with the highest probability. | |
Returns: | |
(int): Index of the class with the highest probability. | |
Examples: | |
>>> probs = Probs(torch.tensor([0.1, 0.3, 0.6])) | |
>>> probs.top1 | |
2 | |
""" | |
return int(self.data.argmax()) | |
def top5(self): | |
""" | |
Returns the indices of the top 5 class probabilities. | |
Returns: | |
(List[int]): A list containing the indices of the top 5 class probabilities, sorted in descending order. | |
Examples: | |
>>> probs = Probs(torch.tensor([0.1, 0.2, 0.3, 0.4, 0.5])) | |
>>> print(probs.top5) | |
[4, 3, 2, 1, 0] | |
""" | |
return (-self.data).argsort(0)[:5].tolist() # this way works with both torch and numpy. | |
def top1conf(self): | |
""" | |
Returns the confidence score of the highest probability class. | |
This property retrieves the confidence score (probability) of the class with the highest predicted probability | |
from the classification results. | |
Returns: | |
(torch.Tensor | numpy.ndarray): A tensor containing the confidence score of the top 1 class. | |
Examples: | |
>>> results = model("image.jpg") # classify an image | |
>>> probs = results[0].probs # get classification probabilities | |
>>> top1_confidence = probs.top1conf # get confidence of top 1 class | |
>>> print(f"Top 1 class confidence: {top1_confidence.item():.4f}") | |
""" | |
return self.data[self.top1] | |
def top5conf(self): | |
""" | |
Returns confidence scores for the top 5 classification predictions. | |
This property retrieves the confidence scores corresponding to the top 5 class probabilities | |
predicted by the model. It provides a quick way to access the most likely class predictions | |
along with their associated confidence levels. | |
Returns: | |
(torch.Tensor | numpy.ndarray): A tensor or array containing the confidence scores for the | |
top 5 predicted classes, sorted in descending order of probability. | |
Examples: | |
>>> results = model("image.jpg") | |
>>> probs = results[0].probs | |
>>> top5_conf = probs.top5conf | |
>>> print(top5_conf) # Prints confidence scores for top 5 classes | |
""" | |
return self.data[self.top5] | |
class OBB(BaseTensor): | |
""" | |
A class for storing and manipulating Oriented Bounding Boxes (OBB). | |
This class provides functionality to handle oriented bounding boxes, including conversion between | |
different formats, normalization, and access to various properties of the boxes. | |
Attributes: | |
data (torch.Tensor): The raw OBB tensor containing box coordinates and associated data. | |
orig_shape (tuple): Original image size as (height, width). | |
is_track (bool): Indicates whether tracking IDs are included in the box data. | |
xywhr (torch.Tensor | numpy.ndarray): Boxes in [x_center, y_center, width, height, rotation] format. | |
conf (torch.Tensor | numpy.ndarray): Confidence scores for each box. | |
cls (torch.Tensor | numpy.ndarray): Class labels for each box. | |
id (torch.Tensor | numpy.ndarray): Tracking IDs for each box, if available. | |
xyxyxyxy (torch.Tensor | numpy.ndarray): Boxes in 8-point [x1, y1, x2, y2, x3, y3, x4, y4] format. | |
xyxyxyxyn (torch.Tensor | numpy.ndarray): Normalized 8-point coordinates relative to orig_shape. | |
xyxy (torch.Tensor | numpy.ndarray): Axis-aligned bounding boxes in [x1, y1, x2, y2] format. | |
Methods: | |
cpu(): Returns a copy of the OBB object with all tensors on CPU memory. | |
numpy(): Returns a copy of the OBB object with all tensors as numpy arrays. | |
cuda(): Returns a copy of the OBB object with all tensors on GPU memory. | |
to(*args, **kwargs): Returns a copy of the OBB object with tensors on specified device and dtype. | |
Examples: | |
>>> boxes = torch.tensor([[100, 50, 150, 100, 30, 0.9, 0]]) # xywhr, conf, cls | |
>>> obb = OBB(boxes, orig_shape=(480, 640)) | |
>>> print(obb.xyxyxyxy) | |
>>> print(obb.conf) | |
>>> print(obb.cls) | |
""" | |
def __init__(self, boxes, orig_shape) -> None: | |
""" | |
Initialize an OBB (Oriented Bounding Box) instance with oriented bounding box data and original image shape. | |
This class stores and manipulates Oriented Bounding Boxes (OBB) for object detection tasks. It provides | |
various properties and methods to access and transform the OBB data. | |
Args: | |
boxes (torch.Tensor | numpy.ndarray): A tensor or numpy array containing the detection boxes, | |
with shape (num_boxes, 7) or (num_boxes, 8). The last two columns contain confidence and class values. | |
If present, the third last column contains track IDs, and the fifth column contains rotation. | |
orig_shape (Tuple[int, int]): Original image size, in the format (height, width). | |
Attributes: | |
data (torch.Tensor | numpy.ndarray): The raw OBB tensor. | |
orig_shape (Tuple[int, int]): The original image shape. | |
is_track (bool): Whether the boxes include tracking IDs. | |
Raises: | |
AssertionError: If the number of values per box is not 7 or 8. | |
Examples: | |
>>> import torch | |
>>> boxes = torch.rand(3, 7) # 3 boxes with 7 values each | |
>>> orig_shape = (640, 480) | |
>>> obb = OBB(boxes, orig_shape) | |
>>> print(obb.xywhr) # Access the boxes in xywhr format | |
""" | |
if boxes.ndim == 1: | |
boxes = boxes[None, :] | |
n = boxes.shape[-1] | |
assert n in {7, 8}, f"expected 7 or 8 values but got {n}" # xywh, rotation, track_id, conf, cls | |
super().__init__(boxes, orig_shape) | |
self.is_track = n == 8 | |
self.orig_shape = orig_shape | |
def xywhr(self): | |
""" | |
Returns boxes in [x_center, y_center, width, height, rotation] format. | |
Returns: | |
(torch.Tensor | numpy.ndarray): A tensor or numpy array containing the oriented bounding boxes with format | |
[x_center, y_center, width, height, rotation]. The shape is (N, 5) where N is the number of boxes. | |
Examples: | |
>>> results = model("image.jpg") | |
>>> obb = results[0].obb | |
>>> xywhr = obb.xywhr | |
>>> print(xywhr.shape) | |
torch.Size([3, 5]) | |
""" | |
return self.data[:, :5] | |
def conf(self): | |
""" | |
Returns the confidence scores for Oriented Bounding Boxes (OBBs). | |
This property retrieves the confidence values associated with each OBB detection. The confidence score | |
represents the model's certainty in the detection. | |
Returns: | |
(torch.Tensor | numpy.ndarray): A tensor or numpy array of shape (N,) containing confidence scores | |
for N detections, where each score is in the range [0, 1]. | |
Examples: | |
>>> results = model("image.jpg") | |
>>> obb_result = results[0].obb | |
>>> confidence_scores = obb_result.conf | |
>>> print(confidence_scores) | |
""" | |
return self.data[:, -2] | |
def cls(self): | |
""" | |
Returns the class values of the oriented bounding boxes. | |
Returns: | |
(torch.Tensor | numpy.ndarray): A tensor or numpy array containing the class values for each oriented | |
bounding box. The shape is (N,), where N is the number of boxes. | |
Examples: | |
>>> results = model("image.jpg") | |
>>> result = results[0] | |
>>> obb = result.obb | |
>>> class_values = obb.cls | |
>>> print(class_values) | |
""" | |
return self.data[:, -1] | |
def id(self): | |
""" | |
Returns the tracking IDs of the oriented bounding boxes (if available). | |
Returns: | |
(torch.Tensor | numpy.ndarray | None): A tensor or numpy array containing the tracking IDs for each | |
oriented bounding box. Returns None if tracking IDs are not available. | |
Examples: | |
>>> results = model("image.jpg", tracker=True) # Run inference with tracking | |
>>> for result in results: | |
... if result.obb is not None: | |
... track_ids = result.obb.id | |
... if track_ids is not None: | |
... print(f"Tracking IDs: {track_ids}") | |
""" | |
return self.data[:, -3] if self.is_track else None | |
def xyxyxyxy(self): | |
""" | |
Converts OBB format to 8-point (xyxyxyxy) coordinate format for rotated bounding boxes. | |
Returns: | |
(torch.Tensor | numpy.ndarray): Rotated bounding boxes in xyxyxyxy format with shape (N, 4, 2), where N is | |
the number of boxes. Each box is represented by 4 points (x, y), starting from the top-left corner and | |
moving clockwise. | |
Examples: | |
>>> obb = OBB(torch.tensor([[100, 100, 50, 30, 0.5, 0.9, 0]]), orig_shape=(640, 640)) | |
>>> xyxyxyxy = obb.xyxyxyxy | |
>>> print(xyxyxyxy.shape) | |
torch.Size([1, 4, 2]) | |
""" | |
return ops.xywhr2xyxyxyxy(self.xywhr) | |
def xyxyxyxyn(self): | |
""" | |
Converts rotated bounding boxes to normalized xyxyxyxy format. | |
Returns: | |
(torch.Tensor | numpy.ndarray): Normalized rotated bounding boxes in xyxyxyxy format with shape (N, 4, 2), | |
where N is the number of boxes. Each box is represented by 4 points (x, y), normalized relative to | |
the original image dimensions. | |
Examples: | |
>>> obb = OBB(torch.rand(10, 7), orig_shape=(640, 480)) # 10 random OBBs | |
>>> normalized_boxes = obb.xyxyxyxyn | |
>>> print(normalized_boxes.shape) | |
torch.Size([10, 4, 2]) | |
""" | |
xyxyxyxyn = self.xyxyxyxy.clone() if isinstance(self.xyxyxyxy, torch.Tensor) else np.copy(self.xyxyxyxy) | |
xyxyxyxyn[..., 0] /= self.orig_shape[1] | |
xyxyxyxyn[..., 1] /= self.orig_shape[0] | |
return xyxyxyxyn | |
def xyxy(self): | |
""" | |
Converts oriented bounding boxes (OBB) to axis-aligned bounding boxes in xyxy format. | |
This property calculates the minimal enclosing rectangle for each oriented bounding box and returns it in | |
xyxy format (x1, y1, x2, y2). This is useful for operations that require axis-aligned bounding boxes, such | |
as IoU calculation with non-rotated boxes. | |
Returns: | |
(torch.Tensor | numpy.ndarray): Axis-aligned bounding boxes in xyxy format with shape (N, 4), where N | |
is the number of boxes. Each row contains [x1, y1, x2, y2] coordinates. | |
Examples: | |
>>> import torch | |
>>> from ultralytics import YOLO | |
>>> model = YOLO("yolov8n-obb.pt") | |
>>> results = model("path/to/image.jpg") | |
>>> for result in results: | |
... obb = result.obb | |
... if obb is not None: | |
... xyxy_boxes = obb.xyxy | |
... print(xyxy_boxes.shape) # (N, 4) | |
Notes: | |
- This method approximates the OBB by its minimal enclosing rectangle. | |
- The returned format is compatible with standard object detection metrics and visualization tools. | |
- The property uses caching to improve performance for repeated access. | |
""" | |
x = self.xyxyxyxy[..., 0] | |
y = self.xyxyxyxy[..., 1] | |
return ( | |
torch.stack([x.amin(1), y.amin(1), x.amax(1), y.amax(1)], -1) | |
if isinstance(x, torch.Tensor) | |
else np.stack([x.min(1), y.min(1), x.max(1), y.max(1)], -1) | |
) | |