init commit

2025-11-08 19:15:39 +01:00
parent ecffcb08e8
commit c7adacf53b
470 changed files with 73751 additions and 0 deletions
--- a/ultralytics/models/yolo/detect/init.py
+++ b/ultralytics/models/yolo/detect/init.py
@@ -0,0 +1,7 @@
+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+
+from .predict import DetectionPredictor
+from .train import DetectionTrainer
+from .val import DetectionValidator
+
+__all__ = "DetectionPredictor", "DetectionTrainer", "DetectionValidator"
--- a/ultralytics/models/yolo/detect/pycache/init.cpython-310.pyc
+++ b/ultralytics/models/yolo/detect/pycache/init.cpython-310.pyc
--- a/ultralytics/models/yolo/detect/pycache/predict.cpython-310.pyc
+++ b/ultralytics/models/yolo/detect/pycache/predict.cpython-310.pyc
--- a/ultralytics/models/yolo/detect/pycache/train.cpython-310.pyc
+++ b/ultralytics/models/yolo/detect/pycache/train.cpython-310.pyc
--- a/ultralytics/models/yolo/detect/pycache/val.cpython-310.pyc
+++ b/ultralytics/models/yolo/detect/pycache/val.cpython-310.pyc
--- a/ultralytics/models/yolo/detect/predict.py
+++ b/ultralytics/models/yolo/detect/predict.py
@@ -0,0 +1,125 @@
+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+
+from ultralytics.engine.predictor import BasePredictor
+from ultralytics.engine.results import Results
+from ultralytics.utils import nms, ops
+
+
+class DetectionPredictor(BasePredictor):
+    """
+    A class extending the BasePredictor class for prediction based on a detection model.
+
+    This predictor specializes in object detection tasks, processing model outputs into meaningful detection results
+    with bounding boxes and class predictions.
+
+    Attributes:
+        args (namespace): Configuration arguments for the predictor.
+        model (nn.Module): The detection model used for inference.
+        batch (list): Batch of images and metadata for processing.
+
+    Methods:
+        postprocess: Process raw model predictions into detection results.
+        construct_results: Build Results objects from processed predictions.
+        construct_result: Create a single Result object from a prediction.
+        get_obj_feats: Extract object features from the feature maps.
+
+    Examples:
+        >>> from ultralytics.utils import ASSETS
+        >>> from ultralytics.models.yolo.detect import DetectionPredictor
+        >>> args = dict(model="yolo11n.pt", source=ASSETS)
+        >>> predictor = DetectionPredictor(overrides=args)
+        >>> predictor.predict_cli()
+    """
+
+    def postprocess(self, preds, img, orig_imgs, **kwargs):
+        """
+        Post-process predictions and return a list of Results objects.
+
+        This method applies non-maximum suppression to raw model predictions and prepares them for visualization and
+        further analysis.
+
+        Args:
+            preds (torch.Tensor): Raw predictions from the model.
+            img (torch.Tensor): Processed input image tensor in model input format.
+            orig_imgs (torch.Tensor | list): Original input images before preprocessing.
+            **kwargs (Any): Additional keyword arguments.
+
+        Returns:
+            (list): List of Results objects containing the post-processed predictions.
+
+        Examples:
+            >>> predictor = DetectionPredictor(overrides=dict(model="yolo11n.pt"))
+            >>> results = predictor.predict("path/to/image.jpg")
+            >>> processed_results = predictor.postprocess(preds, img, orig_imgs)
+        """
+        save_feats = getattr(self, "_feats", None) is not None
+        preds = nms.non_max_suppression(
+            preds,
+            self.args.conf,
+            self.args.iou,
+            self.args.classes,
+            self.args.agnostic_nms,
+            max_det=self.args.max_det,
+            nc=0 if self.args.task == "detect" else len(self.model.names),
+            end2end=getattr(self.model, "end2end", False),
+            rotated=self.args.task == "obb",
+            return_idxs=save_feats,
+        )
+
+        if not isinstance(orig_imgs, list):  # input images are a torch.Tensor, not a list
+            orig_imgs = ops.convert_torch2numpy_batch(orig_imgs)
+
+        if save_feats:
+            obj_feats = self.get_obj_feats(self._feats, preds[1])
+            preds = preds[0]
+
+        results = self.construct_results(preds, img, orig_imgs, **kwargs)
+
+        if save_feats:
+            for r, f in zip(results, obj_feats):
+                r.feats = f  # add object features to results
+
+        return results
+
+    def get_obj_feats(self, feat_maps, idxs):
+        """Extract object features from the feature maps."""
+        import torch
+
+        s = min(x.shape[1] for x in feat_maps)  # find shortest vector length
+        obj_feats = torch.cat(
+            [x.permute(0, 2, 3, 1).reshape(x.shape[0], -1, s, x.shape[1] // s).mean(dim=-1) for x in feat_maps], dim=1
+        )  # mean reduce all vectors to same length
+        return [feats[idx] if idx.shape[0] else [] for feats, idx in zip(obj_feats, idxs)]  # for each img in batch
+
+    def construct_results(self, preds, img, orig_imgs):
+        """
+        Construct a list of Results objects from model predictions.
+
+        Args:
+            preds (list[torch.Tensor]): List of predicted bounding boxes and scores for each image.
+            img (torch.Tensor): Batch of preprocessed images used for inference.
+            orig_imgs (list[np.ndarray]): List of original images before preprocessing.
+
+        Returns:
+            (list[Results]): List of Results objects containing detection information for each image.
+        """
+        return [
+            self.construct_result(pred, img, orig_img, img_path)
+            for pred, orig_img, img_path in zip(preds, orig_imgs, self.batch[0])
+        ]
+
+    def construct_result(self, pred, img, orig_img, img_path):
+        """
+        Construct a single Results object from one image prediction.
+
+        Args:
+            pred (torch.Tensor): Predicted boxes and scores with shape (N, 6) where N is the number of detections.
+            img (torch.Tensor): Preprocessed image tensor used for inference.
+            orig_img (np.ndarray): Original image before preprocessing.
+            img_path (str): Path to the original image file.
+
+        Returns:
+            (Results): Results object containing the original image, image path, class names, and scaled bounding boxes.
+        """
+        pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
+        return Results(orig_img, path=img_path, names=self.model.names, boxes=pred[:, :6])
--- a/ultralytics/models/yolo/detect/train.py
+++ b/ultralytics/models/yolo/detect/train.py
@@ -0,0 +1,236 @@
+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+
+from __future__ import annotations
+
+import math
+import random
+from copy import copy
+from typing import Any
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from ultralytics.data import build_dataloader, build_yolo_dataset
+from ultralytics.engine.trainer import BaseTrainer
+from ultralytics.models import yolo
+from ultralytics.nn.tasks import DetectionModel
+from ultralytics.utils import DEFAULT_CFG, LOGGER, RANK
+from ultralytics.utils.patches import override_configs
+from ultralytics.utils.plotting import plot_images, plot_labels
+from ultralytics.utils.torch_utils import torch_distributed_zero_first, unwrap_model
+
+
+class DetectionTrainer(BaseTrainer):
+    """
+    A class extending the BaseTrainer class for training based on a detection model.
+
+    This trainer specializes in object detection tasks, handling the specific requirements for training YOLO models
+    for object detection including dataset building, data loading, preprocessing, and model configuration.
+
+    Attributes:
+        model (DetectionModel): The YOLO detection model being trained.
+        data (dict): Dictionary containing dataset information including class names and number of classes.
+        loss_names (tuple): Names of the loss components used in training (box_loss, cls_loss, dfl_loss).
+
+    Methods:
+        build_dataset: Build YOLO dataset for training or validation.
+        get_dataloader: Construct and return dataloader for the specified mode.
+        preprocess_batch: Preprocess a batch of images by scaling and converting to float.
+        set_model_attributes: Set model attributes based on dataset information.
+        get_model: Return a YOLO detection model.
+        get_validator: Return a validator for model evaluation.
+        label_loss_items: Return a loss dictionary with labeled training loss items.
+        progress_string: Return a formatted string of training progress.
+        plot_training_samples: Plot training samples with their annotations.
+        plot_training_labels: Create a labeled training plot of the YOLO model.
+        auto_batch: Calculate optimal batch size based on model memory requirements.
+
+    Examples:
+        >>> from ultralytics.models.yolo.detect import DetectionTrainer
+        >>> args = dict(model="yolo11n.pt", data="coco8.yaml", epochs=3)
+        >>> trainer = DetectionTrainer(overrides=args)
+        >>> trainer.train()
+    """
+
+    def __init__(self, cfg=DEFAULT_CFG, overrides: dict[str, Any] | None = None, _callbacks=None):
+        """
+        Initialize a DetectionTrainer object for training YOLO object detection model training.
+
+        Args:
+            cfg (dict, optional): Default configuration dictionary containing training parameters.
+            overrides (dict, optional): Dictionary of parameter overrides for the default configuration.
+            _callbacks (list, optional): List of callback functions to be executed during training.
+        """
+        super().__init__(cfg, overrides, _callbacks)
+
+    def build_dataset(self, img_path: str, mode: str = "train", batch: int | None = None):
+        """
+        Build YOLO Dataset for training or validation.
+
+        Args:
+            img_path (str): Path to the folder containing images.
+            mode (str): 'train' mode or 'val' mode, users are able to customize different augmentations for each mode.
+            batch (int, optional): Size of batches, this is for 'rect' mode.
+
+        Returns:
+            (Dataset): YOLO dataset object configured for the specified mode.
+        """
+        gs = max(int(unwrap_model(self.model).stride.max() if self.model else 0), 32)
+        return build_yolo_dataset(self.args, img_path, batch, self.data, mode=mode, rect=mode == "val", stride=gs)
+
+    def get_dataloader(self, dataset_path: str, batch_size: int = 16, rank: int = 0, mode: str = "train"):
+        """
+        Construct and return dataloader for the specified mode.
+
+        Args:
+            dataset_path (str): Path to the dataset.
+            batch_size (int): Number of images per batch.
+            rank (int): Process rank for distributed training.
+            mode (str): 'train' for training dataloader, 'val' for validation dataloader.
+
+        Returns:
+            (DataLoader): PyTorch dataloader object.
+        """
+        assert mode in {"train", "val"}, f"Mode must be 'train' or 'val', not {mode}."
+        with torch_distributed_zero_first(rank):  # init dataset *.cache only once if DDP
+            dataset = self.build_dataset(dataset_path, mode, batch_size)
+        shuffle = mode == "train"
+        if getattr(dataset, "rect", False) and shuffle:
+            LOGGER.warning("'rect=True' is incompatible with DataLoader shuffle, setting shuffle=False")
+            shuffle = False
+        return build_dataloader(
+            dataset,
+            batch=batch_size,
+            workers=self.args.workers if mode == "train" else self.args.workers * 2,
+            shuffle=shuffle,
+            rank=rank,
+            drop_last=self.args.compile and mode == "train",
+        )
+
+    def preprocess_batch(self, batch: dict) -> dict:
+        """
+        Preprocess a batch of images by scaling and converting to float.
+
+        Args:
+            batch (dict): Dictionary containing batch data with 'img' tensor.
+
+        Returns:
+            (dict): Preprocessed batch with normalized images.
+        """
+        for k, v in batch.items():
+            if isinstance(v, torch.Tensor):
+                batch[k] = v.to(self.device, non_blocking=self.device.type == "cuda")
+        batch["img"] = batch["img"].float() / 255
+        if self.args.multi_scale:
+            imgs = batch["img"]
+            sz = (
+                random.randrange(int(self.args.imgsz * 0.5), int(self.args.imgsz * 1.5 + self.stride))
+                // self.stride
+                * self.stride
+            )  # size
+            sf = sz / max(imgs.shape[2:])  # scale factor
+            if sf != 1:
+                ns = [
+                    math.ceil(x * sf / self.stride) * self.stride for x in imgs.shape[2:]
+                ]  # new shape (stretched to gs-multiple)
+                imgs = nn.functional.interpolate(imgs, size=ns, mode="bilinear", align_corners=False)
+            batch["img"] = imgs
+        return batch
+
+    def set_model_attributes(self):
+        """Set model attributes based on dataset information."""
+        # Nl = de_parallel(self.model).model[-1].nl  # number of detection layers (to scale hyps)
+        # self.args.box *= 3 / nl  # scale to layers
+        # self.args.cls *= self.data["nc"] / 80 * 3 / nl  # scale to classes and layers
+        # self.args.cls *= (self.args.imgsz / 640) ** 2 * 3 / nl  # scale to image size and layers
+        self.model.nc = self.data["nc"]  # attach number of classes to model
+        self.model.names = self.data["names"]  # attach class names to model
+        self.model.args = self.args  # attach hyperparameters to model
+        # TODO: self.model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) * nc
+
+    def get_model(self, cfg: str | None = None, weights: str | None = None, verbose: bool = True):
+        """
+        Return a YOLO detection model.
+
+        Args:
+            cfg (str, optional): Path to model configuration file.
+            weights (str, optional): Path to model weights.
+            verbose (bool): Whether to display model information.
+
+        Returns:
+            (DetectionModel): YOLO detection model.
+        """
+        model = DetectionModel(cfg, nc=self.data["nc"], ch=self.data["channels"], verbose=verbose and RANK == -1)
+        if weights:
+            model.load(weights)
+        return model
+
+    def get_validator(self):
+        """Return a DetectionValidator for YOLO model validation."""
+        self.loss_names = "box_loss", "cls_loss", "dfl_loss"
+        return yolo.detect.DetectionValidator(
+            self.test_loader, save_dir=self.save_dir, args=copy(self.args), _callbacks=self.callbacks
+        )
+
+    def label_loss_items(self, loss_items: list[float] | None = None, prefix: str = "train"):
+        """
+        Return a loss dict with labeled training loss items tensor.
+
+        Args:
+            loss_items (list[float], optional): List of loss values.
+            prefix (str): Prefix for keys in the returned dictionary.
+
+        Returns:
+            (dict | list): Dictionary of labeled loss items if loss_items is provided, otherwise list of keys.
+        """
+        keys = [f"{prefix}/{x}" for x in self.loss_names]
+        if loss_items is not None:
+            loss_items = [round(float(x), 5) for x in loss_items]  # convert tensors to 5 decimal place floats
+            return dict(zip(keys, loss_items))
+        else:
+            return keys
+
+    def progress_string(self):
+        """Return a formatted string of training progress with epoch, GPU memory, loss, instances and size."""
+        return ("\n" + "%11s" * (4 + len(self.loss_names))) % (
+            "Epoch",
+            "GPU_mem",
+            *self.loss_names,
+            "Instances",
+            "Size",
+        )
+
+    def plot_training_samples(self, batch: dict[str, Any], ni: int) -> None:
+        """
+        Plot training samples with their annotations.
+
+        Args:
+            batch (dict[str, Any]): Dictionary containing batch data.
+            ni (int): Number of iterations.
+        """
+        plot_images(
+            labels=batch,
+            paths=batch["im_file"],
+            fname=self.save_dir / f"train_batch{ni}.jpg",
+            on_plot=self.on_plot,
+        )
+
+    def plot_training_labels(self):
+        """Create a labeled training plot of the YOLO model."""
+        boxes = np.concatenate([lb["bboxes"] for lb in self.train_loader.dataset.labels], 0)
+        cls = np.concatenate([lb["cls"] for lb in self.train_loader.dataset.labels], 0)
+        plot_labels(boxes, cls.squeeze(), names=self.data["names"], save_dir=self.save_dir, on_plot=self.on_plot)
+
+    def auto_batch(self):
+        """
+        Get optimal batch size by calculating memory occupation of model.
+
+        Returns:
+            (int): Optimal batch size.
+        """
+        with override_configs(self.args, overrides={"cache": False}) as self.args:
+            train_dataset = self.build_dataset(self.data["train"], mode="train", batch=16)
+        max_num_obj = max(len(label["cls"]) for label in train_dataset.labels) * 4  # 4 for mosaic augmentation
+        del train_dataset  # free memory
+        return super().auto_batch(max_num_obj)
--- a/ultralytics/models/yolo/detect/val.py
+++ b/ultralytics/models/yolo/detect/val.py
@@ -0,0 +1,495 @@
+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import torch
+
+from ultralytics.data import build_dataloader, build_yolo_dataset, converter
+from ultralytics.engine.validator import BaseValidator
+from ultralytics.utils import LOGGER, nms, ops
+from ultralytics.utils.checks import check_requirements
+from ultralytics.utils.metrics import ConfusionMatrix, DetMetrics, box_iou
+from ultralytics.utils.plotting import plot_images
+
+
+class DetectionValidator(BaseValidator):
+    """
+    A class extending the BaseValidator class for validation based on a detection model.
+
+    This class implements validation functionality specific to object detection tasks, including metrics calculation,
+    prediction processing, and visualization of results.
+
+    Attributes:
+        is_coco (bool): Whether the dataset is COCO.
+        is_lvis (bool): Whether the dataset is LVIS.
+        class_map (list[int]): Mapping from model class indices to dataset class indices.
+        metrics (DetMetrics): Object detection metrics calculator.
+        iouv (torch.Tensor): IoU thresholds for mAP calculation.
+        niou (int): Number of IoU thresholds.
+        lb (list[Any]): List for storing ground truth labels for hybrid saving.
+        jdict (list[dict[str, Any]]): List for storing JSON detection results.
+        stats (dict[str, list[torch.Tensor]]): Dictionary for storing statistics during validation.
+
+    Examples:
+        >>> from ultralytics.models.yolo.detect import DetectionValidator
+        >>> args = dict(model="yolo11n.pt", data="coco8.yaml")
+        >>> validator = DetectionValidator(args=args)
+        >>> validator()
+    """
+
+    def __init__(self, dataloader=None, save_dir=None, args=None, _callbacks=None) -> None:
+        """
+        Initialize detection validator with necessary variables and settings.
+
+        Args:
+            dataloader (torch.utils.data.DataLoader, optional): Dataloader to use for validation.
+            save_dir (Path, optional): Directory to save results.
+            args (dict[str, Any], optional): Arguments for the validator.
+            _callbacks (list[Any], optional): List of callback functions.
+        """
+        super().__init__(dataloader, save_dir, args, _callbacks)
+        self.is_coco = False
+        self.is_lvis = False
+        self.class_map = None
+        self.args.task = "detect"
+        self.iouv = torch.linspace(0.5, 0.95, 10)  # IoU vector for mAP@0.5:0.95
+        self.niou = self.iouv.numel()
+        self.metrics = DetMetrics()
+
+    def preprocess(self, batch: dict[str, Any]) -> dict[str, Any]:
+        """
+        Preprocess batch of images for YOLO validation.
+
+        Args:
+            batch (dict[str, Any]): Batch containing images and annotations.
+
+        Returns:
+            (dict[str, Any]): Preprocessed batch.
+        """
+        for k, v in batch.items():
+            if isinstance(v, torch.Tensor):
+                batch[k] = v.to(self.device, non_blocking=self.device.type == "cuda")
+        batch["img"] = (batch["img"].half() if self.args.half else batch["img"].float()) / 255
+        return batch
+
+    def init_metrics(self, model: torch.nn.Module) -> None:
+        """
+        Initialize evaluation metrics for YOLO detection validation.
+
+        Args:
+            model (torch.nn.Module): Model to validate.
+        """
+        val = self.data.get(self.args.split, "")  # validation path
+        self.is_coco = (
+            isinstance(val, str)
+            and "coco" in val
+            and (val.endswith(f"{os.sep}val2017.txt") or val.endswith(f"{os.sep}test-dev2017.txt"))
+        )  # is COCO
+        self.is_lvis = isinstance(val, str) and "lvis" in val and not self.is_coco  # is LVIS
+        self.class_map = converter.coco80_to_coco91_class() if self.is_coco else list(range(1, len(model.names) + 1))
+        self.args.save_json |= self.args.val and (self.is_coco or self.is_lvis) and not self.training  # run final val
+        self.names = model.names
+        self.nc = len(model.names)
+        self.end2end = getattr(model, "end2end", False)
+        self.seen = 0
+        self.jdict = []
+        self.metrics.names = model.names
+        self.confusion_matrix = ConfusionMatrix(names=model.names, save_matches=self.args.plots and self.args.visualize)
+
+    def get_desc(self) -> str:
+        """Return a formatted string summarizing class metrics of YOLO model."""
+        return ("%22s" + "%11s" * 6) % ("Class", "Images", "Instances", "Box(P", "R", "mAP50", "mAP50-95)")
+
+    def postprocess(self, preds: torch.Tensor) -> list[dict[str, torch.Tensor]]:
+        """
+        Apply Non-maximum suppression to prediction outputs.
+
+        Args:
+            preds (torch.Tensor): Raw predictions from the model.
+
+        Returns:
+            (list[dict[str, torch.Tensor]]): Processed predictions after NMS, where each dict contains
+                'bboxes', 'conf', 'cls', and 'extra' tensors.
+        """
+        outputs = nms.non_max_suppression(
+            preds,
+            self.args.conf,
+            self.args.iou,
+            nc=0 if self.args.task == "detect" else self.nc,
+            multi_label=True,
+            agnostic=self.args.single_cls or self.args.agnostic_nms,
+            max_det=self.args.max_det,
+            end2end=self.end2end,
+            rotated=self.args.task == "obb",
+        )
+        return [{"bboxes": x[:, :4], "conf": x[:, 4], "cls": x[:, 5], "extra": x[:, 6:]} for x in outputs]
+
+    def _prepare_batch(self, si: int, batch: dict[str, Any]) -> dict[str, Any]:
+        """
+        Prepare a batch of images and annotations for validation.
+
+        Args:
+            si (int): Batch index.
+            batch (dict[str, Any]): Batch data containing images and annotations.
+
+        Returns:
+            (dict[str, Any]): Prepared batch with processed annotations.
+        """
+        idx = batch["batch_idx"] == si
+        cls = batch["cls"][idx].squeeze(-1)
+        bbox = batch["bboxes"][idx]
+        ori_shape = batch["ori_shape"][si]
+        imgsz = batch["img"].shape[2:]
+        ratio_pad = batch["ratio_pad"][si]
+        if cls.shape[0]:
+            bbox = ops.xywh2xyxy(bbox) * torch.tensor(imgsz, device=self.device)[[1, 0, 1, 0]]  # target boxes
+        return {
+            "cls": cls,
+            "bboxes": bbox,
+            "ori_shape": ori_shape,
+            "imgsz": imgsz,
+            "ratio_pad": ratio_pad,
+            "im_file": batch["im_file"][si],
+        }
+
+    def _prepare_pred(self, pred: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
+        """
+        Prepare predictions for evaluation against ground truth.
+
+        Args:
+            pred (dict[str, torch.Tensor]): Post-processed predictions from the model.
+
+        Returns:
+            (dict[str, torch.Tensor]): Prepared predictions in native space.
+        """
+        if self.args.single_cls:
+            pred["cls"] *= 0
+        return pred
+
+    def update_metrics(self, preds: list[dict[str, torch.Tensor]], batch: dict[str, Any]) -> None:
+        """
+        Update metrics with new predictions and ground truth.
+
+        Args:
+            preds (list[dict[str, torch.Tensor]]): List of predictions from the model.
+            batch (dict[str, Any]): Batch data containing ground truth.
+        """
+        for si, pred in enumerate(preds):
+            self.seen += 1
+            pbatch = self._prepare_batch(si, batch)
+            predn = self._prepare_pred(pred)
+
+            cls = pbatch["cls"].cpu().numpy()
+            no_pred = predn["cls"].shape[0] == 0
+            self.metrics.update_stats(
+                {
+                    **self._process_batch(predn, pbatch),
+                    "target_cls": cls,
+                    "target_img": np.unique(cls),
+                    "conf": np.zeros(0) if no_pred else predn["conf"].cpu().numpy(),
+                    "pred_cls": np.zeros(0) if no_pred else predn["cls"].cpu().numpy(),
+                }
+            )
+            # Evaluate
+            if self.args.plots:
+                self.confusion_matrix.process_batch(predn, pbatch, conf=self.args.conf)
+                if self.args.visualize:
+                    self.confusion_matrix.plot_matches(batch["img"][si], pbatch["im_file"], self.save_dir)
+
+            if no_pred:
+                continue
+
+            # Save
+            if self.args.save_json or self.args.save_txt:
+                predn_scaled = self.scale_preds(predn, pbatch)
+            if self.args.save_json:
+                self.pred_to_json(predn_scaled, pbatch)
+            if self.args.save_txt:
+                self.save_one_txt(
+                    predn_scaled,
+                    self.args.save_conf,
+                    pbatch["ori_shape"],
+                    self.save_dir / "labels" / f"{Path(pbatch['im_file']).stem}.txt",
+                )
+
+    def finalize_metrics(self) -> None:
+        """Set final values for metrics speed and confusion matrix."""
+        if self.args.plots:
+            for normalize in True, False:
+                self.confusion_matrix.plot(save_dir=self.save_dir, normalize=normalize, on_plot=self.on_plot)
+        self.metrics.speed = self.speed
+        self.metrics.confusion_matrix = self.confusion_matrix
+        self.metrics.save_dir = self.save_dir
+
+    def get_stats(self) -> dict[str, Any]:
+        """
+        Calculate and return metrics statistics.
+
+        Returns:
+            (dict[str, Any]): Dictionary containing metrics results.
+        """
+        self.metrics.process(save_dir=self.save_dir, plot=self.args.plots, on_plot=self.on_plot)
+        self.metrics.clear_stats()
+        return self.metrics.results_dict
+
+    def print_results(self) -> None:
+        """Print training/validation set metrics per class."""
+        pf = "%22s" + "%11i" * 2 + "%11.3g" * len(self.metrics.keys)  # print format
+        LOGGER.info(pf % ("all", self.seen, self.metrics.nt_per_class.sum(), *self.metrics.mean_results()))
+        if self.metrics.nt_per_class.sum() == 0:
+            LOGGER.warning(f"no labels found in {self.args.task} set, can not compute metrics without labels")
+
+        # Print results per class
+        if self.args.verbose and not self.training and self.nc > 1 and len(self.metrics.stats):
+            for i, c in enumerate(self.metrics.ap_class_index):
+                LOGGER.info(
+                    pf
+                    % (
+                        self.names[c],
+                        self.metrics.nt_per_image[c],
+                        self.metrics.nt_per_class[c],
+                        *self.metrics.class_result(i),
+                    )
+                )
+
+    def _process_batch(self, preds: dict[str, torch.Tensor], batch: dict[str, Any]) -> dict[str, np.ndarray]:
+        """
+        Return correct prediction matrix.
+
+        Args:
+            preds (dict[str, torch.Tensor]): Dictionary containing prediction data with 'bboxes' and 'cls' keys.
+            batch (dict[str, Any]): Batch dictionary containing ground truth data with 'bboxes' and 'cls' keys.
+
+        Returns:
+            (dict[str, np.ndarray]): Dictionary containing 'tp' key with correct prediction matrix of shape (N, 10) for 10 IoU levels.
+        """
+        if batch["cls"].shape[0] == 0 or preds["cls"].shape[0] == 0:
+            return {"tp": np.zeros((preds["cls"].shape[0], self.niou), dtype=bool)}
+        iou = box_iou(batch["bboxes"], preds["bboxes"])
+        return {"tp": self.match_predictions(preds["cls"], batch["cls"], iou).cpu().numpy()}
+
+    def build_dataset(self, img_path: str, mode: str = "val", batch: int | None = None) -> torch.utils.data.Dataset:
+        """
+        Build YOLO Dataset.
+
+        Args:
+            img_path (str): Path to the folder containing images.
+            mode (str): `train` mode or `val` mode, users are able to customize different augmentations for each mode.
+            batch (int, optional): Size of batches, this is for `rect`.
+
+        Returns:
+            (Dataset): YOLO dataset.
+        """
+        return build_yolo_dataset(self.args, img_path, batch, self.data, mode=mode, stride=self.stride)
+
+    def get_dataloader(self, dataset_path: str, batch_size: int) -> torch.utils.data.DataLoader:
+        """
+        Construct and return dataloader.
+
+        Args:
+            dataset_path (str): Path to the dataset.
+            batch_size (int): Size of each batch.
+
+        Returns:
+            (torch.utils.data.DataLoader): Dataloader for validation.
+        """
+        dataset = self.build_dataset(dataset_path, batch=batch_size, mode="val")
+        return build_dataloader(
+            dataset, batch_size, self.args.workers, shuffle=False, rank=-1, drop_last=self.args.compile
+        )
+
+    def plot_val_samples(self, batch: dict[str, Any], ni: int) -> None:
+        """
+        Plot validation image samples.
+
+        Args:
+            batch (dict[str, Any]): Batch containing images and annotations.
+            ni (int): Batch index.
+        """
+        plot_images(
+            labels=batch,
+            paths=batch["im_file"],
+            fname=self.save_dir / f"val_batch{ni}_labels.jpg",
+            names=self.names,
+            on_plot=self.on_plot,
+        )
+
+    def plot_predictions(
+        self, batch: dict[str, Any], preds: list[dict[str, torch.Tensor]], ni: int, max_det: int | None = None
+    ) -> None:
+        """
+        Plot predicted bounding boxes on input images and save the result.
+
+        Args:
+            batch (dict[str, Any]): Batch containing images and annotations.
+            preds (list[dict[str, torch.Tensor]]): List of predictions from the model.
+            ni (int): Batch index.
+            max_det (Optional[int]): Maximum number of detections to plot.
+        """
+        # TODO: optimize this
+        for i, pred in enumerate(preds):
+            pred["batch_idx"] = torch.ones_like(pred["conf"]) * i  # add batch index to predictions
+        keys = preds[0].keys()
+        max_det = max_det or self.args.max_det
+        batched_preds = {k: torch.cat([x[k][:max_det] for x in preds], dim=0) for k in keys}
+        # TODO: fix this
+        batched_preds["bboxes"][:, :4] = ops.xyxy2xywh(batched_preds["bboxes"][:, :4])  # convert to xywh format
+        plot_images(
+            images=batch["img"],
+            labels=batched_preds,
+            paths=batch["im_file"],
+            fname=self.save_dir / f"val_batch{ni}_pred.jpg",
+            names=self.names,
+            on_plot=self.on_plot,
+        )  # pred
+
+    def save_one_txt(self, predn: dict[str, torch.Tensor], save_conf: bool, shape: tuple[int, int], file: Path) -> None:
+        """
+        Save YOLO detections to a txt file in normalized coordinates in a specific format.
+
+        Args:
+            predn (dict[str, torch.Tensor]): Dictionary containing predictions with keys 'bboxes', 'conf', and 'cls'.
+            save_conf (bool): Whether to save confidence scores.
+            shape (tuple[int, int]): Shape of the original image (height, width).
+            file (Path): File path to save the detections.
+        """
+        from ultralytics.engine.results import Results
+
+        Results(
+            np.zeros((shape[0], shape[1]), dtype=np.uint8),
+            path=None,
+            names=self.names,
+            boxes=torch.cat([predn["bboxes"], predn["conf"].unsqueeze(-1), predn["cls"].unsqueeze(-1)], dim=1),
+        ).save_txt(file, save_conf=save_conf)
+
+    def pred_to_json(self, predn: dict[str, torch.Tensor], pbatch: dict[str, Any]) -> None:
+        """
+        Serialize YOLO predictions to COCO json format.
+
+        Args:
+            predn (dict[str, torch.Tensor]): Predictions dictionary containing 'bboxes', 'conf', and 'cls' keys
+                with bounding box coordinates, confidence scores, and class predictions.
+            pbatch (dict[str, Any]): Batch dictionary containing 'imgsz', 'ori_shape', 'ratio_pad', and 'im_file'.
+
+        Examples:
+             >>> result = {
+             ...     "image_id": 42,
+             ...     "file_name": "42.jpg",
+             ...     "category_id": 18,
+             ...     "bbox": [258.15, 41.29, 348.26, 243.78],
+             ...     "score": 0.236,
+             ... }
+        """
+        path = Path(pbatch["im_file"])
+        stem = path.stem
+        image_id = int(stem) if stem.isnumeric() else stem
+        box = ops.xyxy2xywh(predn["bboxes"])  # xywh
+        box[:, :2] -= box[:, 2:] / 2  # xy center to top-left corner
+        for b, s, c in zip(box.tolist(), predn["conf"].tolist(), predn["cls"].tolist()):
+            self.jdict.append(
+                {
+                    "image_id": image_id,
+                    "file_name": path.name,
+                    "category_id": self.class_map[int(c)],
+                    "bbox": [round(x, 3) for x in b],
+                    "score": round(s, 5),
+                }
+            )
+
+    def scale_preds(self, predn: dict[str, torch.Tensor], pbatch: dict[str, Any]) -> dict[str, torch.Tensor]:
+        """Scales predictions to the original image size."""
+        return {
+            **predn,
+            "bboxes": ops.scale_boxes(
+                pbatch["imgsz"],
+                predn["bboxes"].clone(),
+                pbatch["ori_shape"],
+                ratio_pad=pbatch["ratio_pad"],
+            ),
+        }
+
+    def eval_json(self, stats: dict[str, Any]) -> dict[str, Any]:
+        """
+        Evaluate YOLO output in JSON format and return performance statistics.
+
+        Args:
+            stats (dict[str, Any]): Current statistics dictionary.
+
+        Returns:
+            (dict[str, Any]): Updated statistics dictionary with COCO/LVIS evaluation results.
+        """
+        pred_json = self.save_dir / "predictions.json"  # predictions
+        anno_json = (
+            self.data["path"]
+            / "annotations"
+            / ("instances_val2017.json" if self.is_coco else f"lvis_v1_{self.args.split}.json")
+        )  # annotations
+        return self.coco_evaluate(stats, pred_json, anno_json)
+
+    def coco_evaluate(
+        self,
+        stats: dict[str, Any],
+        pred_json: str,
+        anno_json: str,
+        iou_types: str | list[str] = "bbox",
+        suffix: str | list[str] = "Box",
+    ) -> dict[str, Any]:
+        """
+        Evaluate COCO/LVIS metrics using faster-coco-eval library.
+
+        Performs evaluation using the faster-coco-eval library to compute mAP metrics
+        for object detection. Updates the provided stats dictionary with computed metrics
+        including mAP50, mAP50-95, and LVIS-specific metrics if applicable.
+
+        Args:
+            stats (dict[str, Any]): Dictionary to store computed metrics and statistics.
+            pred_json (str | Path]): Path to JSON file containing predictions in COCO format.
+            anno_json (str | Path]): Path to JSON file containing ground truth annotations in COCO format.
+            iou_types (str | list[str]]): IoU type(s) for evaluation. Can be single string or list of strings.
+                Common values include "bbox", "segm", "keypoints". Defaults to "bbox".
+            suffix (str | list[str]]): Suffix to append to metric names in stats dictionary. Should correspond
+                to iou_types if multiple types provided. Defaults to "Box".
+
+        Returns:
+            (dict[str, Any]): Updated stats dictionary containing the computed COCO/LVIS evaluation metrics.
+        """
+        if self.args.save_json and (self.is_coco or self.is_lvis) and len(self.jdict):
+            LOGGER.info(f"\nEvaluating faster-coco-eval mAP using {pred_json} and {anno_json}...")
+            try:
+                for x in pred_json, anno_json:
+                    assert x.is_file(), f"{x} file not found"
+                iou_types = [iou_types] if isinstance(iou_types, str) else iou_types
+                suffix = [suffix] if isinstance(suffix, str) else suffix
+                check_requirements("faster-coco-eval>=1.6.7")
+                from faster_coco_eval import COCO, COCOeval_faster
+
+                anno = COCO(anno_json)
+                pred = anno.loadRes(pred_json)
+                for i, iou_type in enumerate(iou_types):
+                    val = COCOeval_faster(
+                        anno, pred, iouType=iou_type, lvis_style=self.is_lvis, print_function=LOGGER.info
+                    )
+                    val.params.imgIds = [int(Path(x).stem) for x in self.dataloader.dataset.im_files]  # images to eval
+                    val.evaluate()
+                    val.accumulate()
+                    val.summarize()
+
+                    # update mAP50-95 and mAP50
+                    stats[f"metrics/mAP50({suffix[i][0]})"] = val.stats_as_dict["AP_50"]
+                    stats[f"metrics/mAP50-95({suffix[i][0]})"] = val.stats_as_dict["AP_all"]
+
+                    if self.is_lvis:
+                        stats[f"metrics/APr({suffix[i][0]})"] = val.stats_as_dict["APr"]
+                        stats[f"metrics/APc({suffix[i][0]})"] = val.stats_as_dict["APc"]
+                        stats[f"metrics/APf({suffix[i][0]})"] = val.stats_as_dict["APf"]
+
+                if self.is_lvis:
+                    stats["fitness"] = stats["metrics/mAP50-95(B)"]  # always use box mAP50-95 for fitness
+            except Exception as e:
+                LOGGER.warning(f"faster-coco-eval unable to run: {e}")
+        return stats