init commit

2025-11-08 19:15:39 +01:00
parent ecffcb08e8
commit c7adacf53b
470 changed files with 73751 additions and 0 deletions
--- a/ultralytics/models/rtdetr/init.py
+++ b/ultralytics/models/rtdetr/init.py
@@ -0,0 +1,7 @@
+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+
+from .model import RTDETR
+from .predict import RTDETRPredictor
+from .val import RTDETRValidator
+
+__all__ = "RTDETRPredictor", "RTDETRValidator", "RTDETR"
--- a/ultralytics/models/rtdetr/pycache/init.cpython-310.pyc
+++ b/ultralytics/models/rtdetr/pycache/init.cpython-310.pyc
--- a/ultralytics/models/rtdetr/pycache/model.cpython-310.pyc
+++ b/ultralytics/models/rtdetr/pycache/model.cpython-310.pyc
--- a/ultralytics/models/rtdetr/pycache/predict.cpython-310.pyc
+++ b/ultralytics/models/rtdetr/pycache/predict.cpython-310.pyc
--- a/ultralytics/models/rtdetr/pycache/train.cpython-310.pyc
+++ b/ultralytics/models/rtdetr/pycache/train.cpython-310.pyc
--- a/ultralytics/models/rtdetr/pycache/val.cpython-310.pyc
+++ b/ultralytics/models/rtdetr/pycache/val.cpython-310.pyc
--- a/ultralytics/models/rtdetr/model.py
+++ b/ultralytics/models/rtdetr/model.py
@@ -0,0 +1,66 @@
+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+"""
+Interface for Baidu's RT-DETR, a Vision Transformer-based real-time object detector.
+
+RT-DETR offers real-time performance and high accuracy, excelling in accelerated backends like CUDA with TensorRT.
+It features an efficient hybrid encoder and IoU-aware query selection for enhanced detection accuracy.
+
+References:
+    https://arxiv.org/pdf/2304.08069.pdf
+"""
+
+from ultralytics.engine.model import Model
+from ultralytics.nn.tasks import RTDETRDetectionModel
+from ultralytics.utils.torch_utils import TORCH_1_11
+
+from .predict import RTDETRPredictor
+from .train import RTDETRTrainer
+from .val import RTDETRValidator
+
+
+class RTDETR(Model):
+    """
+    Interface for Baidu's RT-DETR model, a Vision Transformer-based real-time object detector.
+
+    This model provides real-time performance with high accuracy. It supports efficient hybrid encoding, IoU-aware
+    query selection, and adaptable inference speed.
+
+    Attributes:
+        model (str): Path to the pre-trained model.
+
+    Methods:
+        task_map: Return a task map for RT-DETR, associating tasks with corresponding Ultralytics classes.
+
+    Examples:
+        Initialize RT-DETR with a pre-trained model
+        >>> from ultralytics import RTDETR
+        >>> model = RTDETR("rtdetr-l.pt")
+        >>> results = model("image.jpg")
+    """
+
+    def __init__(self, model: str = "rtdetr-l.pt") -> None:
+        """
+        Initialize the RT-DETR model with the given pre-trained model file.
+
+        Args:
+            model (str): Path to the pre-trained model. Supports .pt, .yaml, and .yml formats.
+        """
+        assert TORCH_1_11, "RTDETR requires torch>=1.11"
+        super().__init__(model=model, task="detect")
+
+    @property
+    def task_map(self) -> dict:
+        """
+        Return a task map for RT-DETR, associating tasks with corresponding Ultralytics classes.
+
+        Returns:
+            (dict): A dictionary mapping task names to Ultralytics task classes for the RT-DETR model.
+        """
+        return {
+            "detect": {
+                "predictor": RTDETRPredictor,
+                "validator": RTDETRValidator,
+                "trainer": RTDETRTrainer,
+                "model": RTDETRDetectionModel,
+            }
+        }
--- a/ultralytics/models/rtdetr/predict.py
+++ b/ultralytics/models/rtdetr/predict.py
@@ -0,0 +1,92 @@
+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+
+import torch
+
+from ultralytics.data.augment import LetterBox
+from ultralytics.engine.predictor import BasePredictor
+from ultralytics.engine.results import Results
+from ultralytics.utils import ops
+
+
+class RTDETRPredictor(BasePredictor):
+    """
+    RT-DETR (Real-Time Detection Transformer) Predictor extending the BasePredictor class for making predictions.
+
+    This class leverages Vision Transformers to provide real-time object detection while maintaining high accuracy.
+    It supports key features like efficient hybrid encoding and IoU-aware query selection.
+
+    Attributes:
+        imgsz (int): Image size for inference (must be square and scale-filled).
+        args (dict): Argument overrides for the predictor.
+        model (torch.nn.Module): The loaded RT-DETR model.
+        batch (list): Current batch of processed inputs.
+
+    Methods:
+        postprocess: Postprocess raw model predictions to generate bounding boxes and confidence scores.
+        pre_transform: Pre-transform input images before feeding them into the model for inference.
+
+    Examples:
+        >>> from ultralytics.utils import ASSETS
+        >>> from ultralytics.models.rtdetr import RTDETRPredictor
+        >>> args = dict(model="rtdetr-l.pt", source=ASSETS)
+        >>> predictor = RTDETRPredictor(overrides=args)
+        >>> predictor.predict_cli()
+    """
+
+    def postprocess(self, preds, img, orig_imgs):
+        """
+        Postprocess the raw predictions from the model to generate bounding boxes and confidence scores.
+
+        The method filters detections based on confidence and class if specified in `self.args`. It converts
+        model predictions to Results objects containing properly scaled bounding boxes.
+
+        Args:
+            preds (list | tuple): List of [predictions, extra] from the model, where predictions contain
+                bounding boxes and scores.
+            img (torch.Tensor): Processed input images with shape (N, 3, H, W).
+            orig_imgs (list | torch.Tensor): Original, unprocessed images.
+
+        Returns:
+            results (list[Results]): A list of Results objects containing the post-processed bounding boxes,
+                confidence scores, and class labels.
+        """
+        if not isinstance(preds, (list, tuple)):  # list for PyTorch inference but list[0] Tensor for export inference
+            preds = [preds, None]
+
+        nd = preds[0].shape[-1]
+        bboxes, scores = preds[0].split((4, nd - 4), dim=-1)
+
+        if not isinstance(orig_imgs, list):  # input images are a torch.Tensor, not a list
+            orig_imgs = ops.convert_torch2numpy_batch(orig_imgs)
+
+        results = []
+        for bbox, score, orig_img, img_path in zip(bboxes, scores, orig_imgs, self.batch[0]):  # (300, 4)
+            bbox = ops.xywh2xyxy(bbox)
+            max_score, cls = score.max(-1, keepdim=True)  # (300, 1)
+            idx = max_score.squeeze(-1) > self.args.conf  # (300, )
+            if self.args.classes is not None:
+                idx = (cls == torch.tensor(self.args.classes, device=cls.device)).any(1) & idx
+            pred = torch.cat([bbox, max_score, cls], dim=-1)[idx]  # filter
+            pred = pred[pred[:, 4].argsort(descending=True)][: self.args.max_det]
+            oh, ow = orig_img.shape[:2]
+            pred[..., [0, 2]] *= ow  # scale x coordinates to original width
+            pred[..., [1, 3]] *= oh  # scale y coordinates to original height
+            results.append(Results(orig_img, path=img_path, names=self.model.names, boxes=pred))
+        return results
+
+    def pre_transform(self, im):
+        """
+        Pre-transform input images before feeding them into the model for inference.
+
+        The input images are letterboxed to ensure a square aspect ratio and scale-filled. The size must be square
+        (640) and scale_filled.
+
+        Args:
+            im (list[np.ndarray]  | torch.Tensor): Input images of shape (N, 3, H, W) for tensor,
+                [(H, W, 3) x N] for list.
+
+        Returns:
+            (list): List of pre-transformed images ready for model inference.
+        """
+        letterbox = LetterBox(self.imgsz, auto=False, scale_fill=True)
+        return [letterbox(image=x) for x in im]
--- a/ultralytics/models/rtdetr/train.py
+++ b/ultralytics/models/rtdetr/train.py
@@ -0,0 +1,92 @@
+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+
+from __future__ import annotations
+
+from copy import copy
+
+from ultralytics.models.yolo.detect import DetectionTrainer
+from ultralytics.nn.tasks import RTDETRDetectionModel
+from ultralytics.utils import RANK, colorstr
+
+from .val import RTDETRDataset, RTDETRValidator
+
+
+class RTDETRTrainer(DetectionTrainer):
+    """
+    Trainer class for the RT-DETR model developed by Baidu for real-time object detection.
+
+    This class extends the DetectionTrainer class for YOLO to adapt to the specific features and architecture of RT-DETR.
+    The model leverages Vision Transformers and has capabilities like IoU-aware query selection and adaptable inference
+    speed.
+
+    Attributes:
+        loss_names (tuple): Names of the loss components used for training.
+        data (dict): Dataset configuration containing class count and other parameters.
+        args (dict): Training arguments and hyperparameters.
+        save_dir (Path): Directory to save training results.
+        test_loader (DataLoader): DataLoader for validation/testing data.
+
+    Methods:
+        get_model: Initialize and return an RT-DETR model for object detection tasks.
+        build_dataset: Build and return an RT-DETR dataset for training or validation.
+        get_validator: Return a DetectionValidator suitable for RT-DETR model validation.
+
+    Notes:
+        - F.grid_sample used in RT-DETR does not support the `deterministic=True` argument.
+        - AMP training can lead to NaN outputs and may produce errors during bipartite graph matching.
+
+    Examples:
+        >>> from ultralytics.models.rtdetr.train import RTDETRTrainer
+        >>> args = dict(model="rtdetr-l.yaml", data="coco8.yaml", imgsz=640, epochs=3)
+        >>> trainer = RTDETRTrainer(overrides=args)
+        >>> trainer.train()
+    """
+
+    def get_model(self, cfg: dict | None = None, weights: str | None = None, verbose: bool = True):
+        """
+        Initialize and return an RT-DETR model for object detection tasks.
+
+        Args:
+            cfg (dict, optional): Model configuration.
+            weights (str, optional): Path to pre-trained model weights.
+            verbose (bool): Verbose logging if True.
+
+        Returns:
+            (RTDETRDetectionModel): Initialized model.
+        """
+        model = RTDETRDetectionModel(cfg, nc=self.data["nc"], ch=self.data["channels"], verbose=verbose and RANK == -1)
+        if weights:
+            model.load(weights)
+        return model
+
+    def build_dataset(self, img_path: str, mode: str = "val", batch: int | None = None):
+        """
+        Build and return an RT-DETR dataset for training or validation.
+
+        Args:
+            img_path (str): Path to the folder containing images.
+            mode (str): Dataset mode, either 'train' or 'val'.
+            batch (int, optional): Batch size for rectangle training.
+
+        Returns:
+            (RTDETRDataset): Dataset object for the specific mode.
+        """
+        return RTDETRDataset(
+            img_path=img_path,
+            imgsz=self.args.imgsz,
+            batch_size=batch,
+            augment=mode == "train",
+            hyp=self.args,
+            rect=False,
+            cache=self.args.cache or None,
+            single_cls=self.args.single_cls or False,
+            prefix=colorstr(f"{mode}: "),
+            classes=self.args.classes,
+            data=self.data,
+            fraction=self.args.fraction if mode == "train" else 1.0,
+        )
+
+    def get_validator(self):
+        """Return a DetectionValidator suitable for RT-DETR model validation."""
+        self.loss_names = "giou_loss", "cls_loss", "l1_loss"
+        return RTDETRValidator(self.test_loader, save_dir=self.save_dir, args=copy(self.args))
--- a/ultralytics/models/rtdetr/val.py
+++ b/ultralytics/models/rtdetr/val.py
@@ -0,0 +1,218 @@
+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+import torch
+
+from ultralytics.data import YOLODataset
+from ultralytics.data.augment import Compose, Format, v8_transforms
+from ultralytics.models.yolo.detect import DetectionValidator
+from ultralytics.utils import colorstr, ops
+
+__all__ = ("RTDETRValidator",)  # tuple or list
+
+
+class RTDETRDataset(YOLODataset):
+    """
+    Real-Time DEtection and TRacking (RT-DETR) dataset class extending the base YOLODataset class.
+
+    This specialized dataset class is designed for use with the RT-DETR object detection model and is optimized for
+    real-time detection and tracking tasks.
+
+    Attributes:
+        augment (bool): Whether to apply data augmentation.
+        rect (bool): Whether to use rectangular training.
+        use_segments (bool): Whether to use segmentation masks.
+        use_keypoints (bool): Whether to use keypoint annotations.
+        imgsz (int): Target image size for training.
+
+    Methods:
+        load_image: Load one image from dataset index.
+        build_transforms: Build transformation pipeline for the dataset.
+
+    Examples:
+        Initialize an RT-DETR dataset
+        >>> dataset = RTDETRDataset(img_path="path/to/images", imgsz=640)
+        >>> image, hw = dataset.load_image(0)
+    """
+
+    def __init__(self, *args, data=None, **kwargs):
+        """
+        Initialize the RTDETRDataset class by inheriting from the YOLODataset class.
+
+        This constructor sets up a dataset specifically optimized for the RT-DETR (Real-Time DEtection and TRacking)
+        model, building upon the base YOLODataset functionality.
+
+        Args:
+            *args (Any): Variable length argument list passed to the parent YOLODataset class.
+            data (dict | None): Dictionary containing dataset information. If None, default values will be used.
+            **kwargs (Any): Additional keyword arguments passed to the parent YOLODataset class.
+        """
+        super().__init__(*args, data=data, **kwargs)
+
+    def load_image(self, i, rect_mode=False):
+        """
+        Load one image from dataset index 'i'.
+
+        Args:
+            i (int): Index of the image to load.
+            rect_mode (bool, optional): Whether to use rectangular mode for batch inference.
+
+        Returns:
+            im (torch.Tensor): The loaded image.
+            resized_hw (tuple): Height and width of the resized image with shape (2,).
+
+        Examples:
+            Load an image from the dataset
+            >>> dataset = RTDETRDataset(img_path="path/to/images")
+            >>> image, hw = dataset.load_image(0)
+        """
+        return super().load_image(i=i, rect_mode=rect_mode)
+
+    def build_transforms(self, hyp=None):
+        """
+        Build transformation pipeline for the dataset.
+
+        Args:
+            hyp (dict, optional): Hyperparameters for transformations.
+
+        Returns:
+            (Compose): Composition of transformation functions.
+        """
+        if self.augment:
+            hyp.mosaic = hyp.mosaic if self.augment and not self.rect else 0.0
+            hyp.mixup = hyp.mixup if self.augment and not self.rect else 0.0
+            hyp.cutmix = hyp.cutmix if self.augment and not self.rect else 0.0
+            transforms = v8_transforms(self, self.imgsz, hyp, stretch=True)
+        else:
+            # transforms = Compose([LetterBox(new_shape=(self.imgsz, self.imgsz), auto=False, scale_fill=True)])
+            transforms = Compose([])
+        transforms.append(
+            Format(
+                bbox_format="xywh",
+                normalize=True,
+                return_mask=self.use_segments,
+                return_keypoint=self.use_keypoints,
+                batch_idx=True,
+                mask_ratio=hyp.mask_ratio,
+                mask_overlap=hyp.overlap_mask,
+            )
+        )
+        return transforms
+
+
+class RTDETRValidator(DetectionValidator):
+    """
+    RTDETRValidator extends the DetectionValidator class to provide validation capabilities specifically tailored for
+    the RT-DETR (Real-Time DETR) object detection model.
+
+    The class allows building of an RTDETR-specific dataset for validation, applies Non-maximum suppression for
+    post-processing, and updates evaluation metrics accordingly.
+
+    Attributes:
+        args (Namespace): Configuration arguments for validation.
+        data (dict): Dataset configuration dictionary.
+
+    Methods:
+        build_dataset: Build an RTDETR Dataset for validation.
+        postprocess: Apply Non-maximum suppression to prediction outputs.
+
+    Examples:
+        Initialize and run RT-DETR validation
+        >>> from ultralytics.models.rtdetr import RTDETRValidator
+        >>> args = dict(model="rtdetr-l.pt", data="coco8.yaml")
+        >>> validator = RTDETRValidator(args=args)
+        >>> validator()
+
+    Notes:
+        For further details on the attributes and methods, refer to the parent DetectionValidator class.
+    """
+
+    def build_dataset(self, img_path, mode="val", batch=None):
+        """
+        Build an RTDETR Dataset.
+
+        Args:
+            img_path (str): Path to the folder containing images.
+            mode (str, optional): `train` mode or `val` mode, users are able to customize different augmentations for
+                each mode.
+            batch (int, optional): Size of batches, this is for `rect`.
+
+        Returns:
+            (RTDETRDataset): Dataset configured for RT-DETR validation.
+        """
+        return RTDETRDataset(
+            img_path=img_path,
+            imgsz=self.args.imgsz,
+            batch_size=batch,
+            augment=False,  # no augmentation
+            hyp=self.args,
+            rect=False,  # no rect
+            cache=self.args.cache or None,
+            prefix=colorstr(f"{mode}: "),
+            data=self.data,
+        )
+
+    def postprocess(
+        self, preds: torch.Tensor | list[torch.Tensor] | tuple[torch.Tensor]
+    ) -> list[dict[str, torch.Tensor]]:
+        """
+        Apply Non-maximum suppression to prediction outputs.
+
+        Args:
+            preds (torch.Tensor | list | tuple): Raw predictions from the model. If tensor, should have shape
+                (batch_size, num_predictions, num_classes + 4) where last dimension contains bbox coords and class scores.
+
+        Returns:
+            (list[dict[str, torch.Tensor]]): List of dictionaries for each image, each containing:
+                - 'bboxes': Tensor of shape (N, 4) with bounding box coordinates
+                - 'conf': Tensor of shape (N,) with confidence scores
+                - 'cls': Tensor of shape (N,) with class indices
+        """
+        if not isinstance(preds, (list, tuple)):  # list for PyTorch inference but list[0] Tensor for export inference
+            preds = [preds, None]
+
+        bs, _, nd = preds[0].shape
+        bboxes, scores = preds[0].split((4, nd - 4), dim=-1)
+        bboxes *= self.args.imgsz
+        outputs = [torch.zeros((0, 6), device=bboxes.device)] * bs
+        for i, bbox in enumerate(bboxes):  # (300, 4)
+            bbox = ops.xywh2xyxy(bbox)
+            score, cls = scores[i].max(-1)  # (300, )
+            pred = torch.cat([bbox, score[..., None], cls[..., None]], dim=-1)  # filter
+            # Sort by confidence to correctly get internal metrics
+            pred = pred[score.argsort(descending=True)]
+            outputs[i] = pred[score > self.args.conf]
+
+        return [{"bboxes": x[:, :4], "conf": x[:, 4], "cls": x[:, 5]} for x in outputs]
+
+    def pred_to_json(self, predn: dict[str, torch.Tensor], pbatch: dict[str, Any]) -> None:
+        """
+        Serialize YOLO predictions to COCO json format.
+
+        Args:
+            predn (dict[str, torch.Tensor]): Predictions dictionary containing 'bboxes', 'conf', and 'cls' keys
+                with bounding box coordinates, confidence scores, and class predictions.
+            pbatch (dict[str, Any]): Batch dictionary containing 'imgsz', 'ori_shape', 'ratio_pad', and 'im_file'.
+        """
+        path = Path(pbatch["im_file"])
+        stem = path.stem
+        image_id = int(stem) if stem.isnumeric() else stem
+        box = predn["bboxes"].clone()
+        box[..., [0, 2]] *= pbatch["ori_shape"][1] / self.args.imgsz  # native-space pred
+        box[..., [1, 3]] *= pbatch["ori_shape"][0] / self.args.imgsz  # native-space pred
+        box = ops.xyxy2xywh(box)  # xywh
+        box[:, :2] -= box[:, 2:] / 2  # xy center to top-left corner
+        for b, s, c in zip(box.tolist(), predn["conf"].tolist(), predn["cls"].tolist()):
+            self.jdict.append(
+                {
+                    "image_id": image_id,
+                    "file_name": path.name,
+                    "category_id": self.class_map[int(c)],
+                    "bbox": [round(x, 3) for x in b],
+                    "score": round(s, 5),
+                }
+            )