diff --git a/.gitignore b/.gitignore
index 88cb068..eab0ed1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,12 @@
-# Directories
-./ultralytics
-./models
-./datasets
+# Directories (third-party libraries)
+ultralytics/
+
+# Large files in models and datasets directories
+models/**/*.pkl
+models/**/*.joblib
+models/**/*.h5
+datasets/*.csv
+datasets/**/*.csv
 
 # Large model files and weights
 *.pt
diff --git a/datasets/__pycache__/MC_NDCC.cpython-310.pyc b/datasets/__pycache__/MC_NDCC.cpython-310.pyc
deleted file mode 100644
index e312382..0000000
Binary files a/datasets/__pycache__/MC_NDCC.cpython-310.pyc and /dev/null differ
diff --git a/datasets/__pycache__/__init__.cpython-310.pyc b/datasets/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index 7ebc3b3..0000000
Binary files a/datasets/__pycache__/__init__.cpython-310.pyc and /dev/null differ
diff --git a/datasets/__pycache__/datasets.cpython-310.pyc b/datasets/__pycache__/datasets.cpython-310.pyc
deleted file mode 100644
index d92da77..0000000
Binary files a/datasets/__pycache__/datasets.cpython-310.pyc and /dev/null differ
diff --git a/models/__pycache__/LSTSVM.cpython-310.pyc b/models/__pycache__/LSTSVM.cpython-310.pyc
deleted file mode 100644
index bc8dcd5..0000000
Binary files a/models/__pycache__/LSTSVM.cpython-310.pyc and /dev/null differ
diff --git a/models/__pycache__/NewtonUTSVM.cpython-310.pyc b/models/__pycache__/NewtonUTSVM.cpython-310.pyc
deleted file mode 100644
index 2cc399e..0000000
Binary files a/models/__pycache__/NewtonUTSVM.cpython-310.pyc and /dev/null differ
diff --git a/models/__pycache__/S3VM_constrained.cpython-310.pyc b/models/__pycache__/S3VM_constrained.cpython-310.pyc
deleted file mode 100644
index 4d44cfd..0000000
Binary files a/models/__pycache__/S3VM_constrained.cpython-310.pyc and /dev/null differ
diff --git a/models/__pycache__/S3VM_constrained.cpython-313.pyc b/models/__pycache__/S3VM_constrained.cpython-313.pyc
deleted file mode 100644
index 875cfd3..0000000
Binary files a/models/__pycache__/S3VM_constrained.cpython-313.pyc and /dev/null differ
diff --git a/models/__pycache__/S3VM_unconstrained.cpython-310.pyc b/models/__pycache__/S3VM_unconstrained.cpython-310.pyc
deleted file mode 100644
index a66b60c..0000000
Binary files a/models/__pycache__/S3VM_unconstrained.cpython-310.pyc and /dev/null differ
diff --git a/models/__pycache__/TSVM.cpython-310.pyc b/models/__pycache__/TSVM.cpython-310.pyc
deleted file mode 100644
index ec1f386..0000000
Binary files a/models/__pycache__/TSVM.cpython-310.pyc and /dev/null differ
diff --git a/models/__pycache__/__init__.cpython-310.pyc b/models/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index 963853a..0000000
Binary files a/models/__pycache__/__init__.cpython-310.pyc and /dev/null differ
diff --git a/models/__pycache__/__init__.cpython-313.pyc b/models/__pycache__/__init__.cpython-313.pyc
deleted file mode 100644
index 5047f20..0000000
Binary files a/models/__pycache__/__init__.cpython-313.pyc and /dev/null differ
diff --git a/models/__pycache__/models.cpython-310.pyc b/models/__pycache__/models.cpython-310.pyc
deleted file mode 100644
index 026af95..0000000
Binary files a/models/__pycache__/models.cpython-310.pyc and /dev/null differ
diff --git a/models/__pycache__/models.cpython-313.pyc b/models/__pycache__/models.cpython-313.pyc
deleted file mode 100644
index 082958b..0000000
Binary files a/models/__pycache__/models.cpython-313.pyc and /dev/null differ
diff --git a/models/__pycache__/utils.cpython-310.pyc b/models/__pycache__/utils.cpython-310.pyc
deleted file mode 100644
index 6a59605..0000000
Binary files a/models/__pycache__/utils.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/__init__.py b/ultralytics/__init__.py
deleted file mode 100644
index 3ab8160..0000000
--- a/ultralytics/__init__.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-__version__ = "8.3.204"
-
-import importlib
-import os
-
-# Set ENV variables (place before imports)
-if not os.environ.get("OMP_NUM_THREADS"):
-    os.environ["OMP_NUM_THREADS"] = "1"  # default for reduced CPU utilization during training
-
-from ultralytics.utils import ASSETS, SETTINGS
-from ultralytics.utils.checks import check_yolo as checks
-from ultralytics.utils.downloads import download
-
-settings = SETTINGS
-
-MODELS = ("YOLO", "YOLOWorld", "YOLOE", "NAS", "SAM", "FastSAM", "RTDETR")
-
-__all__ = (
-    "__version__",
-    "ASSETS",
-    *MODELS,
-    "checks",
-    "download",
-    "settings",
-)
-
-
-def __getattr__(name: str):
-    """Lazy-import model classes on first access."""
-    if name in MODELS:
-        return getattr(importlib.import_module("ultralytics.models"), name)
-    raise AttributeError(f"module {__name__} has no attribute {name}")
-
-
-def __dir__():
-    """Extend dir() to include lazily available model names for IDE autocompletion."""
-    return sorted(set(globals()) | set(MODELS))
-
-
-if __name__ == "__main__":
-    print(__version__)
diff --git a/ultralytics/__pycache__/__init__.cpython-310.pyc b/ultralytics/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index cf8981f..0000000
Binary files a/ultralytics/__pycache__/__init__.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/assets/bus.jpg b/ultralytics/assets/bus.jpg
deleted file mode 100644
index 40eaaf5..0000000
Binary files a/ultralytics/assets/bus.jpg and /dev/null differ
diff --git a/ultralytics/assets/zidane.jpg b/ultralytics/assets/zidane.jpg
deleted file mode 100644
index eeab1cd..0000000
Binary files a/ultralytics/assets/zidane.jpg and /dev/null differ
diff --git a/ultralytics/cfg/__init__.py b/ultralytics/cfg/__init__.py
deleted file mode 100644
index 95b7a22..0000000
--- a/ultralytics/cfg/__init__.py
+++ /dev/null
@@ -1,1032 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import shutil
-import subprocess
-import sys
-from pathlib import Path
-from types import SimpleNamespace
-from typing import Any
-
-from ultralytics import __version__
-from ultralytics.utils import (
-    ASSETS,
-    DEFAULT_CFG,
-    DEFAULT_CFG_DICT,
-    DEFAULT_CFG_PATH,
-    FLOAT_OR_INT,
-    IS_VSCODE,
-    LOGGER,
-    RANK,
-    ROOT,
-    RUNS_DIR,
-    SETTINGS,
-    SETTINGS_FILE,
-    STR_OR_PATH,
-    TESTS_RUNNING,
-    YAML,
-    IterableSimpleNamespace,
-    checks,
-    colorstr,
-    deprecation_warn,
-    vscode_msg,
-)
-
-# Define valid solutions
-SOLUTION_MAP = {
-    "count": "ObjectCounter",
-    "crop": "ObjectCropper",
-    "blur": "ObjectBlurrer",
-    "workout": "AIGym",
-    "heatmap": "Heatmap",
-    "isegment": "InstanceSegmentation",
-    "visioneye": "VisionEye",
-    "speed": "SpeedEstimator",
-    "queue": "QueueManager",
-    "analytics": "Analytics",
-    "inference": "Inference",
-    "trackzone": "TrackZone",
-    "help": None,
-}
-
-# Define valid tasks and modes
-MODES = frozenset({"train", "val", "predict", "export", "track", "benchmark"})
-TASKS = frozenset({"detect", "segment", "classify", "pose", "obb"})
-TASK2DATA = {
-    "detect": "coco8.yaml",
-    "segment": "coco8-seg.yaml",
-    "classify": "imagenet10",
-    "pose": "coco8-pose.yaml",
-    "obb": "dota8.yaml",
-}
-TASK2MODEL = {
-    "detect": "yolo11n.pt",
-    "segment": "yolo11n-seg.pt",
-    "classify": "yolo11n-cls.pt",
-    "pose": "yolo11n-pose.pt",
-    "obb": "yolo11n-obb.pt",
-}
-TASK2METRIC = {
-    "detect": "metrics/mAP50-95(B)",
-    "segment": "metrics/mAP50-95(M)",
-    "classify": "metrics/accuracy_top1",
-    "pose": "metrics/mAP50-95(P)",
-    "obb": "metrics/mAP50-95(B)",
-}
-
-ARGV = sys.argv or ["", ""]  # sometimes sys.argv = []
-SOLUTIONS_HELP_MSG = f"""
-    Arguments received: {str(["yolo"] + ARGV[1:])}. Ultralytics 'yolo solutions' usage overview:
-
-        yolo solutions SOLUTION ARGS
-
-        Where SOLUTION (optional) is one of {list(SOLUTION_MAP.keys())[:-1]}
-              ARGS (optional) are any number of custom 'arg=value' pairs like 'show_in=True' that override defaults
-                  at https://docs.ultralytics.com/usage/cfg
-
-    1. Call object counting solution
-        yolo solutions count source="path/to/video.mp4" region="[(20, 400), (1080, 400), (1080, 360), (20, 360)]"
-
-    2. Call heatmaps solution
-        yolo solutions heatmap colormap=cv2.COLORMAP_PARULA model=yolo11n.pt
-
-    3. Call queue management solution
-        yolo solutions queue region="[(20, 400), (1080, 400), (1080, 360), (20, 360)]" model=yolo11n.pt
-
-    4. Call workouts monitoring solution for push-ups
-        yolo solutions workout model=yolo11n-pose.pt kpts=[6, 8, 10]
-
-    5. Generate analytical graphs
-        yolo solutions analytics analytics_type="pie"
-
-    6. Track objects within specific zones
-        yolo solutions trackzone source="path/to/video.mp4" region="[(150, 150), (1130, 150), (1130, 570), (150, 570)]"
-
-    7. Streamlit real-time webcam inference GUI
-        yolo streamlit-predict
-    """
-CLI_HELP_MSG = f"""
-    Arguments received: {str(["yolo"] + ARGV[1:])}. Ultralytics 'yolo' commands use the following syntax:
-
-        yolo TASK MODE ARGS
-
-        Where   TASK (optional) is one of {list(TASKS)}
-                MODE (required) is one of {list(MODES)}
-                ARGS (optional) are any number of custom 'arg=value' pairs like 'imgsz=320' that override defaults.
-                    See all ARGS at https://docs.ultralytics.com/usage/cfg or with 'yolo cfg'
-
-    1. Train a detection model for 10 epochs with an initial learning_rate of 0.01
-        yolo train data=coco8.yaml model=yolo11n.pt epochs=10 lr0=0.01
-
-    2. Predict a YouTube video using a pretrained segmentation model at image size 320:
-        yolo predict model=yolo11n-seg.pt source='https://youtu.be/LNwODJXcvt4' imgsz=320
-
-    3. Val a pretrained detection model at batch-size 1 and image size 640:
-        yolo val model=yolo11n.pt data=coco8.yaml batch=1 imgsz=640
-
-    4. Export a YOLO11n classification model to ONNX format at image size 224 by 128 (no TASK required)
-        yolo export model=yolo11n-cls.pt format=onnx imgsz=224,128
-
-    5. Ultralytics solutions usage
-        yolo solutions count or in {list(SOLUTION_MAP.keys())[1:-1]} source="path/to/video.mp4"
-
-    6. Run special commands:
-        yolo help
-        yolo checks
-        yolo version
-        yolo settings
-        yolo copy-cfg
-        yolo cfg
-        yolo solutions help
-
-    Docs: https://docs.ultralytics.com
-    Solutions: https://docs.ultralytics.com/solutions/
-    Community: https://community.ultralytics.com
-    GitHub: https://github.com/ultralytics/ultralytics
-    """
-
-# Define keys for arg type checks
-CFG_FLOAT_KEYS = frozenset(
-    {  # integer or float arguments, i.e. x=2 and x=2.0
-        "warmup_epochs",
-        "box",
-        "cls",
-        "dfl",
-        "degrees",
-        "shear",
-        "time",
-        "workspace",
-        "batch",
-    }
-)
-CFG_FRACTION_KEYS = frozenset(
-    {  # fractional float arguments with 0.0<=values<=1.0
-        "dropout",
-        "lr0",
-        "lrf",
-        "momentum",
-        "weight_decay",
-        "warmup_momentum",
-        "warmup_bias_lr",
-        "hsv_h",
-        "hsv_s",
-        "hsv_v",
-        "translate",
-        "scale",
-        "perspective",
-        "flipud",
-        "fliplr",
-        "bgr",
-        "mosaic",
-        "mixup",
-        "cutmix",
-        "copy_paste",
-        "conf",
-        "iou",
-        "fraction",
-    }
-)
-CFG_INT_KEYS = frozenset(
-    {  # integer-only arguments
-        "epochs",
-        "patience",
-        "workers",
-        "seed",
-        "close_mosaic",
-        "mask_ratio",
-        "max_det",
-        "vid_stride",
-        "line_width",
-        "nbs",
-        "save_period",
-    }
-)
-CFG_BOOL_KEYS = frozenset(
-    {  # boolean-only arguments
-        "save",
-        "exist_ok",
-        "verbose",
-        "deterministic",
-        "single_cls",
-        "rect",
-        "cos_lr",
-        "overlap_mask",
-        "val",
-        "save_json",
-        "half",
-        "dnn",
-        "plots",
-        "show",
-        "save_txt",
-        "save_conf",
-        "save_crop",
-        "save_frames",
-        "show_labels",
-        "show_conf",
-        "visualize",
-        "augment",
-        "agnostic_nms",
-        "retina_masks",
-        "show_boxes",
-        "keras",
-        "optimize",
-        "int8",
-        "dynamic",
-        "simplify",
-        "nms",
-        "profile",
-        "multi_scale",
-    }
-)
-
-
-def cfg2dict(cfg: str | Path | dict | SimpleNamespace) -> dict:
-    """
-    Convert a configuration object to a dictionary.
-
-    Args:
-        cfg (str | Path | dict | SimpleNamespace): Configuration object to be converted. Can be a file path,
-            a string, a dictionary, or a SimpleNamespace object.
-
-    Returns:
-        (dict): Configuration object in dictionary format.
-
-    Examples:
-        Convert a YAML file path to a dictionary:
-        >>> config_dict = cfg2dict("config.yaml")
-
-        Convert a SimpleNamespace to a dictionary:
-        >>> from types import SimpleNamespace
-        >>> config_sn = SimpleNamespace(param1="value1", param2="value2")
-        >>> config_dict = cfg2dict(config_sn)
-
-        Pass through an already existing dictionary:
-        >>> config_dict = cfg2dict({"param1": "value1", "param2": "value2"})
-
-    Notes:
-        - If cfg is a path or string, it's loaded as YAML and converted to a dictionary.
-        - If cfg is a SimpleNamespace object, it's converted to a dictionary using vars().
-        - If cfg is already a dictionary, it's returned unchanged.
-    """
-    if isinstance(cfg, STR_OR_PATH):
-        cfg = YAML.load(cfg)  # load dict
-    elif isinstance(cfg, SimpleNamespace):
-        cfg = vars(cfg)  # convert to dict
-    return cfg
-
-
-def get_cfg(cfg: str | Path | dict | SimpleNamespace = DEFAULT_CFG_DICT, overrides: dict = None) -> SimpleNamespace:
-    """
-    Load and merge configuration data from a file or dictionary, with optional overrides.
-
-    Args:
-        cfg (str | Path | dict | SimpleNamespace): Configuration data source. Can be a file path, dictionary, or
-            SimpleNamespace object.
-        overrides (dict | None): Dictionary containing key-value pairs to override the base configuration.
-
-    Returns:
-        (SimpleNamespace): Namespace containing the merged configuration arguments.
-
-    Examples:
-        >>> from ultralytics.cfg import get_cfg
-        >>> config = get_cfg()  # Load default configuration
-        >>> config_with_overrides = get_cfg("path/to/config.yaml", overrides={"epochs": 50, "batch_size": 16})
-
-    Notes:
-        - If both `cfg` and `overrides` are provided, the values in `overrides` will take precedence.
-        - Special handling ensures alignment and correctness of the configuration, such as converting numeric
-          `project` and `name` to strings and validating configuration keys and values.
-        - The function performs type and value checks on the configuration data.
-    """
-    cfg = cfg2dict(cfg)
-
-    # Merge overrides
-    if overrides:
-        overrides = cfg2dict(overrides)
-        if "save_dir" not in cfg:
-            overrides.pop("save_dir", None)  # special override keys to ignore
-        check_dict_alignment(cfg, overrides)
-        cfg = {**cfg, **overrides}  # merge cfg and overrides dicts (prefer overrides)
-
-    # Special handling for numeric project/name
-    for k in "project", "name":
-        if k in cfg and isinstance(cfg[k], FLOAT_OR_INT):
-            cfg[k] = str(cfg[k])
-    if cfg.get("name") == "model":  # assign model to 'name' arg
-        cfg["name"] = str(cfg.get("model", "")).partition(".")[0]
-        LOGGER.warning(f"'name=model' automatically updated to 'name={cfg['name']}'.")
-
-    # Type and Value checks
-    check_cfg(cfg)
-
-    # Return instance
-    return IterableSimpleNamespace(**cfg)
-
-
-def check_cfg(cfg: dict, hard: bool = True) -> None:
-    """
-    Check configuration argument types and values for the Ultralytics library.
-
-    This function validates the types and values of configuration arguments, ensuring correctness and converting
-    them if necessary. It checks for specific key types defined in global variables such as `CFG_FLOAT_KEYS`,
-    `CFG_FRACTION_KEYS`, `CFG_INT_KEYS`, and `CFG_BOOL_KEYS`.
-
-    Args:
-        cfg (dict): Configuration dictionary to validate.
-        hard (bool): If True, raises exceptions for invalid types and values; if False, attempts to convert them.
-
-    Examples:
-        >>> config = {
-        ...     "epochs": 50,  # valid integer
-        ...     "lr0": 0.01,  # valid float
-        ...     "momentum": 1.2,  # invalid float (out of 0.0-1.0 range)
-        ...     "save": "true",  # invalid bool
-        ... }
-        >>> check_cfg(config, hard=False)
-        >>> print(config)
-        {'epochs': 50, 'lr0': 0.01, 'momentum': 1.2, 'save': False}  # corrected 'save' key
-
-    Notes:
-        - The function modifies the input dictionary in-place.
-        - None values are ignored as they may be from optional arguments.
-        - Fraction keys are checked to be within the range [0.0, 1.0].
-    """
-    for k, v in cfg.items():
-        if v is not None:  # None values may be from optional args
-            if k in CFG_FLOAT_KEYS and not isinstance(v, FLOAT_OR_INT):
-                if hard:
-                    raise TypeError(
-                        f"'{k}={v}' is of invalid type {type(v).__name__}. "
-                        f"Valid '{k}' types are int (i.e. '{k}=0') or float (i.e. '{k}=0.5')"
-                    )
-                cfg[k] = float(v)
-            elif k in CFG_FRACTION_KEYS:
-                if not isinstance(v, FLOAT_OR_INT):
-                    if hard:
-                        raise TypeError(
-                            f"'{k}={v}' is of invalid type {type(v).__name__}. "
-                            f"Valid '{k}' types are int (i.e. '{k}=0') or float (i.e. '{k}=0.5')"
-                        )
-                    cfg[k] = v = float(v)
-                if not (0.0 <= v <= 1.0):
-                    raise ValueError(f"'{k}={v}' is an invalid value. Valid '{k}' values are between 0.0 and 1.0.")
-            elif k in CFG_INT_KEYS and not isinstance(v, int):
-                if hard:
-                    raise TypeError(
-                        f"'{k}={v}' is of invalid type {type(v).__name__}. '{k}' must be an int (i.e. '{k}=8')"
-                    )
-                cfg[k] = int(v)
-            elif k in CFG_BOOL_KEYS and not isinstance(v, bool):
-                if hard:
-                    raise TypeError(
-                        f"'{k}={v}' is of invalid type {type(v).__name__}. "
-                        f"'{k}' must be a bool (i.e. '{k}=True' or '{k}=False')"
-                    )
-                cfg[k] = bool(v)
-
-
-def get_save_dir(args: SimpleNamespace, name: str = None) -> Path:
-    """
-    Return the directory path for saving outputs, derived from arguments or default settings.
-
-    Args:
-        args (SimpleNamespace): Namespace object containing configurations such as 'project', 'name', 'task',
-            'mode', and 'save_dir'.
-        name (str | None): Optional name for the output directory. If not provided, it defaults to 'args.name'
-            or the 'args.mode'.
-
-    Returns:
-        (Path): Directory path where outputs should be saved.
-
-    Examples:
-        >>> from types import SimpleNamespace
-        >>> args = SimpleNamespace(project="my_project", task="detect", mode="train", exist_ok=True)
-        >>> save_dir = get_save_dir(args)
-        >>> print(save_dir)
-        my_project/detect/train
-    """
-    if getattr(args, "save_dir", None):
-        save_dir = args.save_dir
-    else:
-        from ultralytics.utils.files import increment_path
-
-        project = args.project or (ROOT.parent / "tests/tmp/runs" if TESTS_RUNNING else RUNS_DIR) / args.task
-        name = name or args.name or f"{args.mode}"
-        save_dir = increment_path(Path(project) / name, exist_ok=args.exist_ok if RANK in {-1, 0} else True)
-
-    return Path(save_dir).resolve()  # resolve to display full path in console
-
-
-def _handle_deprecation(custom: dict) -> dict:
-    """
-    Handle deprecated configuration keys by mapping them to current equivalents with deprecation warnings.
-
-    Args:
-        custom (dict): Configuration dictionary potentially containing deprecated keys.
-
-    Returns:
-        (dict): Updated configuration dictionary with deprecated keys replaced.
-
-    Examples:
-        >>> custom_config = {"boxes": True, "hide_labels": "False", "line_thickness": 2}
-        >>> _handle_deprecation(custom_config)
-        >>> print(custom_config)
-        {'show_boxes': True, 'show_labels': True, 'line_width': 2}
-
-    Notes:
-        This function modifies the input dictionary in-place, replacing deprecated keys with their current
-        equivalents. It also handles value conversions where necessary, such as inverting boolean values for
-        'hide_labels' and 'hide_conf'.
-    """
-    deprecated_mappings = {
-        "boxes": ("show_boxes", lambda v: v),
-        "hide_labels": ("show_labels", lambda v: not bool(v)),
-        "hide_conf": ("show_conf", lambda v: not bool(v)),
-        "line_thickness": ("line_width", lambda v: v),
-    }
-    removed_keys = {"label_smoothing", "save_hybrid", "crop_fraction"}
-
-    for old_key, (new_key, transform) in deprecated_mappings.items():
-        if old_key not in custom:
-            continue
-        deprecation_warn(old_key, new_key)
-        custom[new_key] = transform(custom.pop(old_key))
-
-    for key in removed_keys:
-        if key not in custom:
-            continue
-        deprecation_warn(key)
-        custom.pop(key)
-
-    return custom
-
-
-def check_dict_alignment(base: dict, custom: dict, e: Exception = None) -> None:
-    """
-    Check alignment between custom and base configuration dictionaries, handling deprecated keys and providing error
-    messages for mismatched keys.
-
-    Args:
-        base (dict): The base configuration dictionary containing valid keys.
-        custom (dict): The custom configuration dictionary to be checked for alignment.
-        e (Exception | None): Optional error instance passed by the calling function.
-
-    Raises:
-        SystemExit: If mismatched keys are found between the custom and base dictionaries.
-
-    Examples:
-        >>> base_cfg = {"epochs": 50, "lr0": 0.01, "batch_size": 16}
-        >>> custom_cfg = {"epoch": 100, "lr": 0.02, "batch_size": 32}
-        >>> try:
-        ...     check_dict_alignment(base_cfg, custom_cfg)
-        ... except SystemExit:
-        ...     print("Mismatched keys found")
-
-    Notes:
-        - Suggests corrections for mismatched keys based on similarity to valid keys.
-        - Automatically replaces deprecated keys in the custom configuration with updated equivalents.
-        - Prints detailed error messages for each mismatched key to help users correct their configurations.
-    """
-    custom = _handle_deprecation(custom)
-    base_keys, custom_keys = (frozenset(x.keys()) for x in (base, custom))
-    if mismatched := [k for k in custom_keys if k not in base_keys]:
-        from difflib import get_close_matches
-
-        string = ""
-        for x in mismatched:
-            matches = get_close_matches(x, base_keys)  # key list
-            matches = [f"{k}={base[k]}" if base.get(k) is not None else k for k in matches]
-            match_str = f"Similar arguments are i.e. {matches}." if matches else ""
-            string += f"'{colorstr('red', 'bold', x)}' is not a valid YOLO argument. {match_str}\n"
-        raise SyntaxError(string + CLI_HELP_MSG) from e
-
-
-def merge_equals_args(args: list[str]) -> list[str]:
-    """
-    Merge arguments around isolated '=' in a list of strings and join fragments with brackets.
-
-    This function handles the following cases:
-        1. ['arg', '=', 'val'] becomes ['arg=val']
-        2. ['arg=', 'val'] becomes ['arg=val']
-        3. ['arg', '=val'] becomes ['arg=val']
-        4. Joins fragments with brackets, e.g., ['imgsz=[3,', '640,', '640]'] becomes ['imgsz=[3,640,640]']
-
-    Args:
-        args (list[str]): A list of strings where each element represents an argument or fragment.
-
-    Returns:
-        (list[str]): A list of strings where the arguments around isolated '=' are merged and fragments with brackets are joined.
-
-    Examples:
-        >>> args = ["arg1", "=", "value", "arg2=", "value2", "arg3", "=value3", "imgsz=[3,", "640,", "640]"]
-        >>> merge_equals_args(args)
-        ['arg1=value', 'arg2=value2', 'arg3=value3', 'imgsz=[3,640,640]']
-    """
-    new_args = []
-    current = ""
-    depth = 0
-
-    i = 0
-    while i < len(args):
-        arg = args[i]
-
-        # Handle equals sign merging
-        if arg == "=" and 0 < i < len(args) - 1:  # merge ['arg', '=', 'val']
-            new_args[-1] += f"={args[i + 1]}"
-            i += 2
-            continue
-        elif arg.endswith("=") and i < len(args) - 1 and "=" not in args[i + 1]:  # merge ['arg=', 'val']
-            new_args.append(f"{arg}{args[i + 1]}")
-            i += 2
-            continue
-        elif arg.startswith("=") and i > 0:  # merge ['arg', '=val']
-            new_args[-1] += arg
-            i += 1
-            continue
-
-        # Handle bracket joining
-        depth += arg.count("[") - arg.count("]")
-        current += arg
-        if depth == 0:
-            new_args.append(current)
-            current = ""
-
-        i += 1
-
-    # Append any remaining current string
-    if current:
-        new_args.append(current)
-
-    return new_args
-
-
-def handle_yolo_hub(args: list[str]) -> None:
-    """
-    Handle Ultralytics HUB command-line interface (CLI) commands for authentication.
-
-    This function processes Ultralytics HUB CLI commands such as login and logout. It should be called when executing a
-    script with arguments related to HUB authentication.
-
-    Args:
-        args (list[str]): A list of command line arguments. The first argument should be either 'login'
-            or 'logout'. For 'login', an optional second argument can be the API key.
-
-    Examples:
-        $ yolo login YOUR_API_KEY
-
-    Notes:
-        - The function imports the 'hub' module from ultralytics to perform login and logout operations.
-        - For the 'login' command, if no API key is provided, an empty string is passed to the login function.
-        - The 'logout' command does not require any additional arguments.
-    """
-    from ultralytics import hub
-
-    if args[0] == "login":
-        key = args[1] if len(args) > 1 else ""
-        # Log in to Ultralytics HUB using the provided API key
-        hub.login(key)
-    elif args[0] == "logout":
-        # Log out from ultralytics HUB
-        hub.logout()
-
-
-def handle_yolo_settings(args: list[str]) -> None:
-    """
-    Handle YOLO settings command-line interface (CLI) commands.
-
-    This function processes YOLO settings CLI commands such as reset and updating individual settings. It should be
-    called when executing a script with arguments related to YOLO settings management.
-
-    Args:
-        args (list[str]): A list of command line arguments for YOLO settings management.
-
-    Examples:
-        >>> handle_yolo_settings(["reset"])  # Reset YOLO settings
-        >>> handle_yolo_settings(["default_cfg_path=yolo11n.yaml"])  # Update a specific setting
-
-    Notes:
-        - If no arguments are provided, the function will display the current settings.
-        - The 'reset' command will delete the existing settings file and create new default settings.
-        - Other arguments are treated as key-value pairs to update specific settings.
-        - The function will check for alignment between the provided settings and the existing ones.
-        - After processing, the updated settings will be displayed.
-        - For more information on handling YOLO settings, visit:
-          https://docs.ultralytics.com/quickstart/#ultralytics-settings
-    """
-    url = "https://docs.ultralytics.com/quickstart/#ultralytics-settings"  # help URL
-    try:
-        if any(args):
-            if args[0] == "reset":
-                SETTINGS_FILE.unlink()  # delete the settings file
-                SETTINGS.reset()  # create new settings
-                LOGGER.info("Settings reset successfully")  # inform the user that settings have been reset
-            else:  # save a new setting
-                new = dict(parse_key_value_pair(a) for a in args)
-                check_dict_alignment(SETTINGS, new)
-                SETTINGS.update(new)
-                for k, v in new.items():
-                    LOGGER.info(f"✅ Updated '{k}={v}'")
-
-        LOGGER.info(SETTINGS)  # print the current settings
-        LOGGER.info(f"💡 Learn more about Ultralytics Settings at {url}")
-    except Exception as e:
-        LOGGER.warning(f"settings error: '{e}'. Please see {url} for help.")
-
-
-def handle_yolo_solutions(args: list[str]) -> None:
-    """
-    Process YOLO solutions arguments and run the specified computer vision solutions pipeline.
-
-    Args:
-        args (list[str]): Command-line arguments for configuring and running the Ultralytics YOLO
-            solutions: https://docs.ultralytics.com/solutions/, It can include solution name, source,
-            and other configuration parameters.
-
-    Examples:
-        Run people counting solution with default settings:
-        >>> handle_yolo_solutions(["count"])
-
-        Run analytics with custom configuration:
-        >>> handle_yolo_solutions(["analytics", "conf=0.25", "source=path/to/video.mp4"])
-
-        Run inference with custom configuration, requires Streamlit version 1.29.0 or higher.
-        >>> handle_yolo_solutions(["inference", "model=yolo11n.pt"])
-
-    Notes:
-        - Arguments can be provided in the format 'key=value' or as boolean flags
-        - Available solutions are defined in SOLUTION_MAP with their respective classes and methods
-        - If an invalid solution is provided, defaults to 'count' solution
-        - Output videos are saved in 'runs/solution/{solution_name}' directory
-        - For 'analytics' solution, frame numbers are tracked for generating analytical graphs
-        - Video processing can be interrupted by pressing 'q'
-        - Processes video frames sequentially and saves output in .avi format
-        - If no source is specified, downloads and uses a default sample video
-        - The inference solution will be launched using the 'streamlit run' command.
-        - The Streamlit app file is located in the Ultralytics package directory.
-    """
-    from ultralytics.solutions.config import SolutionConfig
-
-    full_args_dict = vars(SolutionConfig())  # arguments dictionary
-    overrides = {}
-
-    # check dictionary alignment
-    for arg in merge_equals_args(args):
-        arg = arg.lstrip("-").rstrip(",")
-        if "=" in arg:
-            try:
-                k, v = parse_key_value_pair(arg)
-                overrides[k] = v
-            except (NameError, SyntaxError, ValueError, AssertionError) as e:
-                check_dict_alignment(full_args_dict, {arg: ""}, e)
-        elif arg in full_args_dict and isinstance(full_args_dict.get(arg), bool):
-            overrides[arg] = True
-    check_dict_alignment(full_args_dict, overrides)  # dict alignment
-
-    # Get solution name
-    if not args:
-        LOGGER.warning("No solution name provided. i.e `yolo solutions count`. Defaulting to 'count'.")
-        args = ["count"]
-    if args[0] == "help":
-        LOGGER.info(SOLUTIONS_HELP_MSG)
-        return  # Early return for 'help' case
-    elif args[0] in SOLUTION_MAP:
-        solution_name = args.pop(0)  # Extract the solution name directly
-    else:
-        LOGGER.warning(
-            f"❌ '{args[0]}' is not a valid solution. 💡 Defaulting to 'count'.\n"
-            f"🚀 Available solutions: {', '.join(list(SOLUTION_MAP.keys())[:-1])}\n"
-        )
-        solution_name = "count"  # Default for invalid solution
-
-    if solution_name == "inference":
-        checks.check_requirements("streamlit>=1.29.0")
-        LOGGER.info("💡 Loading Ultralytics live inference app...")
-        subprocess.run(
-            [  # Run subprocess with Streamlit custom argument
-                "streamlit",
-                "run",
-                str(ROOT / "solutions/streamlit_inference.py"),
-                "--server.headless",
-                "true",
-                overrides.pop("model", "yolo11n.pt"),
-            ]
-        )
-    else:
-        import cv2  # Only needed for cap and vw functionality
-
-        from ultralytics import solutions
-
-        solution = getattr(solutions, SOLUTION_MAP[solution_name])(is_cli=True, **overrides)  # class i.e ObjectCounter
-
-        cap = cv2.VideoCapture(solution.CFG["source"])  # read the video file
-        if solution_name != "crop":
-            # extract width, height and fps of the video file, create save directory and initialize video writer
-            w, h, fps = (
-                int(cap.get(x)) for x in (cv2.CAP_PROP_FRAME_WIDTH, cv2.CAP_PROP_FRAME_HEIGHT, cv2.CAP_PROP_FPS)
-            )
-            if solution_name == "analytics":  # analytical graphs follow fixed shape for output i.e w=1920, h=1080
-                w, h = 1280, 720
-            save_dir = get_save_dir(SimpleNamespace(project="runs/solutions", name="exp", exist_ok=False))
-            save_dir.mkdir(parents=True)  # create the output directory i.e. runs/solutions/exp
-            vw = cv2.VideoWriter(str(save_dir / f"{solution_name}.avi"), cv2.VideoWriter_fourcc(*"mp4v"), fps, (w, h))
-
-        try:  # Process video frames
-            f_n = 0  # frame number, required for analytical graphs
-            while cap.isOpened():
-                success, frame = cap.read()
-                if not success:
-                    break
-                results = solution(frame, f_n := f_n + 1) if solution_name == "analytics" else solution(frame)
-                if solution_name != "crop":
-                    vw.write(results.plot_im)
-                if solution.CFG["show"] and cv2.waitKey(1) & 0xFF == ord("q"):
-                    break
-        finally:
-            cap.release()
-
-
-def parse_key_value_pair(pair: str = "key=value") -> tuple:
-    """
-    Parse a key-value pair string into separate key and value components.
-
-    Args:
-        pair (str): A string containing a key-value pair in the format "key=value".
-
-    Returns:
-        key (str): The parsed key.
-        value (str): The parsed value.
-
-    Raises:
-        AssertionError: If the value is missing or empty.
-
-    Examples:
-        >>> key, value = parse_key_value_pair("model=yolo11n.pt")
-        >>> print(f"Key: {key}, Value: {value}")
-        Key: model, Value: yolo11n.pt
-
-        >>> key, value = parse_key_value_pair("epochs=100")
-        >>> print(f"Key: {key}, Value: {value}")
-        Key: epochs, Value: 100
-
-    Notes:
-        - The function splits the input string on the first '=' character.
-        - Leading and trailing whitespace is removed from both key and value.
-        - An assertion error is raised if the value is empty after stripping.
-    """
-    k, v = pair.split("=", 1)  # split on first '=' sign
-    k, v = k.strip(), v.strip()  # remove spaces
-    assert v, f"missing '{k}' value"
-    return k, smart_value(v)
-
-
-def smart_value(v: str) -> Any:
-    """
-    Convert a string representation of a value to its appropriate Python type.
-
-    This function attempts to convert a given string into a Python object of the most appropriate type. It handles
-    conversions to None, bool, int, float, and other types that can be evaluated safely.
-
-    Args:
-        v (str): The string representation of the value to be converted.
-
-    Returns:
-        (Any): The converted value. The type can be None, bool, int, float, or the original string if no conversion
-            is applicable.
-
-    Examples:
-        >>> smart_value("42")
-        42
-        >>> smart_value("3.14")
-        3.14
-        >>> smart_value("True")
-        True
-        >>> smart_value("None")
-        None
-        >>> smart_value("some_string")
-        'some_string'
-
-    Notes:
-        - The function uses a case-insensitive comparison for boolean and None values.
-        - For other types, it attempts to use Python's eval() function, which can be unsafe if used on untrusted input.
-        - If no conversion is possible, the original string is returned.
-    """
-    v_lower = v.lower()
-    if v_lower == "none":
-        return None
-    elif v_lower == "true":
-        return True
-    elif v_lower == "false":
-        return False
-    else:
-        try:
-            return eval(v)
-        except Exception:
-            return v
-
-
-def entrypoint(debug: str = "") -> None:
-    """
-    Ultralytics entrypoint function for parsing and executing command-line arguments.
-
-    This function serves as the main entry point for the Ultralytics CLI, parsing command-line arguments and
-    executing the corresponding tasks such as training, validation, prediction, exporting models, and more.
-
-    Args:
-        debug (str): Space-separated string of command-line arguments for debugging purposes.
-
-    Examples:
-        Train a detection model for 10 epochs with an initial learning_rate of 0.01:
-        >>> entrypoint("train data=coco8.yaml model=yolo11n.pt epochs=10 lr0=0.01")
-
-        Predict a YouTube video using a pretrained segmentation model at image size 320:
-        >>> entrypoint("predict model=yolo11n-seg.pt source='https://youtu.be/LNwODJXcvt4' imgsz=320")
-
-        Validate a pretrained detection model at batch-size 1 and image size 640:
-        >>> entrypoint("val model=yolo11n.pt data=coco8.yaml batch=1 imgsz=640")
-
-    Notes:
-        - If no arguments are passed, the function will display the usage help message.
-        - For a list of all available commands and their arguments, see the provided help messages and the
-          Ultralytics documentation at https://docs.ultralytics.com.
-    """
-    args = (debug.split(" ") if debug else ARGV)[1:]
-    if not args:  # no arguments passed
-        LOGGER.info(CLI_HELP_MSG)
-        return
-
-    special = {
-        "checks": checks.collect_system_info,
-        "version": lambda: LOGGER.info(__version__),
-        "settings": lambda: handle_yolo_settings(args[1:]),
-        "cfg": lambda: YAML.print(DEFAULT_CFG_PATH),
-        "hub": lambda: handle_yolo_hub(args[1:]),
-        "login": lambda: handle_yolo_hub(args),
-        "logout": lambda: handle_yolo_hub(args),
-        "copy-cfg": copy_default_cfg,
-        "solutions": lambda: handle_yolo_solutions(args[1:]),
-        "help": lambda: LOGGER.info(CLI_HELP_MSG),  # help below hub for -h flag precedence
-    }
-    full_args_dict = {**DEFAULT_CFG_DICT, **{k: None for k in TASKS}, **{k: None for k in MODES}, **special}
-
-    # Define common misuses of special commands, i.e. -h, -help, --help
-    special.update({k[0]: v for k, v in special.items()})  # singular
-    special.update({k[:-1]: v for k, v in special.items() if len(k) > 1 and k.endswith("s")})  # singular
-    special = {**special, **{f"-{k}": v for k, v in special.items()}, **{f"--{k}": v for k, v in special.items()}}
-
-    overrides = {}  # basic overrides, i.e. imgsz=320
-    for a in merge_equals_args(args):  # merge spaces around '=' sign
-        if a.startswith("--"):
-            LOGGER.warning(f"argument '{a}' does not require leading dashes '--', updating to '{a[2:]}'.")
-            a = a[2:]
-        if a.endswith(","):
-            LOGGER.warning(f"argument '{a}' does not require trailing comma ',', updating to '{a[:-1]}'.")
-            a = a[:-1]
-        if "=" in a:
-            try:
-                k, v = parse_key_value_pair(a)
-                if k == "cfg" and v is not None:  # custom.yaml passed
-                    LOGGER.info(f"Overriding {DEFAULT_CFG_PATH} with {v}")
-                    overrides = {k: val for k, val in YAML.load(checks.check_yaml(v)).items() if k != "cfg"}
-                else:
-                    overrides[k] = v
-            except (NameError, SyntaxError, ValueError, AssertionError) as e:
-                check_dict_alignment(full_args_dict, {a: ""}, e)
-
-        elif a in TASKS:
-            overrides["task"] = a
-        elif a in MODES:
-            overrides["mode"] = a
-        elif a.lower() in special:
-            special[a.lower()]()
-            return
-        elif a in DEFAULT_CFG_DICT and isinstance(DEFAULT_CFG_DICT[a], bool):
-            overrides[a] = True  # auto-True for default bool args, i.e. 'yolo show' sets show=True
-        elif a in DEFAULT_CFG_DICT:
-            raise SyntaxError(
-                f"'{colorstr('red', 'bold', a)}' is a valid YOLO argument but is missing an '=' sign "
-                f"to set its value, i.e. try '{a}={DEFAULT_CFG_DICT[a]}'\n{CLI_HELP_MSG}"
-            )
-        else:
-            check_dict_alignment(full_args_dict, {a: ""})
-
-    # Check keys
-    check_dict_alignment(full_args_dict, overrides)
-
-    # Mode
-    mode = overrides.get("mode")
-    if mode is None:
-        mode = DEFAULT_CFG.mode or "predict"
-        LOGGER.warning(f"'mode' argument is missing. Valid modes are {list(MODES)}. Using default 'mode={mode}'.")
-    elif mode not in MODES:
-        raise ValueError(f"Invalid 'mode={mode}'. Valid modes are {list(MODES)}.\n{CLI_HELP_MSG}")
-
-    # Task
-    task = overrides.pop("task", None)
-    if task:
-        if task not in TASKS:
-            if task == "track":
-                LOGGER.warning(
-                    f"invalid 'task=track', setting 'task=detect' and 'mode=track'. Valid tasks are {list(TASKS)}.\n{CLI_HELP_MSG}."
-                )
-                task, mode = "detect", "track"
-            else:
-                raise ValueError(f"Invalid 'task={task}'. Valid tasks are {list(TASKS)}.\n{CLI_HELP_MSG}")
-        if "model" not in overrides:
-            overrides["model"] = TASK2MODEL[task]
-
-    # Model
-    model = overrides.pop("model", DEFAULT_CFG.model)
-    if model is None:
-        model = "yolo11n.pt"
-        LOGGER.warning(f"'model' argument is missing. Using default 'model={model}'.")
-    overrides["model"] = model
-    stem = Path(model).stem.lower()
-    if "rtdetr" in stem:  # guess architecture
-        from ultralytics import RTDETR
-
-        model = RTDETR(model)  # no task argument
-    elif "fastsam" in stem:
-        from ultralytics import FastSAM
-
-        model = FastSAM(model)
-    elif "sam_" in stem or "sam2_" in stem or "sam2.1_" in stem:
-        from ultralytics import SAM
-
-        model = SAM(model)
-    else:
-        from ultralytics import YOLO
-
-        model = YOLO(model, task=task)
-        if "yoloe" in stem or "world" in stem:
-            cls_list = overrides.pop("classes", DEFAULT_CFG.classes)
-            if cls_list is not None and isinstance(cls_list, str):
-                model.set_classes(cls_list.split(","))  # convert "person, bus" -> ['person', ' bus'].
-    # Task Update
-    if task != model.task:
-        if task:
-            LOGGER.warning(
-                f"conflicting 'task={task}' passed with 'task={model.task}' model. "
-                f"Ignoring 'task={task}' and updating to 'task={model.task}' to match model."
-            )
-        task = model.task
-
-    # Mode
-    if mode in {"predict", "track"} and "source" not in overrides:
-        overrides["source"] = (
-            "https://ultralytics.com/images/boats.jpg" if task == "obb" else DEFAULT_CFG.source or ASSETS
-        )
-        LOGGER.warning(f"'source' argument is missing. Using default 'source={overrides['source']}'.")
-    elif mode in {"train", "val"}:
-        if "data" not in overrides and "resume" not in overrides:
-            overrides["data"] = DEFAULT_CFG.data or TASK2DATA.get(task or DEFAULT_CFG.task, DEFAULT_CFG.data)
-            LOGGER.warning(f"'data' argument is missing. Using default 'data={overrides['data']}'.")
-    elif mode == "export":
-        if "format" not in overrides:
-            overrides["format"] = DEFAULT_CFG.format or "torchscript"
-            LOGGER.warning(f"'format' argument is missing. Using default 'format={overrides['format']}'.")
-
-    # Run command in python
-    getattr(model, mode)(**overrides)  # default args from model
-
-    # Show help
-    LOGGER.info(f"💡 Learn more at https://docs.ultralytics.com/modes/{mode}")
-
-    # Recommend VS Code extension
-    if IS_VSCODE and SETTINGS.get("vscode_msg", True):
-        LOGGER.info(vscode_msg())
-
-
-# Special modes --------------------------------------------------------------------------------------------------------
-def copy_default_cfg() -> None:
-    """
-    Copy the default configuration file and create a new one with '_copy' appended to its name.
-
-    This function duplicates the existing default configuration file (DEFAULT_CFG_PATH) and saves it
-    with '_copy' appended to its name in the current working directory. It provides a convenient way
-    to create a custom configuration file based on the default settings.
-
-    Examples:
-        >>> copy_default_cfg()
-        # Output: default.yaml copied to /path/to/current/directory/default_copy.yaml
-        # Example YOLO command with this new custom cfg:
-        #   yolo cfg='/path/to/current/directory/default_copy.yaml' imgsz=320 batch=8
-
-    Notes:
-        - The new configuration file is created in the current working directory.
-        - After copying, the function prints a message with the new file's location and an example
-          YOLO command demonstrating how to use the new configuration file.
-        - This function is useful for users who want to modify the default configuration without
-          altering the original file.
-    """
-    new_file = Path.cwd() / DEFAULT_CFG_PATH.name.replace(".yaml", "_copy.yaml")
-    shutil.copy2(DEFAULT_CFG_PATH, new_file)
-    LOGGER.info(
-        f"{DEFAULT_CFG_PATH} copied to {new_file}\n"
-        f"Example YOLO command with this new custom cfg:\n    yolo cfg='{new_file}' imgsz=320 batch=8"
-    )
-
-
-if __name__ == "__main__":
-    # Example: entrypoint(debug='yolo predict model=yolo11n.pt')
-    entrypoint(debug="")
diff --git a/ultralytics/cfg/__pycache__/__init__.cpython-310.pyc b/ultralytics/cfg/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index 01bd1a8..0000000
Binary files a/ultralytics/cfg/__pycache__/__init__.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/cfg/datasets/Argoverse.yaml b/ultralytics/cfg/datasets/Argoverse.yaml
deleted file mode 100644
index 0eb1482..0000000
--- a/ultralytics/cfg/datasets/Argoverse.yaml
+++ /dev/null
@@ -1,77 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Argoverse-HD dataset (ring-front-center camera) https://www.cs.cmu.edu/~mengtial/proj/streaming/ by Argo AI
-# Documentation: https://docs.ultralytics.com/datasets/detect/argoverse/
-# Example usage: yolo train data=Argoverse.yaml
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── Argoverse ← downloads here (31.5 GB)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: Argoverse # dataset root dir
-train: Argoverse-1.1/images/train/ # train images (relative to 'path') 39384 images
-val: Argoverse-1.1/images/val/ # val images (relative to 'path') 15062 images
-test: Argoverse-1.1/images/test/ # test images (optional) https://eval.ai/web/challenges/challenge-page/800/overview
-
-# Classes
-names:
-  0: person
-  1: bicycle
-  2: car
-  3: motorcycle
-  4: bus
-  5: truck
-  6: traffic_light
-  7: stop_sign
-
-# Download script/URL (optional) ---------------------------------------------------------------------------------------
-download: |
-  import json
-  from pathlib import Path
-
-  from ultralytics.utils import TQDM
-  from ultralytics.utils.downloads import download
-
-  def argoverse2yolo(set):
-      """Convert Argoverse dataset annotations to YOLO format for object detection tasks."""
-      labels = {}
-      a = json.load(open(set, "rb"))
-      for annot in TQDM(a["annotations"], desc=f"Converting {set} to YOLOv5 format..."):
-          img_id = annot["image_id"]
-          img_name = a["images"][img_id]["name"]
-          img_label_name = f"{img_name[:-3]}txt"
-
-          cls = annot["category_id"]  # instance class id
-          x_center, y_center, width, height = annot["bbox"]
-          x_center = (x_center + width / 2) / 1920.0  # offset and scale
-          y_center = (y_center + height / 2) / 1200.0  # offset and scale
-          width /= 1920.0  # scale
-          height /= 1200.0  # scale
-
-          img_dir = set.parents[2] / "Argoverse-1.1" / "labels" / a["seq_dirs"][a["images"][annot["image_id"]]["sid"]]
-          if not img_dir.exists():
-              img_dir.mkdir(parents=True, exist_ok=True)
-
-          k = str(img_dir / img_label_name)
-          if k not in labels:
-              labels[k] = []
-          labels[k].append(f"{cls} {x_center} {y_center} {width} {height}\n")
-
-      for k in labels:
-          with open(k, "w", encoding="utf-8") as f:
-              f.writelines(labels[k])
-
-
-  # Download 'https://argoverse-hd.s3.us-east-2.amazonaws.com/Argoverse-HD-Full.zip' (deprecated S3 link)
-  dir = Path(yaml["path"])  # dataset root dir
-  urls = ["https://drive.google.com/file/d/1st9qW3BeIwQsnR0t8mRpvbsSWIo16ACi/view?usp=drive_link"]
-  print("\n\nWARNING: Argoverse dataset MUST be downloaded manually, autodownload will NOT work.")
-  print(f"WARNING: Manually download Argoverse dataset '{urls[0]}' to '{dir}' and re-run your command.\n\n")
-  # download(urls, dir=dir)
-
-  # Convert
-  annotations_dir = "Argoverse-HD/annotations/"
-  (dir / "Argoverse-1.1" / "tracking").rename(dir / "Argoverse-1.1" / "images")  # rename 'tracking' to 'images'
-  for d in "train.json", "val.json":
-      argoverse2yolo(dir / annotations_dir / d)  # convert Argoverse annotations to YOLO labels
diff --git a/ultralytics/cfg/datasets/DOTAv1.5.yaml b/ultralytics/cfg/datasets/DOTAv1.5.yaml
deleted file mode 100644
index 79bdfb8..0000000
--- a/ultralytics/cfg/datasets/DOTAv1.5.yaml
+++ /dev/null
@@ -1,37 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# DOTA 1.5 dataset https://captain-whu.github.io/DOTA/index.html for object detection in aerial images by Wuhan University
-# Documentation: https://docs.ultralytics.com/datasets/obb/dota-v2/
-# Example usage: yolo train model=yolov8n-obb.pt data=DOTAv1.5.yaml
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── dota1.5 ← downloads here (2GB)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: DOTAv1.5 # dataset root dir
-train: images/train # train images (relative to 'path') 1411 images
-val: images/val # val images (relative to 'path') 458 images
-test: images/test # test images (optional) 937 images
-
-# Classes for DOTA 1.5
-names:
-  0: plane
-  1: ship
-  2: storage tank
-  3: baseball diamond
-  4: tennis court
-  5: basketball court
-  6: ground track field
-  7: harbor
-  8: bridge
-  9: large vehicle
-  10: small vehicle
-  11: helicopter
-  12: roundabout
-  13: soccer ball field
-  14: swimming pool
-  15: container crane
-
-# Download script/URL (optional)
-download: https://github.com/ultralytics/assets/releases/download/v0.0.0/DOTAv1.5.zip
diff --git a/ultralytics/cfg/datasets/DOTAv1.yaml b/ultralytics/cfg/datasets/DOTAv1.yaml
deleted file mode 100644
index 74fd9e9..0000000
--- a/ultralytics/cfg/datasets/DOTAv1.yaml
+++ /dev/null
@@ -1,36 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# DOTA 1.0 dataset https://captain-whu.github.io/DOTA/index.html for object detection in aerial images by Wuhan University
-# Documentation: https://docs.ultralytics.com/datasets/obb/dota-v2/
-# Example usage: yolo train model=yolov8n-obb.pt data=DOTAv1.yaml
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── dota1 ← downloads here (2GB)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: DOTAv1 # dataset root dir
-train: images/train # train images (relative to 'path') 1411 images
-val: images/val # val images (relative to 'path') 458 images
-test: images/test # test images (optional) 937 images
-
-# Classes for DOTA 1.0
-names:
-  0: plane
-  1: ship
-  2: storage tank
-  3: baseball diamond
-  4: tennis court
-  5: basketball court
-  6: ground track field
-  7: harbor
-  8: bridge
-  9: large vehicle
-  10: small vehicle
-  11: helicopter
-  12: roundabout
-  13: soccer ball field
-  14: swimming pool
-
-# Download script/URL (optional)
-download: https://github.com/ultralytics/assets/releases/download/v0.0.0/DOTAv1.zip
diff --git a/ultralytics/cfg/datasets/GlobalWheat2020.yaml b/ultralytics/cfg/datasets/GlobalWheat2020.yaml
deleted file mode 100644
index 5d9d457..0000000
--- a/ultralytics/cfg/datasets/GlobalWheat2020.yaml
+++ /dev/null
@@ -1,68 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Global Wheat 2020 dataset https://www.global-wheat.com/ by University of Saskatchewan
-# Documentation: https://docs.ultralytics.com/datasets/detect/globalwheat2020/
-# Example usage: yolo train data=GlobalWheat2020.yaml
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── GlobalWheat2020 ← downloads here (7.0 GB)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: GlobalWheat2020 # dataset root dir
-train: # train images (relative to 'path') 3422 images
-  - images/arvalis_1
-  - images/arvalis_2
-  - images/arvalis_3
-  - images/ethz_1
-  - images/rres_1
-  - images/inrae_1
-  - images/usask_1
-val: # val images (relative to 'path') 748 images (WARNING: train set contains ethz_1)
-  - images/ethz_1
-test: # test images (optional) 1276 images
-  - images/utokyo_1
-  - images/utokyo_2
-  - images/nau_1
-  - images/uq_1
-
-# Classes
-names:
-  0: wheat_head
-
-# Download script/URL (optional) ---------------------------------------------------------------------------------------
-download: |
-  from pathlib import Path
-
-  from ultralytics.utils.downloads import download
-
-  # Download
-  dir = Path(yaml["path"])  # dataset root dir
-  urls = [
-      "https://zenodo.org/record/4298502/files/global-wheat-codalab-official.zip",
-      "https://github.com/ultralytics/assets/releases/download/v0.0.0/GlobalWheat2020_labels.zip",
-  ]
-  download(urls, dir=dir)
-
-  # Make Directories
-  for p in "annotations", "images", "labels":
-      (dir / p).mkdir(parents=True, exist_ok=True)
-
-  # Move
-  for p in (
-      "arvalis_1",
-      "arvalis_2",
-      "arvalis_3",
-      "ethz_1",
-      "rres_1",
-      "inrae_1",
-      "usask_1",
-      "utokyo_1",
-      "utokyo_2",
-      "nau_1",
-      "uq_1",
-  ):
-      (dir / "global-wheat-codalab-official" / p).rename(dir / "images" / p)  # move to /images
-      f = (dir / "global-wheat-codalab-official" / p).with_suffix(".json")  # json file
-      if f.exists():
-          f.rename((dir / "annotations" / p).with_suffix(".json"))  # move to /annotations
diff --git a/ultralytics/cfg/datasets/HomeObjects-3K.yaml b/ultralytics/cfg/datasets/HomeObjects-3K.yaml
deleted file mode 100644
index f8c5ac1..0000000
--- a/ultralytics/cfg/datasets/HomeObjects-3K.yaml
+++ /dev/null
@@ -1,32 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# HomeObjects-3K dataset by Ultralytics
-# Documentation: https://docs.ultralytics.com/datasets/detect/homeobjects-3k/
-# Example usage: yolo train data=HomeObjects-3K.yaml
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── homeobjects-3K ← downloads here (390 MB)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: homeobjects-3K # dataset root dir
-train: images/train # train images (relative to 'path') 2285 images
-val: images/val # val images (relative to 'path') 404 images
-
-# Classes
-names:
-  0: bed
-  1: sofa
-  2: chair
-  3: table
-  4: lamp
-  5: tv
-  6: laptop
-  7: wardrobe
-  8: window
-  9: door
-  10: potted plant
-  11: photo frame
-
-# Download script/URL (optional)
-download: https://github.com/ultralytics/assets/releases/download/v0.0.0/homeobjects-3K.zip
diff --git a/ultralytics/cfg/datasets/ImageNet.yaml b/ultralytics/cfg/datasets/ImageNet.yaml
deleted file mode 100644
index f0a48fd..0000000
--- a/ultralytics/cfg/datasets/ImageNet.yaml
+++ /dev/null
@@ -1,2025 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# ImageNet-1k dataset https://www.image-net.org/index.php by Stanford University
-# Simplified class names from https://github.com/anishathalye/imagenet-simple-labels
-# Documentation: https://docs.ultralytics.com/datasets/classify/imagenet/
-# Example usage: yolo train task=classify data=imagenet
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── imagenet ← downloads here (144 GB)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: imagenet # dataset root dir
-train: train # train images (relative to 'path') 1281167 images
-val: val # val images (relative to 'path') 50000 images
-test: # test images (optional)
-
-# Classes
-names:
-  0: tench
-  1: goldfish
-  2: great white shark
-  3: tiger shark
-  4: hammerhead shark
-  5: electric ray
-  6: stingray
-  7: cock
-  8: hen
-  9: ostrich
-  10: brambling
-  11: goldfinch
-  12: house finch
-  13: junco
-  14: indigo bunting
-  15: American robin
-  16: bulbul
-  17: jay
-  18: magpie
-  19: chickadee
-  20: American dipper
-  21: kite
-  22: bald eagle
-  23: vulture
-  24: great grey owl
-  25: fire salamander
-  26: smooth newt
-  27: newt
-  28: spotted salamander
-  29: axolotl
-  30: American bullfrog
-  31: tree frog
-  32: tailed frog
-  33: loggerhead sea turtle
-  34: leatherback sea turtle
-  35: mud turtle
-  36: terrapin
-  37: box turtle
-  38: banded gecko
-  39: green iguana
-  40: Carolina anole
-  41: desert grassland whiptail lizard
-  42: agama
-  43: frilled-necked lizard
-  44: alligator lizard
-  45: Gila monster
-  46: European green lizard
-  47: chameleon
-  48: Komodo dragon
-  49: Nile crocodile
-  50: American alligator
-  51: triceratops
-  52: worm snake
-  53: ring-necked snake
-  54: eastern hog-nosed snake
-  55: smooth green snake
-  56: kingsnake
-  57: garter snake
-  58: water snake
-  59: vine snake
-  60: night snake
-  61: boa constrictor
-  62: African rock python
-  63: Indian cobra
-  64: green mamba
-  65: sea snake
-  66: Saharan horned viper
-  67: eastern diamondback rattlesnake
-  68: sidewinder
-  69: trilobite
-  70: harvestman
-  71: scorpion
-  72: yellow garden spider
-  73: barn spider
-  74: European garden spider
-  75: southern black widow
-  76: tarantula
-  77: wolf spider
-  78: tick
-  79: centipede
-  80: black grouse
-  81: ptarmigan
-  82: ruffed grouse
-  83: prairie grouse
-  84: peacock
-  85: quail
-  86: partridge
-  87: grey parrot
-  88: macaw
-  89: sulphur-crested cockatoo
-  90: lorikeet
-  91: coucal
-  92: bee eater
-  93: hornbill
-  94: hummingbird
-  95: jacamar
-  96: toucan
-  97: duck
-  98: red-breasted merganser
-  99: goose
-  100: black swan
-  101: tusker
-  102: echidna
-  103: platypus
-  104: wallaby
-  105: koala
-  106: wombat
-  107: jellyfish
-  108: sea anemone
-  109: brain coral
-  110: flatworm
-  111: nematode
-  112: conch
-  113: snail
-  114: slug
-  115: sea slug
-  116: chiton
-  117: chambered nautilus
-  118: Dungeness crab
-  119: rock crab
-  120: fiddler crab
-  121: red king crab
-  122: American lobster
-  123: spiny lobster
-  124: crayfish
-  125: hermit crab
-  126: isopod
-  127: white stork
-  128: black stork
-  129: spoonbill
-  130: flamingo
-  131: little blue heron
-  132: great egret
-  133: bittern
-  134: crane (bird)
-  135: limpkin
-  136: common gallinule
-  137: American coot
-  138: bustard
-  139: ruddy turnstone
-  140: dunlin
-  141: common redshank
-  142: dowitcher
-  143: oystercatcher
-  144: pelican
-  145: king penguin
-  146: albatross
-  147: grey whale
-  148: killer whale
-  149: dugong
-  150: sea lion
-  151: Chihuahua
-  152: Japanese Chin
-  153: Maltese
-  154: Pekingese
-  155: Shih Tzu
-  156: King Charles Spaniel
-  157: Papillon
-  158: toy terrier
-  159: Rhodesian Ridgeback
-  160: Afghan Hound
-  161: Basset Hound
-  162: Beagle
-  163: Bloodhound
-  164: Bluetick Coonhound
-  165: Black and Tan Coonhound
-  166: Treeing Walker Coonhound
-  167: English foxhound
-  168: Redbone Coonhound
-  169: borzoi
-  170: Irish Wolfhound
-  171: Italian Greyhound
-  172: Whippet
-  173: Ibizan Hound
-  174: Norwegian Elkhound
-  175: Otterhound
-  176: Saluki
-  177: Scottish Deerhound
-  178: Weimaraner
-  179: Staffordshire Bull Terrier
-  180: American Staffordshire Terrier
-  181: Bedlington Terrier
-  182: Border Terrier
-  183: Kerry Blue Terrier
-  184: Irish Terrier
-  185: Norfolk Terrier
-  186: Norwich Terrier
-  187: Yorkshire Terrier
-  188: Wire Fox Terrier
-  189: Lakeland Terrier
-  190: Sealyham Terrier
-  191: Airedale Terrier
-  192: Cairn Terrier
-  193: Australian Terrier
-  194: Dandie Dinmont Terrier
-  195: Boston Terrier
-  196: Miniature Schnauzer
-  197: Giant Schnauzer
-  198: Standard Schnauzer
-  199: Scottish Terrier
-  200: Tibetan Terrier
-  201: Australian Silky Terrier
-  202: Soft-coated Wheaten Terrier
-  203: West Highland White Terrier
-  204: Lhasa Apso
-  205: Flat-Coated Retriever
-  206: Curly-coated Retriever
-  207: Golden Retriever
-  208: Labrador Retriever
-  209: Chesapeake Bay Retriever
-  210: German Shorthaired Pointer
-  211: Vizsla
-  212: English Setter
-  213: Irish Setter
-  214: Gordon Setter
-  215: Brittany
-  216: Clumber Spaniel
-  217: English Springer Spaniel
-  218: Welsh Springer Spaniel
-  219: Cocker Spaniels
-  220: Sussex Spaniel
-  221: Irish Water Spaniel
-  222: Kuvasz
-  223: Schipperke
-  224: Groenendael
-  225: Malinois
-  226: Briard
-  227: Australian Kelpie
-  228: Komondor
-  229: Old English Sheepdog
-  230: Shetland Sheepdog
-  231: collie
-  232: Border Collie
-  233: Bouvier des Flandres
-  234: Rottweiler
-  235: German Shepherd Dog
-  236: Dobermann
-  237: Miniature Pinscher
-  238: Greater Swiss Mountain Dog
-  239: Bernese Mountain Dog
-  240: Appenzeller Sennenhund
-  241: Entlebucher Sennenhund
-  242: Boxer
-  243: Bullmastiff
-  244: Tibetan Mastiff
-  245: French Bulldog
-  246: Great Dane
-  247: St. Bernard
-  248: husky
-  249: Alaskan Malamute
-  250: Siberian Husky
-  251: Dalmatian
-  252: Affenpinscher
-  253: Basenji
-  254: pug
-  255: Leonberger
-  256: Newfoundland
-  257: Pyrenean Mountain Dog
-  258: Samoyed
-  259: Pomeranian
-  260: Chow Chow
-  261: Keeshond
-  262: Griffon Bruxellois
-  263: Pembroke Welsh Corgi
-  264: Cardigan Welsh Corgi
-  265: Toy Poodle
-  266: Miniature Poodle
-  267: Standard Poodle
-  268: Mexican hairless dog
-  269: grey wolf
-  270: Alaskan tundra wolf
-  271: red wolf
-  272: coyote
-  273: dingo
-  274: dhole
-  275: African wild dog
-  276: hyena
-  277: red fox
-  278: kit fox
-  279: Arctic fox
-  280: grey fox
-  281: tabby cat
-  282: tiger cat
-  283: Persian cat
-  284: Siamese cat
-  285: Egyptian Mau
-  286: cougar
-  287: lynx
-  288: leopard
-  289: snow leopard
-  290: jaguar
-  291: lion
-  292: tiger
-  293: cheetah
-  294: brown bear
-  295: American black bear
-  296: polar bear
-  297: sloth bear
-  298: mongoose
-  299: meerkat
-  300: tiger beetle
-  301: ladybug
-  302: ground beetle
-  303: longhorn beetle
-  304: leaf beetle
-  305: dung beetle
-  306: rhinoceros beetle
-  307: weevil
-  308: fly
-  309: bee
-  310: ant
-  311: grasshopper
-  312: cricket
-  313: stick insect
-  314: cockroach
-  315: mantis
-  316: cicada
-  317: leafhopper
-  318: lacewing
-  319: dragonfly
-  320: damselfly
-  321: red admiral
-  322: ringlet
-  323: monarch butterfly
-  324: small white
-  325: sulphur butterfly
-  326: gossamer-winged butterfly
-  327: starfish
-  328: sea urchin
-  329: sea cucumber
-  330: cottontail rabbit
-  331: hare
-  332: Angora rabbit
-  333: hamster
-  334: porcupine
-  335: fox squirrel
-  336: marmot
-  337: beaver
-  338: guinea pig
-  339: common sorrel
-  340: zebra
-  341: pig
-  342: wild boar
-  343: warthog
-  344: hippopotamus
-  345: ox
-  346: water buffalo
-  347: bison
-  348: ram
-  349: bighorn sheep
-  350: Alpine ibex
-  351: hartebeest
-  352: impala
-  353: gazelle
-  354: dromedary
-  355: llama
-  356: weasel
-  357: mink
-  358: European polecat
-  359: black-footed ferret
-  360: otter
-  361: skunk
-  362: badger
-  363: armadillo
-  364: three-toed sloth
-  365: orangutan
-  366: gorilla
-  367: chimpanzee
-  368: gibbon
-  369: siamang
-  370: guenon
-  371: patas monkey
-  372: baboon
-  373: macaque
-  374: langur
-  375: black-and-white colobus
-  376: proboscis monkey
-  377: marmoset
-  378: white-headed capuchin
-  379: howler monkey
-  380: titi
-  381: Geoffroy's spider monkey
-  382: common squirrel monkey
-  383: ring-tailed lemur
-  384: indri
-  385: Asian elephant
-  386: African bush elephant
-  387: red panda
-  388: giant panda
-  389: snoek
-  390: eel
-  391: coho salmon
-  392: rock beauty
-  393: clownfish
-  394: sturgeon
-  395: garfish
-  396: lionfish
-  397: pufferfish
-  398: abacus
-  399: abaya
-  400: academic gown
-  401: accordion
-  402: acoustic guitar
-  403: aircraft carrier
-  404: airliner
-  405: airship
-  406: altar
-  407: ambulance
-  408: amphibious vehicle
-  409: analog clock
-  410: apiary
-  411: apron
-  412: waste container
-  413: assault rifle
-  414: backpack
-  415: bakery
-  416: balance beam
-  417: balloon
-  418: ballpoint pen
-  419: Band-Aid
-  420: banjo
-  421: baluster
-  422: barbell
-  423: barber chair
-  424: barbershop
-  425: barn
-  426: barometer
-  427: barrel
-  428: wheelbarrow
-  429: baseball
-  430: basketball
-  431: bassinet
-  432: bassoon
-  433: swimming cap
-  434: bath towel
-  435: bathtub
-  436: station wagon
-  437: lighthouse
-  438: beaker
-  439: military cap
-  440: beer bottle
-  441: beer glass
-  442: bell-cot
-  443: bib
-  444: tandem bicycle
-  445: bikini
-  446: ring binder
-  447: binoculars
-  448: birdhouse
-  449: boathouse
-  450: bobsleigh
-  451: bolo tie
-  452: poke bonnet
-  453: bookcase
-  454: bookstore
-  455: bottle cap
-  456: bow
-  457: bow tie
-  458: brass
-  459: bra
-  460: breakwater
-  461: breastplate
-  462: broom
-  463: bucket
-  464: buckle
-  465: bulletproof vest
-  466: high-speed train
-  467: butcher shop
-  468: taxicab
-  469: cauldron
-  470: candle
-  471: cannon
-  472: canoe
-  473: can opener
-  474: cardigan
-  475: car mirror
-  476: carousel
-  477: tool kit
-  478: carton
-  479: car wheel
-  480: automated teller machine
-  481: cassette
-  482: cassette player
-  483: castle
-  484: catamaran
-  485: CD player
-  486: cello
-  487: mobile phone
-  488: chain
-  489: chain-link fence
-  490: chain mail
-  491: chainsaw
-  492: chest
-  493: chiffonier
-  494: chime
-  495: china cabinet
-  496: Christmas stocking
-  497: church
-  498: movie theater
-  499: cleaver
-  500: cliff dwelling
-  501: cloak
-  502: clogs
-  503: cocktail shaker
-  504: coffee mug
-  505: coffeemaker
-  506: coil
-  507: combination lock
-  508: computer keyboard
-  509: confectionery store
-  510: container ship
-  511: convertible
-  512: corkscrew
-  513: cornet
-  514: cowboy boot
-  515: cowboy hat
-  516: cradle
-  517: crane (machine)
-  518: crash helmet
-  519: crate
-  520: infant bed
-  521: Crock Pot
-  522: croquet ball
-  523: crutch
-  524: cuirass
-  525: dam
-  526: desk
-  527: desktop computer
-  528: rotary dial telephone
-  529: diaper
-  530: digital clock
-  531: digital watch
-  532: dining table
-  533: dishcloth
-  534: dishwasher
-  535: disc brake
-  536: dock
-  537: dog sled
-  538: dome
-  539: doormat
-  540: drilling rig
-  541: drum
-  542: drumstick
-  543: dumbbell
-  544: Dutch oven
-  545: electric fan
-  546: electric guitar
-  547: electric locomotive
-  548: entertainment center
-  549: envelope
-  550: espresso machine
-  551: face powder
-  552: feather boa
-  553: filing cabinet
-  554: fireboat
-  555: fire engine
-  556: fire screen sheet
-  557: flagpole
-  558: flute
-  559: folding chair
-  560: football helmet
-  561: forklift
-  562: fountain
-  563: fountain pen
-  564: four-poster bed
-  565: freight car
-  566: French horn
-  567: frying pan
-  568: fur coat
-  569: garbage truck
-  570: gas mask
-  571: gas pump
-  572: goblet
-  573: go-kart
-  574: golf ball
-  575: golf cart
-  576: gondola
-  577: gong
-  578: gown
-  579: grand piano
-  580: greenhouse
-  581: grille
-  582: grocery store
-  583: guillotine
-  584: barrette
-  585: hair spray
-  586: half-track
-  587: hammer
-  588: hamper
-  589: hair dryer
-  590: hand-held computer
-  591: handkerchief
-  592: hard disk drive
-  593: harmonica
-  594: harp
-  595: harvester
-  596: hatchet
-  597: holster
-  598: home theater
-  599: honeycomb
-  600: hook
-  601: hoop skirt
-  602: horizontal bar
-  603: horse-drawn vehicle
-  604: hourglass
-  605: iPod
-  606: clothes iron
-  607: jack-o'-lantern
-  608: jeans
-  609: jeep
-  610: T-shirt
-  611: jigsaw puzzle
-  612: pulled rickshaw
-  613: joystick
-  614: kimono
-  615: knee pad
-  616: knot
-  617: lab coat
-  618: ladle
-  619: lampshade
-  620: laptop computer
-  621: lawn mower
-  622: lens cap
-  623: paper knife
-  624: library
-  625: lifeboat
-  626: lighter
-  627: limousine
-  628: ocean liner
-  629: lipstick
-  630: slip-on shoe
-  631: lotion
-  632: speaker
-  633: loupe
-  634: sawmill
-  635: magnetic compass
-  636: mail bag
-  637: mailbox
-  638: tights
-  639: tank suit
-  640: manhole cover
-  641: maraca
-  642: marimba
-  643: mask
-  644: match
-  645: maypole
-  646: maze
-  647: measuring cup
-  648: medicine chest
-  649: megalith
-  650: microphone
-  651: microwave oven
-  652: military uniform
-  653: milk can
-  654: minibus
-  655: miniskirt
-  656: minivan
-  657: missile
-  658: mitten
-  659: mixing bowl
-  660: mobile home
-  661: Model T
-  662: modem
-  663: monastery
-  664: monitor
-  665: moped
-  666: mortar
-  667: square academic cap
-  668: mosque
-  669: mosquito net
-  670: scooter
-  671: mountain bike
-  672: tent
-  673: computer mouse
-  674: mousetrap
-  675: moving van
-  676: muzzle
-  677: nail
-  678: neck brace
-  679: necklace
-  680: nipple
-  681: notebook computer
-  682: obelisk
-  683: oboe
-  684: ocarina
-  685: odometer
-  686: oil filter
-  687: organ
-  688: oscilloscope
-  689: overskirt
-  690: bullock cart
-  691: oxygen mask
-  692: packet
-  693: paddle
-  694: paddle wheel
-  695: padlock
-  696: paintbrush
-  697: pajamas
-  698: palace
-  699: pan flute
-  700: paper towel
-  701: parachute
-  702: parallel bars
-  703: park bench
-  704: parking meter
-  705: passenger car
-  706: patio
-  707: payphone
-  708: pedestal
-  709: pencil case
-  710: pencil sharpener
-  711: perfume
-  712: Petri dish
-  713: photocopier
-  714: plectrum
-  715: Pickelhaube
-  716: picket fence
-  717: pickup truck
-  718: pier
-  719: piggy bank
-  720: pill bottle
-  721: pillow
-  722: ping-pong ball
-  723: pinwheel
-  724: pirate ship
-  725: pitcher
-  726: hand plane
-  727: planetarium
-  728: plastic bag
-  729: plate rack
-  730: plow
-  731: plunger
-  732: Polaroid camera
-  733: pole
-  734: police van
-  735: poncho
-  736: billiard table
-  737: soda bottle
-  738: pot
-  739: potter's wheel
-  740: power drill
-  741: prayer rug
-  742: printer
-  743: prison
-  744: projectile
-  745: projector
-  746: hockey puck
-  747: punching bag
-  748: purse
-  749: quill
-  750: quilt
-  751: race car
-  752: racket
-  753: radiator
-  754: radio
-  755: radio telescope
-  756: rain barrel
-  757: recreational vehicle
-  758: reel
-  759: reflex camera
-  760: refrigerator
-  761: remote control
-  762: restaurant
-  763: revolver
-  764: rifle
-  765: rocking chair
-  766: rotisserie
-  767: eraser
-  768: rugby ball
-  769: ruler
-  770: running shoe
-  771: safe
-  772: safety pin
-  773: salt shaker
-  774: sandal
-  775: sarong
-  776: saxophone
-  777: scabbard
-  778: weighing scale
-  779: school bus
-  780: schooner
-  781: scoreboard
-  782: CRT screen
-  783: screw
-  784: screwdriver
-  785: seat belt
-  786: sewing machine
-  787: shield
-  788: shoe store
-  789: shoji
-  790: shopping basket
-  791: shopping cart
-  792: shovel
-  793: shower cap
-  794: shower curtain
-  795: ski
-  796: ski mask
-  797: sleeping bag
-  798: slide rule
-  799: sliding door
-  800: slot machine
-  801: snorkel
-  802: snowmobile
-  803: snowplow
-  804: soap dispenser
-  805: soccer ball
-  806: sock
-  807: solar thermal collector
-  808: sombrero
-  809: soup bowl
-  810: space bar
-  811: space heater
-  812: space shuttle
-  813: spatula
-  814: motorboat
-  815: spider web
-  816: spindle
-  817: sports car
-  818: spotlight
-  819: stage
-  820: steam locomotive
-  821: through arch bridge
-  822: steel drum
-  823: stethoscope
-  824: scarf
-  825: stone wall
-  826: stopwatch
-  827: stove
-  828: strainer
-  829: tram
-  830: stretcher
-  831: couch
-  832: stupa
-  833: submarine
-  834: suit
-  835: sundial
-  836: sunglass
-  837: sunglasses
-  838: sunscreen
-  839: suspension bridge
-  840: mop
-  841: sweatshirt
-  842: swimsuit
-  843: swing
-  844: switch
-  845: syringe
-  846: table lamp
-  847: tank
-  848: tape player
-  849: teapot
-  850: teddy bear
-  851: television
-  852: tennis ball
-  853: thatched roof
-  854: front curtain
-  855: thimble
-  856: threshing machine
-  857: throne
-  858: tile roof
-  859: toaster
-  860: tobacco shop
-  861: toilet seat
-  862: torch
-  863: totem pole
-  864: tow truck
-  865: toy store
-  866: tractor
-  867: semi-trailer truck
-  868: tray
-  869: trench coat
-  870: tricycle
-  871: trimaran
-  872: tripod
-  873: triumphal arch
-  874: trolleybus
-  875: trombone
-  876: tub
-  877: turnstile
-  878: typewriter keyboard
-  879: umbrella
-  880: unicycle
-  881: upright piano
-  882: vacuum cleaner
-  883: vase
-  884: vault
-  885: velvet
-  886: vending machine
-  887: vestment
-  888: viaduct
-  889: violin
-  890: volleyball
-  891: waffle iron
-  892: wall clock
-  893: wallet
-  894: wardrobe
-  895: military aircraft
-  896: sink
-  897: washing machine
-  898: water bottle
-  899: water jug
-  900: water tower
-  901: whiskey jug
-  902: whistle
-  903: wig
-  904: window screen
-  905: window shade
-  906: Windsor tie
-  907: wine bottle
-  908: wing
-  909: wok
-  910: wooden spoon
-  911: wool
-  912: split-rail fence
-  913: shipwreck
-  914: yawl
-  915: yurt
-  916: website
-  917: comic book
-  918: crossword
-  919: traffic sign
-  920: traffic light
-  921: dust jacket
-  922: menu
-  923: plate
-  924: guacamole
-  925: consomme
-  926: hot pot
-  927: trifle
-  928: ice cream
-  929: ice pop
-  930: baguette
-  931: bagel
-  932: pretzel
-  933: cheeseburger
-  934: hot dog
-  935: mashed potato
-  936: cabbage
-  937: broccoli
-  938: cauliflower
-  939: zucchini
-  940: spaghetti squash
-  941: acorn squash
-  942: butternut squash
-  943: cucumber
-  944: artichoke
-  945: bell pepper
-  946: cardoon
-  947: mushroom
-  948: Granny Smith
-  949: strawberry
-  950: orange
-  951: lemon
-  952: fig
-  953: pineapple
-  954: banana
-  955: jackfruit
-  956: custard apple
-  957: pomegranate
-  958: hay
-  959: carbonara
-  960: chocolate syrup
-  961: dough
-  962: meatloaf
-  963: pizza
-  964: pot pie
-  965: burrito
-  966: red wine
-  967: espresso
-  968: cup
-  969: eggnog
-  970: alp
-  971: bubble
-  972: cliff
-  973: coral reef
-  974: geyser
-  975: lakeshore
-  976: promontory
-  977: shoal
-  978: seashore
-  979: valley
-  980: volcano
-  981: baseball player
-  982: bridegroom
-  983: scuba diver
-  984: rapeseed
-  985: daisy
-  986: yellow lady's slipper
-  987: corn
-  988: acorn
-  989: rose hip
-  990: horse chestnut seed
-  991: coral fungus
-  992: agaric
-  993: gyromitra
-  994: stinkhorn mushroom
-  995: earth star
-  996: hen-of-the-woods
-  997: bolete
-  998: ear
-  999: toilet paper
-
-# Imagenet class codes to human-readable names
-map:
-  n01440764: tench
-  n01443537: goldfish
-  n01484850: great_white_shark
-  n01491361: tiger_shark
-  n01494475: hammerhead
-  n01496331: electric_ray
-  n01498041: stingray
-  n01514668: cock
-  n01514859: hen
-  n01518878: ostrich
-  n01530575: brambling
-  n01531178: goldfinch
-  n01532829: house_finch
-  n01534433: junco
-  n01537544: indigo_bunting
-  n01558993: robin
-  n01560419: bulbul
-  n01580077: jay
-  n01582220: magpie
-  n01592084: chickadee
-  n01601694: water_ouzel
-  n01608432: kite
-  n01614925: bald_eagle
-  n01616318: vulture
-  n01622779: great_grey_owl
-  n01629819: European_fire_salamander
-  n01630670: common_newt
-  n01631663: eft
-  n01632458: spotted_salamander
-  n01632777: axolotl
-  n01641577: bullfrog
-  n01644373: tree_frog
-  n01644900: tailed_frog
-  n01664065: loggerhead
-  n01665541: leatherback_turtle
-  n01667114: mud_turtle
-  n01667778: terrapin
-  n01669191: box_turtle
-  n01675722: banded_gecko
-  n01677366: common_iguana
-  n01682714: American_chameleon
-  n01685808: whiptail
-  n01687978: agama
-  n01688243: frilled_lizard
-  n01689811: alligator_lizard
-  n01692333: Gila_monster
-  n01693334: green_lizard
-  n01694178: African_chameleon
-  n01695060: Komodo_dragon
-  n01697457: African_crocodile
-  n01698640: American_alligator
-  n01704323: triceratops
-  n01728572: thunder_snake
-  n01728920: ringneck_snake
-  n01729322: hognose_snake
-  n01729977: green_snake
-  n01734418: king_snake
-  n01735189: garter_snake
-  n01737021: water_snake
-  n01739381: vine_snake
-  n01740131: night_snake
-  n01742172: boa_constrictor
-  n01744401: rock_python
-  n01748264: Indian_cobra
-  n01749939: green_mamba
-  n01751748: sea_snake
-  n01753488: horned_viper
-  n01755581: diamondback
-  n01756291: sidewinder
-  n01768244: trilobite
-  n01770081: harvestman
-  n01770393: scorpion
-  n01773157: black_and_gold_garden_spider
-  n01773549: barn_spider
-  n01773797: garden_spider
-  n01774384: black_widow
-  n01774750: tarantula
-  n01775062: wolf_spider
-  n01776313: tick
-  n01784675: centipede
-  n01795545: black_grouse
-  n01796340: ptarmigan
-  n01797886: ruffed_grouse
-  n01798484: prairie_chicken
-  n01806143: peacock
-  n01806567: quail
-  n01807496: partridge
-  n01817953: African_grey
-  n01818515: macaw
-  n01819313: sulphur-crested_cockatoo
-  n01820546: lorikeet
-  n01824575: coucal
-  n01828970: bee_eater
-  n01829413: hornbill
-  n01833805: hummingbird
-  n01843065: jacamar
-  n01843383: toucan
-  n01847000: drake
-  n01855032: red-breasted_merganser
-  n01855672: goose
-  n01860187: black_swan
-  n01871265: tusker
-  n01872401: echidna
-  n01873310: platypus
-  n01877812: wallaby
-  n01882714: koala
-  n01883070: wombat
-  n01910747: jellyfish
-  n01914609: sea_anemone
-  n01917289: brain_coral
-  n01924916: flatworm
-  n01930112: nematode
-  n01943899: conch
-  n01944390: snail
-  n01945685: slug
-  n01950731: sea_slug
-  n01955084: chiton
-  n01968897: chambered_nautilus
-  n01978287: Dungeness_crab
-  n01978455: rock_crab
-  n01980166: fiddler_crab
-  n01981276: king_crab
-  n01983481: American_lobster
-  n01984695: spiny_lobster
-  n01985128: crayfish
-  n01986214: hermit_crab
-  n01990800: isopod
-  n02002556: white_stork
-  n02002724: black_stork
-  n02006656: spoonbill
-  n02007558: flamingo
-  n02009229: little_blue_heron
-  n02009912: American_egret
-  n02011460: bittern
-  n02012849: crane_(bird)
-  n02013706: limpkin
-  n02017213: European_gallinule
-  n02018207: American_coot
-  n02018795: bustard
-  n02025239: ruddy_turnstone
-  n02027492: red-backed_sandpiper
-  n02028035: redshank
-  n02033041: dowitcher
-  n02037110: oystercatcher
-  n02051845: pelican
-  n02056570: king_penguin
-  n02058221: albatross
-  n02066245: grey_whale
-  n02071294: killer_whale
-  n02074367: dugong
-  n02077923: sea_lion
-  n02085620: Chihuahua
-  n02085782: Japanese_spaniel
-  n02085936: Maltese_dog
-  n02086079: Pekinese
-  n02086240: Shih-Tzu
-  n02086646: Blenheim_spaniel
-  n02086910: papillon
-  n02087046: toy_terrier
-  n02087394: Rhodesian_ridgeback
-  n02088094: Afghan_hound
-  n02088238: basset
-  n02088364: beagle
-  n02088466: bloodhound
-  n02088632: bluetick
-  n02089078: black-and-tan_coonhound
-  n02089867: Walker_hound
-  n02089973: English_foxhound
-  n02090379: redbone
-  n02090622: borzoi
-  n02090721: Irish_wolfhound
-  n02091032: Italian_greyhound
-  n02091134: whippet
-  n02091244: Ibizan_hound
-  n02091467: Norwegian_elkhound
-  n02091635: otterhound
-  n02091831: Saluki
-  n02092002: Scottish_deerhound
-  n02092339: Weimaraner
-  n02093256: Staffordshire_bullterrier
-  n02093428: American_Staffordshire_terrier
-  n02093647: Bedlington_terrier
-  n02093754: Border_terrier
-  n02093859: Kerry_blue_terrier
-  n02093991: Irish_terrier
-  n02094114: Norfolk_terrier
-  n02094258: Norwich_terrier
-  n02094433: Yorkshire_terrier
-  n02095314: wire-haired_fox_terrier
-  n02095570: Lakeland_terrier
-  n02095889: Sealyham_terrier
-  n02096051: Airedale
-  n02096177: cairn
-  n02096294: Australian_terrier
-  n02096437: Dandie_Dinmont
-  n02096585: Boston_bull
-  n02097047: miniature_schnauzer
-  n02097130: giant_schnauzer
-  n02097209: standard_schnauzer
-  n02097298: Scotch_terrier
-  n02097474: Tibetan_terrier
-  n02097658: silky_terrier
-  n02098105: soft-coated_wheaten_terrier
-  n02098286: West_Highland_white_terrier
-  n02098413: Lhasa
-  n02099267: flat-coated_retriever
-  n02099429: curly-coated_retriever
-  n02099601: golden_retriever
-  n02099712: Labrador_retriever
-  n02099849: Chesapeake_Bay_retriever
-  n02100236: German_short-haired_pointer
-  n02100583: vizsla
-  n02100735: English_setter
-  n02100877: Irish_setter
-  n02101006: Gordon_setter
-  n02101388: Brittany_spaniel
-  n02101556: clumber
-  n02102040: English_springer
-  n02102177: Welsh_springer_spaniel
-  n02102318: cocker_spaniel
-  n02102480: Sussex_spaniel
-  n02102973: Irish_water_spaniel
-  n02104029: kuvasz
-  n02104365: schipperke
-  n02105056: groenendael
-  n02105162: malinois
-  n02105251: briard
-  n02105412: kelpie
-  n02105505: komondor
-  n02105641: Old_English_sheepdog
-  n02105855: Shetland_sheepdog
-  n02106030: collie
-  n02106166: Border_collie
-  n02106382: Bouvier_des_Flandres
-  n02106550: Rottweiler
-  n02106662: German_shepherd
-  n02107142: Doberman
-  n02107312: miniature_pinscher
-  n02107574: Greater_Swiss_Mountain_dog
-  n02107683: Bernese_mountain_dog
-  n02107908: Appenzeller
-  n02108000: EntleBucher
-  n02108089: boxer
-  n02108422: bull_mastiff
-  n02108551: Tibetan_mastiff
-  n02108915: French_bulldog
-  n02109047: Great_Dane
-  n02109525: Saint_Bernard
-  n02109961: Eskimo_dog
-  n02110063: malamute
-  n02110185: Siberian_husky
-  n02110341: dalmatian
-  n02110627: affenpinscher
-  n02110806: basenji
-  n02110958: pug
-  n02111129: Leonberg
-  n02111277: Newfoundland
-  n02111500: Great_Pyrenees
-  n02111889: Samoyed
-  n02112018: Pomeranian
-  n02112137: chow
-  n02112350: keeshond
-  n02112706: Brabancon_griffon
-  n02113023: Pembroke
-  n02113186: Cardigan
-  n02113624: toy_poodle
-  n02113712: miniature_poodle
-  n02113799: standard_poodle
-  n02113978: Mexican_hairless
-  n02114367: timber_wolf
-  n02114548: white_wolf
-  n02114712: red_wolf
-  n02114855: coyote
-  n02115641: dingo
-  n02115913: dhole
-  n02116738: African_hunting_dog
-  n02117135: hyena
-  n02119022: red_fox
-  n02119789: kit_fox
-  n02120079: Arctic_fox
-  n02120505: grey_fox
-  n02123045: tabby
-  n02123159: tiger_cat
-  n02123394: Persian_cat
-  n02123597: Siamese_cat
-  n02124075: Egyptian_cat
-  n02125311: cougar
-  n02127052: lynx
-  n02128385: leopard
-  n02128757: snow_leopard
-  n02128925: jaguar
-  n02129165: lion
-  n02129604: tiger
-  n02130308: cheetah
-  n02132136: brown_bear
-  n02133161: American_black_bear
-  n02134084: ice_bear
-  n02134418: sloth_bear
-  n02137549: mongoose
-  n02138441: meerkat
-  n02165105: tiger_beetle
-  n02165456: ladybug
-  n02167151: ground_beetle
-  n02168699: long-horned_beetle
-  n02169497: leaf_beetle
-  n02172182: dung_beetle
-  n02174001: rhinoceros_beetle
-  n02177972: weevil
-  n02190166: fly
-  n02206856: bee
-  n02219486: ant
-  n02226429: grasshopper
-  n02229544: cricket
-  n02231487: walking_stick
-  n02233338: cockroach
-  n02236044: mantis
-  n02256656: cicada
-  n02259212: leafhopper
-  n02264363: lacewing
-  n02268443: dragonfly
-  n02268853: damselfly
-  n02276258: admiral
-  n02277742: ringlet
-  n02279972: monarch
-  n02280649: cabbage_butterfly
-  n02281406: sulphur_butterfly
-  n02281787: lycaenid
-  n02317335: starfish
-  n02319095: sea_urchin
-  n02321529: sea_cucumber
-  n02325366: wood_rabbit
-  n02326432: hare
-  n02328150: Angora
-  n02342885: hamster
-  n02346627: porcupine
-  n02356798: fox_squirrel
-  n02361337: marmot
-  n02363005: beaver
-  n02364673: guinea_pig
-  n02389026: sorrel
-  n02391049: zebra
-  n02395406: hog
-  n02396427: wild_boar
-  n02397096: warthog
-  n02398521: hippopotamus
-  n02403003: ox
-  n02408429: water_buffalo
-  n02410509: bison
-  n02412080: ram
-  n02415577: bighorn
-  n02417914: ibex
-  n02422106: hartebeest
-  n02422699: impala
-  n02423022: gazelle
-  n02437312: Arabian_camel
-  n02437616: llama
-  n02441942: weasel
-  n02442845: mink
-  n02443114: polecat
-  n02443484: black-footed_ferret
-  n02444819: otter
-  n02445715: skunk
-  n02447366: badger
-  n02454379: armadillo
-  n02457408: three-toed_sloth
-  n02480495: orangutan
-  n02480855: gorilla
-  n02481823: chimpanzee
-  n02483362: gibbon
-  n02483708: siamang
-  n02484975: guenon
-  n02486261: patas
-  n02486410: baboon
-  n02487347: macaque
-  n02488291: langur
-  n02488702: colobus
-  n02489166: proboscis_monkey
-  n02490219: marmoset
-  n02492035: capuchin
-  n02492660: howler_monkey
-  n02493509: titi
-  n02493793: spider_monkey
-  n02494079: squirrel_monkey
-  n02497673: Madagascar_cat
-  n02500267: indri
-  n02504013: Indian_elephant
-  n02504458: African_elephant
-  n02509815: lesser_panda
-  n02510455: giant_panda
-  n02514041: barracouta
-  n02526121: eel
-  n02536864: coho
-  n02606052: rock_beauty
-  n02607072: anemone_fish
-  n02640242: sturgeon
-  n02641379: gar
-  n02643566: lionfish
-  n02655020: puffer
-  n02666196: abacus
-  n02667093: abaya
-  n02669723: academic_gown
-  n02672831: accordion
-  n02676566: acoustic_guitar
-  n02687172: aircraft_carrier
-  n02690373: airliner
-  n02692877: airship
-  n02699494: altar
-  n02701002: ambulance
-  n02704792: amphibian
-  n02708093: analog_clock
-  n02727426: apiary
-  n02730930: apron
-  n02747177: ashcan
-  n02749479: assault_rifle
-  n02769748: backpack
-  n02776631: bakery
-  n02777292: balance_beam
-  n02782093: balloon
-  n02783161: ballpoint
-  n02786058: Band_Aid
-  n02787622: banjo
-  n02788148: bannister
-  n02790996: barbell
-  n02791124: barber_chair
-  n02791270: barbershop
-  n02793495: barn
-  n02794156: barometer
-  n02795169: barrel
-  n02797295: barrow
-  n02799071: baseball
-  n02802426: basketball
-  n02804414: bassinet
-  n02804610: bassoon
-  n02807133: bathing_cap
-  n02808304: bath_towel
-  n02808440: bathtub
-  n02814533: beach_wagon
-  n02814860: beacon
-  n02815834: beaker
-  n02817516: bearskin
-  n02823428: beer_bottle
-  n02823750: beer_glass
-  n02825657: bell_cote
-  n02834397: bib
-  n02835271: bicycle-built-for-two
-  n02837789: bikini
-  n02840245: binder
-  n02841315: binoculars
-  n02843684: birdhouse
-  n02859443: boathouse
-  n02860847: bobsled
-  n02865351: bolo_tie
-  n02869837: bonnet
-  n02870880: bookcase
-  n02871525: bookshop
-  n02877765: bottlecap
-  n02879718: bow
-  n02883205: bow_tie
-  n02892201: brass
-  n02892767: brassiere
-  n02894605: breakwater
-  n02895154: breastplate
-  n02906734: broom
-  n02909870: bucket
-  n02910353: buckle
-  n02916936: bulletproof_vest
-  n02917067: bullet_train
-  n02927161: butcher_shop
-  n02930766: cab
-  n02939185: caldron
-  n02948072: candle
-  n02950826: cannon
-  n02951358: canoe
-  n02951585: can_opener
-  n02963159: cardigan
-  n02965783: car_mirror
-  n02966193: carousel
-  n02966687: carpenter's_kit
-  n02971356: carton
-  n02974003: car_wheel
-  n02977058: cash_machine
-  n02978881: cassette
-  n02979186: cassette_player
-  n02980441: castle
-  n02981792: catamaran
-  n02988304: CD_player
-  n02992211: cello
-  n02992529: cellular_telephone
-  n02999410: chain
-  n03000134: chainlink_fence
-  n03000247: chain_mail
-  n03000684: chain_saw
-  n03014705: chest
-  n03016953: chiffonier
-  n03017168: chime
-  n03018349: china_cabinet
-  n03026506: Christmas_stocking
-  n03028079: church
-  n03032252: cinema
-  n03041632: cleaver
-  n03042490: cliff_dwelling
-  n03045698: cloak
-  n03047690: clog
-  n03062245: cocktail_shaker
-  n03063599: coffee_mug
-  n03063689: coffeepot
-  n03065424: coil
-  n03075370: combination_lock
-  n03085013: computer_keyboard
-  n03089624: confectionery
-  n03095699: container_ship
-  n03100240: convertible
-  n03109150: corkscrew
-  n03110669: cornet
-  n03124043: cowboy_boot
-  n03124170: cowboy_hat
-  n03125729: cradle
-  n03126707: crane_(machine)
-  n03127747: crash_helmet
-  n03127925: crate
-  n03131574: crib
-  n03133878: Crock_Pot
-  n03134739: croquet_ball
-  n03141823: crutch
-  n03146219: cuirass
-  n03160309: dam
-  n03179701: desk
-  n03180011: desktop_computer
-  n03187595: dial_telephone
-  n03188531: diaper
-  n03196217: digital_clock
-  n03197337: digital_watch
-  n03201208: dining_table
-  n03207743: dishrag
-  n03207941: dishwasher
-  n03208938: disk_brake
-  n03216828: dock
-  n03218198: dogsled
-  n03220513: dome
-  n03223299: doormat
-  n03240683: drilling_platform
-  n03249569: drum
-  n03250847: drumstick
-  n03255030: dumbbell
-  n03259280: Dutch_oven
-  n03271574: electric_fan
-  n03272010: electric_guitar
-  n03272562: electric_locomotive
-  n03290653: entertainment_center
-  n03291819: envelope
-  n03297495: espresso_maker
-  n03314780: face_powder
-  n03325584: feather_boa
-  n03337140: file
-  n03344393: fireboat
-  n03345487: fire_engine
-  n03347037: fire_screen
-  n03355925: flagpole
-  n03372029: flute
-  n03376595: folding_chair
-  n03379051: football_helmet
-  n03384352: forklift
-  n03388043: fountain
-  n03388183: fountain_pen
-  n03388549: four-poster
-  n03393912: freight_car
-  n03394916: French_horn
-  n03400231: frying_pan
-  n03404251: fur_coat
-  n03417042: garbage_truck
-  n03424325: gasmask
-  n03425413: gas_pump
-  n03443371: goblet
-  n03444034: go-kart
-  n03445777: golf_ball
-  n03445924: golfcart
-  n03447447: gondola
-  n03447721: gong
-  n03450230: gown
-  n03452741: grand_piano
-  n03457902: greenhouse
-  n03459775: grille
-  n03461385: grocery_store
-  n03467068: guillotine
-  n03476684: hair_slide
-  n03476991: hair_spray
-  n03478589: half_track
-  n03481172: hammer
-  n03482405: hamper
-  n03483316: hand_blower
-  n03485407: hand-held_computer
-  n03485794: handkerchief
-  n03492542: hard_disc
-  n03494278: harmonica
-  n03495258: harp
-  n03496892: harvester
-  n03498962: hatchet
-  n03527444: holster
-  n03529860: home_theater
-  n03530642: honeycomb
-  n03532672: hook
-  n03534580: hoopskirt
-  n03535780: horizontal_bar
-  n03538406: horse_cart
-  n03544143: hourglass
-  n03584254: iPod
-  n03584829: iron
-  n03590841: jack-o'-lantern
-  n03594734: jean
-  n03594945: jeep
-  n03595614: jersey
-  n03598930: jigsaw_puzzle
-  n03599486: jinrikisha
-  n03602883: joystick
-  n03617480: kimono
-  n03623198: knee_pad
-  n03627232: knot
-  n03630383: lab_coat
-  n03633091: ladle
-  n03637318: lampshade
-  n03642806: laptop
-  n03649909: lawn_mower
-  n03657121: lens_cap
-  n03658185: letter_opener
-  n03661043: library
-  n03662601: lifeboat
-  n03666591: lighter
-  n03670208: limousine
-  n03673027: liner
-  n03676483: lipstick
-  n03680355: Loafer
-  n03690938: lotion
-  n03691459: loudspeaker
-  n03692522: loupe
-  n03697007: lumbermill
-  n03706229: magnetic_compass
-  n03709823: mailbag
-  n03710193: mailbox
-  n03710637: maillot_(tights)
-  n03710721: maillot_(tank_suit)
-  n03717622: manhole_cover
-  n03720891: maraca
-  n03721384: marimba
-  n03724870: mask
-  n03729826: matchstick
-  n03733131: maypole
-  n03733281: maze
-  n03733805: measuring_cup
-  n03742115: medicine_chest
-  n03743016: megalith
-  n03759954: microphone
-  n03761084: microwave
-  n03763968: military_uniform
-  n03764736: milk_can
-  n03769881: minibus
-  n03770439: miniskirt
-  n03770679: minivan
-  n03773504: missile
-  n03775071: mitten
-  n03775546: mixing_bowl
-  n03776460: mobile_home
-  n03777568: Model_T
-  n03777754: modem
-  n03781244: monastery
-  n03782006: monitor
-  n03785016: moped
-  n03786901: mortar
-  n03787032: mortarboard
-  n03788195: mosque
-  n03788365: mosquito_net
-  n03791053: motor_scooter
-  n03792782: mountain_bike
-  n03792972: mountain_tent
-  n03793489: mouse
-  n03794056: mousetrap
-  n03796401: moving_van
-  n03803284: muzzle
-  n03804744: nail
-  n03814639: neck_brace
-  n03814906: necklace
-  n03825788: nipple
-  n03832673: notebook
-  n03837869: obelisk
-  n03838899: oboe
-  n03840681: ocarina
-  n03841143: odometer
-  n03843555: oil_filter
-  n03854065: organ
-  n03857828: oscilloscope
-  n03866082: overskirt
-  n03868242: oxcart
-  n03868863: oxygen_mask
-  n03871628: packet
-  n03873416: paddle
-  n03874293: paddlewheel
-  n03874599: padlock
-  n03876231: paintbrush
-  n03877472: pajama
-  n03877845: palace
-  n03884397: panpipe
-  n03887697: paper_towel
-  n03888257: parachute
-  n03888605: parallel_bars
-  n03891251: park_bench
-  n03891332: parking_meter
-  n03895866: passenger_car
-  n03899768: patio
-  n03902125: pay-phone
-  n03903868: pedestal
-  n03908618: pencil_box
-  n03908714: pencil_sharpener
-  n03916031: perfume
-  n03920288: Petri_dish
-  n03924679: photocopier
-  n03929660: pick
-  n03929855: pickelhaube
-  n03930313: picket_fence
-  n03930630: pickup
-  n03933933: pier
-  n03935335: piggy_bank
-  n03937543: pill_bottle
-  n03938244: pillow
-  n03942813: ping-pong_ball
-  n03944341: pinwheel
-  n03947888: pirate
-  n03950228: pitcher
-  n03954731: plane
-  n03956157: planetarium
-  n03958227: plastic_bag
-  n03961711: plate_rack
-  n03967562: plow
-  n03970156: plunger
-  n03976467: Polaroid_camera
-  n03976657: pole
-  n03977966: police_van
-  n03980874: poncho
-  n03982430: pool_table
-  n03983396: pop_bottle
-  n03991062: pot
-  n03992509: potter's_wheel
-  n03995372: power_drill
-  n03998194: prayer_rug
-  n04004767: printer
-  n04005630: prison
-  n04008634: projectile
-  n04009552: projector
-  n04019541: puck
-  n04023962: punching_bag
-  n04026417: purse
-  n04033901: quill
-  n04033995: quilt
-  n04037443: racer
-  n04039381: racket
-  n04040759: radiator
-  n04041544: radio
-  n04044716: radio_telescope
-  n04049303: rain_barrel
-  n04065272: recreational_vehicle
-  n04067472: reel
-  n04069434: reflex_camera
-  n04070727: refrigerator
-  n04074963: remote_control
-  n04081281: restaurant
-  n04086273: revolver
-  n04090263: rifle
-  n04099969: rocking_chair
-  n04111531: rotisserie
-  n04116512: rubber_eraser
-  n04118538: rugby_ball
-  n04118776: rule
-  n04120489: running_shoe
-  n04125021: safe
-  n04127249: safety_pin
-  n04131690: saltshaker
-  n04133789: sandal
-  n04136333: sarong
-  n04141076: sax
-  n04141327: scabbard
-  n04141975: scale
-  n04146614: school_bus
-  n04147183: schooner
-  n04149813: scoreboard
-  n04152593: screen
-  n04153751: screw
-  n04154565: screwdriver
-  n04162706: seat_belt
-  n04179913: sewing_machine
-  n04192698: shield
-  n04200800: shoe_shop
-  n04201297: shoji
-  n04204238: shopping_basket
-  n04204347: shopping_cart
-  n04208210: shovel
-  n04209133: shower_cap
-  n04209239: shower_curtain
-  n04228054: ski
-  n04229816: ski_mask
-  n04235860: sleeping_bag
-  n04238763: slide_rule
-  n04239074: sliding_door
-  n04243546: slot
-  n04251144: snorkel
-  n04252077: snowmobile
-  n04252225: snowplow
-  n04254120: soap_dispenser
-  n04254680: soccer_ball
-  n04254777: sock
-  n04258138: solar_dish
-  n04259630: sombrero
-  n04263257: soup_bowl
-  n04264628: space_bar
-  n04265275: space_heater
-  n04266014: space_shuttle
-  n04270147: spatula
-  n04273569: speedboat
-  n04275548: spider_web
-  n04277352: spindle
-  n04285008: sports_car
-  n04286575: spotlight
-  n04296562: stage
-  n04310018: steam_locomotive
-  n04311004: steel_arch_bridge
-  n04311174: steel_drum
-  n04317175: stethoscope
-  n04325704: stole
-  n04326547: stone_wall
-  n04328186: stopwatch
-  n04330267: stove
-  n04332243: strainer
-  n04335435: streetcar
-  n04336792: stretcher
-  n04344873: studio_couch
-  n04346328: stupa
-  n04347754: submarine
-  n04350905: suit
-  n04355338: sundial
-  n04355933: sunglass
-  n04356056: sunglasses
-  n04357314: sunscreen
-  n04366367: suspension_bridge
-  n04367480: swab
-  n04370456: sweatshirt
-  n04371430: swimming_trunks
-  n04371774: swing
-  n04372370: switch
-  n04376876: syringe
-  n04380533: table_lamp
-  n04389033: tank
-  n04392985: tape_player
-  n04398044: teapot
-  n04399382: teddy
-  n04404412: television
-  n04409515: tennis_ball
-  n04417672: thatch
-  n04418357: theater_curtain
-  n04423845: thimble
-  n04428191: thresher
-  n04429376: throne
-  n04435653: tile_roof
-  n04442312: toaster
-  n04443257: tobacco_shop
-  n04447861: toilet_seat
-  n04456115: torch
-  n04458633: totem_pole
-  n04461696: tow_truck
-  n04462240: toyshop
-  n04465501: tractor
-  n04467665: trailer_truck
-  n04476259: tray
-  n04479046: trench_coat
-  n04482393: tricycle
-  n04483307: trimaran
-  n04485082: tripod
-  n04486054: triumphal_arch
-  n04487081: trolleybus
-  n04487394: trombone
-  n04493381: tub
-  n04501370: turnstile
-  n04505470: typewriter_keyboard
-  n04507155: umbrella
-  n04509417: unicycle
-  n04515003: upright
-  n04517823: vacuum
-  n04522168: vase
-  n04523525: vault
-  n04525038: velvet
-  n04525305: vending_machine
-  n04532106: vestment
-  n04532670: viaduct
-  n04536866: violin
-  n04540053: volleyball
-  n04542943: waffle_iron
-  n04548280: wall_clock
-  n04548362: wallet
-  n04550184: wardrobe
-  n04552348: warplane
-  n04553703: washbasin
-  n04554684: washer
-  n04557648: water_bottle
-  n04560804: water_jug
-  n04562935: water_tower
-  n04579145: whiskey_jug
-  n04579432: whistle
-  n04584207: wig
-  n04589890: window_screen
-  n04590129: window_shade
-  n04591157: Windsor_tie
-  n04591713: wine_bottle
-  n04592741: wing
-  n04596742: wok
-  n04597913: wooden_spoon
-  n04599235: wool
-  n04604644: worm_fence
-  n04606251: wreck
-  n04612504: yawl
-  n04613696: yurt
-  n06359193: web_site
-  n06596364: comic_book
-  n06785654: crossword_puzzle
-  n06794110: street_sign
-  n06874185: traffic_light
-  n07248320: book_jacket
-  n07565083: menu
-  n07579787: plate
-  n07583066: guacamole
-  n07584110: consomme
-  n07590611: hot_pot
-  n07613480: trifle
-  n07614500: ice_cream
-  n07615774: ice_lolly
-  n07684084: French_loaf
-  n07693725: bagel
-  n07695742: pretzel
-  n07697313: cheeseburger
-  n07697537: hotdog
-  n07711569: mashed_potato
-  n07714571: head_cabbage
-  n07714990: broccoli
-  n07715103: cauliflower
-  n07716358: zucchini
-  n07716906: spaghetti_squash
-  n07717410: acorn_squash
-  n07717556: butternut_squash
-  n07718472: cucumber
-  n07718747: artichoke
-  n07720875: bell_pepper
-  n07730033: cardoon
-  n07734744: mushroom
-  n07742313: Granny_Smith
-  n07745940: strawberry
-  n07747607: orange
-  n07749582: lemon
-  n07753113: fig
-  n07753275: pineapple
-  n07753592: banana
-  n07754684: jackfruit
-  n07760859: custard_apple
-  n07768694: pomegranate
-  n07802026: hay
-  n07831146: carbonara
-  n07836838: chocolate_sauce
-  n07860988: dough
-  n07871810: meat_loaf
-  n07873807: pizza
-  n07875152: potpie
-  n07880968: burrito
-  n07892512: red_wine
-  n07920052: espresso
-  n07930864: cup
-  n07932039: eggnog
-  n09193705: alp
-  n09229709: bubble
-  n09246464: cliff
-  n09256479: coral_reef
-  n09288635: geyser
-  n09332890: lakeside
-  n09399592: promontory
-  n09421951: sandbar
-  n09428293: seashore
-  n09468604: valley
-  n09472597: volcano
-  n09835506: ballplayer
-  n10148035: groom
-  n10565667: scuba_diver
-  n11879895: rapeseed
-  n11939491: daisy
-  n12057211: yellow_lady's_slipper
-  n12144580: corn
-  n12267677: acorn
-  n12620546: hip
-  n12768682: buckeye
-  n12985857: coral_fungus
-  n12998815: agaric
-  n13037406: gyromitra
-  n13040303: stinkhorn
-  n13044778: earthstar
-  n13052670: hen-of-the-woods
-  n13054560: bolete
-  n13133613: ear
-  n15075141: toilet_tissue
-
-# Download script/URL (optional)
-download: ultralytics/data/scripts/get_imagenet.sh
diff --git a/ultralytics/cfg/datasets/Objects365.yaml b/ultralytics/cfg/datasets/Objects365.yaml
deleted file mode 100644
index b1cd1ae..0000000
--- a/ultralytics/cfg/datasets/Objects365.yaml
+++ /dev/null
@@ -1,443 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Objects365 dataset https://www.objects365.org/ by Megvii
-# Documentation: https://docs.ultralytics.com/datasets/detect/objects365/
-# Example usage: yolo train data=Objects365.yaml
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── Objects365 ← downloads here (712 GB = 367G data + 345G zips)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: Objects365 # dataset root dir
-train: images/train # train images (relative to 'path') 1742289 images
-val: images/val # val images (relative to 'path') 80000 images
-test: # test images (optional)
-
-# Classes
-names:
-  0: Person
-  1: Sneakers
-  2: Chair
-  3: Other Shoes
-  4: Hat
-  5: Car
-  6: Lamp
-  7: Glasses
-  8: Bottle
-  9: Desk
-  10: Cup
-  11: Street Lights
-  12: Cabinet/shelf
-  13: Handbag/Satchel
-  14: Bracelet
-  15: Plate
-  16: Picture/Frame
-  17: Helmet
-  18: Book
-  19: Gloves
-  20: Storage box
-  21: Boat
-  22: Leather Shoes
-  23: Flower
-  24: Bench
-  25: Potted Plant
-  26: Bowl/Basin
-  27: Flag
-  28: Pillow
-  29: Boots
-  30: Vase
-  31: Microphone
-  32: Necklace
-  33: Ring
-  34: SUV
-  35: Wine Glass
-  36: Belt
-  37: Monitor/TV
-  38: Backpack
-  39: Umbrella
-  40: Traffic Light
-  41: Speaker
-  42: Watch
-  43: Tie
-  44: Trash bin Can
-  45: Slippers
-  46: Bicycle
-  47: Stool
-  48: Barrel/bucket
-  49: Van
-  50: Couch
-  51: Sandals
-  52: Basket
-  53: Drum
-  54: Pen/Pencil
-  55: Bus
-  56: Wild Bird
-  57: High Heels
-  58: Motorcycle
-  59: Guitar
-  60: Carpet
-  61: Cell Phone
-  62: Bread
-  63: Camera
-  64: Canned
-  65: Truck
-  66: Traffic cone
-  67: Cymbal
-  68: Lifesaver
-  69: Towel
-  70: Stuffed Toy
-  71: Candle
-  72: Sailboat
-  73: Laptop
-  74: Awning
-  75: Bed
-  76: Faucet
-  77: Tent
-  78: Horse
-  79: Mirror
-  80: Power outlet
-  81: Sink
-  82: Apple
-  83: Air Conditioner
-  84: Knife
-  85: Hockey Stick
-  86: Paddle
-  87: Pickup Truck
-  88: Fork
-  89: Traffic Sign
-  90: Balloon
-  91: Tripod
-  92: Dog
-  93: Spoon
-  94: Clock
-  95: Pot
-  96: Cow
-  97: Cake
-  98: Dining Table
-  99: Sheep
-  100: Hanger
-  101: Blackboard/Whiteboard
-  102: Napkin
-  103: Other Fish
-  104: Orange/Tangerine
-  105: Toiletry
-  106: Keyboard
-  107: Tomato
-  108: Lantern
-  109: Machinery Vehicle
-  110: Fan
-  111: Green Vegetables
-  112: Banana
-  113: Baseball Glove
-  114: Airplane
-  115: Mouse
-  116: Train
-  117: Pumpkin
-  118: Soccer
-  119: Skiboard
-  120: Luggage
-  121: Nightstand
-  122: Tea pot
-  123: Telephone
-  124: Trolley
-  125: Head Phone
-  126: Sports Car
-  127: Stop Sign
-  128: Dessert
-  129: Scooter
-  130: Stroller
-  131: Crane
-  132: Remote
-  133: Refrigerator
-  134: Oven
-  135: Lemon
-  136: Duck
-  137: Baseball Bat
-  138: Surveillance Camera
-  139: Cat
-  140: Jug
-  141: Broccoli
-  142: Piano
-  143: Pizza
-  144: Elephant
-  145: Skateboard
-  146: Surfboard
-  147: Gun
-  148: Skating and Skiing shoes
-  149: Gas stove
-  150: Donut
-  151: Bow Tie
-  152: Carrot
-  153: Toilet
-  154: Kite
-  155: Strawberry
-  156: Other Balls
-  157: Shovel
-  158: Pepper
-  159: Computer Box
-  160: Toilet Paper
-  161: Cleaning Products
-  162: Chopsticks
-  163: Microwave
-  164: Pigeon
-  165: Baseball
-  166: Cutting/chopping Board
-  167: Coffee Table
-  168: Side Table
-  169: Scissors
-  170: Marker
-  171: Pie
-  172: Ladder
-  173: Snowboard
-  174: Cookies
-  175: Radiator
-  176: Fire Hydrant
-  177: Basketball
-  178: Zebra
-  179: Grape
-  180: Giraffe
-  181: Potato
-  182: Sausage
-  183: Tricycle
-  184: Violin
-  185: Egg
-  186: Fire Extinguisher
-  187: Candy
-  188: Fire Truck
-  189: Billiards
-  190: Converter
-  191: Bathtub
-  192: Wheelchair
-  193: Golf Club
-  194: Briefcase
-  195: Cucumber
-  196: Cigar/Cigarette
-  197: Paint Brush
-  198: Pear
-  199: Heavy Truck
-  200: Hamburger
-  201: Extractor
-  202: Extension Cord
-  203: Tong
-  204: Tennis Racket
-  205: Folder
-  206: American Football
-  207: earphone
-  208: Mask
-  209: Kettle
-  210: Tennis
-  211: Ship
-  212: Swing
-  213: Coffee Machine
-  214: Slide
-  215: Carriage
-  216: Onion
-  217: Green beans
-  218: Projector
-  219: Frisbee
-  220: Washing Machine/Drying Machine
-  221: Chicken
-  222: Printer
-  223: Watermelon
-  224: Saxophone
-  225: Tissue
-  226: Toothbrush
-  227: Ice cream
-  228: Hot-air balloon
-  229: Cello
-  230: French Fries
-  231: Scale
-  232: Trophy
-  233: Cabbage
-  234: Hot dog
-  235: Blender
-  236: Peach
-  237: Rice
-  238: Wallet/Purse
-  239: Volleyball
-  240: Deer
-  241: Goose
-  242: Tape
-  243: Tablet
-  244: Cosmetics
-  245: Trumpet
-  246: Pineapple
-  247: Golf Ball
-  248: Ambulance
-  249: Parking meter
-  250: Mango
-  251: Key
-  252: Hurdle
-  253: Fishing Rod
-  254: Medal
-  255: Flute
-  256: Brush
-  257: Penguin
-  258: Megaphone
-  259: Corn
-  260: Lettuce
-  261: Garlic
-  262: Swan
-  263: Helicopter
-  264: Green Onion
-  265: Sandwich
-  266: Nuts
-  267: Speed Limit Sign
-  268: Induction Cooker
-  269: Broom
-  270: Trombone
-  271: Plum
-  272: Rickshaw
-  273: Goldfish
-  274: Kiwi fruit
-  275: Router/modem
-  276: Poker Card
-  277: Toaster
-  278: Shrimp
-  279: Sushi
-  280: Cheese
-  281: Notepaper
-  282: Cherry
-  283: Pliers
-  284: CD
-  285: Pasta
-  286: Hammer
-  287: Cue
-  288: Avocado
-  289: Hami melon
-  290: Flask
-  291: Mushroom
-  292: Screwdriver
-  293: Soap
-  294: Recorder
-  295: Bear
-  296: Eggplant
-  297: Board Eraser
-  298: Coconut
-  299: Tape Measure/Ruler
-  300: Pig
-  301: Showerhead
-  302: Globe
-  303: Chips
-  304: Steak
-  305: Crosswalk Sign
-  306: Stapler
-  307: Camel
-  308: Formula 1
-  309: Pomegranate
-  310: Dishwasher
-  311: Crab
-  312: Hoverboard
-  313: Meatball
-  314: Rice Cooker
-  315: Tuba
-  316: Calculator
-  317: Papaya
-  318: Antelope
-  319: Parrot
-  320: Seal
-  321: Butterfly
-  322: Dumbbell
-  323: Donkey
-  324: Lion
-  325: Urinal
-  326: Dolphin
-  327: Electric Drill
-  328: Hair Dryer
-  329: Egg tart
-  330: Jellyfish
-  331: Treadmill
-  332: Lighter
-  333: Grapefruit
-  334: Game board
-  335: Mop
-  336: Radish
-  337: Baozi
-  338: Target
-  339: French
-  340: Spring Rolls
-  341: Monkey
-  342: Rabbit
-  343: Pencil Case
-  344: Yak
-  345: Red Cabbage
-  346: Binoculars
-  347: Asparagus
-  348: Barbell
-  349: Scallop
-  350: Noddles
-  351: Comb
-  352: Dumpling
-  353: Oyster
-  354: Table Tennis paddle
-  355: Cosmetics Brush/Eyeliner Pencil
-  356: Chainsaw
-  357: Eraser
-  358: Lobster
-  359: Durian
-  360: Okra
-  361: Lipstick
-  362: Cosmetics Mirror
-  363: Curling
-  364: Table Tennis
-
-# Download script/URL (optional) ---------------------------------------------------------------------------------------
-download: |
-  from pathlib import Path
-
-  import numpy as np
-
-  from ultralytics.utils import TQDM
-  from ultralytics.utils.checks import check_requirements
-  from ultralytics.utils.downloads import download
-  from ultralytics.utils.ops import xyxy2xywhn
-
-  check_requirements("faster-coco-eval")
-  from faster_coco_eval import COCO
-
-  # Make Directories
-  dir = Path(yaml["path"])  # dataset root dir
-  for p in "images", "labels":
-      (dir / p).mkdir(parents=True, exist_ok=True)
-      for q in "train", "val":
-          (dir / p / q).mkdir(parents=True, exist_ok=True)
-
-  # Train, Val Splits
-  for split, patches in [("train", 50 + 1), ("val", 43 + 1)]:
-      print(f"Processing {split} in {patches} patches ...")
-      images, labels = dir / "images" / split, dir / "labels" / split
-
-      # Download
-      url = f"https://dorc.ks3-cn-beijing.ksyun.com/data-set/2020Objects365%E6%95%B0%E6%8D%AE%E9%9B%86/{split}/"
-      if split == "train":
-          download([f"{url}zhiyuan_objv2_{split}.tar.gz"], dir=dir)  # annotations json
-          download([f"{url}patch{i}.tar.gz" for i in range(patches)], dir=images, curl=True, threads=8)
-      elif split == "val":
-          download([f"{url}zhiyuan_objv2_{split}.json"], dir=dir)  # annotations json
-          download([f"{url}images/v1/patch{i}.tar.gz" for i in range(15 + 1)], dir=images, curl=True, threads=8)
-          download([f"{url}images/v2/patch{i}.tar.gz" for i in range(16, patches)], dir=images, curl=True, threads=8)
-
-      # Move
-      for f in TQDM(images.rglob("*.jpg"), desc=f"Moving {split} images"):
-          f.rename(images / f.name)  # move to /images/{split}
-
-      # Labels
-      coco = COCO(dir / f"zhiyuan_objv2_{split}.json")
-      names = [x["name"] for x in coco.loadCats(coco.getCatIds())]
-      for cid, cat in enumerate(names):
-          catIds = coco.getCatIds(catNms=[cat])
-          imgIds = coco.getImgIds(catIds=catIds)
-          for im in TQDM(coco.loadImgs(imgIds), desc=f"Class {cid + 1}/{len(names)} {cat}"):
-              width, height = im["width"], im["height"]
-              path = Path(im["file_name"])  # image filename
-              try:
-                  with open(labels / path.with_suffix(".txt").name, "a", encoding="utf-8") as file:
-                      annIds = coco.getAnnIds(imgIds=im["id"], catIds=catIds, iscrowd=None)
-                      for a in coco.loadAnns(annIds):
-                          x, y, w, h = a["bbox"]  # bounding box in xywh (xy top-left corner)
-                          xyxy = np.array([x, y, x + w, y + h])[None]  # pixels(1,4)
-                          x, y, w, h = xyxy2xywhn(xyxy, w=width, h=height, clip=True)[0]  # normalized and clipped
-                          file.write(f"{cid} {x:.5f} {y:.5f} {w:.5f} {h:.5f}\n")
-              except Exception as e:
-                  print(e)
diff --git a/ultralytics/cfg/datasets/SKU-110K.yaml b/ultralytics/cfg/datasets/SKU-110K.yaml
deleted file mode 100644
index f613d85..0000000
--- a/ultralytics/cfg/datasets/SKU-110K.yaml
+++ /dev/null
@@ -1,58 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# SKU-110K retail items dataset https://github.com/eg4000/SKU110K_CVPR19 by Trax Retail
-# Documentation: https://docs.ultralytics.com/datasets/detect/sku-110k/
-# Example usage: yolo train data=SKU-110K.yaml
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── SKU-110K ← downloads here (13.6 GB)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: SKU-110K # dataset root dir
-train: train.txt # train images (relative to 'path') 8219 images
-val: val.txt # val images (relative to 'path') 588 images
-test: test.txt # test images (optional) 2936 images
-
-# Classes
-names:
-  0: object
-
-# Download script/URL (optional) ---------------------------------------------------------------------------------------
-download: |
-  import shutil
-  from pathlib import Path
-
-  import numpy as np
-  import polars as pl
-
-  from ultralytics.utils import TQDM
-  from ultralytics.utils.downloads import download
-  from ultralytics.utils.ops import xyxy2xywh
-
-  # Download
-  dir = Path(yaml["path"])  # dataset root dir
-  parent = Path(dir.parent)  # download dir
-  urls = ["http://trax-geometry.s3.amazonaws.com/cvpr_challenge/SKU110K_fixed.tar.gz"]
-  download(urls, dir=parent)
-
-  # Rename directories
-  if dir.exists():
-      shutil.rmtree(dir)
-  (parent / "SKU110K_fixed").rename(dir)  # rename dir
-  (dir / "labels").mkdir(parents=True, exist_ok=True)  # create labels dir
-
-  # Convert labels
-  names = "image", "x1", "y1", "x2", "y2", "class", "image_width", "image_height"  # column names
-  for d in "annotations_train.csv", "annotations_val.csv", "annotations_test.csv":
-      x = pl.read_csv(dir / "annotations" / d, has_header=False, new_columns=names, infer_schema_length=None).to_numpy()  # annotations
-      images, unique_images = x[:, 0], np.unique(x[:, 0])
-      with open((dir / d).with_suffix(".txt").__str__().replace("annotations_", ""), "w", encoding="utf-8") as f:
-          f.writelines(f"./images/{s}\n" for s in unique_images)
-      for im in TQDM(unique_images, desc=f"Converting {dir / d}"):
-          cls = 0  # single-class dataset
-          with open((dir / "labels" / im).with_suffix(".txt"), "a", encoding="utf-8") as f:
-              for r in x[images == im]:
-                  w, h = r[6], r[7]  # image width, height
-                  xywh = xyxy2xywh(np.array([[r[1] / w, r[2] / h, r[3] / w, r[4] / h]]))[0]  # instance
-                  f.write(f"{cls} {xywh[0]:.5f} {xywh[1]:.5f} {xywh[2]:.5f} {xywh[3]:.5f}\n")  # write label
diff --git a/ultralytics/cfg/datasets/VOC.yaml b/ultralytics/cfg/datasets/VOC.yaml
deleted file mode 100644
index 4d34549..0000000
--- a/ultralytics/cfg/datasets/VOC.yaml
+++ /dev/null
@@ -1,104 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# PASCAL VOC dataset http://host.robots.ox.ac.uk/pascal/VOC by University of Oxford
-# Documentation: # Documentation: https://docs.ultralytics.com/datasets/detect/voc/
-# Example usage: yolo train data=VOC.yaml
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── VOC ← downloads here (2.8 GB)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: VOC
-train: # train images (relative to 'path') 16551 images
-  - images/train2012
-  - images/train2007
-  - images/val2012
-  - images/val2007
-val: # val images (relative to 'path') 4952 images
-  - images/test2007
-test: # test images (optional)
-  - images/test2007
-
-# Classes
-names:
-  0: aeroplane
-  1: bicycle
-  2: bird
-  3: boat
-  4: bottle
-  5: bus
-  6: car
-  7: cat
-  8: chair
-  9: cow
-  10: diningtable
-  11: dog
-  12: horse
-  13: motorbike
-  14: person
-  15: pottedplant
-  16: sheep
-  17: sofa
-  18: train
-  19: tvmonitor
-
-# Download script/URL (optional) ---------------------------------------------------------------------------------------
-download: |
-  import xml.etree.ElementTree as ET
-  from pathlib import Path
-
-  from ultralytics.utils.downloads import download
-  from ultralytics.utils import TQDM
-
-  def convert_label(path, lb_path, year, image_id):
-      """Converts XML annotations from VOC format to YOLO format by extracting bounding boxes and class IDs."""
-
-      def convert_box(size, box):
-          dw, dh = 1.0 / size[0], 1.0 / size[1]
-          x, y, w, h = (box[0] + box[1]) / 2.0 - 1, (box[2] + box[3]) / 2.0 - 1, box[1] - box[0], box[3] - box[2]
-          return x * dw, y * dh, w * dw, h * dh
-
-      in_file = open(path / f"VOC{year}/Annotations/{image_id}.xml")
-      out_file = open(lb_path, "w")
-      tree = ET.parse(in_file)
-      root = tree.getroot()
-      size = root.find("size")
-      w = int(size.find("width").text)
-      h = int(size.find("height").text)
-
-      names = list(yaml["names"].values())  # names list
-      for obj in root.iter("object"):
-          cls = obj.find("name").text
-          if cls in names and int(obj.find("difficult").text) != 1:
-              xmlbox = obj.find("bndbox")
-              bb = convert_box((w, h), [float(xmlbox.find(x).text) for x in ("xmin", "xmax", "ymin", "ymax")])
-              cls_id = names.index(cls)  # class id
-              out_file.write(" ".join(str(a) for a in (cls_id, *bb)) + "\n")
-
-
-  # Download
-  dir = Path(yaml["path"])  # dataset root dir
-  url = "https://github.com/ultralytics/assets/releases/download/v0.0.0/"
-  urls = [
-      f"{url}VOCtrainval_06-Nov-2007.zip",  # 446MB, 5012 images
-      f"{url}VOCtest_06-Nov-2007.zip",  # 438MB, 4953 images
-      f"{url}VOCtrainval_11-May-2012.zip",  # 1.95GB, 17126 images
-  ]
-  download(urls, dir=dir / "images", threads=3, exist_ok=True)  # download and unzip over existing (required)
-
-  # Convert
-  path = dir / "images/VOCdevkit"
-  for year, image_set in ("2012", "train"), ("2012", "val"), ("2007", "train"), ("2007", "val"), ("2007", "test"):
-      imgs_path = dir / "images" / f"{image_set}{year}"
-      lbs_path = dir / "labels" / f"{image_set}{year}"
-      imgs_path.mkdir(exist_ok=True, parents=True)
-      lbs_path.mkdir(exist_ok=True, parents=True)
-
-      with open(path / f"VOC{year}/ImageSets/Main/{image_set}.txt") as f:
-          image_ids = f.read().strip().split()
-      for id in TQDM(image_ids, desc=f"{image_set}{year}"):
-          f = path / f"VOC{year}/JPEGImages/{id}.jpg"  # old img path
-          lb_path = (lbs_path / f.name).with_suffix(".txt")  # new label path
-          f.rename(imgs_path / f.name)  # move image
-          convert_label(path, lb_path, year, id)  # convert labels to YOLO format
diff --git a/ultralytics/cfg/datasets/VisDrone.yaml b/ultralytics/cfg/datasets/VisDrone.yaml
deleted file mode 100644
index 7c3f48d..0000000
--- a/ultralytics/cfg/datasets/VisDrone.yaml
+++ /dev/null
@@ -1,87 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# VisDrone2019-DET dataset https://github.com/VisDrone/VisDrone-Dataset by Tianjin University
-# Documentation: https://docs.ultralytics.com/datasets/detect/visdrone/
-# Example usage: yolo train data=VisDrone.yaml
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── VisDrone ← downloads here (2.3 GB)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: VisDrone # dataset root dir
-train: images/train # train images (relative to 'path') 6471 images
-val: images/val # val images (relative to 'path') 548 images
-test: images/test # test-dev images (optional) 1610 images
-
-# Classes
-names:
-  0: pedestrian
-  1: people
-  2: bicycle
-  3: car
-  4: van
-  5: truck
-  6: tricycle
-  7: awning-tricycle
-  8: bus
-  9: motor
-
-# Download script/URL (optional) ---------------------------------------------------------------------------------------
-download: |
-  import os
-  from pathlib import Path
-  import shutil
-
-  from ultralytics.utils.downloads import download
-  from ultralytics.utils import TQDM
-
-
-  def visdrone2yolo(dir, split, source_name=None):
-      """Convert VisDrone annotations to YOLO format with images/{split} and labels/{split} structure."""
-      from PIL import Image
-
-      source_dir = dir / (source_name or f"VisDrone2019-DET-{split}")
-      images_dir = dir / "images" / split
-      labels_dir = dir / "labels" / split
-      labels_dir.mkdir(parents=True, exist_ok=True)
-
-      # Move images to new structure
-      if (source_images_dir := source_dir / "images").exists():
-          images_dir.mkdir(parents=True, exist_ok=True)
-          for img in source_images_dir.glob("*.jpg"):
-              img.rename(images_dir / img.name)
-
-      for f in TQDM((source_dir / "annotations").glob("*.txt"), desc=f"Converting {split}"):
-          img_size = Image.open(images_dir / f.with_suffix(".jpg").name).size
-          dw, dh = 1.0 / img_size[0], 1.0 / img_size[1]
-          lines = []
-
-          with open(f, encoding="utf-8") as file:
-              for row in [x.split(",") for x in file.read().strip().splitlines()]:
-                  if row[4] != "0":  # Skip ignored regions
-                      x, y, w, h = map(int, row[:4])
-                      cls = int(row[5]) - 1
-                      # Convert to YOLO format
-                      x_center, y_center = (x + w / 2) * dw, (y + h / 2) * dh
-                      w_norm, h_norm = w * dw, h * dh
-                      lines.append(f"{cls} {x_center:.6f} {y_center:.6f} {w_norm:.6f} {h_norm:.6f}\n")
-
-          (labels_dir / f.name).write_text("".join(lines), encoding="utf-8")
-
-
-  # Download (ignores test-challenge split)
-  dir = Path(yaml["path"])  # dataset root dir
-  urls = [
-      "https://github.com/ultralytics/assets/releases/download/v0.0.0/VisDrone2019-DET-train.zip",
-      "https://github.com/ultralytics/assets/releases/download/v0.0.0/VisDrone2019-DET-val.zip",
-      "https://github.com/ultralytics/assets/releases/download/v0.0.0/VisDrone2019-DET-test-dev.zip",
-      # "https://github.com/ultralytics/assets/releases/download/v0.0.0/VisDrone2019-DET-test-challenge.zip",
-  ]
-  download(urls, dir=dir, threads=4)
-
-  # Convert
-  splits = {"VisDrone2019-DET-train": "train", "VisDrone2019-DET-val": "val", "VisDrone2019-DET-test-dev": "test"}
-  for folder, split in splits.items():
-      visdrone2yolo(dir, split, folder)  # convert VisDrone annotations to YOLO labels
-      shutil.rmtree(dir / folder)  # cleanup original directory
diff --git a/ultralytics/cfg/datasets/african-wildlife.yaml b/ultralytics/cfg/datasets/african-wildlife.yaml
deleted file mode 100644
index 38665ad..0000000
--- a/ultralytics/cfg/datasets/african-wildlife.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# African-wildlife dataset by Ultralytics
-# Documentation: https://docs.ultralytics.com/datasets/detect/african-wildlife/
-# Example usage: yolo train data=african-wildlife.yaml
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── african-wildlife ← downloads here (100 MB)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: african-wildlife # dataset root dir
-train: images/train # train images (relative to 'path') 1052 images
-val: images/val # val images (relative to 'path') 225 images
-test: images/test # test images (relative to 'path') 227 images
-
-# Classes
-names:
-  0: buffalo
-  1: elephant
-  2: rhino
-  3: zebra
-
-# Download script/URL (optional)
-download: https://github.com/ultralytics/assets/releases/download/v0.0.0/african-wildlife.zip
diff --git a/ultralytics/cfg/datasets/brain-tumor.yaml b/ultralytics/cfg/datasets/brain-tumor.yaml
deleted file mode 100644
index bc9be74..0000000
--- a/ultralytics/cfg/datasets/brain-tumor.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Brain-tumor dataset by Ultralytics
-# Documentation: https://docs.ultralytics.com/datasets/detect/brain-tumor/
-# Example usage: yolo train data=brain-tumor.yaml
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── brain-tumor ← downloads here (4.21 MB)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: brain-tumor # dataset root dir
-train: images/train # train images (relative to 'path') 893 images
-val: images/val # val images (relative to 'path') 223 images
-
-# Classes
-names:
-  0: negative
-  1: positive
-
-# Download script/URL (optional)
-download: https://github.com/ultralytics/assets/releases/download/v0.0.0/brain-tumor.zip
diff --git a/ultralytics/cfg/datasets/carparts-seg.yaml b/ultralytics/cfg/datasets/carparts-seg.yaml
deleted file mode 100644
index 1a35b70..0000000
--- a/ultralytics/cfg/datasets/carparts-seg.yaml
+++ /dev/null
@@ -1,44 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Carparts-seg dataset by Ultralytics
-# Documentation: https://docs.ultralytics.com/datasets/segment/carparts-seg/
-# Example usage: yolo train data=carparts-seg.yaml
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── carparts-seg ← downloads here (133 MB)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: carparts-seg # dataset root dir
-train: images/train # train images (relative to 'path') 3516 images
-val: images/val # val images (relative to 'path') 276 images
-test: images/test # test images (relative to 'path') 401 images
-
-# Classes
-names:
-  0: back_bumper
-  1: back_door
-  2: back_glass
-  3: back_left_door
-  4: back_left_light
-  5: back_light
-  6: back_right_door
-  7: back_right_light
-  8: front_bumper
-  9: front_door
-  10: front_glass
-  11: front_left_door
-  12: front_left_light
-  13: front_light
-  14: front_right_door
-  15: front_right_light
-  16: hood
-  17: left_mirror
-  18: object
-  19: right_mirror
-  20: tailgate
-  21: trunk
-  22: wheel
-
-# Download script/URL (optional)
-download: https://github.com/ultralytics/assets/releases/download/v0.0.0/carparts-seg.zip
diff --git a/ultralytics/cfg/datasets/coco-pose.yaml b/ultralytics/cfg/datasets/coco-pose.yaml
deleted file mode 100644
index afa49d0..0000000
--- a/ultralytics/cfg/datasets/coco-pose.yaml
+++ /dev/null
@@ -1,42 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# COCO 2017 Keypoints dataset https://cocodataset.org by Microsoft
-# Documentation: https://docs.ultralytics.com/datasets/pose/coco/
-# Example usage: yolo train data=coco-pose.yaml
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── coco-pose ← downloads here (20.1 GB)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: coco-pose # dataset root dir
-train: train2017.txt # train images (relative to 'path') 56599 images
-val: val2017.txt # val images (relative to 'path') 2346 images
-test: test-dev2017.txt # 20288 of 40670 images, submit to https://codalab.lisn.upsaclay.fr/competitions/7403
-
-# Keypoints
-kpt_shape: [17, 3] # number of keypoints, number of dims (2 for x,y or 3 for x,y,visible)
-flip_idx: [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15]
-
-# Classes
-names:
-  0: person
-
-# Download script/URL (optional)
-download: |
-  from pathlib import Path
-
-  from ultralytics.utils.downloads import download
-
-  # Download labels
-  dir = Path(yaml["path"])  # dataset root dir
-  url = "https://github.com/ultralytics/assets/releases/download/v0.0.0/"
-  urls = [f"{url}coco2017labels-pose.zip"]
-  download(urls, dir=dir.parent)
-  # Download data
-  urls = [
-      "http://images.cocodataset.org/zips/train2017.zip",  # 19G, 118k images
-      "http://images.cocodataset.org/zips/val2017.zip",  # 1G, 5k images
-      "http://images.cocodataset.org/zips/test2017.zip",  # 7G, 41k images (optional)
-  ]
-  download(urls, dir=dir / "images", threads=3)
diff --git a/ultralytics/cfg/datasets/coco.yaml b/ultralytics/cfg/datasets/coco.yaml
deleted file mode 100644
index 97de240..0000000
--- a/ultralytics/cfg/datasets/coco.yaml
+++ /dev/null
@@ -1,118 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# COCO 2017 dataset https://cocodataset.org by Microsoft
-# Documentation: https://docs.ultralytics.com/datasets/detect/coco/
-# Example usage: yolo train data=coco.yaml
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── coco ← downloads here (20.1 GB)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: coco # dataset root dir
-train: train2017.txt # train images (relative to 'path') 118287 images
-val: val2017.txt # val images (relative to 'path') 5000 images
-test: test-dev2017.txt # 20288 of 40670 images, submit to https://competitions.codalab.org/competitions/20794
-
-# Classes
-names:
-  0: person
-  1: bicycle
-  2: car
-  3: motorcycle
-  4: airplane
-  5: bus
-  6: train
-  7: truck
-  8: boat
-  9: traffic light
-  10: fire hydrant
-  11: stop sign
-  12: parking meter
-  13: bench
-  14: bird
-  15: cat
-  16: dog
-  17: horse
-  18: sheep
-  19: cow
-  20: elephant
-  21: bear
-  22: zebra
-  23: giraffe
-  24: backpack
-  25: umbrella
-  26: handbag
-  27: tie
-  28: suitcase
-  29: frisbee
-  30: skis
-  31: snowboard
-  32: sports ball
-  33: kite
-  34: baseball bat
-  35: baseball glove
-  36: skateboard
-  37: surfboard
-  38: tennis racket
-  39: bottle
-  40: wine glass
-  41: cup
-  42: fork
-  43: knife
-  44: spoon
-  45: bowl
-  46: banana
-  47: apple
-  48: sandwich
-  49: orange
-  50: broccoli
-  51: carrot
-  52: hot dog
-  53: pizza
-  54: donut
-  55: cake
-  56: chair
-  57: couch
-  58: potted plant
-  59: bed
-  60: dining table
-  61: toilet
-  62: tv
-  63: laptop
-  64: mouse
-  65: remote
-  66: keyboard
-  67: cell phone
-  68: microwave
-  69: oven
-  70: toaster
-  71: sink
-  72: refrigerator
-  73: book
-  74: clock
-  75: vase
-  76: scissors
-  77: teddy bear
-  78: hair drier
-  79: toothbrush
-
-# Download script/URL (optional)
-download: |
-  from pathlib import Path
-
-  from ultralytics.utils.downloads import download
-
-  # Download labels
-  segments = True  # segment or box labels
-  dir = Path(yaml["path"])  # dataset root dir
-  url = "https://github.com/ultralytics/assets/releases/download/v0.0.0/"
-  urls = [url + ("coco2017labels-segments.zip" if segments else "coco2017labels.zip")]  # labels
-  download(urls, dir=dir.parent)
-  # Download data
-  urls = [
-      "http://images.cocodataset.org/zips/train2017.zip",  # 19G, 118k images
-      "http://images.cocodataset.org/zips/val2017.zip",  # 1G, 5k images
-      "http://images.cocodataset.org/zips/test2017.zip",  # 7G, 41k images (optional)
-  ]
-  download(urls, dir=dir / "images", threads=3)
diff --git a/ultralytics/cfg/datasets/coco128-seg.yaml b/ultralytics/cfg/datasets/coco128-seg.yaml
deleted file mode 100644
index d89ba73..0000000
--- a/ultralytics/cfg/datasets/coco128-seg.yaml
+++ /dev/null
@@ -1,101 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# COCO128-seg dataset https://www.kaggle.com/datasets/ultralytics/coco128 (first 128 images from COCO train2017) by Ultralytics
-# Documentation: https://docs.ultralytics.com/datasets/segment/coco/
-# Example usage: yolo train data=coco128.yaml
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── coco128-seg ← downloads here (7 MB)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: coco128-seg # dataset root dir
-train: images/train2017 # train images (relative to 'path') 128 images
-val: images/train2017 # val images (relative to 'path') 128 images
-test: # test images (optional)
-
-# Classes
-names:
-  0: person
-  1: bicycle
-  2: car
-  3: motorcycle
-  4: airplane
-  5: bus
-  6: train
-  7: truck
-  8: boat
-  9: traffic light
-  10: fire hydrant
-  11: stop sign
-  12: parking meter
-  13: bench
-  14: bird
-  15: cat
-  16: dog
-  17: horse
-  18: sheep
-  19: cow
-  20: elephant
-  21: bear
-  22: zebra
-  23: giraffe
-  24: backpack
-  25: umbrella
-  26: handbag
-  27: tie
-  28: suitcase
-  29: frisbee
-  30: skis
-  31: snowboard
-  32: sports ball
-  33: kite
-  34: baseball bat
-  35: baseball glove
-  36: skateboard
-  37: surfboard
-  38: tennis racket
-  39: bottle
-  40: wine glass
-  41: cup
-  42: fork
-  43: knife
-  44: spoon
-  45: bowl
-  46: banana
-  47: apple
-  48: sandwich
-  49: orange
-  50: broccoli
-  51: carrot
-  52: hot dog
-  53: pizza
-  54: donut
-  55: cake
-  56: chair
-  57: couch
-  58: potted plant
-  59: bed
-  60: dining table
-  61: toilet
-  62: tv
-  63: laptop
-  64: mouse
-  65: remote
-  66: keyboard
-  67: cell phone
-  68: microwave
-  69: oven
-  70: toaster
-  71: sink
-  72: refrigerator
-  73: book
-  74: clock
-  75: vase
-  76: scissors
-  77: teddy bear
-  78: hair drier
-  79: toothbrush
-
-# Download script/URL (optional)
-download: https://github.com/ultralytics/assets/releases/download/v0.0.0/coco128-seg.zip
diff --git a/ultralytics/cfg/datasets/coco128.yaml b/ultralytics/cfg/datasets/coco128.yaml
deleted file mode 100644
index 0488505..0000000
--- a/ultralytics/cfg/datasets/coco128.yaml
+++ /dev/null
@@ -1,101 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# COCO128 dataset https://www.kaggle.com/datasets/ultralytics/coco128 (first 128 images from COCO train2017) by Ultralytics
-# Documentation: https://docs.ultralytics.com/datasets/detect/coco/
-# Example usage: yolo train data=coco128.yaml
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── coco128 ← downloads here (7 MB)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: coco128 # dataset root dir
-train: images/train2017 # train images (relative to 'path') 128 images
-val: images/train2017 # val images (relative to 'path') 128 images
-test: # test images (optional)
-
-# Classes
-names:
-  0: person
-  1: bicycle
-  2: car
-  3: motorcycle
-  4: airplane
-  5: bus
-  6: train
-  7: truck
-  8: boat
-  9: traffic light
-  10: fire hydrant
-  11: stop sign
-  12: parking meter
-  13: bench
-  14: bird
-  15: cat
-  16: dog
-  17: horse
-  18: sheep
-  19: cow
-  20: elephant
-  21: bear
-  22: zebra
-  23: giraffe
-  24: backpack
-  25: umbrella
-  26: handbag
-  27: tie
-  28: suitcase
-  29: frisbee
-  30: skis
-  31: snowboard
-  32: sports ball
-  33: kite
-  34: baseball bat
-  35: baseball glove
-  36: skateboard
-  37: surfboard
-  38: tennis racket
-  39: bottle
-  40: wine glass
-  41: cup
-  42: fork
-  43: knife
-  44: spoon
-  45: bowl
-  46: banana
-  47: apple
-  48: sandwich
-  49: orange
-  50: broccoli
-  51: carrot
-  52: hot dog
-  53: pizza
-  54: donut
-  55: cake
-  56: chair
-  57: couch
-  58: potted plant
-  59: bed
-  60: dining table
-  61: toilet
-  62: tv
-  63: laptop
-  64: mouse
-  65: remote
-  66: keyboard
-  67: cell phone
-  68: microwave
-  69: oven
-  70: toaster
-  71: sink
-  72: refrigerator
-  73: book
-  74: clock
-  75: vase
-  76: scissors
-  77: teddy bear
-  78: hair drier
-  79: toothbrush
-
-# Download script/URL (optional)
-download: https://github.com/ultralytics/assets/releases/download/v0.0.0/coco128.zip
diff --git a/ultralytics/cfg/datasets/coco8-grayscale.yaml b/ultralytics/cfg/datasets/coco8-grayscale.yaml
deleted file mode 100644
index 225060b..0000000
--- a/ultralytics/cfg/datasets/coco8-grayscale.yaml
+++ /dev/null
@@ -1,103 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# COCO8-Grayscale dataset (first 8 images from COCO train2017) by Ultralytics
-# Documentation: https://docs.ultralytics.com/datasets/detect/coco8-grayscale/
-# Example usage: yolo train data=coco8-grayscale.yaml
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── coco8-grayscale ← downloads here (1 MB)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: coco8-grayscale # dataset root dir
-train: images/train # train images (relative to 'path') 4 images
-val: images/val # val images (relative to 'path') 4 images
-test: # test images (optional)
-
-channels: 1
-
-# Classes
-names:
-  0: person
-  1: bicycle
-  2: car
-  3: motorcycle
-  4: airplane
-  5: bus
-  6: train
-  7: truck
-  8: boat
-  9: traffic light
-  10: fire hydrant
-  11: stop sign
-  12: parking meter
-  13: bench
-  14: bird
-  15: cat
-  16: dog
-  17: horse
-  18: sheep
-  19: cow
-  20: elephant
-  21: bear
-  22: zebra
-  23: giraffe
-  24: backpack
-  25: umbrella
-  26: handbag
-  27: tie
-  28: suitcase
-  29: frisbee
-  30: skis
-  31: snowboard
-  32: sports ball
-  33: kite
-  34: baseball bat
-  35: baseball glove
-  36: skateboard
-  37: surfboard
-  38: tennis racket
-  39: bottle
-  40: wine glass
-  41: cup
-  42: fork
-  43: knife
-  44: spoon
-  45: bowl
-  46: banana
-  47: apple
-  48: sandwich
-  49: orange
-  50: broccoli
-  51: carrot
-  52: hot dog
-  53: pizza
-  54: donut
-  55: cake
-  56: chair
-  57: couch
-  58: potted plant
-  59: bed
-  60: dining table
-  61: toilet
-  62: tv
-  63: laptop
-  64: mouse
-  65: remote
-  66: keyboard
-  67: cell phone
-  68: microwave
-  69: oven
-  70: toaster
-  71: sink
-  72: refrigerator
-  73: book
-  74: clock
-  75: vase
-  76: scissors
-  77: teddy bear
-  78: hair drier
-  79: toothbrush
-
-# Download script/URL (optional)
-download: https://github.com/ultralytics/assets/releases/download/v0.0.0/coco8-grayscale.zip
diff --git a/ultralytics/cfg/datasets/coco8-multispectral.yaml b/ultralytics/cfg/datasets/coco8-multispectral.yaml
deleted file mode 100644
index 956cd3f..0000000
--- a/ultralytics/cfg/datasets/coco8-multispectral.yaml
+++ /dev/null
@@ -1,104 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# COCO8-Multispectral dataset (COCO8 images interpolated across 10 channels in the visual spectrum) by Ultralytics
-# Documentation: https://docs.ultralytics.com/datasets/detect/coco8-multispectral/
-# Example usage: yolo train data=coco8-multispectral.yaml
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── coco8-multispectral ← downloads here (20.2 MB)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: coco8-multispectral # dataset root dir
-train: images/train # train images (relative to 'path') 4 images
-val: images/val # val images (relative to 'path') 4 images
-test: # test images (optional)
-
-# Number of multispectral image channels
-channels: 10
-
-# Classes
-names:
-  0: person
-  1: bicycle
-  2: car
-  3: motorcycle
-  4: airplane
-  5: bus
-  6: train
-  7: truck
-  8: boat
-  9: traffic light
-  10: fire hydrant
-  11: stop sign
-  12: parking meter
-  13: bench
-  14: bird
-  15: cat
-  16: dog
-  17: horse
-  18: sheep
-  19: cow
-  20: elephant
-  21: bear
-  22: zebra
-  23: giraffe
-  24: backpack
-  25: umbrella
-  26: handbag
-  27: tie
-  28: suitcase
-  29: frisbee
-  30: skis
-  31: snowboard
-  32: sports ball
-  33: kite
-  34: baseball bat
-  35: baseball glove
-  36: skateboard
-  37: surfboard
-  38: tennis racket
-  39: bottle
-  40: wine glass
-  41: cup
-  42: fork
-  43: knife
-  44: spoon
-  45: bowl
-  46: banana
-  47: apple
-  48: sandwich
-  49: orange
-  50: broccoli
-  51: carrot
-  52: hot dog
-  53: pizza
-  54: donut
-  55: cake
-  56: chair
-  57: couch
-  58: potted plant
-  59: bed
-  60: dining table
-  61: toilet
-  62: tv
-  63: laptop
-  64: mouse
-  65: remote
-  66: keyboard
-  67: cell phone
-  68: microwave
-  69: oven
-  70: toaster
-  71: sink
-  72: refrigerator
-  73: book
-  74: clock
-  75: vase
-  76: scissors
-  77: teddy bear
-  78: hair drier
-  79: toothbrush
-
-# Download script/URL (optional)
-download: https://github.com/ultralytics/assets/releases/download/v0.0.0/coco8-multispectral.zip
diff --git a/ultralytics/cfg/datasets/coco8-pose.yaml b/ultralytics/cfg/datasets/coco8-pose.yaml
deleted file mode 100644
index c53c24e..0000000
--- a/ultralytics/cfg/datasets/coco8-pose.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# COCO8-pose dataset (first 8 images from COCO train2017) by Ultralytics
-# Documentation: https://docs.ultralytics.com/datasets/pose/coco8-pose/
-# Example usage: yolo train data=coco8-pose.yaml
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── coco8-pose ← downloads here (1 MB)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: coco8-pose # dataset root dir
-train: images/train # train images (relative to 'path') 4 images
-val: images/val # val images (relative to 'path') 4 images
-test: # test images (optional)
-
-# Keypoints
-kpt_shape: [17, 3] # number of keypoints, number of dims (2 for x,y or 3 for x,y,visible)
-flip_idx: [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15]
-
-# Classes
-names:
-  0: person
-
-# Download script/URL (optional)
-download: https://github.com/ultralytics/assets/releases/download/v0.0.0/coco8-pose.zip
diff --git a/ultralytics/cfg/datasets/coco8-seg.yaml b/ultralytics/cfg/datasets/coco8-seg.yaml
deleted file mode 100644
index dc2c675..0000000
--- a/ultralytics/cfg/datasets/coco8-seg.yaml
+++ /dev/null
@@ -1,101 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# COCO8-seg dataset (first 8 images from COCO train2017) by Ultralytics
-# Documentation: https://docs.ultralytics.com/datasets/segment/coco8-seg/
-# Example usage: yolo train data=coco8-seg.yaml
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── coco8-seg ← downloads here (1 MB)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: coco8-seg # dataset root dir
-train: images/train # train images (relative to 'path') 4 images
-val: images/val # val images (relative to 'path') 4 images
-test: # test images (optional)
-
-# Classes
-names:
-  0: person
-  1: bicycle
-  2: car
-  3: motorcycle
-  4: airplane
-  5: bus
-  6: train
-  7: truck
-  8: boat
-  9: traffic light
-  10: fire hydrant
-  11: stop sign
-  12: parking meter
-  13: bench
-  14: bird
-  15: cat
-  16: dog
-  17: horse
-  18: sheep
-  19: cow
-  20: elephant
-  21: bear
-  22: zebra
-  23: giraffe
-  24: backpack
-  25: umbrella
-  26: handbag
-  27: tie
-  28: suitcase
-  29: frisbee
-  30: skis
-  31: snowboard
-  32: sports ball
-  33: kite
-  34: baseball bat
-  35: baseball glove
-  36: skateboard
-  37: surfboard
-  38: tennis racket
-  39: bottle
-  40: wine glass
-  41: cup
-  42: fork
-  43: knife
-  44: spoon
-  45: bowl
-  46: banana
-  47: apple
-  48: sandwich
-  49: orange
-  50: broccoli
-  51: carrot
-  52: hot dog
-  53: pizza
-  54: donut
-  55: cake
-  56: chair
-  57: couch
-  58: potted plant
-  59: bed
-  60: dining table
-  61: toilet
-  62: tv
-  63: laptop
-  64: mouse
-  65: remote
-  66: keyboard
-  67: cell phone
-  68: microwave
-  69: oven
-  70: toaster
-  71: sink
-  72: refrigerator
-  73: book
-  74: clock
-  75: vase
-  76: scissors
-  77: teddy bear
-  78: hair drier
-  79: toothbrush
-
-# Download script/URL (optional)
-download: https://github.com/ultralytics/assets/releases/download/v0.0.0/coco8-seg.zip
diff --git a/ultralytics/cfg/datasets/coco8.yaml b/ultralytics/cfg/datasets/coco8.yaml
deleted file mode 100644
index 3f533b9..0000000
--- a/ultralytics/cfg/datasets/coco8.yaml
+++ /dev/null
@@ -1,101 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# COCO8 dataset (first 8 images from COCO train2017) by Ultralytics
-# Documentation: https://docs.ultralytics.com/datasets/detect/coco8/
-# Example usage: yolo train data=coco8.yaml
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── coco8 ← downloads here (1 MB)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: coco8 # dataset root dir
-train: images/train # train images (relative to 'path') 4 images
-val: images/val # val images (relative to 'path') 4 images
-test: # test images (optional)
-
-# Classes
-names:
-  0: person
-  1: bicycle
-  2: car
-  3: motorcycle
-  4: airplane
-  5: bus
-  6: train
-  7: truck
-  8: boat
-  9: traffic light
-  10: fire hydrant
-  11: stop sign
-  12: parking meter
-  13: bench
-  14: bird
-  15: cat
-  16: dog
-  17: horse
-  18: sheep
-  19: cow
-  20: elephant
-  21: bear
-  22: zebra
-  23: giraffe
-  24: backpack
-  25: umbrella
-  26: handbag
-  27: tie
-  28: suitcase
-  29: frisbee
-  30: skis
-  31: snowboard
-  32: sports ball
-  33: kite
-  34: baseball bat
-  35: baseball glove
-  36: skateboard
-  37: surfboard
-  38: tennis racket
-  39: bottle
-  40: wine glass
-  41: cup
-  42: fork
-  43: knife
-  44: spoon
-  45: bowl
-  46: banana
-  47: apple
-  48: sandwich
-  49: orange
-  50: broccoli
-  51: carrot
-  52: hot dog
-  53: pizza
-  54: donut
-  55: cake
-  56: chair
-  57: couch
-  58: potted plant
-  59: bed
-  60: dining table
-  61: toilet
-  62: tv
-  63: laptop
-  64: mouse
-  65: remote
-  66: keyboard
-  67: cell phone
-  68: microwave
-  69: oven
-  70: toaster
-  71: sink
-  72: refrigerator
-  73: book
-  74: clock
-  75: vase
-  76: scissors
-  77: teddy bear
-  78: hair drier
-  79: toothbrush
-
-# Download script/URL (optional)
-download: https://github.com/ultralytics/assets/releases/download/v0.0.0/coco8.zip
diff --git a/ultralytics/cfg/datasets/construction-ppe.yaml b/ultralytics/cfg/datasets/construction-ppe.yaml
deleted file mode 100644
index c30ff09..0000000
--- a/ultralytics/cfg/datasets/construction-ppe.yaml
+++ /dev/null
@@ -1,32 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Construction-PPE dataset by Ultralytics
-# Documentation: https://docs.ultralytics.com/datasets/detect/construction-ppe/
-# Example usage: yolo train data=construction-ppe.yaml
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── construction-ppe ← downloads here (178.4 MB)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: construction-ppe # dataset root dir
-train: images/train # train images (relative to 'path') 1132 images
-val: images/val # val images (relative to 'path') 143 images
-test: images/test # test images (relative to 'path') 141 images
-
-# Classes
-names:
-  0: helmet
-  1: gloves
-  2: vest
-  3: boots
-  4: goggles
-  5: none
-  6: Person
-  7: no_helmet
-  8: no_goggle
-  9: no_gloves
-  10: no_boots
-
-# Download script/URL (optional)
-download: https://github.com/ultralytics/assets/releases/download/v0.0.0/construction-ppe.zip
diff --git a/ultralytics/cfg/datasets/crack-seg.yaml b/ultralytics/cfg/datasets/crack-seg.yaml
deleted file mode 100644
index f85105d..0000000
--- a/ultralytics/cfg/datasets/crack-seg.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Crack-seg dataset by Ultralytics
-# Documentation: https://docs.ultralytics.com/datasets/segment/crack-seg/
-# Example usage: yolo train data=crack-seg.yaml
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── crack-seg ← downloads here (91.6 MB)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: crack-seg # dataset root dir
-train: images/train # train images (relative to 'path') 3717 images
-val: images/val # val images (relative to 'path') 112 images
-test: images/test # test images (relative to 'path') 200 images
-
-# Classes
-names:
-  0: crack
-
-# Download script/URL (optional)
-download: https://github.com/ultralytics/assets/releases/download/v0.0.0/crack-seg.zip
diff --git a/ultralytics/cfg/datasets/dog-pose.yaml b/ultralytics/cfg/datasets/dog-pose.yaml
deleted file mode 100644
index a65e228..0000000
--- a/ultralytics/cfg/datasets/dog-pose.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Dogs dataset http://vision.stanford.edu/aditya86/ImageNetDogs/ by Stanford
-# Documentation: https://docs.ultralytics.com/datasets/pose/dog-pose/
-# Example usage: yolo train data=dog-pose.yaml
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── dog-pose ← downloads here (337 MB)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: dog-pose # dataset root dir
-train: images/train # train images (relative to 'path') 6773 images
-val: images/val # val images (relative to 'path') 1703 images
-
-# Keypoints
-kpt_shape: [24, 3] # number of keypoints, number of dims (2 for x,y or 3 for x,y,visible)
-
-# Classes
-names:
-  0: dog
-
-# Download script/URL (optional)
-download: https://github.com/ultralytics/assets/releases/download/v0.0.0/dog-pose.zip
diff --git a/ultralytics/cfg/datasets/dota8-multispectral.yaml b/ultralytics/cfg/datasets/dota8-multispectral.yaml
deleted file mode 100644
index d922672..0000000
--- a/ultralytics/cfg/datasets/dota8-multispectral.yaml
+++ /dev/null
@@ -1,38 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# DOTA8-Multispectral dataset (DOTA8 interpolated across 10 channels in the visual spectrum) by Ultralytics
-# Documentation: https://docs.ultralytics.com/datasets/obb/dota8/
-# Example usage: yolo train model=yolov8n-obb.pt data=dota8-multispectral.yaml
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── dota8-multispectral ← downloads here (37.3MB)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: dota8-multispectral # dataset root dir
-train: images/train # train images (relative to 'path') 4 images
-val: images/val # val images (relative to 'path') 4 images
-
-# Number of multispectral image channels
-channels: 10
-
-# Classes for DOTA 1.0
-names:
-  0: plane
-  1: ship
-  2: storage tank
-  3: baseball diamond
-  4: tennis court
-  5: basketball court
-  6: ground track field
-  7: harbor
-  8: bridge
-  9: large vehicle
-  10: small vehicle
-  11: helicopter
-  12: roundabout
-  13: soccer ball field
-  14: swimming pool
-
-# Download script/URL (optional)
-download: https://github.com/ultralytics/assets/releases/download/v0.0.0/dota8-multispectral.zip
diff --git a/ultralytics/cfg/datasets/dota8.yaml b/ultralytics/cfg/datasets/dota8.yaml
deleted file mode 100644
index fd027e0..0000000
--- a/ultralytics/cfg/datasets/dota8.yaml
+++ /dev/null
@@ -1,35 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# DOTA8 dataset 8 images from split DOTAv1 dataset by Ultralytics
-# Documentation: https://docs.ultralytics.com/datasets/obb/dota8/
-# Example usage: yolo train model=yolov8n-obb.pt data=dota8.yaml
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── dota8 ← downloads here (1MB)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: dota8 # dataset root dir
-train: images/train # train images (relative to 'path') 4 images
-val: images/val # val images (relative to 'path') 4 images
-
-# Classes for DOTA 1.0
-names:
-  0: plane
-  1: ship
-  2: storage tank
-  3: baseball diamond
-  4: tennis court
-  5: basketball court
-  6: ground track field
-  7: harbor
-  8: bridge
-  9: large vehicle
-  10: small vehicle
-  11: helicopter
-  12: roundabout
-  13: soccer ball field
-  14: swimming pool
-
-# Download script/URL (optional)
-download: https://github.com/ultralytics/assets/releases/download/v0.0.0/dota8.zip
diff --git a/ultralytics/cfg/datasets/hand-keypoints.yaml b/ultralytics/cfg/datasets/hand-keypoints.yaml
deleted file mode 100644
index 66a2936..0000000
--- a/ultralytics/cfg/datasets/hand-keypoints.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Hand Keypoints dataset by Ultralytics
-# Documentation: https://docs.ultralytics.com/datasets/pose/hand-keypoints/
-# Example usage: yolo train data=hand-keypoints.yaml
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── hand-keypoints ← downloads here (369 MB)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: hand-keypoints # dataset root dir
-train: images/train # train images (relative to 'path') 18776 images
-val: images/val # val images (relative to 'path') 7992 images
-
-# Keypoints
-kpt_shape: [21, 3] # number of keypoints, number of dims (2 for x,y or 3 for x,y,visible)
-flip_idx:
-  [0, 1, 2, 4, 3, 10, 11, 12, 13, 14, 5, 6, 7, 8, 9, 15, 16, 17, 18, 19, 20]
-
-# Classes
-names:
-  0: hand
-
-# Download script/URL (optional)
-download: https://github.com/ultralytics/assets/releases/download/v0.0.0/hand-keypoints.zip
diff --git a/ultralytics/cfg/datasets/lvis.yaml b/ultralytics/cfg/datasets/lvis.yaml
deleted file mode 100644
index 58be3d6..0000000
--- a/ultralytics/cfg/datasets/lvis.yaml
+++ /dev/null
@@ -1,1240 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# LVIS dataset http://www.lvisdataset.org by Facebook AI Research.
-# Documentation: https://docs.ultralytics.com/datasets/detect/lvis/
-# Example usage: yolo train data=lvis.yaml
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── lvis ← downloads here (20.1 GB)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: lvis # dataset root dir
-train: train.txt # train images (relative to 'path') 100170 images
-val: val.txt # val images (relative to 'path') 19809 images
-minival: minival.txt # minival images (relative to 'path') 5000 images
-
-names:
-  0: aerosol can/spray can
-  1: air conditioner
-  2: airplane/aeroplane
-  3: alarm clock
-  4: alcohol/alcoholic beverage
-  5: alligator/gator
-  6: almond
-  7: ambulance
-  8: amplifier
-  9: anklet/ankle bracelet
-  10: antenna/aerial/transmitting aerial
-  11: apple
-  12: applesauce
-  13: apricot
-  14: apron
-  15: aquarium/fish tank
-  16: arctic/arctic type of shoe/galosh/golosh/rubber/rubber type of shoe/gumshoe
-  17: armband
-  18: armchair
-  19: armoire
-  20: armor/armour
-  21: artichoke
-  22: trash can/garbage can/wastebin/dustbin/trash barrel/trash bin
-  23: ashtray
-  24: asparagus
-  25: atomizer/atomiser/spray/sprayer/nebulizer/nebuliser
-  26: avocado
-  27: award/accolade
-  28: awning
-  29: ax/axe
-  30: baboon
-  31: baby buggy/baby carriage/perambulator/pram/stroller
-  32: basketball backboard
-  33: backpack/knapsack/packsack/rucksack/haversack
-  34: handbag/purse/pocketbook
-  35: suitcase/baggage/luggage
-  36: bagel/beigel
-  37: bagpipe
-  38: baguet/baguette
-  39: bait/lure
-  40: ball
-  41: ballet skirt/tutu
-  42: balloon
-  43: bamboo
-  44: banana
-  45: Band Aid
-  46: bandage
-  47: bandanna/bandana
-  48: banjo
-  49: banner/streamer
-  50: barbell
-  51: barge
-  52: barrel/cask
-  53: barrette
-  54: barrow/garden cart/lawn cart/wheelbarrow
-  55: baseball base
-  56: baseball
-  57: baseball bat
-  58: baseball cap/jockey cap/golf cap
-  59: baseball glove/baseball mitt
-  60: basket/handbasket
-  61: basketball
-  62: bass horn/sousaphone/tuba
-  63: bat/bat animal
-  64: bath mat
-  65: bath towel
-  66: bathrobe
-  67: bathtub/bathing tub
-  68: batter/batter food
-  69: battery
-  70: beachball
-  71: bead
-  72: bean curd/tofu
-  73: beanbag
-  74: beanie/beany
-  75: bear
-  76: bed
-  77: bedpan
-  78: bedspread/bedcover/bed covering/counterpane/spread
-  79: cow
-  80: beef/beef food/boeuf/boeuf food
-  81: beeper/pager
-  82: beer bottle
-  83: beer can
-  84: beetle
-  85: bell
-  86: bell pepper/capsicum
-  87: belt
-  88: belt buckle
-  89: bench
-  90: beret
-  91: bib
-  92: Bible
-  93: bicycle/bike/bike bicycle
-  94: visor/vizor
-  95: billboard
-  96: binder/ring-binder
-  97: binoculars/field glasses/opera glasses
-  98: bird
-  99: birdfeeder
-  100: birdbath
-  101: birdcage
-  102: birdhouse
-  103: birthday cake
-  104: birthday card
-  105: pirate flag
-  106: black sheep
-  107: blackberry
-  108: blackboard/chalkboard
-  109: blanket
-  110: blazer/sport jacket/sport coat/sports jacket/sports coat
-  111: blender/liquidizer/liquidiser
-  112: blimp
-  113: blinker/flasher
-  114: blouse
-  115: blueberry
-  116: gameboard
-  117: boat/ship/ship boat
-  118: bob/bobber/bobfloat
-  119: bobbin/spool/reel
-  120: bobby pin/hairgrip
-  121: boiled egg/coddled egg
-  122: bolo tie/bolo/bola tie/bola
-  123: deadbolt
-  124: bolt
-  125: bonnet
-  126: book
-  127: bookcase
-  128: booklet/brochure/leaflet/pamphlet
-  129: bookmark/bookmarker
-  130: boom microphone/microphone boom
-  131: boot
-  132: bottle
-  133: bottle opener
-  134: bouquet
-  135: bow/bow weapon
-  136: bow/bow decorative ribbons
-  137: bow-tie/bowtie
-  138: bowl
-  139: pipe bowl
-  140: bowler hat/bowler/derby hat/derby/plug hat
-  141: bowling ball
-  142: box
-  143: boxing glove
-  144: suspenders
-  145: bracelet/bangle
-  146: brass plaque
-  147: brassiere/bra/bandeau
-  148: bread-bin/breadbox
-  149: bread
-  150: breechcloth/breechclout/loincloth
-  151: bridal gown/wedding gown/wedding dress
-  152: briefcase
-  153: broccoli
-  154: broach
-  155: broom
-  156: brownie
-  157: brussels sprouts
-  158: bubble gum
-  159: bucket/pail
-  160: horse buggy
-  161: horned cow
-  162: bulldog
-  163: bulldozer/dozer
-  164: bullet train
-  165: bulletin board/notice board
-  166: bulletproof vest
-  167: bullhorn/megaphone
-  168: bun/roll
-  169: bunk bed
-  170: buoy
-  171: burrito
-  172: bus/bus vehicle/autobus/charabanc/double-decker/motorbus/motorcoach
-  173: business card
-  174: butter
-  175: butterfly
-  176: button
-  177: cab/cab taxi/taxi/taxicab
-  178: cabana
-  179: cabin car/caboose
-  180: cabinet
-  181: locker/storage locker
-  182: cake
-  183: calculator
-  184: calendar
-  185: calf
-  186: camcorder
-  187: camel
-  188: camera
-  189: camera lens
-  190: camper/camper vehicle/camping bus/motor home
-  191: can/tin can
-  192: can opener/tin opener
-  193: candle/candlestick
-  194: candle holder
-  195: candy bar
-  196: candy cane
-  197: walking cane
-  198: canister/canister
-  199: canoe
-  200: cantaloup/cantaloupe
-  201: canteen
-  202: cap/cap headwear
-  203: bottle cap/cap/cap container lid
-  204: cape
-  205: cappuccino/coffee cappuccino
-  206: car/car automobile/auto/auto automobile/automobile
-  207: railcar/railcar part of a train/railway car/railway car part of a train/railroad car/railroad car part of a train
-  208: elevator car
-  209: car battery/automobile battery
-  210: identity card
-  211: card
-  212: cardigan
-  213: cargo ship/cargo vessel
-  214: carnation
-  215: horse carriage
-  216: carrot
-  217: tote bag
-  218: cart
-  219: carton
-  220: cash register/register/register for cash transactions
-  221: casserole
-  222: cassette
-  223: cast/plaster cast/plaster bandage
-  224: cat
-  225: cauliflower
-  226: cayenne/cayenne spice/cayenne pepper/cayenne pepper spice/red pepper/red pepper spice
-  227: CD player
-  228: celery
-  229: cellular telephone/cellular phone/cellphone/mobile phone/smart phone
-  230: chain mail/ring mail/chain armor/chain armour/ring armor/ring armour
-  231: chair
-  232: chaise longue/chaise/daybed
-  233: chalice
-  234: chandelier
-  235: chap
-  236: checkbook/chequebook
-  237: checkerboard
-  238: cherry
-  239: chessboard
-  240: chicken/chicken animal
-  241: chickpea/garbanzo
-  242: chili/chili vegetable/chili pepper/chili pepper vegetable/chilli/chilli vegetable/chilly/chilly vegetable/chile/chile vegetable
-  243: chime/gong
-  244: chinaware
-  245: crisp/crisp potato chip/potato chip
-  246: poker chip
-  247: chocolate bar
-  248: chocolate cake
-  249: chocolate milk
-  250: chocolate mousse
-  251: choker/collar/neckband
-  252: chopping board/cutting board/chopping block
-  253: chopstick
-  254: Christmas tree
-  255: slide
-  256: cider/cyder
-  257: cigar box
-  258: cigarette
-  259: cigarette case/cigarette pack
-  260: cistern/water tank
-  261: clarinet
-  262: clasp
-  263: cleansing agent/cleanser/cleaner
-  264: cleat/cleat for securing rope
-  265: clementine
-  266: clip
-  267: clipboard
-  268: clippers/clippers for plants
-  269: cloak
-  270: clock/timepiece/timekeeper
-  271: clock tower
-  272: clothes hamper/laundry basket/clothes basket
-  273: clothespin/clothes peg
-  274: clutch bag
-  275: coaster
-  276: coat
-  277: coat hanger/clothes hanger/dress hanger
-  278: coatrack/hatrack
-  279: cock/rooster
-  280: cockroach
-  281: cocoa/cocoa beverage/hot chocolate/hot chocolate beverage/drinking chocolate
-  282: coconut/cocoanut
-  283: coffee maker/coffee machine
-  284: coffee table/cocktail table
-  285: coffeepot
-  286: coil
-  287: coin
-  288: colander/cullender
-  289: coleslaw/slaw
-  290: coloring material/colouring material
-  291: combination lock
-  292: pacifier/teething ring
-  293: comic book
-  294: compass
-  295: computer keyboard/keyboard/keyboard computer
-  296: condiment
-  297: cone/traffic cone
-  298: control/controller
-  299: convertible/convertible automobile
-  300: sofa bed
-  301: cooker
-  302: cookie/cooky/biscuit/biscuit cookie
-  303: cooking utensil
-  304: cooler/cooler for food/ice chest
-  305: cork/cork bottle plug/bottle cork
-  306: corkboard
-  307: corkscrew/bottle screw
-  308: edible corn/corn/maize
-  309: cornbread
-  310: cornet/horn/trumpet
-  311: cornice/valance/valance board/pelmet
-  312: cornmeal
-  313: corset/girdle
-  314: costume
-  315: cougar/puma/catamount/mountain lion/panther
-  316: coverall
-  317: cowbell
-  318: cowboy hat/ten-gallon hat
-  319: crab/crab animal
-  320: crabmeat
-  321: cracker
-  322: crape/crepe/French pancake
-  323: crate
-  324: crayon/wax crayon
-  325: cream pitcher
-  326: crescent roll/croissant
-  327: crib/cot
-  328: crock pot/earthenware jar
-  329: crossbar
-  330: crouton
-  331: crow
-  332: crowbar/wrecking bar/pry bar
-  333: crown
-  334: crucifix
-  335: cruise ship/cruise liner
-  336: police cruiser/patrol car/police car/squad car
-  337: crumb
-  338: crutch
-  339: cub/cub animal
-  340: cube/square block
-  341: cucumber/cuke
-  342: cufflink
-  343: cup
-  344: trophy cup
-  345: cupboard/closet
-  346: cupcake
-  347: hair curler/hair roller/hair crimper
-  348: curling iron
-  349: curtain/drapery
-  350: cushion
-  351: cylinder
-  352: cymbal
-  353: dagger
-  354: dalmatian
-  355: dartboard
-  356: date/date fruit
-  357: deck chair/beach chair
-  358: deer/cervid
-  359: dental floss/floss
-  360: desk
-  361: detergent
-  362: diaper
-  363: diary/journal
-  364: die/dice
-  365: dinghy/dory/rowboat
-  366: dining table
-  367: tux/tuxedo
-  368: dish
-  369: dish antenna
-  370: dishrag/dishcloth
-  371: dishtowel/tea towel
-  372: dishwasher/dishwashing machine
-  373: dishwasher detergent/dishwashing detergent/dishwashing liquid/dishsoap
-  374: dispenser
-  375: diving board
-  376: Dixie cup/paper cup
-  377: dog
-  378: dog collar
-  379: doll
-  380: dollar/dollar bill/one dollar bill
-  381: dollhouse/doll's house
-  382: dolphin
-  383: domestic ass/donkey
-  384: doorknob/doorhandle
-  385: doormat/welcome mat
-  386: doughnut/donut
-  387: dove
-  388: dragonfly
-  389: drawer
-  390: underdrawers/boxers/boxershorts
-  391: dress/frock
-  392: dress hat/high hat/opera hat/silk hat/top hat
-  393: dress suit
-  394: dresser
-  395: drill
-  396: drone
-  397: dropper/eye dropper
-  398: drum/drum musical instrument
-  399: drumstick
-  400: duck
-  401: duckling
-  402: duct tape
-  403: duffel bag/duffle bag/duffel/duffle
-  404: dumbbell
-  405: dumpster
-  406: dustpan
-  407: eagle
-  408: earphone/earpiece/headphone
-  409: earplug
-  410: earring
-  411: easel
-  412: eclair
-  413: eel
-  414: egg/eggs
-  415: egg roll/spring roll
-  416: egg yolk/yolk/yolk egg
-  417: eggbeater/eggwhisk
-  418: eggplant/aubergine
-  419: electric chair
-  420: refrigerator
-  421: elephant
-  422: elk/moose
-  423: envelope
-  424: eraser
-  425: escargot
-  426: eyepatch
-  427: falcon
-  428: fan
-  429: faucet/spigot/tap
-  430: fedora
-  431: ferret
-  432: Ferris wheel
-  433: ferry/ferryboat
-  434: fig/fig fruit
-  435: fighter jet/fighter aircraft/attack aircraft
-  436: figurine
-  437: file cabinet/filing cabinet
-  438: file/file tool
-  439: fire alarm/smoke alarm
-  440: fire engine/fire truck
-  441: fire extinguisher/extinguisher
-  442: fire hose
-  443: fireplace
-  444: fireplug/fire hydrant/hydrant
-  445: first-aid kit
-  446: fish
-  447: fish/fish food
-  448: fishbowl/goldfish bowl
-  449: fishing rod/fishing pole
-  450: flag
-  451: flagpole/flagstaff
-  452: flamingo
-  453: flannel
-  454: flap
-  455: flash/flashbulb
-  456: flashlight/torch
-  457: fleece
-  458: flip-flop/flip-flop sandal
-  459: flipper/flipper footwear/fin/fin footwear
-  460: flower arrangement/floral arrangement
-  461: flute glass/champagne flute
-  462: foal
-  463: folding chair
-  464: food processor
-  465: football/football American
-  466: football helmet
-  467: footstool/footrest
-  468: fork
-  469: forklift
-  470: freight car
-  471: French toast
-  472: freshener/air freshener
-  473: frisbee
-  474: frog/toad/toad frog
-  475: fruit juice
-  476: frying pan/frypan/skillet
-  477: fudge
-  478: funnel
-  479: futon
-  480: gag/muzzle
-  481: garbage
-  482: garbage truck
-  483: garden hose
-  484: gargle/mouthwash
-  485: gargoyle
-  486: garlic/ail
-  487: gasmask/respirator/gas helmet
-  488: gazelle
-  489: gelatin/jelly
-  490: gemstone
-  491: generator
-  492: giant panda/panda/panda bear
-  493: gift wrap
-  494: ginger/gingerroot
-  495: giraffe
-  496: cincture/sash/waistband/waistcloth
-  497: glass/glass drink container/drinking glass
-  498: globe
-  499: glove
-  500: goat
-  501: goggles
-  502: goldfish
-  503: golf club/golf-club
-  504: golfcart
-  505: gondola/gondola boat
-  506: goose
-  507: gorilla
-  508: gourd
-  509: grape
-  510: grater
-  511: gravestone/headstone/tombstone
-  512: gravy boat/gravy holder
-  513: green bean
-  514: green onion/spring onion/scallion
-  515: griddle
-  516: grill/grille/grillwork/radiator grille
-  517: grits/hominy grits
-  518: grizzly/grizzly bear
-  519: grocery bag
-  520: guitar
-  521: gull/seagull
-  522: gun
-  523: hairbrush
-  524: hairnet
-  525: hairpin
-  526: halter top
-  527: ham/jambon/gammon
-  528: hamburger/beefburger/burger
-  529: hammer
-  530: hammock
-  531: hamper
-  532: hamster
-  533: hair dryer
-  534: hand glass/hand mirror
-  535: hand towel/face towel
-  536: handcart/pushcart/hand truck
-  537: handcuff
-  538: handkerchief
-  539: handle/grip/handgrip
-  540: handsaw/carpenter's saw
-  541: hardback book/hardcover book
-  542: harmonium/organ/organ musical instrument/reed organ/reed organ musical instrument
-  543: hat
-  544: hatbox
-  545: veil
-  546: headband
-  547: headboard
-  548: headlight/headlamp
-  549: headscarf
-  550: headset
-  551: headstall/headstall for horses/headpiece/headpiece for horses
-  552: heart
-  553: heater/warmer
-  554: helicopter
-  555: helmet
-  556: heron
-  557: highchair/feeding chair
-  558: hinge
-  559: hippopotamus
-  560: hockey stick
-  561: hog/pig
-  562: home plate/home plate baseball/home base/home base baseball
-  563: honey
-  564: fume hood/exhaust hood
-  565: hook
-  566: hookah/narghile/nargileh/sheesha/shisha/water pipe
-  567: hornet
-  568: horse
-  569: hose/hosepipe
-  570: hot-air balloon
-  571: hotplate
-  572: hot sauce
-  573: hourglass
-  574: houseboat
-  575: hummingbird
-  576: hummus/humus/hommos/hoummos/humous
-  577: polar bear
-  578: icecream
-  579: popsicle
-  580: ice maker
-  581: ice pack/ice bag
-  582: ice skate
-  583: igniter/ignitor/lighter
-  584: inhaler/inhalator
-  585: iPod
-  586: iron/iron for clothing/smoothing iron/smoothing iron for clothing
-  587: ironing board
-  588: jacket
-  589: jam
-  590: jar
-  591: jean/blue jean/denim
-  592: jeep/landrover
-  593: jelly bean/jelly egg
-  594: jersey/T-shirt/tee shirt
-  595: jet plane/jet-propelled plane
-  596: jewel/gem/precious stone
-  597: jewelry/jewellery
-  598: joystick
-  599: jumpsuit
-  600: kayak
-  601: keg
-  602: kennel/doghouse
-  603: kettle/boiler
-  604: key
-  605: keycard
-  606: kilt
-  607: kimono
-  608: kitchen sink
-  609: kitchen table
-  610: kite
-  611: kitten/kitty
-  612: kiwi fruit
-  613: knee pad
-  614: knife
-  615: knitting needle
-  616: knob
-  617: knocker/knocker on a door/doorknocker
-  618: koala/koala bear
-  619: lab coat/laboratory coat
-  620: ladder
-  621: ladle
-  622: ladybug/ladybeetle/ladybird beetle
-  623: lamb/lamb animal
-  624: lamb-chop/lambchop
-  625: lamp
-  626: lamppost
-  627: lampshade
-  628: lantern
-  629: lanyard/laniard
-  630: laptop computer/notebook computer
-  631: lasagna/lasagne
-  632: latch
-  633: lawn mower
-  634: leather
-  635: legging/legging clothing/leging/leging clothing/leg covering
-  636: Lego/Lego set
-  637: legume
-  638: lemon
-  639: lemonade
-  640: lettuce
-  641: license plate/numberplate
-  642: life buoy/lifesaver/life belt/life ring
-  643: life jacket/life vest
-  644: lightbulb
-  645: lightning rod/lightning conductor
-  646: lime
-  647: limousine
-  648: lion
-  649: lip balm
-  650: liquor/spirits/hard liquor/liqueur/cordial
-  651: lizard
-  652: log
-  653: lollipop
-  654: speaker/speaker stereo equipment
-  655: loveseat
-  656: machine gun
-  657: magazine
-  658: magnet
-  659: mail slot
-  660: mailbox/mailbox at home/letter box/letter box at home
-  661: mallard
-  662: mallet
-  663: mammoth
-  664: manatee
-  665: mandarin orange
-  666: manager/through
-  667: manhole
-  668: map
-  669: marker
-  670: martini
-  671: mascot
-  672: mashed potato
-  673: masher
-  674: mask/facemask
-  675: mast
-  676: mat/mat gym equipment/gym mat
-  677: matchbox
-  678: mattress
-  679: measuring cup
-  680: measuring stick/ruler/ruler measuring stick/measuring rod
-  681: meatball
-  682: medicine
-  683: melon
-  684: microphone
-  685: microscope
-  686: microwave oven
-  687: milestone/milepost
-  688: milk
-  689: milk can
-  690: milkshake
-  691: minivan
-  692: mint candy
-  693: mirror
-  694: mitten
-  695: mixer/mixer kitchen tool/stand mixer
-  696: money
-  697: monitor/monitor computer equipment
-  698: monkey
-  699: motor
-  700: motor scooter/scooter
-  701: motor vehicle/automotive vehicle
-  702: motorcycle
-  703: mound/mound baseball/pitcher's mound
-  704: mouse/mouse computer equipment/computer mouse
-  705: mousepad
-  706: muffin
-  707: mug
-  708: mushroom
-  709: music stool/piano stool
-  710: musical instrument/instrument/instrument musical
-  711: nailfile
-  712: napkin/table napkin/serviette
-  713: neckerchief
-  714: necklace
-  715: necktie/tie/tie necktie
-  716: needle
-  717: nest
-  718: newspaper/paper/paper newspaper
-  719: newsstand
-  720: nightshirt/nightwear/sleepwear/nightclothes
-  721: nosebag/nosebag for animals/feedbag
-  722: noseband/noseband for animals/nosepiece/nosepiece for animals
-  723: notebook
-  724: notepad
-  725: nut
-  726: nutcracker
-  727: oar
-  728: octopus/octopus food
-  729: octopus/octopus animal
-  730: oil lamp/kerosene lamp/kerosine lamp
-  731: olive oil
-  732: omelet/omelette
-  733: onion
-  734: orange/orange fruit
-  735: orange juice
-  736: ostrich
-  737: ottoman/pouf/pouffe/hassock
-  738: oven
-  739: overalls/overalls clothing
-  740: owl
-  741: packet
-  742: inkpad/inking pad/stamp pad
-  743: pad
-  744: paddle/boat paddle
-  745: padlock
-  746: paintbrush
-  747: painting
-  748: pajamas/pyjamas
-  749: palette/pallet
-  750: pan/pan for cooking/cooking pan
-  751: pan/pan metal container
-  752: pancake
-  753: pantyhose
-  754: papaya
-  755: paper plate
-  756: paper towel
-  757: paperback book/paper-back book/softback book/soft-cover book
-  758: paperweight
-  759: parachute
-  760: parakeet/parrakeet/parroket/paraquet/paroquet/parroquet
-  761: parasail/parasail sports
-  762: parasol/sunshade
-  763: parchment
-  764: parka/anorak
-  765: parking meter
-  766: parrot
-  767: passenger car/passenger car part of a train/coach/coach part of a train
-  768: passenger ship
-  769: passport
-  770: pastry
-  771: patty/patty food
-  772: pea/pea food
-  773: peach
-  774: peanut butter
-  775: pear
-  776: peeler/peeler tool for fruit and vegetables
-  777: wooden leg/pegleg
-  778: pegboard
-  779: pelican
-  780: pen
-  781: pencil
-  782: pencil box/pencil case
-  783: pencil sharpener
-  784: pendulum
-  785: penguin
-  786: pennant
-  787: penny/penny coin
-  788: pepper/peppercorn
-  789: pepper mill/pepper grinder
-  790: perfume
-  791: persimmon
-  792: person/baby/child/boy/girl/man/woman/human
-  793: pet
-  794: pew/pew church bench/church bench
-  795: phonebook/telephone book/telephone directory
-  796: phonograph record/phonograph recording/record/record phonograph recording
-  797: piano
-  798: pickle
-  799: pickup truck
-  800: pie
-  801: pigeon
-  802: piggy bank/penny bank
-  803: pillow
-  804: pin/pin non jewelry
-  805: pineapple
-  806: pinecone
-  807: ping-pong ball
-  808: pinwheel
-  809: tobacco pipe
-  810: pipe/piping
-  811: pistol/handgun
-  812: pita/pita bread/pocket bread
-  813: pitcher/pitcher vessel for liquid/ewer
-  814: pitchfork
-  815: pizza
-  816: place mat
-  817: plate
-  818: platter
-  819: playpen
-  820: pliers/plyers
-  821: plow/plow farm equipment/plough/plough farm equipment
-  822: plume
-  823: pocket watch
-  824: pocketknife
-  825: poker/poker fire stirring tool/stove poker/fire hook
-  826: pole/post
-  827: polo shirt/sport shirt
-  828: poncho
-  829: pony
-  830: pool table/billiard table/snooker table
-  831: pop/pop soda/soda/soda pop/tonic/soft drink
-  832: postbox/postbox public/mailbox/mailbox public
-  833: postcard/postal card/mailing-card
-  834: poster/placard
-  835: pot
-  836: flowerpot
-  837: potato
-  838: potholder
-  839: pottery/clayware
-  840: pouch
-  841: power shovel/excavator/digger
-  842: prawn/shrimp
-  843: pretzel
-  844: printer/printing machine
-  845: projectile/projectile weapon/missile
-  846: projector
-  847: propeller/propellor
-  848: prune
-  849: pudding
-  850: puffer/puffer fish/pufferfish/blowfish/globefish
-  851: puffin
-  852: pug-dog
-  853: pumpkin
-  854: puncher
-  855: puppet/marionette
-  856: puppy
-  857: quesadilla
-  858: quiche
-  859: quilt/comforter
-  860: rabbit
-  861: race car/racing car
-  862: racket/racquet
-  863: radar
-  864: radiator
-  865: radio receiver/radio set/radio/tuner/tuner radio
-  866: radish/daikon
-  867: raft
-  868: rag doll
-  869: raincoat/waterproof jacket
-  870: ram/ram animal
-  871: raspberry
-  872: rat
-  873: razorblade
-  874: reamer/reamer juicer/juicer/juice reamer
-  875: rearview mirror
-  876: receipt
-  877: recliner/reclining chair/lounger/lounger chair
-  878: record player/phonograph/phonograph record player/turntable
-  879: reflector
-  880: remote control
-  881: rhinoceros
-  882: rib/rib food
-  883: rifle
-  884: ring
-  885: river boat
-  886: road map
-  887: robe
-  888: rocking chair
-  889: rodent
-  890: roller skate
-  891: Rollerblade
-  892: rolling pin
-  893: root beer
-  894: router/router computer equipment
-  895: rubber band/elastic band
-  896: runner/runner carpet
-  897: plastic bag/paper bag
-  898: saddle/saddle on an animal
-  899: saddle blanket/saddlecloth/horse blanket
-  900: saddlebag
-  901: safety pin
-  902: sail
-  903: salad
-  904: salad plate/salad bowl
-  905: salami
-  906: salmon/salmon fish
-  907: salmon/salmon food
-  908: salsa
-  909: saltshaker
-  910: sandal/sandal type of shoe
-  911: sandwich
-  912: satchel
-  913: saucepan
-  914: saucer
-  915: sausage
-  916: sawhorse/sawbuck
-  917: saxophone
-  918: scale/scale measuring instrument
-  919: scarecrow/strawman
-  920: scarf
-  921: school bus
-  922: scissors
-  923: scoreboard
-  924: scraper
-  925: screwdriver
-  926: scrubbing brush
-  927: sculpture
-  928: seabird/seafowl
-  929: seahorse
-  930: seaplane/hydroplane
-  931: seashell
-  932: sewing machine
-  933: shaker
-  934: shampoo
-  935: shark
-  936: sharpener
-  937: Sharpie
-  938: shaver/shaver electric/electric shaver/electric razor
-  939: shaving cream/shaving soap
-  940: shawl
-  941: shears
-  942: sheep
-  943: shepherd dog/sheepdog
-  944: sherbert/sherbet
-  945: shield
-  946: shirt
-  947: shoe/sneaker/sneaker type of shoe/tennis shoe
-  948: shopping bag
-  949: shopping cart
-  950: short pants/shorts/shorts clothing/trunks/trunks clothing
-  951: shot glass
-  952: shoulder bag
-  953: shovel
-  954: shower head
-  955: shower cap
-  956: shower curtain
-  957: shredder/shredder for paper
-  958: signboard
-  959: silo
-  960: sink
-  961: skateboard
-  962: skewer
-  963: ski
-  964: ski boot
-  965: ski parka/ski jacket
-  966: ski pole
-  967: skirt
-  968: skullcap
-  969: sled/sledge/sleigh
-  970: sleeping bag
-  971: sling/sling bandage/triangular bandage
-  972: slipper/slipper footwear/carpet slipper/carpet slipper footwear
-  973: smoothie
-  974: snake/serpent
-  975: snowboard
-  976: snowman
-  977: snowmobile
-  978: soap
-  979: soccer ball
-  980: sock
-  981: sofa/couch/lounge
-  982: softball
-  983: solar array/solar battery/solar panel
-  984: sombrero
-  985: soup
-  986: soup bowl
-  987: soupspoon
-  988: sour cream/soured cream
-  989: soya milk/soybean milk/soymilk
-  990: space shuttle
-  991: sparkler/sparkler fireworks
-  992: spatula
-  993: spear/lance
-  994: spectacles/specs/eyeglasses/glasses
-  995: spice rack
-  996: spider
-  997: crawfish/crayfish
-  998: sponge
-  999: spoon
-  1000: sportswear/athletic wear/activewear
-  1001: spotlight
-  1002: squid/squid food/calamari/calamary
-  1003: squirrel
-  1004: stagecoach
-  1005: stapler/stapler stapling machine
-  1006: starfish/sea star
-  1007: statue/statue sculpture
-  1008: steak/steak food
-  1009: steak knife
-  1010: steering wheel
-  1011: stepladder
-  1012: step stool
-  1013: stereo/stereo sound system
-  1014: stew
-  1015: stirrer
-  1016: stirrup
-  1017: stool
-  1018: stop sign
-  1019: brake light
-  1020: stove/kitchen stove/range/range kitchen appliance/kitchen range/cooking stove
-  1021: strainer
-  1022: strap
-  1023: straw/straw for drinking/drinking straw
-  1024: strawberry
-  1025: street sign
-  1026: streetlight/street lamp
-  1027: string cheese
-  1028: stylus
-  1029: subwoofer
-  1030: sugar bowl
-  1031: sugarcane/sugarcane plant
-  1032: suit/suit clothing
-  1033: sunflower
-  1034: sunglasses
-  1035: sunhat
-  1036: surfboard
-  1037: sushi
-  1038: mop
-  1039: sweat pants
-  1040: sweatband
-  1041: sweater
-  1042: sweatshirt
-  1043: sweet potato
-  1044: swimsuit/swimwear/bathing suit/swimming costume/bathing costume/swimming trunks/bathing trunks
-  1045: sword
-  1046: syringe
-  1047: Tabasco sauce
-  1048: table-tennis table/ping-pong table
-  1049: table
-  1050: table lamp
-  1051: tablecloth
-  1052: tachometer
-  1053: taco
-  1054: tag
-  1055: taillight/rear light
-  1056: tambourine
-  1057: army tank/armored combat vehicle/armoured combat vehicle
-  1058: tank/tank storage vessel/storage tank
-  1059: tank top/tank top clothing
-  1060: tape/tape sticky cloth or paper
-  1061: tape measure/measuring tape
-  1062: tapestry
-  1063: tarp
-  1064: tartan/plaid
-  1065: tassel
-  1066: tea bag
-  1067: teacup
-  1068: teakettle
-  1069: teapot
-  1070: teddy bear
-  1071: telephone/phone/telephone set
-  1072: telephone booth/phone booth/call box/telephone box/telephone kiosk
-  1073: telephone pole/telegraph pole/telegraph post
-  1074: telephoto lens/zoom lens
-  1075: television camera/tv camera
-  1076: television set/tv/tv set
-  1077: tennis ball
-  1078: tennis racket
-  1079: tequila
-  1080: thermometer
-  1081: thermos bottle
-  1082: thermostat
-  1083: thimble
-  1084: thread/yarn
-  1085: thumbtack/drawing pin/pushpin
-  1086: tiara
-  1087: tiger
-  1088: tights/tights clothing/leotards
-  1089: timer/stopwatch
-  1090: tinfoil
-  1091: tinsel
-  1092: tissue paper
-  1093: toast/toast food
-  1094: toaster
-  1095: toaster oven
-  1096: toilet
-  1097: toilet tissue/toilet paper/bathroom tissue
-  1098: tomato
-  1099: tongs
-  1100: toolbox
-  1101: toothbrush
-  1102: toothpaste
-  1103: toothpick
-  1104: cover
-  1105: tortilla
-  1106: tow truck
-  1107: towel
-  1108: towel rack/towel rail/towel bar
-  1109: toy
-  1110: tractor/tractor farm equipment
-  1111: traffic light
-  1112: dirt bike
-  1113: trailer truck/tractor trailer/trucking rig/articulated lorry/semi truck
-  1114: train/train railroad vehicle/railroad train
-  1115: trampoline
-  1116: tray
-  1117: trench coat
-  1118: triangle/triangle musical instrument
-  1119: tricycle
-  1120: tripod
-  1121: trousers/pants/pants clothing
-  1122: truck
-  1123: truffle/truffle chocolate/chocolate truffle
-  1124: trunk
-  1125: vat
-  1126: turban
-  1127: turkey/turkey food
-  1128: turnip
-  1129: turtle
-  1130: turtleneck/turtleneck clothing/polo-neck
-  1131: typewriter
-  1132: umbrella
-  1133: underwear/underclothes/underclothing/underpants
-  1134: unicycle
-  1135: urinal
-  1136: urn
-  1137: vacuum cleaner
-  1138: vase
-  1139: vending machine
-  1140: vent/blowhole/air vent
-  1141: vest/waistcoat
-  1142: videotape
-  1143: vinegar
-  1144: violin/fiddle
-  1145: vodka
-  1146: volleyball
-  1147: vulture
-  1148: waffle
-  1149: waffle iron
-  1150: wagon
-  1151: wagon wheel
-  1152: walking stick
-  1153: wall clock
-  1154: wall socket/wall plug/electric outlet/electrical outlet/outlet/electric receptacle
-  1155: wallet/billfold
-  1156: walrus
-  1157: wardrobe
-  1158: washbasin/basin/basin for washing/washbowl/washstand/handbasin
-  1159: automatic washer/washing machine
-  1160: watch/wristwatch
-  1161: water bottle
-  1162: water cooler
-  1163: water faucet/water tap/tap/tap water faucet
-  1164: water heater/hot-water heater
-  1165: water jug
-  1166: water gun/squirt gun
-  1167: water scooter/sea scooter/jet ski
-  1168: water ski
-  1169: water tower
-  1170: watering can
-  1171: watermelon
-  1172: weathervane/vane/vane weathervane/wind vane
-  1173: webcam
-  1174: wedding cake/bridecake
-  1175: wedding ring/wedding band
-  1176: wet suit
-  1177: wheel
-  1178: wheelchair
-  1179: whipped cream
-  1180: whistle
-  1181: wig
-  1182: wind chime
-  1183: windmill
-  1184: window box/window box for plants
-  1185: windshield wiper/windscreen wiper/wiper/wiper for windshield or screen
-  1186: windsock/air sock/air-sleeve/wind sleeve/wind cone
-  1187: wine bottle
-  1188: wine bucket/wine cooler
-  1189: wineglass
-  1190: blinder/blinder for horses
-  1191: wok
-  1192: wolf
-  1193: wooden spoon
-  1194: wreath
-  1195: wrench/spanner
-  1196: wristband
-  1197: wristlet/wrist band
-  1198: yacht
-  1199: yogurt/yoghurt/yoghourt
-  1200: yoke/yoke animal equipment
-  1201: zebra
-  1202: zucchini/courgette
-
-# Download script/URL (optional)
-download: |
-  from pathlib import Path
-
-  from ultralytics.utils.downloads import download
-
-  # Download labels
-  dir = Path(yaml["path"])  # dataset root dir
-  url = "https://github.com/ultralytics/assets/releases/download/v0.0.0/"
-  urls = [f"{url}lvis-labels-segments.zip"]
-  download(urls, dir=dir.parent)
-
-  # Download data
-  urls = [
-      "http://images.cocodataset.org/zips/train2017.zip",  # 19G, 118k images
-      "http://images.cocodataset.org/zips/val2017.zip",  # 1G, 5k images
-      "http://images.cocodataset.org/zips/test2017.zip",  # 7G, 41k images (optional)
-  ]
-  download(urls, dir=dir / "images", threads=3)
diff --git a/ultralytics/cfg/datasets/medical-pills.yaml b/ultralytics/cfg/datasets/medical-pills.yaml
deleted file mode 100644
index 100b298..0000000
--- a/ultralytics/cfg/datasets/medical-pills.yaml
+++ /dev/null
@@ -1,21 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Medical-pills dataset by Ultralytics
-# Documentation: https://docs.ultralytics.com/datasets/detect/medical-pills/
-# Example usage: yolo train data=medical-pills.yaml
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── medical-pills ← downloads here (8.19 MB)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: medical-pills # dataset root dir
-train: images/train # train images (relative to 'path') 92 images
-val: images/val # val images (relative to 'path') 23 images
-
-# Classes
-names:
-  0: pill
-
-# Download script/URL (optional)
-download: https://github.com/ultralytics/assets/releases/download/v0.0.0/medical-pills.zip
diff --git a/ultralytics/cfg/datasets/open-images-v7.yaml b/ultralytics/cfg/datasets/open-images-v7.yaml
deleted file mode 100644
index 0ab6d48..0000000
--- a/ultralytics/cfg/datasets/open-images-v7.yaml
+++ /dev/null
@@ -1,663 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Open Images v7 dataset https://storage.googleapis.com/openimages/web/index.html by Google
-# Documentation: https://docs.ultralytics.com/datasets/detect/open-images-v7/
-# Example usage: yolo train data=open-images-v7.yaml
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── open-images-v7 ← downloads here (561 GB)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: open-images-v7 # dataset root dir
-train: images/train # train images (relative to 'path') 1743042 images
-val: images/val # val images (relative to 'path') 41620 images
-test: # test images (optional)
-
-# Classes
-names:
-  0: Accordion
-  1: Adhesive tape
-  2: Aircraft
-  3: Airplane
-  4: Alarm clock
-  5: Alpaca
-  6: Ambulance
-  7: Animal
-  8: Ant
-  9: Antelope
-  10: Apple
-  11: Armadillo
-  12: Artichoke
-  13: Auto part
-  14: Axe
-  15: Backpack
-  16: Bagel
-  17: Baked goods
-  18: Balance beam
-  19: Ball
-  20: Balloon
-  21: Banana
-  22: Band-aid
-  23: Banjo
-  24: Barge
-  25: Barrel
-  26: Baseball bat
-  27: Baseball glove
-  28: Bat (Animal)
-  29: Bathroom accessory
-  30: Bathroom cabinet
-  31: Bathtub
-  32: Beaker
-  33: Bear
-  34: Bed
-  35: Bee
-  36: Beehive
-  37: Beer
-  38: Beetle
-  39: Bell pepper
-  40: Belt
-  41: Bench
-  42: Bicycle
-  43: Bicycle helmet
-  44: Bicycle wheel
-  45: Bidet
-  46: Billboard
-  47: Billiard table
-  48: Binoculars
-  49: Bird
-  50: Blender
-  51: Blue jay
-  52: Boat
-  53: Bomb
-  54: Book
-  55: Bookcase
-  56: Boot
-  57: Bottle
-  58: Bottle opener
-  59: Bow and arrow
-  60: Bowl
-  61: Bowling equipment
-  62: Box
-  63: Boy
-  64: Brassiere
-  65: Bread
-  66: Briefcase
-  67: Broccoli
-  68: Bronze sculpture
-  69: Brown bear
-  70: Building
-  71: Bull
-  72: Burrito
-  73: Bus
-  74: Bust
-  75: Butterfly
-  76: Cabbage
-  77: Cabinetry
-  78: Cake
-  79: Cake stand
-  80: Calculator
-  81: Camel
-  82: Camera
-  83: Can opener
-  84: Canary
-  85: Candle
-  86: Candy
-  87: Cannon
-  88: Canoe
-  89: Cantaloupe
-  90: Car
-  91: Carnivore
-  92: Carrot
-  93: Cart
-  94: Cassette deck
-  95: Castle
-  96: Cat
-  97: Cat furniture
-  98: Caterpillar
-  99: Cattle
-  100: Ceiling fan
-  101: Cello
-  102: Centipede
-  103: Chainsaw
-  104: Chair
-  105: Cheese
-  106: Cheetah
-  107: Chest of drawers
-  108: Chicken
-  109: Chime
-  110: Chisel
-  111: Chopsticks
-  112: Christmas tree
-  113: Clock
-  114: Closet
-  115: Clothing
-  116: Coat
-  117: Cocktail
-  118: Cocktail shaker
-  119: Coconut
-  120: Coffee
-  121: Coffee cup
-  122: Coffee table
-  123: Coffeemaker
-  124: Coin
-  125: Common fig
-  126: Common sunflower
-  127: Computer keyboard
-  128: Computer monitor
-  129: Computer mouse
-  130: Container
-  131: Convenience store
-  132: Cookie
-  133: Cooking spray
-  134: Corded phone
-  135: Cosmetics
-  136: Couch
-  137: Countertop
-  138: Cowboy hat
-  139: Crab
-  140: Cream
-  141: Cricket ball
-  142: Crocodile
-  143: Croissant
-  144: Crown
-  145: Crutch
-  146: Cucumber
-  147: Cupboard
-  148: Curtain
-  149: Cutting board
-  150: Dagger
-  151: Dairy Product
-  152: Deer
-  153: Desk
-  154: Dessert
-  155: Diaper
-  156: Dice
-  157: Digital clock
-  158: Dinosaur
-  159: Dishwasher
-  160: Dog
-  161: Dog bed
-  162: Doll
-  163: Dolphin
-  164: Door
-  165: Door handle
-  166: Doughnut
-  167: Dragonfly
-  168: Drawer
-  169: Dress
-  170: Drill (Tool)
-  171: Drink
-  172: Drinking straw
-  173: Drum
-  174: Duck
-  175: Dumbbell
-  176: Eagle
-  177: Earrings
-  178: Egg (Food)
-  179: Elephant
-  180: Envelope
-  181: Eraser
-  182: Face powder
-  183: Facial tissue holder
-  184: Falcon
-  185: Fashion accessory
-  186: Fast food
-  187: Fax
-  188: Fedora
-  189: Filing cabinet
-  190: Fire hydrant
-  191: Fireplace
-  192: Fish
-  193: Flag
-  194: Flashlight
-  195: Flower
-  196: Flowerpot
-  197: Flute
-  198: Flying disc
-  199: Food
-  200: Food processor
-  201: Football
-  202: Football helmet
-  203: Footwear
-  204: Fork
-  205: Fountain
-  206: Fox
-  207: French fries
-  208: French horn
-  209: Frog
-  210: Fruit
-  211: Frying pan
-  212: Furniture
-  213: Garden Asparagus
-  214: Gas stove
-  215: Giraffe
-  216: Girl
-  217: Glasses
-  218: Glove
-  219: Goat
-  220: Goggles
-  221: Goldfish
-  222: Golf ball
-  223: Golf cart
-  224: Gondola
-  225: Goose
-  226: Grape
-  227: Grapefruit
-  228: Grinder
-  229: Guacamole
-  230: Guitar
-  231: Hair dryer
-  232: Hair spray
-  233: Hamburger
-  234: Hammer
-  235: Hamster
-  236: Hand dryer
-  237: Handbag
-  238: Handgun
-  239: Harbor seal
-  240: Harmonica
-  241: Harp
-  242: Harpsichord
-  243: Hat
-  244: Headphones
-  245: Heater
-  246: Hedgehog
-  247: Helicopter
-  248: Helmet
-  249: High heels
-  250: Hiking equipment
-  251: Hippopotamus
-  252: Home appliance
-  253: Honeycomb
-  254: Horizontal bar
-  255: Horse
-  256: Hot dog
-  257: House
-  258: Houseplant
-  259: Human arm
-  260: Human beard
-  261: Human body
-  262: Human ear
-  263: Human eye
-  264: Human face
-  265: Human foot
-  266: Human hair
-  267: Human hand
-  268: Human head
-  269: Human leg
-  270: Human mouth
-  271: Human nose
-  272: Humidifier
-  273: Ice cream
-  274: Indoor rower
-  275: Infant bed
-  276: Insect
-  277: Invertebrate
-  278: Ipod
-  279: Isopod
-  280: Jacket
-  281: Jacuzzi
-  282: Jaguar (Animal)
-  283: Jeans
-  284: Jellyfish
-  285: Jet ski
-  286: Jug
-  287: Juice
-  288: Kangaroo
-  289: Kettle
-  290: Kitchen & dining room table
-  291: Kitchen appliance
-  292: Kitchen knife
-  293: Kitchen utensil
-  294: Kitchenware
-  295: Kite
-  296: Knife
-  297: Koala
-  298: Ladder
-  299: Ladle
-  300: Ladybug
-  301: Lamp
-  302: Land vehicle
-  303: Lantern
-  304: Laptop
-  305: Lavender (Plant)
-  306: Lemon
-  307: Leopard
-  308: Light bulb
-  309: Light switch
-  310: Lighthouse
-  311: Lily
-  312: Limousine
-  313: Lion
-  314: Lipstick
-  315: Lizard
-  316: Lobster
-  317: Loveseat
-  318: Luggage and bags
-  319: Lynx
-  320: Magpie
-  321: Mammal
-  322: Man
-  323: Mango
-  324: Maple
-  325: Maracas
-  326: Marine invertebrates
-  327: Marine mammal
-  328: Measuring cup
-  329: Mechanical fan
-  330: Medical equipment
-  331: Microphone
-  332: Microwave oven
-  333: Milk
-  334: Miniskirt
-  335: Mirror
-  336: Missile
-  337: Mixer
-  338: Mixing bowl
-  339: Mobile phone
-  340: Monkey
-  341: Moths and butterflies
-  342: Motorcycle
-  343: Mouse
-  344: Muffin
-  345: Mug
-  346: Mule
-  347: Mushroom
-  348: Musical instrument
-  349: Musical keyboard
-  350: Nail (Construction)
-  351: Necklace
-  352: Nightstand
-  353: Oboe
-  354: Office building
-  355: Office supplies
-  356: Orange
-  357: Organ (Musical Instrument)
-  358: Ostrich
-  359: Otter
-  360: Oven
-  361: Owl
-  362: Oyster
-  363: Paddle
-  364: Palm tree
-  365: Pancake
-  366: Panda
-  367: Paper cutter
-  368: Paper towel
-  369: Parachute
-  370: Parking meter
-  371: Parrot
-  372: Pasta
-  373: Pastry
-  374: Peach
-  375: Pear
-  376: Pen
-  377: Pencil case
-  378: Pencil sharpener
-  379: Penguin
-  380: Perfume
-  381: Person
-  382: Personal care
-  383: Personal flotation device
-  384: Piano
-  385: Picnic basket
-  386: Picture frame
-  387: Pig
-  388: Pillow
-  389: Pineapple
-  390: Pitcher (Container)
-  391: Pizza
-  392: Pizza cutter
-  393: Plant
-  394: Plastic bag
-  395: Plate
-  396: Platter
-  397: Plumbing fixture
-  398: Polar bear
-  399: Pomegranate
-  400: Popcorn
-  401: Porch
-  402: Porcupine
-  403: Poster
-  404: Potato
-  405: Power plugs and sockets
-  406: Pressure cooker
-  407: Pretzel
-  408: Printer
-  409: Pumpkin
-  410: Punching bag
-  411: Rabbit
-  412: Raccoon
-  413: Racket
-  414: Radish
-  415: Ratchet (Device)
-  416: Raven
-  417: Rays and skates
-  418: Red panda
-  419: Refrigerator
-  420: Remote control
-  421: Reptile
-  422: Rhinoceros
-  423: Rifle
-  424: Ring binder
-  425: Rocket
-  426: Roller skates
-  427: Rose
-  428: Rugby ball
-  429: Ruler
-  430: Salad
-  431: Salt and pepper shakers
-  432: Sandal
-  433: Sandwich
-  434: Saucer
-  435: Saxophone
-  436: Scale
-  437: Scarf
-  438: Scissors
-  439: Scoreboard
-  440: Scorpion
-  441: Screwdriver
-  442: Sculpture
-  443: Sea lion
-  444: Sea turtle
-  445: Seafood
-  446: Seahorse
-  447: Seat belt
-  448: Segway
-  449: Serving tray
-  450: Sewing machine
-  451: Shark
-  452: Sheep
-  453: Shelf
-  454: Shellfish
-  455: Shirt
-  456: Shorts
-  457: Shotgun
-  458: Shower
-  459: Shrimp
-  460: Sink
-  461: Skateboard
-  462: Ski
-  463: Skirt
-  464: Skull
-  465: Skunk
-  466: Skyscraper
-  467: Slow cooker
-  468: Snack
-  469: Snail
-  470: Snake
-  471: Snowboard
-  472: Snowman
-  473: Snowmobile
-  474: Snowplow
-  475: Soap dispenser
-  476: Sock
-  477: Sofa bed
-  478: Sombrero
-  479: Sparrow
-  480: Spatula
-  481: Spice rack
-  482: Spider
-  483: Spoon
-  484: Sports equipment
-  485: Sports uniform
-  486: Squash (Plant)
-  487: Squid
-  488: Squirrel
-  489: Stairs
-  490: Stapler
-  491: Starfish
-  492: Stationary bicycle
-  493: Stethoscope
-  494: Stool
-  495: Stop sign
-  496: Strawberry
-  497: Street light
-  498: Stretcher
-  499: Studio couch
-  500: Submarine
-  501: Submarine sandwich
-  502: Suit
-  503: Suitcase
-  504: Sun hat
-  505: Sunglasses
-  506: Surfboard
-  507: Sushi
-  508: Swan
-  509: Swim cap
-  510: Swimming pool
-  511: Swimwear
-  512: Sword
-  513: Syringe
-  514: Table
-  515: Table tennis racket
-  516: Tablet computer
-  517: Tableware
-  518: Taco
-  519: Tank
-  520: Tap
-  521: Tart
-  522: Taxi
-  523: Tea
-  524: Teapot
-  525: Teddy bear
-  526: Telephone
-  527: Television
-  528: Tennis ball
-  529: Tennis racket
-  530: Tent
-  531: Tiara
-  532: Tick
-  533: Tie
-  534: Tiger
-  535: Tin can
-  536: Tire
-  537: Toaster
-  538: Toilet
-  539: Toilet paper
-  540: Tomato
-  541: Tool
-  542: Toothbrush
-  543: Torch
-  544: Tortoise
-  545: Towel
-  546: Tower
-  547: Toy
-  548: Traffic light
-  549: Traffic sign
-  550: Train
-  551: Training bench
-  552: Treadmill
-  553: Tree
-  554: Tree house
-  555: Tripod
-  556: Trombone
-  557: Trousers
-  558: Truck
-  559: Trumpet
-  560: Turkey
-  561: Turtle
-  562: Umbrella
-  563: Unicycle
-  564: Van
-  565: Vase
-  566: Vegetable
-  567: Vehicle
-  568: Vehicle registration plate
-  569: Violin
-  570: Volleyball (Ball)
-  571: Waffle
-  572: Waffle iron
-  573: Wall clock
-  574: Wardrobe
-  575: Washing machine
-  576: Waste container
-  577: Watch
-  578: Watercraft
-  579: Watermelon
-  580: Weapon
-  581: Whale
-  582: Wheel
-  583: Wheelchair
-  584: Whisk
-  585: Whiteboard
-  586: Willow
-  587: Window
-  588: Window blind
-  589: Wine
-  590: Wine glass
-  591: Wine rack
-  592: Winter melon
-  593: Wok
-  594: Woman
-  595: Wood-burning stove
-  596: Woodpecker
-  597: Worm
-  598: Wrench
-  599: Zebra
-  600: Zucchini
-
-# Download script/URL (optional) ---------------------------------------------------------------------------------------
-download: |
-  import warnings
-
-  from ultralytics.utils import LOGGER, SETTINGS, Path
-  from ultralytics.utils.checks import check_requirements
-
-  check_requirements("fiftyone")
-
-  import fiftyone as fo
-  import fiftyone.zoo as foz
-
-  name = "open-images-v7"
-  fo.config.dataset_zoo_dir = Path(SETTINGS["datasets_dir"]) / "fiftyone" / name
-  fraction = 1.0  # fraction of full dataset to use
-  LOGGER.warning("Open Images V7 dataset requires at least **561 GB of free space. Starting download...")
-  for split in "train", "validation":  # 1743042 train, 41620 val images
-      train = split == "train"
-
-      # Load Open Images dataset
-      dataset = foz.load_zoo_dataset(
-          name,
-          split=split,
-          label_types=["detections"],
-          max_samples=round((1743042 if train else 41620) * fraction),
-      )
-
-      # Define classes
-      if train:
-          classes = dataset.default_classes  # all classes
-          # classes = dataset.distinct('ground_truth.detections.label')  # only observed classes
-
-      # Export to YOLO format
-      with warnings.catch_warnings():
-          warnings.filterwarnings("ignore", category=UserWarning, module="fiftyone.utils.yolo")
-          dataset.export(
-              export_dir=str(Path(SETTINGS["datasets_dir"]) / name),
-              dataset_type=fo.types.YOLOv5Dataset,
-              label_field="ground_truth",
-              split="val" if split == "validation" else split,
-              classes=classes,
-              overwrite=train,
-          )
diff --git a/ultralytics/cfg/datasets/package-seg.yaml b/ultralytics/cfg/datasets/package-seg.yaml
deleted file mode 100644
index 70c0fbc..0000000
--- a/ultralytics/cfg/datasets/package-seg.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Package-seg dataset by Ultralytics
-# Documentation: https://docs.ultralytics.com/datasets/segment/package-seg/
-# Example usage: yolo train data=package-seg.yaml
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── package-seg ← downloads here (103 MB)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: package-seg # dataset root dir
-train: images/train # train images (relative to 'path') 1920 images
-val: images/val # val images (relative to 'path') 89 images
-test: images/test # test images (relative to 'path') 188 images
-
-# Classes
-names:
-  0: package
-
-# Download script/URL (optional)
-download: https://github.com/ultralytics/assets/releases/download/v0.0.0/package-seg.zip
diff --git a/ultralytics/cfg/datasets/signature.yaml b/ultralytics/cfg/datasets/signature.yaml
deleted file mode 100644
index 13d5fad..0000000
--- a/ultralytics/cfg/datasets/signature.yaml
+++ /dev/null
@@ -1,21 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Signature dataset by Ultralytics
-# Documentation: https://docs.ultralytics.com/datasets/detect/signature/
-# Example usage: yolo train data=signature.yaml
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── signature ← downloads here (11.3 MB)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: signature # dataset root dir
-train: images/train # train images (relative to 'path') 143 images
-val: images/val # val images (relative to 'path') 35 images
-
-# Classes
-names:
-  0: signature
-
-# Download script/URL (optional)
-download: https://github.com/ultralytics/assets/releases/download/v0.0.0/signature.zip
diff --git a/ultralytics/cfg/datasets/tiger-pose.yaml b/ultralytics/cfg/datasets/tiger-pose.yaml
deleted file mode 100644
index 7b9bd0e..0000000
--- a/ultralytics/cfg/datasets/tiger-pose.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Tiger Pose dataset by Ultralytics
-# Documentation: https://docs.ultralytics.com/datasets/pose/tiger-pose/
-# Example usage: yolo train data=tiger-pose.yaml
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── tiger-pose ← downloads here (49.8 MB)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: tiger-pose # dataset root dir
-train: images/train # train images (relative to 'path') 210 images
-val: images/val # val images (relative to 'path') 53 images
-
-# Keypoints
-kpt_shape: [12, 2] # number of keypoints, number of dims (2 for x,y or 3 for x,y,visible)
-flip_idx: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
-
-# Classes
-names:
-  0: tiger
-
-# Download script/URL (optional)
-download: https://github.com/ultralytics/assets/releases/download/v0.0.0/tiger-pose.zip
diff --git a/ultralytics/cfg/datasets/xView.yaml b/ultralytics/cfg/datasets/xView.yaml
deleted file mode 100644
index 73d58e6..0000000
--- a/ultralytics/cfg/datasets/xView.yaml
+++ /dev/null
@@ -1,155 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# DIUx xView 2018 Challenge https://challenge.xviewdataset.org by U.S. National Geospatial-Intelligence Agency (NGA)
-# --------  DOWNLOAD DATA MANUALLY and jar xf val_images.zip to 'datasets/xView' before running train command!  --------
-# Documentation: https://docs.ultralytics.com/datasets/detect/xview/
-# Example usage: yolo train data=xView.yaml
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── xView ← downloads here (20.7 GB)
-
-# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: xView # dataset root dir
-train: images/autosplit_train.txt # train images (relative to 'path') 90% of 847 train images
-val: images/autosplit_val.txt # train images (relative to 'path') 10% of 847 train images
-
-# Classes
-names:
-  0: Fixed-wing Aircraft
-  1: Small Aircraft
-  2: Cargo Plane
-  3: Helicopter
-  4: Passenger Vehicle
-  5: Small Car
-  6: Bus
-  7: Pickup Truck
-  8: Utility Truck
-  9: Truck
-  10: Cargo Truck
-  11: Truck w/Box
-  12: Truck Tractor
-  13: Trailer
-  14: Truck w/Flatbed
-  15: Truck w/Liquid
-  16: Crane Truck
-  17: Railway Vehicle
-  18: Passenger Car
-  19: Cargo Car
-  20: Flat Car
-  21: Tank car
-  22: Locomotive
-  23: Maritime Vessel
-  24: Motorboat
-  25: Sailboat
-  26: Tugboat
-  27: Barge
-  28: Fishing Vessel
-  29: Ferry
-  30: Yacht
-  31: Container Ship
-  32: Oil Tanker
-  33: Engineering Vehicle
-  34: Tower crane
-  35: Container Crane
-  36: Reach Stacker
-  37: Straddle Carrier
-  38: Mobile Crane
-  39: Dump Truck
-  40: Haul Truck
-  41: Scraper/Tractor
-  42: Front loader/Bulldozer
-  43: Excavator
-  44: Cement Mixer
-  45: Ground Grader
-  46: Hut/Tent
-  47: Shed
-  48: Building
-  49: Aircraft Hangar
-  50: Damaged Building
-  51: Facility
-  52: Construction Site
-  53: Vehicle Lot
-  54: Helipad
-  55: Storage Tank
-  56: Shipping container lot
-  57: Shipping Container
-  58: Pylon
-  59: Tower
-
-# Download script/URL (optional) ---------------------------------------------------------------------------------------
-download: |
-  import json
-  import os
-  from pathlib import Path
-
-  import numpy as np
-  from PIL import Image
-
-  from ultralytics.utils import TQDM
-  from ultralytics.data.split import autosplit
-  from ultralytics.utils.ops import xyxy2xywhn
-
-
-  def convert_labels(fname=Path("xView/xView_train.geojson")):
-      """Converts xView geoJSON labels to YOLO format, mapping classes to indices 0-59 and saving as text files."""
-      path = fname.parent
-      with open(fname, encoding="utf-8") as f:
-          print(f"Loading {fname}...")
-          data = json.load(f)
-
-      # Make dirs
-      labels = Path(path / "labels" / "train")
-      os.system(f"rm -rf {labels}")
-      labels.mkdir(parents=True, exist_ok=True)
-
-      # xView classes 11-94 to 0-59
-      xview_class2index = [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, -1, 3, -1, 4, 5, 6, 7, 8, -1, 9, 10, 11,
-                           12, 13, 14, 15, -1, -1, 16, 17, 18, 19, 20, 21, 22, -1, 23, 24, 25, -1, 26, 27, -1, 28, -1,
-                           29, 30, 31, 32, 33, 34, 35, 36, 37, -1, 38, 39, 40, 41, 42, 43, 44, 45, -1, -1, -1, -1, 46,
-                           47, 48, 49, -1, 50, 51, -1, 52, -1, -1, -1, 53, 54, -1, 55, -1, -1, 56, -1, 57, -1, 58, 59]
-
-      shapes = {}
-      for feature in TQDM(data["features"], desc=f"Converting {fname}"):
-          p = feature["properties"]
-          if p["bounds_imcoords"]:
-              id = p["image_id"]
-              file = path / "train_images" / id
-              if file.exists():  # 1395.tif missing
-                  try:
-                      box = np.array([int(num) for num in p["bounds_imcoords"].split(",")])
-                      assert box.shape[0] == 4, f"incorrect box shape {box.shape[0]}"
-                      cls = p["type_id"]
-                      cls = xview_class2index[int(cls)]  # xView class to 0-60
-                      assert 59 >= cls >= 0, f"incorrect class index {cls}"
-
-                      # Write YOLO label
-                      if id not in shapes:
-                          shapes[id] = Image.open(file).size
-                      box = xyxy2xywhn(box[None].astype(np.float), w=shapes[id][0], h=shapes[id][1], clip=True)
-                      with open((labels / id).with_suffix(".txt"), "a", encoding="utf-8") as f:
-                          f.write(f"{cls} {' '.join(f'{x:.6f}' for x in box[0])}\n")  # write label.txt
-                  except Exception as e:
-                      print(f"WARNING: skipping one label for {file}: {e}")
-
-
-  # Download manually from https://challenge.xviewdataset.org
-  dir = Path(yaml["path"])  # dataset root dir
-  # urls = [
-  #     "https://d307kc0mrhucc3.cloudfront.net/train_labels.zip",  # train labels
-  #     "https://d307kc0mrhucc3.cloudfront.net/train_images.zip",  # 15G, 847 train images
-  #     "https://d307kc0mrhucc3.cloudfront.net/val_images.zip",  # 5G, 282 val images (no labels)
-  # ]
-  # download(urls, dir=dir)
-
-  # Convert labels
-  convert_labels(dir / "xView_train.geojson")
-
-  # Move images
-  images = Path(dir / "images")
-  images.mkdir(parents=True, exist_ok=True)
-  Path(dir / "train_images").rename(dir / "images" / "train")
-  Path(dir / "val_images").rename(dir / "images" / "val")
-
-  # Split
-  autosplit(dir / "images" / "train")
diff --git a/ultralytics/cfg/default.yaml b/ultralytics/cfg/default.yaml
deleted file mode 100644
index 29f2e51..0000000
--- a/ultralytics/cfg/default.yaml
+++ /dev/null
@@ -1,130 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Global configuration YAML with settings and hyperparameters for YOLO training, validation, prediction and export
-# For documentation see https://docs.ultralytics.com/usage/cfg/
-
-task: detect # (str) YOLO task, i.e. detect, segment, classify, pose, obb
-mode: train # (str) YOLO mode, i.e. train, val, predict, export, track, benchmark
-
-# Train settings -------------------------------------------------------------------------------------------------------
-model: # (str, optional) path to model file, i.e. yolov8n.pt or yolov8n.yaml
-data: # (str, optional) path to data file, i.e. coco8.yaml
-epochs: 100 # (int) number of epochs to train for
-time: # (float, optional) max hours to train; overrides epochs if set
-patience: 100 # (int) early stop after N epochs without val improvement
-batch: 16 # (int) batch size; use -1 for AutoBatch
-imgsz: 640 # (int | list) train/val use int (square); predict/export may use [h,w]
-save: True # (bool) save train checkpoints and predict results
-save_period: -1 # (int) save checkpoint every N epochs; disabled if < 1
-cache: False # (bool | str) cache images in RAM (True/'ram') or on 'disk' to speed dataloading; False disables
-device: # (int | str | list) device: 0 or [0,1,2,3] for CUDA, 'cpu'/'mps', or -1/[-1,-1] to auto-select idle GPUs
-workers: 8 # (int) dataloader workers (per RANK if DDP)
-project: # (str, optional) project name for results root
-name: # (str, optional) experiment name; results in 'project/name'
-exist_ok: False # (bool) overwrite existing 'project/name' if True
-pretrained: True # (bool | str) use pretrained weights (bool) or load weights from path (str)
-optimizer: auto # (str) optimizer: SGD, Adam, Adamax, AdamW, NAdam, RAdam, RMSProp, or auto
-verbose: True # (bool) print verbose logs during training/val
-seed: 0 # (int) random seed for reproducibility
-deterministic: True # (bool) enable deterministic ops; reproducible but may be slower
-single_cls: False # (bool) treat all classes as a single class
-rect: False # (bool) rectangular batches for train; rectangular batching for val when mode='val'
-cos_lr: False # (bool) cosine learning rate scheduler
-close_mosaic: 10 # (int) disable mosaic augmentation for final N epochs (0 to keep enabled)
-resume: False # (bool) resume training from last checkpoint in the run dir
-amp: True # (bool) Automatic Mixed Precision (AMP) training; True runs AMP capability check
-fraction: 1.0 # (float) fraction of training dataset to use (1.0 = all)
-profile: False # (bool) profile ONNX/TensorRT speeds during training for loggers
-freeze: # (int | list, optional) freeze first N layers (int) or specific layer indices (list)
-multi_scale: False # (bool) multiscale training by varying image size
-compile: False # (bool | str) enable torch.compile() backend='inductor'; True="default", False=off, or "default|reduce-overhead|max-autotune-no-cudagraphs"
-
-# Segmentation
-overlap_mask: True # (bool) merge instance masks into one mask during training (segment only)
-mask_ratio: 4 # (int) mask downsample ratio (segment only)
-
-# Classification
-dropout: 0.0 # (float) dropout for classification head (classify only)
-
-# Val/Test settings ----------------------------------------------------------------------------------------------------
-val: True # (bool) run validation/testing during training
-split: val # (str) dataset split to evaluate: 'val', 'test' or 'train'
-save_json: False # (bool) save results to COCO JSON for external evaluation
-conf: # (float, optional) confidence threshold; defaults: predict=0.25, val=0.001
-iou: 0.7 # (float) IoU threshold used for NMS
-max_det: 300 # (int) maximum number of detections per image
-half: False # (bool) use half precision (FP16) if supported
-dnn: False # (bool) use OpenCV DNN for ONNX inference
-plots: True # (bool) save plots and images during train/val
-
-# Predict settings -----------------------------------------------------------------------------------------------------
-source: # (str, optional) path/dir/URL/stream for images or videos; e.g. 'ultralytics/assets' or '0' for webcam
-vid_stride: 1 # (int) read every Nth frame for video sources
-stream_buffer: False # (bool) True buffers all frames; False keeps the most recent frame for low-latency streams
-visualize: False # (bool) visualize model features (predict) or TP/FP/FN confusion (val)
-augment: False # (bool) apply test-time augmentation during prediction
-agnostic_nms: False # (bool) class-agnostic NMS
-classes: # (int | list[int], optional) filter by class id(s), e.g. 0 or [0,2,3]
-retina_masks: False # (bool) use high-resolution segmentation masks (segment)
-embed: # (list[int], optional) return feature embeddings from given layer indices
-
-# Visualize settings ---------------------------------------------------------------------------------------------------
-show: False # (bool) show images/videos in a window if supported
-save_frames: False # (bool) save individual frames from video predictions
-save_txt: False # (bool) save results as .txt files (xywh format)
-save_conf: False # (bool) save confidence scores with results
-save_crop: False # (bool) save cropped prediction regions to files
-show_labels: True # (bool) draw class labels on images, e.g. 'person'
-show_conf: True # (bool) draw confidence values on images, e.g. '0.99'
-show_boxes: True # (bool) draw bounding boxes on images
-line_width: # (int, optional) line width of boxes; auto-scales with image size if not set
-
-# Export settings ------------------------------------------------------------------------------------------------------
-format: torchscript # (str) target format, e.g. torchscript|onnx|openvino|engine|coreml|saved_model|pb|tflite|edgetpu|tfjs|paddle|mnn|ncnn|imx|rknn
-keras: False # (bool) TF SavedModel only (format=saved_model); enable Keras layers during export
-optimize: False # (bool) TorchScript only; apply mobile optimizations to the scripted model
-int8: False # (bool) INT8/PTQ where supported (openvino, tflite, tfjs, engine, imx); needs calibration data/fraction
-dynamic: False # (bool) dynamic shapes for torchscript, onnx, openvino, engine; enable variable image sizes
-simplify: True # (bool) ONNX/engine only; run graph simplifier for cleaner ONNX before runtime conversion
-opset: # (int, optional) ONNX/engine only; opset version for export; leave unset to use a tested default
-workspace: # (float, optional) engine (TensorRT) only; workspace size in GiB, e.g. 4
-nms: False # (bool) fuse NMS into exported model when backend supports; if True, conf/iou apply (agnostic_nms except coreml)
-
-# Hyperparameters ------------------------------------------------------------------------------------------------------
-lr0: 0.01 # (float) initial learning rate (SGD=1e-2, Adam/AdamW=1e-3)
-lrf: 0.01 # (float) final LR fraction; final LR = lr0 * lrf
-momentum: 0.937 # (float) SGD momentum or Adam beta1
-weight_decay: 0.0005 # (float) weight decay (L2 regularization)
-warmup_epochs: 3.0 # (float) warmup epochs (fractions allowed)
-warmup_momentum: 0.8 # (float) initial momentum during warmup
-warmup_bias_lr: 0.1 # (float) bias learning rate during warmup
-box: 7.5 # (float) box loss gain
-cls: 0.5 # (float) classification loss gain
-dfl: 1.5 # (float) distribution focal loss gain
-pose: 12.0 # (float) pose loss gain (pose tasks)
-kobj: 1.0 # (float) keypoint objectness loss gain (pose tasks)
-nbs: 64 # (int) nominal batch size used for loss normalization
-hsv_h: 0.015 # (float) HSV hue augmentation fraction
-hsv_s: 0.7 # (float) HSV saturation augmentation fraction
-hsv_v: 0.4 # (float) HSV value (brightness) augmentation fraction
-degrees: 0.0 # (float) rotation degrees (+/-)
-translate: 0.1 # (float) translation fraction (+/-)
-scale: 0.5 # (float) scale gain (+/-)
-shear: 0.0 # (float) shear degrees (+/-)
-perspective: 0.0 # (float) perspective fraction (0–0.001 typical)
-flipud: 0.0 # (float) vertical flip probability
-fliplr: 0.5 # (float) horizontal flip probability
-bgr: 0.0 # (float) RGB↔BGR channel swap probability
-mosaic: 1.0 # (float) mosaic augmentation probability
-mixup: 0.0 # (float) MixUp augmentation probability
-cutmix: 0.0 # (float) CutMix augmentation probability
-copy_paste: 0.0 # (float) segmentation copy-paste probability
-copy_paste_mode: flip # (str) copy-paste strategy for segmentation: flip or mixup
-auto_augment: randaugment # (str) classification auto augmentation policy: randaugment, autoaugment, augmix
-erasing: 0.4 # (float) random erasing probability for classification (0–0.9), <1.0
-
-# Custom config.yaml ---------------------------------------------------------------------------------------------------
-cfg: # (str, optional) path to a config.yaml that overrides defaults
-
-# Tracker settings ------------------------------------------------------------------------------------------------------
-tracker: botsort.yaml # (str) tracker config file: botsort.yaml or bytetrack.yaml
diff --git a/ultralytics/cfg/models/11/yolo11-cls-resnet18.yaml b/ultralytics/cfg/models/11/yolo11-cls-resnet18.yaml
deleted file mode 100644
index c427f85..0000000
--- a/ultralytics/cfg/models/11/yolo11-cls-resnet18.yaml
+++ /dev/null
@@ -1,17 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Ultralytics YOLO11-cls image classification model with ResNet18 backbone
-# Model docs: https://docs.ultralytics.com/models/yolo11
-# Task docs: https://docs.ultralytics.com/tasks/classify
-
-# Parameters
-nc: 1000 # number of classes
-
-# ResNet18 backbone
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, TorchVision, [512, resnet18, DEFAULT, True, 2]] # truncate two layers from the end
-
-# YOLO11n head
-head:
-  - [-1, 1, Classify, [nc]] # Classify
diff --git a/ultralytics/cfg/models/11/yolo11-cls.yaml b/ultralytics/cfg/models/11/yolo11-cls.yaml
deleted file mode 100644
index 753e27b..0000000
--- a/ultralytics/cfg/models/11/yolo11-cls.yaml
+++ /dev/null
@@ -1,33 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Ultralytics YOLO11-cls image classification model
-# Model docs: https://docs.ultralytics.com/models/yolo11
-# Task docs: https://docs.ultralytics.com/tasks/classify
-
-# Parameters
-nc: 1000 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolo11n-cls.yaml' will call yolo11-cls.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.50, 0.25, 1024] # summary: 86 layers, 1633584 parameters, 1633584 gradients, 0.5 GFLOPs
-  s: [0.50, 0.50, 1024] # summary: 86 layers, 5545488 parameters, 5545488 gradients, 1.6 GFLOPs
-  m: [0.50, 1.00, 512] # summary: 106 layers, 10455696 parameters, 10455696 gradients, 5.0 GFLOPs
-  l: [1.00, 1.00, 512] # summary: 176 layers, 12937104 parameters, 12937104 gradients, 6.2 GFLOPs
-  x: [1.00, 1.50, 512] # summary: 176 layers, 28458544 parameters, 28458544 gradients, 13.7 GFLOPs
-
-# YOLO11n backbone
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 2, C3k2, [256, False, 0.25]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 2, C3k2, [512, False, 0.25]]
-  - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
-  - [-1, 2, C3k2, [512, True]]
-  - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
-  - [-1, 2, C3k2, [1024, True]]
-  - [-1, 2, C2PSA, [1024]] # 9
-
-# YOLO11n head
-head:
-  - [-1, 1, Classify, [nc]] # Classify
diff --git a/ultralytics/cfg/models/11/yolo11-obb.yaml b/ultralytics/cfg/models/11/yolo11-obb.yaml
deleted file mode 100644
index 6ca7c60..0000000
--- a/ultralytics/cfg/models/11/yolo11-obb.yaml
+++ /dev/null
@@ -1,50 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Ultralytics YOLO11-obb Oriented Bounding Boxes (OBB) model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolo11
-# Task docs: https://docs.ultralytics.com/tasks/obb
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolo11n-obb.yaml' will call yolo11-obb.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.50, 0.25, 1024] # summary: 196 layers, 2695747 parameters, 2695731 gradients, 6.9 GFLOPs
-  s: [0.50, 0.50, 1024] # summary: 196 layers, 9744931 parameters, 9744915 gradients, 22.7 GFLOPs
-  m: [0.50, 1.00, 512] # summary: 246 layers, 20963523 parameters, 20963507 gradients, 72.2 GFLOPs
-  l: [1.00, 1.00, 512] # summary: 372 layers, 26220995 parameters, 26220979 gradients, 91.3 GFLOPs
-  x: [1.00, 1.50, 512] # summary: 372 layers, 58875331 parameters, 58875315 gradients, 204.3 GFLOPs
-
-# YOLO11n backbone
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 2, C3k2, [256, False, 0.25]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 2, C3k2, [512, False, 0.25]]
-  - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
-  - [-1, 2, C3k2, [512, True]]
-  - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
-  - [-1, 2, C3k2, [1024, True]]
-  - [-1, 1, SPPF, [1024, 5]] # 9
-  - [-1, 2, C2PSA, [1024]] # 10
-
-# YOLO11n head
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 2, C3k2, [512, False]] # 13
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 2, C3k2, [256, False]] # 16 (P3/8-small)
-
-  - [-1, 1, Conv, [256, 3, 2]]
-  - [[-1, 13], 1, Concat, [1]] # cat head P4
-  - [-1, 2, C3k2, [512, False]] # 19 (P4/16-medium)
-
-  - [-1, 1, Conv, [512, 3, 2]]
-  - [[-1, 10], 1, Concat, [1]] # cat head P5
-  - [-1, 2, C3k2, [1024, True]] # 22 (P5/32-large)
-
-  - [[16, 19, 22], 1, OBB, [nc, 1]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/11/yolo11-pose.yaml b/ultralytics/cfg/models/11/yolo11-pose.yaml
deleted file mode 100644
index 32766e7..0000000
--- a/ultralytics/cfg/models/11/yolo11-pose.yaml
+++ /dev/null
@@ -1,51 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Ultralytics YOLO11-pose keypoints/pose estimation model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolo11
-# Task docs: https://docs.ultralytics.com/tasks/pose
-
-# Parameters
-nc: 80 # number of classes
-kpt_shape: [17, 3] # number of keypoints, number of dims (2 for x,y or 3 for x,y,visible)
-scales: # model compound scaling constants, i.e. 'model=yolo11n-pose.yaml' will call yolo11.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.50, 0.25, 1024] # summary: 196 layers, 2908507 parameters, 2908491 gradients, 7.7 GFLOPs
-  s: [0.50, 0.50, 1024] # summary: 196 layers, 9948811 parameters, 9948795 gradients, 23.5 GFLOPs
-  m: [0.50, 1.00, 512] # summary: 246 layers, 20973273 parameters, 20973257 gradients, 72.3 GFLOPs
-  l: [1.00, 1.00, 512] # summary: 372 layers, 26230745 parameters, 26230729 gradients, 91.4 GFLOPs
-  x: [1.00, 1.50, 512] # summary: 372 layers, 58889881 parameters, 58889865 gradients, 204.3 GFLOPs
-
-# YOLO11n backbone
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 2, C3k2, [256, False, 0.25]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 2, C3k2, [512, False, 0.25]]
-  - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
-  - [-1, 2, C3k2, [512, True]]
-  - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
-  - [-1, 2, C3k2, [1024, True]]
-  - [-1, 1, SPPF, [1024, 5]] # 9
-  - [-1, 2, C2PSA, [1024]] # 10
-
-# YOLO11n head
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 2, C3k2, [512, False]] # 13
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 2, C3k2, [256, False]] # 16 (P3/8-small)
-
-  - [-1, 1, Conv, [256, 3, 2]]
-  - [[-1, 13], 1, Concat, [1]] # cat head P4
-  - [-1, 2, C3k2, [512, False]] # 19 (P4/16-medium)
-
-  - [-1, 1, Conv, [512, 3, 2]]
-  - [[-1, 10], 1, Concat, [1]] # cat head P5
-  - [-1, 2, C3k2, [1024, True]] # 22 (P5/32-large)
-
-  - [[16, 19, 22], 1, Pose, [nc, kpt_shape]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/11/yolo11-seg.yaml b/ultralytics/cfg/models/11/yolo11-seg.yaml
deleted file mode 100644
index 1186666..0000000
--- a/ultralytics/cfg/models/11/yolo11-seg.yaml
+++ /dev/null
@@ -1,50 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Ultralytics YOLO11-seg instance segmentation model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolo11
-# Task docs: https://docs.ultralytics.com/tasks/segment
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolo11n-seg.yaml' will call yolo11-seg.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.50, 0.25, 1024] # summary: 203 layers, 2876848 parameters, 2876832 gradients, 10.5 GFLOPs
-  s: [0.50, 0.50, 1024] # summary: 203 layers, 10113248 parameters, 10113232 gradients, 35.8 GFLOPs
-  m: [0.50, 1.00, 512] # summary: 253 layers, 22420896 parameters, 22420880 gradients, 123.9 GFLOPs
-  l: [1.00, 1.00, 512] # summary: 379 layers, 27678368 parameters, 27678352 gradients, 143.0 GFLOPs
-  x: [1.00, 1.50, 512] # summary: 379 layers, 62142656 parameters, 62142640 gradients, 320.2 GFLOPs
-
-# YOLO11n backbone
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 2, C3k2, [256, False, 0.25]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 2, C3k2, [512, False, 0.25]]
-  - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
-  - [-1, 2, C3k2, [512, True]]
-  - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
-  - [-1, 2, C3k2, [1024, True]]
-  - [-1, 1, SPPF, [1024, 5]] # 9
-  - [-1, 2, C2PSA, [1024]] # 10
-
-# YOLO11n head
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 2, C3k2, [512, False]] # 13
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 2, C3k2, [256, False]] # 16 (P3/8-small)
-
-  - [-1, 1, Conv, [256, 3, 2]]
-  - [[-1, 13], 1, Concat, [1]] # cat head P4
-  - [-1, 2, C3k2, [512, False]] # 19 (P4/16-medium)
-
-  - [-1, 1, Conv, [512, 3, 2]]
-  - [[-1, 10], 1, Concat, [1]] # cat head P5
-  - [-1, 2, C3k2, [1024, True]] # 22 (P5/32-large)
-
-  - [[16, 19, 22], 1, Segment, [nc, 32, 256]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/11/yolo11.yaml b/ultralytics/cfg/models/11/yolo11.yaml
deleted file mode 100644
index c90c444..0000000
--- a/ultralytics/cfg/models/11/yolo11.yaml
+++ /dev/null
@@ -1,50 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Ultralytics YOLO11 object detection model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolo11
-# Task docs: https://docs.ultralytics.com/tasks/detect
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolo11n.yaml' will call yolo11.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.50, 0.25, 1024] # summary: 181 layers, 2624080 parameters, 2624064 gradients, 6.6 GFLOPs
-  s: [0.50, 0.50, 1024] # summary: 181 layers, 9458752 parameters, 9458736 gradients, 21.7 GFLOPs
-  m: [0.50, 1.00, 512] # summary: 231 layers, 20114688 parameters, 20114672 gradients, 68.5 GFLOPs
-  l: [1.00, 1.00, 512] # summary: 357 layers, 25372160 parameters, 25372144 gradients, 87.6 GFLOPs
-  x: [1.00, 1.50, 512] # summary: 357 layers, 56966176 parameters, 56966160 gradients, 196.0 GFLOPs
-
-# YOLO11n backbone
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 2, C3k2, [256, False, 0.25]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 2, C3k2, [512, False, 0.25]]
-  - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
-  - [-1, 2, C3k2, [512, True]]
-  - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
-  - [-1, 2, C3k2, [1024, True]]
-  - [-1, 1, SPPF, [1024, 5]] # 9
-  - [-1, 2, C2PSA, [1024]] # 10
-
-# YOLO11n head
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 2, C3k2, [512, False]] # 13
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 2, C3k2, [256, False]] # 16 (P3/8-small)
-
-  - [-1, 1, Conv, [256, 3, 2]]
-  - [[-1, 13], 1, Concat, [1]] # cat head P4
-  - [-1, 2, C3k2, [512, False]] # 19 (P4/16-medium)
-
-  - [-1, 1, Conv, [512, 3, 2]]
-  - [[-1, 10], 1, Concat, [1]] # cat head P5
-  - [-1, 2, C3k2, [1024, True]] # 22 (P5/32-large)
-
-  - [[16, 19, 22], 1, Detect, [nc]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/11/yoloe-11-seg.yaml b/ultralytics/cfg/models/11/yoloe-11-seg.yaml
deleted file mode 100644
index 7fe425a..0000000
--- a/ultralytics/cfg/models/11/yoloe-11-seg.yaml
+++ /dev/null
@@ -1,48 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# YOLO11-seg instance segmentation model. For Usage examples see https://docs.ultralytics.com/tasks/segment
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolo11n-seg.yaml' will call yolo11-seg.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.50, 0.25, 1024] # summary: 355 layers, 2876848 parameters, 2876832 gradients, 10.5 GFLOPs
-  s: [0.50, 0.50, 1024] # summary: 355 layers, 10113248 parameters, 10113232 gradients, 35.8 GFLOPs
-  m: [0.50, 1.00, 512] # summary: 445 layers, 22420896 parameters, 22420880 gradients, 123.9 GFLOPs
-  l: [1.00, 1.00, 512] # summary: 667 layers, 27678368 parameters, 27678352 gradients, 143.0 GFLOPs
-  x: [1.00, 1.50, 512] # summary: 667 layers, 62142656 parameters, 62142640 gradients, 320.2 GFLOPs
-
-# YOLO11n backbone
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 2, C3k2, [256, False, 0.25]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 2, C3k2, [512, False, 0.25]]
-  - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
-  - [-1, 2, C3k2, [512, True]]
-  - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
-  - [-1, 2, C3k2, [1024, True]]
-  - [-1, 1, SPPF, [1024, 5]] # 9
-  - [-1, 2, C2PSA, [1024]] # 10
-
-# YOLO11n head
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 2, C3k2, [512, False]] # 13
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 2, C3k2, [256, False]] # 16 (P3/8-small)
-
-  - [-1, 1, Conv, [256, 3, 2]]
-  - [[-1, 13], 1, Concat, [1]] # cat head P4
-  - [-1, 2, C3k2, [512, False]] # 19 (P4/16-medium)
-
-  - [-1, 1, Conv, [512, 3, 2]]
-  - [[-1, 10], 1, Concat, [1]] # cat head P5
-  - [-1, 2, C3k2, [1024, True]] # 22 (P5/32-large)
-
-  - [[16, 19, 22], 1, YOLOESegment, [nc, 32, 256, 512, True]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/11/yoloe-11.yaml b/ultralytics/cfg/models/11/yoloe-11.yaml
deleted file mode 100644
index 5a5feae..0000000
--- a/ultralytics/cfg/models/11/yoloe-11.yaml
+++ /dev/null
@@ -1,48 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# YOLO11 object detection model with P3-P5 outputs. For Usage examples see https://docs.ultralytics.com/tasks/detect
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolo11n.yaml' will call yolo11.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.50, 0.25, 1024] # summary: 319 layers, 2624080 parameters, 2624064 gradients, 6.6 GFLOPs
-  s: [0.50, 0.50, 1024] # summary: 319 layers, 9458752 parameters, 9458736 gradients, 21.7 GFLOPs
-  m: [0.50, 1.00, 512] # summary: 409 layers, 20114688 parameters, 20114672 gradients, 68.5 GFLOPs
-  l: [1.00, 1.00, 512] # summary: 631 layers, 25372160 parameters, 25372144 gradients, 87.6 GFLOPs
-  x: [1.00, 1.50, 512] # summary: 631 layers, 56966176 parameters, 56966160 gradients, 196.0 GFLOPs
-
-# YOLO11n backbone
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 2, C3k2, [256, False, 0.25]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 2, C3k2, [512, False, 0.25]]
-  - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
-  - [-1, 2, C3k2, [512, True]]
-  - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
-  - [-1, 2, C3k2, [1024, True]]
-  - [-1, 1, SPPF, [1024, 5]] # 9
-  - [-1, 2, C2PSA, [1024]] # 10
-
-# YOLO11n head
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 2, C3k2, [512, False]] # 13
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 2, C3k2, [256, False]] # 16 (P3/8-small)
-
-  - [-1, 1, Conv, [256, 3, 2]]
-  - [[-1, 13], 1, Concat, [1]] # cat head P4
-  - [-1, 2, C3k2, [512, False]] # 19 (P4/16-medium)
-
-  - [-1, 1, Conv, [512, 3, 2]]
-  - [[-1, 10], 1, Concat, [1]] # cat head P5
-  - [-1, 2, C3k2, [1024, True]] # 22 (P5/32-large)
-
-  - [[16, 19, 22], 1, YOLOEDetect, [nc, 512, True]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/12/yolo12-cls.yaml b/ultralytics/cfg/models/12/yolo12-cls.yaml
deleted file mode 100644
index b0e15ce..0000000
--- a/ultralytics/cfg/models/12/yolo12-cls.yaml
+++ /dev/null
@@ -1,32 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# YOLO12-cls image classification model
-# Model docs: https://docs.ultralytics.com/models/yolo12
-# Task docs: https://docs.ultralytics.com/tasks/classify
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolo12n-cls.yaml' will call yolo12-cls.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.50, 0.25, 1024] # summary: 152 layers, 1,820,976 parameters, 1,820,976 gradients, 3.7 GFLOPs
-  s: [0.50, 0.50, 1024] # summary: 152 layers, 6,206,992 parameters, 6,206,992 gradients, 13.6 GFLOPs
-  m: [0.50, 1.00, 512] # summary: 172 layers, 12,083,088 parameters, 12,083,088 gradients, 44.2 GFLOPs
-  l: [1.00, 1.00, 512] # summary: 312 layers, 15,558,640 parameters, 15,558,640 gradients, 56.9 GFLOPs
-  x: [1.00, 1.50, 512] # summary: 312 layers, 34,172,592 parameters, 34,172,592 gradients, 126.5 GFLOPs
-
-# YOLO12n backbone
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 2, C3k2, [256, False, 0.25]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 2, C3k2, [512, False, 0.25]]
-  - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
-  - [-1, 4, A2C2f, [512, True, 4]]
-  - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
-  - [-1, 4, A2C2f, [1024, True, 1]] # 8
-
-# YOLO12n head
-head:
-  - [-1, 1, Classify, [nc]] # Classify
diff --git a/ultralytics/cfg/models/12/yolo12-obb.yaml b/ultralytics/cfg/models/12/yolo12-obb.yaml
deleted file mode 100644
index e5c36f0..0000000
--- a/ultralytics/cfg/models/12/yolo12-obb.yaml
+++ /dev/null
@@ -1,48 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# YOLO12-obb Oriented Bounding Boxes (OBB) model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolo12
-# Task docs: https://docs.ultralytics.com/tasks/obb
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolo12n-obb.yaml' will call yolo12-obb.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.50, 0.25, 1024] # summary: 287 layers, 2,673,955 parameters, 2,673,939 gradients, 6.9 GFLOPs
-  s: [0.50, 0.50, 1024] # summary: 287 layers, 9,570,275 parameters, 9,570,259 gradients, 22.7 GFLOPs
-  m: [0.50, 1.00, 512] # summary: 307 layers, 21,048,003 parameters, 21,047,987 gradients, 71.8 GFLOPs
-  l: [1.00, 1.00, 512] # summary: 503 layers, 27,299,619 parameters, 27,299,603 gradients, 93.4 GFLOPs
-  x: [1.00, 1.50, 512] # summary: 503 layers, 61,119,939 parameters, 61,119,923 gradients, 208.6 GFLOPs
-
-# YOLO12n backbone
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 2, C3k2, [256, False, 0.25]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 2, C3k2, [512, False, 0.25]]
-  - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
-  - [-1, 4, A2C2f, [512, True, 4]]
-  - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
-  - [-1, 4, A2C2f, [1024, True, 1]] # 8
-
-# YOLO12n head
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 2, A2C2f, [512, False, -1]] # 11
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 2, A2C2f, [256, False, -1]] # 14
-
-  - [-1, 1, Conv, [256, 3, 2]]
-  - [[-1, 11], 1, Concat, [1]] # cat head P4
-  - [-1, 2, A2C2f, [512, False, -1]] # 17
-
-  - [-1, 1, Conv, [512, 3, 2]]
-  - [[-1, 8], 1, Concat, [1]] # cat head P5
-  - [-1, 2, C3k2, [1024, True]] # 20 (P5/32-large)
-
-  - [[14, 17, 20], 1, OBB, [nc, 1]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/12/yolo12-pose.yaml b/ultralytics/cfg/models/12/yolo12-pose.yaml
deleted file mode 100644
index 104a186..0000000
--- a/ultralytics/cfg/models/12/yolo12-pose.yaml
+++ /dev/null
@@ -1,49 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# YOLO12-pose keypoints/pose estimation model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolo12
-# Task docs: https://docs.ultralytics.com/tasks/pose
-
-# Parameters
-nc: 80 # number of classes
-kpt_shape: [17, 3] # number of keypoints, number of dims (2 for x,y or 3 for x,y,visible)
-scales: # model compound scaling constants, i.e. 'model=yolo12n-pose.yaml' will call yolo12-pose.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.50, 0.25, 1024] # summary: 287 layers, 2,886,715 parameters, 2,886,699 gradients, 7.8 GFLOPs
-  s: [0.50, 0.50, 1024] # summary: 287 layers, 9,774,155 parameters, 9,774,139 gradients, 23.5 GFLOPs
-  m: [0.50, 1.00, 512] # summary: 307 layers, 21,057,753 parameters, 21,057,737 gradients, 71.8 GFLOPs
-  l: [1.00, 1.00, 512] # summary: 503 layers, 27,309,369 parameters, 27,309,353 gradients, 93.5 GFLOPs
-  x: [1.00, 1.50, 512] # summary: 503 layers, 61,134,489 parameters, 61,134,473 gradients, 208.7 GFLOPs
-
-# YOLO12n backbone
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 2, C3k2, [256, False, 0.25]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 2, C3k2, [512, False, 0.25]]
-  - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
-  - [-1, 4, A2C2f, [512, True, 4]]
-  - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
-  - [-1, 4, A2C2f, [1024, True, 1]] # 8
-
-# YOLO12n head
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 2, A2C2f, [512, False, -1]] # 11
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 2, A2C2f, [256, False, -1]] # 14
-
-  - [-1, 1, Conv, [256, 3, 2]]
-  - [[-1, 11], 1, Concat, [1]] # cat head P4
-  - [-1, 2, A2C2f, [512, False, -1]] # 17
-
-  - [-1, 1, Conv, [512, 3, 2]]
-  - [[-1, 8], 1, Concat, [1]] # cat head P5
-  - [-1, 2, C3k2, [1024, True]] # 20 (P5/32-large)
-
-  - [[14, 17, 20], 1, Pose, [nc, kpt_shape]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/12/yolo12-seg.yaml b/ultralytics/cfg/models/12/yolo12-seg.yaml
deleted file mode 100644
index 6d03a3e..0000000
--- a/ultralytics/cfg/models/12/yolo12-seg.yaml
+++ /dev/null
@@ -1,48 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# YOLO12-seg instance segmentation model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolo12
-# Task docs: https://docs.ultralytics.com/tasks/segment
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolo12n-seg.yaml' will call yolo12-seg.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.50, 0.25, 1024] # summary: 294 layers, 2,855,056 parameters, 2,855,040 gradients, 10.6 GFLOPs
-  s: [0.50, 0.50, 1024] # summary: 294 layers, 9,938,592 parameters, 9,938,576 gradients, 35.7 GFLOPs
-  m: [0.50, 1.00, 512] # summary: 314 layers, 22,505,376 parameters, 22,505,360 gradients, 123.5 GFLOPs
-  l: [1.00, 1.00, 512] # summary: 510 layers, 28,756,992 parameters, 28,756,976 gradients, 145.1 GFLOPs
-  x: [1.00, 1.50, 512] # summary: 510 layers, 64,387,264 parameters, 64,387,248 gradients, 324.6 GFLOPs
-
-# YOLO12n backbone
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 2, C3k2, [256, False, 0.25]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 2, C3k2, [512, False, 0.25]]
-  - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
-  - [-1, 4, A2C2f, [512, True, 4]]
-  - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
-  - [-1, 4, A2C2f, [1024, True, 1]] # 8
-
-# YOLO12n head
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 2, A2C2f, [512, False, -1]] # 11
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 2, A2C2f, [256, False, -1]] # 14
-
-  - [-1, 1, Conv, [256, 3, 2]]
-  - [[-1, 11], 1, Concat, [1]] # cat head P4
-  - [-1, 2, A2C2f, [512, False, -1]] # 17
-
-  - [-1, 1, Conv, [512, 3, 2]]
-  - [[-1, 8], 1, Concat, [1]] # cat head P5
-  - [-1, 2, C3k2, [1024, True]] # 20 (P5/32-large)
-
-  - [[14, 17, 20], 1, Segment, [nc, 32, 256]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/12/yolo12.yaml b/ultralytics/cfg/models/12/yolo12.yaml
deleted file mode 100644
index 737c033..0000000
--- a/ultralytics/cfg/models/12/yolo12.yaml
+++ /dev/null
@@ -1,48 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# YOLO12 object detection model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolo12
-# Task docs: https://docs.ultralytics.com/tasks/detect
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolo12n.yaml' will call yolo12.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.50, 0.25, 1024] # summary: 272 layers, 2,602,288 parameters, 2,602,272 gradients, 6.7 GFLOPs
-  s: [0.50, 0.50, 1024] # summary: 272 layers, 9,284,096 parameters, 9,284,080 gradients, 21.7 GFLOPs
-  m: [0.50, 1.00, 512] # summary: 292 layers, 20,199,168 parameters, 20,199,152 gradients, 68.1 GFLOPs
-  l: [1.00, 1.00, 512] # summary: 488 layers, 26,450,784 parameters, 26,450,768 gradients, 89.7 GFLOPs
-  x: [1.00, 1.50, 512] # summary: 488 layers, 59,210,784 parameters, 59,210,768 gradients, 200.3 GFLOPs
-
-# YOLO12n backbone
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 2, C3k2, [256, False, 0.25]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 2, C3k2, [512, False, 0.25]]
-  - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
-  - [-1, 4, A2C2f, [512, True, 4]]
-  - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
-  - [-1, 4, A2C2f, [1024, True, 1]] # 8
-
-# YOLO12n head
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 2, A2C2f, [512, False, -1]] # 11
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 2, A2C2f, [256, False, -1]] # 14
-
-  - [-1, 1, Conv, [256, 3, 2]]
-  - [[-1, 11], 1, Concat, [1]] # cat head P4
-  - [-1, 2, A2C2f, [512, False, -1]] # 17
-
-  - [-1, 1, Conv, [512, 3, 2]]
-  - [[-1, 8], 1, Concat, [1]] # cat head P5
-  - [-1, 2, C3k2, [1024, True]] # 20 (P5/32-large)
-
-  - [[14, 17, 20], 1, Detect, [nc]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/README.md b/ultralytics/cfg/models/README.md
deleted file mode 100644
index 45255f2..0000000
--- a/ultralytics/cfg/models/README.md
+++ /dev/null
@@ -1,65 +0,0 @@
-<a href="https://www.ultralytics.com/" target="_blank"><img src="https://raw.githubusercontent.com/ultralytics/assets/main/logo/Ultralytics_Logotype_Original.svg" width="320" alt="Ultralytics logo"></a>
-
-# Ultralytics Model Configurations
-
-Welcome to the [Ultralytics](https://www.ultralytics.com/) Models configuration directory! This directory contains a comprehensive collection of pre-configured model configuration files (`*.yaml`). These files serve as blueprints for creating custom [Ultralytics YOLO](https://docs.ultralytics.com/models/yolo11/) models, meticulously crafted and fine-tuned by the Ultralytics team. Our goal is to provide optimal performance across a diverse range of [computer vision](https://www.ultralytics.com/glossary/computer-vision-cv) tasks, including [object detection](https://docs.ultralytics.com/tasks/detect/), [image segmentation](https://docs.ultralytics.com/tasks/segment/), pose estimation, and [object tracking](https://docs.ultralytics.com/modes/track/).
-
-These configurations cater to various scenarios and are engineered for efficiency, running smoothly on different hardware platforms, from standard [CPUs](https://en.wikipedia.org/wiki/Central_processing_unit) to powerful [GPUs](https://www.ultralytics.com/glossary/gpu-graphics-processing-unit). Whether you're an experienced [machine learning](https://en.wikipedia.org/wiki/Machine_learning) practitioner or new to the YOLO ecosystem, this directory offers an excellent starting point for your custom model development journey.
-
-To begin, explore the models within this directory and select one that aligns with your project requirements. You can then use the corresponding `*.yaml` file (learn more about the [YAML format](https://www.ultralytics.com/glossary/yaml)) to [train](https://docs.ultralytics.com/modes/train/) and deploy your custom YOLO model effortlessly. For detailed guidance, refer to the Ultralytics [Documentation](https://docs.ultralytics.com/), and don't hesitate to reach out to the community via [GitHub Issues](https://github.com/ultralytics/ultralytics/issues) if you need support. Start building your custom YOLO model today!
-
-## 🚀 Usage
-
-Model `*.yaml` configuration files can be directly utilized in the [Command Line Interface (CLI)](https://docs.ultralytics.com/usage/cli/) using the `yolo` command:
-
-```bash
-# Train a YOLO11n detection model using the coco8 dataset for 100 epochs
-yolo task=detect mode=train model=yolo11n.yaml data=coco8.yaml epochs=100 imgsz=640
-```
-
-These files are [Python](https://www.python.org/)-compatible, accepting the same [configuration arguments](https://docs.ultralytics.com/usage/cfg/) as shown in the CLI example:
-
-```python
-from ultralytics import YOLO
-
-# Initialize a YOLO11n model from a YAML configuration file
-# This creates a model architecture without loading pre-trained weights
-model = YOLO("yolo11n.yaml")
-
-# Alternatively, load a pre-trained YOLO11n model directly
-# This loads both the architecture and the weights trained on COCO
-# model = YOLO("yolo11n.pt")
-
-# Display model information (architecture, layers, parameters, etc.)
-model.info()
-
-# Train the model using the COCO8 dataset (a small subset of COCO) for 100 epochs
-results = model.train(data="coco8.yaml", epochs=100, imgsz=640)
-
-# Run inference with the trained model on an image
-results = model("path/to/image.jpg")
-```
-
-## 🏗️ Pre-trained Model Architectures
-
-Ultralytics supports a variety of cutting-edge model architectures. Visit the [Ultralytics Models](https://docs.ultralytics.com/models/) documentation page for in-depth information and usage examples for each model, including:
-
-- [YOLO12](https://docs.ultralytics.com/models/yolo12/)
-- [YOLO11](https://docs.ultralytics.com/models/yolo11/)
-- [YOLOv10](https://docs.ultralytics.com/models/yolov10/)
-- [YOLOv9](https://docs.ultralytics.com/models/yolov9/)
-- [YOLOv8](https://docs.ultralytics.com/models/yolov8/)
-- [YOLOv5](https://docs.ultralytics.com/models/yolov5/)
-- [And more...](https://docs.ultralytics.com/models/)
-
-You can easily use any of these models by loading their configuration files (`.yaml`) or their [pre-trained](https://docs.pytorch.org/tutorials/beginner/transfer_learning_tutorial.html) checkpoints (`.pt`).
-
-## 🤝 Contribute New Models
-
-Have you developed a novel YOLO variant, experimented with a unique architecture, or achieved state-of-the-art results through specific tuning? We encourage you to share your innovations with the community by contributing to our Models section! Contributions like new model configurations, architectural improvements, or performance optimizations are highly valuable and help enrich the Ultralytics ecosystem.
-
-Sharing your work here allows others to benefit from your insights and expands the range of available model choices. It's an excellent way to showcase your expertise and make the Ultralytics YOLO platform even more versatile and powerful.
-
-To contribute, please review our [Contributing Guide](https://docs.ultralytics.com/help/contributing/) for detailed instructions on submitting a [Pull Request (PR)](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests) 🛠️. We eagerly await your contributions!
-
-Let's collaborate to enhance the capabilities and diversity of the Ultralytics YOLO models 🙏!
diff --git a/ultralytics/cfg/models/rt-detr/rtdetr-l.yaml b/ultralytics/cfg/models/rt-detr/rtdetr-l.yaml
deleted file mode 100644
index d8d6b4f..0000000
--- a/ultralytics/cfg/models/rt-detr/rtdetr-l.yaml
+++ /dev/null
@@ -1,53 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Ultralytics RT-DETR-l hybrid object detection model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/rtdetr
-# Task docs: https://docs.ultralytics.com/tasks/detect
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolov8n-cls.yaml' will call yolov8-cls.yaml with scale 'n'
-  # [depth, width, max_channels]
-  l: [1.00, 1.00, 1024]
-
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, HGStem, [32, 48]] # 0-P2/4
-  - [-1, 6, HGBlock, [48, 128, 3]] # stage 1
-
-  - [-1, 1, DWConv, [128, 3, 2, 1, False]] # 2-P3/8
-  - [-1, 6, HGBlock, [96, 512, 3]] # stage 2
-
-  - [-1, 1, DWConv, [512, 3, 2, 1, False]] # 4-P3/16
-  - [-1, 6, HGBlock, [192, 1024, 5, True, False]] # cm, c2, k, light, shortcut
-  - [-1, 6, HGBlock, [192, 1024, 5, True, True]]
-  - [-1, 6, HGBlock, [192, 1024, 5, True, True]] # stage 3
-
-  - [-1, 1, DWConv, [1024, 3, 2, 1, False]] # 8-P4/32
-  - [-1, 6, HGBlock, [384, 2048, 5, True, False]] # stage 4
-
-head:
-  - [-1, 1, Conv, [256, 1, 1, None, 1, 1, False]] # 10 input_proj.2
-  - [-1, 1, AIFI, [1024, 8]]
-  - [-1, 1, Conv, [256, 1, 1]] # 12, Y5, lateral_convs.0
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [7, 1, Conv, [256, 1, 1, None, 1, 1, False]] # 14 input_proj.1
-  - [[-2, -1], 1, Concat, [1]]
-  - [-1, 3, RepC3, [256]] # 16, fpn_blocks.0
-  - [-1, 1, Conv, [256, 1, 1]] # 17, Y4, lateral_convs.1
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [3, 1, Conv, [256, 1, 1, None, 1, 1, False]] # 19 input_proj.0
-  - [[-2, -1], 1, Concat, [1]] # cat backbone P4
-  - [-1, 3, RepC3, [256]] # X3 (21), fpn_blocks.1
-
-  - [-1, 1, Conv, [256, 3, 2]] # 22, downsample_convs.0
-  - [[-1, 17], 1, Concat, [1]] # cat Y4
-  - [-1, 3, RepC3, [256]] # F4 (24), pan_blocks.0
-
-  - [-1, 1, Conv, [256, 3, 2]] # 25, downsample_convs.1
-  - [[-1, 12], 1, Concat, [1]] # cat Y5
-  - [-1, 3, RepC3, [256]] # F5 (27), pan_blocks.1
-
-  - [[21, 24, 27], 1, RTDETRDecoder, [nc]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/rt-detr/rtdetr-resnet101.yaml b/ultralytics/cfg/models/rt-detr/rtdetr-resnet101.yaml
deleted file mode 100644
index b13e945..0000000
--- a/ultralytics/cfg/models/rt-detr/rtdetr-resnet101.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Ultralytics RT-DETR-ResNet101 hybrid object detection model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/rtdetr
-# Task docs: https://docs.ultralytics.com/tasks/detect
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolov8n-cls.yaml' will call yolov8-cls.yaml with scale 'n'
-  # [depth, width, max_channels]
-  l: [1.00, 1.00, 1024]
-
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, ResNetLayer, [3, 64, 1, True, 1]] # 0
-  - [-1, 1, ResNetLayer, [64, 64, 1, False, 3]] # 1
-  - [-1, 1, ResNetLayer, [256, 128, 2, False, 4]] # 2
-  - [-1, 1, ResNetLayer, [512, 256, 2, False, 23]] # 3
-  - [-1, 1, ResNetLayer, [1024, 512, 2, False, 3]] # 4
-
-head:
-  - [-1, 1, Conv, [256, 1, 1, None, 1, 1, False]] # 5
-  - [-1, 1, AIFI, [1024, 8]]
-  - [-1, 1, Conv, [256, 1, 1]] # 7
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [3, 1, Conv, [256, 1, 1, None, 1, 1, False]] # 9
-  - [[-2, -1], 1, Concat, [1]]
-  - [-1, 3, RepC3, [256]] # 11
-  - [-1, 1, Conv, [256, 1, 1]] # 12
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [2, 1, Conv, [256, 1, 1, None, 1, 1, False]] # 14
-  - [[-2, -1], 1, Concat, [1]] # cat backbone P4
-  - [-1, 3, RepC3, [256]] # X3 (16), fpn_blocks.1
-
-  - [-1, 1, Conv, [256, 3, 2]] # 17, downsample_convs.0
-  - [[-1, 12], 1, Concat, [1]] # cat Y4
-  - [-1, 3, RepC3, [256]] # F4 (19), pan_blocks.0
-
-  - [-1, 1, Conv, [256, 3, 2]] # 20, downsample_convs.1
-  - [[-1, 7], 1, Concat, [1]] # cat Y5
-  - [-1, 3, RepC3, [256]] # F5 (22), pan_blocks.1
-
-  - [[16, 19, 22], 1, RTDETRDecoder, [nc]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/rt-detr/rtdetr-resnet50.yaml b/ultralytics/cfg/models/rt-detr/rtdetr-resnet50.yaml
deleted file mode 100644
index 8172ad4..0000000
--- a/ultralytics/cfg/models/rt-detr/rtdetr-resnet50.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Ultralytics RT-DETR-ResNet50 hybrid object detection model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/rtdetr
-# Task docs: https://docs.ultralytics.com/tasks/detect
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolov8n-cls.yaml' will call yolov8-cls.yaml with scale 'n'
-  # [depth, width, max_channels]
-  l: [1.00, 1.00, 1024]
-
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, ResNetLayer, [3, 64, 1, True, 1]] # 0
-  - [-1, 1, ResNetLayer, [64, 64, 1, False, 3]] # 1
-  - [-1, 1, ResNetLayer, [256, 128, 2, False, 4]] # 2
-  - [-1, 1, ResNetLayer, [512, 256, 2, False, 6]] # 3
-  - [-1, 1, ResNetLayer, [1024, 512, 2, False, 3]] # 4
-
-head:
-  - [-1, 1, Conv, [256, 1, 1, None, 1, 1, False]] # 5
-  - [-1, 1, AIFI, [1024, 8]]
-  - [-1, 1, Conv, [256, 1, 1]] # 7
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [3, 1, Conv, [256, 1, 1, None, 1, 1, False]] # 9
-  - [[-2, -1], 1, Concat, [1]]
-  - [-1, 3, RepC3, [256]] # 11
-  - [-1, 1, Conv, [256, 1, 1]] # 12
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [2, 1, Conv, [256, 1, 1, None, 1, 1, False]] # 14
-  - [[-2, -1], 1, Concat, [1]] # cat backbone P4
-  - [-1, 3, RepC3, [256]] # X3 (16), fpn_blocks.1
-
-  - [-1, 1, Conv, [256, 3, 2]] # 17, downsample_convs.0
-  - [[-1, 12], 1, Concat, [1]] # cat Y4
-  - [-1, 3, RepC3, [256]] # F4 (19), pan_blocks.0
-
-  - [-1, 1, Conv, [256, 3, 2]] # 20, downsample_convs.1
-  - [[-1, 7], 1, Concat, [1]] # cat Y5
-  - [-1, 3, RepC3, [256]] # F5 (22), pan_blocks.1
-
-  - [[16, 19, 22], 1, RTDETRDecoder, [nc]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/rt-detr/rtdetr-x.yaml b/ultralytics/cfg/models/rt-detr/rtdetr-x.yaml
deleted file mode 100644
index f9c4a19..0000000
--- a/ultralytics/cfg/models/rt-detr/rtdetr-x.yaml
+++ /dev/null
@@ -1,57 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Ultralytics RT-DETR-x hybrid object detection model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/rtdetr
-# Task docs: https://docs.ultralytics.com/tasks/detect
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolov8n-cls.yaml' will call yolov8-cls.yaml with scale 'n'
-  # [depth, width, max_channels]
-  x: [1.00, 1.00, 2048]
-
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, HGStem, [32, 64]] # 0-P2/4
-  - [-1, 6, HGBlock, [64, 128, 3]] # stage 1
-
-  - [-1, 1, DWConv, [128, 3, 2, 1, False]] # 2-P3/8
-  - [-1, 6, HGBlock, [128, 512, 3]]
-  - [-1, 6, HGBlock, [128, 512, 3, False, True]] # 4-stage 2
-
-  - [-1, 1, DWConv, [512, 3, 2, 1, False]] # 5-P3/16
-  - [-1, 6, HGBlock, [256, 1024, 5, True, False]] # cm, c2, k, light, shortcut
-  - [-1, 6, HGBlock, [256, 1024, 5, True, True]]
-  - [-1, 6, HGBlock, [256, 1024, 5, True, True]]
-  - [-1, 6, HGBlock, [256, 1024, 5, True, True]]
-  - [-1, 6, HGBlock, [256, 1024, 5, True, True]] # 10-stage 3
-
-  - [-1, 1, DWConv, [1024, 3, 2, 1, False]] # 11-P4/32
-  - [-1, 6, HGBlock, [512, 2048, 5, True, False]]
-  - [-1, 6, HGBlock, [512, 2048, 5, True, True]] # 13-stage 4
-
-head:
-  - [-1, 1, Conv, [384, 1, 1, None, 1, 1, False]] # 14 input_proj.2
-  - [-1, 1, AIFI, [2048, 8]]
-  - [-1, 1, Conv, [384, 1, 1]] # 16, Y5, lateral_convs.0
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [10, 1, Conv, [384, 1, 1, None, 1, 1, False]] # 18 input_proj.1
-  - [[-2, -1], 1, Concat, [1]]
-  - [-1, 3, RepC3, [384]] # 20, fpn_blocks.0
-  - [-1, 1, Conv, [384, 1, 1]] # 21, Y4, lateral_convs.1
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [4, 1, Conv, [384, 1, 1, None, 1, 1, False]] # 23 input_proj.0
-  - [[-2, -1], 1, Concat, [1]] # cat backbone P4
-  - [-1, 3, RepC3, [384]] # X3 (25), fpn_blocks.1
-
-  - [-1, 1, Conv, [384, 3, 2]] # 26, downsample_convs.0
-  - [[-1, 21], 1, Concat, [1]] # cat Y4
-  - [-1, 3, RepC3, [384]] # F4 (28), pan_blocks.0
-
-  - [-1, 1, Conv, [384, 3, 2]] # 29, downsample_convs.1
-  - [[-1, 16], 1, Concat, [1]] # cat Y5
-  - [-1, 3, RepC3, [384]] # F5 (31), pan_blocks.1
-
-  - [[25, 28, 31], 1, RTDETRDecoder, [nc]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/v10/yolov10b.yaml b/ultralytics/cfg/models/v10/yolov10b.yaml
deleted file mode 100644
index 7503791..0000000
--- a/ultralytics/cfg/models/v10/yolov10b.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# YOLOv10b object detection model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolov10
-# Task docs: https://docs.ultralytics.com/tasks/detect
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolov10n.yaml' will call yolov10.yaml with scale 'n'
-  # [depth, width, max_channels]
-  b: [0.67, 1.00, 512]
-
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 3, C2f, [128, True]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 6, C2f, [256, True]]
-  - [-1, 1, SCDown, [512, 3, 2]] # 5-P4/16
-  - [-1, 6, C2f, [512, True]]
-  - [-1, 1, SCDown, [1024, 3, 2]] # 7-P5/32
-  - [-1, 3, C2fCIB, [1024, True]]
-  - [-1, 1, SPPF, [1024, 5]] # 9
-  - [-1, 1, PSA, [1024]] # 10
-
-# YOLOv10.0n head
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 3, C2fCIB, [512, True]] # 13
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 3, C2f, [256]] # 16 (P3/8-small)
-
-  - [-1, 1, Conv, [256, 3, 2]]
-  - [[-1, 13], 1, Concat, [1]] # cat head P4
-  - [-1, 3, C2fCIB, [512, True]] # 19 (P4/16-medium)
-
-  - [-1, 1, SCDown, [512, 3, 2]]
-  - [[-1, 10], 1, Concat, [1]] # cat head P5
-  - [-1, 3, C2fCIB, [1024, True]] # 22 (P5/32-large)
-
-  - [[16, 19, 22], 1, v10Detect, [nc]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/v10/yolov10l.yaml b/ultralytics/cfg/models/v10/yolov10l.yaml
deleted file mode 100644
index 1dedd75..0000000
--- a/ultralytics/cfg/models/v10/yolov10l.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# YOLOv10l object detection model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolov10
-# Task docs: https://docs.ultralytics.com/tasks/detect
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolov10n.yaml' will call yolov10.yaml with scale 'n'
-  # [depth, width, max_channels]
-  l: [1.00, 1.00, 512]
-
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 3, C2f, [128, True]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 6, C2f, [256, True]]
-  - [-1, 1, SCDown, [512, 3, 2]] # 5-P4/16
-  - [-1, 6, C2f, [512, True]]
-  - [-1, 1, SCDown, [1024, 3, 2]] # 7-P5/32
-  - [-1, 3, C2fCIB, [1024, True]]
-  - [-1, 1, SPPF, [1024, 5]] # 9
-  - [-1, 1, PSA, [1024]] # 10
-
-# YOLOv10.0n head
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 3, C2fCIB, [512, True]] # 13
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 3, C2f, [256]] # 16 (P3/8-small)
-
-  - [-1, 1, Conv, [256, 3, 2]]
-  - [[-1, 13], 1, Concat, [1]] # cat head P4
-  - [-1, 3, C2fCIB, [512, True]] # 19 (P4/16-medium)
-
-  - [-1, 1, SCDown, [512, 3, 2]]
-  - [[-1, 10], 1, Concat, [1]] # cat head P5
-  - [-1, 3, C2fCIB, [1024, True]] # 22 (P5/32-large)
-
-  - [[16, 19, 22], 1, v10Detect, [nc]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/v10/yolov10m.yaml b/ultralytics/cfg/models/v10/yolov10m.yaml
deleted file mode 100644
index 6ba4020..0000000
--- a/ultralytics/cfg/models/v10/yolov10m.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# YOLOv10m object detection model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolov10
-# Task docs: https://docs.ultralytics.com/tasks/detect
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolov10n.yaml' will call yolov10.yaml with scale 'n'
-  # [depth, width, max_channels]
-  m: [0.67, 0.75, 768]
-
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 3, C2f, [128, True]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 6, C2f, [256, True]]
-  - [-1, 1, SCDown, [512, 3, 2]] # 5-P4/16
-  - [-1, 6, C2f, [512, True]]
-  - [-1, 1, SCDown, [1024, 3, 2]] # 7-P5/32
-  - [-1, 3, C2fCIB, [1024, True]]
-  - [-1, 1, SPPF, [1024, 5]] # 9
-  - [-1, 1, PSA, [1024]] # 10
-
-# YOLOv10.0n head
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 3, C2f, [512]] # 13
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 3, C2f, [256]] # 16 (P3/8-small)
-
-  - [-1, 1, Conv, [256, 3, 2]]
-  - [[-1, 13], 1, Concat, [1]] # cat head P4
-  - [-1, 3, C2fCIB, [512, True]] # 19 (P4/16-medium)
-
-  - [-1, 1, SCDown, [512, 3, 2]]
-  - [[-1, 10], 1, Concat, [1]] # cat head P5
-  - [-1, 3, C2fCIB, [1024, True]] # 22 (P5/32-large)
-
-  - [[16, 19, 22], 1, v10Detect, [nc]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/v10/yolov10n.yaml b/ultralytics/cfg/models/v10/yolov10n.yaml
deleted file mode 100644
index a9aa701..0000000
--- a/ultralytics/cfg/models/v10/yolov10n.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# YOLOv10n object detection model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolov10
-# Task docs: https://docs.ultralytics.com/tasks/detect
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolov10n.yaml' will call yolov10.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.33, 0.25, 1024]
-
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 3, C2f, [128, True]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 6, C2f, [256, True]]
-  - [-1, 1, SCDown, [512, 3, 2]] # 5-P4/16
-  - [-1, 6, C2f, [512, True]]
-  - [-1, 1, SCDown, [1024, 3, 2]] # 7-P5/32
-  - [-1, 3, C2f, [1024, True]]
-  - [-1, 1, SPPF, [1024, 5]] # 9
-  - [-1, 1, PSA, [1024]] # 10
-
-# YOLOv10.0n head
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 3, C2f, [512]] # 13
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 3, C2f, [256]] # 16 (P3/8-small)
-
-  - [-1, 1, Conv, [256, 3, 2]]
-  - [[-1, 13], 1, Concat, [1]] # cat head P4
-  - [-1, 3, C2f, [512]] # 19 (P4/16-medium)
-
-  - [-1, 1, SCDown, [512, 3, 2]]
-  - [[-1, 10], 1, Concat, [1]] # cat head P5
-  - [-1, 3, C2fCIB, [1024, True, True]] # 22 (P5/32-large)
-
-  - [[16, 19, 22], 1, v10Detect, [nc]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/v10/yolov10s.yaml b/ultralytics/cfg/models/v10/yolov10s.yaml
deleted file mode 100644
index dbb678b..0000000
--- a/ultralytics/cfg/models/v10/yolov10s.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# YOLOv10s object detection model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolov10
-# Task docs: https://docs.ultralytics.com/tasks/detect
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolov10n.yaml' will call yolov10.yaml with scale 'n'
-  # [depth, width, max_channels]
-  s: [0.33, 0.50, 1024]
-
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 3, C2f, [128, True]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 6, C2f, [256, True]]
-  - [-1, 1, SCDown, [512, 3, 2]] # 5-P4/16
-  - [-1, 6, C2f, [512, True]]
-  - [-1, 1, SCDown, [1024, 3, 2]] # 7-P5/32
-  - [-1, 3, C2fCIB, [1024, True, True]]
-  - [-1, 1, SPPF, [1024, 5]] # 9
-  - [-1, 1, PSA, [1024]] # 10
-
-# YOLOv10.0n head
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 3, C2f, [512]] # 13
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 3, C2f, [256]] # 16 (P3/8-small)
-
-  - [-1, 1, Conv, [256, 3, 2]]
-  - [[-1, 13], 1, Concat, [1]] # cat head P4
-  - [-1, 3, C2f, [512]] # 19 (P4/16-medium)
-
-  - [-1, 1, SCDown, [512, 3, 2]]
-  - [[-1, 10], 1, Concat, [1]] # cat head P5
-  - [-1, 3, C2fCIB, [1024, True, True]] # 22 (P5/32-large)
-
-  - [[16, 19, 22], 1, v10Detect, [nc]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/v10/yolov10x.yaml b/ultralytics/cfg/models/v10/yolov10x.yaml
deleted file mode 100644
index 5748213..0000000
--- a/ultralytics/cfg/models/v10/yolov10x.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# YOLOv10x object detection model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolov10
-# Task docs: https://docs.ultralytics.com/tasks/detect
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolov10n.yaml' will call yolov10.yaml with scale 'n'
-  # [depth, width, max_channels]
-  x: [1.00, 1.25, 512]
-
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 3, C2f, [128, True]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 6, C2f, [256, True]]
-  - [-1, 1, SCDown, [512, 3, 2]] # 5-P4/16
-  - [-1, 6, C2fCIB, [512, True]]
-  - [-1, 1, SCDown, [1024, 3, 2]] # 7-P5/32
-  - [-1, 3, C2fCIB, [1024, True]]
-  - [-1, 1, SPPF, [1024, 5]] # 9
-  - [-1, 1, PSA, [1024]] # 10
-
-# YOLOv10.0n head
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 3, C2fCIB, [512, True]] # 13
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 3, C2f, [256]] # 16 (P3/8-small)
-
-  - [-1, 1, Conv, [256, 3, 2]]
-  - [[-1, 13], 1, Concat, [1]] # cat head P4
-  - [-1, 3, C2fCIB, [512, True]] # 19 (P4/16-medium)
-
-  - [-1, 1, SCDown, [512, 3, 2]]
-  - [[-1, 10], 1, Concat, [1]] # cat head P5
-  - [-1, 3, C2fCIB, [1024, True]] # 22 (P5/32-large)
-
-  - [[16, 19, 22], 1, v10Detect, [nc]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/v3/yolov3-spp.yaml b/ultralytics/cfg/models/v3/yolov3-spp.yaml
deleted file mode 100644
index 6aef25a..0000000
--- a/ultralytics/cfg/models/v3/yolov3-spp.yaml
+++ /dev/null
@@ -1,49 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Ultralytics YOLOv3-SPP object detection model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolov3
-# Task docs: https://docs.ultralytics.com/tasks/detect
-
-# Parameters
-nc: 80 # number of classes
-depth_multiple: 1.0 # model depth multiple
-width_multiple: 1.0 # layer channel multiple
-
-# darknet53 backbone
-backbone:
-  # [from, number, module, args]
-  - [-1, 1, Conv, [32, 3, 1]] # 0
-  - [-1, 1, Conv, [64, 3, 2]] # 1-P1/2
-  - [-1, 1, Bottleneck, [64]]
-  - [-1, 1, Conv, [128, 3, 2]] # 3-P2/4
-  - [-1, 2, Bottleneck, [128]]
-  - [-1, 1, Conv, [256, 3, 2]] # 5-P3/8
-  - [-1, 8, Bottleneck, [256]]
-  - [-1, 1, Conv, [512, 3, 2]] # 7-P4/16
-  - [-1, 8, Bottleneck, [512]]
-  - [-1, 1, Conv, [1024, 3, 2]] # 9-P5/32
-  - [-1, 4, Bottleneck, [1024]] # 10
-
-# YOLOv3-SPP head
-head:
-  - [-1, 1, Bottleneck, [1024, False]]
-  - [-1, 1, SPP, [512, [5, 9, 13]]]
-  - [-1, 1, Conv, [1024, 3, 1]]
-  - [-1, 1, Conv, [512, 1, 1]]
-  - [-1, 1, Conv, [1024, 3, 1]] # 15 (P5/32-large)
-
-  - [-2, 1, Conv, [256, 1, 1]]
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 8], 1, Concat, [1]] # cat backbone P4
-  - [-1, 1, Bottleneck, [512, False]]
-  - [-1, 1, Bottleneck, [512, False]]
-  - [-1, 1, Conv, [256, 1, 1]]
-  - [-1, 1, Conv, [512, 3, 1]] # 22 (P4/16-medium)
-
-  - [-2, 1, Conv, [128, 1, 1]]
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P3
-  - [-1, 1, Bottleneck, [256, False]]
-  - [-1, 2, Bottleneck, [256, False]] # 27 (P3/8-small)
-
-  - [[27, 22, 15], 1, Detect, [nc]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/v3/yolov3-tiny.yaml b/ultralytics/cfg/models/v3/yolov3-tiny.yaml
deleted file mode 100644
index 91a0bb0..0000000
--- a/ultralytics/cfg/models/v3/yolov3-tiny.yaml
+++ /dev/null
@@ -1,40 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Ultralytics YOLOv3-tiiny object detection model with P4/16 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolov3
-# Task docs: https://docs.ultralytics.com/tasks/detect
-
-# Parameters
-nc: 80 # number of classes
-depth_multiple: 1.0 # model depth multiple
-width_multiple: 1.0 # layer channel multiple
-
-# YOLOv3-tiny backbone
-backbone:
-  # [from, number, module, args]
-  - [-1, 1, Conv, [16, 3, 1]] # 0
-  - [-1, 1, nn.MaxPool2d, [2, 2, 0]] # 1-P1/2
-  - [-1, 1, Conv, [32, 3, 1]]
-  - [-1, 1, nn.MaxPool2d, [2, 2, 0]] # 3-P2/4
-  - [-1, 1, Conv, [64, 3, 1]]
-  - [-1, 1, nn.MaxPool2d, [2, 2, 0]] # 5-P3/8
-  - [-1, 1, Conv, [128, 3, 1]]
-  - [-1, 1, nn.MaxPool2d, [2, 2, 0]] # 7-P4/16
-  - [-1, 1, Conv, [256, 3, 1]]
-  - [-1, 1, nn.MaxPool2d, [2, 2, 0]] # 9-P5/32
-  - [-1, 1, Conv, [512, 3, 1]]
-  - [-1, 1, nn.ZeroPad2d, [[0, 1, 0, 1]]] # 11
-  - [-1, 1, nn.MaxPool2d, [2, 1, 0]] # 12
-
-# YOLOv3-tiny head
-head:
-  - [-1, 1, Conv, [1024, 3, 1]]
-  - [-1, 1, Conv, [256, 1, 1]]
-  - [-1, 1, Conv, [512, 3, 1]] # 15 (P5/32-large)
-
-  - [-2, 1, Conv, [128, 1, 1]]
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 8], 1, Concat, [1]] # cat backbone P4
-  - [-1, 1, Conv, [256, 3, 1]] # 19 (P4/16-medium)
-
-  - [[19, 15], 1, Detect, [nc]] # Detect(P4, P5)
diff --git a/ultralytics/cfg/models/v3/yolov3.yaml b/ultralytics/cfg/models/v3/yolov3.yaml
deleted file mode 100644
index 95c99de..0000000
--- a/ultralytics/cfg/models/v3/yolov3.yaml
+++ /dev/null
@@ -1,49 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Ultralytics YOLOv3 object detection model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolov3
-# Task docs: https://docs.ultralytics.com/tasks/detect
-
-# Parameters
-nc: 80 # number of classes
-depth_multiple: 1.0 # model depth multiple
-width_multiple: 1.0 # layer channel multiple
-
-# darknet53 backbone
-backbone:
-  # [from, number, module, args]
-  - [-1, 1, Conv, [32, 3, 1]] # 0
-  - [-1, 1, Conv, [64, 3, 2]] # 1-P1/2
-  - [-1, 1, Bottleneck, [64]]
-  - [-1, 1, Conv, [128, 3, 2]] # 3-P2/4
-  - [-1, 2, Bottleneck, [128]]
-  - [-1, 1, Conv, [256, 3, 2]] # 5-P3/8
-  - [-1, 8, Bottleneck, [256]]
-  - [-1, 1, Conv, [512, 3, 2]] # 7-P4/16
-  - [-1, 8, Bottleneck, [512]]
-  - [-1, 1, Conv, [1024, 3, 2]] # 9-P5/32
-  - [-1, 4, Bottleneck, [1024]] # 10
-
-# YOLOv3 head
-head:
-  - [-1, 1, Bottleneck, [1024, False]]
-  - [-1, 1, Conv, [512, 1, 1]]
-  - [-1, 1, Conv, [1024, 3, 1]]
-  - [-1, 1, Conv, [512, 1, 1]]
-  - [-1, 1, Conv, [1024, 3, 1]] # 15 (P5/32-large)
-
-  - [-2, 1, Conv, [256, 1, 1]]
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 8], 1, Concat, [1]] # cat backbone P4
-  - [-1, 1, Bottleneck, [512, False]]
-  - [-1, 1, Bottleneck, [512, False]]
-  - [-1, 1, Conv, [256, 1, 1]]
-  - [-1, 1, Conv, [512, 3, 1]] # 22 (P4/16-medium)
-
-  - [-2, 1, Conv, [128, 1, 1]]
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P3
-  - [-1, 1, Bottleneck, [256, False]]
-  - [-1, 2, Bottleneck, [256, False]] # 27 (P3/8-small)
-
-  - [[27, 22, 15], 1, Detect, [nc]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/v5/yolov5-p6.yaml b/ultralytics/cfg/models/v5/yolov5-p6.yaml
deleted file mode 100644
index 376d1ab..0000000
--- a/ultralytics/cfg/models/v5/yolov5-p6.yaml
+++ /dev/null
@@ -1,62 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Ultralytics YOLOv5 object detection model with P3/8 - P6/64 outputs
-# Model docs: https://docs.ultralytics.com/models/yolov5
-# Task docs: https://docs.ultralytics.com/tasks/detect
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolov5n-p6.yaml' will call yolov5-p6.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.33, 0.25, 1024]
-  s: [0.33, 0.50, 1024]
-  m: [0.67, 0.75, 1024]
-  l: [1.00, 1.00, 1024]
-  x: [1.33, 1.25, 1024]
-
-# YOLOv5 v6.0 backbone
-backbone:
-  # [from, number, module, args]
-  - [-1, 1, Conv, [64, 6, 2, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 3, C3, [128]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 6, C3, [256]]
-  - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
-  - [-1, 9, C3, [512]]
-  - [-1, 1, Conv, [768, 3, 2]] # 7-P5/32
-  - [-1, 3, C3, [768]]
-  - [-1, 1, Conv, [1024, 3, 2]] # 9-P6/64
-  - [-1, 3, C3, [1024]]
-  - [-1, 1, SPPF, [1024, 5]] # 11
-
-# YOLOv5 v6.0 head
-head:
-  - [-1, 1, Conv, [768, 1, 1]]
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 8], 1, Concat, [1]] # cat backbone P5
-  - [-1, 3, C3, [768, False]] # 15
-
-  - [-1, 1, Conv, [512, 1, 1]]
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 3, C3, [512, False]] # 19
-
-  - [-1, 1, Conv, [256, 1, 1]]
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 3, C3, [256, False]] # 23 (P3/8-small)
-
-  - [-1, 1, Conv, [256, 3, 2]]
-  - [[-1, 20], 1, Concat, [1]] # cat head P4
-  - [-1, 3, C3, [512, False]] # 26 (P4/16-medium)
-
-  - [-1, 1, Conv, [512, 3, 2]]
-  - [[-1, 16], 1, Concat, [1]] # cat head P5
-  - [-1, 3, C3, [768, False]] # 29 (P5/32-large)
-
-  - [-1, 1, Conv, [768, 3, 2]]
-  - [[-1, 12], 1, Concat, [1]] # cat head P6
-  - [-1, 3, C3, [1024, False]] # 32 (P6/64-xlarge)
-
-  - [[23, 26, 29, 32], 1, Detect, [nc]] # Detect(P3, P4, P5, P6)
diff --git a/ultralytics/cfg/models/v5/yolov5.yaml b/ultralytics/cfg/models/v5/yolov5.yaml
deleted file mode 100644
index 76a4749..0000000
--- a/ultralytics/cfg/models/v5/yolov5.yaml
+++ /dev/null
@@ -1,51 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Ultralytics YOLOv5 object detection model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolov5
-# Task docs: https://docs.ultralytics.com/tasks/detect
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolov5n.yaml' will call yolov5.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.33, 0.25, 1024]
-  s: [0.33, 0.50, 1024]
-  m: [0.67, 0.75, 1024]
-  l: [1.00, 1.00, 1024]
-  x: [1.33, 1.25, 1024]
-
-# YOLOv5 v6.0 backbone
-backbone:
-  # [from, number, module, args]
-  - [-1, 1, Conv, [64, 6, 2, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 3, C3, [128]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 6, C3, [256]]
-  - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
-  - [-1, 9, C3, [512]]
-  - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
-  - [-1, 3, C3, [1024]]
-  - [-1, 1, SPPF, [1024, 5]] # 9
-
-# YOLOv5 v6.0 head
-head:
-  - [-1, 1, Conv, [512, 1, 1]]
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 3, C3, [512, False]] # 13
-
-  - [-1, 1, Conv, [256, 1, 1]]
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 3, C3, [256, False]] # 17 (P3/8-small)
-
-  - [-1, 1, Conv, [256, 3, 2]]
-  - [[-1, 14], 1, Concat, [1]] # cat head P4
-  - [-1, 3, C3, [512, False]] # 20 (P4/16-medium)
-
-  - [-1, 1, Conv, [512, 3, 2]]
-  - [[-1, 10], 1, Concat, [1]] # cat head P5
-  - [-1, 3, C3, [1024, False]] # 23 (P5/32-large)
-
-  - [[17, 20, 23], 1, Detect, [nc]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/v6/yolov6.yaml b/ultralytics/cfg/models/v6/yolov6.yaml
deleted file mode 100644
index 4a45224..0000000
--- a/ultralytics/cfg/models/v6/yolov6.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Meituan YOLOv6 object detection model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolov6
-# Task docs: https://docs.ultralytics.com/tasks/detect
-
-# Parameters
-nc: 80 # number of classes
-activation: torch.nn.ReLU() # (optional) model default activation function
-scales: # model compound scaling constants, i.e. 'model=yolov6n.yaml' will call yolov8.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.33, 0.25, 1024]
-  s: [0.33, 0.50, 1024]
-  m: [0.67, 0.75, 768]
-  l: [1.00, 1.00, 512]
-  x: [1.00, 1.25, 512]
-
-# YOLOv6-3.0s backbone
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 6, Conv, [128, 3, 1]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 12, Conv, [256, 3, 1]]
-  - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
-  - [-1, 18, Conv, [512, 3, 1]]
-  - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
-  - [-1, 6, Conv, [1024, 3, 1]]
-  - [-1, 1, SPPF, [1024, 5]] # 9
-
-# YOLOv6-3.0s head
-head:
-  - [-1, 1, Conv, [256, 1, 1]]
-  - [-1, 1, nn.ConvTranspose2d, [256, 2, 2, 0]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 1, Conv, [256, 3, 1]]
-  - [-1, 9, Conv, [256, 3, 1]] # 14
-
-  - [-1, 1, Conv, [128, 1, 1]]
-  - [-1, 1, nn.ConvTranspose2d, [128, 2, 2, 0]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 1, Conv, [128, 3, 1]]
-  - [-1, 9, Conv, [128, 3, 1]] # 19
-
-  - [-1, 1, Conv, [128, 3, 2]]
-  - [[-1, 15], 1, Concat, [1]] # cat head P4
-  - [-1, 1, Conv, [256, 3, 1]]
-  - [-1, 9, Conv, [256, 3, 1]] # 23
-
-  - [-1, 1, Conv, [256, 3, 2]]
-  - [[-1, 10], 1, Concat, [1]] # cat head P5
-  - [-1, 1, Conv, [512, 3, 1]]
-  - [-1, 9, Conv, [512, 3, 1]] # 27
-
-  - [[19, 23, 27], 1, Detect, [nc]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/v8/yoloe-v8-seg.yaml b/ultralytics/cfg/models/v8/yoloe-v8-seg.yaml
deleted file mode 100644
index 83d203a..0000000
--- a/ultralytics/cfg/models/v8/yoloe-v8-seg.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolov8n.yaml' will call yolov8.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.33, 0.25, 1024] # YOLOv8n-world summary: 161 layers, 4204111 parameters, 4204095 gradients, 39.6 GFLOPs
-  s: [0.33, 0.50, 1024] # YOLOv8s-world summary: 161 layers, 13383496 parameters, 13383480 gradients, 71.5 GFLOPs
-  m: [0.67, 0.75, 768] # YOLOv8m-world summary: 201 layers, 29065310 parameters, 29065294 gradients, 131.4 GFLOPs
-  l: [1.00, 1.00, 512] # YOLOv8l-world summary: 241 layers, 47553970 parameters, 47553954 gradients, 225.6 GFLOPs
-  x: [1.00, 1.25, 512] # YOLOv8x-world summary: 241 layers, 73690217 parameters, 73690201 gradients, 330.8 GFLOPs
-
-# YOLOv8.0n backbone
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 3, C2f, [128, True]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 6, C2f, [256, True]]
-  - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
-  - [-1, 6, C2f, [512, True]]
-  - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
-  - [-1, 3, C2f, [1024, True]]
-  - [-1, 1, SPPF, [1024, 5]] # 9
-
-# YOLOv8.0n head
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 3, C2f, [512]] # 12
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 3, C2f, [256]] # 15 (P3/8-small)
-
-  - [15, 1, Conv, [256, 3, 2]]
-  - [[-1, 12], 1, Concat, [1]] # cat head P4
-  - [-1, 3, C2f, [512]] # 18 (P4/16-medium)
-
-  - [-1, 1, Conv, [512, 3, 2]]
-  - [[-1, 9], 1, Concat, [1]] # cat head P5
-  - [-1, 3, C2f, [1024]] # 21 (P5/32-large)
-
-  - [[15, 18, 21], 1, YOLOESegment, [nc, 32, 256, 512, True]] # Segment(P3, P4, P5)
diff --git a/ultralytics/cfg/models/v8/yoloe-v8.yaml b/ultralytics/cfg/models/v8/yoloe-v8.yaml
deleted file mode 100644
index 73feaa6..0000000
--- a/ultralytics/cfg/models/v8/yoloe-v8.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolov8n.yaml' will call yolov8.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.33, 0.25, 1024] # YOLOv8n-worldv2 summary: 148 layers, 3695183 parameters, 3695167 gradients, 19.5 GFLOPS
-  s: [0.33, 0.50, 1024] # YOLOv8s-worldv2 summary: 148 layers, 12759880 parameters, 12759864 gradients, 51.0 GFLOPS
-  m: [0.67, 0.75, 768] # YOLOv8m-worldv2 summary: 188 layers, 28376158 parameters, 28376142 gradients, 110.5 GFLOPS
-  l: [1.00, 1.00, 512] # YOLOv8l-worldv2 summary: 228 layers, 46832050 parameters, 46832034 gradients, 204.5 GFLOPS
-  x: [1.00, 1.25, 512] # YOLOv8x-worldv2 summary: 228 layers, 72886377 parameters, 72886361 gradients, 309.3 GFLOPS
-
-# YOLOv8.0n backbone
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 3, C2f, [128, True]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 6, C2f, [256, True]]
-  - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
-  - [-1, 6, C2f, [512, True]]
-  - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
-  - [-1, 3, C2f, [1024, True]]
-  - [-1, 1, SPPF, [1024, 5]] # 9
-
-# YOLOv8.0n head
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 3, C2f, [512]] # 12
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 3, C2f, [256]] # 15 (P3/8-small)
-
-  - [15, 1, Conv, [256, 3, 2]]
-  - [[-1, 12], 1, Concat, [1]] # cat head P4
-  - [-1, 3, C2f, [512]] # 18 (P4/16-medium)
-
-  - [-1, 1, Conv, [512, 3, 2]]
-  - [[-1, 9], 1, Concat, [1]] # cat head P5
-  - [-1, 3, C2f, [1024]] # 21 (P5/32-large)
-
-  - [[15, 18, 21], 1, YOLOEDetect, [nc, 512, True]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/v8/yolov8-cls-resnet101.yaml b/ultralytics/cfg/models/v8/yolov8-cls-resnet101.yaml
deleted file mode 100644
index 44cc00e..0000000
--- a/ultralytics/cfg/models/v8/yolov8-cls-resnet101.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Ultralytics YOLOv8-cls image classification model with ResNet101 backbone
-# Model docs: https://docs.ultralytics.com/models/yolov8
-# Task docs: https://docs.ultralytics.com/tasks/classify
-
-# Parameters
-nc: 1000 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolov8n-cls.yaml' will call yolov8-cls.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.33, 0.25, 1024]
-  s: [0.33, 0.50, 1024]
-  m: [0.67, 0.75, 1024]
-  l: [1.00, 1.00, 1024]
-  x: [1.00, 1.25, 1024]
-
-# YOLOv8.0n backbone
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, ResNetLayer, [3, 64, 1, True, 1]] # 0-P1/2
-  - [-1, 1, ResNetLayer, [64, 64, 1, False, 3]] # 1-P2/4
-  - [-1, 1, ResNetLayer, [256, 128, 2, False, 4]] # 2-P3/8
-  - [-1, 1, ResNetLayer, [512, 256, 2, False, 23]] # 3-P4/16
-  - [-1, 1, ResNetLayer, [1024, 512, 2, False, 3]] # 4-P5/32
-
-# YOLOv8.0n head
-head:
-  - [-1, 1, Classify, [nc]] # Classify
diff --git a/ultralytics/cfg/models/v8/yolov8-cls-resnet50.yaml b/ultralytics/cfg/models/v8/yolov8-cls-resnet50.yaml
deleted file mode 100644
index 1d05e07..0000000
--- a/ultralytics/cfg/models/v8/yolov8-cls-resnet50.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Ultralytics YOLOv8-cls image classification model with ResNet50 backbone
-# Model docs: https://docs.ultralytics.com/models/yolov8
-# Task docs: https://docs.ultralytics.com/tasks/classify
-
-# Parameters
-nc: 1000 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolov8n-cls.yaml' will call yolov8-cls.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.33, 0.25, 1024]
-  s: [0.33, 0.50, 1024]
-  m: [0.67, 0.75, 1024]
-  l: [1.00, 1.00, 1024]
-  x: [1.00, 1.25, 1024]
-
-# YOLOv8.0n backbone
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, ResNetLayer, [3, 64, 1, True, 1]] # 0-P1/2
-  - [-1, 1, ResNetLayer, [64, 64, 1, False, 3]] # 1-P2/4
-  - [-1, 1, ResNetLayer, [256, 128, 2, False, 4]] # 2-P3/8
-  - [-1, 1, ResNetLayer, [512, 256, 2, False, 6]] # 3-P4/16
-  - [-1, 1, ResNetLayer, [1024, 512, 2, False, 3]] # 4-P5/32
-
-# YOLOv8.0n head
-head:
-  - [-1, 1, Classify, [nc]] # Classify
diff --git a/ultralytics/cfg/models/v8/yolov8-cls.yaml b/ultralytics/cfg/models/v8/yolov8-cls.yaml
deleted file mode 100644
index e346e5e..0000000
--- a/ultralytics/cfg/models/v8/yolov8-cls.yaml
+++ /dev/null
@@ -1,32 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Ultralytics YOLOv8-cls image classification model with YOLO backbone
-# Model docs: https://docs.ultralytics.com/models/yolov8
-# Task docs: https://docs.ultralytics.com/tasks/classify
-
-# Parameters
-nc: 1000 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolov8n-cls.yaml' will call yolov8-cls.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.33, 0.25, 1024]
-  s: [0.33, 0.50, 1024]
-  m: [0.67, 0.75, 1024]
-  l: [1.00, 1.00, 1024]
-  x: [1.00, 1.25, 1024]
-
-# YOLOv8.0n backbone
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 3, C2f, [128, True]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 6, C2f, [256, True]]
-  - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
-  - [-1, 6, C2f, [512, True]]
-  - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
-  - [-1, 3, C2f, [1024, True]]
-
-# YOLOv8.0n head
-head:
-  - [-1, 1, Classify, [nc]] # Classify
diff --git a/ultralytics/cfg/models/v8/yolov8-ghost-p2.yaml b/ultralytics/cfg/models/v8/yolov8-ghost-p2.yaml
deleted file mode 100644
index 08209a1..0000000
--- a/ultralytics/cfg/models/v8/yolov8-ghost-p2.yaml
+++ /dev/null
@@ -1,58 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Ultralytics YOLOv8 object detection model with P2/4 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolov8
-# Task docs: https://docs.ultralytics.com/tasks/detect
-# Employs Ghost convolutions and modules proposed in Huawei's GhostNet in https://arxiv.org/abs/1911.11907v2
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolov8n.yaml' will call yolov8.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.33, 0.25, 1024] # YOLOv8n-ghost-p2 summary: 290 layers, 2033944 parameters, 2033928 gradients, 13.8 GFLOPs
-  s: [0.33, 0.50, 1024] # YOLOv8s-ghost-p2 summary: 290 layers, 5562080 parameters, 5562064 gradients, 25.1 GFLOPs
-  m: [0.67, 0.75, 768] # YOLOv8m-ghost-p2 summary: 434 layers, 9031728 parameters, 9031712 gradients, 42.8 GFLOPs
-  l: [1.00, 1.00, 512] # YOLOv8l-ghost-p2 summary: 578 layers, 12214448 parameters, 12214432 gradients, 69.1 GFLOPs
-  x: [1.00, 1.25, 512] # YOLOv8x-ghost-p2 summary: 578 layers, 18664776 parameters, 18664760 gradients, 103.3 GFLOPs
-
-# YOLOv8.0-ghost backbone
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, GhostConv, [128, 3, 2]] # 1-P2/4
-  - [-1, 3, C3Ghost, [128, True]]
-  - [-1, 1, GhostConv, [256, 3, 2]] # 3-P3/8
-  - [-1, 6, C3Ghost, [256, True]]
-  - [-1, 1, GhostConv, [512, 3, 2]] # 5-P4/16
-  - [-1, 6, C3Ghost, [512, True]]
-  - [-1, 1, GhostConv, [1024, 3, 2]] # 7-P5/32
-  - [-1, 3, C3Ghost, [1024, True]]
-  - [-1, 1, SPPF, [1024, 5]] # 9
-
-# YOLOv8.0-ghost-p2 head
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 3, C3Ghost, [512]] # 12
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 3, C3Ghost, [256]] # 15 (P3/8-small)
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 2], 1, Concat, [1]] # cat backbone P2
-  - [-1, 3, C3Ghost, [128]] # 18 (P2/4-xsmall)
-
-  - [-1, 1, GhostConv, [128, 3, 2]]
-  - [[-1, 15], 1, Concat, [1]] # cat head P3
-  - [-1, 3, C3Ghost, [256]] # 21 (P3/8-small)
-
-  - [-1, 1, GhostConv, [256, 3, 2]]
-  - [[-1, 12], 1, Concat, [1]] # cat head P4
-  - [-1, 3, C3Ghost, [512]] # 24 (P4/16-medium)
-
-  - [-1, 1, GhostConv, [512, 3, 2]]
-  - [[-1, 9], 1, Concat, [1]] # cat head P5
-  - [-1, 3, C3Ghost, [1024]] # 27 (P5/32-large)
-
-  - [[18, 21, 24, 27], 1, Detect, [nc]] # Detect(P2, P3, P4, P5)
diff --git a/ultralytics/cfg/models/v8/yolov8-ghost-p6.yaml b/ultralytics/cfg/models/v8/yolov8-ghost-p6.yaml
deleted file mode 100644
index c11bbbe..0000000
--- a/ultralytics/cfg/models/v8/yolov8-ghost-p6.yaml
+++ /dev/null
@@ -1,60 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Ultralytics YOLOv8 object detection model with P3/8 - P6/64 outputs
-# Model docs: https://docs.ultralytics.com/models/yolov8
-# Task docs: https://docs.ultralytics.com/tasks/detect
-# Employs Ghost convolutions and modules proposed in Huawei's GhostNet in https://arxiv.org/abs/1911.11907v2
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolov8n-p6.yaml' will call yolov8-p6.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.33, 0.25, 1024] # YOLOv8n-ghost-p6 summary: 312 layers, 2901100 parameters, 2901084 gradients, 5.8 GFLOPs
-  s: [0.33, 0.50, 1024] # YOLOv8s-ghost-p6 summary: 312 layers, 9520008 parameters, 9519992 gradients, 16.4 GFLOPs
-  m: [0.67, 0.75, 768] # YOLOv8m-ghost-p6 summary: 468 layers, 18002904 parameters, 18002888 gradients, 34.4 GFLOPs
-  l: [1.00, 1.00, 512] # YOLOv8l-ghost-p6 summary: 624 layers, 21227584 parameters, 21227568 gradients, 55.3 GFLOPs
-  x: [1.00, 1.25, 512] # YOLOv8x-ghost-p6 summary: 624 layers, 33057852 parameters, 33057836 gradients, 85.7 GFLOPs
-
-# YOLOv8.0-ghost backbone
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, GhostConv, [128, 3, 2]] # 1-P2/4
-  - [-1, 3, C3Ghost, [128, True]]
-  - [-1, 1, GhostConv, [256, 3, 2]] # 3-P3/8
-  - [-1, 6, C3Ghost, [256, True]]
-  - [-1, 1, GhostConv, [512, 3, 2]] # 5-P4/16
-  - [-1, 6, C3Ghost, [512, True]]
-  - [-1, 1, GhostConv, [768, 3, 2]] # 7-P5/32
-  - [-1, 3, C3Ghost, [768, True]]
-  - [-1, 1, GhostConv, [1024, 3, 2]] # 9-P6/64
-  - [-1, 3, C3Ghost, [1024, True]]
-  - [-1, 1, SPPF, [1024, 5]] # 11
-
-# YOLOv8.0-ghost-p6 head
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 8], 1, Concat, [1]] # cat backbone P5
-  - [-1, 3, C3Ghost, [768]] # 14
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 3, C3Ghost, [512]] # 17
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 3, C3Ghost, [256]] # 20 (P3/8-small)
-
-  - [-1, 1, GhostConv, [256, 3, 2]]
-  - [[-1, 17], 1, Concat, [1]] # cat head P4
-  - [-1, 3, C3Ghost, [512]] # 23 (P4/16-medium)
-
-  - [-1, 1, GhostConv, [512, 3, 2]]
-  - [[-1, 14], 1, Concat, [1]] # cat head P5
-  - [-1, 3, C3Ghost, [768]] # 26 (P5/32-large)
-
-  - [-1, 1, GhostConv, [768, 3, 2]]
-  - [[-1, 11], 1, Concat, [1]] # cat head P6
-  - [-1, 3, C3Ghost, [1024]] # 29 (P6/64-xlarge)
-
-  - [[20, 23, 26, 29], 1, Detect, [nc]] # Detect(P3, P4, P5, P6)
diff --git a/ultralytics/cfg/models/v8/yolov8-ghost.yaml b/ultralytics/cfg/models/v8/yolov8-ghost.yaml
deleted file mode 100644
index 371b766..0000000
--- a/ultralytics/cfg/models/v8/yolov8-ghost.yaml
+++ /dev/null
@@ -1,50 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Ultralytics YOLOv8 object detection model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolov8
-# Task docs: https://docs.ultralytics.com/tasks/detect
-# Employs Ghost convolutions and modules proposed in Huawei's GhostNet in https://arxiv.org/abs/1911.11907v2
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolov8n.yaml' will call yolov8.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.33, 0.25, 1024] # YOLOv8n-ghost summary: 237 layers, 1865316 parameters, 1865300 gradients, 5.8 GFLOPs
-  s: [0.33, 0.50, 1024] # YOLOv8s-ghost summary: 237 layers, 5960072 parameters, 5960056 gradients, 16.4 GFLOPs
-  m: [0.67, 0.75, 768] # YOLOv8m-ghost summary: 357 layers, 10336312 parameters, 10336296 gradients, 32.7 GFLOPs
-  l: [1.00, 1.00, 512] # YOLOv8l-ghost summary: 477 layers, 14277872 parameters, 14277856 gradients, 53.7 GFLOPs
-  x: [1.00, 1.25, 512] # YOLOv8x-ghost summary: 477 layers, 22229308 parameters, 22229292 gradients, 83.3 GFLOPs
-
-# YOLOv8.0n-ghost backbone
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, GhostConv, [128, 3, 2]] # 1-P2/4
-  - [-1, 3, C3Ghost, [128, True]]
-  - [-1, 1, GhostConv, [256, 3, 2]] # 3-P3/8
-  - [-1, 6, C3Ghost, [256, True]]
-  - [-1, 1, GhostConv, [512, 3, 2]] # 5-P4/16
-  - [-1, 6, C3Ghost, [512, True]]
-  - [-1, 1, GhostConv, [1024, 3, 2]] # 7-P5/32
-  - [-1, 3, C3Ghost, [1024, True]]
-  - [-1, 1, SPPF, [1024, 5]] # 9
-
-# YOLOv8.0n head
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 3, C3Ghost, [512]] # 12
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 3, C3Ghost, [256]] # 15 (P3/8-small)
-
-  - [-1, 1, GhostConv, [256, 3, 2]]
-  - [[-1, 12], 1, Concat, [1]] # cat head P4
-  - [-1, 3, C3Ghost, [512]] # 18 (P4/16-medium)
-
-  - [-1, 1, GhostConv, [512, 3, 2]]
-  - [[-1, 9], 1, Concat, [1]] # cat head P5
-  - [-1, 3, C3Ghost, [1024]] # 21 (P5/32-large)
-
-  - [[15, 18, 21], 1, Detect, [nc]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/v8/yolov8-obb.yaml b/ultralytics/cfg/models/v8/yolov8-obb.yaml
deleted file mode 100644
index 0b6cef3..0000000
--- a/ultralytics/cfg/models/v8/yolov8-obb.yaml
+++ /dev/null
@@ -1,49 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Ultralytics YOLOv8-obb Oriented Bounding Boxes (OBB) model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolov8
-# Task docs: https://docs.ultralytics.com/tasks/obb
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolov8n.yaml' will call yolov8.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.33, 0.25, 1024] # YOLOv8n-obb summary: 144 layers, 3228867 parameters, 3228851 gradients, 9.1 GFLOPs
-  s: [0.33, 0.50, 1024] # YOLOv8s-obb summary: 144 layers, 11452739 parameters, 11452723 gradients, 29.8 GFLOPs
-  m: [0.67, 0.75, 768] # YOLOv8m-obb summary: 184 layers, 26463235 parameters, 26463219 gradients, 81.5 GFLOPs
-  l: [1.00, 1.00, 512] # YOLOv8l-obb summary: 224 layers, 44540355 parameters, 44540339 gradients, 169.4 GFLOPs
-  x: [1.00, 1.25, 512] # YOLOv8x-obb summary: 224 layers, 69555651 parameters, 69555635 gradients, 264.3 GFLOPs
-
-# YOLOv8.0n backbone
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 3, C2f, [128, True]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 6, C2f, [256, True]]
-  - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
-  - [-1, 6, C2f, [512, True]]
-  - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
-  - [-1, 3, C2f, [1024, True]]
-  - [-1, 1, SPPF, [1024, 5]] # 9
-
-# YOLOv8.0n head
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 3, C2f, [512]] # 12
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 3, C2f, [256]] # 15 (P3/8-small)
-
-  - [-1, 1, Conv, [256, 3, 2]]
-  - [[-1, 12], 1, Concat, [1]] # cat head P4
-  - [-1, 3, C2f, [512]] # 18 (P4/16-medium)
-
-  - [-1, 1, Conv, [512, 3, 2]]
-  - [[-1, 9], 1, Concat, [1]] # cat head P5
-  - [-1, 3, C2f, [1024]] # 21 (P5/32-large)
-
-  - [[15, 18, 21], 1, OBB, [nc, 1]] # OBB(P3, P4, P5)
diff --git a/ultralytics/cfg/models/v8/yolov8-p2.yaml b/ultralytics/cfg/models/v8/yolov8-p2.yaml
deleted file mode 100644
index 676bc83..0000000
--- a/ultralytics/cfg/models/v8/yolov8-p2.yaml
+++ /dev/null
@@ -1,57 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Ultralytics YOLOv8 object detection model with P2/4 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolov8
-# Task docs: https://docs.ultralytics.com/tasks/detect
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolov8n.yaml' will call yolov8.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.33, 0.25, 1024]
-  s: [0.33, 0.50, 1024]
-  m: [0.67, 0.75, 768]
-  l: [1.00, 1.00, 512]
-  x: [1.00, 1.25, 512]
-
-# YOLOv8.0 backbone
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 3, C2f, [128, True]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 6, C2f, [256, True]]
-  - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
-  - [-1, 6, C2f, [512, True]]
-  - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
-  - [-1, 3, C2f, [1024, True]]
-  - [-1, 1, SPPF, [1024, 5]] # 9
-
-# YOLOv8.0-p2 head
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 3, C2f, [512]] # 12
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 3, C2f, [256]] # 15 (P3/8-small)
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 2], 1, Concat, [1]] # cat backbone P2
-  - [-1, 3, C2f, [128]] # 18 (P2/4-xsmall)
-
-  - [-1, 1, Conv, [128, 3, 2]]
-  - [[-1, 15], 1, Concat, [1]] # cat head P3
-  - [-1, 3, C2f, [256]] # 21 (P3/8-small)
-
-  - [-1, 1, Conv, [256, 3, 2]]
-  - [[-1, 12], 1, Concat, [1]] # cat head P4
-  - [-1, 3, C2f, [512]] # 24 (P4/16-medium)
-
-  - [-1, 1, Conv, [512, 3, 2]]
-  - [[-1, 9], 1, Concat, [1]] # cat head P5
-  - [-1, 3, C2f, [1024]] # 27 (P5/32-large)
-
-  - [[18, 21, 24, 27], 1, Detect, [nc]] # Detect(P2, P3, P4, P5)
diff --git a/ultralytics/cfg/models/v8/yolov8-p6.yaml b/ultralytics/cfg/models/v8/yolov8-p6.yaml
deleted file mode 100644
index 7fb243f..0000000
--- a/ultralytics/cfg/models/v8/yolov8-p6.yaml
+++ /dev/null
@@ -1,59 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Ultralytics YOLOv8 object detection model with P3/8 - P6/64 outputs
-# Model docs: https://docs.ultralytics.com/models/yolov8
-# Task docs: https://docs.ultralytics.com/tasks/detect
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolov8n-p6.yaml' will call yolov8-p6.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.33, 0.25, 1024] # YOLOv8n-p6 summary: 170 layers, 4984352 parameters, 4984336 gradients, 8.8 GFLOPs
-  s: [0.33, 0.50, 1024] # YOLOv8s-p6 summary: 170 layers, 17911792 parameters, 17911776 gradients, 28.7 GFLOPs
-  m: [0.67, 0.75, 768] # YOLOv8m-p6 summary: 222 layers, 44887488 parameters, 44887472 gradients, 83.5 GFLOPs
-  l: [1.00, 1.00, 512] # YOLOv8l-p6 summary: 274 layers, 62384016 parameters, 62384000 gradients, 167.9 GFLOPs
-  x: [1.00, 1.25, 512] # YOLOv8x-p6 summary: 274 layers, 97423072 parameters, 97423056 gradients, 261.8 GFLOPs
-
-# YOLOv8.0x6 backbone
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 3, C2f, [128, True]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 6, C2f, [256, True]]
-  - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
-  - [-1, 6, C2f, [512, True]]
-  - [-1, 1, Conv, [768, 3, 2]] # 7-P5/32
-  - [-1, 3, C2f, [768, True]]
-  - [-1, 1, Conv, [1024, 3, 2]] # 9-P6/64
-  - [-1, 3, C2f, [1024, True]]
-  - [-1, 1, SPPF, [1024, 5]] # 11
-
-# YOLOv8.0x6 head
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 8], 1, Concat, [1]] # cat backbone P5
-  - [-1, 3, C2, [768, False]] # 14
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 3, C2, [512, False]] # 17
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 3, C2, [256, False]] # 20 (P3/8-small)
-
-  - [-1, 1, Conv, [256, 3, 2]]
-  - [[-1, 17], 1, Concat, [1]] # cat head P4
-  - [-1, 3, C2, [512, False]] # 23 (P4/16-medium)
-
-  - [-1, 1, Conv, [512, 3, 2]]
-  - [[-1, 14], 1, Concat, [1]] # cat head P5
-  - [-1, 3, C2, [768, False]] # 26 (P5/32-large)
-
-  - [-1, 1, Conv, [768, 3, 2]]
-  - [[-1, 11], 1, Concat, [1]] # cat head P6
-  - [-1, 3, C2, [1024, False]] # 29 (P6/64-xlarge)
-
-  - [[20, 23, 26, 29], 1, Detect, [nc]] # Detect(P3, P4, P5, P6)
diff --git a/ultralytics/cfg/models/v8/yolov8-pose-p6.yaml b/ultralytics/cfg/models/v8/yolov8-pose-p6.yaml
deleted file mode 100644
index 447a21a..0000000
--- a/ultralytics/cfg/models/v8/yolov8-pose-p6.yaml
+++ /dev/null
@@ -1,60 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Ultralytics YOLOv8-pose keypoints/pose estimation model with P3/8 - P6/64 outputs
-# Model docs: https://docs.ultralytics.com/models/yolov8
-# Task docs: https://docs.ultralytics.com/tasks/pose
-
-# Parameters
-nc: 1 # number of classes
-kpt_shape: [17, 3] # number of keypoints, number of dims (2 for x,y or 3 for x,y,visible)
-scales: # model compound scaling constants, i.e. 'model=yolov8n-p6.yaml' will call yolov8-p6.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.33, 0.25, 1024]
-  s: [0.33, 0.50, 1024]
-  m: [0.67, 0.75, 768]
-  l: [1.00, 1.00, 512]
-  x: [1.00, 1.25, 512]
-
-# YOLOv8.0x6 backbone
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 3, C2f, [128, True]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 6, C2f, [256, True]]
-  - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
-  - [-1, 6, C2f, [512, True]]
-  - [-1, 1, Conv, [768, 3, 2]] # 7-P5/32
-  - [-1, 3, C2f, [768, True]]
-  - [-1, 1, Conv, [1024, 3, 2]] # 9-P6/64
-  - [-1, 3, C2f, [1024, True]]
-  - [-1, 1, SPPF, [1024, 5]] # 11
-
-# YOLOv8.0x6 head
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 8], 1, Concat, [1]] # cat backbone P5
-  - [-1, 3, C2, [768, False]] # 14
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 3, C2, [512, False]] # 17
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 3, C2, [256, False]] # 20 (P3/8-small)
-
-  - [-1, 1, Conv, [256, 3, 2]]
-  - [[-1, 17], 1, Concat, [1]] # cat head P4
-  - [-1, 3, C2, [512, False]] # 23 (P4/16-medium)
-
-  - [-1, 1, Conv, [512, 3, 2]]
-  - [[-1, 14], 1, Concat, [1]] # cat head P5
-  - [-1, 3, C2, [768, False]] # 26 (P5/32-large)
-
-  - [-1, 1, Conv, [768, 3, 2]]
-  - [[-1, 11], 1, Concat, [1]] # cat head P6
-  - [-1, 3, C2, [1024, False]] # 29 (P6/64-xlarge)
-
-  - [[20, 23, 26, 29], 1, Pose, [nc, kpt_shape]] # Pose(P3, P4, P5, P6)
diff --git a/ultralytics/cfg/models/v8/yolov8-pose.yaml b/ultralytics/cfg/models/v8/yolov8-pose.yaml
deleted file mode 100644
index c22bc43..0000000
--- a/ultralytics/cfg/models/v8/yolov8-pose.yaml
+++ /dev/null
@@ -1,50 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Ultralytics YOLOv8-pose keypoints/pose estimation model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolov8
-# Task docs: https://docs.ultralytics.com/tasks/pose
-
-# Parameters
-nc: 1 # number of classes
-kpt_shape: [17, 3] # number of keypoints, number of dims (2 for x,y or 3 for x,y,visible)
-scales: # model compound scaling constants, i.e. 'model=yolov8n-pose.yaml' will call yolov8-pose.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.33, 0.25, 1024]
-  s: [0.33, 0.50, 1024]
-  m: [0.67, 0.75, 768]
-  l: [1.00, 1.00, 512]
-  x: [1.00, 1.25, 512]
-
-# YOLOv8.0n backbone
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 3, C2f, [128, True]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 6, C2f, [256, True]]
-  - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
-  - [-1, 6, C2f, [512, True]]
-  - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
-  - [-1, 3, C2f, [1024, True]]
-  - [-1, 1, SPPF, [1024, 5]] # 9
-
-# YOLOv8.0n head
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 3, C2f, [512]] # 12
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 3, C2f, [256]] # 15 (P3/8-small)
-
-  - [-1, 1, Conv, [256, 3, 2]]
-  - [[-1, 12], 1, Concat, [1]] # cat head P4
-  - [-1, 3, C2f, [512]] # 18 (P4/16-medium)
-
-  - [-1, 1, Conv, [512, 3, 2]]
-  - [[-1, 9], 1, Concat, [1]] # cat head P5
-  - [-1, 3, C2f, [1024]] # 21 (P5/32-large)
-
-  - [[15, 18, 21], 1, Pose, [nc, kpt_shape]] # Pose(P3, P4, P5)
diff --git a/ultralytics/cfg/models/v8/yolov8-rtdetr.yaml b/ultralytics/cfg/models/v8/yolov8-rtdetr.yaml
deleted file mode 100644
index 2abded6..0000000
--- a/ultralytics/cfg/models/v8/yolov8-rtdetr.yaml
+++ /dev/null
@@ -1,49 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Ultralytics YOLOv8-RTDETR hybrid object detection model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/rtdetr
-# Task docs: https://docs.ultralytics.com/tasks/detect
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolov8n.yaml' will call yolov8.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.33, 0.25, 1024] # YOLOv8n-rtdetr summary: 235 layers, 9643868 parameters, 9643868 gradients, 17.1 GFLOPs
-  s: [0.33, 0.50, 1024] # YOLOv8s-rtdetr summary: 235 layers, 16518572 parameters, 16518572 gradients, 32.8 GFLOPs
-  m: [0.67, 0.75, 768] # YOLOv8m-rtdetr summary: 275 layers, 29645180 parameters, 29645180 gradients, 75.8 GFLOPs
-  l: [1.00, 1.00, 512] # YOLOv8l-rtdetr summary: 315 layers, 45644364 parameters, 45644364 gradients, 152.3 GFLOPs
-  x: [1.00, 1.25, 512] # YOLOv8x-rtdetr summary: 315 layers, 67113884 parameters, 67113884 gradients, 230.8 GFLOPs
-
-# YOLOv8.0n backbone
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 3, C2f, [128, True]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 6, C2f, [256, True]]
-  - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
-  - [-1, 6, C2f, [512, True]]
-  - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
-  - [-1, 3, C2f, [1024, True]]
-  - [-1, 1, SPPF, [1024, 5]] # 9
-
-# YOLOv8.0n head
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 3, C2f, [512]] # 12
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 3, C2f, [256]] # 15 (P3/8-small)
-
-  - [-1, 1, Conv, [256, 3, 2]]
-  - [[-1, 12], 1, Concat, [1]] # cat head P4
-  - [-1, 3, C2f, [512]] # 18 (P4/16-medium)
-
-  - [-1, 1, Conv, [512, 3, 2]]
-  - [[-1, 9], 1, Concat, [1]] # cat head P5
-  - [-1, 3, C2f, [1024]] # 21 (P5/32-large)
-
-  - [[15, 18, 21], 1, RTDETRDecoder, [nc]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/v8/yolov8-seg-p6.yaml b/ultralytics/cfg/models/v8/yolov8-seg-p6.yaml
deleted file mode 100644
index 4c7ba9b..0000000
--- a/ultralytics/cfg/models/v8/yolov8-seg-p6.yaml
+++ /dev/null
@@ -1,59 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Ultralytics YOLOv8-seg instance segmentation model with P3/8 - P6/64 outputs
-# Model docs: https://docs.ultralytics.com/models/yolov8
-# Task docs: https://docs.ultralytics.com/tasks/segment
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolov8n-seg-p6.yaml' will call yolov8-seg-p6.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.33, 0.25, 1024]
-  s: [0.33, 0.50, 1024]
-  m: [0.67, 0.75, 768]
-  l: [1.00, 1.00, 512]
-  x: [1.00, 1.25, 512]
-
-# YOLOv8.0x6 backbone
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 3, C2f, [128, True]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 6, C2f, [256, True]]
-  - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
-  - [-1, 6, C2f, [512, True]]
-  - [-1, 1, Conv, [768, 3, 2]] # 7-P5/32
-  - [-1, 3, C2f, [768, True]]
-  - [-1, 1, Conv, [1024, 3, 2]] # 9-P6/64
-  - [-1, 3, C2f, [1024, True]]
-  - [-1, 1, SPPF, [1024, 5]] # 11
-
-# YOLOv8.0x6 head
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 8], 1, Concat, [1]] # cat backbone P5
-  - [-1, 3, C2, [768, False]] # 14
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 3, C2, [512, False]] # 17
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 3, C2, [256, False]] # 20 (P3/8-small)
-
-  - [-1, 1, Conv, [256, 3, 2]]
-  - [[-1, 17], 1, Concat, [1]] # cat head P4
-  - [-1, 3, C2, [512, False]] # 23 (P4/16-medium)
-
-  - [-1, 1, Conv, [512, 3, 2]]
-  - [[-1, 14], 1, Concat, [1]] # cat head P5
-  - [-1, 3, C2, [768, False]] # 26 (P5/32-large)
-
-  - [-1, 1, Conv, [768, 3, 2]]
-  - [[-1, 11], 1, Concat, [1]] # cat head P6
-  - [-1, 3, C2, [1024, False]] # 29 (P6/64-xlarge)
-
-  - [[20, 23, 26, 29], 1, Segment, [nc, 32, 256]] # Pose(P3, P4, P5, P6)
diff --git a/ultralytics/cfg/models/v8/yolov8-seg.yaml b/ultralytics/cfg/models/v8/yolov8-seg.yaml
deleted file mode 100644
index 52b1c7e..0000000
--- a/ultralytics/cfg/models/v8/yolov8-seg.yaml
+++ /dev/null
@@ -1,49 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Ultralytics YOLOv8-seg instance segmentation model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolov8
-# Task docs: https://docs.ultralytics.com/tasks/segment
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolov8n-seg.yaml' will call yolov8-seg.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.33, 0.25, 1024]
-  s: [0.33, 0.50, 1024]
-  m: [0.67, 0.75, 768]
-  l: [1.00, 1.00, 512]
-  x: [1.00, 1.25, 512]
-
-# YOLOv8.0n backbone
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 3, C2f, [128, True]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 6, C2f, [256, True]]
-  - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
-  - [-1, 6, C2f, [512, True]]
-  - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
-  - [-1, 3, C2f, [1024, True]]
-  - [-1, 1, SPPF, [1024, 5]] # 9
-
-# YOLOv8.0n head
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 3, C2f, [512]] # 12
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 3, C2f, [256]] # 15 (P3/8-small)
-
-  - [-1, 1, Conv, [256, 3, 2]]
-  - [[-1, 12], 1, Concat, [1]] # cat head P4
-  - [-1, 3, C2f, [512]] # 18 (P4/16-medium)
-
-  - [-1, 1, Conv, [512, 3, 2]]
-  - [[-1, 9], 1, Concat, [1]] # cat head P5
-  - [-1, 3, C2f, [1024]] # 21 (P5/32-large)
-
-  - [[15, 18, 21], 1, Segment, [nc, 32, 256]] # Segment(P3, P4, P5)
diff --git a/ultralytics/cfg/models/v8/yolov8-world.yaml b/ultralytics/cfg/models/v8/yolov8-world.yaml
deleted file mode 100644
index 7c7a023..0000000
--- a/ultralytics/cfg/models/v8/yolov8-world.yaml
+++ /dev/null
@@ -1,51 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Ultralytics YOLOv8-World hybrid object detection model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolo-world
-# Task docs: https://docs.ultralytics.com/tasks/detect
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolov8n.yaml' will call yolov8.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.33, 0.25, 1024] # YOLOv8n-world summary: 161 layers, 4204111 parameters, 4204095 gradients, 39.6 GFLOPs
-  s: [0.33, 0.50, 1024] # YOLOv8s-world summary: 161 layers, 13383496 parameters, 13383480 gradients, 71.5 GFLOPs
-  m: [0.67, 0.75, 768] # YOLOv8m-world summary: 201 layers, 29065310 parameters, 29065294 gradients, 131.4 GFLOPs
-  l: [1.00, 1.00, 512] # YOLOv8l-world summary: 241 layers, 47553970 parameters, 47553954 gradients, 225.6 GFLOPs
-  x: [1.00, 1.25, 512] # YOLOv8x-world summary: 241 layers, 73690217 parameters, 73690201 gradients, 330.8 GFLOPs
-
-# YOLOv8.0n backbone
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 3, C2f, [128, True]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 6, C2f, [256, True]]
-  - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
-  - [-1, 6, C2f, [512, True]]
-  - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
-  - [-1, 3, C2f, [1024, True]]
-  - [-1, 1, SPPF, [1024, 5]] # 9
-
-# YOLOv8.0n head
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 3, C2fAttn, [512, 256, 8]] # 12
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 3, C2fAttn, [256, 128, 4]] # 15 (P3/8-small)
-
-  - [[15, 12, 9], 1, ImagePoolingAttn, [256]] # 16 (P3/8-small)
-
-  - [15, 1, Conv, [256, 3, 2]]
-  - [[-1, 12], 1, Concat, [1]] # cat head P4
-  - [-1, 3, C2fAttn, [512, 256, 8]] # 19 (P4/16-medium)
-
-  - [-1, 1, Conv, [512, 3, 2]]
-  - [[-1, 9], 1, Concat, [1]] # cat head P5
-  - [-1, 3, C2fAttn, [1024, 512, 16]] # 22 (P5/32-large)
-
-  - [[15, 19, 22], 1, WorldDetect, [nc, 512, False]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/v8/yolov8-worldv2.yaml b/ultralytics/cfg/models/v8/yolov8-worldv2.yaml
deleted file mode 100644
index 8396009..0000000
--- a/ultralytics/cfg/models/v8/yolov8-worldv2.yaml
+++ /dev/null
@@ -1,49 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Ultralytics YOLOv8-Worldv2 hybrid object detection model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolo-world
-# Task docs: https://docs.ultralytics.com/tasks/detect
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolov8n.yaml' will call yolov8.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.33, 0.25, 1024] # YOLOv8n-worldv2 summary: 148 layers, 3695183 parameters, 3695167 gradients, 19.5 GFLOPS
-  s: [0.33, 0.50, 1024] # YOLOv8s-worldv2 summary: 148 layers, 12759880 parameters, 12759864 gradients, 51.0 GFLOPS
-  m: [0.67, 0.75, 768] # YOLOv8m-worldv2 summary: 188 layers, 28376158 parameters, 28376142 gradients, 110.5 GFLOPS
-  l: [1.00, 1.00, 512] # YOLOv8l-worldv2 summary: 228 layers, 46832050 parameters, 46832034 gradients, 204.5 GFLOPS
-  x: [1.00, 1.25, 512] # YOLOv8x-worldv2 summary: 228 layers, 72886377 parameters, 72886361 gradients, 309.3 GFLOPS
-
-# YOLOv8.0n backbone
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 3, C2f, [128, True]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 6, C2f, [256, True]]
-  - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
-  - [-1, 6, C2f, [512, True]]
-  - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
-  - [-1, 3, C2f, [1024, True]]
-  - [-1, 1, SPPF, [1024, 5]] # 9
-
-# YOLOv8.0n head
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 3, C2fAttn, [512, 256, 8]] # 12
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 3, C2fAttn, [256, 128, 4]] # 15 (P3/8-small)
-
-  - [15, 1, Conv, [256, 3, 2]]
-  - [[-1, 12], 1, Concat, [1]] # cat head P4
-  - [-1, 3, C2fAttn, [512, 256, 8]] # 18 (P4/16-medium)
-
-  - [-1, 1, Conv, [512, 3, 2]]
-  - [[-1, 9], 1, Concat, [1]] # cat head P5
-  - [-1, 3, C2fAttn, [1024, 512, 16]] # 21 (P5/32-large)
-
-  - [[15, 18, 21], 1, WorldDetect, [nc, 512, True]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/v8/yolov8.yaml b/ultralytics/cfg/models/v8/yolov8.yaml
deleted file mode 100644
index 189b89c..0000000
--- a/ultralytics/cfg/models/v8/yolov8.yaml
+++ /dev/null
@@ -1,49 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Ultralytics YOLOv8 object detection model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolov8
-# Task docs: https://docs.ultralytics.com/tasks/detect
-
-# Parameters
-nc: 80 # number of classes
-scales: # model compound scaling constants, i.e. 'model=yolov8n.yaml' will call yolov8.yaml with scale 'n'
-  # [depth, width, max_channels]
-  n: [0.33, 0.25, 1024] # YOLOv8n summary: 129 layers, 3157200 parameters, 3157184 gradients, 8.9 GFLOPS
-  s: [0.33, 0.50, 1024] # YOLOv8s summary: 129 layers, 11166560 parameters, 11166544 gradients, 28.8 GFLOPS
-  m: [0.67, 0.75, 768] # YOLOv8m summary: 169 layers, 25902640 parameters, 25902624 gradients, 79.3 GFLOPS
-  l: [1.00, 1.00, 512] # YOLOv8l summary: 209 layers, 43691520 parameters, 43691504 gradients, 165.7 GFLOPS
-  x: [1.00, 1.25, 512] # YOLOv8x summary: 209 layers, 68229648 parameters, 68229632 gradients, 258.5 GFLOPS
-
-# YOLOv8.0n backbone
-backbone:
-  # [from, repeats, module, args]
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 3, C2f, [128, True]]
-  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
-  - [-1, 6, C2f, [256, True]]
-  - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
-  - [-1, 6, C2f, [512, True]]
-  - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
-  - [-1, 3, C2f, [1024, True]]
-  - [-1, 1, SPPF, [1024, 5]] # 9
-
-# YOLOv8.0n head
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 3, C2f, [512]] # 12
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 3, C2f, [256]] # 15 (P3/8-small)
-
-  - [-1, 1, Conv, [256, 3, 2]]
-  - [[-1, 12], 1, Concat, [1]] # cat head P4
-  - [-1, 3, C2f, [512]] # 18 (P4/16-medium)
-
-  - [-1, 1, Conv, [512, 3, 2]]
-  - [[-1, 9], 1, Concat, [1]] # cat head P5
-  - [-1, 3, C2f, [1024]] # 21 (P5/32-large)
-
-  - [[15, 18, 21], 1, Detect, [nc]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/v9/yolov9c-seg.yaml b/ultralytics/cfg/models/v9/yolov9c-seg.yaml
deleted file mode 100644
index 94940fe..0000000
--- a/ultralytics/cfg/models/v9/yolov9c-seg.yaml
+++ /dev/null
@@ -1,41 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# YOLOv9c-seg instance segmentation model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolov9
-# Task docs: https://docs.ultralytics.com/tasks/segment
-# 380 layers, 27897120 parameters, 159.4 GFLOPs
-
-# Parameters
-nc: 80 # number of classes
-
-# GELAN backbone
-backbone:
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 1, RepNCSPELAN4, [256, 128, 64, 1]] # 2
-  - [-1, 1, ADown, [256]] # 3-P3/8
-  - [-1, 1, RepNCSPELAN4, [512, 256, 128, 1]] # 4
-  - [-1, 1, ADown, [512]] # 5-P4/16
-  - [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]] # 6
-  - [-1, 1, ADown, [512]] # 7-P5/32
-  - [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]] # 8
-  - [-1, 1, SPPELAN, [512, 256]] # 9
-
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]] # 12
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 1, RepNCSPELAN4, [256, 256, 128, 1]] # 15 (P3/8-small)
-
-  - [-1, 1, ADown, [256]]
-  - [[-1, 12], 1, Concat, [1]] # cat head P4
-  - [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]] # 18 (P4/16-medium)
-
-  - [-1, 1, ADown, [512]]
-  - [[-1, 9], 1, Concat, [1]] # cat head P5
-  - [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]] # 21 (P5/32-large)
-
-  - [[15, 18, 21], 1, Segment, [nc, 32, 256]] # Segment(P3, P4, P5)
diff --git a/ultralytics/cfg/models/v9/yolov9c.yaml b/ultralytics/cfg/models/v9/yolov9c.yaml
deleted file mode 100644
index 2808d81..0000000
--- a/ultralytics/cfg/models/v9/yolov9c.yaml
+++ /dev/null
@@ -1,41 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# YOLOv9c object detection model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolov9
-# Task docs: https://docs.ultralytics.com/tasks/detect
-# 358 layers, 25590912 parameters, 104.0 GFLOPs
-
-# Parameters
-nc: 80 # number of classes
-
-# GELAN backbone
-backbone:
-  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
-  - [-1, 1, RepNCSPELAN4, [256, 128, 64, 1]] # 2
-  - [-1, 1, ADown, [256]] # 3-P3/8
-  - [-1, 1, RepNCSPELAN4, [512, 256, 128, 1]] # 4
-  - [-1, 1, ADown, [512]] # 5-P4/16
-  - [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]] # 6
-  - [-1, 1, ADown, [512]] # 7-P5/32
-  - [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]] # 8
-  - [-1, 1, SPPELAN, [512, 256]] # 9
-
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]] # 12
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 1, RepNCSPELAN4, [256, 256, 128, 1]] # 15 (P3/8-small)
-
-  - [-1, 1, ADown, [256]]
-  - [[-1, 12], 1, Concat, [1]] # cat head P4
-  - [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]] # 18 (P4/16-medium)
-
-  - [-1, 1, ADown, [512]]
-  - [[-1, 9], 1, Concat, [1]] # cat head P5
-  - [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]] # 21 (P5/32-large)
-
-  - [[15, 18, 21], 1, Detect, [nc]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/v9/yolov9e-seg.yaml b/ultralytics/cfg/models/v9/yolov9e-seg.yaml
deleted file mode 100644
index 056ec84..0000000
--- a/ultralytics/cfg/models/v9/yolov9e-seg.yaml
+++ /dev/null
@@ -1,64 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# YOLOv9e-seg instance segmentation model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolov9
-# Task docs: https://docs.ultralytics.com/tasks/segment
-# 743 layers, 60512800 parameters, 248.4 GFLOPs
-
-# Parameters
-nc: 80 # number of classes
-
-# GELAN backbone
-backbone:
-  - [-1, 1, nn.Identity, []]
-  - [-1, 1, Conv, [64, 3, 2]] # 1-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 2-P2/4
-  - [-1, 1, RepNCSPELAN4, [256, 128, 64, 2]] # 3
-  - [-1, 1, ADown, [256]] # 4-P3/8
-  - [-1, 1, RepNCSPELAN4, [512, 256, 128, 2]] # 5
-  - [-1, 1, ADown, [512]] # 6-P4/16
-  - [-1, 1, RepNCSPELAN4, [1024, 512, 256, 2]] # 7
-  - [-1, 1, ADown, [1024]] # 8-P5/32
-  - [-1, 1, RepNCSPELAN4, [1024, 512, 256, 2]] # 9
-
-  - [1, 1, CBLinear, [[64]]] # 10
-  - [3, 1, CBLinear, [[64, 128]]] # 11
-  - [5, 1, CBLinear, [[64, 128, 256]]] # 12
-  - [7, 1, CBLinear, [[64, 128, 256, 512]]] # 13
-  - [9, 1, CBLinear, [[64, 128, 256, 512, 1024]]] # 14
-
-  - [0, 1, Conv, [64, 3, 2]] # 15-P1/2
-  - [[10, 11, 12, 13, 14, -1], 1, CBFuse, [[0, 0, 0, 0, 0]]] # 16
-  - [-1, 1, Conv, [128, 3, 2]] # 17-P2/4
-  - [[11, 12, 13, 14, -1], 1, CBFuse, [[1, 1, 1, 1]]] # 18
-  - [-1, 1, RepNCSPELAN4, [256, 128, 64, 2]] # 19
-  - [-1, 1, ADown, [256]] # 20-P3/8
-  - [[12, 13, 14, -1], 1, CBFuse, [[2, 2, 2]]] # 21
-  - [-1, 1, RepNCSPELAN4, [512, 256, 128, 2]] # 22
-  - [-1, 1, ADown, [512]] # 23-P4/16
-  - [[13, 14, -1], 1, CBFuse, [[3, 3]]] # 24
-  - [-1, 1, RepNCSPELAN4, [1024, 512, 256, 2]] # 25
-  - [-1, 1, ADown, [1024]] # 26-P5/32
-  - [[14, -1], 1, CBFuse, [[4]]] # 27
-  - [-1, 1, RepNCSPELAN4, [1024, 512, 256, 2]] # 28
-  - [-1, 1, SPPELAN, [512, 256]] # 29
-
-# GELAN head
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 25], 1, Concat, [1]] # cat backbone P4
-  - [-1, 1, RepNCSPELAN4, [512, 512, 256, 2]] # 32
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 22], 1, Concat, [1]] # cat backbone P3
-  - [-1, 1, RepNCSPELAN4, [256, 256, 128, 2]] # 35 (P3/8-small)
-
-  - [-1, 1, ADown, [256]]
-  - [[-1, 32], 1, Concat, [1]] # cat head P4
-  - [-1, 1, RepNCSPELAN4, [512, 512, 256, 2]] # 38 (P4/16-medium)
-
-  - [-1, 1, ADown, [512]]
-  - [[-1, 29], 1, Concat, [1]] # cat head P5
-  - [-1, 1, RepNCSPELAN4, [512, 1024, 512, 2]] # 41 (P5/32-large)
-
-  - [[35, 38, 41], 1, Segment, [nc, 32, 256]] # Segment (P3, P4, P5)
diff --git a/ultralytics/cfg/models/v9/yolov9e.yaml b/ultralytics/cfg/models/v9/yolov9e.yaml
deleted file mode 100644
index 29a61e5..0000000
--- a/ultralytics/cfg/models/v9/yolov9e.yaml
+++ /dev/null
@@ -1,64 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# YOLOv9e object detection model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolov9
-# Task docs: https://docs.ultralytics.com/tasks/detect
-# 721 layers, 58206592 parameters, 193.0 GFLOPs
-
-# Parameters
-nc: 80 # number of classes
-
-# GELAN backbone
-backbone:
-  - [-1, 1, nn.Identity, []]
-  - [-1, 1, Conv, [64, 3, 2]] # 1-P1/2
-  - [-1, 1, Conv, [128, 3, 2]] # 2-P2/4
-  - [-1, 1, RepNCSPELAN4, [256, 128, 64, 2]] # 3
-  - [-1, 1, ADown, [256]] # 4-P3/8
-  - [-1, 1, RepNCSPELAN4, [512, 256, 128, 2]] # 5
-  - [-1, 1, ADown, [512]] # 6-P4/16
-  - [-1, 1, RepNCSPELAN4, [1024, 512, 256, 2]] # 7
-  - [-1, 1, ADown, [1024]] # 8-P5/32
-  - [-1, 1, RepNCSPELAN4, [1024, 512, 256, 2]] # 9
-
-  - [1, 1, CBLinear, [[64]]] # 10
-  - [3, 1, CBLinear, [[64, 128]]] # 11
-  - [5, 1, CBLinear, [[64, 128, 256]]] # 12
-  - [7, 1, CBLinear, [[64, 128, 256, 512]]] # 13
-  - [9, 1, CBLinear, [[64, 128, 256, 512, 1024]]] # 14
-
-  - [0, 1, Conv, [64, 3, 2]] # 15-P1/2
-  - [[10, 11, 12, 13, 14, -1], 1, CBFuse, [[0, 0, 0, 0, 0]]] # 16
-  - [-1, 1, Conv, [128, 3, 2]] # 17-P2/4
-  - [[11, 12, 13, 14, -1], 1, CBFuse, [[1, 1, 1, 1]]] # 18
-  - [-1, 1, RepNCSPELAN4, [256, 128, 64, 2]] # 19
-  - [-1, 1, ADown, [256]] # 20-P3/8
-  - [[12, 13, 14, -1], 1, CBFuse, [[2, 2, 2]]] # 21
-  - [-1, 1, RepNCSPELAN4, [512, 256, 128, 2]] # 22
-  - [-1, 1, ADown, [512]] # 23-P4/16
-  - [[13, 14, -1], 1, CBFuse, [[3, 3]]] # 24
-  - [-1, 1, RepNCSPELAN4, [1024, 512, 256, 2]] # 25
-  - [-1, 1, ADown, [1024]] # 26-P5/32
-  - [[14, -1], 1, CBFuse, [[4]]] # 27
-  - [-1, 1, RepNCSPELAN4, [1024, 512, 256, 2]] # 28
-  - [-1, 1, SPPELAN, [512, 256]] # 29
-
-# GELAN head
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 25], 1, Concat, [1]] # cat backbone P4
-  - [-1, 1, RepNCSPELAN4, [512, 512, 256, 2]] # 32
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 22], 1, Concat, [1]] # cat backbone P3
-  - [-1, 1, RepNCSPELAN4, [256, 256, 128, 2]] # 35 (P3/8-small)
-
-  - [-1, 1, ADown, [256]]
-  - [[-1, 32], 1, Concat, [1]] # cat head P4
-  - [-1, 1, RepNCSPELAN4, [512, 512, 256, 2]] # 38 (P4/16-medium)
-
-  - [-1, 1, ADown, [512]]
-  - [[-1, 29], 1, Concat, [1]] # cat head P5
-  - [-1, 1, RepNCSPELAN4, [512, 1024, 512, 2]] # 41 (P5/32-large)
-
-  - [[35, 38, 41], 1, Detect, [nc]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/v9/yolov9m.yaml b/ultralytics/cfg/models/v9/yolov9m.yaml
deleted file mode 100644
index 683f90d..0000000
--- a/ultralytics/cfg/models/v9/yolov9m.yaml
+++ /dev/null
@@ -1,41 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# YOLOv9m object detection model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolov9
-# Task docs: https://docs.ultralytics.com/tasks/detect
-# 348 layers, 20216160 parameters, 77.9 GFLOPs
-
-# Parameters
-nc: 80 # number of classes
-
-# GELAN backbone
-backbone:
-  - [-1, 1, Conv, [32, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [64, 3, 2]] # 1-P2/4
-  - [-1, 1, RepNCSPELAN4, [128, 128, 64, 1]] # 2
-  - [-1, 1, AConv, [240]] # 3-P3/8
-  - [-1, 1, RepNCSPELAN4, [240, 240, 120, 1]] # 4
-  - [-1, 1, AConv, [360]] # 5-P4/16
-  - [-1, 1, RepNCSPELAN4, [360, 360, 180, 1]] # 6
-  - [-1, 1, AConv, [480]] # 7-P5/32
-  - [-1, 1, RepNCSPELAN4, [480, 480, 240, 1]] # 8
-  - [-1, 1, SPPELAN, [480, 240]] # 9
-
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 1, RepNCSPELAN4, [360, 360, 180, 1]] # 12
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 1, RepNCSPELAN4, [240, 240, 120, 1]] # 15
-
-  - [-1, 1, AConv, [180]]
-  - [[-1, 12], 1, Concat, [1]] # cat head P4
-  - [-1, 1, RepNCSPELAN4, [360, 360, 180, 1]] # 18 (P4/16-medium)
-
-  - [-1, 1, AConv, [240]]
-  - [[-1, 9], 1, Concat, [1]] # cat head P5
-  - [-1, 1, RepNCSPELAN4, [480, 480, 240, 1]] # 21 (P5/32-large)
-
-  - [[15, 18, 21], 1, Detect, [nc]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/models/v9/yolov9s.yaml b/ultralytics/cfg/models/v9/yolov9s.yaml
deleted file mode 100644
index 6758a29..0000000
--- a/ultralytics/cfg/models/v9/yolov9s.yaml
+++ /dev/null
@@ -1,41 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# YOLOv9s object detection model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolov9
-# Task docs: https://docs.ultralytics.com/tasks/detect
-# 544 layers, 7318368 parameters, 27.6 GFLOPs
-
-# Parameters
-nc: 80 # number of classes
-
-# GELAN backbone
-backbone:
-  - [-1, 1, Conv, [32, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [64, 3, 2]] # 1-P2/4
-  - [-1, 1, ELAN1, [64, 64, 32]] # 2
-  - [-1, 1, AConv, [128]] # 3-P3/8
-  - [-1, 1, RepNCSPELAN4, [128, 128, 64, 3]] # 4
-  - [-1, 1, AConv, [192]] # 5-P4/16
-  - [-1, 1, RepNCSPELAN4, [192, 192, 96, 3]] # 6
-  - [-1, 1, AConv, [256]] # 7-P5/32
-  - [-1, 1, RepNCSPELAN4, [256, 256, 128, 3]] # 8
-  - [-1, 1, SPPELAN, [256, 128]] # 9
-
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 1, RepNCSPELAN4, [192, 192, 96, 3]] # 12
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 1, RepNCSPELAN4, [128, 128, 64, 3]] # 15
-
-  - [-1, 1, AConv, [96]]
-  - [[-1, 12], 1, Concat, [1]] # cat head P4
-  - [-1, 1, RepNCSPELAN4, [192, 192, 96, 3]] # 18 (P4/16-medium)
-
-  - [-1, 1, AConv, [128]]
-  - [[-1, 9], 1, Concat, [1]] # cat head P5
-  - [-1, 1, RepNCSPELAN4, [256, 256, 128, 3]] # 21 (P5/32-large)
-
-  - [[15, 18, 21], 1, Detect, [nc]] # Detect(P3, P4 P5)
diff --git a/ultralytics/cfg/models/v9/yolov9t.yaml b/ultralytics/cfg/models/v9/yolov9t.yaml
deleted file mode 100644
index 950d28f..0000000
--- a/ultralytics/cfg/models/v9/yolov9t.yaml
+++ /dev/null
@@ -1,41 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# YOLOv9t object detection model with P3/8 - P5/32 outputs
-# Model docs: https://docs.ultralytics.com/models/yolov9
-# Task docs: https://docs.ultralytics.com/tasks/detect
-# 544 layers, 2128720 parameters, 8.5 GFLOPs
-
-# Parameters
-nc: 80 # number of classes
-
-# GELAN backbone
-backbone:
-  - [-1, 1, Conv, [16, 3, 2]] # 0-P1/2
-  - [-1, 1, Conv, [32, 3, 2]] # 1-P2/4
-  - [-1, 1, ELAN1, [32, 32, 16]] # 2
-  - [-1, 1, AConv, [64]] # 3-P3/8
-  - [-1, 1, RepNCSPELAN4, [64, 64, 32, 3]] # 4
-  - [-1, 1, AConv, [96]] # 5-P4/16
-  - [-1, 1, RepNCSPELAN4, [96, 96, 48, 3]] # 6
-  - [-1, 1, AConv, [128]] # 7-P5/32
-  - [-1, 1, RepNCSPELAN4, [128, 128, 64, 3]] # 8
-  - [-1, 1, SPPELAN, [128, 64]] # 9
-
-head:
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
-  - [-1, 1, RepNCSPELAN4, [96, 96, 48, 3]] # 12
-
-  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
-  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
-  - [-1, 1, RepNCSPELAN4, [64, 64, 32, 3]] # 15
-
-  - [-1, 1, AConv, [48]]
-  - [[-1, 12], 1, Concat, [1]] # cat head P4
-  - [-1, 1, RepNCSPELAN4, [96, 96, 48, 3]] # 18 (P4/16-medium)
-
-  - [-1, 1, AConv, [64]]
-  - [[-1, 9], 1, Concat, [1]] # cat head P5
-  - [-1, 1, RepNCSPELAN4, [128, 128, 64, 3]] # 21 (P5/32-large)
-
-  - [[15, 18, 21], 1, Detect, [nc]] # Detect(P3, P4, P5)
diff --git a/ultralytics/cfg/trackers/botsort.yaml b/ultralytics/cfg/trackers/botsort.yaml
deleted file mode 100644
index 101d82f..0000000
--- a/ultralytics/cfg/trackers/botsort.yaml
+++ /dev/null
@@ -1,21 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# BoT-SORT tracker defaults for mode="track"
-# Docs: https://docs.ultralytics.com/modes/track/
-
-tracker_type: botsort # (str) Tracker backend: botsort|bytetrack; choose botsort to enable BoT-SORT features
-track_high_thresh: 0.25 # (float) First-stage match threshold; raise for cleaner tracks, lower to keep more
-track_low_thresh: 0.1 # (float) Second-stage threshold for low-score matches; balances recovery vs drift
-new_track_thresh: 0.25 # (float) Start a new track if no match ≥ this; higher reduces false tracks
-track_buffer: 30 # (int) Frames to keep lost tracks alive; higher handles occlusion, increases ID switches risk
-match_thresh: 0.8 # (float) Association similarity threshold (IoU/cost); tune with detector quality
-fuse_score: True # (bool) Fuse detection score with motion/IoU for matching; stabilizes weak detections
-
-# BoT-SORT specifics
-gmc_method: sparseOptFlow # (str) Global motion compensation: sparseOptFlow|orb|none; helps moving camera scenes
-
-# ReID model related thresh
-proximity_thresh: 0.5 # (float) Min IoU to consider tracks proximate for ReID; higher is stricter
-appearance_thresh: 0.8 # (float) Min appearance similarity for ReID; raise to avoid identity swaps
-with_reid: False # (bool) Enable ReID model use; needs extra model and compute
-model: auto # (str) ReID model name/path; "auto" uses detector features if available
diff --git a/ultralytics/cfg/trackers/bytetrack.yaml b/ultralytics/cfg/trackers/bytetrack.yaml
deleted file mode 100644
index 9c3718d..0000000
--- a/ultralytics/cfg/trackers/bytetrack.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# ByteTrack tracker defaults for mode="track"
-# Docs: https://docs.ultralytics.com/modes/track/
-
-tracker_type: bytetrack # (str) Tracker backend: botsort|bytetrack; choose bytetrack for the classic baseline
-track_high_thresh: 0.25 # (float) First-stage match threshold; raise for cleaner tracks, lower to keep more
-track_low_thresh: 0.1 # (float) Second-stage threshold for low-score matches; balances recovery vs drift
-new_track_thresh: 0.25 # (float) Start a new track if no match ≥ this; higher reduces false tracks
-track_buffer: 30 # (int) Frames to keep lost tracks alive; higher handles occlusion, increases ID switches risk
-match_thresh: 0.8 # (float) Association similarity threshold (IoU/cost); tune with detector quality
-fuse_score: True # (bool) Fuse detection score with motion/IoU for matching; stabilizes weak detections
diff --git a/ultralytics/data/__init__.py b/ultralytics/data/__init__.py
deleted file mode 100644
index a5d258d..0000000
--- a/ultralytics/data/__init__.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from .base import BaseDataset
-from .build import build_dataloader, build_grounding, build_yolo_dataset, load_inference_source
-from .dataset import (
-    ClassificationDataset,
-    GroundingDataset,
-    SemanticDataset,
-    YOLOConcatDataset,
-    YOLODataset,
-    YOLOMultiModalDataset,
-)
-
-__all__ = (
-    "BaseDataset",
-    "ClassificationDataset",
-    "SemanticDataset",
-    "YOLODataset",
-    "YOLOMultiModalDataset",
-    "YOLOConcatDataset",
-    "GroundingDataset",
-    "build_yolo_dataset",
-    "build_grounding",
-    "build_dataloader",
-    "load_inference_source",
-)
diff --git a/ultralytics/data/__pycache__/__init__.cpython-310.pyc b/ultralytics/data/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index 92c2e12..0000000
Binary files a/ultralytics/data/__pycache__/__init__.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/data/__pycache__/augment.cpython-310.pyc b/ultralytics/data/__pycache__/augment.cpython-310.pyc
deleted file mode 100644
index 0a8957f..0000000
Binary files a/ultralytics/data/__pycache__/augment.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/data/__pycache__/base.cpython-310.pyc b/ultralytics/data/__pycache__/base.cpython-310.pyc
deleted file mode 100644
index 6368ec7..0000000
Binary files a/ultralytics/data/__pycache__/base.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/data/__pycache__/build.cpython-310.pyc b/ultralytics/data/__pycache__/build.cpython-310.pyc
deleted file mode 100644
index f9391bc..0000000
Binary files a/ultralytics/data/__pycache__/build.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/data/__pycache__/converter.cpython-310.pyc b/ultralytics/data/__pycache__/converter.cpython-310.pyc
deleted file mode 100644
index 9a20873..0000000
Binary files a/ultralytics/data/__pycache__/converter.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/data/__pycache__/dataset.cpython-310.pyc b/ultralytics/data/__pycache__/dataset.cpython-310.pyc
deleted file mode 100644
index a491b32..0000000
Binary files a/ultralytics/data/__pycache__/dataset.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/data/__pycache__/loaders.cpython-310.pyc b/ultralytics/data/__pycache__/loaders.cpython-310.pyc
deleted file mode 100644
index 709d19f..0000000
Binary files a/ultralytics/data/__pycache__/loaders.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/data/__pycache__/utils.cpython-310.pyc b/ultralytics/data/__pycache__/utils.cpython-310.pyc
deleted file mode 100644
index 94f8e06..0000000
Binary files a/ultralytics/data/__pycache__/utils.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/data/annotator.py b/ultralytics/data/annotator.py
deleted file mode 100644
index 5600ee3..0000000
--- a/ultralytics/data/annotator.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-from pathlib import Path
-
-from ultralytics import SAM, YOLO
-
-
-def auto_annotate(
-    data: str | Path,
-    det_model: str = "yolo11x.pt",
-    sam_model: str = "sam_b.pt",
-    device: str = "",
-    conf: float = 0.25,
-    iou: float = 0.45,
-    imgsz: int = 640,
-    max_det: int = 300,
-    classes: list[int] | None = None,
-    output_dir: str | Path | None = None,
-) -> None:
-    """
-    Automatically annotate images using a YOLO object detection model and a SAM segmentation model.
-
-    This function processes images in a specified directory, detects objects using a YOLO model, and then generates
-    segmentation masks using a SAM model. The resulting annotations are saved as text files in YOLO format.
-
-    Args:
-        data (str | Path): Path to a folder containing images to be annotated.
-        det_model (str): Path or name of the pre-trained YOLO detection model.
-        sam_model (str): Path or name of the pre-trained SAM segmentation model.
-        device (str): Device to run the models on (e.g., 'cpu', 'cuda', '0'). Empty string for auto-selection.
-        conf (float): Confidence threshold for detection model.
-        iou (float): IoU threshold for filtering overlapping boxes in detection results.
-        imgsz (int): Input image resize dimension.
-        max_det (int): Maximum number of detections per image.
-        classes (list[int], optional): Filter predictions to specified class IDs, returning only relevant detections.
-        output_dir (str | Path, optional): Directory to save the annotated results. If None, creates a default
-            directory based on the input data path.
-
-    Examples:
-        >>> from ultralytics.data.annotator import auto_annotate
-        >>> auto_annotate(data="ultralytics/assets", det_model="yolo11n.pt", sam_model="mobile_sam.pt")
-    """
-    det_model = YOLO(det_model)
-    sam_model = SAM(sam_model)
-
-    data = Path(data)
-    if not output_dir:
-        output_dir = data.parent / f"{data.stem}_auto_annotate_labels"
-    Path(output_dir).mkdir(exist_ok=True, parents=True)
-
-    det_results = det_model(
-        data, stream=True, device=device, conf=conf, iou=iou, imgsz=imgsz, max_det=max_det, classes=classes
-    )
-
-    for result in det_results:
-        if class_ids := result.boxes.cls.int().tolist():  # Extract class IDs from detection results
-            boxes = result.boxes.xyxy  # Boxes object for bbox outputs
-            sam_results = sam_model(result.orig_img, bboxes=boxes, verbose=False, save=False, device=device)
-            segments = sam_results[0].masks.xyn
-
-            with open(f"{Path(output_dir) / Path(result.path).stem}.txt", "w", encoding="utf-8") as f:
-                for i, s in enumerate(segments):
-                    if s.any():
-                        segment = map(str, s.reshape(-1).tolist())
-                        f.write(f"{class_ids[i]} " + " ".join(segment) + "\n")
diff --git a/ultralytics/data/augment.py b/ultralytics/data/augment.py
deleted file mode 100644
index 092a1bd..0000000
--- a/ultralytics/data/augment.py
+++ /dev/null
@@ -1,2991 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import math
-import random
-from copy import deepcopy
-from typing import Any
-
-import cv2
-import numpy as np
-import torch
-from PIL import Image
-from torch.nn import functional as F
-
-from ultralytics.data.utils import polygons2masks, polygons2masks_overlap
-from ultralytics.utils import LOGGER, IterableSimpleNamespace, colorstr
-from ultralytics.utils.checks import check_version
-from ultralytics.utils.instance import Instances
-from ultralytics.utils.metrics import bbox_ioa
-from ultralytics.utils.ops import segment2box, xywh2xyxy, xyxyxyxy2xywhr
-from ultralytics.utils.torch_utils import TORCHVISION_0_10, TORCHVISION_0_11, TORCHVISION_0_13
-
-DEFAULT_MEAN = (0.0, 0.0, 0.0)
-DEFAULT_STD = (1.0, 1.0, 1.0)
-
-
-class BaseTransform:
-    """
-    Base class for image transformations in the Ultralytics library.
-
-    This class serves as a foundation for implementing various image processing operations, designed to be
-    compatible with both classification and semantic segmentation tasks.
-
-    Methods:
-        apply_image: Apply image transformations to labels.
-        apply_instances: Apply transformations to object instances in labels.
-        apply_semantic: Apply semantic segmentation to an image.
-        __call__: Apply all label transformations to an image, instances, and semantic masks.
-
-    Examples:
-        >>> transform = BaseTransform()
-        >>> labels = {"image": np.array(...), "instances": [...], "semantic": np.array(...)}
-        >>> transformed_labels = transform(labels)
-    """
-
-    def __init__(self) -> None:
-        """
-        Initialize the BaseTransform object.
-
-        This constructor sets up the base transformation object, which can be extended for specific image
-        processing tasks. It is designed to be compatible with both classification and semantic segmentation.
-
-        Examples:
-            >>> transform = BaseTransform()
-        """
-        pass
-
-    def apply_image(self, labels):
-        """
-        Apply image transformations to labels.
-
-        This method is intended to be overridden by subclasses to implement specific image transformation
-        logic. In its base form, it returns the input labels unchanged.
-
-        Args:
-            labels (Any): The input labels to be transformed. The exact type and structure of labels may
-                vary depending on the specific implementation.
-
-        Returns:
-            (Any): The transformed labels. In the base implementation, this is identical to the input.
-
-        Examples:
-            >>> transform = BaseTransform()
-            >>> original_labels = [1, 2, 3]
-            >>> transformed_labels = transform.apply_image(original_labels)
-            >>> print(transformed_labels)
-            [1, 2, 3]
-        """
-        pass
-
-    def apply_instances(self, labels):
-        """
-        Apply transformations to object instances in labels.
-
-        This method is responsible for applying various transformations to object instances within the given
-        labels. It is designed to be overridden by subclasses to implement specific instance transformation
-        logic.
-
-        Args:
-            labels (dict): A dictionary containing label information, including object instances.
-
-        Returns:
-            (dict): The modified labels dictionary with transformed object instances.
-
-        Examples:
-            >>> transform = BaseTransform()
-            >>> labels = {"instances": Instances(xyxy=torch.rand(5, 4), cls=torch.randint(0, 80, (5,)))}
-            >>> transformed_labels = transform.apply_instances(labels)
-        """
-        pass
-
-    def apply_semantic(self, labels):
-        """
-        Apply semantic segmentation transformations to an image.
-
-        This method is intended to be overridden by subclasses to implement specific semantic segmentation
-        transformations. In its base form, it does not perform any operations.
-
-        Args:
-            labels (Any): The input labels or semantic segmentation mask to be transformed.
-
-        Returns:
-            (Any): The transformed semantic segmentation mask or labels.
-
-        Examples:
-            >>> transform = BaseTransform()
-            >>> semantic_mask = np.zeros((100, 100), dtype=np.uint8)
-            >>> transformed_mask = transform.apply_semantic(semantic_mask)
-        """
-        pass
-
-    def __call__(self, labels):
-        """
-        Apply all label transformations to an image, instances, and semantic masks.
-
-        This method orchestrates the application of various transformations defined in the BaseTransform class
-        to the input labels. It sequentially calls the apply_image and apply_instances methods to process the
-        image and object instances, respectively.
-
-        Args:
-            labels (dict): A dictionary containing image data and annotations. Expected keys include 'img' for
-                the image data, and 'instances' for object instances.
-
-        Returns:
-            (dict): The input labels dictionary with transformed image and instances.
-
-        Examples:
-            >>> transform = BaseTransform()
-            >>> labels = {"img": np.random.rand(640, 640, 3), "instances": []}
-            >>> transformed_labels = transform(labels)
-        """
-        self.apply_image(labels)
-        self.apply_instances(labels)
-        self.apply_semantic(labels)
-
-
-class Compose:
-    """
-    A class for composing multiple image transformations.
-
-    Attributes:
-        transforms (list[Callable]): A list of transformation functions to be applied sequentially.
-
-    Methods:
-        __call__: Apply a series of transformations to input data.
-        append: Append a new transform to the existing list of transforms.
-        insert: Insert a new transform at a specified index in the list of transforms.
-        __getitem__: Retrieve a specific transform or a set of transforms using indexing.
-        __setitem__: Set a specific transform or a set of transforms using indexing.
-        tolist: Convert the list of transforms to a standard Python list.
-
-    Examples:
-        >>> transforms = [RandomFlip(), RandomPerspective(30)]
-        >>> compose = Compose(transforms)
-        >>> transformed_data = compose(data)
-        >>> compose.append(CenterCrop((224, 224)))
-        >>> compose.insert(0, RandomFlip())
-    """
-
-    def __init__(self, transforms):
-        """
-        Initialize the Compose object with a list of transforms.
-
-        Args:
-            transforms (list[Callable]): A list of callable transform objects to be applied sequentially.
-
-        Examples:
-            >>> from ultralytics.data.augment import Compose, RandomHSV, RandomFlip
-            >>> transforms = [RandomHSV(), RandomFlip()]
-            >>> compose = Compose(transforms)
-        """
-        self.transforms = transforms if isinstance(transforms, list) else [transforms]
-
-    def __call__(self, data):
-        """
-        Apply a series of transformations to input data.
-
-        This method sequentially applies each transformation in the Compose object's transforms to the input data.
-
-        Args:
-            data (Any): The input data to be transformed. This can be of any type, depending on the
-                transformations in the list.
-
-        Returns:
-            (Any): The transformed data after applying all transformations in sequence.
-
-        Examples:
-            >>> transforms = [Transform1(), Transform2(), Transform3()]
-            >>> compose = Compose(transforms)
-            >>> transformed_data = compose(input_data)
-        """
-        for t in self.transforms:
-            data = t(data)
-        return data
-
-    def append(self, transform):
-        """
-        Append a new transform to the existing list of transforms.
-
-        Args:
-            transform (BaseTransform): The transformation to be added to the composition.
-
-        Examples:
-            >>> compose = Compose([RandomFlip(), RandomPerspective()])
-            >>> compose.append(RandomHSV())
-        """
-        self.transforms.append(transform)
-
-    def insert(self, index, transform):
-        """
-        Insert a new transform at a specified index in the existing list of transforms.
-
-        Args:
-            index (int): The index at which to insert the new transform.
-            transform (BaseTransform): The transform object to be inserted.
-
-        Examples:
-            >>> compose = Compose([Transform1(), Transform2()])
-            >>> compose.insert(1, Transform3())
-            >>> len(compose.transforms)
-            3
-        """
-        self.transforms.insert(index, transform)
-
-    def __getitem__(self, index: list | int) -> Compose:
-        """
-        Retrieve a specific transform or a set of transforms using indexing.
-
-        Args:
-            index (int | list[int]): Index or list of indices of the transforms to retrieve.
-
-        Returns:
-            (Compose): A new Compose object containing the selected transform(s).
-
-        Raises:
-            AssertionError: If the index is not of type int or list.
-
-        Examples:
-            >>> transforms = [RandomFlip(), RandomPerspective(10), RandomHSV(0.5, 0.5, 0.5)]
-            >>> compose = Compose(transforms)
-            >>> single_transform = compose[1]  # Returns a Compose object with only RandomPerspective
-            >>> multiple_transforms = compose[0:2]  # Returns a Compose object with RandomFlip and RandomPerspective
-        """
-        assert isinstance(index, (int, list)), f"The indices should be either list or int type but got {type(index)}"
-        return Compose([self.transforms[i] for i in index]) if isinstance(index, list) else self.transforms[index]
-
-    def __setitem__(self, index: list | int, value: list | int) -> None:
-        """
-        Set one or more transforms in the composition using indexing.
-
-        Args:
-            index (int | list[int]): Index or list of indices to set transforms at.
-            value (Any | list[Any]): Transform or list of transforms to set at the specified index(es).
-
-        Raises:
-            AssertionError: If index type is invalid, value type doesn't match index type, or index is out of range.
-
-        Examples:
-            >>> compose = Compose([Transform1(), Transform2(), Transform3()])
-            >>> compose[1] = NewTransform()  # Replace second transform
-            >>> compose[0:2] = [NewTransform1(), NewTransform2()]  # Replace first two transforms
-        """
-        assert isinstance(index, (int, list)), f"The indices should be either list or int type but got {type(index)}"
-        if isinstance(index, list):
-            assert isinstance(value, list), (
-                f"The indices should be the same type as values, but got {type(index)} and {type(value)}"
-            )
-        if isinstance(index, int):
-            index, value = [index], [value]
-        for i, v in zip(index, value):
-            assert i < len(self.transforms), f"list index {i} out of range {len(self.transforms)}."
-            self.transforms[i] = v
-
-    def tolist(self):
-        """
-        Convert the list of transforms to a standard Python list.
-
-        Returns:
-            (list): A list containing all the transform objects in the Compose instance.
-
-        Examples:
-            >>> transforms = [RandomFlip(), RandomPerspective(10), CenterCrop()]
-            >>> compose = Compose(transforms)
-            >>> transform_list = compose.tolist()
-            >>> print(len(transform_list))
-            3
-        """
-        return self.transforms
-
-    def __repr__(self):
-        """
-        Return a string representation of the Compose object.
-
-        Returns:
-            (str): A string representation of the Compose object, including the list of transforms.
-
-        Examples:
-            >>> transforms = [RandomFlip(), RandomPerspective(degrees=10, translate=0.1, scale=0.1)]
-            >>> compose = Compose(transforms)
-            >>> print(compose)
-            Compose([
-                RandomFlip(),
-                RandomPerspective(degrees=10, translate=0.1, scale=0.1)
-            ])
-        """
-        return f"{self.__class__.__name__}({', '.join([f'{t}' for t in self.transforms])})"
-
-
-class BaseMixTransform:
-    """
-    Base class for mix transformations like Cutmix, MixUp and Mosaic.
-
-    This class provides a foundation for implementing mix transformations on datasets. It handles the
-    probability-based application of transforms and manages the mixing of multiple images and labels.
-
-    Attributes:
-        dataset (Any): The dataset object containing images and labels.
-        pre_transform (Callable | None): Optional transform to apply before mixing.
-        p (float): Probability of applying the mix transformation.
-
-    Methods:
-        __call__: Apply the mix transformation to the input labels.
-        _mix_transform: Abstract method to be implemented by subclasses for specific mix operations.
-        get_indexes: Abstract method to get indexes of images to be mixed.
-        _update_label_text: Update label text for mixed images.
-
-    Examples:
-        >>> class CustomMixTransform(BaseMixTransform):
-        ...     def _mix_transform(self, labels):
-        ...         # Implement custom mix logic here
-        ...         return labels
-        ...
-        ...     def get_indexes(self):
-        ...         return [random.randint(0, len(self.dataset) - 1) for _ in range(3)]
-        >>> dataset = YourDataset()
-        >>> transform = CustomMixTransform(dataset, p=0.5)
-        >>> mixed_labels = transform(original_labels)
-    """
-
-    def __init__(self, dataset, pre_transform=None, p=0.0) -> None:
-        """
-        Initialize the BaseMixTransform object for mix transformations like CutMix, MixUp and Mosaic.
-
-        This class serves as a base for implementing mix transformations in image processing pipelines.
-
-        Args:
-            dataset (Any): The dataset object containing images and labels for mixing.
-            pre_transform (Callable | None): Optional transform to apply before mixing.
-            p (float): Probability of applying the mix transformation. Should be in the range [0.0, 1.0].
-
-        Examples:
-            >>> dataset = YOLODataset("path/to/data")
-            >>> pre_transform = Compose([RandomFlip(), RandomPerspective()])
-            >>> mix_transform = BaseMixTransform(dataset, pre_transform, p=0.5)
-        """
-        self.dataset = dataset
-        self.pre_transform = pre_transform
-        self.p = p
-
-    def __call__(self, labels: dict[str, Any]) -> dict[str, Any]:
-        """
-        Apply pre-processing transforms and cutmix/mixup/mosaic transforms to labels data.
-
-        This method determines whether to apply the mix transform based on a probability factor. If applied, it
-        selects additional images, applies pre-transforms if specified, and then performs the mix transform.
-
-        Args:
-            labels (dict[str, Any]): A dictionary containing label data for an image.
-
-        Returns:
-            (dict[str, Any]): The transformed labels dictionary, which may include mixed data from other images.
-
-        Examples:
-            >>> transform = BaseMixTransform(dataset, pre_transform=None, p=0.5)
-            >>> result = transform({"image": img, "bboxes": boxes, "cls": classes})
-        """
-        if random.uniform(0, 1) > self.p:
-            return labels
-
-        # Get index of one or three other images
-        indexes = self.get_indexes()
-        if isinstance(indexes, int):
-            indexes = [indexes]
-
-        # Get images information will be used for Mosaic, CutMix or MixUp
-        mix_labels = [self.dataset.get_image_and_label(i) for i in indexes]
-
-        if self.pre_transform is not None:
-            for i, data in enumerate(mix_labels):
-                mix_labels[i] = self.pre_transform(data)
-        labels["mix_labels"] = mix_labels
-
-        # Update cls and texts
-        labels = self._update_label_text(labels)
-        # Mosaic, CutMix or MixUp
-        labels = self._mix_transform(labels)
-        labels.pop("mix_labels", None)
-        return labels
-
-    def _mix_transform(self, labels: dict[str, Any]):
-        """
-        Apply CutMix, MixUp or Mosaic augmentation to the label dictionary.
-
-        This method should be implemented by subclasses to perform specific mix transformations like CutMix, MixUp or
-        Mosaic. It modifies the input label dictionary in-place with the augmented data.
-
-        Args:
-            labels (dict[str, Any]): A dictionary containing image and label data. Expected to have a 'mix_labels' key
-                with a list of additional image and label data for mixing.
-
-        Returns:
-            (dict[str, Any]): The modified labels dictionary with augmented data after applying the mix transform.
-
-        Examples:
-            >>> transform = BaseMixTransform(dataset)
-            >>> labels = {"image": img, "bboxes": boxes, "mix_labels": [{"image": img2, "bboxes": boxes2}]}
-            >>> augmented_labels = transform._mix_transform(labels)
-        """
-        raise NotImplementedError
-
-    def get_indexes(self):
-        """
-        Get a list of shuffled indexes for mosaic augmentation.
-
-        Returns:
-            (list[int]): A list of shuffled indexes from the dataset.
-
-        Examples:
-            >>> transform = BaseMixTransform(dataset)
-            >>> indexes = transform.get_indexes()
-            >>> print(indexes)  # [3, 18, 7, 2]
-        """
-        return random.randint(0, len(self.dataset) - 1)
-
-    @staticmethod
-    def _update_label_text(labels: dict[str, Any]) -> dict[str, Any]:
-        """
-        Update label text and class IDs for mixed labels in image augmentation.
-
-        This method processes the 'texts' and 'cls' fields of the input labels dictionary and any mixed labels,
-        creating a unified set of text labels and updating class IDs accordingly.
-
-        Args:
-            labels (dict[str, Any]): A dictionary containing label information, including 'texts' and 'cls' fields,
-                and optionally a 'mix_labels' field with additional label dictionaries.
-
-        Returns:
-            (dict[str, Any]): The updated labels dictionary with unified text labels and updated class IDs.
-
-        Examples:
-            >>> labels = {
-            ...     "texts": [["cat"], ["dog"]],
-            ...     "cls": torch.tensor([[0], [1]]),
-            ...     "mix_labels": [{"texts": [["bird"], ["fish"]], "cls": torch.tensor([[0], [1]])}],
-            ... }
-            >>> updated_labels = self._update_label_text(labels)
-            >>> print(updated_labels["texts"])
-            [['cat'], ['dog'], ['bird'], ['fish']]
-            >>> print(updated_labels["cls"])
-            tensor([[0],
-                    [1]])
-            >>> print(updated_labels["mix_labels"][0]["cls"])
-            tensor([[2],
-                    [3]])
-        """
-        if "texts" not in labels:
-            return labels
-
-        mix_texts = sum([labels["texts"]] + [x["texts"] for x in labels["mix_labels"]], [])
-        mix_texts = list({tuple(x) for x in mix_texts})
-        text2id = {text: i for i, text in enumerate(mix_texts)}
-
-        for label in [labels] + labels["mix_labels"]:
-            for i, cls in enumerate(label["cls"].squeeze(-1).tolist()):
-                text = label["texts"][int(cls)]
-                label["cls"][i] = text2id[tuple(text)]
-            label["texts"] = mix_texts
-        return labels
-
-
-class Mosaic(BaseMixTransform):
-    """
-    Mosaic augmentation for image datasets.
-
-    This class performs mosaic augmentation by combining multiple (4 or 9) images into a single mosaic image.
-    The augmentation is applied to a dataset with a given probability.
-
-    Attributes:
-        dataset: The dataset on which the mosaic augmentation is applied.
-        imgsz (int): Image size (height and width) after mosaic pipeline of a single image.
-        p (float): Probability of applying the mosaic augmentation. Must be in the range 0-1.
-        n (int): The grid size, either 4 (for 2x2) or 9 (for 3x3).
-        border (tuple[int, int]): Border size for width and height.
-
-    Methods:
-        get_indexes: Return a list of random indexes from the dataset.
-        _mix_transform: Apply mixup transformation to the input image and labels.
-        _mosaic3: Create a 1x3 image mosaic.
-        _mosaic4: Create a 2x2 image mosaic.
-        _mosaic9: Create a 3x3 image mosaic.
-        _update_labels: Update labels with padding.
-        _cat_labels: Concatenate labels and clips mosaic border instances.
-
-    Examples:
-        >>> from ultralytics.data.augment import Mosaic
-        >>> dataset = YourDataset(...)  # Your image dataset
-        >>> mosaic_aug = Mosaic(dataset, imgsz=640, p=0.5, n=4)
-        >>> augmented_labels = mosaic_aug(original_labels)
-    """
-
-    def __init__(self, dataset, imgsz: int = 640, p: float = 1.0, n: int = 4):
-        """
-        Initialize the Mosaic augmentation object.
-
-        This class performs mosaic augmentation by combining multiple (4 or 9) images into a single mosaic image.
-        The augmentation is applied to a dataset with a given probability.
-
-        Args:
-            dataset (Any): The dataset on which the mosaic augmentation is applied.
-            imgsz (int): Image size (height and width) after mosaic pipeline of a single image.
-            p (float): Probability of applying the mosaic augmentation. Must be in the range 0-1.
-            n (int): The grid size, either 4 (for 2x2) or 9 (for 3x3).
-
-        Examples:
-            >>> from ultralytics.data.augment import Mosaic
-            >>> dataset = YourDataset(...)
-            >>> mosaic_aug = Mosaic(dataset, imgsz=640, p=0.5, n=4)
-        """
-        assert 0 <= p <= 1.0, f"The probability should be in range [0, 1], but got {p}."
-        assert n in {4, 9}, "grid must be equal to 4 or 9."
-        super().__init__(dataset=dataset, p=p)
-        self.imgsz = imgsz
-        self.border = (-imgsz // 2, -imgsz // 2)  # width, height
-        self.n = n
-        self.buffer_enabled = self.dataset.cache != "ram"
-
-    def get_indexes(self):
-        """
-        Return a list of random indexes from the dataset for mosaic augmentation.
-
-        This method selects random image indexes either from a buffer or from the entire dataset, depending on
-        the 'buffer' parameter. It is used to choose images for creating mosaic augmentations.
-
-        Returns:
-            (list[int]): A list of random image indexes. The length of the list is n-1, where n is the number
-                of images used in the mosaic (either 3 or 8, depending on whether n is 4 or 9).
-
-        Examples:
-            >>> mosaic = Mosaic(dataset, imgsz=640, p=1.0, n=4)
-            >>> indexes = mosaic.get_indexes()
-            >>> print(len(indexes))  # Output: 3
-        """
-        if self.buffer_enabled:  # select images from buffer
-            return random.choices(list(self.dataset.buffer), k=self.n - 1)
-        else:  # select any images
-            return [random.randint(0, len(self.dataset) - 1) for _ in range(self.n - 1)]
-
-    def _mix_transform(self, labels: dict[str, Any]) -> dict[str, Any]:
-        """
-        Apply mosaic augmentation to the input image and labels.
-
-        This method combines multiple images (3, 4, or 9) into a single mosaic image based on the 'n' attribute.
-        It ensures that rectangular annotations are not present and that there are other images available for
-        mosaic augmentation.
-
-        Args:
-            labels (dict[str, Any]): A dictionary containing image data and annotations. Expected keys include:
-                - 'rect_shape': Should be None as rect and mosaic are mutually exclusive.
-                - 'mix_labels': A list of dictionaries containing data for other images to be used in the mosaic.
-
-        Returns:
-            (dict[str, Any]): A dictionary containing the mosaic-augmented image and updated annotations.
-
-        Raises:
-            AssertionError: If 'rect_shape' is not None or if 'mix_labels' is empty.
-
-        Examples:
-            >>> mosaic = Mosaic(dataset, imgsz=640, p=1.0, n=4)
-            >>> augmented_data = mosaic._mix_transform(labels)
-        """
-        assert labels.get("rect_shape") is None, "rect and mosaic are mutually exclusive."
-        assert len(labels.get("mix_labels", [])), "There are no other images for mosaic augment."
-        return (
-            self._mosaic3(labels) if self.n == 3 else self._mosaic4(labels) if self.n == 4 else self._mosaic9(labels)
-        )  # This code is modified for mosaic3 method.
-
-    def _mosaic3(self, labels: dict[str, Any]) -> dict[str, Any]:
-        """
-        Create a 1x3 image mosaic by combining three images.
-
-        This method arranges three images in a horizontal layout, with the main image in the center and two
-        additional images on either side. It's part of the Mosaic augmentation technique used in object detection.
-
-        Args:
-            labels (dict[str, Any]): A dictionary containing image and label information for the main (center) image.
-                Must include 'img' key with the image array, and 'mix_labels' key with a list of two
-                dictionaries containing information for the side images.
-
-        Returns:
-            (dict[str, Any]): A dictionary with the mosaic image and updated labels. Keys include:
-                - 'img' (np.ndarray): The mosaic image array with shape (H, W, C).
-                - Other keys from the input labels, updated to reflect the new image dimensions.
-
-        Examples:
-            >>> mosaic = Mosaic(dataset, imgsz=640, p=1.0, n=3)
-            >>> labels = {
-            ...     "img": np.random.rand(480, 640, 3),
-            ...     "mix_labels": [{"img": np.random.rand(480, 640, 3)} for _ in range(2)],
-            ... }
-            >>> result = mosaic._mosaic3(labels)
-            >>> print(result["img"].shape)
-            (640, 640, 3)
-        """
-        mosaic_labels = []
-        s = self.imgsz
-        for i in range(3):
-            labels_patch = labels if i == 0 else labels["mix_labels"][i - 1]
-            # Load image
-            img = labels_patch["img"]
-            h, w = labels_patch.pop("resized_shape")
-
-            # Place img in img3
-            if i == 0:  # center
-                img3 = np.full((s * 3, s * 3, img.shape[2]), 114, dtype=np.uint8)  # base image with 3 tiles
-                h0, w0 = h, w
-                c = s, s, s + w, s + h  # xmin, ymin, xmax, ymax (base) coordinates
-            elif i == 1:  # right
-                c = s + w0, s, s + w0 + w, s + h
-            elif i == 2:  # left
-                c = s - w, s + h0 - h, s, s + h0
-
-            padw, padh = c[:2]
-            x1, y1, x2, y2 = (max(x, 0) for x in c)  # allocate coordinates
-
-            img3[y1:y2, x1:x2] = img[y1 - padh :, x1 - padw :]  # img3[ymin:ymax, xmin:xmax]
-            # hp, wp = h, w  # height, width previous for next iteration
-
-            # Labels assuming imgsz*2 mosaic size
-            labels_patch = self._update_labels(labels_patch, padw + self.border[0], padh + self.border[1])
-            mosaic_labels.append(labels_patch)
-        final_labels = self._cat_labels(mosaic_labels)
-
-        final_labels["img"] = img3[-self.border[0] : self.border[0], -self.border[1] : self.border[1]]
-        return final_labels
-
-    def _mosaic4(self, labels: dict[str, Any]) -> dict[str, Any]:
-        """
-        Create a 2x2 image mosaic from four input images.
-
-        This method combines four images into a single mosaic image by placing them in a 2x2 grid. It also
-        updates the corresponding labels for each image in the mosaic.
-
-        Args:
-            labels (dict[str, Any]): A dictionary containing image data and labels for the base image (index 0) and three
-                additional images (indices 1-3) in the 'mix_labels' key.
-
-        Returns:
-            (dict[str, Any]): A dictionary containing the mosaic image and updated labels. The 'img' key contains the mosaic
-                image as a numpy array, and other keys contain the combined and adjusted labels for all four images.
-
-        Examples:
-            >>> mosaic = Mosaic(dataset, imgsz=640, p=1.0, n=4)
-            >>> labels = {
-            ...     "img": np.random.rand(480, 640, 3),
-            ...     "mix_labels": [{"img": np.random.rand(480, 640, 3)} for _ in range(3)],
-            ... }
-            >>> result = mosaic._mosaic4(labels)
-            >>> assert result["img"].shape == (1280, 1280, 3)
-        """
-        mosaic_labels = []
-        s = self.imgsz
-        yc, xc = (int(random.uniform(-x, 2 * s + x)) for x in self.border)  # mosaic center x, y
-        for i in range(4):
-            labels_patch = labels if i == 0 else labels["mix_labels"][i - 1]
-            # Load image
-            img = labels_patch["img"]
-            h, w = labels_patch.pop("resized_shape")
-
-            # Place img in img4
-            if i == 0:  # top left
-                img4 = np.full((s * 2, s * 2, img.shape[2]), 114, dtype=np.uint8)  # base image with 4 tiles
-                x1a, y1a, x2a, y2a = max(xc - w, 0), max(yc - h, 0), xc, yc  # xmin, ymin, xmax, ymax (large image)
-                x1b, y1b, x2b, y2b = w - (x2a - x1a), h - (y2a - y1a), w, h  # xmin, ymin, xmax, ymax (small image)
-            elif i == 1:  # top right
-                x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, s * 2), yc
-                x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h
-            elif i == 2:  # bottom left
-                x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(s * 2, yc + h)
-                x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, w, min(y2a - y1a, h)
-            elif i == 3:  # bottom right
-                x1a, y1a, x2a, y2a = xc, yc, min(xc + w, s * 2), min(s * 2, yc + h)
-                x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h)
-
-            img4[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b]  # img4[ymin:ymax, xmin:xmax]
-            padw = x1a - x1b
-            padh = y1a - y1b
-
-            labels_patch = self._update_labels(labels_patch, padw, padh)
-            mosaic_labels.append(labels_patch)
-        final_labels = self._cat_labels(mosaic_labels)
-        final_labels["img"] = img4
-        return final_labels
-
-    def _mosaic9(self, labels: dict[str, Any]) -> dict[str, Any]:
-        """
-        Create a 3x3 image mosaic from the input image and eight additional images.
-
-        This method combines nine images into a single mosaic image. The input image is placed at the center,
-        and eight additional images from the dataset are placed around it in a 3x3 grid pattern.
-
-        Args:
-            labels (dict[str, Any]): A dictionary containing the input image and its associated labels. It should have
-                the following keys:
-                - 'img' (np.ndarray): The input image.
-                - 'resized_shape' (tuple[int, int]): The shape of the resized image (height, width).
-                - 'mix_labels' (list[dict]): A list of dictionaries containing information for the additional
-                  eight images, each with the same structure as the input labels.
-
-        Returns:
-            (dict[str, Any]): A dictionary containing the mosaic image and updated labels. It includes the following keys:
-                - 'img' (np.ndarray): The final mosaic image.
-                - Other keys from the input labels, updated to reflect the new mosaic arrangement.
-
-        Examples:
-            >>> mosaic = Mosaic(dataset, imgsz=640, p=1.0, n=9)
-            >>> input_labels = dataset[0]
-            >>> mosaic_result = mosaic._mosaic9(input_labels)
-            >>> mosaic_image = mosaic_result["img"]
-        """
-        mosaic_labels = []
-        s = self.imgsz
-        hp, wp = -1, -1  # height, width previous
-        for i in range(9):
-            labels_patch = labels if i == 0 else labels["mix_labels"][i - 1]
-            # Load image
-            img = labels_patch["img"]
-            h, w = labels_patch.pop("resized_shape")
-
-            # Place img in img9
-            if i == 0:  # center
-                img9 = np.full((s * 3, s * 3, img.shape[2]), 114, dtype=np.uint8)  # base image with 4 tiles
-                h0, w0 = h, w
-                c = s, s, s + w, s + h  # xmin, ymin, xmax, ymax (base) coordinates
-            elif i == 1:  # top
-                c = s, s - h, s + w, s
-            elif i == 2:  # top right
-                c = s + wp, s - h, s + wp + w, s
-            elif i == 3:  # right
-                c = s + w0, s, s + w0 + w, s + h
-            elif i == 4:  # bottom right
-                c = s + w0, s + hp, s + w0 + w, s + hp + h
-            elif i == 5:  # bottom
-                c = s + w0 - w, s + h0, s + w0, s + h0 + h
-            elif i == 6:  # bottom left
-                c = s + w0 - wp - w, s + h0, s + w0 - wp, s + h0 + h
-            elif i == 7:  # left
-                c = s - w, s + h0 - h, s, s + h0
-            elif i == 8:  # top left
-                c = s - w, s + h0 - hp - h, s, s + h0 - hp
-
-            padw, padh = c[:2]
-            x1, y1, x2, y2 = (max(x, 0) for x in c)  # allocate coordinates
-
-            # Image
-            img9[y1:y2, x1:x2] = img[y1 - padh :, x1 - padw :]  # img9[ymin:ymax, xmin:xmax]
-            hp, wp = h, w  # height, width previous for next iteration
-
-            # Labels assuming imgsz*2 mosaic size
-            labels_patch = self._update_labels(labels_patch, padw + self.border[0], padh + self.border[1])
-            mosaic_labels.append(labels_patch)
-        final_labels = self._cat_labels(mosaic_labels)
-
-        final_labels["img"] = img9[-self.border[0] : self.border[0], -self.border[1] : self.border[1]]
-        return final_labels
-
-    @staticmethod
-    def _update_labels(labels, padw: int, padh: int) -> dict[str, Any]:
-        """
-        Update label coordinates with padding values.
-
-        This method adjusts the bounding box coordinates of object instances in the labels by adding padding
-        values. It also denormalizes the coordinates if they were previously normalized.
-
-        Args:
-            labels (dict[str, Any]): A dictionary containing image and instance information.
-            padw (int): Padding width to be added to the x-coordinates.
-            padh (int): Padding height to be added to the y-coordinates.
-
-        Returns:
-            (dict): Updated labels dictionary with adjusted instance coordinates.
-
-        Examples:
-            >>> labels = {"img": np.zeros((100, 100, 3)), "instances": Instances(...)}
-            >>> padw, padh = 50, 50
-            >>> updated_labels = Mosaic._update_labels(labels, padw, padh)
-        """
-        nh, nw = labels["img"].shape[:2]
-        labels["instances"].convert_bbox(format="xyxy")
-        labels["instances"].denormalize(nw, nh)
-        labels["instances"].add_padding(padw, padh)
-        return labels
-
-    def _cat_labels(self, mosaic_labels: list[dict[str, Any]]) -> dict[str, Any]:
-        """
-        Concatenate and process labels for mosaic augmentation.
-
-        This method combines labels from multiple images used in mosaic augmentation, clips instances to the
-        mosaic border, and removes zero-area boxes.
-
-        Args:
-            mosaic_labels (list[dict[str, Any]]): A list of label dictionaries for each image in the mosaic.
-
-        Returns:
-            (dict[str, Any]): A dictionary containing concatenated and processed labels for the mosaic image, including:
-                - im_file (str): File path of the first image in the mosaic.
-                - ori_shape (tuple[int, int]): Original shape of the first image.
-                - resized_shape (tuple[int, int]): Shape of the mosaic image (imgsz * 2, imgsz * 2).
-                - cls (np.ndarray): Concatenated class labels.
-                - instances (Instances): Concatenated instance annotations.
-                - mosaic_border (tuple[int, int]): Mosaic border size.
-                - texts (list[str], optional): Text labels if present in the original labels.
-
-        Examples:
-            >>> mosaic = Mosaic(dataset, imgsz=640)
-            >>> mosaic_labels = [{"cls": np.array([0, 1]), "instances": Instances(...)} for _ in range(4)]
-            >>> result = mosaic._cat_labels(mosaic_labels)
-            >>> print(result.keys())
-            dict_keys(['im_file', 'ori_shape', 'resized_shape', 'cls', 'instances', 'mosaic_border'])
-        """
-        if not mosaic_labels:
-            return {}
-        cls = []
-        instances = []
-        imgsz = self.imgsz * 2  # mosaic imgsz
-        for labels in mosaic_labels:
-            cls.append(labels["cls"])
-            instances.append(labels["instances"])
-        # Final labels
-        final_labels = {
-            "im_file": mosaic_labels[0]["im_file"],
-            "ori_shape": mosaic_labels[0]["ori_shape"],
-            "resized_shape": (imgsz, imgsz),
-            "cls": np.concatenate(cls, 0),
-            "instances": Instances.concatenate(instances, axis=0),
-            "mosaic_border": self.border,
-        }
-        final_labels["instances"].clip(imgsz, imgsz)
-        good = final_labels["instances"].remove_zero_area_boxes()
-        final_labels["cls"] = final_labels["cls"][good]
-        if "texts" in mosaic_labels[0]:
-            final_labels["texts"] = mosaic_labels[0]["texts"]
-        return final_labels
-
-
-class MixUp(BaseMixTransform):
-    """
-    Apply MixUp augmentation to image datasets.
-
-    This class implements the MixUp augmentation technique as described in the paper [mixup: Beyond Empirical Risk
-    Minimization](https://arxiv.org/abs/1710.09412). MixUp combines two images and their labels using a random weight.
-
-    Attributes:
-        dataset (Any): The dataset to which MixUp augmentation will be applied.
-        pre_transform (Callable | None): Optional transform to apply before MixUp.
-        p (float): Probability of applying MixUp augmentation.
-
-    Methods:
-        _mix_transform: Apply MixUp augmentation to the input labels.
-
-    Examples:
-        >>> from ultralytics.data.augment import MixUp
-        >>> dataset = YourDataset(...)  # Your image dataset
-        >>> mixup = MixUp(dataset, p=0.5)
-        >>> augmented_labels = mixup(original_labels)
-    """
-
-    def __init__(self, dataset, pre_transform=None, p: float = 0.0) -> None:
-        """
-        Initialize the MixUp augmentation object.
-
-        MixUp is an image augmentation technique that combines two images by taking a weighted sum of their pixel
-        values and labels. This implementation is designed for use with the Ultralytics YOLO framework.
-
-        Args:
-            dataset (Any): The dataset to which MixUp augmentation will be applied.
-            pre_transform (Callable | None): Optional transform to apply to images before MixUp.
-            p (float): Probability of applying MixUp augmentation to an image. Must be in the range [0, 1].
-
-        Examples:
-            >>> from ultralytics.data.dataset import YOLODataset
-            >>> dataset = YOLODataset("path/to/data.yaml")
-            >>> mixup = MixUp(dataset, pre_transform=None, p=0.5)
-        """
-        super().__init__(dataset=dataset, pre_transform=pre_transform, p=p)
-
-    def _mix_transform(self, labels: dict[str, Any]) -> dict[str, Any]:
-        """
-        Apply MixUp augmentation to the input labels.
-
-        This method implements the MixUp augmentation technique as described in the paper
-        "mixup: Beyond Empirical Risk Minimization" (https://arxiv.org/abs/1710.09412).
-
-        Args:
-            labels (dict[str, Any]): A dictionary containing the original image and label information.
-
-        Returns:
-            (dict[str, Any]): A dictionary containing the mixed-up image and combined label information.
-
-        Examples:
-            >>> mixer = MixUp(dataset)
-            >>> mixed_labels = mixer._mix_transform(labels)
-        """
-        r = np.random.beta(32.0, 32.0)  # mixup ratio, alpha=beta=32.0
-        labels2 = labels["mix_labels"][0]
-        labels["img"] = (labels["img"] * r + labels2["img"] * (1 - r)).astype(np.uint8)
-        labels["instances"] = Instances.concatenate([labels["instances"], labels2["instances"]], axis=0)
-        labels["cls"] = np.concatenate([labels["cls"], labels2["cls"]], 0)
-        return labels
-
-
-class CutMix(BaseMixTransform):
-    """
-    Apply CutMix augmentation to image datasets as described in the paper https://arxiv.org/abs/1905.04899.
-
-    CutMix combines two images by replacing a random rectangular region of one image with the corresponding region from another image,
-    and adjusts the labels proportionally to the area of the mixed region.
-
-    Attributes:
-        dataset (Any): The dataset to which CutMix augmentation will be applied.
-        pre_transform (Callable | None): Optional transform to apply before CutMix.
-        p (float): Probability of applying CutMix augmentation.
-        beta (float): Beta distribution parameter for sampling the mixing ratio.
-        num_areas (int): Number of areas to try to cut and mix.
-
-    Methods:
-        _mix_transform: Apply CutMix augmentation to the input labels.
-        _rand_bbox: Generate random bounding box coordinates for the cut region.
-
-    Examples:
-        >>> from ultralytics.data.augment import CutMix
-        >>> dataset = YourDataset(...)  # Your image dataset
-        >>> cutmix = CutMix(dataset, p=0.5)
-        >>> augmented_labels = cutmix(original_labels)
-    """
-
-    def __init__(self, dataset, pre_transform=None, p: float = 0.0, beta: float = 1.0, num_areas: int = 3) -> None:
-        """
-        Initialize the CutMix augmentation object.
-
-        Args:
-            dataset (Any): The dataset to which CutMix augmentation will be applied.
-            pre_transform (Callable | None): Optional transform to apply before CutMix.
-            p (float): Probability of applying CutMix augmentation.
-            beta (float): Beta distribution parameter for sampling the mixing ratio.
-            num_areas (int): Number of areas to try to cut and mix.
-        """
-        super().__init__(dataset=dataset, pre_transform=pre_transform, p=p)
-        self.beta = beta
-        self.num_areas = num_areas
-
-    def _rand_bbox(self, width: int, height: int) -> tuple[int, int, int, int]:
-        """
-        Generate random bounding box coordinates for the cut region.
-
-        Args:
-            width (int): Width of the image.
-            height (int): Height of the image.
-
-        Returns:
-            (tuple[int]): (x1, y1, x2, y2) coordinates of the bounding box.
-        """
-        # Sample mixing ratio from Beta distribution
-        lam = np.random.beta(self.beta, self.beta)
-
-        cut_ratio = np.sqrt(1.0 - lam)
-        cut_w = int(width * cut_ratio)
-        cut_h = int(height * cut_ratio)
-
-        # Random center
-        cx = np.random.randint(width)
-        cy = np.random.randint(height)
-
-        # Bounding box coordinates
-        x1 = np.clip(cx - cut_w // 2, 0, width)
-        y1 = np.clip(cy - cut_h // 2, 0, height)
-        x2 = np.clip(cx + cut_w // 2, 0, width)
-        y2 = np.clip(cy + cut_h // 2, 0, height)
-
-        return x1, y1, x2, y2
-
-    def _mix_transform(self, labels: dict[str, Any]) -> dict[str, Any]:
-        """
-        Apply CutMix augmentation to the input labels.
-
-        Args:
-            labels (dict[str, Any]): A dictionary containing the original image and label information.
-
-        Returns:
-            (dict[str, Any]): A dictionary containing the mixed image and adjusted labels.
-
-        Examples:
-            >>> cutter = CutMix(dataset)
-            >>> mixed_labels = cutter._mix_transform(labels)
-        """
-        # Get a random second image
-        h, w = labels["img"].shape[:2]
-
-        cut_areas = np.asarray([self._rand_bbox(w, h) for _ in range(self.num_areas)], dtype=np.float32)
-        ioa1 = bbox_ioa(cut_areas, labels["instances"].bboxes)  # (self.num_areas, num_boxes)
-        idx = np.nonzero(ioa1.sum(axis=1) <= 0)[0]
-        if len(idx) == 0:
-            return labels
-
-        labels2 = labels.pop("mix_labels")[0]
-        area = cut_areas[np.random.choice(idx)]  # randomly select one
-        ioa2 = bbox_ioa(area[None], labels2["instances"].bboxes).squeeze(0)
-        indexes2 = np.nonzero(ioa2 >= (0.01 if len(labels["instances"].segments) else 0.1))[0]
-        if len(indexes2) == 0:
-            return labels
-
-        instances2 = labels2["instances"][indexes2]
-        instances2.convert_bbox("xyxy")
-        instances2.denormalize(w, h)
-
-        # Apply CutMix
-        x1, y1, x2, y2 = area.astype(np.int32)
-        labels["img"][y1:y2, x1:x2] = labels2["img"][y1:y2, x1:x2]
-
-        # Restrain instances2 to the random bounding border
-        instances2.add_padding(-x1, -y1)
-        instances2.clip(x2 - x1, y2 - y1)
-        instances2.add_padding(x1, y1)
-
-        labels["cls"] = np.concatenate([labels["cls"], labels2["cls"][indexes2]], axis=0)
-        labels["instances"] = Instances.concatenate([labels["instances"], instances2], axis=0)
-        return labels
-
-
-class RandomPerspective:
-    """
-    Implement random perspective and affine transformations on images and corresponding annotations.
-
-    This class applies random rotations, translations, scaling, shearing, and perspective transformations
-    to images and their associated bounding boxes, segments, and keypoints. It can be used as part of an
-    augmentation pipeline for object detection and instance segmentation tasks.
-
-    Attributes:
-        degrees (float): Maximum absolute degree range for random rotations.
-        translate (float): Maximum translation as a fraction of the image size.
-        scale (float): Scaling factor range, e.g., scale=0.1 means 0.9-1.1.
-        shear (float): Maximum shear angle in degrees.
-        perspective (float): Perspective distortion factor.
-        border (tuple[int, int]): Mosaic border size as (x, y).
-        pre_transform (Callable | None): Optional transform to apply before the random perspective.
-
-    Methods:
-        affine_transform: Apply affine transformations to the input image.
-        apply_bboxes: Transform bounding boxes using the affine matrix.
-        apply_segments: Transform segments and generate new bounding boxes.
-        apply_keypoints: Transform keypoints using the affine matrix.
-        __call__: Apply the random perspective transformation to images and annotations.
-        box_candidates: Filter transformed bounding boxes based on size and aspect ratio.
-
-    Examples:
-        >>> transform = RandomPerspective(degrees=10, translate=0.1, scale=0.1, shear=10)
-        >>> image = np.random.randint(0, 255, (640, 640, 3), dtype=np.uint8)
-        >>> labels = {"img": image, "cls": np.array([0, 1]), "instances": Instances(...)}
-        >>> result = transform(labels)
-        >>> transformed_image = result["img"]
-        >>> transformed_instances = result["instances"]
-    """
-
-    def __init__(
-        self,
-        degrees: float = 0.0,
-        translate: float = 0.1,
-        scale: float = 0.5,
-        shear: float = 0.0,
-        perspective: float = 0.0,
-        border: tuple[int, int] = (0, 0),
-        pre_transform=None,
-    ):
-        """
-        Initialize RandomPerspective object with transformation parameters.
-
-        This class implements random perspective and affine transformations on images and corresponding bounding boxes,
-        segments, and keypoints. Transformations include rotation, translation, scaling, and shearing.
-
-        Args:
-            degrees (float): Degree range for random rotations.
-            translate (float): Fraction of total width and height for random translation.
-            scale (float): Scaling factor interval, e.g., a scale factor of 0.5 allows a resize between 50%-150%.
-            shear (float): Shear intensity (angle in degrees).
-            perspective (float): Perspective distortion factor.
-            border (tuple[int, int]): Tuple specifying mosaic border (top/bottom, left/right).
-            pre_transform (Callable | None): Function/transform to apply to the image before starting the random
-                transformation.
-
-        Examples:
-            >>> transform = RandomPerspective(degrees=10.0, translate=0.1, scale=0.5, shear=5.0)
-            >>> result = transform(labels)  # Apply random perspective to labels
-        """
-        self.degrees = degrees
-        self.translate = translate
-        self.scale = scale
-        self.shear = shear
-        self.perspective = perspective
-        self.border = border  # mosaic border
-        self.pre_transform = pre_transform
-
-    def affine_transform(self, img: np.ndarray, border: tuple[int, int]) -> tuple[np.ndarray, np.ndarray, float]:
-        """
-        Apply a sequence of affine transformations centered around the image center.
-
-        This function performs a series of geometric transformations on the input image, including
-        translation, perspective change, rotation, scaling, and shearing. The transformations are
-        applied in a specific order to maintain consistency.
-
-        Args:
-            img (np.ndarray): Input image to be transformed.
-            border (tuple[int, int]): Border dimensions for the transformed image.
-
-        Returns:
-            img (np.ndarray): Transformed image.
-            M (np.ndarray): 3x3 transformation matrix.
-            s (float): Scale factor applied during the transformation.
-
-        Examples:
-            >>> import numpy as np
-            >>> img = np.random.rand(100, 100, 3)
-            >>> border = (10, 10)
-            >>> transformed_img, matrix, scale = affine_transform(img, border)
-        """
-        # Center
-        C = np.eye(3, dtype=np.float32)
-
-        C[0, 2] = -img.shape[1] / 2  # x translation (pixels)
-        C[1, 2] = -img.shape[0] / 2  # y translation (pixels)
-
-        # Perspective
-        P = np.eye(3, dtype=np.float32)
-        P[2, 0] = random.uniform(-self.perspective, self.perspective)  # x perspective (about y)
-        P[2, 1] = random.uniform(-self.perspective, self.perspective)  # y perspective (about x)
-
-        # Rotation and Scale
-        R = np.eye(3, dtype=np.float32)
-        a = random.uniform(-self.degrees, self.degrees)
-        # a += random.choice([-180, -90, 0, 90])  # add 90deg rotations to small rotations
-        s = random.uniform(1 - self.scale, 1 + self.scale)
-        # s = 2 ** random.uniform(-scale, scale)
-        R[:2] = cv2.getRotationMatrix2D(angle=a, center=(0, 0), scale=s)
-
-        # Shear
-        S = np.eye(3, dtype=np.float32)
-        S[0, 1] = math.tan(random.uniform(-self.shear, self.shear) * math.pi / 180)  # x shear (deg)
-        S[1, 0] = math.tan(random.uniform(-self.shear, self.shear) * math.pi / 180)  # y shear (deg)
-
-        # Translation
-        T = np.eye(3, dtype=np.float32)
-        T[0, 2] = random.uniform(0.5 - self.translate, 0.5 + self.translate) * self.size[0]  # x translation (pixels)
-        T[1, 2] = random.uniform(0.5 - self.translate, 0.5 + self.translate) * self.size[1]  # y translation (pixels)
-
-        # Combined rotation matrix
-        M = T @ S @ R @ P @ C  # order of operations (right to left) is IMPORTANT
-        # Affine image
-        if (border[0] != 0) or (border[1] != 0) or (M != np.eye(3)).any():  # image changed
-            if self.perspective:
-                img = cv2.warpPerspective(img, M, dsize=self.size, borderValue=(114, 114, 114))
-            else:  # affine
-                img = cv2.warpAffine(img, M[:2], dsize=self.size, borderValue=(114, 114, 114))
-            if img.ndim == 2:
-                img = img[..., None]
-        return img, M, s
-
-    def apply_bboxes(self, bboxes: np.ndarray, M: np.ndarray) -> np.ndarray:
-        """
-        Apply affine transformation to bounding boxes.
-
-        This function applies an affine transformation to a set of bounding boxes using the provided
-        transformation matrix.
-
-        Args:
-            bboxes (np.ndarray): Bounding boxes in xyxy format with shape (N, 4), where N is the number
-                of bounding boxes.
-            M (np.ndarray): Affine transformation matrix with shape (3, 3).
-
-        Returns:
-            (np.ndarray): Transformed bounding boxes in xyxy format with shape (N, 4).
-
-        Examples:
-            >>> bboxes = torch.tensor([[10, 10, 20, 20], [30, 30, 40, 40]])
-            >>> M = torch.eye(3)
-            >>> transformed_bboxes = apply_bboxes(bboxes, M)
-        """
-        n = len(bboxes)
-        if n == 0:
-            return bboxes
-
-        xy = np.ones((n * 4, 3), dtype=bboxes.dtype)
-        xy[:, :2] = bboxes[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(n * 4, 2)  # x1y1, x2y2, x1y2, x2y1
-        xy = xy @ M.T  # transform
-        xy = (xy[:, :2] / xy[:, 2:3] if self.perspective else xy[:, :2]).reshape(n, 8)  # perspective rescale or affine
-
-        # Create new boxes
-        x = xy[:, [0, 2, 4, 6]]
-        y = xy[:, [1, 3, 5, 7]]
-        return np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1)), dtype=bboxes.dtype).reshape(4, n).T
-
-    def apply_segments(self, segments: np.ndarray, M: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
-        """
-        Apply affine transformations to segments and generate new bounding boxes.
-
-        This function applies affine transformations to input segments and generates new bounding boxes based on
-        the transformed segments. It clips the transformed segments to fit within the new bounding boxes.
-
-        Args:
-            segments (np.ndarray): Input segments with shape (N, M, 2), where N is the number of segments and M is the
-                number of points in each segment.
-            M (np.ndarray): Affine transformation matrix with shape (3, 3).
-
-        Returns:
-            bboxes (np.ndarray): New bounding boxes with shape (N, 4) in xyxy format.
-            segments (np.ndarray): Transformed and clipped segments with shape (N, M, 2).
-
-        Examples:
-            >>> segments = np.random.rand(10, 500, 2)  # 10 segments with 500 points each
-            >>> M = np.eye(3)  # Identity transformation matrix
-            >>> new_bboxes, new_segments = apply_segments(segments, M)
-        """
-        n, num = segments.shape[:2]
-        if n == 0:
-            return [], segments
-
-        xy = np.ones((n * num, 3), dtype=segments.dtype)
-        segments = segments.reshape(-1, 2)
-        xy[:, :2] = segments
-        xy = xy @ M.T  # transform
-        xy = xy[:, :2] / xy[:, 2:3]
-        segments = xy.reshape(n, -1, 2)
-        bboxes = np.stack([segment2box(xy, self.size[0], self.size[1]) for xy in segments], 0)
-        segments[..., 0] = segments[..., 0].clip(bboxes[:, 0:1], bboxes[:, 2:3])
-        segments[..., 1] = segments[..., 1].clip(bboxes[:, 1:2], bboxes[:, 3:4])
-        return bboxes, segments
-
-    def apply_keypoints(self, keypoints: np.ndarray, M: np.ndarray) -> np.ndarray:
-        """
-        Apply affine transformation to keypoints.
-
-        This method transforms the input keypoints using the provided affine transformation matrix. It handles
-        perspective rescaling if necessary and updates the visibility of keypoints that fall outside the image
-        boundaries after transformation.
-
-        Args:
-            keypoints (np.ndarray): Array of keypoints with shape (N, 17, 3), where N is the number of instances,
-                17 is the number of keypoints per instance, and 3 represents (x, y, visibility).
-            M (np.ndarray): 3x3 affine transformation matrix.
-
-        Returns:
-            (np.ndarray): Transformed keypoints array with the same shape as input (N, 17, 3).
-
-        Examples:
-            >>> random_perspective = RandomPerspective()
-            >>> keypoints = np.random.rand(5, 17, 3)  # 5 instances, 17 keypoints each
-            >>> M = np.eye(3)  # Identity transformation
-            >>> transformed_keypoints = random_perspective.apply_keypoints(keypoints, M)
-        """
-        n, nkpt = keypoints.shape[:2]
-        if n == 0:
-            return keypoints
-        xy = np.ones((n * nkpt, 3), dtype=keypoints.dtype)
-        visible = keypoints[..., 2].reshape(n * nkpt, 1)
-        xy[:, :2] = keypoints[..., :2].reshape(n * nkpt, 2)
-        xy = xy @ M.T  # transform
-        xy = xy[:, :2] / xy[:, 2:3]  # perspective rescale or affine
-        out_mask = (xy[:, 0] < 0) | (xy[:, 1] < 0) | (xy[:, 0] > self.size[0]) | (xy[:, 1] > self.size[1])
-        visible[out_mask] = 0
-        return np.concatenate([xy, visible], axis=-1).reshape(n, nkpt, 3)
-
-    def __call__(self, labels: dict[str, Any]) -> dict[str, Any]:
-        """
-        Apply random perspective and affine transformations to an image and its associated labels.
-
-        This method performs a series of transformations including rotation, translation, scaling, shearing,
-        and perspective distortion on the input image and adjusts the corresponding bounding boxes, segments,
-        and keypoints accordingly.
-
-        Args:
-            labels (dict[str, Any]): A dictionary containing image data and annotations.
-                Must include:
-                    'img' (np.ndarray): The input image.
-                    'cls' (np.ndarray): Class labels.
-                    'instances' (Instances): Object instances with bounding boxes, segments, and keypoints.
-                May include:
-                    'mosaic_border' (tuple[int, int]): Border size for mosaic augmentation.
-
-        Returns:
-            (dict[str, Any]): Transformed labels dictionary containing:
-                - 'img' (np.ndarray): The transformed image.
-                - 'cls' (np.ndarray): Updated class labels.
-                - 'instances' (Instances): Updated object instances.
-                - 'resized_shape' (tuple[int, int]): New image shape after transformation.
-
-        Examples:
-            >>> transform = RandomPerspective()
-            >>> image = np.random.randint(0, 255, (640, 640, 3), dtype=np.uint8)
-            >>> labels = {
-            ...     "img": image,
-            ...     "cls": np.array([0, 1, 2]),
-            ...     "instances": Instances(bboxes=np.array([[10, 10, 50, 50], [100, 100, 150, 150]])),
-            ... }
-            >>> result = transform(labels)
-            >>> assert result["img"].shape[:2] == result["resized_shape"]
-        """
-        if self.pre_transform and "mosaic_border" not in labels:
-            labels = self.pre_transform(labels)
-        labels.pop("ratio_pad", None)  # do not need ratio pad
-
-        img = labels["img"]
-        cls = labels["cls"]
-        instances = labels.pop("instances")
-        # Make sure the coord formats are right
-        instances.convert_bbox(format="xyxy")
-        instances.denormalize(*img.shape[:2][::-1])
-
-        border = labels.pop("mosaic_border", self.border)
-        self.size = img.shape[1] + border[1] * 2, img.shape[0] + border[0] * 2  # w, h
-        # M is affine matrix
-        # Scale for func:`box_candidates`
-        img, M, scale = self.affine_transform(img, border)
-
-        bboxes = self.apply_bboxes(instances.bboxes, M)
-
-        segments = instances.segments
-        keypoints = instances.keypoints
-        # Update bboxes if there are segments.
-        if len(segments):
-            bboxes, segments = self.apply_segments(segments, M)
-
-        if keypoints is not None:
-            keypoints = self.apply_keypoints(keypoints, M)
-        new_instances = Instances(bboxes, segments, keypoints, bbox_format="xyxy", normalized=False)
-        # Clip
-        new_instances.clip(*self.size)
-
-        # Filter instances
-        instances.scale(scale_w=scale, scale_h=scale, bbox_only=True)
-        # Make the bboxes have the same scale with new_bboxes
-        i = self.box_candidates(
-            box1=instances.bboxes.T, box2=new_instances.bboxes.T, area_thr=0.01 if len(segments) else 0.10
-        )
-        labels["instances"] = new_instances[i]
-        labels["cls"] = cls[i]
-        labels["img"] = img
-        labels["resized_shape"] = img.shape[:2]
-        return labels
-
-    @staticmethod
-    def box_candidates(
-        box1: np.ndarray,
-        box2: np.ndarray,
-        wh_thr: int = 2,
-        ar_thr: int = 100,
-        area_thr: float = 0.1,
-        eps: float = 1e-16,
-    ) -> np.ndarray:
-        """
-        Compute candidate boxes for further processing based on size and aspect ratio criteria.
-
-        This method compares boxes before and after augmentation to determine if they meet specified
-        thresholds for width, height, aspect ratio, and area. It's used to filter out boxes that have
-        been overly distorted or reduced by the augmentation process.
-
-        Args:
-            box1 (np.ndarray): Original boxes before augmentation, shape (4, N) where n is the
-                number of boxes. Format is [x1, y1, x2, y2] in absolute coordinates.
-            box2 (np.ndarray): Augmented boxes after transformation, shape (4, N). Format is
-                [x1, y1, x2, y2] in absolute coordinates.
-            wh_thr (int): Width and height threshold in pixels. Boxes smaller than this in either
-                dimension are rejected.
-            ar_thr (int): Aspect ratio threshold. Boxes with an aspect ratio greater than this
-                value are rejected.
-            area_thr (float): Area ratio threshold. Boxes with an area ratio (new/old) less than
-                this value are rejected.
-            eps (float): Small epsilon value to prevent division by zero.
-
-        Returns:
-            (np.ndarray): Boolean array of shape (n) indicating which boxes are candidates.
-                True values correspond to boxes that meet all criteria.
-
-        Examples:
-            >>> random_perspective = RandomPerspective()
-            >>> box1 = np.array([[0, 0, 100, 100], [0, 0, 50, 50]]).T
-            >>> box2 = np.array([[10, 10, 90, 90], [5, 5, 45, 45]]).T
-            >>> candidates = random_perspective.box_candidates(box1, box2)
-            >>> print(candidates)
-            [True True]
-        """
-        w1, h1 = box1[2] - box1[0], box1[3] - box1[1]
-        w2, h2 = box2[2] - box2[0], box2[3] - box2[1]
-        ar = np.maximum(w2 / (h2 + eps), h2 / (w2 + eps))  # aspect ratio
-        return (w2 > wh_thr) & (h2 > wh_thr) & (w2 * h2 / (w1 * h1 + eps) > area_thr) & (ar < ar_thr)  # candidates
-
-
-class RandomHSV:
-    """
-    Randomly adjust the Hue, Saturation, and Value (HSV) channels of an image.
-
-    This class applies random HSV augmentation to images within predefined limits set by hgain, sgain, and vgain.
-
-    Attributes:
-        hgain (float): Maximum variation for hue. Range is typically [0, 1].
-        sgain (float): Maximum variation for saturation. Range is typically [0, 1].
-        vgain (float): Maximum variation for value. Range is typically [0, 1].
-
-    Methods:
-        __call__: Apply random HSV augmentation to an image.
-
-    Examples:
-        >>> import numpy as np
-        >>> from ultralytics.data.augment import RandomHSV
-        >>> augmenter = RandomHSV(hgain=0.5, sgain=0.5, vgain=0.5)
-        >>> image = np.random.randint(0, 255, (100, 100, 3), dtype=np.uint8)
-        >>> labels = {"img": image}
-        >>> augmenter(labels)
-        >>> augmented_image = augmented_labels["img"]
-    """
-
-    def __init__(self, hgain: float = 0.5, sgain: float = 0.5, vgain: float = 0.5) -> None:
-        """
-        Initialize the RandomHSV object for random HSV (Hue, Saturation, Value) augmentation.
-
-        This class applies random adjustments to the HSV channels of an image within specified limits.
-
-        Args:
-            hgain (float): Maximum variation for hue. Should be in the range [0, 1].
-            sgain (float): Maximum variation for saturation. Should be in the range [0, 1].
-            vgain (float): Maximum variation for value. Should be in the range [0, 1].
-
-        Examples:
-            >>> hsv_aug = RandomHSV(hgain=0.5, sgain=0.5, vgain=0.5)
-            >>> hsv_aug(image)
-        """
-        self.hgain = hgain
-        self.sgain = sgain
-        self.vgain = vgain
-
-    def __call__(self, labels: dict[str, Any]) -> dict[str, Any]:
-        """
-        Apply random HSV augmentation to an image within predefined limits.
-
-        This method modifies the input image by randomly adjusting its Hue, Saturation, and Value (HSV) channels.
-        The adjustments are made within the limits set by hgain, sgain, and vgain during initialization.
-
-        Args:
-            labels (dict[str, Any]): A dictionary containing image data and metadata. Must include an 'img' key with
-                the image as a numpy array.
-
-        Returns:
-            (dict[str, Any]): A dictionary containing the mixed image and adjusted labels.
-
-        Examples:
-            >>> hsv_augmenter = RandomHSV(hgain=0.5, sgain=0.5, vgain=0.5)
-            >>> labels = {"img": np.random.randint(0, 255, (100, 100, 3), dtype=np.uint8)}
-            >>> labels = hsv_augmenter(labels)
-            >>> augmented_img = labels["img"]
-        """
-        img = labels["img"]
-        if img.shape[-1] != 3:  # only apply to RGB images
-            return labels
-        if self.hgain or self.sgain or self.vgain:
-            dtype = img.dtype  # uint8
-
-            r = np.random.uniform(-1, 1, 3) * [self.hgain, self.sgain, self.vgain]  # random gains
-            x = np.arange(0, 256, dtype=r.dtype)
-            # lut_hue = ((x * (r[0] + 1)) % 180).astype(dtype)   # original hue implementation from ultralytics<=8.3.78
-            lut_hue = ((x + r[0] * 180) % 180).astype(dtype)
-            lut_sat = np.clip(x * (r[1] + 1), 0, 255).astype(dtype)
-            lut_val = np.clip(x * (r[2] + 1), 0, 255).astype(dtype)
-            lut_sat[0] = 0  # prevent pure white changing color, introduced in 8.3.79
-
-            hue, sat, val = cv2.split(cv2.cvtColor(img, cv2.COLOR_BGR2HSV))
-            im_hsv = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val)))
-            cv2.cvtColor(im_hsv, cv2.COLOR_HSV2BGR, dst=img)  # no return needed
-        return labels
-
-
-class RandomFlip:
-    """
-    Apply a random horizontal or vertical flip to an image with a given probability.
-
-    This class performs random image flipping and updates corresponding instance annotations such as
-    bounding boxes and keypoints.
-
-    Attributes:
-        p (float): Probability of applying the flip. Must be between 0 and 1.
-        direction (str): Direction of flip, either 'horizontal' or 'vertical'.
-        flip_idx (array-like): Index mapping for flipping keypoints, if applicable.
-
-    Methods:
-        __call__: Apply the random flip transformation to an image and its annotations.
-
-    Examples:
-        >>> transform = RandomFlip(p=0.5, direction="horizontal")
-        >>> result = transform({"img": image, "instances": instances})
-        >>> flipped_image = result["img"]
-        >>> flipped_instances = result["instances"]
-    """
-
-    def __init__(self, p: float = 0.5, direction: str = "horizontal", flip_idx: list[int] = None) -> None:
-        """
-        Initialize the RandomFlip class with probability and direction.
-
-        This class applies a random horizontal or vertical flip to an image with a given probability.
-        It also updates any instances (bounding boxes, keypoints, etc.) accordingly.
-
-        Args:
-            p (float): The probability of applying the flip. Must be between 0 and 1.
-            direction (str): The direction to apply the flip. Must be 'horizontal' or 'vertical'.
-            flip_idx (list[int] | None): Index mapping for flipping keypoints, if any.
-
-        Raises:
-            AssertionError: If direction is not 'horizontal' or 'vertical', or if p is not between 0 and 1.
-
-        Examples:
-            >>> flip = RandomFlip(p=0.5, direction="horizontal")
-            >>> flip_with_idx = RandomFlip(p=0.7, direction="vertical", flip_idx=[1, 0, 3, 2, 5, 4])
-        """
-        assert direction in {"horizontal", "vertical"}, f"Support direction `horizontal` or `vertical`, got {direction}"
-        assert 0 <= p <= 1.0, f"The probability should be in range [0, 1], but got {p}."
-
-        self.p = p
-        self.direction = direction
-        self.flip_idx = flip_idx
-
-    def __call__(self, labels: dict[str, Any]) -> dict[str, Any]:
-        """
-        Apply random flip to an image and update any instances like bounding boxes or keypoints accordingly.
-
-        This method randomly flips the input image either horizontally or vertically based on the initialized
-        probability and direction. It also updates the corresponding instances (bounding boxes, keypoints) to
-        match the flipped image.
-
-        Args:
-            labels (dict[str, Any]): A dictionary containing the following keys:
-                'img' (np.ndarray): The image to be flipped.
-                'instances' (ultralytics.utils.instance.Instances): An object containing bounding boxes and
-                    optionally keypoints.
-
-        Returns:
-            (dict[str, Any]): The same dictionary with the flipped image and updated instances:
-                'img' (np.ndarray): The flipped image.
-                'instances' (ultralytics.utils.instance.Instances): Updated instances matching the flipped image.
-
-        Examples:
-            >>> labels = {"img": np.random.rand(640, 640, 3), "instances": Instances(...)}
-            >>> random_flip = RandomFlip(p=0.5, direction="horizontal")
-            >>> flipped_labels = random_flip(labels)
-        """
-        img = labels["img"]
-        instances = labels.pop("instances")
-        instances.convert_bbox(format="xywh")
-        h, w = img.shape[:2]
-        h = 1 if instances.normalized else h
-        w = 1 if instances.normalized else w
-
-        # WARNING: two separate if and calls to random.random() intentional for reproducibility with older versions
-        if self.direction == "vertical" and random.random() < self.p:
-            img = np.flipud(img)
-            instances.flipud(h)
-            if self.flip_idx is not None and instances.keypoints is not None:
-                instances.keypoints = np.ascontiguousarray(instances.keypoints[:, self.flip_idx, :])
-        if self.direction == "horizontal" and random.random() < self.p:
-            img = np.fliplr(img)
-            instances.fliplr(w)
-            if self.flip_idx is not None and instances.keypoints is not None:
-                instances.keypoints = np.ascontiguousarray(instances.keypoints[:, self.flip_idx, :])
-        labels["img"] = np.ascontiguousarray(img)
-        labels["instances"] = instances
-        return labels
-
-
-class LetterBox:
-    """
-    Resize image and padding for detection, instance segmentation, pose.
-
-    This class resizes and pads images to a specified shape while preserving aspect ratio. It also updates
-    corresponding labels and bounding boxes.
-
-    Attributes:
-        new_shape (tuple): Target shape (height, width) for resizing.
-        auto (bool): Whether to use minimum rectangle.
-        scale_fill (bool): Whether to stretch the image to new_shape.
-        scaleup (bool): Whether to allow scaling up. If False, only scale down.
-        stride (int): Stride for rounding padding.
-        center (bool): Whether to center the image or align to top-left.
-
-    Methods:
-        __call__: Resize and pad image, update labels and bounding boxes.
-
-    Examples:
-        >>> transform = LetterBox(new_shape=(640, 640))
-        >>> result = transform(labels)
-        >>> resized_img = result["img"]
-        >>> updated_instances = result["instances"]
-    """
-
-    def __init__(
-        self,
-        new_shape: tuple[int, int] = (640, 640),
-        auto: bool = False,
-        scale_fill: bool = False,
-        scaleup: bool = True,
-        center: bool = True,
-        stride: int = 32,
-        padding_value: int = 114,
-        interpolation: int = cv2.INTER_LINEAR,
-    ):
-        """
-        Initialize LetterBox object for resizing and padding images.
-
-        This class is designed to resize and pad images for object detection, instance segmentation, and pose estimation
-        tasks. It supports various resizing modes including auto-sizing, scale-fill, and letterboxing.
-
-        Args:
-            new_shape (tuple[int, int]): Target size (height, width) for the resized image.
-            auto (bool): If True, use minimum rectangle to resize. If False, use new_shape directly.
-            scale_fill (bool): If True, stretch the image to new_shape without padding.
-            scaleup (bool): If True, allow scaling up. If False, only scale down.
-            center (bool): If True, center the placed image. If False, place image in top-left corner.
-            stride (int): Stride of the model (e.g., 32 for YOLOv5).
-            padding_value (int): Value for padding the image. Default is 114.
-            interpolation (int): Interpolation method for resizing. Default is cv2.INTER_LINEAR.
-
-        Attributes:
-            new_shape (tuple[int, int]): Target size for the resized image.
-            auto (bool): Flag for using minimum rectangle resizing.
-            scale_fill (bool): Flag for stretching image without padding.
-            scaleup (bool): Flag for allowing upscaling.
-            stride (int): Stride value for ensuring image size is divisible by stride.
-            padding_value (int): Value used for padding the image.
-            interpolation (int): Interpolation method used for resizing.
-
-        Examples:
-            >>> letterbox = LetterBox(new_shape=(640, 640), auto=False, scale_fill=False, scaleup=True, stride=32)
-            >>> resized_img = letterbox(original_img)
-        """
-        self.new_shape = new_shape
-        self.auto = auto
-        self.scale_fill = scale_fill
-        self.scaleup = scaleup
-        self.stride = stride
-        self.center = center  # Put the image in the middle or top-left
-        self.padding_value = padding_value
-        self.interpolation = interpolation
-
-    def __call__(self, labels: dict[str, Any] = None, image: np.ndarray = None) -> dict[str, Any] | np.ndarray:
-        """
-        Resize and pad an image for object detection, instance segmentation, or pose estimation tasks.
-
-        This method applies letterboxing to the input image, which involves resizing the image while maintaining its
-        aspect ratio and adding padding to fit the new shape. It also updates any associated labels accordingly.
-
-        Args:
-            labels (dict[str, Any] | None): A dictionary containing image data and associated labels, or empty dict if None.
-            image (np.ndarray | None): The input image as a numpy array. If None, the image is taken from 'labels'.
-
-        Returns:
-            (dict[str, Any] | nd.ndarray): If 'labels' is provided, returns an updated dictionary with the resized and padded image,
-                updated labels, and additional metadata. If 'labels' is empty, returns the resized
-                and padded image.
-
-        Examples:
-            >>> letterbox = LetterBox(new_shape=(640, 640))
-            >>> result = letterbox(labels={"img": np.zeros((480, 640, 3)), "instances": Instances(...)})
-            >>> resized_img = result["img"]
-            >>> updated_instances = result["instances"]
-        """
-        if labels is None:
-            labels = {}
-        img = labels.get("img") if image is None else image
-        shape = img.shape[:2]  # current shape [height, width]
-        new_shape = labels.pop("rect_shape", self.new_shape)
-        if isinstance(new_shape, int):
-            new_shape = (new_shape, new_shape)
-
-        # Scale ratio (new / old)
-        r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
-        if not self.scaleup:  # only scale down, do not scale up (for better val mAP)
-            r = min(r, 1.0)
-
-        # Compute padding
-        ratio = r, r  # width, height ratios
-        new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
-        dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
-        if self.auto:  # minimum rectangle
-            dw, dh = np.mod(dw, self.stride), np.mod(dh, self.stride)  # wh padding
-        elif self.scale_fill:  # stretch
-            dw, dh = 0.0, 0.0
-            new_unpad = (new_shape[1], new_shape[0])
-            ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios
-
-        if self.center:
-            dw /= 2  # divide padding into 2 sides
-            dh /= 2
-
-        if shape[::-1] != new_unpad:  # resize
-            img = cv2.resize(img, new_unpad, interpolation=self.interpolation)
-            if img.ndim == 2:
-                img = img[..., None]
-
-        top, bottom = int(round(dh - 0.1)) if self.center else 0, int(round(dh + 0.1))
-        left, right = int(round(dw - 0.1)) if self.center else 0, int(round(dw + 0.1))
-        h, w, c = img.shape
-        if c == 3:
-            img = cv2.copyMakeBorder(
-                img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(self.padding_value,) * 3
-            )
-        else:  # multispectral
-            pad_img = np.full((h + top + bottom, w + left + right, c), fill_value=self.padding_value, dtype=img.dtype)
-            pad_img[top : top + h, left : left + w] = img
-            img = pad_img
-
-        if labels.get("ratio_pad"):
-            labels["ratio_pad"] = (labels["ratio_pad"], (left, top))  # for evaluation
-
-        if len(labels):
-            labels = self._update_labels(labels, ratio, left, top)
-            labels["img"] = img
-            labels["resized_shape"] = new_shape
-            return labels
-        else:
-            return img
-
-    @staticmethod
-    def _update_labels(labels: dict[str, Any], ratio: tuple[float, float], padw: float, padh: float) -> dict[str, Any]:
-        """
-        Update labels after applying letterboxing to an image.
-
-        This method modifies the bounding box coordinates of instances in the labels
-        to account for resizing and padding applied during letterboxing.
-
-        Args:
-            labels (dict[str, Any]): A dictionary containing image labels and instances.
-            ratio (tuple[float, float]): Scaling ratios (width, height) applied to the image.
-            padw (float): Padding width added to the image.
-            padh (float): Padding height added to the image.
-
-        Returns:
-            (dict[str, Any]): Updated labels dictionary with modified instance coordinates.
-
-        Examples:
-            >>> letterbox = LetterBox(new_shape=(640, 640))
-            >>> labels = {"instances": Instances(...)}
-            >>> ratio = (0.5, 0.5)
-            >>> padw, padh = 10, 20
-            >>> updated_labels = letterbox._update_labels(labels, ratio, padw, padh)
-        """
-        labels["instances"].convert_bbox(format="xyxy")
-        labels["instances"].denormalize(*labels["img"].shape[:2][::-1])
-        labels["instances"].scale(*ratio)
-        labels["instances"].add_padding(padw, padh)
-        return labels
-
-
-class CopyPaste(BaseMixTransform):
-    """
-    CopyPaste class for applying Copy-Paste augmentation to image datasets.
-
-    This class implements the Copy-Paste augmentation technique as described in the paper "Simple Copy-Paste is a Strong
-    Data Augmentation Method for Instance Segmentation" (https://arxiv.org/abs/2012.07177). It combines objects from
-    different images to create new training samples.
-
-    Attributes:
-        dataset (Any): The dataset to which Copy-Paste augmentation will be applied.
-        pre_transform (Callable | None): Optional transform to apply before Copy-Paste.
-        p (float): Probability of applying Copy-Paste augmentation.
-
-    Methods:
-        _mix_transform: Apply Copy-Paste augmentation to the input labels.
-        __call__: Apply the Copy-Paste transformation to images and annotations.
-
-    Examples:
-        >>> from ultralytics.data.augment import CopyPaste
-        >>> dataset = YourDataset(...)  # Your image dataset
-        >>> copypaste = CopyPaste(dataset, p=0.5)
-        >>> augmented_labels = copypaste(original_labels)
-    """
-
-    def __init__(self, dataset=None, pre_transform=None, p: float = 0.5, mode: str = "flip") -> None:
-        """Initialize CopyPaste object with dataset, pre_transform, and probability of applying MixUp."""
-        super().__init__(dataset=dataset, pre_transform=pre_transform, p=p)
-        assert mode in {"flip", "mixup"}, f"Expected `mode` to be `flip` or `mixup`, but got {mode}."
-        self.mode = mode
-
-    def _mix_transform(self, labels: dict[str, Any]) -> dict[str, Any]:
-        """Apply Copy-Paste augmentation to combine objects from another image into the current image."""
-        labels2 = labels["mix_labels"][0]
-        return self._transform(labels, labels2)
-
-    def __call__(self, labels: dict[str, Any]) -> dict[str, Any]:
-        """Apply Copy-Paste augmentation to an image and its labels."""
-        if len(labels["instances"].segments) == 0 or self.p == 0:
-            return labels
-        if self.mode == "flip":
-            return self._transform(labels)
-
-        # Get index of one or three other images
-        indexes = self.get_indexes()
-        if isinstance(indexes, int):
-            indexes = [indexes]
-
-        # Get images information will be used for Mosaic or MixUp
-        mix_labels = [self.dataset.get_image_and_label(i) for i in indexes]
-
-        if self.pre_transform is not None:
-            for i, data in enumerate(mix_labels):
-                mix_labels[i] = self.pre_transform(data)
-        labels["mix_labels"] = mix_labels
-
-        # Update cls and texts
-        labels = self._update_label_text(labels)
-        # Mosaic or MixUp
-        labels = self._mix_transform(labels)
-        labels.pop("mix_labels", None)
-        return labels
-
-    def _transform(self, labels1: dict[str, Any], labels2: dict[str, Any] = {}) -> dict[str, Any]:
-        """Apply Copy-Paste augmentation to combine objects from another image into the current image."""
-        im = labels1["img"]
-        if "mosaic_border" not in labels1:
-            im = im.copy()  # avoid modifying original non-mosaic image
-        cls = labels1["cls"]
-        h, w = im.shape[:2]
-        instances = labels1.pop("instances")
-        instances.convert_bbox(format="xyxy")
-        instances.denormalize(w, h)
-
-        im_new = np.zeros(im.shape, np.uint8)
-        instances2 = labels2.pop("instances", None)
-        if instances2 is None:
-            instances2 = deepcopy(instances)
-            instances2.fliplr(w)
-        ioa = bbox_ioa(instances2.bboxes, instances.bboxes)  # intersection over area, (N, M)
-        indexes = np.nonzero((ioa < 0.30).all(1))[0]  # (N, )
-        n = len(indexes)
-        sorted_idx = np.argsort(ioa.max(1)[indexes])
-        indexes = indexes[sorted_idx]
-        for j in indexes[: round(self.p * n)]:
-            cls = np.concatenate((cls, labels2.get("cls", cls)[[j]]), axis=0)
-            instances = Instances.concatenate((instances, instances2[[j]]), axis=0)
-            cv2.drawContours(im_new, instances2.segments[[j]].astype(np.int32), -1, (1, 1, 1), cv2.FILLED)
-
-        result = labels2.get("img", cv2.flip(im, 1))  # augment segments
-        if result.ndim == 2:  # cv2.flip would eliminate the last dimension for grayscale images
-            result = result[..., None]
-        i = im_new.astype(bool)
-        im[i] = result[i]
-
-        labels1["img"] = im
-        labels1["cls"] = cls
-        labels1["instances"] = instances
-        return labels1
-
-
-class Albumentations:
-    """
-    Albumentations transformations for image augmentation.
-
-    This class applies various image transformations using the Albumentations library. It includes operations such as
-    Blur, Median Blur, conversion to grayscale, Contrast Limited Adaptive Histogram Equalization (CLAHE), random changes
-    in brightness and contrast, RandomGamma, and image quality reduction through compression.
-
-    Attributes:
-        p (float): Probability of applying the transformations.
-        transform (albumentations.Compose): Composed Albumentations transforms.
-        contains_spatial (bool): Indicates if the transforms include spatial operations.
-
-    Methods:
-        __call__: Apply the Albumentations transformations to the input labels.
-
-    Examples:
-        >>> transform = Albumentations(p=0.5)
-        >>> augmented_labels = transform(labels)
-
-    Notes:
-        - The Albumentations package must be installed to use this class.
-        - If the package is not installed or an error occurs during initialization, the transform will be set to None.
-        - Spatial transforms are handled differently and require special processing for bounding boxes.
-    """
-
-    def __init__(self, p: float = 1.0) -> None:
-        """
-        Initialize the Albumentations transform object for YOLO bbox formatted parameters.
-
-        This class applies various image augmentations using the Albumentations library, including Blur, Median Blur,
-        conversion to grayscale, Contrast Limited Adaptive Histogram Equalization, random changes of brightness and
-        contrast, RandomGamma, and image quality reduction through compression.
-
-        Args:
-            p (float): Probability of applying the augmentations. Must be between 0 and 1.
-
-        Attributes:
-            p (float): Probability of applying the augmentations.
-            transform (albumentations.Compose): Composed Albumentations transforms.
-            contains_spatial (bool): Indicates if the transforms include spatial transformations.
-
-        Raises:
-            ImportError: If the Albumentations package is not installed.
-            Exception: For any other errors during initialization.
-
-        Examples:
-            >>> transform = Albumentations(p=0.5)
-            >>> augmented = transform(image=image, bboxes=bboxes, class_labels=classes)
-            >>> augmented_image = augmented["image"]
-            >>> augmented_bboxes = augmented["bboxes"]
-
-        Notes:
-            - Requires Albumentations version 1.0.3 or higher.
-            - Spatial transforms are handled differently to ensure bbox compatibility.
-            - Some transforms are applied with very low probability (0.01) by default.
-        """
-        self.p = p
-        self.transform = None
-        prefix = colorstr("albumentations: ")
-
-        try:
-            import os
-
-            os.environ["NO_ALBUMENTATIONS_UPDATE"] = "1"  # suppress Albumentations upgrade message
-            import albumentations as A
-
-            check_version(A.__version__, "1.0.3", hard=True)  # version requirement
-
-            # List of possible spatial transforms
-            spatial_transforms = {
-                "Affine",
-                "BBoxSafeRandomCrop",
-                "CenterCrop",
-                "CoarseDropout",
-                "Crop",
-                "CropAndPad",
-                "CropNonEmptyMaskIfExists",
-                "D4",
-                "ElasticTransform",
-                "Flip",
-                "GridDistortion",
-                "GridDropout",
-                "HorizontalFlip",
-                "Lambda",
-                "LongestMaxSize",
-                "MaskDropout",
-                "MixUp",
-                "Morphological",
-                "NoOp",
-                "OpticalDistortion",
-                "PadIfNeeded",
-                "Perspective",
-                "PiecewiseAffine",
-                "PixelDropout",
-                "RandomCrop",
-                "RandomCropFromBorders",
-                "RandomGridShuffle",
-                "RandomResizedCrop",
-                "RandomRotate90",
-                "RandomScale",
-                "RandomSizedBBoxSafeCrop",
-                "RandomSizedCrop",
-                "Resize",
-                "Rotate",
-                "SafeRotate",
-                "ShiftScaleRotate",
-                "SmallestMaxSize",
-                "Transpose",
-                "VerticalFlip",
-                "XYMasking",
-            }  # from https://albumentations.ai/docs/getting_started/transforms_and_targets/#spatial-level-transforms
-
-            # Transforms
-            T = [
-                A.Blur(p=0.01),
-                A.MedianBlur(p=0.01),
-                A.ToGray(p=0.01),
-                A.CLAHE(p=0.01),
-                A.RandomBrightnessContrast(p=0.0),
-                A.RandomGamma(p=0.0),
-                A.ImageCompression(quality_range=(75, 100), p=0.0),
-            ]
-
-            # Compose transforms
-            self.contains_spatial = any(transform.__class__.__name__ in spatial_transforms for transform in T)
-            self.transform = (
-                A.Compose(T, bbox_params=A.BboxParams(format="yolo", label_fields=["class_labels"]))
-                if self.contains_spatial
-                else A.Compose(T)
-            )
-            if hasattr(self.transform, "set_random_seed"):
-                # Required for deterministic transforms in albumentations>=1.4.21
-                self.transform.set_random_seed(torch.initial_seed())
-            LOGGER.info(prefix + ", ".join(f"{x}".replace("always_apply=False, ", "") for x in T if x.p))
-        except ImportError:  # package not installed, skip
-            pass
-        except Exception as e:
-            LOGGER.info(f"{prefix}{e}")
-
-    def __call__(self, labels: dict[str, Any]) -> dict[str, Any]:
-        """
-        Apply Albumentations transformations to input labels.
-
-        This method applies a series of image augmentations using the Albumentations library. It can perform both
-        spatial and non-spatial transformations on the input image and its corresponding labels.
-
-        Args:
-            labels (dict[str, Any]): A dictionary containing image data and annotations. Expected keys are:
-                - 'img': np.ndarray representing the image
-                - 'cls': np.ndarray of class labels
-                - 'instances': object containing bounding boxes and other instance information
-
-        Returns:
-            (dict[str, Any]): The input dictionary with augmented image and updated annotations.
-
-        Examples:
-            >>> transform = Albumentations(p=0.5)
-            >>> labels = {
-            ...     "img": np.random.rand(640, 640, 3),
-            ...     "cls": np.array([0, 1]),
-            ...     "instances": Instances(bboxes=np.array([[0, 0, 1, 1], [0.5, 0.5, 0.8, 0.8]])),
-            ... }
-            >>> augmented = transform(labels)
-            >>> assert augmented["img"].shape == (640, 640, 3)
-
-        Notes:
-            - The method applies transformations with probability self.p.
-            - Spatial transforms update bounding boxes, while non-spatial transforms only modify the image.
-            - Requires the Albumentations library to be installed.
-        """
-        if self.transform is None or random.random() > self.p:
-            return labels
-
-        im = labels["img"]
-        if im.shape[2] != 3:  # Only apply Albumentation on 3-channel images
-            return labels
-
-        if self.contains_spatial:
-            cls = labels["cls"]
-            if len(cls):
-                labels["instances"].convert_bbox("xywh")
-                labels["instances"].normalize(*im.shape[:2][::-1])
-                bboxes = labels["instances"].bboxes
-                # TODO: add supports of segments and keypoints
-                new = self.transform(image=im, bboxes=bboxes, class_labels=cls)  # transformed
-                if len(new["class_labels"]) > 0:  # skip update if no bbox in new im
-                    labels["img"] = new["image"]
-                    labels["cls"] = np.array(new["class_labels"])
-                    bboxes = np.array(new["bboxes"], dtype=np.float32)
-                labels["instances"].update(bboxes=bboxes)
-        else:
-            labels["img"] = self.transform(image=labels["img"])["image"]  # transformed
-
-        return labels
-
-
-class Format:
-    """
-    A class for formatting image annotations for object detection, instance segmentation, and pose estimation tasks.
-
-    This class standardizes image and instance annotations to be used by the `collate_fn` in PyTorch DataLoader.
-
-    Attributes:
-        bbox_format (str): Format for bounding boxes. Options are 'xywh' or 'xyxy'.
-        normalize (bool): Whether to normalize bounding boxes.
-        return_mask (bool): Whether to return instance masks for segmentation.
-        return_keypoint (bool): Whether to return keypoints for pose estimation.
-        return_obb (bool): Whether to return oriented bounding boxes.
-        mask_ratio (int): Downsample ratio for masks.
-        mask_overlap (bool): Whether to overlap masks.
-        batch_idx (bool): Whether to keep batch indexes.
-        bgr (float): The probability to return BGR images.
-
-    Methods:
-        __call__: Format labels dictionary with image, classes, bounding boxes, and optionally masks and keypoints.
-        _format_img: Convert image from Numpy array to PyTorch tensor.
-        _format_segments: Convert polygon points to bitmap masks.
-
-    Examples:
-        >>> formatter = Format(bbox_format="xywh", normalize=True, return_mask=True)
-        >>> formatted_labels = formatter(labels)
-        >>> img = formatted_labels["img"]
-        >>> bboxes = formatted_labels["bboxes"]
-        >>> masks = formatted_labels["masks"]
-    """
-
-    def __init__(
-        self,
-        bbox_format: str = "xywh",
-        normalize: bool = True,
-        return_mask: bool = False,
-        return_keypoint: bool = False,
-        return_obb: bool = False,
-        mask_ratio: int = 4,
-        mask_overlap: bool = True,
-        batch_idx: bool = True,
-        bgr: float = 0.0,
-    ):
-        """
-        Initialize the Format class with given parameters for image and instance annotation formatting.
-
-        This class standardizes image and instance annotations for object detection, instance segmentation, and pose
-        estimation tasks, preparing them for use in PyTorch DataLoader's `collate_fn`.
-
-        Args:
-            bbox_format (str): Format for bounding boxes. Options are 'xywh', 'xyxy', etc.
-            normalize (bool): Whether to normalize bounding boxes to [0,1].
-            return_mask (bool): If True, returns instance masks for segmentation tasks.
-            return_keypoint (bool): If True, returns keypoints for pose estimation tasks.
-            return_obb (bool): If True, returns oriented bounding boxes.
-            mask_ratio (int): Downsample ratio for masks.
-            mask_overlap (bool): If True, allows mask overlap.
-            batch_idx (bool): If True, keeps batch indexes.
-            bgr (float): Probability of returning BGR images instead of RGB.
-
-        Attributes:
-            bbox_format (str): Format for bounding boxes.
-            normalize (bool): Whether bounding boxes are normalized.
-            return_mask (bool): Whether to return instance masks.
-            return_keypoint (bool): Whether to return keypoints.
-            return_obb (bool): Whether to return oriented bounding boxes.
-            mask_ratio (int): Downsample ratio for masks.
-            mask_overlap (bool): Whether masks can overlap.
-            batch_idx (bool): Whether to keep batch indexes.
-            bgr (float): The probability to return BGR images.
-
-        Examples:
-            >>> format = Format(bbox_format="xyxy", return_mask=True, return_keypoint=False)
-            >>> print(format.bbox_format)
-            xyxy
-        """
-        self.bbox_format = bbox_format
-        self.normalize = normalize
-        self.return_mask = return_mask  # set False when training detection only
-        self.return_keypoint = return_keypoint
-        self.return_obb = return_obb
-        self.mask_ratio = mask_ratio
-        self.mask_overlap = mask_overlap
-        self.batch_idx = batch_idx  # keep the batch indexes
-        self.bgr = bgr
-
-    def __call__(self, labels: dict[str, Any]) -> dict[str, Any]:
-        """
-        Format image annotations for object detection, instance segmentation, and pose estimation tasks.
-
-        This method standardizes the image and instance annotations to be used by the `collate_fn` in PyTorch
-        DataLoader. It processes the input labels dictionary, converting annotations to the specified format and
-        applying normalization if required.
-
-        Args:
-            labels (dict[str, Any]): A dictionary containing image and annotation data with the following keys:
-                - 'img': The input image as a numpy array.
-                - 'cls': Class labels for instances.
-                - 'instances': An Instances object containing bounding boxes, segments, and keypoints.
-
-        Returns:
-            (dict[str, Any]): A dictionary with formatted data, including:
-                - 'img': Formatted image tensor.
-                - 'cls': Class label's tensor.
-                - 'bboxes': Bounding boxes tensor in the specified format.
-                - 'masks': Instance masks tensor (if return_mask is True).
-                - 'keypoints': Keypoints tensor (if return_keypoint is True).
-                - 'batch_idx': Batch index tensor (if batch_idx is True).
-
-        Examples:
-            >>> formatter = Format(bbox_format="xywh", normalize=True, return_mask=True)
-            >>> labels = {"img": np.random.rand(640, 640, 3), "cls": np.array([0, 1]), "instances": Instances(...)}
-            >>> formatted_labels = formatter(labels)
-            >>> print(formatted_labels.keys())
-        """
-        img = labels.pop("img")
-        h, w = img.shape[:2]
-        cls = labels.pop("cls")
-        instances = labels.pop("instances")
-        instances.convert_bbox(format=self.bbox_format)
-        instances.denormalize(w, h)
-        nl = len(instances)
-
-        if self.return_mask:
-            if nl:
-                masks, instances, cls = self._format_segments(instances, cls, w, h)
-                masks = torch.from_numpy(masks)
-            else:
-                masks = torch.zeros(
-                    1 if self.mask_overlap else nl, img.shape[0] // self.mask_ratio, img.shape[1] // self.mask_ratio
-                )
-            labels["masks"] = masks
-        labels["img"] = self._format_img(img)
-        labels["cls"] = torch.from_numpy(cls) if nl else torch.zeros(nl, 1)
-        labels["bboxes"] = torch.from_numpy(instances.bboxes) if nl else torch.zeros((nl, 4))
-        if self.return_keypoint:
-            labels["keypoints"] = (
-                torch.empty(0, 3) if instances.keypoints is None else torch.from_numpy(instances.keypoints)
-            )
-            if self.normalize:
-                labels["keypoints"][..., 0] /= w
-                labels["keypoints"][..., 1] /= h
-        if self.return_obb:
-            labels["bboxes"] = (
-                xyxyxyxy2xywhr(torch.from_numpy(instances.segments)) if len(instances.segments) else torch.zeros((0, 5))
-            )
-        # NOTE: need to normalize obb in xywhr format for width-height consistency
-        if self.normalize:
-            labels["bboxes"][:, [0, 2]] /= w
-            labels["bboxes"][:, [1, 3]] /= h
-        # Then we can use collate_fn
-        if self.batch_idx:
-            labels["batch_idx"] = torch.zeros(nl)
-        return labels
-
-    def _format_img(self, img: np.ndarray) -> torch.Tensor:
-        """
-        Format an image for YOLO from a Numpy array to a PyTorch tensor.
-
-        This function performs the following operations:
-        1. Ensures the image has 3 dimensions (adds a channel dimension if needed).
-        2. Transposes the image from HWC to CHW format.
-        3. Optionally flips the color channels from RGB to BGR.
-        4. Converts the image to a contiguous array.
-        5. Converts the Numpy array to a PyTorch tensor.
-
-        Args:
-            img (np.ndarray): Input image as a Numpy array with shape (H, W, C) or (H, W).
-
-        Returns:
-            (torch.Tensor): Formatted image as a PyTorch tensor with shape (C, H, W).
-
-        Examples:
-            >>> import numpy as np
-            >>> img = np.random.rand(100, 100, 3)
-            >>> formatted_img = self._format_img(img)
-            >>> print(formatted_img.shape)
-            torch.Size([3, 100, 100])
-        """
-        if len(img.shape) < 3:
-            img = np.expand_dims(img, -1)
-        img = img.transpose(2, 0, 1)
-        img = np.ascontiguousarray(img[::-1] if random.uniform(0, 1) > self.bgr and img.shape[0] == 3 else img)
-        img = torch.from_numpy(img)
-        return img
-
-    def _format_segments(
-        self, instances: Instances, cls: np.ndarray, w: int, h: int
-    ) -> tuple[np.ndarray, Instances, np.ndarray]:
-        """
-        Convert polygon segments to bitmap masks.
-
-        Args:
-            instances (Instances): Object containing segment information.
-            cls (np.ndarray): Class labels for each instance.
-            w (int): Width of the image.
-            h (int): Height of the image.
-
-        Returns:
-            masks (np.ndarray): Bitmap masks with shape (N, H, W) or (1, H, W) if mask_overlap is True.
-            instances (Instances): Updated instances object with sorted segments if mask_overlap is True.
-            cls (np.ndarray): Updated class labels, sorted if mask_overlap is True.
-
-        Notes:
-            - If self.mask_overlap is True, masks are overlapped and sorted by area.
-            - If self.mask_overlap is False, each mask is represented separately.
-            - Masks are downsampled according to self.mask_ratio.
-        """
-        segments = instances.segments
-        if self.mask_overlap:
-            masks, sorted_idx = polygons2masks_overlap((h, w), segments, downsample_ratio=self.mask_ratio)
-            masks = masks[None]  # (640, 640) -> (1, 640, 640)
-            instances = instances[sorted_idx]
-            cls = cls[sorted_idx]
-        else:
-            masks = polygons2masks((h, w), segments, color=1, downsample_ratio=self.mask_ratio)
-
-        return masks, instances, cls
-
-
-class LoadVisualPrompt:
-    """Create visual prompts from bounding boxes or masks for model input."""
-
-    def __init__(self, scale_factor: float = 1 / 8) -> None:
-        """
-        Initialize the LoadVisualPrompt with a scale factor.
-
-        Args:
-            scale_factor (float): Factor to scale the input image dimensions.
-        """
-        self.scale_factor = scale_factor
-
-    def make_mask(self, boxes: torch.Tensor, h: int, w: int) -> torch.Tensor:
-        """
-        Create binary masks from bounding boxes.
-
-        Args:
-            boxes (torch.Tensor): Bounding boxes in xyxy format, shape: (N, 4).
-            h (int): Height of the mask.
-            w (int): Width of the mask.
-
-        Returns:
-            (torch.Tensor): Binary masks with shape (N, h, w).
-        """
-        x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1)  # x1 shape(n,1,1)
-        r = torch.arange(w)[None, None, :]  # rows shape(1,1,w)
-        c = torch.arange(h)[None, :, None]  # cols shape(1,h,1)
-
-        return (r >= x1) * (r < x2) * (c >= y1) * (c < y2)
-
-    def __call__(self, labels: dict[str, Any]) -> dict[str, Any]:
-        """
-        Process labels to create visual prompts.
-
-        Args:
-            labels (dict[str, Any]): Dictionary containing image data and annotations.
-
-        Returns:
-            (dict[str, Any]): Updated labels with visual prompts added.
-        """
-        imgsz = labels["img"].shape[1:]
-        bboxes, masks = None, None
-        if "bboxes" in labels:
-            bboxes = labels["bboxes"]
-            bboxes = xywh2xyxy(bboxes) * torch.tensor(imgsz)[[1, 0, 1, 0]]  # denormalize boxes
-
-        cls = labels["cls"].squeeze(-1).to(torch.int)
-        visuals = self.get_visuals(cls, imgsz, bboxes=bboxes, masks=masks)
-        labels["visuals"] = visuals
-        return labels
-
-    def get_visuals(
-        self,
-        category: int | np.ndarray | torch.Tensor,
-        shape: tuple[int, int],
-        bboxes: np.ndarray | torch.Tensor = None,
-        masks: np.ndarray | torch.Tensor = None,
-    ) -> torch.Tensor:
-        """
-        Generate visual masks based on bounding boxes or masks.
-
-        Args:
-            category (int | np.ndarray | torch.Tensor): The category labels for the objects.
-            shape (tuple[int, int]): The shape of the image (height, width).
-            bboxes (np.ndarray | torch.Tensor, optional): Bounding boxes for the objects, xyxy format.
-            masks (np.ndarray | torch.Tensor, optional): Masks for the objects.
-
-        Returns:
-            (torch.Tensor): A tensor containing the visual masks for each category.
-
-        Raises:
-            ValueError: If neither bboxes nor masks are provided.
-        """
-        masksz = (int(shape[0] * self.scale_factor), int(shape[1] * self.scale_factor))
-        if bboxes is not None:
-            if isinstance(bboxes, np.ndarray):
-                bboxes = torch.from_numpy(bboxes)
-            bboxes *= self.scale_factor
-            masks = self.make_mask(bboxes, *masksz).float()
-        elif masks is not None:
-            if isinstance(masks, np.ndarray):
-                masks = torch.from_numpy(masks)  # (N, H, W)
-            masks = F.interpolate(masks.unsqueeze(1), masksz, mode="nearest").squeeze(1).float()
-        else:
-            raise ValueError("LoadVisualPrompt must have bboxes or masks in the label")
-        if not isinstance(category, torch.Tensor):
-            category = torch.tensor(category, dtype=torch.int)
-        cls_unique, inverse_indices = torch.unique(category, sorted=True, return_inverse=True)
-        # NOTE: `cls` indices from RandomLoadText should be continuous.
-        # if len(cls_unique):
-        #     assert len(cls_unique) == cls_unique[-1] + 1, (
-        #         f"Expected a continuous range of class indices, but got {cls_unique}"
-        #     )
-        visuals = torch.zeros(cls_unique.shape[0], *masksz)
-        for idx, mask in zip(inverse_indices, masks):
-            visuals[idx] = torch.logical_or(visuals[idx], mask)
-        return visuals
-
-
-class RandomLoadText:
-    """
-    Randomly sample positive and negative texts and update class indices accordingly.
-
-    This class is responsible for sampling texts from a given set of class texts, including both positive
-    (present in the image) and negative (not present in the image) samples. It updates the class indices
-    to reflect the sampled texts and can optionally pad the text list to a fixed length.
-
-    Attributes:
-        prompt_format (str): Format string for text prompts.
-        neg_samples (tuple[int, int]): Range for randomly sampling negative texts.
-        max_samples (int): Maximum number of different text samples in one image.
-        padding (bool): Whether to pad texts to max_samples.
-        padding_value (str): The text used for padding when padding is True.
-
-    Methods:
-        __call__: Process the input labels and return updated classes and texts.
-
-    Examples:
-        >>> loader = RandomLoadText(prompt_format="Object: {}", neg_samples=(5, 10), max_samples=20)
-        >>> labels = {"cls": [0, 1, 2], "texts": [["cat"], ["dog"], ["bird"]], "instances": [...]}
-        >>> updated_labels = loader(labels)
-        >>> print(updated_labels["texts"])
-        ['Object: cat', 'Object: dog', 'Object: bird', 'Object: elephant', 'Object: car']
-    """
-
-    def __init__(
-        self,
-        prompt_format: str = "{}",
-        neg_samples: tuple[int, int] = (80, 80),
-        max_samples: int = 80,
-        padding: bool = False,
-        padding_value: list[str] = [""],
-    ) -> None:
-        """
-        Initialize the RandomLoadText class for randomly sampling positive and negative texts.
-
-        This class is designed to randomly sample positive texts and negative texts, and update the class
-        indices accordingly to the number of samples. It can be used for text-based object detection tasks.
-
-        Args:
-            prompt_format (str): Format string for the prompt. The format string should
-                contain a single pair of curly braces {} where the text will be inserted.
-            neg_samples (tuple[int, int]): A range to randomly sample negative texts. The first integer
-                specifies the minimum number of negative samples, and the second integer specifies the
-                maximum.
-            max_samples (int): The maximum number of different text samples in one image.
-            padding (bool): Whether to pad texts to max_samples. If True, the number of texts will always
-                be equal to max_samples.
-            padding_value (str): The padding text to use when padding is True.
-
-        Attributes:
-            prompt_format (str): The format string for the prompt.
-            neg_samples (tuple[int, int]): The range for sampling negative texts.
-            max_samples (int): The maximum number of text samples.
-            padding (bool): Whether padding is enabled.
-            padding_value (str): The value used for padding.
-
-        Examples:
-            >>> random_load_text = RandomLoadText(prompt_format="Object: {}", neg_samples=(50, 100), max_samples=120)
-            >>> random_load_text.prompt_format
-            'Object: {}'
-            >>> random_load_text.neg_samples
-            (50, 100)
-            >>> random_load_text.max_samples
-            120
-        """
-        self.prompt_format = prompt_format
-        self.neg_samples = neg_samples
-        self.max_samples = max_samples
-        self.padding = padding
-        self.padding_value = padding_value
-
-    def __call__(self, labels: dict[str, Any]) -> dict[str, Any]:
-        """
-        Randomly sample positive and negative texts and update class indices accordingly.
-
-        This method samples positive texts based on the existing class labels in the image, and randomly
-        selects negative texts from the remaining classes. It then updates the class indices to match the
-        new sampled text order.
-
-        Args:
-            labels (dict[str, Any]): A dictionary containing image labels and metadata. Must include 'texts' and 'cls' keys.
-
-        Returns:
-            (dict[str, Any]): Updated labels dictionary with new 'cls' and 'texts' entries.
-
-        Examples:
-            >>> loader = RandomLoadText(prompt_format="A photo of {}", neg_samples=(5, 10), max_samples=20)
-            >>> labels = {"cls": np.array([[0], [1], [2]]), "texts": [["dog"], ["cat"], ["bird"]]}
-            >>> updated_labels = loader(labels)
-        """
-        assert "texts" in labels, "No texts found in labels."
-        class_texts = labels["texts"]
-        num_classes = len(class_texts)
-        cls = np.asarray(labels.pop("cls"), dtype=int)
-        pos_labels = np.unique(cls).tolist()
-
-        if len(pos_labels) > self.max_samples:
-            pos_labels = random.sample(pos_labels, k=self.max_samples)
-
-        neg_samples = min(min(num_classes, self.max_samples) - len(pos_labels), random.randint(*self.neg_samples))
-        neg_labels = [i for i in range(num_classes) if i not in pos_labels]
-        neg_labels = random.sample(neg_labels, k=neg_samples)
-
-        sampled_labels = pos_labels + neg_labels
-        # Randomness
-        # random.shuffle(sampled_labels)
-
-        label2ids = {label: i for i, label in enumerate(sampled_labels)}
-        valid_idx = np.zeros(len(labels["instances"]), dtype=bool)
-        new_cls = []
-        for i, label in enumerate(cls.squeeze(-1).tolist()):
-            if label not in label2ids:
-                continue
-            valid_idx[i] = True
-            new_cls.append([label2ids[label]])
-        labels["instances"] = labels["instances"][valid_idx]
-        labels["cls"] = np.array(new_cls)
-
-        # Randomly select one prompt when there's more than one prompts
-        texts = []
-        for label in sampled_labels:
-            prompts = class_texts[label]
-            assert len(prompts) > 0
-            prompt = self.prompt_format.format(prompts[random.randrange(len(prompts))])
-            texts.append(prompt)
-
-        if self.padding:
-            valid_labels = len(pos_labels) + len(neg_labels)
-            num_padding = self.max_samples - valid_labels
-            if num_padding > 0:
-                texts += random.choices(self.padding_value, k=num_padding)
-
-        assert len(texts) == self.max_samples
-        labels["texts"] = texts
-        return labels
-
-
-def v8_transforms(dataset, imgsz: int, hyp: IterableSimpleNamespace, stretch: bool = False):
-    """
-    Apply a series of image transformations for training.
-
-    This function creates a composition of image augmentation techniques to prepare images for YOLO training.
-    It includes operations such as mosaic, copy-paste, random perspective, mixup, and various color adjustments.
-
-    Args:
-        dataset (Dataset): The dataset object containing image data and annotations.
-        imgsz (int): The target image size for resizing.
-        hyp (IterableSimpleNamespace): A dictionary of hyperparameters controlling various aspects of the transformations.
-        stretch (bool): If True, applies stretching to the image. If False, uses LetterBox resizing.
-
-    Returns:
-        (Compose): A composition of image transformations to be applied to the dataset.
-
-    Examples:
-        >>> from ultralytics.data.dataset import YOLODataset
-        >>> from ultralytics.utils import IterableSimpleNamespace
-        >>> dataset = YOLODataset(img_path="path/to/images", imgsz=640)
-        >>> hyp = IterableSimpleNamespace(mosaic=1.0, copy_paste=0.5, degrees=10.0, translate=0.2, scale=0.9)
-        >>> transforms = v8_transforms(dataset, imgsz=640, hyp=hyp)
-        >>> augmented_data = transforms(dataset[0])
-    """
-    mosaic = Mosaic(dataset, imgsz=imgsz, p=hyp.mosaic)
-    affine = RandomPerspective(
-        degrees=hyp.degrees,
-        translate=hyp.translate,
-        scale=hyp.scale,
-        shear=hyp.shear,
-        perspective=hyp.perspective,
-        pre_transform=None if stretch else LetterBox(new_shape=(imgsz, imgsz)),
-    )
-
-    pre_transform = Compose([mosaic, affine])
-    if hyp.copy_paste_mode == "flip":
-        pre_transform.insert(1, CopyPaste(p=hyp.copy_paste, mode=hyp.copy_paste_mode))
-    else:
-        pre_transform.append(
-            CopyPaste(
-                dataset,
-                pre_transform=Compose([Mosaic(dataset, imgsz=imgsz, p=hyp.mosaic), affine]),
-                p=hyp.copy_paste,
-                mode=hyp.copy_paste_mode,
-            )
-        )
-    flip_idx = dataset.data.get("flip_idx", [])  # for keypoints augmentation
-    if dataset.use_keypoints:
-        kpt_shape = dataset.data.get("kpt_shape", None)
-        if len(flip_idx) == 0 and (hyp.fliplr > 0.0 or hyp.flipud > 0.0):
-            hyp.fliplr = hyp.flipud = 0.0  # both fliplr and flipud require flip_idx
-            LOGGER.warning("No 'flip_idx' array defined in data.yaml, disabling 'fliplr' and 'flipud' augmentations.")
-        elif flip_idx and (len(flip_idx) != kpt_shape[0]):
-            raise ValueError(f"data.yaml flip_idx={flip_idx} length must be equal to kpt_shape[0]={kpt_shape[0]}")
-
-    return Compose(
-        [
-            pre_transform,
-            MixUp(dataset, pre_transform=pre_transform, p=hyp.mixup),
-            CutMix(dataset, pre_transform=pre_transform, p=hyp.cutmix),
-            Albumentations(p=1.0),
-            RandomHSV(hgain=hyp.hsv_h, sgain=hyp.hsv_s, vgain=hyp.hsv_v),
-            RandomFlip(direction="vertical", p=hyp.flipud, flip_idx=flip_idx),
-            RandomFlip(direction="horizontal", p=hyp.fliplr, flip_idx=flip_idx),
-        ]
-    )  # transforms
-
-
-# Classification augmentations -----------------------------------------------------------------------------------------
-def classify_transforms(
-    size: tuple[int, int] | int = 224,
-    mean: tuple[float, float, float] = DEFAULT_MEAN,
-    std: tuple[float, float, float] = DEFAULT_STD,
-    interpolation: str = "BILINEAR",
-    crop_fraction: float = None,
-):
-    """
-    Create a composition of image transforms for classification tasks.
-
-    This function generates a sequence of torchvision transforms suitable for preprocessing images
-    for classification models during evaluation or inference. The transforms include resizing,
-    center cropping, conversion to tensor, and normalization.
-
-    Args:
-        size (int | tuple): The target size for the transformed image. If an int, it defines the shortest edge. If a
-            tuple, it defines (height, width).
-        mean (tuple[float, float, float]): Mean values for each RGB channel used in normalization.
-        std (tuple[float, float, float]): Standard deviation values for each RGB channel used in normalization.
-        interpolation (str): Interpolation method of either 'NEAREST', 'BILINEAR' or 'BICUBIC'.
-        crop_fraction (float): Deprecated, will be removed in a future version.
-
-    Returns:
-        (torchvision.transforms.Compose): A composition of torchvision transforms.
-
-    Examples:
-        >>> transforms = classify_transforms(size=224)
-        >>> img = Image.open("path/to/image.jpg")
-        >>> transformed_img = transforms(img)
-    """
-    import torchvision.transforms as T  # scope for faster 'import ultralytics'
-
-    scale_size = size if isinstance(size, (tuple, list)) and len(size) == 2 else (size, size)
-
-    if crop_fraction:
-        raise DeprecationWarning(
-            "'crop_fraction' arg of classify_transforms is deprecated, will be removed in a future version."
-        )
-
-    # Aspect ratio is preserved, crops center within image, no borders are added, image is lost
-    if scale_size[0] == scale_size[1]:
-        # Simple case, use torchvision built-in Resize with the shortest edge mode (scalar size arg)
-        tfl = [T.Resize(scale_size[0], interpolation=getattr(T.InterpolationMode, interpolation))]
-    else:
-        # Resize the shortest edge to matching target dim for non-square target
-        tfl = [T.Resize(scale_size)]
-    tfl += [T.CenterCrop(size), T.ToTensor(), T.Normalize(mean=torch.tensor(mean), std=torch.tensor(std))]
-    return T.Compose(tfl)
-
-
-# Classification training augmentations --------------------------------------------------------------------------------
-def classify_augmentations(
-    size: int = 224,
-    mean: tuple[float, float, float] = DEFAULT_MEAN,
-    std: tuple[float, float, float] = DEFAULT_STD,
-    scale: tuple[float, float] = None,
-    ratio: tuple[float, float] = None,
-    hflip: float = 0.5,
-    vflip: float = 0.0,
-    auto_augment: str = None,
-    hsv_h: float = 0.015,  # image HSV-Hue augmentation (fraction)
-    hsv_s: float = 0.4,  # image HSV-Saturation augmentation (fraction)
-    hsv_v: float = 0.4,  # image HSV-Value augmentation (fraction)
-    force_color_jitter: bool = False,
-    erasing: float = 0.0,
-    interpolation: str = "BILINEAR",
-):
-    """
-    Create a composition of image augmentation transforms for classification tasks.
-
-    This function generates a set of image transformations suitable for training classification models. It includes
-    options for resizing, flipping, color jittering, auto augmentation, and random erasing.
-
-    Args:
-        size (int): Target size for the image after transformations.
-        mean (tuple[float, float, float]): Mean values for each RGB channel used in normalization.
-        std (tuple[float, float, float]): Standard deviation values for each RGB channel used in normalization.
-        scale (tuple[float, float] | None): Range of size of the origin size cropped.
-        ratio (tuple[float, float] | None): Range of aspect ratio of the origin aspect ratio cropped.
-        hflip (float): Probability of horizontal flip.
-        vflip (float): Probability of vertical flip.
-        auto_augment (str | None): Auto augmentation policy. Can be 'randaugment', 'augmix', 'autoaugment' or None.
-        hsv_h (float): Image HSV-Hue augmentation factor.
-        hsv_s (float): Image HSV-Saturation augmentation factor.
-        hsv_v (float): Image HSV-Value augmentation factor.
-        force_color_jitter (bool): Whether to apply color jitter even if auto augment is enabled.
-        erasing (float): Probability of random erasing.
-        interpolation (str): Interpolation method of either 'NEAREST', 'BILINEAR' or 'BICUBIC'.
-
-    Returns:
-        (torchvision.transforms.Compose): A composition of image augmentation transforms.
-
-    Examples:
-        >>> transforms = classify_augmentations(size=224, auto_augment="randaugment")
-        >>> augmented_image = transforms(original_image)
-    """
-    # Transforms to apply if Albumentations not installed
-    import torchvision.transforms as T  # scope for faster 'import ultralytics'
-
-    if not isinstance(size, int):
-        raise TypeError(f"classify_augmentations() size {size} must be integer, not (list, tuple)")
-    scale = tuple(scale or (0.08, 1.0))  # default imagenet scale range
-    ratio = tuple(ratio or (3.0 / 4.0, 4.0 / 3.0))  # default imagenet ratio range
-    interpolation = getattr(T.InterpolationMode, interpolation)
-    primary_tfl = [T.RandomResizedCrop(size, scale=scale, ratio=ratio, interpolation=interpolation)]
-    if hflip > 0.0:
-        primary_tfl.append(T.RandomHorizontalFlip(p=hflip))
-    if vflip > 0.0:
-        primary_tfl.append(T.RandomVerticalFlip(p=vflip))
-
-    secondary_tfl = []
-    disable_color_jitter = False
-    if auto_augment:
-        assert isinstance(auto_augment, str), f"Provided argument should be string, but got type {type(auto_augment)}"
-        # color jitter is typically disabled if AA/RA on,
-        # this allows override without breaking old hparm cfgs
-        disable_color_jitter = not force_color_jitter
-
-        if auto_augment == "randaugment":
-            if TORCHVISION_0_11:
-                secondary_tfl.append(T.RandAugment(interpolation=interpolation))
-            else:
-                LOGGER.warning('"auto_augment=randaugment" requires torchvision >= 0.11.0. Disabling it.')
-
-        elif auto_augment == "augmix":
-            if TORCHVISION_0_13:
-                secondary_tfl.append(T.AugMix(interpolation=interpolation))
-            else:
-                LOGGER.warning('"auto_augment=augmix" requires torchvision >= 0.13.0. Disabling it.')
-
-        elif auto_augment == "autoaugment":
-            if TORCHVISION_0_10:
-                secondary_tfl.append(T.AutoAugment(interpolation=interpolation))
-            else:
-                LOGGER.warning('"auto_augment=autoaugment" requires torchvision >= 0.10.0. Disabling it.')
-
-        else:
-            raise ValueError(
-                f'Invalid auto_augment policy: {auto_augment}. Should be one of "randaugment", '
-                f'"augmix", "autoaugment" or None'
-            )
-
-    if not disable_color_jitter:
-        secondary_tfl.append(T.ColorJitter(brightness=hsv_v, contrast=hsv_v, saturation=hsv_s, hue=hsv_h))
-
-    final_tfl = [
-        T.ToTensor(),
-        T.Normalize(mean=torch.tensor(mean), std=torch.tensor(std)),
-        T.RandomErasing(p=erasing, inplace=True),
-    ]
-
-    return T.Compose(primary_tfl + secondary_tfl + final_tfl)
-
-
-# NOTE: keep this class for backward compatibility
-class ClassifyLetterBox:
-    """
-    A class for resizing and padding images for classification tasks.
-
-    This class is designed to be part of a transformation pipeline, e.g., T.Compose([LetterBox(size), ToTensor()]).
-    It resizes and pads images to a specified size while maintaining the original aspect ratio.
-
-    Attributes:
-        h (int): Target height of the image.
-        w (int): Target width of the image.
-        auto (bool): If True, automatically calculates the short side using stride.
-        stride (int): The stride value, used when 'auto' is True.
-
-    Methods:
-        __call__: Apply the letterbox transformation to an input image.
-
-    Examples:
-        >>> transform = ClassifyLetterBox(size=(640, 640), auto=False, stride=32)
-        >>> img = np.random.randint(0, 255, (480, 640, 3), dtype=np.uint8)
-        >>> result = transform(img)
-        >>> print(result.shape)
-        (640, 640, 3)
-    """
-
-    def __init__(self, size: int | tuple[int, int] = (640, 640), auto: bool = False, stride: int = 32):
-        """
-        Initialize the ClassifyLetterBox object for image preprocessing.
-
-        This class is designed to be part of a transformation pipeline for image classification tasks. It resizes and
-        pads images to a specified size while maintaining the original aspect ratio.
-
-        Args:
-            size (int | tuple[int, int]): Target size for the letterboxed image. If an int, a square image of
-                (size, size) is created. If a tuple, it should be (height, width).
-            auto (bool): If True, automatically calculates the short side based on stride.
-            stride (int): The stride value, used when 'auto' is True.
-
-        Attributes:
-            h (int): Target height of the letterboxed image.
-            w (int): Target width of the letterboxed image.
-            auto (bool): Flag indicating whether to automatically calculate short side.
-            stride (int): Stride value for automatic short side calculation.
-
-        Examples:
-            >>> transform = ClassifyLetterBox(size=224)
-            >>> img = np.random.randint(0, 255, (480, 640, 3), dtype=np.uint8)
-            >>> result = transform(img)
-            >>> print(result.shape)
-            (224, 224, 3)
-        """
-        super().__init__()
-        self.h, self.w = (size, size) if isinstance(size, int) else size
-        self.auto = auto  # pass max size integer, automatically solve for short side using stride
-        self.stride = stride  # used with auto
-
-    def __call__(self, im: np.ndarray) -> np.ndarray:
-        """
-        Resize and pad an image using the letterbox method.
-
-        This method resizes the input image to fit within the specified dimensions while maintaining its aspect ratio,
-        then pads the resized image to match the target size.
-
-        Args:
-            im (np.ndarray): Input image as a numpy array with shape (H, W, C).
-
-        Returns:
-            (np.ndarray): Resized and padded image as a numpy array with shape (hs, ws, 3), where hs and ws are
-                the target height and width respectively.
-
-        Examples:
-            >>> letterbox = ClassifyLetterBox(size=(640, 640))
-            >>> image = np.random.randint(0, 255, (720, 1280, 3), dtype=np.uint8)
-            >>> resized_image = letterbox(image)
-            >>> print(resized_image.shape)
-            (640, 640, 3)
-        """
-        imh, imw = im.shape[:2]
-        r = min(self.h / imh, self.w / imw)  # ratio of new/old dimensions
-        h, w = round(imh * r), round(imw * r)  # resized image dimensions
-
-        # Calculate padding dimensions
-        hs, ws = (math.ceil(x / self.stride) * self.stride for x in (h, w)) if self.auto else (self.h, self.w)
-        top, left = round((hs - h) / 2 - 0.1), round((ws - w) / 2 - 0.1)
-
-        # Create padded image
-        im_out = np.full((hs, ws, 3), 114, dtype=im.dtype)
-        im_out[top : top + h, left : left + w] = cv2.resize(im, (w, h), interpolation=cv2.INTER_LINEAR)
-        return im_out
-
-
-# NOTE: keep this class for backward compatibility
-class CenterCrop:
-    """
-    Apply center cropping to images for classification tasks.
-
-    This class performs center cropping on input images, resizing them to a specified size while maintaining the aspect
-    ratio. It is designed to be part of a transformation pipeline, e.g., T.Compose([CenterCrop(size), ToTensor()]).
-
-    Attributes:
-        h (int): Target height of the cropped image.
-        w (int): Target width of the cropped image.
-
-    Methods:
-        __call__: Apply the center crop transformation to an input image.
-
-    Examples:
-        >>> transform = CenterCrop(640)
-        >>> image = np.random.randint(0, 255, (1080, 1920, 3), dtype=np.uint8)
-        >>> cropped_image = transform(image)
-        >>> print(cropped_image.shape)
-        (640, 640, 3)
-    """
-
-    def __init__(self, size: int | tuple[int, int] = (640, 640)):
-        """
-        Initialize the CenterCrop object for image preprocessing.
-
-        This class is designed to be part of a transformation pipeline, e.g., T.Compose([CenterCrop(size), ToTensor()]).
-        It performs a center crop on input images to a specified size.
-
-        Args:
-            size (int | tuple[int, int]): The desired output size of the crop. If size is an int, a square crop
-                (size, size) is made. If size is a sequence like (h, w), it is used as the output size.
-
-        Returns:
-            (None): This method initializes the object and does not return anything.
-
-        Examples:
-            >>> transform = CenterCrop(224)
-            >>> img = np.random.rand(300, 300, 3)
-            >>> cropped_img = transform(img)
-            >>> print(cropped_img.shape)
-            (224, 224, 3)
-        """
-        super().__init__()
-        self.h, self.w = (size, size) if isinstance(size, int) else size
-
-    def __call__(self, im: Image.Image | np.ndarray) -> np.ndarray:
-        """
-        Apply center cropping to an input image.
-
-        This method resizes and crops the center of the image using a letterbox method. It maintains the aspect
-        ratio of the original image while fitting it into the specified dimensions.
-
-        Args:
-            im (np.ndarray | PIL.Image.Image): The input image as a numpy array of shape (H, W, C) or a
-                PIL Image object.
-
-        Returns:
-            (np.ndarray): The center-cropped and resized image as a numpy array of shape (self.h, self.w, C).
-
-        Examples:
-            >>> transform = CenterCrop(size=224)
-            >>> image = np.random.randint(0, 255, (640, 480, 3), dtype=np.uint8)
-            >>> cropped_image = transform(image)
-            >>> assert cropped_image.shape == (224, 224, 3)
-        """
-        if isinstance(im, Image.Image):  # convert from PIL to numpy array if required
-            im = np.asarray(im)
-        imh, imw = im.shape[:2]
-        m = min(imh, imw)  # min dimension
-        top, left = (imh - m) // 2, (imw - m) // 2
-        return cv2.resize(im[top : top + m, left : left + m], (self.w, self.h), interpolation=cv2.INTER_LINEAR)
-
-
-# NOTE: keep this class for backward compatibility
-class ToTensor:
-    """
-    Convert an image from a numpy array to a PyTorch tensor.
-
-    This class is designed to be part of a transformation pipeline, e.g., T.Compose([LetterBox(size), ToTensor()]).
-
-    Attributes:
-        half (bool): If True, converts the image to half precision (float16).
-
-    Methods:
-        __call__: Apply the tensor conversion to an input image.
-
-    Examples:
-        >>> transform = ToTensor(half=True)
-        >>> img = np.random.randint(0, 255, (640, 640, 3), dtype=np.uint8)
-        >>> tensor_img = transform(img)
-        >>> print(tensor_img.shape, tensor_img.dtype)
-        torch.Size([3, 640, 640]) torch.float16
-
-    Notes:
-        The input image is expected to be in BGR format with shape (H, W, C).
-        The output tensor will be in RGB format with shape (C, H, W), normalized to [0, 1].
-    """
-
-    def __init__(self, half: bool = False):
-        """
-        Initialize the ToTensor object for converting images to PyTorch tensors.
-
-        This class is designed to be used as part of a transformation pipeline for image preprocessing in the
-        Ultralytics YOLO framework. It converts numpy arrays or PIL Images to PyTorch tensors, with an option
-        for half-precision (float16) conversion.
-
-        Args:
-            half (bool): If True, converts the tensor to half precision (float16).
-
-        Examples:
-            >>> transform = ToTensor(half=True)
-            >>> img = np.random.rand(640, 640, 3)
-            >>> tensor_img = transform(img)
-            >>> print(tensor_img.dtype)
-            torch.float16
-        """
-        super().__init__()
-        self.half = half
-
-    def __call__(self, im: np.ndarray) -> torch.Tensor:
-        """
-        Transform an image from a numpy array to a PyTorch tensor.
-
-        This method converts the input image from a numpy array to a PyTorch tensor, applying optional
-        half-precision conversion and normalization. The image is transposed from HWC to CHW format and
-        the color channels are reversed from BGR to RGB.
-
-        Args:
-            im (np.ndarray): Input image as a numpy array with shape (H, W, C) in RGB order.
-
-        Returns:
-            (torch.Tensor): The transformed image as a PyTorch tensor in float32 or float16, normalized
-                to [0, 1] with shape (C, H, W) in RGB order.
-
-        Examples:
-            >>> transform = ToTensor(half=True)
-            >>> img = np.random.randint(0, 255, (640, 640, 3), dtype=np.uint8)
-            >>> tensor_img = transform(img)
-            >>> print(tensor_img.shape, tensor_img.dtype)
-            torch.Size([3, 640, 640]) torch.float16
-        """
-        im = np.ascontiguousarray(im.transpose((2, 0, 1)))  # HWC to CHW -> contiguous
-        im = torch.from_numpy(im)  # to torch
-        im = im.half() if self.half else im.float()  # uint8 to fp16/32
-        im /= 255.0  # 0-255 to 0.0-1.0
-        return im
diff --git a/ultralytics/data/base.py b/ultralytics/data/base.py
deleted file mode 100644
index 6a3fea6..0000000
--- a/ultralytics/data/base.py
+++ /dev/null
@@ -1,443 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import glob
-import math
-import os
-import random
-from copy import deepcopy
-from multiprocessing.pool import ThreadPool
-from pathlib import Path
-from typing import Any
-
-import cv2
-import numpy as np
-from torch.utils.data import Dataset
-
-from ultralytics.data.utils import FORMATS_HELP_MSG, HELP_URL, IMG_FORMATS, check_file_speeds
-from ultralytics.utils import DEFAULT_CFG, LOCAL_RANK, LOGGER, NUM_THREADS, TQDM
-from ultralytics.utils.patches import imread
-
-
-class BaseDataset(Dataset):
-    """
-    Base dataset class for loading and processing image data.
-
-    This class provides core functionality for loading images, caching, and preparing data for training and inference
-    in object detection tasks.
-
-    Attributes:
-        img_path (str): Path to the folder containing images.
-        imgsz (int): Target image size for resizing.
-        augment (bool): Whether to apply data augmentation.
-        single_cls (bool): Whether to treat all objects as a single class.
-        prefix (str): Prefix to print in log messages.
-        fraction (float): Fraction of dataset to utilize.
-        channels (int): Number of channels in the images (1 for grayscale, 3 for RGB).
-        cv2_flag (int): OpenCV flag for reading images.
-        im_files (list[str]): List of image file paths.
-        labels (list[dict]): List of label data dictionaries.
-        ni (int): Number of images in the dataset.
-        rect (bool): Whether to use rectangular training.
-        batch_size (int): Size of batches.
-        stride (int): Stride used in the model.
-        pad (float): Padding value.
-        buffer (list): Buffer for mosaic images.
-        max_buffer_length (int): Maximum buffer size.
-        ims (list): List of loaded images.
-        im_hw0 (list): List of original image dimensions (h, w).
-        im_hw (list): List of resized image dimensions (h, w).
-        npy_files (list[Path]): List of numpy file paths.
-        cache (str): Cache images to RAM or disk during training.
-        transforms (callable): Image transformation function.
-        batch_shapes (np.ndarray): Batch shapes for rectangular training.
-        batch (np.ndarray): Batch index of each image.
-
-    Methods:
-        get_img_files: Read image files from the specified path.
-        update_labels: Update labels to include only specified classes.
-        load_image: Load an image from the dataset.
-        cache_images: Cache images to memory or disk.
-        cache_images_to_disk: Save an image as an *.npy file for faster loading.
-        check_cache_disk: Check image caching requirements vs available disk space.
-        check_cache_ram: Check image caching requirements vs available memory.
-        set_rectangle: Set the shape of bounding boxes as rectangles.
-        get_image_and_label: Get and return label information from the dataset.
-        update_labels_info: Custom label format method to be implemented by subclasses.
-        build_transforms: Build transformation pipeline to be implemented by subclasses.
-        get_labels: Get labels method to be implemented by subclasses.
-    """
-
-    def __init__(
-        self,
-        img_path: str | list[str],
-        imgsz: int = 640,
-        cache: bool | str = False,
-        augment: bool = True,
-        hyp: dict[str, Any] = DEFAULT_CFG,
-        prefix: str = "",
-        rect: bool = False,
-        batch_size: int = 16,
-        stride: int = 32,
-        pad: float = 0.5,
-        single_cls: bool = False,
-        classes: list[int] | None = None,
-        fraction: float = 1.0,
-        channels: int = 3,
-    ):
-        """
-        Initialize BaseDataset with given configuration and options.
-
-        Args:
-            img_path (str | list[str]): Path to the folder containing images or list of image paths.
-            imgsz (int): Image size for resizing.
-            cache (bool | str): Cache images to RAM or disk during training.
-            augment (bool): If True, data augmentation is applied.
-            hyp (dict[str, Any]): Hyperparameters to apply data augmentation.
-            prefix (str): Prefix to print in log messages.
-            rect (bool): If True, rectangular training is used.
-            batch_size (int): Size of batches.
-            stride (int): Stride used in the model.
-            pad (float): Padding value.
-            single_cls (bool): If True, single class training is used.
-            classes (list[int], optional): List of included classes.
-            fraction (float): Fraction of dataset to utilize.
-            channels (int): Number of channels in the images (1 for grayscale, 3 for RGB).
-        """
-        super().__init__()
-        self.img_path = img_path
-        self.imgsz = imgsz
-        self.augment = augment
-        self.single_cls = single_cls
-        self.prefix = prefix
-        self.fraction = fraction
-        self.channels = channels
-        self.cv2_flag = cv2.IMREAD_GRAYSCALE if channels == 1 else cv2.IMREAD_COLOR
-        self.im_files = self.get_img_files(self.img_path)
-        self.labels = self.get_labels()
-        self.update_labels(include_class=classes)  # single_cls and include_class
-        self.ni = len(self.labels)  # number of images
-        self.rect = rect
-        self.batch_size = batch_size
-        self.stride = stride
-        self.pad = pad
-        if self.rect:
-            assert self.batch_size is not None
-            self.set_rectangle()
-
-        # Buffer thread for mosaic images
-        self.buffer = []  # buffer size = batch size
-        self.max_buffer_length = min((self.ni, self.batch_size * 8, 1000)) if self.augment else 0
-
-        # Cache images (options are cache = True, False, None, "ram", "disk")
-        self.ims, self.im_hw0, self.im_hw = [None] * self.ni, [None] * self.ni, [None] * self.ni
-        self.npy_files = [Path(f).with_suffix(".npy") for f in self.im_files]
-        self.cache = cache.lower() if isinstance(cache, str) else "ram" if cache is True else None
-        if self.cache == "ram" and self.check_cache_ram():
-            if hyp.deterministic:
-                LOGGER.warning(
-                    "cache='ram' may produce non-deterministic training results. "
-                    "Consider cache='disk' as a deterministic alternative if your disk space allows."
-                )
-            self.cache_images()
-        elif self.cache == "disk" and self.check_cache_disk():
-            self.cache_images()
-
-        # Transforms
-        self.transforms = self.build_transforms(hyp=hyp)
-
-    def get_img_files(self, img_path: str | list[str]) -> list[str]:
-        """
-        Read image files from the specified path.
-
-        Args:
-            img_path (str | list[str]): Path or list of paths to image directories or files.
-
-        Returns:
-            (list[str]): List of image file paths.
-
-        Raises:
-            FileNotFoundError: If no images are found or the path doesn't exist.
-        """
-        try:
-            f = []  # image files
-            for p in img_path if isinstance(img_path, list) else [img_path]:
-                p = Path(p)  # os-agnostic
-                if p.is_dir():  # dir
-                    f += glob.glob(str(p / "**" / "*.*"), recursive=True)
-                    # F = list(p.rglob('*.*'))  # pathlib
-                elif p.is_file():  # file
-                    with open(p, encoding="utf-8") as t:
-                        t = t.read().strip().splitlines()
-                        parent = str(p.parent) + os.sep
-                        f += [x.replace("./", parent) if x.startswith("./") else x for x in t]  # local to global path
-                        # F += [p.parent / x.lstrip(os.sep) for x in t]  # local to global path (pathlib)
-                else:
-                    raise FileNotFoundError(f"{self.prefix}{p} does not exist")
-            im_files = sorted(x.replace("/", os.sep) for x in f if x.rpartition(".")[-1].lower() in IMG_FORMATS)
-            # self.img_files = sorted([x for x in f if x.suffix[1:].lower() in IMG_FORMATS])  # pathlib
-            assert im_files, f"{self.prefix}No images found in {img_path}. {FORMATS_HELP_MSG}"
-        except Exception as e:
-            raise FileNotFoundError(f"{self.prefix}Error loading data from {img_path}\n{HELP_URL}") from e
-        if self.fraction < 1:
-            im_files = im_files[: round(len(im_files) * self.fraction)]  # retain a fraction of the dataset
-        check_file_speeds(im_files, prefix=self.prefix)  # check image read speeds
-        return im_files
-
-    def update_labels(self, include_class: list[int] | None) -> None:
-        """
-        Update labels to include only specified classes.
-
-        Args:
-            include_class (list[int], optional): List of classes to include. If None, all classes are included.
-        """
-        include_class_array = np.array(include_class).reshape(1, -1)
-        for i in range(len(self.labels)):
-            if include_class is not None:
-                cls = self.labels[i]["cls"]
-                bboxes = self.labels[i]["bboxes"]
-                segments = self.labels[i]["segments"]
-                keypoints = self.labels[i]["keypoints"]
-                j = (cls == include_class_array).any(1)
-                self.labels[i]["cls"] = cls[j]
-                self.labels[i]["bboxes"] = bboxes[j]
-                if segments:
-                    self.labels[i]["segments"] = [segments[si] for si, idx in enumerate(j) if idx]
-                if keypoints is not None:
-                    self.labels[i]["keypoints"] = keypoints[j]
-            if self.single_cls:
-                self.labels[i]["cls"][:, 0] = 0
-
-    def load_image(self, i: int, rect_mode: bool = True) -> tuple[np.ndarray, tuple[int, int], tuple[int, int]]:
-        """
-        Load an image from dataset index 'i'.
-
-        Args:
-            i (int): Index of the image to load.
-            rect_mode (bool): Whether to use rectangular resizing.
-
-        Returns:
-            im (np.ndarray): Loaded image as a NumPy array.
-            hw_original (tuple[int, int]): Original image dimensions in (height, width) format.
-            hw_resized (tuple[int, int]): Resized image dimensions in (height, width) format.
-
-        Raises:
-            FileNotFoundError: If the image file is not found.
-        """
-        im, f, fn = self.ims[i], self.im_files[i], self.npy_files[i]
-        if im is None:  # not cached in RAM
-            if fn.exists():  # load npy
-                try:
-                    im = np.load(fn)
-                except Exception as e:
-                    LOGGER.warning(f"{self.prefix}Removing corrupt *.npy image file {fn} due to: {e}")
-                    Path(fn).unlink(missing_ok=True)
-                    im = imread(f, flags=self.cv2_flag)  # BGR
-            else:  # read image
-                im = imread(f, flags=self.cv2_flag)  # BGR
-            if im is None:
-                raise FileNotFoundError(f"Image Not Found {f}")
-
-            h0, w0 = im.shape[:2]  # orig hw
-            if rect_mode:  # resize long side to imgsz while maintaining aspect ratio
-                r = self.imgsz / max(h0, w0)  # ratio
-                if r != 1:  # if sizes are not equal
-                    w, h = (min(math.ceil(w0 * r), self.imgsz), min(math.ceil(h0 * r), self.imgsz))
-                    im = cv2.resize(im, (w, h), interpolation=cv2.INTER_LINEAR)
-            elif not (h0 == w0 == self.imgsz):  # resize by stretching image to square imgsz
-                im = cv2.resize(im, (self.imgsz, self.imgsz), interpolation=cv2.INTER_LINEAR)
-            if im.ndim == 2:
-                im = im[..., None]
-
-            # Add to buffer if training with augmentations
-            if self.augment:
-                self.ims[i], self.im_hw0[i], self.im_hw[i] = im, (h0, w0), im.shape[:2]  # im, hw_original, hw_resized
-                self.buffer.append(i)
-                if 1 < len(self.buffer) >= self.max_buffer_length:  # prevent empty buffer
-                    j = self.buffer.pop(0)
-                    if self.cache != "ram":
-                        self.ims[j], self.im_hw0[j], self.im_hw[j] = None, None, None
-
-            return im, (h0, w0), im.shape[:2]
-
-        return self.ims[i], self.im_hw0[i], self.im_hw[i]
-
-    def cache_images(self) -> None:
-        """Cache images to memory or disk for faster training."""
-        b, gb = 0, 1 << 30  # bytes of cached images, bytes per gigabytes
-        fcn, storage = (self.cache_images_to_disk, "Disk") if self.cache == "disk" else (self.load_image, "RAM")
-        with ThreadPool(NUM_THREADS) as pool:
-            results = pool.imap(fcn, range(self.ni))
-            pbar = TQDM(enumerate(results), total=self.ni, disable=LOCAL_RANK > 0)
-            for i, x in pbar:
-                if self.cache == "disk":
-                    b += self.npy_files[i].stat().st_size
-                else:  # 'ram'
-                    self.ims[i], self.im_hw0[i], self.im_hw[i] = x  # im, hw_orig, hw_resized = load_image(self, i)
-                    b += self.ims[i].nbytes
-                pbar.desc = f"{self.prefix}Caching images ({b / gb:.1f}GB {storage})"
-            pbar.close()
-
-    def cache_images_to_disk(self, i: int) -> None:
-        """Save an image as an *.npy file for faster loading."""
-        f = self.npy_files[i]
-        if not f.exists():
-            np.save(f.as_posix(), imread(self.im_files[i]), allow_pickle=False)
-
-    def check_cache_disk(self, safety_margin: float = 0.5) -> bool:
-        """
-        Check if there's enough disk space for caching images.
-
-        Args:
-            safety_margin (float): Safety margin factor for disk space calculation.
-
-        Returns:
-            (bool): True if there's enough disk space, False otherwise.
-        """
-        import shutil
-
-        b, gb = 0, 1 << 30  # bytes of cached images, bytes per gigabytes
-        n = min(self.ni, 30)  # extrapolate from 30 random images
-        for _ in range(n):
-            im_file = random.choice(self.im_files)
-            im = imread(im_file)
-            if im is None:
-                continue
-            b += im.nbytes
-            if not os.access(Path(im_file).parent, os.W_OK):
-                self.cache = None
-                LOGGER.warning(f"{self.prefix}Skipping caching images to disk, directory not writeable")
-                return False
-        disk_required = b * self.ni / n * (1 + safety_margin)  # bytes required to cache dataset to disk
-        total, used, free = shutil.disk_usage(Path(self.im_files[0]).parent)
-        if disk_required > free:
-            self.cache = None
-            LOGGER.warning(
-                f"{self.prefix}{disk_required / gb:.1f}GB disk space required, "
-                f"with {int(safety_margin * 100)}% safety margin but only "
-                f"{free / gb:.1f}/{total / gb:.1f}GB free, not caching images to disk"
-            )
-            return False
-        return True
-
-    def check_cache_ram(self, safety_margin: float = 0.5) -> bool:
-        """
-        Check if there's enough RAM for caching images.
-
-        Args:
-            safety_margin (float): Safety margin factor for RAM calculation.
-
-        Returns:
-            (bool): True if there's enough RAM, False otherwise.
-        """
-        b, gb = 0, 1 << 30  # bytes of cached images, bytes per gigabytes
-        n = min(self.ni, 30)  # extrapolate from 30 random images
-        for _ in range(n):
-            im = imread(random.choice(self.im_files))  # sample image
-            if im is None:
-                continue
-            ratio = self.imgsz / max(im.shape[0], im.shape[1])  # max(h, w)  # ratio
-            b += im.nbytes * ratio**2
-        mem_required = b * self.ni / n * (1 + safety_margin)  # GB required to cache dataset into RAM
-        mem = __import__("psutil").virtual_memory()
-        if mem_required > mem.available:
-            self.cache = None
-            LOGGER.warning(
-                f"{self.prefix}{mem_required / gb:.1f}GB RAM required to cache images "
-                f"with {int(safety_margin * 100)}% safety margin but only "
-                f"{mem.available / gb:.1f}/{mem.total / gb:.1f}GB available, not caching images"
-            )
-            return False
-        return True
-
-    def set_rectangle(self) -> None:
-        """Set the shape of bounding boxes for YOLO detections as rectangles."""
-        bi = np.floor(np.arange(self.ni) / self.batch_size).astype(int)  # batch index
-        nb = bi[-1] + 1  # number of batches
-
-        s = np.array([x.pop("shape") for x in self.labels])  # hw
-        ar = s[:, 0] / s[:, 1]  # aspect ratio
-        irect = ar.argsort()
-        self.im_files = [self.im_files[i] for i in irect]
-        self.labels = [self.labels[i] for i in irect]
-        ar = ar[irect]
-
-        # Set training image shapes
-        shapes = [[1, 1]] * nb
-        for i in range(nb):
-            ari = ar[bi == i]
-            mini, maxi = ari.min(), ari.max()
-            if maxi < 1:
-                shapes[i] = [maxi, 1]
-            elif mini > 1:
-                shapes[i] = [1, 1 / mini]
-
-        self.batch_shapes = np.ceil(np.array(shapes) * self.imgsz / self.stride + self.pad).astype(int) * self.stride
-        self.batch = bi  # batch index of image
-
-    def __getitem__(self, index: int) -> dict[str, Any]:
-        """Return transformed label information for given index."""
-        return self.transforms(self.get_image_and_label(index))
-
-    def get_image_and_label(self, index: int) -> dict[str, Any]:
-        """
-        Get and return label information from the dataset.
-
-        Args:
-            index (int): Index of the image to retrieve.
-
-        Returns:
-            (dict[str, Any]): Label dictionary with image and metadata.
-        """
-        label = deepcopy(self.labels[index])  # requires deepcopy() https://github.com/ultralytics/ultralytics/pull/1948
-        label.pop("shape", None)  # shape is for rect, remove it
-        label["img"], label["ori_shape"], label["resized_shape"] = self.load_image(index)
-        label["ratio_pad"] = (
-            label["resized_shape"][0] / label["ori_shape"][0],
-            label["resized_shape"][1] / label["ori_shape"][1],
-        )  # for evaluation
-        if self.rect:
-            label["rect_shape"] = self.batch_shapes[self.batch[index]]
-        return self.update_labels_info(label)
-
-    def __len__(self) -> int:
-        """Return the length of the labels list for the dataset."""
-        return len(self.labels)
-
-    def update_labels_info(self, label: dict[str, Any]) -> dict[str, Any]:
-        """Custom your label format here."""
-        return label
-
-    def build_transforms(self, hyp: dict[str, Any] | None = None):
-        """
-        Users can customize augmentations here.
-
-        Examples:
-            >>> if self.augment:
-            ...     # Training transforms
-            ...     return Compose([])
-            >>> else:
-            ...    # Val transforms
-            ...    return Compose([])
-        """
-        raise NotImplementedError
-
-    def get_labels(self) -> list[dict[str, Any]]:
-        """
-        Users can customize their own format here.
-
-        Examples:
-            Ensure output is a dictionary with the following keys:
-            >>> dict(
-            ...     im_file=im_file,
-            ...     shape=shape,  # format: (height, width)
-            ...     cls=cls,
-            ...     bboxes=bboxes,  # xywh
-            ...     segments=segments,  # xy
-            ...     keypoints=keypoints,  # xy
-            ...     normalized=True,  # or False
-            ...     bbox_format="xyxy",  # or xywh, ltwh
-            ... )
-        """
-        raise NotImplementedError
diff --git a/ultralytics/data/build.py b/ultralytics/data/build.py
deleted file mode 100644
index b1fd8d7..0000000
--- a/ultralytics/data/build.py
+++ /dev/null
@@ -1,315 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import os
-import random
-from collections.abc import Iterator
-from pathlib import Path
-from typing import Any
-from urllib.parse import urlsplit
-
-import numpy as np
-import torch
-from PIL import Image
-from torch.utils.data import dataloader, distributed
-
-from ultralytics.cfg import IterableSimpleNamespace
-from ultralytics.data.dataset import GroundingDataset, YOLODataset, YOLOMultiModalDataset
-from ultralytics.data.loaders import (
-    LOADERS,
-    LoadImagesAndVideos,
-    LoadPilAndNumpy,
-    LoadScreenshots,
-    LoadStreams,
-    LoadTensor,
-    SourceTypes,
-    autocast_list,
-)
-from ultralytics.data.utils import IMG_FORMATS, VID_FORMATS
-from ultralytics.utils import RANK, colorstr
-from ultralytics.utils.checks import check_file
-from ultralytics.utils.torch_utils import TORCH_2_0
-
-
-class InfiniteDataLoader(dataloader.DataLoader):
-    """
-    Dataloader that reuses workers for infinite iteration.
-
-    This dataloader extends the PyTorch DataLoader to provide infinite recycling of workers, which improves efficiency
-    for training loops that need to iterate through the dataset multiple times without recreating workers.
-
-    Attributes:
-        batch_sampler (_RepeatSampler): A sampler that repeats indefinitely.
-        iterator (Iterator): The iterator from the parent DataLoader.
-
-    Methods:
-        __len__: Return the length of the batch sampler's sampler.
-        __iter__: Create a sampler that repeats indefinitely.
-        __del__: Ensure workers are properly terminated.
-        reset: Reset the iterator, useful when modifying dataset settings during training.
-
-    Examples:
-        Create an infinite dataloader for training
-        >>> dataset = YOLODataset(...)
-        >>> dataloader = InfiniteDataLoader(dataset, batch_size=16, shuffle=True)
-        >>> for batch in dataloader:  # Infinite iteration
-        >>>     train_step(batch)
-    """
-
-    def __init__(self, *args: Any, **kwargs: Any):
-        """Initialize the InfiniteDataLoader with the same arguments as DataLoader."""
-        if not TORCH_2_0:
-            kwargs.pop("prefetch_factor", None)  # not supported by earlier versions
-        super().__init__(*args, **kwargs)
-        object.__setattr__(self, "batch_sampler", _RepeatSampler(self.batch_sampler))
-        self.iterator = super().__iter__()
-
-    def __len__(self) -> int:
-        """Return the length of the batch sampler's sampler."""
-        return len(self.batch_sampler.sampler)
-
-    def __iter__(self) -> Iterator:
-        """Create an iterator that yields indefinitely from the underlying iterator."""
-        for _ in range(len(self)):
-            yield next(self.iterator)
-
-    def __del__(self):
-        """Ensure that workers are properly terminated when the dataloader is deleted."""
-        try:
-            if not hasattr(self.iterator, "_workers"):
-                return
-            for w in self.iterator._workers:  # force terminate
-                if w.is_alive():
-                    w.terminate()
-            self.iterator._shutdown_workers()  # cleanup
-        except Exception:
-            pass
-
-    def reset(self):
-        """Reset the iterator to allow modifications to the dataset during training."""
-        self.iterator = self._get_iterator()
-
-
-class _RepeatSampler:
-    """
-    Sampler that repeats forever for infinite iteration.
-
-    This sampler wraps another sampler and yields its contents indefinitely, allowing for infinite iteration
-    over a dataset without recreating the sampler.
-
-    Attributes:
-        sampler (Dataset.sampler): The sampler to repeat.
-    """
-
-    def __init__(self, sampler: Any):
-        """Initialize the _RepeatSampler with a sampler to repeat indefinitely."""
-        self.sampler = sampler
-
-    def __iter__(self) -> Iterator:
-        """Iterate over the sampler indefinitely, yielding its contents."""
-        while True:
-            yield from iter(self.sampler)
-
-
-def seed_worker(worker_id: int):  # noqa
-    """Set dataloader worker seed for reproducibility across worker processes."""
-    worker_seed = torch.initial_seed() % 2**32
-    np.random.seed(worker_seed)
-    random.seed(worker_seed)
-
-
-def build_yolo_dataset(
-    cfg: IterableSimpleNamespace,
-    img_path: str,
-    batch: int,
-    data: dict[str, Any],
-    mode: str = "train",
-    rect: bool = False,
-    stride: int = 32,
-    multi_modal: bool = False,
-):
-    """Build and return a YOLO dataset based on configuration parameters."""
-    dataset = YOLOMultiModalDataset if multi_modal else YOLODataset
-    return dataset(
-        img_path=img_path,
-        imgsz=cfg.imgsz,
-        batch_size=batch,
-        augment=mode == "train",  # augmentation
-        hyp=cfg,  # TODO: probably add a get_hyps_from_cfg function
-        rect=cfg.rect or rect,  # rectangular batches
-        cache=cfg.cache or None,
-        single_cls=cfg.single_cls or False,
-        stride=stride,
-        pad=0.0 if mode == "train" else 0.5,
-        prefix=colorstr(f"{mode}: "),
-        task=cfg.task,
-        classes=cfg.classes,
-        data=data,
-        fraction=cfg.fraction if mode == "train" else 1.0,
-    )
-
-
-def build_grounding(
-    cfg: IterableSimpleNamespace,
-    img_path: str,
-    json_file: str,
-    batch: int,
-    mode: str = "train",
-    rect: bool = False,
-    stride: int = 32,
-    max_samples: int = 80,
-):
-    """Build and return a GroundingDataset based on configuration parameters."""
-    return GroundingDataset(
-        img_path=img_path,
-        json_file=json_file,
-        max_samples=max_samples,
-        imgsz=cfg.imgsz,
-        batch_size=batch,
-        augment=mode == "train",  # augmentation
-        hyp=cfg,  # TODO: probably add a get_hyps_from_cfg function
-        rect=cfg.rect or rect,  # rectangular batches
-        cache=cfg.cache or None,
-        single_cls=cfg.single_cls or False,
-        stride=stride,
-        pad=0.0 if mode == "train" else 0.5,
-        prefix=colorstr(f"{mode}: "),
-        task=cfg.task,
-        classes=cfg.classes,
-        fraction=cfg.fraction if mode == "train" else 1.0,
-    )
-
-
-def build_dataloader(dataset, batch: int, workers: int, shuffle: bool = True, rank: int = -1, drop_last: bool = False):
-    """
-    Create and return an InfiniteDataLoader or DataLoader for training or validation.
-
-    Args:
-        dataset (Dataset): Dataset to load data from.
-        batch (int): Batch size for the dataloader.
-        workers (int): Number of worker threads for loading data.
-        shuffle (bool, optional): Whether to shuffle the dataset.
-        rank (int, optional): Process rank in distributed training. -1 for single-GPU training.
-        drop_last (bool, optional): Whether to drop the last incomplete batch.
-
-    Returns:
-        (InfiniteDataLoader): A dataloader that can be used for training or validation.
-
-    Examples:
-        Create a dataloader for training
-        >>> dataset = YOLODataset(...)
-        >>> dataloader = build_dataloader(dataset, batch=16, workers=4, shuffle=True)
-    """
-    batch = min(batch, len(dataset))
-    nd = torch.cuda.device_count()  # number of CUDA devices
-    nw = min(os.cpu_count() // max(nd, 1), workers)  # number of workers
-    sampler = None if rank == -1 else distributed.DistributedSampler(dataset, shuffle=shuffle)
-    generator = torch.Generator()
-    generator.manual_seed(6148914691236517205 + RANK)
-    return InfiniteDataLoader(
-        dataset=dataset,
-        batch_size=batch,
-        shuffle=shuffle and sampler is None,
-        num_workers=nw,
-        sampler=sampler,
-        prefetch_factor=4 if nw > 0 else None,  # increase over default 2
-        pin_memory=nd > 0,
-        collate_fn=getattr(dataset, "collate_fn", None),
-        worker_init_fn=seed_worker,
-        generator=generator,
-        drop_last=drop_last and len(dataset) % batch != 0,
-    )
-
-
-def check_source(source):
-    """
-    Check the type of input source and return corresponding flag values.
-
-    Args:
-        source (str | int | Path | list | tuple | np.ndarray | PIL.Image | torch.Tensor): The input source to check.
-
-    Returns:
-        source (str | int | Path | list | tuple | np.ndarray | PIL.Image | torch.Tensor): The processed source.
-        webcam (bool): Whether the source is a webcam.
-        screenshot (bool): Whether the source is a screenshot.
-        from_img (bool): Whether the source is an image or list of images.
-        in_memory (bool): Whether the source is an in-memory object.
-        tensor (bool): Whether the source is a torch.Tensor.
-
-    Examples:
-        Check a file path source
-        >>> source, webcam, screenshot, from_img, in_memory, tensor = check_source("image.jpg")
-
-        Check a webcam source
-        >>> source, webcam, screenshot, from_img, in_memory, tensor = check_source(0)
-    """
-    webcam, screenshot, from_img, in_memory, tensor = False, False, False, False, False
-    if isinstance(source, (str, int, Path)):  # int for local usb camera
-        source = str(source)
-        source_lower = source.lower()
-        is_url = source_lower.startswith(("https://", "http://", "rtsp://", "rtmp://", "tcp://"))
-        is_file = (urlsplit(source_lower).path if is_url else source_lower).rpartition(".")[-1] in (
-            IMG_FORMATS | VID_FORMATS
-        )
-        webcam = source.isnumeric() or source.endswith(".streams") or (is_url and not is_file)
-        screenshot = source_lower == "screen"
-        if is_url and is_file:
-            source = check_file(source)  # download
-    elif isinstance(source, LOADERS):
-        in_memory = True
-    elif isinstance(source, (list, tuple)):
-        source = autocast_list(source)  # convert all list elements to PIL or np arrays
-        from_img = True
-    elif isinstance(source, (Image.Image, np.ndarray)):
-        from_img = True
-    elif isinstance(source, torch.Tensor):
-        tensor = True
-    else:
-        raise TypeError("Unsupported image type. For supported types see https://docs.ultralytics.com/modes/predict")
-
-    return source, webcam, screenshot, from_img, in_memory, tensor
-
-
-def load_inference_source(source=None, batch: int = 1, vid_stride: int = 1, buffer: bool = False, channels: int = 3):
-    """
-    Load an inference source for object detection and apply necessary transformations.
-
-    Args:
-        source (str | Path | torch.Tensor | PIL.Image | np.ndarray, optional): The input source for inference.
-        batch (int, optional): Batch size for dataloaders.
-        vid_stride (int, optional): The frame interval for video sources.
-        buffer (bool, optional): Whether stream frames will be buffered.
-        channels (int, optional): The number of input channels for the model.
-
-    Returns:
-        (Dataset): A dataset object for the specified input source with attached source_type attribute.
-
-    Examples:
-        Load an image source for inference
-        >>> dataset = load_inference_source("image.jpg", batch=1)
-
-        Load a video stream source
-        >>> dataset = load_inference_source("rtsp://example.com/stream", vid_stride=2)
-    """
-    source, stream, screenshot, from_img, in_memory, tensor = check_source(source)
-    source_type = source.source_type if in_memory else SourceTypes(stream, screenshot, from_img, tensor)
-
-    # Dataloader
-    if tensor:
-        dataset = LoadTensor(source)
-    elif in_memory:
-        dataset = source
-    elif stream:
-        dataset = LoadStreams(source, vid_stride=vid_stride, buffer=buffer, channels=channels)
-    elif screenshot:
-        dataset = LoadScreenshots(source, channels=channels)
-    elif from_img:
-        dataset = LoadPilAndNumpy(source, channels=channels)
-    else:
-        dataset = LoadImagesAndVideos(source, batch=batch, vid_stride=vid_stride, channels=channels)
-
-    # Attach source types to the dataset
-    setattr(dataset, "source_type", source_type)
-
-    return dataset
diff --git a/ultralytics/data/converter.py b/ultralytics/data/converter.py
deleted file mode 100644
index e0b9263..0000000
--- a/ultralytics/data/converter.py
+++ /dev/null
@@ -1,867 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import asyncio
-import json
-import random
-import shutil
-from collections import defaultdict
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from pathlib import Path
-
-import cv2
-import numpy as np
-from PIL import Image
-
-from ultralytics.utils import DATASETS_DIR, LOGGER, NUM_THREADS, TQDM, YAML
-from ultralytics.utils.checks import check_file, check_requirements
-from ultralytics.utils.downloads import download, zip_directory
-from ultralytics.utils.files import increment_path
-
-
-def coco91_to_coco80_class() -> list[int]:
-    """
-    Convert 91-index COCO class IDs to 80-index COCO class IDs.
-
-    Returns:
-        (list[int]): A list of 91 class IDs where the index represents the 80-index class ID and the value
-            is the corresponding 91-index class ID.
-    """
-    return [
-        0,
-        1,
-        2,
-        3,
-        4,
-        5,
-        6,
-        7,
-        8,
-        9,
-        10,
-        None,
-        11,
-        12,
-        13,
-        14,
-        15,
-        16,
-        17,
-        18,
-        19,
-        20,
-        21,
-        22,
-        23,
-        None,
-        24,
-        25,
-        None,
-        None,
-        26,
-        27,
-        28,
-        29,
-        30,
-        31,
-        32,
-        33,
-        34,
-        35,
-        36,
-        37,
-        38,
-        39,
-        None,
-        40,
-        41,
-        42,
-        43,
-        44,
-        45,
-        46,
-        47,
-        48,
-        49,
-        50,
-        51,
-        52,
-        53,
-        54,
-        55,
-        56,
-        57,
-        58,
-        59,
-        None,
-        60,
-        None,
-        None,
-        61,
-        None,
-        62,
-        63,
-        64,
-        65,
-        66,
-        67,
-        68,
-        69,
-        70,
-        71,
-        72,
-        None,
-        73,
-        74,
-        75,
-        76,
-        77,
-        78,
-        79,
-        None,
-    ]
-
-
-def coco80_to_coco91_class() -> list[int]:
-    r"""
-    Convert 80-index (val2014) to 91-index (paper).
-
-    Returns:
-        (list[int]): A list of 80 class IDs where each value is the corresponding 91-index class ID.
-
-    References:
-        https://tech.amikelive.com/node-718/what-object-categories-labels-are-in-coco-dataset/
-
-    Examples:
-        >>> import numpy as np
-        >>> a = np.loadtxt("data/coco.names", dtype="str", delimiter="\n")
-        >>> b = np.loadtxt("data/coco_paper.names", dtype="str", delimiter="\n")
-
-        Convert the darknet to COCO format
-        >>> x1 = [list(a[i] == b).index(True) + 1 for i in range(80)]
-
-        Convert the COCO to darknet format
-        >>> x2 = [list(b[i] == a).index(True) if any(b[i] == a) else None for i in range(91)]
-    """
-    return [
-        1,
-        2,
-        3,
-        4,
-        5,
-        6,
-        7,
-        8,
-        9,
-        10,
-        11,
-        13,
-        14,
-        15,
-        16,
-        17,
-        18,
-        19,
-        20,
-        21,
-        22,
-        23,
-        24,
-        25,
-        27,
-        28,
-        31,
-        32,
-        33,
-        34,
-        35,
-        36,
-        37,
-        38,
-        39,
-        40,
-        41,
-        42,
-        43,
-        44,
-        46,
-        47,
-        48,
-        49,
-        50,
-        51,
-        52,
-        53,
-        54,
-        55,
-        56,
-        57,
-        58,
-        59,
-        60,
-        61,
-        62,
-        63,
-        64,
-        65,
-        67,
-        70,
-        72,
-        73,
-        74,
-        75,
-        76,
-        77,
-        78,
-        79,
-        80,
-        81,
-        82,
-        84,
-        85,
-        86,
-        87,
-        88,
-        89,
-        90,
-    ]
-
-
-def convert_coco(
-    labels_dir: str = "../coco/annotations/",
-    save_dir: str = "coco_converted/",
-    use_segments: bool = False,
-    use_keypoints: bool = False,
-    cls91to80: bool = True,
-    lvis: bool = False,
-):
-    """
-    Convert COCO dataset annotations to a YOLO annotation format suitable for training YOLO models.
-
-    Args:
-        labels_dir (str, optional): Path to directory containing COCO dataset annotation files.
-        save_dir (str, optional): Path to directory to save results to.
-        use_segments (bool, optional): Whether to include segmentation masks in the output.
-        use_keypoints (bool, optional): Whether to include keypoint annotations in the output.
-        cls91to80 (bool, optional): Whether to map 91 COCO class IDs to the corresponding 80 COCO class IDs.
-        lvis (bool, optional): Whether to convert data in lvis dataset way.
-
-    Examples:
-        >>> from ultralytics.data.converter import convert_coco
-
-        Convert COCO annotations to YOLO format
-        >>> convert_coco("coco/annotations/", use_segments=True, use_keypoints=False, cls91to80=False)
-
-        Convert LVIS annotations to YOLO format
-        >>> convert_coco("lvis/annotations/", use_segments=True, use_keypoints=False, cls91to80=False, lvis=True)
-    """
-    # Create dataset directory
-    save_dir = increment_path(save_dir)  # increment if save directory already exists
-    for p in save_dir / "labels", save_dir / "images":
-        p.mkdir(parents=True, exist_ok=True)  # make dir
-
-    # Convert classes
-    coco80 = coco91_to_coco80_class()
-
-    # Import json
-    for json_file in sorted(Path(labels_dir).resolve().glob("*.json")):
-        lname = "" if lvis else json_file.stem.replace("instances_", "")
-        fn = Path(save_dir) / "labels" / lname  # folder name
-        fn.mkdir(parents=True, exist_ok=True)
-        if lvis:
-            # NOTE: create folders for both train and val in advance,
-            # since LVIS val set contains images from COCO 2017 train in addition to the COCO 2017 val split.
-            (fn / "train2017").mkdir(parents=True, exist_ok=True)
-            (fn / "val2017").mkdir(parents=True, exist_ok=True)
-        with open(json_file, encoding="utf-8") as f:
-            data = json.load(f)
-
-        # Create image dict
-        images = {f"{x['id']:d}": x for x in data["images"]}
-        # Create image-annotations dict
-        annotations = defaultdict(list)
-        for ann in data["annotations"]:
-            annotations[ann["image_id"]].append(ann)
-
-        image_txt = []
-        # Write labels file
-        for img_id, anns in TQDM(annotations.items(), desc=f"Annotations {json_file}"):
-            img = images[f"{img_id:d}"]
-            h, w = img["height"], img["width"]
-            f = str(Path(img["coco_url"]).relative_to("http://images.cocodataset.org")) if lvis else img["file_name"]
-            if lvis:
-                image_txt.append(str(Path("./images") / f))
-
-            bboxes = []
-            segments = []
-            keypoints = []
-            for ann in anns:
-                if ann.get("iscrowd", False):
-                    continue
-                # The COCO box format is [top left x, top left y, width, height]
-                box = np.array(ann["bbox"], dtype=np.float64)
-                box[:2] += box[2:] / 2  # xy top-left corner to center
-                box[[0, 2]] /= w  # normalize x
-                box[[1, 3]] /= h  # normalize y
-                if box[2] <= 0 or box[3] <= 0:  # if w <= 0 and h <= 0
-                    continue
-
-                cls = coco80[ann["category_id"] - 1] if cls91to80 else ann["category_id"] - 1  # class
-                box = [cls] + box.tolist()
-                if box not in bboxes:
-                    bboxes.append(box)
-                    if use_segments and ann.get("segmentation") is not None:
-                        if len(ann["segmentation"]) == 0:
-                            segments.append([])
-                            continue
-                        elif len(ann["segmentation"]) > 1:
-                            s = merge_multi_segment(ann["segmentation"])
-                            s = (np.concatenate(s, axis=0) / np.array([w, h])).reshape(-1).tolist()
-                        else:
-                            s = [j for i in ann["segmentation"] for j in i]  # all segments concatenated
-                            s = (np.array(s).reshape(-1, 2) / np.array([w, h])).reshape(-1).tolist()
-                        s = [cls] + s
-                        segments.append(s)
-                    if use_keypoints and ann.get("keypoints") is not None:
-                        keypoints.append(
-                            box + (np.array(ann["keypoints"]).reshape(-1, 3) / np.array([w, h, 1])).reshape(-1).tolist()
-                        )
-
-            # Write
-            with open((fn / f).with_suffix(".txt"), "a", encoding="utf-8") as file:
-                for i in range(len(bboxes)):
-                    if use_keypoints:
-                        line = (*(keypoints[i]),)  # cls, box, keypoints
-                    else:
-                        line = (
-                            *(segments[i] if use_segments and len(segments[i]) > 0 else bboxes[i]),
-                        )  # cls, box or segments
-                    file.write(("%g " * len(line)).rstrip() % line + "\n")
-
-        if lvis:
-            filename = Path(save_dir) / json_file.name.replace("lvis_v1_", "").replace(".json", ".txt")
-            with open(filename, "a", encoding="utf-8") as f:
-                f.writelines(f"{line}\n" for line in image_txt)
-
-    LOGGER.info(f"{'LVIS' if lvis else 'COCO'} data converted successfully.\nResults saved to {save_dir.resolve()}")
-
-
-def convert_segment_masks_to_yolo_seg(masks_dir: str, output_dir: str, classes: int):
-    """
-    Convert a dataset of segmentation mask images to the YOLO segmentation format.
-
-    This function takes the directory containing the binary format mask images and converts them into YOLO segmentation
-    format. The converted masks are saved in the specified output directory.
-
-    Args:
-        masks_dir (str): The path to the directory where all mask images (png, jpg) are stored.
-        output_dir (str): The path to the directory where the converted YOLO segmentation masks will be stored.
-        classes (int): Total classes in the dataset i.e. for COCO classes=80
-
-    Examples:
-        >>> from ultralytics.data.converter import convert_segment_masks_to_yolo_seg
-
-        The classes here is the total classes in the dataset, for COCO dataset we have 80 classes
-        >>> convert_segment_masks_to_yolo_seg("path/to/masks_directory", "path/to/output/directory", classes=80)
-
-    Notes:
-        The expected directory structure for the masks is:
-
-            - masks
-                ├─ mask_image_01.png or mask_image_01.jpg
-                ├─ mask_image_02.png or mask_image_02.jpg
-                ├─ mask_image_03.png or mask_image_03.jpg
-                └─ mask_image_04.png or mask_image_04.jpg
-
-        After execution, the labels will be organized in the following structure:
-
-            - output_dir
-                ├─ mask_yolo_01.txt
-                ├─ mask_yolo_02.txt
-                ├─ mask_yolo_03.txt
-                └─ mask_yolo_04.txt
-    """
-    pixel_to_class_mapping = {i + 1: i for i in range(classes)}
-    for mask_path in Path(masks_dir).iterdir():
-        if mask_path.suffix in {".png", ".jpg"}:
-            mask = cv2.imread(str(mask_path), cv2.IMREAD_GRAYSCALE)  # Read the mask image in grayscale
-            img_height, img_width = mask.shape  # Get image dimensions
-            LOGGER.info(f"Processing {mask_path} imgsz = {img_height} x {img_width}")
-
-            unique_values = np.unique(mask)  # Get unique pixel values representing different classes
-            yolo_format_data = []
-
-            for value in unique_values:
-                if value == 0:
-                    continue  # Skip background
-                class_index = pixel_to_class_mapping.get(value, -1)
-                if class_index == -1:
-                    LOGGER.warning(f"Unknown class for pixel value {value} in file {mask_path}, skipping.")
-                    continue
-
-                # Create a binary mask for the current class and find contours
-                contours, _ = cv2.findContours(
-                    (mask == value).astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
-                )  # Find contours
-
-                for contour in contours:
-                    if len(contour) >= 3:  # YOLO requires at least 3 points for a valid segmentation
-                        contour = contour.squeeze()  # Remove single-dimensional entries
-                        yolo_format = [class_index]
-                        for point in contour:
-                            # Normalize the coordinates
-                            yolo_format.append(round(point[0] / img_width, 6))  # Rounding to 6 decimal places
-                            yolo_format.append(round(point[1] / img_height, 6))
-                        yolo_format_data.append(yolo_format)
-            # Save Ultralytics YOLO format data to file
-            output_path = Path(output_dir) / f"{mask_path.stem}.txt"
-            with open(output_path, "w", encoding="utf-8") as file:
-                for item in yolo_format_data:
-                    line = " ".join(map(str, item))
-                    file.write(line + "\n")
-            LOGGER.info(f"Processed and stored at {output_path} imgsz = {img_height} x {img_width}")
-
-
-def convert_dota_to_yolo_obb(dota_root_path: str):
-    """
-    Convert DOTA dataset annotations to YOLO OBB (Oriented Bounding Box) format.
-
-    The function processes images in the 'train' and 'val' folders of the DOTA dataset. For each image, it reads the
-    associated label from the original labels directory and writes new labels in YOLO OBB format to a new directory.
-
-    Args:
-        dota_root_path (str): The root directory path of the DOTA dataset.
-
-    Examples:
-        >>> from ultralytics.data.converter import convert_dota_to_yolo_obb
-        >>> convert_dota_to_yolo_obb("path/to/DOTA")
-
-    Notes:
-        The directory structure assumed for the DOTA dataset:
-
-            - DOTA
-                ├─ images
-                │   ├─ train
-                │   └─ val
-                └─ labels
-                    ├─ train_original
-                    └─ val_original
-
-        After execution, the function will organize the labels into:
-
-            - DOTA
-                └─ labels
-                    ├─ train
-                    └─ val
-    """
-    dota_root_path = Path(dota_root_path)
-
-    # Class names to indices mapping
-    class_mapping = {
-        "plane": 0,
-        "ship": 1,
-        "storage-tank": 2,
-        "baseball-diamond": 3,
-        "tennis-court": 4,
-        "basketball-court": 5,
-        "ground-track-field": 6,
-        "harbor": 7,
-        "bridge": 8,
-        "large-vehicle": 9,
-        "small-vehicle": 10,
-        "helicopter": 11,
-        "roundabout": 12,
-        "soccer-ball-field": 13,
-        "swimming-pool": 14,
-        "container-crane": 15,
-        "airport": 16,
-        "helipad": 17,
-    }
-
-    def convert_label(image_name: str, image_width: int, image_height: int, orig_label_dir: Path, save_dir: Path):
-        """Convert a single image's DOTA annotation to YOLO OBB format and save it to a specified directory."""
-        orig_label_path = orig_label_dir / f"{image_name}.txt"
-        save_path = save_dir / f"{image_name}.txt"
-
-        with orig_label_path.open("r") as f, save_path.open("w") as g:
-            lines = f.readlines()
-            for line in lines:
-                parts = line.strip().split()
-                if len(parts) < 9:
-                    continue
-                class_name = parts[8]
-                class_idx = class_mapping[class_name]
-                coords = [float(p) for p in parts[:8]]
-                normalized_coords = [
-                    coords[i] / image_width if i % 2 == 0 else coords[i] / image_height for i in range(8)
-                ]
-                formatted_coords = [f"{coord:.6g}" for coord in normalized_coords]
-                g.write(f"{class_idx} {' '.join(formatted_coords)}\n")
-
-    for phase in {"train", "val"}:
-        image_dir = dota_root_path / "images" / phase
-        orig_label_dir = dota_root_path / "labels" / f"{phase}_original"
-        save_dir = dota_root_path / "labels" / phase
-
-        save_dir.mkdir(parents=True, exist_ok=True)
-
-        image_paths = list(image_dir.iterdir())
-        for image_path in TQDM(image_paths, desc=f"Processing {phase} images"):
-            if image_path.suffix != ".png":
-                continue
-            image_name_without_ext = image_path.stem
-            img = cv2.imread(str(image_path))
-            h, w = img.shape[:2]
-            convert_label(image_name_without_ext, w, h, orig_label_dir, save_dir)
-
-
-def min_index(arr1: np.ndarray, arr2: np.ndarray):
-    """
-    Find a pair of indexes with the shortest distance between two arrays of 2D points.
-
-    Args:
-        arr1 (np.ndarray): A NumPy array of shape (N, 2) representing N 2D points.
-        arr2 (np.ndarray): A NumPy array of shape (M, 2) representing M 2D points.
-
-    Returns:
-        idx1 (int): Index of the point in arr1 with the shortest distance.
-        idx2 (int): Index of the point in arr2 with the shortest distance.
-    """
-    dis = ((arr1[:, None, :] - arr2[None, :, :]) ** 2).sum(-1)
-    return np.unravel_index(np.argmin(dis, axis=None), dis.shape)
-
-
-def merge_multi_segment(segments: list[list]):
-    """
-    Merge multiple segments into one list by connecting the coordinates with the minimum distance between each segment.
-
-    This function connects these coordinates with a thin line to merge all segments into one.
-
-    Args:
-        segments (list[list]): Original segmentations in COCO's JSON file.
-                               Each element is a list of coordinates, like [segmentation1, segmentation2,...].
-
-    Returns:
-        s (list[np.ndarray]): A list of connected segments represented as NumPy arrays.
-    """
-    s = []
-    segments = [np.array(i).reshape(-1, 2) for i in segments]
-    idx_list = [[] for _ in range(len(segments))]
-
-    # Record the indexes with min distance between each segment
-    for i in range(1, len(segments)):
-        idx1, idx2 = min_index(segments[i - 1], segments[i])
-        idx_list[i - 1].append(idx1)
-        idx_list[i].append(idx2)
-
-    # Use two round to connect all the segments
-    for k in range(2):
-        # Forward connection
-        if k == 0:
-            for i, idx in enumerate(idx_list):
-                # Middle segments have two indexes, reverse the index of middle segments
-                if len(idx) == 2 and idx[0] > idx[1]:
-                    idx = idx[::-1]
-                    segments[i] = segments[i][::-1, :]
-
-                segments[i] = np.roll(segments[i], -idx[0], axis=0)
-                segments[i] = np.concatenate([segments[i], segments[i][:1]])
-                # Deal with the first segment and the last one
-                if i in {0, len(idx_list) - 1}:
-                    s.append(segments[i])
-                else:
-                    idx = [0, idx[1] - idx[0]]
-                    s.append(segments[i][idx[0] : idx[1] + 1])
-
-        else:
-            for i in range(len(idx_list) - 1, -1, -1):
-                if i not in {0, len(idx_list) - 1}:
-                    idx = idx_list[i]
-                    nidx = abs(idx[1] - idx[0])
-                    s.append(segments[i][nidx:])
-    return s
-
-
-def yolo_bbox2segment(im_dir: str | Path, save_dir: str | Path | None = None, sam_model: str = "sam_b.pt", device=None):
-    """
-    Convert existing object detection dataset (bounding boxes) to segmentation dataset or oriented bounding box (OBB) in
-    YOLO format. Generate segmentation data using SAM auto-annotator as needed.
-
-    Args:
-        im_dir (str | Path): Path to image directory to convert.
-        save_dir (str | Path, optional): Path to save the generated labels, labels will be saved
-            into `labels-segment` in the same directory level of `im_dir` if save_dir is None.
-        sam_model (str): Segmentation model to use for intermediate segmentation data.
-        device (int | str, optional): The specific device to run SAM models.
-
-    Notes:
-        The input directory structure assumed for dataset:
-
-            - im_dir
-                ├─ 001.jpg
-                ├─ ...
-                └─ NNN.jpg
-            - labels
-                ├─ 001.txt
-                ├─ ...
-                └─ NNN.txt
-    """
-    from ultralytics import SAM
-    from ultralytics.data import YOLODataset
-    from ultralytics.utils.ops import xywh2xyxy
-
-    # NOTE: add placeholder to pass class index check
-    dataset = YOLODataset(im_dir, data=dict(names=list(range(1000)), channels=3))
-    if len(dataset.labels[0]["segments"]) > 0:  # if it's segment data
-        LOGGER.info("Segmentation labels detected, no need to generate new ones!")
-        return
-
-    LOGGER.info("Detection labels detected, generating segment labels by SAM model!")
-    sam_model = SAM(sam_model)
-    for label in TQDM(dataset.labels, total=len(dataset.labels), desc="Generating segment labels"):
-        h, w = label["shape"]
-        boxes = label["bboxes"]
-        if len(boxes) == 0:  # skip empty labels
-            continue
-        boxes[:, [0, 2]] *= w
-        boxes[:, [1, 3]] *= h
-        im = cv2.imread(label["im_file"])
-        sam_results = sam_model(im, bboxes=xywh2xyxy(boxes), verbose=False, save=False, device=device)
-        label["segments"] = sam_results[0].masks.xyn
-
-    save_dir = Path(save_dir) if save_dir else Path(im_dir).parent / "labels-segment"
-    save_dir.mkdir(parents=True, exist_ok=True)
-    for label in dataset.labels:
-        texts = []
-        lb_name = Path(label["im_file"]).with_suffix(".txt").name
-        txt_file = save_dir / lb_name
-        cls = label["cls"]
-        for i, s in enumerate(label["segments"]):
-            if len(s) == 0:
-                continue
-            line = (int(cls[i]), *s.reshape(-1))
-            texts.append(("%g " * len(line)).rstrip() % line)
-        with open(txt_file, "a", encoding="utf-8") as f:
-            f.writelines(text + "\n" for text in texts)
-    LOGGER.info(f"Generated segment labels saved in {save_dir}")
-
-
-def create_synthetic_coco_dataset():
-    """
-    Create a synthetic COCO dataset with random images based on filenames from label lists.
-
-    This function downloads COCO labels, reads image filenames from label list files,
-    creates synthetic images for train2017 and val2017 subsets, and organizes
-    them in the COCO dataset structure. It uses multithreading to generate images efficiently.
-
-    Examples:
-        >>> from ultralytics.data.converter import create_synthetic_coco_dataset
-        >>> create_synthetic_coco_dataset()
-
-    Notes:
-        - Requires internet connection to download label files.
-        - Generates random RGB images of varying sizes (480x480 to 640x640 pixels).
-        - Existing test2017 directory is removed as it's not needed.
-        - Reads image filenames from train2017.txt and val2017.txt files.
-    """
-
-    def create_synthetic_image(image_file: Path):
-        """Generate synthetic images with random sizes and colors for dataset augmentation or testing purposes."""
-        if not image_file.exists():
-            size = (random.randint(480, 640), random.randint(480, 640))
-            Image.new(
-                "RGB",
-                size=size,
-                color=(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)),
-            ).save(image_file)
-
-    # Download labels
-    dir = DATASETS_DIR / "coco"
-    url = "https://github.com/ultralytics/assets/releases/download/v0.0.0/"
-    label_zip = "coco2017labels-segments.zip"
-    download([url + label_zip], dir=dir.parent)
-
-    # Create synthetic images
-    shutil.rmtree(dir / "labels" / "test2017", ignore_errors=True)  # Remove test2017 directory as not needed
-    with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
-        for subset in {"train2017", "val2017"}:
-            subset_dir = dir / "images" / subset
-            subset_dir.mkdir(parents=True, exist_ok=True)
-
-            # Read image filenames from label list file
-            label_list_file = dir / f"{subset}.txt"
-            if label_list_file.exists():
-                with open(label_list_file, encoding="utf-8") as f:
-                    image_files = [dir / line.strip() for line in f]
-
-                # Submit all tasks
-                futures = [executor.submit(create_synthetic_image, image_file) for image_file in image_files]
-                for _ in TQDM(as_completed(futures), total=len(futures), desc=f"Generating images for {subset}"):
-                    pass  # The actual work is done in the background
-            else:
-                LOGGER.warning(f"Labels file {label_list_file} does not exist. Skipping image creation for {subset}.")
-
-    LOGGER.info("Synthetic COCO dataset created successfully.")
-
-
-def convert_to_multispectral(path: str | Path, n_channels: int = 10, replace: bool = False, zip: bool = False):
-    """
-    Convert RGB images to multispectral images by interpolating across wavelength bands.
-
-    This function takes RGB images and interpolates them to create multispectral images with a specified number
-    of channels. It can process either a single image or a directory of images.
-
-    Args:
-        path (str | Path): Path to an image file or directory containing images to convert.
-        n_channels (int): Number of spectral channels to generate in the output image.
-        replace (bool): Whether to replace the original image file with the converted one.
-        zip (bool): Whether to zip the converted images into a zip file.
-
-    Examples:
-        Convert a single image
-        >>> convert_to_multispectral("path/to/image.jpg", n_channels=10)
-
-        Convert a dataset
-        >>> convert_to_multispectral("coco8", n_channels=10)
-    """
-    from scipy.interpolate import interp1d
-
-    from ultralytics.data.utils import IMG_FORMATS
-
-    path = Path(path)
-    if path.is_dir():
-        # Process directory
-        im_files = sum((list(path.rglob(f"*.{ext}")) for ext in (IMG_FORMATS - {"tif", "tiff"})), [])
-        for im_path in im_files:
-            try:
-                convert_to_multispectral(im_path, n_channels)
-                if replace:
-                    im_path.unlink()
-            except Exception as e:
-                LOGGER.info(f"Error converting {im_path}: {e}")
-
-        if zip:
-            zip_directory(path)
-    else:
-        # Process a single image
-        output_path = path.with_suffix(".tiff")
-        img = cv2.cvtColor(cv2.imread(str(path)), cv2.COLOR_BGR2RGB)
-
-        # Interpolate all pixels at once
-        rgb_wavelengths = np.array([650, 510, 475])  # R, G, B wavelengths (nm)
-        target_wavelengths = np.linspace(450, 700, n_channels)
-        f = interp1d(rgb_wavelengths.T, img, kind="linear", bounds_error=False, fill_value="extrapolate")
-        multispectral = f(target_wavelengths)
-        cv2.imwritemulti(str(output_path), np.clip(multispectral, 0, 255).astype(np.uint8).transpose(2, 0, 1))
-        LOGGER.info(f"Converted {output_path}")
-
-
-async def convert_ndjson_to_yolo(ndjson_path: str | Path, output_path: str | Path | None = None) -> Path:
-    """
-    Convert NDJSON dataset format to Ultralytics YOLO11 dataset structure.
-
-    This function converts datasets stored in NDJSON (Newline Delimited JSON) format to the standard YOLO
-    format with separate directories for images and labels. It supports parallel processing for efficient
-    conversion of large datasets and can download images from URLs if they don't exist locally.
-
-    The NDJSON format consists of:
-    - First line: Dataset metadata with class names and configuration
-    - Subsequent lines: Individual image records with annotations and optional URLs
-
-    Args:
-        ndjson_path (Union[str, Path]): Path to the input NDJSON file containing dataset information.
-        output_path (Optional[Union[str, Path]], optional): Directory where the converted YOLO dataset
-            will be saved. If None, uses the parent directory of the NDJSON file. Defaults to None.
-
-    Returns:
-        (Path): Path to the generated data.yaml file that can be used for YOLO training.
-
-    Examples:
-        Convert a local NDJSON file:
-        >>> yaml_path = convert_ndjson_to_yolo("dataset.ndjson")
-        >>> print(f"Dataset converted to: {yaml_path}")
-
-        Convert with custom output directory:
-        >>> yaml_path = convert_ndjson_to_yolo("dataset.ndjson", output_path="./converted_datasets")
-
-        Use with YOLO training
-        >>> from ultralytics import YOLO
-        >>> model = YOLO("yolo11n.pt")
-        >>> model.train(data="https://github.com/ultralytics/assets/releases/download/v0.0.0/coco8-ndjson.ndjson")
-    """
-    check_requirements("aiohttp")
-    import aiohttp
-
-    ndjson_path = Path(check_file(ndjson_path))
-    output_path = Path(output_path or DATASETS_DIR)
-    with open(ndjson_path) as f:
-        lines = [json.loads(line.strip()) for line in f if line.strip()]
-
-    dataset_record, image_records = lines[0], lines[1:]
-    dataset_dir = output_path / ndjson_path.stem
-    splits = {record["split"] for record in image_records}
-
-    # Create directories and prepare YAML structure
-    dataset_dir.mkdir(parents=True, exist_ok=True)
-    data_yaml = dict(dataset_record)
-    data_yaml["names"] = {int(k): v for k, v in dataset_record.get("class_names", {}).items()}
-    data_yaml.pop("class_names")
-
-    for split in sorted(splits):
-        (dataset_dir / "images" / split).mkdir(parents=True, exist_ok=True)
-        (dataset_dir / "labels" / split).mkdir(parents=True, exist_ok=True)
-        data_yaml[split] = f"images/{split}"
-
-    async def process_record(session, semaphore, record):
-        """Process single image record with async session."""
-        async with semaphore:
-            split, original_name = record["split"], record["file"]
-            label_path = dataset_dir / "labels" / split / f"{Path(original_name).stem}.txt"
-            image_path = dataset_dir / "images" / split / original_name
-
-            annotations = record.get("annotations", {})
-            lines_to_write = []
-            for key in annotations.keys():
-                lines_to_write = [" ".join(map(str, item)) for item in annotations[key]]
-                break
-            if "classification" in annotations:
-                lines_to_write = [str(cls) for cls in annotations["classification"]]
-
-            label_path.write_text("\n".join(lines_to_write) + "\n" if lines_to_write else "")
-
-            if http_url := record.get("url"):
-                if not image_path.exists():
-                    try:
-                        async with session.get(http_url, timeout=aiohttp.ClientTimeout(total=30)) as response:
-                            response.raise_for_status()
-                            with open(image_path, "wb") as f:
-                                async for chunk in response.content.iter_chunked(8192):
-                                    f.write(chunk)
-                        return True
-                    except Exception as e:
-                        LOGGER.warning(f"Failed to download {http_url}: {e}")
-                        return False
-            return True
-
-    # Process all images with async downloads
-    semaphore = asyncio.Semaphore(64)
-    async with aiohttp.ClientSession() as session:
-        pbar = TQDM(
-            total=len(image_records),
-            desc=f"Converting {ndjson_path.name} → {dataset_dir} ({len(image_records)} images)",
-        )
-
-        async def tracked_process(record):
-            result = await process_record(session, semaphore, record)
-            pbar.update(1)
-            return result
-
-        await asyncio.gather(*[tracked_process(record) for record in image_records])
-        pbar.close()
-
-    # Write data.yaml
-    yaml_path = dataset_dir / "data.yaml"
-    YAML.save(yaml_path, data_yaml)
-
-    return yaml_path
diff --git a/ultralytics/data/dataset.py b/ultralytics/data/dataset.py
deleted file mode 100644
index 17d3fc7..0000000
--- a/ultralytics/data/dataset.py
+++ /dev/null
@@ -1,862 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import json
-from collections import defaultdict
-from itertools import repeat
-from multiprocessing.pool import ThreadPool
-from pathlib import Path
-from typing import Any
-
-import cv2
-import numpy as np
-import torch
-from PIL import Image
-from torch.utils.data import ConcatDataset
-
-from ultralytics.utils import LOCAL_RANK, LOGGER, NUM_THREADS, TQDM, colorstr
-from ultralytics.utils.instance import Instances
-from ultralytics.utils.ops import resample_segments, segments2boxes
-from ultralytics.utils.torch_utils import TORCHVISION_0_18
-
-from .augment import (
-    Compose,
-    Format,
-    LetterBox,
-    RandomLoadText,
-    classify_augmentations,
-    classify_transforms,
-    v8_transforms,
-)
-from .base import BaseDataset
-from .converter import merge_multi_segment
-from .utils import (
-    HELP_URL,
-    check_file_speeds,
-    get_hash,
-    img2label_paths,
-    load_dataset_cache_file,
-    save_dataset_cache_file,
-    verify_image,
-    verify_image_label,
-)
-
-# Ultralytics dataset *.cache version, >= 1.0.0 for Ultralytics YOLO models
-DATASET_CACHE_VERSION = "1.0.3"
-
-
-class YOLODataset(BaseDataset):
-    """
-    Dataset class for loading object detection and/or segmentation labels in YOLO format.
-
-    This class supports loading data for object detection, segmentation, pose estimation, and oriented bounding box
-    (OBB) tasks using the YOLO format.
-
-    Attributes:
-        use_segments (bool): Indicates if segmentation masks should be used.
-        use_keypoints (bool): Indicates if keypoints should be used for pose estimation.
-        use_obb (bool): Indicates if oriented bounding boxes should be used.
-        data (dict): Dataset configuration dictionary.
-
-    Methods:
-        cache_labels: Cache dataset labels, check images and read shapes.
-        get_labels: Return dictionary of labels for YOLO training.
-        build_transforms: Build and append transforms to the list.
-        close_mosaic: Set mosaic, copy_paste and mixup options to 0.0 and build transformations.
-        update_labels_info: Update label format for different tasks.
-        collate_fn: Collate data samples into batches.
-
-    Examples:
-        >>> dataset = YOLODataset(img_path="path/to/images", data={"names": {0: "person"}}, task="detect")
-        >>> dataset.get_labels()
-    """
-
-    def __init__(self, *args, data: dict | None = None, task: str = "detect", **kwargs):
-        """
-        Initialize the YOLODataset.
-
-        Args:
-            data (dict, optional): Dataset configuration dictionary.
-            task (str): Task type, one of 'detect', 'segment', 'pose', or 'obb'.
-            *args (Any): Additional positional arguments for the parent class.
-            **kwargs (Any): Additional keyword arguments for the parent class.
-        """
-        self.use_segments = task == "segment"
-        self.use_keypoints = task == "pose"
-        self.use_obb = task == "obb"
-        self.data = data
-        assert not (self.use_segments and self.use_keypoints), "Can not use both segments and keypoints."
-        super().__init__(*args, channels=self.data.get("channels", 3), **kwargs)
-
-    def cache_labels(self, path: Path = Path("./labels.cache")) -> dict:
-        """
-        Cache dataset labels, check images and read shapes.
-
-        Args:
-            path (Path): Path where to save the cache file.
-
-        Returns:
-            (dict): Dictionary containing cached labels and related information.
-        """
-        x = {"labels": []}
-        nm, nf, ne, nc, msgs = 0, 0, 0, 0, []  # number missing, found, empty, corrupt, messages
-        desc = f"{self.prefix}Scanning {path.parent / path.stem}..."
-        total = len(self.im_files)
-        nkpt, ndim = self.data.get("kpt_shape", (0, 0))
-        if self.use_keypoints and (nkpt <= 0 or ndim not in {2, 3}):
-            raise ValueError(
-                "'kpt_shape' in data.yaml missing or incorrect. Should be a list with [number of "
-                "keypoints, number of dims (2 for x,y or 3 for x,y,visible)], i.e. 'kpt_shape: [17, 3]'"
-            )
-        with ThreadPool(NUM_THREADS) as pool:
-            results = pool.imap(
-                func=verify_image_label,
-                iterable=zip(
-                    self.im_files,
-                    self.label_files,
-                    repeat(self.prefix),
-                    repeat(self.use_keypoints),
-                    repeat(len(self.data["names"])),
-                    repeat(nkpt),
-                    repeat(ndim),
-                    repeat(self.single_cls),
-                ),
-            )
-            pbar = TQDM(results, desc=desc, total=total)
-            for im_file, lb, shape, segments, keypoint, nm_f, nf_f, ne_f, nc_f, msg in pbar:
-                nm += nm_f
-                nf += nf_f
-                ne += ne_f
-                nc += nc_f
-                if im_file:
-                    x["labels"].append(
-                        {
-                            "im_file": im_file,
-                            "shape": shape,
-                            "cls": lb[:, 0:1],  # n, 1
-                            "bboxes": lb[:, 1:],  # n, 4
-                            "segments": segments,
-                            "keypoints": keypoint,
-                            "normalized": True,
-                            "bbox_format": "xywh",
-                        }
-                    )
-                if msg:
-                    msgs.append(msg)
-                pbar.desc = f"{desc} {nf} images, {nm + ne} backgrounds, {nc} corrupt"
-            pbar.close()
-
-        if msgs:
-            LOGGER.info("\n".join(msgs))
-        if nf == 0:
-            LOGGER.warning(f"{self.prefix}No labels found in {path}. {HELP_URL}")
-        x["hash"] = get_hash(self.label_files + self.im_files)
-        x["results"] = nf, nm, ne, nc, len(self.im_files)
-        x["msgs"] = msgs  # warnings
-        save_dataset_cache_file(self.prefix, path, x, DATASET_CACHE_VERSION)
-        return x
-
-    def get_labels(self) -> list[dict]:
-        """
-        Return dictionary of labels for YOLO training.
-
-        This method loads labels from disk or cache, verifies their integrity, and prepares them for training.
-
-        Returns:
-            (list[dict]): List of label dictionaries, each containing information about an image and its annotations.
-        """
-        self.label_files = img2label_paths(self.im_files)
-        cache_path = Path(self.label_files[0]).parent.with_suffix(".cache")
-        try:
-            cache, exists = load_dataset_cache_file(cache_path), True  # attempt to load a *.cache file
-            assert cache["version"] == DATASET_CACHE_VERSION  # matches current version
-            assert cache["hash"] == get_hash(self.label_files + self.im_files)  # identical hash
-        except (FileNotFoundError, AssertionError, AttributeError, ModuleNotFoundError):
-            cache, exists = self.cache_labels(cache_path), False  # run cache ops
-
-        # Display cache
-        nf, nm, ne, nc, n = cache.pop("results")  # found, missing, empty, corrupt, total
-        if exists and LOCAL_RANK in {-1, 0}:
-            d = f"Scanning {cache_path}... {nf} images, {nm + ne} backgrounds, {nc} corrupt"
-            TQDM(None, desc=self.prefix + d, total=n, initial=n)  # display results
-            if cache["msgs"]:
-                LOGGER.info("\n".join(cache["msgs"]))  # display warnings
-
-        # Read cache
-        [cache.pop(k) for k in ("hash", "version", "msgs")]  # remove items
-        labels = cache["labels"]
-        if not labels:
-            raise RuntimeError(
-                f"No valid images found in {cache_path}. Images with incorrectly formatted labels are ignored. {HELP_URL}"
-            )
-        self.im_files = [lb["im_file"] for lb in labels]  # update im_files
-
-        # Check if the dataset is all boxes or all segments
-        lengths = ((len(lb["cls"]), len(lb["bboxes"]), len(lb["segments"])) for lb in labels)
-        len_cls, len_boxes, len_segments = (sum(x) for x in zip(*lengths))
-        if len_segments and len_boxes != len_segments:
-            LOGGER.warning(
-                f"Box and segment counts should be equal, but got len(segments) = {len_segments}, "
-                f"len(boxes) = {len_boxes}. To resolve this only boxes will be used and all segments will be removed. "
-                "To avoid this please supply either a detect or segment dataset, not a detect-segment mixed dataset."
-            )
-            for lb in labels:
-                lb["segments"] = []
-        if len_cls == 0:
-            LOGGER.warning(f"Labels are missing or empty in {cache_path}, training may not work correctly. {HELP_URL}")
-        return labels
-
-    def build_transforms(self, hyp: dict | None = None) -> Compose:
-        """
-        Build and append transforms to the list.
-
-        Args:
-            hyp (dict, optional): Hyperparameters for transforms.
-
-        Returns:
-            (Compose): Composed transforms.
-        """
-        if self.augment:
-            hyp.mosaic = hyp.mosaic if self.augment and not self.rect else 0.0
-            hyp.mixup = hyp.mixup if self.augment and not self.rect else 0.0
-            hyp.cutmix = hyp.cutmix if self.augment and not self.rect else 0.0
-            transforms = v8_transforms(self, self.imgsz, hyp)
-        else:
-            transforms = Compose([LetterBox(new_shape=(self.imgsz, self.imgsz), scaleup=False)])
-        transforms.append(
-            Format(
-                bbox_format="xywh",
-                normalize=True,
-                return_mask=self.use_segments,
-                return_keypoint=self.use_keypoints,
-                return_obb=self.use_obb,
-                batch_idx=True,
-                mask_ratio=hyp.mask_ratio,
-                mask_overlap=hyp.overlap_mask,
-                bgr=hyp.bgr if self.augment else 0.0,  # only affect training.
-            )
-        )
-        return transforms
-
-    def close_mosaic(self, hyp: dict) -> None:
-        """
-        Disable mosaic, copy_paste, mixup and cutmix augmentations by setting their probabilities to 0.0.
-
-        Args:
-            hyp (dict): Hyperparameters for transforms.
-        """
-        hyp.mosaic = 0.0
-        hyp.copy_paste = 0.0
-        hyp.mixup = 0.0
-        hyp.cutmix = 0.0
-        self.transforms = self.build_transforms(hyp)
-
-    def update_labels_info(self, label: dict) -> dict:
-        """
-        Update label format for different tasks.
-
-        Args:
-            label (dict): Label dictionary containing bboxes, segments, keypoints, etc.
-
-        Returns:
-            (dict): Updated label dictionary with instances.
-
-        Note:
-            cls is not with bboxes now, classification and semantic segmentation need an independent cls label
-            Can also support classification and semantic segmentation by adding or removing dict keys there.
-        """
-        bboxes = label.pop("bboxes")
-        segments = label.pop("segments", [])
-        keypoints = label.pop("keypoints", None)
-        bbox_format = label.pop("bbox_format")
-        normalized = label.pop("normalized")
-
-        # NOTE: do NOT resample oriented boxes
-        segment_resamples = 100 if self.use_obb else 1000
-        if len(segments) > 0:
-            # make sure segments interpolate correctly if original length is greater than segment_resamples
-            max_len = max(len(s) for s in segments)
-            segment_resamples = (max_len + 1) if segment_resamples < max_len else segment_resamples
-            # list[np.array(segment_resamples, 2)] * num_samples
-            segments = np.stack(resample_segments(segments, n=segment_resamples), axis=0)
-        else:
-            segments = np.zeros((0, segment_resamples, 2), dtype=np.float32)
-        label["instances"] = Instances(bboxes, segments, keypoints, bbox_format=bbox_format, normalized=normalized)
-        return label
-
-    @staticmethod
-    def collate_fn(batch: list[dict]) -> dict:
-        """
-        Collate data samples into batches.
-
-        Args:
-            batch (list[dict]): List of dictionaries containing sample data.
-
-        Returns:
-            (dict): Collated batch with stacked tensors.
-        """
-        new_batch = {}
-        batch = [dict(sorted(b.items())) for b in batch]  # make sure the keys are in the same order
-        keys = batch[0].keys()
-        values = list(zip(*[list(b.values()) for b in batch]))
-        for i, k in enumerate(keys):
-            value = values[i]
-            if k in {"img", "text_feats"}:
-                value = torch.stack(value, 0)
-            elif k == "visuals":
-                value = torch.nn.utils.rnn.pad_sequence(value, batch_first=True)
-            if k in {"masks", "keypoints", "bboxes", "cls", "segments", "obb"}:
-                value = torch.cat(value, 0)
-            new_batch[k] = value
-        new_batch["batch_idx"] = list(new_batch["batch_idx"])
-        for i in range(len(new_batch["batch_idx"])):
-            new_batch["batch_idx"][i] += i  # add target image index for build_targets()
-        new_batch["batch_idx"] = torch.cat(new_batch["batch_idx"], 0)
-        return new_batch
-
-
-class YOLOMultiModalDataset(YOLODataset):
-    """
-    Dataset class for loading object detection and/or segmentation labels in YOLO format with multi-modal support.
-
-    This class extends YOLODataset to add text information for multi-modal model training, enabling models to
-    process both image and text data.
-
-    Methods:
-        update_labels_info: Add text information for multi-modal model training.
-        build_transforms: Enhance data transformations with text augmentation.
-
-    Examples:
-        >>> dataset = YOLOMultiModalDataset(img_path="path/to/images", data={"names": {0: "person"}}, task="detect")
-        >>> batch = next(iter(dataset))
-        >>> print(batch.keys())  # Should include 'texts'
-    """
-
-    def __init__(self, *args, data: dict | None = None, task: str = "detect", **kwargs):
-        """
-        Initialize a YOLOMultiModalDataset.
-
-        Args:
-            data (dict, optional): Dataset configuration dictionary.
-            task (str): Task type, one of 'detect', 'segment', 'pose', or 'obb'.
-            *args (Any): Additional positional arguments for the parent class.
-            **kwargs (Any): Additional keyword arguments for the parent class.
-        """
-        super().__init__(*args, data=data, task=task, **kwargs)
-
-    def update_labels_info(self, label: dict) -> dict:
-        """
-        Add text information for multi-modal model training.
-
-        Args:
-            label (dict): Label dictionary containing bboxes, segments, keypoints, etc.
-
-        Returns:
-            (dict): Updated label dictionary with instances and texts.
-        """
-        labels = super().update_labels_info(label)
-        # NOTE: some categories are concatenated with its synonyms by `/`.
-        # NOTE: and `RandomLoadText` would randomly select one of them if there are multiple words.
-        labels["texts"] = [v.split("/") for _, v in self.data["names"].items()]
-
-        return labels
-
-    def build_transforms(self, hyp: dict | None = None) -> Compose:
-        """
-        Enhance data transformations with optional text augmentation for multi-modal training.
-
-        Args:
-            hyp (dict, optional): Hyperparameters for transforms.
-
-        Returns:
-            (Compose): Composed transforms including text augmentation if applicable.
-        """
-        transforms = super().build_transforms(hyp)
-        if self.augment:
-            # NOTE: hard-coded the args for now.
-            # NOTE: this implementation is different from official yoloe,
-            # the strategy of selecting negative is restricted in one dataset,
-            # while official pre-saved neg embeddings from all datasets at once.
-            transform = RandomLoadText(
-                max_samples=min(self.data["nc"], 80),
-                padding=True,
-                padding_value=self._get_neg_texts(self.category_freq),
-            )
-            transforms.insert(-1, transform)
-        return transforms
-
-    @property
-    def category_names(self):
-        """
-        Return category names for the dataset.
-
-        Returns:
-            (set[str]): List of class names.
-        """
-        names = self.data["names"].values()
-        return {n.strip() for name in names for n in name.split("/")}  # category names
-
-    @property
-    def category_freq(self):
-        """Return frequency of each category in the dataset."""
-        texts = [v.split("/") for v in self.data["names"].values()]
-        category_freq = defaultdict(int)
-        for label in self.labels:
-            for c in label["cls"].squeeze(-1):  # to check
-                text = texts[int(c)]
-                for t in text:
-                    t = t.strip()
-                    category_freq[t] += 1
-        return category_freq
-
-    @staticmethod
-    def _get_neg_texts(category_freq: dict, threshold: int = 100) -> list[str]:
-        """Get negative text samples based on frequency threshold."""
-        threshold = min(max(category_freq.values()), 100)
-        return [k for k, v in category_freq.items() if v >= threshold]
-
-
-class GroundingDataset(YOLODataset):
-    """
-    Dataset class for object detection tasks using annotations from a JSON file in grounding format.
-
-    This dataset is designed for grounding tasks where annotations are provided in a JSON file rather than
-    the standard YOLO format text files.
-
-    Attributes:
-        json_file (str): Path to the JSON file containing annotations.
-
-    Methods:
-        get_img_files: Return empty list as image files are read in get_labels.
-        get_labels: Load annotations from a JSON file and prepare them for training.
-        build_transforms: Configure augmentations for training with optional text loading.
-
-    Examples:
-        >>> dataset = GroundingDataset(img_path="path/to/images", json_file="annotations.json", task="detect")
-        >>> len(dataset)  # Number of valid images with annotations
-    """
-
-    def __init__(self, *args, task: str = "detect", json_file: str = "", max_samples: int = 80, **kwargs):
-        """
-        Initialize a GroundingDataset for object detection.
-
-        Args:
-            json_file (str): Path to the JSON file containing annotations.
-            task (str): Must be 'detect' or 'segment' for GroundingDataset.
-            max_samples (int): Maximum number of samples to load for text augmentation.
-            *args (Any): Additional positional arguments for the parent class.
-            **kwargs (Any): Additional keyword arguments for the parent class.
-        """
-        assert task in {"detect", "segment"}, "GroundingDataset currently only supports `detect` and `segment` tasks"
-        self.json_file = json_file
-        self.max_samples = max_samples
-        super().__init__(*args, task=task, data={"channels": 3}, **kwargs)
-
-    def get_img_files(self, img_path: str) -> list:
-        """
-        The image files would be read in `get_labels` function, return empty list here.
-
-        Args:
-            img_path (str): Path to the directory containing images.
-
-        Returns:
-            (list): Empty list as image files are read in get_labels.
-        """
-        return []
-
-    def verify_labels(self, labels: list[dict[str, Any]]) -> None:
-        """
-        Verify the number of instances in the dataset matches expected counts.
-
-        This method checks if the total number of bounding box instances in the provided
-        labels matches the expected count for known datasets. It performs validation
-        against a predefined set of datasets with known instance counts.
-
-        Args:
-            labels (list[dict[str, Any]]): List of label dictionaries, where each dictionary
-                contains dataset annotations. Each label dict must have a 'bboxes' key with
-                a numpy array or tensor containing bounding box coordinates.
-
-        Raises:
-            AssertionError: If the actual instance count doesn't match the expected count
-                for a recognized dataset.
-
-        Note:
-            For unrecognized datasets (those not in the predefined expected_counts),
-            a warning is logged and verification is skipped.
-        """
-        expected_counts = {
-            "final_mixed_train_no_coco_segm": 3662412,
-            "final_mixed_train_no_coco": 3681235,
-            "final_flickr_separateGT_train_segm": 638214,
-            "final_flickr_separateGT_train": 640704,
-        }
-
-        instance_count = sum(label["bboxes"].shape[0] for label in labels)
-        for data_name, count in expected_counts.items():
-            if data_name in self.json_file:
-                assert instance_count == count, f"'{self.json_file}' has {instance_count} instances, expected {count}."
-                return
-        LOGGER.warning(f"Skipping instance count verification for unrecognized dataset '{self.json_file}'")
-
-    def cache_labels(self, path: Path = Path("./labels.cache")) -> dict[str, Any]:
-        """
-        Load annotations from a JSON file, filter, and normalize bounding boxes for each image.
-
-        Args:
-            path (Path): Path where to save the cache file.
-
-        Returns:
-            (dict[str, Any]): Dictionary containing cached labels and related information.
-        """
-        x = {"labels": []}
-        LOGGER.info("Loading annotation file...")
-        with open(self.json_file) as f:
-            annotations = json.load(f)
-        images = {f"{x['id']:d}": x for x in annotations["images"]}
-        img_to_anns = defaultdict(list)
-        for ann in annotations["annotations"]:
-            img_to_anns[ann["image_id"]].append(ann)
-        for img_id, anns in TQDM(img_to_anns.items(), desc=f"Reading annotations {self.json_file}"):
-            img = images[f"{img_id:d}"]
-            h, w, f = img["height"], img["width"], img["file_name"]
-            im_file = Path(self.img_path) / f
-            if not im_file.exists():
-                continue
-            self.im_files.append(str(im_file))
-            bboxes = []
-            segments = []
-            cat2id = {}
-            texts = []
-            for ann in anns:
-                if ann["iscrowd"]:
-                    continue
-                box = np.array(ann["bbox"], dtype=np.float32)
-                box[:2] += box[2:] / 2
-                box[[0, 2]] /= float(w)
-                box[[1, 3]] /= float(h)
-                if box[2] <= 0 or box[3] <= 0:
-                    continue
-
-                caption = img["caption"]
-                cat_name = " ".join([caption[t[0] : t[1]] for t in ann["tokens_positive"]]).lower().strip()
-                if not cat_name:
-                    continue
-
-                if cat_name not in cat2id:
-                    cat2id[cat_name] = len(cat2id)
-                    texts.append([cat_name])
-                cls = cat2id[cat_name]  # class
-                box = [cls] + box.tolist()
-                if box not in bboxes:
-                    bboxes.append(box)
-                    if ann.get("segmentation") is not None:
-                        if len(ann["segmentation"]) == 0:
-                            segments.append(box)
-                            continue
-                        elif len(ann["segmentation"]) > 1:
-                            s = merge_multi_segment(ann["segmentation"])
-                            s = (np.concatenate(s, axis=0) / np.array([w, h], dtype=np.float32)).reshape(-1).tolist()
-                        else:
-                            s = [j for i in ann["segmentation"] for j in i]  # all segments concatenated
-                            s = (
-                                (np.array(s, dtype=np.float32).reshape(-1, 2) / np.array([w, h], dtype=np.float32))
-                                .reshape(-1)
-                                .tolist()
-                            )
-                        s = [cls] + s
-                        segments.append(s)
-            lb = np.array(bboxes, dtype=np.float32) if len(bboxes) else np.zeros((0, 5), dtype=np.float32)
-
-            if segments:
-                classes = np.array([x[0] for x in segments], dtype=np.float32)
-                segments = [np.array(x[1:], dtype=np.float32).reshape(-1, 2) for x in segments]  # (cls, xy1...)
-                lb = np.concatenate((classes.reshape(-1, 1), segments2boxes(segments)), 1)  # (cls, xywh)
-            lb = np.array(lb, dtype=np.float32)
-
-            x["labels"].append(
-                {
-                    "im_file": im_file,
-                    "shape": (h, w),
-                    "cls": lb[:, 0:1],  # n, 1
-                    "bboxes": lb[:, 1:],  # n, 4
-                    "segments": segments,
-                    "normalized": True,
-                    "bbox_format": "xywh",
-                    "texts": texts,
-                }
-            )
-        x["hash"] = get_hash(self.json_file)
-        save_dataset_cache_file(self.prefix, path, x, DATASET_CACHE_VERSION)
-        return x
-
-    def get_labels(self) -> list[dict]:
-        """
-        Load labels from cache or generate them from JSON file.
-
-        Returns:
-            (list[dict]): List of label dictionaries, each containing information about an image and its annotations.
-        """
-        cache_path = Path(self.json_file).with_suffix(".cache")
-        try:
-            cache, _ = load_dataset_cache_file(cache_path), True  # attempt to load a *.cache file
-            assert cache["version"] == DATASET_CACHE_VERSION  # matches current version
-            assert cache["hash"] == get_hash(self.json_file)  # identical hash
-        except (FileNotFoundError, AssertionError, AttributeError, ModuleNotFoundError):
-            cache, _ = self.cache_labels(cache_path), False  # run cache ops
-        [cache.pop(k) for k in ("hash", "version")]  # remove items
-        labels = cache["labels"]
-        self.verify_labels(labels)
-        self.im_files = [str(label["im_file"]) for label in labels]
-        if LOCAL_RANK in {-1, 0}:
-            LOGGER.info(f"Load {self.json_file} from cache file {cache_path}")
-        return labels
-
-    def build_transforms(self, hyp: dict | None = None) -> Compose:
-        """
-        Configure augmentations for training with optional text loading.
-
-        Args:
-            hyp (dict, optional): Hyperparameters for transforms.
-
-        Returns:
-            (Compose): Composed transforms including text augmentation if applicable.
-        """
-        transforms = super().build_transforms(hyp)
-        if self.augment:
-            # NOTE: hard-coded the args for now.
-            # NOTE: this implementation is different from official yoloe,
-            # the strategy of selecting negative is restricted in one dataset,
-            # while official pre-saved neg embeddings from all datasets at once.
-            transform = RandomLoadText(
-                max_samples=min(self.max_samples, 80),
-                padding=True,
-                padding_value=self._get_neg_texts(self.category_freq),
-            )
-            transforms.insert(-1, transform)
-        return transforms
-
-    @property
-    def category_names(self):
-        """Return unique category names from the dataset."""
-        return {t.strip() for label in self.labels for text in label["texts"] for t in text}
-
-    @property
-    def category_freq(self):
-        """Return frequency of each category in the dataset."""
-        category_freq = defaultdict(int)
-        for label in self.labels:
-            for text in label["texts"]:
-                for t in text:
-                    t = t.strip()
-                    category_freq[t] += 1
-        return category_freq
-
-    @staticmethod
-    def _get_neg_texts(category_freq: dict, threshold: int = 100) -> list[str]:
-        """Get negative text samples based on frequency threshold."""
-        threshold = min(max(category_freq.values()), 100)
-        return [k for k, v in category_freq.items() if v >= threshold]
-
-
-class YOLOConcatDataset(ConcatDataset):
-    """
-    Dataset as a concatenation of multiple datasets.
-
-    This class is useful to assemble different existing datasets for YOLO training, ensuring they use the same
-    collation function.
-
-    Methods:
-        collate_fn: Static method that collates data samples into batches using YOLODataset's collation function.
-
-    Examples:
-        >>> dataset1 = YOLODataset(...)
-        >>> dataset2 = YOLODataset(...)
-        >>> combined_dataset = YOLOConcatDataset([dataset1, dataset2])
-    """
-
-    @staticmethod
-    def collate_fn(batch: list[dict]) -> dict:
-        """
-        Collate data samples into batches.
-
-        Args:
-            batch (list[dict]): List of dictionaries containing sample data.
-
-        Returns:
-            (dict): Collated batch with stacked tensors.
-        """
-        return YOLODataset.collate_fn(batch)
-
-    def close_mosaic(self, hyp: dict) -> None:
-        """
-        Set mosaic, copy_paste and mixup options to 0.0 and build transformations.
-
-        Args:
-            hyp (dict): Hyperparameters for transforms.
-        """
-        for dataset in self.datasets:
-            if not hasattr(dataset, "close_mosaic"):
-                continue
-            dataset.close_mosaic(hyp)
-
-
-# TODO: support semantic segmentation
-class SemanticDataset(BaseDataset):
-    """Semantic Segmentation Dataset."""
-
-    def __init__(self):
-        """Initialize a SemanticDataset object."""
-        super().__init__()
-
-
-class ClassificationDataset:
-    """
-    Dataset class for image classification tasks extending torchvision ImageFolder functionality.
-
-    This class offers functionalities like image augmentation, caching, and verification. It's designed to efficiently
-    handle large datasets for training deep learning models, with optional image transformations and caching mechanisms
-    to speed up training.
-
-    Attributes:
-        cache_ram (bool): Indicates if caching in RAM is enabled.
-        cache_disk (bool): Indicates if caching on disk is enabled.
-        samples (list): A list of tuples, each containing the path to an image, its class index, path to its .npy cache
-                        file (if caching on disk), and optionally the loaded image array (if caching in RAM).
-        torch_transforms (callable): PyTorch transforms to be applied to the images.
-        root (str): Root directory of the dataset.
-        prefix (str): Prefix for logging and cache filenames.
-
-    Methods:
-        __getitem__: Return subset of data and targets corresponding to given indices.
-        __len__: Return the total number of samples in the dataset.
-        verify_images: Verify all images in dataset.
-    """
-
-    def __init__(self, root: str, args, augment: bool = False, prefix: str = ""):
-        """
-        Initialize YOLO classification dataset with root directory, arguments, augmentations, and cache settings.
-
-        Args:
-            root (str): Path to the dataset directory where images are stored in a class-specific folder structure.
-            args (Namespace): Configuration containing dataset-related settings such as image size, augmentation
-                parameters, and cache settings.
-            augment (bool, optional): Whether to apply augmentations to the dataset.
-            prefix (str, optional): Prefix for logging and cache filenames, aiding in dataset identification.
-        """
-        import torchvision  # scope for faster 'import ultralytics'
-
-        # Base class assigned as attribute rather than used as base class to allow for scoping slow torchvision import
-        if TORCHVISION_0_18:  # 'allow_empty' argument first introduced in torchvision 0.18
-            self.base = torchvision.datasets.ImageFolder(root=root, allow_empty=True)
-        else:
-            self.base = torchvision.datasets.ImageFolder(root=root)
-        self.samples = self.base.samples
-        self.root = self.base.root
-
-        # Initialize attributes
-        if augment and args.fraction < 1.0:  # reduce training fraction
-            self.samples = self.samples[: round(len(self.samples) * args.fraction)]
-        self.prefix = colorstr(f"{prefix}: ") if prefix else ""
-        self.cache_ram = args.cache is True or str(args.cache).lower() == "ram"  # cache images into RAM
-        if self.cache_ram:
-            LOGGER.warning(
-                "Classification `cache_ram` training has known memory leak in "
-                "https://github.com/ultralytics/ultralytics/issues/9824, setting `cache_ram=False`."
-            )
-            self.cache_ram = False
-        self.cache_disk = str(args.cache).lower() == "disk"  # cache images on hard drive as uncompressed *.npy files
-        self.samples = self.verify_images()  # filter out bad images
-        self.samples = [list(x) + [Path(x[0]).with_suffix(".npy"), None] for x in self.samples]  # file, index, npy, im
-        scale = (1.0 - args.scale, 1.0)  # (0.08, 1.0)
-        self.torch_transforms = (
-            classify_augmentations(
-                size=args.imgsz,
-                scale=scale,
-                hflip=args.fliplr,
-                vflip=args.flipud,
-                erasing=args.erasing,
-                auto_augment=args.auto_augment,
-                hsv_h=args.hsv_h,
-                hsv_s=args.hsv_s,
-                hsv_v=args.hsv_v,
-            )
-            if augment
-            else classify_transforms(size=args.imgsz)
-        )
-
-    def __getitem__(self, i: int) -> dict:
-        """
-        Return subset of data and targets corresponding to given indices.
-
-        Args:
-            i (int): Index of the sample to retrieve.
-
-        Returns:
-            (dict): Dictionary containing the image and its class index.
-        """
-        f, j, fn, im = self.samples[i]  # filename, index, filename.with_suffix('.npy'), image
-        if self.cache_ram:
-            if im is None:  # Warning: two separate if statements required here, do not combine this with previous line
-                im = self.samples[i][3] = cv2.imread(f)
-        elif self.cache_disk:
-            if not fn.exists():  # load npy
-                np.save(fn.as_posix(), cv2.imread(f), allow_pickle=False)
-            im = np.load(fn)
-        else:  # read image
-            im = cv2.imread(f)  # BGR
-        # Convert NumPy array to PIL image
-        im = Image.fromarray(cv2.cvtColor(im, cv2.COLOR_BGR2RGB))
-        sample = self.torch_transforms(im)
-        return {"img": sample, "cls": j}
-
-    def __len__(self) -> int:
-        """Return the total number of samples in the dataset."""
-        return len(self.samples)
-
-    def verify_images(self) -> list[tuple]:
-        """
-        Verify all images in dataset.
-
-        Returns:
-            (list): List of valid samples after verification.
-        """
-        desc = f"{self.prefix}Scanning {self.root}..."
-        path = Path(self.root).with_suffix(".cache")  # *.cache file path
-
-        try:
-            check_file_speeds([file for (file, _) in self.samples[:5]], prefix=self.prefix)  # check image read speeds
-            cache = load_dataset_cache_file(path)  # attempt to load a *.cache file
-            assert cache["version"] == DATASET_CACHE_VERSION  # matches current version
-            assert cache["hash"] == get_hash([x[0] for x in self.samples])  # identical hash
-            nf, nc, n, samples = cache.pop("results")  # found, missing, empty, corrupt, total
-            if LOCAL_RANK in {-1, 0}:
-                d = f"{desc} {nf} images, {nc} corrupt"
-                TQDM(None, desc=d, total=n, initial=n)
-                if cache["msgs"]:
-                    LOGGER.info("\n".join(cache["msgs"]))  # display warnings
-            return samples
-
-        except (FileNotFoundError, AssertionError, AttributeError):
-            # Run scan if *.cache retrieval failed
-            nf, nc, msgs, samples, x = 0, 0, [], [], {}
-            with ThreadPool(NUM_THREADS) as pool:
-                results = pool.imap(func=verify_image, iterable=zip(self.samples, repeat(self.prefix)))
-                pbar = TQDM(results, desc=desc, total=len(self.samples))
-                for sample, nf_f, nc_f, msg in pbar:
-                    if nf_f:
-                        samples.append(sample)
-                    if msg:
-                        msgs.append(msg)
-                    nf += nf_f
-                    nc += nc_f
-                    pbar.desc = f"{desc} {nf} images, {nc} corrupt"
-                pbar.close()
-            if msgs:
-                LOGGER.info("\n".join(msgs))
-            x["hash"] = get_hash([x[0] for x in self.samples])
-            x["results"] = nf, nc, len(samples), samples
-            x["msgs"] = msgs  # warnings
-            save_dataset_cache_file(self.prefix, path, x, DATASET_CACHE_VERSION)
-            return samples
diff --git a/ultralytics/data/loaders.py b/ultralytics/data/loaders.py
deleted file mode 100644
index 39e6e7f..0000000
--- a/ultralytics/data/loaders.py
+++ /dev/null
@@ -1,711 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import glob
-import math
-import os
-import time
-import urllib
-from dataclasses import dataclass
-from pathlib import Path
-from threading import Thread
-from typing import Any
-
-import cv2
-import numpy as np
-import torch
-from PIL import Image
-
-from ultralytics.data.utils import FORMATS_HELP_MSG, IMG_FORMATS, VID_FORMATS
-from ultralytics.utils import IS_COLAB, IS_KAGGLE, LOGGER, ops
-from ultralytics.utils.checks import check_requirements
-from ultralytics.utils.patches import imread
-
-
-@dataclass
-class SourceTypes:
-    """
-    Class to represent various types of input sources for predictions.
-
-    This class uses dataclass to define boolean flags for different types of input sources that can be used for
-    making predictions with YOLO models.
-
-    Attributes:
-        stream (bool): Flag indicating if the input source is a video stream.
-        screenshot (bool): Flag indicating if the input source is a screenshot.
-        from_img (bool): Flag indicating if the input source is an image file.
-        tensor (bool): Flag indicating if the input source is a tensor.
-
-    Examples:
-        >>> source_types = SourceTypes(stream=True, screenshot=False, from_img=False)
-        >>> print(source_types.stream)
-        True
-        >>> print(source_types.from_img)
-        False
-    """
-
-    stream: bool = False
-    screenshot: bool = False
-    from_img: bool = False
-    tensor: bool = False
-
-
-class LoadStreams:
-    """
-    Stream Loader for various types of video streams.
-
-    Supports RTSP, RTMP, HTTP, and TCP streams. This class handles the loading and processing of multiple video
-    streams simultaneously, making it suitable for real-time video analysis tasks.
-
-    Attributes:
-        sources (list[str]): The source input paths or URLs for the video streams.
-        vid_stride (int): Video frame-rate stride.
-        buffer (bool): Whether to buffer input streams.
-        running (bool): Flag to indicate if the streaming thread is running.
-        mode (str): Set to 'stream' indicating real-time capture.
-        imgs (list[list[np.ndarray]]): List of image frames for each stream.
-        fps (list[float]): List of FPS for each stream.
-        frames (list[int]): List of total frames for each stream.
-        threads (list[Thread]): List of threads for each stream.
-        shape (list[tuple[int, int, int]]): List of shapes for each stream.
-        caps (list[cv2.VideoCapture]): List of cv2.VideoCapture objects for each stream.
-        bs (int): Batch size for processing.
-        cv2_flag (int): OpenCV flag for image reading (grayscale or RGB).
-
-    Methods:
-        update: Read stream frames in daemon thread.
-        close: Close stream loader and release resources.
-        __iter__: Returns an iterator object for the class.
-        __next__: Returns source paths, transformed, and original images for processing.
-        __len__: Return the length of the sources object.
-
-    Examples:
-        >>> stream_loader = LoadStreams("rtsp://example.com/stream1.mp4")
-        >>> for sources, imgs, _ in stream_loader:
-        ...     # Process the images
-        ...     pass
-        >>> stream_loader.close()
-
-    Notes:
-        - The class uses threading to efficiently load frames from multiple streams simultaneously.
-        - It automatically handles YouTube links, converting them to the best available stream URL.
-        - The class implements a buffer system to manage frame storage and retrieval.
-    """
-
-    def __init__(self, sources: str = "file.streams", vid_stride: int = 1, buffer: bool = False, channels: int = 3):
-        """
-        Initialize stream loader for multiple video sources, supporting various stream types.
-
-        Args:
-            sources (str): Path to streams file or single stream URL.
-            vid_stride (int): Video frame-rate stride.
-            buffer (bool): Whether to buffer input streams.
-            channels (int): Number of image channels (1 for grayscale, 3 for RGB).
-        """
-        torch.backends.cudnn.benchmark = True  # faster for fixed-size inference
-        self.buffer = buffer  # buffer input streams
-        self.running = True  # running flag for Thread
-        self.mode = "stream"
-        self.vid_stride = vid_stride  # video frame-rate stride
-        self.cv2_flag = cv2.IMREAD_GRAYSCALE if channels == 1 else cv2.IMREAD_COLOR  # grayscale or RGB
-
-        sources = Path(sources).read_text().rsplit() if os.path.isfile(sources) else [sources]
-        n = len(sources)
-        self.bs = n
-        self.fps = [0] * n  # frames per second
-        self.frames = [0] * n
-        self.threads = [None] * n
-        self.caps = [None] * n  # video capture objects
-        self.imgs = [[] for _ in range(n)]  # images
-        self.shape = [[] for _ in range(n)]  # image shapes
-        self.sources = [ops.clean_str(x).replace(os.sep, "_") for x in sources]  # clean source names for later
-        for i, s in enumerate(sources):  # index, source
-            # Start thread to read frames from video stream
-            st = f"{i + 1}/{n}: {s}... "
-            if urllib.parse.urlparse(s).hostname in {"www.youtube.com", "youtube.com", "youtu.be"}:  # YouTube video
-                # YouTube format i.e. 'https://www.youtube.com/watch?v=Jsn8D3aC840' or 'https://youtu.be/Jsn8D3aC840'
-                s = get_best_youtube_url(s)
-            s = eval(s) if s.isnumeric() else s  # i.e. s = '0' local webcam
-            if s == 0 and (IS_COLAB or IS_KAGGLE):
-                raise NotImplementedError(
-                    "'source=0' webcam not supported in Colab and Kaggle notebooks. "
-                    "Try running 'source=0' in a local environment."
-                )
-            self.caps[i] = cv2.VideoCapture(s)  # store video capture object
-            if not self.caps[i].isOpened():
-                raise ConnectionError(f"{st}Failed to open {s}")
-            w = int(self.caps[i].get(cv2.CAP_PROP_FRAME_WIDTH))
-            h = int(self.caps[i].get(cv2.CAP_PROP_FRAME_HEIGHT))
-            fps = self.caps[i].get(cv2.CAP_PROP_FPS)  # warning: may return 0 or nan
-            self.frames[i] = max(int(self.caps[i].get(cv2.CAP_PROP_FRAME_COUNT)), 0) or float(
-                "inf"
-            )  # infinite stream fallback
-            self.fps[i] = max((fps if math.isfinite(fps) else 0) % 100, 0) or 30  # 30 FPS fallback
-
-            success, im = self.caps[i].read()  # guarantee first frame
-            im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)[..., None] if self.cv2_flag == cv2.IMREAD_GRAYSCALE else im
-            if not success or im is None:
-                raise ConnectionError(f"{st}Failed to read images from {s}")
-            self.imgs[i].append(im)
-            self.shape[i] = im.shape
-            self.threads[i] = Thread(target=self.update, args=([i, self.caps[i], s]), daemon=True)
-            LOGGER.info(f"{st}Success ✅ ({self.frames[i]} frames of shape {w}x{h} at {self.fps[i]:.2f} FPS)")
-            self.threads[i].start()
-        LOGGER.info("")  # newline
-
-    def update(self, i: int, cap: cv2.VideoCapture, stream: str):
-        """Read stream frames in daemon thread and update image buffer."""
-        n, f = 0, self.frames[i]  # frame number, frame array
-        while self.running and cap.isOpened() and n < (f - 1):
-            if len(self.imgs[i]) < 30:  # keep a <=30-image buffer
-                n += 1
-                cap.grab()  # .read() = .grab() followed by .retrieve()
-                if n % self.vid_stride == 0:
-                    success, im = cap.retrieve()
-                    im = (
-                        cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)[..., None] if self.cv2_flag == cv2.IMREAD_GRAYSCALE else im
-                    )
-                    if not success:
-                        im = np.zeros(self.shape[i], dtype=np.uint8)
-                        LOGGER.warning("Video stream unresponsive, please check your IP camera connection.")
-                        cap.open(stream)  # re-open stream if signal was lost
-                    if self.buffer:
-                        self.imgs[i].append(im)
-                    else:
-                        self.imgs[i] = [im]
-            else:
-                time.sleep(0.01)  # wait until the buffer is empty
-
-    def close(self):
-        """Terminate stream loader, stop threads, and release video capture resources."""
-        self.running = False  # stop flag for Thread
-        for thread in self.threads:
-            if thread.is_alive():
-                thread.join(timeout=5)  # Add timeout
-        for cap in self.caps:  # Iterate through the stored VideoCapture objects
-            try:
-                cap.release()  # release video capture
-            except Exception as e:
-                LOGGER.warning(f"Could not release VideoCapture object: {e}")
-
-    def __iter__(self):
-        """Iterate through YOLO image feed and re-open unresponsive streams."""
-        self.count = -1
-        return self
-
-    def __next__(self) -> tuple[list[str], list[np.ndarray], list[str]]:
-        """Return the next batch of frames from multiple video streams for processing."""
-        self.count += 1
-
-        images = []
-        for i, x in enumerate(self.imgs):
-            # Wait until a frame is available in each buffer
-            while not x:
-                if not self.threads[i].is_alive():
-                    self.close()
-                    raise StopIteration
-                time.sleep(1 / min(self.fps))
-                x = self.imgs[i]
-                if not x:
-                    LOGGER.warning(f"Waiting for stream {i}")
-
-            # Get and remove the first frame from imgs buffer
-            if self.buffer:
-                images.append(x.pop(0))
-
-            # Get the last frame, and clear the rest from the imgs buffer
-            else:
-                images.append(x.pop(-1) if x else np.zeros(self.shape[i], dtype=np.uint8))
-                x.clear()
-
-        return self.sources, images, [""] * self.bs
-
-    def __len__(self) -> int:
-        """Return the number of video streams in the LoadStreams object."""
-        return self.bs  # 1E12 frames = 32 streams at 30 FPS for 30 years
-
-
-class LoadScreenshots:
-    """
-    Ultralytics screenshot dataloader for capturing and processing screen images.
-
-    This class manages the loading of screenshot images for processing with YOLO. It is suitable for use with
-    `yolo predict source=screen`.
-
-    Attributes:
-        source (str): The source input indicating which screen to capture.
-        screen (int): The screen number to capture.
-        left (int): The left coordinate for screen capture area.
-        top (int): The top coordinate for screen capture area.
-        width (int): The width of the screen capture area.
-        height (int): The height of the screen capture area.
-        mode (str): Set to 'stream' indicating real-time capture.
-        frame (int): Counter for captured frames.
-        sct (mss.mss): Screen capture object from `mss` library.
-        bs (int): Batch size, set to 1.
-        fps (int): Frames per second, set to 30.
-        monitor (dict[str, int]): Monitor configuration details.
-        cv2_flag (int): OpenCV flag for image reading (grayscale or RGB).
-
-    Methods:
-        __iter__: Returns an iterator object.
-        __next__: Captures the next screenshot and returns it.
-
-    Examples:
-        >>> loader = LoadScreenshots("0 100 100 640 480")  # screen 0, top-left (100,100), 640x480
-        >>> for source, im, im0s, vid_cap, s in loader:
-        ...     print(f"Captured frame: {im.shape}")
-    """
-
-    def __init__(self, source: str, channels: int = 3):
-        """
-        Initialize screenshot capture with specified screen and region parameters.
-
-        Args:
-            source (str): Screen capture source string in format "screen_num left top width height".
-            channels (int): Number of image channels (1 for grayscale, 3 for RGB).
-        """
-        check_requirements("mss")
-        import mss  # noqa
-
-        source, *params = source.split()
-        self.screen, left, top, width, height = 0, None, None, None, None  # default to full screen 0
-        if len(params) == 1:
-            self.screen = int(params[0])
-        elif len(params) == 4:
-            left, top, width, height = (int(x) for x in params)
-        elif len(params) == 5:
-            self.screen, left, top, width, height = (int(x) for x in params)
-        self.mode = "stream"
-        self.frame = 0
-        self.sct = mss.mss()
-        self.bs = 1
-        self.fps = 30
-        self.cv2_flag = cv2.IMREAD_GRAYSCALE if channels == 1 else cv2.IMREAD_COLOR  # grayscale or RGB
-
-        # Parse monitor shape
-        monitor = self.sct.monitors[self.screen]
-        self.top = monitor["top"] if top is None else (monitor["top"] + top)
-        self.left = monitor["left"] if left is None else (monitor["left"] + left)
-        self.width = width or monitor["width"]
-        self.height = height or monitor["height"]
-        self.monitor = {"left": self.left, "top": self.top, "width": self.width, "height": self.height}
-
-    def __iter__(self):
-        """Yield the next screenshot image from the specified screen or region for processing."""
-        return self
-
-    def __next__(self) -> tuple[list[str], list[np.ndarray], list[str]]:
-        """Capture and return the next screenshot as a numpy array using the mss library."""
-        im0 = np.asarray(self.sct.grab(self.monitor))[:, :, :3]  # BGRA to BGR
-        im0 = cv2.cvtColor(im0, cv2.COLOR_BGR2GRAY)[..., None] if self.cv2_flag == cv2.IMREAD_GRAYSCALE else im0
-        s = f"screen {self.screen} (LTWH): {self.left},{self.top},{self.width},{self.height}: "
-
-        self.frame += 1
-        return [str(self.screen)], [im0], [s]  # screen, img, string
-
-
-class LoadImagesAndVideos:
-    """
-    A class for loading and processing images and videos for YOLO object detection.
-
-    This class manages the loading and pre-processing of image and video data from various sources, including
-    single image files, video files, and lists of image and video paths.
-
-    Attributes:
-        files (list[str]): List of image and video file paths.
-        nf (int): Total number of files (images and videos).
-        video_flag (list[bool]): Flags indicating whether a file is a video (True) or an image (False).
-        mode (str): Current mode, 'image' or 'video'.
-        vid_stride (int): Stride for video frame-rate.
-        bs (int): Batch size.
-        cap (cv2.VideoCapture): Video capture object for OpenCV.
-        frame (int): Frame counter for video.
-        frames (int): Total number of frames in the video.
-        count (int): Counter for iteration, initialized at 0 during __iter__().
-        ni (int): Number of images.
-        cv2_flag (int): OpenCV flag for image reading (grayscale or RGB).
-
-    Methods:
-        __init__: Initialize the LoadImagesAndVideos object.
-        __iter__: Returns an iterator object for VideoStream or ImageFolder.
-        __next__: Returns the next batch of images or video frames along with their paths and metadata.
-        _new_video: Creates a new video capture object for the given path.
-        __len__: Returns the number of batches in the object.
-
-    Examples:
-        >>> loader = LoadImagesAndVideos("path/to/data", batch=32, vid_stride=1)
-        >>> for paths, imgs, info in loader:
-        ...     # Process batch of images or video frames
-        ...     pass
-
-    Notes:
-        - Supports various image formats including HEIC.
-        - Handles both local files and directories.
-        - Can read from a text file containing paths to images and videos.
-    """
-
-    def __init__(self, path: str | Path | list, batch: int = 1, vid_stride: int = 1, channels: int = 3):
-        """
-        Initialize dataloader for images and videos, supporting various input formats.
-
-        Args:
-            path (str | Path | list): Path to images/videos, directory, or list of paths.
-            batch (int): Batch size for processing.
-            vid_stride (int): Video frame-rate stride.
-            channels (int): Number of image channels (1 for grayscale, 3 for RGB).
-        """
-        parent = None
-        if isinstance(path, str) and Path(path).suffix in {".txt", ".csv"}:  # txt/csv file with source paths
-            parent, content = Path(path).parent, Path(path).read_text()
-            path = content.splitlines() if Path(path).suffix == ".txt" else content.split(",")  # list of sources
-            path = [p.strip() for p in path]
-        files = []
-        for p in sorted(path) if isinstance(path, (list, tuple)) else [path]:
-            a = str(Path(p).absolute())  # do not use .resolve() https://github.com/ultralytics/ultralytics/issues/2912
-            if "*" in a:
-                files.extend(sorted(glob.glob(a, recursive=True)))  # glob
-            elif os.path.isdir(a):
-                files.extend(sorted(glob.glob(os.path.join(a, "*.*"))))  # dir
-            elif os.path.isfile(a):
-                files.append(a)  # files (absolute or relative to CWD)
-            elif parent and (parent / p).is_file():
-                files.append(str((parent / p).absolute()))  # files (relative to *.txt file parent)
-            else:
-                raise FileNotFoundError(f"{p} does not exist")
-
-        # Define files as images or videos
-        images, videos = [], []
-        for f in files:
-            suffix = f.rpartition(".")[-1].lower()  # Get file extension without the dot and lowercase
-            if suffix in IMG_FORMATS:
-                images.append(f)
-            elif suffix in VID_FORMATS:
-                videos.append(f)
-        ni, nv = len(images), len(videos)
-
-        self.files = images + videos
-        self.nf = ni + nv  # number of files
-        self.ni = ni  # number of images
-        self.video_flag = [False] * ni + [True] * nv
-        self.mode = "video" if ni == 0 else "image"  # default to video if no images
-        self.vid_stride = vid_stride  # video frame-rate stride
-        self.bs = batch
-        self.cv2_flag = cv2.IMREAD_GRAYSCALE if channels == 1 else cv2.IMREAD_COLOR  # grayscale or RGB
-        if any(videos):
-            self._new_video(videos[0])  # new video
-        else:
-            self.cap = None
-        if self.nf == 0:
-            raise FileNotFoundError(f"No images or videos found in {p}. {FORMATS_HELP_MSG}")
-
-    def __iter__(self):
-        """Iterate through image/video files, yielding source paths, images, and metadata."""
-        self.count = 0
-        return self
-
-    def __next__(self) -> tuple[list[str], list[np.ndarray], list[str]]:
-        """Return the next batch of images or video frames with their paths and metadata."""
-        paths, imgs, info = [], [], []
-        while len(imgs) < self.bs:
-            if self.count >= self.nf:  # end of file list
-                if imgs:
-                    return paths, imgs, info  # return last partial batch
-                else:
-                    raise StopIteration
-
-            path = self.files[self.count]
-            if self.video_flag[self.count]:
-                self.mode = "video"
-                if not self.cap or not self.cap.isOpened():
-                    self._new_video(path)
-
-                success = False
-                for _ in range(self.vid_stride):
-                    success = self.cap.grab()
-                    if not success:
-                        break  # end of video or failure
-
-                if success:
-                    success, im0 = self.cap.retrieve()
-                    im0 = (
-                        cv2.cvtColor(im0, cv2.COLOR_BGR2GRAY)[..., None]
-                        if self.cv2_flag == cv2.IMREAD_GRAYSCALE
-                        else im0
-                    )
-                    if success:
-                        self.frame += 1
-                        paths.append(path)
-                        imgs.append(im0)
-                        info.append(f"video {self.count + 1}/{self.nf} (frame {self.frame}/{self.frames}) {path}: ")
-                        if self.frame == self.frames:  # end of video
-                            self.count += 1
-                            self.cap.release()
-                else:
-                    # Move to the next file if the current video ended or failed to open
-                    self.count += 1
-                    if self.cap:
-                        self.cap.release()
-                    if self.count < self.nf:
-                        self._new_video(self.files[self.count])
-            else:
-                # Handle image files (including HEIC)
-                self.mode = "image"
-                if path.rpartition(".")[-1].lower() == "heic":
-                    # Load HEIC image using Pillow with pillow-heif
-                    check_requirements("pi-heif")
-
-                    from pi_heif import register_heif_opener
-
-                    register_heif_opener()  # Register HEIF opener with Pillow
-                    with Image.open(path) as img:
-                        im0 = cv2.cvtColor(np.asarray(img), cv2.COLOR_RGB2BGR)  # convert image to BGR nparray
-                else:
-                    im0 = imread(path, flags=self.cv2_flag)  # BGR
-                if im0 is None:
-                    LOGGER.warning(f"Image Read Error {path}")
-                else:
-                    paths.append(path)
-                    imgs.append(im0)
-                    info.append(f"image {self.count + 1}/{self.nf} {path}: ")
-                self.count += 1  # move to the next file
-                if self.count >= self.ni:  # end of image list
-                    break
-
-        return paths, imgs, info
-
-    def _new_video(self, path: str):
-        """Create a new video capture object for the given path and initialize video-related attributes."""
-        self.frame = 0
-        self.cap = cv2.VideoCapture(path)
-        self.fps = int(self.cap.get(cv2.CAP_PROP_FPS))
-        if not self.cap.isOpened():
-            raise FileNotFoundError(f"Failed to open video {path}")
-        self.frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT) / self.vid_stride)
-
-    def __len__(self) -> int:
-        """Return the number of files (images and videos) in the dataset."""
-        return math.ceil(self.nf / self.bs)  # number of batches
-
-
-class LoadPilAndNumpy:
-    """
-    Load images from PIL and Numpy arrays for batch processing.
-
-    This class manages loading and pre-processing of image data from both PIL and Numpy formats. It performs basic
-    validation and format conversion to ensure that the images are in the required format for downstream processing.
-
-    Attributes:
-        paths (list[str]): List of image paths or autogenerated filenames.
-        im0 (list[np.ndarray]): List of images stored as Numpy arrays.
-        mode (str): Type of data being processed, set to 'image'.
-        bs (int): Batch size, equivalent to the length of `im0`.
-
-    Methods:
-        _single_check: Validate and format a single image to a Numpy array.
-
-    Examples:
-        >>> from PIL import Image
-        >>> import numpy as np
-        >>> pil_img = Image.new("RGB", (100, 100))
-        >>> np_img = np.random.randint(0, 255, (100, 100, 3), dtype=np.uint8)
-        >>> loader = LoadPilAndNumpy([pil_img, np_img])
-        >>> paths, images, _ = next(iter(loader))
-        >>> print(f"Loaded {len(images)} images")
-        Loaded 2 images
-    """
-
-    def __init__(self, im0: Image.Image | np.ndarray | list, channels: int = 3):
-        """
-        Initialize a loader for PIL and Numpy images, converting inputs to a standardized format.
-
-        Args:
-            im0 (PIL.Image.Image | np.ndarray | list): Single image or list of images in PIL or numpy format.
-            channels (int): Number of image channels (1 for grayscale, 3 for RGB).
-        """
-        if not isinstance(im0, list):
-            im0 = [im0]
-        # use `image{i}.jpg` when Image.filename returns an empty path.
-        self.paths = [getattr(im, "filename", "") or f"image{i}.jpg" for i, im in enumerate(im0)]
-        pil_flag = "L" if channels == 1 else "RGB"  # grayscale or RGB
-        self.im0 = [self._single_check(im, pil_flag) for im in im0]
-        self.mode = "image"
-        self.bs = len(self.im0)
-
-    @staticmethod
-    def _single_check(im: Image.Image | np.ndarray, flag: str = "RGB") -> np.ndarray:
-        """Validate and format an image to numpy array, ensuring RGB order and contiguous memory."""
-        assert isinstance(im, (Image.Image, np.ndarray)), f"Expected PIL/np.ndarray image type, but got {type(im)}"
-        if isinstance(im, Image.Image):
-            im = np.asarray(im.convert(flag))
-            # adding new axis if it's grayscale, and converting to BGR if it's RGB
-            im = im[..., None] if flag == "L" else im[..., ::-1]
-            im = np.ascontiguousarray(im)  # contiguous
-        elif im.ndim == 2:  # grayscale in numpy form
-            im = im[..., None]
-        return im
-
-    def __len__(self) -> int:
-        """Return the length of the 'im0' attribute, representing the number of loaded images."""
-        return len(self.im0)
-
-    def __next__(self) -> tuple[list[str], list[np.ndarray], list[str]]:
-        """Return the next batch of images, paths, and metadata for processing."""
-        if self.count == 1:  # loop only once as it's batch inference
-            raise StopIteration
-        self.count += 1
-        return self.paths, self.im0, [""] * self.bs
-
-    def __iter__(self):
-        """Iterate through PIL/numpy images, yielding paths, raw images, and metadata for processing."""
-        self.count = 0
-        return self
-
-
-class LoadTensor:
-    """
-    A class for loading and processing tensor data for object detection tasks.
-
-    This class handles the loading and pre-processing of image data from PyTorch tensors, preparing them for
-    further processing in object detection pipelines.
-
-    Attributes:
-        im0 (torch.Tensor): The input tensor containing the image(s) with shape (B, C, H, W).
-        bs (int): Batch size, inferred from the shape of `im0`.
-        mode (str): Current processing mode, set to 'image'.
-        paths (list[str]): List of image paths or auto-generated filenames.
-
-    Methods:
-        _single_check: Validates and formats an input tensor.
-
-    Examples:
-        >>> import torch
-        >>> tensor = torch.rand(1, 3, 640, 640)
-        >>> loader = LoadTensor(tensor)
-        >>> paths, images, info = next(iter(loader))
-        >>> print(f"Processed {len(images)} images")
-    """
-
-    def __init__(self, im0: torch.Tensor) -> None:
-        """
-        Initialize LoadTensor object for processing torch.Tensor image data.
-
-        Args:
-            im0 (torch.Tensor): Input tensor with shape (B, C, H, W).
-        """
-        self.im0 = self._single_check(im0)
-        self.bs = self.im0.shape[0]
-        self.mode = "image"
-        self.paths = [getattr(im, "filename", f"image{i}.jpg") for i, im in enumerate(im0)]
-
-    @staticmethod
-    def _single_check(im: torch.Tensor, stride: int = 32) -> torch.Tensor:
-        """Validate and format a single image tensor, ensuring correct shape and normalization."""
-        s = (
-            f"torch.Tensor inputs should be BCHW i.e. shape(1, 3, 640, 640) "
-            f"divisible by stride {stride}. Input shape{tuple(im.shape)} is incompatible."
-        )
-        if len(im.shape) != 4:
-            if len(im.shape) != 3:
-                raise ValueError(s)
-            LOGGER.warning(s)
-            im = im.unsqueeze(0)
-        if im.shape[2] % stride or im.shape[3] % stride:
-            raise ValueError(s)
-        if im.max() > 1.0 + torch.finfo(im.dtype).eps:  # torch.float32 eps is 1.2e-07
-            LOGGER.warning(
-                f"torch.Tensor inputs should be normalized 0.0-1.0 but max value is {im.max()}. Dividing input by 255."
-            )
-            im = im.float() / 255.0
-
-        return im
-
-    def __iter__(self):
-        """Yield an iterator object for iterating through tensor image data."""
-        self.count = 0
-        return self
-
-    def __next__(self) -> tuple[list[str], torch.Tensor, list[str]]:
-        """Yield the next batch of tensor images and metadata for processing."""
-        if self.count == 1:
-            raise StopIteration
-        self.count += 1
-        return self.paths, self.im0, [""] * self.bs
-
-    def __len__(self) -> int:
-        """Return the batch size of the tensor input."""
-        return self.bs
-
-
-def autocast_list(source: list[Any]) -> list[Image.Image | np.ndarray]:
-    """Merge a list of sources into a list of numpy arrays or PIL images for Ultralytics prediction."""
-    files = []
-    for im in source:
-        if isinstance(im, (str, Path)):  # filename or uri
-            files.append(Image.open(urllib.request.urlopen(im) if str(im).startswith("http") else im))
-        elif isinstance(im, (Image.Image, np.ndarray)):  # PIL or np Image
-            files.append(im)
-        else:
-            raise TypeError(
-                f"type {type(im).__name__} is not a supported Ultralytics prediction source type. \n"
-                f"See https://docs.ultralytics.com/modes/predict for supported source types."
-            )
-
-    return files
-
-
-def get_best_youtube_url(url: str, method: str = "pytube") -> str | None:
-    """
-    Retrieve the URL of the best quality MP4 video stream from a given YouTube video.
-
-    Args:
-        url (str): The URL of the YouTube video.
-        method (str): The method to use for extracting video info. Options are "pytube", "pafy", and "yt-dlp".
-
-    Returns:
-        (str | None): The URL of the best quality MP4 video stream, or None if no suitable stream is found.
-
-    Examples:
-        >>> url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
-        >>> best_url = get_best_youtube_url(url)
-        >>> print(best_url)
-        https://rr4---sn-q4flrnek.googlevideo.com/videoplayback?expire=...
-
-    Notes:
-        - Requires additional libraries based on the chosen method: pytubefix, pafy, or yt-dlp.
-        - The function prioritizes streams with at least 1080p resolution when available.
-        - For the "yt-dlp" method, it looks for formats with video codec, no audio, and *.mp4 extension.
-    """
-    if method == "pytube":
-        # Switched from pytube to pytubefix to resolve https://github.com/pytube/pytube/issues/1954
-        check_requirements("pytubefix>=6.5.2")
-        from pytubefix import YouTube
-
-        streams = YouTube(url).streams.filter(file_extension="mp4", only_video=True)
-        streams = sorted(streams, key=lambda s: s.resolution, reverse=True)  # sort streams by resolution
-        for stream in streams:
-            if stream.resolution and int(stream.resolution[:-1]) >= 1080:  # check if resolution is at least 1080p
-                return stream.url
-
-    elif method == "pafy":
-        check_requirements(("pafy", "youtube_dl==2020.12.2"))
-        import pafy  # noqa
-
-        return pafy.new(url).getbestvideo(preftype="mp4").url
-
-    elif method == "yt-dlp":
-        check_requirements("yt-dlp")
-        import yt_dlp
-
-        with yt_dlp.YoutubeDL({"quiet": True}) as ydl:
-            info_dict = ydl.extract_info(url, download=False)  # extract info
-        for f in reversed(info_dict.get("formats", [])):  # reversed because best is usually last
-            # Find a format with video codec, no audio, *.mp4 extension at least 1920x1080 size
-            good_size = (f.get("width") or 0) >= 1920 or (f.get("height") or 0) >= 1080
-            if good_size and f["vcodec"] != "none" and f["acodec"] == "none" and f["ext"] == "mp4":
-                return f.get("url")
-
-
-# Define constants
-LOADERS = (LoadStreams, LoadPilAndNumpy, LoadImagesAndVideos, LoadScreenshots)
diff --git a/ultralytics/data/scripts/download_weights.sh b/ultralytics/data/scripts/download_weights.sh
deleted file mode 100755
index d2d1c0f..0000000
--- a/ultralytics/data/scripts/download_weights.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Download latest models from https://github.com/ultralytics/assets/releases
-# Example usage: bash ultralytics/data/scripts/download_weights.sh
-# parent
-# └── weights
-#     ├── yolov8n.pt  ← downloads here
-#     ├── yolov8s.pt
-#     └── ...
-
-python << EOF
-from ultralytics.utils.downloads import attempt_download_asset
-
-assets = [f"yolov8{size}{suffix}.pt" for size in "nsmlx" for suffix in ("", "-cls", "-seg", "-pose")]
-for x in assets:
-    attempt_download_asset(f"weights/{x}")
-EOF
diff --git a/ultralytics/data/scripts/get_coco.sh b/ultralytics/data/scripts/get_coco.sh
deleted file mode 100755
index 4490a14..0000000
--- a/ultralytics/data/scripts/get_coco.sh
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/bin/bash
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Download COCO 2017 dataset https://cocodataset.org
-# Example usage: bash data/scripts/get_coco.sh
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── coco  ← downloads here
-
-# Arguments (optional) Usage: bash data/scripts/get_coco.sh --train --val --test --segments
-if [ "$#" -gt 0 ]; then
-  for opt in "$@"; do
-    case "${opt}" in
-      --train) train=true ;;
-      --val) val=true ;;
-      --test) test=true ;;
-      --segments) segments=true ;;
-      --sama) sama=true ;;
-    esac
-  done
-else
-  train=true
-  val=true
-  test=false
-  segments=false
-  sama=false
-fi
-
-# Download/unzip labels
-d='../datasets' # unzip directory
-url=https://github.com/ultralytics/assets/releases/download/v0.0.0/
-if [ "$segments" == "true" ]; then
-  f='coco2017labels-segments.zip' # 169 MB
-elif [ "$sama" == "true" ]; then
-  f='coco2017labels-segments-sama.zip' # 199 MB https://www.sama.com/sama-coco-dataset/
-else
-  f='coco2017labels.zip' # 46 MB
-fi
-echo 'Downloading' $url$f ' ...'
-curl -L $url$f -o $f -# && unzip -q $f -d $d && rm $f &
-
-# Download/unzip images
-d='../datasets/coco/images' # unzip directory
-url=http://images.cocodataset.org/zips/
-if [ "$train" == "true" ]; then
-  f='train2017.zip' # 19G, 118k images
-  echo 'Downloading' $url$f '...'
-  curl -L $url$f -o $f -# && unzip -q $f -d $d && rm $f &
-fi
-if [ "$val" == "true" ]; then
-  f='val2017.zip' # 1G, 5k images
-  echo 'Downloading' $url$f '...'
-  curl -L $url$f -o $f -# && unzip -q $f -d $d && rm $f &
-fi
-if [ "$test" == "true" ]; then
-  f='test2017.zip' # 7G, 41k images (optional)
-  echo 'Downloading' $url$f '...'
-  curl -L $url$f -o $f -# && unzip -q $f -d $d && rm $f &
-fi
-wait # finish background tasks
diff --git a/ultralytics/data/scripts/get_coco128.sh b/ultralytics/data/scripts/get_coco128.sh
deleted file mode 100755
index 36306cf..0000000
--- a/ultralytics/data/scripts/get_coco128.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Download COCO128 dataset https://www.kaggle.com/ultralytics/coco128 (first 128 images from COCO train2017)
-# Example usage: bash data/scripts/get_coco128.sh
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── coco128  ← downloads here
-
-# Download/unzip images and labels
-d='../datasets' # unzip directory
-url=https://github.com/ultralytics/assets/releases/download/v0.0.0/
-f='coco128.zip' # or 'coco128-segments.zip', 68 MB
-echo 'Downloading' $url$f ' ...'
-curl -L $url$f -o $f -# && unzip -q $f -d $d && rm $f &
-
-wait # finish background tasks
diff --git a/ultralytics/data/scripts/get_imagenet.sh b/ultralytics/data/scripts/get_imagenet.sh
deleted file mode 100755
index 2674de1..0000000
--- a/ultralytics/data/scripts/get_imagenet.sh
+++ /dev/null
@@ -1,52 +0,0 @@
-#!/bin/bash
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Download ILSVRC2012 ImageNet dataset https://image-net.org
-# Example usage: bash data/scripts/get_imagenet.sh
-# parent
-# ├── ultralytics
-# └── datasets
-#     └── imagenet  ← downloads here
-
-# Arguments (optional) Usage: bash data/scripts/get_imagenet.sh --train --val
-if [ "$#" -gt 0 ]; then
-  for opt in "$@"; do
-    case "${opt}" in
-      --train) train=true ;;
-      --val) val=true ;;
-    esac
-  done
-else
-  train=true
-  val=true
-fi
-
-# Make dir
-d='../datasets/imagenet' # unzip directory
-mkdir -p $d && cd $d
-
-# Download/unzip train
-if [ "$train" == "true" ]; then
-  wget https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_train.tar # download 138G, 1281167 images
-  mkdir train && mv ILSVRC2012_img_train.tar train/ && cd train
-  tar -xf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar
-  find . -name "*.tar" | while read NAME; do
-    mkdir -p "${NAME%.tar}"
-    tar -xf "${NAME}" -C "${NAME%.tar}"
-    rm -f "${NAME}"
-  done
-  cd ..
-fi
-
-# Download/unzip val
-if [ "$val" == "true" ]; then
-  wget https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_val.tar # download 6.3G, 50000 images
-  mkdir val && mv ILSVRC2012_img_val.tar val/ && cd val && tar -xf ILSVRC2012_img_val.tar
-  wget -qO- https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh | bash # move into subdirs
-fi
-
-# Delete corrupted image (optional: PNG under JPEG name that may cause dataloaders to fail)
-# rm train/n04266014/n04266014_10835.JPEG
-
-# TFRecords (optional)
-# wget https://raw.githubusercontent.com/tensorflow/models/master/research/slim/datasets/imagenet_lsvrc_2015_synsets.txt
diff --git a/ultralytics/data/split.py b/ultralytics/data/split.py
deleted file mode 100644
index d37664c..0000000
--- a/ultralytics/data/split.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import random
-import shutil
-from pathlib import Path
-
-from ultralytics.data.utils import IMG_FORMATS, img2label_paths
-from ultralytics.utils import DATASETS_DIR, LOGGER, TQDM
-
-
-def split_classify_dataset(source_dir: str | Path, train_ratio: float = 0.8) -> Path:
-    """
-    Split classification dataset into train and val directories in a new directory.
-
-    Creates a new directory '{source_dir}_split' with train/val subdirectories, preserving the original class
-    structure with an 80/20 split by default.
-
-    Directory structure:
-        Before:
-            caltech/
-            ├── class1/
-            │   ├── img1.jpg
-            │   ├── img2.jpg
-            │   └── ...
-            ├── class2/
-            │   ├── img1.jpg
-            │   └── ...
-            └── ...
-
-        After:
-            caltech_split/
-            ├── train/
-            │   ├── class1/
-            │   │   ├── img1.jpg
-            │   │   └── ...
-            │   ├── class2/
-            │   │   ├── img1.jpg
-            │   │   └── ...
-            │   └── ...
-            └── val/
-                ├── class1/
-                │   ├── img2.jpg
-                │   └── ...
-                ├── class2/
-                │   └── ...
-                └── ...
-
-    Args:
-        source_dir (str | Path): Path to classification dataset root directory.
-        train_ratio (float): Ratio for train split, between 0 and 1.
-
-    Returns:
-        (Path): Path to the created split directory.
-
-    Examples:
-        Split dataset with default 80/20 ratio
-        >>> split_classify_dataset("path/to/caltech")
-
-        Split with custom ratio
-        >>> split_classify_dataset("path/to/caltech", 0.75)
-    """
-    source_path = Path(source_dir)
-    split_path = Path(f"{source_path}_split")
-    train_path, val_path = split_path / "train", split_path / "val"
-
-    # Create directory structure
-    split_path.mkdir(exist_ok=True)
-    train_path.mkdir(exist_ok=True)
-    val_path.mkdir(exist_ok=True)
-
-    # Process class directories
-    class_dirs = [d for d in source_path.iterdir() if d.is_dir()]
-    total_images = sum(len(list(d.glob("*.*"))) for d in class_dirs)
-    stats = f"{len(class_dirs)} classes, {total_images} images"
-    LOGGER.info(f"Splitting {source_path} ({stats}) into {train_ratio:.0%} train, {1 - train_ratio:.0%} val...")
-
-    for class_dir in class_dirs:
-        # Create class directories
-        (train_path / class_dir.name).mkdir(exist_ok=True)
-        (val_path / class_dir.name).mkdir(exist_ok=True)
-
-        # Split and copy files
-        image_files = list(class_dir.glob("*.*"))
-        random.shuffle(image_files)
-        split_idx = int(len(image_files) * train_ratio)
-
-        for img in image_files[:split_idx]:
-            shutil.copy2(img, train_path / class_dir.name / img.name)
-
-        for img in image_files[split_idx:]:
-            shutil.copy2(img, val_path / class_dir.name / img.name)
-
-    LOGGER.info(f"Split complete in {split_path} ✅")
-    return split_path
-
-
-def autosplit(
-    path: Path = DATASETS_DIR / "coco8/images",
-    weights: tuple[float, float, float] = (0.9, 0.1, 0.0),
-    annotated_only: bool = False,
-) -> None:
-    """
-    Automatically split a dataset into train/val/test splits and save the resulting splits into autosplit_*.txt files.
-
-    Args:
-        path (Path): Path to images directory.
-        weights (tuple): Train, validation, and test split fractions.
-        annotated_only (bool): If True, only images with an associated txt file are used.
-
-    Examples:
-        Split images with default weights
-        >>> from ultralytics.data.split import autosplit
-        >>> autosplit()
-
-        Split with custom weights and annotated images only
-        >>> autosplit(path="path/to/images", weights=(0.8, 0.15, 0.05), annotated_only=True)
-    """
-    path = Path(path)  # images dir
-    files = sorted(x for x in path.rglob("*.*") if x.suffix[1:].lower() in IMG_FORMATS)  # image files only
-    n = len(files)  # number of files
-    random.seed(0)  # for reproducibility
-    indices = random.choices([0, 1, 2], weights=weights, k=n)  # assign each image to a split
-
-    txt = ["autosplit_train.txt", "autosplit_val.txt", "autosplit_test.txt"]  # 3 txt files
-    for x in txt:
-        if (path.parent / x).exists():
-            (path.parent / x).unlink()  # remove existing
-
-    LOGGER.info(f"Autosplitting images from {path}" + ", using *.txt labeled images only" * annotated_only)
-    for i, img in TQDM(zip(indices, files), total=n):
-        if not annotated_only or Path(img2label_paths([str(img)])[0]).exists():  # check label
-            with open(path.parent / txt[i], "a", encoding="utf-8") as f:
-                f.write(f"./{img.relative_to(path.parent).as_posix()}" + "\n")  # add image to txt file
-
-
-if __name__ == "__main__":
-    split_classify_dataset("caltech101")
diff --git a/ultralytics/data/split_dota.py b/ultralytics/data/split_dota.py
deleted file mode 100644
index 3534cb7..0000000
--- a/ultralytics/data/split_dota.py
+++ /dev/null
@@ -1,351 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import itertools
-from glob import glob
-from math import ceil
-from pathlib import Path
-from typing import Any
-
-import cv2
-import numpy as np
-from PIL import Image
-
-from ultralytics.data.utils import exif_size, img2label_paths
-from ultralytics.utils import TQDM
-from ultralytics.utils.checks import check_requirements
-
-
-def bbox_iof(polygon1: np.ndarray, bbox2: np.ndarray, eps: float = 1e-6) -> np.ndarray:
-    """
-    Calculate Intersection over Foreground (IoF) between polygons and bounding boxes.
-
-    Args:
-        polygon1 (np.ndarray): Polygon coordinates with shape (N, 8).
-        bbox2 (np.ndarray): Bounding boxes with shape (N, 4).
-        eps (float, optional): Small value to prevent division by zero.
-
-    Returns:
-        (np.ndarray): IoF scores with shape (N, 1) or (N, M) if bbox2 is (M, 4).
-
-    Notes:
-        Polygon format: [x1, y1, x2, y2, x3, y3, x4, y4].
-        Bounding box format: [x_min, y_min, x_max, y_max].
-    """
-    check_requirements("shapely>=2.0.0")
-    from shapely.geometry import Polygon
-
-    polygon1 = polygon1.reshape(-1, 4, 2)
-    lt_point = np.min(polygon1, axis=-2)  # left-top
-    rb_point = np.max(polygon1, axis=-2)  # right-bottom
-    bbox1 = np.concatenate([lt_point, rb_point], axis=-1)
-
-    lt = np.maximum(bbox1[:, None, :2], bbox2[..., :2])
-    rb = np.minimum(bbox1[:, None, 2:], bbox2[..., 2:])
-    wh = np.clip(rb - lt, 0, np.inf)
-    h_overlaps = wh[..., 0] * wh[..., 1]
-
-    left, top, right, bottom = (bbox2[..., i] for i in range(4))
-    polygon2 = np.stack([left, top, right, top, right, bottom, left, bottom], axis=-1).reshape(-1, 4, 2)
-
-    sg_polys1 = [Polygon(p) for p in polygon1]
-    sg_polys2 = [Polygon(p) for p in polygon2]
-    overlaps = np.zeros(h_overlaps.shape)
-    for p in zip(*np.nonzero(h_overlaps)):
-        overlaps[p] = sg_polys1[p[0]].intersection(sg_polys2[p[-1]]).area
-    unions = np.array([p.area for p in sg_polys1], dtype=np.float32)
-    unions = unions[..., None]
-
-    unions = np.clip(unions, eps, np.inf)
-    outputs = overlaps / unions
-    if outputs.ndim == 1:
-        outputs = outputs[..., None]
-    return outputs
-
-
-def load_yolo_dota(data_root: str, split: str = "train") -> list[dict[str, Any]]:
-    """
-    Load DOTA dataset annotations and image information.
-
-    Args:
-        data_root (str): Data root directory.
-        split (str, optional): The split data set, could be 'train' or 'val'.
-
-    Returns:
-        (list[dict[str, Any]]): List of annotation dictionaries containing image information.
-
-    Notes:
-        The directory structure assumed for the DOTA dataset:
-            - data_root
-                - images
-                    - train
-                    - val
-                - labels
-                    - train
-                    - val
-    """
-    assert split in {"train", "val"}, f"Split must be 'train' or 'val', not {split}."
-    im_dir = Path(data_root) / "images" / split
-    assert im_dir.exists(), f"Can't find {im_dir}, please check your data root."
-    im_files = glob(str(Path(data_root) / "images" / split / "*"))
-    lb_files = img2label_paths(im_files)
-    annos = []
-    for im_file, lb_file in zip(im_files, lb_files):
-        w, h = exif_size(Image.open(im_file))
-        with open(lb_file, encoding="utf-8") as f:
-            lb = [x.split() for x in f.read().strip().splitlines() if len(x)]
-            lb = np.array(lb, dtype=np.float32)
-        annos.append(dict(ori_size=(h, w), label=lb, filepath=im_file))
-    return annos
-
-
-def get_windows(
-    im_size: tuple[int, int],
-    crop_sizes: tuple[int, ...] = (1024,),
-    gaps: tuple[int, ...] = (200,),
-    im_rate_thr: float = 0.6,
-    eps: float = 0.01,
-) -> np.ndarray:
-    """
-    Get the coordinates of sliding windows for image cropping.
-
-    Args:
-        im_size (tuple[int, int]): Original image size, (H, W).
-        crop_sizes (tuple[int, ...], optional): Crop size of windows.
-        gaps (tuple[int, ...], optional): Gap between crops.
-        im_rate_thr (float, optional): Threshold of windows areas divided by image areas.
-        eps (float, optional): Epsilon value for math operations.
-
-    Returns:
-        (np.ndarray): Array of window coordinates with shape (N, 4) where each row is [x_start, y_start, x_stop, y_stop].
-    """
-    h, w = im_size
-    windows = []
-    for crop_size, gap in zip(crop_sizes, gaps):
-        assert crop_size > gap, f"invalid crop_size gap pair [{crop_size} {gap}]"
-        step = crop_size - gap
-
-        xn = 1 if w <= crop_size else ceil((w - crop_size) / step + 1)
-        xs = [step * i for i in range(xn)]
-        if len(xs) > 1 and xs[-1] + crop_size > w:
-            xs[-1] = w - crop_size
-
-        yn = 1 if h <= crop_size else ceil((h - crop_size) / step + 1)
-        ys = [step * i for i in range(yn)]
-        if len(ys) > 1 and ys[-1] + crop_size > h:
-            ys[-1] = h - crop_size
-
-        start = np.array(list(itertools.product(xs, ys)), dtype=np.int64)
-        stop = start + crop_size
-        windows.append(np.concatenate([start, stop], axis=1))
-    windows = np.concatenate(windows, axis=0)
-
-    im_in_wins = windows.copy()
-    im_in_wins[:, 0::2] = np.clip(im_in_wins[:, 0::2], 0, w)
-    im_in_wins[:, 1::2] = np.clip(im_in_wins[:, 1::2], 0, h)
-    im_areas = (im_in_wins[:, 2] - im_in_wins[:, 0]) * (im_in_wins[:, 3] - im_in_wins[:, 1])
-    win_areas = (windows[:, 2] - windows[:, 0]) * (windows[:, 3] - windows[:, 1])
-    im_rates = im_areas / win_areas
-    if not (im_rates > im_rate_thr).any():
-        max_rate = im_rates.max()
-        im_rates[abs(im_rates - max_rate) < eps] = 1
-    return windows[im_rates > im_rate_thr]
-
-
-def get_window_obj(anno: dict[str, Any], windows: np.ndarray, iof_thr: float = 0.7) -> list[np.ndarray]:
-    """Get objects for each window based on IoF threshold."""
-    h, w = anno["ori_size"]
-    label = anno["label"]
-    if len(label):
-        label[:, 1::2] *= w
-        label[:, 2::2] *= h
-        iofs = bbox_iof(label[:, 1:], windows)
-        # Unnormalized and misaligned coordinates
-        return [(label[iofs[:, i] >= iof_thr]) for i in range(len(windows))]  # window_anns
-    else:
-        return [np.zeros((0, 9), dtype=np.float32) for _ in range(len(windows))]  # window_anns
-
-
-def crop_and_save(
-    anno: dict[str, Any],
-    windows: np.ndarray,
-    window_objs: list[np.ndarray],
-    im_dir: str,
-    lb_dir: str,
-    allow_background_images: bool = True,
-) -> None:
-    """
-    Crop images and save new labels for each window.
-
-    Args:
-        anno (dict[str, Any]): Annotation dict, including 'filepath', 'label', 'ori_size' as its keys.
-        windows (np.ndarray): Array of windows coordinates with shape (N, 4).
-        window_objs (list[np.ndarray]): A list of labels inside each window.
-        im_dir (str): The output directory path of images.
-        lb_dir (str): The output directory path of labels.
-        allow_background_images (bool, optional): Whether to include background images without labels.
-
-    Notes:
-        The directory structure assumed for the DOTA dataset:
-            - data_root
-                - images
-                    - train
-                    - val
-                - labels
-                    - train
-                    - val
-    """
-    im = cv2.imread(anno["filepath"])
-    name = Path(anno["filepath"]).stem
-    for i, window in enumerate(windows):
-        x_start, y_start, x_stop, y_stop = window.tolist()
-        new_name = f"{name}__{x_stop - x_start}__{x_start}___{y_start}"
-        patch_im = im[y_start:y_stop, x_start:x_stop]
-        ph, pw = patch_im.shape[:2]
-
-        label = window_objs[i]
-        if len(label) or allow_background_images:
-            cv2.imwrite(str(Path(im_dir) / f"{new_name}.jpg"), patch_im)
-        if len(label):
-            label[:, 1::2] -= x_start
-            label[:, 2::2] -= y_start
-            label[:, 1::2] /= pw
-            label[:, 2::2] /= ph
-
-            with open(Path(lb_dir) / f"{new_name}.txt", "w", encoding="utf-8") as f:
-                for lb in label:
-                    formatted_coords = [f"{coord:.6g}" for coord in lb[1:]]
-                    f.write(f"{int(lb[0])} {' '.join(formatted_coords)}\n")
-
-
-def split_images_and_labels(
-    data_root: str,
-    save_dir: str,
-    split: str = "train",
-    crop_sizes: tuple[int, ...] = (1024,),
-    gaps: tuple[int, ...] = (200,),
-) -> None:
-    """
-    Split both images and labels for a given dataset split.
-
-    Args:
-        data_root (str): Root directory of the dataset.
-        save_dir (str): Directory to save the split dataset.
-        split (str, optional): The split data set, could be 'train' or 'val'.
-        crop_sizes (tuple[int, ...], optional): Tuple of crop sizes.
-        gaps (tuple[int, ...], optional): Tuple of gaps between crops.
-
-    Notes:
-        The directory structure assumed for the DOTA dataset:
-            - data_root
-                - images
-                    - split
-                - labels
-                    - split
-        and the output directory structure is:
-            - save_dir
-                - images
-                    - split
-                - labels
-                    - split
-    """
-    im_dir = Path(save_dir) / "images" / split
-    im_dir.mkdir(parents=True, exist_ok=True)
-    lb_dir = Path(save_dir) / "labels" / split
-    lb_dir.mkdir(parents=True, exist_ok=True)
-
-    annos = load_yolo_dota(data_root, split=split)
-    for anno in TQDM(annos, total=len(annos), desc=split):
-        windows = get_windows(anno["ori_size"], crop_sizes, gaps)
-        window_objs = get_window_obj(anno, windows)
-        crop_and_save(anno, windows, window_objs, str(im_dir), str(lb_dir))
-
-
-def split_trainval(
-    data_root: str, save_dir: str, crop_size: int = 1024, gap: int = 200, rates: tuple[float, ...] = (1.0,)
-) -> None:
-    """
-    Split train and val sets of DOTA dataset with multiple scaling rates.
-
-    Args:
-        data_root (str): Root directory of the dataset.
-        save_dir (str): Directory to save the split dataset.
-        crop_size (int, optional): Base crop size.
-        gap (int, optional): Base gap between crops.
-        rates (tuple[float, ...], optional): Scaling rates for crop_size and gap.
-
-    Notes:
-        The directory structure assumed for the DOTA dataset:
-            - data_root
-                - images
-                    - train
-                    - val
-                - labels
-                    - train
-                    - val
-        and the output directory structure is:
-            - save_dir
-                - images
-                    - train
-                    - val
-                - labels
-                    - train
-                    - val
-    """
-    crop_sizes, gaps = [], []
-    for r in rates:
-        crop_sizes.append(int(crop_size / r))
-        gaps.append(int(gap / r))
-    for split in {"train", "val"}:
-        split_images_and_labels(data_root, save_dir, split, crop_sizes, gaps)
-
-
-def split_test(
-    data_root: str, save_dir: str, crop_size: int = 1024, gap: int = 200, rates: tuple[float, ...] = (1.0,)
-) -> None:
-    """
-    Split test set of DOTA dataset, labels are not included within this set.
-
-    Args:
-        data_root (str): Root directory of the dataset.
-        save_dir (str): Directory to save the split dataset.
-        crop_size (int, optional): Base crop size.
-        gap (int, optional): Base gap between crops.
-        rates (tuple[float, ...], optional): Scaling rates for crop_size and gap.
-
-    Notes:
-        The directory structure assumed for the DOTA dataset:
-            - data_root
-                - images
-                    - test
-        and the output directory structure is:
-            - save_dir
-                - images
-                    - test
-    """
-    crop_sizes, gaps = [], []
-    for r in rates:
-        crop_sizes.append(int(crop_size / r))
-        gaps.append(int(gap / r))
-    save_dir = Path(save_dir) / "images" / "test"
-    save_dir.mkdir(parents=True, exist_ok=True)
-
-    im_dir = Path(data_root) / "images" / "test"
-    assert im_dir.exists(), f"Can't find {im_dir}, please check your data root."
-    im_files = glob(str(im_dir / "*"))
-    for im_file in TQDM(im_files, total=len(im_files), desc="test"):
-        w, h = exif_size(Image.open(im_file))
-        windows = get_windows((h, w), crop_sizes=crop_sizes, gaps=gaps)
-        im = cv2.imread(im_file)
-        name = Path(im_file).stem
-        for window in windows:
-            x_start, y_start, x_stop, y_stop = window.tolist()
-            new_name = f"{name}__{x_stop - x_start}__{x_start}___{y_start}"
-            patch_im = im[y_start:y_stop, x_start:x_stop]
-            cv2.imwrite(str(save_dir / f"{new_name}.jpg"), patch_im)
-
-
-if __name__ == "__main__":
-    split_trainval(data_root="DOTAv2", save_dir="DOTAv2-split")
-    split_test(data_root="DOTAv2", save_dir="DOTAv2-split")
diff --git a/ultralytics/data/utils.py b/ultralytics/data/utils.py
deleted file mode 100644
index cd6d262..0000000
--- a/ultralytics/data/utils.py
+++ /dev/null
@@ -1,807 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import json
-import os
-import random
-import subprocess
-import time
-import zipfile
-from multiprocessing.pool import ThreadPool
-from pathlib import Path
-from tarfile import is_tarfile
-from typing import Any
-
-import cv2
-import numpy as np
-from PIL import Image, ImageOps
-
-from ultralytics.nn.autobackend import check_class_names
-from ultralytics.utils import (
-    DATASETS_DIR,
-    LOGGER,
-    NUM_THREADS,
-    ROOT,
-    SETTINGS_FILE,
-    TQDM,
-    YAML,
-    clean_url,
-    colorstr,
-    emojis,
-    is_dir_writeable,
-)
-from ultralytics.utils.checks import check_file, check_font, is_ascii
-from ultralytics.utils.downloads import download, safe_download, unzip_file
-from ultralytics.utils.ops import segments2boxes
-
-HELP_URL = "See https://docs.ultralytics.com/datasets for dataset formatting guidance."
-IMG_FORMATS = {"bmp", "dng", "jpeg", "jpg", "mpo", "png", "tif", "tiff", "webp", "pfm", "heic"}  # image suffixes
-VID_FORMATS = {"asf", "avi", "gif", "m4v", "mkv", "mov", "mp4", "mpeg", "mpg", "ts", "wmv", "webm"}  # video suffixes
-FORMATS_HELP_MSG = f"Supported formats are:\nimages: {IMG_FORMATS}\nvideos: {VID_FORMATS}"
-
-
-def img2label_paths(img_paths: list[str]) -> list[str]:
-    """Convert image paths to label paths by replacing 'images' with 'labels' and extension with '.txt'."""
-    sa, sb = f"{os.sep}images{os.sep}", f"{os.sep}labels{os.sep}"  # /images/, /labels/ substrings
-    return [sb.join(x.rsplit(sa, 1)).rsplit(".", 1)[0] + ".txt" for x in img_paths]
-
-
-def check_file_speeds(
-    files: list[str], threshold_ms: float = 10, threshold_mb: float = 50, max_files: int = 5, prefix: str = ""
-):
-    """
-    Check dataset file access speed and provide performance feedback.
-
-    This function tests the access speed of dataset files by measuring ping (stat call) time and read speed.
-    It samples up to 5 files from the provided list and warns if access times exceed the threshold.
-
-    Args:
-        files (list[str]): List of file paths to check for access speed.
-        threshold_ms (float, optional): Threshold in milliseconds for ping time warnings.
-        threshold_mb (float, optional): Threshold in megabytes per second for read speed warnings.
-        max_files (int, optional): The maximum number of files to check.
-        prefix (str, optional): Prefix string to add to log messages.
-
-    Examples:
-        >>> from pathlib import Path
-        >>> image_files = list(Path("dataset/images").glob("*.jpg"))
-        >>> check_file_speeds(image_files, threshold_ms=15)
-    """
-    if not files:
-        LOGGER.warning(f"{prefix}Image speed checks: No files to check")
-        return
-
-    # Sample files (max 5)
-    files = random.sample(files, min(max_files, len(files)))
-
-    # Test ping (stat time)
-    ping_times = []
-    file_sizes = []
-    read_speeds = []
-
-    for f in files:
-        try:
-            # Measure ping (stat call)
-            start = time.perf_counter()
-            file_size = os.stat(f).st_size
-            ping_times.append((time.perf_counter() - start) * 1000)  # ms
-            file_sizes.append(file_size)
-
-            # Measure read speed
-            start = time.perf_counter()
-            with open(f, "rb") as file_obj:
-                _ = file_obj.read()
-            read_time = time.perf_counter() - start
-            if read_time > 0:  # Avoid division by zero
-                read_speeds.append(file_size / (1 << 20) / read_time)  # MB/s
-        except Exception:
-            pass
-
-    if not ping_times:
-        LOGGER.warning(f"{prefix}Image speed checks: failed to access files")
-        return
-
-    # Calculate stats with uncertainties
-    avg_ping = np.mean(ping_times)
-    std_ping = np.std(ping_times, ddof=1) if len(ping_times) > 1 else 0
-    size_msg = f", size: {np.mean(file_sizes) / (1 << 10):.1f} KB"
-    ping_msg = f"ping: {avg_ping:.1f}±{std_ping:.1f} ms"
-
-    if read_speeds:
-        avg_speed = np.mean(read_speeds)
-        std_speed = np.std(read_speeds, ddof=1) if len(read_speeds) > 1 else 0
-        speed_msg = f", read: {avg_speed:.1f}±{std_speed:.1f} MB/s"
-    else:
-        speed_msg = ""
-
-    if avg_ping < threshold_ms or avg_speed < threshold_mb:
-        LOGGER.info(f"{prefix}Fast image access ✅ ({ping_msg}{speed_msg}{size_msg})")
-    else:
-        LOGGER.warning(
-            f"{prefix}Slow image access detected ({ping_msg}{speed_msg}{size_msg}). "
-            f"Use local storage instead of remote/mounted storage for better performance. "
-            f"See https://docs.ultralytics.com/guides/model-training-tips/"
-        )
-
-
-def get_hash(paths: list[str]) -> str:
-    """Return a single hash value of a list of paths (files or dirs)."""
-    size = 0
-    for p in paths:
-        try:
-            size += os.stat(p).st_size
-        except OSError:
-            continue
-    h = __import__("hashlib").sha256(str(size).encode())  # hash sizes
-    h.update("".join(paths).encode())  # hash paths
-    return h.hexdigest()  # return hash
-
-
-def exif_size(img: Image.Image) -> tuple[int, int]:
-    """Return exif-corrected PIL size."""
-    s = img.size  # (width, height)
-    if img.format == "JPEG":  # only support JPEG images
-        try:
-            if exif := img.getexif():
-                rotation = exif.get(274, None)  # the EXIF key for the orientation tag is 274
-                if rotation in {6, 8}:  # rotation 270 or 90
-                    s = s[1], s[0]
-        except Exception:
-            pass
-    return s
-
-
-def verify_image(args: tuple) -> tuple:
-    """Verify one image."""
-    (im_file, cls), prefix = args
-    # Number (found, corrupt), message
-    nf, nc, msg = 0, 0, ""
-    try:
-        im = Image.open(im_file)
-        im.verify()  # PIL verify
-        shape = exif_size(im)  # image size
-        shape = (shape[1], shape[0])  # hw
-        assert (shape[0] > 9) & (shape[1] > 9), f"image size {shape} <10 pixels"
-        assert im.format.lower() in IMG_FORMATS, f"Invalid image format {im.format}. {FORMATS_HELP_MSG}"
-        if im.format.lower() in {"jpg", "jpeg"}:
-            with open(im_file, "rb") as f:
-                f.seek(-2, 2)
-                if f.read() != b"\xff\xd9":  # corrupt JPEG
-                    ImageOps.exif_transpose(Image.open(im_file)).save(im_file, "JPEG", subsampling=0, quality=100)
-                    msg = f"{prefix}{im_file}: corrupt JPEG restored and saved"
-        nf = 1
-    except Exception as e:
-        nc = 1
-        msg = f"{prefix}{im_file}: ignoring corrupt image/label: {e}"
-    return (im_file, cls), nf, nc, msg
-
-
-def verify_image_label(args: tuple) -> list:
-    """Verify one image-label pair."""
-    im_file, lb_file, prefix, keypoint, num_cls, nkpt, ndim, single_cls = args
-    # Number (missing, found, empty, corrupt), message, segments, keypoints
-    nm, nf, ne, nc, msg, segments, keypoints = 0, 0, 0, 0, "", [], None
-    try:
-        # Verify images
-        im = Image.open(im_file)
-        im.verify()  # PIL verify
-        shape = exif_size(im)  # image size
-        shape = (shape[1], shape[0])  # hw
-        assert (shape[0] > 9) & (shape[1] > 9), f"image size {shape} <10 pixels"
-        assert im.format.lower() in IMG_FORMATS, f"invalid image format {im.format}. {FORMATS_HELP_MSG}"
-        if im.format.lower() in {"jpg", "jpeg"}:
-            with open(im_file, "rb") as f:
-                f.seek(-2, 2)
-                if f.read() != b"\xff\xd9":  # corrupt JPEG
-                    ImageOps.exif_transpose(Image.open(im_file)).save(im_file, "JPEG", subsampling=0, quality=100)
-                    msg = f"{prefix}{im_file}: corrupt JPEG restored and saved"
-
-        # Verify labels
-        if os.path.isfile(lb_file):
-            nf = 1  # label found
-            with open(lb_file, encoding="utf-8") as f:
-                lb = [x.split() for x in f.read().strip().splitlines() if len(x)]
-                if any(len(x) > 6 for x in lb) and (not keypoint):  # is segment
-                    classes = np.array([x[0] for x in lb], dtype=np.float32)
-                    segments = [np.array(x[1:], dtype=np.float32).reshape(-1, 2) for x in lb]  # (cls, xy1...)
-                    lb = np.concatenate((classes.reshape(-1, 1), segments2boxes(segments)), 1)  # (cls, xywh)
-                lb = np.array(lb, dtype=np.float32)
-            if nl := len(lb):
-                if keypoint:
-                    assert lb.shape[1] == (5 + nkpt * ndim), f"labels require {(5 + nkpt * ndim)} columns each"
-                    points = lb[:, 5:].reshape(-1, ndim)[:, :2]
-                else:
-                    assert lb.shape[1] == 5, f"labels require 5 columns, {lb.shape[1]} columns detected"
-                    points = lb[:, 1:]
-                # Coordinate points check with 1% tolerance
-                assert points.max() <= 1.01, f"non-normalized or out of bounds coordinates {points[points > 1.01]}"
-                assert lb.min() >= -0.01, f"negative class labels or coordinate {lb[lb < -0.01]}"
-
-                # All labels
-                max_cls = 0 if single_cls else lb[:, 0].max()  # max label count
-                assert max_cls < num_cls, (
-                    f"Label class {int(max_cls)} exceeds dataset class count {num_cls}. "
-                    f"Possible class labels are 0-{num_cls - 1}"
-                )
-                _, i = np.unique(lb, axis=0, return_index=True)
-                if len(i) < nl:  # duplicate row check
-                    lb = lb[i]  # remove duplicates
-                    if segments:
-                        segments = [segments[x] for x in i]
-                    msg = f"{prefix}{im_file}: {nl - len(i)} duplicate labels removed"
-            else:
-                ne = 1  # label empty
-                lb = np.zeros((0, (5 + nkpt * ndim) if keypoint else 5), dtype=np.float32)
-        else:
-            nm = 1  # label missing
-            lb = np.zeros((0, (5 + nkpt * ndim) if keypoint else 5), dtype=np.float32)
-        if keypoint:
-            keypoints = lb[:, 5:].reshape(-1, nkpt, ndim)
-            if ndim == 2:
-                kpt_mask = np.where((keypoints[..., 0] < 0) | (keypoints[..., 1] < 0), 0.0, 1.0).astype(np.float32)
-                keypoints = np.concatenate([keypoints, kpt_mask[..., None]], axis=-1)  # (nl, nkpt, 3)
-        lb = lb[:, :5]
-        return im_file, lb, shape, segments, keypoints, nm, nf, ne, nc, msg
-    except Exception as e:
-        nc = 1
-        msg = f"{prefix}{im_file}: ignoring corrupt image/label: {e}"
-        return [None, None, None, None, None, nm, nf, ne, nc, msg]
-
-
-def visualize_image_annotations(image_path: str, txt_path: str, label_map: dict[int, str]):
-    """
-    Visualize YOLO annotations (bounding boxes and class labels) on an image.
-
-    This function reads an image and its corresponding annotation file in YOLO format, then
-    draws bounding boxes around detected objects and labels them with their respective class names.
-    The bounding box colors are assigned based on the class ID, and the text color is dynamically
-    adjusted for readability, depending on the background color's luminance.
-
-    Args:
-        image_path (str): The path to the image file to annotate, and it can be in formats supported by PIL.
-        txt_path (str): The path to the annotation file in YOLO format, that should contain one line per object.
-        label_map (dict[int, str]): A dictionary that maps class IDs (integers) to class labels (strings).
-
-    Examples:
-        >>> label_map = {0: "cat", 1: "dog", 2: "bird"}  # It should include all annotated classes details
-        >>> visualize_image_annotations("path/to/image.jpg", "path/to/annotations.txt", label_map)
-    """
-    import matplotlib.pyplot as plt
-
-    from ultralytics.utils.plotting import colors
-
-    img = np.array(Image.open(image_path))
-    img_height, img_width = img.shape[:2]
-    annotations = []
-    with open(txt_path, encoding="utf-8") as file:
-        for line in file:
-            class_id, x_center, y_center, width, height = map(float, line.split())
-            x = (x_center - width / 2) * img_width
-            y = (y_center - height / 2) * img_height
-            w = width * img_width
-            h = height * img_height
-            annotations.append((x, y, w, h, int(class_id)))
-    _, ax = plt.subplots(1)  # Plot the image and annotations
-    for x, y, w, h, label in annotations:
-        color = tuple(c / 255 for c in colors(label, True))  # Get and normalize the RGB color
-        rect = plt.Rectangle((x, y), w, h, linewidth=2, edgecolor=color, facecolor="none")  # Create a rectangle
-        ax.add_patch(rect)
-        luminance = 0.2126 * color[0] + 0.7152 * color[1] + 0.0722 * color[2]  # Formula for luminance
-        ax.text(x, y - 5, label_map[label], color="white" if luminance < 0.5 else "black", backgroundcolor=color)
-    ax.imshow(img)
-    plt.show()
-
-
-def polygon2mask(
-    imgsz: tuple[int, int], polygons: list[np.ndarray], color: int = 1, downsample_ratio: int = 1
-) -> np.ndarray:
-    """
-    Convert a list of polygons to a binary mask of the specified image size.
-
-    Args:
-        imgsz (tuple[int, int]): The size of the image as (height, width).
-        polygons (list[np.ndarray]): A list of polygons. Each polygon is an array with shape (N, M), where
-                                     N is the number of polygons, and M is the number of points such that M % 2 = 0.
-        color (int, optional): The color value to fill in the polygons on the mask.
-        downsample_ratio (int, optional): Factor by which to downsample the mask.
-
-    Returns:
-        (np.ndarray): A binary mask of the specified image size with the polygons filled in.
-    """
-    mask = np.zeros(imgsz, dtype=np.uint8)
-    polygons = np.asarray(polygons, dtype=np.int32)
-    polygons = polygons.reshape((polygons.shape[0], -1, 2))
-    cv2.fillPoly(mask, polygons, color=color)
-    nh, nw = (imgsz[0] // downsample_ratio, imgsz[1] // downsample_ratio)
-    # Note: fillPoly first then resize is trying to keep the same loss calculation method when mask-ratio=1
-    return cv2.resize(mask, (nw, nh))
-
-
-def polygons2masks(
-    imgsz: tuple[int, int], polygons: list[np.ndarray], color: int, downsample_ratio: int = 1
-) -> np.ndarray:
-    """
-    Convert a list of polygons to a set of binary masks of the specified image size.
-
-    Args:
-        imgsz (tuple[int, int]): The size of the image as (height, width).
-        polygons (list[np.ndarray]): A list of polygons. Each polygon is an array with shape (N, M), where
-                                     N is the number of polygons, and M is the number of points such that M % 2 = 0.
-        color (int): The color value to fill in the polygons on the masks.
-        downsample_ratio (int, optional): Factor by which to downsample each mask.
-
-    Returns:
-        (np.ndarray): A set of binary masks of the specified image size with the polygons filled in.
-    """
-    return np.array([polygon2mask(imgsz, [x.reshape(-1)], color, downsample_ratio) for x in polygons])
-
-
-def polygons2masks_overlap(
-    imgsz: tuple[int, int], segments: list[np.ndarray], downsample_ratio: int = 1
-) -> tuple[np.ndarray, np.ndarray]:
-    """Return a (640, 640) overlap mask."""
-    masks = np.zeros(
-        (imgsz[0] // downsample_ratio, imgsz[1] // downsample_ratio),
-        dtype=np.int32 if len(segments) > 255 else np.uint8,
-    )
-    areas = []
-    ms = []
-    for segment in segments:
-        mask = polygon2mask(
-            imgsz,
-            [segment.reshape(-1)],
-            downsample_ratio=downsample_ratio,
-            color=1,
-        )
-        ms.append(mask.astype(masks.dtype))
-        areas.append(mask.sum())
-    areas = np.asarray(areas)
-    index = np.argsort(-areas)
-    ms = np.array(ms)[index]
-    for i in range(len(segments)):
-        mask = ms[i] * (i + 1)
-        masks = masks + mask
-        masks = np.clip(masks, a_min=0, a_max=i + 1)
-    return masks, index
-
-
-def find_dataset_yaml(path: Path) -> Path:
-    """
-    Find and return the YAML file associated with a Detect, Segment or Pose dataset.
-
-    This function searches for a YAML file at the root level of the provided directory first, and if not found, it
-    performs a recursive search. It prefers YAML files that have the same stem as the provided path.
-
-    Args:
-        path (Path): The directory path to search for the YAML file.
-
-    Returns:
-        (Path): The path of the found YAML file.
-    """
-    files = list(path.glob("*.yaml")) or list(path.rglob("*.yaml"))  # try root level first and then recursive
-    assert files, f"No YAML file found in '{path.resolve()}'"
-    if len(files) > 1:
-        files = [f for f in files if f.stem == path.stem]  # prefer *.yaml files that match
-    assert len(files) == 1, f"Expected 1 YAML file in '{path.resolve()}', but found {len(files)}.\n{files}"
-    return files[0]
-
-
-def check_det_dataset(dataset: str, autodownload: bool = True) -> dict[str, Any]:
-    """
-    Download, verify, and/or unzip a dataset if not found locally.
-
-    This function checks the availability of a specified dataset, and if not found, it has the option to download and
-    unzip the dataset. It then reads and parses the accompanying YAML data, ensuring key requirements are met and also
-    resolves paths related to the dataset.
-
-    Args:
-        dataset (str): Path to the dataset or dataset descriptor (like a YAML file).
-        autodownload (bool, optional): Whether to automatically download the dataset if not found.
-
-    Returns:
-        (dict[str, Any]): Parsed dataset information and paths.
-    """
-    file = check_file(dataset)
-
-    # Download (optional)
-    extract_dir = ""
-    if zipfile.is_zipfile(file) or is_tarfile(file):
-        new_dir = safe_download(file, dir=DATASETS_DIR, unzip=True, delete=False)
-        file = find_dataset_yaml(DATASETS_DIR / new_dir)
-        extract_dir, autodownload = file.parent, False
-
-    # Read YAML
-    data = YAML.load(file, append_filename=True)  # dictionary
-
-    # Checks
-    for k in "train", "val":
-        if k not in data:
-            if k != "val" or "validation" not in data:
-                raise SyntaxError(
-                    emojis(f"{dataset} '{k}:' key missing ❌.\n'train' and 'val' are required in all data YAMLs.")
-                )
-            LOGGER.warning("renaming data YAML 'validation' key to 'val' to match YOLO format.")
-            data["val"] = data.pop("validation")  # replace 'validation' key with 'val' key
-    if "names" not in data and "nc" not in data:
-        raise SyntaxError(emojis(f"{dataset} key missing ❌.\n either 'names' or 'nc' are required in all data YAMLs."))
-    if "names" in data and "nc" in data and len(data["names"]) != data["nc"]:
-        raise SyntaxError(emojis(f"{dataset} 'names' length {len(data['names'])} and 'nc: {data['nc']}' must match."))
-    if "names" not in data:
-        data["names"] = [f"class_{i}" for i in range(data["nc"])]
-    else:
-        data["nc"] = len(data["names"])
-
-    data["names"] = check_class_names(data["names"])
-    data["channels"] = data.get("channels", 3)  # get image channels, default to 3
-
-    # Resolve paths
-    path = Path(extract_dir or data.get("path") or Path(data.get("yaml_file", "")).parent)  # dataset root
-    if not path.exists() and not path.is_absolute():
-        path = (DATASETS_DIR / path).resolve()  # path relative to DATASETS_DIR
-
-    # Set paths
-    data["path"] = path  # download scripts
-    for k in "train", "val", "test", "minival":
-        if data.get(k):  # prepend path
-            if isinstance(data[k], str):
-                x = (path / data[k]).resolve()
-                if not x.exists() and data[k].startswith("../"):
-                    x = (path / data[k][3:]).resolve()
-                data[k] = str(x)
-            else:
-                data[k] = [str((path / x).resolve()) for x in data[k]]
-
-    # Parse YAML
-    val, s = (data.get(x) for x in ("val", "download"))
-    if val:
-        val = [Path(x).resolve() for x in (val if isinstance(val, list) else [val])]  # val path
-        if not all(x.exists() for x in val):
-            name = clean_url(dataset)  # dataset name with URL auth stripped
-            LOGGER.info("")
-            m = f"Dataset '{name}' images not found, missing path '{[x for x in val if not x.exists()][0]}'"
-            if s and autodownload:
-                LOGGER.warning(m)
-            else:
-                m += f"\nNote dataset download directory is '{DATASETS_DIR}'. You can update this in '{SETTINGS_FILE}'"
-                raise FileNotFoundError(m)
-            t = time.time()
-            r = None  # success
-            if s.startswith("http") and s.endswith(".zip"):  # URL
-                safe_download(url=s, dir=DATASETS_DIR, delete=True)
-            elif s.startswith("bash "):  # bash script
-                LOGGER.info(f"Running {s} ...")
-                subprocess.run(s.split(), check=True)
-            else:  # python script
-                exec(s, {"yaml": data})
-            dt = f"({round(time.time() - t, 1)}s)"
-            s = f"success ✅ {dt}, saved to {colorstr('bold', DATASETS_DIR)}" if r in {0, None} else f"failure {dt} ❌"
-            LOGGER.info(f"Dataset download {s}\n")
-    check_font("Arial.ttf" if is_ascii(data["names"]) else "Arial.Unicode.ttf")  # download fonts
-
-    return data  # dictionary
-
-
-def check_cls_dataset(dataset: str | Path, split: str = "") -> dict[str, Any]:
-    """
-    Check a classification dataset such as Imagenet.
-
-    This function accepts a `dataset` name and attempts to retrieve the corresponding dataset information.
-    If the dataset is not found locally, it attempts to download the dataset from the internet and save it locally.
-
-    Args:
-        dataset (str | Path): The name of the dataset.
-        split (str, optional): The split of the dataset. Either 'val', 'test', or ''.
-
-    Returns:
-        (dict[str, Any]): A dictionary containing the following keys:
-
-            - 'train' (Path): The directory path containing the training set of the dataset.
-            - 'val' (Path): The directory path containing the validation set of the dataset.
-            - 'test' (Path): The directory path containing the test set of the dataset.
-            - 'nc' (int): The number of classes in the dataset.
-            - 'names' (dict[int, str]): A dictionary of class names in the dataset.
-    """
-    # Download (optional if dataset=https://file.zip is passed directly)
-    if str(dataset).startswith(("http:/", "https:/")):
-        dataset = safe_download(dataset, dir=DATASETS_DIR, unzip=True, delete=False)
-    elif str(dataset).endswith((".zip", ".tar", ".gz")):
-        file = check_file(dataset)
-        dataset = safe_download(file, dir=DATASETS_DIR, unzip=True, delete=False)
-
-    dataset = Path(dataset)
-    data_dir = (dataset if dataset.is_dir() else (DATASETS_DIR / dataset)).resolve()
-    if not data_dir.is_dir():
-        if data_dir.suffix != "":
-            raise ValueError(
-                f'Classification datasets must be a directory (data="path/to/dir") not a file (data="{dataset}"), '
-                "See https://docs.ultralytics.com/datasets/classify/"
-            )
-        LOGGER.info("")
-        LOGGER.warning(f"Dataset not found, missing path {data_dir}, attempting download...")
-        t = time.time()
-        if str(dataset) == "imagenet":
-            subprocess.run(["bash", str(ROOT / "data/scripts/get_imagenet.sh")], check=True)
-        else:
-            url = f"https://github.com/ultralytics/assets/releases/download/v0.0.0/{dataset}.zip"
-            download(url, dir=data_dir.parent)
-        LOGGER.info(f"Dataset download success ✅ ({time.time() - t:.1f}s), saved to {colorstr('bold', data_dir)}\n")
-    train_set = data_dir / "train"
-    if not train_set.is_dir():
-        LOGGER.warning(f"Dataset 'split=train' not found at {train_set}")
-        if image_files := list(data_dir.rglob("*.jpg")) + list(data_dir.rglob("*.png")):
-            from ultralytics.data.split import split_classify_dataset
-
-            LOGGER.info(f"Found {len(image_files)} images in subdirectories. Attempting to split...")
-            data_dir = split_classify_dataset(data_dir, train_ratio=0.8)
-            train_set = data_dir / "train"
-        else:
-            LOGGER.error(f"No images found in {data_dir} or its subdirectories.")
-    val_set = (
-        data_dir / "val"
-        if (data_dir / "val").exists()
-        else data_dir / "validation"
-        if (data_dir / "validation").exists()
-        else data_dir / "valid"
-        if (data_dir / "valid").exists()
-        else None
-    )  # data/test or data/val
-    test_set = data_dir / "test" if (data_dir / "test").exists() else None  # data/val or data/test
-    if split == "val" and not val_set:
-        LOGGER.warning("Dataset 'split=val' not found, using 'split=test' instead.")
-        val_set = test_set
-    elif split == "test" and not test_set:
-        LOGGER.warning("Dataset 'split=test' not found, using 'split=val' instead.")
-        test_set = val_set
-
-    nc = len([x for x in (data_dir / "train").glob("*") if x.is_dir()])  # number of classes
-    names = [x.name for x in (data_dir / "train").iterdir() if x.is_dir()]  # class names list
-    names = dict(enumerate(sorted(names)))
-
-    # Print to console
-    for k, v in {"train": train_set, "val": val_set, "test": test_set}.items():
-        prefix = f"{colorstr(f'{k}:')} {v}..."
-        if v is None:
-            LOGGER.info(prefix)
-        else:
-            files = [path for path in v.rglob("*.*") if path.suffix[1:].lower() in IMG_FORMATS]
-            nf = len(files)  # number of files
-            nd = len({file.parent for file in files})  # number of directories
-            if nf == 0:
-                if k == "train":
-                    raise FileNotFoundError(f"{dataset} '{k}:' no training images found")
-                else:
-                    LOGGER.warning(f"{prefix} found {nf} images in {nd} classes (no images found)")
-            elif nd != nc:
-                LOGGER.error(f"{prefix} found {nf} images in {nd} classes (requires {nc} classes, not {nd})")
-            else:
-                LOGGER.info(f"{prefix} found {nf} images in {nd} classes ✅ ")
-
-    return {"train": train_set, "val": val_set, "test": test_set, "nc": nc, "names": names, "channels": 3}
-
-
-class HUBDatasetStats:
-    """
-    A class for generating HUB dataset JSON and `-hub` dataset directory.
-
-    Args:
-        path (str): Path to data.yaml or data.zip (with data.yaml inside data.zip).
-        task (str): Dataset task. Options are 'detect', 'segment', 'pose', 'classify'.
-        autodownload (bool): Attempt to download dataset if not found locally.
-
-    Attributes:
-        task (str): Dataset task type.
-        hub_dir (Path): Directory path for HUB dataset files.
-        im_dir (Path): Directory path for compressed images.
-        stats (dict): Statistics dictionary containing dataset information.
-        data (dict): Dataset configuration data.
-
-    Methods:
-        get_json: Return dataset JSON for Ultralytics HUB.
-        process_images: Compress images for Ultralytics HUB.
-
-    Note:
-        Download *.zip files from https://github.com/ultralytics/hub/tree/main/example_datasets
-        i.e. https://github.com/ultralytics/hub/raw/main/example_datasets/coco8.zip for coco8.zip.
-
-    Examples:
-        >>> from ultralytics.data.utils import HUBDatasetStats
-        >>> stats = HUBDatasetStats("path/to/coco8.zip", task="detect")  # detect dataset
-        >>> stats = HUBDatasetStats("path/to/coco8-seg.zip", task="segment")  # segment dataset
-        >>> stats = HUBDatasetStats("path/to/coco8-pose.zip", task="pose")  # pose dataset
-        >>> stats = HUBDatasetStats("path/to/dota8.zip", task="obb")  # OBB dataset
-        >>> stats = HUBDatasetStats("path/to/imagenet10.zip", task="classify")  # classification dataset
-        >>> stats.get_json(save=True)
-        >>> stats.process_images()
-    """
-
-    def __init__(self, path: str = "coco8.yaml", task: str = "detect", autodownload: bool = False):
-        """Initialize class."""
-        path = Path(path).resolve()
-        LOGGER.info(f"Starting HUB dataset checks for {path}....")
-
-        self.task = task  # detect, segment, pose, classify, obb
-        if self.task == "classify":
-            unzip_dir = unzip_file(path)
-            data = check_cls_dataset(unzip_dir)
-            data["path"] = unzip_dir
-        else:  # detect, segment, pose, obb
-            _, data_dir, yaml_path = self._unzip(Path(path))
-            try:
-                # Load YAML with checks
-                data = YAML.load(yaml_path)
-                data["path"] = ""  # strip path since YAML should be in dataset root for all HUB datasets
-                YAML.save(yaml_path, data)
-                data = check_det_dataset(yaml_path, autodownload)  # dict
-                data["path"] = data_dir  # YAML path should be set to '' (relative) or parent (absolute)
-            except Exception as e:
-                raise Exception("error/HUB/dataset_stats/init") from e
-
-        self.hub_dir = Path(f"{data['path']}-hub")
-        self.im_dir = self.hub_dir / "images"
-        self.stats = {"nc": len(data["names"]), "names": list(data["names"].values())}  # statistics dictionary
-        self.data = data
-
-    @staticmethod
-    def _unzip(path: Path) -> tuple[bool, str, Path]:
-        """Unzip data.zip."""
-        if not str(path).endswith(".zip"):  # path is data.yaml
-            return False, None, path
-        unzip_dir = unzip_file(path, path=path.parent)
-        assert unzip_dir.is_dir(), (
-            f"Error unzipping {path}, {unzip_dir} not found. path/to/abc.zip MUST unzip to path/to/abc/"
-        )
-        return True, str(unzip_dir), find_dataset_yaml(unzip_dir)  # zipped, data_dir, yaml_path
-
-    def _hub_ops(self, f: str):
-        """Save a compressed image for HUB previews."""
-        compress_one_image(f, self.im_dir / Path(f).name)  # save to dataset-hub
-
-    def get_json(self, save: bool = False, verbose: bool = False) -> dict:
-        """Return dataset JSON for Ultralytics HUB."""
-
-        def _round(labels):
-            """Update labels to integer class and 4 decimal place floats."""
-            if self.task == "detect":
-                coordinates = labels["bboxes"]
-            elif self.task in {"segment", "obb"}:  # Segment and OBB use segments. OBB segments are normalized xyxyxyxy
-                coordinates = [x.flatten() for x in labels["segments"]]
-            elif self.task == "pose":
-                n, nk, nd = labels["keypoints"].shape
-                coordinates = np.concatenate((labels["bboxes"], labels["keypoints"].reshape(n, nk * nd)), 1)
-            else:
-                raise ValueError(f"Undefined dataset task={self.task}.")
-            zipped = zip(labels["cls"], coordinates)
-            return [[int(c[0]), *(round(float(x), 4) for x in points)] for c, points in zipped]
-
-        for split in "train", "val", "test":
-            self.stats[split] = None  # predefine
-            path = self.data.get(split)
-
-            # Check split
-            if path is None:  # no split
-                continue
-            files = [f for f in Path(path).rglob("*.*") if f.suffix[1:].lower() in IMG_FORMATS]  # image files in split
-            if not files:  # no images
-                continue
-
-            # Get dataset statistics
-            if self.task == "classify":
-                from torchvision.datasets import ImageFolder  # scope for faster 'import ultralytics'
-
-                dataset = ImageFolder(self.data[split])
-
-                x = np.zeros(len(dataset.classes)).astype(int)
-                for im in dataset.imgs:
-                    x[im[1]] += 1
-
-                self.stats[split] = {
-                    "instance_stats": {"total": len(dataset), "per_class": x.tolist()},
-                    "image_stats": {"total": len(dataset), "unlabelled": 0, "per_class": x.tolist()},
-                    "labels": [{Path(k).name: v} for k, v in dataset.imgs],
-                }
-            else:
-                from ultralytics.data import YOLODataset
-
-                dataset = YOLODataset(img_path=self.data[split], data=self.data, task=self.task)
-                x = np.array(
-                    [
-                        np.bincount(label["cls"].astype(int).flatten(), minlength=self.data["nc"])
-                        for label in TQDM(dataset.labels, total=len(dataset), desc="Statistics")
-                    ]
-                )  # shape(128x80)
-                self.stats[split] = {
-                    "instance_stats": {"total": int(x.sum()), "per_class": x.sum(0).tolist()},
-                    "image_stats": {
-                        "total": len(dataset),
-                        "unlabelled": int(np.all(x == 0, 1).sum()),
-                        "per_class": (x > 0).sum(0).tolist(),
-                    },
-                    "labels": [{Path(k).name: _round(v)} for k, v in zip(dataset.im_files, dataset.labels)],
-                }
-
-        # Save, print and return
-        if save:
-            self.hub_dir.mkdir(parents=True, exist_ok=True)  # makes dataset-hub/
-            stats_path = self.hub_dir / "stats.json"
-            LOGGER.info(f"Saving {stats_path.resolve()}...")
-            with open(stats_path, "w", encoding="utf-8") as f:
-                json.dump(self.stats, f)  # save stats.json
-        if verbose:
-            LOGGER.info(json.dumps(self.stats, indent=2, sort_keys=False))
-        return self.stats
-
-    def process_images(self) -> Path:
-        """Compress images for Ultralytics HUB."""
-        from ultralytics.data import YOLODataset  # ClassificationDataset
-
-        self.im_dir.mkdir(parents=True, exist_ok=True)  # makes dataset-hub/images/
-        for split in "train", "val", "test":
-            if self.data.get(split) is None:
-                continue
-            dataset = YOLODataset(img_path=self.data[split], data=self.data)
-            with ThreadPool(NUM_THREADS) as pool:
-                for _ in TQDM(pool.imap(self._hub_ops, dataset.im_files), total=len(dataset), desc=f"{split} images"):
-                    pass
-        LOGGER.info(f"Done. All images saved to {self.im_dir}")
-        return self.im_dir
-
-
-def compress_one_image(f: str, f_new: str = None, max_dim: int = 1920, quality: int = 50):
-    """
-    Compress a single image file to reduced size while preserving its aspect ratio and quality using either the Python
-    Imaging Library (PIL) or OpenCV library. If the input image is smaller than the maximum dimension, it will not be
-    resized.
-
-    Args:
-        f (str): The path to the input image file.
-        f_new (str, optional): The path to the output image file. If not specified, the input file will be overwritten.
-        max_dim (int, optional): The maximum dimension (width or height) of the output image.
-        quality (int, optional): The image compression quality as a percentage.
-
-    Examples:
-        >>> from pathlib import Path
-        >>> from ultralytics.data.utils import compress_one_image
-        >>> for f in Path("path/to/dataset").rglob("*.jpg"):
-        >>>    compress_one_image(f)
-    """
-    try:  # use PIL
-        Image.MAX_IMAGE_PIXELS = None  # Fix DecompressionBombError, allow optimization of image > ~178.9 million pixels
-        im = Image.open(f)
-        if im.mode in {"RGBA", "LA"}:  # Convert to RGB if needed (for JPEG)
-            im = im.convert("RGB")
-        r = max_dim / max(im.height, im.width)  # ratio
-        if r < 1.0:  # image too large
-            im = im.resize((int(im.width * r), int(im.height * r)))
-        im.save(f_new or f, "JPEG", quality=quality, optimize=True)  # save
-    except Exception as e:  # use OpenCV
-        LOGGER.warning(f"HUB ops PIL failure {f}: {e}")
-        im = cv2.imread(f)
-        im_height, im_width = im.shape[:2]
-        r = max_dim / max(im_height, im_width)  # ratio
-        if r < 1.0:  # image too large
-            im = cv2.resize(im, (int(im_width * r), int(im_height * r)), interpolation=cv2.INTER_AREA)
-        cv2.imwrite(str(f_new or f), im)
-
-
-def load_dataset_cache_file(path: Path) -> dict:
-    """Load an Ultralytics *.cache dictionary from path."""
-    import gc
-
-    gc.disable()  # reduce pickle load time https://github.com/ultralytics/ultralytics/pull/1585
-    cache = np.load(str(path), allow_pickle=True).item()  # load dict
-    gc.enable()
-    return cache
-
-
-def save_dataset_cache_file(prefix: str, path: Path, x: dict, version: str):
-    """Save an Ultralytics dataset *.cache dictionary x to path."""
-    x["version"] = version  # add cache version
-    if is_dir_writeable(path.parent):
-        if path.exists():
-            path.unlink()  # remove *.cache file if exists
-        with open(str(path), "wb") as file:  # context manager here fixes windows async np.save bug
-            np.save(file, x)
-        LOGGER.info(f"{prefix}New cache created: {path}")
-    else:
-        LOGGER.warning(f"{prefix}Cache directory {path.parent} is not writeable, cache not saved.")
diff --git a/ultralytics/engine/__init__.py b/ultralytics/engine/__init__.py
deleted file mode 100644
index 77a19dc..0000000
--- a/ultralytics/engine/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
diff --git a/ultralytics/engine/__pycache__/__init__.cpython-310.pyc b/ultralytics/engine/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index e12f615..0000000
Binary files a/ultralytics/engine/__pycache__/__init__.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/engine/__pycache__/exporter.cpython-310.pyc b/ultralytics/engine/__pycache__/exporter.cpython-310.pyc
deleted file mode 100644
index e578c53..0000000
Binary files a/ultralytics/engine/__pycache__/exporter.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/engine/__pycache__/model.cpython-310.pyc b/ultralytics/engine/__pycache__/model.cpython-310.pyc
deleted file mode 100644
index 1efaea6..0000000
Binary files a/ultralytics/engine/__pycache__/model.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/engine/__pycache__/predictor.cpython-310.pyc b/ultralytics/engine/__pycache__/predictor.cpython-310.pyc
deleted file mode 100644
index 53ff912..0000000
Binary files a/ultralytics/engine/__pycache__/predictor.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/engine/__pycache__/results.cpython-310.pyc b/ultralytics/engine/__pycache__/results.cpython-310.pyc
deleted file mode 100644
index 70fafb2..0000000
Binary files a/ultralytics/engine/__pycache__/results.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/engine/__pycache__/trainer.cpython-310.pyc b/ultralytics/engine/__pycache__/trainer.cpython-310.pyc
deleted file mode 100644
index 84d804a..0000000
Binary files a/ultralytics/engine/__pycache__/trainer.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/engine/__pycache__/validator.cpython-310.pyc b/ultralytics/engine/__pycache__/validator.cpython-310.pyc
deleted file mode 100644
index 6d6f025..0000000
Binary files a/ultralytics/engine/__pycache__/validator.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/engine/exporter.py b/ultralytics/engine/exporter.py
deleted file mode 100644
index e1f2e86..0000000
--- a/ultralytics/engine/exporter.py
+++ /dev/null
@@ -1,1472 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-"""
-Export a YOLO PyTorch model to other formats. TensorFlow exports authored by https://github.com/zldrobit.
-
-Format                  | `format=argument`         | Model
----                     | ---                       | ---
-PyTorch                 | -                         | yolo11n.pt
-TorchScript             | `torchscript`             | yolo11n.torchscript
-ONNX                    | `onnx`                    | yolo11n.onnx
-OpenVINO                | `openvino`                | yolo11n_openvino_model/
-TensorRT                | `engine`                  | yolo11n.engine
-CoreML                  | `coreml`                  | yolo11n.mlpackage
-TensorFlow SavedModel   | `saved_model`             | yolo11n_saved_model/
-TensorFlow GraphDef     | `pb`                      | yolo11n.pb
-TensorFlow Lite         | `tflite`                  | yolo11n.tflite
-TensorFlow Edge TPU     | `edgetpu`                 | yolo11n_edgetpu.tflite
-TensorFlow.js           | `tfjs`                    | yolo11n_web_model/
-PaddlePaddle            | `paddle`                  | yolo11n_paddle_model/
-MNN                     | `mnn`                     | yolo11n.mnn
-NCNN                    | `ncnn`                    | yolo11n_ncnn_model/
-IMX                     | `imx`                     | yolo11n_imx_model/
-RKNN                    | `rknn`                    | yolo11n_rknn_model/
-
-Requirements:
-    $ pip install "ultralytics[export]"
-
-Python:
-    from ultralytics import YOLO
-    model = YOLO('yolo11n.pt')
-    results = model.export(format='onnx')
-
-CLI:
-    $ yolo mode=export model=yolo11n.pt format=onnx
-
-Inference:
-    $ yolo predict model=yolo11n.pt                 # PyTorch
-                         yolo11n.torchscript        # TorchScript
-                         yolo11n.onnx               # ONNX Runtime or OpenCV DNN with dnn=True
-                         yolo11n_openvino_model     # OpenVINO
-                         yolo11n.engine             # TensorRT
-                         yolo11n.mlpackage          # CoreML (macOS-only)
-                         yolo11n_saved_model        # TensorFlow SavedModel
-                         yolo11n.pb                 # TensorFlow GraphDef
-                         yolo11n.tflite             # TensorFlow Lite
-                         yolo11n_edgetpu.tflite     # TensorFlow Edge TPU
-                         yolo11n_paddle_model       # PaddlePaddle
-                         yolo11n.mnn                # MNN
-                         yolo11n_ncnn_model         # NCNN
-                         yolo11n_imx_model          # IMX
-                         yolo11n_rknn_model         # RKNN
-
-TensorFlow.js:
-    $ cd .. && git clone https://github.com/zldrobit/tfjs-yolov5-example.git && cd tfjs-yolov5-example
-    $ npm install
-    $ ln -s ../../yolo11n_web_model public/yolo11n_web_model
-    $ npm start
-"""
-
-import json
-import os
-import re
-import shutil
-import subprocess
-import time
-import warnings
-from copy import deepcopy
-from datetime import datetime
-from pathlib import Path
-
-import numpy as np
-import torch
-
-from ultralytics import __version__
-from ultralytics.cfg import TASK2DATA, get_cfg
-from ultralytics.data import build_dataloader
-from ultralytics.data.dataset import YOLODataset
-from ultralytics.data.utils import check_cls_dataset, check_det_dataset
-from ultralytics.nn.autobackend import check_class_names, default_class_names
-from ultralytics.nn.modules import C2f, Classify, Detect, RTDETRDecoder
-from ultralytics.nn.tasks import ClassificationModel, DetectionModel, SegmentationModel, WorldModel
-from ultralytics.utils import (
-    ARM64,
-    DEFAULT_CFG,
-    IS_COLAB,
-    IS_JETSON,
-    LINUX,
-    LOGGER,
-    MACOS,
-    MACOS_VERSION,
-    RKNN_CHIPS,
-    ROOT,
-    SETTINGS,
-    TORCH_VERSION,
-    WINDOWS,
-    YAML,
-    callbacks,
-    colorstr,
-    get_default_args,
-)
-from ultralytics.utils.checks import (
-    check_imgsz,
-    check_is_path_safe,
-    check_requirements,
-    check_version,
-    is_intel,
-    is_sudo_available,
-)
-from ultralytics.utils.downloads import attempt_download_asset, get_github_assets, safe_download
-from ultralytics.utils.export import onnx2engine, torch2imx, torch2onnx
-from ultralytics.utils.files import file_size, spaces_in_path
-from ultralytics.utils.metrics import batch_probiou
-from ultralytics.utils.nms import TorchNMS
-from ultralytics.utils.ops import Profile
-from ultralytics.utils.patches import arange_patch
-from ultralytics.utils.torch_utils import TORCH_1_11, TORCH_1_13, TORCH_2_1, TORCH_2_4, select_device
-
-
-def export_formats():
-    """Return a dictionary of Ultralytics YOLO export formats."""
-    x = [
-        ["PyTorch", "-", ".pt", True, True, []],
-        ["TorchScript", "torchscript", ".torchscript", True, True, ["batch", "optimize", "half", "nms", "dynamic"]],
-        ["ONNX", "onnx", ".onnx", True, True, ["batch", "dynamic", "half", "opset", "simplify", "nms"]],
-        [
-            "OpenVINO",
-            "openvino",
-            "_openvino_model",
-            True,
-            False,
-            ["batch", "dynamic", "half", "int8", "nms", "fraction"],
-        ],
-        [
-            "TensorRT",
-            "engine",
-            ".engine",
-            False,
-            True,
-            ["batch", "dynamic", "half", "int8", "simplify", "nms", "fraction"],
-        ],
-        ["CoreML", "coreml", ".mlpackage", True, False, ["batch", "half", "int8", "nms"]],
-        ["TensorFlow SavedModel", "saved_model", "_saved_model", True, True, ["batch", "int8", "keras", "nms"]],
-        ["TensorFlow GraphDef", "pb", ".pb", True, True, ["batch"]],
-        ["TensorFlow Lite", "tflite", ".tflite", True, False, ["batch", "half", "int8", "nms", "fraction"]],
-        ["TensorFlow Edge TPU", "edgetpu", "_edgetpu.tflite", True, False, []],
-        ["TensorFlow.js", "tfjs", "_web_model", True, False, ["batch", "half", "int8", "nms"]],
-        ["PaddlePaddle", "paddle", "_paddle_model", True, True, ["batch"]],
-        ["MNN", "mnn", ".mnn", True, True, ["batch", "half", "int8"]],
-        ["NCNN", "ncnn", "_ncnn_model", True, True, ["batch", "half"]],
-        ["IMX", "imx", "_imx_model", True, True, ["int8", "fraction", "nms"]],
-        ["RKNN", "rknn", "_rknn_model", False, False, ["batch", "name"]],
-    ]
-    return dict(zip(["Format", "Argument", "Suffix", "CPU", "GPU", "Arguments"], zip(*x)))
-
-
-def best_onnx_opset(onnx, cuda=False) -> int:
-    """Return max ONNX opset for this torch version with ONNX fallback."""
-    version = ".".join(TORCH_VERSION.split(".")[:2])
-    if TORCH_2_4:  # _constants.ONNX_MAX_OPSET first defined in torch 1.13
-        opset = torch.onnx.utils._constants.ONNX_MAX_OPSET - 1  # use second-latest version for safety
-        if cuda:
-            opset -= 2  # fix CUDA ONNXRuntime NMS squeeze op errors
-    else:
-        opset = {
-            "1.8": 12,
-            "1.9": 12,
-            "1.10": 13,
-            "1.11": 14,
-            "1.12": 15,
-            "1.13": 17,
-            "2.0": 17,  # reduced from 18 to fix ONNX errors
-            "2.1": 17,  # reduced from 19
-            "2.2": 17,  # reduced from 19
-            "2.3": 17,  # reduced from 19
-            "2.4": 20,
-            "2.5": 20,
-            "2.6": 20,
-            "2.7": 20,
-            "2.8": 23,
-        }.get(version, 12)
-    return min(opset, onnx.defs.onnx_opset_version())
-
-
-def validate_args(format, passed_args, valid_args):
-    """
-    Validate arguments based on the export format.
-
-    Args:
-        format (str): The export format.
-        passed_args (Namespace): The arguments used during export.
-        valid_args (list): List of valid arguments for the format.
-
-    Raises:
-        AssertionError: If an unsupported argument is used, or if the format lacks supported argument listings.
-    """
-    export_args = ["half", "int8", "dynamic", "keras", "nms", "batch", "fraction"]
-
-    assert valid_args is not None, f"ERROR ❌️ valid arguments for '{format}' not listed."
-    custom = {"batch": 1, "data": None, "device": None}  # exporter defaults
-    default_args = get_cfg(DEFAULT_CFG, custom)
-    for arg in export_args:
-        not_default = getattr(passed_args, arg, None) != getattr(default_args, arg, None)
-        if not_default:
-            assert arg in valid_args, f"ERROR ❌️ argument '{arg}' is not supported for format='{format}'"
-
-
-def gd_outputs(gd):
-    """Return TensorFlow GraphDef model output node names."""
-    name_list, input_list = [], []
-    for node in gd.node:  # tensorflow.core.framework.node_def_pb2.NodeDef
-        name_list.append(node.name)
-        input_list.extend(node.input)
-    return sorted(f"{x}:0" for x in list(set(name_list) - set(input_list)) if not x.startswith("NoOp"))
-
-
-def try_export(inner_func):
-    """YOLO export decorator, i.e. @try_export."""
-    inner_args = get_default_args(inner_func)
-
-    def outer_func(*args, **kwargs):
-        """Export a model."""
-        prefix = inner_args["prefix"]
-        dt = 0.0
-        try:
-            with Profile() as dt:
-                f = inner_func(*args, **kwargs)  # exported file/dir or tuple of (file/dir, *)
-            path = f if isinstance(f, (str, Path)) else f[0]
-            mb = file_size(path)
-            assert mb > 0.0, "0.0 MB output model size"
-            LOGGER.info(f"{prefix} export success ✅ {dt.t:.1f}s, saved as '{path}' ({mb:.1f} MB)")
-            return f
-        except Exception as e:
-            LOGGER.error(f"{prefix} export failure {dt.t:.1f}s: {e}")
-            raise e
-
-    return outer_func
-
-
-class Exporter:
-    """
-    A class for exporting YOLO models to various formats.
-
-    This class provides functionality to export YOLO models to different formats including ONNX, TensorRT, CoreML,
-    TensorFlow, and others. It handles format validation, device selection, model preparation, and the actual export
-    process for each supported format.
-
-    Attributes:
-        args (SimpleNamespace): Configuration arguments for the exporter.
-        callbacks (dict): Dictionary of callback functions for different export events.
-        im (torch.Tensor): Input tensor for model inference during export.
-        model (torch.nn.Module): The YOLO model to be exported.
-        file (Path): Path to the model file being exported.
-        output_shape (tuple): Shape of the model output tensor(s).
-        pretty_name (str): Formatted model name for display purposes.
-        metadata (dict): Model metadata including description, author, version, etc.
-        device (torch.device): Device on which the model is loaded.
-        imgsz (tuple): Input image size for the model.
-
-    Methods:
-        __call__: Main export method that handles the export process.
-        get_int8_calibration_dataloader: Build dataloader for INT8 calibration.
-        export_torchscript: Export model to TorchScript format.
-        export_onnx: Export model to ONNX format.
-        export_openvino: Export model to OpenVINO format.
-        export_paddle: Export model to PaddlePaddle format.
-        export_mnn: Export model to MNN format.
-        export_ncnn: Export model to NCNN format.
-        export_coreml: Export model to CoreML format.
-        export_engine: Export model to TensorRT format.
-        export_saved_model: Export model to TensorFlow SavedModel format.
-        export_pb: Export model to TensorFlow GraphDef format.
-        export_tflite: Export model to TensorFlow Lite format.
-        export_edgetpu: Export model to Edge TPU format.
-        export_tfjs: Export model to TensorFlow.js format.
-        export_rknn: Export model to RKNN format.
-        export_imx: Export model to IMX format.
-
-    Examples:
-        Export a YOLOv8 model to ONNX format
-        >>> from ultralytics.engine.exporter import Exporter
-        >>> exporter = Exporter()
-        >>> exporter(model="yolov8n.pt")  # exports to yolov8n.onnx
-
-        Export with specific arguments
-        >>> args = {"format": "onnx", "dynamic": True, "half": True}
-        >>> exporter = Exporter(overrides=args)
-        >>> exporter(model="yolov8n.pt")
-    """
-
-    def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
-        """
-        Initialize the Exporter class.
-
-        Args:
-            cfg (str, optional): Path to a configuration file.
-            overrides (dict, optional): Configuration overrides.
-            _callbacks (dict, optional): Dictionary of callback functions.
-        """
-        self.args = get_cfg(cfg, overrides)
-        self.callbacks = _callbacks or callbacks.get_default_callbacks()
-        callbacks.add_integration_callbacks(self)
-
-    def __call__(self, model=None) -> str:
-        """Return list of exported files/dirs after running callbacks."""
-        t = time.time()
-        fmt = self.args.format.lower()  # to lowercase
-        if fmt in {"tensorrt", "trt"}:  # 'engine' aliases
-            fmt = "engine"
-        if fmt in {"mlmodel", "mlpackage", "mlprogram", "apple", "ios", "coreml"}:  # 'coreml' aliases
-            fmt = "coreml"
-        fmts_dict = export_formats()
-        fmts = tuple(fmts_dict["Argument"][1:])  # available export formats
-        if fmt not in fmts:
-            import difflib
-
-            # Get the closest match if format is invalid
-            matches = difflib.get_close_matches(fmt, fmts, n=1, cutoff=0.6)  # 60% similarity required to match
-            if not matches:
-                msg = "Model is already in PyTorch format." if fmt == "pt" else f"Invalid export format='{fmt}'."
-                raise ValueError(f"{msg} Valid formats are {fmts}")
-            LOGGER.warning(f"Invalid export format='{fmt}', updating to format='{matches[0]}'")
-            fmt = matches[0]
-        flags = [x == fmt for x in fmts]
-        if sum(flags) != 1:
-            raise ValueError(f"Invalid export format='{fmt}'. Valid formats are {fmts}")
-        (jit, onnx, xml, engine, coreml, saved_model, pb, tflite, edgetpu, tfjs, paddle, mnn, ncnn, imx, rknn) = (
-            flags  # export booleans
-        )
-
-        is_tf_format = any((saved_model, pb, tflite, edgetpu, tfjs))
-
-        # Device
-        dla = None
-        if engine and self.args.device is None:
-            LOGGER.warning("TensorRT requires GPU export, automatically assigning device=0")
-            self.args.device = "0"
-        if engine and "dla" in str(self.args.device):  # convert int/list to str first
-            dla = self.args.device.rsplit(":", 1)[-1]
-            self.args.device = "0"  # update device to "0"
-            assert dla in {"0", "1"}, f"Expected self.args.device='dla:0' or 'dla:1, but got {self.args.device}."
-        if imx and self.args.device is None and torch.cuda.is_available():
-            LOGGER.warning("Exporting on CPU while CUDA is available, setting device=0 for faster export on GPU.")
-            self.args.device = "0"  # update device to "0"
-        self.device = select_device("cpu" if self.args.device is None else self.args.device)
-
-        # Argument compatibility checks
-        fmt_keys = fmts_dict["Arguments"][flags.index(True) + 1]
-        validate_args(fmt, self.args, fmt_keys)
-        if imx:
-            if not self.args.int8:
-                LOGGER.warning("IMX export requires int8=True, setting int8=True.")
-                self.args.int8 = True
-            if not self.args.nms:
-                LOGGER.warning("IMX export requires nms=True, setting nms=True.")
-                self.args.nms = True
-            if model.task not in {"detect", "pose"}:
-                raise ValueError("IMX export only supported for detection and pose estimation models.")
-        if not hasattr(model, "names"):
-            model.names = default_class_names()
-        model.names = check_class_names(model.names)
-        if self.args.half and self.args.int8:
-            LOGGER.warning("half=True and int8=True are mutually exclusive, setting half=False.")
-            self.args.half = False
-        if self.args.half and onnx and self.device.type == "cpu":
-            LOGGER.warning("half=True only compatible with GPU export, i.e. use device=0")
-            self.args.half = False
-        self.imgsz = check_imgsz(self.args.imgsz, stride=model.stride, min_dim=2)  # check image size
-        if self.args.optimize:
-            assert not ncnn, "optimize=True not compatible with format='ncnn', i.e. use optimize=False"
-            assert self.device.type == "cpu", "optimize=True not compatible with cuda devices, i.e. use device='cpu'"
-        if rknn:
-            if not self.args.name:
-                LOGGER.warning(
-                    "Rockchip RKNN export requires a missing 'name' arg for processor type. "
-                    "Using default name='rk3588'."
-                )
-                self.args.name = "rk3588"
-            self.args.name = self.args.name.lower()
-            assert self.args.name in RKNN_CHIPS, (
-                f"Invalid processor name '{self.args.name}' for Rockchip RKNN export. Valid names are {RKNN_CHIPS}."
-            )
-        if self.args.int8 and tflite:
-            assert not getattr(model, "end2end", False), "TFLite INT8 export not supported for end2end models."
-        if self.args.nms:
-            assert not isinstance(model, ClassificationModel), "'nms=True' is not valid for classification models."
-            assert not tflite or not ARM64 or not LINUX, "TFLite export with NMS unsupported on ARM64 Linux"
-            assert not is_tf_format or TORCH_1_13, "TensorFlow exports with NMS require torch>=1.13"
-            assert not onnx or TORCH_1_13, "ONNX export with NMS requires torch>=1.13"
-            if getattr(model, "end2end", False):
-                LOGGER.warning("'nms=True' is not available for end2end models. Forcing 'nms=False'.")
-                self.args.nms = False
-            self.args.conf = self.args.conf or 0.25  # set conf default value for nms export
-        if (engine or self.args.nms) and self.args.dynamic and self.args.batch == 1:
-            LOGGER.warning(
-                f"'dynamic=True' model with '{'nms=True' if self.args.nms else 'format=engine'}' requires max batch size, i.e. 'batch=16'"
-            )
-        if edgetpu:
-            if not LINUX or ARM64:
-                raise SystemError(
-                    "Edge TPU export only supported on non-aarch64 Linux. See https://coral.ai/docs/edgetpu/compiler"
-                )
-            elif self.args.batch != 1:  # see github.com/ultralytics/ultralytics/pull/13420
-                LOGGER.warning("Edge TPU export requires batch size 1, setting batch=1.")
-                self.args.batch = 1
-        if isinstance(model, WorldModel):
-            LOGGER.warning(
-                "YOLOWorld (original version) export is not supported to any format. "
-                "YOLOWorldv2 models (i.e. 'yolov8s-worldv2.pt') only support export to "
-                "(torchscript, onnx, openvino, engine, coreml) formats. "
-                "See https://docs.ultralytics.com/models/yolo-world for details."
-            )
-            model.clip_model = None  # openvino int8 export error: https://github.com/ultralytics/ultralytics/pull/18445
-        if self.args.int8 and not self.args.data:
-            self.args.data = DEFAULT_CFG.data or TASK2DATA[getattr(model, "task", "detect")]  # assign default data
-            LOGGER.warning(
-                f"INT8 export requires a missing 'data' arg for calibration. Using default 'data={self.args.data}'."
-            )
-        if tfjs and (ARM64 and LINUX):
-            raise SystemError("TF.js exports are not currently supported on ARM64 Linux")
-        # Recommend OpenVINO if export and Intel CPU
-        if SETTINGS.get("openvino_msg"):
-            if is_intel():
-                LOGGER.info(
-                    "💡 ProTip: Export to OpenVINO format for best performance on Intel hardware."
-                    " Learn more at https://docs.ultralytics.com/integrations/openvino/"
-                )
-            SETTINGS["openvino_msg"] = False
-
-        # Input
-        im = torch.zeros(self.args.batch, model.yaml.get("channels", 3), *self.imgsz).to(self.device)
-        file = Path(
-            getattr(model, "pt_path", None) or getattr(model, "yaml_file", None) or model.yaml.get("yaml_file", "")
-        )
-        if file.suffix in {".yaml", ".yml"}:
-            file = Path(file.name)
-
-        # Update model
-        model = deepcopy(model).to(self.device)
-        for p in model.parameters():
-            p.requires_grad = False
-        model.eval()
-        model.float()
-        model = model.fuse()
-
-        if imx:
-            from ultralytics.utils.export.imx import FXModel
-
-            model = FXModel(model, self.imgsz)
-        for m in model.modules():
-            if isinstance(m, Classify):
-                m.export = True
-            if isinstance(m, (Detect, RTDETRDecoder)):  # includes all Detect subclasses like Segment, Pose, OBB
-                m.dynamic = self.args.dynamic
-                m.export = True
-                m.format = self.args.format
-                m.max_det = self.args.max_det
-                m.xyxy = self.args.nms and not coreml
-                if hasattr(model, "pe") and hasattr(m, "fuse"):  # for YOLOE models
-                    m.fuse(model.pe.to(self.device))
-            elif isinstance(m, C2f) and not is_tf_format:
-                # EdgeTPU does not support FlexSplitV while split provides cleaner ONNX graph
-                m.forward = m.forward_split
-
-        y = None
-        for _ in range(2):  # dry runs
-            y = NMSModel(model, self.args)(im) if self.args.nms and not coreml and not imx else model(im)
-        if self.args.half and onnx and self.device.type != "cpu":
-            im, model = im.half(), model.half()  # to FP16
-
-        # Filter warnings
-        warnings.filterwarnings("ignore", category=torch.jit.TracerWarning)  # suppress TracerWarning
-        warnings.filterwarnings("ignore", category=UserWarning)  # suppress shape prim::Constant missing ONNX warning
-        warnings.filterwarnings("ignore", category=DeprecationWarning)  # suppress CoreML np.bool deprecation warning
-
-        # Assign
-        self.im = im
-        self.model = model
-        self.file = file
-        self.output_shape = (
-            tuple(y.shape)
-            if isinstance(y, torch.Tensor)
-            else tuple(tuple(x.shape if isinstance(x, torch.Tensor) else []) for x in y)
-        )
-        self.pretty_name = Path(self.model.yaml.get("yaml_file", self.file)).stem.replace("yolo", "YOLO")
-        data = model.args["data"] if hasattr(model, "args") and isinstance(model.args, dict) else ""
-        description = f"Ultralytics {self.pretty_name} model {f'trained on {data}' if data else ''}"
-        self.metadata = {
-            "description": description,
-            "author": "Ultralytics",
-            "date": datetime.now().isoformat(),
-            "version": __version__,
-            "license": "AGPL-3.0 License (https://ultralytics.com/license)",
-            "docs": "https://docs.ultralytics.com",
-            "stride": int(max(model.stride)),
-            "task": model.task,
-            "batch": self.args.batch,
-            "imgsz": self.imgsz,
-            "names": model.names,
-            "args": {k: v for k, v in self.args if k in fmt_keys},
-            "channels": model.yaml.get("channels", 3),
-        }  # model metadata
-        if dla is not None:
-            self.metadata["dla"] = dla  # make sure `AutoBackend` uses correct dla device if it has one
-        if model.task == "pose":
-            self.metadata["kpt_shape"] = model.model[-1].kpt_shape
-
-        LOGGER.info(
-            f"\n{colorstr('PyTorch:')} starting from '{file}' with input shape {tuple(im.shape)} BCHW and "
-            f"output shape(s) {self.output_shape} ({file_size(file):.1f} MB)"
-        )
-        self.run_callbacks("on_export_start")
-        # Exports
-        f = [""] * len(fmts)  # exported filenames
-        if jit or ncnn:  # TorchScript
-            f[0] = self.export_torchscript()
-        if engine:  # TensorRT required before ONNX
-            f[1] = self.export_engine(dla=dla)
-        if onnx:  # ONNX
-            f[2] = self.export_onnx()
-        if xml:  # OpenVINO
-            f[3] = self.export_openvino()
-        if coreml:  # CoreML
-            f[4] = self.export_coreml()
-        if is_tf_format:  # TensorFlow formats
-            self.args.int8 |= edgetpu
-            f[5], keras_model = self.export_saved_model()
-            if pb or tfjs:  # pb prerequisite to tfjs
-                f[6] = self.export_pb(keras_model=keras_model)
-            if tflite:
-                f[7] = self.export_tflite()
-            if edgetpu:
-                f[8] = self.export_edgetpu(tflite_model=Path(f[5]) / f"{self.file.stem}_full_integer_quant.tflite")
-            if tfjs:
-                f[9] = self.export_tfjs()
-        if paddle:  # PaddlePaddle
-            f[10] = self.export_paddle()
-        if mnn:  # MNN
-            f[11] = self.export_mnn()
-        if ncnn:  # NCNN
-            f[12] = self.export_ncnn()
-        if imx:
-            f[13] = self.export_imx()
-        if rknn:
-            f[14] = self.export_rknn()
-
-        # Finish
-        f = [str(x) for x in f if x]  # filter out '' and None
-        if any(f):
-            f = str(Path(f[-1]))
-            square = self.imgsz[0] == self.imgsz[1]
-            s = (
-                ""
-                if square
-                else f"WARNING ⚠️ non-PyTorch val requires square images, 'imgsz={self.imgsz}' will not "
-                f"work. Use export 'imgsz={max(self.imgsz)}' if val is required."
-            )
-            imgsz = self.imgsz[0] if square else str(self.imgsz)[1:-1].replace(" ", "")
-            predict_data = f"data={data}" if model.task == "segment" and pb else ""
-            q = "int8" if self.args.int8 else "half" if self.args.half else ""  # quantization
-            LOGGER.info(
-                f"\nExport complete ({time.time() - t:.1f}s)"
-                f"\nResults saved to {colorstr('bold', file.parent.resolve())}"
-                f"\nPredict:         yolo predict task={model.task} model={f} imgsz={imgsz} {q} {predict_data}"
-                f"\nValidate:        yolo val task={model.task} model={f} imgsz={imgsz} data={data} {q} {s}"
-                f"\nVisualize:       https://netron.app"
-            )
-
-        self.run_callbacks("on_export_end")
-        return f  # return list of exported files/dirs
-
-    def get_int8_calibration_dataloader(self, prefix=""):
-        """Build and return a dataloader for calibration of INT8 models."""
-        LOGGER.info(f"{prefix} collecting INT8 calibration images from 'data={self.args.data}'")
-        data = (check_cls_dataset if self.model.task == "classify" else check_det_dataset)(self.args.data)
-        dataset = YOLODataset(
-            data[self.args.split or "val"],
-            data=data,
-            fraction=self.args.fraction,
-            task=self.model.task,
-            imgsz=self.imgsz[0],
-            augment=False,
-            batch_size=self.args.batch,
-        )
-        n = len(dataset)
-        if n < self.args.batch:
-            raise ValueError(
-                f"The calibration dataset ({n} images) must have at least as many images as the batch size "
-                f"('batch={self.args.batch}')."
-            )
-        elif n < 300:
-            LOGGER.warning(f"{prefix} >300 images recommended for INT8 calibration, found {n} images.")
-        return build_dataloader(dataset, batch=self.args.batch, workers=0, drop_last=True)  # required for batch loading
-
-    @try_export
-    def export_torchscript(self, prefix=colorstr("TorchScript:")):
-        """Export YOLO model to TorchScript format."""
-        LOGGER.info(f"\n{prefix} starting export with torch {TORCH_VERSION}...")
-        f = self.file.with_suffix(".torchscript")
-
-        ts = torch.jit.trace(NMSModel(self.model, self.args) if self.args.nms else self.model, self.im, strict=False)
-        extra_files = {"config.txt": json.dumps(self.metadata)}  # torch._C.ExtraFilesMap()
-        if self.args.optimize:  # https://pytorch.org/tutorials/recipes/mobile_interpreter.html
-            LOGGER.info(f"{prefix} optimizing for mobile...")
-            from torch.utils.mobile_optimizer import optimize_for_mobile
-
-            optimize_for_mobile(ts)._save_for_lite_interpreter(str(f), _extra_files=extra_files)
-        else:
-            ts.save(str(f), _extra_files=extra_files)
-        return f
-
-    @try_export
-    def export_onnx(self, prefix=colorstr("ONNX:")):
-        """Export YOLO model to ONNX format."""
-        requirements = ["onnx>=1.12.0"]
-        if self.args.simplify:
-            requirements += ["onnxslim>=0.1.67", "onnxruntime" + ("-gpu" if torch.cuda.is_available() else "")]
-        check_requirements(requirements)
-        import onnx  # noqa
-
-        opset = self.args.opset or best_onnx_opset(onnx, cuda="cuda" in self.device.type)
-        LOGGER.info(f"\n{prefix} starting export with onnx {onnx.__version__} opset {opset}...")
-        if self.args.nms:
-            assert TORCH_1_13, f"'nms=True' ONNX export requires torch>=1.13 (found torch=={TORCH_VERSION})"
-
-        f = str(self.file.with_suffix(".onnx"))
-        output_names = ["output0", "output1"] if isinstance(self.model, SegmentationModel) else ["output0"]
-        dynamic = self.args.dynamic
-        if dynamic:
-            dynamic = {"images": {0: "batch", 2: "height", 3: "width"}}  # shape(1,3,640,640)
-            if isinstance(self.model, SegmentationModel):
-                dynamic["output0"] = {0: "batch", 2: "anchors"}  # shape(1, 116, 8400)
-                dynamic["output1"] = {0: "batch", 2: "mask_height", 3: "mask_width"}  # shape(1,32,160,160)
-            elif isinstance(self.model, DetectionModel):
-                dynamic["output0"] = {0: "batch", 2: "anchors"}  # shape(1, 84, 8400)
-            if self.args.nms:  # only batch size is dynamic with NMS
-                dynamic["output0"].pop(2)
-        if self.args.nms and self.model.task == "obb":
-            self.args.opset = opset  # for NMSModel
-
-        with arange_patch(self.args):
-            torch2onnx(
-                NMSModel(self.model, self.args) if self.args.nms else self.model,
-                self.im,
-                f,
-                opset=opset,
-                input_names=["images"],
-                output_names=output_names,
-                dynamic=dynamic or None,
-            )
-
-        # Checks
-        model_onnx = onnx.load(f)  # load onnx model
-
-        # Simplify
-        if self.args.simplify:
-            try:
-                import onnxslim
-
-                LOGGER.info(f"{prefix} slimming with onnxslim {onnxslim.__version__}...")
-                model_onnx = onnxslim.slim(model_onnx)
-
-            except Exception as e:
-                LOGGER.warning(f"{prefix} simplifier failure: {e}")
-
-        # Metadata
-        for k, v in self.metadata.items():
-            meta = model_onnx.metadata_props.add()
-            meta.key, meta.value = k, str(v)
-
-        # IR version
-        if getattr(model_onnx, "ir_version", 0) > 10:
-            LOGGER.info(f"{prefix} limiting IR version {model_onnx.ir_version} to 10 for ONNXRuntime compatibility...")
-            model_onnx.ir_version = 10
-
-        onnx.save(model_onnx, f)
-        return f
-
-    @try_export
-    def export_openvino(self, prefix=colorstr("OpenVINO:")):
-        """Export YOLO model to OpenVINO format."""
-        # OpenVINO <= 2025.1.0 error on macOS 15.4+: https://github.com/openvinotoolkit/openvino/issues/30023"
-        check_requirements("openvino>=2025.2.0" if MACOS and MACOS_VERSION >= "15.4" else "openvino>=2024.0.0")
-        import openvino as ov
-
-        LOGGER.info(f"\n{prefix} starting export with openvino {ov.__version__}...")
-        assert TORCH_2_1, f"OpenVINO export requires torch>=2.1 but torch=={TORCH_VERSION} is installed"
-        ov_model = ov.convert_model(
-            NMSModel(self.model, self.args) if self.args.nms else self.model,
-            input=None if self.args.dynamic else [self.im.shape],
-            example_input=self.im,
-        )
-
-        def serialize(ov_model, file):
-            """Set RT info, serialize, and save metadata YAML."""
-            ov_model.set_rt_info("YOLO", ["model_info", "model_type"])
-            ov_model.set_rt_info(True, ["model_info", "reverse_input_channels"])
-            ov_model.set_rt_info(114, ["model_info", "pad_value"])
-            ov_model.set_rt_info([255.0], ["model_info", "scale_values"])
-            ov_model.set_rt_info(self.args.iou, ["model_info", "iou_threshold"])
-            ov_model.set_rt_info([v.replace(" ", "_") for v in self.model.names.values()], ["model_info", "labels"])
-            if self.model.task != "classify":
-                ov_model.set_rt_info("fit_to_window_letterbox", ["model_info", "resize_type"])
-
-            ov.save_model(ov_model, file, compress_to_fp16=self.args.half)
-            YAML.save(Path(file).parent / "metadata.yaml", self.metadata)  # add metadata.yaml
-
-        if self.args.int8:
-            fq = str(self.file).replace(self.file.suffix, f"_int8_openvino_model{os.sep}")
-            fq_ov = str(Path(fq) / self.file.with_suffix(".xml").name)
-            # INT8 requires nncf, nncf requires packaging>=23.2 https://github.com/openvinotoolkit/nncf/issues/3463
-            check_requirements("packaging>=23.2")  # must be installed first to build nncf wheel
-            check_requirements("nncf>=2.14.0")
-            import nncf
-
-            def transform_fn(data_item) -> np.ndarray:
-                """Quantization transform function."""
-                data_item: torch.Tensor = data_item["img"] if isinstance(data_item, dict) else data_item
-                assert data_item.dtype == torch.uint8, "Input image must be uint8 for the quantization preprocessing"
-                im = data_item.numpy().astype(np.float32) / 255.0  # uint8 to fp16/32 and 0-255 to 0.0-1.0
-                return np.expand_dims(im, 0) if im.ndim == 3 else im
-
-            # Generate calibration data for integer quantization
-            ignored_scope = None
-            if isinstance(self.model.model[-1], Detect):
-                # Includes all Detect subclasses like Segment, Pose, OBB, WorldDetect, YOLOEDetect
-                head_module_name = ".".join(list(self.model.named_modules())[-1][0].split(".")[:2])
-                ignored_scope = nncf.IgnoredScope(  # ignore operations
-                    patterns=[
-                        f".*{head_module_name}/.*/Add",
-                        f".*{head_module_name}/.*/Sub*",
-                        f".*{head_module_name}/.*/Mul*",
-                        f".*{head_module_name}/.*/Div*",
-                        f".*{head_module_name}\\.dfl.*",
-                    ],
-                    types=["Sigmoid"],
-                )
-
-            quantized_ov_model = nncf.quantize(
-                model=ov_model,
-                calibration_dataset=nncf.Dataset(self.get_int8_calibration_dataloader(prefix), transform_fn),
-                preset=nncf.QuantizationPreset.MIXED,
-                ignored_scope=ignored_scope,
-            )
-            serialize(quantized_ov_model, fq_ov)
-            return fq
-
-        f = str(self.file).replace(self.file.suffix, f"_openvino_model{os.sep}")
-        f_ov = str(Path(f) / self.file.with_suffix(".xml").name)
-
-        serialize(ov_model, f_ov)
-        return f
-
-    @try_export
-    def export_paddle(self, prefix=colorstr("PaddlePaddle:")):
-        """Export YOLO model to PaddlePaddle format."""
-        assert not IS_JETSON, "Jetson Paddle exports not supported yet"
-        check_requirements(
-            (
-                "paddlepaddle-gpu"
-                if torch.cuda.is_available()
-                else "paddlepaddle==3.0.0"  # pin 3.0.0 for ARM64
-                if ARM64
-                else "paddlepaddle>=3.0.0",
-                "x2paddle",
-            )
-        )
-        import x2paddle  # noqa
-        from x2paddle.convert import pytorch2paddle  # noqa
-
-        LOGGER.info(f"\n{prefix} starting export with X2Paddle {x2paddle.__version__}...")
-        f = str(self.file).replace(self.file.suffix, f"_paddle_model{os.sep}")
-
-        pytorch2paddle(module=self.model, save_dir=f, jit_type="trace", input_examples=[self.im])  # export
-        YAML.save(Path(f) / "metadata.yaml", self.metadata)  # add metadata.yaml
-        return f
-
-    @try_export
-    def export_mnn(self, prefix=colorstr("MNN:")):
-        """Export YOLO model to MNN format using MNN https://github.com/alibaba/MNN."""
-        f_onnx = self.export_onnx()  # get onnx model first
-
-        check_requirements("MNN>=2.9.6")
-        import MNN  # noqa
-        from MNN.tools import mnnconvert
-
-        # Setup and checks
-        LOGGER.info(f"\n{prefix} starting export with MNN {MNN.version()}...")
-        assert Path(f_onnx).exists(), f"failed to export ONNX file: {f_onnx}"
-        f = str(self.file.with_suffix(".mnn"))  # MNN model file
-        args = ["", "-f", "ONNX", "--modelFile", f_onnx, "--MNNModel", f, "--bizCode", json.dumps(self.metadata)]
-        if self.args.int8:
-            args.extend(("--weightQuantBits", "8"))
-        if self.args.half:
-            args.append("--fp16")
-        mnnconvert.convert(args)
-        # remove scratch file for model convert optimize
-        convert_scratch = Path(self.file.parent / ".__convert_external_data.bin")
-        if convert_scratch.exists():
-            convert_scratch.unlink()
-        return f
-
-    @try_export
-    def export_ncnn(self, prefix=colorstr("NCNN:")):
-        """Export YOLO model to NCNN format using PNNX https://github.com/pnnx/pnnx."""
-        check_requirements("ncnn", cmds="--no-deps")  # no deps to avoid installing opencv-python
-        import ncnn  # noqa
-
-        LOGGER.info(f"\n{prefix} starting export with NCNN {ncnn.__version__}...")
-        f = Path(str(self.file).replace(self.file.suffix, f"_ncnn_model{os.sep}"))
-        f_ts = self.file.with_suffix(".torchscript")
-
-        name = Path("pnnx.exe" if WINDOWS else "pnnx")  # PNNX filename
-        pnnx = name if name.is_file() else (ROOT / name)
-        if not pnnx.is_file():
-            LOGGER.warning(
-                f"{prefix} PNNX not found. Attempting to download binary file from "
-                "https://github.com/pnnx/pnnx/.\nNote PNNX Binary file must be placed in current working directory "
-                f"or in {ROOT}. See PNNX repo for full installation instructions."
-            )
-            system = "macos" if MACOS else "windows" if WINDOWS else "linux-aarch64" if ARM64 else "linux"
-            try:
-                release, assets = get_github_assets(repo="pnnx/pnnx")
-                asset = [x for x in assets if f"{system}.zip" in x][0]
-                assert isinstance(asset, str), "Unable to retrieve PNNX repo assets"  # i.e. pnnx-20240410-macos.zip
-                LOGGER.info(f"{prefix} successfully found latest PNNX asset file {asset}")
-            except Exception as e:
-                release = "20240410"
-                asset = f"pnnx-{release}-{system}.zip"
-                LOGGER.warning(f"{prefix} PNNX GitHub assets not found: {e}, using default {asset}")
-            unzip_dir = safe_download(f"https://github.com/pnnx/pnnx/releases/download/{release}/{asset}", delete=True)
-            if check_is_path_safe(Path.cwd(), unzip_dir):  # avoid path traversal security vulnerability
-                shutil.move(src=unzip_dir / name, dst=pnnx)  # move binary to ROOT
-                pnnx.chmod(0o777)  # set read, write, and execute permissions for everyone
-                shutil.rmtree(unzip_dir)  # delete unzip dir
-
-        ncnn_args = [
-            f"ncnnparam={f / 'model.ncnn.param'}",
-            f"ncnnbin={f / 'model.ncnn.bin'}",
-            f"ncnnpy={f / 'model_ncnn.py'}",
-        ]
-
-        pnnx_args = [
-            f"pnnxparam={f / 'model.pnnx.param'}",
-            f"pnnxbin={f / 'model.pnnx.bin'}",
-            f"pnnxpy={f / 'model_pnnx.py'}",
-            f"pnnxonnx={f / 'model.pnnx.onnx'}",
-        ]
-
-        cmd = [
-            str(pnnx),
-            str(f_ts),
-            *ncnn_args,
-            *pnnx_args,
-            f"fp16={int(self.args.half)}",
-            f"device={self.device.type}",
-            f'inputshape="{[self.args.batch, 3, *self.imgsz]}"',
-        ]
-        f.mkdir(exist_ok=True)  # make ncnn_model directory
-        LOGGER.info(f"{prefix} running '{' '.join(cmd)}'")
-        subprocess.run(cmd, check=True)
-
-        # Remove debug files
-        pnnx_files = [x.rsplit("=", 1)[-1] for x in pnnx_args]
-        for f_debug in ("debug.bin", "debug.param", "debug2.bin", "debug2.param", *pnnx_files):
-            Path(f_debug).unlink(missing_ok=True)
-
-        YAML.save(f / "metadata.yaml", self.metadata)  # add metadata.yaml
-        return str(f)
-
-    @try_export
-    def export_coreml(self, prefix=colorstr("CoreML:")):
-        """Export YOLO model to CoreML format."""
-        mlmodel = self.args.format.lower() == "mlmodel"  # legacy *.mlmodel export format requested
-        check_requirements("coremltools>=8.0")
-        import coremltools as ct  # noqa
-
-        LOGGER.info(f"\n{prefix} starting export with coremltools {ct.__version__}...")
-        assert not WINDOWS, "CoreML export is not supported on Windows, please run on macOS or Linux."
-        assert TORCH_1_11, "CoreML export requires torch>=1.11"
-        assert self.args.batch == 1, "CoreML batch sizes > 1 are not supported. Please retry at 'batch=1'."
-        f = self.file.with_suffix(".mlmodel" if mlmodel else ".mlpackage")
-        if f.is_dir():
-            shutil.rmtree(f)
-
-        bias = [0.0, 0.0, 0.0]
-        scale = 1 / 255
-        classifier_config = None
-        if self.model.task == "classify":
-            classifier_config = ct.ClassifierConfig(list(self.model.names.values()))
-            model = self.model
-        elif self.model.task == "detect":
-            model = IOSDetectModel(self.model, self.im) if self.args.nms else self.model
-        else:
-            if self.args.nms:
-                LOGGER.warning(f"{prefix} 'nms=True' is only available for Detect models like 'yolo11n.pt'.")
-                # TODO CoreML Segment and Pose model pipelining
-            model = self.model
-        ts = torch.jit.trace(model.eval(), self.im, strict=False)  # TorchScript model
-
-        # Based on apple's documentation it is better to leave out the minimum_deployment target and let that get set
-        # Internally based on the model conversion and output type.
-        # Setting minimum_depoloyment_target >= iOS16 will require setting compute_precision=ct.precision.FLOAT32.
-        # iOS16 adds in better support for FP16, but none of the CoreML NMS specifications handle FP16 as input.
-        ct_model = ct.convert(
-            ts,
-            inputs=[ct.ImageType("image", shape=self.im.shape, scale=scale, bias=bias)],  # expects ct.TensorType
-            classifier_config=classifier_config,
-            convert_to="neuralnetwork" if mlmodel else "mlprogram",
-        )
-        bits, mode = (8, "kmeans") if self.args.int8 else (16, "linear") if self.args.half else (32, None)
-        if bits < 32:
-            if "kmeans" in mode:
-                check_requirements("scikit-learn")  # scikit-learn package required for k-means quantization
-            if mlmodel:
-                ct_model = ct.models.neural_network.quantization_utils.quantize_weights(ct_model, bits, mode)
-            elif bits == 8:  # mlprogram already quantized to FP16
-                import coremltools.optimize.coreml as cto
-
-                op_config = cto.OpPalettizerConfig(mode="kmeans", nbits=bits, weight_threshold=512)
-                config = cto.OptimizationConfig(global_config=op_config)
-                ct_model = cto.palettize_weights(ct_model, config=config)
-        if self.args.nms and self.model.task == "detect":
-            if mlmodel:
-                weights_dir = None
-            else:
-                ct_model.save(str(f))  # save otherwise weights_dir does not exist
-                weights_dir = str(f / "Data/com.apple.CoreML/weights")
-            ct_model = self._pipeline_coreml(ct_model, weights_dir=weights_dir)
-
-        m = self.metadata  # metadata dict
-        ct_model.short_description = m.pop("description")
-        ct_model.author = m.pop("author")
-        ct_model.license = m.pop("license")
-        ct_model.version = m.pop("version")
-        ct_model.user_defined_metadata.update({k: str(v) for k, v in m.items()})
-        if self.model.task == "classify":
-            ct_model.user_defined_metadata.update({"com.apple.coreml.model.preview.type": "imageClassifier"})
-
-        try:
-            ct_model.save(str(f))  # save *.mlpackage
-        except Exception as e:
-            LOGGER.warning(
-                f"{prefix} CoreML export to *.mlpackage failed ({e}), reverting to *.mlmodel export. "
-                f"Known coremltools Python 3.11 and Windows bugs https://github.com/apple/coremltools/issues/1928."
-            )
-            f = f.with_suffix(".mlmodel")
-            ct_model.save(str(f))
-        return f
-
-    @try_export
-    def export_engine(self, dla=None, prefix=colorstr("TensorRT:")):
-        """Export YOLO model to TensorRT format https://developer.nvidia.com/tensorrt."""
-        assert self.im.device.type != "cpu", "export running on CPU but must be on GPU, i.e. use 'device=0'"
-        f_onnx = self.export_onnx()  # run before TRT import https://github.com/ultralytics/ultralytics/issues/7016
-
-        try:
-            import tensorrt as trt  # noqa
-        except ImportError:
-            if LINUX:
-                cuda_version = torch.version.cuda.split(".")[0]
-                check_requirements(f"tensorrt-cu{cuda_version}>7.0.0,!=10.1.0")
-            import tensorrt as trt  # noqa
-        check_version(trt.__version__, ">=7.0.0", hard=True)
-        check_version(trt.__version__, "!=10.1.0", msg="https://github.com/ultralytics/ultralytics/pull/14239")
-
-        # Setup and checks
-        LOGGER.info(f"\n{prefix} starting export with TensorRT {trt.__version__}...")
-        assert Path(f_onnx).exists(), f"failed to export ONNX file: {f_onnx}"
-        f = self.file.with_suffix(".engine")  # TensorRT engine file
-        onnx2engine(
-            f_onnx,
-            f,
-            self.args.workspace,
-            self.args.half,
-            self.args.int8,
-            self.args.dynamic,
-            self.im.shape,
-            dla=dla,
-            dataset=self.get_int8_calibration_dataloader(prefix) if self.args.int8 else None,
-            metadata=self.metadata,
-            verbose=self.args.verbose,
-            prefix=prefix,
-        )
-
-        return f
-
-    @try_export
-    def export_saved_model(self, prefix=colorstr("TensorFlow SavedModel:")):
-        """Export YOLO model to TensorFlow SavedModel format."""
-        cuda = torch.cuda.is_available()
-        try:
-            import tensorflow as tf  # noqa
-        except ImportError:
-            check_requirements("tensorflow>=2.0.0,<=2.19.0")
-            import tensorflow as tf  # noqa
-        check_requirements(
-            (
-                "tf_keras<=2.19.0",  # required by 'onnx2tf' package
-                "sng4onnx>=1.0.1",  # required by 'onnx2tf' package
-                "onnx_graphsurgeon>=0.3.26",  # required by 'onnx2tf' package
-                "ai-edge-litert>=1.2.0" + (",<1.4.0" if MACOS else ""),  # required by 'onnx2tf' package
-                "onnx>=1.12.0",
-                "onnx2tf>=1.26.3",
-                "onnxslim>=0.1.67",
-                "onnxruntime-gpu" if cuda else "onnxruntime",
-                "protobuf>=5",
-            ),
-            cmds="--extra-index-url https://pypi.ngc.nvidia.com",  # onnx_graphsurgeon only on NVIDIA
-        )
-
-        LOGGER.info(f"\n{prefix} starting export with tensorflow {tf.__version__}...")
-        check_version(
-            tf.__version__,
-            ">=2.0.0",
-            name="tensorflow",
-            verbose=True,
-            msg="https://github.com/ultralytics/ultralytics/issues/5161",
-        )
-        f = Path(str(self.file).replace(self.file.suffix, "_saved_model"))
-        if f.is_dir():
-            shutil.rmtree(f)  # delete output folder
-
-        # Pre-download calibration file to fix https://github.com/PINTO0309/onnx2tf/issues/545
-        onnx2tf_file = Path("calibration_image_sample_data_20x128x128x3_float32.npy")
-        if not onnx2tf_file.exists():
-            attempt_download_asset(f"{onnx2tf_file}.zip", unzip=True, delete=True)
-
-        # Export to ONNX
-        self.args.simplify = True
-        f_onnx = self.export_onnx()
-
-        # Export to TF
-        np_data = None
-        if self.args.int8:
-            tmp_file = f / "tmp_tflite_int8_calibration_images.npy"  # int8 calibration images file
-            if self.args.data:
-                f.mkdir()
-                images = [batch["img"] for batch in self.get_int8_calibration_dataloader(prefix)]
-                images = torch.nn.functional.interpolate(torch.cat(images, 0).float(), size=self.imgsz).permute(
-                    0, 2, 3, 1
-                )
-                np.save(str(tmp_file), images.numpy().astype(np.float32))  # BHWC
-                np_data = [["images", tmp_file, [[[[0, 0, 0]]]], [[[[255, 255, 255]]]]]]
-
-        import onnx2tf  # scoped for after ONNX export for reduced conflict during import
-
-        LOGGER.info(f"{prefix} starting TFLite export with onnx2tf {onnx2tf.__version__}...")
-        keras_model = onnx2tf.convert(
-            input_onnx_file_path=f_onnx,
-            output_folder_path=str(f),
-            not_use_onnxsim=True,
-            verbosity="error",  # note INT8-FP16 activation bug https://github.com/ultralytics/ultralytics/issues/15873
-            output_integer_quantized_tflite=self.args.int8,
-            custom_input_op_name_np_data_path=np_data,
-            enable_batchmatmul_unfold=True and not self.args.int8,  # fix lower no. of detected objects on GPU delegate
-            output_signaturedefs=True,  # fix error with Attention block group convolution
-            disable_group_convolution=self.args.format in {"tfjs", "edgetpu"},  # fix error with group convolution
-        )
-        YAML.save(f / "metadata.yaml", self.metadata)  # add metadata.yaml
-
-        # Remove/rename TFLite models
-        if self.args.int8:
-            tmp_file.unlink(missing_ok=True)
-            for file in f.rglob("*_dynamic_range_quant.tflite"):
-                file.rename(file.with_name(file.stem.replace("_dynamic_range_quant", "_int8") + file.suffix))
-            for file in f.rglob("*_integer_quant_with_int16_act.tflite"):
-                file.unlink()  # delete extra fp16 activation TFLite files
-
-        # Add TFLite metadata
-        for file in f.rglob("*.tflite"):
-            f.unlink() if "quant_with_int16_act.tflite" in str(f) else self._add_tflite_metadata(file)
-
-        return str(f), keras_model  # or keras_model = tf.saved_model.load(f, tags=None, options=None)
-
-    @try_export
-    def export_pb(self, keras_model, prefix=colorstr("TensorFlow GraphDef:")):
-        """Export YOLO model to TensorFlow GraphDef *.pb format https://github.com/leimao/Frozen-Graph-TensorFlow."""
-        import tensorflow as tf  # noqa
-        from tensorflow.python.framework.convert_to_constants import convert_variables_to_constants_v2  # noqa
-
-        LOGGER.info(f"\n{prefix} starting export with tensorflow {tf.__version__}...")
-        f = self.file.with_suffix(".pb")
-
-        m = tf.function(lambda x: keras_model(x))  # full model
-        m = m.get_concrete_function(tf.TensorSpec(keras_model.inputs[0].shape, keras_model.inputs[0].dtype))
-        frozen_func = convert_variables_to_constants_v2(m)
-        frozen_func.graph.as_graph_def()
-        tf.io.write_graph(graph_or_graph_def=frozen_func.graph, logdir=str(f.parent), name=f.name, as_text=False)
-        return f
-
-    @try_export
-    def export_tflite(self, prefix=colorstr("TensorFlow Lite:")):
-        """Export YOLO model to TensorFlow Lite format."""
-        # BUG https://github.com/ultralytics/ultralytics/issues/13436
-        import tensorflow as tf  # noqa
-
-        LOGGER.info(f"\n{prefix} starting export with tensorflow {tf.__version__}...")
-        saved_model = Path(str(self.file).replace(self.file.suffix, "_saved_model"))
-        if self.args.int8:
-            f = saved_model / f"{self.file.stem}_int8.tflite"  # fp32 in/out
-        elif self.args.half:
-            f = saved_model / f"{self.file.stem}_float16.tflite"  # fp32 in/out
-        else:
-            f = saved_model / f"{self.file.stem}_float32.tflite"
-        return str(f)
-
-    @try_export
-    def export_edgetpu(self, tflite_model="", prefix=colorstr("Edge TPU:")):
-        """Export YOLO model to Edge TPU format https://coral.ai/docs/edgetpu/models-intro/."""
-        cmd = "edgetpu_compiler --version"
-        help_url = "https://coral.ai/docs/edgetpu/compiler/"
-        assert LINUX, f"export only supported on Linux. See {help_url}"
-        if subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, shell=True).returncode != 0:
-            LOGGER.info(f"\n{prefix} export requires Edge TPU compiler. Attempting install from {help_url}")
-            for c in (
-                "curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -",
-                'echo "deb https://packages.cloud.google.com/apt coral-edgetpu-stable main" | '
-                "sudo tee /etc/apt/sources.list.d/coral-edgetpu.list",
-                "sudo apt-get update",
-                "sudo apt-get install edgetpu-compiler",
-            ):
-                subprocess.run(c if is_sudo_available() else c.replace("sudo ", ""), shell=True, check=True)
-        ver = subprocess.run(cmd, shell=True, capture_output=True, check=True).stdout.decode().rsplit(maxsplit=1)[-1]
-
-        LOGGER.info(f"\n{prefix} starting export with Edge TPU compiler {ver}...")
-        f = str(tflite_model).replace(".tflite", "_edgetpu.tflite")  # Edge TPU model
-
-        cmd = (
-            "edgetpu_compiler "
-            f'--out_dir "{Path(f).parent}" '
-            "--show_operations "
-            "--search_delegate "
-            "--delegate_search_step 30 "
-            "--timeout_sec 180 "
-            f'"{tflite_model}"'
-        )
-        LOGGER.info(f"{prefix} running '{cmd}'")
-        subprocess.run(cmd, shell=True)
-        self._add_tflite_metadata(f)
-        return f
-
-    @try_export
-    def export_tfjs(self, prefix=colorstr("TensorFlow.js:")):
-        """Export YOLO model to TensorFlow.js format."""
-        check_requirements("tensorflowjs")
-        import tensorflow as tf
-        import tensorflowjs as tfjs  # noqa
-
-        LOGGER.info(f"\n{prefix} starting export with tensorflowjs {tfjs.__version__}...")
-        f = str(self.file).replace(self.file.suffix, "_web_model")  # js dir
-        f_pb = str(self.file.with_suffix(".pb"))  # *.pb path
-
-        gd = tf.Graph().as_graph_def()  # TF GraphDef
-        with open(f_pb, "rb") as file:
-            gd.ParseFromString(file.read())
-        outputs = ",".join(gd_outputs(gd))
-        LOGGER.info(f"\n{prefix} output node names: {outputs}")
-
-        quantization = "--quantize_float16" if self.args.half else "--quantize_uint8" if self.args.int8 else ""
-        with spaces_in_path(f_pb) as fpb_, spaces_in_path(f) as f_:  # exporter can not handle spaces in path
-            cmd = (
-                "tensorflowjs_converter "
-                f'--input_format=tf_frozen_model {quantization} --output_node_names={outputs} "{fpb_}" "{f_}"'
-            )
-            LOGGER.info(f"{prefix} running '{cmd}'")
-            subprocess.run(cmd, shell=True)
-
-        if " " in f:
-            LOGGER.warning(f"{prefix} your model may not work correctly with spaces in path '{f}'.")
-
-        # Add metadata
-        YAML.save(Path(f) / "metadata.yaml", self.metadata)  # add metadata.yaml
-        return f
-
-    @try_export
-    def export_rknn(self, prefix=colorstr("RKNN:")):
-        """Export YOLO model to RKNN format."""
-        LOGGER.info(f"\n{prefix} starting export with rknn-toolkit2...")
-
-        check_requirements("rknn-toolkit2")
-        if IS_COLAB:
-            # Prevent 'exit' from closing the notebook https://github.com/airockchip/rknn-toolkit2/issues/259
-            import builtins
-
-            builtins.exit = lambda: None
-
-        from rknn.api import RKNN
-
-        f = self.export_onnx()
-        export_path = Path(f"{Path(f).stem}_rknn_model")
-        export_path.mkdir(exist_ok=True)
-
-        rknn = RKNN(verbose=False)
-        rknn.config(mean_values=[[0, 0, 0]], std_values=[[255, 255, 255]], target_platform=self.args.name)
-        rknn.load_onnx(model=f)
-        rknn.build(do_quantization=False)  # TODO: Add quantization support
-        f = f.replace(".onnx", f"-{self.args.name}.rknn")
-        rknn.export_rknn(f"{export_path / f}")
-        YAML.save(export_path / "metadata.yaml", self.metadata)
-        return export_path
-
-    @try_export
-    def export_imx(self, prefix=colorstr("IMX:")):
-        """Export YOLO model to IMX format."""
-        assert LINUX, (
-            "export only supported on Linux. "
-            "See https://developer.aitrios.sony-semicon.com/en/raspberrypi-ai-camera/documentation/imx500-converter"
-        )
-        if getattr(self.model, "end2end", False):
-            raise ValueError("IMX export is not supported for end2end models.")
-        check_requirements(
-            ("model-compression-toolkit>=2.4.1", "sony-custom-layers>=0.3.0", "edge-mdt-tpc>=1.1.0", "pydantic<=2.11.7")
-        )
-        check_requirements("imx500-converter[pt]>=3.16.1")  # Separate requirements for imx500-converter
-        check_requirements("mct-quantizers>=1.6.0")  # Separate for compatibility with model-compression-toolkit
-
-        # Install Java>=17
-        try:
-            java_output = subprocess.run(["java", "--version"], check=True, capture_output=True).stdout.decode()
-            version_match = re.search(r"(?:openjdk|java) (\d+)", java_output)
-            java_version = int(version_match.group(1)) if version_match else 0
-            assert java_version >= 17, "Java version too old"
-        except (FileNotFoundError, subprocess.CalledProcessError, AssertionError):
-            cmd = (["sudo"] if is_sudo_available() else []) + ["apt", "install", "-y", "openjdk-21-jre"]
-            subprocess.run(cmd, check=True)
-
-        return torch2imx(
-            self.model,
-            self.file,
-            self.args.conf,
-            self.args.iou,
-            self.args.max_det,
-            metadata=self.metadata,
-            dataset=self.get_int8_calibration_dataloader(prefix),
-            prefix=prefix,
-        )
-
-    def _add_tflite_metadata(self, file):
-        """Add metadata to *.tflite models per https://ai.google.dev/edge/litert/models/metadata."""
-        import zipfile
-
-        with zipfile.ZipFile(file, "a", zipfile.ZIP_DEFLATED) as zf:
-            zf.writestr("metadata.json", json.dumps(self.metadata, indent=2))
-
-    def _pipeline_coreml(self, model, weights_dir=None, prefix=colorstr("CoreML Pipeline:")):
-        """Create CoreML pipeline with NMS for YOLO detection models."""
-        import coremltools as ct  # noqa
-
-        LOGGER.info(f"{prefix} starting pipeline with coremltools {ct.__version__}...")
-
-        # Output shapes
-        spec = model.get_spec()
-        outs = list(iter(spec.description.output))
-        if self.args.format == "mlmodel":  # mlmodel doesn't infer shapes automatically
-            outs[0].type.multiArrayType.shape[:] = self.output_shape[2], self.output_shape[1] - 4
-            outs[1].type.multiArrayType.shape[:] = self.output_shape[2], 4
-
-        # Checks
-        names = self.metadata["names"]
-        nx, ny = spec.description.input[0].type.imageType.width, spec.description.input[0].type.imageType.height
-        nc = outs[0].type.multiArrayType.shape[-1]
-        assert len(names) == nc, f"{len(names)} names found for nc={nc}"  # check
-
-        # Model from spec
-        model = ct.models.MLModel(spec, weights_dir=weights_dir)
-
-        # Create NMS protobuf
-        nms_spec = ct.proto.Model_pb2.Model()
-        nms_spec.specificationVersion = spec.specificationVersion
-        for i in range(len(outs)):
-            decoder_output = model._spec.description.output[i].SerializeToString()
-            nms_spec.description.input.add()
-            nms_spec.description.input[i].ParseFromString(decoder_output)
-            nms_spec.description.output.add()
-            nms_spec.description.output[i].ParseFromString(decoder_output)
-
-        output_names = ["confidence", "coordinates"]
-        for i, name in enumerate(output_names):
-            nms_spec.description.output[i].name = name
-
-        for i, out in enumerate(outs):
-            ma_type = nms_spec.description.output[i].type.multiArrayType
-            ma_type.shapeRange.sizeRanges.add()
-            ma_type.shapeRange.sizeRanges[0].lowerBound = 0
-            ma_type.shapeRange.sizeRanges[0].upperBound = -1
-            ma_type.shapeRange.sizeRanges.add()
-            ma_type.shapeRange.sizeRanges[1].lowerBound = out.type.multiArrayType.shape[-1]
-            ma_type.shapeRange.sizeRanges[1].upperBound = out.type.multiArrayType.shape[-1]
-            del ma_type.shape[:]
-
-        nms = nms_spec.nonMaximumSuppression
-        nms.confidenceInputFeatureName = outs[0].name  # 1x507x80
-        nms.coordinatesInputFeatureName = outs[1].name  # 1x507x4
-        nms.confidenceOutputFeatureName = output_names[0]
-        nms.coordinatesOutputFeatureName = output_names[1]
-        nms.iouThresholdInputFeatureName = "iouThreshold"
-        nms.confidenceThresholdInputFeatureName = "confidenceThreshold"
-        nms.iouThreshold = self.args.iou
-        nms.confidenceThreshold = self.args.conf
-        nms.pickTop.perClass = True
-        nms.stringClassLabels.vector.extend(names.values())
-        nms_model = ct.models.MLModel(nms_spec)
-
-        # Pipeline models together
-        pipeline = ct.models.pipeline.Pipeline(
-            input_features=[
-                ("image", ct.models.datatypes.Array(3, ny, nx)),
-                ("iouThreshold", ct.models.datatypes.Double()),
-                ("confidenceThreshold", ct.models.datatypes.Double()),
-            ],
-            output_features=output_names,
-        )
-        pipeline.add_model(model)
-        pipeline.add_model(nms_model)
-
-        # Correct datatypes
-        pipeline.spec.description.input[0].ParseFromString(model._spec.description.input[0].SerializeToString())
-        pipeline.spec.description.output[0].ParseFromString(nms_model._spec.description.output[0].SerializeToString())
-        pipeline.spec.description.output[1].ParseFromString(nms_model._spec.description.output[1].SerializeToString())
-
-        # Update metadata
-        pipeline.spec.specificationVersion = spec.specificationVersion
-        pipeline.spec.description.metadata.userDefined.update(
-            {"IoU threshold": str(nms.iouThreshold), "Confidence threshold": str(nms.confidenceThreshold)}
-        )
-
-        # Save the model
-        model = ct.models.MLModel(pipeline.spec, weights_dir=weights_dir)
-        model.input_description["image"] = "Input image"
-        model.input_description["iouThreshold"] = f"(optional) IoU threshold override (default: {nms.iouThreshold})"
-        model.input_description["confidenceThreshold"] = (
-            f"(optional) Confidence threshold override (default: {nms.confidenceThreshold})"
-        )
-        model.output_description["confidence"] = 'Boxes × Class confidence (see user-defined metadata "classes")'
-        model.output_description["coordinates"] = "Boxes × [x, y, width, height] (relative to image size)"
-        LOGGER.info(f"{prefix} pipeline success")
-        return model
-
-    def add_callback(self, event: str, callback):
-        """Append the given callback to the specified event."""
-        self.callbacks[event].append(callback)
-
-    def run_callbacks(self, event: str):
-        """Execute all callbacks for a given event."""
-        for callback in self.callbacks.get(event, []):
-            callback(self)
-
-
-class IOSDetectModel(torch.nn.Module):
-    """Wrap an Ultralytics YOLO model for Apple iOS CoreML export."""
-
-    def __init__(self, model, im):
-        """
-        Initialize the IOSDetectModel class with a YOLO model and example image.
-
-        Args:
-            model (torch.nn.Module): The YOLO model to wrap.
-            im (torch.Tensor): Example input tensor with shape (B, C, H, W).
-        """
-        super().__init__()
-        _, _, h, w = im.shape  # batch, channel, height, width
-        self.model = model
-        self.nc = len(model.names)  # number of classes
-        if w == h:
-            self.normalize = 1.0 / w  # scalar
-        else:
-            self.normalize = torch.tensor(
-                [1.0 / w, 1.0 / h, 1.0 / w, 1.0 / h],  # broadcast (slower, smaller)
-                device=next(model.parameters()).device,
-            )
-
-    def forward(self, x):
-        """Normalize predictions of object detection model with input size-dependent factors."""
-        xywh, cls = self.model(x)[0].transpose(0, 1).split((4, self.nc), 1)
-        return cls, xywh * self.normalize  # confidence (3780, 80), coordinates (3780, 4)
-
-
-class NMSModel(torch.nn.Module):
-    """Model wrapper with embedded NMS for Detect, Segment, Pose and OBB."""
-
-    def __init__(self, model, args):
-        """
-        Initialize the NMSModel.
-
-        Args:
-            model (torch.nn.Module): The model to wrap with NMS postprocessing.
-            args (Namespace): The export arguments.
-        """
-        super().__init__()
-        self.model = model
-        self.args = args
-        self.obb = model.task == "obb"
-        self.is_tf = self.args.format in frozenset({"saved_model", "tflite", "tfjs"})
-
-    def forward(self, x):
-        """
-        Perform inference with NMS post-processing. Supports Detect, Segment, OBB and Pose.
-
-        Args:
-            x (torch.Tensor): The preprocessed tensor with shape (N, 3, H, W).
-
-        Returns:
-            (torch.Tensor): List of detections, each an (N, max_det, 4 + 2 + extra_shape) Tensor where N is the
-                number of detections after NMS.
-        """
-        from functools import partial
-
-        from torchvision.ops import nms
-
-        preds = self.model(x)
-        pred = preds[0] if isinstance(preds, tuple) else preds
-        kwargs = dict(device=pred.device, dtype=pred.dtype)
-        bs = pred.shape[0]
-        pred = pred.transpose(-1, -2)  # shape(1,84,6300) to shape(1,6300,84)
-        extra_shape = pred.shape[-1] - (4 + len(self.model.names))  # extras from Segment, OBB, Pose
-        if self.args.dynamic and self.args.batch > 1:  # batch size needs to always be same due to loop unroll
-            pad = torch.zeros(torch.max(torch.tensor(self.args.batch - bs), torch.tensor(0)), *pred.shape[1:], **kwargs)
-            pred = torch.cat((pred, pad))
-        boxes, scores, extras = pred.split([4, len(self.model.names), extra_shape], dim=2)
-        scores, classes = scores.max(dim=-1)
-        self.args.max_det = min(pred.shape[1], self.args.max_det)  # in case num_anchors < max_det
-        # (N, max_det, 4 coords + 1 class score + 1 class label + extra_shape).
-        out = torch.zeros(pred.shape[0], self.args.max_det, boxes.shape[-1] + 2 + extra_shape, **kwargs)
-        for i in range(bs):
-            box, cls, score, extra = boxes[i], classes[i], scores[i], extras[i]
-            mask = score > self.args.conf
-            if self.is_tf or (self.args.format == "onnx" and self.obb):
-                # TFLite GatherND error if mask is empty
-                score *= mask
-                # Explicit length otherwise reshape error, hardcoded to `self.args.max_det * 5`
-                mask = score.topk(min(self.args.max_det * 5, score.shape[0])).indices
-            box, score, cls, extra = box[mask], score[mask], cls[mask], extra[mask]
-            nmsbox = box.clone()
-            # `8` is the minimum value experimented to get correct NMS results for obb
-            multiplier = 8 if self.obb else 1
-            # Normalize boxes for NMS since large values for class offset causes issue with int8 quantization
-            if self.args.format == "tflite":  # TFLite is already normalized
-                nmsbox *= multiplier
-            else:
-                nmsbox = multiplier * nmsbox / torch.tensor(x.shape[2:], **kwargs).max()
-            if not self.args.agnostic_nms:  # class-specific NMS
-                end = 2 if self.obb else 4
-                # fully explicit expansion otherwise reshape error
-                # large max_wh causes issues when quantizing
-                cls_offset = cls.reshape(-1, 1).expand(nmsbox.shape[0], end)
-                offbox = nmsbox[:, :end] + cls_offset * multiplier
-                nmsbox = torch.cat((offbox, nmsbox[:, end:]), dim=-1)
-            nms_fn = (
-                partial(
-                    TorchNMS.fast_nms,
-                    use_triu=not (
-                        self.is_tf
-                        or (self.args.opset or 14) < 14
-                        or (self.args.format == "openvino" and self.args.int8)  # OpenVINO int8 error with triu
-                    ),
-                    iou_func=batch_probiou,
-                    exit_early=False,
-                )
-                if self.obb
-                else nms
-            )
-            keep = nms_fn(
-                torch.cat([nmsbox, extra], dim=-1) if self.obb else nmsbox,
-                score,
-                self.args.iou,
-            )[: self.args.max_det]
-            dets = torch.cat(
-                [box[keep], score[keep].view(-1, 1), cls[keep].view(-1, 1).to(out.dtype), extra[keep]], dim=-1
-            )
-            # Zero-pad to max_det size to avoid reshape error
-            pad = (0, 0, 0, self.args.max_det - dets.shape[0])
-            out[i] = torch.nn.functional.pad(dets, pad)
-        return (out[:bs], preds[1]) if self.model.task == "segment" else out[:bs]
diff --git a/ultralytics/engine/model.py b/ultralytics/engine/model.py
deleted file mode 100644
index 208b442..0000000
--- a/ultralytics/engine/model.py
+++ /dev/null
@@ -1,1164 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import inspect
-from pathlib import Path
-from typing import Any
-
-import numpy as np
-import torch
-from PIL import Image
-
-from ultralytics.cfg import TASK2DATA, get_cfg, get_save_dir
-from ultralytics.engine.results import Results
-from ultralytics.nn.tasks import guess_model_task, load_checkpoint, yaml_model_load
-from ultralytics.utils import (
-    ARGV,
-    ASSETS,
-    DEFAULT_CFG_DICT,
-    LOGGER,
-    RANK,
-    SETTINGS,
-    YAML,
-    callbacks,
-    checks,
-)
-
-
-class Model(torch.nn.Module):
-    """
-    A base class for implementing YOLO models, unifying APIs across different model types.
-
-    This class provides a common interface for various operations related to YOLO models, such as training,
-    validation, prediction, exporting, and benchmarking. It handles different types of models, including those
-    loaded from local files, Ultralytics HUB, or Triton Server.
-
-    Attributes:
-        callbacks (dict): A dictionary of callback functions for various events during model operations.
-        predictor (BasePredictor): The predictor object used for making predictions.
-        model (torch.nn.Module): The underlying PyTorch model.
-        trainer (BaseTrainer): The trainer object used for training the model.
-        ckpt (dict): The checkpoint data if the model is loaded from a *.pt file.
-        cfg (str): The configuration of the model if loaded from a *.yaml file.
-        ckpt_path (str): The path to the checkpoint file.
-        overrides (dict): A dictionary of overrides for model configuration.
-        metrics (dict): The latest training/validation metrics.
-        session (HUBTrainingSession): The Ultralytics HUB session, if applicable.
-        task (str): The type of task the model is intended for.
-        model_name (str): The name of the model.
-
-    Methods:
-        __call__: Alias for the predict method, enabling the model instance to be callable.
-        _new: Initialize a new model based on a configuration file.
-        _load: Load a model from a checkpoint file.
-        _check_is_pytorch_model: Ensure that the model is a PyTorch model.
-        reset_weights: Reset the model's weights to their initial state.
-        load: Load model weights from a specified file.
-        save: Save the current state of the model to a file.
-        info: Log or return information about the model.
-        fuse: Fuse Conv2d and BatchNorm2d layers for optimized inference.
-        predict: Perform object detection predictions.
-        track: Perform object tracking.
-        val: Validate the model on a dataset.
-        benchmark: Benchmark the model on various export formats.
-        export: Export the model to different formats.
-        train: Train the model on a dataset.
-        tune: Perform hyperparameter tuning.
-        _apply: Apply a function to the model's tensors.
-        add_callback: Add a callback function for an event.
-        clear_callback: Clear all callbacks for an event.
-        reset_callbacks: Reset all callbacks to their default functions.
-
-    Examples:
-        >>> from ultralytics import YOLO
-        >>> model = YOLO("yolo11n.pt")
-        >>> results = model.predict("image.jpg")
-        >>> model.train(data="coco8.yaml", epochs=3)
-        >>> metrics = model.val()
-        >>> model.export(format="onnx")
-    """
-
-    def __init__(
-        self,
-        model: str | Path | Model = "yolo11n.pt",
-        task: str = None,
-        verbose: bool = False,
-    ) -> None:
-        """
-        Initialize a new instance of the YOLO model class.
-
-        This constructor sets up the model based on the provided model path or name. It handles various types of
-        model sources, including local files, Ultralytics HUB models, and Triton Server models. The method
-        initializes several important attributes of the model and prepares it for operations like training,
-        prediction, or export.
-
-        Args:
-            model (str | Path | Model): Path or name of the model to load or create. Can be a local file path, a
-                model name from ultralytics HUB, a Triton Server model, or an already initialized Model instance.
-            task (str, optional): The specific task for the model. If None, it will be inferred from the config.
-            verbose (bool): If True, enables verbose output during the model's initialization and subsequent
-                operations.
-
-        Raises:
-            FileNotFoundError: If the specified model file does not exist or is inaccessible.
-            ValueError: If the model file or configuration is invalid or unsupported.
-            ImportError: If required dependencies for specific model types (like HUB SDK) are not installed.
-
-        Examples:
-            >>> model = Model("yolo11n.pt")
-            >>> model = Model("path/to/model.yaml", task="detect")
-            >>> model = Model("hub_model", verbose=True)
-        """
-        if isinstance(model, Model):
-            self.__dict__ = model.__dict__  # accepts an already initialized Model
-            return
-        super().__init__()
-        self.callbacks = callbacks.get_default_callbacks()
-        self.predictor = None  # reuse predictor
-        self.model = None  # model object
-        self.trainer = None  # trainer object
-        self.ckpt = {}  # if loaded from *.pt
-        self.cfg = None  # if loaded from *.yaml
-        self.ckpt_path = None
-        self.overrides = {}  # overrides for trainer object
-        self.metrics = None  # validation/training metrics
-        self.session = None  # HUB session
-        self.task = task  # task type
-        self.model_name = None  # model name
-        model = str(model).strip()
-
-        # Check if Ultralytics HUB model from https://hub.ultralytics.com
-        if self.is_hub_model(model):
-            from ultralytics.hub import HUBTrainingSession
-
-            # Fetch model from HUB
-            checks.check_requirements("hub-sdk>=0.0.12")
-            session = HUBTrainingSession.create_session(model)
-            model = session.model_file
-            if session.train_args:  # training sent from HUB
-                self.session = session
-
-        # Check if Triton Server model
-        elif self.is_triton_model(model):
-            self.model_name = self.model = model
-            self.overrides["task"] = task or "detect"  # set `task=detect` if not explicitly set
-            return
-
-        # Load or create new YOLO model
-        __import__("os").environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"  # to avoid deterministic warnings
-        if str(model).endswith((".yaml", ".yml")):
-            self._new(model, task=task, verbose=verbose)
-        else:
-            self._load(model, task=task)
-
-        # Delete super().training for accessing self.model.training
-        del self.training
-
-    def __call__(
-        self,
-        source: str | Path | int | Image.Image | list | tuple | np.ndarray | torch.Tensor = None,
-        stream: bool = False,
-        **kwargs: Any,
-    ) -> list:
-        """
-        Alias for the predict method, enabling the model instance to be callable for predictions.
-
-        This method simplifies the process of making predictions by allowing the model instance to be called
-        directly with the required arguments.
-
-        Args:
-            source (str | Path | int | PIL.Image | np.ndarray | torch.Tensor | list | tuple): The source of
-                the image(s) to make predictions on. Can be a file path, URL, PIL image, numpy array, PyTorch
-                tensor, or a list/tuple of these.
-            stream (bool): If True, treat the input source as a continuous stream for predictions.
-            **kwargs (Any): Additional keyword arguments to configure the prediction process.
-
-        Returns:
-            (list[ultralytics.engine.results.Results]): A list of prediction results, each encapsulated in a
-                Results object.
-
-        Examples:
-            >>> model = YOLO("yolo11n.pt")
-            >>> results = model("https://ultralytics.com/images/bus.jpg")
-            >>> for r in results:
-            ...     print(f"Detected {len(r)} objects in image")
-        """
-        return self.predict(source, stream, **kwargs)
-
-    @staticmethod
-    def is_triton_model(model: str) -> bool:
-        """
-        Check if the given model string is a Triton Server URL.
-
-        This static method determines whether the provided model string represents a valid Triton Server URL by
-        parsing its components using urllib.parse.urlsplit().
-
-        Args:
-            model (str): The model string to be checked.
-
-        Returns:
-            (bool): True if the model string is a valid Triton Server URL, False otherwise.
-
-        Examples:
-            >>> Model.is_triton_model("http://localhost:8000/v2/models/yolo11n")
-            True
-            >>> Model.is_triton_model("yolo11n.pt")
-            False
-        """
-        from urllib.parse import urlsplit
-
-        url = urlsplit(model)
-        return url.netloc and url.path and url.scheme in {"http", "grpc"}
-
-    @staticmethod
-    def is_hub_model(model: str) -> bool:
-        """
-        Check if the provided model is an Ultralytics HUB model.
-
-        This static method determines whether the given model string represents a valid Ultralytics HUB model
-        identifier.
-
-        Args:
-            model (str): The model string to check.
-
-        Returns:
-            (bool): True if the model is a valid Ultralytics HUB model, False otherwise.
-
-        Examples:
-            >>> Model.is_hub_model("https://hub.ultralytics.com/models/MODEL")
-            True
-            >>> Model.is_hub_model("yolo11n.pt")
-            False
-        """
-        from ultralytics.hub import HUB_WEB_ROOT
-
-        return model.startswith(f"{HUB_WEB_ROOT}/models/")
-
-    def _new(self, cfg: str, task=None, model=None, verbose=False) -> None:
-        """
-        Initialize a new model and infer the task type from model definitions.
-
-        Creates a new model instance based on the provided configuration file. Loads the model configuration, infers
-        the task type if not specified, and initializes the model using the appropriate class from the task map.
-
-        Args:
-            cfg (str): Path to the model configuration file in YAML format.
-            task (str, optional): The specific task for the model. If None, it will be inferred from the config.
-            model (torch.nn.Module, optional): A custom model instance. If provided, it will be used instead of
-                creating a new one.
-            verbose (bool): If True, displays model information during loading.
-
-        Raises:
-            ValueError: If the configuration file is invalid or the task cannot be inferred.
-            ImportError: If the required dependencies for the specified task are not installed.
-
-        Examples:
-            >>> model = Model()
-            >>> model._new("yolo11n.yaml", task="detect", verbose=True)
-        """
-        cfg_dict = yaml_model_load(cfg)
-        self.cfg = cfg
-        self.task = task or guess_model_task(cfg_dict)
-        self.model = (model or self._smart_load("model"))(cfg_dict, verbose=verbose and RANK == -1)  # build model
-        self.overrides["model"] = self.cfg
-        self.overrides["task"] = self.task
-
-        # Below added to allow export from YAMLs
-        self.model.args = {**DEFAULT_CFG_DICT, **self.overrides}  # combine default and model args (prefer model args)
-        self.model.task = self.task
-        self.model_name = cfg
-
-    def _load(self, weights: str, task=None) -> None:
-        """
-        Load a model from a checkpoint file or initialize it from a weights file.
-
-        This method handles loading models from either .pt checkpoint files or other weight file formats. It sets
-        up the model, task, and related attributes based on the loaded weights.
-
-        Args:
-            weights (str): Path to the model weights file to be loaded.
-            task (str, optional): The task associated with the model. If None, it will be inferred from the model.
-
-        Raises:
-            FileNotFoundError: If the specified weights file does not exist or is inaccessible.
-            ValueError: If the weights file format is unsupported or invalid.
-
-        Examples:
-            >>> model = Model()
-            >>> model._load("yolo11n.pt")
-            >>> model._load("path/to/weights.pth", task="detect")
-        """
-        if weights.lower().startswith(("https://", "http://", "rtsp://", "rtmp://", "tcp://")):
-            weights = checks.check_file(weights, download_dir=SETTINGS["weights_dir"])  # download and return local file
-        weights = checks.check_model_file_from_stem(weights)  # add suffix, i.e. yolo11n -> yolo11n.pt
-
-        if str(weights).rpartition(".")[-1] == "pt":
-            self.model, self.ckpt = load_checkpoint(weights)
-            self.task = self.model.task
-            self.overrides = self.model.args = self._reset_ckpt_args(self.model.args)
-            self.ckpt_path = self.model.pt_path
-        else:
-            weights = checks.check_file(weights)  # runs in all cases, not redundant with above call
-            self.model, self.ckpt = weights, None
-            self.task = task or guess_model_task(weights)
-            self.ckpt_path = weights
-        self.overrides["model"] = weights
-        self.overrides["task"] = self.task
-        self.model_name = weights
-
-    def _check_is_pytorch_model(self) -> None:
-        """
-        Check if the model is a PyTorch model and raise TypeError if it's not.
-
-        This method verifies that the model is either a PyTorch module or a .pt file. It's used to ensure that
-        certain operations that require a PyTorch model are only performed on compatible model types.
-
-        Raises:
-            TypeError: If the model is not a PyTorch module or a .pt file. The error message provides detailed
-                information about supported model formats and operations.
-
-        Examples:
-            >>> model = Model("yolo11n.pt")
-            >>> model._check_is_pytorch_model()  # No error raised
-            >>> model = Model("yolo11n.onnx")
-            >>> model._check_is_pytorch_model()  # Raises TypeError
-        """
-        pt_str = isinstance(self.model, (str, Path)) and str(self.model).rpartition(".")[-1] == "pt"
-        pt_module = isinstance(self.model, torch.nn.Module)
-        if not (pt_module or pt_str):
-            raise TypeError(
-                f"model='{self.model}' should be a *.pt PyTorch model to run this method, but is a different format. "
-                f"PyTorch models can train, val, predict and export, i.e. 'model.train(data=...)', but exported "
-                f"formats like ONNX, TensorRT etc. only support 'predict' and 'val' modes, "
-                f"i.e. 'yolo predict model=yolo11n.onnx'.\nTo run CUDA or MPS inference please pass the device "
-                f"argument directly in your inference command, i.e. 'model.predict(source=..., device=0)'"
-            )
-
-    def reset_weights(self) -> Model:
-        """
-        Reset the model's weights to their initial state.
-
-        This method iterates through all modules in the model and resets their parameters if they have a
-        'reset_parameters' method. It also ensures that all parameters have 'requires_grad' set to True,
-        enabling them to be updated during training.
-
-        Returns:
-            (Model): The instance of the class with reset weights.
-
-        Raises:
-            AssertionError: If the model is not a PyTorch model.
-
-        Examples:
-            >>> model = Model("yolo11n.pt")
-            >>> model.reset_weights()
-        """
-        self._check_is_pytorch_model()
-        for m in self.model.modules():
-            if hasattr(m, "reset_parameters"):
-                m.reset_parameters()
-        for p in self.model.parameters():
-            p.requires_grad = True
-        return self
-
-    def load(self, weights: str | Path = "yolo11n.pt") -> Model:
-        """
-        Load parameters from the specified weights file into the model.
-
-        This method supports loading weights from a file or directly from a weights object. It matches parameters by
-        name and shape and transfers them to the model.
-
-        Args:
-            weights (str | Path): Path to the weights file or a weights object.
-
-        Returns:
-            (Model): The instance of the class with loaded weights.
-
-        Raises:
-            AssertionError: If the model is not a PyTorch model.
-
-        Examples:
-            >>> model = Model()
-            >>> model.load("yolo11n.pt")
-            >>> model.load(Path("path/to/weights.pt"))
-        """
-        self._check_is_pytorch_model()
-        if isinstance(weights, (str, Path)):
-            self.overrides["pretrained"] = weights  # remember the weights for DDP training
-            weights, self.ckpt = load_checkpoint(weights)
-        self.model.load(weights)
-        return self
-
-    def save(self, filename: str | Path = "saved_model.pt") -> None:
-        """
-        Save the current model state to a file.
-
-        This method exports the model's checkpoint (ckpt) to the specified filename. It includes metadata such as
-        the date, Ultralytics version, license information, and a link to the documentation.
-
-        Args:
-            filename (str | Path): The name of the file to save the model to.
-
-        Raises:
-            AssertionError: If the model is not a PyTorch model.
-
-        Examples:
-            >>> model = Model("yolo11n.pt")
-            >>> model.save("my_model.pt")
-        """
-        self._check_is_pytorch_model()
-        from copy import deepcopy
-        from datetime import datetime
-
-        from ultralytics import __version__
-
-        updates = {
-            "model": deepcopy(self.model).half() if isinstance(self.model, torch.nn.Module) else self.model,
-            "date": datetime.now().isoformat(),
-            "version": __version__,
-            "license": "AGPL-3.0 License (https://ultralytics.com/license)",
-            "docs": "https://docs.ultralytics.com",
-        }
-        torch.save({**self.ckpt, **updates}, filename)
-
-    def info(self, detailed: bool = False, verbose: bool = True):
-        """
-        Display model information.
-
-        This method provides an overview or detailed information about the model, depending on the arguments
-        passed. It can control the verbosity of the output and return the information as a list.
-
-        Args:
-            detailed (bool): If True, shows detailed information about the model layers and parameters.
-            verbose (bool): If True, prints the information. If False, returns the information as a list.
-
-        Returns:
-            (list[str]): A list of strings containing various types of information about the model, including
-                model summary, layer details, and parameter counts. Empty if verbose is True.
-
-        Examples:
-            >>> model = Model("yolo11n.pt")
-            >>> model.info()  # Prints model summary
-            >>> info_list = model.info(detailed=True, verbose=False)  # Returns detailed info as a list
-        """
-        self._check_is_pytorch_model()
-        return self.model.info(detailed=detailed, verbose=verbose)
-
-    def fuse(self) -> None:
-        """
-        Fuse Conv2d and BatchNorm2d layers in the model for optimized inference.
-
-        This method iterates through the model's modules and fuses consecutive Conv2d and BatchNorm2d layers
-        into a single layer. This fusion can significantly improve inference speed by reducing the number of
-        operations and memory accesses required during forward passes.
-
-        The fusion process typically involves folding the BatchNorm2d parameters (mean, variance, weight, and
-        bias) into the preceding Conv2d layer's weights and biases. This results in a single Conv2d layer that
-        performs both convolution and normalization in one step.
-
-        Examples:
-            >>> model = Model("yolo11n.pt")
-            >>> model.fuse()
-            >>> # Model is now fused and ready for optimized inference
-        """
-        self._check_is_pytorch_model()
-        self.model.fuse()
-
-    def embed(
-        self,
-        source: str | Path | int | list | tuple | np.ndarray | torch.Tensor = None,
-        stream: bool = False,
-        **kwargs: Any,
-    ) -> list:
-        """
-        Generate image embeddings based on the provided source.
-
-        This method is a wrapper around the 'predict()' method, focusing on generating embeddings from an image
-        source. It allows customization of the embedding process through various keyword arguments.
-
-        Args:
-            source (str | Path | int | list | tuple | np.ndarray | torch.Tensor): The source of the image for
-                generating embeddings. Can be a file path, URL, PIL image, numpy array, etc.
-            stream (bool): If True, predictions are streamed.
-            **kwargs (Any): Additional keyword arguments for configuring the embedding process.
-
-        Returns:
-            (list[torch.Tensor]): A list containing the image embeddings.
-
-        Examples:
-            >>> model = YOLO("yolo11n.pt")
-            >>> image = "https://ultralytics.com/images/bus.jpg"
-            >>> embeddings = model.embed(image)
-            >>> print(embeddings[0].shape)
-        """
-        if not kwargs.get("embed"):
-            kwargs["embed"] = [len(self.model.model) - 2]  # embed second-to-last layer if no indices passed
-        return self.predict(source, stream, **kwargs)
-
-    def predict(
-        self,
-        source: str | Path | int | Image.Image | list | tuple | np.ndarray | torch.Tensor = None,
-        stream: bool = False,
-        predictor=None,
-        **kwargs: Any,
-    ) -> list[Results]:
-        """
-        Perform predictions on the given image source using the YOLO model.
-
-        This method facilitates the prediction process, allowing various configurations through keyword arguments.
-        It supports predictions with custom predictors or the default predictor method. The method handles different
-        types of image sources and can operate in a streaming mode.
-
-        Args:
-            source (str | Path | int | PIL.Image | np.ndarray | torch.Tensor | list | tuple): The source
-                of the image(s) to make predictions on. Accepts various types including file paths, URLs, PIL
-                images, numpy arrays, and torch tensors.
-            stream (bool): If True, treats the input source as a continuous stream for predictions.
-            predictor (BasePredictor, optional): An instance of a custom predictor class for making predictions.
-                If None, the method uses a default predictor.
-            **kwargs (Any): Additional keyword arguments for configuring the prediction process.
-
-        Returns:
-            (list[ultralytics.engine.results.Results]): A list of prediction results, each encapsulated in a
-                Results object.
-
-        Examples:
-            >>> model = YOLO("yolo11n.pt")
-            >>> results = model.predict(source="path/to/image.jpg", conf=0.25)
-            >>> for r in results:
-            ...     print(r.boxes.data)  # print detection bounding boxes
-
-        Notes:
-            - If 'source' is not provided, it defaults to the ASSETS constant with a warning.
-            - The method sets up a new predictor if not already present and updates its arguments with each call.
-            - For SAM-type models, 'prompts' can be passed as a keyword argument.
-        """
-        if source is None:
-            source = "https://ultralytics.com/images/boats.jpg" if self.task == "obb" else ASSETS
-            LOGGER.warning(f"'source' is missing. Using 'source={source}'.")
-
-        is_cli = (ARGV[0].endswith("yolo") or ARGV[0].endswith("ultralytics")) and any(
-            x in ARGV for x in ("predict", "track", "mode=predict", "mode=track")
-        )
-
-        custom = {"conf": 0.25, "batch": 1, "save": is_cli, "mode": "predict", "rect": True}  # method defaults
-        args = {**self.overrides, **custom, **kwargs}  # highest priority args on the right
-        prompts = args.pop("prompts", None)  # for SAM-type models
-
-        if not self.predictor:
-            self.predictor = (predictor or self._smart_load("predictor"))(overrides=args, _callbacks=self.callbacks)
-            self.predictor.setup_model(model=self.model, verbose=is_cli)
-        else:  # only update args if predictor is already setup
-            self.predictor.args = get_cfg(self.predictor.args, args)
-            if "project" in args or "name" in args:
-                self.predictor.save_dir = get_save_dir(self.predictor.args)
-        if prompts and hasattr(self.predictor, "set_prompts"):  # for SAM-type models
-            self.predictor.set_prompts(prompts)
-        return self.predictor.predict_cli(source=source) if is_cli else self.predictor(source=source, stream=stream)
-
-    def track(
-        self,
-        source: str | Path | int | list | tuple | np.ndarray | torch.Tensor = None,
-        stream: bool = False,
-        persist: bool = False,
-        **kwargs: Any,
-    ) -> list[Results]:
-        """
-        Conduct object tracking on the specified input source using the registered trackers.
-
-        This method performs object tracking using the model's predictors and optionally registered trackers. It handles
-        various input sources such as file paths or video streams, and supports customization through keyword arguments.
-        The method registers trackers if not already present and can persist them between calls.
-
-        Args:
-            source (str | Path | int | list | tuple | np.ndarray | torch.Tensor, optional): Input source for object
-                tracking. Can be a file path, URL, or video stream.
-            stream (bool): If True, treats the input source as a continuous video stream.
-            persist (bool): If True, persists trackers between different calls to this method.
-            **kwargs (Any): Additional keyword arguments for configuring the tracking process.
-
-        Returns:
-            (list[ultralytics.engine.results.Results]): A list of tracking results, each a Results object.
-
-        Examples:
-            >>> model = YOLO("yolo11n.pt")
-            >>> results = model.track(source="path/to/video.mp4", show=True)
-            >>> for r in results:
-            ...     print(r.boxes.id)  # print tracking IDs
-
-        Notes:
-            - This method sets a default confidence threshold of 0.1 for ByteTrack-based tracking.
-            - The tracking mode is explicitly set in the keyword arguments.
-            - Batch size is set to 1 for tracking in videos.
-        """
-        if not hasattr(self.predictor, "trackers"):
-            from ultralytics.trackers import register_tracker
-
-            register_tracker(self, persist)
-        kwargs["conf"] = kwargs.get("conf") or 0.1  # ByteTrack-based method needs low confidence predictions as input
-        kwargs["batch"] = kwargs.get("batch") or 1  # batch-size 1 for tracking in videos
-        kwargs["mode"] = "track"
-        return self.predict(source=source, stream=stream, **kwargs)
-
-    def val(
-        self,
-        validator=None,
-        **kwargs: Any,
-    ):
-        """
-        Validate the model using a specified dataset and validation configuration.
-
-        This method facilitates the model validation process, allowing for customization through various settings. It
-        supports validation with a custom validator or the default validation approach. The method combines default
-        configurations, method-specific defaults, and user-provided arguments to configure the validation process.
-
-        Args:
-            validator (ultralytics.engine.validator.BaseValidator, optional): An instance of a custom validator class
-                for validating the model.
-            **kwargs (Any): Arbitrary keyword arguments for customizing the validation process.
-
-        Returns:
-            (ultralytics.utils.metrics.DetMetrics): Validation metrics obtained from the validation process.
-
-        Raises:
-            AssertionError: If the model is not a PyTorch model.
-
-        Examples:
-            >>> model = YOLO("yolo11n.pt")
-            >>> results = model.val(data="coco8.yaml", imgsz=640)
-            >>> print(results.box.map)  # Print mAP50-95
-        """
-        custom = {"rect": True}  # method defaults
-        args = {**self.overrides, **custom, **kwargs, "mode": "val"}  # highest priority args on the right
-
-        validator = (validator or self._smart_load("validator"))(args=args, _callbacks=self.callbacks)
-        validator(model=self.model)
-        self.metrics = validator.metrics
-        return validator.metrics
-
-    def benchmark(self, data=None, format="", verbose=False, **kwargs: Any):
-        """
-        Benchmark the model across various export formats to evaluate performance.
-
-        This method assesses the model's performance in different export formats, such as ONNX, TorchScript, etc.
-        It uses the 'benchmark' function from the ultralytics.utils.benchmarks module. The benchmarking is
-        configured using a combination of default configuration values, model-specific arguments, method-specific
-        defaults, and any additional user-provided keyword arguments.
-
-        Args:
-            data (str): Path to the dataset for benchmarking.
-            verbose (bool): Whether to print detailed benchmark information.
-            format (str): Export format name for specific benchmarking.
-            **kwargs (Any): Arbitrary keyword arguments to customize the benchmarking process. Common options include:
-                - imgsz (int | list[int]): Image size for benchmarking.
-                - half (bool): Whether to use half-precision (FP16) mode.
-                - int8 (bool): Whether to use int8 precision mode.
-                - device (str): Device to run the benchmark on (e.g., 'cpu', 'cuda').
-
-        Returns:
-            (dict): A dictionary containing the results of the benchmarking process, including metrics for
-                different export formats.
-
-        Raises:
-            AssertionError: If the model is not a PyTorch model.
-
-        Examples:
-            >>> model = YOLO("yolo11n.pt")
-            >>> results = model.benchmark(data="coco8.yaml", imgsz=640, half=True)
-            >>> print(results)
-        """
-        self._check_is_pytorch_model()
-        from ultralytics.utils.benchmarks import benchmark
-
-        from .exporter import export_formats
-
-        custom = {"verbose": False}  # method defaults
-        args = {**DEFAULT_CFG_DICT, **self.model.args, **custom, **kwargs, "mode": "benchmark"}
-        fmts = export_formats()
-        export_args = set(dict(zip(fmts["Argument"], fmts["Arguments"])).get(format, [])) - {"batch"}
-        export_kwargs = {k: v for k, v in args.items() if k in export_args}
-        return benchmark(
-            model=self,
-            data=data,  # if no 'data' argument passed set data=None for default datasets
-            imgsz=args["imgsz"],
-            device=args["device"],
-            verbose=verbose,
-            format=format,
-            **export_kwargs,
-        )
-
-    def export(
-        self,
-        **kwargs: Any,
-    ) -> str:
-        """
-        Export the model to a different format suitable for deployment.
-
-        This method facilitates the export of the model to various formats (e.g., ONNX, TorchScript) for deployment
-        purposes. It uses the 'Exporter' class for the export process, combining model-specific overrides, method
-        defaults, and any additional arguments provided.
-
-        Args:
-            **kwargs (Any): Arbitrary keyword arguments to customize the export process. These are combined with
-                the model's overrides and method defaults. Common arguments include:
-                format (str): Export format (e.g., 'onnx', 'engine', 'coreml').
-                half (bool): Export model in half-precision.
-                int8 (bool): Export model in int8 precision.
-                device (str): Device to run the export on.
-                workspace (int): Maximum memory workspace size for TensorRT engines.
-                nms (bool): Add Non-Maximum Suppression (NMS) module to model.
-                simplify (bool): Simplify ONNX model.
-
-        Returns:
-            (str): The path to the exported model file.
-
-        Raises:
-            AssertionError: If the model is not a PyTorch model.
-            ValueError: If an unsupported export format is specified.
-            RuntimeError: If the export process fails due to errors.
-
-        Examples:
-            >>> model = YOLO("yolo11n.pt")
-            >>> model.export(format="onnx", dynamic=True, simplify=True)
-            'path/to/exported/model.onnx'
-        """
-        self._check_is_pytorch_model()
-        from .exporter import Exporter
-
-        custom = {
-            "imgsz": self.model.args["imgsz"],
-            "batch": 1,
-            "data": None,
-            "device": None,  # reset to avoid multi-GPU errors
-            "verbose": False,
-        }  # method defaults
-        args = {**self.overrides, **custom, **kwargs, "mode": "export"}  # highest priority args on the right
-        return Exporter(overrides=args, _callbacks=self.callbacks)(model=self.model)
-
-    def train(
-        self,
-        trainer=None,
-        **kwargs: Any,
-    ):
-        """
-        Train the model using the specified dataset and training configuration.
-
-        This method facilitates model training with a range of customizable settings. It supports training with a
-        custom trainer or the default training approach. The method handles scenarios such as resuming training
-        from a checkpoint, integrating with Ultralytics HUB, and updating model and configuration after training.
-
-        When using Ultralytics HUB, if the session has a loaded model, the method prioritizes HUB training
-        arguments and warns if local arguments are provided. It checks for pip updates and combines default
-        configurations, method-specific defaults, and user-provided arguments to configure the training process.
-
-        Args:
-            trainer (BaseTrainer, optional): Custom trainer instance for model training. If None, uses default.
-            **kwargs (Any): Arbitrary keyword arguments for training configuration. Common options include:
-                data (str): Path to dataset configuration file.
-                epochs (int): Number of training epochs.
-                batch (int): Batch size for training.
-                imgsz (int): Input image size.
-                device (str): Device to run training on (e.g., 'cuda', 'cpu').
-                workers (int): Number of worker threads for data loading.
-                optimizer (str): Optimizer to use for training.
-                lr0 (float): Initial learning rate.
-                patience (int): Epochs to wait for no observable improvement for early stopping of training.
-
-        Returns:
-            (dict | None): Training metrics if available and training is successful; otherwise, None.
-
-        Examples:
-            >>> model = YOLO("yolo11n.pt")
-            >>> results = model.train(data="coco8.yaml", epochs=3)
-        """
-        self._check_is_pytorch_model()
-        if hasattr(self.session, "model") and self.session.model.id:  # Ultralytics HUB session with loaded model
-            if any(kwargs):
-                LOGGER.warning("using HUB training arguments, ignoring local training arguments.")
-            kwargs = self.session.train_args  # overwrite kwargs
-
-        checks.check_pip_update_available()
-
-        if isinstance(kwargs.get("pretrained", None), (str, Path)):
-            self.load(kwargs["pretrained"])  # load pretrained weights if provided
-        overrides = YAML.load(checks.check_yaml(kwargs["cfg"])) if kwargs.get("cfg") else self.overrides
-        custom = {
-            # NOTE: handle the case when 'cfg' includes 'data'.
-            "data": overrides.get("data") or DEFAULT_CFG_DICT["data"] or TASK2DATA[self.task],
-            "model": self.overrides["model"],
-            "task": self.task,
-        }  # method defaults
-        args = {**overrides, **custom, **kwargs, "mode": "train", "session": self.session}  # prioritizes rightmost args
-        if args.get("resume"):
-            args["resume"] = self.ckpt_path
-
-        self.trainer = (trainer or self._smart_load("trainer"))(overrides=args, _callbacks=self.callbacks)
-        if not args.get("resume"):  # manually set model only if not resuming
-            self.trainer.model = self.trainer.get_model(weights=self.model if self.ckpt else None, cfg=self.model.yaml)
-            self.model = self.trainer.model
-
-        self.trainer.train()
-        # Update model and cfg after training
-        if RANK in {-1, 0}:
-            ckpt = self.trainer.best if self.trainer.best.exists() else self.trainer.last
-            self.model, self.ckpt = load_checkpoint(ckpt)
-            self.overrides = self.model.args
-            self.metrics = getattr(self.trainer.validator, "metrics", None)  # TODO: no metrics returned by DDP
-        return self.metrics
-
-    def tune(
-        self,
-        use_ray=False,
-        iterations=10,
-        *args: Any,
-        **kwargs: Any,
-    ):
-        """
-        Conduct hyperparameter tuning for the model, with an option to use Ray Tune.
-
-        This method supports two modes of hyperparameter tuning: using Ray Tune or a custom tuning method.
-        When Ray Tune is enabled, it leverages the 'run_ray_tune' function from the ultralytics.utils.tuner module.
-        Otherwise, it uses the internal 'Tuner' class for tuning. The method combines default, overridden, and
-        custom arguments to configure the tuning process.
-
-        Args:
-            use_ray (bool): Whether to use Ray Tune for hyperparameter tuning. If False, uses internal tuning method.
-            iterations (int): Number of tuning iterations to perform.
-            *args (Any): Additional positional arguments to pass to the tuner.
-            **kwargs (Any): Additional keyword arguments for tuning configuration. These are combined with model
-                overrides and defaults to configure the tuning process.
-
-        Returns:
-            (dict): Results of the hyperparameter search, including best parameters and performance metrics.
-
-        Raises:
-            TypeError: If the model is not a PyTorch model.
-
-        Examples:
-            >>> model = YOLO("yolo11n.pt")
-            >>> results = model.tune(data="coco8.yaml", iterations=5)
-            >>> print(results)
-
-            # Use Ray Tune for more advanced hyperparameter search
-            >>> results = model.tune(use_ray=True, iterations=20, data="coco8.yaml")
-        """
-        self._check_is_pytorch_model()
-        if use_ray:
-            from ultralytics.utils.tuner import run_ray_tune
-
-            return run_ray_tune(self, max_samples=iterations, *args, **kwargs)
-        else:
-            from .tuner import Tuner
-
-            custom = {}  # method defaults
-            args = {**self.overrides, **custom, **kwargs, "mode": "train"}  # highest priority args on the right
-            return Tuner(args=args, _callbacks=self.callbacks)(model=self, iterations=iterations)
-
-    def _apply(self, fn) -> Model:
-        """
-        Apply a function to model tensors that are not parameters or registered buffers.
-
-        This method extends the functionality of the parent class's _apply method by additionally resetting the
-        predictor and updating the device in the model's overrides. It's typically used for operations like
-        moving the model to a different device or changing its precision.
-
-        Args:
-            fn (Callable): A function to be applied to the model's tensors. This is typically a method like
-                to(), cpu(), cuda(), half(), or float().
-
-        Returns:
-            (Model): The model instance with the function applied and updated attributes.
-
-        Raises:
-            AssertionError: If the model is not a PyTorch model.
-
-        Examples:
-            >>> model = Model("yolo11n.pt")
-            >>> model = model._apply(lambda t: t.cuda())  # Move model to GPU
-        """
-        self._check_is_pytorch_model()
-        self = super()._apply(fn)  # noqa
-        self.predictor = None  # reset predictor as device may have changed
-        self.overrides["device"] = self.device  # was str(self.device) i.e. device(type='cuda', index=0) -> 'cuda:0'
-        return self
-
-    @property
-    def names(self) -> dict[int, str]:
-        """
-        Retrieve the class names associated with the loaded model.
-
-        This property returns the class names if they are defined in the model. It checks the class names for validity
-        using the 'check_class_names' function from the ultralytics.nn.autobackend module. If the predictor is not
-        initialized, it sets it up before retrieving the names.
-
-        Returns:
-            (dict[int, str]): A dictionary of class names associated with the model, where keys are class indices and
-                values are the corresponding class names.
-
-        Raises:
-            AttributeError: If the model or predictor does not have a 'names' attribute.
-
-        Examples:
-            >>> model = YOLO("yolo11n.pt")
-            >>> print(model.names)
-            {0: 'person', 1: 'bicycle', 2: 'car', ...}
-        """
-        from ultralytics.nn.autobackend import check_class_names
-
-        if hasattr(self.model, "names"):
-            return check_class_names(self.model.names)
-        if not self.predictor:  # export formats will not have predictor defined until predict() is called
-            predictor = self._smart_load("predictor")(overrides=self.overrides, _callbacks=self.callbacks)
-            predictor.setup_model(model=self.model, verbose=False)  # do not mess with self.predictor.model args
-            return predictor.model.names
-        return self.predictor.model.names
-
-    @property
-    def device(self) -> torch.device:
-        """
-        Get the device on which the model's parameters are allocated.
-
-        This property determines the device (CPU or GPU) where the model's parameters are currently stored. It is
-        applicable only to models that are instances of torch.nn.Module.
-
-        Returns:
-            (torch.device): The device (CPU/GPU) of the model.
-
-        Raises:
-            AttributeError: If the model is not a torch.nn.Module instance.
-
-        Examples:
-            >>> model = YOLO("yolo11n.pt")
-            >>> print(model.device)
-            device(type='cuda', index=0)  # if CUDA is available
-            >>> model = model.to("cpu")
-            >>> print(model.device)
-            device(type='cpu')
-        """
-        return next(self.model.parameters()).device if isinstance(self.model, torch.nn.Module) else None
-
-    @property
-    def transforms(self):
-        """
-        Retrieve the transformations applied to the input data of the loaded model.
-
-        This property returns the transformations if they are defined in the model. The transforms
-        typically include preprocessing steps like resizing, normalization, and data augmentation
-        that are applied to input data before it is fed into the model.
-
-        Returns:
-            (object | None): The transform object of the model if available, otherwise None.
-
-        Examples:
-            >>> model = YOLO("yolo11n.pt")
-            >>> transforms = model.transforms
-            >>> if transforms:
-            ...     print(f"Model transforms: {transforms}")
-            ... else:
-            ...     print("No transforms defined for this model.")
-        """
-        return self.model.transforms if hasattr(self.model, "transforms") else None
-
-    def add_callback(self, event: str, func) -> None:
-        """
-        Add a callback function for a specified event.
-
-        This method allows registering custom callback functions that are triggered on specific events during
-        model operations such as training or inference. Callbacks provide a way to extend and customize the
-        behavior of the model at various stages of its lifecycle.
-
-        Args:
-            event (str): The name of the event to attach the callback to. Must be a valid event name recognized
-                by the Ultralytics framework.
-            func (Callable): The callback function to be registered. This function will be called when the
-                specified event occurs.
-
-        Raises:
-            ValueError: If the event name is not recognized or is invalid.
-
-        Examples:
-            >>> def on_train_start(trainer):
-            ...     print("Training is starting!")
-            >>> model = YOLO("yolo11n.pt")
-            >>> model.add_callback("on_train_start", on_train_start)
-            >>> model.train(data="coco8.yaml", epochs=1)
-        """
-        self.callbacks[event].append(func)
-
-    def clear_callback(self, event: str) -> None:
-        """
-        Clear all callback functions registered for a specified event.
-
-        This method removes all custom and default callback functions associated with the given event.
-        It resets the callback list for the specified event to an empty list, effectively removing all
-        registered callbacks for that event.
-
-        Args:
-            event (str): The name of the event for which to clear the callbacks. This should be a valid event name
-                recognized by the Ultralytics callback system.
-
-        Examples:
-            >>> model = YOLO("yolo11n.pt")
-            >>> model.add_callback("on_train_start", lambda: print("Training started"))
-            >>> model.clear_callback("on_train_start")
-            >>> # All callbacks for 'on_train_start' are now removed
-
-        Notes:
-            - This method affects both custom callbacks added by the user and default callbacks
-              provided by the Ultralytics framework.
-            - After calling this method, no callbacks will be executed for the specified event
-              until new ones are added.
-            - Use with caution as it removes all callbacks, including essential ones that might
-              be required for proper functioning of certain operations.
-        """
-        self.callbacks[event] = []
-
-    def reset_callbacks(self) -> None:
-        """
-        Reset all callbacks to their default functions.
-
-        This method reinstates the default callback functions for all events, removing any custom callbacks that were
-        previously added. It iterates through all default callback events and replaces the current callbacks with the
-        default ones.
-
-        The default callbacks are defined in the 'callbacks.default_callbacks' dictionary, which contains predefined
-        functions for various events in the model's lifecycle, such as on_train_start, on_epoch_end, etc.
-
-        This method is useful when you want to revert to the original set of callbacks after making custom
-        modifications, ensuring consistent behavior across different runs or experiments.
-
-        Examples:
-            >>> model = YOLO("yolo11n.pt")
-            >>> model.add_callback("on_train_start", custom_function)
-            >>> model.reset_callbacks()
-            # All callbacks are now reset to their default functions
-        """
-        for event in callbacks.default_callbacks.keys():
-            self.callbacks[event] = [callbacks.default_callbacks[event][0]]
-
-    @staticmethod
-    def _reset_ckpt_args(args: dict[str, Any]) -> dict[str, Any]:
-        """
-        Reset specific arguments when loading a PyTorch model checkpoint.
-
-        This method filters the input arguments dictionary to retain only a specific set of keys that are
-        considered important for model loading. It's used to ensure that only relevant arguments are preserved
-        when loading a model from a checkpoint, discarding any unnecessary or potentially conflicting settings.
-
-        Args:
-            args (dict): A dictionary containing various model arguments and settings.
-
-        Returns:
-            (dict): A new dictionary containing only the specified include keys from the input arguments.
-
-        Examples:
-            >>> original_args = {"imgsz": 640, "data": "coco.yaml", "task": "detect", "batch": 16, "epochs": 100}
-            >>> reset_args = Model._reset_ckpt_args(original_args)
-            >>> print(reset_args)
-            {'imgsz': 640, 'data': 'coco.yaml', 'task': 'detect'}
-        """
-        include = {"imgsz", "data", "task", "single_cls"}  # only remember these arguments when loading a PyTorch model
-        return {k: v for k, v in args.items() if k in include}
-
-    # def __getattr__(self, attr):
-    #    """Raises error if object has no requested attribute."""
-    #    name = self.__class__.__name__
-    #    raise AttributeError(f"'{name}' object has no attribute '{attr}'. See valid attributes below.\n{self.__doc__}")
-
-    def _smart_load(self, key: str):
-        """
-        Intelligently load the appropriate module based on the model task.
-
-        This method dynamically selects and returns the correct module (model, trainer, validator, or predictor)
-        based on the current task of the model and the provided key. It uses the task_map dictionary to determine
-        the appropriate module to load for the specific task.
-
-        Args:
-            key (str): The type of module to load. Must be one of 'model', 'trainer', 'validator', or 'predictor'.
-
-        Returns:
-            (object): The loaded module class corresponding to the specified key and current task.
-
-        Raises:
-            NotImplementedError: If the specified key is not supported for the current task.
-
-        Examples:
-            >>> model = Model(task="detect")
-            >>> predictor_class = model._smart_load("predictor")
-            >>> trainer_class = model._smart_load("trainer")
-        """
-        try:
-            return self.task_map[self.task][key]
-        except Exception as e:
-            name = self.__class__.__name__
-            mode = inspect.stack()[1][3]  # get the function name.
-            raise NotImplementedError(f"'{name}' model does not support '{mode}' mode for '{self.task}' task.") from e
-
-    @property
-    def task_map(self) -> dict:
-        """
-        Provide a mapping from model tasks to corresponding classes for different modes.
-
-        This property method returns a dictionary that maps each supported task (e.g., detect, segment, classify)
-        to a nested dictionary. The nested dictionary contains mappings for different operational modes
-        (model, trainer, validator, predictor) to their respective class implementations.
-
-        The mapping allows for dynamic loading of appropriate classes based on the model's task and the
-        desired operational mode. This facilitates a flexible and extensible architecture for handling
-        various tasks and modes within the Ultralytics framework.
-
-        Returns:
-            (dict[str, dict[str, Any]]): A dictionary mapping task names to nested dictionaries. Each nested dictionary
-            contains mappings for 'model', 'trainer', 'validator', and 'predictor' keys to their respective class
-            implementations for that task.
-
-        Examples:
-            >>> model = Model("yolo11n.pt")
-            >>> task_map = model.task_map
-            >>> detect_predictor = task_map["detect"]["predictor"]
-            >>> segment_trainer = task_map["segment"]["trainer"]
-        """
-        raise NotImplementedError("Please provide task map for your model!")
-
-    def eval(self):
-        """
-        Sets the model to evaluation mode.
-
-        This method changes the model's mode to evaluation, which affects layers like dropout and batch normalization
-        that behave differently during training and evaluation. In evaluation mode, these layers use running statistics
-        rather than computing batch statistics, and dropout layers are disabled.
-
-        Returns:
-            (Model): The model instance with evaluation mode set.
-
-        Examples:
-            >>> model = YOLO("yolo11n.pt")
-            >>> model.eval()
-            >>> # Model is now in evaluation mode for inference
-        """
-        self.model.eval()
-        return self
-
-    def __getattr__(self, name):
-        """
-        Enable accessing model attributes directly through the Model class.
-
-        This method provides a way to access attributes of the underlying model directly through the Model class
-        instance. It first checks if the requested attribute is 'model', in which case it returns the model from
-        the module dictionary. Otherwise, it delegates the attribute lookup to the underlying model.
-
-        Args:
-            name (str): The name of the attribute to retrieve.
-
-        Returns:
-            (Any): The requested attribute value.
-
-        Raises:
-            AttributeError: If the requested attribute does not exist in the model.
-
-        Examples:
-            >>> model = YOLO("yolo11n.pt")
-            >>> print(model.stride)  # Access model.stride attribute
-            >>> print(model.names)  # Access model.names attribute
-        """
-        return self._modules["model"] if name == "model" else getattr(self.model, name)
diff --git a/ultralytics/engine/predictor.py b/ultralytics/engine/predictor.py
deleted file mode 100644
index 1631dcc..0000000
--- a/ultralytics/engine/predictor.py
+++ /dev/null
@@ -1,517 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-"""
-Run prediction on images, videos, directories, globs, YouTube, webcam, streams, etc.
-
-Usage - sources:
-    $ yolo mode=predict model=yolo11n.pt source=0                               # webcam
-                                                img.jpg                         # image
-                                                vid.mp4                         # video
-                                                screen                          # screenshot
-                                                path/                           # directory
-                                                list.txt                        # list of images
-                                                list.streams                    # list of streams
-                                                'path/*.jpg'                    # glob
-                                                'https://youtu.be/LNwODJXcvt4'  # YouTube
-                                                'rtsp://example.com/media.mp4'  # RTSP, RTMP, HTTP, TCP stream
-
-Usage - formats:
-    $ yolo mode=predict model=yolo11n.pt                 # PyTorch
-                              yolo11n.torchscript        # TorchScript
-                              yolo11n.onnx               # ONNX Runtime or OpenCV DNN with dnn=True
-                              yolo11n_openvino_model     # OpenVINO
-                              yolo11n.engine             # TensorRT
-                              yolo11n.mlpackage          # CoreML (macOS-only)
-                              yolo11n_saved_model        # TensorFlow SavedModel
-                              yolo11n.pb                 # TensorFlow GraphDef
-                              yolo11n.tflite             # TensorFlow Lite
-                              yolo11n_edgetpu.tflite     # TensorFlow Edge TPU
-                              yolo11n_paddle_model       # PaddlePaddle
-                              yolo11n.mnn                # MNN
-                              yolo11n_ncnn_model         # NCNN
-                              yolo11n_imx_model          # Sony IMX
-                              yolo11n_rknn_model         # Rockchip RKNN
-"""
-
-from __future__ import annotations
-
-import platform
-import re
-import threading
-from pathlib import Path
-from typing import Any
-
-import cv2
-import numpy as np
-import torch
-
-from ultralytics.cfg import get_cfg, get_save_dir
-from ultralytics.data import load_inference_source
-from ultralytics.data.augment import LetterBox
-from ultralytics.nn.autobackend import AutoBackend
-from ultralytics.utils import DEFAULT_CFG, LOGGER, MACOS, WINDOWS, callbacks, colorstr, ops
-from ultralytics.utils.checks import check_imgsz, check_imshow
-from ultralytics.utils.files import increment_path
-from ultralytics.utils.torch_utils import attempt_compile, select_device, smart_inference_mode
-
-STREAM_WARNING = """
-inference results will accumulate in RAM unless `stream=True` is passed, causing potential out-of-memory
-errors for large sources or long-running streams and videos. See https://docs.ultralytics.com/modes/predict/ for help.
-
-Example:
-    results = model(source=..., stream=True)  # generator of Results objects
-    for r in results:
-        boxes = r.boxes  # Boxes object for bbox outputs
-        masks = r.masks  # Masks object for segment masks outputs
-        probs = r.probs  # Class probabilities for classification outputs
-"""
-
-
-class BasePredictor:
-    """
-    A base class for creating predictors.
-
-    This class provides the foundation for prediction functionality, handling model setup, inference,
-    and result processing across various input sources.
-
-    Attributes:
-        args (SimpleNamespace): Configuration for the predictor.
-        save_dir (Path): Directory to save results.
-        done_warmup (bool): Whether the predictor has finished setup.
-        model (torch.nn.Module): Model used for prediction.
-        data (dict): Data configuration.
-        device (torch.device): Device used for prediction.
-        dataset (Dataset): Dataset used for prediction.
-        vid_writer (dict[str, cv2.VideoWriter]): Dictionary of {save_path: video_writer} for saving video output.
-        plotted_img (np.ndarray): Last plotted image.
-        source_type (SimpleNamespace): Type of input source.
-        seen (int): Number of images processed.
-        windows (list[str]): List of window names for visualization.
-        batch (tuple): Current batch data.
-        results (list[Any]): Current batch results.
-        transforms (callable): Image transforms for classification.
-        callbacks (dict[str, list[callable]]): Callback functions for different events.
-        txt_path (Path): Path to save text results.
-        _lock (threading.Lock): Lock for thread-safe inference.
-
-    Methods:
-        preprocess: Prepare input image before inference.
-        inference: Run inference on a given image.
-        postprocess: Process raw predictions into structured results.
-        predict_cli: Run prediction for command line interface.
-        setup_source: Set up input source and inference mode.
-        stream_inference: Stream inference on input source.
-        setup_model: Initialize and configure the model.
-        write_results: Write inference results to files.
-        save_predicted_images: Save prediction visualizations.
-        show: Display results in a window.
-        run_callbacks: Execute registered callbacks for an event.
-        add_callback: Register a new callback function.
-    """
-
-    def __init__(
-        self,
-        cfg=DEFAULT_CFG,
-        overrides: dict[str, Any] | None = None,
-        _callbacks: dict[str, list[callable]] | None = None,
-    ):
-        """
-        Initialize the BasePredictor class.
-
-        Args:
-            cfg (str | dict): Path to a configuration file or a configuration dictionary.
-            overrides (dict, optional): Configuration overrides.
-            _callbacks (dict, optional): Dictionary of callback functions.
-        """
-        self.args = get_cfg(cfg, overrides)
-        self.save_dir = get_save_dir(self.args)
-        if self.args.conf is None:
-            self.args.conf = 0.25  # default conf=0.25
-        self.done_warmup = False
-        if self.args.show:
-            self.args.show = check_imshow(warn=True)
-
-        # Usable if setup is done
-        self.model = None
-        self.data = self.args.data  # data_dict
-        self.imgsz = None
-        self.device = None
-        self.dataset = None
-        self.vid_writer = {}  # dict of {save_path: video_writer, ...}
-        self.plotted_img = None
-        self.source_type = None
-        self.seen = 0
-        self.windows = []
-        self.batch = None
-        self.results = None
-        self.transforms = None
-        self.callbacks = _callbacks or callbacks.get_default_callbacks()
-        self.txt_path = None
-        self._lock = threading.Lock()  # for automatic thread-safe inference
-        callbacks.add_integration_callbacks(self)
-
-    def preprocess(self, im: torch.Tensor | list[np.ndarray]) -> torch.Tensor:
-        """
-        Prepare input image before inference.
-
-        Args:
-            im (torch.Tensor | list[np.ndarray]): Images of shape (N, 3, H, W) for tensor, [(H, W, 3) x N] for list.
-
-        Returns:
-            (torch.Tensor): Preprocessed image tensor of shape (N, 3, H, W).
-        """
-        not_tensor = not isinstance(im, torch.Tensor)
-        if not_tensor:
-            im = np.stack(self.pre_transform(im))
-            if im.shape[-1] == 3:
-                im = im[..., ::-1]  # BGR to RGB
-            im = im.transpose((0, 3, 1, 2))  # BHWC to BCHW, (n, 3, h, w)
-            im = np.ascontiguousarray(im)  # contiguous
-            im = torch.from_numpy(im)
-
-        im = im.to(self.device)
-        im = im.half() if self.model.fp16 else im.float()  # uint8 to fp16/32
-        if not_tensor:
-            im /= 255  # 0 - 255 to 0.0 - 1.0
-        return im
-
-    def inference(self, im: torch.Tensor, *args, **kwargs):
-        """Run inference on a given image using the specified model and arguments."""
-        visualize = (
-            increment_path(self.save_dir / Path(self.batch[0][0]).stem, mkdir=True)
-            if self.args.visualize and (not self.source_type.tensor)
-            else False
-        )
-        return self.model(im, augment=self.args.augment, visualize=visualize, embed=self.args.embed, *args, **kwargs)
-
-    def pre_transform(self, im: list[np.ndarray]) -> list[np.ndarray]:
-        """
-        Pre-transform input image before inference.
-
-        Args:
-            im (list[np.ndarray]): List of images with shape [(H, W, 3) x N].
-
-        Returns:
-            (list[np.ndarray]): List of transformed images.
-        """
-        same_shapes = len({x.shape for x in im}) == 1
-        letterbox = LetterBox(
-            self.imgsz,
-            auto=same_shapes
-            and self.args.rect
-            and (self.model.pt or (getattr(self.model, "dynamic", False) and not self.model.imx)),
-            stride=self.model.stride,
-        )
-        return [letterbox(image=x) for x in im]
-
-    def postprocess(self, preds, img, orig_imgs):
-        """Post-process predictions for an image and return them."""
-        return preds
-
-    def __call__(self, source=None, model=None, stream: bool = False, *args, **kwargs):
-        """
-        Perform inference on an image or stream.
-
-        Args:
-            source (str | Path | list[str] | list[Path] | list[np.ndarray] | np.ndarray | torch.Tensor, optional):
-                Source for inference.
-            model (str | Path | torch.nn.Module, optional): Model for inference.
-            stream (bool): Whether to stream the inference results. If True, returns a generator.
-            *args (Any): Additional arguments for the inference method.
-            **kwargs (Any): Additional keyword arguments for the inference method.
-
-        Returns:
-            (list[ultralytics.engine.results.Results] | generator): Results objects or generator of Results objects.
-        """
-        self.stream = stream
-        if stream:
-            return self.stream_inference(source, model, *args, **kwargs)
-        else:
-            return list(self.stream_inference(source, model, *args, **kwargs))  # merge list of Result into one
-
-    def predict_cli(self, source=None, model=None):
-        """
-        Method used for Command Line Interface (CLI) prediction.
-
-        This function is designed to run predictions using the CLI. It sets up the source and model, then processes
-        the inputs in a streaming manner. This method ensures that no outputs accumulate in memory by consuming the
-        generator without storing results.
-
-        Args:
-            source (str | Path | list[str] | list[Path] | list[np.ndarray] | np.ndarray | torch.Tensor, optional):
-                Source for inference.
-            model (str | Path | torch.nn.Module, optional): Model for inference.
-
-        Note:
-            Do not modify this function or remove the generator. The generator ensures that no outputs are
-            accumulated in memory, which is critical for preventing memory issues during long-running predictions.
-        """
-        gen = self.stream_inference(source, model)
-        for _ in gen:  # sourcery skip: remove-empty-nested-block, noqa
-            pass
-
-    def setup_source(self, source):
-        """
-        Set up source and inference mode.
-
-        Args:
-            source (str | Path | list[str] | list[Path] | list[np.ndarray] | np.ndarray | torch.Tensor):
-                Source for inference.
-        """
-        self.imgsz = check_imgsz(self.args.imgsz, stride=self.model.stride, min_dim=2)  # check image size
-        self.dataset = load_inference_source(
-            source=source,
-            batch=self.args.batch,
-            vid_stride=self.args.vid_stride,
-            buffer=self.args.stream_buffer,
-            channels=getattr(self.model, "ch", 3),
-        )
-        self.source_type = self.dataset.source_type
-        long_sequence = (
-            self.source_type.stream
-            or self.source_type.screenshot
-            or len(self.dataset) > 1000  # many images
-            or any(getattr(self.dataset, "video_flag", [False]))
-        )
-        if long_sequence:
-            import torchvision  # noqa (import here triggers torchvision NMS use in nms.py)
-
-            if not getattr(self, "stream", True):  # videos
-                LOGGER.warning(STREAM_WARNING)
-        self.vid_writer = {}
-
-    @smart_inference_mode()
-    def stream_inference(self, source=None, model=None, *args, **kwargs):
-        """
-        Stream real-time inference on camera feed and save results to file.
-
-        Args:
-            source (str | Path | list[str] | list[Path] | list[np.ndarray] | np.ndarray | torch.Tensor, optional):
-                Source for inference.
-            model (str | Path | torch.nn.Module, optional): Model for inference.
-            *args (Any): Additional arguments for the inference method.
-            **kwargs (Any): Additional keyword arguments for the inference method.
-
-        Yields:
-            (ultralytics.engine.results.Results): Results objects.
-        """
-        if self.args.verbose:
-            LOGGER.info("")
-
-        # Setup model
-        if not self.model:
-            self.setup_model(model)
-
-        with self._lock:  # for thread-safe inference
-            # Setup source every time predict is called
-            self.setup_source(source if source is not None else self.args.source)
-
-            # Check if save_dir/ label file exists
-            if self.args.save or self.args.save_txt:
-                (self.save_dir / "labels" if self.args.save_txt else self.save_dir).mkdir(parents=True, exist_ok=True)
-
-            # Warmup model
-            if not self.done_warmup:
-                self.model.warmup(
-                    imgsz=(1 if self.model.pt or self.model.triton else self.dataset.bs, self.model.ch, *self.imgsz)
-                )
-                self.done_warmup = True
-
-            self.seen, self.windows, self.batch = 0, [], None
-            profilers = (
-                ops.Profile(device=self.device),
-                ops.Profile(device=self.device),
-                ops.Profile(device=self.device),
-            )
-            self.run_callbacks("on_predict_start")
-            for self.batch in self.dataset:
-                self.run_callbacks("on_predict_batch_start")
-                paths, im0s, s = self.batch
-
-                # Preprocess
-                with profilers[0]:
-                    im = self.preprocess(im0s)
-
-                # Inference
-                with profilers[1]:
-                    preds = self.inference(im, *args, **kwargs)
-                    if self.args.embed:
-                        yield from [preds] if isinstance(preds, torch.Tensor) else preds  # yield embedding tensors
-                        continue
-
-                # Postprocess
-                with profilers[2]:
-                    self.results = self.postprocess(preds, im, im0s)
-                self.run_callbacks("on_predict_postprocess_end")
-
-                # Visualize, save, write results
-                n = len(im0s)
-                try:
-                    for i in range(n):
-                        self.seen += 1
-                        self.results[i].speed = {
-                            "preprocess": profilers[0].dt * 1e3 / n,
-                            "inference": profilers[1].dt * 1e3 / n,
-                            "postprocess": profilers[2].dt * 1e3 / n,
-                        }
-                        if self.args.verbose or self.args.save or self.args.save_txt or self.args.show:
-                            s[i] += self.write_results(i, Path(paths[i]), im, s)
-                except StopIteration:
-                    break
-
-                # Print batch results
-                if self.args.verbose:
-                    LOGGER.info("\n".join(s))
-
-                self.run_callbacks("on_predict_batch_end")
-                yield from self.results
-
-        # Release assets
-        for v in self.vid_writer.values():
-            if isinstance(v, cv2.VideoWriter):
-                v.release()
-
-        if self.args.show:
-            cv2.destroyAllWindows()  # close any open windows
-
-        # Print final results
-        if self.args.verbose and self.seen:
-            t = tuple(x.t / self.seen * 1e3 for x in profilers)  # speeds per image
-            LOGGER.info(
-                f"Speed: %.1fms preprocess, %.1fms inference, %.1fms postprocess per image at shape "
-                f"{(min(self.args.batch, self.seen), getattr(self.model, 'ch', 3), *im.shape[2:])}" % t
-            )
-        if self.args.save or self.args.save_txt or self.args.save_crop:
-            nl = len(list(self.save_dir.glob("labels/*.txt")))  # number of labels
-            s = f"\n{nl} label{'s' * (nl > 1)} saved to {self.save_dir / 'labels'}" if self.args.save_txt else ""
-            LOGGER.info(f"Results saved to {colorstr('bold', self.save_dir)}{s}")
-        self.run_callbacks("on_predict_end")
-
-    def setup_model(self, model, verbose: bool = True):
-        """
-        Initialize YOLO model with given parameters and set it to evaluation mode.
-
-        Args:
-            model (str | Path | torch.nn.Module, optional): Model to load or use.
-            verbose (bool): Whether to print verbose output.
-        """
-        self.model = AutoBackend(
-            model=model or self.args.model,
-            device=select_device(self.args.device, verbose=verbose),
-            dnn=self.args.dnn,
-            data=self.args.data,
-            fp16=self.args.half,
-            fuse=True,
-            verbose=verbose,
-        )
-
-        self.device = self.model.device  # update device
-        self.args.half = self.model.fp16  # update half
-        if hasattr(self.model, "imgsz") and not getattr(self.model, "dynamic", False):
-            self.args.imgsz = self.model.imgsz  # reuse imgsz from export metadata
-        self.model.eval()
-        self.model = attempt_compile(self.model, device=self.device, mode=self.args.compile)
-
-    def write_results(self, i: int, p: Path, im: torch.Tensor, s: list[str]) -> str:
-        """
-        Write inference results to a file or directory.
-
-        Args:
-            i (int): Index of the current image in the batch.
-            p (Path): Path to the current image.
-            im (torch.Tensor): Preprocessed image tensor.
-            s (list[str]): List of result strings.
-
-        Returns:
-            (str): String with result information.
-        """
-        string = ""  # print string
-        if len(im.shape) == 3:
-            im = im[None]  # expand for batch dim
-        if self.source_type.stream or self.source_type.from_img or self.source_type.tensor:  # batch_size >= 1
-            string += f"{i}: "
-            frame = self.dataset.count
-        else:
-            match = re.search(r"frame (\d+)/", s[i])
-            frame = int(match[1]) if match else None  # 0 if frame undetermined
-
-        self.txt_path = self.save_dir / "labels" / (p.stem + ("" if self.dataset.mode == "image" else f"_{frame}"))
-        string += "{:g}x{:g} ".format(*im.shape[2:])
-        result = self.results[i]
-        result.save_dir = self.save_dir.__str__()  # used in other locations
-        string += f"{result.verbose()}{result.speed['inference']:.1f}ms"
-
-        # Add predictions to image
-        if self.args.save or self.args.show:
-            self.plotted_img = result.plot(
-                line_width=self.args.line_width,
-                boxes=self.args.show_boxes,
-                conf=self.args.show_conf,
-                labels=self.args.show_labels,
-                im_gpu=None if self.args.retina_masks else im[i],
-            )
-
-        # Save results
-        if self.args.save_txt:
-            result.save_txt(f"{self.txt_path}.txt", save_conf=self.args.save_conf)
-        if self.args.save_crop:
-            result.save_crop(save_dir=self.save_dir / "crops", file_name=self.txt_path.stem)
-        if self.args.show:
-            self.show(str(p))
-        if self.args.save:
-            self.save_predicted_images(self.save_dir / p.name, frame)
-
-        return string
-
-    def save_predicted_images(self, save_path: Path, frame: int = 0):
-        """
-        Save video predictions as mp4 or images as jpg at specified path.
-
-        Args:
-            save_path (Path): Path to save the results.
-            frame (int): Frame number for video mode.
-        """
-        im = self.plotted_img
-
-        # Save videos and streams
-        if self.dataset.mode in {"stream", "video"}:
-            fps = self.dataset.fps if self.dataset.mode == "video" else 30
-            frames_path = self.save_dir / f"{save_path.stem}_frames"  # save frames to a separate directory
-            if save_path not in self.vid_writer:  # new video
-                if self.args.save_frames:
-                    Path(frames_path).mkdir(parents=True, exist_ok=True)
-                suffix, fourcc = (".mp4", "avc1") if MACOS else (".avi", "WMV2") if WINDOWS else (".avi", "MJPG")
-                self.vid_writer[save_path] = cv2.VideoWriter(
-                    filename=str(Path(save_path).with_suffix(suffix)),
-                    fourcc=cv2.VideoWriter_fourcc(*fourcc),
-                    fps=fps,  # integer required, floats produce error in MP4 codec
-                    frameSize=(im.shape[1], im.shape[0]),  # (width, height)
-                )
-
-            # Save video
-            self.vid_writer[save_path].write(im)
-            if self.args.save_frames:
-                cv2.imwrite(f"{frames_path}/{save_path.stem}_{frame}.jpg", im)
-
-        # Save images
-        else:
-            cv2.imwrite(str(save_path.with_suffix(".jpg")), im)  # save to JPG for best support
-
-    def show(self, p: str = ""):
-        """Display an image in a window."""
-        im = self.plotted_img
-        if platform.system() == "Linux" and p not in self.windows:
-            self.windows.append(p)
-            cv2.namedWindow(p, cv2.WINDOW_NORMAL | cv2.WINDOW_KEEPRATIO)  # allow window resize (Linux)
-            cv2.resizeWindow(p, im.shape[1], im.shape[0])  # (width, height)
-        cv2.imshow(p, im)
-        if cv2.waitKey(300 if self.dataset.mode == "image" else 1) & 0xFF == ord("q"):  # 300ms if image; else 1ms
-            raise StopIteration
-
-    def run_callbacks(self, event: str):
-        """Run all registered callbacks for a specific event."""
-        for callback in self.callbacks.get(event, []):
-            callback(self)
-
-    def add_callback(self, event: str, func: callable):
-        """Add a callback function for a specific event."""
-        self.callbacks[event].append(func)
diff --git a/ultralytics/engine/results.py b/ultralytics/engine/results.py
deleted file mode 100644
index a8548d5..0000000
--- a/ultralytics/engine/results.py
+++ /dev/null
@@ -1,1656 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-"""
-Ultralytics Results, Boxes and Masks classes for handling inference results.
-
-Usage: See https://docs.ultralytics.com/modes/predict/
-"""
-
-from __future__ import annotations
-
-from copy import deepcopy
-from functools import lru_cache
-from pathlib import Path
-from typing import Any
-
-import numpy as np
-import torch
-
-from ultralytics.data.augment import LetterBox
-from ultralytics.utils import LOGGER, DataExportMixin, SimpleClass, ops
-from ultralytics.utils.plotting import Annotator, colors, save_one_box
-
-
-class BaseTensor(SimpleClass):
-    """
-    Base tensor class with additional methods for easy manipulation and device handling.
-
-    This class provides a foundation for tensor-like objects with device management capabilities,
-    supporting both PyTorch tensors and NumPy arrays. It includes methods for moving data between
-    devices and converting between tensor types.
-
-    Attributes:
-        data (torch.Tensor | np.ndarray): Prediction data such as bounding boxes, masks, or keypoints.
-        orig_shape (tuple[int, int]): Original shape of the image, typically in the format (height, width).
-
-    Methods:
-        cpu: Return a copy of the tensor stored in CPU memory.
-        numpy: Return a copy of the tensor as a numpy array.
-        cuda: Move the tensor to GPU memory, returning a new instance if necessary.
-        to: Return a copy of the tensor with the specified device and dtype.
-
-    Examples:
-        >>> import torch
-        >>> data = torch.tensor([[1, 2, 3], [4, 5, 6]])
-        >>> orig_shape = (720, 1280)
-        >>> base_tensor = BaseTensor(data, orig_shape)
-        >>> cpu_tensor = base_tensor.cpu()
-        >>> numpy_array = base_tensor.numpy()
-        >>> gpu_tensor = base_tensor.cuda()
-    """
-
-    def __init__(self, data: torch.Tensor | np.ndarray, orig_shape: tuple[int, int]) -> None:
-        """
-        Initialize BaseTensor with prediction data and the original shape of the image.
-
-        Args:
-            data (torch.Tensor | np.ndarray): Prediction data such as bounding boxes, masks, or keypoints.
-            orig_shape (tuple[int, int]): Original shape of the image in (height, width) format.
-
-        Examples:
-            >>> import torch
-            >>> data = torch.tensor([[1, 2, 3], [4, 5, 6]])
-            >>> orig_shape = (720, 1280)
-            >>> base_tensor = BaseTensor(data, orig_shape)
-        """
-        assert isinstance(data, (torch.Tensor, np.ndarray)), "data must be torch.Tensor or np.ndarray"
-        self.data = data
-        self.orig_shape = orig_shape
-
-    @property
-    def shape(self) -> tuple[int, ...]:
-        """
-        Return the shape of the underlying data tensor.
-
-        Returns:
-            (tuple[int, ...]): The shape of the data tensor.
-
-        Examples:
-            >>> data = torch.rand(100, 4)
-            >>> base_tensor = BaseTensor(data, orig_shape=(720, 1280))
-            >>> print(base_tensor.shape)
-            (100, 4)
-        """
-        return self.data.shape
-
-    def cpu(self):
-        """
-        Return a copy of the tensor stored in CPU memory.
-
-        Returns:
-            (BaseTensor): A new BaseTensor object with the data tensor moved to CPU memory.
-
-        Examples:
-            >>> data = torch.tensor([[1, 2, 3], [4, 5, 6]]).cuda()
-            >>> base_tensor = BaseTensor(data, orig_shape=(720, 1280))
-            >>> cpu_tensor = base_tensor.cpu()
-            >>> isinstance(cpu_tensor, BaseTensor)
-            True
-            >>> cpu_tensor.data.device
-            device(type='cpu')
-        """
-        return self if isinstance(self.data, np.ndarray) else self.__class__(self.data.cpu(), self.orig_shape)
-
-    def numpy(self):
-        """
-        Return a copy of the tensor as a numpy array.
-
-        Returns:
-            (np.ndarray): A numpy array containing the same data as the original tensor.
-
-        Examples:
-            >>> data = torch.tensor([[1, 2, 3], [4, 5, 6]])
-            >>> orig_shape = (720, 1280)
-            >>> base_tensor = BaseTensor(data, orig_shape)
-            >>> numpy_array = base_tensor.numpy()
-            >>> print(type(numpy_array))
-            <class 'numpy.ndarray'>
-        """
-        return self if isinstance(self.data, np.ndarray) else self.__class__(self.data.numpy(), self.orig_shape)
-
-    def cuda(self):
-        """
-        Move the tensor to GPU memory.
-
-        Returns:
-            (BaseTensor): A new BaseTensor instance with the data moved to GPU memory if it's not already a
-                numpy array, otherwise returns self.
-
-        Examples:
-            >>> import torch
-            >>> from ultralytics.engine.results import BaseTensor
-            >>> data = torch.tensor([[1, 2, 3], [4, 5, 6]])
-            >>> base_tensor = BaseTensor(data, orig_shape=(720, 1280))
-            >>> gpu_tensor = base_tensor.cuda()
-            >>> print(gpu_tensor.data.device)
-            cuda:0
-        """
-        return self.__class__(torch.as_tensor(self.data).cuda(), self.orig_shape)
-
-    def to(self, *args, **kwargs):
-        """
-        Return a copy of the tensor with the specified device and dtype.
-
-        Args:
-            *args (Any): Variable length argument list to be passed to torch.Tensor.to().
-            **kwargs (Any): Arbitrary keyword arguments to be passed to torch.Tensor.to().
-
-        Returns:
-            (BaseTensor): A new BaseTensor instance with the data moved to the specified device and/or dtype.
-
-        Examples:
-            >>> base_tensor = BaseTensor(torch.randn(3, 4), orig_shape=(480, 640))
-            >>> cuda_tensor = base_tensor.to("cuda")
-            >>> float16_tensor = base_tensor.to(dtype=torch.float16)
-        """
-        return self.__class__(torch.as_tensor(self.data).to(*args, **kwargs), self.orig_shape)
-
-    def __len__(self) -> int:
-        """
-        Return the length of the underlying data tensor.
-
-        Returns:
-            (int): The number of elements in the first dimension of the data tensor.
-
-        Examples:
-            >>> data = torch.tensor([[1, 2, 3], [4, 5, 6]])
-            >>> base_tensor = BaseTensor(data, orig_shape=(720, 1280))
-            >>> len(base_tensor)
-            2
-        """
-        return len(self.data)
-
-    def __getitem__(self, idx):
-        """
-        Return a new BaseTensor instance containing the specified indexed elements of the data tensor.
-
-        Args:
-            idx (int | list[int] | torch.Tensor): Index or indices to select from the data tensor.
-
-        Returns:
-            (BaseTensor): A new BaseTensor instance containing the indexed data.
-
-        Examples:
-            >>> data = torch.tensor([[1, 2, 3], [4, 5, 6]])
-            >>> base_tensor = BaseTensor(data, orig_shape=(720, 1280))
-            >>> result = base_tensor[0]  # Select the first row
-            >>> print(result.data)
-            tensor([1, 2, 3])
-        """
-        return self.__class__(self.data[idx], self.orig_shape)
-
-
-class Results(SimpleClass, DataExportMixin):
-    """
-    A class for storing and manipulating inference results.
-
-    This class provides comprehensive functionality for handling inference results from various
-    Ultralytics models, including detection, segmentation, classification, and pose estimation.
-    It supports visualization, data export, and various coordinate transformations.
-
-    Attributes:
-        orig_img (np.ndarray): The original image as a numpy array.
-        orig_shape (tuple[int, int]): Original image shape in (height, width) format.
-        boxes (Boxes | None): Detected bounding boxes.
-        masks (Masks | None): Segmentation masks.
-        probs (Probs | None): Classification probabilities.
-        keypoints (Keypoints | None): Detected keypoints.
-        obb (OBB | None): Oriented bounding boxes.
-        speed (dict): Dictionary containing inference speed information.
-        names (dict): Dictionary mapping class indices to class names.
-        path (str): Path to the input image file.
-        save_dir (str | None): Directory to save results.
-
-    Methods:
-        update: Update the Results object with new detection data.
-        cpu: Return a copy of the Results object with all tensors moved to CPU memory.
-        numpy: Convert all tensors in the Results object to numpy arrays.
-        cuda: Move all tensors in the Results object to GPU memory.
-        to: Move all tensors to the specified device and dtype.
-        new: Create a new Results object with the same image, path, names, and speed attributes.
-        plot: Plot detection results on an input RGB image.
-        show: Display the image with annotated inference results.
-        save: Save annotated inference results image to file.
-        verbose: Return a log string for each task in the results.
-        save_txt: Save detection results to a text file.
-        save_crop: Save cropped detection images to specified directory.
-        summary: Convert inference results to a summarized dictionary.
-        to_df: Convert detection results to a Polars Dataframe.
-        to_json: Convert detection results to JSON format.
-        to_csv: Convert detection results to a CSV format.
-
-    Examples:
-        >>> results = model("path/to/image.jpg")
-        >>> result = results[0]  # Get the first result
-        >>> boxes = result.boxes  # Get the boxes for the first result
-        >>> masks = result.masks  # Get the masks for the first result
-        >>> for result in results:
-        >>>     result.plot()  # Plot detection results
-    """
-
-    def __init__(
-        self,
-        orig_img: np.ndarray,
-        path: str,
-        names: dict[int, str],
-        boxes: torch.Tensor | None = None,
-        masks: torch.Tensor | None = None,
-        probs: torch.Tensor | None = None,
-        keypoints: torch.Tensor | None = None,
-        obb: torch.Tensor | None = None,
-        speed: dict[str, float] | None = None,
-    ) -> None:
-        """
-        Initialize the Results class for storing and manipulating inference results.
-
-        Args:
-            orig_img (np.ndarray): The original image as a numpy array.
-            path (str): The path to the image file.
-            names (dict): A dictionary of class names.
-            boxes (torch.Tensor | None): A 2D tensor of bounding box coordinates for each detection.
-            masks (torch.Tensor | None): A 3D tensor of detection masks, where each mask is a binary image.
-            probs (torch.Tensor | None): A 1D tensor of probabilities of each class for classification task.
-            keypoints (torch.Tensor | None): A 2D tensor of keypoint coordinates for each detection.
-            obb (torch.Tensor | None): A 2D tensor of oriented bounding box coordinates for each detection.
-            speed (dict | None): A dictionary containing preprocess, inference, and postprocess speeds (ms/image).
-
-        Examples:
-            >>> results = model("path/to/image.jpg")
-            >>> result = results[0]  # Get the first result
-            >>> boxes = result.boxes  # Get the boxes for the first result
-            >>> masks = result.masks  # Get the masks for the first result
-
-        Notes:
-            For the default pose model, keypoint indices for human body pose estimation are:
-            0: Nose, 1: Left Eye, 2: Right Eye, 3: Left Ear, 4: Right Ear
-            5: Left Shoulder, 6: Right Shoulder, 7: Left Elbow, 8: Right Elbow
-            9: Left Wrist, 10: Right Wrist, 11: Left Hip, 12: Right Hip
-            13: Left Knee, 14: Right Knee, 15: Left Ankle, 16: Right Ankle
-        """
-        self.orig_img = orig_img
-        self.orig_shape = orig_img.shape[:2]
-        self.boxes = Boxes(boxes, self.orig_shape) if boxes is not None else None  # native size boxes
-        self.masks = Masks(masks, self.orig_shape) if masks is not None else None  # native size or imgsz masks
-        self.probs = Probs(probs) if probs is not None else None
-        self.keypoints = Keypoints(keypoints, self.orig_shape) if keypoints is not None else None
-        self.obb = OBB(obb, self.orig_shape) if obb is not None else None
-        self.speed = speed if speed is not None else {"preprocess": None, "inference": None, "postprocess": None}
-        self.names = names
-        self.path = path
-        self.save_dir = None
-        self._keys = "boxes", "masks", "probs", "keypoints", "obb"
-
-    def __getitem__(self, idx):
-        """
-        Return a Results object for a specific index of inference results.
-
-        Args:
-            idx (int | slice): Index or slice to retrieve from the Results object.
-
-        Returns:
-            (Results): A new Results object containing the specified subset of inference results.
-
-        Examples:
-            >>> results = model("path/to/image.jpg")  # Perform inference
-            >>> single_result = results[0]  # Get the first result
-            >>> subset_results = results[1:4]  # Get a slice of results
-        """
-        return self._apply("__getitem__", idx)
-
-    def __len__(self) -> int:
-        """
-        Return the number of detections in the Results object.
-
-        Returns:
-            (int): The number of detections, determined by the length of the first non-empty
-                attribute in (masks, probs, keypoints, or obb).
-
-        Examples:
-            >>> results = Results(orig_img, path, names, boxes=torch.rand(5, 4))
-            >>> len(results)
-            5
-        """
-        for k in self._keys:
-            v = getattr(self, k)
-            if v is not None:
-                return len(v)
-
-    def update(
-        self,
-        boxes: torch.Tensor | None = None,
-        masks: torch.Tensor | None = None,
-        probs: torch.Tensor | None = None,
-        obb: torch.Tensor | None = None,
-        keypoints: torch.Tensor | None = None,
-    ):
-        """
-        Update the Results object with new detection data.
-
-        This method allows updating the boxes, masks, probabilities, and oriented bounding boxes (OBB) of the
-        Results object. It ensures that boxes are clipped to the original image shape.
-
-        Args:
-            boxes (torch.Tensor | None): A tensor of shape (N, 6) containing bounding box coordinates and
-                confidence scores. The format is (x1, y1, x2, y2, conf, class).
-            masks (torch.Tensor | None): A tensor of shape (N, H, W) containing segmentation masks.
-            probs (torch.Tensor | None): A tensor of shape (num_classes,) containing class probabilities.
-            obb (torch.Tensor | None): A tensor of shape (N, 5) containing oriented bounding box coordinates.
-            keypoints (torch.Tensor | None): A tensor of shape (N, 17, 3) containing keypoints.
-
-        Examples:
-            >>> results = model("image.jpg")
-            >>> new_boxes = torch.tensor([[100, 100, 200, 200, 0.9, 0]])
-            >>> results[0].update(boxes=new_boxes)
-        """
-        if boxes is not None:
-            self.boxes = Boxes(ops.clip_boxes(boxes, self.orig_shape), self.orig_shape)
-        if masks is not None:
-            self.masks = Masks(masks, self.orig_shape)
-        if probs is not None:
-            self.probs = probs
-        if obb is not None:
-            self.obb = OBB(obb, self.orig_shape)
-        if keypoints is not None:
-            self.keypoints = Keypoints(keypoints, self.orig_shape)
-
-    def _apply(self, fn: str, *args, **kwargs):
-        """
-        Apply a function to all non-empty attributes and return a new Results object with modified attributes.
-
-        This method is internally called by methods like .to(), .cuda(), .cpu(), etc.
-
-        Args:
-            fn (str): The name of the function to apply.
-            *args (Any): Variable length argument list to pass to the function.
-            **kwargs (Any): Arbitrary keyword arguments to pass to the function.
-
-        Returns:
-            (Results): A new Results object with attributes modified by the applied function.
-
-        Examples:
-            >>> results = model("path/to/image.jpg")
-            >>> for result in results:
-            ...     result_cuda = result.cuda()
-            ...     result_cpu = result.cpu()
-        """
-        r = self.new()
-        for k in self._keys:
-            v = getattr(self, k)
-            if v is not None:
-                setattr(r, k, getattr(v, fn)(*args, **kwargs))
-        return r
-
-    def cpu(self):
-        """
-        Return a copy of the Results object with all its tensors moved to CPU memory.
-
-        This method creates a new Results object with all tensor attributes (boxes, masks, probs, keypoints, obb)
-        transferred to CPU memory. It's useful for moving data from GPU to CPU for further processing or saving.
-
-        Returns:
-            (Results): A new Results object with all tensor attributes on CPU memory.
-
-        Examples:
-            >>> results = model("path/to/image.jpg")  # Perform inference
-            >>> cpu_result = results[0].cpu()  # Move the first result to CPU
-            >>> print(cpu_result.boxes.device)  # Output: cpu
-        """
-        return self._apply("cpu")
-
-    def numpy(self):
-        """
-        Convert all tensors in the Results object to numpy arrays.
-
-        Returns:
-            (Results): A new Results object with all tensors converted to numpy arrays.
-
-        Examples:
-            >>> results = model("path/to/image.jpg")
-            >>> numpy_result = results[0].numpy()
-            >>> type(numpy_result.boxes.data)
-            <class 'numpy.ndarray'>
-
-        Notes:
-            This method creates a new Results object, leaving the original unchanged. It's useful for
-            interoperability with numpy-based libraries or when CPU-based operations are required.
-        """
-        return self._apply("numpy")
-
-    def cuda(self):
-        """
-        Move all tensors in the Results object to GPU memory.
-
-        Returns:
-            (Results): A new Results object with all tensors moved to CUDA device.
-
-        Examples:
-            >>> results = model("path/to/image.jpg")
-            >>> cuda_results = results[0].cuda()  # Move first result to GPU
-            >>> for result in results:
-            ...     result_cuda = result.cuda()  # Move each result to GPU
-        """
-        return self._apply("cuda")
-
-    def to(self, *args, **kwargs):
-        """
-        Move all tensors in the Results object to the specified device and dtype.
-
-        Args:
-            *args (Any): Variable length argument list to be passed to torch.Tensor.to().
-            **kwargs (Any): Arbitrary keyword arguments to be passed to torch.Tensor.to().
-
-        Returns:
-            (Results): A new Results object with all tensors moved to the specified device and dtype.
-
-        Examples:
-            >>> results = model("path/to/image.jpg")
-            >>> result_cuda = results[0].to("cuda")  # Move first result to GPU
-            >>> result_cpu = results[0].to("cpu")  # Move first result to CPU
-            >>> result_half = results[0].to(dtype=torch.float16)  # Convert first result to half precision
-        """
-        return self._apply("to", *args, **kwargs)
-
-    def new(self):
-        """
-        Create a new Results object with the same image, path, names, and speed attributes.
-
-        Returns:
-            (Results): A new Results object with copied attributes from the original instance.
-
-        Examples:
-            >>> results = model("path/to/image.jpg")
-            >>> new_result = results[0].new()
-        """
-        return Results(orig_img=self.orig_img, path=self.path, names=self.names, speed=self.speed)
-
-    def plot(
-        self,
-        conf: bool = True,
-        line_width: float | None = None,
-        font_size: float | None = None,
-        font: str = "Arial.ttf",
-        pil: bool = False,
-        img: np.ndarray | None = None,
-        im_gpu: torch.Tensor | None = None,
-        kpt_radius: int = 5,
-        kpt_line: bool = True,
-        labels: bool = True,
-        boxes: bool = True,
-        masks: bool = True,
-        probs: bool = True,
-        show: bool = False,
-        save: bool = False,
-        filename: str | None = None,
-        color_mode: str = "class",
-        txt_color: tuple[int, int, int] = (255, 255, 255),
-    ) -> np.ndarray:
-        """
-        Plot detection results on an input RGB image.
-
-        Args:
-            conf (bool): Whether to plot detection confidence scores.
-            line_width (float | None): Line width of bounding boxes. If None, scaled to image size.
-            font_size (float | None): Font size for text. If None, scaled to image size.
-            font (str): Font to use for text.
-            pil (bool): Whether to return the image as a PIL Image.
-            img (np.ndarray | None): Image to plot on. If None, uses original image.
-            im_gpu (torch.Tensor | None): Normalized image on GPU for faster mask plotting.
-            kpt_radius (int): Radius of drawn keypoints.
-            kpt_line (bool): Whether to draw lines connecting keypoints.
-            labels (bool): Whether to plot labels of bounding boxes.
-            boxes (bool): Whether to plot bounding boxes.
-            masks (bool): Whether to plot masks.
-            probs (bool): Whether to plot classification probabilities.
-            show (bool): Whether to display the annotated image.
-            save (bool): Whether to save the annotated image.
-            filename (str | None): Filename to save image if save is True.
-            color_mode (str): Specify the color mode, e.g., 'instance' or 'class'.
-            txt_color (tuple[int, int, int]): Specify the RGB text color for classification task.
-
-        Returns:
-            (np.ndarray): Annotated image as a numpy array.
-
-        Examples:
-            >>> results = model("image.jpg")
-            >>> for result in results:
-            >>>     im = result.plot()
-            >>>     im.show()
-        """
-        assert color_mode in {"instance", "class"}, f"Expected color_mode='instance' or 'class', not {color_mode}."
-        if img is None and isinstance(self.orig_img, torch.Tensor):
-            img = (self.orig_img[0].detach().permute(1, 2, 0).contiguous() * 255).to(torch.uint8).cpu().numpy()
-
-        names = self.names
-        is_obb = self.obb is not None
-        pred_boxes, show_boxes = self.obb if is_obb else self.boxes, boxes
-        pred_masks, show_masks = self.masks, masks
-        pred_probs, show_probs = self.probs, probs
-        annotator = Annotator(
-            deepcopy(self.orig_img if img is None else img),
-            line_width,
-            font_size,
-            font,
-            pil or (pred_probs is not None and show_probs),  # Classify tasks default to pil=True
-            example=names,
-        )
-
-        # Plot Segment results
-        if pred_masks and show_masks:
-            if im_gpu is None:
-                img = LetterBox(pred_masks.shape[1:])(image=annotator.result())
-                im_gpu = (
-                    torch.as_tensor(img, dtype=torch.float16, device=pred_masks.data.device)
-                    .permute(2, 0, 1)
-                    .flip(0)
-                    .contiguous()
-                    / 255
-                )
-            idx = (
-                pred_boxes.id
-                if pred_boxes.is_track and color_mode == "instance"
-                else pred_boxes.cls
-                if pred_boxes and color_mode == "class"
-                else reversed(range(len(pred_masks)))
-            )
-            annotator.masks(pred_masks.data, colors=[colors(x, True) for x in idx], im_gpu=im_gpu)
-
-        # Plot Detect results
-        if pred_boxes is not None and show_boxes:
-            for i, d in enumerate(reversed(pred_boxes)):
-                c, d_conf, id = int(d.cls), float(d.conf) if conf else None, int(d.id.item()) if d.is_track else None
-                name = ("" if id is None else f"id:{id} ") + names[c]
-                label = (f"{name} {d_conf:.2f}" if conf else name) if labels else None
-                box = d.xyxyxyxy.squeeze() if is_obb else d.xyxy.squeeze()
-                annotator.box_label(
-                    box,
-                    label,
-                    color=colors(
-                        c
-                        if color_mode == "class"
-                        else id
-                        if id is not None
-                        else i
-                        if color_mode == "instance"
-                        else None,
-                        True,
-                    ),
-                )
-
-        # Plot Classify results
-        if pred_probs is not None and show_probs:
-            text = "\n".join(f"{names[j] if names else j} {pred_probs.data[j]:.2f}" for j in pred_probs.top5)
-            x = round(self.orig_shape[0] * 0.03)
-            annotator.text([x, x], text, txt_color=txt_color, box_color=(64, 64, 64, 128))  # RGBA box
-
-        # Plot Pose results
-        if self.keypoints is not None:
-            for i, k in enumerate(reversed(self.keypoints.data)):
-                annotator.kpts(
-                    k,
-                    self.orig_shape,
-                    radius=kpt_radius,
-                    kpt_line=kpt_line,
-                    kpt_color=colors(i, True) if color_mode == "instance" else None,
-                )
-
-        # Show results
-        if show:
-            annotator.show(self.path)
-
-        # Save results
-        if save:
-            annotator.save(filename or f"results_{Path(self.path).name}")
-
-        return annotator.im if pil else annotator.result()
-
-    def show(self, *args, **kwargs):
-        """
-        Display the image with annotated inference results.
-
-        This method plots the detection results on the original image and displays it. It's a convenient way to
-        visualize the model's predictions directly.
-
-        Args:
-            *args (Any): Variable length argument list to be passed to the `plot()` method.
-            **kwargs (Any): Arbitrary keyword arguments to be passed to the `plot()` method.
-
-        Examples:
-            >>> results = model("path/to/image.jpg")
-            >>> results[0].show()  # Display the first result
-            >>> for result in results:
-            >>>     result.show()  # Display all results
-        """
-        self.plot(show=True, *args, **kwargs)
-
-    def save(self, filename: str | None = None, *args, **kwargs) -> str:
-        """
-        Save annotated inference results image to file.
-
-        This method plots the detection results on the original image and saves the annotated image to a file. It
-        utilizes the `plot` method to generate the annotated image and then saves it to the specified filename.
-
-        Args:
-            filename (str | Path | None): The filename to save the annotated image. If None, a default filename
-                is generated based on the original image path.
-            *args (Any): Variable length argument list to be passed to the `plot` method.
-            **kwargs (Any): Arbitrary keyword arguments to be passed to the `plot` method.
-
-        Returns:
-            (str): The filename where the image was saved.
-
-        Examples:
-            >>> results = model("path/to/image.jpg")
-            >>> for result in results:
-            >>>     result.save("annotated_image.jpg")
-            >>> # Or with custom plot arguments
-            >>> for result in results:
-            >>>     result.save("annotated_image.jpg", conf=False, line_width=2)
-        """
-        if not filename:
-            filename = f"results_{Path(self.path).name}"
-        self.plot(save=True, filename=filename, *args, **kwargs)
-        return filename
-
-    def verbose(self) -> str:
-        """
-        Return a log string for each task in the results, detailing detection and classification outcomes.
-
-        This method generates a human-readable string summarizing the detection and classification results. It includes
-        the number of detections for each class and the top probabilities for classification tasks.
-
-        Returns:
-            (str): A formatted string containing a summary of the results. For detection tasks, it includes the
-                number of detections per class. For classification tasks, it includes the top 5 class probabilities.
-
-        Examples:
-            >>> results = model("path/to/image.jpg")
-            >>> for result in results:
-            >>>     print(result.verbose())
-            2 persons, 1 car, 3 traffic lights,
-            dog 0.92, cat 0.78, horse 0.64,
-
-        Notes:
-            - If there are no detections, the method returns "(no detections), " for detection tasks.
-            - For classification tasks, it returns the top 5 class probabilities and their corresponding class names.
-            - The returned string is comma-separated and ends with a comma and a space.
-        """
-        boxes = self.obb if self.obb is not None else self.boxes
-        if len(self) == 0:
-            return "" if self.probs is not None else "(no detections), "
-        if self.probs is not None:
-            return f"{', '.join(f'{self.names[j]} {self.probs.data[j]:.2f}' for j in self.probs.top5)}, "
-        if boxes:
-            counts = boxes.cls.int().bincount()
-            return "".join(f"{n} {self.names[i]}{'s' * (n > 1)}, " for i, n in enumerate(counts) if n > 0)
-
-    def save_txt(self, txt_file: str | Path, save_conf: bool = False) -> str:
-        """
-        Save detection results to a text file.
-
-        Args:
-            txt_file (str | Path): Path to the output text file.
-            save_conf (bool): Whether to include confidence scores in the output.
-
-        Returns:
-            (str): Path to the saved text file.
-
-        Examples:
-            >>> from ultralytics import YOLO
-            >>> model = YOLO("yolo11n.pt")
-            >>> results = model("path/to/image.jpg")
-            >>> for result in results:
-            >>>     result.save_txt("output.txt")
-
-        Notes:
-            - The file will contain one line per detection or classification with the following structure:
-              - For detections: `class confidence x_center y_center width height`
-              - For classifications: `confidence class_name`
-              - For masks and keypoints, the specific formats will vary accordingly.
-            - The function will create the output directory if it does not exist.
-            - If save_conf is False, the confidence scores will be excluded from the output.
-            - Existing contents of the file will not be overwritten; new results will be appended.
-        """
-        is_obb = self.obb is not None
-        boxes = self.obb if is_obb else self.boxes
-        masks = self.masks
-        probs = self.probs
-        kpts = self.keypoints
-        texts = []
-        if probs is not None:
-            # Classify
-            [texts.append(f"{probs.data[j]:.2f} {self.names[j]}") for j in probs.top5]
-        elif boxes:
-            # Detect/segment/pose
-            for j, d in enumerate(boxes):
-                c, conf, id = int(d.cls), float(d.conf), int(d.id.item()) if d.is_track else None
-                line = (c, *(d.xyxyxyxyn.view(-1) if is_obb else d.xywhn.view(-1)))
-                if masks:
-                    seg = masks[j].xyn[0].copy().reshape(-1)  # reversed mask.xyn, (n,2) to (n*2)
-                    line = (c, *seg)
-                if kpts is not None:
-                    kpt = torch.cat((kpts[j].xyn, kpts[j].conf[..., None]), 2) if kpts[j].has_visible else kpts[j].xyn
-                    line += (*kpt.reshape(-1).tolist(),)
-                line += (conf,) * save_conf + (() if id is None else (id,))
-                texts.append(("%g " * len(line)).rstrip() % line)
-
-        if texts:
-            Path(txt_file).parent.mkdir(parents=True, exist_ok=True)  # make directory
-            with open(txt_file, "a", encoding="utf-8") as f:
-                f.writelines(text + "\n" for text in texts)
-
-        return str(txt_file)
-
-    def save_crop(self, save_dir: str | Path, file_name: str | Path = Path("im.jpg")):
-        """
-        Save cropped detection images to specified directory.
-
-        This method saves cropped images of detected objects to a specified directory. Each crop is saved in a
-        subdirectory named after the object's class, with the filename based on the input file_name.
-
-        Args:
-            save_dir (str | Path): Directory path where cropped images will be saved.
-            file_name (str | Path): Base filename for the saved cropped images.
-
-        Notes:
-            - This method does not support Classify or Oriented Bounding Box (OBB) tasks.
-            - Crops are saved as 'save_dir/class_name/file_name.jpg'.
-            - The method will create necessary subdirectories if they don't exist.
-            - Original image is copied before cropping to avoid modifying the original.
-
-        Examples:
-            >>> results = model("path/to/image.jpg")
-            >>> for result in results:
-            >>>     result.save_crop(save_dir="path/to/crops", file_name="detection")
-        """
-        if self.probs is not None:
-            LOGGER.warning("Classify task do not support `save_crop`.")
-            return
-        if self.obb is not None:
-            LOGGER.warning("OBB task do not support `save_crop`.")
-            return
-        for d in self.boxes:
-            save_one_box(
-                d.xyxy,
-                self.orig_img.copy(),
-                file=Path(save_dir) / self.names[int(d.cls)] / Path(file_name).with_suffix(".jpg"),
-                BGR=True,
-            )
-
-    def summary(self, normalize: bool = False, decimals: int = 5) -> list[dict[str, Any]]:
-        """
-        Convert inference results to a summarized dictionary with optional normalization for box coordinates.
-
-        This method creates a list of detection dictionaries, each containing information about a single
-        detection or classification result. For classification tasks, it returns the top class and its
-        confidence. For detection tasks, it includes class information, bounding box coordinates, and
-        optionally mask segments and keypoints.
-
-        Args:
-            normalize (bool): Whether to normalize bounding box coordinates by image dimensions.
-            decimals (int): Number of decimal places to round the output values to.
-
-        Returns:
-            (list[dict[str, Any]]): A list of dictionaries, each containing summarized information for a single detection
-                or classification result. The structure of each dictionary varies based on the task type
-                (classification or detection) and available information (boxes, masks, keypoints).
-
-        Examples:
-            >>> results = model("image.jpg")
-            >>> for result in results:
-            >>>     summary = result.summary()
-            >>>     print(summary)
-        """
-        # Create list of detection dictionaries
-        results = []
-        if self.probs is not None:
-            class_id = self.probs.top1
-            results.append(
-                {
-                    "name": self.names[class_id],
-                    "class": class_id,
-                    "confidence": round(self.probs.top1conf.item(), decimals),
-                }
-            )
-            return results
-
-        is_obb = self.obb is not None
-        data = self.obb if is_obb else self.boxes
-        h, w = self.orig_shape if normalize else (1, 1)
-        for i, row in enumerate(data):  # xyxy, track_id if tracking, conf, class_id
-            class_id, conf = int(row.cls), round(row.conf.item(), decimals)
-            box = (row.xyxyxyxy if is_obb else row.xyxy).squeeze().reshape(-1, 2).tolist()
-            xy = {}
-            for j, b in enumerate(box):
-                xy[f"x{j + 1}"] = round(b[0] / w, decimals)
-                xy[f"y{j + 1}"] = round(b[1] / h, decimals)
-            result = {"name": self.names[class_id], "class": class_id, "confidence": conf, "box": xy}
-            if data.is_track:
-                result["track_id"] = int(row.id.item())  # track ID
-            if self.masks:
-                result["segments"] = {
-                    "x": (self.masks.xy[i][:, 0] / w).round(decimals).tolist(),
-                    "y": (self.masks.xy[i][:, 1] / h).round(decimals).tolist(),
-                }
-            if self.keypoints is not None:
-                x, y, visible = self.keypoints[i].data[0].cpu().unbind(dim=1)  # torch Tensor
-                result["keypoints"] = {
-                    "x": (x / w).numpy().round(decimals).tolist(),  # decimals named argument required
-                    "y": (y / h).numpy().round(decimals).tolist(),
-                    "visible": visible.numpy().round(decimals).tolist(),
-                }
-            results.append(result)
-
-        return results
-
-
-class Boxes(BaseTensor):
-    """
-    A class for managing and manipulating detection boxes.
-
-    This class provides comprehensive functionality for handling detection boxes, including their coordinates,
-    confidence scores, class labels, and optional tracking IDs. It supports various box formats and offers
-    methods for easy manipulation and conversion between different coordinate systems.
-
-    Attributes:
-        data (torch.Tensor | np.ndarray): The raw tensor containing detection boxes and associated data.
-        orig_shape (tuple[int, int]): The original image dimensions (height, width).
-        is_track (bool): Indicates whether tracking IDs are included in the box data.
-        xyxy (torch.Tensor | np.ndarray): Boxes in [x1, y1, x2, y2] format.
-        conf (torch.Tensor | np.ndarray): Confidence scores for each box.
-        cls (torch.Tensor | np.ndarray): Class labels for each box.
-        id (torch.Tensor | None): Tracking IDs for each box (if available).
-        xywh (torch.Tensor | np.ndarray): Boxes in [x, y, width, height] format.
-        xyxyn (torch.Tensor | np.ndarray): Normalized [x1, y1, x2, y2] boxes relative to orig_shape.
-        xywhn (torch.Tensor | np.ndarray): Normalized [x, y, width, height] boxes relative to orig_shape.
-
-    Methods:
-        cpu: Return a copy of the object with all tensors on CPU memory.
-        numpy: Return a copy of the object with all tensors as numpy arrays.
-        cuda: Return a copy of the object with all tensors on GPU memory.
-        to: Return a copy of the object with tensors on specified device and dtype.
-
-    Examples:
-        >>> import torch
-        >>> boxes_data = torch.tensor([[100, 50, 150, 100, 0.9, 0], [200, 150, 300, 250, 0.8, 1]])
-        >>> orig_shape = (480, 640)  # height, width
-        >>> boxes = Boxes(boxes_data, orig_shape)
-        >>> print(boxes.xyxy)
-        >>> print(boxes.conf)
-        >>> print(boxes.cls)
-        >>> print(boxes.xywhn)
-    """
-
-    def __init__(self, boxes: torch.Tensor | np.ndarray, orig_shape: tuple[int, int]) -> None:
-        """
-        Initialize the Boxes class with detection box data and the original image shape.
-
-        This class manages detection boxes, providing easy access and manipulation of box coordinates,
-        confidence scores, class identifiers, and optional tracking IDs. It supports multiple formats
-        for box coordinates, including both absolute and normalized forms.
-
-        Args:
-            boxes (torch.Tensor | np.ndarray): A tensor or numpy array with detection boxes of shape
-                (num_boxes, 6) or (num_boxes, 7). Columns should contain
-                [x1, y1, x2, y2, (optional) track_id, confidence, class].
-            orig_shape (tuple[int, int]): The original image shape as (height, width). Used for normalization.
-
-        Attributes:
-            data (torch.Tensor): The raw tensor containing detection boxes and their associated data.
-            orig_shape (tuple[int, int]): The original image size, used for normalization.
-            is_track (bool): Indicates whether tracking IDs are included in the box data.
-
-        Examples:
-            >>> import torch
-            >>> boxes = torch.tensor([[100, 50, 150, 100, 0.9, 0]])
-            >>> orig_shape = (480, 640)
-            >>> detection_boxes = Boxes(boxes, orig_shape)
-            >>> print(detection_boxes.xyxy)
-            tensor([[100.,  50., 150., 100.]])
-        """
-        if boxes.ndim == 1:
-            boxes = boxes[None, :]
-        n = boxes.shape[-1]
-        assert n in {6, 7}, f"expected 6 or 7 values but got {n}"  # xyxy, track_id, conf, cls
-        super().__init__(boxes, orig_shape)
-        self.is_track = n == 7
-        self.orig_shape = orig_shape
-
-    @property
-    def xyxy(self) -> torch.Tensor | np.ndarray:
-        """
-        Return bounding boxes in [x1, y1, x2, y2] format.
-
-        Returns:
-            (torch.Tensor | np.ndarray): A tensor or numpy array of shape (n, 4) containing bounding box
-                coordinates in [x1, y1, x2, y2] format, where n is the number of boxes.
-
-        Examples:
-            >>> results = model("image.jpg")
-            >>> boxes = results[0].boxes
-            >>> xyxy = boxes.xyxy
-            >>> print(xyxy)
-        """
-        return self.data[:, :4]
-
-    @property
-    def conf(self) -> torch.Tensor | np.ndarray:
-        """
-        Return the confidence scores for each detection box.
-
-        Returns:
-            (torch.Tensor | np.ndarray): A 1D tensor or array containing confidence scores for each detection,
-                with shape (N,) where N is the number of detections.
-
-        Examples:
-            >>> boxes = Boxes(torch.tensor([[10, 20, 30, 40, 0.9, 0]]), orig_shape=(100, 100))
-            >>> conf_scores = boxes.conf
-            >>> print(conf_scores)
-            tensor([0.9000])
-        """
-        return self.data[:, -2]
-
-    @property
-    def cls(self) -> torch.Tensor | np.ndarray:
-        """
-        Return the class ID tensor representing category predictions for each bounding box.
-
-        Returns:
-            (torch.Tensor | np.ndarray): A tensor or numpy array containing the class IDs for each detection box.
-                The shape is (N,), where N is the number of boxes.
-
-        Examples:
-            >>> results = model("image.jpg")
-            >>> boxes = results[0].boxes
-            >>> class_ids = boxes.cls
-            >>> print(class_ids)  # tensor([0., 2., 1.])
-        """
-        return self.data[:, -1]
-
-    @property
-    def id(self) -> torch.Tensor | np.ndarray | None:
-        """
-        Return the tracking IDs for each detection box if available.
-
-        Returns:
-            (torch.Tensor | None): A tensor containing tracking IDs for each box if tracking is enabled,
-                otherwise None. Shape is (N,) where N is the number of boxes.
-
-        Examples:
-            >>> results = model.track("path/to/video.mp4")
-            >>> for result in results:
-            ...     boxes = result.boxes
-            ...     if boxes.is_track:
-            ...         track_ids = boxes.id
-            ...         print(f"Tracking IDs: {track_ids}")
-            ...     else:
-            ...         print("Tracking is not enabled for these boxes.")
-
-        Notes:
-            - This property is only available when tracking is enabled (i.e., when `is_track` is True).
-            - The tracking IDs are typically used to associate detections across multiple frames in video analysis.
-        """
-        return self.data[:, -3] if self.is_track else None
-
-    @property
-    @lru_cache(maxsize=2)
-    def xywh(self) -> torch.Tensor | np.ndarray:
-        """
-        Convert bounding boxes from [x1, y1, x2, y2] format to [x, y, width, height] format.
-
-        Returns:
-            (torch.Tensor | np.ndarray): Boxes in [x_center, y_center, width, height] format, where x_center,
-                y_center are the coordinates of the center point of the bounding box, width, height are the
-                dimensions of the bounding box and the shape of the returned tensor is (N, 4), where N is the
-                number of boxes.
-
-        Examples:
-            >>> boxes = Boxes(torch.tensor([[100, 50, 150, 100], [200, 150, 300, 250]]), orig_shape=(480, 640))
-            >>> xywh = boxes.xywh
-            >>> print(xywh)
-            tensor([[100.0000,  50.0000,  50.0000,  50.0000],
-                    [200.0000, 150.0000, 100.0000, 100.0000]])
-        """
-        return ops.xyxy2xywh(self.xyxy)
-
-    @property
-    @lru_cache(maxsize=2)
-    def xyxyn(self) -> torch.Tensor | np.ndarray:
-        """
-        Return normalized bounding box coordinates relative to the original image size.
-
-        This property calculates and returns the bounding box coordinates in [x1, y1, x2, y2] format,
-        normalized to the range [0, 1] based on the original image dimensions.
-
-        Returns:
-            (torch.Tensor | np.ndarray): Normalized bounding box coordinates with shape (N, 4), where N is
-                the number of boxes. Each row contains [x1, y1, x2, y2] values normalized to [0, 1].
-
-        Examples:
-            >>> boxes = Boxes(torch.tensor([[100, 50, 300, 400, 0.9, 0]]), orig_shape=(480, 640))
-            >>> normalized = boxes.xyxyn
-            >>> print(normalized)
-            tensor([[0.1562, 0.1042, 0.4688, 0.8333]])
-        """
-        xyxy = self.xyxy.clone() if isinstance(self.xyxy, torch.Tensor) else np.copy(self.xyxy)
-        xyxy[..., [0, 2]] /= self.orig_shape[1]
-        xyxy[..., [1, 3]] /= self.orig_shape[0]
-        return xyxy
-
-    @property
-    @lru_cache(maxsize=2)
-    def xywhn(self) -> torch.Tensor | np.ndarray:
-        """
-        Return normalized bounding boxes in [x, y, width, height] format.
-
-        This property calculates and returns the normalized bounding box coordinates in the format
-        [x_center, y_center, width, height], where all values are relative to the original image dimensions.
-
-        Returns:
-            (torch.Tensor | np.ndarray): Normalized bounding boxes with shape (N, 4), where N is the
-                number of boxes. Each row contains [x_center, y_center, width, height] values normalized
-                to [0, 1] based on the original image dimensions.
-
-        Examples:
-            >>> boxes = Boxes(torch.tensor([[100, 50, 150, 100, 0.9, 0]]), orig_shape=(480, 640))
-            >>> normalized = boxes.xywhn
-            >>> print(normalized)
-            tensor([[0.1953, 0.1562, 0.0781, 0.1042]])
-        """
-        xywh = ops.xyxy2xywh(self.xyxy)
-        xywh[..., [0, 2]] /= self.orig_shape[1]
-        xywh[..., [1, 3]] /= self.orig_shape[0]
-        return xywh
-
-
-class Masks(BaseTensor):
-    """
-    A class for storing and manipulating detection masks.
-
-    This class extends BaseTensor and provides functionality for handling segmentation masks,
-    including methods for converting between pixel and normalized coordinates.
-
-    Attributes:
-        data (torch.Tensor | np.ndarray): The raw tensor or array containing mask data.
-        orig_shape (tuple): Original image shape in (height, width) format.
-        xy (list[np.ndarray]): A list of segments in pixel coordinates.
-        xyn (list[np.ndarray]): A list of normalized segments.
-
-    Methods:
-        cpu: Return a copy of the Masks object with the mask tensor on CPU memory.
-        numpy: Return a copy of the Masks object with the mask tensor as a numpy array.
-        cuda: Return a copy of the Masks object with the mask tensor on GPU memory.
-        to: Return a copy of the Masks object with the mask tensor on specified device and dtype.
-
-    Examples:
-        >>> masks_data = torch.rand(1, 160, 160)
-        >>> orig_shape = (720, 1280)
-        >>> masks = Masks(masks_data, orig_shape)
-        >>> pixel_coords = masks.xy
-        >>> normalized_coords = masks.xyn
-    """
-
-    def __init__(self, masks: torch.Tensor | np.ndarray, orig_shape: tuple[int, int]) -> None:
-        """
-        Initialize the Masks class with detection mask data and the original image shape.
-
-        Args:
-            masks (torch.Tensor | np.ndarray): Detection masks with shape (num_masks, height, width).
-            orig_shape (tuple): The original image shape as (height, width). Used for normalization.
-
-        Examples:
-            >>> import torch
-            >>> from ultralytics.engine.results import Masks
-            >>> masks = torch.rand(10, 160, 160)  # 10 masks of 160x160 resolution
-            >>> orig_shape = (720, 1280)  # Original image shape
-            >>> mask_obj = Masks(masks, orig_shape)
-        """
-        if masks.ndim == 2:
-            masks = masks[None, :]
-        super().__init__(masks, orig_shape)
-
-    @property
-    @lru_cache(maxsize=1)
-    def xyn(self) -> list[np.ndarray]:
-        """
-        Return normalized xy-coordinates of the segmentation masks.
-
-        This property calculates and caches the normalized xy-coordinates of the segmentation masks. The coordinates
-        are normalized relative to the original image shape.
-
-        Returns:
-            (list[np.ndarray]): A list of numpy arrays, where each array contains the normalized xy-coordinates
-                of a single segmentation mask. Each array has shape (N, 2), where N is the number of points in the
-                mask contour.
-
-        Examples:
-            >>> results = model("image.jpg")
-            >>> masks = results[0].masks
-            >>> normalized_coords = masks.xyn
-            >>> print(normalized_coords[0])  # Normalized coordinates of the first mask
-        """
-        return [
-            ops.scale_coords(self.data.shape[1:], x, self.orig_shape, normalize=True)
-            for x in ops.masks2segments(self.data)
-        ]
-
-    @property
-    @lru_cache(maxsize=1)
-    def xy(self) -> list[np.ndarray]:
-        """
-        Return the [x, y] pixel coordinates for each segment in the mask tensor.
-
-        This property calculates and returns a list of pixel coordinates for each segmentation mask in the
-        Masks object. The coordinates are scaled to match the original image dimensions.
-
-        Returns:
-            (list[np.ndarray]): A list of numpy arrays, where each array contains the [x, y] pixel
-                coordinates for a single segmentation mask. Each array has shape (N, 2), where N is the
-                number of points in the segment.
-
-        Examples:
-            >>> results = model("image.jpg")
-            >>> masks = results[0].masks
-            >>> xy_coords = masks.xy
-            >>> print(len(xy_coords))  # Number of masks
-            >>> print(xy_coords[0].shape)  # Shape of first mask's coordinates
-        """
-        return [
-            ops.scale_coords(self.data.shape[1:], x, self.orig_shape, normalize=False)
-            for x in ops.masks2segments(self.data)
-        ]
-
-
-class Keypoints(BaseTensor):
-    """
-    A class for storing and manipulating detection keypoints.
-
-    This class encapsulates functionality for handling keypoint data, including coordinate manipulation,
-    normalization, and confidence values. It supports keypoint detection results with optional visibility
-    information.
-
-    Attributes:
-        data (torch.Tensor): The raw tensor containing keypoint data.
-        orig_shape (tuple[int, int]): The original image dimensions (height, width).
-        has_visible (bool): Indicates whether visibility information is available for keypoints.
-        xy (torch.Tensor): Keypoint coordinates in [x, y] format.
-        xyn (torch.Tensor): Normalized keypoint coordinates in [x, y] format, relative to orig_shape.
-        conf (torch.Tensor): Confidence values for each keypoint, if available.
-
-    Methods:
-        cpu: Return a copy of the keypoints tensor on CPU memory.
-        numpy: Return a copy of the keypoints tensor as a numpy array.
-        cuda: Return a copy of the keypoints tensor on GPU memory.
-        to: Return a copy of the keypoints tensor with specified device and dtype.
-
-    Examples:
-        >>> import torch
-        >>> from ultralytics.engine.results import Keypoints
-        >>> keypoints_data = torch.rand(1, 17, 3)  # 1 detection, 17 keypoints, (x, y, conf)
-        >>> orig_shape = (480, 640)  # Original image shape (height, width)
-        >>> keypoints = Keypoints(keypoints_data, orig_shape)
-        >>> print(keypoints.xy.shape)  # Access xy coordinates
-        >>> print(keypoints.conf)  # Access confidence values
-        >>> keypoints_cpu = keypoints.cpu()  # Move keypoints to CPU
-    """
-
-    def __init__(self, keypoints: torch.Tensor | np.ndarray, orig_shape: tuple[int, int]) -> None:
-        """
-        Initialize the Keypoints object with detection keypoints and original image dimensions.
-
-        This method processes the input keypoints tensor, handling both 2D and 3D formats. For 3D tensors
-        (x, y, confidence), it masks out low-confidence keypoints by setting their coordinates to zero.
-
-        Args:
-            keypoints (torch.Tensor): A tensor containing keypoint data. Shape can be either:
-                - (num_objects, num_keypoints, 2) for x, y coordinates only
-                - (num_objects, num_keypoints, 3) for x, y coordinates and confidence scores
-            orig_shape (tuple[int, int]): The original image dimensions (height, width).
-
-        Examples:
-            >>> kpts = torch.rand(1, 17, 3)  # 1 object, 17 keypoints (COCO format), x,y,conf
-            >>> orig_shape = (720, 1280)  # Original image height, width
-            >>> keypoints = Keypoints(kpts, orig_shape)
-        """
-        if keypoints.ndim == 2:
-            keypoints = keypoints[None, :]
-        super().__init__(keypoints, orig_shape)
-        self.has_visible = self.data.shape[-1] == 3
-
-    @property
-    @lru_cache(maxsize=1)
-    def xy(self) -> torch.Tensor | np.ndarray:
-        """
-        Return x, y coordinates of keypoints.
-
-        Returns:
-            (torch.Tensor): A tensor containing the x, y coordinates of keypoints with shape (N, K, 2), where N is
-                the number of detections and K is the number of keypoints per detection.
-
-        Examples:
-            >>> results = model("image.jpg")
-            >>> keypoints = results[0].keypoints
-            >>> xy = keypoints.xy
-            >>> print(xy.shape)  # (N, K, 2)
-            >>> print(xy[0])  # x, y coordinates of keypoints for first detection
-
-        Notes:
-            - The returned coordinates are in pixel units relative to the original image dimensions.
-            - If keypoints were initialized with confidence values, only keypoints with confidence >= 0.5 are returned.
-            - This property uses LRU caching to improve performance on repeated access.
-        """
-        return self.data[..., :2]
-
-    @property
-    @lru_cache(maxsize=1)
-    def xyn(self) -> torch.Tensor | np.ndarray:
-        """
-        Return normalized coordinates (x, y) of keypoints relative to the original image size.
-
-        Returns:
-            (torch.Tensor | np.ndarray): A tensor or array of shape (N, K, 2) containing normalized keypoint
-                coordinates, where N is the number of instances, K is the number of keypoints, and the last
-                dimension contains [x, y] values in the range [0, 1].
-
-        Examples:
-            >>> keypoints = Keypoints(torch.rand(1, 17, 2), orig_shape=(480, 640))
-            >>> normalized_kpts = keypoints.xyn
-            >>> print(normalized_kpts.shape)
-            torch.Size([1, 17, 2])
-        """
-        xy = self.xy.clone() if isinstance(self.xy, torch.Tensor) else np.copy(self.xy)
-        xy[..., 0] /= self.orig_shape[1]
-        xy[..., 1] /= self.orig_shape[0]
-        return xy
-
-    @property
-    @lru_cache(maxsize=1)
-    def conf(self) -> torch.Tensor | np.ndarray | None:
-        """
-        Return confidence values for each keypoint.
-
-        Returns:
-            (torch.Tensor | None): A tensor containing confidence scores for each keypoint if available,
-                otherwise None. Shape is (num_detections, num_keypoints) for batched data or (num_keypoints,)
-                for single detection.
-
-        Examples:
-            >>> keypoints = Keypoints(torch.rand(1, 17, 3), orig_shape=(640, 640))  # 1 detection, 17 keypoints
-            >>> conf = keypoints.conf
-            >>> print(conf.shape)  # torch.Size([1, 17])
-        """
-        return self.data[..., 2] if self.has_visible else None
-
-
-class Probs(BaseTensor):
-    """
-    A class for storing and manipulating classification probabilities.
-
-    This class extends BaseTensor and provides methods for accessing and manipulating
-    classification probabilities, including top-1 and top-5 predictions.
-
-    Attributes:
-        data (torch.Tensor | np.ndarray): The raw tensor or array containing classification probabilities.
-        orig_shape (tuple | None): The original image shape as (height, width). Not used in this class.
-        top1 (int): Index of the class with the highest probability.
-        top5 (list[int]): Indices of the top 5 classes by probability.
-        top1conf (torch.Tensor | np.ndarray): Confidence score of the top 1 class.
-        top5conf (torch.Tensor | np.ndarray): Confidence scores of the top 5 classes.
-
-    Methods:
-        cpu: Return a copy of the probabilities tensor on CPU memory.
-        numpy: Return a copy of the probabilities tensor as a numpy array.
-        cuda: Return a copy of the probabilities tensor on GPU memory.
-        to: Return a copy of the probabilities tensor with specified device and dtype.
-
-    Examples:
-        >>> probs = torch.tensor([0.1, 0.3, 0.6])
-        >>> p = Probs(probs)
-        >>> print(p.top1)
-        2
-        >>> print(p.top5)
-        [2, 1, 0]
-        >>> print(p.top1conf)
-        tensor(0.6000)
-        >>> print(p.top5conf)
-        tensor([0.6000, 0.3000, 0.1000])
-    """
-
-    def __init__(self, probs: torch.Tensor | np.ndarray, orig_shape: tuple[int, int] | None = None) -> None:
-        """
-        Initialize the Probs class with classification probabilities.
-
-        This class stores and manages classification probabilities, providing easy access to top predictions and their
-        confidences.
-
-        Args:
-            probs (torch.Tensor | np.ndarray): A 1D tensor or array of classification probabilities.
-            orig_shape (tuple | None): The original image shape as (height, width). Not used in this class but kept
-                for consistency with other result classes.
-
-        Attributes:
-            data (torch.Tensor | np.ndarray): The raw tensor or array containing classification probabilities.
-            top1 (int): Index of the top 1 class.
-            top5 (list[int]): Indices of the top 5 classes.
-            top1conf (torch.Tensor | np.ndarray): Confidence of the top 1 class.
-            top5conf (torch.Tensor | np.ndarray): Confidences of the top 5 classes.
-
-        Examples:
-            >>> import torch
-            >>> probs = torch.tensor([0.1, 0.3, 0.2, 0.4])
-            >>> p = Probs(probs)
-            >>> print(p.top1)
-            3
-            >>> print(p.top1conf)
-            tensor(0.4000)
-            >>> print(p.top5)
-            [3, 1, 2, 0]
-        """
-        super().__init__(probs, orig_shape)
-
-    @property
-    @lru_cache(maxsize=1)
-    def top1(self) -> int:
-        """
-        Return the index of the class with the highest probability.
-
-        Returns:
-            (int): Index of the class with the highest probability.
-
-        Examples:
-            >>> probs = Probs(torch.tensor([0.1, 0.3, 0.6]))
-            >>> probs.top1
-            2
-        """
-        return int(self.data.argmax())
-
-    @property
-    @lru_cache(maxsize=1)
-    def top5(self) -> list[int]:
-        """
-        Return the indices of the top 5 class probabilities.
-
-        Returns:
-            (list[int]): A list containing the indices of the top 5 class probabilities, sorted in descending order.
-
-        Examples:
-            >>> probs = Probs(torch.tensor([0.1, 0.2, 0.3, 0.4, 0.5]))
-            >>> print(probs.top5)
-            [4, 3, 2, 1, 0]
-        """
-        return (-self.data).argsort(0)[:5].tolist()  # this way works with both torch and numpy.
-
-    @property
-    @lru_cache(maxsize=1)
-    def top1conf(self) -> torch.Tensor | np.ndarray:
-        """
-        Return the confidence score of the highest probability class.
-
-        This property retrieves the confidence score (probability) of the class with the highest predicted probability
-        from the classification results.
-
-        Returns:
-            (torch.Tensor | np.ndarray): A tensor containing the confidence score of the top 1 class.
-
-        Examples:
-            >>> results = model("image.jpg")  # classify an image
-            >>> probs = results[0].probs  # get classification probabilities
-            >>> top1_confidence = probs.top1conf  # get confidence of top 1 class
-            >>> print(f"Top 1 class confidence: {top1_confidence.item():.4f}")
-        """
-        return self.data[self.top1]
-
-    @property
-    @lru_cache(maxsize=1)
-    def top5conf(self) -> torch.Tensor | np.ndarray:
-        """
-        Return confidence scores for the top 5 classification predictions.
-
-        This property retrieves the confidence scores corresponding to the top 5 class probabilities
-        predicted by the model. It provides a quick way to access the most likely class predictions
-        along with their associated confidence levels.
-
-        Returns:
-            (torch.Tensor | np.ndarray): A tensor or array containing the confidence scores for the
-                top 5 predicted classes, sorted in descending order of probability.
-
-        Examples:
-            >>> results = model("image.jpg")
-            >>> probs = results[0].probs
-            >>> top5_conf = probs.top5conf
-            >>> print(top5_conf)  # Prints confidence scores for top 5 classes
-        """
-        return self.data[self.top5]
-
-
-class OBB(BaseTensor):
-    """
-    A class for storing and manipulating Oriented Bounding Boxes (OBB).
-
-    This class provides functionality to handle oriented bounding boxes, including conversion between
-    different formats, normalization, and access to various properties of the boxes. It supports
-    both tracking and non-tracking scenarios.
-
-    Attributes:
-        data (torch.Tensor): The raw OBB tensor containing box coordinates and associated data.
-        orig_shape (tuple): Original image size as (height, width).
-        is_track (bool): Indicates whether tracking IDs are included in the box data.
-        xywhr (torch.Tensor | np.ndarray): Boxes in [x_center, y_center, width, height, rotation] format.
-        conf (torch.Tensor | np.ndarray): Confidence scores for each box.
-        cls (torch.Tensor | np.ndarray): Class labels for each box.
-        id (torch.Tensor | np.ndarray): Tracking IDs for each box, if available.
-        xyxyxyxy (torch.Tensor | np.ndarray): Boxes in 8-point [x1, y1, x2, y2, x3, y3, x4, y4] format.
-        xyxyxyxyn (torch.Tensor | np.ndarray): Normalized 8-point coordinates relative to orig_shape.
-        xyxy (torch.Tensor | np.ndarray): Axis-aligned bounding boxes in [x1, y1, x2, y2] format.
-
-    Methods:
-        cpu: Return a copy of the OBB object with all tensors on CPU memory.
-        numpy: Return a copy of the OBB object with all tensors as numpy arrays.
-        cuda: Return a copy of the OBB object with all tensors on GPU memory.
-        to: Return a copy of the OBB object with tensors on specified device and dtype.
-
-    Examples:
-        >>> boxes = torch.tensor([[100, 50, 150, 100, 30, 0.9, 0]])  # xywhr, conf, cls
-        >>> obb = OBB(boxes, orig_shape=(480, 640))
-        >>> print(obb.xyxyxyxy)
-        >>> print(obb.conf)
-        >>> print(obb.cls)
-    """
-
-    def __init__(self, boxes: torch.Tensor | np.ndarray, orig_shape: tuple[int, int]) -> None:
-        """
-        Initialize an OBB (Oriented Bounding Box) instance with oriented bounding box data and original image shape.
-
-        This class stores and manipulates Oriented Bounding Boxes (OBB) for object detection tasks. It provides
-        various properties and methods to access and transform the OBB data.
-
-        Args:
-            boxes (torch.Tensor | np.ndarray): A tensor or numpy array containing the detection boxes,
-                with shape (num_boxes, 7) or (num_boxes, 8). The last two columns contain confidence and class values.
-                If present, the third last column contains track IDs, and the fifth column contains rotation.
-            orig_shape (tuple[int, int]): Original image size, in the format (height, width).
-
-        Attributes:
-            data (torch.Tensor | np.ndarray): The raw OBB tensor.
-            orig_shape (tuple[int, int]): The original image shape.
-            is_track (bool): Whether the boxes include tracking IDs.
-
-        Raises:
-            AssertionError: If the number of values per box is not 7 or 8.
-
-        Examples:
-            >>> import torch
-            >>> boxes = torch.rand(3, 7)  # 3 boxes with 7 values each
-            >>> orig_shape = (640, 480)
-            >>> obb = OBB(boxes, orig_shape)
-            >>> print(obb.xywhr)  # Access the boxes in xywhr format
-        """
-        if boxes.ndim == 1:
-            boxes = boxes[None, :]
-        n = boxes.shape[-1]
-        assert n in {7, 8}, f"expected 7 or 8 values but got {n}"  # xywh, rotation, track_id, conf, cls
-        super().__init__(boxes, orig_shape)
-        self.is_track = n == 8
-        self.orig_shape = orig_shape
-
-    @property
-    def xywhr(self) -> torch.Tensor | np.ndarray:
-        """
-        Return boxes in [x_center, y_center, width, height, rotation] format.
-
-        Returns:
-            (torch.Tensor | np.ndarray): A tensor or numpy array containing the oriented bounding boxes with format
-                [x_center, y_center, width, height, rotation]. The shape is (N, 5) where N is the number of boxes.
-
-        Examples:
-            >>> results = model("image.jpg")
-            >>> obb = results[0].obb
-            >>> xywhr = obb.xywhr
-            >>> print(xywhr.shape)
-            torch.Size([3, 5])
-        """
-        return self.data[:, :5]
-
-    @property
-    def conf(self) -> torch.Tensor | np.ndarray:
-        """
-        Return the confidence scores for Oriented Bounding Boxes (OBBs).
-
-        This property retrieves the confidence values associated with each OBB detection. The confidence score
-        represents the model's certainty in the detection.
-
-        Returns:
-            (torch.Tensor | np.ndarray): A tensor or numpy array of shape (N,) containing confidence scores
-                for N detections, where each score is in the range [0, 1].
-
-        Examples:
-            >>> results = model("image.jpg")
-            >>> obb_result = results[0].obb
-            >>> confidence_scores = obb_result.conf
-            >>> print(confidence_scores)
-        """
-        return self.data[:, -2]
-
-    @property
-    def cls(self) -> torch.Tensor | np.ndarray:
-        """
-        Return the class values of the oriented bounding boxes.
-
-        Returns:
-            (torch.Tensor | np.ndarray): A tensor or numpy array containing the class values for each oriented
-                bounding box. The shape is (N,), where N is the number of boxes.
-
-        Examples:
-            >>> results = model("image.jpg")
-            >>> result = results[0]
-            >>> obb = result.obb
-            >>> class_values = obb.cls
-            >>> print(class_values)
-        """
-        return self.data[:, -1]
-
-    @property
-    def id(self) -> torch.Tensor | np.ndarray | None:
-        """
-        Return the tracking IDs of the oriented bounding boxes (if available).
-
-        Returns:
-            (torch.Tensor | np.ndarray | None): A tensor or numpy array containing the tracking IDs for each
-                oriented bounding box. Returns None if tracking IDs are not available.
-
-        Examples:
-            >>> results = model("image.jpg", tracker=True)  # Run inference with tracking
-            >>> for result in results:
-            ...     if result.obb is not None:
-            ...         track_ids = result.obb.id
-            ...         if track_ids is not None:
-            ...             print(f"Tracking IDs: {track_ids}")
-        """
-        return self.data[:, -3] if self.is_track else None
-
-    @property
-    @lru_cache(maxsize=2)
-    def xyxyxyxy(self) -> torch.Tensor | np.ndarray:
-        """
-        Convert OBB format to 8-point (xyxyxyxy) coordinate format for rotated bounding boxes.
-
-        Returns:
-            (torch.Tensor | np.ndarray): Rotated bounding boxes in xyxyxyxy format with shape (N, 4, 2), where N is
-                the number of boxes. Each box is represented by 4 points (x, y), starting from the top-left corner and
-                moving clockwise.
-
-        Examples:
-            >>> obb = OBB(torch.tensor([[100, 100, 50, 30, 0.5, 0.9, 0]]), orig_shape=(640, 640))
-            >>> xyxyxyxy = obb.xyxyxyxy
-            >>> print(xyxyxyxy.shape)
-            torch.Size([1, 4, 2])
-        """
-        return ops.xywhr2xyxyxyxy(self.xywhr)
-
-    @property
-    @lru_cache(maxsize=2)
-    def xyxyxyxyn(self) -> torch.Tensor | np.ndarray:
-        """
-        Convert rotated bounding boxes to normalized xyxyxyxy format.
-
-        Returns:
-            (torch.Tensor | np.ndarray): Normalized rotated bounding boxes in xyxyxyxy format with shape (N, 4, 2),
-                where N is the number of boxes. Each box is represented by 4 points (x, y), normalized relative to
-                the original image dimensions.
-
-        Examples:
-            >>> obb = OBB(torch.rand(10, 7), orig_shape=(640, 480))  # 10 random OBBs
-            >>> normalized_boxes = obb.xyxyxyxyn
-            >>> print(normalized_boxes.shape)
-            torch.Size([10, 4, 2])
-        """
-        xyxyxyxyn = self.xyxyxyxy.clone() if isinstance(self.xyxyxyxy, torch.Tensor) else np.copy(self.xyxyxyxy)
-        xyxyxyxyn[..., 0] /= self.orig_shape[1]
-        xyxyxyxyn[..., 1] /= self.orig_shape[0]
-        return xyxyxyxyn
-
-    @property
-    @lru_cache(maxsize=2)
-    def xyxy(self) -> torch.Tensor | np.ndarray:
-        """
-        Convert oriented bounding boxes (OBB) to axis-aligned bounding boxes in xyxy format.
-
-        This property calculates the minimal enclosing rectangle for each oriented bounding box and returns it in
-        xyxy format (x1, y1, x2, y2). This is useful for operations that require axis-aligned bounding boxes, such
-        as IoU calculation with non-rotated boxes.
-
-        Returns:
-            (torch.Tensor | np.ndarray): Axis-aligned bounding boxes in xyxy format with shape (N, 4), where N
-                is the number of boxes. Each row contains [x1, y1, x2, y2] coordinates.
-
-        Examples:
-            >>> import torch
-            >>> from ultralytics import YOLO
-            >>> model = YOLO("yolo11n-obb.pt")
-            >>> results = model("path/to/image.jpg")
-            >>> for result in results:
-            ...     obb = result.obb
-            ...     if obb is not None:
-            ...         xyxy_boxes = obb.xyxy
-            ...         print(xyxy_boxes.shape)  # (N, 4)
-
-        Notes:
-            - This method approximates the OBB by its minimal enclosing rectangle.
-            - The returned format is compatible with standard object detection metrics and visualization tools.
-            - The property uses caching to improve performance for repeated access.
-        """
-        x = self.xyxyxyxy[..., 0]
-        y = self.xyxyxyxy[..., 1]
-        return (
-            torch.stack([x.amin(1), y.amin(1), x.amax(1), y.amax(1)], -1)
-            if isinstance(x, torch.Tensor)
-            else np.stack([x.min(1), y.min(1), x.max(1), y.max(1)], -1)
-        )
diff --git a/ultralytics/engine/trainer.py b/ultralytics/engine/trainer.py
deleted file mode 100644
index 445c5b3..0000000
--- a/ultralytics/engine/trainer.py
+++ /dev/null
@@ -1,904 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-"""
-Train a model on a dataset.
-
-Usage:
-    $ yolo mode=train model=yolo11n.pt data=coco8.yaml imgsz=640 epochs=100 batch=16
-"""
-
-import gc
-import math
-import os
-import subprocess
-import time
-import warnings
-from copy import copy, deepcopy
-from datetime import datetime, timedelta
-from pathlib import Path
-
-import numpy as np
-import torch
-from torch import distributed as dist
-from torch import nn, optim
-
-from ultralytics import __version__
-from ultralytics.cfg import get_cfg, get_save_dir
-from ultralytics.data.utils import check_cls_dataset, check_det_dataset
-from ultralytics.nn.tasks import load_checkpoint
-from ultralytics.utils import (
-    DEFAULT_CFG,
-    GIT,
-    LOCAL_RANK,
-    LOGGER,
-    RANK,
-    TQDM,
-    YAML,
-    callbacks,
-    clean_url,
-    colorstr,
-    emojis,
-)
-from ultralytics.utils.autobatch import check_train_batch_size
-from ultralytics.utils.checks import check_amp, check_file, check_imgsz, check_model_file_from_stem, print_args
-from ultralytics.utils.dist import ddp_cleanup, generate_ddp_command
-from ultralytics.utils.files import get_latest_run
-from ultralytics.utils.plotting import plot_results
-from ultralytics.utils.torch_utils import (
-    TORCH_2_4,
-    EarlyStopping,
-    ModelEMA,
-    attempt_compile,
-    autocast,
-    convert_optimizer_state_dict_to_fp16,
-    init_seeds,
-    one_cycle,
-    select_device,
-    strip_optimizer,
-    torch_distributed_zero_first,
-    unset_deterministic,
-    unwrap_model,
-)
-
-
-class BaseTrainer:
-    """
-    A base class for creating trainers.
-
-    This class provides the foundation for training YOLO models, handling the training loop, validation, checkpointing,
-    and various training utilities. It supports both single-GPU and multi-GPU distributed training.
-
-    Attributes:
-        args (SimpleNamespace): Configuration for the trainer.
-        validator (BaseValidator): Validator instance.
-        model (nn.Module): Model instance.
-        callbacks (defaultdict): Dictionary of callbacks.
-        save_dir (Path): Directory to save results.
-        wdir (Path): Directory to save weights.
-        last (Path): Path to the last checkpoint.
-        best (Path): Path to the best checkpoint.
-        save_period (int): Save checkpoint every x epochs (disabled if < 1).
-        batch_size (int): Batch size for training.
-        epochs (int): Number of epochs to train for.
-        start_epoch (int): Starting epoch for training.
-        device (torch.device): Device to use for training.
-        amp (bool): Flag to enable AMP (Automatic Mixed Precision).
-        scaler (amp.GradScaler): Gradient scaler for AMP.
-        data (str): Path to data.
-        ema (nn.Module): EMA (Exponential Moving Average) of the model.
-        resume (bool): Resume training from a checkpoint.
-        lf (nn.Module): Loss function.
-        scheduler (torch.optim.lr_scheduler._LRScheduler): Learning rate scheduler.
-        best_fitness (float): The best fitness value achieved.
-        fitness (float): Current fitness value.
-        loss (float): Current loss value.
-        tloss (float): Total loss value.
-        loss_names (list): List of loss names.
-        csv (Path): Path to results CSV file.
-        metrics (dict): Dictionary of metrics.
-        plots (dict): Dictionary of plots.
-
-    Methods:
-        train: Execute the training process.
-        validate: Run validation on the test set.
-        save_model: Save model training checkpoints.
-        get_dataset: Get train and validation datasets.
-        setup_model: Load, create, or download model.
-        build_optimizer: Construct an optimizer for the model.
-
-    Examples:
-        Initialize a trainer and start training
-        >>> trainer = BaseTrainer(cfg="config.yaml")
-        >>> trainer.train()
-    """
-
-    def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
-        """
-        Initialize the BaseTrainer class.
-
-        Args:
-            cfg (str, optional): Path to a configuration file.
-            overrides (dict, optional): Configuration overrides.
-            _callbacks (list, optional): List of callback functions.
-        """
-        self.hub_session = overrides.pop("session", None)  # HUB
-        self.args = get_cfg(cfg, overrides)
-        self.check_resume(overrides)
-        self.device = select_device(self.args.device)
-        # Update "-1" devices so post-training val does not repeat search
-        self.args.device = os.getenv("CUDA_VISIBLE_DEVICES") if "cuda" in str(self.device) else str(self.device)
-        self.validator = None
-        self.metrics = None
-        self.plots = {}
-        init_seeds(self.args.seed + 1 + RANK, deterministic=self.args.deterministic)
-
-        # Dirs
-        self.save_dir = get_save_dir(self.args)
-        self.args.name = self.save_dir.name  # update name for loggers
-        self.wdir = self.save_dir / "weights"  # weights dir
-        if RANK in {-1, 0}:
-            self.wdir.mkdir(parents=True, exist_ok=True)  # make dir
-            self.args.save_dir = str(self.save_dir)
-            YAML.save(self.save_dir / "args.yaml", vars(self.args))  # save run args
-        self.last, self.best = self.wdir / "last.pt", self.wdir / "best.pt"  # checkpoint paths
-        self.save_period = self.args.save_period
-
-        self.batch_size = self.args.batch
-        self.epochs = self.args.epochs or 100  # in case users accidentally pass epochs=None with timed training
-        self.start_epoch = 0
-        if RANK == -1:
-            print_args(vars(self.args))
-
-        # Device
-        if self.device.type in {"cpu", "mps"}:
-            self.args.workers = 0  # faster CPU training as time dominated by inference, not dataloading
-
-        # Model and Dataset
-        self.model = check_model_file_from_stem(self.args.model)  # add suffix, i.e. yolo11n -> yolo11n.pt
-        with torch_distributed_zero_first(LOCAL_RANK):  # avoid auto-downloading dataset multiple times
-            self.data = self.get_dataset()
-
-        self.ema = None
-
-        # Optimization utils init
-        self.lf = None
-        self.scheduler = None
-
-        # Epoch level metrics
-        self.best_fitness = None
-        self.fitness = None
-        self.loss = None
-        self.tloss = None
-        self.loss_names = ["Loss"]
-        self.csv = self.save_dir / "results.csv"
-        self.plot_idx = [0, 1, 2]
-
-        # Callbacks
-        self.callbacks = _callbacks or callbacks.get_default_callbacks()
-
-        if isinstance(self.args.device, str) and len(self.args.device):  # i.e. device='0' or device='0,1,2,3'
-            world_size = len(self.args.device.split(","))
-        elif isinstance(self.args.device, (tuple, list)):  # i.e. device=[0, 1, 2, 3] (multi-GPU from CLI is list)
-            world_size = len(self.args.device)
-        elif self.args.device in {"cpu", "mps"}:  # i.e. device='cpu' or 'mps'
-            world_size = 0
-        elif torch.cuda.is_available():  # i.e. device=None or device='' or device=number
-            world_size = 1  # default to device 0
-        else:  # i.e. device=None or device=''
-            world_size = 0
-
-        self.ddp = world_size > 1 and "LOCAL_RANK" not in os.environ
-        self.world_size = world_size
-        # Run subprocess if DDP training, else train normally
-        if RANK in {-1, 0} and not self.ddp:
-            callbacks.add_integration_callbacks(self)
-            # Start console logging immediately at trainer initialization
-            self.run_callbacks("on_pretrain_routine_start")
-
-    def add_callback(self, event: str, callback):
-        """Append the given callback to the event's callback list."""
-        self.callbacks[event].append(callback)
-
-    def set_callback(self, event: str, callback):
-        """Override the existing callbacks with the given callback for the specified event."""
-        self.callbacks[event] = [callback]
-
-    def run_callbacks(self, event: str):
-        """Run all existing callbacks associated with a particular event."""
-        for callback in self.callbacks.get(event, []):
-            callback(self)
-
-    def train(self):
-        """Allow device='', device=None on Multi-GPU systems to default to device=0."""
-        # Run subprocess if DDP training, else train normally
-        if self.ddp:
-            # Argument checks
-            if self.args.rect:
-                LOGGER.warning("'rect=True' is incompatible with Multi-GPU training, setting 'rect=False'")
-                self.args.rect = False
-            if self.args.batch < 1.0:
-                raise ValueError(
-                    "AutoBatch with batch<1 not supported for Multi-GPU training, "
-                    f"please specify a valid batch size multiple of GPU count {self.world_size}, i.e. batch={self.world_size * 8}."
-                )
-
-            # Command
-            cmd, file = generate_ddp_command(self)
-            try:
-                LOGGER.info(f"{colorstr('DDP:')} debug command {' '.join(cmd)}")
-                subprocess.run(cmd, check=True)
-            except Exception as e:
-                raise e
-            finally:
-                ddp_cleanup(self, str(file))
-
-        else:
-            self._do_train()
-
-    def _setup_scheduler(self):
-        """Initialize training learning rate scheduler."""
-        if self.args.cos_lr:
-            self.lf = one_cycle(1, self.args.lrf, self.epochs)  # cosine 1->hyp['lrf']
-        else:
-            self.lf = lambda x: max(1 - x / self.epochs, 0) * (1.0 - self.args.lrf) + self.args.lrf  # linear
-        self.scheduler = optim.lr_scheduler.LambdaLR(self.optimizer, lr_lambda=self.lf)
-
-    def _setup_ddp(self):
-        """Initialize and set the DistributedDataParallel parameters for training."""
-        torch.cuda.set_device(RANK)
-        self.device = torch.device("cuda", RANK)
-        os.environ["TORCH_NCCL_BLOCKING_WAIT"] = "1"  # set to enforce timeout
-        dist.init_process_group(
-            backend="nccl" if dist.is_nccl_available() else "gloo",
-            timeout=timedelta(seconds=10800),  # 3 hours
-            rank=RANK,
-            world_size=self.world_size,
-        )
-
-    def _setup_train(self):
-        """Build dataloaders and optimizer on correct rank process."""
-        ckpt = self.setup_model()
-        self.model = self.model.to(self.device)
-        self.set_model_attributes()
-
-        # Compile model
-        self.model = attempt_compile(self.model, device=self.device, mode=self.args.compile)
-
-        # Freeze layers
-        freeze_list = (
-            self.args.freeze
-            if isinstance(self.args.freeze, list)
-            else range(self.args.freeze)
-            if isinstance(self.args.freeze, int)
-            else []
-        )
-        always_freeze_names = [".dfl"]  # always freeze these layers
-        freeze_layer_names = [f"model.{x}." for x in freeze_list] + always_freeze_names
-        self.freeze_layer_names = freeze_layer_names
-        for k, v in self.model.named_parameters():
-            # v.register_hook(lambda x: torch.nan_to_num(x))  # NaN to 0 (commented for erratic training results)
-            if any(x in k for x in freeze_layer_names):
-                LOGGER.info(f"Freezing layer '{k}'")
-                v.requires_grad = False
-            elif not v.requires_grad and v.dtype.is_floating_point:  # only floating point Tensor can require gradients
-                LOGGER.warning(
-                    f"setting 'requires_grad=True' for frozen layer '{k}'. "
-                    "See ultralytics.engine.trainer for customization of frozen layers."
-                )
-                v.requires_grad = True
-
-        # Check AMP
-        self.amp = torch.tensor(self.args.amp).to(self.device)  # True or False
-        if self.amp and RANK in {-1, 0}:  # Single-GPU and DDP
-            callbacks_backup = callbacks.default_callbacks.copy()  # backup callbacks as check_amp() resets them
-            self.amp = torch.tensor(check_amp(self.model), device=self.device)
-            callbacks.default_callbacks = callbacks_backup  # restore callbacks
-        if RANK > -1 and self.world_size > 1:  # DDP
-            dist.broadcast(self.amp.int(), src=0)  # broadcast from rank 0 to all other ranks; gloo errors with boolean
-        self.amp = bool(self.amp)  # as boolean
-        self.scaler = (
-            torch.amp.GradScaler("cuda", enabled=self.amp) if TORCH_2_4 else torch.cuda.amp.GradScaler(enabled=self.amp)
-        )
-        if self.world_size > 1:
-            self.model = nn.parallel.DistributedDataParallel(self.model, device_ids=[RANK], find_unused_parameters=True)
-
-        # Check imgsz
-        gs = max(int(self.model.stride.max() if hasattr(self.model, "stride") else 32), 32)  # grid size (max stride)
-        self.args.imgsz = check_imgsz(self.args.imgsz, stride=gs, floor=gs, max_dim=1)
-        self.stride = gs  # for multiscale training
-
-        # Batch size
-        if self.batch_size < 1 and RANK == -1:  # single-GPU only, estimate best batch size
-            self.args.batch = self.batch_size = self.auto_batch()
-
-        # Dataloaders
-        batch_size = self.batch_size // max(self.world_size, 1)
-        self.train_loader = self.get_dataloader(
-            self.data["train"], batch_size=batch_size, rank=LOCAL_RANK, mode="train"
-        )
-        if RANK in {-1, 0}:
-            # Note: When training DOTA dataset, double batch size could get OOM on images with >2000 objects.
-            self.test_loader = self.get_dataloader(
-                self.data.get("val") or self.data.get("test"),
-                batch_size=batch_size if self.args.task == "obb" else batch_size * 2,
-                rank=-1,
-                mode="val",
-            )
-            self.validator = self.get_validator()
-            metric_keys = self.validator.metrics.keys + self.label_loss_items(prefix="val")
-            self.metrics = dict(zip(metric_keys, [0] * len(metric_keys)))
-            self.ema = ModelEMA(self.model)
-            if self.args.plots:
-                self.plot_training_labels()
-
-        # Optimizer
-        self.accumulate = max(round(self.args.nbs / self.batch_size), 1)  # accumulate loss before optimizing
-        weight_decay = self.args.weight_decay * self.batch_size * self.accumulate / self.args.nbs  # scale weight_decay
-        iterations = math.ceil(len(self.train_loader.dataset) / max(self.batch_size, self.args.nbs)) * self.epochs
-        self.optimizer = self.build_optimizer(
-            model=self.model,
-            name=self.args.optimizer,
-            lr=self.args.lr0,
-            momentum=self.args.momentum,
-            decay=weight_decay,
-            iterations=iterations,
-        )
-        # Scheduler
-        self._setup_scheduler()
-        self.stopper, self.stop = EarlyStopping(patience=self.args.patience), False
-        self.resume_training(ckpt)
-        self.scheduler.last_epoch = self.start_epoch - 1  # do not move
-        self.run_callbacks("on_pretrain_routine_end")
-
-    def _do_train(self):
-        """Train the model with the specified world size."""
-        if self.world_size > 1:
-            self._setup_ddp()
-        self._setup_train()
-
-        nb = len(self.train_loader)  # number of batches
-        nw = max(round(self.args.warmup_epochs * nb), 100) if self.args.warmup_epochs > 0 else -1  # warmup iterations
-        last_opt_step = -1
-        self.epoch_time = None
-        self.epoch_time_start = time.time()
-        self.train_time_start = time.time()
-        self.run_callbacks("on_train_start")
-        LOGGER.info(
-            f"Image sizes {self.args.imgsz} train, {self.args.imgsz} val\n"
-            f"Using {self.train_loader.num_workers * (self.world_size or 1)} dataloader workers\n"
-            f"Logging results to {colorstr('bold', self.save_dir)}\n"
-            f"Starting training for " + (f"{self.args.time} hours..." if self.args.time else f"{self.epochs} epochs...")
-        )
-        if self.args.close_mosaic:
-            base_idx = (self.epochs - self.args.close_mosaic) * nb
-            self.plot_idx.extend([base_idx, base_idx + 1, base_idx + 2])
-        epoch = self.start_epoch
-        self.optimizer.zero_grad()  # zero any resumed gradients to ensure stability on train start
-        while True:
-            self.epoch = epoch
-            self.run_callbacks("on_train_epoch_start")
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore")  # suppress 'Detected lr_scheduler.step() before optimizer.step()'
-                self.scheduler.step()
-
-            self._model_train()
-            if RANK != -1:
-                self.train_loader.sampler.set_epoch(epoch)
-            pbar = enumerate(self.train_loader)
-            # Update dataloader attributes (optional)
-            if epoch == (self.epochs - self.args.close_mosaic):
-                self._close_dataloader_mosaic()
-                self.train_loader.reset()
-
-            if RANK in {-1, 0}:
-                LOGGER.info(self.progress_string())
-                pbar = TQDM(enumerate(self.train_loader), total=nb)
-            self.tloss = None
-            for i, batch in pbar:
-                self.run_callbacks("on_train_batch_start")
-                # Warmup
-                ni = i + nb * epoch
-                if ni <= nw:
-                    xi = [0, nw]  # x interp
-                    self.accumulate = max(1, int(np.interp(ni, xi, [1, self.args.nbs / self.batch_size]).round()))
-                    for j, x in enumerate(self.optimizer.param_groups):
-                        # Bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
-                        x["lr"] = np.interp(
-                            ni, xi, [self.args.warmup_bias_lr if j == 0 else 0.0, x["initial_lr"] * self.lf(epoch)]
-                        )
-                        if "momentum" in x:
-                            x["momentum"] = np.interp(ni, xi, [self.args.warmup_momentum, self.args.momentum])
-
-                # Forward
-                with autocast(self.amp):
-                    batch = self.preprocess_batch(batch)
-                    if self.args.compile:
-                        # Decouple inference and loss calculations for improved compile performance
-                        preds = self.model(batch["img"])
-                        loss, self.loss_items = unwrap_model(self.model).loss(batch, preds)
-                    else:
-                        loss, self.loss_items = self.model(batch)
-                    self.loss = loss.sum()
-                    if RANK != -1:
-                        self.loss *= self.world_size
-                    self.tloss = (
-                        (self.tloss * i + self.loss_items) / (i + 1) if self.tloss is not None else self.loss_items
-                    )
-
-                # Backward
-                self.scaler.scale(self.loss).backward()
-
-                # Optimize - https://pytorch.org/docs/master/notes/amp_examples.html
-                if ni - last_opt_step >= self.accumulate:
-                    self.optimizer_step()
-                    last_opt_step = ni
-
-                    # Timed stopping
-                    if self.args.time:
-                        self.stop = (time.time() - self.train_time_start) > (self.args.time * 3600)
-                        if RANK != -1:  # if DDP training
-                            broadcast_list = [self.stop if RANK == 0 else None]
-                            dist.broadcast_object_list(broadcast_list, 0)  # broadcast 'stop' to all ranks
-                            self.stop = broadcast_list[0]
-                        if self.stop:  # training time exceeded
-                            break
-
-                # Log
-                if RANK in {-1, 0}:
-                    loss_length = self.tloss.shape[0] if len(self.tloss.shape) else 1
-                    pbar.set_description(
-                        ("%11s" * 2 + "%11.4g" * (2 + loss_length))
-                        % (
-                            f"{epoch + 1}/{self.epochs}",
-                            f"{self._get_memory():.3g}G",  # (GB) GPU memory util
-                            *(self.tloss if loss_length > 1 else torch.unsqueeze(self.tloss, 0)),  # losses
-                            batch["cls"].shape[0],  # batch size, i.e. 8
-                            batch["img"].shape[-1],  # imgsz, i.e 640
-                        )
-                    )
-                    self.run_callbacks("on_batch_end")
-                    if self.args.plots and ni in self.plot_idx:
-                        self.plot_training_samples(batch, ni)
-
-                self.run_callbacks("on_train_batch_end")
-
-            self.lr = {f"lr/pg{ir}": x["lr"] for ir, x in enumerate(self.optimizer.param_groups)}  # for loggers
-            self.run_callbacks("on_train_epoch_end")
-            if RANK in {-1, 0}:
-                final_epoch = epoch + 1 >= self.epochs
-                self.ema.update_attr(self.model, include=["yaml", "nc", "args", "names", "stride", "class_weights"])
-
-                # Validation
-                if self.args.val or final_epoch or self.stopper.possible_stop or self.stop:
-                    self._clear_memory(threshold=0.5)  # prevent VRAM spike
-                    self.metrics, self.fitness = self.validate()
-                self.save_metrics(metrics={**self.label_loss_items(self.tloss), **self.metrics, **self.lr})
-                self.stop |= self.stopper(epoch + 1, self.fitness) or final_epoch
-                if self.args.time:
-                    self.stop |= (time.time() - self.train_time_start) > (self.args.time * 3600)
-
-                # Save model
-                if self.args.save or final_epoch:
-                    self.save_model()
-                    self.run_callbacks("on_model_save")
-
-            # Scheduler
-            t = time.time()
-            self.epoch_time = t - self.epoch_time_start
-            self.epoch_time_start = t
-            if self.args.time:
-                mean_epoch_time = (t - self.train_time_start) / (epoch - self.start_epoch + 1)
-                self.epochs = self.args.epochs = math.ceil(self.args.time * 3600 / mean_epoch_time)
-                self._setup_scheduler()
-                self.scheduler.last_epoch = self.epoch  # do not move
-                self.stop |= epoch >= self.epochs  # stop if exceeded epochs
-            self.run_callbacks("on_fit_epoch_end")
-            self._clear_memory(0.5)  # clear if memory utilization > 50%
-
-            # Early Stopping
-            if RANK != -1:  # if DDP training
-                broadcast_list = [self.stop if RANK == 0 else None]
-                dist.broadcast_object_list(broadcast_list, 0)  # broadcast 'stop' to all ranks
-                self.stop = broadcast_list[0]
-            if self.stop:
-                break  # must break all DDP ranks
-            epoch += 1
-
-        if RANK in {-1, 0}:
-            # Do final val with best.pt
-            seconds = time.time() - self.train_time_start
-            LOGGER.info(f"\n{epoch - self.start_epoch + 1} epochs completed in {seconds / 3600:.3f} hours.")
-            self.final_eval()
-            if self.args.plots:
-                self.plot_metrics()
-            self.run_callbacks("on_train_end")
-        self._clear_memory()
-        unset_deterministic()
-        self.run_callbacks("teardown")
-
-    def auto_batch(self, max_num_obj=0):
-        """Calculate optimal batch size based on model and device memory constraints."""
-        return check_train_batch_size(
-            model=self.model,
-            imgsz=self.args.imgsz,
-            amp=self.amp,
-            batch=self.batch_size,
-            max_num_obj=max_num_obj,
-        )  # returns batch size
-
-    def _get_memory(self, fraction=False):
-        """Get accelerator memory utilization in GB or as a fraction of total memory."""
-        memory, total = 0, 0
-        if self.device.type == "mps":
-            memory = torch.mps.driver_allocated_memory()
-            if fraction:
-                return __import__("psutil").virtual_memory().percent / 100
-        elif self.device.type != "cpu":
-            memory = torch.cuda.memory_reserved()
-            if fraction:
-                total = torch.cuda.get_device_properties(self.device).total_memory
-        return ((memory / total) if total > 0 else 0) if fraction else (memory / 2**30)
-
-    def _clear_memory(self, threshold: float = None):
-        """Clear accelerator memory by calling garbage collector and emptying cache."""
-        if threshold:
-            assert 0 <= threshold <= 1, "Threshold must be between 0 and 1."
-            if self._get_memory(fraction=True) <= threshold:
-                return
-        gc.collect()
-        if self.device.type == "mps":
-            torch.mps.empty_cache()
-        elif self.device.type == "cpu":
-            return
-        else:
-            torch.cuda.empty_cache()
-
-    def read_results_csv(self):
-        """Read results.csv into a dictionary using polars."""
-        import polars as pl  # scope for faster 'import ultralytics'
-
-        return pl.read_csv(self.csv, infer_schema_length=None).to_dict(as_series=False)
-
-    def _model_train(self):
-        """Set model in training mode."""
-        self.model.train()
-        # Freeze BN stat
-        for n, m in self.model.named_modules():
-            if any(filter(lambda f: f in n, self.freeze_layer_names)) and isinstance(m, nn.BatchNorm2d):
-                m.eval()
-
-    def save_model(self):
-        """Save model training checkpoints with additional metadata."""
-        import io
-
-        # Serialize ckpt to a byte buffer once (faster than repeated torch.save() calls)
-        buffer = io.BytesIO()
-        torch.save(
-            {
-                "epoch": self.epoch,
-                "best_fitness": self.best_fitness,
-                "model": None,  # resume and final checkpoints derive from EMA
-                "ema": deepcopy(unwrap_model(self.ema.ema)).half(),
-                "updates": self.ema.updates,
-                "optimizer": convert_optimizer_state_dict_to_fp16(deepcopy(self.optimizer.state_dict())),
-                "scaler": self.scaler.state_dict(),
-                "train_args": vars(self.args),  # save as dict
-                "train_metrics": {**self.metrics, **{"fitness": self.fitness}},
-                "train_results": self.read_results_csv(),
-                "date": datetime.now().isoformat(),
-                "version": __version__,
-                "git": {
-                    "root": str(GIT.root),
-                    "branch": GIT.branch,
-                    "commit": GIT.commit,
-                    "origin": GIT.origin,
-                },
-                "license": "AGPL-3.0 (https://ultralytics.com/license)",
-                "docs": "https://docs.ultralytics.com",
-            },
-            buffer,
-        )
-        serialized_ckpt = buffer.getvalue()  # get the serialized content to save
-
-        # Save checkpoints
-        self.last.write_bytes(serialized_ckpt)  # save last.pt
-        if self.best_fitness == self.fitness:
-            self.best.write_bytes(serialized_ckpt)  # save best.pt
-        if (self.save_period > 0) and (self.epoch % self.save_period == 0):
-            (self.wdir / f"epoch{self.epoch}.pt").write_bytes(serialized_ckpt)  # save epoch, i.e. 'epoch3.pt'
-
-    def get_dataset(self):
-        """
-        Get train and validation datasets from data dictionary.
-
-        Returns:
-            (dict): A dictionary containing the training/validation/test dataset and category names.
-        """
-        try:
-            if self.args.task == "classify":
-                data = check_cls_dataset(self.args.data)
-            elif self.args.data.rsplit(".", 1)[-1] == "ndjson":
-                # Convert NDJSON to YOLO format
-                import asyncio
-
-                from ultralytics.data.converter import convert_ndjson_to_yolo
-
-                yaml_path = asyncio.run(convert_ndjson_to_yolo(self.args.data))
-                self.args.data = str(yaml_path)
-                data = check_det_dataset(self.args.data)
-            elif self.args.data.rsplit(".", 1)[-1] in {"yaml", "yml"} or self.args.task in {
-                "detect",
-                "segment",
-                "pose",
-                "obb",
-            }:
-                data = check_det_dataset(self.args.data)
-                if "yaml_file" in data:
-                    self.args.data = data["yaml_file"]  # for validating 'yolo train data=url.zip' usage
-        except Exception as e:
-            raise RuntimeError(emojis(f"Dataset '{clean_url(self.args.data)}' error ❌ {e}")) from e
-        if self.args.single_cls:
-            LOGGER.info("Overriding class names with single class.")
-            data["names"] = {0: "item"}
-            data["nc"] = 1
-        return data
-
-    def setup_model(self):
-        """
-        Load, create, or download model for any task.
-
-        Returns:
-            (dict): Optional checkpoint to resume training from.
-        """
-        if isinstance(self.model, torch.nn.Module):  # if model is loaded beforehand. No setup needed
-            return
-
-        cfg, weights = self.model, None
-        ckpt = None
-        if str(self.model).endswith(".pt"):
-            weights, ckpt = load_checkpoint(self.model)
-            cfg = weights.yaml
-        elif isinstance(self.args.pretrained, (str, Path)):
-            weights, _ = load_checkpoint(self.args.pretrained)
-        self.model = self.get_model(cfg=cfg, weights=weights, verbose=RANK == -1)  # calls Model(cfg, weights)
-        return ckpt
-
-    def optimizer_step(self):
-        """Perform a single step of the training optimizer with gradient clipping and EMA update."""
-        self.scaler.unscale_(self.optimizer)  # unscale gradients
-        torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=10.0)  # clip gradients
-        self.scaler.step(self.optimizer)
-        self.scaler.update()
-        self.optimizer.zero_grad()
-        if self.ema:
-            self.ema.update(self.model)
-
-    def preprocess_batch(self, batch):
-        """Allow custom preprocessing model inputs and ground truths depending on task type."""
-        return batch
-
-    def validate(self):
-        """
-        Run validation on val set using self.validator.
-
-        Returns:
-            metrics (dict): Dictionary of validation metrics.
-            fitness (float): Fitness score for the validation.
-        """
-        metrics = self.validator(self)
-        fitness = metrics.pop("fitness", -self.loss.detach().cpu().numpy())  # use loss as fitness measure if not found
-        if not self.best_fitness or self.best_fitness < fitness:
-            self.best_fitness = fitness
-        return metrics, fitness
-
-    def get_model(self, cfg=None, weights=None, verbose=True):
-        """Get model and raise NotImplementedError for loading cfg files."""
-        raise NotImplementedError("This task trainer doesn't support loading cfg files")
-
-    def get_validator(self):
-        """Return a NotImplementedError when the get_validator function is called."""
-        raise NotImplementedError("get_validator function not implemented in trainer")
-
-    def get_dataloader(self, dataset_path, batch_size=16, rank=0, mode="train"):
-        """Return dataloader derived from torch.data.Dataloader."""
-        raise NotImplementedError("get_dataloader function not implemented in trainer")
-
-    def build_dataset(self, img_path, mode="train", batch=None):
-        """Build dataset."""
-        raise NotImplementedError("build_dataset function not implemented in trainer")
-
-    def label_loss_items(self, loss_items=None, prefix="train"):
-        """
-        Return a loss dict with labelled training loss items tensor.
-
-        Note:
-            This is not needed for classification but necessary for segmentation & detection
-        """
-        return {"loss": loss_items} if loss_items is not None else ["loss"]
-
-    def set_model_attributes(self):
-        """Set or update model parameters before training."""
-        self.model.names = self.data["names"]
-
-    def build_targets(self, preds, targets):
-        """Build target tensors for training YOLO model."""
-        pass
-
-    def progress_string(self):
-        """Return a string describing training progress."""
-        return ""
-
-    # TODO: may need to put these following functions into callback
-    def plot_training_samples(self, batch, ni):
-        """Plot training samples during YOLO training."""
-        pass
-
-    def plot_training_labels(self):
-        """Plot training labels for YOLO model."""
-        pass
-
-    def save_metrics(self, metrics):
-        """Save training metrics to a CSV file."""
-        keys, vals = list(metrics.keys()), list(metrics.values())
-        n = len(metrics) + 2  # number of cols
-        s = "" if self.csv.exists() else (("%s," * n % tuple(["epoch", "time"] + keys)).rstrip(",") + "\n")  # header
-        t = time.time() - self.train_time_start
-        with open(self.csv, "a", encoding="utf-8") as f:
-            f.write(s + ("%.6g," * n % tuple([self.epoch + 1, t] + vals)).rstrip(",") + "\n")
-
-    def plot_metrics(self):
-        """Plot metrics from a CSV file."""
-        plot_results(file=self.csv, on_plot=self.on_plot)  # save results.png
-
-    def on_plot(self, name, data=None):
-        """Register plots (e.g. to be consumed in callbacks)."""
-        path = Path(name)
-        self.plots[path] = {"data": data, "timestamp": time.time()}
-
-    def final_eval(self):
-        """Perform final evaluation and validation for object detection YOLO model."""
-        ckpt = {}
-        for f in self.last, self.best:
-            if f.exists():
-                if f is self.last:
-                    ckpt = strip_optimizer(f)
-                elif f is self.best:
-                    k = "train_results"  # update best.pt train_metrics from last.pt
-                    strip_optimizer(f, updates={k: ckpt[k]} if k in ckpt else None)
-                    LOGGER.info(f"\nValidating {f}...")
-                    self.validator.args.plots = self.args.plots
-                    self.validator.args.compile = False  # disable final val compile as too slow
-                    self.metrics = self.validator(model=f)
-                    self.metrics.pop("fitness", None)
-                    self.run_callbacks("on_fit_epoch_end")
-
-    def check_resume(self, overrides):
-        """Check if resume checkpoint exists and update arguments accordingly."""
-        resume = self.args.resume
-        if resume:
-            try:
-                exists = isinstance(resume, (str, Path)) and Path(resume).exists()
-                last = Path(check_file(resume) if exists else get_latest_run())
-
-                # Check that resume data YAML exists, otherwise strip to force re-download of dataset
-                ckpt_args = load_checkpoint(last)[0].args
-                if not isinstance(ckpt_args["data"], dict) and not Path(ckpt_args["data"]).exists():
-                    ckpt_args["data"] = self.args.data
-
-                resume = True
-                self.args = get_cfg(ckpt_args)
-                self.args.model = self.args.resume = str(last)  # reinstate model
-                for k in (
-                    "imgsz",
-                    "batch",
-                    "device",
-                    "close_mosaic",
-                ):  # allow arg updates to reduce memory or update device on resume
-                    if k in overrides:
-                        setattr(self.args, k, overrides[k])
-
-            except Exception as e:
-                raise FileNotFoundError(
-                    "Resume checkpoint not found. Please pass a valid checkpoint to resume from, "
-                    "i.e. 'yolo train resume model=path/to/last.pt'"
-                ) from e
-        self.resume = resume
-
-    def resume_training(self, ckpt):
-        """Resume YOLO training from given epoch and best fitness."""
-        if ckpt is None or not self.resume:
-            return
-        best_fitness = 0.0
-        start_epoch = ckpt.get("epoch", -1) + 1
-        if ckpt.get("optimizer") is not None:
-            self.optimizer.load_state_dict(ckpt["optimizer"])  # optimizer
-            best_fitness = ckpt["best_fitness"]
-        if ckpt.get("scaler") is not None:
-            self.scaler.load_state_dict(ckpt["scaler"])
-        if self.ema and ckpt.get("ema"):
-            self.ema.ema.load_state_dict(ckpt["ema"].float().state_dict())  # EMA
-            self.ema.updates = ckpt["updates"]
-        assert start_epoch > 0, (
-            f"{self.args.model} training to {self.epochs} epochs is finished, nothing to resume.\n"
-            f"Start a new training without resuming, i.e. 'yolo train model={self.args.model}'"
-        )
-        LOGGER.info(f"Resuming training {self.args.model} from epoch {start_epoch + 1} to {self.epochs} total epochs")
-        if self.epochs < start_epoch:
-            LOGGER.info(
-                f"{self.model} has been trained for {ckpt['epoch']} epochs. Fine-tuning for {self.epochs} more epochs."
-            )
-            self.epochs += ckpt["epoch"]  # finetune additional epochs
-        self.best_fitness = best_fitness
-        self.start_epoch = start_epoch
-        if start_epoch > (self.epochs - self.args.close_mosaic):
-            self._close_dataloader_mosaic()
-
-    def _close_dataloader_mosaic(self):
-        """Update dataloaders to stop using mosaic augmentation."""
-        if hasattr(self.train_loader.dataset, "mosaic"):
-            self.train_loader.dataset.mosaic = False
-        if hasattr(self.train_loader.dataset, "close_mosaic"):
-            LOGGER.info("Closing dataloader mosaic")
-            self.train_loader.dataset.close_mosaic(hyp=copy(self.args))
-
-    def build_optimizer(self, model, name="auto", lr=0.001, momentum=0.9, decay=1e-5, iterations=1e5):
-        """
-        Construct an optimizer for the given model.
-
-        Args:
-            model (torch.nn.Module): The model for which to build an optimizer.
-            name (str, optional): The name of the optimizer to use. If 'auto', the optimizer is selected
-                based on the number of iterations.
-            lr (float, optional): The learning rate for the optimizer.
-            momentum (float, optional): The momentum factor for the optimizer.
-            decay (float, optional): The weight decay for the optimizer.
-            iterations (float, optional): The number of iterations, which determines the optimizer if
-                name is 'auto'.
-
-        Returns:
-            (torch.optim.Optimizer): The constructed optimizer.
-        """
-        g = [], [], []  # optimizer parameter groups
-        bn = tuple(v for k, v in nn.__dict__.items() if "Norm" in k)  # normalization layers, i.e. BatchNorm2d()
-        if name == "auto":
-            LOGGER.info(
-                f"{colorstr('optimizer:')} 'optimizer=auto' found, "
-                f"ignoring 'lr0={self.args.lr0}' and 'momentum={self.args.momentum}' and "
-                f"determining best 'optimizer', 'lr0' and 'momentum' automatically... "
-            )
-            nc = self.data.get("nc", 10)  # number of classes
-            lr_fit = round(0.002 * 5 / (4 + nc), 6)  # lr0 fit equation to 6 decimal places
-            name, lr, momentum = ("SGD", 0.01, 0.9) if iterations > 10000 else ("AdamW", lr_fit, 0.9)
-            self.args.warmup_bias_lr = 0.0  # no higher than 0.01 for Adam
-
-        for module_name, module in model.named_modules():
-            for param_name, param in module.named_parameters(recurse=False):
-                fullname = f"{module_name}.{param_name}" if module_name else param_name
-                if "bias" in fullname:  # bias (no decay)
-                    g[2].append(param)
-                elif isinstance(module, bn) or "logit_scale" in fullname:  # weight (no decay)
-                    # ContrastiveHead and BNContrastiveHead included here with 'logit_scale'
-                    g[1].append(param)
-                else:  # weight (with decay)
-                    g[0].append(param)
-
-        optimizers = {"Adam", "Adamax", "AdamW", "NAdam", "RAdam", "RMSProp", "SGD", "auto"}
-        name = {x.lower(): x for x in optimizers}.get(name.lower())
-        if name in {"Adam", "Adamax", "AdamW", "NAdam", "RAdam"}:
-            optimizer = getattr(optim, name, optim.Adam)(g[2], lr=lr, betas=(momentum, 0.999), weight_decay=0.0)
-        elif name == "RMSProp":
-            optimizer = optim.RMSprop(g[2], lr=lr, momentum=momentum)
-        elif name == "SGD":
-            optimizer = optim.SGD(g[2], lr=lr, momentum=momentum, nesterov=True)
-        else:
-            raise NotImplementedError(
-                f"Optimizer '{name}' not found in list of available optimizers {optimizers}. "
-                "Request support for addition optimizers at https://github.com/ultralytics/ultralytics."
-            )
-
-        optimizer.add_param_group({"params": g[0], "weight_decay": decay})  # add g0 with weight_decay
-        optimizer.add_param_group({"params": g[1], "weight_decay": 0.0})  # add g1 (BatchNorm2d weights)
-        LOGGER.info(
-            f"{colorstr('optimizer:')} {type(optimizer).__name__}(lr={lr}, momentum={momentum}) with parameter groups "
-            f"{len(g[1])} weight(decay=0.0), {len(g[0])} weight(decay={decay}), {len(g[2])} bias(decay=0.0)"
-        )
-        return optimizer
diff --git a/ultralytics/engine/tuner.py b/ultralytics/engine/tuner.py
deleted file mode 100644
index 7c4b9c1..0000000
--- a/ultralytics/engine/tuner.py
+++ /dev/null
@@ -1,459 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-"""
-Module provides functionalities for hyperparameter tuning of the Ultralytics YOLO models for object detection, instance
-segmentation, image classification, pose estimation, and multi-object tracking.
-
-Hyperparameter tuning is the process of systematically searching for the optimal set of hyperparameters
-that yield the best model performance. This is particularly crucial in deep learning models like YOLO,
-where small changes in hyperparameters can lead to significant differences in model accuracy and efficiency.
-
-Examples:
-    Tune hyperparameters for YOLO11n on COCO8 at imgsz=640 and epochs=30 for 300 tuning iterations.
-    >>> from ultralytics import YOLO
-    >>> model = YOLO("yolo11n.pt")
-    >>> model.tune(data="coco8.yaml", epochs=10, iterations=300, optimizer="AdamW", plots=False, save=False, val=False)
-"""
-
-from __future__ import annotations
-
-import gc
-import random
-import shutil
-import subprocess
-import time
-from datetime import datetime
-
-import numpy as np
-import torch
-
-from ultralytics.cfg import get_cfg, get_save_dir
-from ultralytics.utils import DEFAULT_CFG, LOGGER, YAML, callbacks, colorstr, remove_colorstr
-from ultralytics.utils.checks import check_requirements
-from ultralytics.utils.patches import torch_load
-from ultralytics.utils.plotting import plot_tune_results
-
-
-class Tuner:
-    """
-    A class for hyperparameter tuning of YOLO models.
-
-    The class evolves YOLO model hyperparameters over a given number of iterations by mutating them according to the
-    search space and retraining the model to evaluate their performance. Supports both local CSV storage and
-    distributed MongoDB Atlas coordination for multi-machine hyperparameter optimization.
-
-    Attributes:
-        space (dict[str, tuple]): Hyperparameter search space containing bounds and scaling factors for mutation.
-        tune_dir (Path): Directory where evolution logs and results will be saved.
-        tune_csv (Path): Path to the CSV file where evolution logs are saved.
-        args (dict): Configuration arguments for the tuning process.
-        callbacks (list): Callback functions to be executed during tuning.
-        prefix (str): Prefix string for logging messages.
-        mongodb (MongoClient): Optional MongoDB client for distributed tuning.
-        collection (Collection): MongoDB collection for storing tuning results.
-
-    Methods:
-        _mutate: Mutate hyperparameters based on bounds and scaling factors.
-        __call__: Execute the hyperparameter evolution across multiple iterations.
-
-    Examples:
-        Tune hyperparameters for YOLO11n on COCO8 at imgsz=640 and epochs=30 for 300 tuning iterations.
-        >>> from ultralytics import YOLO
-        >>> model = YOLO("yolo11n.pt")
-        >>> model.tune(
-        >>>     data="coco8.yaml",
-        >>>     epochs=10,
-        >>>     iterations=300,
-        >>>     plots=False,
-        >>>     save=False,
-        >>>     val=False
-        >>> )
-
-        Tune with distributed MongoDB Atlas coordination across multiple machines:
-        >>> model.tune(
-        >>>     data="coco8.yaml",
-        >>>     epochs=10,
-        >>>     iterations=300,
-        >>>     mongodb_uri="mongodb+srv://user:pass@cluster.mongodb.net/",
-        >>>     mongodb_db="ultralytics",
-        >>>     mongodb_collection="tune_results"
-        >>> )
-
-        Tune with custom search space:
-        >>> model.tune(space={"lr0": (1e-5, 1e-1), "momentum": (0.6, 0.98)})
-    """
-
-    def __init__(self, args=DEFAULT_CFG, _callbacks: list | None = None):
-        """
-        Initialize the Tuner with configurations.
-
-        Args:
-            args (dict): Configuration for hyperparameter evolution.
-            _callbacks (list | None, optional): Callback functions to be executed during tuning.
-        """
-        self.space = args.pop("space", None) or {  # key: (min, max, gain(optional))
-            # 'optimizer': tune.choice(['SGD', 'Adam', 'AdamW', 'NAdam', 'RAdam', 'RMSProp']),
-            "lr0": (1e-5, 1e-1),  # initial learning rate (i.e. SGD=1E-2, Adam=1E-3)
-            "lrf": (0.0001, 0.1),  # final OneCycleLR learning rate (lr0 * lrf)
-            "momentum": (0.7, 0.98, 0.3),  # SGD momentum/Adam beta1
-            "weight_decay": (0.0, 0.001),  # optimizer weight decay 5e-4
-            "warmup_epochs": (0.0, 5.0),  # warmup epochs (fractions ok)
-            "warmup_momentum": (0.0, 0.95),  # warmup initial momentum
-            "box": (1.0, 20.0),  # box loss gain
-            "cls": (0.1, 4.0),  # cls loss gain (scale with pixels)
-            "dfl": (0.4, 6.0),  # dfl loss gain
-            "hsv_h": (0.0, 0.1),  # image HSV-Hue augmentation (fraction)
-            "hsv_s": (0.0, 0.9),  # image HSV-Saturation augmentation (fraction)
-            "hsv_v": (0.0, 0.9),  # image HSV-Value augmentation (fraction)
-            "degrees": (0.0, 45.0),  # image rotation (+/- deg)
-            "translate": (0.0, 0.9),  # image translation (+/- fraction)
-            "scale": (0.0, 0.95),  # image scale (+/- gain)
-            "shear": (0.0, 10.0),  # image shear (+/- deg)
-            "perspective": (0.0, 0.001),  # image perspective (+/- fraction), range 0-0.001
-            "flipud": (0.0, 1.0),  # image flip up-down (probability)
-            "fliplr": (0.0, 1.0),  # image flip left-right (probability)
-            "bgr": (0.0, 1.0),  # image channel bgr (probability)
-            "mosaic": (0.0, 1.0),  # image mosaic (probability)
-            "mixup": (0.0, 1.0),  # image mixup (probability)
-            "cutmix": (0.0, 1.0),  # image cutmix (probability)
-            "copy_paste": (0.0, 1.0),  # segment copy-paste (probability)
-            "close_mosaic": (0.0, 10.0),  # close dataloader mosaic (epochs)
-        }
-        mongodb_uri = args.pop("mongodb_uri", None)
-        mongodb_db = args.pop("mongodb_db", "ultralytics")
-        mongodb_collection = args.pop("mongodb_collection", "tuner_results")
-
-        self.args = get_cfg(overrides=args)
-        self.args.exist_ok = self.args.resume  # resume w/ same tune_dir
-        self.tune_dir = get_save_dir(self.args, name=self.args.name or "tune")
-        self.args.name, self.args.exist_ok, self.args.resume = (None, False, False)  # reset to not affect training
-        self.tune_csv = self.tune_dir / "tune_results.csv"
-        self.callbacks = _callbacks or callbacks.get_default_callbacks()
-        self.prefix = colorstr("Tuner: ")
-        callbacks.add_integration_callbacks(self)
-
-        # MongoDB Atlas support (optional)
-        self.mongodb = None
-        if mongodb_uri:
-            self._init_mongodb(mongodb_uri, mongodb_db, mongodb_collection)
-
-        LOGGER.info(
-            f"{self.prefix}Initialized Tuner instance with 'tune_dir={self.tune_dir}'\n"
-            f"{self.prefix}💡 Learn about tuning at https://docs.ultralytics.com/guides/hyperparameter-tuning"
-        )
-
-    def _connect(self, uri: str = "mongodb+srv://username:password@cluster.mongodb.net/", max_retries: int = 3):
-        """
-        Create MongoDB client with exponential backoff retry on connection failures.
-
-        Args:
-            uri (str): MongoDB connection string with credentials and cluster information.
-            max_retries (int): Maximum number of connection attempts before giving up.
-
-        Returns:
-            (MongoClient): Connected MongoDB client instance.
-        """
-        check_requirements("pymongo")
-
-        from pymongo import MongoClient
-        from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError
-
-        for attempt in range(max_retries):
-            try:
-                client = MongoClient(
-                    uri,
-                    serverSelectionTimeoutMS=30000,
-                    connectTimeoutMS=20000,
-                    socketTimeoutMS=40000,
-                    retryWrites=True,
-                    retryReads=True,
-                    maxPoolSize=30,
-                    minPoolSize=3,
-                    maxIdleTimeMS=60000,
-                )
-                client.admin.command("ping")  # Test connection
-                LOGGER.info(f"{self.prefix}Connected to MongoDB Atlas (attempt {attempt + 1})")
-                return client
-            except (ConnectionFailure, ServerSelectionTimeoutError):
-                if attempt == max_retries - 1:
-                    raise
-                wait_time = 2**attempt
-                LOGGER.warning(
-                    f"{self.prefix}MongoDB connection failed (attempt {attempt + 1}), retrying in {wait_time}s..."
-                )
-                time.sleep(wait_time)
-
-    def _init_mongodb(self, mongodb_uri="", mongodb_db="", mongodb_collection=""):
-        """
-        Initialize MongoDB connection for distributed tuning.
-
-        Connects to MongoDB Atlas for distributed hyperparameter optimization across multiple machines.
-        Each worker saves results to a shared collection and reads the latest best hyperparameters
-        from all workers for evolution.
-
-        Args:
-            mongodb_uri (str): MongoDB connection string, e.g. 'mongodb+srv://username:password@cluster.mongodb.net/'.
-            mongodb_db (str, optional): Database name.
-            mongodb_collection (str, optional): Collection name.
-
-        Notes:
-            - Creates a fitness index for fast queries of top results
-            - Falls back to CSV-only mode if connection fails
-            - Uses connection pooling and retry logic for production reliability
-        """
-        self.mongodb = self._connect(mongodb_uri)
-        self.collection = self.mongodb[mongodb_db][mongodb_collection]
-        self.collection.create_index([("fitness", -1)], background=True)
-        LOGGER.info(f"{self.prefix}Using MongoDB Atlas for distributed tuning")
-
-    def _get_mongodb_results(self, n: int = 5) -> list:
-        """
-        Get top N results from MongoDB sorted by fitness.
-
-        Args:
-            n (int): Number of top results to retrieve.
-
-        Returns:
-            (list[dict]): List of result documents with fitness scores and hyperparameters.
-        """
-        try:
-            return list(self.collection.find().sort("fitness", -1).limit(n))
-        except Exception:
-            return []
-
-    def _save_to_mongodb(self, fitness: float, hyperparameters: dict[str, float], metrics: dict, iteration: int):
-        """
-        Save results to MongoDB with proper type conversion.
-
-        Args:
-            fitness (float): Fitness score achieved with these hyperparameters.
-            hyperparameters (dict[str, float]): Dictionary of hyperparameter values.
-            metrics (dict): Complete training metrics dictionary (mAP, precision, recall, losses, etc.).
-            iteration (int): Current iteration number.
-        """
-        try:
-            self.collection.insert_one(
-                {
-                    "fitness": float(fitness),
-                    "hyperparameters": {k: (v.item() if hasattr(v, "item") else v) for k, v in hyperparameters.items()},
-                    "metrics": metrics,
-                    "timestamp": datetime.now(),
-                    "iteration": iteration,
-                }
-            )
-        except Exception as e:
-            LOGGER.warning(f"{self.prefix}MongoDB save failed: {e}")
-
-    def _sync_mongodb_to_csv(self):
-        """
-        Sync MongoDB results to CSV for plotting compatibility.
-
-        Downloads all results from MongoDB and writes them to the local CSV file in chronological order. This enables
-        the existing plotting functions to work seamlessly with distributed MongoDB data.
-        """
-        try:
-            # Get all results from MongoDB
-            all_results = list(self.collection.find().sort("iteration", 1))
-            if not all_results:
-                return
-
-            # Write to CSV
-            headers = ",".join(["fitness"] + list(self.space.keys())) + "\n"
-            with open(self.tune_csv, "w", encoding="utf-8") as f:
-                f.write(headers)
-                for result in all_results:
-                    fitness = result["fitness"]
-                    hyp_values = [result["hyperparameters"][k] for k in self.space.keys()]
-                    log_row = [round(fitness, 5)] + hyp_values
-                    f.write(",".join(map(str, log_row)) + "\n")
-
-        except Exception as e:
-            LOGGER.warning(f"{self.prefix}MongoDB to CSV sync failed: {e}")
-
-    def _crossover(self, x: np.ndarray, alpha: float = 0.2, k: int = 9) -> np.ndarray:
-        """BLX-α crossover from up to top-k parents (x[:,0]=fitness, rest=genes)."""
-        k = min(k, len(x))
-        # fitness weights (shifted to >0); fallback to uniform if degenerate
-        weights = x[:, 0] - x[:, 0].min() + 1e-6
-        if not np.isfinite(weights).all() or weights.sum() == 0:
-            weights = np.ones_like(weights)
-        idxs = random.choices(range(len(x)), weights=weights, k=k)
-        parents_mat = np.stack([x[i][1:] for i in idxs], 0)  # (k, ng) strip fitness
-        lo, hi = parents_mat.min(0), parents_mat.max(0)
-        span = hi - lo
-        return np.random.uniform(lo - alpha * span, hi + alpha * span)
-
-    def _mutate(
-        self,
-        n: int = 9,
-        mutation: float = 0.5,
-        sigma: float = 0.2,
-    ) -> dict[str, float]:
-        """
-        Mutate hyperparameters based on bounds and scaling factors specified in `self.space`.
-
-        Args:
-            parent (str): Parent selection method (kept for API compatibility, unused in BLX mode).
-            n (int): Number of top parents to consider.
-            mutation (float): Probability of a parameter mutation in any given iteration.
-            sigma (float): Standard deviation for Gaussian random number generator.
-
-        Returns:
-            (dict[str, float]): A dictionary containing mutated hyperparameters.
-        """
-        x = None
-
-        # Try MongoDB first if available
-        if self.mongodb:
-            results = self._get_mongodb_results(n)
-            if results:
-                # MongoDB already sorted by fitness DESC, so results[0] is best
-                x = np.array([[r["fitness"]] + [r["hyperparameters"][k] for k in self.space.keys()] for r in results])
-            elif self.collection.name in self.collection.database.list_collection_names():  # Tuner started elsewhere
-                x = np.array([[0.0] + [getattr(self.args, k) for k in self.space.keys()]])
-
-        # Fall back to CSV if MongoDB unavailable or empty
-        if x is None and self.tune_csv.exists():
-            csv_data = np.loadtxt(self.tune_csv, ndmin=2, delimiter=",", skiprows=1)
-            if len(csv_data) > 0:
-                fitness = csv_data[:, 0]  # first column
-                order = np.argsort(-fitness)
-                x = csv_data[order][:n]  # top-n sorted by fitness DESC
-
-        # Mutate if we have data, otherwise use defaults
-        if x is not None:
-            np.random.seed(int(time.time()))
-            ng = len(self.space)
-
-            # Crossover
-            genes = self._crossover(x)
-
-            # Mutation
-            gains = np.array([v[2] if len(v) == 3 else 1.0 for v in self.space.values()])  # gains 0-1
-            factors = np.ones(ng)
-            while np.all(factors == 1):  # mutate until a change occurs (prevent duplicates)
-                mask = np.random.random(ng) < mutation
-                step = np.random.randn(ng) * (sigma * gains)
-                factors = np.where(mask, np.exp(step), 1.0).clip(0.25, 4.0)
-            hyp = {k: float(genes[i] * factors[i]) for i, k in enumerate(self.space.keys())}
-        else:
-            hyp = {k: getattr(self.args, k) for k in self.space.keys()}
-
-        # Constrain to limits
-        for k, bounds in self.space.items():
-            hyp[k] = round(min(max(hyp[k], bounds[0]), bounds[1]), 5)
-
-        # Update types
-        if "close_mosaic" in hyp:
-            hyp["close_mosaic"] = int(round(hyp["close_mosaic"]))
-
-        return hyp
-
-    def __call__(self, model=None, iterations: int = 10, cleanup: bool = True):
-        """
-        Execute the hyperparameter evolution process when the Tuner instance is called.
-
-        This method iterates through the specified number of iterations, performing the following steps:
-        1. Sync MongoDB results to CSV (if using distributed mode)
-        2. Mutate hyperparameters using the best previous results or defaults
-        3. Train a YOLO model with the mutated hyperparameters
-        4. Log fitness scores and hyperparameters to MongoDB and/or CSV
-        5. Track the best performing configuration across all iterations
-
-        Args:
-            model (Model | None, optional): A pre-initialized YOLO model to be used for training.
-            iterations (int): The number of generations to run the evolution for.
-            cleanup (bool): Whether to delete iteration weights to reduce storage space during tuning.
-        """
-        t0 = time.time()
-        best_save_dir, best_metrics = None, None
-        (self.tune_dir / "weights").mkdir(parents=True, exist_ok=True)
-
-        # Sync MongoDB to CSV at startup for proper resume logic
-        if self.mongodb:
-            self._sync_mongodb_to_csv()
-
-        start = 0
-        if self.tune_csv.exists():
-            x = np.loadtxt(self.tune_csv, ndmin=2, delimiter=",", skiprows=1)
-            start = x.shape[0]
-            LOGGER.info(f"{self.prefix}Resuming tuning run {self.tune_dir} from iteration {start + 1}...")
-        for i in range(start, iterations):
-            # Linearly decay sigma from 0.2 → 0.1 over first 300 iterations
-            frac = min(i / 300.0, 1.0)
-            sigma_i = 0.2 - 0.1 * frac
-
-            # Mutate hyperparameters
-            mutated_hyp = self._mutate(sigma=sigma_i)
-            LOGGER.info(f"{self.prefix}Starting iteration {i + 1}/{iterations} with hyperparameters: {mutated_hyp}")
-
-            metrics = {}
-            train_args = {**vars(self.args), **mutated_hyp}
-            save_dir = get_save_dir(get_cfg(train_args))
-            weights_dir = save_dir / "weights"
-            try:
-                # Train YOLO model with mutated hyperparameters (run in subprocess to avoid dataloader hang)
-                launch = [__import__("sys").executable, "-m", "ultralytics.cfg.__init__"]  # workaround yolo not found
-                cmd = [*launch, "train", *(f"{k}={v}" for k, v in train_args.items())]
-                return_code = subprocess.run(cmd, check=True).returncode
-                ckpt_file = weights_dir / ("best.pt" if (weights_dir / "best.pt").exists() else "last.pt")
-                metrics = torch_load(ckpt_file)["train_metrics"]
-                assert return_code == 0, "training failed"
-
-                # Cleanup
-                time.sleep(1)
-                gc.collect()
-                torch.cuda.empty_cache()
-
-            except Exception as e:
-                LOGGER.error(f"training failure for hyperparameter tuning iteration {i + 1}\n{e}")
-
-            # Save results - MongoDB takes precedence
-            fitness = metrics.get("fitness", 0.0)
-            if self.mongodb:
-                self._save_to_mongodb(fitness, mutated_hyp, metrics, i + 1)
-                self._sync_mongodb_to_csv()
-                total_mongo_iterations = self.collection.count_documents({})
-                if total_mongo_iterations >= iterations:
-                    LOGGER.info(
-                        f"{self.prefix}Target iterations ({iterations}) reached in MongoDB ({total_mongo_iterations}). Stopping."
-                    )
-                    break
-            else:
-                # Save to CSV only if no MongoDB
-                log_row = [round(fitness, 5)] + [mutated_hyp[k] for k in self.space.keys()]
-                headers = "" if self.tune_csv.exists() else (",".join(["fitness"] + list(self.space.keys())) + "\n")
-                with open(self.tune_csv, "a", encoding="utf-8") as f:
-                    f.write(headers + ",".join(map(str, log_row)) + "\n")
-
-            # Get best results
-            x = np.loadtxt(self.tune_csv, ndmin=2, delimiter=",", skiprows=1)
-            fitness = x[:, 0]  # first column
-            best_idx = fitness.argmax()
-            best_is_current = best_idx == (i - start)
-            if best_is_current:
-                best_save_dir = str(save_dir)
-                best_metrics = {k: round(v, 5) for k, v in metrics.items()}
-                for ckpt in weights_dir.glob("*.pt"):
-                    shutil.copy2(ckpt, self.tune_dir / "weights")
-            elif cleanup and best_save_dir:
-                shutil.rmtree(best_save_dir, ignore_errors=True)  # remove iteration dirs to reduce storage space
-
-            # Plot tune results
-            plot_tune_results(str(self.tune_csv))
-
-            # Save and print tune results
-            header = (
-                f"{self.prefix}{i + 1}/{iterations} iterations complete ✅ ({time.time() - t0:.2f}s)\n"
-                f"{self.prefix}Results saved to {colorstr('bold', self.tune_dir)}\n"
-                f"{self.prefix}Best fitness={fitness[best_idx]} observed at iteration {best_idx + 1}\n"
-                f"{self.prefix}Best fitness metrics are {best_metrics}\n"
-                f"{self.prefix}Best fitness model is {best_save_dir}"
-            )
-            LOGGER.info("\n" + header)
-            data = {k: float(x[best_idx, i + 1]) for i, k in enumerate(self.space.keys())}
-            YAML.save(
-                self.tune_dir / "best_hyperparameters.yaml",
-                data=data,
-                header=remove_colorstr(header.replace(self.prefix, "# ")) + "\n",
-            )
-            YAML.print(self.tune_dir / "best_hyperparameters.yaml")
diff --git a/ultralytics/engine/validator.py b/ultralytics/engine/validator.py
deleted file mode 100644
index ae16cc9..0000000
--- a/ultralytics/engine/validator.py
+++ /dev/null
@@ -1,370 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-"""
-Check a model's accuracy on a test or val split of a dataset.
-
-Usage:
-    $ yolo mode=val model=yolo11n.pt data=coco8.yaml imgsz=640
-
-Usage - formats:
-    $ yolo mode=val model=yolo11n.pt                 # PyTorch
-                          yolo11n.torchscript        # TorchScript
-                          yolo11n.onnx               # ONNX Runtime or OpenCV DNN with dnn=True
-                          yolo11n_openvino_model     # OpenVINO
-                          yolo11n.engine             # TensorRT
-                          yolo11n.mlpackage          # CoreML (macOS-only)
-                          yolo11n_saved_model        # TensorFlow SavedModel
-                          yolo11n.pb                 # TensorFlow GraphDef
-                          yolo11n.tflite             # TensorFlow Lite
-                          yolo11n_edgetpu.tflite     # TensorFlow Edge TPU
-                          yolo11n_paddle_model       # PaddlePaddle
-                          yolo11n.mnn                # MNN
-                          yolo11n_ncnn_model         # NCNN
-                          yolo11n_imx_model          # Sony IMX
-                          yolo11n_rknn_model         # Rockchip RKNN
-"""
-
-import json
-import time
-from pathlib import Path
-
-import numpy as np
-import torch
-
-from ultralytics.cfg import get_cfg, get_save_dir
-from ultralytics.data.utils import check_cls_dataset, check_det_dataset
-from ultralytics.nn.autobackend import AutoBackend
-from ultralytics.utils import LOGGER, TQDM, callbacks, colorstr, emojis
-from ultralytics.utils.checks import check_imgsz
-from ultralytics.utils.ops import Profile
-from ultralytics.utils.torch_utils import attempt_compile, select_device, smart_inference_mode, unwrap_model
-
-
-class BaseValidator:
-    """
-    A base class for creating validators.
-
-    This class provides the foundation for validation processes, including model evaluation, metric computation, and
-    result visualization.
-
-    Attributes:
-        args (SimpleNamespace): Configuration for the validator.
-        dataloader (DataLoader): Dataloader to use for validation.
-        model (nn.Module): Model to validate.
-        data (dict): Data dictionary containing dataset information.
-        device (torch.device): Device to use for validation.
-        batch_i (int): Current batch index.
-        training (bool): Whether the model is in training mode.
-        names (dict): Class names mapping.
-        seen (int): Number of images seen so far during validation.
-        stats (dict): Statistics collected during validation.
-        confusion_matrix: Confusion matrix for classification evaluation.
-        nc (int): Number of classes.
-        iouv (torch.Tensor): IoU thresholds from 0.50 to 0.95 in spaces of 0.05.
-        jdict (list): List to store JSON validation results.
-        speed (dict): Dictionary with keys 'preprocess', 'inference', 'loss', 'postprocess' and their respective
-            batch processing times in milliseconds.
-        save_dir (Path): Directory to save results.
-        plots (dict): Dictionary to store plots for visualization.
-        callbacks (dict): Dictionary to store various callback functions.
-        stride (int): Model stride for padding calculations.
-        loss (torch.Tensor): Accumulated loss during training validation.
-
-    Methods:
-        __call__: Execute validation process, running inference on dataloader and computing performance metrics.
-        match_predictions: Match predictions to ground truth objects using IoU.
-        add_callback: Append the given callback to the specified event.
-        run_callbacks: Run all callbacks associated with a specified event.
-        get_dataloader: Get data loader from dataset path and batch size.
-        build_dataset: Build dataset from image path.
-        preprocess: Preprocess an input batch.
-        postprocess: Postprocess the predictions.
-        init_metrics: Initialize performance metrics for the YOLO model.
-        update_metrics: Update metrics based on predictions and batch.
-        finalize_metrics: Finalize and return all metrics.
-        get_stats: Return statistics about the model's performance.
-        print_results: Print the results of the model's predictions.
-        get_desc: Get description of the YOLO model.
-        on_plot: Register plots for visualization.
-        plot_val_samples: Plot validation samples during training.
-        plot_predictions: Plot YOLO model predictions on batch images.
-        pred_to_json: Convert predictions to JSON format.
-        eval_json: Evaluate and return JSON format of prediction statistics.
-    """
-
-    def __init__(self, dataloader=None, save_dir=None, args=None, _callbacks=None):
-        """
-        Initialize a BaseValidator instance.
-
-        Args:
-            dataloader (torch.utils.data.DataLoader, optional): Dataloader to be used for validation.
-            save_dir (Path, optional): Directory to save results.
-            args (SimpleNamespace, optional): Configuration for the validator.
-            _callbacks (dict, optional): Dictionary to store various callback functions.
-        """
-        import torchvision  # noqa (import here so torchvision import time not recorded in postprocess time)
-
-        self.args = get_cfg(overrides=args)
-        self.dataloader = dataloader
-        self.stride = None
-        self.data = None
-        self.device = None
-        self.batch_i = None
-        self.training = True
-        self.names = None
-        self.seen = None
-        self.stats = None
-        self.confusion_matrix = None
-        self.nc = None
-        self.iouv = None
-        self.jdict = None
-        self.speed = {"preprocess": 0.0, "inference": 0.0, "loss": 0.0, "postprocess": 0.0}
-
-        self.save_dir = save_dir or get_save_dir(self.args)
-        (self.save_dir / "labels" if self.args.save_txt else self.save_dir).mkdir(parents=True, exist_ok=True)
-        if self.args.conf is None:
-            self.args.conf = 0.01 if self.args.task == "obb" else 0.001  # reduce OBB val memory usage
-        self.args.imgsz = check_imgsz(self.args.imgsz, max_dim=1)
-
-        self.plots = {}
-        self.callbacks = _callbacks or callbacks.get_default_callbacks()
-
-    @smart_inference_mode()
-    def __call__(self, trainer=None, model=None):
-        """
-        Execute validation process, running inference on dataloader and computing performance metrics.
-
-        Args:
-            trainer (object, optional): Trainer object that contains the model to validate.
-            model (nn.Module, optional): Model to validate if not using a trainer.
-
-        Returns:
-            (dict): Dictionary containing validation statistics.
-        """
-        self.training = trainer is not None
-        augment = self.args.augment and (not self.training)
-        if self.training:
-            self.device = trainer.device
-            self.data = trainer.data
-            # Force FP16 val during training
-            self.args.half = self.device.type != "cpu" and trainer.amp
-            model = trainer.ema.ema or trainer.model
-            if trainer.args.compile and hasattr(model, "_orig_mod"):
-                model = model._orig_mod  # validate non-compiled original model to avoid issues
-            model = model.half() if self.args.half else model.float()
-            self.loss = torch.zeros_like(trainer.loss_items, device=trainer.device)
-            self.args.plots &= trainer.stopper.possible_stop or (trainer.epoch == trainer.epochs - 1)
-            model.eval()
-        else:
-            if str(self.args.model).endswith(".yaml") and model is None:
-                LOGGER.warning("validating an untrained model YAML will result in 0 mAP.")
-            callbacks.add_integration_callbacks(self)
-            model = AutoBackend(
-                model=model or self.args.model,
-                device=select_device(self.args.device),
-                dnn=self.args.dnn,
-                data=self.args.data,
-                fp16=self.args.half,
-            )
-            self.device = model.device  # update device
-            self.args.half = model.fp16  # update half
-            stride, pt, jit = model.stride, model.pt, model.jit
-            imgsz = check_imgsz(self.args.imgsz, stride=stride)
-            if not (pt or jit or getattr(model, "dynamic", False)):
-                self.args.batch = model.metadata.get("batch", 1)  # export.py models default to batch-size 1
-                LOGGER.info(f"Setting batch={self.args.batch} input of shape ({self.args.batch}, 3, {imgsz}, {imgsz})")
-
-            if str(self.args.data).rsplit(".", 1)[-1] in {"yaml", "yml"}:
-                self.data = check_det_dataset(self.args.data)
-            elif self.args.task == "classify":
-                self.data = check_cls_dataset(self.args.data, split=self.args.split)
-            else:
-                raise FileNotFoundError(emojis(f"Dataset '{self.args.data}' for task={self.args.task} not found ❌"))
-
-            if self.device.type in {"cpu", "mps"}:
-                self.args.workers = 0  # faster CPU val as time dominated by inference, not dataloading
-            if not (pt or (getattr(model, "dynamic", False) and not model.imx)):
-                self.args.rect = False
-            self.stride = model.stride  # used in get_dataloader() for padding
-            self.dataloader = self.dataloader or self.get_dataloader(self.data.get(self.args.split), self.args.batch)
-
-            model.eval()
-            if self.args.compile:
-                model = attempt_compile(model, device=self.device)
-            model.warmup(imgsz=(1 if pt else self.args.batch, self.data["channels"], imgsz, imgsz))  # warmup
-
-        self.run_callbacks("on_val_start")
-        dt = (
-            Profile(device=self.device),
-            Profile(device=self.device),
-            Profile(device=self.device),
-            Profile(device=self.device),
-        )
-        bar = TQDM(self.dataloader, desc=self.get_desc(), total=len(self.dataloader))
-        self.init_metrics(unwrap_model(model))
-        self.jdict = []  # empty before each val
-        for batch_i, batch in enumerate(bar):
-            self.run_callbacks("on_val_batch_start")
-            self.batch_i = batch_i
-            # Preprocess
-            with dt[0]:
-                batch = self.preprocess(batch)
-
-            # Inference
-            with dt[1]:
-                preds = model(batch["img"], augment=augment)
-
-            # Loss
-            with dt[2]:
-                if self.training:
-                    self.loss += model.loss(batch, preds)[1]
-
-            # Postprocess
-            with dt[3]:
-                preds = self.postprocess(preds)
-
-            self.update_metrics(preds, batch)
-            if self.args.plots and batch_i < 3:
-                self.plot_val_samples(batch, batch_i)
-                self.plot_predictions(batch, preds, batch_i)
-
-            self.run_callbacks("on_val_batch_end")
-        stats = self.get_stats()
-        self.speed = dict(zip(self.speed.keys(), (x.t / len(self.dataloader.dataset) * 1e3 for x in dt)))
-        self.finalize_metrics()
-        self.print_results()
-        self.run_callbacks("on_val_end")
-        if self.training:
-            model.float()
-            results = {**stats, **trainer.label_loss_items(self.loss.cpu() / len(self.dataloader), prefix="val")}
-            return {k: round(float(v), 5) for k, v in results.items()}  # return results as 5 decimal place floats
-        else:
-            LOGGER.info(
-                "Speed: {:.1f}ms preprocess, {:.1f}ms inference, {:.1f}ms loss, {:.1f}ms postprocess per image".format(
-                    *tuple(self.speed.values())
-                )
-            )
-            if self.args.save_json and self.jdict:
-                with open(str(self.save_dir / "predictions.json"), "w", encoding="utf-8") as f:
-                    LOGGER.info(f"Saving {f.name}...")
-                    json.dump(self.jdict, f)  # flatten and save
-                stats = self.eval_json(stats)  # update stats
-            if self.args.plots or self.args.save_json:
-                LOGGER.info(f"Results saved to {colorstr('bold', self.save_dir)}")
-            return stats
-
-    def match_predictions(
-        self, pred_classes: torch.Tensor, true_classes: torch.Tensor, iou: torch.Tensor, use_scipy: bool = False
-    ) -> torch.Tensor:
-        """
-        Match predictions to ground truth objects using IoU.
-
-        Args:
-            pred_classes (torch.Tensor): Predicted class indices of shape (N,).
-            true_classes (torch.Tensor): Target class indices of shape (M,).
-            iou (torch.Tensor): An NxM tensor containing the pairwise IoU values for predictions and ground truth.
-            use_scipy (bool, optional): Whether to use scipy for matching (more precise).
-
-        Returns:
-            (torch.Tensor): Correct tensor of shape (N, 10) for 10 IoU thresholds.
-        """
-        # Dx10 matrix, where D - detections, 10 - IoU thresholds
-        correct = np.zeros((pred_classes.shape[0], self.iouv.shape[0])).astype(bool)
-        # LxD matrix where L - labels (rows), D - detections (columns)
-        correct_class = true_classes[:, None] == pred_classes
-        iou = iou * correct_class  # zero out the wrong classes
-        iou = iou.cpu().numpy()
-        for i, threshold in enumerate(self.iouv.cpu().tolist()):
-            if use_scipy:
-                # WARNING: known issue that reduces mAP in https://github.com/ultralytics/ultralytics/pull/4708
-                import scipy  # scope import to avoid importing for all commands
-
-                cost_matrix = iou * (iou >= threshold)
-                if cost_matrix.any():
-                    labels_idx, detections_idx = scipy.optimize.linear_sum_assignment(cost_matrix)
-                    valid = cost_matrix[labels_idx, detections_idx] > 0
-                    if valid.any():
-                        correct[detections_idx[valid], i] = True
-            else:
-                matches = np.nonzero(iou >= threshold)  # IoU > threshold and classes match
-                matches = np.array(matches).T
-                if matches.shape[0]:
-                    if matches.shape[0] > 1:
-                        matches = matches[iou[matches[:, 0], matches[:, 1]].argsort()[::-1]]
-                        matches = matches[np.unique(matches[:, 1], return_index=True)[1]]
-                        matches = matches[np.unique(matches[:, 0], return_index=True)[1]]
-                    correct[matches[:, 1].astype(int), i] = True
-        return torch.tensor(correct, dtype=torch.bool, device=pred_classes.device)
-
-    def add_callback(self, event: str, callback):
-        """Append the given callback to the specified event."""
-        self.callbacks[event].append(callback)
-
-    def run_callbacks(self, event: str):
-        """Run all callbacks associated with a specified event."""
-        for callback in self.callbacks.get(event, []):
-            callback(self)
-
-    def get_dataloader(self, dataset_path, batch_size):
-        """Get data loader from dataset path and batch size."""
-        raise NotImplementedError("get_dataloader function not implemented for this validator")
-
-    def build_dataset(self, img_path):
-        """Build dataset from image path."""
-        raise NotImplementedError("build_dataset function not implemented in validator")
-
-    def preprocess(self, batch):
-        """Preprocess an input batch."""
-        return batch
-
-    def postprocess(self, preds):
-        """Postprocess the predictions."""
-        return preds
-
-    def init_metrics(self, model):
-        """Initialize performance metrics for the YOLO model."""
-        pass
-
-    def update_metrics(self, preds, batch):
-        """Update metrics based on predictions and batch."""
-        pass
-
-    def finalize_metrics(self):
-        """Finalize and return all metrics."""
-        pass
-
-    def get_stats(self):
-        """Return statistics about the model's performance."""
-        return {}
-
-    def print_results(self):
-        """Print the results of the model's predictions."""
-        pass
-
-    def get_desc(self):
-        """Get description of the YOLO model."""
-        pass
-
-    @property
-    def metric_keys(self):
-        """Return the metric keys used in YOLO training/validation."""
-        return []
-
-    def on_plot(self, name, data=None):
-        """Register plots for visualization."""
-        self.plots[Path(name)] = {"data": data, "timestamp": time.time()}
-
-    def plot_val_samples(self, batch, ni):
-        """Plot validation samples during training."""
-        pass
-
-    def plot_predictions(self, batch, preds, ni):
-        """Plot YOLO model predictions on batch images."""
-        pass
-
-    def pred_to_json(self, preds, batch):
-        """Convert predictions to JSON format."""
-        pass
-
-    def eval_json(self, stats):
-        """Evaluate and return JSON format of prediction statistics."""
-        pass
diff --git a/ultralytics/hub/__init__.py b/ultralytics/hub/__init__.py
deleted file mode 100644
index a24db5a..0000000
--- a/ultralytics/hub/__init__.py
+++ /dev/null
@@ -1,169 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from ultralytics.data.utils import HUBDatasetStats
-from ultralytics.hub.auth import Auth
-from ultralytics.hub.session import HUBTrainingSession
-from ultralytics.hub.utils import HUB_API_ROOT, HUB_WEB_ROOT, PREFIX
-from ultralytics.utils import LOGGER, SETTINGS, checks
-
-__all__ = (
-    "PREFIX",
-    "HUB_WEB_ROOT",
-    "HUBTrainingSession",
-    "login",
-    "logout",
-    "reset_model",
-    "export_fmts_hub",
-    "export_model",
-    "get_export",
-    "check_dataset",
-)
-
-
-def login(api_key: str = None, save: bool = True) -> bool:
-    """
-    Log in to the Ultralytics HUB API using the provided API key.
-
-    The session is not stored; a new session is created when needed using the saved SETTINGS or the HUB_API_KEY
-    environment variable if successfully authenticated.
-
-    Args:
-        api_key (str, optional): API key to use for authentication. If not provided, it will be retrieved from
-            SETTINGS or HUB_API_KEY environment variable.
-        save (bool, optional): Whether to save the API key to SETTINGS if authentication is successful.
-
-    Returns:
-        (bool): True if authentication is successful, False otherwise.
-    """
-    checks.check_requirements("hub-sdk>=0.0.12")
-    from hub_sdk import HUBClient
-
-    api_key_url = f"{HUB_WEB_ROOT}/settings?tab=api+keys"  # set the redirect URL
-    saved_key = SETTINGS.get("api_key")
-    active_key = api_key or saved_key
-    credentials = {"api_key": active_key} if active_key and active_key != "" else None  # set credentials
-
-    client = HUBClient(credentials)  # initialize HUBClient
-
-    if client.authenticated:
-        # Successfully authenticated with HUB
-
-        if save and client.api_key != saved_key:
-            SETTINGS.update({"api_key": client.api_key})  # update settings with valid API key
-
-        # Set message based on whether key was provided or retrieved from settings
-        log_message = (
-            "New authentication successful ✅" if client.api_key == api_key or not credentials else "Authenticated ✅"
-        )
-        LOGGER.info(f"{PREFIX}{log_message}")
-
-        return True
-    else:
-        # Failed to authenticate with HUB
-        LOGGER.info(f"{PREFIX}Get API key from {api_key_url} and then run 'yolo login API_KEY'")
-        return False
-
-
-def logout():
-    """Log out of Ultralytics HUB by removing the API key from the settings file."""
-    SETTINGS["api_key"] = ""
-    LOGGER.info(f"{PREFIX}logged out ✅. To log in again, use 'yolo login'.")
-
-
-def reset_model(model_id: str = ""):
-    """Reset a trained model to an untrained state."""
-    import requests  # scoped as slow import
-
-    r = requests.post(f"{HUB_API_ROOT}/model-reset", json={"modelId": model_id}, headers={"x-api-key": Auth().api_key})
-    if r.status_code == 200:
-        LOGGER.info(f"{PREFIX}Model reset successfully")
-        return
-    LOGGER.warning(f"{PREFIX}Model reset failure {r.status_code} {r.reason}")
-
-
-def export_fmts_hub():
-    """Return a list of HUB-supported export formats."""
-    from ultralytics.engine.exporter import export_formats
-
-    return list(export_formats()["Argument"][1:]) + ["ultralytics_tflite", "ultralytics_coreml"]
-
-
-def export_model(model_id: str = "", format: str = "torchscript"):
-    """
-    Export a model to a specified format for deployment via the Ultralytics HUB API.
-
-    Args:
-        model_id (str): The ID of the model to export. An empty string will use the default model.
-        format (str): The format to export the model to. Must be one of the supported formats returned by
-            export_fmts_hub().
-
-    Raises:
-        AssertionError: If the specified format is not supported or if the export request fails.
-
-    Examples:
-        >>> from ultralytics import hub
-        >>> hub.export_model(model_id="your_model_id", format="torchscript")
-    """
-    import requests  # scoped as slow import
-
-    assert format in export_fmts_hub(), f"Unsupported export format '{format}', valid formats are {export_fmts_hub()}"
-    r = requests.post(
-        f"{HUB_API_ROOT}/v1/models/{model_id}/export", json={"format": format}, headers={"x-api-key": Auth().api_key}
-    )
-    assert r.status_code == 200, f"{PREFIX}{format} export failure {r.status_code} {r.reason}"
-    LOGGER.info(f"{PREFIX}{format} export started ✅")
-
-
-def get_export(model_id: str = "", format: str = "torchscript"):
-    """
-    Retrieve an exported model in the specified format from ultralytics HUB using the model ID.
-
-    Args:
-        model_id (str): The ID of the model to retrieve from ultralytics HUB.
-        format (str): The export format to retrieve. Must be one of the supported formats returned by
-            export_fmts_hub().
-
-    Returns:
-        (dict): JSON response containing the exported model information.
-
-    Raises:
-        AssertionError: If the specified format is not supported or if the API request fails.
-
-    Examples:
-        >>> from ultralytics import hub
-        >>> result = hub.get_export(model_id="your_model_id", format="torchscript")
-    """
-    import requests  # scoped as slow import
-
-    assert format in export_fmts_hub(), f"Unsupported export format '{format}', valid formats are {export_fmts_hub()}"
-    r = requests.post(
-        f"{HUB_API_ROOT}/get-export",
-        json={"apiKey": Auth().api_key, "modelId": model_id, "format": format},
-        headers={"x-api-key": Auth().api_key},
-    )
-    assert r.status_code == 200, f"{PREFIX}{format} get_export failure {r.status_code} {r.reason}"
-    return r.json()
-
-
-def check_dataset(path: str, task: str) -> None:
-    """
-    Check HUB dataset Zip file for errors before upload.
-
-    Args:
-        path (str): Path to data.zip (with data.yaml inside data.zip).
-        task (str): Dataset task. Options are 'detect', 'segment', 'pose', 'classify', 'obb'.
-
-    Examples:
-        >>> from ultralytics.hub import check_dataset
-        >>> check_dataset("path/to/coco8.zip", task="detect")  # detect dataset
-        >>> check_dataset("path/to/coco8-seg.zip", task="segment")  # segment dataset
-        >>> check_dataset("path/to/coco8-pose.zip", task="pose")  # pose dataset
-        >>> check_dataset("path/to/dota8.zip", task="obb")  # OBB dataset
-        >>> check_dataset("path/to/imagenet10.zip", task="classify")  # classification dataset
-
-    Notes:
-        Download *.zip files from https://github.com/ultralytics/hub/tree/main/example_datasets
-        i.e. https://github.com/ultralytics/hub/raw/main/example_datasets/coco8.zip for coco8.zip.
-    """
-    HUBDatasetStats(path=path, task=task).get_json()
-    LOGGER.info(f"Checks completed correctly ✅. Upload this dataset to {HUB_WEB_ROOT}/datasets/.")
diff --git a/ultralytics/hub/__pycache__/__init__.cpython-310.pyc b/ultralytics/hub/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index 2d15eb8..0000000
Binary files a/ultralytics/hub/__pycache__/__init__.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/hub/__pycache__/auth.cpython-310.pyc b/ultralytics/hub/__pycache__/auth.cpython-310.pyc
deleted file mode 100644
index c353c0f..0000000
Binary files a/ultralytics/hub/__pycache__/auth.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/hub/__pycache__/session.cpython-310.pyc b/ultralytics/hub/__pycache__/session.cpython-310.pyc
deleted file mode 100644
index 9772fee..0000000
Binary files a/ultralytics/hub/__pycache__/session.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/hub/__pycache__/utils.cpython-310.pyc b/ultralytics/hub/__pycache__/utils.cpython-310.pyc
deleted file mode 100644
index 963bf33..0000000
Binary files a/ultralytics/hub/__pycache__/utils.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/hub/auth.py b/ultralytics/hub/auth.py
deleted file mode 100644
index 4b8489d..0000000
--- a/ultralytics/hub/auth.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from ultralytics.hub.utils import HUB_API_ROOT, HUB_WEB_ROOT, PREFIX, request_with_credentials
-from ultralytics.utils import IS_COLAB, LOGGER, SETTINGS, emojis
-
-API_KEY_URL = f"{HUB_WEB_ROOT}/settings?tab=api+keys"
-
-
-class Auth:
-    """
-    Manages authentication processes including API key handling, cookie-based authentication, and header generation.
-
-    The class supports different methods of authentication:
-    1. Directly using an API key.
-    2. Authenticating using browser cookies (specifically in Google Colab).
-    3. Prompting the user to enter an API key.
-
-    Attributes:
-        id_token (str | bool): Token used for identity verification, initialized as False.
-        api_key (str | bool): API key for authentication, initialized as False.
-        model_key (bool): Placeholder for model key, initialized as False.
-
-    Methods:
-        authenticate: Attempt to authenticate with the server using either id_token or API key.
-        auth_with_cookies: Attempt to fetch authentication via cookies and set id_token.
-        get_auth_header: Get the authentication header for making API requests.
-        request_api_key: Prompt the user to input their API key.
-
-    Examples:
-        Initialize Auth with an API key
-        >>> auth = Auth(api_key="your_api_key_here")
-
-        Initialize Auth without API key (will prompt for input)
-        >>> auth = Auth()
-    """
-
-    id_token = api_key = model_key = False
-
-    def __init__(self, api_key: str = "", verbose: bool = False):
-        """
-        Initialize Auth class and authenticate user.
-
-        Handles API key validation, Google Colab authentication, and new key requests. Updates SETTINGS upon successful
-        authentication.
-
-        Args:
-            api_key (str): API key or combined key_id format.
-            verbose (bool): Enable verbose logging.
-        """
-        # Split the input API key in case it contains a combined key_model and keep only the API key part
-        api_key = api_key.split("_", 1)[0]
-
-        # Set API key attribute as value passed or SETTINGS API key if none passed
-        self.api_key = api_key or SETTINGS.get("api_key", "")
-
-        # If an API key is provided
-        if self.api_key:
-            # If the provided API key matches the API key in the SETTINGS
-            if self.api_key == SETTINGS.get("api_key"):
-                # Log that the user is already logged in
-                if verbose:
-                    LOGGER.info(f"{PREFIX}Authenticated ✅")
-                return
-            else:
-                # Attempt to authenticate with the provided API key
-                success = self.authenticate()
-        # If the API key is not provided and the environment is a Google Colab notebook
-        elif IS_COLAB:
-            # Attempt to authenticate using browser cookies
-            success = self.auth_with_cookies()
-        else:
-            # Request an API key
-            success = self.request_api_key()
-
-        # Update SETTINGS with the new API key after successful authentication
-        if success:
-            SETTINGS.update({"api_key": self.api_key})
-            # Log that the new login was successful
-            if verbose:
-                LOGGER.info(f"{PREFIX}New authentication successful ✅")
-        elif verbose:
-            LOGGER.info(f"{PREFIX}Get API key from {API_KEY_URL} and then run 'yolo login API_KEY'")
-
-    def request_api_key(self, max_attempts: int = 3) -> bool:
-        """
-        Prompt the user to input their API key.
-
-        Args:
-            max_attempts (int): Maximum number of authentication attempts.
-
-        Returns:
-            (bool): True if authentication is successful, False otherwise.
-        """
-        import getpass
-
-        for attempts in range(max_attempts):
-            LOGGER.info(f"{PREFIX}Login. Attempt {attempts + 1} of {max_attempts}")
-            input_key = getpass.getpass(f"Enter API key from {API_KEY_URL} ")
-            self.api_key = input_key.split("_", 1)[0]  # remove model id if present
-            if self.authenticate():
-                return True
-        raise ConnectionError(emojis(f"{PREFIX}Failed to authenticate ❌"))
-
-    def authenticate(self) -> bool:
-        """
-        Attempt to authenticate with the server using either id_token or API key.
-
-        Returns:
-            (bool): True if authentication is successful, False otherwise.
-        """
-        import requests  # scoped as slow import
-
-        try:
-            if header := self.get_auth_header():
-                r = requests.post(f"{HUB_API_ROOT}/v1/auth", headers=header)
-                if not r.json().get("success", False):
-                    raise ConnectionError("Unable to authenticate.")
-                return True
-            raise ConnectionError("User has not authenticated locally.")
-        except ConnectionError:
-            self.id_token = self.api_key = False  # reset invalid
-            LOGGER.warning(f"{PREFIX}Invalid API key")
-            return False
-
-    def auth_with_cookies(self) -> bool:
-        """
-        Attempt to fetch authentication via cookies and set id_token.
-
-        User must be logged in to HUB and running in a supported browser.
-
-        Returns:
-            (bool): True if authentication is successful, False otherwise.
-        """
-        if not IS_COLAB:
-            return False  # Currently only works with Colab
-        try:
-            authn = request_with_credentials(f"{HUB_API_ROOT}/v1/auth/auto")
-            if authn.get("success", False):
-                self.id_token = authn.get("data", {}).get("idToken", None)
-                self.authenticate()
-                return True
-            raise ConnectionError("Unable to fetch browser authentication details.")
-        except ConnectionError:
-            self.id_token = False  # reset invalid
-            return False
-
-    def get_auth_header(self):
-        """
-        Get the authentication header for making API requests.
-
-        Returns:
-            (dict | None): The authentication header if id_token or API key is set, None otherwise.
-        """
-        if self.id_token:
-            return {"authorization": f"Bearer {self.id_token}"}
-        elif self.api_key:
-            return {"x-api-key": self.api_key}
diff --git a/ultralytics/hub/google/__init__.py b/ultralytics/hub/google/__init__.py
deleted file mode 100644
index 5cff4e0..0000000
--- a/ultralytics/hub/google/__init__.py
+++ /dev/null
@@ -1,177 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import concurrent.futures
-import statistics
-import time
-
-
-class GCPRegions:
-    """
-    A class for managing and analyzing Google Cloud Platform (GCP) regions.
-
-    This class provides functionality to initialize, categorize, and analyze GCP regions based on their
-    geographical location, tier classification, and network latency.
-
-    Attributes:
-        regions (dict[str, tuple[int, str, str]]): A dictionary of GCP regions with their tier, city, and country.
-
-    Methods:
-        tier1: Returns a list of tier 1 GCP regions.
-        tier2: Returns a list of tier 2 GCP regions.
-        lowest_latency: Determines the GCP region(s) with the lowest network latency.
-
-    Examples:
-        >>> from ultralytics.hub.google import GCPRegions
-        >>> regions = GCPRegions()
-        >>> lowest_latency_region = regions.lowest_latency(verbose=True, attempts=3)
-        >>> print(f"Lowest latency region: {lowest_latency_region[0][0]}")
-    """
-
-    def __init__(self):
-        """Initialize the GCPRegions class with predefined Google Cloud Platform regions and their details."""
-        self.regions = {
-            "asia-east1": (1, "Taiwan", "China"),
-            "asia-east2": (2, "Hong Kong", "China"),
-            "asia-northeast1": (1, "Tokyo", "Japan"),
-            "asia-northeast2": (1, "Osaka", "Japan"),
-            "asia-northeast3": (2, "Seoul", "South Korea"),
-            "asia-south1": (2, "Mumbai", "India"),
-            "asia-south2": (2, "Delhi", "India"),
-            "asia-southeast1": (2, "Jurong West", "Singapore"),
-            "asia-southeast2": (2, "Jakarta", "Indonesia"),
-            "australia-southeast1": (2, "Sydney", "Australia"),
-            "australia-southeast2": (2, "Melbourne", "Australia"),
-            "europe-central2": (2, "Warsaw", "Poland"),
-            "europe-north1": (1, "Hamina", "Finland"),
-            "europe-southwest1": (1, "Madrid", "Spain"),
-            "europe-west1": (1, "St. Ghislain", "Belgium"),
-            "europe-west10": (2, "Berlin", "Germany"),
-            "europe-west12": (2, "Turin", "Italy"),
-            "europe-west2": (2, "London", "United Kingdom"),
-            "europe-west3": (2, "Frankfurt", "Germany"),
-            "europe-west4": (1, "Eemshaven", "Netherlands"),
-            "europe-west6": (2, "Zurich", "Switzerland"),
-            "europe-west8": (1, "Milan", "Italy"),
-            "europe-west9": (1, "Paris", "France"),
-            "me-central1": (2, "Doha", "Qatar"),
-            "me-west1": (1, "Tel Aviv", "Israel"),
-            "northamerica-northeast1": (2, "Montreal", "Canada"),
-            "northamerica-northeast2": (2, "Toronto", "Canada"),
-            "southamerica-east1": (2, "São Paulo", "Brazil"),
-            "southamerica-west1": (2, "Santiago", "Chile"),
-            "us-central1": (1, "Iowa", "United States"),
-            "us-east1": (1, "South Carolina", "United States"),
-            "us-east4": (1, "Northern Virginia", "United States"),
-            "us-east5": (1, "Columbus", "United States"),
-            "us-south1": (1, "Dallas", "United States"),
-            "us-west1": (1, "Oregon", "United States"),
-            "us-west2": (2, "Los Angeles", "United States"),
-            "us-west3": (2, "Salt Lake City", "United States"),
-            "us-west4": (2, "Las Vegas", "United States"),
-        }
-
-    def tier1(self) -> list[str]:
-        """Return a list of GCP regions classified as tier 1 based on predefined criteria."""
-        return [region for region, info in self.regions.items() if info[0] == 1]
-
-    def tier2(self) -> list[str]:
-        """Return a list of GCP regions classified as tier 2 based on predefined criteria."""
-        return [region for region, info in self.regions.items() if info[0] == 2]
-
-    @staticmethod
-    def _ping_region(region: str, attempts: int = 1) -> tuple[str, float, float, float, float]:
-        """
-        Ping a specified GCP region and measure network latency statistics.
-
-        Args:
-            region (str): The GCP region identifier to ping (e.g., 'us-central1').
-            attempts (int, optional): Number of ping attempts to make for calculating statistics.
-
-        Returns:
-            region (str): The GCP region identifier that was pinged.
-            mean_latency (float): Mean latency in milliseconds, or infinity if all pings failed.
-            std_dev (float): Standard deviation of latencies in milliseconds, or infinity if all pings failed.
-            min_latency (float): Minimum latency in milliseconds, or infinity if all pings failed.
-            max_latency (float): Maximum latency in milliseconds, or infinity if all pings failed.
-
-        Examples:
-            >>> region, mean, std, min_lat, max_lat = GCPRegions._ping_region("us-central1", attempts=3)
-            >>> print(f"Region {region} has mean latency: {mean:.2f}ms")
-        """
-        import requests  # scoped as slow import
-
-        url = f"https://{region}-docker.pkg.dev"
-        latencies = []
-        for _ in range(attempts):
-            try:
-                start_time = time.time()
-                _ = requests.head(url, timeout=5)
-                latency = (time.time() - start_time) * 1000  # Convert latency to milliseconds
-                if latency != float("inf"):
-                    latencies.append(latency)
-            except requests.RequestException:
-                pass
-        if not latencies:
-            return region, float("inf"), float("inf"), float("inf"), float("inf")
-
-        std_dev = statistics.stdev(latencies) if len(latencies) > 1 else 0
-        return region, statistics.mean(latencies), std_dev, min(latencies), max(latencies)
-
-    def lowest_latency(
-        self,
-        top: int = 1,
-        verbose: bool = False,
-        tier: int | None = None,
-        attempts: int = 1,
-    ) -> list[tuple[str, float, float, float, float]]:
-        """
-        Determine the GCP regions with the lowest latency based on ping tests.
-
-        Args:
-            top (int, optional): Number of top regions to return.
-            verbose (bool, optional): If True, prints detailed latency information for all tested regions.
-            tier (int | None, optional): Filter regions by tier (1 or 2). If None, all regions are tested.
-            attempts (int, optional): Number of ping attempts per region.
-
-        Returns:
-            (list[tuple[str, float, float, float, float]]): List of tuples containing region information and
-                latency statistics. Each tuple contains (region, mean_latency, std_dev, min_latency, max_latency).
-
-        Examples:
-            >>> regions = GCPRegions()
-            >>> results = regions.lowest_latency(top=3, verbose=True, tier=1, attempts=2)
-            >>> print(results[0][0])  # Print the name of the lowest latency region
-        """
-        if verbose:
-            print(f"Testing GCP regions for latency (with {attempts} {'retry' if attempts == 1 else 'attempts'})...")
-
-        regions_to_test = [k for k, v in self.regions.items() if v[0] == tier] if tier else list(self.regions.keys())
-        with concurrent.futures.ThreadPoolExecutor(max_workers=50) as executor:
-            results = list(executor.map(lambda r: self._ping_region(r, attempts), regions_to_test))
-
-        sorted_results = sorted(results, key=lambda x: x[1])
-
-        if verbose:
-            print(f"{'Region':<25} {'Location':<35} {'Tier':<5} Latency (ms)")
-            for region, mean, std, min_, max_ in sorted_results:
-                tier, city, country = self.regions[region]
-                location = f"{city}, {country}"
-                if mean == float("inf"):
-                    print(f"{region:<25} {location:<35} {tier:<5} Timeout")
-                else:
-                    print(f"{region:<25} {location:<35} {tier:<5} {mean:.0f} ± {std:.0f} ({min_:.0f} - {max_:.0f})")
-            print(f"\nLowest latency region{'s' if top > 1 else ''}:")
-            for region, mean, std, min_, max_ in sorted_results[:top]:
-                tier, city, country = self.regions[region]
-                location = f"{city}, {country}"
-                print(f"{region} ({location}, {mean:.0f} ± {std:.0f} ms ({min_:.0f} - {max_:.0f}))")
-
-        return sorted_results[:top]
-
-
-# Usage example
-if __name__ == "__main__":
-    regions = GCPRegions()
-    top_3_latency_tier1 = regions.lowest_latency(top=3, verbose=True, tier=1, attempts=3)
diff --git a/ultralytics/hub/session.py b/ultralytics/hub/session.py
deleted file mode 100644
index a4091af..0000000
--- a/ultralytics/hub/session.py
+++ /dev/null
@@ -1,432 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import shutil
-import threading
-import time
-from http import HTTPStatus
-from pathlib import Path
-from typing import Any
-from urllib.parse import parse_qs, urlparse
-
-from ultralytics import __version__
-from ultralytics.hub.utils import HELP_MSG, HUB_WEB_ROOT, PREFIX
-from ultralytics.utils import IS_COLAB, LOGGER, SETTINGS, TQDM, checks, emojis
-from ultralytics.utils.errors import HUBModelError
-
-AGENT_NAME = f"python-{__version__}-colab" if IS_COLAB else f"python-{__version__}-local"
-
-
-class HUBTrainingSession:
-    """
-    HUB training session for Ultralytics HUB YOLO models.
-
-    This class encapsulates the functionality for interacting with Ultralytics HUB during model training, including
-    model creation, metrics tracking, and checkpoint uploading.
-
-    Attributes:
-        model_id (str): Identifier for the YOLO model being trained.
-        model_url (str): URL for the model in Ultralytics HUB.
-        rate_limits (dict[str, int]): Rate limits for different API calls in seconds.
-        timers (dict[str, Any]): Timers for rate limiting.
-        metrics_queue (dict[str, Any]): Queue for the model's metrics.
-        metrics_upload_failed_queue (dict[str, Any]): Queue for metrics that failed to upload.
-        model (Any): Model data fetched from ultralytics HUB.
-        model_file (str): Path to the model file.
-        train_args (dict[str, Any]): Arguments for training the model.
-        client (Any): Client for interacting with Ultralytics HUB.
-        filename (str): Filename of the model.
-
-    Examples:
-        Create a training session with a model URL
-        >>> session = HUBTrainingSession("https://hub.ultralytics.com/models/example-model")
-        >>> session.upload_metrics()
-    """
-
-    def __init__(self, identifier: str):
-        """
-        Initialize the HUBTrainingSession with the provided model identifier.
-
-        Args:
-            identifier (str): Model identifier used to initialize the HUB training session. It can be a URL string
-                or a model key with specific format.
-
-        Raises:
-            ValueError: If the provided model identifier is invalid.
-            ConnectionError: If connecting with global API key is not supported.
-            ModuleNotFoundError: If hub-sdk package is not installed.
-        """
-        from hub_sdk import HUBClient
-
-        self.rate_limits = {"metrics": 3, "ckpt": 900, "heartbeat": 300}  # rate limits (seconds)
-        self.metrics_queue = {}  # holds metrics for each epoch until upload
-        self.metrics_upload_failed_queue = {}  # holds metrics for each epoch if upload failed
-        self.timers = {}  # holds timers in ultralytics/utils/callbacks/hub.py
-        self.model = None
-        self.model_url = None
-        self.model_file = None
-        self.train_args = None
-
-        # Parse input
-        api_key, model_id, self.filename = self._parse_identifier(identifier)
-
-        # Get credentials
-        active_key = api_key or SETTINGS.get("api_key")
-        credentials = {"api_key": active_key} if active_key else None  # set credentials
-
-        # Initialize client
-        self.client = HUBClient(credentials)
-
-        # Load models
-        try:
-            if model_id:
-                self.load_model(model_id)  # load existing model
-            else:
-                self.model = self.client.model()  # load empty model
-        except Exception:
-            if identifier.startswith(f"{HUB_WEB_ROOT}/models/") and not self.client.authenticated:
-                LOGGER.warning(
-                    f"{PREFIX}Please log in using 'yolo login API_KEY'. "
-                    "You can find your API Key at: https://hub.ultralytics.com/settings?tab=api+keys."
-                )
-
-    @classmethod
-    def create_session(cls, identifier: str, args: dict[str, Any] | None = None):
-        """
-        Create an authenticated HUBTrainingSession or return None.
-
-        Args:
-            identifier (str): Model identifier used to initialize the HUB training session.
-            args (dict[str, Any], optional): Arguments for creating a new model if identifier is not a HUB model URL.
-
-        Returns:
-            session (HUBTrainingSession | None): An authenticated session or None if creation fails.
-        """
-        try:
-            session = cls(identifier)
-            if args and not identifier.startswith(f"{HUB_WEB_ROOT}/models/"):  # not a HUB model URL
-                session.create_model(args)
-                assert session.model.id, "HUB model not loaded correctly"
-            return session
-        # PermissionError and ModuleNotFoundError indicate hub-sdk not installed
-        except (PermissionError, ModuleNotFoundError, AssertionError):
-            return None
-
-    def load_model(self, model_id: str):
-        """
-        Load an existing model from ultralytics HUB using the provided model identifier.
-
-        Args:
-            model_id (str): The identifier of the model to load.
-
-        Raises:
-            ValueError: If the specified HUB model does not exist.
-        """
-        self.model = self.client.model(model_id)
-        if not self.model.data:  # then model does not exist
-            raise ValueError(emojis("❌ The specified HUB model does not exist"))  # TODO: improve error handling
-
-        self.model_url = f"{HUB_WEB_ROOT}/models/{self.model.id}"
-        if self.model.is_trained():
-            LOGGER.info(f"Loading trained HUB model {self.model_url} 🚀")
-            url = self.model.get_weights_url("best")  # download URL with auth
-            self.model_file = checks.check_file(url, download_dir=Path(SETTINGS["weights_dir"]) / "hub" / self.model.id)
-            return
-
-        # Set training args and start heartbeats for HUB to monitor agent
-        self._set_train_args()
-        self.model.start_heartbeat(self.rate_limits["heartbeat"])
-        LOGGER.info(f"{PREFIX}View model at {self.model_url} 🚀")
-
-    def create_model(self, model_args: dict[str, Any]):
-        """
-        Initialize a HUB training session with the specified model arguments.
-
-        Args:
-            model_args (dict[str, Any]): Arguments for creating the model, including batch size, epochs, image size,
-                etc.
-
-        Returns:
-            (None): If the model could not be created.
-        """
-        payload = {
-            "config": {
-                "batchSize": model_args.get("batch", -1),
-                "epochs": model_args.get("epochs", 300),
-                "imageSize": model_args.get("imgsz", 640),
-                "patience": model_args.get("patience", 100),
-                "device": str(model_args.get("device", "")),  # convert None to string
-                "cache": str(model_args.get("cache", "ram")),  # convert True, False, None to string
-            },
-            "dataset": {"name": model_args.get("data")},
-            "lineage": {
-                "architecture": {"name": self.filename.replace(".pt", "").replace(".yaml", "")},
-                "parent": {},
-            },
-            "meta": {"name": self.filename},
-        }
-
-        if self.filename.endswith(".pt"):
-            payload["lineage"]["parent"]["name"] = self.filename
-
-        self.model.create_model(payload)
-
-        # Model could not be created
-        # TODO: improve error handling
-        if not self.model.id:
-            return None
-
-        self.model_url = f"{HUB_WEB_ROOT}/models/{self.model.id}"
-
-        # Start heartbeats for HUB to monitor agent
-        self.model.start_heartbeat(self.rate_limits["heartbeat"])
-
-        LOGGER.info(f"{PREFIX}View model at {self.model_url} 🚀")
-
-    @staticmethod
-    def _parse_identifier(identifier: str):
-        """
-        Parse the given identifier to determine the type and extract relevant components.
-
-        The method supports different identifier formats:
-            - A HUB model URL https://hub.ultralytics.com/models/MODEL
-            - A HUB model URL with API Key https://hub.ultralytics.com/models/MODEL?api_key=APIKEY
-            - A local filename that ends with '.pt' or '.yaml'
-
-        Args:
-            identifier (str): The identifier string to be parsed.
-
-        Returns:
-            api_key (str | None): Extracted API key if present.
-            model_id (str | None): Extracted model ID if present.
-            filename (str | None): Extracted filename if present.
-
-        Raises:
-            HUBModelError: If the identifier format is not recognized.
-        """
-        api_key, model_id, filename = None, None, None
-        if identifier.endswith((".pt", ".yaml")):
-            filename = identifier
-        elif identifier.startswith(f"{HUB_WEB_ROOT}/models/"):
-            parsed_url = urlparse(identifier)
-            model_id = Path(parsed_url.path).stem  # handle possible final backslash robustly
-            query_params = parse_qs(parsed_url.query)  # dictionary, i.e. {"api_key": ["API_KEY_HERE"]}
-            api_key = query_params.get("api_key", [None])[0]
-        else:
-            raise HUBModelError(f"model='{identifier} invalid, correct format is {HUB_WEB_ROOT}/models/MODEL_ID")
-        return api_key, model_id, filename
-
-    def _set_train_args(self):
-        """
-        Initialize training arguments and create a model entry on the Ultralytics HUB.
-
-        This method sets up training arguments based on the model's state and updates them with any additional
-        arguments provided. It handles different states of the model, such as whether it's resumable, pretrained,
-        or requires specific file setup.
-
-        Raises:
-            ValueError: If the model is already trained, if required dataset information is missing, or if there are
-                issues with the provided training arguments.
-        """
-        if self.model.is_resumable():
-            # Model has saved weights
-            self.train_args = {"data": self.model.get_dataset_url(), "resume": True}
-            self.model_file = self.model.get_weights_url("last")
-        else:
-            # Model has no saved weights
-            self.train_args = self.model.data.get("train_args")  # new response
-
-            # Set the model file as either a *.pt or *.yaml file
-            self.model_file = (
-                self.model.get_weights_url("parent") if self.model.is_pretrained() else self.model.get_architecture()
-            )
-
-        if "data" not in self.train_args:
-            # RF bug - datasets are sometimes not exported
-            raise ValueError("Dataset may still be processing. Please wait a minute and try again.")
-
-        self.model_file = checks.check_yolov5u_filename(self.model_file, verbose=False)  # YOLOv5->YOLOv5u
-        self.model_id = self.model.id
-
-    def request_queue(
-        self,
-        request_func,
-        retry: int = 3,
-        timeout: int = 30,
-        thread: bool = True,
-        verbose: bool = True,
-        progress_total: int | None = None,
-        stream_response: bool | None = None,
-        *args,
-        **kwargs,
-    ):
-        """
-        Execute request_func with retries, timeout handling, optional threading, and progress tracking.
-
-        Args:
-            request_func (callable): The function to execute.
-            retry (int): Number of retry attempts.
-            timeout (int): Maximum time to wait for the request to complete.
-            thread (bool): Whether to run the request in a separate thread.
-            verbose (bool): Whether to log detailed messages.
-            progress_total (int, optional): Total size for progress tracking.
-            stream_response (bool, optional): Whether to stream the response.
-            *args (Any): Additional positional arguments for request_func.
-            **kwargs (Any): Additional keyword arguments for request_func.
-
-        Returns:
-            (requests.Response | None): The response object if thread=False, otherwise None.
-        """
-
-        def retry_request():
-            """Attempt to call request_func with retries, timeout, and optional threading."""
-            t0 = time.time()  # Record the start time for the timeout
-            response = None
-            for i in range(retry + 1):
-                if (time.time() - t0) > timeout:
-                    LOGGER.warning(f"{PREFIX}Timeout for request reached. {HELP_MSG}")
-                    break  # Timeout reached, exit loop
-
-                response = request_func(*args, **kwargs)
-                if response is None:
-                    LOGGER.warning(f"{PREFIX}Received no response from the request. {HELP_MSG}")
-                    time.sleep(2**i)  # Exponential backoff before retrying
-                    continue  # Skip further processing and retry
-
-                if progress_total:
-                    self._show_upload_progress(progress_total, response)
-                elif stream_response:
-                    self._iterate_content(response)
-
-                if HTTPStatus.OK <= response.status_code < HTTPStatus.MULTIPLE_CHOICES:
-                    # if request related to metrics upload
-                    if kwargs.get("metrics"):
-                        self.metrics_upload_failed_queue = {}
-                    return response  # Success, no need to retry
-
-                if i == 0:
-                    # Initial attempt, check status code and provide messages
-                    message = self._get_failure_message(response, retry, timeout)
-
-                    if verbose:
-                        LOGGER.warning(f"{PREFIX}{message} {HELP_MSG} ({response.status_code})")
-
-                if not self._should_retry(response.status_code):
-                    LOGGER.warning(f"{PREFIX}Request failed. {HELP_MSG} ({response.status_code}")
-                    break  # Not an error that should be retried, exit loop
-
-                time.sleep(2**i)  # Exponential backoff for retries
-
-            # if request related to metrics upload and exceed retries
-            if response is None and kwargs.get("metrics"):
-                self.metrics_upload_failed_queue.update(kwargs.get("metrics"))
-
-            return response
-
-        if thread:
-            # Start a new thread to run the retry_request function
-            threading.Thread(target=retry_request, daemon=True).start()
-        else:
-            # If running in the main thread, call retry_request directly
-            return retry_request()
-
-    @staticmethod
-    def _should_retry(status_code: int) -> bool:
-        """Determine if a request should be retried based on the HTTP status code."""
-        retry_codes = {
-            HTTPStatus.REQUEST_TIMEOUT,
-            HTTPStatus.BAD_GATEWAY,
-            HTTPStatus.GATEWAY_TIMEOUT,
-        }
-        return status_code in retry_codes
-
-    def _get_failure_message(self, response, retry: int, timeout: int) -> str:
-        """
-        Generate a retry message based on the response status code.
-
-        Args:
-            response (requests.Response): The HTTP response object.
-            retry (int): The number of retry attempts allowed.
-            timeout (int): The maximum timeout duration.
-
-        Returns:
-            (str): The retry message.
-        """
-        if self._should_retry(response.status_code):
-            return f"Retrying {retry}x for {timeout}s." if retry else ""
-        elif response.status_code == HTTPStatus.TOO_MANY_REQUESTS:  # rate limit
-            headers = response.headers
-            return (
-                f"Rate limit reached ({headers['X-RateLimit-Remaining']}/{headers['X-RateLimit-Limit']}). "
-                f"Please retry after {headers['Retry-After']}s."
-            )
-        else:
-            try:
-                return response.json().get("message", "No JSON message.")
-            except AttributeError:
-                return "Unable to read JSON."
-
-    def upload_metrics(self):
-        """Upload model metrics to Ultralytics HUB."""
-        return self.request_queue(self.model.upload_metrics, metrics=self.metrics_queue.copy(), thread=True)
-
-    def upload_model(
-        self,
-        epoch: int,
-        weights: str,
-        is_best: bool = False,
-        map: float = 0.0,
-        final: bool = False,
-    ) -> None:
-        """
-        Upload a model checkpoint to Ultralytics HUB.
-
-        Args:
-            epoch (int): The current training epoch.
-            weights (str): Path to the model weights file.
-            is_best (bool): Indicates if the current model is the best one so far.
-            map (float): Mean average precision of the model.
-            final (bool): Indicates if the model is the final model after training.
-        """
-        weights = Path(weights)
-        if not weights.is_file():
-            last = weights.with_name(f"last{weights.suffix}")
-            if final and last.is_file():
-                LOGGER.warning(
-                    f"{PREFIX} Model 'best.pt' not found, copying 'last.pt' to 'best.pt' and uploading. "
-                    "This often happens when resuming training in transient environments like Google Colab. "
-                    "For more reliable training, consider using Ultralytics HUB Cloud. "
-                    "Learn more at https://docs.ultralytics.com/hub/cloud-training."
-                )
-                shutil.copy(last, weights)  # copy last.pt to best.pt
-            else:
-                LOGGER.warning(f"{PREFIX} Model upload issue. Missing model {weights}.")
-                return
-
-        self.request_queue(
-            self.model.upload_model,
-            epoch=epoch,
-            weights=str(weights),
-            is_best=is_best,
-            map=map,
-            final=final,
-            retry=10,
-            timeout=3600,
-            thread=not final,
-            progress_total=weights.stat().st_size if final else None,  # only show progress if final
-            stream_response=True,
-        )
-
-    @staticmethod
-    def _show_upload_progress(content_length: int, response) -> None:
-        """Display a progress bar to track the upload progress of a file download."""
-        with TQDM(total=content_length, unit="B", unit_scale=True, unit_divisor=1024) as pbar:
-            for data in response.iter_content(chunk_size=1024):
-                pbar.update(len(data))
-
-    @staticmethod
-    def _iterate_content(response) -> None:
-        """Process the streamed HTTP response data."""
-        for _ in response.iter_content(chunk_size=1024):
-            pass  # Do nothing with data chunks
diff --git a/ultralytics/hub/utils.py b/ultralytics/hub/utils.py
deleted file mode 100644
index 865e668..0000000
--- a/ultralytics/hub/utils.py
+++ /dev/null
@@ -1,165 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-import os
-import threading
-import time
-from typing import Any
-
-from ultralytics.utils import (
-    IS_COLAB,
-    LOGGER,
-    TQDM,
-    TryExcept,
-    colorstr,
-)
-
-HUB_API_ROOT = os.environ.get("ULTRALYTICS_HUB_API", "https://api.ultralytics.com")
-HUB_WEB_ROOT = os.environ.get("ULTRALYTICS_HUB_WEB", "https://hub.ultralytics.com")
-
-PREFIX = colorstr("Ultralytics HUB: ")
-HELP_MSG = "If this issue persists please visit https://github.com/ultralytics/hub/issues for assistance."
-
-
-def request_with_credentials(url: str) -> Any:
-    """
-    Make an AJAX request with cookies attached in a Google Colab environment.
-
-    Args:
-        url (str): The URL to make the request to.
-
-    Returns:
-        (Any): The response data from the AJAX request.
-
-    Raises:
-        OSError: If the function is not run in a Google Colab environment.
-    """
-    if not IS_COLAB:
-        raise OSError("request_with_credentials() must run in a Colab environment")
-    from google.colab import output  # noqa
-    from IPython import display  # noqa
-
-    display.display(
-        display.Javascript(
-            f"""
-            window._hub_tmp = new Promise((resolve, reject) => {{
-                const timeout = setTimeout(() => reject("Failed authenticating existing browser session"), 5000)
-                fetch("{url}", {{
-                    method: 'POST',
-                    credentials: 'include'
-                }})
-                    .then((response) => resolve(response.json()))
-                    .then((json) => {{
-                    clearTimeout(timeout);
-                    }}).catch((err) => {{
-                    clearTimeout(timeout);
-                    reject(err);
-                }});
-            }});
-            """
-        )
-    )
-    return output.eval_js("_hub_tmp")
-
-
-def requests_with_progress(method: str, url: str, **kwargs):
-    """
-    Make an HTTP request using the specified method and URL, with an optional progress bar.
-
-    Args:
-        method (str): The HTTP method to use (e.g. 'GET', 'POST').
-        url (str): The URL to send the request to.
-        **kwargs (Any): Additional keyword arguments to pass to the underlying `requests.request` function.
-
-    Returns:
-        (requests.Response): The response object from the HTTP request.
-
-    Notes:
-        - If 'progress' is set to True, the progress bar will display the download progress for responses with a known
-          content length.
-        - If 'progress' is a number then progress bar will display assuming content length = progress.
-    """
-    import requests  # scoped as slow import
-
-    progress = kwargs.pop("progress", False)
-    if not progress:
-        return requests.request(method, url, **kwargs)
-    response = requests.request(method, url, stream=True, **kwargs)
-    total = int(response.headers.get("content-length", 0) if isinstance(progress, bool) else progress)  # total size
-    try:
-        pbar = TQDM(total=total, unit="B", unit_scale=True, unit_divisor=1024)
-        for data in response.iter_content(chunk_size=1024):
-            pbar.update(len(data))
-        pbar.close()
-    except requests.exceptions.ChunkedEncodingError:  # avoid 'Connection broken: IncompleteRead' warnings
-        response.close()
-    return response
-
-
-def smart_request(
-    method: str,
-    url: str,
-    retry: int = 3,
-    timeout: int = 30,
-    thread: bool = True,
-    code: int = -1,
-    verbose: bool = True,
-    progress: bool = False,
-    **kwargs,
-):
-    """
-    Make an HTTP request using the 'requests' library, with exponential backoff retries up to a specified timeout.
-
-    Args:
-        method (str): The HTTP method to use for the request. Choices are 'post' and 'get'.
-        url (str): The URL to make the request to.
-        retry (int, optional): Number of retries to attempt before giving up.
-        timeout (int, optional): Timeout in seconds after which the function will give up retrying.
-        thread (bool, optional): Whether to execute the request in a separate daemon thread.
-        code (int, optional): An identifier for the request, used for logging purposes.
-        verbose (bool, optional): A flag to determine whether to print out to console or not.
-        progress (bool, optional): Whether to show a progress bar during the request.
-        **kwargs (Any): Keyword arguments to be passed to the requests function specified in method.
-
-    Returns:
-        (requests.Response | None): The HTTP response object. If the request is executed in a separate thread, returns
-            None.
-    """
-    retry_codes = (408, 500)  # retry only these codes
-
-    @TryExcept(verbose=verbose)
-    def func(func_method, func_url, **func_kwargs):
-        """Make HTTP requests with retries and timeouts, with optional progress tracking."""
-        r = None  # response
-        t0 = time.time()  # initial time for timer
-        for i in range(retry + 1):
-            if (time.time() - t0) > timeout:
-                break
-            r = requests_with_progress(func_method, func_url, **func_kwargs)  # i.e. get(url, data, json, files)
-            if r.status_code < 300:  # return codes in the 2xx range are generally considered "good" or "successful"
-                break
-            try:
-                m = r.json().get("message", "No JSON message.")
-            except AttributeError:
-                m = "Unable to read JSON."
-            if i == 0:
-                if r.status_code in retry_codes:
-                    m += f" Retrying {retry}x for {timeout}s." if retry else ""
-                elif r.status_code == 429:  # rate limit
-                    h = r.headers  # response headers
-                    m = (
-                        f"Rate limit reached ({h['X-RateLimit-Remaining']}/{h['X-RateLimit-Limit']}). "
-                        f"Please retry after {h['Retry-After']}s."
-                    )
-                if verbose:
-                    LOGGER.warning(f"{PREFIX}{m} {HELP_MSG} ({r.status_code} #{code})")
-                if r.status_code not in retry_codes:
-                    return r
-            time.sleep(2**i)  # exponential standoff
-        return r
-
-    args = method, url
-    kwargs["progress"] = progress
-    if thread:
-        threading.Thread(target=func, args=args, kwargs=kwargs, daemon=True).start()
-    else:
-        return func(*args, **kwargs)
diff --git a/ultralytics/models/__init__.py b/ultralytics/models/__init__.py
deleted file mode 100644
index 8794aaf..0000000
--- a/ultralytics/models/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from .fastsam import FastSAM
-from .nas import NAS
-from .rtdetr import RTDETR
-from .sam import SAM
-from .yolo import YOLO, YOLOE, YOLOWorld
-
-__all__ = "YOLO", "RTDETR", "SAM", "FastSAM", "NAS", "YOLOWorld", "YOLOE"  # allow simpler import
diff --git a/ultralytics/models/__pycache__/__init__.cpython-310.pyc b/ultralytics/models/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index 2059c67..0000000
Binary files a/ultralytics/models/__pycache__/__init__.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/fastsam/__init__.py b/ultralytics/models/fastsam/__init__.py
deleted file mode 100644
index 8c224ac..0000000
--- a/ultralytics/models/fastsam/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from .model import FastSAM
-from .predict import FastSAMPredictor
-from .val import FastSAMValidator
-
-__all__ = "FastSAMPredictor", "FastSAM", "FastSAMValidator"
diff --git a/ultralytics/models/fastsam/__pycache__/__init__.cpython-310.pyc b/ultralytics/models/fastsam/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index 5b62969..0000000
Binary files a/ultralytics/models/fastsam/__pycache__/__init__.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/fastsam/__pycache__/model.cpython-310.pyc b/ultralytics/models/fastsam/__pycache__/model.cpython-310.pyc
deleted file mode 100644
index b19367d..0000000
Binary files a/ultralytics/models/fastsam/__pycache__/model.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/fastsam/__pycache__/predict.cpython-310.pyc b/ultralytics/models/fastsam/__pycache__/predict.cpython-310.pyc
deleted file mode 100644
index 92f5a1f..0000000
Binary files a/ultralytics/models/fastsam/__pycache__/predict.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/fastsam/__pycache__/utils.cpython-310.pyc b/ultralytics/models/fastsam/__pycache__/utils.cpython-310.pyc
deleted file mode 100644
index ae3e7e6..0000000
Binary files a/ultralytics/models/fastsam/__pycache__/utils.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/fastsam/__pycache__/val.cpython-310.pyc b/ultralytics/models/fastsam/__pycache__/val.cpython-310.pyc
deleted file mode 100644
index 6fdce70..0000000
Binary files a/ultralytics/models/fastsam/__pycache__/val.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/fastsam/model.py b/ultralytics/models/fastsam/model.py
deleted file mode 100644
index f515685..0000000
--- a/ultralytics/models/fastsam/model.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-from pathlib import Path
-from typing import Any
-
-from ultralytics.engine.model import Model
-
-from .predict import FastSAMPredictor
-from .val import FastSAMValidator
-
-
-class FastSAM(Model):
-    """
-    FastSAM model interface for segment anything tasks.
-
-    This class extends the base Model class to provide specific functionality for the FastSAM (Fast Segment Anything
-    Model) implementation, allowing for efficient and accurate image segmentation with optional prompting support.
-
-    Attributes:
-        model (str): Path to the pre-trained FastSAM model file.
-        task (str): The task type, set to "segment" for FastSAM models.
-
-    Methods:
-        predict: Perform segmentation prediction on image or video source with optional prompts.
-        task_map: Returns mapping of segment task to predictor and validator classes.
-
-    Examples:
-        Initialize FastSAM model and run prediction
-        >>> from ultralytics import FastSAM
-        >>> model = FastSAM("FastSAM-x.pt")
-        >>> results = model.predict("ultralytics/assets/bus.jpg")
-
-        Run prediction with bounding box prompts
-        >>> results = model.predict("image.jpg", bboxes=[[100, 100, 200, 200]])
-    """
-
-    def __init__(self, model: str = "FastSAM-x.pt"):
-        """Initialize the FastSAM model with the specified pre-trained weights."""
-        if str(model) == "FastSAM.pt":
-            model = "FastSAM-x.pt"
-        assert Path(model).suffix not in {".yaml", ".yml"}, "FastSAM models only support pre-trained models."
-        super().__init__(model=model, task="segment")
-
-    def predict(
-        self,
-        source,
-        stream: bool = False,
-        bboxes: list | None = None,
-        points: list | None = None,
-        labels: list | None = None,
-        texts: list | None = None,
-        **kwargs: Any,
-    ):
-        """
-        Perform segmentation prediction on image or video source.
-
-        Supports prompted segmentation with bounding boxes, points, labels, and texts. The method packages these
-        prompts and passes them to the parent class predict method for processing.
-
-        Args:
-            source (str | PIL.Image | np.ndarray): Input source for prediction, can be a file path, URL, PIL image,
-                or numpy array.
-            stream (bool): Whether to enable real-time streaming mode for video inputs.
-            bboxes (list, optional): Bounding box coordinates for prompted segmentation in format [[x1, y1, x2, y2]].
-            points (list, optional): Point coordinates for prompted segmentation in format [[x, y]].
-            labels (list, optional): Class labels for prompted segmentation.
-            texts (list, optional): Text prompts for segmentation guidance.
-            **kwargs (Any): Additional keyword arguments passed to the predictor.
-
-        Returns:
-            (list): List of Results objects containing the prediction results.
-        """
-        prompts = dict(bboxes=bboxes, points=points, labels=labels, texts=texts)
-        return super().predict(source, stream, prompts=prompts, **kwargs)
-
-    @property
-    def task_map(self) -> dict[str, dict[str, Any]]:
-        """Returns a dictionary mapping segment task to corresponding predictor and validator classes."""
-        return {"segment": {"predictor": FastSAMPredictor, "validator": FastSAMValidator}}
diff --git a/ultralytics/models/fastsam/predict.py b/ultralytics/models/fastsam/predict.py
deleted file mode 100644
index c2be53e..0000000
--- a/ultralytics/models/fastsam/predict.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-import torch
-from PIL import Image
-
-from ultralytics.models.yolo.segment import SegmentationPredictor
-from ultralytics.utils import DEFAULT_CFG, checks
-from ultralytics.utils.metrics import box_iou
-from ultralytics.utils.ops import scale_masks
-from ultralytics.utils.torch_utils import TORCH_1_10
-
-from .utils import adjust_bboxes_to_image_border
-
-
-class FastSAMPredictor(SegmentationPredictor):
-    """
-    FastSAMPredictor is specialized for fast SAM (Segment Anything Model) segmentation prediction tasks.
-
-    This class extends the SegmentationPredictor, customizing the prediction pipeline specifically for fast SAM. It
-    adjusts post-processing steps to incorporate mask prediction and non-maximum suppression while optimizing for
-    single-class segmentation.
-
-    Attributes:
-        prompts (dict): Dictionary containing prompt information for segmentation (bboxes, points, labels, texts).
-        device (torch.device): Device on which model and tensors are processed.
-        clip_model (Any, optional): CLIP model for text-based prompting, loaded on demand.
-        clip_preprocess (Any, optional): CLIP preprocessing function for images, loaded on demand.
-
-    Methods:
-        postprocess: Apply postprocessing to FastSAM predictions and handle prompts.
-        prompt: Perform image segmentation inference based on various prompt types.
-        set_prompts: Set prompts to be used during inference.
-    """
-
-    def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
-        """
-        Initialize the FastSAMPredictor with configuration and callbacks.
-
-        This initializes a predictor specialized for Fast SAM (Segment Anything Model) segmentation tasks. The predictor
-        extends SegmentationPredictor with custom post-processing for mask prediction and non-maximum suppression
-        optimized for single-class segmentation.
-
-        Args:
-            cfg (dict): Configuration for the predictor.
-            overrides (dict, optional): Configuration overrides.
-            _callbacks (list, optional): List of callback functions.
-        """
-        super().__init__(cfg, overrides, _callbacks)
-        self.prompts = {}
-
-    def postprocess(self, preds, img, orig_imgs):
-        """
-        Apply postprocessing to FastSAM predictions and handle prompts.
-
-        Args:
-            preds (list[torch.Tensor]): Raw predictions from the model.
-            img (torch.Tensor): Input image tensor that was fed to the model.
-            orig_imgs (list[np.ndarray]): Original images before preprocessing.
-
-        Returns:
-            (list[Results]): Processed results with prompts applied.
-        """
-        bboxes = self.prompts.pop("bboxes", None)
-        points = self.prompts.pop("points", None)
-        labels = self.prompts.pop("labels", None)
-        texts = self.prompts.pop("texts", None)
-        results = super().postprocess(preds, img, orig_imgs)
-        for result in results:
-            full_box = torch.tensor(
-                [0, 0, result.orig_shape[1], result.orig_shape[0]], device=preds[0].device, dtype=torch.float32
-            )
-            boxes = adjust_bboxes_to_image_border(result.boxes.xyxy, result.orig_shape)
-            idx = torch.nonzero(box_iou(full_box[None], boxes) > 0.9).flatten()
-            if idx.numel() != 0:
-                result.boxes.xyxy[idx] = full_box
-
-        return self.prompt(results, bboxes=bboxes, points=points, labels=labels, texts=texts)
-
-    def prompt(self, results, bboxes=None, points=None, labels=None, texts=None):
-        """
-        Perform image segmentation inference based on cues like bounding boxes, points, and text prompts.
-
-        Args:
-            results (Results | list[Results]): Original inference results from FastSAM models without any prompts.
-            bboxes (np.ndarray | list, optional): Bounding boxes with shape (N, 4), in XYXY format.
-            points (np.ndarray | list, optional): Points indicating object locations with shape (N, 2), in pixels.
-            labels (np.ndarray | list, optional): Labels for point prompts, shape (N, ). 1 = foreground, 0 = background.
-            texts (str | list[str], optional): Textual prompts, a list containing string objects.
-
-        Returns:
-            (list[Results]): Output results filtered and determined by the provided prompts.
-        """
-        if bboxes is None and points is None and texts is None:
-            return results
-        prompt_results = []
-        if not isinstance(results, list):
-            results = [results]
-        for result in results:
-            if len(result) == 0:
-                prompt_results.append(result)
-                continue
-            masks = result.masks.data
-            if masks.shape[1:] != result.orig_shape:
-                masks = scale_masks(masks[None], result.orig_shape)[0]
-            # bboxes prompt
-            idx = torch.zeros(len(result), dtype=torch.bool, device=self.device)
-            if bboxes is not None:
-                bboxes = torch.as_tensor(bboxes, dtype=torch.int32, device=self.device)
-                bboxes = bboxes[None] if bboxes.ndim == 1 else bboxes
-                bbox_areas = (bboxes[:, 3] - bboxes[:, 1]) * (bboxes[:, 2] - bboxes[:, 0])
-                mask_areas = torch.stack([masks[:, b[1] : b[3], b[0] : b[2]].sum(dim=(1, 2)) for b in bboxes])
-                full_mask_areas = torch.sum(masks, dim=(1, 2))
-
-                union = bbox_areas[:, None] + full_mask_areas - mask_areas
-                idx[torch.argmax(mask_areas / union, dim=1)] = True
-            if points is not None:
-                points = torch.as_tensor(points, dtype=torch.int32, device=self.device)
-                points = points[None] if points.ndim == 1 else points
-                if labels is None:
-                    labels = torch.ones(points.shape[0])
-                labels = torch.as_tensor(labels, dtype=torch.int32, device=self.device)
-                assert len(labels) == len(points), (
-                    f"Expected `labels` with same size as `point`, but got {len(labels)} and {len(points)}"
-                )
-                point_idx = (
-                    torch.ones(len(result), dtype=torch.bool, device=self.device)
-                    if labels.sum() == 0  # all negative points
-                    else torch.zeros(len(result), dtype=torch.bool, device=self.device)
-                )
-                for point, label in zip(points, labels):
-                    point_idx[torch.nonzero(masks[:, point[1], point[0]], as_tuple=True)[0]] = bool(label)
-                idx |= point_idx
-            if texts is not None:
-                if isinstance(texts, str):
-                    texts = [texts]
-                crop_ims, filter_idx = [], []
-                for i, b in enumerate(result.boxes.xyxy.tolist()):
-                    x1, y1, x2, y2 = (int(x) for x in b)
-                    if (masks[i].sum() if TORCH_1_10 else masks[i].sum(0).sum()) <= 100:  # torch 1.9 bug workaround
-                        filter_idx.append(i)
-                        continue
-                    crop_ims.append(Image.fromarray(result.orig_img[y1:y2, x1:x2, ::-1]))
-                similarity = self._clip_inference(crop_ims, texts)
-                text_idx = torch.argmax(similarity, dim=-1)  # (M, )
-                if len(filter_idx):
-                    text_idx += (torch.tensor(filter_idx, device=self.device)[None] <= int(text_idx)).sum(0)
-                idx[text_idx] = True
-
-            prompt_results.append(result[idx])
-
-        return prompt_results
-
-    def _clip_inference(self, images, texts):
-        """
-        Perform CLIP inference to calculate similarity between images and text prompts.
-
-        Args:
-            images (list[PIL.Image]): List of source images, each should be PIL.Image with RGB channel order.
-            texts (list[str]): List of prompt texts, each should be a string object.
-
-        Returns:
-            (torch.Tensor): Similarity matrix between given images and texts with shape (M, N).
-        """
-        try:
-            import clip
-        except ImportError:
-            checks.check_requirements("git+https://github.com/ultralytics/CLIP.git")
-            import clip
-        if (not hasattr(self, "clip_model")) or (not hasattr(self, "clip_preprocess")):
-            self.clip_model, self.clip_preprocess = clip.load("ViT-B/32", device=self.device)
-        images = torch.stack([self.clip_preprocess(image).to(self.device) for image in images])
-        tokenized_text = clip.tokenize(texts).to(self.device)
-        image_features = self.clip_model.encode_image(images)
-        text_features = self.clip_model.encode_text(tokenized_text)
-        image_features /= image_features.norm(dim=-1, keepdim=True)  # (N, 512)
-        text_features /= text_features.norm(dim=-1, keepdim=True)  # (M, 512)
-        return (image_features * text_features[:, None]).sum(-1)  # (M, N)
-
-    def set_prompts(self, prompts):
-        """Set prompts to be used during inference."""
-        self.prompts = prompts
diff --git a/ultralytics/models/fastsam/utils.py b/ultralytics/models/fastsam/utils.py
deleted file mode 100644
index 18b13b4..0000000
--- a/ultralytics/models/fastsam/utils.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-
-def adjust_bboxes_to_image_border(boxes, image_shape, threshold=20):
-    """
-    Adjust bounding boxes to stick to image border if they are within a certain threshold.
-
-    Args:
-        boxes (torch.Tensor): Bounding boxes with shape (N, 4) in xyxy format.
-        image_shape (tuple): Image dimensions as (height, width).
-        threshold (int): Pixel threshold for considering a box close to the border.
-
-    Returns:
-        (torch.Tensor): Adjusted bounding boxes with shape (N, 4).
-    """
-    # Image dimensions
-    h, w = image_shape
-
-    # Adjust boxes that are close to image borders
-    boxes[boxes[:, 0] < threshold, 0] = 0  # x1
-    boxes[boxes[:, 1] < threshold, 1] = 0  # y1
-    boxes[boxes[:, 2] > w - threshold, 2] = w  # x2
-    boxes[boxes[:, 3] > h - threshold, 3] = h  # y2
-    return boxes
diff --git a/ultralytics/models/fastsam/val.py b/ultralytics/models/fastsam/val.py
deleted file mode 100644
index 6cfd7e0..0000000
--- a/ultralytics/models/fastsam/val.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from ultralytics.models.yolo.segment import SegmentationValidator
-
-
-class FastSAMValidator(SegmentationValidator):
-    """
-    Custom validation class for Fast SAM (Segment Anything Model) segmentation in Ultralytics YOLO framework.
-
-    Extends the SegmentationValidator class, customizing the validation process specifically for Fast SAM. This class
-    sets the task to 'segment' and uses the SegmentMetrics for evaluation. Additionally, plotting features are disabled
-    to avoid errors during validation.
-
-    Attributes:
-        dataloader (torch.utils.data.DataLoader): The data loader object used for validation.
-        save_dir (Path): The directory where validation results will be saved.
-        args (SimpleNamespace): Additional arguments for customization of the validation process.
-        _callbacks (list): List of callback functions to be invoked during validation.
-        metrics (SegmentMetrics): Segmentation metrics calculator for evaluation.
-
-    Methods:
-        __init__: Initialize the FastSAMValidator with custom settings for Fast SAM.
-    """
-
-    def __init__(self, dataloader=None, save_dir=None, args=None, _callbacks=None):
-        """
-        Initialize the FastSAMValidator class, setting the task to 'segment' and metrics to SegmentMetrics.
-
-        Args:
-            dataloader (torch.utils.data.DataLoader, optional): Dataloader to be used for validation.
-            save_dir (Path, optional): Directory to save results.
-            args (SimpleNamespace, optional): Configuration for the validator.
-            _callbacks (list, optional): List of callback functions to be invoked during validation.
-
-        Notes:
-            Plots for ConfusionMatrix and other related metrics are disabled in this class to avoid errors.
-        """
-        super().__init__(dataloader, save_dir, args, _callbacks)
-        self.args.task = "segment"
-        self.args.plots = False  # disable ConfusionMatrix and other plots to avoid errors
diff --git a/ultralytics/models/nas/__init__.py b/ultralytics/models/nas/__init__.py
deleted file mode 100644
index c36c0a4..0000000
--- a/ultralytics/models/nas/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from .model import NAS
-from .predict import NASPredictor
-from .val import NASValidator
-
-__all__ = "NASPredictor", "NASValidator", "NAS"
diff --git a/ultralytics/models/nas/__pycache__/__init__.cpython-310.pyc b/ultralytics/models/nas/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index d52d3b9..0000000
Binary files a/ultralytics/models/nas/__pycache__/__init__.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/nas/__pycache__/model.cpython-310.pyc b/ultralytics/models/nas/__pycache__/model.cpython-310.pyc
deleted file mode 100644
index 853cbb4..0000000
Binary files a/ultralytics/models/nas/__pycache__/model.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/nas/__pycache__/predict.cpython-310.pyc b/ultralytics/models/nas/__pycache__/predict.cpython-310.pyc
deleted file mode 100644
index da5677d..0000000
Binary files a/ultralytics/models/nas/__pycache__/predict.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/nas/__pycache__/val.cpython-310.pyc b/ultralytics/models/nas/__pycache__/val.cpython-310.pyc
deleted file mode 100644
index 3c92abd..0000000
Binary files a/ultralytics/models/nas/__pycache__/val.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/nas/model.py b/ultralytics/models/nas/model.py
deleted file mode 100644
index 4fb858b..0000000
--- a/ultralytics/models/nas/model.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-from pathlib import Path
-from typing import Any
-
-import torch
-
-from ultralytics.engine.model import Model
-from ultralytics.utils import DEFAULT_CFG_DICT
-from ultralytics.utils.downloads import attempt_download_asset
-from ultralytics.utils.patches import torch_load
-from ultralytics.utils.torch_utils import model_info
-
-from .predict import NASPredictor
-from .val import NASValidator
-
-
-class NAS(Model):
-    """
-    YOLO-NAS model for object detection.
-
-    This class provides an interface for the YOLO-NAS models and extends the `Model` class from ultralytics engine.
-    It is designed to facilitate the task of object detection using pre-trained or custom-trained YOLO-NAS models.
-
-    Attributes:
-        model (torch.nn.Module): The loaded YOLO-NAS model.
-        task (str): The task type for the model, defaults to 'detect'.
-        predictor (NASPredictor): The predictor instance for making predictions.
-        validator (NASValidator): The validator instance for model validation.
-
-    Methods:
-        info: Log model information and return model details.
-
-    Examples:
-        >>> from ultralytics import NAS
-        >>> model = NAS("yolo_nas_s")
-        >>> results = model.predict("ultralytics/assets/bus.jpg")
-
-    Notes:
-        YOLO-NAS models only support pre-trained models. Do not provide YAML configuration files.
-    """
-
-    def __init__(self, model: str = "yolo_nas_s.pt") -> None:
-        """Initialize the NAS model with the provided or default model."""
-        assert Path(model).suffix not in {".yaml", ".yml"}, "YOLO-NAS models only support pre-trained models."
-        super().__init__(model, task="detect")
-
-    def _load(self, weights: str, task=None) -> None:
-        """
-        Load an existing NAS model weights or create a new NAS model with pretrained weights.
-
-        Args:
-            weights (str): Path to the model weights file or model name.
-            task (str, optional): Task type for the model.
-        """
-        import super_gradients
-
-        suffix = Path(weights).suffix
-        if suffix == ".pt":
-            self.model = torch_load(attempt_download_asset(weights))
-        elif suffix == "":
-            self.model = super_gradients.training.models.get(weights, pretrained_weights="coco")
-
-        # Override the forward method to ignore additional arguments
-        def new_forward(x, *args, **kwargs):
-            """Ignore additional __call__ arguments."""
-            return self.model._original_forward(x)
-
-        self.model._original_forward = self.model.forward
-        self.model.forward = new_forward
-
-        # Standardize model attributes for compatibility
-        self.model.fuse = lambda verbose=True: self.model
-        self.model.stride = torch.tensor([32])
-        self.model.names = dict(enumerate(self.model._class_names))
-        self.model.is_fused = lambda: False  # for info()
-        self.model.yaml = {}  # for info()
-        self.model.pt_path = weights  # for export()
-        self.model.task = "detect"  # for export()
-        self.model.args = {**DEFAULT_CFG_DICT, **self.overrides}  # for export()
-        self.model.eval()
-
-    def info(self, detailed: bool = False, verbose: bool = True) -> dict[str, Any]:
-        """
-        Log model information.
-
-        Args:
-            detailed (bool): Show detailed information about model.
-            verbose (bool): Controls verbosity.
-
-        Returns:
-            (dict[str, Any]): Model information dictionary.
-        """
-        return model_info(self.model, detailed=detailed, verbose=verbose, imgsz=640)
-
-    @property
-    def task_map(self) -> dict[str, dict[str, Any]]:
-        """Return a dictionary mapping tasks to respective predictor and validator classes."""
-        return {"detect": {"predictor": NASPredictor, "validator": NASValidator}}
diff --git a/ultralytics/models/nas/predict.py b/ultralytics/models/nas/predict.py
deleted file mode 100644
index 6f90e43..0000000
--- a/ultralytics/models/nas/predict.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-import torch
-
-from ultralytics.models.yolo.detect.predict import DetectionPredictor
-from ultralytics.utils import ops
-
-
-class NASPredictor(DetectionPredictor):
-    """
-    Ultralytics YOLO NAS Predictor for object detection.
-
-    This class extends the DetectionPredictor from ultralytics engine and is responsible for post-processing the
-    raw predictions generated by the YOLO NAS models. It applies operations like non-maximum suppression and
-    scaling the bounding boxes to fit the original image dimensions.
-
-    Attributes:
-        args (Namespace): Namespace containing various configurations for post-processing including confidence
-            threshold, IoU threshold, agnostic NMS flag, maximum detections, and class filtering options.
-        model (torch.nn.Module): The YOLO NAS model used for inference.
-        batch (list): Batch of inputs for processing.
-
-    Examples:
-        >>> from ultralytics import NAS
-        >>> model = NAS("yolo_nas_s")
-        >>> predictor = model.predictor
-
-        Assume that raw_preds, img, orig_imgs are available
-        >>> results = predictor.postprocess(raw_preds, img, orig_imgs)
-
-    Notes:
-        Typically, this class is not instantiated directly. It is used internally within the NAS class.
-    """
-
-    def postprocess(self, preds_in, img, orig_imgs):
-        """
-        Postprocess NAS model predictions to generate final detection results.
-
-        This method takes raw predictions from a YOLO NAS model, converts bounding box formats, and applies
-        post-processing operations to generate the final detection results compatible with Ultralytics
-        result visualization and analysis tools.
-
-        Args:
-            preds_in (list): Raw predictions from the NAS model, typically containing bounding boxes and class scores.
-            img (torch.Tensor): Input image tensor that was fed to the model, with shape (B, C, H, W).
-            orig_imgs (list | torch.Tensor | np.ndarray): Original images before preprocessing, used for scaling
-                coordinates back to original dimensions.
-
-        Returns:
-            (list): List of Results objects containing the processed predictions for each image in the batch.
-
-        Examples:
-            >>> predictor = NAS("yolo_nas_s").predictor
-            >>> results = predictor.postprocess(raw_preds, img, orig_imgs)
-        """
-        boxes = ops.xyxy2xywh(preds_in[0][0])  # Convert bounding boxes from xyxy to xywh format
-        preds = torch.cat((boxes, preds_in[0][1]), -1).permute(0, 2, 1)  # Concatenate boxes with class scores
-        return super().postprocess(preds, img, orig_imgs)
diff --git a/ultralytics/models/nas/val.py b/ultralytics/models/nas/val.py
deleted file mode 100644
index cb9e938..0000000
--- a/ultralytics/models/nas/val.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-import torch
-
-from ultralytics.models.yolo.detect import DetectionValidator
-from ultralytics.utils import ops
-
-__all__ = ["NASValidator"]
-
-
-class NASValidator(DetectionValidator):
-    """
-    Ultralytics YOLO NAS Validator for object detection.
-
-    Extends DetectionValidator from the Ultralytics models package and is designed to post-process the raw predictions
-    generated by YOLO NAS models. It performs non-maximum suppression to remove overlapping and low-confidence boxes,
-    ultimately producing the final detections.
-
-    Attributes:
-        args (Namespace): Namespace containing various configurations for post-processing, such as confidence and IoU
-            thresholds.
-        lb (torch.Tensor): Optional tensor for multilabel NMS.
-
-    Examples:
-        >>> from ultralytics import NAS
-        >>> model = NAS("yolo_nas_s")
-        >>> validator = model.validator
-        >>> # Assumes that raw_preds are available
-        >>> final_preds = validator.postprocess(raw_preds)
-
-    Notes:
-        This class is generally not instantiated directly but is used internally within the NAS class.
-    """
-
-    def postprocess(self, preds_in):
-        """Apply Non-maximum suppression to prediction outputs."""
-        boxes = ops.xyxy2xywh(preds_in[0][0])  # Convert bounding box format from xyxy to xywh
-        preds = torch.cat((boxes, preds_in[0][1]), -1).permute(0, 2, 1)  # Concatenate boxes with scores and permute
-        return super().postprocess(preds)
diff --git a/ultralytics/models/rtdetr/__init__.py b/ultralytics/models/rtdetr/__init__.py
deleted file mode 100644
index a6d038d..0000000
--- a/ultralytics/models/rtdetr/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from .model import RTDETR
-from .predict import RTDETRPredictor
-from .val import RTDETRValidator
-
-__all__ = "RTDETRPredictor", "RTDETRValidator", "RTDETR"
diff --git a/ultralytics/models/rtdetr/__pycache__/__init__.cpython-310.pyc b/ultralytics/models/rtdetr/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index ca5b0ce..0000000
Binary files a/ultralytics/models/rtdetr/__pycache__/__init__.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/rtdetr/__pycache__/model.cpython-310.pyc b/ultralytics/models/rtdetr/__pycache__/model.cpython-310.pyc
deleted file mode 100644
index 4e2c8b5..0000000
Binary files a/ultralytics/models/rtdetr/__pycache__/model.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/rtdetr/__pycache__/predict.cpython-310.pyc b/ultralytics/models/rtdetr/__pycache__/predict.cpython-310.pyc
deleted file mode 100644
index ce3f50c..0000000
Binary files a/ultralytics/models/rtdetr/__pycache__/predict.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/rtdetr/__pycache__/train.cpython-310.pyc b/ultralytics/models/rtdetr/__pycache__/train.cpython-310.pyc
deleted file mode 100644
index 4baeb06..0000000
Binary files a/ultralytics/models/rtdetr/__pycache__/train.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/rtdetr/__pycache__/val.cpython-310.pyc b/ultralytics/models/rtdetr/__pycache__/val.cpython-310.pyc
deleted file mode 100644
index d6bc27f..0000000
Binary files a/ultralytics/models/rtdetr/__pycache__/val.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/rtdetr/model.py b/ultralytics/models/rtdetr/model.py
deleted file mode 100644
index 05ba800..0000000
--- a/ultralytics/models/rtdetr/model.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-"""
-Interface for Baidu's RT-DETR, a Vision Transformer-based real-time object detector.
-
-RT-DETR offers real-time performance and high accuracy, excelling in accelerated backends like CUDA with TensorRT.
-It features an efficient hybrid encoder and IoU-aware query selection for enhanced detection accuracy.
-
-References:
-    https://arxiv.org/pdf/2304.08069.pdf
-"""
-
-from ultralytics.engine.model import Model
-from ultralytics.nn.tasks import RTDETRDetectionModel
-from ultralytics.utils.torch_utils import TORCH_1_11
-
-from .predict import RTDETRPredictor
-from .train import RTDETRTrainer
-from .val import RTDETRValidator
-
-
-class RTDETR(Model):
-    """
-    Interface for Baidu's RT-DETR model, a Vision Transformer-based real-time object detector.
-
-    This model provides real-time performance with high accuracy. It supports efficient hybrid encoding, IoU-aware
-    query selection, and adaptable inference speed.
-
-    Attributes:
-        model (str): Path to the pre-trained model.
-
-    Methods:
-        task_map: Return a task map for RT-DETR, associating tasks with corresponding Ultralytics classes.
-
-    Examples:
-        Initialize RT-DETR with a pre-trained model
-        >>> from ultralytics import RTDETR
-        >>> model = RTDETR("rtdetr-l.pt")
-        >>> results = model("image.jpg")
-    """
-
-    def __init__(self, model: str = "rtdetr-l.pt") -> None:
-        """
-        Initialize the RT-DETR model with the given pre-trained model file.
-
-        Args:
-            model (str): Path to the pre-trained model. Supports .pt, .yaml, and .yml formats.
-        """
-        assert TORCH_1_11, "RTDETR requires torch>=1.11"
-        super().__init__(model=model, task="detect")
-
-    @property
-    def task_map(self) -> dict:
-        """
-        Return a task map for RT-DETR, associating tasks with corresponding Ultralytics classes.
-
-        Returns:
-            (dict): A dictionary mapping task names to Ultralytics task classes for the RT-DETR model.
-        """
-        return {
-            "detect": {
-                "predictor": RTDETRPredictor,
-                "validator": RTDETRValidator,
-                "trainer": RTDETRTrainer,
-                "model": RTDETRDetectionModel,
-            }
-        }
diff --git a/ultralytics/models/rtdetr/predict.py b/ultralytics/models/rtdetr/predict.py
deleted file mode 100644
index 804ef78..0000000
--- a/ultralytics/models/rtdetr/predict.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-import torch
-
-from ultralytics.data.augment import LetterBox
-from ultralytics.engine.predictor import BasePredictor
-from ultralytics.engine.results import Results
-from ultralytics.utils import ops
-
-
-class RTDETRPredictor(BasePredictor):
-    """
-    RT-DETR (Real-Time Detection Transformer) Predictor extending the BasePredictor class for making predictions.
-
-    This class leverages Vision Transformers to provide real-time object detection while maintaining high accuracy.
-    It supports key features like efficient hybrid encoding and IoU-aware query selection.
-
-    Attributes:
-        imgsz (int): Image size for inference (must be square and scale-filled).
-        args (dict): Argument overrides for the predictor.
-        model (torch.nn.Module): The loaded RT-DETR model.
-        batch (list): Current batch of processed inputs.
-
-    Methods:
-        postprocess: Postprocess raw model predictions to generate bounding boxes and confidence scores.
-        pre_transform: Pre-transform input images before feeding them into the model for inference.
-
-    Examples:
-        >>> from ultralytics.utils import ASSETS
-        >>> from ultralytics.models.rtdetr import RTDETRPredictor
-        >>> args = dict(model="rtdetr-l.pt", source=ASSETS)
-        >>> predictor = RTDETRPredictor(overrides=args)
-        >>> predictor.predict_cli()
-    """
-
-    def postprocess(self, preds, img, orig_imgs):
-        """
-        Postprocess the raw predictions from the model to generate bounding boxes and confidence scores.
-
-        The method filters detections based on confidence and class if specified in `self.args`. It converts
-        model predictions to Results objects containing properly scaled bounding boxes.
-
-        Args:
-            preds (list | tuple): List of [predictions, extra] from the model, where predictions contain
-                bounding boxes and scores.
-            img (torch.Tensor): Processed input images with shape (N, 3, H, W).
-            orig_imgs (list | torch.Tensor): Original, unprocessed images.
-
-        Returns:
-            results (list[Results]): A list of Results objects containing the post-processed bounding boxes,
-                confidence scores, and class labels.
-        """
-        if not isinstance(preds, (list, tuple)):  # list for PyTorch inference but list[0] Tensor for export inference
-            preds = [preds, None]
-
-        nd = preds[0].shape[-1]
-        bboxes, scores = preds[0].split((4, nd - 4), dim=-1)
-
-        if not isinstance(orig_imgs, list):  # input images are a torch.Tensor, not a list
-            orig_imgs = ops.convert_torch2numpy_batch(orig_imgs)
-
-        results = []
-        for bbox, score, orig_img, img_path in zip(bboxes, scores, orig_imgs, self.batch[0]):  # (300, 4)
-            bbox = ops.xywh2xyxy(bbox)
-            max_score, cls = score.max(-1, keepdim=True)  # (300, 1)
-            idx = max_score.squeeze(-1) > self.args.conf  # (300, )
-            if self.args.classes is not None:
-                idx = (cls == torch.tensor(self.args.classes, device=cls.device)).any(1) & idx
-            pred = torch.cat([bbox, max_score, cls], dim=-1)[idx]  # filter
-            pred = pred[pred[:, 4].argsort(descending=True)][: self.args.max_det]
-            oh, ow = orig_img.shape[:2]
-            pred[..., [0, 2]] *= ow  # scale x coordinates to original width
-            pred[..., [1, 3]] *= oh  # scale y coordinates to original height
-            results.append(Results(orig_img, path=img_path, names=self.model.names, boxes=pred))
-        return results
-
-    def pre_transform(self, im):
-        """
-        Pre-transform input images before feeding them into the model for inference.
-
-        The input images are letterboxed to ensure a square aspect ratio and scale-filled. The size must be square
-        (640) and scale_filled.
-
-        Args:
-            im (list[np.ndarray]  | torch.Tensor): Input images of shape (N, 3, H, W) for tensor,
-                [(H, W, 3) x N] for list.
-
-        Returns:
-            (list): List of pre-transformed images ready for model inference.
-        """
-        letterbox = LetterBox(self.imgsz, auto=False, scale_fill=True)
-        return [letterbox(image=x) for x in im]
diff --git a/ultralytics/models/rtdetr/train.py b/ultralytics/models/rtdetr/train.py
deleted file mode 100644
index f990a6f..0000000
--- a/ultralytics/models/rtdetr/train.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-from copy import copy
-
-from ultralytics.models.yolo.detect import DetectionTrainer
-from ultralytics.nn.tasks import RTDETRDetectionModel
-from ultralytics.utils import RANK, colorstr
-
-from .val import RTDETRDataset, RTDETRValidator
-
-
-class RTDETRTrainer(DetectionTrainer):
-    """
-    Trainer class for the RT-DETR model developed by Baidu for real-time object detection.
-
-    This class extends the DetectionTrainer class for YOLO to adapt to the specific features and architecture of RT-DETR.
-    The model leverages Vision Transformers and has capabilities like IoU-aware query selection and adaptable inference
-    speed.
-
-    Attributes:
-        loss_names (tuple): Names of the loss components used for training.
-        data (dict): Dataset configuration containing class count and other parameters.
-        args (dict): Training arguments and hyperparameters.
-        save_dir (Path): Directory to save training results.
-        test_loader (DataLoader): DataLoader for validation/testing data.
-
-    Methods:
-        get_model: Initialize and return an RT-DETR model for object detection tasks.
-        build_dataset: Build and return an RT-DETR dataset for training or validation.
-        get_validator: Return a DetectionValidator suitable for RT-DETR model validation.
-
-    Notes:
-        - F.grid_sample used in RT-DETR does not support the `deterministic=True` argument.
-        - AMP training can lead to NaN outputs and may produce errors during bipartite graph matching.
-
-    Examples:
-        >>> from ultralytics.models.rtdetr.train import RTDETRTrainer
-        >>> args = dict(model="rtdetr-l.yaml", data="coco8.yaml", imgsz=640, epochs=3)
-        >>> trainer = RTDETRTrainer(overrides=args)
-        >>> trainer.train()
-    """
-
-    def get_model(self, cfg: dict | None = None, weights: str | None = None, verbose: bool = True):
-        """
-        Initialize and return an RT-DETR model for object detection tasks.
-
-        Args:
-            cfg (dict, optional): Model configuration.
-            weights (str, optional): Path to pre-trained model weights.
-            verbose (bool): Verbose logging if True.
-
-        Returns:
-            (RTDETRDetectionModel): Initialized model.
-        """
-        model = RTDETRDetectionModel(cfg, nc=self.data["nc"], ch=self.data["channels"], verbose=verbose and RANK == -1)
-        if weights:
-            model.load(weights)
-        return model
-
-    def build_dataset(self, img_path: str, mode: str = "val", batch: int | None = None):
-        """
-        Build and return an RT-DETR dataset for training or validation.
-
-        Args:
-            img_path (str): Path to the folder containing images.
-            mode (str): Dataset mode, either 'train' or 'val'.
-            batch (int, optional): Batch size for rectangle training.
-
-        Returns:
-            (RTDETRDataset): Dataset object for the specific mode.
-        """
-        return RTDETRDataset(
-            img_path=img_path,
-            imgsz=self.args.imgsz,
-            batch_size=batch,
-            augment=mode == "train",
-            hyp=self.args,
-            rect=False,
-            cache=self.args.cache or None,
-            single_cls=self.args.single_cls or False,
-            prefix=colorstr(f"{mode}: "),
-            classes=self.args.classes,
-            data=self.data,
-            fraction=self.args.fraction if mode == "train" else 1.0,
-        )
-
-    def get_validator(self):
-        """Return a DetectionValidator suitable for RT-DETR model validation."""
-        self.loss_names = "giou_loss", "cls_loss", "l1_loss"
-        return RTDETRValidator(self.test_loader, save_dir=self.save_dir, args=copy(self.args))
diff --git a/ultralytics/models/rtdetr/val.py b/ultralytics/models/rtdetr/val.py
deleted file mode 100644
index 5fbfe56..0000000
--- a/ultralytics/models/rtdetr/val.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-from pathlib import Path
-from typing import Any
-
-import torch
-
-from ultralytics.data import YOLODataset
-from ultralytics.data.augment import Compose, Format, v8_transforms
-from ultralytics.models.yolo.detect import DetectionValidator
-from ultralytics.utils import colorstr, ops
-
-__all__ = ("RTDETRValidator",)  # tuple or list
-
-
-class RTDETRDataset(YOLODataset):
-    """
-    Real-Time DEtection and TRacking (RT-DETR) dataset class extending the base YOLODataset class.
-
-    This specialized dataset class is designed for use with the RT-DETR object detection model and is optimized for
-    real-time detection and tracking tasks.
-
-    Attributes:
-        augment (bool): Whether to apply data augmentation.
-        rect (bool): Whether to use rectangular training.
-        use_segments (bool): Whether to use segmentation masks.
-        use_keypoints (bool): Whether to use keypoint annotations.
-        imgsz (int): Target image size for training.
-
-    Methods:
-        load_image: Load one image from dataset index.
-        build_transforms: Build transformation pipeline for the dataset.
-
-    Examples:
-        Initialize an RT-DETR dataset
-        >>> dataset = RTDETRDataset(img_path="path/to/images", imgsz=640)
-        >>> image, hw = dataset.load_image(0)
-    """
-
-    def __init__(self, *args, data=None, **kwargs):
-        """
-        Initialize the RTDETRDataset class by inheriting from the YOLODataset class.
-
-        This constructor sets up a dataset specifically optimized for the RT-DETR (Real-Time DEtection and TRacking)
-        model, building upon the base YOLODataset functionality.
-
-        Args:
-            *args (Any): Variable length argument list passed to the parent YOLODataset class.
-            data (dict | None): Dictionary containing dataset information. If None, default values will be used.
-            **kwargs (Any): Additional keyword arguments passed to the parent YOLODataset class.
-        """
-        super().__init__(*args, data=data, **kwargs)
-
-    def load_image(self, i, rect_mode=False):
-        """
-        Load one image from dataset index 'i'.
-
-        Args:
-            i (int): Index of the image to load.
-            rect_mode (bool, optional): Whether to use rectangular mode for batch inference.
-
-        Returns:
-            im (torch.Tensor): The loaded image.
-            resized_hw (tuple): Height and width of the resized image with shape (2,).
-
-        Examples:
-            Load an image from the dataset
-            >>> dataset = RTDETRDataset(img_path="path/to/images")
-            >>> image, hw = dataset.load_image(0)
-        """
-        return super().load_image(i=i, rect_mode=rect_mode)
-
-    def build_transforms(self, hyp=None):
-        """
-        Build transformation pipeline for the dataset.
-
-        Args:
-            hyp (dict, optional): Hyperparameters for transformations.
-
-        Returns:
-            (Compose): Composition of transformation functions.
-        """
-        if self.augment:
-            hyp.mosaic = hyp.mosaic if self.augment and not self.rect else 0.0
-            hyp.mixup = hyp.mixup if self.augment and not self.rect else 0.0
-            hyp.cutmix = hyp.cutmix if self.augment and not self.rect else 0.0
-            transforms = v8_transforms(self, self.imgsz, hyp, stretch=True)
-        else:
-            # transforms = Compose([LetterBox(new_shape=(self.imgsz, self.imgsz), auto=False, scale_fill=True)])
-            transforms = Compose([])
-        transforms.append(
-            Format(
-                bbox_format="xywh",
-                normalize=True,
-                return_mask=self.use_segments,
-                return_keypoint=self.use_keypoints,
-                batch_idx=True,
-                mask_ratio=hyp.mask_ratio,
-                mask_overlap=hyp.overlap_mask,
-            )
-        )
-        return transforms
-
-
-class RTDETRValidator(DetectionValidator):
-    """
-    RTDETRValidator extends the DetectionValidator class to provide validation capabilities specifically tailored for
-    the RT-DETR (Real-Time DETR) object detection model.
-
-    The class allows building of an RTDETR-specific dataset for validation, applies Non-maximum suppression for
-    post-processing, and updates evaluation metrics accordingly.
-
-    Attributes:
-        args (Namespace): Configuration arguments for validation.
-        data (dict): Dataset configuration dictionary.
-
-    Methods:
-        build_dataset: Build an RTDETR Dataset for validation.
-        postprocess: Apply Non-maximum suppression to prediction outputs.
-
-    Examples:
-        Initialize and run RT-DETR validation
-        >>> from ultralytics.models.rtdetr import RTDETRValidator
-        >>> args = dict(model="rtdetr-l.pt", data="coco8.yaml")
-        >>> validator = RTDETRValidator(args=args)
-        >>> validator()
-
-    Notes:
-        For further details on the attributes and methods, refer to the parent DetectionValidator class.
-    """
-
-    def build_dataset(self, img_path, mode="val", batch=None):
-        """
-        Build an RTDETR Dataset.
-
-        Args:
-            img_path (str): Path to the folder containing images.
-            mode (str, optional): `train` mode or `val` mode, users are able to customize different augmentations for
-                each mode.
-            batch (int, optional): Size of batches, this is for `rect`.
-
-        Returns:
-            (RTDETRDataset): Dataset configured for RT-DETR validation.
-        """
-        return RTDETRDataset(
-            img_path=img_path,
-            imgsz=self.args.imgsz,
-            batch_size=batch,
-            augment=False,  # no augmentation
-            hyp=self.args,
-            rect=False,  # no rect
-            cache=self.args.cache or None,
-            prefix=colorstr(f"{mode}: "),
-            data=self.data,
-        )
-
-    def postprocess(
-        self, preds: torch.Tensor | list[torch.Tensor] | tuple[torch.Tensor]
-    ) -> list[dict[str, torch.Tensor]]:
-        """
-        Apply Non-maximum suppression to prediction outputs.
-
-        Args:
-            preds (torch.Tensor | list | tuple): Raw predictions from the model. If tensor, should have shape
-                (batch_size, num_predictions, num_classes + 4) where last dimension contains bbox coords and class scores.
-
-        Returns:
-            (list[dict[str, torch.Tensor]]): List of dictionaries for each image, each containing:
-                - 'bboxes': Tensor of shape (N, 4) with bounding box coordinates
-                - 'conf': Tensor of shape (N,) with confidence scores
-                - 'cls': Tensor of shape (N,) with class indices
-        """
-        if not isinstance(preds, (list, tuple)):  # list for PyTorch inference but list[0] Tensor for export inference
-            preds = [preds, None]
-
-        bs, _, nd = preds[0].shape
-        bboxes, scores = preds[0].split((4, nd - 4), dim=-1)
-        bboxes *= self.args.imgsz
-        outputs = [torch.zeros((0, 6), device=bboxes.device)] * bs
-        for i, bbox in enumerate(bboxes):  # (300, 4)
-            bbox = ops.xywh2xyxy(bbox)
-            score, cls = scores[i].max(-1)  # (300, )
-            pred = torch.cat([bbox, score[..., None], cls[..., None]], dim=-1)  # filter
-            # Sort by confidence to correctly get internal metrics
-            pred = pred[score.argsort(descending=True)]
-            outputs[i] = pred[score > self.args.conf]
-
-        return [{"bboxes": x[:, :4], "conf": x[:, 4], "cls": x[:, 5]} for x in outputs]
-
-    def pred_to_json(self, predn: dict[str, torch.Tensor], pbatch: dict[str, Any]) -> None:
-        """
-        Serialize YOLO predictions to COCO json format.
-
-        Args:
-            predn (dict[str, torch.Tensor]): Predictions dictionary containing 'bboxes', 'conf', and 'cls' keys
-                with bounding box coordinates, confidence scores, and class predictions.
-            pbatch (dict[str, Any]): Batch dictionary containing 'imgsz', 'ori_shape', 'ratio_pad', and 'im_file'.
-        """
-        path = Path(pbatch["im_file"])
-        stem = path.stem
-        image_id = int(stem) if stem.isnumeric() else stem
-        box = predn["bboxes"].clone()
-        box[..., [0, 2]] *= pbatch["ori_shape"][1] / self.args.imgsz  # native-space pred
-        box[..., [1, 3]] *= pbatch["ori_shape"][0] / self.args.imgsz  # native-space pred
-        box = ops.xyxy2xywh(box)  # xywh
-        box[:, :2] -= box[:, 2:] / 2  # xy center to top-left corner
-        for b, s, c in zip(box.tolist(), predn["conf"].tolist(), predn["cls"].tolist()):
-            self.jdict.append(
-                {
-                    "image_id": image_id,
-                    "file_name": path.name,
-                    "category_id": self.class_map[int(c)],
-                    "bbox": [round(x, 3) for x in b],
-                    "score": round(s, 5),
-                }
-            )
diff --git a/ultralytics/models/sam/__init__.py b/ultralytics/models/sam/__init__.py
deleted file mode 100644
index 08a7fe2..0000000
--- a/ultralytics/models/sam/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from .model import SAM
-from .predict import Predictor, SAM2DynamicInteractivePredictor, SAM2Predictor, SAM2VideoPredictor
-
-__all__ = (
-    "SAM",
-    "Predictor",
-    "SAM2Predictor",
-    "SAM2VideoPredictor",
-    "SAM2DynamicInteractivePredictor",
-)  # tuple or list of exportable items
diff --git a/ultralytics/models/sam/__pycache__/__init__.cpython-310.pyc b/ultralytics/models/sam/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index 1a8a7b0..0000000
Binary files a/ultralytics/models/sam/__pycache__/__init__.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/sam/__pycache__/amg.cpython-310.pyc b/ultralytics/models/sam/__pycache__/amg.cpython-310.pyc
deleted file mode 100644
index 26116a1..0000000
Binary files a/ultralytics/models/sam/__pycache__/amg.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/sam/__pycache__/model.cpython-310.pyc b/ultralytics/models/sam/__pycache__/model.cpython-310.pyc
deleted file mode 100644
index 6b40702..0000000
Binary files a/ultralytics/models/sam/__pycache__/model.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/sam/__pycache__/predict.cpython-310.pyc b/ultralytics/models/sam/__pycache__/predict.cpython-310.pyc
deleted file mode 100644
index 8235058..0000000
Binary files a/ultralytics/models/sam/__pycache__/predict.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/sam/amg.py b/ultralytics/models/sam/amg.py
deleted file mode 100644
index f5e7804..0000000
--- a/ultralytics/models/sam/amg.py
+++ /dev/null
@@ -1,281 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import math
-from collections.abc import Generator
-from itertools import product
-from typing import Any
-
-import numpy as np
-import torch
-
-
-def is_box_near_crop_edge(
-    boxes: torch.Tensor, crop_box: list[int], orig_box: list[int], atol: float = 20.0
-) -> torch.Tensor:
-    """
-    Determine if bounding boxes are near the edge of a cropped image region using a specified tolerance.
-
-    Args:
-        boxes (torch.Tensor): Bounding boxes in XYXY format.
-        crop_box (list[int]): Crop box coordinates in [x0, y0, x1, y1] format.
-        orig_box (list[int]): Original image box coordinates in [x0, y0, x1, y1] format.
-        atol (float, optional): Absolute tolerance for edge proximity detection.
-
-    Returns:
-        (torch.Tensor): Boolean tensor indicating which boxes are near crop edges.
-
-    Examples:
-        >>> boxes = torch.tensor([[10, 10, 50, 50], [100, 100, 150, 150]])
-        >>> crop_box = [0, 0, 200, 200]
-        >>> orig_box = [0, 0, 300, 300]
-        >>> near_edge = is_box_near_crop_edge(boxes, crop_box, orig_box, atol=20.0)
-    """
-    crop_box_torch = torch.as_tensor(crop_box, dtype=torch.float, device=boxes.device)
-    orig_box_torch = torch.as_tensor(orig_box, dtype=torch.float, device=boxes.device)
-    boxes = uncrop_boxes_xyxy(boxes, crop_box).float()
-    near_crop_edge = torch.isclose(boxes, crop_box_torch[None, :], atol=atol, rtol=0)
-    near_image_edge = torch.isclose(boxes, orig_box_torch[None, :], atol=atol, rtol=0)
-    near_crop_edge = torch.logical_and(near_crop_edge, ~near_image_edge)
-    return torch.any(near_crop_edge, dim=1)
-
-
-def batch_iterator(batch_size: int, *args) -> Generator[list[Any]]:
-    """
-    Yield batches of data from input arguments with specified batch size for efficient processing.
-
-    This function takes a batch size and any number of iterables, then yields batches of elements from those
-    iterables. All input iterables must have the same length.
-
-    Args:
-        batch_size (int): Size of each batch to yield.
-        *args (Any): Variable length input iterables to batch. All iterables must have the same length.
-
-    Yields:
-        (list[Any]): A list of batched elements from each input iterable.
-
-    Examples:
-        >>> data = [1, 2, 3, 4, 5]
-        >>> labels = ["a", "b", "c", "d", "e"]
-        >>> for batch in batch_iterator(2, data, labels):
-        ...     print(batch)
-        [[1, 2], ['a', 'b']]
-        [[3, 4], ['c', 'd']]
-        [[5], ['e']]
-    """
-    assert args and all(len(a) == len(args[0]) for a in args), "Batched iteration must have same-size inputs."
-    n_batches = len(args[0]) // batch_size + int(len(args[0]) % batch_size != 0)
-    for b in range(n_batches):
-        yield [arg[b * batch_size : (b + 1) * batch_size] for arg in args]
-
-
-def calculate_stability_score(masks: torch.Tensor, mask_threshold: float, threshold_offset: float) -> torch.Tensor:
-    """
-    Compute the stability score for a batch of masks.
-
-    The stability score is the IoU between binary masks obtained by thresholding the predicted mask logits at
-    high and low values.
-
-    Args:
-        masks (torch.Tensor): Batch of predicted mask logits.
-        mask_threshold (float): Threshold value for creating binary masks.
-        threshold_offset (float): Offset applied to the threshold for creating high and low binary masks.
-
-    Returns:
-        (torch.Tensor): Stability scores for each mask in the batch.
-
-    Notes:
-        - One mask is always contained inside the other.
-        - Memory is saved by preventing unnecessary cast to torch.int64.
-
-    Examples:
-        >>> masks = torch.rand(10, 256, 256)  # Batch of 10 masks
-        >>> mask_threshold = 0.5
-        >>> threshold_offset = 0.1
-        >>> stability_scores = calculate_stability_score(masks, mask_threshold, threshold_offset)
-    """
-    intersections = (masks > (mask_threshold + threshold_offset)).sum(-1, dtype=torch.int16).sum(-1, dtype=torch.int32)
-    unions = (masks > (mask_threshold - threshold_offset)).sum(-1, dtype=torch.int16).sum(-1, dtype=torch.int32)
-    return intersections / unions
-
-
-def build_point_grid(n_per_side: int) -> np.ndarray:
-    """Generate a 2D grid of evenly spaced points in the range [0,1]x[0,1] for image segmentation tasks."""
-    offset = 1 / (2 * n_per_side)
-    points_one_side = np.linspace(offset, 1 - offset, n_per_side)
-    points_x = np.tile(points_one_side[None, :], (n_per_side, 1))
-    points_y = np.tile(points_one_side[:, None], (1, n_per_side))
-    return np.stack([points_x, points_y], axis=-1).reshape(-1, 2)
-
-
-def build_all_layer_point_grids(n_per_side: int, n_layers: int, scale_per_layer: int) -> list[np.ndarray]:
-    """Generate point grids for multiple crop layers with varying scales and densities."""
-    return [build_point_grid(int(n_per_side / (scale_per_layer**i))) for i in range(n_layers + 1)]
-
-
-def generate_crop_boxes(
-    im_size: tuple[int, ...], n_layers: int, overlap_ratio: float
-) -> tuple[list[list[int]], list[int]]:
-    """
-    Generate crop boxes of varying sizes for multiscale image processing, with layered overlapping regions.
-
-    Args:
-        im_size (tuple[int, ...]): Height and width of the input image.
-        n_layers (int): Number of layers to generate crop boxes for.
-        overlap_ratio (float): Ratio of overlap between adjacent crop boxes.
-
-    Returns:
-        crop_boxes (list[list[int]]): List of crop boxes in [x0, y0, x1, y1] format.
-        layer_idxs (list[int]): List of layer indices corresponding to each crop box.
-
-    Examples:
-        >>> im_size = (800, 1200)  # Height, width
-        >>> n_layers = 3
-        >>> overlap_ratio = 0.25
-        >>> crop_boxes, layer_idxs = generate_crop_boxes(im_size, n_layers, overlap_ratio)
-    """
-    crop_boxes, layer_idxs = [], []
-    im_h, im_w = im_size
-    short_side = min(im_h, im_w)
-
-    # Original image
-    crop_boxes.append([0, 0, im_w, im_h])
-    layer_idxs.append(0)
-
-    def crop_len(orig_len, n_crops, overlap):
-        """Calculate the length of each crop given the original length, number of crops, and overlap."""
-        return int(math.ceil((overlap * (n_crops - 1) + orig_len) / n_crops))
-
-    for i_layer in range(n_layers):
-        n_crops_per_side = 2 ** (i_layer + 1)
-        overlap = int(overlap_ratio * short_side * (2 / n_crops_per_side))
-
-        crop_w = crop_len(im_w, n_crops_per_side, overlap)
-        crop_h = crop_len(im_h, n_crops_per_side, overlap)
-
-        crop_box_x0 = [int((crop_w - overlap) * i) for i in range(n_crops_per_side)]
-        crop_box_y0 = [int((crop_h - overlap) * i) for i in range(n_crops_per_side)]
-
-        # Crops in XYWH format
-        for x0, y0 in product(crop_box_x0, crop_box_y0):
-            box = [x0, y0, min(x0 + crop_w, im_w), min(y0 + crop_h, im_h)]
-            crop_boxes.append(box)
-            layer_idxs.append(i_layer + 1)
-
-    return crop_boxes, layer_idxs
-
-
-def uncrop_boxes_xyxy(boxes: torch.Tensor, crop_box: list[int]) -> torch.Tensor:
-    """Uncrop bounding boxes by adding the crop box offset to their coordinates."""
-    x0, y0, _, _ = crop_box
-    offset = torch.tensor([[x0, y0, x0, y0]], device=boxes.device)
-    # Check if boxes has a channel dimension
-    if len(boxes.shape) == 3:
-        offset = offset.unsqueeze(1)
-    return boxes + offset
-
-
-def uncrop_points(points: torch.Tensor, crop_box: list[int]) -> torch.Tensor:
-    """Uncrop points by adding the crop box offset to their coordinates."""
-    x0, y0, _, _ = crop_box
-    offset = torch.tensor([[x0, y0]], device=points.device)
-    # Check if points has a channel dimension
-    if len(points.shape) == 3:
-        offset = offset.unsqueeze(1)
-    return points + offset
-
-
-def uncrop_masks(masks: torch.Tensor, crop_box: list[int], orig_h: int, orig_w: int) -> torch.Tensor:
-    """Uncrop masks by padding them to the original image size, handling coordinate transformations."""
-    x0, y0, x1, y1 = crop_box
-    if x0 == 0 and y0 == 0 and x1 == orig_w and y1 == orig_h:
-        return masks
-    # Coordinate transform masks
-    pad_x, pad_y = orig_w - (x1 - x0), orig_h - (y1 - y0)
-    pad = (x0, pad_x - x0, y0, pad_y - y0)
-    return torch.nn.functional.pad(masks, pad, value=0)
-
-
-def remove_small_regions(mask: np.ndarray, area_thresh: float, mode: str) -> tuple[np.ndarray, bool]:
-    """
-    Remove small disconnected regions or holes in a mask based on area threshold and mode.
-
-    Args:
-        mask (np.ndarray): Binary mask to process.
-        area_thresh (float): Area threshold below which regions will be removed.
-        mode (str): Processing mode, either 'holes' to fill small holes or 'islands' to remove small disconnected
-            regions.
-
-    Returns:
-        processed_mask (np.ndarray): Processed binary mask with small regions removed.
-        modified (bool): Whether any regions were modified.
-
-    Examples:
-        >>> mask = np.zeros((100, 100), dtype=np.bool_)
-        >>> mask[40:60, 40:60] = True  # Create a square
-        >>> mask[45:55, 45:55] = False  # Create a hole
-        >>> processed_mask, modified = remove_small_regions(mask, 50, "holes")
-    """
-    import cv2  # type: ignore
-
-    assert mode in {"holes", "islands"}, f"Provided mode {mode} is invalid"
-    correct_holes = mode == "holes"
-    working_mask = (correct_holes ^ mask).astype(np.uint8)
-    n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
-    sizes = stats[:, -1][1:]  # Row 0 is background label
-    small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
-    if not small_regions:
-        return mask, False
-    fill_labels = [0] + small_regions
-    if not correct_holes:
-        # If every region is below threshold, keep largest
-        fill_labels = [i for i in range(n_labels) if i not in fill_labels] or [int(np.argmax(sizes)) + 1]
-    mask = np.isin(regions, fill_labels)
-    return mask, True
-
-
-def batched_mask_to_box(masks: torch.Tensor) -> torch.Tensor:
-    """
-    Calculate bounding boxes in XYXY format around binary masks.
-
-    Args:
-        masks (torch.Tensor): Binary masks with shape (B, H, W) or (B, C, H, W).
-
-    Returns:
-        (torch.Tensor): Bounding boxes in XYXY format with shape (B, 4) or (B, C, 4).
-
-    Notes:
-        - Handles empty masks by returning zero boxes.
-        - Preserves input tensor dimensions in the output.
-    """
-    # torch.max below raises an error on empty inputs, just skip in this case
-    if torch.numel(masks) == 0:
-        return torch.zeros(*masks.shape[:-2], 4, device=masks.device)
-
-    # Normalize shape to CxHxW
-    shape = masks.shape
-    h, w = shape[-2:]
-    masks = masks.flatten(0, -3) if len(shape) > 2 else masks.unsqueeze(0)
-    # Get top and bottom edges
-    in_height, _ = torch.max(masks, dim=-1)
-    in_height_coords = in_height * torch.arange(h, device=in_height.device)[None, :]
-    bottom_edges, _ = torch.max(in_height_coords, dim=-1)
-    in_height_coords = in_height_coords + h * (~in_height)
-    top_edges, _ = torch.min(in_height_coords, dim=-1)
-
-    # Get left and right edges
-    in_width, _ = torch.max(masks, dim=-2)
-    in_width_coords = in_width * torch.arange(w, device=in_width.device)[None, :]
-    right_edges, _ = torch.max(in_width_coords, dim=-1)
-    in_width_coords = in_width_coords + w * (~in_width)
-    left_edges, _ = torch.min(in_width_coords, dim=-1)
-
-    # If the mask is empty the right edge will be to the left of the left edge.
-    # Replace these boxes with [0, 0, 0, 0]
-    empty_filter = (right_edges < left_edges) | (bottom_edges < top_edges)
-    out = torch.stack([left_edges, top_edges, right_edges, bottom_edges], dim=-1)
-    out = out * (~empty_filter).unsqueeze(-1)
-
-    # Return to original shape
-    return out.reshape(*shape[:-2], 4) if len(shape) > 2 else out[0]
diff --git a/ultralytics/models/sam/build.py b/ultralytics/models/sam/build.py
deleted file mode 100644
index b5af7ab..0000000
--- a/ultralytics/models/sam/build.py
+++ /dev/null
@@ -1,358 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-
-from functools import partial
-
-import torch
-
-from ultralytics.utils.downloads import attempt_download_asset
-
-from .modules.decoders import MaskDecoder
-from .modules.encoders import FpnNeck, Hiera, ImageEncoder, ImageEncoderViT, MemoryEncoder, PromptEncoder
-from .modules.memory_attention import MemoryAttention, MemoryAttentionLayer
-from .modules.sam import SAM2Model, SAMModel
-from .modules.tiny_encoder import TinyViT
-from .modules.transformer import TwoWayTransformer
-
-
-def build_sam_vit_h(checkpoint=None):
-    """Build and return a Segment Anything Model (SAM) h-size model with specified encoder parameters."""
-    return _build_sam(
-        encoder_embed_dim=1280,
-        encoder_depth=32,
-        encoder_num_heads=16,
-        encoder_global_attn_indexes=[7, 15, 23, 31],
-        checkpoint=checkpoint,
-    )
-
-
-def build_sam_vit_l(checkpoint=None):
-    """Build and return a Segment Anything Model (SAM) l-size model with specified encoder parameters."""
-    return _build_sam(
-        encoder_embed_dim=1024,
-        encoder_depth=24,
-        encoder_num_heads=16,
-        encoder_global_attn_indexes=[5, 11, 17, 23],
-        checkpoint=checkpoint,
-    )
-
-
-def build_sam_vit_b(checkpoint=None):
-    """Build and return a Segment Anything Model (SAM) b-size model with specified encoder parameters."""
-    return _build_sam(
-        encoder_embed_dim=768,
-        encoder_depth=12,
-        encoder_num_heads=12,
-        encoder_global_attn_indexes=[2, 5, 8, 11],
-        checkpoint=checkpoint,
-    )
-
-
-def build_mobile_sam(checkpoint=None):
-    """Build and return a Mobile Segment Anything Model (Mobile-SAM) for efficient image segmentation."""
-    return _build_sam(
-        encoder_embed_dim=[64, 128, 160, 320],
-        encoder_depth=[2, 2, 6, 2],
-        encoder_num_heads=[2, 4, 5, 10],
-        encoder_global_attn_indexes=None,
-        mobile_sam=True,
-        checkpoint=checkpoint,
-    )
-
-
-def build_sam2_t(checkpoint=None):
-    """Build and return a Segment Anything Model 2 (SAM2) tiny-size model with specified architecture parameters."""
-    return _build_sam2(
-        encoder_embed_dim=96,
-        encoder_stages=[1, 2, 7, 2],
-        encoder_num_heads=1,
-        encoder_global_att_blocks=[5, 7, 9],
-        encoder_window_spec=[8, 4, 14, 7],
-        encoder_backbone_channel_list=[768, 384, 192, 96],
-        checkpoint=checkpoint,
-    )
-
-
-def build_sam2_s(checkpoint=None):
-    """Build and return a small-size Segment Anything Model 2 (SAM2) with specified architecture parameters."""
-    return _build_sam2(
-        encoder_embed_dim=96,
-        encoder_stages=[1, 2, 11, 2],
-        encoder_num_heads=1,
-        encoder_global_att_blocks=[7, 10, 13],
-        encoder_window_spec=[8, 4, 14, 7],
-        encoder_backbone_channel_list=[768, 384, 192, 96],
-        checkpoint=checkpoint,
-    )
-
-
-def build_sam2_b(checkpoint=None):
-    """Build and return a Segment Anything Model 2 (SAM2) base-size model with specified architecture parameters."""
-    return _build_sam2(
-        encoder_embed_dim=112,
-        encoder_stages=[2, 3, 16, 3],
-        encoder_num_heads=2,
-        encoder_global_att_blocks=[12, 16, 20],
-        encoder_window_spec=[8, 4, 14, 7],
-        encoder_window_spatial_size=[14, 14],
-        encoder_backbone_channel_list=[896, 448, 224, 112],
-        checkpoint=checkpoint,
-    )
-
-
-def build_sam2_l(checkpoint=None):
-    """Build and return a large-size Segment Anything Model 2 (SAM2) with specified architecture parameters."""
-    return _build_sam2(
-        encoder_embed_dim=144,
-        encoder_stages=[2, 6, 36, 4],
-        encoder_num_heads=2,
-        encoder_global_att_blocks=[23, 33, 43],
-        encoder_window_spec=[8, 4, 16, 8],
-        encoder_backbone_channel_list=[1152, 576, 288, 144],
-        checkpoint=checkpoint,
-    )
-
-
-def _build_sam(
-    encoder_embed_dim,
-    encoder_depth,
-    encoder_num_heads,
-    encoder_global_attn_indexes,
-    checkpoint=None,
-    mobile_sam=False,
-):
-    """
-    Build a Segment Anything Model (SAM) with specified encoder parameters.
-
-    Args:
-        encoder_embed_dim (int | list[int]): Embedding dimension for the encoder.
-        encoder_depth (int | list[int]): Depth of the encoder.
-        encoder_num_heads (int | list[int]): Number of attention heads in the encoder.
-        encoder_global_attn_indexes (list[int] | None): Indexes for global attention in the encoder.
-        checkpoint (str | None, optional): Path to the model checkpoint file.
-        mobile_sam (bool, optional): Whether to build a Mobile-SAM model.
-
-    Returns:
-        (SAMModel): A Segment Anything Model instance with the specified architecture.
-
-    Examples:
-        >>> sam = _build_sam(768, 12, 12, [2, 5, 8, 11])
-        >>> sam = _build_sam([64, 128, 160, 320], [2, 2, 6, 2], [2, 4, 5, 10], None, mobile_sam=True)
-    """
-    prompt_embed_dim = 256
-    image_size = 1024
-    vit_patch_size = 16
-    image_embedding_size = image_size // vit_patch_size
-    image_encoder = (
-        TinyViT(
-            img_size=1024,
-            in_chans=3,
-            num_classes=1000,
-            embed_dims=encoder_embed_dim,
-            depths=encoder_depth,
-            num_heads=encoder_num_heads,
-            window_sizes=[7, 7, 14, 7],
-            mlp_ratio=4.0,
-            drop_rate=0.0,
-            drop_path_rate=0.0,
-            use_checkpoint=False,
-            mbconv_expand_ratio=4.0,
-            local_conv_size=3,
-            layer_lr_decay=0.8,
-        )
-        if mobile_sam
-        else ImageEncoderViT(
-            depth=encoder_depth,
-            embed_dim=encoder_embed_dim,
-            img_size=image_size,
-            mlp_ratio=4,
-            norm_layer=partial(torch.nn.LayerNorm, eps=1e-6),
-            num_heads=encoder_num_heads,
-            patch_size=vit_patch_size,
-            qkv_bias=True,
-            use_rel_pos=True,
-            global_attn_indexes=encoder_global_attn_indexes,
-            window_size=14,
-            out_chans=prompt_embed_dim,
-        )
-    )
-    sam = SAMModel(
-        image_encoder=image_encoder,
-        prompt_encoder=PromptEncoder(
-            embed_dim=prompt_embed_dim,
-            image_embedding_size=(image_embedding_size, image_embedding_size),
-            input_image_size=(image_size, image_size),
-            mask_in_chans=16,
-        ),
-        mask_decoder=MaskDecoder(
-            num_multimask_outputs=3,
-            transformer=TwoWayTransformer(
-                depth=2,
-                embedding_dim=prompt_embed_dim,
-                mlp_dim=2048,
-                num_heads=8,
-            ),
-            transformer_dim=prompt_embed_dim,
-            iou_head_depth=3,
-            iou_head_hidden_dim=256,
-        ),
-        pixel_mean=[123.675, 116.28, 103.53],
-        pixel_std=[58.395, 57.12, 57.375],
-    )
-    if checkpoint is not None:
-        checkpoint = attempt_download_asset(checkpoint)
-        with open(checkpoint, "rb") as f:
-            state_dict = torch.load(f)
-        sam.load_state_dict(state_dict)
-    sam.eval()
-    return sam
-
-
-def _build_sam2(
-    encoder_embed_dim=1280,
-    encoder_stages=[2, 6, 36, 4],
-    encoder_num_heads=2,
-    encoder_global_att_blocks=[7, 15, 23, 31],
-    encoder_backbone_channel_list=[1152, 576, 288, 144],
-    encoder_window_spatial_size=[7, 7],
-    encoder_window_spec=[8, 4, 16, 8],
-    checkpoint=None,
-):
-    """
-    Build and return a Segment Anything Model 2 (SAM2) with specified architecture parameters.
-
-    Args:
-        encoder_embed_dim (int, optional): Embedding dimension for the encoder.
-        encoder_stages (list[int], optional): Number of blocks in each stage of the encoder.
-        encoder_num_heads (int, optional): Number of attention heads in the encoder.
-        encoder_global_att_blocks (list[int], optional): Indices of global attention blocks in the encoder.
-        encoder_backbone_channel_list (list[int], optional): Channel dimensions for each level of the encoder backbone.
-        encoder_window_spatial_size (list[int], optional): Spatial size of the window for position embeddings.
-        encoder_window_spec (list[int], optional): Window specifications for each stage of the encoder.
-        checkpoint (str | None, optional): Path to the checkpoint file for loading pre-trained weights.
-
-    Returns:
-        (SAM2Model): A configured and initialized SAM2 model.
-
-    Examples:
-        >>> sam2_model = _build_sam2(encoder_embed_dim=96, encoder_stages=[1, 2, 7, 2])
-        >>> sam2_model.eval()
-    """
-    image_encoder = ImageEncoder(
-        trunk=Hiera(
-            embed_dim=encoder_embed_dim,
-            num_heads=encoder_num_heads,
-            stages=encoder_stages,
-            global_att_blocks=encoder_global_att_blocks,
-            window_pos_embed_bkg_spatial_size=encoder_window_spatial_size,
-            window_spec=encoder_window_spec,
-        ),
-        neck=FpnNeck(
-            d_model=256,
-            backbone_channel_list=encoder_backbone_channel_list,
-            fpn_top_down_levels=[2, 3],
-            fpn_interp_model="nearest",
-        ),
-        scalp=1,
-    )
-    memory_attention = MemoryAttention(d_model=256, pos_enc_at_input=True, num_layers=4, layer=MemoryAttentionLayer())
-    memory_encoder = MemoryEncoder(out_dim=64)
-
-    is_sam2_1 = checkpoint is not None and "sam2.1" in checkpoint
-    sam2 = SAM2Model(
-        image_encoder=image_encoder,
-        memory_attention=memory_attention,
-        memory_encoder=memory_encoder,
-        num_maskmem=7,
-        image_size=1024,
-        sigmoid_scale_for_mem_enc=20.0,
-        sigmoid_bias_for_mem_enc=-10.0,
-        use_mask_input_as_output_without_sam=True,
-        directly_add_no_mem_embed=True,
-        use_high_res_features_in_sam=True,
-        multimask_output_in_sam=True,
-        iou_prediction_use_sigmoid=True,
-        use_obj_ptrs_in_encoder=True,
-        add_tpos_enc_to_obj_ptrs=True,
-        only_obj_ptrs_in_the_past_for_eval=True,
-        pred_obj_scores=True,
-        pred_obj_scores_mlp=True,
-        fixed_no_obj_ptr=True,
-        multimask_output_for_tracking=True,
-        use_multimask_token_for_obj_ptr=True,
-        multimask_min_pt_num=0,
-        multimask_max_pt_num=1,
-        use_mlp_for_obj_ptr_proj=True,
-        compile_image_encoder=False,
-        no_obj_embed_spatial=is_sam2_1,
-        proj_tpos_enc_in_obj_ptrs=is_sam2_1,
-        use_signed_tpos_enc_to_obj_ptrs=is_sam2_1,
-        sam_mask_decoder_extra_args=dict(
-            dynamic_multimask_via_stability=True,
-            dynamic_multimask_stability_delta=0.05,
-            dynamic_multimask_stability_thresh=0.98,
-        ),
-    )
-
-    if checkpoint is not None:
-        checkpoint = attempt_download_asset(checkpoint)
-        with open(checkpoint, "rb") as f:
-            state_dict = torch.load(f)["model"]
-        sam2.load_state_dict(state_dict)
-    sam2.eval()
-    return sam2
-
-
-sam_model_map = {
-    "sam_h.pt": build_sam_vit_h,
-    "sam_l.pt": build_sam_vit_l,
-    "sam_b.pt": build_sam_vit_b,
-    "mobile_sam.pt": build_mobile_sam,
-    "sam2_t.pt": build_sam2_t,
-    "sam2_s.pt": build_sam2_s,
-    "sam2_b.pt": build_sam2_b,
-    "sam2_l.pt": build_sam2_l,
-    "sam2.1_t.pt": build_sam2_t,
-    "sam2.1_s.pt": build_sam2_s,
-    "sam2.1_b.pt": build_sam2_b,
-    "sam2.1_l.pt": build_sam2_l,
-}
-
-
-def build_sam(ckpt="sam_b.pt"):
-    """
-    Build and return a Segment Anything Model (SAM) based on the provided checkpoint.
-
-    Args:
-        ckpt (str | Path, optional): Path to the checkpoint file or name of a pre-defined SAM model.
-
-    Returns:
-        (SAMModel | SAM2Model): A configured and initialized SAM or SAM2 model instance.
-
-    Raises:
-        FileNotFoundError: If the provided checkpoint is not a supported SAM model.
-
-    Examples:
-        >>> sam_model = build_sam("sam_b.pt")
-        >>> sam_model = build_sam("path/to/custom_checkpoint.pt")
-
-    Notes:
-        Supported pre-defined models include:
-        - SAM: 'sam_h.pt', 'sam_l.pt', 'sam_b.pt', 'mobile_sam.pt'
-        - SAM2: 'sam2_t.pt', 'sam2_s.pt', 'sam2_b.pt', 'sam2_l.pt'
-    """
-    model_builder = None
-    ckpt = str(ckpt)  # to allow Path ckpt types
-    for k in sam_model_map.keys():
-        if ckpt.endswith(k):
-            model_builder = sam_model_map.get(k)
-
-    if not model_builder:
-        raise FileNotFoundError(f"{ckpt} is not a supported SAM model. Available models are: \n {sam_model_map.keys()}")
-
-    return model_builder(ckpt)
diff --git a/ultralytics/models/sam/model.py b/ultralytics/models/sam/model.py
deleted file mode 100644
index 9008810..0000000
--- a/ultralytics/models/sam/model.py
+++ /dev/null
@@ -1,172 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-"""
-SAM model interface.
-
-This module provides an interface to the Segment Anything Model (SAM) from ultralytics, designed for real-time image
-segmentation tasks. The SAM model allows for promptable segmentation with unparalleled versatility in image analysis,
-and has been trained on the SA-1B dataset. It features zero-shot performance capabilities, enabling it to adapt to new
-image distributions and tasks without prior knowledge.
-
-Key Features:
-    - Promptable segmentation
-    - Real-time performance
-    - Zero-shot transfer capabilities
-    - Trained on SA-1B dataset
-"""
-
-from __future__ import annotations
-
-from pathlib import Path
-
-from ultralytics.engine.model import Model
-from ultralytics.utils.torch_utils import model_info
-
-from .predict import Predictor, SAM2Predictor
-
-
-class SAM(Model):
-    """
-    SAM (Segment Anything Model) interface class for real-time image segmentation tasks.
-
-    This class provides an interface to the Segment Anything Model (SAM) from ultralytics, designed for
-    promptable segmentation with versatility in image analysis. It supports various prompts such as bounding
-    boxes, points, or labels, and features zero-shot performance capabilities.
-
-    Attributes:
-        model (torch.nn.Module): The loaded SAM model.
-        is_sam2 (bool): Indicates whether the model is SAM2 variant.
-        task (str): The task type, set to "segment" for SAM models.
-
-    Methods:
-        predict: Perform segmentation prediction on the given image or video source.
-        info: Log information about the SAM model.
-
-    Examples:
-        >>> sam = SAM("sam_b.pt")
-        >>> results = sam.predict("image.jpg", points=[[500, 375]])
-        >>> for r in results:
-        >>>     print(f"Detected {len(r.masks)} masks")
-    """
-
-    def __init__(self, model: str = "sam_b.pt") -> None:
-        """
-        Initialize the SAM (Segment Anything Model) instance.
-
-        Args:
-            model (str): Path to the pre-trained SAM model file. File should have a .pt or .pth extension.
-
-        Raises:
-            NotImplementedError: If the model file extension is not .pt or .pth.
-
-        Examples:
-            >>> sam = SAM("sam_b.pt")
-            >>> print(sam.is_sam2)
-        """
-        if model and Path(model).suffix not in {".pt", ".pth"}:
-            raise NotImplementedError("SAM prediction requires pre-trained *.pt or *.pth model.")
-        self.is_sam2 = "sam2" in Path(model).stem
-        super().__init__(model=model, task="segment")
-
-    def _load(self, weights: str, task=None):
-        """
-        Load the specified weights into the SAM model.
-
-        Args:
-            weights (str): Path to the weights file. Should be a .pt or .pth file containing the model parameters.
-            task (str | None): Task name. If provided, it specifies the particular task the model is being loaded for.
-
-        Examples:
-            >>> sam = SAM("sam_b.pt")
-            >>> sam._load("path/to/custom_weights.pt")
-        """
-        from .build import build_sam  # slow import
-
-        self.model = build_sam(weights)
-
-    def predict(self, source, stream: bool = False, bboxes=None, points=None, labels=None, **kwargs):
-        """
-        Perform segmentation prediction on the given image or video source.
-
-        Args:
-            source (str | PIL.Image | np.ndarray): Path to the image or video file, or a PIL.Image object, or
-                a np.ndarray object.
-            stream (bool): If True, enables real-time streaming.
-            bboxes (list[list[float]] | None): List of bounding box coordinates for prompted segmentation.
-            points (list[list[float]] | None): List of points for prompted segmentation.
-            labels (list[int] | None): List of labels for prompted segmentation.
-            **kwargs (Any): Additional keyword arguments for prediction.
-
-        Returns:
-            (list): The model predictions.
-
-        Examples:
-            >>> sam = SAM("sam_b.pt")
-            >>> results = sam.predict("image.jpg", points=[[500, 375]])
-            >>> for r in results:
-            ...     print(f"Detected {len(r.masks)} masks")
-        """
-        overrides = dict(conf=0.25, task="segment", mode="predict", imgsz=1024)
-        kwargs = {**overrides, **kwargs}
-        prompts = dict(bboxes=bboxes, points=points, labels=labels)
-        return super().predict(source, stream, prompts=prompts, **kwargs)
-
-    def __call__(self, source=None, stream: bool = False, bboxes=None, points=None, labels=None, **kwargs):
-        """
-        Perform segmentation prediction on the given image or video source.
-
-        This method is an alias for the 'predict' method, providing a convenient way to call the SAM model
-        for segmentation tasks.
-
-        Args:
-            source (str | PIL.Image | np.ndarray | None): Path to the image or video file, or a PIL.Image
-                object, or a np.ndarray object.
-            stream (bool): If True, enables real-time streaming.
-            bboxes (list[list[float]] | None): List of bounding box coordinates for prompted segmentation.
-            points (list[list[float]] | None): List of points for prompted segmentation.
-            labels (list[int] | None): List of labels for prompted segmentation.
-            **kwargs (Any): Additional keyword arguments to be passed to the predict method.
-
-        Returns:
-            (list): The model predictions, typically containing segmentation masks and other relevant information.
-
-        Examples:
-            >>> sam = SAM("sam_b.pt")
-            >>> results = sam("image.jpg", points=[[500, 375]])
-            >>> print(f"Detected {len(results[0].masks)} masks")
-        """
-        return self.predict(source, stream, bboxes, points, labels, **kwargs)
-
-    def info(self, detailed: bool = False, verbose: bool = True):
-        """
-        Log information about the SAM model.
-
-        Args:
-            detailed (bool): If True, displays detailed information about the model layers and operations.
-            verbose (bool): If True, prints the information to the console.
-
-        Returns:
-            (tuple): A tuple containing the model's information (string representations of the model).
-
-        Examples:
-            >>> sam = SAM("sam_b.pt")
-            >>> info = sam.info()
-            >>> print(info[0])  # Print summary information
-        """
-        return model_info(self.model, detailed=detailed, verbose=verbose)
-
-    @property
-    def task_map(self) -> dict[str, dict[str, type[Predictor]]]:
-        """
-        Provide a mapping from the 'segment' task to its corresponding 'Predictor'.
-
-        Returns:
-            (dict[str, dict[str, Type[Predictor]]]): A dictionary mapping the 'segment' task to its corresponding
-                Predictor class. For SAM2 models, it maps to SAM2Predictor, otherwise to the standard Predictor.
-
-        Examples:
-            >>> sam = SAM("sam_b.pt")
-            >>> task_map = sam.task_map
-            >>> print(task_map)
-            {'segment': {'predictor': <class 'ultralytics.models.sam.predict.Predictor'>}}
-        """
-        return {"segment": {"predictor": SAM2Predictor if self.is_sam2 else Predictor}}
diff --git a/ultralytics/models/sam/modules/__init__.py b/ultralytics/models/sam/modules/__init__.py
deleted file mode 100644
index 77a19dc..0000000
--- a/ultralytics/models/sam/modules/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
diff --git a/ultralytics/models/sam/modules/blocks.py b/ultralytics/models/sam/modules/blocks.py
deleted file mode 100644
index 2c68a34..0000000
--- a/ultralytics/models/sam/modules/blocks.py
+++ /dev/null
@@ -1,1128 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-from __future__ import annotations
-
-import copy
-import math
-from functools import partial
-
-import numpy as np
-import torch
-import torch.nn.functional as F
-from torch import Tensor, nn
-
-from ultralytics.nn.modules import MLP, LayerNorm2d, MLPBlock
-
-from .transformer import Attention, TwoWayAttentionBlock, TwoWayTransformer
-from .utils import add_decomposed_rel_pos, apply_rotary_enc, compute_axial_cis, window_partition, window_unpartition
-
-
-class DropPath(nn.Module):
-    """
-    Implements stochastic depth regularization for neural networks during training.
-
-    Attributes:
-        drop_prob (float): Probability of dropping a path during training.
-        scale_by_keep (bool): Whether to scale the output by the keep probability.
-
-    Methods:
-        forward: Applies stochastic depth to input tensor during training, with optional scaling.
-
-    Examples:
-        >>> drop_path = DropPath(drop_prob=0.2, scale_by_keep=True)
-        >>> x = torch.randn(32, 64, 224, 224)
-        >>> output = drop_path(x)
-    """
-
-    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
-        """Initialize DropPath module for stochastic depth regularization during training."""
-        super().__init__()
-        self.drop_prob = drop_prob
-        self.scale_by_keep = scale_by_keep
-
-    def forward(self, x: Tensor) -> Tensor:
-        """Apply stochastic depth to input tensor during training, with optional scaling."""
-        if self.drop_prob == 0.0 or not self.training:
-            return x
-        keep_prob = 1 - self.drop_prob
-        shape = (x.shape[0],) + (1,) * (x.ndim - 1)
-        random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
-        if keep_prob > 0.0 and self.scale_by_keep:
-            random_tensor.div_(keep_prob)
-        return x * random_tensor
-
-
-class MaskDownSampler(nn.Module):
-    """
-    A mask downsampling and embedding module for efficient processing of input masks.
-
-    This class implements a mask downsampler that progressively reduces the spatial dimensions of input masks
-    while expanding their channel dimensions using convolutional layers, layer normalization, and activation
-    functions.
-
-    Attributes:
-        encoder (nn.Sequential): A sequential container of convolutional layers, layer normalization, and
-            activation functions for downsampling and embedding masks.
-
-    Methods:
-        forward: Downsamples and encodes input mask to embed_dim channels.
-
-    Examples:
-        >>> mask_downsampler = MaskDownSampler(embed_dim=256, kernel_size=4, stride=4, padding=0, total_stride=16)
-        >>> input_mask = torch.randn(1, 1, 256, 256)
-        >>> output = mask_downsampler(input_mask)
-        >>> print(output.shape)
-        torch.Size([1, 256, 16, 16])
-    """
-
-    def __init__(
-        self,
-        embed_dim: int = 256,
-        kernel_size: int = 4,
-        stride: int = 4,
-        padding: int = 0,
-        total_stride: int = 16,
-        activation: type[nn.Module] = nn.GELU,
-    ):
-        """Initialize a mask downsampler module for progressive downsampling and channel expansion."""
-        super().__init__()
-        num_layers = int(math.log2(total_stride) // math.log2(stride))
-        assert stride**num_layers == total_stride
-        self.encoder = nn.Sequential()
-        mask_in_chans, mask_out_chans = 1, 1
-        for _ in range(num_layers):
-            mask_out_chans = mask_in_chans * (stride**2)
-            self.encoder.append(
-                nn.Conv2d(
-                    mask_in_chans,
-                    mask_out_chans,
-                    kernel_size=kernel_size,
-                    stride=stride,
-                    padding=padding,
-                )
-            )
-            self.encoder.append(LayerNorm2d(mask_out_chans))
-            self.encoder.append(activation())
-            mask_in_chans = mask_out_chans
-
-        self.encoder.append(nn.Conv2d(mask_out_chans, embed_dim, kernel_size=1))
-
-    def forward(self, x: Tensor) -> Tensor:
-        """Downsample and encode input mask to embed_dim channels using convolutional layers and LayerNorm2d."""
-        return self.encoder(x)
-
-
-class CXBlock(nn.Module):
-    """
-    ConvNeXt Block for efficient feature extraction in convolutional neural networks.
-
-    This block implements a modified version of the ConvNeXt architecture, offering improved performance and
-    flexibility in feature extraction.
-
-    Attributes:
-        dwconv (nn.Conv2d): Depthwise or standard 2D convolution layer.
-        norm (LayerNorm2d): Layer normalization applied to channels.
-        pwconv1 (nn.Linear): First pointwise convolution implemented as a linear layer.
-        act (nn.GELU): GELU activation function.
-        pwconv2 (nn.Linear): Second pointwise convolution implemented as a linear layer.
-        gamma (nn.Parameter | None): Learnable scale parameter for layer scaling.
-        drop_path (nn.Module): DropPath layer for stochastic depth regularization.
-
-    Methods:
-        forward: Processes the input tensor through the ConvNeXt block.
-
-    Examples:
-        >>> import torch
-        >>> x = torch.randn(1, 64, 56, 56)
-        >>> block = CXBlock(dim=64, kernel_size=7, padding=3)
-        >>> output = block(x)
-        >>> print(output.shape)
-        torch.Size([1, 64, 56, 56])
-    """
-
-    def __init__(
-        self,
-        dim: int,
-        kernel_size: int = 7,
-        padding: int = 3,
-        drop_path: float = 0.0,
-        layer_scale_init_value: float = 1e-6,
-        use_dwconv: bool = True,
-    ):
-        """
-        Initialize a ConvNeXt Block for efficient feature extraction in convolutional neural networks.
-
-        This block implements a modified version of the ConvNeXt architecture, offering improved performance and
-        flexibility in feature extraction.
-
-        Args:
-            dim (int): Number of input channels.
-            kernel_size (int): Size of the convolutional kernel.
-            padding (int): Padding size for the convolution.
-            drop_path (float): Stochastic depth rate.
-            layer_scale_init_value (float): Initial value for Layer Scale.
-            use_dwconv (bool): Whether to use depthwise convolution.
-
-        Examples:
-            >>> block = CXBlock(dim=64, kernel_size=7, padding=3)
-            >>> x = torch.randn(1, 64, 32, 32)
-            >>> output = block(x)
-            >>> print(output.shape)
-            torch.Size([1, 64, 32, 32])
-        """
-        super().__init__()
-        self.dwconv = nn.Conv2d(
-            dim,
-            dim,
-            kernel_size=kernel_size,
-            padding=padding,
-            groups=dim if use_dwconv else 1,
-        )  # depthwise conv
-        self.norm = LayerNorm2d(dim, eps=1e-6)
-        self.pwconv1 = nn.Linear(dim, 4 * dim)  # pointwise/1x1 convs, implemented with linear layers
-        self.act = nn.GELU()
-        self.pwconv2 = nn.Linear(4 * dim, dim)
-        self.gamma = (
-            nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
-            if layer_scale_init_value > 0
-            else None
-        )
-        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
-
-    def forward(self, x: Tensor) -> Tensor:
-        """Apply ConvNeXt block operations to input tensor, including convolutions and residual connection."""
-        input = x
-        x = self.dwconv(x)
-        x = self.norm(x)
-        x = x.permute(0, 2, 3, 1)  # (N, C, H, W) -> (N, H, W, C)
-        x = self.pwconv1(x)
-        x = self.act(x)
-        x = self.pwconv2(x)
-        if self.gamma is not None:
-            x = self.gamma * x
-        x = x.permute(0, 3, 1, 2)  # (N, H, W, C) -> (N, C, H, W)
-
-        x = input + self.drop_path(x)
-        return x
-
-
-class Fuser(nn.Module):
-    """
-    A module for fusing features through multiple layers of a neural network.
-
-    This class applies a series of identical layers to an input tensor, optionally projecting the input first.
-
-    Attributes:
-        proj (nn.Module): An optional input projection layer. Identity if no projection is needed.
-        layers (nn.ModuleList): A list of identical layers to be applied sequentially.
-
-    Methods:
-        forward: Applies the fuser to an input tensor.
-
-    Examples:
-        >>> layer = CXBlock(dim=256)
-        >>> fuser = Fuser(layer, num_layers=3, dim=256, input_projection=True)
-        >>> x = torch.randn(1, 256, 32, 32)
-        >>> output = fuser(x)
-        >>> print(output.shape)
-        torch.Size([1, 256, 32, 32])
-    """
-
-    def __init__(self, layer: nn.Module, num_layers: int, dim: int | None = None, input_projection: bool = False):
-        """
-        Initialize the Fuser module for feature fusion through multiple layers.
-
-        This module creates a sequence of identical layers and optionally applies an input projection.
-
-        Args:
-            layer (nn.Module): The layer to be replicated in the fuser.
-            num_layers (int): The number of times to replicate the layer.
-            dim (int | None): The dimension for input projection, if used.
-            input_projection (bool): Whether to use input projection.
-
-        Examples:
-            >>> layer = nn.Linear(64, 64)
-            >>> fuser = Fuser(layer, num_layers=3, dim=64, input_projection=True)
-            >>> input_tensor = torch.randn(1, 64)
-            >>> output = fuser(input_tensor)
-        """
-        super().__init__()
-        self.proj = nn.Identity()
-        self.layers = nn.ModuleList([copy.deepcopy(layer) for _ in range(num_layers)])
-
-        if input_projection:
-            assert dim is not None
-            self.proj = nn.Conv2d(dim, dim, kernel_size=1)
-
-    def forward(self, x: Tensor) -> Tensor:
-        """Apply a series of layers to the input tensor, optionally projecting it first."""
-        x = self.proj(x)
-        for layer in self.layers:
-            x = layer(x)
-        return x
-
-
-class SAM2TwoWayAttentionBlock(TwoWayAttentionBlock):
-    """
-    A two-way attention block for performing self-attention and cross-attention in both directions.
-
-    This block extends the TwoWayAttentionBlock and consists of four main components: self-attention on
-    sparse inputs, cross-attention from sparse to dense inputs, an MLP block on sparse inputs, and
-    cross-attention from dense to sparse inputs.
-
-    Attributes:
-        self_attn (Attention): Self-attention layer for queries.
-        norm1 (nn.LayerNorm): Layer normalization after the first attention block.
-        cross_attn_token_to_image (Attention): Cross-attention layer from queries to keys.
-        norm2 (nn.LayerNorm): Layer normalization after the second attention block.
-        mlp (MLP): MLP block for transforming query embeddings.
-        norm3 (nn.LayerNorm): Layer normalization after the MLP block.
-        norm4 (nn.LayerNorm): Layer normalization after the third attention block.
-        cross_attn_image_to_token (Attention): Cross-attention layer from keys to queries.
-        skip_first_layer_pe (bool): Flag to skip positional encoding in the first layer.
-
-    Methods:
-        forward: Processes input through the attention blocks and MLP.
-
-    Examples:
-        >>> block = SAM2TwoWayAttentionBlock(embedding_dim=256, num_heads=8)
-        >>> sparse_input = torch.randn(1, 100, 256)
-        >>> dense_input = torch.randn(1, 256, 16, 16)
-        >>> sparse_output, dense_output = block(sparse_input, dense_input)
-    """
-
-    def __init__(
-        self,
-        embedding_dim: int,
-        num_heads: int,
-        mlp_dim: int = 2048,
-        activation: type[nn.Module] = nn.ReLU,
-        attention_downsample_rate: int = 2,
-        skip_first_layer_pe: bool = False,
-    ) -> None:
-        """
-        Initialize a SAM2TwoWayAttentionBlock for performing self-attention and cross-attention in two directions.
-
-        This block extends the TwoWayAttentionBlock and consists of four main components: self-attention on sparse
-        inputs, cross-attention from sparse to dense inputs, an MLP block on sparse inputs, and cross-attention
-        from dense to sparse inputs.
-
-        Args:
-            embedding_dim (int): The channel dimension of the embeddings.
-            num_heads (int): The number of heads in the attention layers.
-            mlp_dim (int): The hidden dimension of the MLP block.
-            activation (Type[nn.Module]): The activation function of the MLP block.
-            attention_downsample_rate (int): The downsample rate for attention computations.
-            skip_first_layer_pe (bool): Whether to skip the positional encoding in the first layer.
-
-        Examples:
-            >>> block = SAM2TwoWayAttentionBlock(embedding_dim=256, num_heads=8, mlp_dim=2048)
-            >>> sparse_inputs = torch.randn(1, 100, 256)
-            >>> dense_inputs = torch.randn(1, 256, 32, 32)
-            >>> sparse_outputs, dense_outputs = block(sparse_inputs, dense_inputs)
-        """
-        super().__init__(embedding_dim, num_heads, mlp_dim, activation, attention_downsample_rate, skip_first_layer_pe)
-        self.mlp = MLP(embedding_dim, mlp_dim, embedding_dim, num_layers=2, act=activation)
-
-
-class SAM2TwoWayTransformer(TwoWayTransformer):
-    """
-    A Two-Way Transformer module for simultaneous attention to image and query points.
-
-    This class extends the TwoWayTransformer, implementing a specialized transformer decoder that attends to an
-    input image using queries with supplied positional embeddings. It is particularly useful for tasks like
-    object detection, image segmentation, and point cloud processing.
-
-    Attributes:
-        depth (int): Number of layers in the transformer.
-        embedding_dim (int): Channel dimension for input embeddings.
-        num_heads (int): Number of heads for multihead attention.
-        mlp_dim (int): Internal channel dimension for the MLP block.
-        layers (nn.ModuleList): List of SAM2TwoWayAttentionBlock layers comprising the transformer.
-        final_attn_token_to_image (Attention): Final attention layer from queries to image.
-        norm_final_attn (nn.LayerNorm): Layer normalization applied to final queries.
-
-    Methods:
-        forward: Processes input image embeddings and query embeddings through the transformer.
-
-    Examples:
-        >>> transformer = SAM2TwoWayTransformer(depth=5, embedding_dim=256, num_heads=8, mlp_dim=2048)
-        >>> image_embedding = torch.randn(1, 256, 64, 64)
-        >>> query_embedding = torch.randn(1, 100, 256)
-        >>> output = transformer(image_embedding, query_embedding)
-        >>> print(output[0].shape, output[1].shape)
-        torch.Size([1, 100, 256]) torch.Size([1, 256, 64, 64])
-    """
-
-    def __init__(
-        self,
-        depth: int,
-        embedding_dim: int,
-        num_heads: int,
-        mlp_dim: int,
-        activation: type[nn.Module] = nn.ReLU,
-        attention_downsample_rate: int = 2,
-    ) -> None:
-        """
-        Initialize a SAM2TwoWayTransformer instance.
-
-        This transformer decoder attends to an input image using queries with supplied positional embeddings.
-        It is designed for tasks like object detection, image segmentation, and point cloud processing.
-
-        Args:
-            depth (int): Number of layers in the transformer.
-            embedding_dim (int): Channel dimension for the input embeddings.
-            num_heads (int): Number of heads for multihead attention. Must divide embedding_dim.
-            mlp_dim (int): Channel dimension internal to the MLP block.
-            activation (Type[nn.Module]): Activation function to use in the MLP block.
-            attention_downsample_rate (int): Downsampling rate for attention computations.
-
-        Examples:
-            >>> transformer = SAM2TwoWayTransformer(depth=5, embedding_dim=256, num_heads=8, mlp_dim=2048)
-            >>> transformer
-            SAM2TwoWayTransformer(
-              (layers): ModuleList(
-                (0-4): 5 x SAM2TwoWayAttentionBlock(...)
-              )
-              (final_attn_token_to_image): Attention(...)
-              (norm_final_attn): LayerNorm(...)
-            )
-        """
-        super().__init__(depth, embedding_dim, num_heads, mlp_dim, activation, attention_downsample_rate)
-        self.layers = nn.ModuleList()
-        for i in range(depth):
-            self.layers.append(
-                SAM2TwoWayAttentionBlock(
-                    embedding_dim=embedding_dim,
-                    num_heads=num_heads,
-                    mlp_dim=mlp_dim,
-                    activation=activation,
-                    attention_downsample_rate=attention_downsample_rate,
-                    skip_first_layer_pe=(i == 0),
-                )
-            )
-
-
-class RoPEAttention(Attention):
-    """
-    Implements rotary position encoding for attention mechanisms in transformer architectures.
-
-    This class extends the base Attention class by incorporating Rotary Position Encoding (RoPE) to enhance
-    the positional awareness of the attention mechanism.
-
-    Attributes:
-        compute_cis (Callable): Function to compute axial complex numbers for rotary encoding.
-        freqs_cis (torch.Tensor): Precomputed frequency tensor for rotary encoding.
-        rope_k_repeat (bool): Flag to repeat query RoPE to match key length for cross-attention to memories.
-
-    Methods:
-        forward: Applies rotary position encoding and computes attention between query, key, and value tensors.
-
-    Examples:
-        >>> rope_attn = RoPEAttention(embedding_dim=256, num_heads=8, rope_theta=10000.0, feat_sizes=(32, 32))
-        >>> q = torch.randn(1, 1024, 256)
-        >>> k = torch.randn(1, 1024, 256)
-        >>> v = torch.randn(1, 1024, 256)
-        >>> output = rope_attn(q, k, v)
-        >>> print(output.shape)
-        torch.Size([1, 1024, 256])
-    """
-
-    def __init__(
-        self,
-        *args,
-        rope_theta: float = 10000.0,
-        rope_k_repeat: bool = False,
-        feat_sizes: tuple[int, int] = (32, 32),  # [w, h] for stride 16 feats at 512 resolution
-        **kwargs,
-    ):
-        """Initialize RoPEAttention with rotary position encoding for enhanced positional awareness."""
-        super().__init__(*args, **kwargs)
-
-        self.compute_cis = partial(compute_axial_cis, dim=self.internal_dim // self.num_heads, theta=rope_theta)
-        freqs_cis = self.compute_cis(end_x=feat_sizes[0], end_y=feat_sizes[1])
-        self.freqs_cis = freqs_cis
-        self.rope_k_repeat = rope_k_repeat  # repeat q rope to match k length, needed for cross-attention to memories
-
-    def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, num_k_exclude_rope: int = 0) -> torch.Tensor:
-        """Apply rotary position encoding and compute attention between query, key, and value tensors."""
-        q = self.q_proj(q)
-        k = self.k_proj(k)
-        v = self.v_proj(v)
-
-        # Separate into heads
-        q = self._separate_heads(q, self.num_heads)
-        k = self._separate_heads(k, self.num_heads)
-        v = self._separate_heads(v, self.num_heads)
-
-        # Apply rotary position encoding
-        w = h = math.sqrt(q.shape[-2])
-        self.freqs_cis = self.freqs_cis.to(q.device)
-        if self.freqs_cis.shape[0] != q.shape[-2]:
-            self.freqs_cis = self.compute_cis(end_x=w, end_y=h).to(q.device)
-        if q.shape[-2] != k.shape[-2]:
-            assert self.rope_k_repeat
-
-        num_k_rope = k.size(-2) - num_k_exclude_rope
-        q, k[:, :, :num_k_rope] = apply_rotary_enc(
-            q,
-            k[:, :, :num_k_rope],
-            freqs_cis=self.freqs_cis,
-            repeat_freqs_k=self.rope_k_repeat,
-        )
-
-        # Attention
-        _, _, _, c_per_head = q.shape
-        attn = q @ k.permute(0, 1, 3, 2)  # B x N_heads x N_tokens x N_tokens
-        attn = attn / math.sqrt(c_per_head)
-        attn = torch.softmax(attn, dim=-1)
-
-        # Get output
-        out = attn @ v
-
-        out = self._recombine_heads(out)
-        out = self.out_proj(out)
-
-        return out
-
-
-def do_pool(x: torch.Tensor, pool: nn.Module, norm: nn.Module = None) -> torch.Tensor:
-    """Apply pooling and optional normalization to a tensor, handling spatial dimension permutations."""
-    if pool is None:
-        return x
-    # (B, H, W, C) -> (B, C, H, W)
-    x = x.permute(0, 3, 1, 2)
-    x = pool(x)
-    # (B, C, H', W') -> (B, H', W', C)
-    x = x.permute(0, 2, 3, 1)
-    if norm:
-        x = norm(x)
-
-    return x
-
-
-class MultiScaleAttention(nn.Module):
-    """
-    Implements multiscale self-attention with optional query pooling for efficient feature extraction.
-
-    This class provides a flexible implementation of multiscale attention, allowing for optional
-    downsampling of query features through pooling. It's designed to enhance the model's ability to
-    capture multiscale information in visual tasks.
-
-    Attributes:
-        dim (int): Input dimension of the feature map.
-        dim_out (int): Output dimension of the attention module.
-        num_heads (int): Number of attention heads.
-        scale (float): Scaling factor for dot-product attention.
-        q_pool (nn.Module | None): Optional pooling module for query features.
-        qkv (nn.Linear): Linear projection for query, key, and value.
-        proj (nn.Linear): Output projection.
-
-    Methods:
-        forward: Applies multiscale attention to the input tensor.
-
-    Examples:
-        >>> import torch
-        >>> from torch import nn
-        >>> x = torch.randn(1, 64, 64, 256)
-        >>> msa = MultiScaleAttention(dim=256, dim_out=256, num_heads=8)
-        >>> output = msa(x)
-        >>> print(output.shape)
-        torch.Size([1, 64, 64, 256])
-    """
-
-    def __init__(
-        self,
-        dim: int,
-        dim_out: int,
-        num_heads: int,
-        q_pool: nn.Module = None,
-    ):
-        """Initialize multiscale attention with optional query pooling for efficient feature extraction."""
-        super().__init__()
-
-        self.dim = dim
-        self.dim_out = dim_out
-
-        self.num_heads = num_heads
-        head_dim = dim_out // num_heads
-        self.scale = head_dim**-0.5
-
-        self.q_pool = q_pool
-        self.qkv = nn.Linear(dim, dim_out * 3)
-        self.proj = nn.Linear(dim_out, dim_out)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Apply multiscale attention with optional query pooling to extract multiscale features."""
-        B, H, W, _ = x.shape
-        # qkv with shape (B, H * W, 3, nHead, C)
-        qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1)
-        # q, k, v with shape (B, H * W, nheads, C)
-        q, k, v = torch.unbind(qkv, 2)
-
-        # Q pooling (for downsample at stage changes)
-        if self.q_pool:
-            q = do_pool(q.reshape(B, H, W, -1), self.q_pool)
-            H, W = q.shape[1:3]  # downsampled shape
-            q = q.reshape(B, H * W, self.num_heads, -1)
-
-        # Torch's SDPA expects [B, nheads, H*W, C] so we transpose
-        x = F.scaled_dot_product_attention(
-            q.transpose(1, 2),
-            k.transpose(1, 2),
-            v.transpose(1, 2),
-        )
-        # Transpose back
-        x = x.transpose(1, 2)
-        x = x.reshape(B, H, W, -1)
-
-        x = self.proj(x)
-
-        return x
-
-
-class MultiScaleBlock(nn.Module):
-    """
-    A multiscale attention block with window partitioning and query pooling for efficient vision transformers.
-
-    This class implements a multiscale attention mechanism with optional window partitioning and downsampling,
-    designed for use in vision transformer architectures.
-
-    Attributes:
-        dim (int): Input dimension of the block.
-        dim_out (int): Output dimension of the block.
-        norm1 (nn.Module): First normalization layer.
-        window_size (int): Size of the window for partitioning.
-        pool (nn.Module | None): Pooling layer for query downsampling.
-        q_stride (tuple[int, int] | None): Stride for query pooling.
-        attn (MultiScaleAttention): Multi-scale attention module.
-        drop_path (nn.Module): Drop path layer for regularization.
-        norm2 (nn.Module): Second normalization layer.
-        mlp (MLP): Multi-layer perceptron module.
-        proj (nn.Linear | None): Projection layer for dimension mismatch.
-
-    Methods:
-        forward: Processes input tensor through the multiscale block.
-
-    Examples:
-        >>> block = MultiScaleBlock(dim=256, dim_out=512, num_heads=8, window_size=7)
-        >>> x = torch.randn(1, 56, 56, 256)
-        >>> output = block(x)
-        >>> print(output.shape)
-        torch.Size([1, 28, 28, 512])
-    """
-
-    def __init__(
-        self,
-        dim: int,
-        dim_out: int,
-        num_heads: int,
-        mlp_ratio: float = 4.0,
-        drop_path: float = 0.0,
-        norm_layer: nn.Module | str = "LayerNorm",
-        q_stride: tuple[int, int] = None,
-        act_layer: type[nn.Module] = nn.GELU,
-        window_size: int = 0,
-    ):
-        """Initialize a multiscale attention block with window partitioning and optional query pooling."""
-        super().__init__()
-
-        if isinstance(norm_layer, str):
-            norm_layer = partial(getattr(nn, norm_layer), eps=1e-6)
-
-        self.dim = dim
-        self.dim_out = dim_out
-        self.norm1 = norm_layer(dim)
-
-        self.window_size = window_size
-
-        self.pool, self.q_stride = None, q_stride
-        if self.q_stride:
-            self.pool = nn.MaxPool2d(kernel_size=q_stride, stride=q_stride, ceil_mode=False)
-
-        self.attn = MultiScaleAttention(
-            dim,
-            dim_out,
-            num_heads=num_heads,
-            q_pool=self.pool,
-        )
-        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
-
-        self.norm2 = norm_layer(dim_out)
-        self.mlp = MLP(
-            dim_out,
-            int(dim_out * mlp_ratio),
-            dim_out,
-            num_layers=2,
-            act=act_layer,
-        )
-
-        if dim != dim_out:
-            self.proj = nn.Linear(dim, dim_out)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Process input through multiscale attention and MLP, with optional windowing and downsampling."""
-        shortcut = x  # B, H, W, C
-        x = self.norm1(x)
-
-        # Skip connection
-        if self.dim != self.dim_out:
-            shortcut = do_pool(self.proj(x), self.pool)
-
-        # Window partition
-        window_size = self.window_size
-        if window_size > 0:
-            H, W = x.shape[1], x.shape[2]
-            x, pad_hw = window_partition(x, window_size)
-
-        # Window Attention + Q Pooling (if stage change)
-        x = self.attn(x)
-        if self.q_stride:
-            # Shapes have changed due to Q pooling
-            window_size = self.window_size // self.q_stride[0]
-            H, W = shortcut.shape[1:3]
-
-            pad_h = (window_size - H % window_size) % window_size
-            pad_w = (window_size - W % window_size) % window_size
-            pad_hw = (H + pad_h, W + pad_w)
-
-        # Reverse window partition
-        if self.window_size > 0:
-            x = window_unpartition(x, window_size, pad_hw, (H, W))
-
-        x = shortcut + self.drop_path(x)
-        # MLP
-        x = x + self.drop_path(self.mlp(self.norm2(x)))
-        return x
-
-
-class PositionEmbeddingSine(nn.Module):
-    """
-    A module for generating sinusoidal positional embeddings for 2D inputs like images.
-
-    This class implements sinusoidal position encoding for 2D spatial positions, which can be used in
-    transformer-based models for computer vision tasks.
-
-    Attributes:
-        num_pos_feats (int): Number of positional features (half of the embedding dimension).
-        temperature (int): Temperature parameter for the sinusoidal functions.
-        normalize (bool): Whether to normalize the positional embeddings.
-        scale (float): Scaling factor for the embeddings when normalize is True.
-        cache (dict): Cache for storing precomputed embeddings.
-
-    Methods:
-        _encode_xy: Encodes 2D positions using sine and cosine functions.
-        encode_boxes: Encodes box coordinates and dimensions into positional embeddings.
-        encode_points: Encodes 2D point coordinates with sinusoidal positional embeddings.
-        forward: Generates sinusoidal position embeddings for 2D inputs.
-
-    Examples:
-        >>> pos_emb = PositionEmbeddingSine(num_pos_feats=128)
-        >>> x = torch.randn(1, 3, 224, 224)
-        >>> embeddings = pos_emb(x)
-        >>> print(embeddings.shape)
-        torch.Size([1, 256, 224, 224])
-    """
-
-    def __init__(
-        self,
-        num_pos_feats: int,
-        temperature: int = 10000,
-        normalize: bool = True,
-        scale: float | None = None,
-    ):
-        """Initialize sinusoidal position embeddings for 2D image inputs."""
-        super().__init__()
-        assert num_pos_feats % 2 == 0, "Expecting even model width"
-        self.num_pos_feats = num_pos_feats // 2
-        self.temperature = temperature
-        self.normalize = normalize
-        if scale is not None and not normalize:
-            raise ValueError("normalize should be True if scale is passed")
-        if scale is None:
-            scale = 2 * math.pi
-        self.scale = scale
-
-        self.cache = {}
-
-    def _encode_xy(self, x: torch.Tensor, y: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
-        """Encode 2D positions using sine/cosine functions for transformer positional embeddings."""
-        assert len(x) == len(y) and x.ndim == y.ndim == 1
-        x_embed = x * self.scale
-        y_embed = y * self.scale
-
-        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
-        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
-
-        pos_x = x_embed[:, None] / dim_t
-        pos_y = y_embed[:, None] / dim_t
-        pos_x = torch.stack((pos_x[:, 0::2].sin(), pos_x[:, 1::2].cos()), dim=2).flatten(1)
-        pos_y = torch.stack((pos_y[:, 0::2].sin(), pos_y[:, 1::2].cos()), dim=2).flatten(1)
-        return pos_x, pos_y
-
-    @torch.no_grad()
-    def encode_boxes(self, x: torch.Tensor, y: torch.Tensor, w: torch.Tensor, h: torch.Tensor) -> torch.Tensor:
-        """Encode box coordinates and dimensions into positional embeddings for detection."""
-        pos_x, pos_y = self._encode_xy(x, y)
-        return torch.cat((pos_y, pos_x, h[:, None], w[:, None]), dim=1)
-
-    encode = encode_boxes  # Backwards compatibility
-
-    @torch.no_grad()
-    def encode_points(self, x: torch.Tensor, y: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
-        """Encode 2D points with sinusoidal embeddings and append labels."""
-        (bx, nx), (by, ny), (bl, nl) = x.shape, y.shape, labels.shape
-        assert bx == by and nx == ny and bx == bl and nx == nl
-        pos_x, pos_y = self._encode_xy(x.flatten(), y.flatten())
-        pos_x, pos_y = pos_x.reshape(bx, nx, -1), pos_y.reshape(by, ny, -1)
-        return torch.cat((pos_y, pos_x, labels[:, :, None]), dim=2)
-
-    @torch.no_grad()
-    def forward(self, x: torch.Tensor) -> Tensor:
-        """Generate sinusoidal position embeddings for 2D inputs like images."""
-        cache_key = (x.shape[-2], x.shape[-1])
-        if cache_key in self.cache:
-            return self.cache[cache_key][None].repeat(x.shape[0], 1, 1, 1)
-        y_embed = (
-            torch.arange(1, x.shape[-2] + 1, dtype=torch.float32, device=x.device)
-            .view(1, -1, 1)
-            .repeat(x.shape[0], 1, x.shape[-1])
-        )
-        x_embed = (
-            torch.arange(1, x.shape[-1] + 1, dtype=torch.float32, device=x.device)
-            .view(1, 1, -1)
-            .repeat(x.shape[0], x.shape[-2], 1)
-        )
-
-        if self.normalize:
-            eps = 1e-6
-            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
-            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
-
-        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
-        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
-
-        pos_x = x_embed[:, :, :, None] / dim_t
-        pos_y = y_embed[:, :, :, None] / dim_t
-        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
-        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
-        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
-        self.cache[cache_key] = pos[0]
-        return pos
-
-
-class PositionEmbeddingRandom(nn.Module):
-    """
-    Positional encoding using random spatial frequencies.
-
-    This class generates positional embeddings for input coordinates using random spatial frequencies. It is
-    particularly useful for transformer-based models that require position information.
-
-    Attributes:
-        positional_encoding_gaussian_matrix (torch.Tensor): A buffer containing random values for encoding.
-
-    Methods:
-        _pe_encoding: Positionally encodes points that are normalized to [0,1].
-        forward: Generates positional encoding for a grid of the specified size.
-        forward_with_coords: Positionally encodes points that are not normalized to [0,1].
-
-    Examples:
-        >>> pe = PositionEmbeddingRandom(num_pos_feats=64)
-        >>> size = (32, 32)
-        >>> encoding = pe(size)
-        >>> print(encoding.shape)
-        torch.Size([128, 32, 32])
-    """
-
-    def __init__(self, num_pos_feats: int = 64, scale: float | None = None) -> None:
-        """Initialize random spatial frequency position embedding for transformers."""
-        super().__init__()
-        if scale is None or scale <= 0.0:
-            scale = 1.0
-        self.register_buffer("positional_encoding_gaussian_matrix", scale * torch.randn((2, num_pos_feats)))
-
-        # Set non-deterministic for forward() error 'cumsum_cuda_kernel does not have a deterministic implementation'
-        torch.use_deterministic_algorithms(False)
-        torch.backends.cudnn.deterministic = False
-
-    def _pe_encoding(self, coords: torch.Tensor) -> torch.Tensor:
-        """Encode normalized [0,1] coordinates using random spatial frequencies."""
-        # Assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
-        coords = 2 * coords - 1
-        coords = coords @ self.positional_encoding_gaussian_matrix
-        coords = 2 * np.pi * coords
-        # Outputs d_1 x ... x d_n x C shape
-        return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1)
-
-    def forward(self, size: tuple[int, int]) -> torch.Tensor:
-        """Generate positional encoding for a grid using random spatial frequencies."""
-        h, w = size
-        grid = torch.ones(
-            (h, w),
-            device=self.positional_encoding_gaussian_matrix.device,
-            dtype=self.positional_encoding_gaussian_matrix.dtype,
-        )
-        y_embed = grid.cumsum(dim=0) - 0.5
-        x_embed = grid.cumsum(dim=1) - 0.5
-        y_embed = y_embed / h
-        x_embed = x_embed / w
-
-        pe = self._pe_encoding(torch.stack([x_embed, y_embed], dim=-1))
-        return pe.permute(2, 0, 1)  # C x H x W
-
-    def forward_with_coords(self, coords_input: torch.Tensor, image_size: tuple[int, int]) -> torch.Tensor:
-        """Positionally encode input coordinates, normalizing them to [0,1] based on the given image size."""
-        coords = coords_input.clone()
-        coords[:, :, 0] = coords[:, :, 0] / image_size[1]
-        coords[:, :, 1] = coords[:, :, 1] / image_size[0]
-        return self._pe_encoding(coords)  # B x N x C
-
-
-class Block(nn.Module):
-    """
-    Transformer block with support for window attention and residual propagation.
-
-    This class implements a transformer block that can use either global or windowed self-attention,
-    followed by a feed-forward network. It supports relative positional embeddings and is designed
-    for use in vision transformer architectures.
-
-    Attributes:
-        norm1 (nn.Module): First normalization layer.
-        attn (REAttention): Self-attention layer with optional relative positional encoding.
-        norm2 (nn.Module): Second normalization layer.
-        mlp (MLPBlock): Multi-layer perceptron block.
-        window_size (int): Size of attention window. If 0, global attention is used.
-
-    Methods:
-        forward: Processes input through the transformer block.
-
-    Examples:
-        >>> import torch
-        >>> block = Block(dim=256, num_heads=8, window_size=7)
-        >>> x = torch.randn(1, 56, 56, 256)
-        >>> output = block(x)
-        >>> print(output.shape)
-        torch.Size([1, 56, 56, 256])
-    """
-
-    def __init__(
-        self,
-        dim: int,
-        num_heads: int,
-        mlp_ratio: float = 4.0,
-        qkv_bias: bool = True,
-        norm_layer: type[nn.Module] = nn.LayerNorm,
-        act_layer: type[nn.Module] = nn.GELU,
-        use_rel_pos: bool = False,
-        rel_pos_zero_init: bool = True,
-        window_size: int = 0,
-        input_size: tuple[int, int] | None = None,
-    ) -> None:
-        """
-        Initialize a transformer block with optional window attention and relative positional embeddings.
-
-        This constructor sets up a transformer block that can use either global or windowed self-attention,
-        followed by a feed-forward network. It supports relative positional embeddings and is designed
-        for use in vision transformer architectures.
-
-        Args:
-            dim (int): Number of input channels.
-            num_heads (int): Number of attention heads in the self-attention layer.
-            mlp_ratio (float): Ratio of mlp hidden dimension to embedding dimension.
-            qkv_bias (bool): If True, adds a learnable bias to query, key, value projections.
-            norm_layer (Type[nn.Module]): Type of normalization layer to use.
-            act_layer (Type[nn.Module]): Type of activation function to use in the MLP block.
-            use_rel_pos (bool): If True, uses relative positional embeddings in attention.
-            rel_pos_zero_init (bool): If True, initializes relative positional parameters to zero.
-            window_size (int): Size of attention window. If 0, uses global attention.
-            input_size (tuple[int, int] | None): Input resolution for calculating relative positional parameter size.
-
-        Examples:
-            >>> block = Block(dim=256, num_heads=8, window_size=7)
-            >>> x = torch.randn(1, 56, 56, 256)
-            >>> output = block(x)
-            >>> print(output.shape)
-            torch.Size([1, 56, 56, 256])
-        """
-        super().__init__()
-        self.norm1 = norm_layer(dim)
-        self.attn = REAttention(
-            dim,
-            num_heads=num_heads,
-            qkv_bias=qkv_bias,
-            use_rel_pos=use_rel_pos,
-            rel_pos_zero_init=rel_pos_zero_init,
-            input_size=input_size if window_size == 0 else (window_size, window_size),
-        )
-
-        self.norm2 = norm_layer(dim)
-        self.mlp = MLPBlock(embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer)
-
-        self.window_size = window_size
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Process input through transformer block with optional windowed self-attention and residual connection."""
-        shortcut = x
-        x = self.norm1(x)
-        # Window partition
-        if self.window_size > 0:
-            H, W = x.shape[1], x.shape[2]
-            x, pad_hw = window_partition(x, self.window_size)
-
-        x = self.attn(x)
-        # Reverse window partition
-        if self.window_size > 0:
-            x = window_unpartition(x, self.window_size, pad_hw, (H, W))
-
-        x = shortcut + x
-        return x + self.mlp(self.norm2(x))
-
-
-class REAttention(nn.Module):
-    """
-    Relative Position Attention module for efficient self-attention in transformer architectures.
-
-    This class implements a multi-head attention mechanism with relative positional embeddings, designed
-    for use in vision transformer models. It supports optional query pooling and window partitioning
-    for efficient processing of large inputs.
-
-    Attributes:
-        num_heads (int): Number of attention heads.
-        scale (float): Scaling factor for attention computation.
-        qkv (nn.Linear): Linear projection for query, key, and value.
-        proj (nn.Linear): Output projection layer.
-        use_rel_pos (bool): Whether to use relative positional embeddings.
-        rel_pos_h (nn.Parameter): Relative positional embeddings for height dimension.
-        rel_pos_w (nn.Parameter): Relative positional embeddings for width dimension.
-
-    Methods:
-        forward: Applies multi-head attention with optional relative positional encoding to input tensor.
-
-    Examples:
-        >>> attention = REAttention(dim=256, num_heads=8, input_size=(32, 32))
-        >>> x = torch.randn(1, 32, 32, 256)
-        >>> output = attention(x)
-        >>> print(output.shape)
-        torch.Size([1, 32, 32, 256])
-    """
-
-    def __init__(
-        self,
-        dim: int,
-        num_heads: int = 8,
-        qkv_bias: bool = True,
-        use_rel_pos: bool = False,
-        rel_pos_zero_init: bool = True,
-        input_size: tuple[int, int] | None = None,
-    ) -> None:
-        """
-        Initialize a Relative Position Attention module for transformer-based architectures.
-
-        This module implements multi-head attention with optional relative positional encodings, designed
-        specifically for vision tasks in transformer models.
-
-        Args:
-            dim (int): Number of input channels.
-            num_heads (int): Number of attention heads.
-            qkv_bias (bool): If True, adds a learnable bias to query, key, value projections.
-            use_rel_pos (bool): If True, uses relative positional encodings.
-            rel_pos_zero_init (bool): If True, initializes relative positional parameters to zero.
-            input_size (tuple[int, int] | None): Input resolution for calculating relative positional parameter size.
-                Required if use_rel_pos is True.
-
-        Examples:
-            >>> attention = REAttention(dim=256, num_heads=8, input_size=(32, 32))
-            >>> x = torch.randn(1, 32, 32, 256)
-            >>> output = attention(x)
-            >>> print(output.shape)
-            torch.Size([1, 32, 32, 256])
-        """
-        super().__init__()
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = head_dim**-0.5
-
-        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.proj = nn.Linear(dim, dim)
-
-        self.use_rel_pos = use_rel_pos
-        if self.use_rel_pos:
-            assert input_size is not None, "Input size must be provided if using relative positional encoding."
-            # Initialize relative positional embeddings
-            self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
-            self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Apply multi-head attention with optional relative positional encoding to input tensor."""
-        B, H, W, _ = x.shape
-        # qkv with shape (3, B, nHead, H * W, C)
-        qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
-        # q, k, v with shape (B * nHead, H * W, C)
-        q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0)
-
-        attn = (q * self.scale) @ k.transpose(-2, -1)
-
-        if self.use_rel_pos:
-            attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W))
-
-        attn = attn.softmax(dim=-1)
-        x = (attn @ v).view(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1)
-        return self.proj(x)
-
-
-class PatchEmbed(nn.Module):
-    """
-    Image to Patch Embedding module for vision transformer architectures.
-
-    This module converts an input image into a sequence of patch embeddings using a convolutional layer.
-    It is commonly used as the first layer in vision transformer architectures to transform image data
-    into a suitable format for subsequent transformer blocks.
-
-    Attributes:
-        proj (nn.Conv2d): Convolutional layer for projecting image patches to embeddings.
-
-    Methods:
-        forward: Applies patch embedding to the input tensor.
-
-    Examples:
-        >>> patch_embed = PatchEmbed(kernel_size=(16, 16), stride=(16, 16), in_chans=3, embed_dim=768)
-        >>> x = torch.randn(1, 3, 224, 224)
-        >>> output = patch_embed(x)
-        >>> print(output.shape)
-        torch.Size([1, 768, 14, 14])
-    """
-
-    def __init__(
-        self,
-        kernel_size: tuple[int, int] = (16, 16),
-        stride: tuple[int, int] = (16, 16),
-        padding: tuple[int, int] = (0, 0),
-        in_chans: int = 3,
-        embed_dim: int = 768,
-    ) -> None:
-        """
-        Initialize the PatchEmbed module for converting image patches to embeddings.
-
-        This module is typically used as the first layer in vision transformer architectures to transform
-        image data into a suitable format for subsequent transformer blocks.
-
-        Args:
-            kernel_size (tuple[int, int]): Size of the convolutional kernel for patch extraction.
-            stride (tuple[int, int]): Stride of the convolutional operation.
-            padding (tuple[int, int]): Padding applied to the input before convolution.
-            in_chans (int): Number of input image channels.
-            embed_dim (int): Dimensionality of the output patch embeddings.
-
-        Examples:
-            >>> patch_embed = PatchEmbed(kernel_size=(16, 16), stride=(16, 16), in_chans=3, embed_dim=768)
-            >>> x = torch.randn(1, 3, 224, 224)
-            >>> output = patch_embed(x)
-            >>> print(output.shape)
-            torch.Size([1, 768, 14, 14])
-        """
-        super().__init__()
-
-        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Compute patch embedding by applying convolution and transposing resulting tensor."""
-        return self.proj(x).permute(0, 2, 3, 1)  # B C H W -> B H W C
diff --git a/ultralytics/models/sam/modules/decoders.py b/ultralytics/models/sam/modules/decoders.py
deleted file mode 100644
index f0ba941..0000000
--- a/ultralytics/models/sam/modules/decoders.py
+++ /dev/null
@@ -1,513 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import torch
-from torch import nn
-
-from ultralytics.nn.modules import MLP, LayerNorm2d
-
-
-class MaskDecoder(nn.Module):
-    """
-    Decoder module for generating masks and their associated quality scores using a transformer architecture.
-
-    This class predicts masks given image and prompt embeddings, utilizing a transformer to process the inputs and
-    generate mask predictions along with their quality scores.
-
-    Attributes:
-        transformer_dim (int): Channel dimension for the transformer module.
-        transformer (nn.Module): Transformer module used for mask prediction.
-        num_multimask_outputs (int): Number of masks to predict for disambiguating masks.
-        iou_token (nn.Embedding): Embedding for the IoU token.
-        num_mask_tokens (int): Number of mask tokens.
-        mask_tokens (nn.Embedding): Embedding for the mask tokens.
-        output_upscaling (nn.Sequential): Neural network sequence for upscaling the output.
-        output_hypernetworks_mlps (nn.ModuleList): Hypernetwork MLPs for generating masks.
-        iou_prediction_head (nn.Module): MLP for predicting mask quality.
-
-    Methods:
-        forward: Predict masks given image and prompt embeddings.
-        predict_masks: Internal method for mask prediction.
-
-    Examples:
-        >>> decoder = MaskDecoder(transformer_dim=256, transformer=transformer_module)
-        >>> masks, iou_pred = decoder(
-        ...     image_embeddings, image_pe, sparse_prompt_embeddings, dense_prompt_embeddings, multimask_output=True
-        ... )
-        >>> print(f"Predicted masks shape: {masks.shape}, IoU predictions shape: {iou_pred.shape}")
-    """
-
-    def __init__(
-        self,
-        transformer_dim: int,
-        transformer: nn.Module,
-        num_multimask_outputs: int = 3,
-        activation: type[nn.Module] = nn.GELU,
-        iou_head_depth: int = 3,
-        iou_head_hidden_dim: int = 256,
-    ) -> None:
-        """
-        Initialize the MaskDecoder module for generating masks and their associated quality scores.
-
-        Args:
-            transformer_dim (int): Channel dimension for the transformer module.
-            transformer (nn.Module): Transformer module used for mask prediction.
-            num_multimask_outputs (int): Number of masks to predict for disambiguating masks.
-            activation (Type[nn.Module]): Type of activation to use when upscaling masks.
-            iou_head_depth (int): Depth of the MLP used to predict mask quality.
-            iou_head_hidden_dim (int): Hidden dimension of the MLP used to predict mask quality.
-
-        Examples:
-            >>> transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=256, nhead=8), num_layers=6)
-            >>> decoder = MaskDecoder(transformer_dim=256, transformer=transformer)
-            >>> print(decoder)
-        """
-        super().__init__()
-        self.transformer_dim = transformer_dim
-        self.transformer = transformer
-
-        self.num_multimask_outputs = num_multimask_outputs
-
-        self.iou_token = nn.Embedding(1, transformer_dim)
-        self.num_mask_tokens = num_multimask_outputs + 1
-        self.mask_tokens = nn.Embedding(self.num_mask_tokens, transformer_dim)
-
-        self.output_upscaling = nn.Sequential(
-            nn.ConvTranspose2d(transformer_dim, transformer_dim // 4, kernel_size=2, stride=2),
-            LayerNorm2d(transformer_dim // 4),
-            activation(),
-            nn.ConvTranspose2d(transformer_dim // 4, transformer_dim // 8, kernel_size=2, stride=2),
-            activation(),
-        )
-        self.output_hypernetworks_mlps = nn.ModuleList(
-            [MLP(transformer_dim, transformer_dim, transformer_dim // 8, 3) for _ in range(self.num_mask_tokens)]
-        )
-
-        self.iou_prediction_head = MLP(transformer_dim, iou_head_hidden_dim, self.num_mask_tokens, iou_head_depth)
-
-    def forward(
-        self,
-        image_embeddings: torch.Tensor,
-        image_pe: torch.Tensor,
-        sparse_prompt_embeddings: torch.Tensor,
-        dense_prompt_embeddings: torch.Tensor,
-        multimask_output: bool,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        """
-        Predict masks given image and prompt embeddings.
-
-        Args:
-            image_embeddings (torch.Tensor): Embeddings from the image encoder.
-            image_pe (torch.Tensor): Positional encoding with the shape of image_embeddings.
-            sparse_prompt_embeddings (torch.Tensor): Embeddings of the points and boxes.
-            dense_prompt_embeddings (torch.Tensor): Embeddings of the mask inputs.
-            multimask_output (bool): Whether to return multiple masks or a single mask.
-
-        Returns:
-            masks (torch.Tensor): Batched predicted masks.
-            iou_pred (torch.Tensor): Batched predictions of mask quality.
-
-        Examples:
-            >>> decoder = MaskDecoder(transformer_dim=256, transformer=transformer_module)
-            >>> image_emb = torch.rand(1, 256, 64, 64)
-            >>> image_pe = torch.rand(1, 256, 64, 64)
-            >>> sparse_emb = torch.rand(1, 2, 256)
-            >>> dense_emb = torch.rand(1, 256, 64, 64)
-            >>> masks, iou_pred = decoder(image_emb, image_pe, sparse_emb, dense_emb, multimask_output=True)
-            >>> print(f"Masks shape: {masks.shape}, IoU predictions shape: {iou_pred.shape}")
-        """
-        masks, iou_pred = self.predict_masks(
-            image_embeddings=image_embeddings,
-            image_pe=image_pe,
-            sparse_prompt_embeddings=sparse_prompt_embeddings,
-            dense_prompt_embeddings=dense_prompt_embeddings,
-        )
-
-        # Select the correct mask or masks for output
-        mask_slice = slice(1, None) if multimask_output else slice(0, 1)
-        masks = masks[:, mask_slice, :, :]
-        iou_pred = iou_pred[:, mask_slice]
-
-        return masks, iou_pred
-
-    def predict_masks(
-        self,
-        image_embeddings: torch.Tensor,
-        image_pe: torch.Tensor,
-        sparse_prompt_embeddings: torch.Tensor,
-        dense_prompt_embeddings: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        """Predict masks and quality scores using image and prompt embeddings via transformer architecture."""
-        # Concatenate output tokens
-        output_tokens = torch.cat([self.iou_token.weight, self.mask_tokens.weight], dim=0)
-        output_tokens = output_tokens.unsqueeze(0).expand(sparse_prompt_embeddings.shape[0], -1, -1)
-        tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=1)
-
-        # Expand per-image data in batch direction to be per-mask
-        src = torch.repeat_interleave(image_embeddings, tokens.shape[0], dim=0)
-        src = src + dense_prompt_embeddings
-        pos_src = torch.repeat_interleave(image_pe, tokens.shape[0], dim=0)
-        b, c, h, w = src.shape
-
-        # Run the transformer
-        hs, src = self.transformer(src, pos_src, tokens)
-        iou_token_out = hs[:, 0, :]
-        mask_tokens_out = hs[:, 1 : (1 + self.num_mask_tokens), :]
-
-        # Upscale mask embeddings and predict masks using the mask tokens
-        src = src.transpose(1, 2).view(b, c, h, w)
-        upscaled_embedding = self.output_upscaling(src)
-        hyper_in_list: list[torch.Tensor] = [
-            self.output_hypernetworks_mlps[i](mask_tokens_out[:, i, :]) for i in range(self.num_mask_tokens)
-        ]
-        hyper_in = torch.stack(hyper_in_list, dim=1)
-        b, c, h, w = upscaled_embedding.shape
-        masks = (hyper_in @ upscaled_embedding.view(b, c, h * w)).view(b, -1, h, w)
-
-        # Generate mask quality predictions
-        iou_pred = self.iou_prediction_head(iou_token_out)
-
-        return masks, iou_pred
-
-
-class SAM2MaskDecoder(nn.Module):
-    """
-    Transformer-based decoder for predicting instance segmentation masks from image and prompt embeddings.
-
-    This class extends the functionality of the MaskDecoder, incorporating additional features such as
-    high-resolution feature processing, dynamic multimask output, and object score prediction.
-
-    Attributes:
-        transformer_dim (int): Channel dimension of the transformer.
-        transformer (nn.Module): Transformer used to predict masks.
-        num_multimask_outputs (int): Number of masks to predict when disambiguating masks.
-        iou_token (nn.Embedding): Embedding for IOU token.
-        num_mask_tokens (int): Total number of mask tokens.
-        mask_tokens (nn.Embedding): Embedding for mask tokens.
-        pred_obj_scores (bool): Whether to predict object scores.
-        obj_score_token (nn.Embedding): Embedding for object score token.
-        use_multimask_token_for_obj_ptr (bool): Whether to use multimask token for object pointer.
-        output_upscaling (nn.Sequential): Upscaling layers for output.
-        use_high_res_features (bool): Whether to use high-resolution features.
-        conv_s0 (nn.Conv2d): Convolutional layer for high-resolution features (s0).
-        conv_s1 (nn.Conv2d): Convolutional layer for high-resolution features (s1).
-        output_hypernetworks_mlps (nn.ModuleList): List of MLPs for output hypernetworks.
-        iou_prediction_head (MLP): MLP for IOU prediction.
-        pred_obj_score_head (nn.Linear | MLP): Linear layer or MLP for object score prediction.
-        dynamic_multimask_via_stability (bool): Whether to use dynamic multimask via stability.
-        dynamic_multimask_stability_delta (float): Delta value for dynamic multimask stability.
-        dynamic_multimask_stability_thresh (float): Threshold for dynamic multimask stability.
-
-    Methods:
-        forward: Predict masks given image and prompt embeddings.
-        predict_masks: Predict instance segmentation masks from image and prompt embeddings.
-        _get_stability_scores: Compute mask stability scores based on IoU between thresholds.
-        _dynamic_multimask_via_stability: Dynamically select the most stable mask output.
-
-    Examples:
-        >>> image_embeddings = torch.rand(1, 256, 64, 64)
-        >>> image_pe = torch.rand(1, 256, 64, 64)
-        >>> sparse_prompt_embeddings = torch.rand(1, 2, 256)
-        >>> dense_prompt_embeddings = torch.rand(1, 256, 64, 64)
-        >>> decoder = SAM2MaskDecoder(256, transformer)
-        >>> masks, iou_pred, sam_tokens_out, obj_score_logits = decoder.forward(
-        ...     image_embeddings, image_pe, sparse_prompt_embeddings, dense_prompt_embeddings, True, False
-        ... )
-    """
-
-    def __init__(
-        self,
-        transformer_dim: int,
-        transformer: nn.Module,
-        num_multimask_outputs: int = 3,
-        activation: type[nn.Module] = nn.GELU,
-        iou_head_depth: int = 3,
-        iou_head_hidden_dim: int = 256,
-        use_high_res_features: bool = False,
-        iou_prediction_use_sigmoid=False,
-        dynamic_multimask_via_stability=False,
-        dynamic_multimask_stability_delta=0.05,
-        dynamic_multimask_stability_thresh=0.98,
-        pred_obj_scores: bool = False,
-        pred_obj_scores_mlp: bool = False,
-        use_multimask_token_for_obj_ptr: bool = False,
-    ) -> None:
-        """
-        Initialize the SAM2MaskDecoder module for predicting instance segmentation masks.
-
-        This decoder extends the functionality of MaskDecoder, incorporating additional features such as
-        high-resolution feature processing, dynamic multimask output, and object score prediction.
-
-        Args:
-            transformer_dim (int): Channel dimension of the transformer.
-            transformer (nn.Module): Transformer used to predict masks.
-            num_multimask_outputs (int): Number of masks to predict when disambiguating masks.
-            activation (Type[nn.Module]): Type of activation to use when upscaling masks.
-            iou_head_depth (int): Depth of the MLP used to predict mask quality.
-            iou_head_hidden_dim (int): Hidden dimension of the MLP used to predict mask quality.
-            use_high_res_features (bool): Whether to use high-resolution features.
-            iou_prediction_use_sigmoid (bool): Whether to use sigmoid for IOU prediction.
-            dynamic_multimask_via_stability (bool): Whether to use dynamic multimask via stability.
-            dynamic_multimask_stability_delta (float): Delta value for dynamic multimask stability.
-            dynamic_multimask_stability_thresh (float): Threshold for dynamic multimask stability.
-            pred_obj_scores (bool): Whether to predict object scores.
-            pred_obj_scores_mlp (bool): Whether to use MLP for object score prediction.
-            use_multimask_token_for_obj_ptr (bool): Whether to use multimask token for object pointer.
-
-        Examples:
-            >>> transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=256, nhead=8), num_layers=6)
-            >>> decoder = SAM2MaskDecoder(transformer_dim=256, transformer=transformer)
-            >>> print(decoder)
-        """
-        super().__init__()
-        self.transformer_dim = transformer_dim
-        self.transformer = transformer
-
-        self.num_multimask_outputs = num_multimask_outputs
-
-        self.iou_token = nn.Embedding(1, transformer_dim)
-        self.num_mask_tokens = num_multimask_outputs + 1
-        self.mask_tokens = nn.Embedding(self.num_mask_tokens, transformer_dim)
-
-        self.pred_obj_scores = pred_obj_scores
-        if self.pred_obj_scores:
-            self.obj_score_token = nn.Embedding(1, transformer_dim)
-        self.use_multimask_token_for_obj_ptr = use_multimask_token_for_obj_ptr
-
-        self.output_upscaling = nn.Sequential(
-            nn.ConvTranspose2d(transformer_dim, transformer_dim // 4, kernel_size=2, stride=2),
-            LayerNorm2d(transformer_dim // 4),
-            activation(),
-            nn.ConvTranspose2d(transformer_dim // 4, transformer_dim // 8, kernel_size=2, stride=2),
-            activation(),
-        )
-        self.use_high_res_features = use_high_res_features
-        if use_high_res_features:
-            self.conv_s0 = nn.Conv2d(transformer_dim, transformer_dim // 8, kernel_size=1, stride=1)
-            self.conv_s1 = nn.Conv2d(transformer_dim, transformer_dim // 4, kernel_size=1, stride=1)
-
-        self.output_hypernetworks_mlps = nn.ModuleList(
-            [MLP(transformer_dim, transformer_dim, transformer_dim // 8, 3) for _ in range(self.num_mask_tokens)]
-        )
-
-        self.iou_prediction_head = MLP(
-            transformer_dim,
-            iou_head_hidden_dim,
-            self.num_mask_tokens,
-            iou_head_depth,
-            sigmoid=iou_prediction_use_sigmoid,
-        )
-        if self.pred_obj_scores:
-            self.pred_obj_score_head = nn.Linear(transformer_dim, 1)
-            if pred_obj_scores_mlp:
-                self.pred_obj_score_head = MLP(transformer_dim, transformer_dim, 1, 3)
-
-        # When outputting a single mask, optionally we can dynamically fall back to the best
-        # multimask output token if the single mask output token gives low stability scores.
-        self.dynamic_multimask_via_stability = dynamic_multimask_via_stability
-        self.dynamic_multimask_stability_delta = dynamic_multimask_stability_delta
-        self.dynamic_multimask_stability_thresh = dynamic_multimask_stability_thresh
-
-    def forward(
-        self,
-        image_embeddings: torch.Tensor,
-        image_pe: torch.Tensor,
-        sparse_prompt_embeddings: torch.Tensor,
-        dense_prompt_embeddings: torch.Tensor,
-        multimask_output: bool,
-        repeat_image: bool,
-        high_res_features: list[torch.Tensor] | None = None,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        """
-        Predict masks given image and prompt embeddings.
-
-        Args:
-            image_embeddings (torch.Tensor): Embeddings from the image encoder with shape (B, C, H, W).
-            image_pe (torch.Tensor): Positional encoding with the shape of image_embeddings (B, C, H, W).
-            sparse_prompt_embeddings (torch.Tensor): Embeddings of the points and boxes with shape (B, N, C).
-            dense_prompt_embeddings (torch.Tensor): Embeddings of the mask inputs with shape (B, C, H, W).
-            multimask_output (bool): Whether to return multiple masks or a single mask.
-            repeat_image (bool): Flag to repeat the image embeddings.
-            high_res_features (list[torch.Tensor] | None, optional): Optional high-resolution features.
-
-        Returns:
-            masks (torch.Tensor): Batched predicted masks with shape (B, N, H, W).
-            iou_pred (torch.Tensor): Batched predictions of mask quality with shape (B, N).
-            sam_tokens_out (torch.Tensor): Batched SAM token for mask output with shape (B, N, C).
-            object_score_logits (torch.Tensor): Batched object score logits with shape (B, 1).
-
-        Examples:
-            >>> image_embeddings = torch.rand(1, 256, 64, 64)
-            >>> image_pe = torch.rand(1, 256, 64, 64)
-            >>> sparse_prompt_embeddings = torch.rand(1, 2, 256)
-            >>> dense_prompt_embeddings = torch.rand(1, 256, 64, 64)
-            >>> decoder = SAM2MaskDecoder(256, transformer)
-            >>> masks, iou_pred, sam_tokens_out, obj_score_logits = decoder.forward(
-            ...     image_embeddings, image_pe, sparse_prompt_embeddings, dense_prompt_embeddings, True, False
-            ... )
-        """
-        masks, iou_pred, mask_tokens_out, object_score_logits = self.predict_masks(
-            image_embeddings=image_embeddings,
-            image_pe=image_pe,
-            sparse_prompt_embeddings=sparse_prompt_embeddings,
-            dense_prompt_embeddings=dense_prompt_embeddings,
-            repeat_image=repeat_image,
-            high_res_features=high_res_features,
-        )
-
-        # Select the correct mask or masks for output
-        if multimask_output:
-            masks = masks[:, 1:, :, :]
-            iou_pred = iou_pred[:, 1:]
-        elif self.dynamic_multimask_via_stability and not self.training:
-            masks, iou_pred = self._dynamic_multimask_via_stability(masks, iou_pred)
-        else:
-            masks = masks[:, 0:1, :, :]
-            iou_pred = iou_pred[:, 0:1]
-
-        if multimask_output and self.use_multimask_token_for_obj_ptr:
-            sam_tokens_out = mask_tokens_out[:, 1:]  # [b, 3, c] shape
-        else:
-            # Take the mask output token. Here we *always* use the token for single mask output.
-            # At test time, even if we track after 1-click (and using multimask_output=True),
-            # we still take the single mask token here. The rationale is that we always track
-            # after multiple clicks during training, so the past tokens seen during training
-            # are always the single mask token (and we'll let it be the object-memory token).
-            sam_tokens_out = mask_tokens_out[:, 0:1]  # [b, 1, c] shape
-
-        return masks, iou_pred, sam_tokens_out, object_score_logits
-
-    def predict_masks(
-        self,
-        image_embeddings: torch.Tensor,
-        image_pe: torch.Tensor,
-        sparse_prompt_embeddings: torch.Tensor,
-        dense_prompt_embeddings: torch.Tensor,
-        repeat_image: bool,
-        high_res_features: list[torch.Tensor] | None = None,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Predict instance segmentation masks from image and prompt embeddings using a transformer."""
-        # Concatenate output tokens
-        s = 0
-        if self.pred_obj_scores:
-            output_tokens = torch.cat(
-                [
-                    self.obj_score_token.weight,
-                    self.iou_token.weight,
-                    self.mask_tokens.weight,
-                ],
-                dim=0,
-            )
-            s = 1
-        else:
-            output_tokens = torch.cat([self.iou_token.weight, self.mask_tokens.weight], dim=0)
-        output_tokens = output_tokens.unsqueeze(0).expand(sparse_prompt_embeddings.shape[0], -1, -1)
-        tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=1)
-
-        # Expand per-image data in batch direction to be per-mask
-        if repeat_image:
-            src = torch.repeat_interleave(image_embeddings, tokens.shape[0], dim=0)
-        else:
-            assert image_embeddings.shape[0] == tokens.shape[0]
-            src = image_embeddings
-        src = src + dense_prompt_embeddings
-        assert image_pe.shape[0] == 1, "image_pe should have size 1 in batch dim (from `get_dense_pe()`)"
-        pos_src = torch.repeat_interleave(image_pe, tokens.shape[0], dim=0)
-        b, c, h, w = src.shape
-
-        # Run the transformer
-        hs, src = self.transformer(src, pos_src, tokens)
-        iou_token_out = hs[:, s, :]
-        mask_tokens_out = hs[:, s + 1 : (s + 1 + self.num_mask_tokens), :]
-
-        # Upscale mask embeddings and predict masks using the mask tokens
-        src = src.transpose(1, 2).view(b, c, h, w)
-        if not self.use_high_res_features or high_res_features is None:
-            upscaled_embedding = self.output_upscaling(src)
-        else:
-            dc1, ln1, act1, dc2, act2 = self.output_upscaling
-            feat_s0, feat_s1 = high_res_features
-            upscaled_embedding = act1(ln1(dc1(src) + feat_s1))
-            upscaled_embedding = act2(dc2(upscaled_embedding) + feat_s0)
-
-        hyper_in_list: list[torch.Tensor] = [
-            self.output_hypernetworks_mlps[i](mask_tokens_out[:, i, :]) for i in range(self.num_mask_tokens)
-        ]
-        hyper_in = torch.stack(hyper_in_list, dim=1)
-        b, c, h, w = upscaled_embedding.shape
-        masks = (hyper_in @ upscaled_embedding.view(b, c, h * w)).view(b, -1, h, w)
-
-        # Generate mask quality predictions
-        iou_pred = self.iou_prediction_head(iou_token_out)
-        if self.pred_obj_scores:
-            assert s == 1
-            object_score_logits = self.pred_obj_score_head(hs[:, 0, :])
-        else:
-            # Obj scores logits - default to 10.0, i.e. assuming the object is present, sigmoid(10)=1
-            object_score_logits = 10.0 * iou_pred.new_ones(iou_pred.shape[0], 1)
-
-        return masks, iou_pred, mask_tokens_out, object_score_logits
-
-    def _get_stability_scores(self, mask_logits):
-        """Compute mask stability scores based on IoU between upper and lower thresholds."""
-        mask_logits = mask_logits.flatten(-2)
-        stability_delta = self.dynamic_multimask_stability_delta
-        area_i = torch.sum(mask_logits > stability_delta, dim=-1).float()
-        area_u = torch.sum(mask_logits > -stability_delta, dim=-1).float()
-        return torch.where(area_u > 0, area_i / area_u, 1.0)
-
-    def _dynamic_multimask_via_stability(self, all_mask_logits, all_iou_scores):
-        """
-        Dynamically select the most stable mask output based on stability scores and IoU predictions.
-
-        This method is used when outputting a single mask. If the stability score from the current single-mask
-        output (based on output token 0) falls below a threshold, it instead selects from multi-mask outputs
-        (based on output tokens 1-3) the mask with the highest predicted IoU score. This ensures a valid mask
-        for both clicking and tracking scenarios.
-
-        Args:
-            all_mask_logits (torch.Tensor): Logits for all predicted masks, shape (B, N, H, W) where B is
-                batch size, N is number of masks (typically 4), and H, W are mask dimensions.
-            all_iou_scores (torch.Tensor): Predicted IoU scores for all masks, shape (B, N).
-
-        Returns:
-            mask_logits_out (torch.Tensor): Selected mask logits, shape (B, 1, H, W).
-            iou_scores_out (torch.Tensor): Selected IoU scores, shape (B, 1).
-
-        Examples:
-            >>> decoder = SAM2MaskDecoder(...)
-            >>> all_mask_logits = torch.rand(2, 4, 256, 256)  # 2 images, 4 masks each
-            >>> all_iou_scores = torch.rand(2, 4)
-            >>> mask_logits, iou_scores = decoder._dynamic_multimask_via_stability(all_mask_logits, all_iou_scores)
-            >>> print(mask_logits.shape, iou_scores.shape)
-            torch.Size([2, 1, 256, 256]) torch.Size([2, 1])
-        """
-        # The best mask from multimask output tokens (1~3)
-        multimask_logits = all_mask_logits[:, 1:, :, :]
-        multimask_iou_scores = all_iou_scores[:, 1:]
-        best_scores_inds = torch.argmax(multimask_iou_scores, dim=-1)
-        batch_inds = torch.arange(multimask_iou_scores.shape[0], device=all_iou_scores.device)
-        best_multimask_logits = multimask_logits[batch_inds, best_scores_inds]
-        best_multimask_logits = best_multimask_logits.unsqueeze(1)
-        best_multimask_iou_scores = multimask_iou_scores[batch_inds, best_scores_inds]
-        best_multimask_iou_scores = best_multimask_iou_scores.unsqueeze(1)
-
-        # The mask from singlemask output token 0 and its stability score
-        singlemask_logits = all_mask_logits[:, 0:1, :, :]
-        singlemask_iou_scores = all_iou_scores[:, 0:1]
-        stability_scores = self._get_stability_scores(singlemask_logits)
-        is_stable = stability_scores >= self.dynamic_multimask_stability_thresh
-
-        # Dynamically fall back to best multimask output upon low stability scores.
-        mask_logits_out = torch.where(
-            is_stable[..., None, None].expand_as(singlemask_logits),
-            singlemask_logits,
-            best_multimask_logits,
-        )
-        iou_scores_out = torch.where(
-            is_stable.expand_as(singlemask_iou_scores),
-            singlemask_iou_scores,
-            best_multimask_iou_scores,
-        )
-        return mask_logits_out, iou_scores_out
diff --git a/ultralytics/models/sam/modules/encoders.py b/ultralytics/models/sam/modules/encoders.py
deleted file mode 100644
index d778f9e..0000000
--- a/ultralytics/models/sam/modules/encoders.py
+++ /dev/null
@@ -1,851 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from ultralytics.nn.modules import LayerNorm2d
-
-from .blocks import (
-    Block,
-    CXBlock,
-    Fuser,
-    MaskDownSampler,
-    MultiScaleBlock,
-    PatchEmbed,
-    PositionEmbeddingRandom,
-    PositionEmbeddingSine,
-)
-
-
-class ImageEncoderViT(nn.Module):
-    """
-    An image encoder using Vision Transformer (ViT) architecture for encoding images into a compact latent space.
-
-    This class processes images by splitting them into patches, applying transformer blocks, and generating a final
-    encoded representation through a neck module.
-
-    Attributes:
-        img_size (int): Dimension of input images, assumed to be square.
-        patch_embed (PatchEmbed): Module for patch embedding.
-        pos_embed (nn.Parameter | None): Absolute positional embedding for patches.
-        blocks (nn.ModuleList): List of transformer blocks for processing patch embeddings.
-        neck (nn.Sequential): Neck module to further process the output.
-
-    Methods:
-        forward: Process input through patch embedding, positional embedding, blocks, and neck.
-
-    Examples:
-        >>> import torch
-        >>> encoder = ImageEncoderViT(img_size=224, patch_size=16, embed_dim=768, depth=12, num_heads=12)
-        >>> input_image = torch.randn(1, 3, 224, 224)
-        >>> output = encoder(input_image)
-        >>> print(output.shape)
-    """
-
-    def __init__(
-        self,
-        img_size: int = 1024,
-        patch_size: int = 16,
-        in_chans: int = 3,
-        embed_dim: int = 768,
-        depth: int = 12,
-        num_heads: int = 12,
-        mlp_ratio: float = 4.0,
-        out_chans: int = 256,
-        qkv_bias: bool = True,
-        norm_layer: type[nn.Module] = nn.LayerNorm,
-        act_layer: type[nn.Module] = nn.GELU,
-        use_abs_pos: bool = True,
-        use_rel_pos: bool = False,
-        rel_pos_zero_init: bool = True,
-        window_size: int = 0,
-        global_attn_indexes: tuple[int, ...] = (),
-    ) -> None:
-        """
-        Initialize an ImageEncoderViT instance for encoding images using Vision Transformer architecture.
-
-        Args:
-            img_size (int): Input image size, assumed to be square.
-            patch_size (int): Size of image patches.
-            in_chans (int): Number of input image channels.
-            embed_dim (int): Dimension of patch embeddings.
-            depth (int): Number of transformer blocks.
-            num_heads (int): Number of attention heads in each block.
-            mlp_ratio (float): Ratio of MLP hidden dimension to embedding dimension.
-            out_chans (int): Number of output channels from the neck module.
-            qkv_bias (bool): If True, adds learnable bias to query, key, value projections.
-            norm_layer (Type[nn.Module]): Type of normalization layer to use.
-            act_layer (Type[nn.Module]): Type of activation layer to use.
-            use_abs_pos (bool): If True, uses absolute positional embeddings.
-            use_rel_pos (bool): If True, adds relative positional embeddings to attention maps.
-            rel_pos_zero_init (bool): If True, initializes relative positional parameters to zero.
-            window_size (int): Size of attention window for windowed attention blocks.
-            global_attn_indexes (tuple[int, ...]): Indices of blocks that use global attention.
-
-        Examples:
-            >>> encoder = ImageEncoderViT(img_size=224, patch_size=16, embed_dim=768, depth=12, num_heads=12)
-            >>> input_image = torch.randn(1, 3, 224, 224)
-            >>> output = encoder(input_image)
-            >>> print(output.shape)
-        """
-        super().__init__()
-        self.img_size = img_size
-
-        self.patch_embed = PatchEmbed(
-            kernel_size=(patch_size, patch_size),
-            stride=(patch_size, patch_size),
-            in_chans=in_chans,
-            embed_dim=embed_dim,
-        )
-
-        self.pos_embed: nn.Parameter | None = None
-        if use_abs_pos:
-            # Initialize absolute positional embedding with pretrain image size
-            self.pos_embed = nn.Parameter(torch.zeros(1, img_size // patch_size, img_size // patch_size, embed_dim))
-
-        self.blocks = nn.ModuleList()
-        for i in range(depth):
-            block = Block(
-                dim=embed_dim,
-                num_heads=num_heads,
-                mlp_ratio=mlp_ratio,
-                qkv_bias=qkv_bias,
-                norm_layer=norm_layer,
-                act_layer=act_layer,
-                use_rel_pos=use_rel_pos,
-                rel_pos_zero_init=rel_pos_zero_init,
-                window_size=window_size if i not in global_attn_indexes else 0,
-                input_size=(img_size // patch_size, img_size // patch_size),
-            )
-            self.blocks.append(block)
-
-        self.neck = nn.Sequential(
-            nn.Conv2d(
-                embed_dim,
-                out_chans,
-                kernel_size=1,
-                bias=False,
-            ),
-            LayerNorm2d(out_chans),
-            nn.Conv2d(
-                out_chans,
-                out_chans,
-                kernel_size=3,
-                padding=1,
-                bias=False,
-            ),
-            LayerNorm2d(out_chans),
-        )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Process input through patch embedding, positional embedding, transformer blocks, and neck module."""
-        x = self.patch_embed(x)
-        if self.pos_embed is not None:
-            pos_embed = (
-                F.interpolate(self.pos_embed.permute(0, 3, 1, 2), scale_factor=self.img_size / 1024).permute(0, 2, 3, 1)
-                if self.img_size != 1024
-                else self.pos_embed
-            )
-            x = x + pos_embed
-        for blk in self.blocks:
-            x = blk(x)
-        return self.neck(x.permute(0, 3, 1, 2))
-
-
-class PromptEncoder(nn.Module):
-    """
-    Encode different types of prompts for input to SAM's mask decoder, producing sparse and dense embeddings.
-
-    Attributes:
-        embed_dim (int): Dimension of the embeddings.
-        input_image_size (tuple[int, int]): Size of the input image as (H, W).
-        image_embedding_size (tuple[int, int]): Spatial size of the image embedding as (H, W).
-        pe_layer (PositionEmbeddingRandom): Module for random position embedding.
-        num_point_embeddings (int): Number of point embeddings for different types of points.
-        point_embeddings (nn.ModuleList): List of point embeddings.
-        not_a_point_embed (nn.Embedding): Embedding for points that are not part of any label.
-        mask_input_size (tuple[int, int]): Size of the input mask.
-        mask_downscaling (nn.Sequential): Neural network for downscaling the mask.
-        no_mask_embed (nn.Embedding): Embedding for cases where no mask is provided.
-
-    Methods:
-        get_dense_pe: Return the positional encoding used to encode point prompts.
-        forward: Embed different types of prompts, returning both sparse and dense embeddings.
-
-    Examples:
-        >>> prompt_encoder = PromptEncoder(256, (64, 64), (1024, 1024), 16)
-        >>> points = (torch.rand(1, 5, 2), torch.randint(0, 4, (1, 5)))
-        >>> boxes = torch.rand(1, 2, 2)
-        >>> masks = torch.rand(1, 1, 256, 256)
-        >>> sparse_embeddings, dense_embeddings = prompt_encoder(points, boxes, masks)
-        >>> print(sparse_embeddings.shape, dense_embeddings.shape)
-        torch.Size([1, 7, 256]) torch.Size([1, 256, 64, 64])
-    """
-
-    def __init__(
-        self,
-        embed_dim: int,
-        image_embedding_size: tuple[int, int],
-        input_image_size: tuple[int, int],
-        mask_in_chans: int,
-        activation: type[nn.Module] = nn.GELU,
-    ) -> None:
-        """
-        Initialize the PromptEncoder module for encoding various types of prompts.
-
-        Args:
-            embed_dim (int): The dimension of the embeddings.
-            image_embedding_size (tuple[int, int]): The spatial size of the image embedding as (H, W).
-            input_image_size (tuple[int, int]): The padded size of the input image as (H, W).
-            mask_in_chans (int): The number of hidden channels used for encoding input masks.
-            activation (Type[nn.Module]): The activation function to use when encoding input masks.
-
-        Examples:
-            >>> prompt_encoder = PromptEncoder(256, (64, 64), (1024, 1024), 16)
-            >>> points = (torch.rand(1, 5, 2), torch.randint(0, 4, (1, 5)))
-            >>> boxes = torch.rand(1, 2, 2)
-            >>> masks = torch.rand(1, 1, 256, 256)
-            >>> sparse_embeddings, dense_embeddings = prompt_encoder(points, boxes, masks)
-            >>> print(sparse_embeddings.shape, dense_embeddings.shape)
-            torch.Size([1, 7, 256]) torch.Size([1, 256, 64, 64])
-        """
-        super().__init__()
-        self.embed_dim = embed_dim
-        self.input_image_size = input_image_size
-        self.image_embedding_size = image_embedding_size
-        self.pe_layer = PositionEmbeddingRandom(embed_dim // 2)
-
-        self.num_point_embeddings: int = 4  # pos/neg point + 2 box corners
-        point_embeddings = [nn.Embedding(1, embed_dim) for _ in range(self.num_point_embeddings)]
-        self.point_embeddings = nn.ModuleList(point_embeddings)
-        self.not_a_point_embed = nn.Embedding(1, embed_dim)
-
-        self.mask_input_size = (4 * image_embedding_size[0], 4 * image_embedding_size[1])
-        self.mask_downscaling = nn.Sequential(
-            nn.Conv2d(1, mask_in_chans // 4, kernel_size=2, stride=2),
-            LayerNorm2d(mask_in_chans // 4),
-            activation(),
-            nn.Conv2d(mask_in_chans // 4, mask_in_chans, kernel_size=2, stride=2),
-            LayerNorm2d(mask_in_chans),
-            activation(),
-            nn.Conv2d(mask_in_chans, embed_dim, kernel_size=1),
-        )
-        self.no_mask_embed = nn.Embedding(1, embed_dim)
-
-    def get_dense_pe(self) -> torch.Tensor:
-        """
-        Return the dense positional encoding used for encoding point prompts.
-
-        Generate a positional encoding for a dense set of points matching the shape of the image
-        encoding. The encoding is used to provide spatial information to the model when processing point prompts.
-
-        Returns:
-            (torch.Tensor): Positional encoding tensor with shape (1, embed_dim, H, W), where H and W are the
-                height and width of the image embedding size, respectively.
-
-        Examples:
-            >>> prompt_encoder = PromptEncoder(256, (64, 64), (1024, 1024), 16)
-            >>> dense_pe = prompt_encoder.get_dense_pe()
-            >>> print(dense_pe.shape)
-            torch.Size([1, 256, 64, 64])
-        """
-        return self.pe_layer(self.image_embedding_size).unsqueeze(0)
-
-    def _embed_points(self, points: torch.Tensor, labels: torch.Tensor, pad: bool) -> torch.Tensor:
-        """Embed point prompts by applying positional encoding and label-specific embeddings."""
-        points = points + 0.5  # Shift to center of pixel
-        if pad:
-            padding_point = torch.zeros((points.shape[0], 1, 2), dtype=points.dtype, device=points.device)
-            padding_label = -torch.ones((labels.shape[0], 1), dtype=labels.dtype, device=labels.device)
-            points = torch.cat([points, padding_point], dim=1)
-            labels = torch.cat([labels, padding_label], dim=1)
-        point_embedding = self.pe_layer.forward_with_coords(points, self.input_image_size)
-        point_embedding[labels == -1] = 0.0
-        point_embedding[labels == -1] += self.not_a_point_embed.weight
-        point_embedding[labels == 0] += self.point_embeddings[0].weight
-        point_embedding[labels == 1] += self.point_embeddings[1].weight
-        point_embedding[labels == 2] += self.point_embeddings[2].weight
-        point_embedding[labels == 3] += self.point_embeddings[3].weight
-        return point_embedding
-
-    def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor:
-        """Embed box prompts by applying positional encoding and adding corner embeddings."""
-        boxes = boxes + 0.5  # Shift to center of pixel
-        coords = boxes.reshape(-1, 2, 2)
-        corner_embedding = self.pe_layer.forward_with_coords(coords, self.input_image_size)
-        corner_embedding[:, 0, :] += self.point_embeddings[2].weight
-        corner_embedding[:, 1, :] += self.point_embeddings[3].weight
-        return corner_embedding
-
-    def _embed_masks(self, masks: torch.Tensor) -> torch.Tensor:
-        """Embed mask inputs by downscaling and processing through convolutional layers."""
-        return self.mask_downscaling(masks)
-
-    @staticmethod
-    def _get_batch_size(
-        points: tuple[torch.Tensor, torch.Tensor] | None,
-        boxes: torch.Tensor | None,
-        masks: torch.Tensor | None,
-    ) -> int:
-        """Get the batch size of the output given the batch size of the input prompts."""
-        if points is not None:
-            return points[0].shape[0]
-        elif boxes is not None:
-            return boxes.shape[0]
-        elif masks is not None:
-            return masks.shape[0]
-        else:
-            return 1
-
-    def forward(
-        self,
-        points: tuple[torch.Tensor, torch.Tensor] | None,
-        boxes: torch.Tensor | None,
-        masks: torch.Tensor | None,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        """
-        Embed different types of prompts, returning both sparse and dense embeddings.
-
-        Args:
-            points (tuple[torch.Tensor, torch.Tensor] | None): Point coordinates and labels to embed. The first
-                tensor contains coordinates with shape (B, N, 2), and the second tensor contains labels with
-                shape (B, N).
-            boxes (torch.Tensor | None): Boxes to embed with shape (B, M, 2, 2), where M is the number of boxes.
-            masks (torch.Tensor | None): Masks to embed with shape (B, 1, H, W).
-
-        Returns:
-            sparse_embeddings (torch.Tensor): Sparse embeddings for points and boxes with shape (B, N, embed_dim).
-            dense_embeddings (torch.Tensor): Dense embeddings for masks of shape (B, embed_dim, embed_H, embed_W).
-
-        Examples:
-            >>> encoder = PromptEncoder(256, (64, 64), (1024, 1024), 16)
-            >>> points = (torch.rand(1, 5, 2), torch.randint(0, 4, (1, 5)))
-            >>> boxes = torch.rand(1, 2, 2, 2)
-            >>> masks = torch.rand(1, 1, 256, 256)
-            >>> sparse_emb, dense_emb = encoder(points, boxes, masks)
-            >>> print(sparse_emb.shape, dense_emb.shape)
-            torch.Size([1, 7, 256]) torch.Size([1, 256, 64, 64])
-        """
-        bs = self._get_batch_size(points, boxes, masks)
-        sparse_embeddings = torch.empty(
-            (bs, 0, self.embed_dim),
-            dtype=self.point_embeddings[0].weight.dtype,
-            device=self.point_embeddings[0].weight.device,
-        )
-        if points is not None:
-            coords, labels = points
-            point_embeddings = self._embed_points(coords, labels, pad=(boxes is None))
-            sparse_embeddings = torch.cat([sparse_embeddings, point_embeddings], dim=1)
-        if boxes is not None:
-            box_embeddings = self._embed_boxes(boxes)
-            sparse_embeddings = torch.cat([sparse_embeddings, box_embeddings], dim=1)
-
-        if masks is not None:
-            dense_embeddings = self._embed_masks(masks)
-        else:
-            dense_embeddings = self.no_mask_embed.weight.reshape(1, -1, 1, 1).expand(
-                bs, -1, self.image_embedding_size[0], self.image_embedding_size[1]
-            )
-
-        return sparse_embeddings, dense_embeddings
-
-
-class MemoryEncoder(nn.Module):
-    """
-    Encode pixel features and masks into a memory representation for efficient image segmentation.
-
-    This class processes pixel-level features and masks, fusing them to generate encoded memory representations
-    suitable for downstream tasks in image segmentation models like SAM (Segment Anything Model).
-
-    Attributes:
-        mask_downsampler (MaskDownSampler): Module for downsampling input masks.
-        pix_feat_proj (nn.Conv2d): Convolutional layer for projecting pixel features.
-        fuser (Fuser): Module for fusing pixel features and masks.
-        position_encoding (PositionEmbeddingSine): Module for adding positional encoding to features.
-        out_proj (nn.Module): Output projection layer, either nn.Identity or nn.Conv2d.
-
-    Methods:
-        forward: Process input pixel features and masks to generate encoded memory representations.
-
-    Examples:
-        >>> import torch
-        >>> encoder = MemoryEncoder(out_dim=256, in_dim=256)
-        >>> pix_feat = torch.randn(1, 256, 64, 64)
-        >>> masks = torch.randn(1, 1, 64, 64)
-        >>> encoded_feat, pos = encoder(pix_feat, masks)
-        >>> print(encoded_feat.shape, pos.shape)
-        torch.Size([1, 256, 64, 64]) torch.Size([1, 128, 64, 64])
-    """
-
-    def __init__(
-        self,
-        out_dim,
-        in_dim=256,  # in_dim of pix_feats
-    ):
-        """
-        Initialize the MemoryEncoder for encoding pixel features and masks into memory representations.
-
-        This encoder processes pixel-level features and masks, fusing them to generate encoded memory representations
-        suitable for downstream tasks in image segmentation models like SAM (Segment Anything Model).
-
-        Args:
-            out_dim (int): Output dimension of the encoded features.
-            in_dim (int): Input dimension of the pixel features.
-
-        Examples:
-            >>> encoder = MemoryEncoder(out_dim=256, in_dim=256)
-            >>> pix_feat = torch.randn(1, 256, 64, 64)
-            >>> masks = torch.randn(1, 1, 64, 64)
-            >>> encoded_feat, pos = encoder(pix_feat, masks)
-            >>> print(encoded_feat.shape, pos.shape)
-            torch.Size([1, 256, 64, 64]) torch.Size([1, 128, 64, 64])
-        """
-        super().__init__()
-
-        self.mask_downsampler = MaskDownSampler(kernel_size=3, stride=2, padding=1)
-
-        self.pix_feat_proj = nn.Conv2d(in_dim, in_dim, kernel_size=1)
-        self.fuser = Fuser(CXBlock(dim=256), num_layers=2)
-        self.position_encoding = PositionEmbeddingSine(num_pos_feats=64)
-        self.out_proj = nn.Identity()
-        if out_dim != in_dim:
-            self.out_proj = nn.Conv2d(in_dim, out_dim, kernel_size=1)
-
-    def forward(
-        self,
-        pix_feat: torch.Tensor,
-        masks: torch.Tensor,
-        skip_mask_sigmoid: bool = False,
-    ) -> dict:
-        """Process pixel features and masks to generate encoded memory representations for segmentation."""
-        if not skip_mask_sigmoid:
-            masks = F.sigmoid(masks)
-        masks = self.mask_downsampler(masks)
-
-        # Fuse pix_feats and downsampled masks, in case the visual features are on CPU, cast them to CUDA
-        pix_feat = pix_feat.to(masks.device)
-
-        x = self.pix_feat_proj(pix_feat)
-        x = x + masks
-        x = self.fuser(x)
-        x = self.out_proj(x)
-
-        pos = self.position_encoding(x).to(x.dtype)
-
-        return {"vision_features": x, "vision_pos_enc": [pos]}
-
-
-class ImageEncoder(nn.Module):
-    """
-    Encode images using a trunk-neck architecture, producing multiscale features and positional encodings.
-
-    This class combines a trunk network for feature extraction with a neck network for feature refinement
-    and positional encoding generation. It can optionally discard the lowest resolution features.
-
-    Attributes:
-        trunk (nn.Module): The trunk network for initial feature extraction.
-        neck (nn.Module): The neck network for feature refinement and positional encoding generation.
-        scalp (int): Number of lowest resolution feature levels to discard.
-
-    Methods:
-        forward: Process the input image through the trunk and neck networks.
-
-    Examples:
-        >>> trunk = SomeTrunkNetwork()
-        >>> neck = SomeNeckNetwork()
-        >>> encoder = ImageEncoder(trunk, neck, scalp=1)
-        >>> image = torch.randn(1, 3, 224, 224)
-        >>> output = encoder(image)
-        >>> print(output.keys())
-        dict_keys(['vision_features', 'vision_pos_enc', 'backbone_fpn'])
-    """
-
-    def __init__(
-        self,
-        trunk: nn.Module,
-        neck: nn.Module,
-        scalp: int = 0,
-    ):
-        """
-        Initialize the ImageEncoder with trunk and neck networks for feature extraction and refinement.
-
-        This encoder combines a trunk network for feature extraction with a neck network for feature refinement
-        and positional encoding generation. It can optionally discard the lowest resolution features.
-
-        Args:
-            trunk (nn.Module): The trunk network for initial feature extraction.
-            neck (nn.Module): The neck network for feature refinement and positional encoding generation.
-            scalp (int): Number of lowest resolution feature levels to discard.
-
-        Examples:
-            >>> trunk = SomeTrunkNetwork()
-            >>> neck = SomeNeckNetwork()
-            >>> encoder = ImageEncoder(trunk, neck, scalp=1)
-            >>> image = torch.randn(1, 3, 224, 224)
-            >>> output = encoder(image)
-            >>> print(output.keys())
-            dict_keys(['vision_features', 'vision_pos_enc', 'backbone_fpn'])
-        """
-        super().__init__()
-        self.trunk = trunk
-        self.neck = neck
-        self.scalp = scalp
-        assert self.trunk.channel_list == self.neck.backbone_channel_list, (
-            f"Channel dims of trunk {self.trunk.channel_list} and neck {self.neck.backbone_channel_list} do not match."
-        )
-
-    def forward(self, sample: torch.Tensor):
-        """Encode input through trunk and neck networks, returning multiscale features and positional encodings."""
-        features, pos = self.neck(self.trunk(sample))
-        if self.scalp > 0:
-            # Discard the lowest resolution features
-            features, pos = features[: -self.scalp], pos[: -self.scalp]
-
-        src = features[-1]
-        return {
-            "vision_features": src,
-            "vision_pos_enc": pos,
-            "backbone_fpn": features,
-        }
-
-
-class FpnNeck(nn.Module):
-    """
-    A Feature Pyramid Network (FPN) neck variant for multiscale feature fusion in object detection models.
-
-    This FPN variant removes the output convolution and uses bicubic interpolation for feature resizing,
-    similar to ViT positional embedding interpolation.
-
-    Attributes:
-        position_encoding (PositionEmbeddingSine): Sinusoidal positional encoding module.
-        convs (nn.ModuleList): List of convolutional layers for each backbone level.
-        backbone_channel_list (list[int]): List of channel dimensions from the backbone.
-        fpn_interp_model (str): Interpolation mode for FPN feature resizing.
-        fuse_type (str): Type of feature fusion, either 'sum' or 'avg'.
-        fpn_top_down_levels (list[int]): Levels to have top-down features in outputs.
-
-    Methods:
-        forward: Perform forward pass through the FPN neck.
-
-    Examples:
-        >>> backbone_channels = [64, 128, 256, 512]
-        >>> fpn_neck = FpnNeck(256, backbone_channels)
-        >>> inputs = [torch.rand(1, c, 32, 32) for c in backbone_channels]
-        >>> outputs, positions = fpn_neck(inputs)
-        >>> print(len(outputs), len(positions))
-        4 4
-    """
-
-    def __init__(
-        self,
-        d_model: int,
-        backbone_channel_list: list[int],
-        kernel_size: int = 1,
-        stride: int = 1,
-        padding: int = 0,
-        fpn_interp_model: str = "bilinear",
-        fuse_type: str = "sum",
-        fpn_top_down_levels: list[int] | None = None,
-    ):
-        """
-        Initialize a modified Feature Pyramid Network (FPN) neck.
-
-        This FPN variant removes the output convolution and uses bicubic interpolation for feature resizing,
-        similar to ViT positional embedding interpolation.
-
-        Args:
-            d_model (int): Dimension of the model.
-            backbone_channel_list (list[int]): List of channel dimensions from the backbone.
-            kernel_size (int): Kernel size for the convolutional layers.
-            stride (int): Stride for the convolutional layers.
-            padding (int): Padding for the convolutional layers.
-            fpn_interp_model (str): Interpolation mode for FPN feature resizing.
-            fuse_type (str): Type of feature fusion, either 'sum' or 'avg'.
-            fpn_top_down_levels (Optional[list[int]]): Levels to have top-down features in outputs.
-
-        Examples:
-            >>> backbone_channels = [64, 128, 256, 512]
-            >>> fpn_neck = FpnNeck(256, backbone_channels)
-            >>> print(fpn_neck)
-        """
-        super().__init__()
-        self.position_encoding = PositionEmbeddingSine(num_pos_feats=256)
-        self.convs = nn.ModuleList()
-        self.backbone_channel_list = backbone_channel_list
-        for dim in backbone_channel_list:
-            current = nn.Sequential()
-            current.add_module(
-                "conv",
-                nn.Conv2d(
-                    in_channels=dim,
-                    out_channels=d_model,
-                    kernel_size=kernel_size,
-                    stride=stride,
-                    padding=padding,
-                ),
-            )
-
-            self.convs.append(current)
-        self.fpn_interp_model = fpn_interp_model
-        assert fuse_type in {"sum", "avg"}
-        self.fuse_type = fuse_type
-
-        # Levels to have top-down features in its outputs
-        # e.g. if fpn_top_down_levels is [2, 3], then only outputs of level 2 and 3
-        # have top-down propagation, while outputs of level 0 and level 1 have only
-        # lateral features from the same backbone level
-        if fpn_top_down_levels is None:
-            # Default is to have top-down features on all levels
-            fpn_top_down_levels = range(len(self.convs))
-        self.fpn_top_down_levels = list(fpn_top_down_levels)
-
-    def forward(self, xs: list[torch.Tensor]):
-        """
-        Perform forward pass through the Feature Pyramid Network (FPN) neck.
-
-        This method processes a list of input tensors from the backbone through the FPN, applying lateral connections
-        and top-down feature fusion. It generates output feature maps and corresponding positional encodings.
-
-        Args:
-            xs (list[torch.Tensor]): List of input tensors from the backbone, each with shape (B, C, H, W).
-
-        Returns:
-            out (list[torch.Tensor]): List of output feature maps after FPN processing, each with shape
-                (B, d_model, H, W).
-            pos (list[torch.Tensor]): List of positional encodings corresponding to each output feature map.
-
-        Examples:
-            >>> fpn_neck = FpnNeck(d_model=256, backbone_channel_list=[64, 128, 256, 512])
-            >>> inputs = [torch.rand(1, c, 32, 32) for c in [64, 128, 256, 512]]
-            >>> outputs, positions = fpn_neck(inputs)
-            >>> print(len(outputs), len(positions))
-            4 4
-        """
-        out = [None] * len(self.convs)
-        pos = [None] * len(self.convs)
-        assert len(xs) == len(self.convs)
-        # FPN forward pass
-        # see https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/fpn.py
-        prev_features = None
-        # Forward in top-down order (from low to high resolution)
-        n = len(self.convs) - 1
-        for i in range(n, -1, -1):
-            x = xs[i]
-            lateral_features = self.convs[n - i](x)
-            if i in self.fpn_top_down_levels and prev_features is not None:
-                top_down_features = F.interpolate(
-                    prev_features.to(dtype=x.dtype),
-                    scale_factor=2.0,
-                    mode=self.fpn_interp_model,
-                    align_corners=(None if self.fpn_interp_model == "nearest" else False),
-                    antialias=False,
-                )
-                prev_features = lateral_features + top_down_features
-                if self.fuse_type == "avg":
-                    prev_features /= 2
-            else:
-                prev_features = lateral_features
-            x_out = prev_features
-            out[i] = x_out
-            pos[i] = self.position_encoding(x_out).to(x_out.dtype)
-
-        return out, pos
-
-
-class Hiera(nn.Module):
-    """
-    Hierarchical vision transformer for efficient multiscale feature extraction in image processing tasks.
-
-    This class implements a Hiera model, which is a hierarchical vision transformer architecture designed for
-    efficient multiscale feature extraction. It uses a series of transformer blocks organized into stages,
-    with optional pooling and global attention mechanisms.
-
-    Attributes:
-        window_spec (tuple[int, ...]): Window sizes for each stage.
-        q_stride (tuple[int, int]): Downsampling stride between stages.
-        stage_ends (list[int]): Indices of the last block in each stage.
-        q_pool_blocks (list[int]): Indices of blocks where pooling is applied.
-        return_interm_layers (bool): Whether to return intermediate layer outputs.
-        patch_embed (PatchEmbed): Module for patch embedding.
-        global_att_blocks (tuple[int, ...]): Indices of blocks with global attention.
-        window_pos_embed_bkg_spatial_size (tuple[int, int]): Spatial size for window positional embedding background.
-        pos_embed (nn.Parameter): Positional embedding for the background.
-        pos_embed_window (nn.Parameter): Positional embedding for the window.
-        blocks (nn.ModuleList): List of MultiScaleBlock modules.
-        channel_list (list[int]): List of output channel dimensions for each stage.
-
-    Methods:
-        _get_pos_embed: Generate positional embeddings by interpolating and combining window and background embeddings.
-        forward: Perform the forward pass through the Hiera model.
-
-    Examples:
-        >>> model = Hiera(embed_dim=96, num_heads=1, stages=(2, 3, 16, 3))
-        >>> input_tensor = torch.randn(1, 3, 224, 224)
-        >>> output_features = model(input_tensor)
-        >>> for feat in output_features:
-        ...     print(feat.shape)
-    """
-
-    def __init__(
-        self,
-        embed_dim: int = 96,  # initial embed dim
-        num_heads: int = 1,  # initial number of heads
-        drop_path_rate: float = 0.0,  # stochastic depth
-        q_pool: int = 3,  # number of q_pool stages
-        q_stride: tuple[int, int] = (2, 2),  # downsample stride bet. stages
-        stages: tuple[int, ...] = (2, 3, 16, 3),  # blocks per stage
-        dim_mul: float = 2.0,  # dim_mul factor at stage shift
-        head_mul: float = 2.0,  # head_mul factor at stage shift
-        window_pos_embed_bkg_spatial_size: tuple[int, int] = (14, 14),
-        # window size per stage, when not using global att.
-        window_spec: tuple[int, ...] = (
-            8,
-            4,
-            14,
-            7,
-        ),
-        # global attn in these blocks
-        global_att_blocks: tuple[int, ...] = (
-            12,
-            16,
-            20,
-        ),
-        return_interm_layers=True,  # return feats from every stage
-    ):
-        """
-        Initialize a Hiera model, a hierarchical vision transformer for efficient multiscale feature extraction.
-
-        Hiera is a hierarchical vision transformer architecture designed for efficient multiscale feature extraction
-        in image processing tasks. It uses a series of transformer blocks organized into stages, with optional
-        pooling and global attention mechanisms.
-
-        Args:
-            embed_dim (int): Initial embedding dimension for the model.
-            num_heads (int): Initial number of attention heads.
-            drop_path_rate (float): Stochastic depth rate.
-            q_pool (int): Number of query pooling stages.
-            q_stride (tuple[int, int]): Downsampling stride between stages.
-            stages (tuple[int, ...]): Number of blocks per stage.
-            dim_mul (float): Dimension multiplier factor at stage transitions.
-            head_mul (float): Head multiplier factor at stage transitions.
-            window_pos_embed_bkg_spatial_size (tuple[int, int]): Spatial size for window positional embedding background.
-            window_spec (tuple[int, ...]): Window sizes for each stage when not using global attention.
-            global_att_blocks (tuple[int, ...]): Indices of blocks that use global attention.
-            return_interm_layers (bool): Whether to return intermediate layer outputs.
-
-        Examples:
-            >>> model = Hiera(embed_dim=96, num_heads=1, stages=(2, 3, 16, 3))
-            >>> input_tensor = torch.randn(1, 3, 224, 224)
-            >>> output_features = model(input_tensor)
-            >>> for feat in output_features:
-            ...     print(feat.shape)
-        """
-        super().__init__()
-
-        assert len(stages) == len(window_spec)
-        self.window_spec = window_spec
-
-        depth = sum(stages)
-        self.q_stride = q_stride
-        self.stage_ends = [sum(stages[:i]) - 1 for i in range(1, len(stages) + 1)]
-        assert 0 <= q_pool <= len(self.stage_ends[:-1])
-        self.q_pool_blocks = [x + 1 for x in self.stage_ends[:-1]][:q_pool]
-        self.return_interm_layers = return_interm_layers
-
-        self.patch_embed = PatchEmbed(
-            embed_dim=embed_dim,
-            kernel_size=(7, 7),
-            stride=(4, 4),
-            padding=(3, 3),
-        )
-        # Which blocks have global attention?
-        self.global_att_blocks = global_att_blocks
-
-        # Windowed positional embedding (https://arxiv.org/abs/2311.05613)
-        self.window_pos_embed_bkg_spatial_size = window_pos_embed_bkg_spatial_size
-        self.pos_embed = nn.Parameter(torch.zeros(1, embed_dim, *self.window_pos_embed_bkg_spatial_size))
-        self.pos_embed_window = nn.Parameter(torch.zeros(1, embed_dim, self.window_spec[0], self.window_spec[0]))
-
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
-
-        cur_stage = 1
-        self.blocks = nn.ModuleList()
-
-        for i in range(depth):
-            dim_out = embed_dim
-            # Lags by a block, so first block of next stage uses an initial window size
-            # of previous stage and final window size of current stage
-            window_size = self.window_spec[cur_stage - 1]
-
-            if self.global_att_blocks is not None:
-                window_size = 0 if i in self.global_att_blocks else window_size
-
-            if i - 1 in self.stage_ends:
-                dim_out = int(embed_dim * dim_mul)
-                num_heads = int(num_heads * head_mul)
-                cur_stage += 1
-
-            block = MultiScaleBlock(
-                dim=embed_dim,
-                dim_out=dim_out,
-                num_heads=num_heads,
-                drop_path=dpr[i],
-                q_stride=self.q_stride if i in self.q_pool_blocks else None,
-                window_size=window_size,
-            )
-
-            embed_dim = dim_out
-            self.blocks.append(block)
-
-        self.channel_list = (
-            [self.blocks[i].dim_out for i in self.stage_ends[::-1]]
-            if return_interm_layers
-            else [self.blocks[-1].dim_out]
-        )
-
-    def _get_pos_embed(self, hw: tuple[int, int]) -> torch.Tensor:
-        """Generate positional embeddings by interpolating and combining window and background embeddings."""
-        h, w = hw
-        window_embed = self.pos_embed_window
-        pos_embed = F.interpolate(self.pos_embed, size=(h, w), mode="bicubic")
-        pos_embed = pos_embed + window_embed.tile([x // y for x, y in zip(pos_embed.shape, window_embed.shape)])
-        pos_embed = pos_embed.permute(0, 2, 3, 1)
-        return pos_embed
-
-    def forward(self, x: torch.Tensor) -> list[torch.Tensor]:
-        """
-        Perform forward pass through Hiera model, extracting multiscale features from input images.
-
-        Args:
-            x (torch.Tensor): Input tensor with shape (B, C, H, W) representing a batch of images.
-
-        Returns:
-            (list[torch.Tensor]): List of feature maps at different scales, each with shape (B, C_i, H_i, W_i), where
-                C_i is the channel dimension and H_i, W_i are the spatial dimensions at scale i. The list is ordered
-                from highest resolution (fine features) to lowest resolution (coarse features) if return_interm_layers
-                is True, otherwise contains only the final output.
-
-        Examples:
-            >>> model = Hiera(embed_dim=96, num_heads=1, stages=(2, 3, 16, 3))
-            >>> input_tensor = torch.randn(1, 3, 224, 224)
-            >>> output_features = model(input_tensor)
-            >>> for feat in output_features:
-            ...     print(feat.shape)
-        """
-        x = self.patch_embed(x)
-        # x: (B, H, W, C)
-
-        # Add positional embedding
-        x = x + self._get_pos_embed(x.shape[1:3])
-
-        outputs = []
-        for i, blk in enumerate(self.blocks):
-            x = blk(x)
-            if (i == self.stage_ends[-1]) or (i in self.stage_ends and self.return_interm_layers):
-                feats = x.permute(0, 3, 1, 2)
-                outputs.append(feats)
-
-        return outputs
diff --git a/ultralytics/models/sam/modules/memory_attention.py b/ultralytics/models/sam/modules/memory_attention.py
deleted file mode 100644
index 1aa48ec..0000000
--- a/ultralytics/models/sam/modules/memory_attention.py
+++ /dev/null
@@ -1,312 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import copy
-
-import torch
-from torch import nn
-
-from .blocks import RoPEAttention
-
-
-class MemoryAttentionLayer(nn.Module):
-    """
-    Implements a memory attention layer with self-attention and cross-attention mechanisms for neural networks.
-
-    This class combines self-attention, cross-attention, and feedforward components to process input tensors and
-    generate memory-based attention outputs.
-
-    Attributes:
-        d_model (int): Dimensionality of the model.
-        dim_feedforward (int): Dimensionality of the feedforward network.
-        dropout_value (float): Dropout rate for regularization.
-        self_attn (RoPEAttention): Self-attention mechanism using RoPE (Rotary Position Embedding).
-        cross_attn_image (RoPEAttention): Cross-attention mechanism for image processing.
-        linear1 (nn.Linear): First linear layer of the feedforward network.
-        linear2 (nn.Linear): Second linear layer of the feedforward network.
-        norm1 (nn.LayerNorm): Layer normalization for self-attention output.
-        norm2 (nn.LayerNorm): Layer normalization for cross-attention output.
-        norm3 (nn.LayerNorm): Layer normalization for feedforward network output.
-        dropout1 (nn.Dropout): Dropout layer after self-attention.
-        dropout2 (nn.Dropout): Dropout layer after cross-attention.
-        dropout3 (nn.Dropout): Dropout layer after feedforward network.
-        activation (nn.ReLU): Activation function for the feedforward network.
-        pos_enc_at_attn (bool): Flag to add positional encoding at attention.
-        pos_enc_at_cross_attn_queries (bool): Flag to add positional encoding to cross-attention queries.
-        pos_enc_at_cross_attn_keys (bool): Flag to add positional encoding to cross-attention keys.
-
-    Methods:
-        forward: Performs the full memory attention operation on input tensors.
-        _forward_sa: Performs self-attention on input tensor.
-        _forward_ca: Performs cross-attention between target and memory tensors.
-
-    Examples:
-        >>> layer = MemoryAttentionLayer(d_model=256, dim_feedforward=2048, dropout=0.1)
-        >>> tgt = torch.randn(1, 100, 256)
-        >>> memory = torch.randn(1, 100, 64)
-        >>> pos = torch.randn(1, 100, 256)
-        >>> query_pos = torch.randn(1, 100, 256)
-        >>> output = layer(tgt, memory, pos, query_pos)
-        >>> print(output.shape)
-        torch.Size([1, 100, 256])
-    """
-
-    def __init__(
-        self,
-        d_model: int = 256,
-        dim_feedforward: int = 2048,
-        dropout: float = 0.1,
-        pos_enc_at_attn: bool = False,
-        pos_enc_at_cross_attn_keys: bool = True,
-        pos_enc_at_cross_attn_queries: bool = False,
-    ):
-        """
-        Initialize a memory attention layer with self-attention, cross-attention, and feedforward components.
-
-        Args:
-            d_model (int): Dimensionality of the model.
-            dim_feedforward (int): Dimensionality of the feedforward network.
-            dropout (float): Dropout rate for regularization.
-            pos_enc_at_attn (bool): Whether to add positional encoding at attention.
-            pos_enc_at_cross_attn_keys (bool): Whether to add positional encoding to cross-attention keys.
-            pos_enc_at_cross_attn_queries (bool): Whether to add positional encoding to cross-attention queries.
-        """
-        super().__init__()
-        self.d_model = d_model
-        self.dim_feedforward = dim_feedforward
-        self.dropout_value = dropout
-        self.self_attn = RoPEAttention(embedding_dim=256, num_heads=1, downsample_rate=1)
-        self.cross_attn_image = RoPEAttention(
-            rope_k_repeat=True,
-            embedding_dim=256,
-            num_heads=1,
-            downsample_rate=1,
-            kv_in_dim=64,
-        )
-
-        # Implementation of Feedforward model
-        self.linear1 = nn.Linear(d_model, dim_feedforward)
-        self.dropout = nn.Dropout(dropout)
-        self.linear2 = nn.Linear(dim_feedforward, d_model)
-
-        self.norm1 = nn.LayerNorm(d_model)
-        self.norm2 = nn.LayerNorm(d_model)
-        self.norm3 = nn.LayerNorm(d_model)
-        self.dropout1 = nn.Dropout(dropout)
-        self.dropout2 = nn.Dropout(dropout)
-        self.dropout3 = nn.Dropout(dropout)
-
-        self.activation = nn.ReLU()
-
-        # Where to add pos enc
-        self.pos_enc_at_attn = pos_enc_at_attn
-        self.pos_enc_at_cross_attn_queries = pos_enc_at_cross_attn_queries
-        self.pos_enc_at_cross_attn_keys = pos_enc_at_cross_attn_keys
-
-    def _forward_sa(self, tgt: torch.Tensor, query_pos: torch.Tensor | None) -> torch.Tensor:
-        """Perform self-attention on input tensor using positional encoding and RoPE attention mechanism."""
-        tgt2 = self.norm1(tgt)
-        q = k = tgt2 + query_pos if self.pos_enc_at_attn else tgt2
-        tgt2 = self.self_attn(q, k, v=tgt2)
-        tgt = tgt + self.dropout1(tgt2)
-        return tgt
-
-    def _forward_ca(
-        self,
-        tgt: torch.Tensor,
-        memory: torch.Tensor,
-        query_pos: torch.Tensor | None,
-        pos: torch.Tensor | None,
-        num_k_exclude_rope: int = 0,
-    ) -> torch.Tensor:
-        """Perform cross-attention between target and memory tensors using RoPEAttention mechanism."""
-        kwds = {}
-        if num_k_exclude_rope > 0:
-            assert isinstance(self.cross_attn_image, RoPEAttention)
-            kwds = {"num_k_exclude_rope": num_k_exclude_rope}
-
-        # Cross-Attention
-        tgt2 = self.norm2(tgt)
-        tgt2 = self.cross_attn_image(
-            q=tgt2 + query_pos if self.pos_enc_at_cross_attn_queries else tgt2,
-            k=memory + pos if self.pos_enc_at_cross_attn_keys else memory,
-            v=memory,
-            **kwds,
-        )
-        tgt = tgt + self.dropout2(tgt2)
-        return tgt
-
-    def forward(
-        self,
-        tgt: torch.Tensor,
-        memory: torch.Tensor,
-        pos: torch.Tensor | None = None,
-        query_pos: torch.Tensor | None = None,
-        num_k_exclude_rope: int = 0,
-    ) -> torch.Tensor:
-        """
-        Process input tensors through self-attention, cross-attention, and feedforward network layers.
-
-        Args:
-            tgt (torch.Tensor): Target tensor for self-attention with shape (N, L, D).
-            memory (torch.Tensor): Memory tensor for cross-attention with shape (N, S, D).
-            pos (Optional[torch.Tensor]): Positional encoding for memory tensor.
-            query_pos (Optional[torch.Tensor]): Positional encoding for target tensor.
-            num_k_exclude_rope (int): Number of keys to exclude from rotary position embedding.
-
-        Returns:
-            (torch.Tensor): Processed tensor after attention and feedforward layers with shape (N, L, D).
-        """
-        tgt = self._forward_sa(tgt, query_pos)
-        tgt = self._forward_ca(tgt, memory, query_pos, pos, num_k_exclude_rope)
-        # MLP
-        tgt2 = self.norm3(tgt)
-        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
-        tgt = tgt + self.dropout3(tgt2)
-        return tgt
-
-
-class MemoryAttention(nn.Module):
-    """
-    Memory attention module for processing sequential data with self and cross-attention mechanisms.
-
-    This class implements a multi-layer attention mechanism that combines self-attention and cross-attention
-    for processing sequential data, particularly useful in transformer-like architectures.
-
-    Attributes:
-        d_model (int): The dimension of the model's hidden state.
-        layers (nn.ModuleList): A list of MemoryAttentionLayer modules.
-        num_layers (int): The number of attention layers.
-        norm (nn.LayerNorm): Layer normalization applied to the output.
-        pos_enc_at_input (bool): Whether to apply positional encoding at the input.
-        batch_first (bool): Whether the input tensors are in batch-first format.
-
-    Methods:
-        forward: Processes input tensors through the attention layers.
-
-    Examples:
-        >>> d_model = 256
-        >>> layer = MemoryAttentionLayer(d_model)
-        >>> attention = MemoryAttention(d_model, pos_enc_at_input=True, layer=layer, num_layers=3)
-        >>> curr = torch.randn(10, 32, d_model)  # (seq_len, batch_size, d_model)
-        >>> memory = torch.randn(20, 32, d_model)  # (mem_len, batch_size, d_model)
-        >>> curr_pos = torch.randn(10, 32, d_model)
-        >>> memory_pos = torch.randn(20, 32, d_model)
-        >>> output = attention(curr, memory, curr_pos, memory_pos)
-        >>> print(output.shape)
-        torch.Size([10, 32, 256])
-    """
-
-    def __init__(
-        self,
-        d_model: int,
-        pos_enc_at_input: bool,
-        layer: nn.Module,
-        num_layers: int,
-        batch_first: bool = True,  # Do layers expect batch first input?
-    ):
-        """
-        Initialize MemoryAttention with specified layers and normalization for sequential data processing.
-
-        This class implements a multi-layer attention mechanism that combines self-attention and cross-attention
-        for processing sequential data, particularly useful in transformer-like architectures.
-
-        Args:
-            d_model (int): The dimension of the model's hidden state.
-            pos_enc_at_input (bool): Whether to apply positional encoding at the input.
-            layer (nn.Module): The attention layer to be used in the module.
-            num_layers (int): The number of attention layers.
-            batch_first (bool): Whether the input tensors are in batch-first format.
-
-        Examples:
-            >>> d_model = 256
-            >>> layer = MemoryAttentionLayer(d_model)
-            >>> attention = MemoryAttention(d_model, pos_enc_at_input=True, layer=layer, num_layers=3)
-            >>> curr = torch.randn(10, 32, d_model)  # (seq_len, batch_size, d_model)
-            >>> memory = torch.randn(20, 32, d_model)  # (mem_len, batch_size, d_model)
-            >>> curr_pos = torch.randn(10, 32, d_model)
-            >>> memory_pos = torch.randn(20, 32, d_model)
-            >>> output = attention(curr, memory, curr_pos, memory_pos)
-            >>> print(output.shape)
-            torch.Size([10, 32, 256])
-        """
-        super().__init__()
-        self.d_model = d_model
-        self.layers = nn.ModuleList([copy.deepcopy(layer) for _ in range(num_layers)])
-        self.num_layers = num_layers
-        self.norm = nn.LayerNorm(d_model)
-        self.pos_enc_at_input = pos_enc_at_input
-        self.batch_first = batch_first
-
-    def forward(
-        self,
-        curr: torch.Tensor,  # self-attention inputs
-        memory: torch.Tensor,  # cross-attention inputs
-        curr_pos: torch.Tensor | None = None,  # pos_enc for self-attention inputs
-        memory_pos: torch.Tensor | None = None,  # pos_enc for cross-attention inputs
-        num_obj_ptr_tokens: int = 0,  # number of object pointer *tokens*
-    ) -> torch.Tensor:
-        """
-        Process inputs through attention layers, applying self and cross-attention with positional encoding.
-
-        Args:
-            curr (torch.Tensor): Self-attention input tensor, representing the current state.
-            memory (torch.Tensor): Cross-attention input tensor, representing memory information.
-            curr_pos (Optional[torch.Tensor]): Positional encoding for self-attention inputs.
-            memory_pos (Optional[torch.Tensor]): Positional encoding for cross-attention inputs.
-            num_obj_ptr_tokens (int): Number of object pointer tokens to exclude from rotary position embedding.
-
-        Returns:
-            (torch.Tensor): Processed output tensor after applying attention layers and normalization.
-
-        Examples:
-            >>> d_model = 256
-            >>> layer = MemoryAttentionLayer(d_model)
-            >>> attention = MemoryAttention(d_model, pos_enc_at_input=True, layer=layer, num_layers=3)
-            >>> curr = torch.randn(10, 32, d_model)  # (seq_len, batch_size, d_model)
-            >>> memory = torch.randn(20, 32, d_model)  # (mem_len, batch_size, d_model)
-            >>> curr_pos = torch.randn(10, 32, d_model)
-            >>> memory_pos = torch.randn(20, 32, d_model)
-            >>> output = attention(curr, memory, curr_pos, memory_pos)
-            >>> print(output.shape)
-            torch.Size([10, 32, 256])
-        """
-        if isinstance(curr, list):
-            assert isinstance(curr_pos, list)
-            assert len(curr) == len(curr_pos) == 1
-            curr, curr_pos = curr[0], curr_pos[0]
-
-        assert curr.shape[1] == memory.shape[1], "Batch size must be the same for curr and memory"
-
-        output = curr
-        if self.pos_enc_at_input and curr_pos is not None:
-            output = output + 0.1 * curr_pos
-
-        if self.batch_first:
-            # Convert to batch first
-            output = output.transpose(0, 1)
-            curr_pos = curr_pos.transpose(0, 1)
-            memory = memory.transpose(0, 1)
-            memory_pos = memory_pos.transpose(0, 1)
-
-        for layer in self.layers:
-            kwds = {}
-            if isinstance(layer.cross_attn_image, RoPEAttention):
-                kwds = {"num_k_exclude_rope": num_obj_ptr_tokens}
-
-            output = layer(
-                tgt=output,
-                memory=memory,
-                pos=memory_pos,
-                query_pos=curr_pos,
-                **kwds,
-            )
-        normed_output = self.norm(output)
-
-        if self.batch_first:
-            # Convert back to seq first
-            normed_output = normed_output.transpose(0, 1)
-            curr_pos = curr_pos.transpose(0, 1)
-
-        return normed_output
diff --git a/ultralytics/models/sam/modules/sam.py b/ultralytics/models/sam/modules/sam.py
deleted file mode 100644
index 719b50c..0000000
--- a/ultralytics/models/sam/modules/sam.py
+++ /dev/null
@@ -1,1033 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-
-from __future__ import annotations
-
-import torch
-import torch.nn.functional as F
-from torch import nn
-from torch.nn.init import trunc_normal_
-
-from ultralytics.nn.modules import MLP
-from ultralytics.utils import LOGGER
-
-from .blocks import SAM2TwoWayTransformer
-from .decoders import MaskDecoder, SAM2MaskDecoder
-from .encoders import ImageEncoderViT, PromptEncoder
-from .utils import get_1d_sine_pe, select_closest_cond_frames
-
-# a large negative value as a placeholder score for missing objects
-NO_OBJ_SCORE = -1024.0
-
-
-class SAMModel(nn.Module):
-    """
-    Segment Anything Model (SAM) for object segmentation tasks.
-
-    This class combines image encoders, prompt encoders, and mask decoders to predict object masks from images
-    and input prompts.
-
-    Attributes:
-        mask_threshold (float): Threshold value for mask prediction.
-        image_encoder (ImageEncoderViT): Backbone for encoding images into embeddings.
-        prompt_encoder (PromptEncoder): Encoder for various types of input prompts.
-        mask_decoder (MaskDecoder): Predicts object masks from image and prompt embeddings.
-        pixel_mean (torch.Tensor): Mean values for normalizing pixels in the input image.
-        pixel_std (torch.Tensor): Standard deviation values for normalizing pixels in the input image.
-
-    Methods:
-        set_imgsz: Set image size to make model compatible with different image sizes.
-
-    Examples:
-        >>> image_encoder = ImageEncoderViT(...)
-        >>> prompt_encoder = PromptEncoder(...)
-        >>> mask_decoder = MaskDecoder(...)
-        >>> sam_model = SAMModel(image_encoder, prompt_encoder, mask_decoder)
-        >>> # Further usage depends on SAMPredictor class
-
-    Notes:
-        All forward() operations are implemented in the SAMPredictor class.
-    """
-
-    mask_threshold: float = 0.0
-
-    def __init__(
-        self,
-        image_encoder: ImageEncoderViT,
-        prompt_encoder: PromptEncoder,
-        mask_decoder: MaskDecoder,
-        pixel_mean: list[float] = (123.675, 116.28, 103.53),
-        pixel_std: list[float] = (58.395, 57.12, 57.375),
-    ) -> None:
-        """
-        Initialize the SAMModel class to predict object masks from an image and input prompts.
-
-        Args:
-            image_encoder (ImageEncoderViT): The backbone used to encode the image into image embeddings.
-            prompt_encoder (PromptEncoder): Encodes various types of input prompts.
-            mask_decoder (MaskDecoder): Predicts masks from the image embeddings and encoded prompts.
-            pixel_mean (list[float]): Mean values for normalizing pixels in the input image.
-            pixel_std (list[float]): Standard deviation values for normalizing pixels in the input image.
-
-        Examples:
-            >>> image_encoder = ImageEncoderViT(...)
-            >>> prompt_encoder = PromptEncoder(...)
-            >>> mask_decoder = MaskDecoder(...)
-            >>> sam_model = SAMModel(image_encoder, prompt_encoder, mask_decoder)
-            >>> # Further usage depends on SAMPredictor class
-
-        Notes:
-            All forward() operations moved to SAMPredictor.
-        """
-        super().__init__()
-        self.image_encoder = image_encoder
-        self.prompt_encoder = prompt_encoder
-        self.mask_decoder = mask_decoder
-        self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False)
-        self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False)
-
-    def set_imgsz(self, imgsz):
-        """Set image size to make model compatible with different image sizes."""
-        if hasattr(self.image_encoder, "set_imgsz"):
-            self.image_encoder.set_imgsz(imgsz)
-        self.prompt_encoder.input_image_size = imgsz
-        self.prompt_encoder.image_embedding_size = [x // 16 for x in imgsz]  # 16 is fixed as patch size of ViT model
-        self.image_encoder.img_size = imgsz[0]
-
-
-class SAM2Model(torch.nn.Module):
-    """
-    SAM2Model class for Segment Anything Model 2 with memory-based video object segmentation capabilities.
-
-    This class extends the functionality of SAM to handle video sequences, incorporating memory mechanisms
-    for temporal consistency and efficient tracking of objects across frames.
-
-    Attributes:
-        mask_threshold (float): Threshold value for mask prediction.
-        image_encoder (ImageEncoderViT): Visual encoder for extracting image features.
-        memory_attention (nn.Module): Module for attending to memory features.
-        memory_encoder (nn.Module): Encoder for generating memory representations.
-        num_maskmem (int): Number of accessible memory frames.
-        image_size (int): Size of input images.
-        backbone_stride (int): Stride of the backbone network output.
-        sam_prompt_embed_dim (int): Dimension of SAM prompt embeddings.
-        sam_image_embedding_size (int): Size of SAM image embeddings.
-        sam_prompt_encoder (PromptEncoder): Encoder for processing input prompts.
-        sam_mask_decoder (SAM2MaskDecoder): Decoder for generating object masks.
-        obj_ptr_proj (nn.Module): Projection layer for object pointers.
-        obj_ptr_tpos_proj (nn.Module): Projection for temporal positional encoding in object pointers.
-        hidden_dim (int): Hidden dimension of the model.
-        mem_dim (int): Memory dimension for encoding features.
-        use_high_res_features_in_sam (bool): Whether to use high-resolution feature maps in the SAM mask decoder.
-        use_obj_ptrs_in_encoder (bool): Whether to cross-attend to object pointers from other frames in the encoder.
-        max_obj_ptrs_in_encoder (int): Maximum number of object pointers from other frames in encoder cross-attention.
-        add_tpos_enc_to_obj_ptrs (bool): Whether to add temporal positional encoding to object pointers.
-        proj_tpos_enc_in_obj_ptrs (bool): Whether to add an extra linear projection layer for temporal positional
-            encoding in object pointers.
-        use_signed_tpos_enc_to_obj_ptrs (bool): Whether to use signed distance in temporal positional encoding.
-        only_obj_ptrs_in_the_past_for_eval (bool): Whether to only attend to object pointers in the past during
-            evaluation.
-        pred_obj_scores (bool): Whether to predict if there is an object in the frame.
-        pred_obj_scores_mlp (bool): Whether to use an MLP to predict object scores.
-        fixed_no_obj_ptr (bool): Whether to have a fixed no-object pointer when there is no object present.
-        soft_no_obj_ptr (bool): Whether to mix in no-object pointer softly for easier recovery and error mitigation.
-        use_mlp_for_obj_ptr_proj (bool): Whether to use MLP for object pointer projection.
-        no_obj_embed_spatial (torch.Tensor | None): No-object embedding for spatial frames.
-        max_cond_frames_in_attn (int): Maximum number of conditioning frames to participate in memory attention.
-        directly_add_no_mem_embed (bool): Whether to directly add no-memory embedding to image feature on the
-            first frame.
-        multimask_output_in_sam (bool): Whether to output multiple masks for the first click on initial
-            conditioning frames.
-        multimask_min_pt_num (int): Minimum number of clicks to use multimask output in SAM.
-        multimask_max_pt_num (int): Maximum number of clicks to use multimask output in SAM.
-        multimask_output_for_tracking (bool): Whether to use multimask output for tracking.
-        use_multimask_token_for_obj_ptr (bool): Whether to use multimask tokens for object pointers.
-        iou_prediction_use_sigmoid (bool): Whether to use sigmoid to restrict IoU prediction to [0-1].
-        memory_temporal_stride_for_eval (int): Memory bank's temporal stride during evaluation.
-        non_overlap_masks_for_mem_enc (bool): Whether to apply non-overlapping constraints on object masks in
-            memory encoder during evaluation.
-        sigmoid_scale_for_mem_enc (float): Scale factor for mask sigmoid probability.
-        sigmoid_bias_for_mem_enc (float): Bias factor for mask sigmoid probability.
-        binarize_mask_from_pts_for_mem_enc (bool): Whether to binarize sigmoid mask logits on interacted frames
-            with clicks during evaluation.
-        use_mask_input_as_output_without_sam (bool): Whether to directly output the input mask without using SAM
-            prompt encoder and mask decoder on frames with mask input.
-
-    Methods:
-        forward_image: Process image batch through encoder to extract multi-level features.
-        track_step: Perform a single tracking step, updating object masks and memory features.
-        set_binarize: Set binarize for VideoPredictor.
-        set_imgsz: Set image size to make model compatible with different image sizes.
-
-    Examples:
-        >>> model = SAM2Model(image_encoder, memory_attention, memory_encoder)
-        >>> image_batch = torch.rand(1, 3, 512, 512)
-        >>> features = model.forward_image(image_batch)
-        >>> track_results = model.track_step(0, True, features, None, None, None, {})
-    """
-
-    mask_threshold: float = 0.0
-
-    def __init__(
-        self,
-        image_encoder,
-        memory_attention,
-        memory_encoder,
-        num_maskmem=7,
-        image_size=512,
-        backbone_stride=16,
-        sigmoid_scale_for_mem_enc=1.0,
-        sigmoid_bias_for_mem_enc=0.0,
-        binarize_mask_from_pts_for_mem_enc=False,
-        use_mask_input_as_output_without_sam=False,
-        max_cond_frames_in_attn=-1,
-        directly_add_no_mem_embed=False,
-        use_high_res_features_in_sam=False,
-        multimask_output_in_sam=False,
-        multimask_min_pt_num=1,
-        multimask_max_pt_num=1,
-        multimask_output_for_tracking=False,
-        use_multimask_token_for_obj_ptr: bool = False,
-        iou_prediction_use_sigmoid=False,
-        memory_temporal_stride_for_eval=1,
-        non_overlap_masks_for_mem_enc=False,
-        use_obj_ptrs_in_encoder=False,
-        max_obj_ptrs_in_encoder=16,
-        add_tpos_enc_to_obj_ptrs=True,
-        proj_tpos_enc_in_obj_ptrs=False,
-        use_signed_tpos_enc_to_obj_ptrs=False,
-        only_obj_ptrs_in_the_past_for_eval=False,
-        pred_obj_scores: bool = False,
-        pred_obj_scores_mlp: bool = False,
-        fixed_no_obj_ptr: bool = False,
-        soft_no_obj_ptr: bool = False,
-        use_mlp_for_obj_ptr_proj: bool = False,
-        no_obj_embed_spatial: bool = False,
-        sam_mask_decoder_extra_args=None,
-        compile_image_encoder: bool = False,
-    ):
-        """
-        Initialize the SAM2Model for video object segmentation with memory-based tracking.
-
-        Args:
-            image_encoder (nn.Module): Visual encoder for extracting image features.
-            memory_attention (nn.Module): Module for attending to memory features.
-            memory_encoder (nn.Module): Encoder for generating memory representations.
-            num_maskmem (int): Number of accessible memory frames.
-            image_size (int): Size of input images.
-            backbone_stride (int): Stride of the image backbone output.
-            sigmoid_scale_for_mem_enc (float): Scale factor for mask sigmoid probability.
-            sigmoid_bias_for_mem_enc (float): Bias factor for mask sigmoid probability.
-            binarize_mask_from_pts_for_mem_enc (bool): Whether to binarize sigmoid mask logits on interacted frames
-                with clicks during evaluation.
-            use_mask_input_as_output_without_sam (bool): Whether to directly output the input mask without using SAM
-                prompt encoder and mask decoder on frames with mask input.
-            max_cond_frames_in_attn (int): Maximum number of conditioning frames to participate in memory attention.
-            directly_add_no_mem_embed (bool): Whether to directly add no-memory embedding to image feature on the
-                first frame.
-            use_high_res_features_in_sam (bool): Whether to use high-resolution feature maps in the SAM mask decoder.
-            multimask_output_in_sam (bool): Whether to output multiple masks for the first click on initial
-                conditioning frames.
-            multimask_min_pt_num (int): Minimum number of clicks to use multimask output in SAM.
-            multimask_max_pt_num (int): Maximum number of clicks to use multimask output in SAM.
-            multimask_output_for_tracking (bool): Whether to use multimask output for tracking.
-            use_multimask_token_for_obj_ptr (bool): Whether to use multimask tokens for object pointers.
-            iou_prediction_use_sigmoid (bool): Whether to use sigmoid to restrict IoU prediction to [0-1].
-            memory_temporal_stride_for_eval (int): Memory bank's temporal stride during evaluation.
-            non_overlap_masks_for_mem_enc (bool): Whether to apply non-overlapping constraints on object masks in
-                memory encoder during evaluation.
-            use_obj_ptrs_in_encoder (bool): Whether to cross-attend to object pointers from other frames in the encoder.
-            max_obj_ptrs_in_encoder (int): Maximum number of object pointers from other frames in encoder
-                cross-attention.
-            add_tpos_enc_to_obj_ptrs (bool): Whether to add temporal positional encoding to object pointers in
-                the encoder.
-            proj_tpos_enc_in_obj_ptrs (bool): Whether to add an extra linear projection layer for temporal positional
-                encoding in object pointers.
-            use_signed_tpos_enc_to_obj_ptrs (bool): Whether to use signed distance in the temporal positional encoding
-                in the object pointers.
-            only_obj_ptrs_in_the_past_for_eval (bool): Whether to only attend to object pointers in the past
-                during evaluation.
-            pred_obj_scores (bool): Whether to predict if there is an object in the frame.
-            pred_obj_scores_mlp (bool): Whether to use an MLP to predict object scores.
-            fixed_no_obj_ptr (bool): Whether to have a fixed no-object pointer when there is no object present.
-            soft_no_obj_ptr (bool): Whether to mix in no-object pointer softly for easier recovery and error mitigation.
-            use_mlp_for_obj_ptr_proj (bool): Whether to use MLP for object pointer projection.
-            no_obj_embed_spatial (bool): Whether add no obj embedding to spatial frames.
-            sam_mask_decoder_extra_args (dict | None): Extra arguments for constructing the SAM mask decoder.
-            compile_image_encoder (bool): Whether to compile the image encoder for faster inference.
-
-        Examples:
-            >>> image_encoder = ImageEncoderViT(...)
-            >>> memory_attention = SAM2TwoWayTransformer(...)
-            >>> memory_encoder = nn.Sequential(...)
-            >>> model = SAM2Model(image_encoder, memory_attention, memory_encoder)
-            >>> image_batch = torch.rand(1, 3, 512, 512)
-            >>> features = model.forward_image(image_batch)
-            >>> track_results = model.track_step(0, True, features, None, None, None, {})
-        """
-        super().__init__()
-
-        # Part 1: the image backbone
-        self.image_encoder = image_encoder
-        # Use level 0, 1, 2 for high-res setting, or just level 2 for the default setting
-        self.use_high_res_features_in_sam = use_high_res_features_in_sam
-        self.num_feature_levels = 3 if use_high_res_features_in_sam else 1
-        self.use_obj_ptrs_in_encoder = use_obj_ptrs_in_encoder
-        self.max_obj_ptrs_in_encoder = max_obj_ptrs_in_encoder
-        if use_obj_ptrs_in_encoder:
-            # A conv layer to downsample the mask prompt to stride 4 (the same stride as
-            # low-res SAM mask logits) and to change its scales from 0~1 to SAM logit scale,
-            # so that it can be fed into the SAM mask decoder to generate a pointer.
-            self.mask_downsample = torch.nn.Conv2d(1, 1, kernel_size=4, stride=4)
-        self.add_tpos_enc_to_obj_ptrs = add_tpos_enc_to_obj_ptrs
-        if proj_tpos_enc_in_obj_ptrs:
-            assert add_tpos_enc_to_obj_ptrs  # these options need to be used together
-        self.proj_tpos_enc_in_obj_ptrs = proj_tpos_enc_in_obj_ptrs
-        self.use_signed_tpos_enc_to_obj_ptrs = use_signed_tpos_enc_to_obj_ptrs
-        self.only_obj_ptrs_in_the_past_for_eval = only_obj_ptrs_in_the_past_for_eval
-
-        # Part 2: memory attention to condition current frame's visual features
-        # with memories (and obj ptrs) from past frames
-        self.memory_attention = memory_attention
-        self.hidden_dim = memory_attention.d_model
-
-        # Part 3: memory encoder for the previous frame's outputs
-        self.memory_encoder = memory_encoder
-        self.mem_dim = self.hidden_dim
-        if hasattr(self.memory_encoder, "out_proj") and hasattr(self.memory_encoder.out_proj, "weight"):
-            # if there is compression of memories along channel dim
-            self.mem_dim = self.memory_encoder.out_proj.weight.shape[0]
-        self.num_maskmem = num_maskmem  # Number of memories accessible
-        # Temporal encoding of the memories
-        self.maskmem_tpos_enc = torch.nn.Parameter(torch.zeros(num_maskmem, 1, 1, self.mem_dim))
-        trunc_normal_(self.maskmem_tpos_enc, std=0.02)
-        # a single token to indicate no memory embedding from previous frames
-        self.no_mem_embed = torch.nn.Parameter(torch.zeros(1, 1, self.hidden_dim))
-        self.no_mem_pos_enc = torch.nn.Parameter(torch.zeros(1, 1, self.hidden_dim))
-        trunc_normal_(self.no_mem_embed, std=0.02)
-        trunc_normal_(self.no_mem_pos_enc, std=0.02)
-        self.directly_add_no_mem_embed = directly_add_no_mem_embed
-        # Apply sigmoid to the output raw mask logits (to turn them from
-        # range (-inf, +inf) to range (0, 1)) before feeding them into the memory encoder
-        self.sigmoid_scale_for_mem_enc = sigmoid_scale_for_mem_enc
-        self.sigmoid_bias_for_mem_enc = sigmoid_bias_for_mem_enc
-        self.binarize_mask_from_pts_for_mem_enc = binarize_mask_from_pts_for_mem_enc
-        self.non_overlap_masks_for_mem_enc = non_overlap_masks_for_mem_enc
-        self.memory_temporal_stride_for_eval = memory_temporal_stride_for_eval
-        # On frames with mask input, whether to directly output the input mask without
-        # using a SAM prompt encoder + mask decoder
-        self.use_mask_input_as_output_without_sam = use_mask_input_as_output_without_sam
-        self.multimask_output_in_sam = multimask_output_in_sam
-        self.multimask_min_pt_num = multimask_min_pt_num
-        self.multimask_max_pt_num = multimask_max_pt_num
-        self.multimask_output_for_tracking = multimask_output_for_tracking
-        self.use_multimask_token_for_obj_ptr = use_multimask_token_for_obj_ptr
-        self.iou_prediction_use_sigmoid = iou_prediction_use_sigmoid
-
-        # Part 4: SAM-style prompt encoder (for both mask and point inputs)
-        # and SAM-style mask decoder for the final mask output
-        self.image_size = image_size
-        self.backbone_stride = backbone_stride
-        self.sam_mask_decoder_extra_args = sam_mask_decoder_extra_args
-        self.pred_obj_scores = pred_obj_scores
-        self.pred_obj_scores_mlp = pred_obj_scores_mlp
-        self.fixed_no_obj_ptr = fixed_no_obj_ptr
-        self.soft_no_obj_ptr = soft_no_obj_ptr
-        if self.fixed_no_obj_ptr:
-            assert self.pred_obj_scores
-            assert self.use_obj_ptrs_in_encoder
-        if self.pred_obj_scores and self.use_obj_ptrs_in_encoder:
-            self.no_obj_ptr = torch.nn.Parameter(torch.zeros(1, self.hidden_dim))
-            trunc_normal_(self.no_obj_ptr, std=0.02)
-        self.use_mlp_for_obj_ptr_proj = use_mlp_for_obj_ptr_proj
-        self.no_obj_embed_spatial = None
-        if no_obj_embed_spatial:
-            self.no_obj_embed_spatial = torch.nn.Parameter(torch.zeros(1, self.mem_dim))
-            trunc_normal_(self.no_obj_embed_spatial, std=0.02)
-
-        self._build_sam_heads()
-        self.max_cond_frames_in_attn = max_cond_frames_in_attn
-
-        # Model compilation
-        if compile_image_encoder:
-            # Compile the forward function (not the full module) to allow loading checkpoints.
-            LOGGER.info("Image encoder compilation is enabled. First forward pass will be slow.")
-            self.image_encoder.forward = torch.compile(
-                self.image_encoder.forward,
-                mode="max-autotune",
-                fullgraph=True,
-                dynamic=False,
-            )
-
-    @property
-    def device(self):
-        """Return the device on which the model's parameters are stored."""
-        return next(self.parameters()).device
-
-    def forward(self, *args, **kwargs):
-        """Process image and prompt inputs to generate object masks and scores in video sequences."""
-        raise NotImplementedError(
-            "Please use the corresponding methods in SAM2VideoPredictor for inference."
-            "See notebooks/video_predictor_example.ipynb for an example."
-        )
-
-    def _build_sam_heads(self):
-        """Build SAM-style prompt encoder and mask decoder for image segmentation tasks."""
-        self.sam_prompt_embed_dim = self.hidden_dim
-        self.sam_image_embedding_size = self.image_size // self.backbone_stride
-
-        # Build PromptEncoder and MaskDecoder from SAM (hyperparameters like `mask_in_chans=16` are from SAM code)
-        self.sam_prompt_encoder = PromptEncoder(
-            embed_dim=self.sam_prompt_embed_dim,
-            image_embedding_size=(
-                self.sam_image_embedding_size,
-                self.sam_image_embedding_size,
-            ),
-            input_image_size=(self.image_size, self.image_size),
-            mask_in_chans=16,
-        )
-        self.sam_mask_decoder = SAM2MaskDecoder(
-            num_multimask_outputs=3,
-            transformer=SAM2TwoWayTransformer(
-                depth=2,
-                embedding_dim=self.sam_prompt_embed_dim,
-                mlp_dim=2048,
-                num_heads=8,
-            ),
-            transformer_dim=self.sam_prompt_embed_dim,
-            iou_head_depth=3,
-            iou_head_hidden_dim=256,
-            use_high_res_features=self.use_high_res_features_in_sam,
-            iou_prediction_use_sigmoid=self.iou_prediction_use_sigmoid,
-            pred_obj_scores=self.pred_obj_scores,
-            pred_obj_scores_mlp=self.pred_obj_scores_mlp,
-            use_multimask_token_for_obj_ptr=self.use_multimask_token_for_obj_ptr,
-            **(self.sam_mask_decoder_extra_args or {}),
-        )
-        if self.use_obj_ptrs_in_encoder:
-            # a linear projection on SAM output tokens to turn them into object pointers
-            self.obj_ptr_proj = torch.nn.Linear(self.hidden_dim, self.hidden_dim)
-            if self.use_mlp_for_obj_ptr_proj:
-                self.obj_ptr_proj = MLP(self.hidden_dim, self.hidden_dim, self.hidden_dim, 3)
-        else:
-            self.obj_ptr_proj = torch.nn.Identity()
-        if self.proj_tpos_enc_in_obj_ptrs:
-            # a linear projection on temporal positional encoding in object pointers to
-            # avoid potential interference with spatial positional encoding
-            self.obj_ptr_tpos_proj = torch.nn.Linear(self.hidden_dim, self.mem_dim)
-        else:
-            self.obj_ptr_tpos_proj = torch.nn.Identity()
-
-    def _forward_sam_heads(
-        self,
-        backbone_features,
-        point_inputs=None,
-        mask_inputs=None,
-        high_res_features=None,
-        multimask_output=False,
-    ):
-        """
-        Forward pass through SAM prompt encoders and mask heads.
-
-        This method processes image features and optional point/mask inputs to generate object masks and scores.
-
-        Args:
-            backbone_features (torch.Tensor): Image features with shape (B, C, H, W).
-            point_inputs (dict[str, torch.Tensor] | None): Dictionary containing point prompts.
-                'point_coords': Tensor of shape (B, P, 2) with float32 dtype, containing absolute
-                    pixel-unit coordinates in (x, y) format for P input points.
-                'point_labels': Tensor of shape (B, P) with int32 dtype, where 1 means positive clicks,
-                    0 means negative clicks, and -1 means padding.
-            mask_inputs (torch.Tensor | None): Mask of shape (B, 1, H*16, W*16), float or bool, with the
-                same spatial size as the image.
-            high_res_features (list[torch.Tensor] | None): List of two feature maps with shapes
-                (B, C, 4*H, 4*W) and (B, C, 2*H, 2*W) respectively, used as high-resolution feature maps
-                for SAM decoder.
-            multimask_output (bool): If True, output 3 candidate masks and their IoU estimates; if False,
-                output only 1 mask and its IoU estimate.
-
-        Returns:
-            low_res_multimasks (torch.Tensor): Tensor of shape (B, M, H*4, W*4) with SAM output mask logits.
-            high_res_multimasks (torch.Tensor): Tensor of shape (B, M, H*16, W*16) with upsampled mask logits.
-            ious (torch.Tensor): Tensor of shape (B, M) with estimated IoU for each output mask.
-            low_res_masks (torch.Tensor): Tensor of shape (B, 1, H*4, W*4) with the best low-resolution mask.
-            high_res_masks (torch.Tensor): Tensor of shape (B, 1, H*16, W*16) with the best high-resolution mask.
-            obj_ptr (torch.Tensor): Tensor of shape (B, C) with object pointer vector for the output mask.
-            object_score_logits (torch.Tensor): Tensor of shape (B) with object score logits.
-
-        Examples:
-            >>> backbone_features = torch.rand(1, 256, 32, 32)
-            >>> point_inputs = {"point_coords": torch.rand(1, 2, 2), "point_labels": torch.tensor([[1, 0]])}
-            >>> mask_inputs = torch.rand(1, 1, 512, 512)
-            >>> results = model._forward_sam_heads(backbone_features, point_inputs, mask_inputs)
-            >>> (
-            ...     low_res_multimasks,
-            ...     high_res_multimasks,
-            ...     ious,
-            ...     low_res_masks,
-            ...     high_res_masks,
-            ...     obj_ptr,
-            ...     object_score_logits,
-            ... ) = results
-        """
-        B = backbone_features.shape[0]
-        device = backbone_features.device
-        assert backbone_features.size(1) == self.sam_prompt_embed_dim
-        assert backbone_features.size(2) == self.sam_image_embedding_size
-        assert backbone_features.size(3) == self.sam_image_embedding_size
-
-        # a) Handle point prompts
-        if point_inputs is not None:
-            sam_point_coords = point_inputs["point_coords"]
-            sam_point_labels = point_inputs["point_labels"]
-            assert sam_point_coords.shape[0] == B and sam_point_labels.shape[0] == B
-        else:
-            # If no points are provide, pad with an empty point (with label -1)
-            sam_point_coords = torch.zeros(B, 1, 2, device=device, dtype=backbone_features.dtype)
-            sam_point_labels = -torch.ones(B, 1, dtype=torch.int32, device=device)
-
-        # b) Handle mask prompts
-        if mask_inputs is not None:
-            # If mask_inputs is provided, downsize it into low-res mask input if needed
-            # and feed it as a dense mask prompt into the SAM mask encoder
-            assert len(mask_inputs.shape) == 4 and mask_inputs.shape[:2] == (B, 1)
-            if mask_inputs.shape[-2:] != self.sam_prompt_encoder.mask_input_size:
-                sam_mask_prompt = F.interpolate(
-                    mask_inputs.float(),
-                    size=self.sam_prompt_encoder.mask_input_size,
-                    align_corners=False,
-                    mode="bilinear",
-                    antialias=True,  # use antialias for downsampling
-                )
-            else:
-                sam_mask_prompt = mask_inputs
-        else:
-            # Otherwise, simply feed None (and SAM's prompt encoder will add
-            # a learned `no_mask_embed` to indicate no mask input in this case).
-            sam_mask_prompt = None
-
-        sparse_embeddings, dense_embeddings = self.sam_prompt_encoder(
-            points=(sam_point_coords, sam_point_labels),
-            boxes=None,
-            masks=sam_mask_prompt,
-        )
-        low_res_multimasks, ious, sam_output_tokens, object_score_logits = self.sam_mask_decoder(
-            image_embeddings=backbone_features,
-            image_pe=self.sam_prompt_encoder.get_dense_pe(),
-            sparse_prompt_embeddings=sparse_embeddings,
-            dense_prompt_embeddings=dense_embeddings,
-            multimask_output=multimask_output,
-            repeat_image=False,  # the image is already batched
-            high_res_features=high_res_features,
-        )
-        if self.pred_obj_scores:
-            is_obj_appearing = object_score_logits > 0
-
-            # Spatial memory mask is a *hard* choice between obj and no obj, consistent with actual mask prediction
-            low_res_multimasks = torch.where(is_obj_appearing[:, None, None], low_res_multimasks, NO_OBJ_SCORE)
-
-        # convert masks from possibly bfloat16 (or float16) to float32
-        # (older PyTorch versions before 2.1 don't support `interpolate` on bf16)
-        high_res_multimasks = F.interpolate(
-            low_res_multimasks,
-            size=(self.image_size, self.image_size),
-            mode="bilinear",
-            align_corners=False,
-        )
-
-        sam_output_token = sam_output_tokens[:, 0]
-        if multimask_output:
-            # take the best mask prediction (with the highest IoU estimation)
-            best_iou_inds = torch.argmax(ious, dim=-1)
-            batch_inds = torch.arange(B, device=device)
-            low_res_masks = low_res_multimasks[batch_inds, best_iou_inds].unsqueeze(1)
-            high_res_masks = high_res_multimasks[batch_inds, best_iou_inds].unsqueeze(1)
-            if sam_output_tokens.size(1) > 1:
-                sam_output_token = sam_output_tokens[batch_inds, best_iou_inds]
-        else:
-            low_res_masks, high_res_masks = low_res_multimasks, high_res_multimasks
-
-        # Extract object pointer from the SAM output token (with occlusion handling)
-        obj_ptr = self.obj_ptr_proj(sam_output_token)
-        if self.pred_obj_scores:
-            # Allow *soft* no obj ptr, unlike for masks
-            if self.soft_no_obj_ptr:
-                lambda_is_obj_appearing = object_score_logits.sigmoid()
-            else:
-                lambda_is_obj_appearing = is_obj_appearing.to(obj_ptr.dtype)
-
-            if self.fixed_no_obj_ptr:
-                obj_ptr = lambda_is_obj_appearing * obj_ptr
-            obj_ptr = obj_ptr + (1 - lambda_is_obj_appearing) * self.no_obj_ptr
-        return (
-            low_res_multimasks,
-            high_res_multimasks,
-            ious,
-            low_res_masks,
-            high_res_masks,
-            obj_ptr,
-            object_score_logits,
-        )
-
-    def _use_mask_as_output(self, mask_inputs, backbone_features=None, high_res_features=None):
-        """Process mask inputs directly as output, bypassing SAM encoder/decoder."""
-        # Use -10/+10 as logits for neg/pos pixels (very close to 0/1 in prob after sigmoid).
-        out_scale, out_bias = 20.0, -10.0  # sigmoid(-10.0)=4.5398e-05
-        mask_inputs_float = mask_inputs.float()
-        high_res_masks = mask_inputs_float * out_scale + out_bias
-        low_res_masks = F.interpolate(
-            high_res_masks,
-            size=(high_res_masks.size(-2) // 4, high_res_masks.size(-1) // 4),
-            align_corners=False,
-            mode="bilinear",
-            antialias=True,  # use antialias for downsampling
-        )
-        # a dummy IoU prediction of all 1's under mask input
-        ious = mask_inputs.new_ones(mask_inputs.shape[0], 1).float()
-        if not self.use_obj_ptrs_in_encoder or backbone_features is None or high_res_features is None:
-            # all zeros as a dummy object pointer (of shape [B, C])
-            obj_ptr = torch.zeros(mask_inputs.shape[0], self.hidden_dim, device=mask_inputs.device)
-        else:
-            # produce an object pointer using the SAM decoder from the mask input
-            _, _, _, _, _, obj_ptr, _ = self._forward_sam_heads(
-                backbone_features=backbone_features,
-                mask_inputs=self.mask_downsample(mask_inputs_float),
-                high_res_features=high_res_features,
-            )
-        # In this method, we are treating mask_input as output, e.g. using it directly to create spatial mem;
-        # Below, we follow the same design axiom to use mask_input to decide if obj appears or not instead of relying
-        # on the object_scores from the SAM decoder.
-        is_obj_appearing = torch.any(mask_inputs.flatten(1).float() > 0.0, dim=1)
-        is_obj_appearing = is_obj_appearing[..., None]
-        lambda_is_obj_appearing = is_obj_appearing.float()
-        object_score_logits = out_scale * lambda_is_obj_appearing + out_bias
-        if self.pred_obj_scores:
-            if self.fixed_no_obj_ptr:
-                obj_ptr = lambda_is_obj_appearing * obj_ptr
-            obj_ptr = obj_ptr + (1 - lambda_is_obj_appearing) * self.no_obj_ptr
-
-        return (
-            low_res_masks,
-            high_res_masks,
-            ious,
-            low_res_masks,
-            high_res_masks,
-            obj_ptr,
-            object_score_logits,
-        )
-
-    def forward_image(self, img_batch: torch.Tensor):
-        """Process image batch through encoder to extract multi-level features for SAM model."""
-        backbone_out = self.image_encoder(img_batch)
-        if self.use_high_res_features_in_sam:
-            # precompute projected level 0 and level 1 features in SAM decoder
-            # to avoid running it again on every SAM click
-            backbone_out["backbone_fpn"][0] = self.sam_mask_decoder.conv_s0(backbone_out["backbone_fpn"][0])
-            backbone_out["backbone_fpn"][1] = self.sam_mask_decoder.conv_s1(backbone_out["backbone_fpn"][1])
-        return backbone_out
-
-    def _prepare_backbone_features(self, backbone_out):
-        """Prepare and flatten visual features from the image backbone output for further processing."""
-        assert len(backbone_out["backbone_fpn"]) == len(backbone_out["vision_pos_enc"])
-        assert len(backbone_out["backbone_fpn"]) >= self.num_feature_levels
-
-        feature_maps = backbone_out["backbone_fpn"][-self.num_feature_levels :]
-        vision_pos_embeds = backbone_out["vision_pos_enc"][-self.num_feature_levels :]
-
-        feat_sizes = [(x.shape[-2], x.shape[-1]) for x in vision_pos_embeds]
-        # flatten NxCxHxW to HWxNxC
-        vision_feats = [x.flatten(2).permute(2, 0, 1) for x in feature_maps]
-        vision_pos_embeds = [x.flatten(2).permute(2, 0, 1) for x in vision_pos_embeds]
-
-        return backbone_out, vision_feats, vision_pos_embeds, feat_sizes
-
-    def _prepare_memory_conditioned_features(
-        self,
-        frame_idx,
-        is_init_cond_frame,
-        current_vision_feats,
-        current_vision_pos_embeds,
-        feat_sizes,
-        output_dict,
-        num_frames,
-        track_in_reverse=False,  # tracking in reverse time order (for demo usage)
-    ):
-        """Prepare memory-conditioned features by fusing current frame's visual features with previous memories."""
-        B = current_vision_feats[-1].size(1)  # batch size on this frame
-        C = self.hidden_dim
-        H, W = feat_sizes[-1]  # top-level (lowest-resolution) feature size
-        device = current_vision_feats[-1].device
-        # The case of `self.num_maskmem == 0` below is primarily used for reproducing SAM on images.
-        # In this case, we skip the fusion with any memory.
-        if self.num_maskmem == 0:  # Disable memory and skip fusion
-            return current_vision_feats[-1].permute(1, 2, 0).view(B, C, H, W)
-        num_obj_ptr_tokens = 0
-        tpos_sign_mul = -1 if track_in_reverse else 1
-        # Step 1: condition the visual features of the current frame on previous memories
-        if not is_init_cond_frame:
-            # Retrieve the memories encoded with the maskmem backbone
-            to_cat_memory, to_cat_memory_pos_embed = [], []
-            # Add conditioning frame's output first (all cond frames have t_pos=0 for
-            # when getting temporal positional embedding below)
-            assert len(output_dict["cond_frame_outputs"]) > 0
-            # Select a maximum number of temporally closest cond frames for cross attention
-            cond_outputs = output_dict["cond_frame_outputs"]
-            selected_cond_outputs, unselected_cond_outputs = select_closest_cond_frames(
-                frame_idx, cond_outputs, self.max_cond_frames_in_attn
-            )
-            t_pos_and_prevs = [(0, out) for out in selected_cond_outputs.values()]
-            # Add last (self.num_maskmem - 1) frames before current frame for non-conditioning memory
-            # the earliest one has t_pos=1 and the latest one has t_pos=self.num_maskmem-1
-            # We also allow taking the memory frame non-consecutively (with r>1), in which case
-            # we take (self.num_maskmem - 2) frames among every r-th frames plus the last frame.
-            r = 1 if self.training else self.memory_temporal_stride_for_eval
-            for t_pos in range(1, self.num_maskmem):
-                t_rel = self.num_maskmem - t_pos  # how many frames before current frame
-                if t_rel == 1:
-                    # for t_rel == 1, we take the last frame (regardless of r)
-                    prev_frame_idx = frame_idx + t_rel if track_in_reverse else frame_idx - t_rel
-                elif not track_in_reverse:
-                    # first find the nearest frame among every r-th frames before this frame
-                    # for r=1, this would be (frame_idx - 2)
-                    prev_frame_idx = ((frame_idx - 2) // r) * r
-                    # then seek further among every r-th frames
-                    prev_frame_idx = prev_frame_idx - (t_rel - 2) * r
-                else:
-                    # first find the nearest frame among every r-th frames after this frame
-                    # for r=1, this would be (frame_idx + 2)
-                    prev_frame_idx = -(-(frame_idx + 2) // r) * r
-                    # then seek further among every r-th frames
-                    prev_frame_idx = prev_frame_idx + (t_rel - 2) * r
-                out = output_dict["non_cond_frame_outputs"].get(prev_frame_idx, None)
-                if out is None:
-                    # If an unselected conditioning frame is among the last (self.num_maskmem - 1)
-                    # frames, we still attend to it as if it's a non-conditioning frame.
-                    out = unselected_cond_outputs.get(prev_frame_idx, None)
-                t_pos_and_prevs.append((t_pos, out))
-
-            for t_pos, prev in t_pos_and_prevs:
-                if prev is None:
-                    continue  # skip padding frames
-                # "maskmem_features" might have been offloaded to CPU in demo use cases,
-                # so we load it back to inference device (it's a no-op if it's already on device).
-                feats = prev["maskmem_features"].to(device=device, non_blocking=device.type == "cuda")
-                to_cat_memory.append(feats.flatten(2).permute(2, 0, 1))
-                # Spatial positional encoding (it might have been offloaded to CPU in eval)
-                maskmem_enc = prev["maskmem_pos_enc"][-1].to(device=device)
-                maskmem_enc = maskmem_enc.flatten(2).permute(2, 0, 1)
-                # Temporal positional encoding
-                maskmem_enc = maskmem_enc + self.maskmem_tpos_enc[self.num_maskmem - t_pos - 1]
-                to_cat_memory_pos_embed.append(maskmem_enc)
-
-            # Construct the list of past object pointers
-            if self.use_obj_ptrs_in_encoder:
-                max_obj_ptrs_in_encoder = min(num_frames, self.max_obj_ptrs_in_encoder)
-                # First add those object pointers from selected conditioning frames
-                # (optionally, only include object pointers in the past during evaluation)
-                if not self.training and self.only_obj_ptrs_in_the_past_for_eval:
-                    ptr_cond_outputs = {
-                        t: out
-                        for t, out in selected_cond_outputs.items()
-                        if (t >= frame_idx if track_in_reverse else t <= frame_idx)
-                    }
-                else:
-                    ptr_cond_outputs = selected_cond_outputs
-                pos_and_ptrs = [
-                    # Temporal pos encoding contains how far away each pointer is from current frame
-                    (
-                        (
-                            (frame_idx - t) * tpos_sign_mul
-                            if self.use_signed_tpos_enc_to_obj_ptrs
-                            else abs(frame_idx - t)
-                        ),
-                        out["obj_ptr"],
-                    )
-                    for t, out in ptr_cond_outputs.items()
-                ]
-                # Add up to (max_obj_ptrs_in_encoder - 1) non-conditioning frames before current frame
-                for t_diff in range(1, max_obj_ptrs_in_encoder):
-                    t = frame_idx + t_diff if track_in_reverse else frame_idx - t_diff
-                    if t < 0 or (num_frames is not None and t >= num_frames):
-                        break
-                    out = output_dict["non_cond_frame_outputs"].get(t, unselected_cond_outputs.get(t, None))
-                    if out is not None:
-                        pos_and_ptrs.append((t_diff, out["obj_ptr"]))
-                # If we have at least one object pointer, add them to the across attention
-                if pos_and_ptrs:
-                    pos_list, ptrs_list = zip(*pos_and_ptrs)
-                    # stack object pointers along dim=0 into [ptr_seq_len, B, C] shape
-                    obj_ptrs = torch.stack(ptrs_list, dim=0)
-                    # a temporal positional embedding based on how far each object pointer is from
-                    # the current frame (sine embedding normalized by the max pointer num).
-                    if self.add_tpos_enc_to_obj_ptrs:
-                        t_diff_max = max_obj_ptrs_in_encoder - 1
-                        tpos_dim = C if self.proj_tpos_enc_in_obj_ptrs else self.mem_dim
-                        obj_pos = torch.tensor(pos_list, device=device, dtype=current_vision_feats[-1].dtype)
-                        obj_pos = get_1d_sine_pe(obj_pos / t_diff_max, dim=tpos_dim)
-                        obj_pos = self.obj_ptr_tpos_proj(obj_pos)
-                        obj_pos = obj_pos.unsqueeze(1).expand(-1, B, self.mem_dim)
-                    else:
-                        obj_pos = obj_ptrs.new_zeros(len(pos_list), B, self.mem_dim)
-                    if self.mem_dim < C:
-                        # split a pointer into (C // self.mem_dim) tokens for self.mem_dim < C
-                        obj_ptrs = obj_ptrs.reshape(-1, B, C // self.mem_dim, self.mem_dim)
-                        obj_ptrs = obj_ptrs.permute(0, 2, 1, 3).flatten(0, 1)
-                        obj_pos = obj_pos.repeat_interleave(C // self.mem_dim, dim=0)
-                    to_cat_memory.append(obj_ptrs)
-                    to_cat_memory_pos_embed.append(obj_pos)
-                    num_obj_ptr_tokens = obj_ptrs.shape[0]
-                else:
-                    num_obj_ptr_tokens = 0
-        else:
-            # for initial conditioning frames, encode them without using any previous memory
-            if self.directly_add_no_mem_embed:
-                # directly add no-mem embedding (instead of using the transformer encoder)
-                pix_feat_with_mem = current_vision_feats[-1] + self.no_mem_embed
-                pix_feat_with_mem = pix_feat_with_mem.permute(1, 2, 0).view(B, C, H, W)
-                return pix_feat_with_mem
-
-            # Use a dummy token on the first frame (to avoid empty memory input to transformer encoder)
-            to_cat_memory = [self.no_mem_embed.expand(1, B, self.mem_dim)]
-            to_cat_memory_pos_embed = [self.no_mem_pos_enc.expand(1, B, self.mem_dim)]
-
-        # Step 2: Concatenate the memories and forward through the transformer encoder
-        memory = torch.cat(to_cat_memory, dim=0)
-        memory_pos_embed = torch.cat(to_cat_memory_pos_embed, dim=0)
-
-        pix_feat_with_mem = self.memory_attention(
-            curr=current_vision_feats,
-            curr_pos=current_vision_pos_embeds,
-            memory=memory,
-            memory_pos=memory_pos_embed,
-            num_obj_ptr_tokens=num_obj_ptr_tokens,
-        )
-        # reshape the output (HW)BC => BCHW
-        pix_feat_with_mem = pix_feat_with_mem.permute(1, 2, 0).view(B, C, H, W)
-        return pix_feat_with_mem
-
-    def _encode_new_memory(
-        self,
-        current_vision_feats,
-        feat_sizes,
-        pred_masks_high_res,
-        object_score_logits,
-        is_mask_from_pts,
-    ):
-        """Encode frame features and masks into a new memory representation for video segmentation."""
-        B = current_vision_feats[-1].size(1)  # batch size on this frame
-        C = self.hidden_dim
-        H, W = feat_sizes[-1]  # top-level (lowest-resolution) feature size
-        # top-level feature, (HW)BC => BCHW
-        pix_feat = current_vision_feats[-1].permute(1, 2, 0).view(B, C, H, W)
-        if self.non_overlap_masks_for_mem_enc and not self.training:
-            # optionally, apply non-overlapping constraints to the masks (it's applied
-            # in the batch dimension and should only be used during eval, where all
-            # the objects come from the same video under batch size 1).
-            pred_masks_high_res = self._apply_non_overlapping_constraints(pred_masks_high_res)
-        # scale the raw mask logits with a temperature before applying sigmoid
-        binarize = self.binarize_mask_from_pts_for_mem_enc and is_mask_from_pts
-        if binarize and not self.training:
-            mask_for_mem = (pred_masks_high_res > 0).to(pix_feat.dtype)
-        else:
-            # apply sigmoid on the raw mask logits to turn them into range (0, 1)
-            mask_for_mem = torch.sigmoid(pred_masks_high_res)
-        # apply scale and bias terms to the sigmoid probabilities
-        if self.sigmoid_scale_for_mem_enc != 1.0:
-            mask_for_mem = mask_for_mem * self.sigmoid_scale_for_mem_enc
-        if self.sigmoid_bias_for_mem_enc != 0.0:
-            mask_for_mem = mask_for_mem + self.sigmoid_bias_for_mem_enc
-        maskmem_out = self.memory_encoder(pix_feat, mask_for_mem, skip_mask_sigmoid=True)  # sigmoid already applied
-        maskmem_features = maskmem_out["vision_features"]
-        maskmem_pos_enc = maskmem_out["vision_pos_enc"]
-        # add a no-object embedding to the spatial memory to indicate that the frame
-        # is predicted to be occluded (i.e. no object is appearing in the frame)
-        if self.no_obj_embed_spatial is not None:
-            is_obj_appearing = (object_score_logits > 0).float()
-            maskmem_features += (1 - is_obj_appearing[..., None, None]) * self.no_obj_embed_spatial[
-                ..., None, None
-            ].expand(*maskmem_features.shape)
-
-        return maskmem_features, maskmem_pos_enc
-
-    def _track_step(
-        self,
-        frame_idx,
-        is_init_cond_frame,
-        current_vision_feats,
-        current_vision_pos_embeds,
-        feat_sizes,
-        point_inputs,
-        mask_inputs,
-        output_dict,
-        num_frames,
-        track_in_reverse,
-        prev_sam_mask_logits,
-    ):
-        """Perform a single tracking step, updating object masks and memory features based on current frame inputs."""
-        # High-resolution feature maps for the SAM head, reshape (HW)BC => BCHW
-        if len(current_vision_feats) > 1:
-            high_res_features = [
-                x.permute(1, 2, 0).view(x.size(1), x.size(2), *s)
-                for x, s in zip(current_vision_feats[:-1], feat_sizes[:-1])
-            ]
-        else:
-            high_res_features = None
-        if mask_inputs is not None and self.use_mask_input_as_output_without_sam:
-            # When use_mask_input_as_output_without_sam=True, we directly output the mask input
-            # (see it as a GT mask) without using a SAM prompt encoder + mask decoder.
-            pix_feat = current_vision_feats[-1].permute(1, 2, 0)
-            pix_feat = pix_feat.view(-1, self.hidden_dim, *feat_sizes[-1])
-            sam_outputs = self._use_mask_as_output(mask_inputs, pix_feat, high_res_features)
-        else:
-            # fused the visual feature with previous memory features in the memory bank
-            pix_feat = self._prepare_memory_conditioned_features(
-                frame_idx=frame_idx,
-                is_init_cond_frame=is_init_cond_frame,
-                current_vision_feats=current_vision_feats[-1:],
-                current_vision_pos_embeds=current_vision_pos_embeds[-1:],
-                feat_sizes=feat_sizes[-1:],
-                output_dict=output_dict,
-                num_frames=num_frames,
-                track_in_reverse=track_in_reverse,
-            )
-            # apply SAM-style segmentation head
-            # here we might feed previously predicted low-res SAM mask logits into the SAM mask decoder,
-            # e.g. in demo where such logits come from earlier interaction instead of correction sampling
-            # (in this case, any `mask_inputs` shouldn't reach here as they are sent to _use_mask_as_output instead)
-            if prev_sam_mask_logits is not None:
-                assert point_inputs is not None and mask_inputs is None
-                mask_inputs = prev_sam_mask_logits
-            multimask_output = self._use_multimask(is_init_cond_frame, point_inputs)
-            sam_outputs = self._forward_sam_heads(
-                backbone_features=pix_feat,
-                point_inputs=point_inputs,
-                mask_inputs=mask_inputs,
-                high_res_features=high_res_features,
-                multimask_output=multimask_output,
-            )
-        return sam_outputs, high_res_features, pix_feat
-
-    def _encode_memory_in_output(
-        self,
-        current_vision_feats,
-        feat_sizes,
-        point_inputs,
-        run_mem_encoder,
-        high_res_masks,
-        object_score_logits,
-        current_out,
-    ):
-        """Run memory encoder on predicted mask to encode it into a new memory feature for future frames."""
-        if run_mem_encoder and self.num_maskmem > 0:
-            maskmem_features, maskmem_pos_enc = self._encode_new_memory(
-                current_vision_feats=current_vision_feats,
-                feat_sizes=feat_sizes,
-                pred_masks_high_res=high_res_masks,
-                object_score_logits=object_score_logits,
-                is_mask_from_pts=(point_inputs is not None),
-            )
-            current_out["maskmem_features"] = maskmem_features
-            current_out["maskmem_pos_enc"] = maskmem_pos_enc
-        else:
-            current_out["maskmem_features"] = None
-            current_out["maskmem_pos_enc"] = None
-
-    def track_step(
-        self,
-        frame_idx,
-        is_init_cond_frame,
-        current_vision_feats,
-        current_vision_pos_embeds,
-        feat_sizes,
-        point_inputs,
-        mask_inputs,
-        output_dict,
-        num_frames,
-        track_in_reverse=False,  # tracking in reverse time order (for demo usage)
-        # Whether to run the memory encoder on the predicted masks. Sometimes we might want
-        # to skip the memory encoder with `run_mem_encoder=False`. For example,
-        # in demo we might call `track_step` multiple times for each user click,
-        # and only encode the memory when the user finalizes their clicks. And in ablation
-        # settings like SAM training on static images, we don't need the memory encoder.
-        run_mem_encoder=True,
-        # The previously predicted SAM mask logits (which can be fed together with new clicks in demo).
-        prev_sam_mask_logits=None,
-    ):
-        """Perform a single tracking step, updating object masks and memory features based on current frame inputs."""
-        sam_outputs, _, _ = self._track_step(
-            frame_idx,
-            is_init_cond_frame,
-            current_vision_feats,
-            current_vision_pos_embeds,
-            feat_sizes,
-            point_inputs,
-            mask_inputs,
-            output_dict,
-            num_frames,
-            track_in_reverse,
-            prev_sam_mask_logits,
-        )
-        _, _, _, low_res_masks, high_res_masks, obj_ptr, object_score_logits = sam_outputs
-
-        current_out = {
-            "pred_masks": low_res_masks,
-            "pred_masks_high_res": high_res_masks,
-            "obj_ptr": obj_ptr,
-        }
-        if not self.training:
-            # Only add this in inference (to avoid unused param in activation checkpointing;
-            # it's mainly used in the demo to encode spatial memories w/ consolidated masks)
-            current_out["object_score_logits"] = object_score_logits
-
-        # Run memory encoder on the predicted mask to encode it into a new memory feature (for use in future frames)
-        self._encode_memory_in_output(
-            current_vision_feats,
-            feat_sizes,
-            point_inputs,
-            run_mem_encoder,
-            high_res_masks,
-            object_score_logits,
-            current_out,
-        )
-
-        return current_out
-
-    def _use_multimask(self, is_init_cond_frame, point_inputs):
-        """Determine whether to use multiple mask outputs in the SAM head based on configuration and inputs."""
-        num_pts = 0 if point_inputs is None else point_inputs["point_labels"].size(1)
-        return (
-            self.multimask_output_in_sam
-            and (is_init_cond_frame or self.multimask_output_for_tracking)
-            and (self.multimask_min_pt_num <= num_pts <= self.multimask_max_pt_num)
-        )
-
-    @staticmethod
-    def _apply_non_overlapping_constraints(pred_masks):
-        """Apply non-overlapping constraints to masks, keeping the highest scoring object per location."""
-        batch_size = pred_masks.shape[0]
-        if batch_size == 1:
-            return pred_masks
-
-        device = pred_masks.device
-        # "max_obj_inds": object index of the object with the highest score at each location
-        max_obj_inds = torch.argmax(pred_masks, dim=0, keepdim=True)
-        # "batch_obj_inds": object index of each object slice (along dim 0) in `pred_masks`
-        batch_obj_inds = torch.arange(batch_size, device=device)[:, None, None, None]
-        keep = max_obj_inds == batch_obj_inds
-        # suppress overlapping regions' scores below -10.0 so that the foreground regions
-        # don't overlap (here sigmoid(-10.0)=4.5398e-05)
-        pred_masks = torch.where(keep, pred_masks, torch.clamp(pred_masks, max=-10.0))
-        return pred_masks
-
-    def set_binarize(self, binarize=False):
-        """Set binarize for VideoPredictor."""
-        self.binarize_mask_from_pts_for_mem_enc = binarize
-
-    def set_imgsz(self, imgsz):
-        """Set image size to make model compatible with different image sizes."""
-        self.image_size = imgsz[0]
-        self.sam_prompt_encoder.input_image_size = imgsz
-        self.sam_prompt_encoder.image_embedding_size = [x // 16 for x in imgsz]  # fixed ViT patch size of 16
-        self.sam_image_embedding_size = self.image_size // self.backbone_stride  # update image embedding size
diff --git a/ultralytics/models/sam/modules/tiny_encoder.py b/ultralytics/models/sam/modules/tiny_encoder.py
deleted file mode 100644
index ea8ea9c..0000000
--- a/ultralytics/models/sam/modules/tiny_encoder.py
+++ /dev/null
@@ -1,998 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-# --------------------------------------------------------
-# TinyViT Model Architecture
-# Copyright (c) 2022 Microsoft
-# Adapted from LeViT and Swin Transformer
-#   LeViT: (https://github.com/facebookresearch/levit)
-#   Swin: (https://github.com/microsoft/swin-transformer)
-# Build the TinyViT Model
-# --------------------------------------------------------
-
-from __future__ import annotations
-
-import itertools
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from ultralytics.nn.modules import LayerNorm2d
-from ultralytics.utils.instance import to_2tuple
-
-
-class Conv2d_BN(torch.nn.Sequential):
-    """
-    A sequential container that performs 2D convolution followed by batch normalization.
-
-    This module combines a 2D convolution layer with batch normalization, providing a common building block
-    for convolutional neural networks. The batch normalization weights and biases are initialized to specific
-    values for optimal training performance.
-
-    Attributes:
-        c (torch.nn.Conv2d): 2D convolution layer.
-        bn (torch.nn.BatchNorm2d): Batch normalization layer.
-
-    Examples:
-        >>> conv_bn = Conv2d_BN(3, 64, ks=3, stride=1, pad=1)
-        >>> input_tensor = torch.randn(1, 3, 224, 224)
-        >>> output = conv_bn(input_tensor)
-        >>> print(output.shape)
-        torch.Size([1, 64, 224, 224])
-    """
-
-    def __init__(
-        self,
-        a: int,
-        b: int,
-        ks: int = 1,
-        stride: int = 1,
-        pad: int = 0,
-        dilation: int = 1,
-        groups: int = 1,
-        bn_weight_init: float = 1,
-    ):
-        """
-        Initialize a sequential container with 2D convolution followed by batch normalization.
-
-        Args:
-            a (int): Number of input channels.
-            b (int): Number of output channels.
-            ks (int, optional): Kernel size for the convolution.
-            stride (int, optional): Stride for the convolution.
-            pad (int, optional): Padding for the convolution.
-            dilation (int, optional): Dilation factor for the convolution.
-            groups (int, optional): Number of groups for the convolution.
-            bn_weight_init (float, optional): Initial value for batch normalization weight.
-        """
-        super().__init__()
-        self.add_module("c", torch.nn.Conv2d(a, b, ks, stride, pad, dilation, groups, bias=False))
-        bn = torch.nn.BatchNorm2d(b)
-        torch.nn.init.constant_(bn.weight, bn_weight_init)
-        torch.nn.init.constant_(bn.bias, 0)
-        self.add_module("bn", bn)
-
-
-class PatchEmbed(nn.Module):
-    """
-    Embed images into patches and project them into a specified embedding dimension.
-
-    This module converts input images into patch embeddings using a sequence of convolutional layers,
-    effectively downsampling the spatial dimensions while increasing the channel dimension.
-
-    Attributes:
-        patches_resolution (tuple[int, int]): Resolution of the patches after embedding.
-        num_patches (int): Total number of patches.
-        in_chans (int): Number of input channels.
-        embed_dim (int): Dimension of the embedding.
-        seq (nn.Sequential): Sequence of convolutional and activation layers for patch embedding.
-
-    Examples:
-        >>> import torch
-        >>> patch_embed = PatchEmbed(in_chans=3, embed_dim=96, resolution=224, activation=nn.GELU)
-        >>> x = torch.randn(1, 3, 224, 224)
-        >>> output = patch_embed(x)
-        >>> print(output.shape)
-        torch.Size([1, 96, 56, 56])
-    """
-
-    def __init__(self, in_chans: int, embed_dim: int, resolution: int, activation):
-        """
-        Initialize patch embedding with convolutional layers for image-to-patch conversion and projection.
-
-        Args:
-            in_chans (int): Number of input channels.
-            embed_dim (int): Dimension of the embedding.
-            resolution (int): Input image resolution.
-            activation (nn.Module): Activation function to use between convolutions.
-        """
-        super().__init__()
-        img_size: tuple[int, int] = to_2tuple(resolution)
-        self.patches_resolution = (img_size[0] // 4, img_size[1] // 4)
-        self.num_patches = self.patches_resolution[0] * self.patches_resolution[1]
-        self.in_chans = in_chans
-        self.embed_dim = embed_dim
-        n = embed_dim
-        self.seq = nn.Sequential(
-            Conv2d_BN(in_chans, n // 2, 3, 2, 1),
-            activation(),
-            Conv2d_BN(n // 2, n, 3, 2, 1),
-        )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Process input tensor through patch embedding sequence, converting images to patch embeddings."""
-        return self.seq(x)
-
-
-class MBConv(nn.Module):
-    """
-    Mobile Inverted Bottleneck Conv (MBConv) layer, part of the EfficientNet architecture.
-
-    This module implements the mobile inverted bottleneck convolution with expansion, depthwise convolution,
-    and projection phases, along with residual connections for improved gradient flow.
-
-    Attributes:
-        in_chans (int): Number of input channels.
-        hidden_chans (int): Number of hidden channels after expansion.
-        out_chans (int): Number of output channels.
-        conv1 (Conv2d_BN): First convolutional layer for channel expansion.
-        act1 (nn.Module): First activation function.
-        conv2 (Conv2d_BN): Depthwise convolutional layer.
-        act2 (nn.Module): Second activation function.
-        conv3 (Conv2d_BN): Final convolutional layer for projection.
-        act3 (nn.Module): Third activation function.
-        drop_path (nn.Module): Drop path layer (Identity for inference).
-
-    Examples:
-        >>> in_chans, out_chans = 32, 64
-        >>> mbconv = MBConv(in_chans, out_chans, expand_ratio=4, activation=nn.ReLU, drop_path=0.1)
-        >>> x = torch.randn(1, in_chans, 56, 56)
-        >>> output = mbconv(x)
-        >>> print(output.shape)
-        torch.Size([1, 64, 56, 56])
-    """
-
-    def __init__(self, in_chans: int, out_chans: int, expand_ratio: float, activation, drop_path: float):
-        """
-        Initialize the MBConv layer with specified input/output channels, expansion ratio, and activation.
-
-        Args:
-            in_chans (int): Number of input channels.
-            out_chans (int): Number of output channels.
-            expand_ratio (float): Channel expansion ratio for the hidden layer.
-            activation (nn.Module): Activation function to use.
-            drop_path (float): Drop path rate for stochastic depth.
-        """
-        super().__init__()
-        self.in_chans = in_chans
-        self.hidden_chans = int(in_chans * expand_ratio)
-        self.out_chans = out_chans
-
-        self.conv1 = Conv2d_BN(in_chans, self.hidden_chans, ks=1)
-        self.act1 = activation()
-
-        self.conv2 = Conv2d_BN(self.hidden_chans, self.hidden_chans, ks=3, stride=1, pad=1, groups=self.hidden_chans)
-        self.act2 = activation()
-
-        self.conv3 = Conv2d_BN(self.hidden_chans, out_chans, ks=1, bn_weight_init=0.0)
-        self.act3 = activation()
-
-        # NOTE: `DropPath` is needed only for training.
-        # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-        self.drop_path = nn.Identity()
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Implement the forward pass of MBConv, applying convolutions and skip connection."""
-        shortcut = x
-        x = self.conv1(x)
-        x = self.act1(x)
-        x = self.conv2(x)
-        x = self.act2(x)
-        x = self.conv3(x)
-        x = self.drop_path(x)
-        x += shortcut
-        return self.act3(x)
-
-
-class PatchMerging(nn.Module):
-    """
-    Merge neighboring patches in the feature map and project to a new dimension.
-
-    This class implements a patch merging operation that combines spatial information and adjusts the feature
-    dimension using a series of convolutional layers with batch normalization. It effectively reduces spatial
-    resolution while potentially increasing channel dimensions.
-
-    Attributes:
-        input_resolution (tuple[int, int]): The input resolution (height, width) of the feature map.
-        dim (int): The input dimension of the feature map.
-        out_dim (int): The output dimension after merging and projection.
-        act (nn.Module): The activation function used between convolutions.
-        conv1 (Conv2d_BN): The first convolutional layer for dimension projection.
-        conv2 (Conv2d_BN): The second convolutional layer for spatial merging.
-        conv3 (Conv2d_BN): The third convolutional layer for final projection.
-
-    Examples:
-        >>> input_resolution = (56, 56)
-        >>> patch_merging = PatchMerging(input_resolution, dim=64, out_dim=128, activation=nn.ReLU)
-        >>> x = torch.randn(4, 64, 56, 56)
-        >>> output = patch_merging(x)
-        >>> print(output.shape)
-        torch.Size([4, 3136, 128])
-    """
-
-    def __init__(self, input_resolution: tuple[int, int], dim: int, out_dim: int, activation):
-        """
-        Initialize the PatchMerging module for merging and projecting neighboring patches in feature maps.
-
-        Args:
-            input_resolution (tuple[int, int]): The input resolution (height, width) of the feature map.
-            dim (int): The input dimension of the feature map.
-            out_dim (int): The output dimension after merging and projection.
-            activation (nn.Module): The activation function used between convolutions.
-        """
-        super().__init__()
-
-        self.input_resolution = input_resolution
-        self.dim = dim
-        self.out_dim = out_dim
-        self.act = activation()
-        self.conv1 = Conv2d_BN(dim, out_dim, 1, 1, 0)
-        stride_c = 1 if out_dim in {320, 448, 576} else 2
-        self.conv2 = Conv2d_BN(out_dim, out_dim, 3, stride_c, 1, groups=out_dim)
-        self.conv3 = Conv2d_BN(out_dim, out_dim, 1, 1, 0)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Apply patch merging and dimension projection to the input feature map."""
-        if x.ndim == 3:
-            H, W = self.input_resolution
-            B = len(x)
-            # (B, C, H, W)
-            x = x.view(B, H, W, -1).permute(0, 3, 1, 2)
-
-        x = self.conv1(x)
-        x = self.act(x)
-
-        x = self.conv2(x)
-        x = self.act(x)
-        x = self.conv3(x)
-        return x.flatten(2).transpose(1, 2)
-
-
-class ConvLayer(nn.Module):
-    """
-    Convolutional Layer featuring multiple MobileNetV3-style inverted bottleneck convolutions (MBConv).
-
-    This layer optionally applies downsample operations to the output and supports gradient checkpointing
-    for memory efficiency during training.
-
-    Attributes:
-        dim (int): Dimensionality of the input and output.
-        input_resolution (tuple[int, int]): Resolution of the input image.
-        depth (int): Number of MBConv layers in the block.
-        use_checkpoint (bool): Whether to use gradient checkpointing to save memory.
-        blocks (nn.ModuleList): List of MBConv layers.
-        downsample (Optional[nn.Module]): Function for downsampling the output.
-
-    Examples:
-        >>> input_tensor = torch.randn(1, 64, 56, 56)
-        >>> conv_layer = ConvLayer(64, (56, 56), depth=3, activation=nn.ReLU)
-        >>> output = conv_layer(input_tensor)
-        >>> print(output.shape)
-        torch.Size([1, 3136, 128])
-    """
-
-    def __init__(
-        self,
-        dim: int,
-        input_resolution: tuple[int, int],
-        depth: int,
-        activation,
-        drop_path: float | list[float] = 0.0,
-        downsample: nn.Module | None = None,
-        use_checkpoint: bool = False,
-        out_dim: int | None = None,
-        conv_expand_ratio: float = 4.0,
-    ):
-        """
-        Initialize the ConvLayer with the given dimensions and settings.
-
-        This layer consists of multiple MobileNetV3-style inverted bottleneck convolutions (MBConv) and
-        optionally applies downsampling to the output.
-
-        Args:
-            dim (int): The dimensionality of the input and output.
-            input_resolution (tuple[int, int]): The resolution of the input image.
-            depth (int): The number of MBConv layers in the block.
-            activation (nn.Module): Activation function applied after each convolution.
-            drop_path (float | list[float], optional): Drop path rate. Single float or a list of floats for each MBConv.
-            downsample (Optional[nn.Module], optional): Function for downsampling the output. None to skip downsampling.
-            use_checkpoint (bool, optional): Whether to use gradient checkpointing to save memory.
-            out_dim (Optional[int], optional): The dimensionality of the output. None means it will be the same as `dim`.
-            conv_expand_ratio (float, optional): Expansion ratio for the MBConv layers.
-        """
-        super().__init__()
-        self.dim = dim
-        self.input_resolution = input_resolution
-        self.depth = depth
-        self.use_checkpoint = use_checkpoint
-
-        # Build blocks
-        self.blocks = nn.ModuleList(
-            [
-                MBConv(
-                    dim,
-                    dim,
-                    conv_expand_ratio,
-                    activation,
-                    drop_path[i] if isinstance(drop_path, list) else drop_path,
-                )
-                for i in range(depth)
-            ]
-        )
-
-        # Patch merging layer
-        self.downsample = (
-            None
-            if downsample is None
-            else downsample(input_resolution, dim=dim, out_dim=out_dim, activation=activation)
-        )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Process input through convolutional layers, applying MBConv blocks and optional downsampling."""
-        for blk in self.blocks:
-            x = torch.utils.checkpoint(blk, x) if self.use_checkpoint else blk(x)  # warn: checkpoint is slow import
-        return x if self.downsample is None else self.downsample(x)
-
-
-class MLP(nn.Module):
-    """
-    Multi-layer Perceptron (MLP) module for transformer architectures.
-
-    This module applies layer normalization, two fully-connected layers with an activation function in between,
-    and dropout. It is commonly used in transformer-based architectures for processing token embeddings.
-
-    Attributes:
-        norm (nn.LayerNorm): Layer normalization applied to the input.
-        fc1 (nn.Linear): First fully-connected layer.
-        fc2 (nn.Linear): Second fully-connected layer.
-        act (nn.Module): Activation function applied after the first fully-connected layer.
-        drop (nn.Dropout): Dropout layer applied after the activation function.
-
-    Examples:
-        >>> import torch
-        >>> from torch import nn
-        >>> mlp = MLP(in_features=256, hidden_features=512, out_features=256, activation=nn.GELU, drop=0.1)
-        >>> x = torch.randn(32, 100, 256)
-        >>> output = mlp(x)
-        >>> print(output.shape)
-        torch.Size([32, 100, 256])
-    """
-
-    def __init__(
-        self,
-        in_features: int,
-        hidden_features: int | None = None,
-        out_features: int | None = None,
-        activation=nn.GELU,
-        drop: float = 0.0,
-    ):
-        """
-        Initialize a multi-layer perceptron with configurable input, hidden, and output dimensions.
-
-        Args:
-            in_features (int): Number of input features.
-            hidden_features (Optional[int], optional): Number of hidden features.
-            out_features (Optional[int], optional): Number of output features.
-            activation (nn.Module): Activation function applied after the first fully-connected layer.
-            drop (float, optional): Dropout probability.
-        """
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.norm = nn.LayerNorm(in_features)
-        self.fc1 = nn.Linear(in_features, hidden_features)
-        self.fc2 = nn.Linear(hidden_features, out_features)
-        self.act = activation()
-        self.drop = nn.Dropout(drop)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Apply MLP operations: layer norm, FC layers, activation, and dropout to the input tensor."""
-        x = self.norm(x)
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop(x)
-        x = self.fc2(x)
-        return self.drop(x)
-
-
-class Attention(torch.nn.Module):
-    """
-    Multi-head attention module with spatial awareness and trainable attention biases.
-
-    This module implements a multi-head attention mechanism with support for spatial awareness, applying
-    attention biases based on spatial resolution. It includes trainable attention biases for each unique
-    offset between spatial positions in the resolution grid.
-
-    Attributes:
-        num_heads (int): Number of attention heads.
-        scale (float): Scaling factor for attention scores.
-        key_dim (int): Dimensionality of the keys and queries.
-        nh_kd (int): Product of num_heads and key_dim.
-        d (int): Dimensionality of the value vectors.
-        dh (int): Product of d and num_heads.
-        attn_ratio (float): Attention ratio affecting the dimensions of the value vectors.
-        norm (nn.LayerNorm): Layer normalization applied to input.
-        qkv (nn.Linear): Linear layer for computing query, key, and value projections.
-        proj (nn.Linear): Linear layer for final projection.
-        attention_biases (nn.Parameter): Learnable attention biases.
-        attention_bias_idxs (torch.Tensor): Indices for attention biases.
-        ab (torch.Tensor): Cached attention biases for inference, deleted during training.
-
-    Examples:
-        >>> attn = Attention(dim=256, key_dim=64, num_heads=8, resolution=(14, 14))
-        >>> x = torch.randn(1, 196, 256)
-        >>> output = attn(x)
-        >>> print(output.shape)
-        torch.Size([1, 196, 256])
-    """
-
-    def __init__(
-        self,
-        dim: int,
-        key_dim: int,
-        num_heads: int = 8,
-        attn_ratio: float = 4,
-        resolution: tuple[int, int] = (14, 14),
-    ):
-        """
-        Initialize the Attention module for multi-head attention with spatial awareness.
-
-        This module implements a multi-head attention mechanism with support for spatial awareness, applying
-        attention biases based on spatial resolution. It includes trainable attention biases for each unique
-        offset between spatial positions in the resolution grid.
-
-        Args:
-            dim (int): The dimensionality of the input and output.
-            key_dim (int): The dimensionality of the keys and queries.
-            num_heads (int, optional): Number of attention heads.
-            attn_ratio (float, optional): Attention ratio, affecting the dimensions of the value vectors.
-            resolution (tuple[int, int], optional): Spatial resolution of the input feature map.
-        """
-        super().__init__()
-
-        assert isinstance(resolution, tuple) and len(resolution) == 2, "'resolution' argument not tuple of length 2"
-        self.num_heads = num_heads
-        self.scale = key_dim**-0.5
-        self.key_dim = key_dim
-        self.nh_kd = nh_kd = key_dim * num_heads
-        self.d = int(attn_ratio * key_dim)
-        self.dh = int(attn_ratio * key_dim) * num_heads
-        self.attn_ratio = attn_ratio
-        h = self.dh + nh_kd * 2
-
-        self.norm = nn.LayerNorm(dim)
-        self.qkv = nn.Linear(dim, h)
-        self.proj = nn.Linear(self.dh, dim)
-
-        points = list(itertools.product(range(resolution[0]), range(resolution[1])))
-        N = len(points)
-        attention_offsets = {}
-        idxs = []
-        for p1 in points:
-            for p2 in points:
-                offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1]))
-                if offset not in attention_offsets:
-                    attention_offsets[offset] = len(attention_offsets)
-                idxs.append(attention_offsets[offset])
-        self.attention_biases = torch.nn.Parameter(torch.zeros(num_heads, len(attention_offsets)))
-        self.register_buffer("attention_bias_idxs", torch.LongTensor(idxs).view(N, N), persistent=False)
-
-    @torch.no_grad()
-    def train(self, mode: bool = True):
-        """Set the module in training mode and handle the 'ab' attribute for cached attention biases."""
-        super().train(mode)
-        if mode and hasattr(self, "ab"):
-            del self.ab
-        else:
-            self.ab = self.attention_biases[:, self.attention_bias_idxs]
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Apply multi-head attention with spatial awareness and trainable attention biases."""
-        B, N, _ = x.shape  # B, N, C
-
-        # Normalization
-        x = self.norm(x)
-
-        qkv = self.qkv(x)
-        # (B, N, num_heads, d)
-        q, k, v = qkv.view(B, N, self.num_heads, -1).split([self.key_dim, self.key_dim, self.d], dim=3)
-        # (B, num_heads, N, d)
-        q = q.permute(0, 2, 1, 3)
-        k = k.permute(0, 2, 1, 3)
-        v = v.permute(0, 2, 1, 3)
-        self.ab = self.ab.to(self.attention_biases.device)
-
-        attn = (q @ k.transpose(-2, -1)) * self.scale + (
-            self.attention_biases[:, self.attention_bias_idxs] if self.training else self.ab
-        )
-        attn = attn.softmax(dim=-1)
-        x = (attn @ v).transpose(1, 2).reshape(B, N, self.dh)
-        return self.proj(x)
-
-
-class TinyViTBlock(nn.Module):
-    """
-    TinyViT Block that applies self-attention and a local convolution to the input.
-
-    This block is a key component of the TinyViT architecture, combining self-attention mechanisms with
-    local convolutions to process input features efficiently. It supports windowed attention for
-    computational efficiency and includes residual connections.
-
-    Attributes:
-        dim (int): The dimensionality of the input and output.
-        input_resolution (tuple[int, int]): Spatial resolution of the input feature map.
-        num_heads (int): Number of attention heads.
-        window_size (int): Size of the attention window.
-        mlp_ratio (float): Ratio of MLP hidden dimension to embedding dimension.
-        drop_path (nn.Module): Stochastic depth layer, identity function during inference.
-        attn (Attention): Self-attention module.
-        mlp (MLP): Multi-layer perceptron module.
-        local_conv (Conv2d_BN): Depth-wise local convolution layer.
-
-    Examples:
-        >>> input_tensor = torch.randn(1, 196, 192)
-        >>> block = TinyViTBlock(dim=192, input_resolution=(14, 14), num_heads=3)
-        >>> output = block(input_tensor)
-        >>> print(output.shape)
-        torch.Size([1, 196, 192])
-    """
-
-    def __init__(
-        self,
-        dim: int,
-        input_resolution: tuple[int, int],
-        num_heads: int,
-        window_size: int = 7,
-        mlp_ratio: float = 4.0,
-        drop: float = 0.0,
-        drop_path: float = 0.0,
-        local_conv_size: int = 3,
-        activation=nn.GELU,
-    ):
-        """
-        Initialize a TinyViT block with self-attention and local convolution.
-
-        This block is a key component of the TinyViT architecture, combining self-attention mechanisms with
-        local convolutions to process input features efficiently.
-
-        Args:
-            dim (int): Dimensionality of the input and output features.
-            input_resolution (tuple[int, int]): Spatial resolution of the input feature map (height, width).
-            num_heads (int): Number of attention heads.
-            window_size (int, optional): Size of the attention window. Must be greater than 0.
-            mlp_ratio (float, optional): Ratio of MLP hidden dimension to embedding dimension.
-            drop (float, optional): Dropout rate.
-            drop_path (float, optional): Stochastic depth rate.
-            local_conv_size (int, optional): Kernel size of the local convolution.
-            activation (nn.Module): Activation function for MLP.
-        """
-        super().__init__()
-        self.dim = dim
-        self.input_resolution = input_resolution
-        self.num_heads = num_heads
-        assert window_size > 0, "window_size must be greater than 0"
-        self.window_size = window_size
-        self.mlp_ratio = mlp_ratio
-
-        # NOTE: `DropPath` is needed only for training.
-        # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-        self.drop_path = nn.Identity()
-
-        assert dim % num_heads == 0, "dim must be divisible by num_heads"
-        head_dim = dim // num_heads
-
-        window_resolution = (window_size, window_size)
-        self.attn = Attention(dim, head_dim, num_heads, attn_ratio=1, resolution=window_resolution)
-
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        mlp_activation = activation
-        self.mlp = MLP(in_features=dim, hidden_features=mlp_hidden_dim, activation=mlp_activation, drop=drop)
-
-        pad = local_conv_size // 2
-        self.local_conv = Conv2d_BN(dim, dim, ks=local_conv_size, stride=1, pad=pad, groups=dim)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Apply self-attention, local convolution, and MLP operations to the input tensor."""
-        h, w = self.input_resolution
-        b, hw, c = x.shape  # batch, height*width, channels
-        assert hw == h * w, "input feature has wrong size"
-        res_x = x
-        if h == self.window_size and w == self.window_size:
-            x = self.attn(x)
-        else:
-            x = x.view(b, h, w, c)
-            pad_b = (self.window_size - h % self.window_size) % self.window_size
-            pad_r = (self.window_size - w % self.window_size) % self.window_size
-            padding = pad_b > 0 or pad_r > 0
-            if padding:
-                x = F.pad(x, (0, 0, 0, pad_r, 0, pad_b))
-
-            pH, pW = h + pad_b, w + pad_r
-            nH = pH // self.window_size
-            nW = pW // self.window_size
-
-            # Window partition
-            x = (
-                x.view(b, nH, self.window_size, nW, self.window_size, c)
-                .transpose(2, 3)
-                .reshape(b * nH * nW, self.window_size * self.window_size, c)
-            )
-            x = self.attn(x)
-
-            # Window reverse
-            x = x.view(b, nH, nW, self.window_size, self.window_size, c).transpose(2, 3).reshape(b, pH, pW, c)
-            if padding:
-                x = x[:, :h, :w].contiguous()
-
-            x = x.view(b, hw, c)
-
-        x = res_x + self.drop_path(x)
-        x = x.transpose(1, 2).reshape(b, c, h, w)
-        x = self.local_conv(x)
-        x = x.view(b, c, hw).transpose(1, 2)
-
-        return x + self.drop_path(self.mlp(x))
-
-    def extra_repr(self) -> str:
-        """
-        Return a string representation of the TinyViTBlock's parameters.
-
-        This method provides a formatted string containing key information about the TinyViTBlock, including its
-        dimension, input resolution, number of attention heads, window size, and MLP ratio.
-
-        Returns:
-            (str): A formatted string containing the block's parameters.
-
-        Examples:
-            >>> block = TinyViTBlock(dim=192, input_resolution=(14, 14), num_heads=3, window_size=7, mlp_ratio=4.0)
-            >>> print(block.extra_repr())
-            dim=192, input_resolution=(14, 14), num_heads=3, window_size=7, mlp_ratio=4.0
-        """
-        return (
-            f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, "
-            f"window_size={self.window_size}, mlp_ratio={self.mlp_ratio}"
-        )
-
-
-class BasicLayer(nn.Module):
-    """
-    A basic TinyViT layer for one stage in a TinyViT architecture.
-
-    This class represents a single layer in the TinyViT model, consisting of multiple TinyViT blocks
-    and an optional downsampling operation. It processes features at a specific resolution and
-    dimensionality within the overall architecture.
-
-    Attributes:
-        dim (int): The dimensionality of the input and output features.
-        input_resolution (tuple[int, int]): Spatial resolution of the input feature map.
-        depth (int): Number of TinyViT blocks in this layer.
-        use_checkpoint (bool): Whether to use gradient checkpointing to save memory.
-        blocks (nn.ModuleList): List of TinyViT blocks that make up this layer.
-        downsample (nn.Module | None): Downsample layer at the end of the layer, if specified.
-
-    Examples:
-        >>> input_tensor = torch.randn(1, 3136, 192)
-        >>> layer = BasicLayer(dim=192, input_resolution=(56, 56), depth=2, num_heads=3, window_size=7)
-        >>> output = layer(input_tensor)
-        >>> print(output.shape)
-        torch.Size([1, 784, 384])
-    """
-
-    def __init__(
-        self,
-        dim: int,
-        input_resolution: tuple[int, int],
-        depth: int,
-        num_heads: int,
-        window_size: int,
-        mlp_ratio: float = 4.0,
-        drop: float = 0.0,
-        drop_path: float | list[float] = 0.0,
-        downsample: nn.Module | None = None,
-        use_checkpoint: bool = False,
-        local_conv_size: int = 3,
-        activation=nn.GELU,
-        out_dim: int | None = None,
-    ):
-        """
-        Initialize a BasicLayer in the TinyViT architecture.
-
-        This layer consists of multiple TinyViT blocks and an optional downsampling operation. It is designed to
-        process feature maps at a specific resolution and dimensionality within the TinyViT model.
-
-        Args:
-            dim (int): Dimensionality of the input and output features.
-            input_resolution (tuple[int, int]): Spatial resolution of the input feature map (height, width).
-            depth (int): Number of TinyViT blocks in this layer.
-            num_heads (int): Number of attention heads in each TinyViT block.
-            window_size (int): Size of the local window for attention computation.
-            mlp_ratio (float, optional): Ratio of MLP hidden dimension to embedding dimension.
-            drop (float, optional): Dropout rate.
-            drop_path (float | list[float], optional): Stochastic depth rate. Can be a float or a list of floats for each block.
-            downsample (nn.Module | None, optional): Downsampling layer at the end of the layer. None to skip downsampling.
-            use_checkpoint (bool, optional): Whether to use gradient checkpointing to save memory.
-            local_conv_size (int, optional): Kernel size for the local convolution in each TinyViT block.
-            activation (nn.Module): Activation function used in the MLP.
-            out_dim (int | None, optional): Output dimension after downsampling. None means it will be the same as `dim`.
-        """
-        super().__init__()
-        self.dim = dim
-        self.input_resolution = input_resolution
-        self.depth = depth
-        self.use_checkpoint = use_checkpoint
-
-        # Build blocks
-        self.blocks = nn.ModuleList(
-            [
-                TinyViTBlock(
-                    dim=dim,
-                    input_resolution=input_resolution,
-                    num_heads=num_heads,
-                    window_size=window_size,
-                    mlp_ratio=mlp_ratio,
-                    drop=drop,
-                    drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
-                    local_conv_size=local_conv_size,
-                    activation=activation,
-                )
-                for i in range(depth)
-            ]
-        )
-
-        # Patch merging layer
-        self.downsample = (
-            None
-            if downsample is None
-            else downsample(input_resolution, dim=dim, out_dim=out_dim, activation=activation)
-        )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Process input through TinyViT blocks and optional downsampling."""
-        for blk in self.blocks:
-            x = torch.utils.checkpoint(blk, x) if self.use_checkpoint else blk(x)  # warn: checkpoint is slow import
-        return x if self.downsample is None else self.downsample(x)
-
-    def extra_repr(self) -> str:
-        """Return a string with the layer's parameters for printing."""
-        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
-
-
-class TinyViT(nn.Module):
-    """
-    TinyViT: A compact vision transformer architecture for efficient image classification and feature extraction.
-
-    This class implements the TinyViT model, which combines elements of vision transformers and convolutional
-    neural networks for improved efficiency and performance on vision tasks. It features hierarchical processing
-    with patch embedding, multiple stages of attention and convolution blocks, and a feature refinement neck.
-
-    Attributes:
-        img_size (int): Input image size.
-        num_classes (int): Number of classification classes.
-        depths (tuple[int, int, int, int]): Number of blocks in each stage.
-        num_layers (int): Total number of layers in the network.
-        mlp_ratio (float): Ratio of MLP hidden dimension to embedding dimension.
-        patch_embed (PatchEmbed): Module for patch embedding.
-        patches_resolution (tuple[int, int]): Resolution of embedded patches.
-        layers (nn.ModuleList): List of network layers.
-        norm_head (nn.LayerNorm): Layer normalization for the classifier head.
-        head (nn.Linear): Linear layer for final classification.
-        neck (nn.Sequential): Neck module for feature refinement.
-
-    Examples:
-        >>> model = TinyViT(img_size=224, num_classes=1000)
-        >>> x = torch.randn(1, 3, 224, 224)
-        >>> features = model.forward_features(x)
-        >>> print(features.shape)
-        torch.Size([1, 256, 56, 56])
-    """
-
-    def __init__(
-        self,
-        img_size: int = 224,
-        in_chans: int = 3,
-        num_classes: int = 1000,
-        embed_dims: tuple[int, int, int, int] = (96, 192, 384, 768),
-        depths: tuple[int, int, int, int] = (2, 2, 6, 2),
-        num_heads: tuple[int, int, int, int] = (3, 6, 12, 24),
-        window_sizes: tuple[int, int, int, int] = (7, 7, 14, 7),
-        mlp_ratio: float = 4.0,
-        drop_rate: float = 0.0,
-        drop_path_rate: float = 0.1,
-        use_checkpoint: bool = False,
-        mbconv_expand_ratio: float = 4.0,
-        local_conv_size: int = 3,
-        layer_lr_decay: float = 1.0,
-    ):
-        """
-        Initialize the TinyViT model.
-
-        This constructor sets up the TinyViT architecture, including patch embedding, multiple layers of
-        attention and convolution blocks, and a classification head.
-
-        Args:
-            img_size (int, optional): Size of the input image.
-            in_chans (int, optional): Number of input channels.
-            num_classes (int, optional): Number of classes for classification.
-            embed_dims (tuple[int, int, int, int], optional): Embedding dimensions for each stage.
-            depths (tuple[int, int, int, int], optional): Number of blocks in each stage.
-            num_heads (tuple[int, int, int, int], optional): Number of attention heads in each stage.
-            window_sizes (tuple[int, int, int, int], optional): Window sizes for each stage.
-            mlp_ratio (float, optional): Ratio of MLP hidden dim to embedding dim.
-            drop_rate (float, optional): Dropout rate.
-            drop_path_rate (float, optional): Stochastic depth rate.
-            use_checkpoint (bool, optional): Whether to use checkpointing to save memory.
-            mbconv_expand_ratio (float, optional): Expansion ratio for MBConv layer.
-            local_conv_size (int, optional): Kernel size for local convolutions.
-            layer_lr_decay (float, optional): Layer-wise learning rate decay factor.
-        """
-        super().__init__()
-        self.img_size = img_size
-        self.num_classes = num_classes
-        self.depths = depths
-        self.num_layers = len(depths)
-        self.mlp_ratio = mlp_ratio
-
-        activation = nn.GELU
-
-        self.patch_embed = PatchEmbed(
-            in_chans=in_chans, embed_dim=embed_dims[0], resolution=img_size, activation=activation
-        )
-
-        patches_resolution = self.patch_embed.patches_resolution
-        self.patches_resolution = patches_resolution
-
-        # Stochastic depth
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
-
-        # Build layers
-        self.layers = nn.ModuleList()
-        for i_layer in range(self.num_layers):
-            kwargs = dict(
-                dim=embed_dims[i_layer],
-                input_resolution=(
-                    patches_resolution[0] // (2 ** (i_layer - 1 if i_layer == 3 else i_layer)),
-                    patches_resolution[1] // (2 ** (i_layer - 1 if i_layer == 3 else i_layer)),
-                ),
-                #   input_resolution=(patches_resolution[0] // (2 ** i_layer),
-                #                     patches_resolution[1] // (2 ** i_layer)),
-                depth=depths[i_layer],
-                drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])],
-                downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
-                use_checkpoint=use_checkpoint,
-                out_dim=embed_dims[min(i_layer + 1, len(embed_dims) - 1)],
-                activation=activation,
-            )
-            if i_layer == 0:
-                layer = ConvLayer(conv_expand_ratio=mbconv_expand_ratio, **kwargs)
-            else:
-                layer = BasicLayer(
-                    num_heads=num_heads[i_layer],
-                    window_size=window_sizes[i_layer],
-                    mlp_ratio=self.mlp_ratio,
-                    drop=drop_rate,
-                    local_conv_size=local_conv_size,
-                    **kwargs,
-                )
-            self.layers.append(layer)
-
-        # Classifier head
-        self.norm_head = nn.LayerNorm(embed_dims[-1])
-        self.head = nn.Linear(embed_dims[-1], num_classes) if num_classes > 0 else torch.nn.Identity()
-
-        # Init weights
-        self.apply(self._init_weights)
-        self.set_layer_lr_decay(layer_lr_decay)
-        self.neck = nn.Sequential(
-            nn.Conv2d(
-                embed_dims[-1],
-                256,
-                kernel_size=1,
-                bias=False,
-            ),
-            LayerNorm2d(256),
-            nn.Conv2d(
-                256,
-                256,
-                kernel_size=3,
-                padding=1,
-                bias=False,
-            ),
-            LayerNorm2d(256),
-        )
-
-    def set_layer_lr_decay(self, layer_lr_decay: float):
-        """Set layer-wise learning rate decay for the TinyViT model based on depth."""
-        decay_rate = layer_lr_decay
-
-        # Layers -> blocks (depth)
-        depth = sum(self.depths)
-        lr_scales = [decay_rate ** (depth - i - 1) for i in range(depth)]
-
-        def _set_lr_scale(m, scale):
-            """Set the learning rate scale for each layer in the model based on the layer's depth."""
-            for p in m.parameters():
-                p.lr_scale = scale
-
-        self.patch_embed.apply(lambda x: _set_lr_scale(x, lr_scales[0]))
-        i = 0
-        for layer in self.layers:
-            for block in layer.blocks:
-                block.apply(lambda x: _set_lr_scale(x, lr_scales[i]))
-                i += 1
-            if layer.downsample is not None:
-                layer.downsample.apply(lambda x: _set_lr_scale(x, lr_scales[i - 1]))
-        assert i == depth
-        for m in {self.norm_head, self.head}:
-            m.apply(lambda x: _set_lr_scale(x, lr_scales[-1]))
-
-        for k, p in self.named_parameters():
-            p.param_name = k
-
-        def _check_lr_scale(m):
-            """Check if the learning rate scale attribute is present in module's parameters."""
-            for p in m.parameters():
-                assert hasattr(p, "lr_scale"), p.param_name
-
-        self.apply(_check_lr_scale)
-
-    @staticmethod
-    def _init_weights(m):
-        """Initialize weights for linear and normalization layers in the TinyViT model."""
-        if isinstance(m, nn.Linear):
-            # NOTE: This initialization is needed only for training.
-            # trunc_normal_(m.weight, std=.02)
-            if m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
-
-    @torch.jit.ignore
-    def no_weight_decay_keywords(self):
-        """Return a set of keywords for parameters that should not use weight decay."""
-        return {"attention_biases"}
-
-    def forward_features(self, x: torch.Tensor) -> torch.Tensor:
-        """Process input through feature extraction layers, returning spatial features."""
-        x = self.patch_embed(x)  # x input is (N, C, H, W)
-
-        x = self.layers[0](x)
-        start_i = 1
-
-        for i in range(start_i, len(self.layers)):
-            layer = self.layers[i]
-            x = layer(x)
-        batch, _, channel = x.shape
-        x = x.view(batch, self.patches_resolution[0] // 4, self.patches_resolution[1] // 4, channel)
-        x = x.permute(0, 3, 1, 2)
-        return self.neck(x)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Perform the forward pass through the TinyViT model, extracting features from the input image."""
-        return self.forward_features(x)
-
-    def set_imgsz(self, imgsz: list[int] = [1024, 1024]):
-        """Set image size to make model compatible with different image sizes."""
-        imgsz = [s // 4 for s in imgsz]
-        self.patches_resolution = imgsz
-        for i, layer in enumerate(self.layers):
-            input_resolution = (
-                imgsz[0] // (2 ** (i - 1 if i == 3 else i)),
-                imgsz[1] // (2 ** (i - 1 if i == 3 else i)),
-            )
-            layer.input_resolution = input_resolution
-            if layer.downsample is not None:
-                layer.downsample.input_resolution = input_resolution
-            if isinstance(layer, BasicLayer):
-                for b in layer.blocks:
-                    b.input_resolution = input_resolution
diff --git a/ultralytics/models/sam/modules/transformer.py b/ultralytics/models/sam/modules/transformer.py
deleted file mode 100644
index 379970b..0000000
--- a/ultralytics/models/sam/modules/transformer.py
+++ /dev/null
@@ -1,354 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import math
-
-import torch
-from torch import Tensor, nn
-
-from ultralytics.nn.modules import MLPBlock
-
-
-class TwoWayTransformer(nn.Module):
-    """
-    A Two-Way Transformer module for simultaneous attention to image and query points.
-
-    This class implements a specialized transformer decoder that attends to an input image using queries with
-    supplied positional embeddings. It's useful for tasks like object detection, image segmentation, and point
-    cloud processing.
-
-    Attributes:
-        depth (int): Number of layers in the transformer.
-        embedding_dim (int): Channel dimension for input embeddings.
-        num_heads (int): Number of heads for multihead attention.
-        mlp_dim (int): Internal channel dimension for the MLP block.
-        layers (nn.ModuleList): List of TwoWayAttentionBlock layers composing the transformer.
-        final_attn_token_to_image (Attention): Final attention layer from queries to image.
-        norm_final_attn (nn.LayerNorm): Layer normalization applied to final queries.
-
-    Methods:
-        forward: Process image and point embeddings through the transformer.
-
-    Examples:
-        >>> transformer = TwoWayTransformer(depth=6, embedding_dim=256, num_heads=8, mlp_dim=2048)
-        >>> image_embedding = torch.randn(1, 256, 32, 32)
-        >>> image_pe = torch.randn(1, 256, 32, 32)
-        >>> point_embedding = torch.randn(1, 100, 256)
-        >>> output_queries, output_image = transformer(image_embedding, image_pe, point_embedding)
-        >>> print(output_queries.shape, output_image.shape)
-    """
-
-    def __init__(
-        self,
-        depth: int,
-        embedding_dim: int,
-        num_heads: int,
-        mlp_dim: int,
-        activation: type[nn.Module] = nn.ReLU,
-        attention_downsample_rate: int = 2,
-    ) -> None:
-        """
-        Initialize a Two-Way Transformer for simultaneous attention to image and query points.
-
-        Args:
-            depth (int): Number of layers in the transformer.
-            embedding_dim (int): Channel dimension for input embeddings.
-            num_heads (int): Number of heads for multihead attention. Must divide embedding_dim.
-            mlp_dim (int): Internal channel dimension for the MLP block.
-            activation (Type[nn.Module], optional): Activation function to use in the MLP block.
-            attention_downsample_rate (int, optional): Downsampling rate for attention mechanism.
-        """
-        super().__init__()
-        self.depth = depth
-        self.embedding_dim = embedding_dim
-        self.num_heads = num_heads
-        self.mlp_dim = mlp_dim
-        self.layers = nn.ModuleList()
-
-        for i in range(depth):
-            self.layers.append(
-                TwoWayAttentionBlock(
-                    embedding_dim=embedding_dim,
-                    num_heads=num_heads,
-                    mlp_dim=mlp_dim,
-                    activation=activation,
-                    attention_downsample_rate=attention_downsample_rate,
-                    skip_first_layer_pe=(i == 0),
-                )
-            )
-
-        self.final_attn_token_to_image = Attention(embedding_dim, num_heads, downsample_rate=attention_downsample_rate)
-        self.norm_final_attn = nn.LayerNorm(embedding_dim)
-
-    def forward(
-        self,
-        image_embedding: torch.Tensor,
-        image_pe: torch.Tensor,
-        point_embedding: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        """
-        Process image and point embeddings through the Two-Way Transformer.
-
-        Args:
-            image_embedding (torch.Tensor): Image to attend to, with shape (B, embedding_dim, H, W).
-            image_pe (torch.Tensor): Positional encoding to add to the image, with same shape as image_embedding.
-            point_embedding (torch.Tensor): Embedding to add to query points, with shape (B, N_points, embedding_dim).
-
-        Returns:
-            queries (torch.Tensor): Processed point embeddings with shape (B, N_points, embedding_dim).
-            keys (torch.Tensor): Processed image embeddings with shape (B, H*W, embedding_dim).
-        """
-        # BxCxHxW -> BxHWxC == B x N_image_tokens x C
-        image_embedding = image_embedding.flatten(2).permute(0, 2, 1)
-        image_pe = image_pe.flatten(2).permute(0, 2, 1)
-
-        # Prepare queries
-        queries = point_embedding
-        keys = image_embedding
-
-        # Apply transformer blocks and final layernorm
-        for layer in self.layers:
-            queries, keys = layer(
-                queries=queries,
-                keys=keys,
-                query_pe=point_embedding,
-                key_pe=image_pe,
-            )
-
-        # Apply the final attention layer from the points to the image
-        q = queries + point_embedding
-        k = keys + image_pe
-        attn_out = self.final_attn_token_to_image(q=q, k=k, v=keys)
-        queries = queries + attn_out
-        queries = self.norm_final_attn(queries)
-
-        return queries, keys
-
-
-class TwoWayAttentionBlock(nn.Module):
-    """
-    A two-way attention block for simultaneous attention to image and query points.
-
-    This class implements a specialized transformer block with four main layers: self-attention on sparse inputs,
-    cross-attention of sparse inputs to dense inputs, MLP block on sparse inputs, and cross-attention of dense
-    inputs to sparse inputs.
-
-    Attributes:
-        self_attn (Attention): Self-attention layer for queries.
-        norm1 (nn.LayerNorm): Layer normalization after self-attention.
-        cross_attn_token_to_image (Attention): Cross-attention layer from queries to keys.
-        norm2 (nn.LayerNorm): Layer normalization after token-to-image attention.
-        mlp (MLPBlock): MLP block for transforming query embeddings.
-        norm3 (nn.LayerNorm): Layer normalization after MLP block.
-        norm4 (nn.LayerNorm): Layer normalization after image-to-token attention.
-        cross_attn_image_to_token (Attention): Cross-attention layer from keys to queries.
-        skip_first_layer_pe (bool): Whether to skip positional encoding in the first layer.
-
-    Methods:
-        forward: Apply self-attention and cross-attention to queries and keys.
-
-    Examples:
-        >>> embedding_dim, num_heads = 256, 8
-        >>> block = TwoWayAttentionBlock(embedding_dim, num_heads)
-        >>> queries = torch.randn(1, 100, embedding_dim)
-        >>> keys = torch.randn(1, 1000, embedding_dim)
-        >>> query_pe = torch.randn(1, 100, embedding_dim)
-        >>> key_pe = torch.randn(1, 1000, embedding_dim)
-        >>> processed_queries, processed_keys = block(queries, keys, query_pe, key_pe)
-    """
-
-    def __init__(
-        self,
-        embedding_dim: int,
-        num_heads: int,
-        mlp_dim: int = 2048,
-        activation: type[nn.Module] = nn.ReLU,
-        attention_downsample_rate: int = 2,
-        skip_first_layer_pe: bool = False,
-    ) -> None:
-        """
-        Initialize a TwoWayAttentionBlock for simultaneous attention to image and query points.
-
-        This block implements a specialized transformer layer with four main components: self-attention on sparse
-        inputs, cross-attention of sparse inputs to dense inputs, MLP block on sparse inputs, and cross-attention
-        of dense inputs to sparse inputs.
-
-        Args:
-            embedding_dim (int): Channel dimension of the embeddings.
-            num_heads (int): Number of attention heads in the attention layers.
-            mlp_dim (int, optional): Hidden dimension of the MLP block.
-            activation (Type[nn.Module], optional): Activation function for the MLP block.
-            attention_downsample_rate (int, optional): Downsampling rate for the attention mechanism.
-            skip_first_layer_pe (bool, optional): Whether to skip positional encoding in the first layer.
-        """
-        super().__init__()
-        self.self_attn = Attention(embedding_dim, num_heads)
-        self.norm1 = nn.LayerNorm(embedding_dim)
-
-        self.cross_attn_token_to_image = Attention(embedding_dim, num_heads, downsample_rate=attention_downsample_rate)
-        self.norm2 = nn.LayerNorm(embedding_dim)
-
-        self.mlp = MLPBlock(embedding_dim, mlp_dim, activation)
-        self.norm3 = nn.LayerNorm(embedding_dim)
-
-        self.norm4 = nn.LayerNorm(embedding_dim)
-        self.cross_attn_image_to_token = Attention(embedding_dim, num_heads, downsample_rate=attention_downsample_rate)
-
-        self.skip_first_layer_pe = skip_first_layer_pe
-
-    def forward(
-        self, queries: torch.Tensor, keys: torch.Tensor, query_pe: torch.Tensor, key_pe: torch.Tensor
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        """
-        Apply two-way attention to process query and key embeddings in a transformer block.
-
-        Args:
-            queries (torch.Tensor): Query embeddings with shape (B, N_queries, embedding_dim).
-            keys (torch.Tensor): Key embeddings with shape (B, N_keys, embedding_dim).
-            query_pe (torch.Tensor): Positional encodings for queries with same shape as queries.
-            key_pe (torch.Tensor): Positional encodings for keys with same shape as keys.
-
-        Returns:
-            queries (torch.Tensor): Processed query embeddings with shape (B, N_queries, embedding_dim).
-            keys (torch.Tensor): Processed key embeddings with shape (B, N_keys, embedding_dim).
-        """
-        # Self attention block
-        if self.skip_first_layer_pe:
-            queries = self.self_attn(q=queries, k=queries, v=queries)
-        else:
-            q = queries + query_pe
-            attn_out = self.self_attn(q=q, k=q, v=queries)
-            queries = queries + attn_out
-        queries = self.norm1(queries)
-
-        # Cross attention block, tokens attending to image embedding
-        q = queries + query_pe
-        k = keys + key_pe
-        attn_out = self.cross_attn_token_to_image(q=q, k=k, v=keys)
-        queries = queries + attn_out
-        queries = self.norm2(queries)
-
-        # MLP block
-        mlp_out = self.mlp(queries)
-        queries = queries + mlp_out
-        queries = self.norm3(queries)
-
-        # Cross attention block, image embedding attending to tokens
-        q = queries + query_pe
-        k = keys + key_pe
-        attn_out = self.cross_attn_image_to_token(q=k, k=q, v=queries)
-        keys = keys + attn_out
-        keys = self.norm4(keys)
-
-        return queries, keys
-
-
-class Attention(nn.Module):
-    """
-    An attention layer with downscaling capability for embedding size after projection.
-
-    This class implements a multi-head attention mechanism with the option to downsample the internal
-    dimension of queries, keys, and values.
-
-    Attributes:
-        embedding_dim (int): Dimensionality of input embeddings.
-        kv_in_dim (int): Dimensionality of key and value inputs.
-        internal_dim (int): Internal dimension after downsampling.
-        num_heads (int): Number of attention heads.
-        q_proj (nn.Linear): Linear projection for queries.
-        k_proj (nn.Linear): Linear projection for keys.
-        v_proj (nn.Linear): Linear projection for values.
-        out_proj (nn.Linear): Linear projection for output.
-
-    Methods:
-        _separate_heads: Separate input tensor into attention heads.
-        _recombine_heads: Recombine separated attention heads.
-        forward: Compute attention output for given query, key, and value tensors.
-
-    Examples:
-        >>> attn = Attention(embedding_dim=256, num_heads=8, downsample_rate=2)
-        >>> q = torch.randn(1, 100, 256)
-        >>> k = v = torch.randn(1, 50, 256)
-        >>> output = attn(q, k, v)
-        >>> print(output.shape)
-        torch.Size([1, 100, 256])
-    """
-
-    def __init__(
-        self,
-        embedding_dim: int,
-        num_heads: int,
-        downsample_rate: int = 1,
-        kv_in_dim: int = None,
-    ) -> None:
-        """
-        Initialize the Attention module with specified dimensions and settings.
-
-        Args:
-            embedding_dim (int): Dimensionality of input embeddings.
-            num_heads (int): Number of attention heads.
-            downsample_rate (int, optional): Factor by which internal dimensions are downsampled.
-            kv_in_dim (int | None, optional): Dimensionality of key and value inputs. If None, uses embedding_dim.
-
-        Raises:
-            AssertionError: If num_heads does not evenly divide the internal dim (embedding_dim / downsample_rate).
-        """
-        super().__init__()
-        self.embedding_dim = embedding_dim
-        self.kv_in_dim = kv_in_dim if kv_in_dim is not None else embedding_dim
-        self.internal_dim = embedding_dim // downsample_rate
-        self.num_heads = num_heads
-        assert self.internal_dim % num_heads == 0, "num_heads must divide embedding_dim."
-
-        self.q_proj = nn.Linear(embedding_dim, self.internal_dim)
-        self.k_proj = nn.Linear(self.kv_in_dim, self.internal_dim)
-        self.v_proj = nn.Linear(self.kv_in_dim, self.internal_dim)
-        self.out_proj = nn.Linear(self.internal_dim, embedding_dim)
-
-    @staticmethod
-    def _separate_heads(x: torch.Tensor, num_heads: int) -> torch.Tensor:
-        """Separate the input tensor into the specified number of attention heads."""
-        b, n, c = x.shape
-        x = x.reshape(b, n, num_heads, c // num_heads)
-        return x.transpose(1, 2)  # B x N_heads x N_tokens x C_per_head
-
-    @staticmethod
-    def _recombine_heads(x: Tensor) -> Tensor:
-        """Recombine separated attention heads into a single tensor."""
-        b, n_heads, n_tokens, c_per_head = x.shape
-        x = x.transpose(1, 2)
-        return x.reshape(b, n_tokens, n_heads * c_per_head)  # B x N_tokens x C
-
-    def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
-        """
-        Apply multi-head attention to query, key, and value tensors with optional downsampling.
-
-        Args:
-            q (torch.Tensor): Query tensor with shape (B, N_q, embedding_dim).
-            k (torch.Tensor): Key tensor with shape (B, N_k, embedding_dim).
-            v (torch.Tensor): Value tensor with shape (B, N_k, embedding_dim).
-
-        Returns:
-            (torch.Tensor): Output tensor after attention with shape (B, N_q, embedding_dim).
-        """
-        # Input projections
-        q = self.q_proj(q)
-        k = self.k_proj(k)
-        v = self.v_proj(v)
-
-        # Separate into heads
-        q = self._separate_heads(q, self.num_heads)
-        k = self._separate_heads(k, self.num_heads)
-        v = self._separate_heads(v, self.num_heads)
-
-        # Attention
-        _, _, _, c_per_head = q.shape
-        attn = q @ k.permute(0, 1, 3, 2)  # B x N_heads x N_tokens x N_tokens
-        attn = attn / math.sqrt(c_per_head)
-        attn = torch.softmax(attn, dim=-1)
-
-        # Get output
-        out = attn @ v
-        out = self._recombine_heads(out)
-        return self.out_proj(out)
diff --git a/ultralytics/models/sam/modules/utils.py b/ultralytics/models/sam/modules/utils.py
deleted file mode 100644
index e8923ff..0000000
--- a/ultralytics/models/sam/modules/utils.py
+++ /dev/null
@@ -1,388 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-from typing import Any
-
-import torch
-import torch.nn.functional as F
-
-
-def select_closest_cond_frames(frame_idx: int, cond_frame_outputs: dict[int, Any], max_cond_frame_num: int):
-    """
-    Select the closest conditioning frames to a given frame index.
-
-    Args:
-        frame_idx (int): Current frame index.
-        cond_frame_outputs (dict[int, Any]): Dictionary of conditioning frame outputs keyed by frame indices.
-        max_cond_frame_num (int): Maximum number of conditioning frames to select.
-
-    Returns:
-        selected_outputs (dict[int, Any]): Selected items from cond_frame_outputs.
-        unselected_outputs (dict[int, Any]): Items not selected from cond_frame_outputs.
-
-    Examples:
-        >>> frame_idx = 5
-        >>> cond_frame_outputs = {1: "a", 3: "b", 7: "c", 9: "d"}
-        >>> max_cond_frame_num = 2
-        >>> selected, unselected = select_closest_cond_frames(frame_idx, cond_frame_outputs, max_cond_frame_num)
-        >>> print(selected)
-        {3: 'b', 7: 'c'}
-        >>> print(unselected)
-        {1: 'a', 9: 'd'}
-    """
-    if max_cond_frame_num == -1 or len(cond_frame_outputs) <= max_cond_frame_num:
-        selected_outputs = cond_frame_outputs
-        unselected_outputs = {}
-    else:
-        assert max_cond_frame_num >= 2, "we should allow using 2+ conditioning frames"
-        selected_outputs = {}
-
-        # The closest conditioning frame before `frame_idx` (if any)
-        idx_before = max((t for t in cond_frame_outputs if t < frame_idx), default=None)
-        if idx_before is not None:
-            selected_outputs[idx_before] = cond_frame_outputs[idx_before]
-
-        # The closest conditioning frame after `frame_idx` (if any)
-        idx_after = min((t for t in cond_frame_outputs if t >= frame_idx), default=None)
-        if idx_after is not None:
-            selected_outputs[idx_after] = cond_frame_outputs[idx_after]
-
-        # Add other temporally closest conditioning frames until reaching a total
-        # of `max_cond_frame_num` conditioning frames.
-        num_remain = max_cond_frame_num - len(selected_outputs)
-        inds_remain = sorted(
-            (t for t in cond_frame_outputs if t not in selected_outputs),
-            key=lambda x: abs(x - frame_idx),
-        )[:num_remain]
-        selected_outputs.update((t, cond_frame_outputs[t]) for t in inds_remain)
-        unselected_outputs = {t: v for t, v in cond_frame_outputs.items() if t not in selected_outputs}
-
-    return selected_outputs, unselected_outputs
-
-
-def get_1d_sine_pe(pos_inds: torch.Tensor, dim: int, temperature: float = 10000):
-    """
-    Generate 1D sinusoidal positional embeddings for given positions and dimensions.
-
-    Args:
-        pos_inds (torch.Tensor): Position indices for which to generate embeddings.
-        dim (int): Dimension of the positional embeddings. Should be an even number.
-        temperature (float, optional): Scaling factor for the frequency of the sinusoidal functions.
-
-    Returns:
-        (torch.Tensor): Sinusoidal positional embeddings with shape (pos_inds.shape, dim).
-
-    Examples:
-        >>> pos = torch.tensor([0, 1, 2, 3])
-        >>> embeddings = get_1d_sine_pe(pos, 128)
-        >>> embeddings.shape
-        torch.Size([4, 128])
-    """
-    pe_dim = dim // 2
-    dim_t = torch.arange(pe_dim, dtype=pos_inds.dtype, device=pos_inds.device)
-    dim_t = temperature ** (2 * (dim_t // 2) / pe_dim)
-
-    pos_embed = pos_inds.unsqueeze(-1) / dim_t
-    pos_embed = torch.cat([pos_embed.sin(), pos_embed.cos()], dim=-1)
-    return pos_embed
-
-
-def init_t_xy(end_x: int, end_y: int):
-    """
-    Initialize 1D and 2D coordinate tensors for a grid of specified dimensions.
-
-    This function creates coordinate tensors for a grid with dimensions end_x × end_y. It generates a linear index tensor
-    and corresponding x and y coordinate tensors.
-
-    Args:
-        end_x (int): Width of the grid (number of columns).
-        end_y (int): Height of the grid (number of rows).
-
-    Returns:
-        t_x (torch.Tensor): X-coordinates for each position, with shape (end_x * end_y).
-        t_y (torch.Tensor): Y-coordinates for each position, with shape (end_x * end_y).
-
-    Examples:
-        >>> t_x, t_y = init_t_xy(3, 2)
-        >>> print(t_x)
-        tensor([0., 1., 2., 0., 1., 2.])
-        >>> print(t_y)
-        tensor([0., 0., 0., 1., 1., 1.])
-    """
-    t = torch.arange(end_x * end_y, dtype=torch.float32)
-    t_x = (t % end_x).float()
-    t_y = torch.div(t, end_x, rounding_mode="floor").float()
-    return t_x, t_y
-
-
-def compute_axial_cis(dim: int, end_x: int, end_y: int, theta: float = 10000.0):
-    """
-    Compute axial complex exponential positional encodings for 2D spatial positions in a grid.
-
-    This function generates complex exponential positional encodings for a 2D grid of spatial positions,
-    using separate frequency components for the x and y dimensions.
-
-    Args:
-        dim (int): Dimension of the positional encoding.
-        end_x (int): Width of the 2D grid.
-        end_y (int): Height of the 2D grid.
-        theta (float, optional): Scaling factor for frequency computation.
-
-    Returns:
-        (torch.Tensor): Complex exponential positional encodings with shape (end_x*end_y, dim//2).
-
-    Examples:
-        >>> dim, end_x, end_y = 128, 8, 8
-        >>> freqs_cis = compute_axial_cis(dim, end_x, end_y)
-        >>> freqs_cis.shape
-        torch.Size([64, 64])
-    """
-    freqs_x = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
-    freqs_y = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
-
-    t_x, t_y = init_t_xy(end_x, end_y)
-    freqs_x = torch.outer(t_x, freqs_x)
-    freqs_y = torch.outer(t_y, freqs_y)
-    freqs_cis_x = torch.polar(torch.ones_like(freqs_x), freqs_x)
-    freqs_cis_y = torch.polar(torch.ones_like(freqs_y), freqs_y)
-    return torch.cat([freqs_cis_x, freqs_cis_y], dim=-1)
-
-
-def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
-    """
-    Reshape frequency tensor for broadcasting with input tensor.
-
-    Reshapes a frequency tensor to ensure dimensional compatibility for broadcasting with an input tensor.
-    This function is typically used in positional encoding operations.
-
-    Args:
-        freqs_cis (torch.Tensor): Frequency tensor with shape matching the last two dimensions of x.
-        x (torch.Tensor): Input tensor to broadcast with.
-
-    Returns:
-        (torch.Tensor): Reshaped frequency tensor ready for broadcasting with the input tensor.
-
-    Raises:
-        AssertionError: If the shape of freqs_cis doesn't match the last two dimensions of x.
-    """
-    ndim = x.ndim
-    assert 0 <= 1 < ndim
-    assert freqs_cis.shape == (x.shape[-2], x.shape[-1])
-    shape = [d if i >= ndim - 2 else 1 for i, d in enumerate(x.shape)]
-    return freqs_cis.view(*shape)
-
-
-def apply_rotary_enc(
-    xq: torch.Tensor,
-    xk: torch.Tensor,
-    freqs_cis: torch.Tensor,
-    repeat_freqs_k: bool = False,
-):
-    """
-    Apply rotary positional encoding to query and key tensors.
-
-    This function applies rotary positional encoding (RoPE) to query and key tensors using complex-valued frequency
-    components. RoPE is a technique that injects relative position information into self-attention mechanisms.
-
-    Args:
-        xq (torch.Tensor): Query tensor to encode with positional information.
-        xk (torch.Tensor): Key tensor to encode with positional information.
-        freqs_cis (torch.Tensor): Complex-valued frequency components for rotary encoding with shape matching the
-            last two dimensions of xq.
-        repeat_freqs_k (bool, optional): Whether to repeat frequency components along sequence length dimension
-            to match key sequence length.
-
-    Returns:
-        xq_out (torch.Tensor): Query tensor with rotary positional encoding applied.
-        xk_out (torch.Tensor): Key tensor with rotary positional encoding applied, or original xk if xk is empty.
-
-    Examples:
-        >>> import torch
-        >>> xq = torch.randn(2, 8, 16, 64)  # [batch, heads, seq_len, dim]
-        >>> xk = torch.randn(2, 8, 16, 64)
-        >>> freqs_cis = compute_axial_cis(64, 4, 4)  # For a 4x4 spatial grid with dim=64
-        >>> q_encoded, k_encoded = apply_rotary_enc(xq, xk, freqs_cis)
-    """
-    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
-    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) if xk.shape[-2] != 0 else None
-    freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
-    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
-    if xk_ is None:
-        # No keys to rotate, due to dropout
-        return xq_out.type_as(xq).to(xq.device), xk
-    # Repeat freqs along seq_len dim to match k seq_len
-    if repeat_freqs_k:
-        r = xk_.shape[-2] // xq_.shape[-2]
-        freqs_cis = freqs_cis.repeat(*([1] * (freqs_cis.ndim - 2)), r, 1)
-    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
-    return xq_out.type_as(xq).to(xq.device), xk_out.type_as(xk).to(xk.device)
-
-
-def window_partition(x: torch.Tensor, window_size: int):
-    """
-    Partition input tensor into non-overlapping windows with padding if needed.
-
-    Args:
-        x (torch.Tensor): Input tensor with shape (B, H, W, C).
-        window_size (int): Size of each window.
-
-    Returns:
-        windows (torch.Tensor): Partitioned windows with shape (B * num_windows, window_size, window_size, C).
-        padded_h_w (tuple[int, int]): Padded height and width before partition.
-
-    Examples:
-        >>> x = torch.randn(1, 16, 16, 3)
-        >>> windows, (Hp, Wp) = window_partition(x, window_size=4)
-        >>> print(windows.shape, Hp, Wp)
-        torch.Size([16, 4, 4, 3]) 16 16
-    """
-    B, H, W, C = x.shape
-
-    pad_h = (window_size - H % window_size) % window_size
-    pad_w = (window_size - W % window_size) % window_size
-    if pad_h > 0 or pad_w > 0:
-        x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
-    Hp, Wp = H + pad_h, W + pad_w
-
-    x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
-    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
-    return windows, (Hp, Wp)
-
-
-def window_unpartition(windows: torch.Tensor, window_size: int, pad_hw: tuple[int, int], hw: tuple[int, int]):
-    """
-    Unpartition windowed sequences into original sequences and remove padding.
-
-    This function reverses the windowing process, reconstructing the original input from windowed segments
-    and removing any padding that was added during the windowing process.
-
-    Args:
-        windows (torch.Tensor): Input tensor of windowed sequences with shape (B * num_windows, window_size,
-            window_size, C), where B is the batch size, num_windows is the number of windows, window_size is
-            the size of each window, and C is the number of channels.
-        window_size (int): Size of each window.
-        pad_hw (tuple[int, int]): Padded height and width (Hp, Wp) of the input before windowing.
-        hw (tuple[int, int]): Original height and width (H, W) of the input before padding and windowing.
-
-    Returns:
-        (torch.Tensor): Unpartitioned sequences with shape (B, H, W, C), where B is the batch size, H and W
-            are the original height and width, and C is the number of channels.
-
-    Examples:
-        >>> windows = torch.rand(32, 8, 8, 64)  # 32 windows of size 8x8 with 64 channels
-        >>> pad_hw = (16, 16)  # Padded height and width
-        >>> hw = (15, 14)  # Original height and width
-        >>> x = window_unpartition(windows, window_size=8, pad_hw=pad_hw, hw=hw)
-        >>> print(x.shape)
-        torch.Size([1, 15, 14, 64])
-    """
-    Hp, Wp = pad_hw
-    H, W = hw
-    B = windows.shape[0] // (Hp * Wp // window_size // window_size)
-    x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
-    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
-
-    if Hp > H or Wp > W:
-        x = x[:, :H, :W, :].contiguous()
-    return x
-
-
-def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
-    """
-    Extract relative positional embeddings based on query and key sizes.
-
-    Args:
-        q_size (int): Size of the query.
-        k_size (int): Size of the key.
-        rel_pos (torch.Tensor): Relative position embeddings with shape (L, C), where L is the maximum relative
-            distance and C is the embedding dimension.
-
-    Returns:
-        (torch.Tensor): Extracted positional embeddings according to relative positions, with shape (q_size,
-            k_size, C).
-
-    Examples:
-        >>> q_size, k_size = 8, 16
-        >>> rel_pos = torch.randn(31, 64)  # 31 = 2 * max(8, 16) - 1
-        >>> extracted_pos = get_rel_pos(q_size, k_size, rel_pos)
-        >>> print(extracted_pos.shape)
-        torch.Size([8, 16, 64])
-    """
-    max_rel_dist = int(2 * max(q_size, k_size) - 1)
-    # Interpolate rel pos if needed.
-    if rel_pos.shape[0] != max_rel_dist:
-        # Interpolate rel pos.
-        rel_pos_resized = F.interpolate(
-            rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
-            size=max_rel_dist,
-            mode="linear",
-        )
-        rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
-    else:
-        rel_pos_resized = rel_pos
-
-    # Scale the coords with short length if shapes for q and k are different.
-    q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
-    k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
-    relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
-
-    return rel_pos_resized[relative_coords.long()]
-
-
-def add_decomposed_rel_pos(
-    attn: torch.Tensor,
-    q: torch.Tensor,
-    rel_pos_h: torch.Tensor,
-    rel_pos_w: torch.Tensor,
-    q_size: tuple[int, int],
-    k_size: tuple[int, int],
-) -> torch.Tensor:
-    """
-    Add decomposed Relative Positional Embeddings to the attention map.
-
-    This function calculates and applies decomposed Relative Positional Embeddings as described in the MVITv2
-    paper. It enhances the attention mechanism by incorporating spatial relationships between query and key
-    positions.
-
-    Args:
-        attn (torch.Tensor): Attention map with shape (B, q_h * q_w, k_h * k_w).
-        q (torch.Tensor): Query tensor in the attention layer with shape (B, q_h * q_w, C).
-        rel_pos_h (torch.Tensor): Relative position embeddings for height axis with shape (Lh, C).
-        rel_pos_w (torch.Tensor): Relative position embeddings for width axis with shape (Lw, C).
-        q_size (tuple[int, int]): Spatial sequence size of query q as (q_h, q_w).
-        k_size (tuple[int, int]): Spatial sequence size of key k as (k_h, k_w).
-
-    Returns:
-        (torch.Tensor): Updated attention map with added relative positional embeddings, shape
-            (B, q_h * q_w, k_h * k_w).
-
-    Examples:
-        >>> B, C, q_h, q_w, k_h, k_w = 1, 64, 8, 8, 8, 8
-        >>> attn = torch.rand(B, q_h * q_w, k_h * k_w)
-        >>> q = torch.rand(B, q_h * q_w, C)
-        >>> rel_pos_h = torch.rand(2 * max(q_h, k_h) - 1, C)
-        >>> rel_pos_w = torch.rand(2 * max(q_w, k_w) - 1, C)
-        >>> q_size, k_size = (q_h, q_w), (k_h, k_w)
-        >>> updated_attn = add_decomposed_rel_pos(attn, q, rel_pos_h, rel_pos_w, q_size, k_size)
-        >>> print(updated_attn.shape)
-        torch.Size([1, 64, 64])
-
-    References:
-        https://github.com/facebookresearch/mvit/blob/main/mvit/models/attention.py
-    """
-    q_h, q_w = q_size
-    k_h, k_w = k_size
-    Rh = get_rel_pos(q_h, k_h, rel_pos_h)
-    Rw = get_rel_pos(q_w, k_w, rel_pos_w)
-
-    B, _, dim = q.shape
-    r_q = q.reshape(B, q_h, q_w, dim)
-    rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
-    rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
-
-    attn = (attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]).view(
-        B, q_h * q_w, k_h * k_w
-    )
-
-    return attn
diff --git a/ultralytics/models/sam/predict.py b/ultralytics/models/sam/predict.py
deleted file mode 100644
index f509bf0..0000000
--- a/ultralytics/models/sam/predict.py
+++ /dev/null
@@ -1,2042 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-"""
-Generate predictions using the Segment Anything Model (SAM).
-
-SAM is an advanced image segmentation model offering features like promptable segmentation and zero-shot performance.
-This module contains the implementation of the prediction logic and auxiliary utilities required to perform segmentation
-using SAM. It forms an integral part of the Ultralytics framework and is designed for high-performance, real-time image
-segmentation tasks.
-"""
-
-from __future__ import annotations
-
-from collections import OrderedDict
-from typing import Any
-
-import cv2
-import numpy as np
-import torch
-import torch.nn.functional as F
-
-from ultralytics.data.augment import LetterBox
-from ultralytics.engine.predictor import BasePredictor
-from ultralytics.engine.results import Results
-from ultralytics.utils import DEFAULT_CFG, ops
-from ultralytics.utils.torch_utils import select_device, smart_inference_mode
-
-from .amg import (
-    batch_iterator,
-    batched_mask_to_box,
-    build_all_layer_point_grids,
-    calculate_stability_score,
-    generate_crop_boxes,
-    is_box_near_crop_edge,
-    remove_small_regions,
-    uncrop_boxes_xyxy,
-    uncrop_masks,
-)
-
-
-class Predictor(BasePredictor):
-    """
-    Predictor class for SAM, enabling real-time image segmentation with promptable capabilities.
-
-    This class extends BasePredictor and implements the Segment Anything Model (SAM) for advanced image
-    segmentation tasks. It supports various input prompts like points, bounding boxes, and masks for
-    fine-grained control over segmentation results.
-
-    Attributes:
-        args (SimpleNamespace): Configuration arguments for the predictor.
-        model (torch.nn.Module): The loaded SAM model.
-        device (torch.device): The device (CPU or GPU) on which the model is loaded.
-        im (torch.Tensor): The preprocessed input image.
-        features (torch.Tensor): Extracted image features.
-        prompts (dict[str, Any]): Dictionary to store various types of prompts (e.g., bboxes, points, masks).
-        segment_all (bool): Flag to indicate if full image segmentation should be performed.
-        mean (torch.Tensor): Mean values for image normalization.
-        std (torch.Tensor): Standard deviation values for image normalization.
-
-    Methods:
-        preprocess: Prepare input images for model inference.
-        pre_transform: Perform initial transformations on the input image.
-        inference: Perform segmentation inference based on input prompts.
-        prompt_inference: Internal function for prompt-based segmentation inference.
-        generate: Generate segmentation masks for an entire image.
-        setup_model: Initialize the SAM model for inference.
-        get_model: Build and return a SAM model.
-        postprocess: Post-process model outputs to generate final results.
-        setup_source: Set up the data source for inference.
-        set_image: Set and preprocess a single image for inference.
-        get_im_features: Extract image features using the SAM image encoder.
-        set_prompts: Set prompts for subsequent inference.
-        reset_image: Reset the current image and its features.
-        remove_small_regions: Remove small disconnected regions and holes from masks.
-
-    Examples:
-        >>> predictor = Predictor()
-        >>> predictor.setup_model(model_path="sam_model.pt")
-        >>> predictor.set_image("image.jpg")
-        >>> bboxes = [[100, 100, 200, 200]]
-        >>> results = predictor(bboxes=bboxes)
-    """
-
-    def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
-        """
-        Initialize the Predictor with configuration, overrides, and callbacks.
-
-        Sets up the Predictor object for SAM (Segment Anything Model) and applies any configuration overrides or
-        callbacks provided. Initializes task-specific settings for SAM, such as retina_masks being set to True
-        for optimal results.
-
-        Args:
-            cfg (dict): Configuration dictionary containing default settings.
-            overrides (dict | None): Dictionary of values to override default configuration.
-            _callbacks (dict | None): Dictionary of callback functions to customize behavior.
-
-        Examples:
-            >>> predictor_example = Predictor(cfg=DEFAULT_CFG)
-            >>> predictor_example_with_imgsz = Predictor(overrides={"imgsz": 640})
-            >>> predictor_example_with_callback = Predictor(_callbacks={"on_predict_start": custom_callback})
-        """
-        if overrides is None:
-            overrides = {}
-        overrides.update(dict(task="segment", mode="predict", batch=1))
-        super().__init__(cfg, overrides, _callbacks)
-        self.args.retina_masks = True
-        self.im = None
-        self.features = None
-        self.prompts = {}
-        self.segment_all = False
-
-    def preprocess(self, im):
-        """
-        Preprocess the input image for model inference.
-
-        This method prepares the input image by applying transformations and normalization. It supports both
-        torch.Tensor and list of np.ndarray as input formats.
-
-        Args:
-            im (torch.Tensor | list[np.ndarray]): Input image(s) in BCHW tensor format or list of HWC numpy arrays.
-
-        Returns:
-            (torch.Tensor): The preprocessed image tensor, normalized and converted to the appropriate dtype.
-
-        Examples:
-            >>> predictor = Predictor()
-            >>> image = torch.rand(1, 3, 640, 640)
-            >>> preprocessed_image = predictor.preprocess(image)
-        """
-        if self.im is not None:
-            return self.im
-        not_tensor = not isinstance(im, torch.Tensor)
-        if not_tensor:
-            im = np.stack(self.pre_transform(im))
-            im = im[..., ::-1].transpose((0, 3, 1, 2))
-            im = np.ascontiguousarray(im)
-            im = torch.from_numpy(im)
-
-        im = im.to(self.device)
-        if not_tensor:
-            im = (im - self.mean) / self.std
-        im = im.half() if self.model.fp16 else im.float()
-        return im
-
-    def pre_transform(self, im):
-        """
-        Perform initial transformations on the input image for preprocessing.
-
-        This method applies transformations such as resizing to prepare the image for further preprocessing.
-        Currently, batched inference is not supported; hence the list length should be 1.
-
-        Args:
-            im (list[np.ndarray]): List containing a single image in HWC numpy array format.
-
-        Returns:
-            (list[np.ndarray]): List containing the transformed image.
-
-        Raises:
-            AssertionError: If the input list contains more than one image.
-
-        Examples:
-            >>> predictor = Predictor()
-            >>> image = np.random.rand(480, 640, 3)  # Single HWC image
-            >>> transformed = predictor.pre_transform([image])
-            >>> print(len(transformed))
-            1
-        """
-        assert len(im) == 1, "SAM model does not currently support batched inference"
-        letterbox = LetterBox(self.args.imgsz, auto=False, center=False)
-        return [letterbox(image=x) for x in im]
-
-    def inference(self, im, bboxes=None, points=None, labels=None, masks=None, multimask_output=False, *args, **kwargs):
-        """
-        Perform image segmentation inference based on the given input cues, using the currently loaded image.
-
-        This method leverages SAM's (Segment Anything Model) architecture consisting of image encoder, prompt
-        encoder, and mask decoder for real-time and promptable segmentation tasks.
-
-        Args:
-            im (torch.Tensor): The preprocessed input image in tensor format, with shape (N, C, H, W).
-            bboxes (np.ndarray | list | None): Bounding boxes with shape (N, 4), in XYXY format.
-            points (np.ndarray | list | None): Points indicating object locations with shape (N, 2), in pixels.
-            labels (np.ndarray | list | None): Labels for point prompts, shape (N,). 1 = foreground, 0 = background.
-            masks (np.ndarray | None): Low-resolution masks from previous predictions, shape (N, H, W). For SAM H=W=256.
-            multimask_output (bool): Flag to return multiple masks. Helpful for ambiguous prompts.
-            *args (Any): Additional positional arguments.
-            **kwargs (Any): Additional keyword arguments.
-
-        Returns:
-            pred_masks (torch.Tensor): The output masks in shape (C, H, W), where C is the number of generated masks.
-            pred_scores (torch.Tensor): An array of length C containing quality scores predicted by the model for each mask.
-
-        Examples:
-            >>> predictor = Predictor()
-            >>> predictor.setup_model(model_path="sam_model.pt")
-            >>> predictor.set_image("image.jpg")
-            >>> results = predictor(bboxes=[[0, 0, 100, 100]])
-        """
-        # Override prompts if any stored in self.prompts
-        bboxes = self.prompts.pop("bboxes", bboxes)
-        points = self.prompts.pop("points", points)
-        masks = self.prompts.pop("masks", masks)
-        labels = self.prompts.pop("labels", labels)
-
-        if all(i is None for i in [bboxes, points, masks]):
-            return self.generate(im, *args, **kwargs)
-
-        return self.prompt_inference(im, bboxes, points, labels, masks, multimask_output)
-
-    def prompt_inference(self, im, bboxes=None, points=None, labels=None, masks=None, multimask_output=False):
-        """
-        Perform image segmentation inference based on input cues using SAM's specialized architecture.
-
-        This internal function leverages the Segment Anything Model (SAM) for prompt-based, real-time segmentation.
-        It processes various input prompts such as bounding boxes, points, and masks to generate segmentation masks.
-
-        Args:
-            im (torch.Tensor): Preprocessed input image tensor with shape (N, C, H, W).
-            bboxes (np.ndarray | list | None): Bounding boxes in XYXY format with shape (N, 4).
-            points (np.ndarray | list | None): Points indicating object locations with shape (N, 2) or (N, num_points, 2), in pixels.
-            labels (np.ndarray | list | None): Point prompt labels with shape (N) or (N, num_points). 1 for foreground, 0 for background.
-            masks (np.ndarray | None): Low-res masks from previous predictions with shape (N, H, W). For SAM, H=W=256.
-            multimask_output (bool): Flag to return multiple masks for ambiguous prompts.
-
-        Returns:
-            pred_masks (torch.Tensor): Output masks with shape (C, H, W), where C is the number of generated masks.
-            pred_scores (torch.Tensor): Quality scores predicted by the model for each mask, with length C.
-
-        Examples:
-            >>> predictor = Predictor()
-            >>> im = torch.rand(1, 3, 1024, 1024)
-            >>> bboxes = [[100, 100, 200, 200]]
-            >>> masks, scores, logits = predictor.prompt_inference(im, bboxes=bboxes)
-        """
-        features = self.get_im_features(im) if self.features is None else self.features
-
-        prompts = self._prepare_prompts(im.shape[2:], self.batch[1][0].shape[:2], bboxes, points, labels, masks)
-        return self._inference_features(features, *prompts, multimask_output)
-
-    def _inference_features(
-        self,
-        features,
-        bboxes=None,
-        points=None,
-        labels=None,
-        masks=None,
-        multimask_output=False,
-    ):
-        """
-        Perform inference on image features using the SAM model.
-
-        Args:
-            features (torch.Tensor): Extracted image features with shape (B, C, H, W) from the SAM model image encoder.
-            bboxes (np.ndarray | list[list[float]] | None): Bounding boxes in XYXY format with shape (N, 4).
-            points (np.ndarray | list[list[float]] | None): Object location points with shape (N, 2), in pixels.
-            labels (np.ndarray | list[int] | None): Point prompt labels with shape (N,). 1 = foreground, 0 = background.
-            masks (list[np.ndarray] | np.ndarray | None): Masks for the objects, where each mask is a 2D array.
-            multimask_output (bool): Flag to return multiple masks for ambiguous prompts.
-
-        Returns:
-            pred_masks (torch.Tensor): Output masks with shape (C, H, W), where C is the number of generated masks.
-            pred_scores (torch.Tensor): Quality scores for each mask, with length C.
-        """
-        points = (points, labels) if points is not None else None
-        # Embed prompts
-        sparse_embeddings, dense_embeddings = self.model.prompt_encoder(points=points, boxes=bboxes, masks=masks)
-
-        # Predict masks
-        pred_masks, pred_scores = self.model.mask_decoder(
-            image_embeddings=features,
-            image_pe=self.model.prompt_encoder.get_dense_pe(),
-            sparse_prompt_embeddings=sparse_embeddings,
-            dense_prompt_embeddings=dense_embeddings,
-            multimask_output=multimask_output,
-        )
-
-        # (N, d, H, W) --> (N*d, H, W), (N, d) --> (N*d, )
-        # `d` could be 1 or 3 depends on `multimask_output`.
-        return pred_masks.flatten(0, 1), pred_scores.flatten(0, 1)
-
-    def _prepare_prompts(self, dst_shape, src_shape, bboxes=None, points=None, labels=None, masks=None):
-        """
-        Prepare and transform the input prompts for processing based on the destination shape.
-
-        Args:
-            dst_shape (tuple[int, int]): The target shape (height, width) for the prompts.
-            src_shape (tuple[int, int]): The source shape (height, width) of the input image.
-            bboxes (np.ndarray | list | None): Bounding boxes in XYXY format with shape (N, 4).
-            points (np.ndarray | list | None): Points indicating object locations with shape (N, 2) or (N, num_points, 2), in pixels.
-            labels (np.ndarray | list | None): Point prompt labels with shape (N) or (N, num_points). 1 for foreground, 0 for background.
-            masks (list[np.ndarray] | np.ndarray | None): Masks for the objects, where each mask is a 2D array with shape (H, W).
-
-        Returns:
-            bboxes (torch.Tensor | None): Transformed bounding boxes.
-            points (torch.Tensor | None): Transformed points.
-            labels (torch.Tensor | None): Transformed labels.
-            masks (torch.Tensor | None): Transformed masks.
-
-        Raises:
-            AssertionError: If the number of points don't match the number of labels, in case labels were passed.
-        """
-        r = 1.0 if self.segment_all else min(dst_shape[0] / src_shape[0], dst_shape[1] / src_shape[1])
-        # Transform input prompts
-        if points is not None:
-            points = torch.as_tensor(points, dtype=self.torch_dtype, device=self.device)
-            points = points[None] if points.ndim == 1 else points
-            # Assuming labels are all positive if users don't pass labels.
-            if labels is None:
-                labels = np.ones(points.shape[:-1])
-            labels = torch.as_tensor(labels, dtype=torch.int32, device=self.device)
-            assert points.shape[-2] == labels.shape[-1], (
-                f"Number of points {points.shape[-2]} should match number of labels {labels.shape[-1]}."
-            )
-            points *= r
-            if points.ndim == 2:
-                # (N, 2) --> (N, 1, 2), (N, ) --> (N, 1)
-                points, labels = points[:, None, :], labels[:, None]
-        if bboxes is not None:
-            bboxes = torch.as_tensor(bboxes, dtype=self.torch_dtype, device=self.device)
-            bboxes = bboxes[None] if bboxes.ndim == 1 else bboxes
-            bboxes *= r
-        if masks is not None:
-            masks = np.asarray(masks, dtype=np.uint8)
-            masks = masks[None] if masks.ndim == 2 else masks
-            letterbox = LetterBox(dst_shape, auto=False, center=False, padding_value=0, interpolation=cv2.INTER_NEAREST)
-            masks = np.stack([letterbox(image=x).squeeze() for x in masks], axis=0)
-            masks = torch.tensor(masks, dtype=self.torch_dtype, device=self.device)
-        return bboxes, points, labels, masks
-
-    def generate(
-        self,
-        im,
-        crop_n_layers=0,
-        crop_overlap_ratio=512 / 1500,
-        crop_downscale_factor=1,
-        point_grids=None,
-        points_stride=32,
-        points_batch_size=64,
-        conf_thres=0.88,
-        stability_score_thresh=0.95,
-        stability_score_offset=0.95,
-        crop_nms_thresh=0.7,
-    ):
-        """
-        Perform image segmentation using the Segment Anything Model (SAM).
-
-        This method segments an entire image into constituent parts by leveraging SAM's advanced architecture
-        and real-time performance capabilities. It can optionally work on image crops for finer segmentation.
-
-        Args:
-            im (torch.Tensor): Input tensor representing the preprocessed image with shape (N, C, H, W).
-            crop_n_layers (int): Number of layers for additional mask predictions on image crops.
-            crop_overlap_ratio (float): Overlap between crops, scaled down in subsequent layers.
-            crop_downscale_factor (int): Scaling factor for sampled points-per-side in each layer.
-            point_grids (list[np.ndarray] | None): Custom grids for point sampling normalized to [0,1].
-            points_stride (int): Number of points to sample along each side of the image.
-            points_batch_size (int): Batch size for the number of points processed simultaneously.
-            conf_thres (float): Confidence threshold [0,1] for filtering based on mask quality prediction.
-            stability_score_thresh (float): Stability threshold [0,1] for mask filtering based on stability.
-            stability_score_offset (float): Offset value for calculating stability score.
-            crop_nms_thresh (float): IoU cutoff for NMS to remove duplicate masks between crops.
-
-        Returns:
-            pred_masks (torch.Tensor): Segmented masks with shape (N, H, W).
-            pred_scores (torch.Tensor): Confidence scores for each mask with shape (N,).
-            pred_bboxes (torch.Tensor): Bounding boxes for each mask with shape (N, 4).
-
-        Examples:
-            >>> predictor = Predictor()
-            >>> im = torch.rand(1, 3, 1024, 1024)  # Example input image
-            >>> masks, scores, boxes = predictor.generate(im)
-        """
-        import torchvision  # scope for faster 'import ultralytics'
-
-        self.segment_all = True
-        ih, iw = im.shape[2:]
-        crop_regions, layer_idxs = generate_crop_boxes((ih, iw), crop_n_layers, crop_overlap_ratio)
-        if point_grids is None:
-            point_grids = build_all_layer_point_grids(points_stride, crop_n_layers, crop_downscale_factor)
-        pred_masks, pred_scores, pred_bboxes, region_areas = [], [], [], []
-        for crop_region, layer_idx in zip(crop_regions, layer_idxs):
-            x1, y1, x2, y2 = crop_region
-            w, h = x2 - x1, y2 - y1
-            area = torch.tensor(w * h, device=im.device)
-            points_scale = np.array([[w, h]])  # w, h
-            # Crop image and interpolate to input size
-            crop_im = F.interpolate(im[..., y1:y2, x1:x2], (ih, iw), mode="bilinear", align_corners=False)
-            # (num_points, 2)
-            points_for_image = point_grids[layer_idx] * points_scale
-            crop_masks, crop_scores, crop_bboxes = [], [], []
-            for (points,) in batch_iterator(points_batch_size, points_for_image):
-                pred_mask, pred_score = self.prompt_inference(crop_im, points=points, multimask_output=True)
-                # Interpolate predicted masks to input size
-                pred_mask = F.interpolate(pred_mask[None], (h, w), mode="bilinear", align_corners=False)[0]
-                idx = pred_score > conf_thres
-                pred_mask, pred_score = pred_mask[idx], pred_score[idx]
-
-                stability_score = calculate_stability_score(
-                    pred_mask, self.model.mask_threshold, stability_score_offset
-                )
-                idx = stability_score > stability_score_thresh
-                pred_mask, pred_score = pred_mask[idx], pred_score[idx]
-                # Bool type is much more memory-efficient.
-                pred_mask = pred_mask > self.model.mask_threshold
-                # (N, 4)
-                pred_bbox = batched_mask_to_box(pred_mask).float()
-                keep_mask = ~is_box_near_crop_edge(pred_bbox, crop_region, [0, 0, iw, ih])
-                if not torch.all(keep_mask):
-                    pred_bbox, pred_mask, pred_score = pred_bbox[keep_mask], pred_mask[keep_mask], pred_score[keep_mask]
-
-                crop_masks.append(pred_mask)
-                crop_bboxes.append(pred_bbox)
-                crop_scores.append(pred_score)
-
-            # Do nms within this crop
-            crop_masks = torch.cat(crop_masks)
-            crop_bboxes = torch.cat(crop_bboxes)
-            crop_scores = torch.cat(crop_scores)
-            keep = torchvision.ops.nms(crop_bboxes, crop_scores, self.args.iou)  # NMS
-            crop_bboxes = uncrop_boxes_xyxy(crop_bboxes[keep], crop_region)
-            crop_masks = uncrop_masks(crop_masks[keep], crop_region, ih, iw)
-            crop_scores = crop_scores[keep]
-
-            pred_masks.append(crop_masks)
-            pred_bboxes.append(crop_bboxes)
-            pred_scores.append(crop_scores)
-            region_areas.append(area.expand(crop_masks.shape[0]))
-
-        pred_masks = torch.cat(pred_masks)
-        pred_bboxes = torch.cat(pred_bboxes)
-        pred_scores = torch.cat(pred_scores)
-        region_areas = torch.cat(region_areas)
-
-        # Remove duplicate masks between crops
-        if len(crop_regions) > 1:
-            scores = 1 / region_areas
-            keep = torchvision.ops.nms(pred_bboxes, scores, crop_nms_thresh)
-            pred_masks, pred_bboxes, pred_scores = pred_masks[keep], pred_bboxes[keep], pred_scores[keep]
-
-        return pred_masks, pred_scores, pred_bboxes
-
-    def setup_model(self, model=None, verbose=True):
-        """
-        Initialize the Segment Anything Model (SAM) for inference.
-
-        This method sets up the SAM model by allocating it to the appropriate device and initializing the necessary
-        parameters for image normalization and other Ultralytics compatibility settings.
-
-        Args:
-            model (torch.nn.Module | None): A pretrained SAM model. If None, a new model is built based on config.
-            verbose (bool): If True, prints selected device information.
-
-        Examples:
-            >>> predictor = Predictor()
-            >>> predictor.setup_model(model=sam_model, verbose=True)
-        """
-        device = select_device(self.args.device, verbose=verbose)
-        if model is None:
-            model = self.get_model()
-        model.eval()
-        model = model.to(device)
-        self.model = model.half() if self.args.half else model.float()
-        self.device = device
-        self.mean = torch.tensor([123.675, 116.28, 103.53]).view(-1, 1, 1).to(device)
-        self.std = torch.tensor([58.395, 57.12, 57.375]).view(-1, 1, 1).to(device)
-
-        # Ultralytics compatibility settings
-        self.model.pt = False
-        self.model.triton = False
-        self.model.stride = 32
-        self.model.fp16 = self.args.half
-        self.done_warmup = True
-        self.torch_dtype = torch.float16 if self.model.fp16 else torch.float32
-
-    def get_model(self):
-        """Retrieve or build the Segment Anything Model (SAM) for image segmentation tasks."""
-        from .build import build_sam  # slow import
-
-        return build_sam(self.args.model)
-
-    def postprocess(self, preds, img, orig_imgs):
-        """
-        Post-process SAM's inference outputs to generate object detection masks and bounding boxes.
-
-        This method scales masks and boxes to the original image size and applies a threshold to the mask
-        predictions. It leverages SAM's advanced architecture for real-time, promptable segmentation tasks.
-
-        Args:
-            preds (tuple): The output from SAM model inference, containing:
-                - pred_masks (torch.Tensor): Predicted masks with shape (N, 1, H, W).
-                - pred_scores (torch.Tensor): Confidence scores for each mask with shape (N, 1).
-                - pred_bboxes (torch.Tensor, optional): Predicted bounding boxes if segment_all is True.
-            img (torch.Tensor): The processed input image tensor with shape (C, H, W).
-            orig_imgs (list[np.ndarray] | torch.Tensor): The original, unprocessed images.
-
-        Returns:
-            (list[Results]): List of Results objects containing detection masks, bounding boxes, and other
-                metadata for each processed image.
-
-        Examples:
-            >>> predictor = Predictor()
-            >>> preds = predictor.inference(img)
-            >>> results = predictor.postprocess(preds, img, orig_imgs)
-        """
-        # (N, 1, H, W), (N, 1)
-        pred_masks, pred_scores = preds[:2]
-        pred_bboxes = preds[2] if self.segment_all else None
-        names = dict(enumerate(str(i) for i in range(pred_masks.shape[0])))
-
-        if not isinstance(orig_imgs, list):  # input images are a torch.Tensor, not a list
-            orig_imgs = ops.convert_torch2numpy_batch(orig_imgs)
-
-        results = []
-        for masks, orig_img, img_path in zip([pred_masks], orig_imgs, self.batch[0]):
-            if masks.shape[0] == 0:
-                masks, pred_bboxes = None, torch.zeros((0, 6), device=pred_masks.device)
-            else:
-                masks = ops.scale_masks(masks[None].float(), orig_img.shape[:2], padding=False)[0]
-                masks = masks > self.model.mask_threshold  # to bool
-                if pred_bboxes is not None:
-                    pred_bboxes = ops.scale_boxes(img.shape[2:], pred_bboxes.float(), orig_img.shape, padding=False)
-                else:
-                    pred_bboxes = batched_mask_to_box(masks)
-                # NOTE: SAM models do not return cls info. This `cls` here is just a placeholder for consistency.
-                cls = torch.arange(pred_masks.shape[0], dtype=torch.int32, device=pred_masks.device)
-                idx = pred_scores > self.args.conf
-                pred_bboxes = torch.cat([pred_bboxes, pred_scores[:, None], cls[:, None]], dim=-1)[idx]
-                masks = masks[idx]
-            results.append(Results(orig_img, path=img_path, names=names, masks=masks, boxes=pred_bboxes))
-        # Reset segment-all mode.
-        self.segment_all = False
-        return results
-
-    def setup_source(self, source):
-        """
-        Set up the data source for inference.
-
-        This method configures the data source from which images will be fetched for inference. It supports
-        various input types such as image files, directories, video files, and other compatible data sources.
-
-        Args:
-            source (str | Path | None): The path or identifier for the image data source. Can be a file path,
-                directory path, URL, or other supported source types.
-
-        Examples:
-            >>> predictor = Predictor()
-            >>> predictor.setup_source("path/to/images")
-            >>> predictor.setup_source("video.mp4")
-            >>> predictor.setup_source(None)  # Uses default source if available
-
-        Notes:
-            - If source is None, the method may use a default source if configured.
-            - The method adapts to different source types and prepares them for subsequent inference steps.
-            - Supported source types may include local files, directories, URLs, and video streams.
-        """
-        if source is not None:
-            super().setup_source(source)
-
-    def set_image(self, image):
-        """
-        Preprocess and set a single image for inference.
-
-        This method prepares the model for inference on a single image by setting up the model if not already
-        initialized, configuring the data source, and preprocessing the image for feature extraction. It
-        ensures that only one image is set at a time and extracts image features for subsequent use.
-
-        Args:
-            image (str | np.ndarray): Path to the image file as a string, or a numpy array representing
-                an image read by cv2.
-
-        Examples:
-            >>> predictor = Predictor()
-            >>> predictor.set_image("path/to/image.jpg")
-            >>> predictor.set_image(cv2.imread("path/to/image.jpg"))
-
-        Raises:
-            AssertionError: If more than one image is attempted to be set.
-
-        Notes:
-            - This method should be called before performing inference on a new image.
-            - The extracted features are stored in the `self.features` attribute for later use.
-        """
-        if self.model is None:
-            self.setup_model()
-        self.setup_source(image)
-        assert len(self.dataset) == 1, "`set_image` only supports setting one image!"
-        for batch in self.dataset:
-            im = self.preprocess(batch[1])
-            self.features = self.get_im_features(im)
-            break
-
-    def get_im_features(self, im):
-        """Extract image features using the SAM model's image encoder for subsequent mask prediction."""
-        assert isinstance(self.imgsz, (tuple, list)) and self.imgsz[0] == self.imgsz[1], (
-            f"SAM models only support square image size, but got {self.imgsz}."
-        )
-        self.model.set_imgsz(self.imgsz)
-        return self.model.image_encoder(im)
-
-    def set_prompts(self, prompts):
-        """Set prompts for subsequent inference operations."""
-        self.prompts = prompts
-
-    def reset_image(self):
-        """Reset the current image and its features, clearing them for subsequent inference."""
-        self.im = None
-        self.features = None
-
-    @staticmethod
-    def remove_small_regions(masks, min_area=0, nms_thresh=0.7):
-        """
-        Remove small disconnected regions and holes from segmentation masks.
-
-        This function performs post-processing on segmentation masks generated by the Segment Anything Model (SAM).
-        It removes small disconnected regions and holes from the input masks, and then performs Non-Maximum
-        Suppression (NMS) to eliminate any newly created duplicate boxes.
-
-        Args:
-            masks (torch.Tensor): Segmentation masks to be processed, with shape (N, H, W) where N is the number of
-                masks, H is height, and W is width.
-            min_area (int): Minimum area threshold for removing disconnected regions and holes. Regions smaller than
-                this will be removed.
-            nms_thresh (float): IoU threshold for the NMS algorithm to remove duplicate boxes.
-
-        Returns:
-            new_masks (torch.Tensor): Processed masks with small regions removed, shape (N, H, W).
-            keep (list[int]): Indices of remaining masks after NMS, for filtering corresponding boxes.
-
-        Examples:
-            >>> masks = torch.rand(5, 640, 640) > 0.5  # 5 random binary masks
-            >>> new_masks, keep = remove_small_regions(masks, min_area=100, nms_thresh=0.7)
-            >>> print(f"Original masks: {masks.shape}, Processed masks: {new_masks.shape}")
-            >>> print(f"Indices of kept masks: {keep}")
-        """
-        import torchvision  # scope for faster 'import ultralytics'
-
-        if masks.shape[0] == 0:
-            return masks
-
-        # Filter small disconnected regions and holes
-        new_masks = []
-        scores = []
-        for mask in masks:
-            mask = mask.cpu().numpy().astype(np.uint8)
-            mask, changed = remove_small_regions(mask, min_area, mode="holes")
-            unchanged = not changed
-            mask, changed = remove_small_regions(mask, min_area, mode="islands")
-            unchanged = unchanged and not changed
-
-            new_masks.append(torch.as_tensor(mask).unsqueeze(0))
-            # Give score=0 to changed masks and 1 to unchanged masks so NMS prefers masks not needing postprocessing
-            scores.append(float(unchanged))
-
-        # Recalculate boxes and remove any new duplicates
-        new_masks = torch.cat(new_masks, dim=0)
-        boxes = batched_mask_to_box(new_masks)
-        keep = torchvision.ops.nms(boxes.float(), torch.as_tensor(scores), nms_thresh)
-
-        return new_masks[keep].to(device=masks.device, dtype=masks.dtype), keep
-
-    @smart_inference_mode()
-    def inference_features(
-        self,
-        features,
-        src_shape,
-        dst_shape=None,
-        bboxes=None,
-        points=None,
-        labels=None,
-        masks=None,
-        multimask_output=False,
-    ):
-        """
-        Perform prompts preprocessing and inference on provided image features using the SAM model.
-
-        Args:
-            features (torch.Tensor | dict[str, Any]): Extracted image features from the SAM/SAM2 model image encoder.
-            src_shape (tuple[int, int]): The source shape (height, width) of the input image.
-            dst_shape (tuple[int, int] | None): The target shape (height, width) for the prompts. If None, defaults to (imgsz, imgsz).
-            bboxes (np.ndarray | list[list[float]] | None): Bounding boxes in xyxy format with shape (N, 4).
-            points (np.ndarray | list[list[float]] | None): Points indicating object locations with shape (N, 2), in pixels.
-            labels (np.ndarray | list[int] | None): Point prompt labels with shape (N, ).
-            masks (list[np.ndarray] | np.ndarray | None): Masks for the objects, where each mask is a 2D array.
-            multimask_output (bool): Flag to return multiple masks for ambiguous prompts.
-
-        Returns:
-            pred_masks (torch.Tensor): The output masks in shape (C, H, W), where C is the number of generated masks.
-            pred_bboxes (torch.Tensor): Bounding boxes for each mask with shape (N, 6), where N is the number of boxes.
-                Each box is in xyxy format with additional columns for score and class.
-
-        Notes:
-            - The input features is a torch.Tensor of shape (B, C, H, W) if performing on SAM, or a dict[str, Any] if performing on SAM2.
-        """
-        dst_shape = dst_shape or (self.args.imgsz, self.args.imgsz)
-        prompts = self._prepare_prompts(dst_shape, src_shape, bboxes, points, labels, masks)
-        pred_masks, pred_scores = self._inference_features(features, *prompts, multimask_output)
-        if pred_masks.shape[0] == 0:
-            pred_masks, pred_bboxes = None, torch.zeros((0, 6), device=pred_masks.device)
-        else:
-            pred_masks = ops.scale_masks(pred_masks[None].float(), src_shape, padding=False)[0]
-            pred_masks = pred_masks > self.model.mask_threshold  # to bool
-            pred_bboxes = batched_mask_to_box(pred_masks)
-            # NOTE: SAM models do not return cls info. This `cls` here is just a placeholder for consistency.
-            cls = torch.arange(pred_masks.shape[0], dtype=torch.int32, device=pred_masks.device)
-            pred_bboxes = torch.cat([pred_bboxes, pred_scores[:, None], cls[:, None]], dim=-1)
-        return pred_masks, pred_bboxes
-
-
-class SAM2Predictor(Predictor):
-    """
-    SAM2Predictor class for advanced image segmentation using Segment Anything Model 2 architecture.
-
-    This class extends the base Predictor class to implement SAM2-specific functionality for image
-    segmentation tasks. It provides methods for model initialization, feature extraction, and
-    prompt-based inference.
-
-    Attributes:
-        _bb_feat_sizes (list[tuple]): Feature sizes for different backbone levels.
-        model (torch.nn.Module): The loaded SAM2 model.
-        device (torch.device): The device (CPU or GPU) on which the model is loaded.
-        features (dict): Cached image features for efficient inference.
-        segment_all (bool): Flag to indicate if all segments should be predicted.
-        prompts (dict[str, Any]): Dictionary to store various types of prompts for inference.
-
-    Methods:
-        get_model: Retrieve and initialize the SAM2 model.
-        prompt_inference: Perform image segmentation inference based on various prompts.
-        set_image: Preprocess and set a single image for inference.
-        get_im_features: Extract and process image features using SAM2's image encoder.
-
-    Examples:
-        >>> predictor = SAM2Predictor(cfg)
-        >>> predictor.set_image("path/to/image.jpg")
-        >>> bboxes = [[100, 100, 200, 200]]
-        >>> result = predictor(bboxes=bboxes)[0]
-        >>> print(f"Predicted {len(result.masks)} masks with average score {result.boxes.conf.mean():.2f}")
-    """
-
-    _bb_feat_sizes = [
-        (256, 256),
-        (128, 128),
-        (64, 64),
-    ]
-
-    def get_model(self):
-        """Retrieve and initialize the Segment Anything Model 2 (SAM2) for image segmentation tasks."""
-        from .build import build_sam  # slow import
-
-        return build_sam(self.args.model)
-
-    def _prepare_prompts(self, dst_shape, src_shape, bboxes=None, points=None, labels=None, masks=None):
-        """
-        Prepare and transform the input prompts for processing based on the destination shape.
-
-        Args:
-            dst_shape (tuple[int, int]): The target shape (height, width) for the prompts.
-            src_shape (tuple[int, int]): The source shape (height, width) of the input image.
-            bboxes (np.ndarray | list | None): Bounding boxes in XYXY format with shape (N, 4).
-            points (np.ndarray | list | None): Points indicating object locations with shape (N, 2) or (N, num_points, 2), in pixels.
-            labels (np.ndarray | list | None): Point prompt labels with shape (N,) or (N, num_points). 1 for foreground, 0 for background.
-            masks (list | np.ndarray | None): Masks for the objects, where each mask is a 2D array.
-
-        Returns:
-            points (torch.Tensor | None): Transformed points.
-            labels (torch.Tensor | None): Transformed labels.
-            masks (torch.Tensor | None): Transformed masks.
-
-        Raises:
-            AssertionError: If the number of points don't match the number of labels, in case labels were passed.
-        """
-        bboxes, points, labels, masks = super()._prepare_prompts(dst_shape, src_shape, bboxes, points, labels, masks)
-        if bboxes is not None:
-            bboxes = bboxes.view(-1, 2, 2)
-            bbox_labels = torch.tensor([[2, 3]], dtype=torch.int32, device=bboxes.device).expand(bboxes.shape[0], -1)
-            # NOTE: merge "boxes" and "points" into a single "points" input
-            # (where boxes are added at the beginning) to model.sam_prompt_encoder
-            if points is not None:
-                points = torch.cat([bboxes, points], dim=1)
-                labels = torch.cat([bbox_labels, labels], dim=1)
-            else:
-                points, labels = bboxes, bbox_labels
-        return points, labels, masks
-
-    def set_image(self, image):
-        """
-        Preprocess and set a single image for inference using the SAM2 model.
-
-        This method initializes the model if not already done, configures the data source to the specified image,
-        and preprocesses the image for feature extraction. It supports setting only one image at a time.
-
-        Args:
-            image (str | np.ndarray): Path to the image file as a string, or a numpy array representing the image.
-
-        Examples:
-            >>> predictor = SAM2Predictor()
-            >>> predictor.set_image("path/to/image.jpg")
-            >>> predictor.set_image(np.array([...]))  # Using a numpy array
-
-        Raises:
-            AssertionError: If more than one image is attempted to be set.
-
-        Notes:
-            - This method must be called before performing any inference on a new image.
-            - The method caches the extracted features for efficient subsequent inferences on the same image.
-            - Only one image can be set at a time. To process multiple images, call this method for each new image.
-        """
-        if self.model is None:
-            self.setup_model(model=None)
-        self.setup_source(image)
-        assert len(self.dataset) == 1, "`set_image` only supports setting one image!"
-        for batch in self.dataset:
-            im = self.preprocess(batch[1])
-            self.features = self.get_im_features(im)
-            break
-
-    def get_im_features(self, im):
-        """Extract image features from the SAM image encoder for subsequent processing."""
-        assert isinstance(self.imgsz, (tuple, list)) and self.imgsz[0] == self.imgsz[1], (
-            f"SAM 2 models only support square image size, but got {self.imgsz}."
-        )
-        self.model.set_imgsz(self.imgsz)
-        self._bb_feat_sizes = [[x // (4 * i) for x in self.imgsz] for i in [1, 2, 4]]
-
-        backbone_out = self.model.forward_image(im)
-        _, vision_feats, _, _ = self.model._prepare_backbone_features(backbone_out)
-        if self.model.directly_add_no_mem_embed:
-            vision_feats[-1] = vision_feats[-1] + self.model.no_mem_embed
-        feats = [
-            feat.permute(1, 2, 0).view(1, -1, *feat_size) for feat, feat_size in zip(vision_feats, self._bb_feat_sizes)
-        ]
-        return {"image_embed": feats[-1], "high_res_feats": feats[:-1]}
-
-    def _inference_features(
-        self,
-        features,
-        points=None,
-        labels=None,
-        masks=None,
-        multimask_output=False,
-        img_idx=-1,
-    ):
-        """
-        Perform inference on image features using the SAM2 model.
-
-        Args:
-            features (torch.Tensor | dict[str, Any]): Extracted image features with shape (B, C, H, W) from the SAM2 model image encoder, it
-                could also be a dictionary including:
-                - image_embed (torch.Tensor): Image embedding with shape (B, C, H, W).
-                - high_res_feats (list[torch.Tensor]): List of high-resolution feature maps from the backbone, each with shape (B, C, H, W).
-            points (np.ndarray | list[list[float]] | None): Object location points with shape (N, 2), in pixels.
-            labels (np.ndarray | list[int] | None): Point prompt labels with shape (N,). 1 = foreground, 0 = background.
-            masks (list[np.ndarray] | np.ndarray | None): Masks for the objects, where each mask is a 2D array.
-            multimask_output (bool): Flag to return multiple masks for ambiguous prompts.
-            img_idx (int): Index of the image in the batch to process.
-
-        Returns:
-            pred_masks (torch.Tensor): Output masks with shape (C, H, W), where C is the number of generated masks.
-            pred_scores (torch.Tensor): Quality scores for each mask, with length C.
-        """
-        points = (points, labels) if points is not None else None
-        sparse_embeddings, dense_embeddings = self.model.sam_prompt_encoder(
-            points=points,
-            boxes=None,
-            masks=masks,
-        )
-        # Predict masks
-        batched_mode = points is not None and points[0].shape[0] > 1  # multi object prediction
-        high_res_features = None
-        if isinstance(features, dict):
-            high_res_features = [feat_level[img_idx].unsqueeze(0) for feat_level in features["high_res_feats"]]
-            features = features["image_embed"][[img_idx]]
-        pred_masks, pred_scores, _, _ = self.model.sam_mask_decoder(
-            image_embeddings=features,
-            image_pe=self.model.sam_prompt_encoder.get_dense_pe(),
-            sparse_prompt_embeddings=sparse_embeddings,
-            dense_prompt_embeddings=dense_embeddings,
-            multimask_output=multimask_output,
-            repeat_image=batched_mode,
-            high_res_features=high_res_features,
-        )
-        # (N, d, H, W) --> (N*d, H, W), (N, d) --> (N*d, )
-        # `d` could be 1 or 3 depends on `multimask_output`.
-        return pred_masks.flatten(0, 1), pred_scores.flatten(0, 1)
-
-
-class SAM2VideoPredictor(SAM2Predictor):
-    """
-    SAM2VideoPredictor to handle user interactions with videos and manage inference states.
-
-    This class extends the functionality of SAM2Predictor to support video processing and maintains
-    the state of inference operations. It includes configurations for managing non-overlapping masks,
-    clearing memory for non-conditional inputs, and setting up callbacks for prediction events.
-
-    Attributes:
-        inference_state (dict): A dictionary to store the current state of inference operations.
-        non_overlap_masks (bool): A flag indicating whether masks should be non-overlapping.
-        clear_non_cond_mem_around_input (bool): A flag to control clearing non-conditional memory around inputs.
-        clear_non_cond_mem_for_multi_obj (bool): A flag to control clearing non-conditional memory for multi-object scenarios.
-        callbacks (dict): A dictionary of callbacks for various prediction lifecycle events.
-
-    Methods:
-        get_model: Retrieve and configure the model with binarization enabled.
-        inference: Perform image segmentation inference based on the given input cues.
-        postprocess: Post-process the predictions to apply non-overlapping constraints if required.
-        add_new_prompts: Add new points or masks to a specific frame for a given object ID.
-        propagate_in_video_preflight: Prepare inference_state and consolidate temporary outputs before tracking.
-        init_state: Initialize an inference state for the predictor.
-        get_im_features: Extract and process image features using SAM2's image encoder for subsequent segmentation tasks.
-
-    Examples:
-        >>> predictor = SAM2VideoPredictor(cfg=DEFAULT_CFG)
-        >>> predictor.set_image("path/to/video_frame.jpg")
-        >>> bboxes = [[100, 100, 200, 200]]
-        >>> results = predictor(bboxes=bboxes)
-
-    Note:
-        The `fill_hole_area` attribute is defined but not used in the current implementation.
-    """
-
-    # fill_hole_area = 8  # not used
-
-    def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
-        """
-        Initialize the predictor with configuration and optional overrides.
-
-        This constructor initializes the SAM2VideoPredictor with a given configuration, applies any
-        specified overrides, and sets up the inference state along with certain flags
-        that control the behavior of the predictor.
-
-        Args:
-            cfg (dict): Configuration dictionary containing default settings.
-            overrides (dict | None): Dictionary of values to override default configuration.
-            _callbacks (dict | None): Dictionary of callback functions to customize behavior.
-
-        Examples:
-            >>> predictor = SAM2VideoPredictor(cfg=DEFAULT_CFG)
-            >>> predictor_example_with_imgsz = SAM2VideoPredictor(overrides={"imgsz": 640})
-            >>> predictor_example_with_callback = SAM2VideoPredictor(_callbacks={"on_predict_start": custom_callback})
-        """
-        super().__init__(cfg, overrides, _callbacks)
-        self.inference_state = {}
-        self.non_overlap_masks = True
-        self.clear_non_cond_mem_around_input = False
-        self.clear_non_cond_mem_for_multi_obj = False
-        self.callbacks["on_predict_start"].append(self.init_state)
-
-    def get_model(self):
-        """
-        Retrieve and configure the model with binarization enabled.
-
-        Note:
-            This method overrides the base class implementation to set the binarize flag to True.
-        """
-        model = super().get_model()
-        model.set_binarize(True)
-        return model
-
-    def inference(self, im, bboxes=None, points=None, labels=None, masks=None):
-        """
-        Perform image segmentation inference based on the given input cues, using the currently loaded image. This
-        method leverages SAM's (Segment Anything Model) architecture consisting of image encoder, prompt encoder, and
-        mask decoder for real-time and promptable segmentation tasks.
-
-        Args:
-            im (torch.Tensor): The preprocessed input image in tensor format, with shape (N, C, H, W).
-            bboxes (np.ndarray | list, optional): Bounding boxes with shape (N, 4), in XYXY format.
-            points (np.ndarray | list, optional): Points indicating object locations with shape (N, 2), in pixels.
-            labels (np.ndarray | list, optional): Labels for point prompts, shape (N, ). 1 = foreground, 0 = background.
-            masks (np.ndarray, optional): Low-resolution masks from previous predictions shape (N,H,W). For SAM H=W=256.
-
-        Returns:
-            pred_masks (torch.Tensor): The output masks in shape CxHxW, where C is the number of generated masks.
-            pred_scores (torch.Tensor): An array of length C containing quality scores predicted by the model for each mask.
-        """
-        # Override prompts if any stored in self.prompts
-        bboxes = self.prompts.pop("bboxes", bboxes)
-        points = self.prompts.pop("points", points)
-        masks = self.prompts.pop("masks", masks)
-
-        frame = self.dataset.frame
-        self.inference_state["im"] = im
-        output_dict = self.inference_state["output_dict"]
-        if len(output_dict["cond_frame_outputs"]) == 0:  # initialize prompts
-            points, labels, masks = self._prepare_prompts(
-                im.shape[2:], self.batch[1][0].shape[:2], bboxes, points, labels, masks
-            )
-            if points is not None:
-                for i in range(len(points)):
-                    self.add_new_prompts(obj_id=i, points=points[[i]], labels=labels[[i]], frame_idx=frame)
-            elif masks is not None:
-                for i in range(len(masks)):
-                    self.add_new_prompts(obj_id=i, masks=masks[[i]], frame_idx=frame)
-        self.propagate_in_video_preflight()
-
-        consolidated_frame_inds = self.inference_state["consolidated_frame_inds"]
-        batch_size = len(self.inference_state["obj_idx_to_id"])
-        if len(output_dict["cond_frame_outputs"]) == 0:
-            raise RuntimeError("No points are provided; please add points first")
-
-        if frame in consolidated_frame_inds["cond_frame_outputs"]:
-            storage_key = "cond_frame_outputs"
-            current_out = output_dict[storage_key][frame]
-            if self.clear_non_cond_mem_around_input and (self.clear_non_cond_mem_for_multi_obj or batch_size <= 1):
-                # clear non-conditioning memory of the surrounding frames
-                self._clear_non_cond_mem_around_input(frame)
-        elif frame in consolidated_frame_inds["non_cond_frame_outputs"]:
-            storage_key = "non_cond_frame_outputs"
-            current_out = output_dict[storage_key][frame]
-        else:
-            storage_key = "non_cond_frame_outputs"
-            current_out = self._run_single_frame_inference(
-                output_dict=output_dict,
-                frame_idx=frame,
-                batch_size=batch_size,
-                is_init_cond_frame=False,
-                point_inputs=None,
-                mask_inputs=None,
-                reverse=False,
-                run_mem_encoder=True,
-            )
-            output_dict[storage_key][frame] = current_out
-        # Create slices of per-object outputs for subsequent interaction with each
-        # individual object after tracking.
-        self._add_output_per_object(frame, current_out, storage_key)
-        self.inference_state["frames_already_tracked"].append(frame)
-        pred_masks = current_out["pred_masks"].flatten(0, 1)
-        pred_masks = pred_masks[(pred_masks > self.model.mask_threshold).sum((1, 2)) > 0]  # filter blank masks
-
-        return pred_masks, torch.ones(pred_masks.shape[0], dtype=pred_masks.dtype, device=pred_masks.device)
-
-    def postprocess(self, preds, img, orig_imgs):
-        """
-        Post-process the predictions to apply non-overlapping constraints if required.
-
-        This method extends the post-processing functionality by applying non-overlapping constraints
-        to the predicted masks if the `non_overlap_masks` flag is set to True. This ensures that
-        the masks do not overlap, which can be useful for certain applications.
-
-        Args:
-            preds (tuple[torch.Tensor, torch.Tensor]): The predicted masks and scores from the model.
-            img (torch.Tensor): The processed image tensor.
-            orig_imgs (list[np.ndarray]): The original images before processing.
-
-        Returns:
-            (list): The post-processed predictions.
-
-        Note:
-            If `non_overlap_masks` is True, the method applies constraints to ensure non-overlapping masks.
-        """
-        results = super().postprocess(preds, img, orig_imgs)
-        if self.non_overlap_masks:
-            for result in results:
-                if result.masks is None or len(result.masks) == 0:
-                    continue
-                result.masks.data = self.model._apply_non_overlapping_constraints(result.masks.data.unsqueeze(0))[0]
-        return results
-
-    @smart_inference_mode()
-    def add_new_prompts(
-        self,
-        obj_id,
-        points=None,
-        labels=None,
-        masks=None,
-        frame_idx=0,
-    ):
-        """
-        Add new points or masks to a specific frame for a given object ID.
-
-        This method updates the inference state with new prompts (points or masks) for a specified
-        object and frame index. It ensures that the prompts are either points or masks, but not both,
-        and updates the internal state accordingly. It also handles the generation of new segmentations
-        based on the provided prompts and the existing state.
-
-        Args:
-            obj_id (int): The ID of the object to which the prompts are associated.
-            points (torch.Tensor, optional): The coordinates of the points of interest.
-            labels (torch.Tensor, optional): The labels corresponding to the points.
-            masks (torch.Tensor, optional): Binary masks for the object.
-            frame_idx (int, optional): The index of the frame to which the prompts are applied.
-
-        Returns:
-            pred_masks (torch.Tensor): The flattened predicted masks.
-            pred_scores (torch.Tensor): A tensor of ones indicating the number of objects.
-
-        Raises:
-            AssertionError: If both `masks` and `points` are provided, or neither is provided.
-
-        Note:
-            - Only one type of prompt (either points or masks) can be added per call.
-            - If the frame is being tracked for the first time, it is treated as an initial conditioning frame.
-            - The method handles the consolidation of outputs and resizing of masks to the original video resolution.
-        """
-        assert (masks is None) ^ (points is None), "'masks' and 'points' prompts are not compatible with each other."
-        obj_idx = self._obj_id_to_idx(obj_id)
-
-        point_inputs = None
-        pop_key = "point_inputs_per_obj"
-        if points is not None:
-            point_inputs = {"point_coords": points, "point_labels": labels}
-            self.inference_state["point_inputs_per_obj"][obj_idx][frame_idx] = point_inputs
-            pop_key = "mask_inputs_per_obj"
-        self.inference_state["mask_inputs_per_obj"][obj_idx][frame_idx] = masks
-        self.inference_state[pop_key][obj_idx].pop(frame_idx, None)
-        # If this frame hasn't been tracked before, we treat it as an initial conditioning
-        # frame, meaning that the inputs points are to generate segments on this frame without
-        # using any memory from other frames, like in SAM. Otherwise (if it has been tracked),
-        # the input points will be used to correct the already tracked masks.
-        is_init_cond_frame = frame_idx not in self.inference_state["frames_already_tracked"]
-        obj_output_dict = self.inference_state["output_dict_per_obj"][obj_idx]
-        obj_temp_output_dict = self.inference_state["temp_output_dict_per_obj"][obj_idx]
-        # Add a frame to conditioning output if it's an initial conditioning frame or
-        # if the model sees all frames receiving clicks/mask as conditioning frames.
-        is_cond = is_init_cond_frame or self.model.add_all_frames_to_correct_as_cond
-        storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"
-
-        # Get any previously predicted mask logits on this object and feed it along with
-        # the new clicks into the SAM mask decoder.
-        prev_sam_mask_logits = None
-        # lookup temporary output dict first, which contains the most recent output
-        # (if not found, then lookup conditioning and non-conditioning frame output)
-        if point_inputs is not None:
-            prev_out = (
-                obj_temp_output_dict[storage_key].get(frame_idx)
-                or obj_output_dict["cond_frame_outputs"].get(frame_idx)
-                or obj_output_dict["non_cond_frame_outputs"].get(frame_idx)
-            )
-
-            if prev_out is not None and prev_out.get("pred_masks") is not None:
-                prev_sam_mask_logits = prev_out["pred_masks"].to(
-                    device=self.device, non_blocking=self.device.type == "cuda"
-                )
-                # Clamp the scale of prev_sam_mask_logits to avoid rare numerical issues.
-                prev_sam_mask_logits.clamp_(-32.0, 32.0)
-        current_out = self._run_single_frame_inference(
-            output_dict=obj_output_dict,  # run on the slice of a single object
-            frame_idx=frame_idx,
-            batch_size=1,  # run on the slice of a single object
-            is_init_cond_frame=is_init_cond_frame,
-            point_inputs=point_inputs,
-            mask_inputs=masks,
-            reverse=False,
-            # Skip the memory encoder when adding clicks or mask. We execute the memory encoder
-            # at the beginning of `propagate_in_video` (after user finalize their clicks). This
-            # allows us to enforce non-overlapping constraints on all objects before encoding
-            # them into memory.
-            run_mem_encoder=False,
-            prev_sam_mask_logits=prev_sam_mask_logits,
-        )
-        # Add the output to the output dict (to be used as future memory)
-        obj_temp_output_dict[storage_key][frame_idx] = current_out
-
-        # Resize the output mask to the original video resolution
-        consolidated_out = self._consolidate_temp_output_across_obj(
-            frame_idx,
-            is_cond=is_cond,
-            run_mem_encoder=False,
-        )
-        pred_masks = consolidated_out["pred_masks"].flatten(0, 1)
-        return pred_masks.flatten(0, 1), torch.ones(1, dtype=pred_masks.dtype, device=pred_masks.device)
-
-    @smart_inference_mode()
-    def propagate_in_video_preflight(self):
-        """
-        Prepare inference_state and consolidate temporary outputs before tracking.
-
-        This method marks the start of tracking, disallowing the addition of new objects until the session is reset.
-        It consolidates temporary outputs from `temp_output_dict_per_obj` and merges them into `output_dict`.
-        Additionally, it clears non-conditioning memory around input frames and ensures that the state is consistent
-        with the provided inputs.
-        """
-        # Tracking has started and we don't allow adding new objects until session is reset.
-        self.inference_state["tracking_has_started"] = True
-        batch_size = len(self.inference_state["obj_idx_to_id"])
-
-        # Consolidate per-object temporary outputs in "temp_output_dict_per_obj" and
-        # add them into "output_dict".
-        temp_output_dict_per_obj = self.inference_state["temp_output_dict_per_obj"]
-        output_dict = self.inference_state["output_dict"]
-        # "consolidated_frame_inds" contains indices of those frames where consolidated
-        # temporary outputs have been added (either in this call or any previous calls
-        # to `propagate_in_video_preflight`).
-        consolidated_frame_inds = self.inference_state["consolidated_frame_inds"]
-        for is_cond in {False, True}:
-            # Separately consolidate conditioning and non-conditioning temp outputs
-            storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"
-            # Find all the frames that contain temporary outputs for any objects
-            # (these should be the frames that have just received clicks for mask inputs
-            # via `add_new_points` or `add_new_mask`)
-            temp_frame_inds = set()
-            for obj_temp_output_dict in temp_output_dict_per_obj.values():
-                temp_frame_inds.update(obj_temp_output_dict[storage_key].keys())
-            consolidated_frame_inds[storage_key].update(temp_frame_inds)
-            # consolidate the temporary output across all objects on this frame
-            for frame_idx in temp_frame_inds:
-                consolidated_out = self._consolidate_temp_output_across_obj(
-                    frame_idx, is_cond=is_cond, run_mem_encoder=True
-                )
-                # merge them into "output_dict" and also create per-object slices
-                output_dict[storage_key][frame_idx] = consolidated_out
-                self._add_output_per_object(frame_idx, consolidated_out, storage_key)
-                if self.clear_non_cond_mem_around_input and (self.clear_non_cond_mem_for_multi_obj or batch_size <= 1):
-                    # clear non-conditioning memory of the surrounding frames
-                    self._clear_non_cond_mem_around_input(frame_idx)
-
-            # clear temporary outputs in `temp_output_dict_per_obj`
-            for obj_temp_output_dict in temp_output_dict_per_obj.values():
-                obj_temp_output_dict[storage_key].clear()
-
-        # edge case: if an output is added to "cond_frame_outputs", we remove any prior
-        # output on the same frame in "non_cond_frame_outputs"
-        for frame_idx in output_dict["cond_frame_outputs"]:
-            output_dict["non_cond_frame_outputs"].pop(frame_idx, None)
-        for obj_output_dict in self.inference_state["output_dict_per_obj"].values():
-            for frame_idx in obj_output_dict["cond_frame_outputs"]:
-                obj_output_dict["non_cond_frame_outputs"].pop(frame_idx, None)
-        for frame_idx in consolidated_frame_inds["cond_frame_outputs"]:
-            assert frame_idx in output_dict["cond_frame_outputs"]
-            consolidated_frame_inds["non_cond_frame_outputs"].discard(frame_idx)
-
-        # Make sure that the frame indices in "consolidated_frame_inds" are exactly those frames
-        # with either points or mask inputs (which should be true under a correct workflow).
-        all_consolidated_frame_inds = (
-            consolidated_frame_inds["cond_frame_outputs"] | consolidated_frame_inds["non_cond_frame_outputs"]
-        )
-        input_frames_inds = set()
-        for point_inputs_per_frame in self.inference_state["point_inputs_per_obj"].values():
-            input_frames_inds.update(point_inputs_per_frame.keys())
-        for mask_inputs_per_frame in self.inference_state["mask_inputs_per_obj"].values():
-            input_frames_inds.update(mask_inputs_per_frame.keys())
-        assert all_consolidated_frame_inds == input_frames_inds
-
-    @staticmethod
-    def init_state(predictor):
-        """
-        Initialize an inference state for the predictor.
-
-        This function sets up the initial state required for performing inference on video data.
-        It includes initializing various dictionaries and ordered dictionaries that will store
-        inputs, outputs, and other metadata relevant to the tracking process.
-
-        Args:
-            predictor (SAM2VideoPredictor): The predictor object for which to initialize the state.
-        """
-        if len(predictor.inference_state) > 0:  # means initialized
-            return
-        assert predictor.dataset is not None
-        assert predictor.dataset.mode == "video"
-
-        inference_state = {
-            "num_frames": predictor.dataset.frames,
-            "point_inputs_per_obj": {},  # inputs points on each frame
-            "mask_inputs_per_obj": {},  # inputs mask on each frame
-            "constants": {},  # values that don't change across frames (so we only need to hold one copy of them)
-            # mapping between client-side object id and model-side object index
-            "obj_id_to_idx": OrderedDict(),
-            "obj_idx_to_id": OrderedDict(),
-            "obj_ids": [],
-            # A storage to hold the model's tracking results and states on each frame
-            "output_dict": {
-                "cond_frame_outputs": {},  # dict containing {frame_idx: <out>}
-                "non_cond_frame_outputs": {},  # dict containing {frame_idx: <out>}
-            },
-            # Slice (view) of each object tracking results, sharing the same memory with "output_dict"
-            "output_dict_per_obj": {},
-            # A temporary storage to hold new outputs when user interact with a frame
-            # to add clicks or mask (it's merged into "output_dict" before propagation starts)
-            "temp_output_dict_per_obj": {},
-            # Frames that already holds consolidated outputs from click or mask inputs
-            # (we directly use their consolidated outputs during tracking)
-            "consolidated_frame_inds": {
-                "cond_frame_outputs": set(),  # set containing frame indices
-                "non_cond_frame_outputs": set(),  # set containing frame indices
-            },
-            # metadata for each tracking frame (e.g. which direction it's tracked)
-            "tracking_has_started": False,
-            "frames_already_tracked": [],
-        }
-        predictor.inference_state = inference_state
-
-    def get_im_features(self, im, batch=1):
-        """
-        Extract and process image features using SAM2's image encoder for subsequent segmentation tasks.
-
-        Args:
-            im (torch.Tensor): The input image tensor.
-            batch (int, optional): The batch size for expanding features if there are multiple prompts.
-
-        Returns:
-            vis_feats (torch.Tensor): The visual features extracted from the image.
-            vis_pos_embed (torch.Tensor): The positional embeddings for the visual features.
-            feat_sizes (list[tuple]): A list containing the sizes of the extracted features.
-
-        Note:
-            - If `batch` is greater than 1, the features are expanded to fit the batch size.
-            - The method leverages the model's `_prepare_backbone_features` method to prepare the backbone features.
-        """
-        self.model.set_imgsz(self.imgsz)
-        backbone_out = self.model.forward_image(im)
-        if batch > 1:  # expand features if there's more than one prompt
-            for i, feat in enumerate(backbone_out["backbone_fpn"]):
-                backbone_out["backbone_fpn"][i] = feat.expand(batch, -1, -1, -1)
-            for i, pos in enumerate(backbone_out["vision_pos_enc"]):
-                pos = pos.expand(batch, -1, -1, -1)
-                backbone_out["vision_pos_enc"][i] = pos
-        _, vis_feats, vis_pos_embed, feat_sizes = self.model._prepare_backbone_features(backbone_out)
-        return vis_feats, vis_pos_embed, feat_sizes
-
-    def _obj_id_to_idx(self, obj_id):
-        """
-        Map client-side object id to model-side object index.
-
-        Args:
-            obj_id (int): The unique identifier of the object provided by the client side.
-
-        Returns:
-            (int): The index of the object on the model side.
-
-        Raises:
-            RuntimeError: If an attempt is made to add a new object after tracking has started.
-
-        Note:
-            - The method updates or retrieves mappings between object IDs and indices stored in
-              `inference_state`.
-            - It ensures that new objects can only be added before tracking commences.
-            - It maintains two-way mappings between IDs and indices (`obj_id_to_idx` and `obj_idx_to_id`).
-            - Additional data structures are initialized for the new object to store inputs and outputs.
-        """
-        obj_idx = self.inference_state["obj_id_to_idx"].get(obj_id, None)
-        if obj_idx is not None:
-            return obj_idx
-
-        # This is a new object id not sent to the server before. We only allow adding
-        # new objects *before* the tracking starts.
-        allow_new_object = not self.inference_state["tracking_has_started"]
-        if allow_new_object:
-            # get the next object slot
-            obj_idx = len(self.inference_state["obj_id_to_idx"])
-            self.inference_state["obj_id_to_idx"][obj_id] = obj_idx
-            self.inference_state["obj_idx_to_id"][obj_idx] = obj_id
-            self.inference_state["obj_ids"] = list(self.inference_state["obj_id_to_idx"])
-            # set up input and output structures for this object
-            self.inference_state["point_inputs_per_obj"][obj_idx] = {}
-            self.inference_state["mask_inputs_per_obj"][obj_idx] = {}
-            self.inference_state["output_dict_per_obj"][obj_idx] = {
-                "cond_frame_outputs": {},  # dict containing {frame_idx: <out>}
-                "non_cond_frame_outputs": {},  # dict containing {frame_idx: <out>}
-            }
-            self.inference_state["temp_output_dict_per_obj"][obj_idx] = {
-                "cond_frame_outputs": {},  # dict containing {frame_idx: <out>}
-                "non_cond_frame_outputs": {},  # dict containing {frame_idx: <out>}
-            }
-            return obj_idx
-        else:
-            raise RuntimeError(
-                f"Cannot add new object id {obj_id} after tracking starts. "
-                f"All existing object ids: {self.inference_state['obj_ids']}. "
-                f"Please call 'reset_state' to restart from scratch."
-            )
-
-    def _run_single_frame_inference(
-        self,
-        output_dict,
-        frame_idx,
-        batch_size,
-        is_init_cond_frame,
-        point_inputs,
-        mask_inputs,
-        reverse,
-        run_mem_encoder,
-        prev_sam_mask_logits=None,
-    ):
-        """
-        Run tracking on a single frame based on current inputs and previous memory.
-
-        Args:
-            output_dict (dict): The dictionary containing the output states of the tracking process.
-            frame_idx (int): The index of the current frame.
-            batch_size (int): The batch size for processing the frame.
-            is_init_cond_frame (bool): Indicates if the current frame is an initial conditioning frame.
-            point_inputs (dict | None): Input points and their labels.
-            mask_inputs (torch.Tensor | None): Input binary masks.
-            reverse (bool): Indicates if the tracking should be performed in reverse order.
-            run_mem_encoder (bool): Indicates if the memory encoder should be executed.
-            prev_sam_mask_logits (torch.Tensor | None): Previous mask logits for the current object.
-
-        Returns:
-            (dict): A dictionary containing the output of the tracking step, including updated features and predictions.
-
-        Raises:
-            AssertionError: If both `point_inputs` and `mask_inputs` are provided, or neither is provided.
-
-        Note:
-            - The method assumes that `point_inputs` and `mask_inputs` are mutually exclusive.
-            - The method retrieves image features using the `get_im_features` method.
-            - The `maskmem_pos_enc` is assumed to be constant across frames, hence only one copy is stored.
-            - The `fill_holes_in_mask_scores` function is commented out and currently unsupported due to CUDA extension requirements.
-        """
-        # Retrieve correct image features
-        current_vision_feats, current_vision_pos_embeds, feat_sizes = self.get_im_features(
-            self.inference_state["im"], batch_size
-        )
-
-        # point and mask should not appear as input simultaneously on the same frame
-        assert point_inputs is None or mask_inputs is None
-        current_out = self.model.track_step(
-            frame_idx=frame_idx,
-            is_init_cond_frame=is_init_cond_frame,
-            current_vision_feats=current_vision_feats,
-            current_vision_pos_embeds=current_vision_pos_embeds,
-            feat_sizes=feat_sizes,
-            point_inputs=point_inputs,
-            mask_inputs=mask_inputs,
-            output_dict=output_dict,
-            num_frames=self.inference_state["num_frames"],
-            track_in_reverse=reverse,
-            run_mem_encoder=run_mem_encoder,
-            prev_sam_mask_logits=prev_sam_mask_logits,
-        )
-
-        maskmem_features = current_out["maskmem_features"]
-        if maskmem_features is not None:
-            current_out["maskmem_features"] = maskmem_features.to(
-                dtype=torch.float16, device=self.device, non_blocking=self.device.type == "cuda"
-            )
-        # NOTE: Do not support the `fill_holes_in_mask_scores` function since it needs cuda extensions
-        # potentially fill holes in the predicted masks
-        # if self.fill_hole_area > 0:
-        #     pred_masks = current_out["pred_masks"].to(self.device, non_blocking=self.device.type == "cuda")
-        #     pred_masks = fill_holes_in_mask_scores(pred_masks, self.fill_hole_area)
-
-        # "maskmem_pos_enc" is the same across frames, so we only need to store one copy of it
-        current_out["maskmem_pos_enc"] = self._get_maskmem_pos_enc(current_out["maskmem_pos_enc"])
-        return current_out
-
-    def _get_maskmem_pos_enc(self, out_maskmem_pos_enc):
-        """
-        Cache and manage the positional encoding for mask memory across frames and objects.
-
-        This method optimizes storage by caching the positional encoding (`maskmem_pos_enc`) for
-        mask memory, which is constant across frames and objects, thus reducing the amount of
-        redundant information stored during an inference session. It checks if the positional
-        encoding has already been cached; if not, it caches a slice of the provided encoding.
-        If the batch size is greater than one, it expands the cached positional encoding to match
-        the current batch size.
-
-        Args:
-            out_maskmem_pos_enc (list[torch.Tensor] | None): The positional encoding for mask memory.
-                Should be a list of tensors or None.
-
-        Returns:
-            (list[torch.Tensor]): The positional encoding for mask memory, either cached or expanded.
-
-        Note:
-            - The method assumes that `out_maskmem_pos_enc` is a list of tensors or None.
-            - Only a single object's slice is cached since the encoding is the same across objects.
-            - The method checks if the positional encoding has already been cached in the session's constants.
-            - If the batch size is greater than one, the cached encoding is expanded to fit the batch size.
-        """
-        model_constants = self.inference_state["constants"]
-        # "out_maskmem_pos_enc" should be either a list of tensors or None
-        if out_maskmem_pos_enc is not None:
-            if "maskmem_pos_enc" not in model_constants:
-                assert isinstance(out_maskmem_pos_enc, list)
-                # only take the slice for one object, since it's same across objects
-                maskmem_pos_enc = [x[:1].clone() for x in out_maskmem_pos_enc]
-                model_constants["maskmem_pos_enc"] = maskmem_pos_enc
-            else:
-                maskmem_pos_enc = model_constants["maskmem_pos_enc"]
-            # expand the cached maskmem_pos_enc to the actual batch size
-            batch_size = out_maskmem_pos_enc[0].shape[0]
-            if batch_size > 1:
-                out_maskmem_pos_enc = [x.expand(batch_size, -1, -1, -1) for x in maskmem_pos_enc]
-        return out_maskmem_pos_enc
-
-    def _consolidate_temp_output_across_obj(
-        self,
-        frame_idx,
-        is_cond=False,
-        run_mem_encoder=False,
-    ):
-        """
-        Consolidate per-object temporary outputs into a single output for all objects.
-
-        This method combines the temporary outputs for each object on a given frame into a unified
-        output. It fills in any missing objects either from the main output dictionary or leaves
-        placeholders if they do not exist in the main output. Optionally, it can re-run the memory
-        encoder after applying non-overlapping constraints to the object scores.
-
-        Args:
-            frame_idx (int): The index of the frame for which to consolidate outputs.
-            is_cond (bool, optional): Indicates if the frame is considered a conditioning frame.
-            run_mem_encoder (bool, optional): Specifies whether to run the memory encoder after
-                consolidating the outputs.
-
-        Returns:
-            (dict): A consolidated output dictionary containing the combined results for all objects.
-
-        Note:
-            - The method initializes the consolidated output with placeholder values for missing objects.
-            - It searches for outputs in both the temporary and main output dictionaries.
-            - If `run_mem_encoder` is True, it applies non-overlapping constraints and re-runs the memory encoder.
-            - The `maskmem_features` and `maskmem_pos_enc` are only populated when `run_mem_encoder` is True.
-        """
-        batch_size = len(self.inference_state["obj_idx_to_id"])
-        storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"
-
-        # Initialize `consolidated_out`. Its "maskmem_features" and "maskmem_pos_enc"
-        # will be added when rerunning the memory encoder after applying non-overlapping
-        # constraints to object scores. Its "pred_masks" are prefilled with a large
-        # negative value (NO_OBJ_SCORE) to represent missing objects.
-        consolidated_out = {
-            "maskmem_features": None,
-            "maskmem_pos_enc": None,
-            "pred_masks": torch.full(
-                size=(batch_size, 1, self.imgsz[0] // 4, self.imgsz[1] // 4),
-                fill_value=-1024.0,
-                dtype=self.torch_dtype,
-                device=self.device,
-            ),
-            "obj_ptr": torch.full(
-                size=(batch_size, self.model.hidden_dim),
-                fill_value=-1024.0,
-                dtype=self.torch_dtype,
-                device=self.device,
-            ),
-            "object_score_logits": torch.full(
-                size=(batch_size, 1),
-                # default to 10.0 for object_score_logits, i.e. assuming the object is
-                # present as sigmoid(10)=1, same as in `predict_masks` of `MaskDecoder`
-                fill_value=10.0,
-                dtype=self.torch_dtype,
-                device=self.device,
-            ),
-        }
-        for obj_idx in range(batch_size):
-            obj_temp_output_dict = self.inference_state["temp_output_dict_per_obj"][obj_idx]
-            obj_output_dict = self.inference_state["output_dict_per_obj"][obj_idx]
-            out = (
-                obj_temp_output_dict[storage_key].get(frame_idx)
-                # If the object doesn't appear in "temp_output_dict_per_obj" on this frame,
-                # we fall back and look up its previous output in "output_dict_per_obj".
-                # We look up both "cond_frame_outputs" and "non_cond_frame_outputs" in
-                # "output_dict_per_obj" to find a previous output for this object.
-                or obj_output_dict["cond_frame_outputs"].get(frame_idx)
-                or obj_output_dict["non_cond_frame_outputs"].get(frame_idx)
-            )
-            # If the object doesn't appear in "output_dict_per_obj" either, we skip it
-            # and leave its mask scores to the default scores (i.e. the NO_OBJ_SCORE
-            # placeholder above) and set its object pointer to be a dummy pointer.
-            if out is None:
-                # Fill in dummy object pointers for those objects without any inputs or
-                # tracking outcomes on this frame (only do it under `run_mem_encoder=True`,
-                # i.e. when we need to build the memory for tracking).
-                if run_mem_encoder:
-                    # fill object pointer with a dummy pointer (based on an empty mask)
-                    consolidated_out["obj_ptr"][obj_idx : obj_idx + 1] = self._get_empty_mask_ptr(frame_idx)
-                continue
-            # Add the temporary object output mask to consolidated output mask
-            consolidated_out["pred_masks"][obj_idx : obj_idx + 1] = out["pred_masks"]
-            consolidated_out["obj_ptr"][obj_idx : obj_idx + 1] = out["obj_ptr"]
-
-        # Optionally, apply non-overlapping constraints on the consolidated scores and rerun the memory encoder
-        if run_mem_encoder:
-            high_res_masks = F.interpolate(
-                consolidated_out["pred_masks"],
-                size=self.imgsz,
-                mode="bilinear",
-                align_corners=False,
-            )
-            if self.model.non_overlap_masks_for_mem_enc:
-                high_res_masks = self.model._apply_non_overlapping_constraints(high_res_masks)
-            consolidated_out["maskmem_features"], consolidated_out["maskmem_pos_enc"] = self._run_memory_encoder(
-                batch_size=batch_size,
-                high_res_masks=high_res_masks,
-                is_mask_from_pts=True,  # these frames are what the user interacted with
-                object_score_logits=consolidated_out["object_score_logits"],
-            )
-
-        return consolidated_out
-
-    def _get_empty_mask_ptr(self, frame_idx):
-        """
-        Get a dummy object pointer based on an empty mask on the current frame.
-
-        Args:
-            frame_idx (int): The index of the current frame for which to generate the dummy object pointer.
-
-        Returns:
-            (torch.Tensor): A tensor representing the dummy object pointer generated from the empty mask.
-        """
-        # Retrieve correct image features
-        current_vision_feats, current_vision_pos_embeds, feat_sizes = self.get_im_features(self.inference_state["im"])
-
-        # Feed the empty mask and image feature above to get a dummy object pointer
-        current_out = self.model.track_step(
-            frame_idx=frame_idx,
-            is_init_cond_frame=True,
-            current_vision_feats=current_vision_feats,
-            current_vision_pos_embeds=current_vision_pos_embeds,
-            feat_sizes=feat_sizes,
-            point_inputs=None,
-            # A dummy (empty) mask with a single object
-            mask_inputs=torch.zeros((1, 1, *self.imgsz), dtype=self.torch_dtype, device=self.device),
-            output_dict={},
-            num_frames=self.inference_state["num_frames"],
-            track_in_reverse=False,
-            run_mem_encoder=False,
-            prev_sam_mask_logits=None,
-        )
-        return current_out["obj_ptr"]
-
-    def _run_memory_encoder(self, batch_size, high_res_masks, object_score_logits, is_mask_from_pts):
-        """
-        Run the memory encoder on masks.
-
-        This is usually after applying non-overlapping constraints to object scores. Since their scores changed, their
-        memory also needs to be computed again with the memory encoder.
-
-        Args:
-            batch_size (int): The batch size for processing the frame.
-            high_res_masks (torch.Tensor): High-resolution masks for which to compute the memory.
-            object_score_logits (torch.Tensor): Logits representing the object scores.
-            is_mask_from_pts (bool): Indicates if the mask is derived from point interactions.
-
-        Returns:
-            maskmem_features (torch.Tensor): The encoded mask features.
-            maskmem_pos_enc (torch.Tensor): The positional encoding.
-        """
-        # Retrieve correct image features
-        current_vision_feats, _, feat_sizes = self.get_im_features(self.inference_state["im"], batch_size)
-        maskmem_features, maskmem_pos_enc = self.model._encode_new_memory(
-            current_vision_feats=current_vision_feats,
-            feat_sizes=feat_sizes,
-            pred_masks_high_res=high_res_masks,
-            is_mask_from_pts=is_mask_from_pts,
-            object_score_logits=object_score_logits,
-        )
-
-        # "maskmem_pos_enc" is the same across frames, so we only need to store one copy of it
-        maskmem_pos_enc = self._get_maskmem_pos_enc(maskmem_pos_enc)
-        return maskmem_features.to(
-            dtype=torch.float16, device=self.device, non_blocking=self.device.type == "cuda"
-        ), maskmem_pos_enc
-
-    def _add_output_per_object(self, frame_idx, current_out, storage_key):
-        """
-        Split a multi-object output into per-object output slices and add them into Output_Dict_Per_Obj.
-
-        The resulting slices share the same tensor storage.
-
-        Args:
-            frame_idx (int): The index of the current frame.
-            current_out (dict): The current output dictionary containing multi-object outputs.
-            storage_key (str): The key used to store the output in the per-object output dictionary.
-        """
-        maskmem_features = current_out["maskmem_features"]
-        assert maskmem_features is None or isinstance(maskmem_features, torch.Tensor)
-
-        maskmem_pos_enc = current_out["maskmem_pos_enc"]
-        assert maskmem_pos_enc is None or isinstance(maskmem_pos_enc, list)
-
-        for obj_idx, obj_output_dict in self.inference_state["output_dict_per_obj"].items():
-            obj_slice = slice(obj_idx, obj_idx + 1)
-            obj_out = {
-                "maskmem_features": None,
-                "maskmem_pos_enc": None,
-                "pred_masks": current_out["pred_masks"][obj_slice],
-                "obj_ptr": current_out["obj_ptr"][obj_slice],
-            }
-            if maskmem_features is not None:
-                obj_out["maskmem_features"] = maskmem_features[obj_slice]
-            if maskmem_pos_enc is not None:
-                obj_out["maskmem_pos_enc"] = [x[obj_slice] for x in maskmem_pos_enc]
-            obj_output_dict[storage_key][frame_idx] = obj_out
-
-    def _clear_non_cond_mem_around_input(self, frame_idx):
-        """
-        Remove the non-conditioning memory around the input frame.
-
-        When users provide correction clicks, the surrounding frames' non-conditioning memories can still contain outdated
-        object appearance information and could confuse the model. This method clears those non-conditioning memories
-        surrounding the interacted frame to avoid giving the model both old and new information about the object.
-
-        Args:
-            frame_idx (int): The index of the current frame where user interaction occurred.
-        """
-        r = self.model.memory_temporal_stride_for_eval
-        frame_idx_begin = frame_idx - r * self.model.num_maskmem
-        frame_idx_end = frame_idx + r * self.model.num_maskmem
-        for t in range(frame_idx_begin, frame_idx_end + 1):
-            self.inference_state["output_dict"]["non_cond_frame_outputs"].pop(t, None)
-            for obj_output_dict in self.inference_state["output_dict_per_obj"].values():
-                obj_output_dict["non_cond_frame_outputs"].pop(t, None)
-
-
-class SAM2DynamicInteractivePredictor(SAM2Predictor):
-    """
-    SAM2DynamicInteractivePredictor extends SAM2Predictor to support dynamic interactions with video frames or a
-    sequence of images.
-
-    Attributes:
-        memory_bank (list): OrderedDict: Stores the states of each image with prompts.
-        obj_idx_set (set): A set to keep track of the object indices that have been added.
-        obj_id_to_idx (OrderedDict): Maps object IDs to their corresponding indices.
-        obj_idx_to_id (OrderedDict): Maps object indices to their corresponding IDs.
-
-    Methods:
-        get_model: Retrieves and configures the model with binarization enabled.
-        inference: Performs inference on a single image with optional prompts and object IDs.
-        postprocess: Post-processes the predictions to apply non-overlapping constraints if required.
-        update_memory: Append the imgState to the memory_bank and update the memory for the model.
-        track_step: Tracking step for the current image state to predict masks.
-        get_maskmem_enc: Get memory and positional encoding from the memory bank.
-
-    Examples:
-            >>> predictor = SAM2DynamicInteractivePredictor(cfg=DEFAULT_CFG)
-            >>> predictor(source=support_img1, bboxes=bboxes1, obj_ids=labels1, update_memory=True)
-            >>> results1 = predictor(source=query_img1)
-            >>> predictor(source=support_img2, bboxes=bboxes2, obj_ids=labels2, update_memory=True)
-            >>> results2 = predictor(source=query_img2)
-    """
-
-    def __init__(
-        self,
-        cfg: Any = DEFAULT_CFG,
-        overrides: dict[str, Any] | None = None,
-        max_obj_num: int = 3,
-        _callbacks: dict[str, Any] | None = None,
-    ) -> None:
-        """
-        Initialize the predictor with configuration and optional overrides.
-
-        This constructor initializes the SAM2DynamicInteractivePredictor with a given configuration, applies any
-        specified overrides
-
-        Args:
-            cfg (dict[str, Any]): Configuration dictionary containing default settings.
-            overrides (dict[str, Any] | None): Dictionary of values to override default configuration.
-            max_obj_num (int): Maximum number of objects to track. Default is 3. this is set to keep fix feature size for the model.
-            _callbacks (dict[str, Any] | None): Dictionary of callback functions to customize behavior.
-
-        Examples:
-            >>> predictor = SAM2DynamicInteractivePredictor(cfg=DEFAULT_CFG)
-            >>> predictor_example_with_imgsz = SAM2DynamicInteractivePredictor(overrides={"imgsz": 640})
-            >>> predictor_example_with_callback = SAM2DynamicInteractivePredictor(
-            ...     _callbacks={"on_predict_start": custom_callback}
-            ... )
-        """
-        super().__init__(cfg, overrides, _callbacks)
-        self.non_overlap_masks = True
-
-        # Initialize the memory bank to store image states
-        # NOTE: probably need to use dict for better query
-        self.memory_bank = []
-
-        # Initialize the object index set and mappings
-        self.obj_idx_set = set()
-        self.obj_id_to_idx = OrderedDict()
-        self.obj_idx_to_id = OrderedDict()
-        self._max_obj_num = max_obj_num
-        for i in range(self._max_obj_num):
-            self.obj_id_to_idx[i + 1] = i
-            self.obj_idx_to_id[i] = i + 1
-
-    @smart_inference_mode()
-    def inference(
-        self,
-        im: torch.Tensor | np.ndarray,
-        bboxes: list[list[float]] | None = None,
-        masks: torch.Tensor | np.ndarray | None = None,
-        points: list[list[float]] | None = None,
-        labels: list[int] | None = None,
-        obj_ids: list[int] | None = None,
-        update_memory: bool = False,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        """
-        Perform inference on a single image with optional bounding boxes, masks, points and object IDs.
-        It has two modes: one is to run inference on a single image without updating the memory,
-        and the other is to update the memory with the provided prompts and object IDs.
-        When update_memory is True, it will update the memory with the provided prompts and obj_ids.
-        When update_memory is False, it will only run inference on the provided image without updating the memory.
-
-        Args:
-            im (torch.Tensor | np.ndarray): The input image tensor or numpy array.
-            bboxes (list[list[float]] | None): Optional list of bounding boxes to update the memory.
-            masks (list[torch.Tensor | np.ndarray] | None): Optional masks to update the memory.
-            points (list[list[float]] | None): Optional list of points to update the memory, each point is [x, y].
-            labels (list[int] | None): Optional list of object IDs corresponding to the points (>0 for positive, 0 for negative).
-            obj_ids (list[int] | None): Optional list of object IDs corresponding to the prompts.
-            update_memory (bool): Flag to indicate whether to update the memory with new objects.
-
-        Returns:
-            res_masks (torch.Tensor): The output masks in shape (C, H, W)
-            object_score_logits (torch.Tensor): Quality scores for each mask
-        """
-        self.get_im_features(im)
-        points, labels, masks = self._prepare_prompts(
-            dst_shape=self.imgsz,
-            src_shape=self.batch[1][0].shape[:2],
-            points=points,
-            bboxes=bboxes,
-            labels=labels,
-            masks=masks,
-        )
-
-        if update_memory:
-            if isinstance(obj_ids, int):
-                obj_ids = [obj_ids]
-            assert obj_ids is not None, "obj_ids must be provided when update_memory is True"
-            assert masks is not None or points is not None, (
-                "bboxes, masks, or points must be provided when update_memory is True"
-            )
-            if points is None:  # placeholder
-                points = torch.zeros((len(obj_ids), 0, 2), dtype=self.torch_dtype, device=self.device)
-                labels = torch.zeros((len(obj_ids), 0), dtype=torch.int32, device=self.device)
-            if masks is not None:
-                assert len(masks) == len(obj_ids), "masks and obj_ids must have the same length."
-            assert len(points) == len(obj_ids), "points and obj_ids must have the same length."
-            self.update_memory(obj_ids, points, labels, masks)
-
-        current_out = self.track_step()
-        pred_masks, pred_scores = current_out["pred_masks"], current_out["object_score_logits"]
-        # filter the masks and logits based on the object indices
-        if len(self.obj_idx_set) == 0:
-            raise RuntimeError("No objects have been added to the state. Please add objects before inference.")
-        idx = list(self.obj_idx_set)  # cls id
-        pred_masks, pred_scores = pred_masks[idx], pred_scores[idx]
-        # the original score are in [-32,32], and a object score larger than 0 means the object is present, we map it to [-1,1] range,
-        # and use a activate function to make sure the object score logits are non-negative, so that we can use it as a mask
-        pred_scores = torch.clamp_(pred_scores / 32, min=0)
-        return pred_masks.flatten(0, 1), pred_scores.flatten(0, 1)
-
-    def get_im_features(self, img: torch.Tensor | np.ndarray) -> None:
-        """
-        Initialize the image state by processing the input image and extracting features.
-
-        Args:
-            img (torch.Tensor | np.ndarray): The input image tensor or numpy array.
-        """
-        vis_feats, vis_pos_embed, feat_sizes = SAM2VideoPredictor.get_im_features(self, img, batch=self._max_obj_num)
-        self.high_res_features = [
-            feat.permute(1, 2, 0).view(*feat.shape[1:], *feat_size)
-            for feat, feat_size in zip(vis_feats[:-1], feat_sizes[:-1])
-        ]
-
-        self.vision_feats = vis_feats
-        self.vision_pos_embeds = vis_pos_embed
-        self.feat_sizes = feat_sizes
-
-    @smart_inference_mode()
-    def update_memory(
-        self,
-        obj_ids: list[int] = None,
-        points: torch.Tensor | None = None,
-        labels: torch.Tensor | None = None,
-        masks: torch.Tensor | None = None,
-    ) -> None:
-        """
-        Append the imgState to the memory_bank and update the memory for the model.
-
-        Args:
-            obj_ids (list[int]): List of object IDs corresponding to the prompts.
-            points (torch.Tensor | None): Tensor of shape (B, N, 2) representing the input points for N objects.
-            labels (torch.Tensor | None): Tensor of shape (B, N) representing the labels for the input points.
-            masks (torch.Tensor | None): Optional tensor of shape (N, H, W) representing the input masks for N objects.
-        """
-        consolidated_out = {
-            "maskmem_features": None,
-            "maskmem_pos_enc": None,
-            "pred_masks": torch.full(
-                size=(self._max_obj_num, 1, self.imgsz[0] // 4, self.imgsz[1] // 4),
-                fill_value=-1024.0,
-                dtype=self.torch_dtype,
-                device=self.device,
-            ),
-            "obj_ptr": torch.full(
-                size=(self._max_obj_num, self.model.hidden_dim),
-                fill_value=-1024.0,
-                dtype=self.torch_dtype,
-                device=self.device,
-            ),
-            "object_score_logits": torch.full(
-                size=(self._max_obj_num, 1),
-                # default to 10.0 for object_score_logits, i.e. assuming the object is
-                # present as sigmoid(10)=1, same as in `predict_masks` of `MaskDecoder`
-                fill_value=-32,  # 10.0,
-                dtype=self.torch_dtype,
-                device=self.device,
-            ),
-        }
-
-        for i, obj_id in enumerate(obj_ids):
-            assert obj_id < self._max_obj_num
-            obj_idx = self._obj_id_to_idx(int(obj_id))
-            self.obj_idx_set.add(obj_idx)
-            point, label = points[[i]], labels[[i]]
-            mask = masks[[i]][None] if masks is not None else None
-            # Currently, only bbox prompt or mask prompt is supported, so we assert that bbox is not None.
-            assert point is not None or mask is not None, "Either bbox, points or mask is required"
-            out = self.track_step(obj_idx, point, label, mask)
-            if out is not None:
-                obj_mask = out["pred_masks"]
-                assert obj_mask.shape[-2:] == consolidated_out["pred_masks"].shape[-2:], (
-                    f"Expected mask shape {consolidated_out['pred_masks'].shape[-2:]} but got {obj_mask.shape[-2:]} for object {obj_idx}."
-                )
-                consolidated_out["pred_masks"][obj_idx : obj_idx + 1] = obj_mask
-                consolidated_out["obj_ptr"][obj_idx : obj_idx + 1] = out["obj_ptr"]
-
-                if "object_score_logits" in out.keys():
-                    consolidated_out["object_score_logits"][obj_idx : obj_idx + 1] = out["object_score_logits"]
-
-        high_res_masks = F.interpolate(
-            consolidated_out["pred_masks"].to(self.device, non_blocking=self.device.type == "cuda"),
-            size=self.imgsz,
-            mode="bilinear",
-            align_corners=False,
-        )
-
-        if self.model.non_overlap_masks_for_mem_enc:
-            high_res_masks = self.model._apply_non_overlapping_constraints(high_res_masks)
-        maskmem_features, maskmem_pos_enc = self.model._encode_new_memory(
-            current_vision_feats=self.vision_feats,
-            feat_sizes=self.feat_sizes,
-            pred_masks_high_res=high_res_masks,
-            object_score_logits=consolidated_out["object_score_logits"],
-            is_mask_from_pts=True,
-        )
-        consolidated_out["maskmem_features"] = maskmem_features
-        consolidated_out["maskmem_pos_enc"] = maskmem_pos_enc
-        self.memory_bank.append(consolidated_out)
-
-    def _prepare_memory_conditioned_features(self, obj_idx: int | None) -> torch.Tensor:
-        """
-        Prepare the memory-conditioned features for the current image state. If obj_idx is provided, it supposes to
-        prepare features for a specific prompted object in the image. If obj_idx is None, it prepares features for all
-        objects in the image. If there is no memory, it will directly add a no-memory embedding to the current vision
-        features. If there is memory, it will use the memory features from previous frames to condition the current
-        vision features using a transformer attention mechanism.
-
-        Args:
-            obj_idx (int | None): The index of the object for which to prepare the features.
-
-        Returns:
-            pix_feat_with_mem (torch.Tensor): The memory-conditioned pixel features.
-        """
-        if len(self.memory_bank) == 0 or isinstance(obj_idx, int):
-            # for initial conditioning frames with, encode them without using any previous memory
-            # directly add no-mem embedding (instead of using the transformer encoder)
-            pix_feat_with_mem = self.vision_feats[-1] + self.model.no_mem_embed
-        else:
-            # for inference frames, use the memory features from previous frames
-            memory, memory_pos_embed = self.get_maskmem_enc()
-            pix_feat_with_mem = self.model.memory_attention(
-                curr=self.vision_feats[-1:],
-                curr_pos=self.vision_pos_embeds[-1:],
-                memory=memory,
-                memory_pos=memory_pos_embed,
-                num_obj_ptr_tokens=0,  # num_obj_ptr_tokens
-            )
-        # reshape the output (HW)BC => BCHW
-        return pix_feat_with_mem.permute(1, 2, 0).view(
-            self._max_obj_num,
-            self.model.memory_attention.d_model,
-            *self.feat_sizes[-1],
-        )
-
-    def get_maskmem_enc(self) -> tuple[torch.Tensor, torch.Tensor]:
-        """Get the memory and positional encoding from the memory, which is used to condition the current image
-        features.
-        """
-        to_cat_memory, to_cat_memory_pos_embed = [], []
-        for consolidated_out in self.memory_bank:
-            to_cat_memory.append(consolidated_out["maskmem_features"].flatten(2).permute(2, 0, 1))  # (H*W, B, C)
-            maskmem_enc = consolidated_out["maskmem_pos_enc"][-1].flatten(2).permute(2, 0, 1)
-            maskmem_enc = maskmem_enc + self.model.maskmem_tpos_enc[self.model.num_maskmem - 1]
-            to_cat_memory_pos_embed.append(maskmem_enc)
-
-        memory = torch.cat(to_cat_memory, dim=0)
-        memory_pos_embed = torch.cat(to_cat_memory_pos_embed, dim=0)
-        return memory, memory_pos_embed
-
-    def _obj_id_to_idx(self, obj_id: int) -> int | None:
-        """
-        Map client-side object id to model-side object index.
-
-        Args:
-            obj_id (int): The client-side object ID.
-
-        Returns:
-            (int): The model-side object index, or None if not found.
-        """
-        return self.obj_id_to_idx.get(obj_id, None)
-
-    def track_step(
-        self,
-        obj_idx: int | None = None,
-        point: torch.Tensor | None = None,
-        label: torch.Tensor | None = None,
-        mask: torch.Tensor | None = None,
-    ) -> dict[str, Any]:
-        """
-        Tracking step for the current image state to predict masks.
-
-        This method processes the image features and runs the SAM heads to predict masks. If obj_idx is provided, it
-        processes the features for a specific prompted object in the image. If obj_idx is None, it processes the
-        features for all objects in the image. The method supports both mask-based output without SAM and full
-        SAM processing with memory-conditioned features.
-
-        Args:
-            obj_idx (int | None): The index of the object for which to predict masks. If None, it processes all objects.
-            point (torch.Tensor | None): The coordinates of the points of interest with shape (N, 2).
-            label (torch.Tensor | None): The labels corresponding to the points where 1 means positive clicks, 0 means negative clicks.
-            mask (torch.Tensor | None): The mask input for the object with shape (H, W).
-
-        Returns:
-            current_out (dict[str, Any]): A dictionary containing the current output with mask predictions and object pointers.
-                Keys include 'point_inputs', 'mask_inputs', 'pred_masks', 'pred_masks_high_res', 'obj_ptr', 'object_score_logits'.
-        """
-        if mask is not None and self.model.use_mask_input_as_output_without_sam:
-            # When use_mask_input_as_output_without_sam=True, we directly output the mask input
-            # (see it as a GT mask) without using a SAM prompt encoder + mask decoder.
-            pix_feat = self.vision_feats[-1].permute(1, 2, 0)
-            pix_feat = pix_feat.view(-1, self.model.memory_attention.d_model, *self.feat_sizes[-1])
-            _, _, _, low_res_masks, high_res_masks, obj_ptr, object_score_logits = self.model._use_mask_as_output(mask)
-        else:
-            # fused the visual feature with previous memory features in the memory bank
-            pix_feat_with_mem = self._prepare_memory_conditioned_features(obj_idx)
-            # calculate the first feature if adding obj_idx exists(means adding prompts)
-            pix_feat_with_mem = pix_feat_with_mem[:1] if obj_idx is not None else pix_feat_with_mem
-            _, _, _, low_res_masks, high_res_masks, obj_ptr, object_score_logits = self.model._forward_sam_heads(
-                backbone_features=pix_feat_with_mem,
-                point_inputs={"point_coords": point, "point_labels": label} if obj_idx is not None else None,
-                mask_inputs=mask,
-                multimask_output=False,
-                high_res_features=[feat[: pix_feat_with_mem.shape[0]] for feat in self.high_res_features],
-            )
-        return {
-            "pred_masks": low_res_masks,
-            "pred_masks_high_res": high_res_masks,
-            "obj_ptr": obj_ptr,
-            "object_score_logits": object_score_logits,
-        }
diff --git a/ultralytics/models/utils/__init__.py b/ultralytics/models/utils/__init__.py
deleted file mode 100644
index 77a19dc..0000000
--- a/ultralytics/models/utils/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
diff --git a/ultralytics/models/utils/loss.py b/ultralytics/models/utils/loss.py
deleted file mode 100644
index eca29e7..0000000
--- a/ultralytics/models/utils/loss.py
+++ /dev/null
@@ -1,478 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-from typing import Any
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from ultralytics.utils.loss import FocalLoss, VarifocalLoss
-from ultralytics.utils.metrics import bbox_iou
-
-from .ops import HungarianMatcher
-
-
-class DETRLoss(nn.Module):
-    """
-    DETR (DEtection TRansformer) Loss class for calculating various loss components.
-
-    This class computes classification loss, bounding box loss, GIoU loss, and optionally auxiliary losses for the
-    DETR object detection model.
-
-    Attributes:
-        nc (int): Number of classes.
-        loss_gain (dict[str, float]): Coefficients for different loss components.
-        aux_loss (bool): Whether to compute auxiliary losses.
-        use_fl (bool): Whether to use FocalLoss.
-        use_vfl (bool): Whether to use VarifocalLoss.
-        use_uni_match (bool): Whether to use a fixed layer for auxiliary branch label assignment.
-        uni_match_ind (int): Index of fixed layer to use if use_uni_match is True.
-        matcher (HungarianMatcher): Object to compute matching cost and indices.
-        fl (FocalLoss | None): Focal Loss object if use_fl is True, otherwise None.
-        vfl (VarifocalLoss | None): Varifocal Loss object if use_vfl is True, otherwise None.
-        device (torch.device): Device on which tensors are stored.
-    """
-
-    def __init__(
-        self,
-        nc: int = 80,
-        loss_gain: dict[str, float] | None = None,
-        aux_loss: bool = True,
-        use_fl: bool = True,
-        use_vfl: bool = False,
-        use_uni_match: bool = False,
-        uni_match_ind: int = 0,
-        gamma: float = 1.5,
-        alpha: float = 0.25,
-    ):
-        """
-        Initialize DETR loss function with customizable components and gains.
-
-        Uses default loss_gain if not provided. Initializes HungarianMatcher with preset cost gains. Supports auxiliary
-        losses and various loss types.
-
-        Args:
-            nc (int): Number of classes.
-            loss_gain (dict[str, float], optional): Coefficients for different loss components.
-            aux_loss (bool): Whether to use auxiliary losses from each decoder layer.
-            use_fl (bool): Whether to use FocalLoss.
-            use_vfl (bool): Whether to use VarifocalLoss.
-            use_uni_match (bool): Whether to use fixed layer for auxiliary branch label assignment.
-            uni_match_ind (int): Index of fixed layer for uni_match.
-            gamma (float): The focusing parameter that controls how much the loss focuses on hard-to-classify examples.
-            alpha (float): The balancing factor used to address class imbalance.
-        """
-        super().__init__()
-
-        if loss_gain is None:
-            loss_gain = {"class": 1, "bbox": 5, "giou": 2, "no_object": 0.1, "mask": 1, "dice": 1}
-        self.nc = nc
-        self.matcher = HungarianMatcher(cost_gain={"class": 2, "bbox": 5, "giou": 2})
-        self.loss_gain = loss_gain
-        self.aux_loss = aux_loss
-        self.fl = FocalLoss(gamma, alpha) if use_fl else None
-        self.vfl = VarifocalLoss(gamma, alpha) if use_vfl else None
-
-        self.use_uni_match = use_uni_match
-        self.uni_match_ind = uni_match_ind
-        self.device = None
-
-    def _get_loss_class(
-        self, pred_scores: torch.Tensor, targets: torch.Tensor, gt_scores: torch.Tensor, num_gts: int, postfix: str = ""
-    ) -> dict[str, torch.Tensor]:
-        """
-        Compute classification loss based on predictions, target values, and ground truth scores.
-
-        Args:
-            pred_scores (torch.Tensor): Predicted class scores with shape (B, N, C).
-            targets (torch.Tensor): Target class indices with shape (B, N).
-            gt_scores (torch.Tensor): Ground truth confidence scores with shape (B, N).
-            num_gts (int): Number of ground truth objects.
-            postfix (str, optional): String to append to the loss name for identification in multi-loss scenarios.
-
-        Returns:
-            (dict[str, torch.Tensor]): Dictionary containing classification loss value.
-
-        Notes:
-            The function supports different classification loss types:
-            - Varifocal Loss (if self.vfl is True and num_gts > 0)
-            - Focal Loss (if self.fl is True)
-            - BCE Loss (default fallback)
-        """
-        # Logits: [b, query, num_classes], gt_class: list[[n, 1]]
-        name_class = f"loss_class{postfix}"
-        bs, nq = pred_scores.shape[:2]
-        # one_hot = F.one_hot(targets, self.nc + 1)[..., :-1]  # (bs, num_queries, num_classes)
-        one_hot = torch.zeros((bs, nq, self.nc + 1), dtype=torch.int64, device=targets.device)
-        one_hot.scatter_(2, targets.unsqueeze(-1), 1)
-        one_hot = one_hot[..., :-1]
-        gt_scores = gt_scores.view(bs, nq, 1) * one_hot
-
-        if self.fl:
-            if num_gts and self.vfl:
-                loss_cls = self.vfl(pred_scores, gt_scores, one_hot)
-            else:
-                loss_cls = self.fl(pred_scores, one_hot.float())
-            loss_cls /= max(num_gts, 1) / nq
-        else:
-            loss_cls = nn.BCEWithLogitsLoss(reduction="none")(pred_scores, gt_scores).mean(1).sum()  # YOLO CLS loss
-
-        return {name_class: loss_cls.squeeze() * self.loss_gain["class"]}
-
-    def _get_loss_bbox(
-        self, pred_bboxes: torch.Tensor, gt_bboxes: torch.Tensor, postfix: str = ""
-    ) -> dict[str, torch.Tensor]:
-        """
-        Compute bounding box and GIoU losses for predicted and ground truth bounding boxes.
-
-        Args:
-            pred_bboxes (torch.Tensor): Predicted bounding boxes with shape (N, 4).
-            gt_bboxes (torch.Tensor): Ground truth bounding boxes with shape (N, 4).
-            postfix (str, optional): String to append to the loss names for identification in multi-loss scenarios.
-
-        Returns:
-            (dict[str, torch.Tensor]): Dictionary containing:
-                - loss_bbox{postfix}: L1 loss between predicted and ground truth boxes, scaled by the bbox loss gain.
-                - loss_giou{postfix}: GIoU loss between predicted and ground truth boxes, scaled by the giou loss gain.
-
-        Notes:
-            If no ground truth boxes are provided (empty list), zero-valued tensors are returned for both losses.
-        """
-        # Boxes: [b, query, 4], gt_bbox: list[[n, 4]]
-        name_bbox = f"loss_bbox{postfix}"
-        name_giou = f"loss_giou{postfix}"
-
-        loss = {}
-        if len(gt_bboxes) == 0:
-            loss[name_bbox] = torch.tensor(0.0, device=self.device)
-            loss[name_giou] = torch.tensor(0.0, device=self.device)
-            return loss
-
-        loss[name_bbox] = self.loss_gain["bbox"] * F.l1_loss(pred_bboxes, gt_bboxes, reduction="sum") / len(gt_bboxes)
-        loss[name_giou] = 1.0 - bbox_iou(pred_bboxes, gt_bboxes, xywh=True, GIoU=True)
-        loss[name_giou] = loss[name_giou].sum() / len(gt_bboxes)
-        loss[name_giou] = self.loss_gain["giou"] * loss[name_giou]
-        return {k: v.squeeze() for k, v in loss.items()}
-
-    # This function is for future RT-DETR Segment models
-    # def _get_loss_mask(self, masks, gt_mask, match_indices, postfix=''):
-    #     # masks: [b, query, h, w], gt_mask: list[[n, H, W]]
-    #     name_mask = f'loss_mask{postfix}'
-    #     name_dice = f'loss_dice{postfix}'
-    #
-    #     loss = {}
-    #     if sum(len(a) for a in gt_mask) == 0:
-    #         loss[name_mask] = torch.tensor(0., device=self.device)
-    #         loss[name_dice] = torch.tensor(0., device=self.device)
-    #         return loss
-    #
-    #     num_gts = len(gt_mask)
-    #     src_masks, target_masks = self._get_assigned_bboxes(masks, gt_mask, match_indices)
-    #     src_masks = F.interpolate(src_masks.unsqueeze(0), size=target_masks.shape[-2:], mode='bilinear')[0]
-    #     # TODO: torch does not have `sigmoid_focal_loss`, but it's not urgent since we don't use mask branch for now.
-    #     loss[name_mask] = self.loss_gain['mask'] * F.sigmoid_focal_loss(src_masks, target_masks,
-    #                                                                     torch.tensor([num_gts], dtype=torch.float32))
-    #     loss[name_dice] = self.loss_gain['dice'] * self._dice_loss(src_masks, target_masks, num_gts)
-    #     return loss
-
-    # This function is for future RT-DETR Segment models
-    # @staticmethod
-    # def _dice_loss(inputs, targets, num_gts):
-    #     inputs = F.sigmoid(inputs).flatten(1)
-    #     targets = targets.flatten(1)
-    #     numerator = 2 * (inputs * targets).sum(1)
-    #     denominator = inputs.sum(-1) + targets.sum(-1)
-    #     loss = 1 - (numerator + 1) / (denominator + 1)
-    #     return loss.sum() / num_gts
-
-    def _get_loss_aux(
-        self,
-        pred_bboxes: torch.Tensor,
-        pred_scores: torch.Tensor,
-        gt_bboxes: torch.Tensor,
-        gt_cls: torch.Tensor,
-        gt_groups: list[int],
-        match_indices: list[tuple] | None = None,
-        postfix: str = "",
-        masks: torch.Tensor | None = None,
-        gt_mask: torch.Tensor | None = None,
-    ) -> dict[str, torch.Tensor]:
-        """
-        Get auxiliary losses for intermediate decoder layers.
-
-        Args:
-            pred_bboxes (torch.Tensor): Predicted bounding boxes from auxiliary layers.
-            pred_scores (torch.Tensor): Predicted scores from auxiliary layers.
-            gt_bboxes (torch.Tensor): Ground truth bounding boxes.
-            gt_cls (torch.Tensor): Ground truth classes.
-            gt_groups (list[int]): Number of ground truths per image.
-            match_indices (list[tuple], optional): Pre-computed matching indices.
-            postfix (str, optional): String to append to loss names.
-            masks (torch.Tensor, optional): Predicted masks if using segmentation.
-            gt_mask (torch.Tensor, optional): Ground truth masks if using segmentation.
-
-        Returns:
-            (dict[str, torch.Tensor]): Dictionary of auxiliary losses.
-        """
-        # NOTE: loss class, bbox, giou, mask, dice
-        loss = torch.zeros(5 if masks is not None else 3, device=pred_bboxes.device)
-        if match_indices is None and self.use_uni_match:
-            match_indices = self.matcher(
-                pred_bboxes[self.uni_match_ind],
-                pred_scores[self.uni_match_ind],
-                gt_bboxes,
-                gt_cls,
-                gt_groups,
-                masks=masks[self.uni_match_ind] if masks is not None else None,
-                gt_mask=gt_mask,
-            )
-        for i, (aux_bboxes, aux_scores) in enumerate(zip(pred_bboxes, pred_scores)):
-            aux_masks = masks[i] if masks is not None else None
-            loss_ = self._get_loss(
-                aux_bboxes,
-                aux_scores,
-                gt_bboxes,
-                gt_cls,
-                gt_groups,
-                masks=aux_masks,
-                gt_mask=gt_mask,
-                postfix=postfix,
-                match_indices=match_indices,
-            )
-            loss[0] += loss_[f"loss_class{postfix}"]
-            loss[1] += loss_[f"loss_bbox{postfix}"]
-            loss[2] += loss_[f"loss_giou{postfix}"]
-            # if masks is not None and gt_mask is not None:
-            #     loss_ = self._get_loss_mask(aux_masks, gt_mask, match_indices, postfix)
-            #     loss[3] += loss_[f'loss_mask{postfix}']
-            #     loss[4] += loss_[f'loss_dice{postfix}']
-
-        loss = {
-            f"loss_class_aux{postfix}": loss[0],
-            f"loss_bbox_aux{postfix}": loss[1],
-            f"loss_giou_aux{postfix}": loss[2],
-        }
-        # if masks is not None and gt_mask is not None:
-        #     loss[f'loss_mask_aux{postfix}'] = loss[3]
-        #     loss[f'loss_dice_aux{postfix}'] = loss[4]
-        return loss
-
-    @staticmethod
-    def _get_index(match_indices: list[tuple]) -> tuple[tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
-        """
-        Extract batch indices, source indices, and destination indices from match indices.
-
-        Args:
-            match_indices (list[tuple]): List of tuples containing matched indices.
-
-        Returns:
-            batch_idx (tuple[torch.Tensor, torch.Tensor]): Tuple containing (batch_idx, src_idx).
-            dst_idx (torch.Tensor): Destination indices.
-        """
-        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(match_indices)])
-        src_idx = torch.cat([src for (src, _) in match_indices])
-        dst_idx = torch.cat([dst for (_, dst) in match_indices])
-        return (batch_idx, src_idx), dst_idx
-
-    def _get_assigned_bboxes(
-        self, pred_bboxes: torch.Tensor, gt_bboxes: torch.Tensor, match_indices: list[tuple]
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        """
-        Assign predicted bounding boxes to ground truth bounding boxes based on match indices.
-
-        Args:
-            pred_bboxes (torch.Tensor): Predicted bounding boxes.
-            gt_bboxes (torch.Tensor): Ground truth bounding boxes.
-            match_indices (list[tuple]): List of tuples containing matched indices.
-
-        Returns:
-            pred_assigned (torch.Tensor): Assigned predicted bounding boxes.
-            gt_assigned (torch.Tensor): Assigned ground truth bounding boxes.
-        """
-        pred_assigned = torch.cat(
-            [
-                t[i] if len(i) > 0 else torch.zeros(0, t.shape[-1], device=self.device)
-                for t, (i, _) in zip(pred_bboxes, match_indices)
-            ]
-        )
-        gt_assigned = torch.cat(
-            [
-                t[j] if len(j) > 0 else torch.zeros(0, t.shape[-1], device=self.device)
-                for t, (_, j) in zip(gt_bboxes, match_indices)
-            ]
-        )
-        return pred_assigned, gt_assigned
-
-    def _get_loss(
-        self,
-        pred_bboxes: torch.Tensor,
-        pred_scores: torch.Tensor,
-        gt_bboxes: torch.Tensor,
-        gt_cls: torch.Tensor,
-        gt_groups: list[int],
-        masks: torch.Tensor | None = None,
-        gt_mask: torch.Tensor | None = None,
-        postfix: str = "",
-        match_indices: list[tuple] | None = None,
-    ) -> dict[str, torch.Tensor]:
-        """
-        Calculate losses for a single prediction layer.
-
-        Args:
-            pred_bboxes (torch.Tensor): Predicted bounding boxes.
-            pred_scores (torch.Tensor): Predicted class scores.
-            gt_bboxes (torch.Tensor): Ground truth bounding boxes.
-            gt_cls (torch.Tensor): Ground truth classes.
-            gt_groups (list[int]): Number of ground truths per image.
-            masks (torch.Tensor, optional): Predicted masks if using segmentation.
-            gt_mask (torch.Tensor, optional): Ground truth masks if using segmentation.
-            postfix (str, optional): String to append to loss names.
-            match_indices (list[tuple], optional): Pre-computed matching indices.
-
-        Returns:
-            (dict[str, torch.Tensor]): Dictionary of losses.
-        """
-        if match_indices is None:
-            match_indices = self.matcher(
-                pred_bboxes, pred_scores, gt_bboxes, gt_cls, gt_groups, masks=masks, gt_mask=gt_mask
-            )
-
-        idx, gt_idx = self._get_index(match_indices)
-        pred_bboxes, gt_bboxes = pred_bboxes[idx], gt_bboxes[gt_idx]
-
-        bs, nq = pred_scores.shape[:2]
-        targets = torch.full((bs, nq), self.nc, device=pred_scores.device, dtype=gt_cls.dtype)
-        targets[idx] = gt_cls[gt_idx]
-
-        gt_scores = torch.zeros([bs, nq], device=pred_scores.device)
-        if len(gt_bboxes):
-            gt_scores[idx] = bbox_iou(pred_bboxes.detach(), gt_bboxes, xywh=True).squeeze(-1)
-
-        return {
-            **self._get_loss_class(pred_scores, targets, gt_scores, len(gt_bboxes), postfix),
-            **self._get_loss_bbox(pred_bboxes, gt_bboxes, postfix),
-            # **(self._get_loss_mask(masks, gt_mask, match_indices, postfix) if masks is not None and gt_mask is not None else {})
-        }
-
-    def forward(
-        self,
-        pred_bboxes: torch.Tensor,
-        pred_scores: torch.Tensor,
-        batch: dict[str, Any],
-        postfix: str = "",
-        **kwargs: Any,
-    ) -> dict[str, torch.Tensor]:
-        """
-        Calculate loss for predicted bounding boxes and scores.
-
-        Args:
-            pred_bboxes (torch.Tensor): Predicted bounding boxes, shape (L, B, N, 4).
-            pred_scores (torch.Tensor): Predicted class scores, shape (L, B, N, C).
-            batch (dict[str, Any]): Batch information containing cls, bboxes, and gt_groups.
-            postfix (str, optional): Postfix for loss names.
-            **kwargs (Any): Additional arguments, may include 'match_indices'.
-
-        Returns:
-            (dict[str, torch.Tensor]): Computed losses, including main and auxiliary (if enabled).
-
-        Notes:
-            Uses last elements of pred_bboxes and pred_scores for main loss, and the rest for auxiliary losses if
-            self.aux_loss is True.
-        """
-        self.device = pred_bboxes.device
-        match_indices = kwargs.get("match_indices", None)
-        gt_cls, gt_bboxes, gt_groups = batch["cls"], batch["bboxes"], batch["gt_groups"]
-
-        total_loss = self._get_loss(
-            pred_bboxes[-1], pred_scores[-1], gt_bboxes, gt_cls, gt_groups, postfix=postfix, match_indices=match_indices
-        )
-
-        if self.aux_loss:
-            total_loss.update(
-                self._get_loss_aux(
-                    pred_bboxes[:-1], pred_scores[:-1], gt_bboxes, gt_cls, gt_groups, match_indices, postfix
-                )
-            )
-
-        return total_loss
-
-
-class RTDETRDetectionLoss(DETRLoss):
-    """
-    Real-Time DeepTracker (RT-DETR) Detection Loss class that extends the DETRLoss.
-
-    This class computes the detection loss for the RT-DETR model, which includes the standard detection loss as well as
-    an additional denoising training loss when provided with denoising metadata.
-    """
-
-    def forward(
-        self,
-        preds: tuple[torch.Tensor, torch.Tensor],
-        batch: dict[str, Any],
-        dn_bboxes: torch.Tensor | None = None,
-        dn_scores: torch.Tensor | None = None,
-        dn_meta: dict[str, Any] | None = None,
-    ) -> dict[str, torch.Tensor]:
-        """
-        Forward pass to compute detection loss with optional denoising loss.
-
-        Args:
-            preds (tuple[torch.Tensor, torch.Tensor]): Tuple containing predicted bounding boxes and scores.
-            batch (dict[str, Any]): Batch data containing ground truth information.
-            dn_bboxes (torch.Tensor, optional): Denoising bounding boxes.
-            dn_scores (torch.Tensor, optional): Denoising scores.
-            dn_meta (dict[str, Any], optional): Metadata for denoising.
-
-        Returns:
-            (dict[str, torch.Tensor]): Dictionary containing total loss and denoising loss if applicable.
-        """
-        pred_bboxes, pred_scores = preds
-        total_loss = super().forward(pred_bboxes, pred_scores, batch)
-
-        # Check for denoising metadata to compute denoising training loss
-        if dn_meta is not None:
-            dn_pos_idx, dn_num_group = dn_meta["dn_pos_idx"], dn_meta["dn_num_group"]
-            assert len(batch["gt_groups"]) == len(dn_pos_idx)
-
-            # Get the match indices for denoising
-            match_indices = self.get_dn_match_indices(dn_pos_idx, dn_num_group, batch["gt_groups"])
-
-            # Compute the denoising training loss
-            dn_loss = super().forward(dn_bboxes, dn_scores, batch, postfix="_dn", match_indices=match_indices)
-            total_loss.update(dn_loss)
-        else:
-            # If no denoising metadata is provided, set denoising loss to zero
-            total_loss.update({f"{k}_dn": torch.tensor(0.0, device=self.device) for k in total_loss.keys()})
-
-        return total_loss
-
-    @staticmethod
-    def get_dn_match_indices(
-        dn_pos_idx: list[torch.Tensor], dn_num_group: int, gt_groups: list[int]
-    ) -> list[tuple[torch.Tensor, torch.Tensor]]:
-        """
-        Get match indices for denoising.
-
-        Args:
-            dn_pos_idx (list[torch.Tensor]): List of tensors containing positive indices for denoising.
-            dn_num_group (int): Number of denoising groups.
-            gt_groups (list[int]): List of integers representing number of ground truths per image.
-
-        Returns:
-            (list[tuple[torch.Tensor, torch.Tensor]]): List of tuples containing matched indices for denoising.
-        """
-        dn_match_indices = []
-        idx_groups = torch.as_tensor([0, *gt_groups[:-1]]).cumsum_(0)
-        for i, num_gt in enumerate(gt_groups):
-            if num_gt > 0:
-                gt_idx = torch.arange(end=num_gt, dtype=torch.long) + idx_groups[i]
-                gt_idx = gt_idx.repeat(dn_num_group)
-                assert len(dn_pos_idx[i]) == len(gt_idx), (
-                    f"Expected the same length, but got {len(dn_pos_idx[i])} and {len(gt_idx)} respectively."
-                )
-                dn_match_indices.append((dn_pos_idx[i], gt_idx))
-            else:
-                dn_match_indices.append((torch.zeros([0], dtype=torch.long), torch.zeros([0], dtype=torch.long)))
-        return dn_match_indices
diff --git a/ultralytics/models/utils/ops.py b/ultralytics/models/utils/ops.py
deleted file mode 100644
index 298123f..0000000
--- a/ultralytics/models/utils/ops.py
+++ /dev/null
@@ -1,319 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-from typing import Any
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from scipy.optimize import linear_sum_assignment
-
-from ultralytics.utils.metrics import bbox_iou
-from ultralytics.utils.ops import xywh2xyxy, xyxy2xywh
-
-
-class HungarianMatcher(nn.Module):
-    """
-    A module implementing the HungarianMatcher for optimal assignment between predictions and ground truth.
-
-    HungarianMatcher performs optimal bipartite assignment over predicted and ground truth bounding boxes using a cost
-    function that considers classification scores, bounding box coordinates, and optionally mask predictions. This is
-    used in end-to-end object detection models like DETR.
-
-    Attributes:
-        cost_gain (dict[str, float]): Dictionary of cost coefficients for 'class', 'bbox', 'giou', 'mask', and 'dice'
-            components.
-        use_fl (bool): Whether to use Focal Loss for classification cost calculation.
-        with_mask (bool): Whether the model makes mask predictions.
-        num_sample_points (int): Number of sample points used in mask cost calculation.
-        alpha (float): Alpha factor in Focal Loss calculation.
-        gamma (float): Gamma factor in Focal Loss calculation.
-
-    Methods:
-        forward: Compute optimal assignment between predictions and ground truths for a batch.
-        _cost_mask: Compute mask cost and dice cost if masks are predicted.
-
-    Examples:
-        Initialize a HungarianMatcher with custom cost gains
-        >>> matcher = HungarianMatcher(cost_gain={"class": 2, "bbox": 5, "giou": 2})
-
-        Perform matching between predictions and ground truth
-        >>> pred_boxes = torch.rand(2, 100, 4)  # batch_size=2, num_queries=100
-        >>> pred_scores = torch.rand(2, 100, 80)  # 80 classes
-        >>> gt_boxes = torch.rand(10, 4)  # 10 ground truth boxes
-        >>> gt_classes = torch.randint(0, 80, (10,))
-        >>> gt_groups = [5, 5]  # 5 GT boxes per image
-        >>> indices = matcher(pred_boxes, pred_scores, gt_boxes, gt_classes, gt_groups)
-    """
-
-    def __init__(
-        self,
-        cost_gain: dict[str, float] | None = None,
-        use_fl: bool = True,
-        with_mask: bool = False,
-        num_sample_points: int = 12544,
-        alpha: float = 0.25,
-        gamma: float = 2.0,
-    ):
-        """
-        Initialize HungarianMatcher for optimal assignment of predicted and ground truth bounding boxes.
-
-        Args:
-            cost_gain (dict[str, float], optional): Dictionary of cost coefficients for different matching cost
-                components. Should contain keys 'class', 'bbox', 'giou', 'mask', and 'dice'.
-            use_fl (bool): Whether to use Focal Loss for classification cost calculation.
-            with_mask (bool): Whether the model makes mask predictions.
-            num_sample_points (int): Number of sample points used in mask cost calculation.
-            alpha (float): Alpha factor in Focal Loss calculation.
-            gamma (float): Gamma factor in Focal Loss calculation.
-        """
-        super().__init__()
-        if cost_gain is None:
-            cost_gain = {"class": 1, "bbox": 5, "giou": 2, "mask": 1, "dice": 1}
-        self.cost_gain = cost_gain
-        self.use_fl = use_fl
-        self.with_mask = with_mask
-        self.num_sample_points = num_sample_points
-        self.alpha = alpha
-        self.gamma = gamma
-
-    def forward(
-        self,
-        pred_bboxes: torch.Tensor,
-        pred_scores: torch.Tensor,
-        gt_bboxes: torch.Tensor,
-        gt_cls: torch.Tensor,
-        gt_groups: list[int],
-        masks: torch.Tensor | None = None,
-        gt_mask: list[torch.Tensor] | None = None,
-    ) -> list[tuple[torch.Tensor, torch.Tensor]]:
-        """
-        Compute optimal assignment between predictions and ground truth using Hungarian algorithm.
-
-        This method calculates matching costs based on classification scores, bounding box coordinates, and optionally
-        mask predictions, then finds the optimal bipartite assignment between predictions and ground truth.
-
-        Args:
-            pred_bboxes (torch.Tensor): Predicted bounding boxes with shape (batch_size, num_queries, 4).
-            pred_scores (torch.Tensor): Predicted classification scores with shape (batch_size, num_queries,
-                num_classes).
-            gt_bboxes (torch.Tensor): Ground truth bounding boxes with shape (num_gts, 4).
-            gt_cls (torch.Tensor): Ground truth class labels with shape (num_gts,).
-            gt_groups (list[int]): Number of ground truth boxes for each image in the batch.
-            masks (torch.Tensor, optional): Predicted masks with shape (batch_size, num_queries, height, width).
-            gt_mask (list[torch.Tensor], optional): Ground truth masks, each with shape (num_masks, Height, Width).
-
-        Returns:
-            (list[tuple[torch.Tensor, torch.Tensor]]): A list of size batch_size, each element is a tuple
-                (index_i, index_j), where index_i is the tensor of indices of the selected predictions (in order)
-                and index_j is the tensor of indices of the corresponding selected ground truth targets (in order).
-                For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes).
-        """
-        bs, nq, nc = pred_scores.shape
-
-        if sum(gt_groups) == 0:
-            return [(torch.tensor([], dtype=torch.long), torch.tensor([], dtype=torch.long)) for _ in range(bs)]
-
-        # Flatten to compute cost matrices in batch format
-        pred_scores = pred_scores.detach().view(-1, nc)
-        pred_scores = F.sigmoid(pred_scores) if self.use_fl else F.softmax(pred_scores, dim=-1)
-        pred_bboxes = pred_bboxes.detach().view(-1, 4)
-
-        # Compute classification cost
-        pred_scores = pred_scores[:, gt_cls]
-        if self.use_fl:
-            neg_cost_class = (1 - self.alpha) * (pred_scores**self.gamma) * (-(1 - pred_scores + 1e-8).log())
-            pos_cost_class = self.alpha * ((1 - pred_scores) ** self.gamma) * (-(pred_scores + 1e-8).log())
-            cost_class = pos_cost_class - neg_cost_class
-        else:
-            cost_class = -pred_scores
-
-        # Compute L1 cost between boxes
-        cost_bbox = (pred_bboxes.unsqueeze(1) - gt_bboxes.unsqueeze(0)).abs().sum(-1)  # (bs*num_queries, num_gt)
-
-        # Compute GIoU cost between boxes, (bs*num_queries, num_gt)
-        cost_giou = 1.0 - bbox_iou(pred_bboxes.unsqueeze(1), gt_bboxes.unsqueeze(0), xywh=True, GIoU=True).squeeze(-1)
-
-        # Combine costs into final cost matrix
-        C = (
-            self.cost_gain["class"] * cost_class
-            + self.cost_gain["bbox"] * cost_bbox
-            + self.cost_gain["giou"] * cost_giou
-        )
-
-        # Add mask costs if available
-        if self.with_mask:
-            C += self._cost_mask(bs, gt_groups, masks, gt_mask)
-
-        # Set invalid values (NaNs and infinities) to 0
-        C[C.isnan() | C.isinf()] = 0.0
-
-        C = C.view(bs, nq, -1).cpu()
-        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(gt_groups, -1))]
-        gt_groups = torch.as_tensor([0, *gt_groups[:-1]]).cumsum_(0)  # (idx for queries, idx for gt)
-        return [
-            (torch.tensor(i, dtype=torch.long), torch.tensor(j, dtype=torch.long) + gt_groups[k])
-            for k, (i, j) in enumerate(indices)
-        ]
-
-    # This function is for future RT-DETR Segment models
-    # def _cost_mask(self, bs, num_gts, masks=None, gt_mask=None):
-    #     assert masks is not None and gt_mask is not None, 'Make sure the input has `mask` and `gt_mask`'
-    #     # all masks share the same set of points for efficient matching
-    #     sample_points = torch.rand([bs, 1, self.num_sample_points, 2])
-    #     sample_points = 2.0 * sample_points - 1.0
-    #
-    #     out_mask = F.grid_sample(masks.detach(), sample_points, align_corners=False).squeeze(-2)
-    #     out_mask = out_mask.flatten(0, 1)
-    #
-    #     tgt_mask = torch.cat(gt_mask).unsqueeze(1)
-    #     sample_points = torch.cat([a.repeat(b, 1, 1, 1) for a, b in zip(sample_points, num_gts) if b > 0])
-    #     tgt_mask = F.grid_sample(tgt_mask, sample_points, align_corners=False).squeeze([1, 2])
-    #
-    #     with torch.amp.autocast("cuda", enabled=False):
-    #         # binary cross entropy cost
-    #         pos_cost_mask = F.binary_cross_entropy_with_logits(out_mask, torch.ones_like(out_mask), reduction='none')
-    #         neg_cost_mask = F.binary_cross_entropy_with_logits(out_mask, torch.zeros_like(out_mask), reduction='none')
-    #         cost_mask = torch.matmul(pos_cost_mask, tgt_mask.T) + torch.matmul(neg_cost_mask, 1 - tgt_mask.T)
-    #         cost_mask /= self.num_sample_points
-    #
-    #         # dice cost
-    #         out_mask = F.sigmoid(out_mask)
-    #         numerator = 2 * torch.matmul(out_mask, tgt_mask.T)
-    #         denominator = out_mask.sum(-1, keepdim=True) + tgt_mask.sum(-1).unsqueeze(0)
-    #         cost_dice = 1 - (numerator + 1) / (denominator + 1)
-    #
-    #         C = self.cost_gain['mask'] * cost_mask + self.cost_gain['dice'] * cost_dice
-    #     return C
-
-
-def get_cdn_group(
-    batch: dict[str, Any],
-    num_classes: int,
-    num_queries: int,
-    class_embed: torch.Tensor,
-    num_dn: int = 100,
-    cls_noise_ratio: float = 0.5,
-    box_noise_scale: float = 1.0,
-    training: bool = False,
-) -> tuple[torch.Tensor | None, torch.Tensor | None, torch.Tensor | None, dict[str, Any] | None]:
-    """
-    Generate contrastive denoising training group with positive and negative samples from ground truths.
-
-    This function creates denoising queries for contrastive denoising training by adding noise to ground truth
-    bounding boxes and class labels. It generates both positive and negative samples to improve model robustness.
-
-    Args:
-        batch (dict[str, Any]): Batch dictionary containing 'gt_cls' (torch.Tensor with shape (num_gts,)),
-            'gt_bboxes' (torch.Tensor with shape (num_gts, 4)), and 'gt_groups' (list[int]) indicating number of
-            ground truths per image.
-        num_classes (int): Total number of object classes.
-        num_queries (int): Number of object queries.
-        class_embed (torch.Tensor): Class embedding weights to map labels to embedding space.
-        num_dn (int): Number of denoising queries to generate.
-        cls_noise_ratio (float): Noise ratio for class labels.
-        box_noise_scale (float): Noise scale for bounding box coordinates.
-        training (bool): Whether model is in training mode.
-
-    Returns:
-        padding_cls (torch.Tensor | None): Modified class embeddings for denoising with shape (bs, num_dn, embed_dim).
-        padding_bbox (torch.Tensor | None): Modified bounding boxes for denoising with shape (bs, num_dn, 4).
-        attn_mask (torch.Tensor | None): Attention mask for denoising with shape (tgt_size, tgt_size).
-        dn_meta (dict[str, Any] | None): Meta information dictionary containing denoising parameters.
-
-    Examples:
-        Generate denoising group for training
-        >>> batch = {
-        ...     "cls": torch.tensor([0, 1, 2]),
-        ...     "bboxes": torch.rand(3, 4),
-        ...     "batch_idx": torch.tensor([0, 0, 1]),
-        ...     "gt_groups": [2, 1],
-        ... }
-        >>> class_embed = torch.rand(80, 256)  # 80 classes, 256 embedding dim
-        >>> cdn_outputs = get_cdn_group(batch, 80, 100, class_embed, training=True)
-    """
-    if (not training) or num_dn <= 0 or batch is None:
-        return None, None, None, None
-    gt_groups = batch["gt_groups"]
-    total_num = sum(gt_groups)
-    max_nums = max(gt_groups)
-    if max_nums == 0:
-        return None, None, None, None
-
-    num_group = num_dn // max_nums
-    num_group = 1 if num_group == 0 else num_group
-    # Pad gt to max_num of a batch
-    bs = len(gt_groups)
-    gt_cls = batch["cls"]  # (bs*num, )
-    gt_bbox = batch["bboxes"]  # bs*num, 4
-    b_idx = batch["batch_idx"]
-
-    # Each group has positive and negative queries
-    dn_cls = gt_cls.repeat(2 * num_group)  # (2*num_group*bs*num, )
-    dn_bbox = gt_bbox.repeat(2 * num_group, 1)  # 2*num_group*bs*num, 4
-    dn_b_idx = b_idx.repeat(2 * num_group).view(-1)  # (2*num_group*bs*num, )
-
-    # Positive and negative mask
-    # (bs*num*num_group, ), the second total_num*num_group part as negative samples
-    neg_idx = torch.arange(total_num * num_group, dtype=torch.long, device=gt_bbox.device) + num_group * total_num
-
-    if cls_noise_ratio > 0:
-        # Apply class label noise to half of the samples
-        mask = torch.rand(dn_cls.shape) < (cls_noise_ratio * 0.5)
-        idx = torch.nonzero(mask).squeeze(-1)
-        # Randomly assign new class labels
-        new_label = torch.randint_like(idx, 0, num_classes, dtype=dn_cls.dtype, device=dn_cls.device)
-        dn_cls[idx] = new_label
-
-    if box_noise_scale > 0:
-        known_bbox = xywh2xyxy(dn_bbox)
-
-        diff = (dn_bbox[..., 2:] * 0.5).repeat(1, 2) * box_noise_scale  # 2*num_group*bs*num, 4
-
-        rand_sign = torch.randint_like(dn_bbox, 0, 2) * 2.0 - 1.0
-        rand_part = torch.rand_like(dn_bbox)
-        rand_part[neg_idx] += 1.0
-        rand_part *= rand_sign
-        known_bbox += rand_part * diff
-        known_bbox.clip_(min=0.0, max=1.0)
-        dn_bbox = xyxy2xywh(known_bbox)
-        dn_bbox = torch.logit(dn_bbox, eps=1e-6)  # inverse sigmoid
-
-    num_dn = int(max_nums * 2 * num_group)  # total denoising queries
-    dn_cls_embed = class_embed[dn_cls]  # bs*num * 2 * num_group, 256
-    padding_cls = torch.zeros(bs, num_dn, dn_cls_embed.shape[-1], device=gt_cls.device)
-    padding_bbox = torch.zeros(bs, num_dn, 4, device=gt_bbox.device)
-
-    map_indices = torch.cat([torch.tensor(range(num), dtype=torch.long) for num in gt_groups])
-    pos_idx = torch.stack([map_indices + max_nums * i for i in range(num_group)], dim=0)
-
-    map_indices = torch.cat([map_indices + max_nums * i for i in range(2 * num_group)])
-    padding_cls[(dn_b_idx, map_indices)] = dn_cls_embed
-    padding_bbox[(dn_b_idx, map_indices)] = dn_bbox
-
-    tgt_size = num_dn + num_queries
-    attn_mask = torch.zeros([tgt_size, tgt_size], dtype=torch.bool)
-    # Match query cannot see the reconstruct
-    attn_mask[num_dn:, :num_dn] = True
-    # Reconstruct cannot see each other
-    for i in range(num_group):
-        if i == 0:
-            attn_mask[max_nums * 2 * i : max_nums * 2 * (i + 1), max_nums * 2 * (i + 1) : num_dn] = True
-        if i == num_group - 1:
-            attn_mask[max_nums * 2 * i : max_nums * 2 * (i + 1), : max_nums * i * 2] = True
-        else:
-            attn_mask[max_nums * 2 * i : max_nums * 2 * (i + 1), max_nums * 2 * (i + 1) : num_dn] = True
-            attn_mask[max_nums * 2 * i : max_nums * 2 * (i + 1), : max_nums * 2 * i] = True
-    dn_meta = {
-        "dn_pos_idx": [p.reshape(-1) for p in pos_idx.cpu().split(list(gt_groups), dim=1)],
-        "dn_num_group": num_group,
-        "dn_num_split": [num_dn, num_queries],
-    }
-
-    return (
-        padding_cls.to(class_embed.device),
-        padding_bbox.to(class_embed.device),
-        attn_mask.to(class_embed.device),
-        dn_meta,
-    )
diff --git a/ultralytics/models/yolo/__init__.py b/ultralytics/models/yolo/__init__.py
deleted file mode 100644
index c198879..0000000
--- a/ultralytics/models/yolo/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from ultralytics.models.yolo import classify, detect, obb, pose, segment, world, yoloe
-
-from .model import YOLO, YOLOE, YOLOWorld
-
-__all__ = "classify", "segment", "detect", "pose", "obb", "world", "yoloe", "YOLO", "YOLOWorld", "YOLOE"
diff --git a/ultralytics/models/yolo/__pycache__/__init__.cpython-310.pyc b/ultralytics/models/yolo/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index 53999f0..0000000
Binary files a/ultralytics/models/yolo/__pycache__/__init__.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/yolo/__pycache__/model.cpython-310.pyc b/ultralytics/models/yolo/__pycache__/model.cpython-310.pyc
deleted file mode 100644
index ef2da4a..0000000
Binary files a/ultralytics/models/yolo/__pycache__/model.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/yolo/classify/__init__.py b/ultralytics/models/yolo/classify/__init__.py
deleted file mode 100644
index 3a10629..0000000
--- a/ultralytics/models/yolo/classify/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from ultralytics.models.yolo.classify.predict import ClassificationPredictor
-from ultralytics.models.yolo.classify.train import ClassificationTrainer
-from ultralytics.models.yolo.classify.val import ClassificationValidator
-
-__all__ = "ClassificationPredictor", "ClassificationTrainer", "ClassificationValidator"
diff --git a/ultralytics/models/yolo/classify/__pycache__/__init__.cpython-310.pyc b/ultralytics/models/yolo/classify/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index 65ceddb..0000000
Binary files a/ultralytics/models/yolo/classify/__pycache__/__init__.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/yolo/classify/__pycache__/predict.cpython-310.pyc b/ultralytics/models/yolo/classify/__pycache__/predict.cpython-310.pyc
deleted file mode 100644
index 14d5aae..0000000
Binary files a/ultralytics/models/yolo/classify/__pycache__/predict.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/yolo/classify/__pycache__/train.cpython-310.pyc b/ultralytics/models/yolo/classify/__pycache__/train.cpython-310.pyc
deleted file mode 100644
index 6495c6d..0000000
Binary files a/ultralytics/models/yolo/classify/__pycache__/train.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/yolo/classify/__pycache__/val.cpython-310.pyc b/ultralytics/models/yolo/classify/__pycache__/val.cpython-310.pyc
deleted file mode 100644
index 8615e49..0000000
Binary files a/ultralytics/models/yolo/classify/__pycache__/val.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/yolo/classify/predict.py b/ultralytics/models/yolo/classify/predict.py
deleted file mode 100644
index 4668abd..0000000
--- a/ultralytics/models/yolo/classify/predict.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-import cv2
-import torch
-from PIL import Image
-
-from ultralytics.data.augment import classify_transforms
-from ultralytics.engine.predictor import BasePredictor
-from ultralytics.engine.results import Results
-from ultralytics.utils import DEFAULT_CFG, ops
-
-
-class ClassificationPredictor(BasePredictor):
-    """
-    A class extending the BasePredictor class for prediction based on a classification model.
-
-    This predictor handles the specific requirements of classification models, including preprocessing images
-    and postprocessing predictions to generate classification results.
-
-    Attributes:
-        args (dict): Configuration arguments for the predictor.
-
-    Methods:
-        preprocess: Convert input images to model-compatible format.
-        postprocess: Process model predictions into Results objects.
-
-    Notes:
-        - Torchvision classification models can also be passed to the 'model' argument, i.e. model='resnet18'.
-
-    Examples:
-        >>> from ultralytics.utils import ASSETS
-        >>> from ultralytics.models.yolo.classify import ClassificationPredictor
-        >>> args = dict(model="yolo11n-cls.pt", source=ASSETS)
-        >>> predictor = ClassificationPredictor(overrides=args)
-        >>> predictor.predict_cli()
-    """
-
-    def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
-        """
-        Initialize the ClassificationPredictor with the specified configuration and set task to 'classify'.
-
-        This constructor initializes a ClassificationPredictor instance, which extends BasePredictor for classification
-        tasks. It ensures the task is set to 'classify' regardless of input configuration.
-
-        Args:
-            cfg (dict): Default configuration dictionary containing prediction settings.
-            overrides (dict, optional): Configuration overrides that take precedence over cfg.
-            _callbacks (list, optional): List of callback functions to be executed during prediction.
-        """
-        super().__init__(cfg, overrides, _callbacks)
-        self.args.task = "classify"
-
-    def setup_source(self, source):
-        """Set up source and inference mode and classify transforms."""
-        super().setup_source(source)
-        updated = (
-            self.model.model.transforms.transforms[0].size != max(self.imgsz)
-            if hasattr(self.model.model, "transforms") and hasattr(self.model.model.transforms.transforms[0], "size")
-            else False
-        )
-        self.transforms = (
-            classify_transforms(self.imgsz) if updated or not self.model.pt else self.model.model.transforms
-        )
-
-    def preprocess(self, img):
-        """Convert input images to model-compatible tensor format with appropriate normalization."""
-        if not isinstance(img, torch.Tensor):
-            img = torch.stack(
-                [self.transforms(Image.fromarray(cv2.cvtColor(im, cv2.COLOR_BGR2RGB))) for im in img], dim=0
-            )
-        img = (img if isinstance(img, torch.Tensor) else torch.from_numpy(img)).to(self.model.device)
-        return img.half() if self.model.fp16 else img.float()  # Convert uint8 to fp16/32
-
-    def postprocess(self, preds, img, orig_imgs):
-        """
-        Process predictions to return Results objects with classification probabilities.
-
-        Args:
-            preds (torch.Tensor): Raw predictions from the model.
-            img (torch.Tensor): Input images after preprocessing.
-            orig_imgs (list[np.ndarray] | torch.Tensor): Original images before preprocessing.
-
-        Returns:
-            (list[Results]): List of Results objects containing classification results for each image.
-        """
-        if not isinstance(orig_imgs, list):  # Input images are a torch.Tensor, not a list
-            orig_imgs = ops.convert_torch2numpy_batch(orig_imgs)
-
-        preds = preds[0] if isinstance(preds, (list, tuple)) else preds
-        return [
-            Results(orig_img, path=img_path, names=self.model.names, probs=pred)
-            for pred, orig_img, img_path in zip(preds, orig_imgs, self.batch[0])
-        ]
diff --git a/ultralytics/models/yolo/classify/train.py b/ultralytics/models/yolo/classify/train.py
deleted file mode 100644
index 01f76ef..0000000
--- a/ultralytics/models/yolo/classify/train.py
+++ /dev/null
@@ -1,223 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-from copy import copy
-from typing import Any
-
-import torch
-
-from ultralytics.data import ClassificationDataset, build_dataloader
-from ultralytics.engine.trainer import BaseTrainer
-from ultralytics.models import yolo
-from ultralytics.nn.tasks import ClassificationModel
-from ultralytics.utils import DEFAULT_CFG, LOGGER, RANK
-from ultralytics.utils.plotting import plot_images
-from ultralytics.utils.torch_utils import is_parallel, strip_optimizer, torch_distributed_zero_first
-
-
-class ClassificationTrainer(BaseTrainer):
-    """
-    A trainer class extending BaseTrainer for training image classification models.
-
-    This trainer handles the training process for image classification tasks, supporting both YOLO classification models
-    and torchvision models with comprehensive dataset handling and validation.
-
-    Attributes:
-        model (ClassificationModel): The classification model to be trained.
-        data (dict[str, Any]): Dictionary containing dataset information including class names and number of classes.
-        loss_names (list[str]): Names of the loss functions used during training.
-        validator (ClassificationValidator): Validator instance for model evaluation.
-
-    Methods:
-        set_model_attributes: Set the model's class names from the loaded dataset.
-        get_model: Return a modified PyTorch model configured for training.
-        setup_model: Load, create or download model for classification.
-        build_dataset: Create a ClassificationDataset instance.
-        get_dataloader: Return PyTorch DataLoader with transforms for image preprocessing.
-        preprocess_batch: Preprocess a batch of images and classes.
-        progress_string: Return a formatted string showing training progress.
-        get_validator: Return an instance of ClassificationValidator.
-        label_loss_items: Return a loss dict with labelled training loss items.
-        final_eval: Evaluate trained model and save validation results.
-        plot_training_samples: Plot training samples with their annotations.
-
-    Examples:
-        Initialize and train a classification model
-        >>> from ultralytics.models.yolo.classify import ClassificationTrainer
-        >>> args = dict(model="yolo11n-cls.pt", data="imagenet10", epochs=3)
-        >>> trainer = ClassificationTrainer(overrides=args)
-        >>> trainer.train()
-    """
-
-    def __init__(self, cfg=DEFAULT_CFG, overrides: dict[str, Any] | None = None, _callbacks=None):
-        """
-        Initialize a ClassificationTrainer object.
-
-        Args:
-            cfg (dict[str, Any], optional): Default configuration dictionary containing training parameters.
-            overrides (dict[str, Any], optional): Dictionary of parameter overrides for the default configuration.
-            _callbacks (list[Any], optional): List of callback functions to be executed during training.
-        """
-        if overrides is None:
-            overrides = {}
-        overrides["task"] = "classify"
-        if overrides.get("imgsz") is None:
-            overrides["imgsz"] = 224
-        super().__init__(cfg, overrides, _callbacks)
-
-    def set_model_attributes(self):
-        """Set the YOLO model's class names from the loaded dataset."""
-        self.model.names = self.data["names"]
-
-    def get_model(self, cfg=None, weights=None, verbose: bool = True):
-        """
-        Return a modified PyTorch model configured for training YOLO classification.
-
-        Args:
-            cfg (Any, optional): Model configuration.
-            weights (Any, optional): Pre-trained model weights.
-            verbose (bool, optional): Whether to display model information.
-
-        Returns:
-            (ClassificationModel): Configured PyTorch model for classification.
-        """
-        model = ClassificationModel(cfg, nc=self.data["nc"], ch=self.data["channels"], verbose=verbose and RANK == -1)
-        if weights:
-            model.load(weights)
-
-        for m in model.modules():
-            if not self.args.pretrained and hasattr(m, "reset_parameters"):
-                m.reset_parameters()
-            if isinstance(m, torch.nn.Dropout) and self.args.dropout:
-                m.p = self.args.dropout  # set dropout
-        for p in model.parameters():
-            p.requires_grad = True  # for training
-        return model
-
-    def setup_model(self):
-        """
-        Load, create or download model for classification tasks.
-
-        Returns:
-            (Any): Model checkpoint if applicable, otherwise None.
-        """
-        import torchvision  # scope for faster 'import ultralytics'
-
-        if str(self.model) in torchvision.models.__dict__:
-            self.model = torchvision.models.__dict__[self.model](
-                weights="IMAGENET1K_V1" if self.args.pretrained else None
-            )
-            ckpt = None
-        else:
-            ckpt = super().setup_model()
-        ClassificationModel.reshape_outputs(self.model, self.data["nc"])
-        return ckpt
-
-    def build_dataset(self, img_path: str, mode: str = "train", batch=None):
-        """
-        Create a ClassificationDataset instance given an image path and mode.
-
-        Args:
-            img_path (str): Path to the dataset images.
-            mode (str, optional): Dataset mode ('train', 'val', or 'test').
-            batch (Any, optional): Batch information (unused in this implementation).
-
-        Returns:
-            (ClassificationDataset): Dataset for the specified mode.
-        """
-        return ClassificationDataset(root=img_path, args=self.args, augment=mode == "train", prefix=mode)
-
-    def get_dataloader(self, dataset_path: str, batch_size: int = 16, rank: int = 0, mode: str = "train"):
-        """
-        Return PyTorch DataLoader with transforms to preprocess images.
-
-        Args:
-            dataset_path (str): Path to the dataset.
-            batch_size (int, optional): Number of images per batch.
-            rank (int, optional): Process rank for distributed training.
-            mode (str, optional): 'train', 'val', or 'test' mode.
-
-        Returns:
-            (torch.utils.data.DataLoader): DataLoader for the specified dataset and mode.
-        """
-        with torch_distributed_zero_first(rank):  # init dataset *.cache only once if DDP
-            dataset = self.build_dataset(dataset_path, mode)
-
-        loader = build_dataloader(dataset, batch_size, self.args.workers, rank=rank, drop_last=self.args.compile)
-        # Attach inference transforms
-        if mode != "train":
-            if is_parallel(self.model):
-                self.model.module.transforms = loader.dataset.torch_transforms
-            else:
-                self.model.transforms = loader.dataset.torch_transforms
-        return loader
-
-    def preprocess_batch(self, batch: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
-        """Preprocess a batch of images and classes."""
-        batch["img"] = batch["img"].to(self.device, non_blocking=self.device.type == "cuda")
-        batch["cls"] = batch["cls"].to(self.device, non_blocking=self.device.type == "cuda")
-        return batch
-
-    def progress_string(self) -> str:
-        """Return a formatted string showing training progress."""
-        return ("\n" + "%11s" * (4 + len(self.loss_names))) % (
-            "Epoch",
-            "GPU_mem",
-            *self.loss_names,
-            "Instances",
-            "Size",
-        )
-
-    def get_validator(self):
-        """Return an instance of ClassificationValidator for validation."""
-        self.loss_names = ["loss"]
-        return yolo.classify.ClassificationValidator(
-            self.test_loader, self.save_dir, args=copy(self.args), _callbacks=self.callbacks
-        )
-
-    def label_loss_items(self, loss_items: torch.Tensor | None = None, prefix: str = "train"):
-        """
-        Return a loss dict with labelled training loss items tensor.
-
-        Args:
-            loss_items (torch.Tensor, optional): Loss tensor items.
-            prefix (str, optional): Prefix to prepend to loss names.
-
-        Returns:
-            keys (list[str]): List of loss keys if loss_items is None.
-            loss_dict (dict[str, float]): Dictionary of loss items if loss_items is provided.
-        """
-        keys = [f"{prefix}/{x}" for x in self.loss_names]
-        if loss_items is None:
-            return keys
-        loss_items = [round(float(loss_items), 5)]
-        return dict(zip(keys, loss_items))
-
-    def final_eval(self):
-        """Evaluate trained model and save validation results."""
-        for f in self.last, self.best:
-            if f.exists():
-                strip_optimizer(f)  # strip optimizers
-                if f is self.best:
-                    LOGGER.info(f"\nValidating {f}...")
-                    self.validator.args.data = self.args.data
-                    self.validator.args.plots = self.args.plots
-                    self.metrics = self.validator(model=f)
-                    self.metrics.pop("fitness", None)
-                    self.run_callbacks("on_fit_epoch_end")
-
-    def plot_training_samples(self, batch: dict[str, torch.Tensor], ni: int):
-        """
-        Plot training samples with their annotations.
-
-        Args:
-            batch (dict[str, torch.Tensor]): Batch containing images and class labels.
-            ni (int): Number of iterations.
-        """
-        batch["batch_idx"] = torch.arange(batch["img"].shape[0])  # add batch index for plotting
-        plot_images(
-            labels=batch,
-            fname=self.save_dir / f"train_batch{ni}.jpg",
-            on_plot=self.on_plot,
-        )
diff --git a/ultralytics/models/yolo/classify/val.py b/ultralytics/models/yolo/classify/val.py
deleted file mode 100644
index a6a31de..0000000
--- a/ultralytics/models/yolo/classify/val.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-from pathlib import Path
-from typing import Any
-
-import torch
-
-from ultralytics.data import ClassificationDataset, build_dataloader
-from ultralytics.engine.validator import BaseValidator
-from ultralytics.utils import LOGGER
-from ultralytics.utils.metrics import ClassifyMetrics, ConfusionMatrix
-from ultralytics.utils.plotting import plot_images
-
-
-class ClassificationValidator(BaseValidator):
-    """
-    A class extending the BaseValidator class for validation based on a classification model.
-
-    This validator handles the validation process for classification models, including metrics calculation,
-    confusion matrix generation, and visualization of results.
-
-    Attributes:
-        targets (list[torch.Tensor]): Ground truth class labels.
-        pred (list[torch.Tensor]): Model predictions.
-        metrics (ClassifyMetrics): Object to calculate and store classification metrics.
-        names (dict): Mapping of class indices to class names.
-        nc (int): Number of classes.
-        confusion_matrix (ConfusionMatrix): Matrix to evaluate model performance across classes.
-
-    Methods:
-        get_desc: Return a formatted string summarizing classification metrics.
-        init_metrics: Initialize confusion matrix, class names, and tracking containers.
-        preprocess: Preprocess input batch by moving data to device.
-        update_metrics: Update running metrics with model predictions and batch targets.
-        finalize_metrics: Finalize metrics including confusion matrix and processing speed.
-        postprocess: Extract the primary prediction from model output.
-        get_stats: Calculate and return a dictionary of metrics.
-        build_dataset: Create a ClassificationDataset instance for validation.
-        get_dataloader: Build and return a data loader for classification validation.
-        print_results: Print evaluation metrics for the classification model.
-        plot_val_samples: Plot validation image samples with their ground truth labels.
-        plot_predictions: Plot images with their predicted class labels.
-
-    Examples:
-        >>> from ultralytics.models.yolo.classify import ClassificationValidator
-        >>> args = dict(model="yolo11n-cls.pt", data="imagenet10")
-        >>> validator = ClassificationValidator(args=args)
-        >>> validator()
-
-    Notes:
-        Torchvision classification models can also be passed to the 'model' argument, i.e. model='resnet18'.
-    """
-
-    def __init__(self, dataloader=None, save_dir=None, args=None, _callbacks=None) -> None:
-        """
-        Initialize ClassificationValidator with dataloader, save directory, and other parameters.
-
-        Args:
-            dataloader (torch.utils.data.DataLoader, optional): Dataloader to use for validation.
-            save_dir (str | Path, optional): Directory to save results.
-            args (dict, optional): Arguments containing model and validation configuration.
-            _callbacks (list, optional): List of callback functions to be called during validation.
-
-        Examples:
-            >>> from ultralytics.models.yolo.classify import ClassificationValidator
-            >>> args = dict(model="yolo11n-cls.pt", data="imagenet10")
-            >>> validator = ClassificationValidator(args=args)
-            >>> validator()
-        """
-        super().__init__(dataloader, save_dir, args, _callbacks)
-        self.targets = None
-        self.pred = None
-        self.args.task = "classify"
-        self.metrics = ClassifyMetrics()
-
-    def get_desc(self) -> str:
-        """Return a formatted string summarizing classification metrics."""
-        return ("%22s" + "%11s" * 2) % ("classes", "top1_acc", "top5_acc")
-
-    def init_metrics(self, model: torch.nn.Module) -> None:
-        """Initialize confusion matrix, class names, and tracking containers for predictions and targets."""
-        self.names = model.names
-        self.nc = len(model.names)
-        self.pred = []
-        self.targets = []
-        self.confusion_matrix = ConfusionMatrix(names=model.names)
-
-    def preprocess(self, batch: dict[str, Any]) -> dict[str, Any]:
-        """Preprocess input batch by moving data to device and converting to appropriate dtype."""
-        batch["img"] = batch["img"].to(self.device, non_blocking=self.device.type == "cuda")
-        batch["img"] = batch["img"].half() if self.args.half else batch["img"].float()
-        batch["cls"] = batch["cls"].to(self.device, non_blocking=self.device.type == "cuda")
-        return batch
-
-    def update_metrics(self, preds: torch.Tensor, batch: dict[str, Any]) -> None:
-        """
-        Update running metrics with model predictions and batch targets.
-
-        Args:
-            preds (torch.Tensor): Model predictions, typically logits or probabilities for each class.
-            batch (dict): Batch data containing images and class labels.
-
-        Notes:
-            This method appends the top-N predictions (sorted by confidence in descending order) to the
-            prediction list for later evaluation. N is limited to the minimum of 5 and the number of classes.
-        """
-        n5 = min(len(self.names), 5)
-        self.pred.append(preds.argsort(1, descending=True)[:, :n5].type(torch.int32).cpu())
-        self.targets.append(batch["cls"].type(torch.int32).cpu())
-
-    def finalize_metrics(self) -> None:
-        """
-        Finalize metrics including confusion matrix and processing speed.
-
-        Notes:
-            This method processes the accumulated predictions and targets to generate the confusion matrix,
-            optionally plots it, and updates the metrics object with speed information.
-
-        Examples:
-            >>> validator = ClassificationValidator()
-            >>> validator.pred = [torch.tensor([[0, 1, 2]])]  # Top-3 predictions for one sample
-            >>> validator.targets = [torch.tensor([0])]  # Ground truth class
-            >>> validator.finalize_metrics()
-            >>> print(validator.metrics.confusion_matrix)  # Access the confusion matrix
-        """
-        self.confusion_matrix.process_cls_preds(self.pred, self.targets)
-        if self.args.plots:
-            for normalize in True, False:
-                self.confusion_matrix.plot(save_dir=self.save_dir, normalize=normalize, on_plot=self.on_plot)
-        self.metrics.speed = self.speed
-        self.metrics.save_dir = self.save_dir
-        self.metrics.confusion_matrix = self.confusion_matrix
-
-    def postprocess(self, preds: torch.Tensor | list[torch.Tensor] | tuple[torch.Tensor]) -> torch.Tensor:
-        """Extract the primary prediction from model output if it's in a list or tuple format."""
-        return preds[0] if isinstance(preds, (list, tuple)) else preds
-
-    def get_stats(self) -> dict[str, float]:
-        """Calculate and return a dictionary of metrics by processing targets and predictions."""
-        self.metrics.process(self.targets, self.pred)
-        return self.metrics.results_dict
-
-    def build_dataset(self, img_path: str) -> ClassificationDataset:
-        """Create a ClassificationDataset instance for validation."""
-        return ClassificationDataset(root=img_path, args=self.args, augment=False, prefix=self.args.split)
-
-    def get_dataloader(self, dataset_path: Path | str, batch_size: int) -> torch.utils.data.DataLoader:
-        """
-        Build and return a data loader for classification validation.
-
-        Args:
-            dataset_path (str | Path): Path to the dataset directory.
-            batch_size (int): Number of samples per batch.
-
-        Returns:
-            (torch.utils.data.DataLoader): DataLoader object for the classification validation dataset.
-        """
-        dataset = self.build_dataset(dataset_path)
-        return build_dataloader(dataset, batch_size, self.args.workers, rank=-1)
-
-    def print_results(self) -> None:
-        """Print evaluation metrics for the classification model."""
-        pf = "%22s" + "%11.3g" * len(self.metrics.keys)  # print format
-        LOGGER.info(pf % ("all", self.metrics.top1, self.metrics.top5))
-
-    def plot_val_samples(self, batch: dict[str, Any], ni: int) -> None:
-        """
-        Plot validation image samples with their ground truth labels.
-
-        Args:
-            batch (dict[str, Any]): Dictionary containing batch data with 'img' (images) and 'cls' (class labels).
-            ni (int): Batch index used for naming the output file.
-
-        Examples:
-            >>> validator = ClassificationValidator()
-            >>> batch = {"img": torch.rand(16, 3, 224, 224), "cls": torch.randint(0, 10, (16,))}
-            >>> validator.plot_val_samples(batch, 0)
-        """
-        batch["batch_idx"] = torch.arange(batch["img"].shape[0])  # add batch index for plotting
-        plot_images(
-            labels=batch,
-            fname=self.save_dir / f"val_batch{ni}_labels.jpg",
-            names=self.names,
-            on_plot=self.on_plot,
-        )
-
-    def plot_predictions(self, batch: dict[str, Any], preds: torch.Tensor, ni: int) -> None:
-        """
-        Plot images with their predicted class labels and save the visualization.
-
-        Args:
-            batch (dict[str, Any]): Batch data containing images and other information.
-            preds (torch.Tensor): Model predictions with shape (batch_size, num_classes).
-            ni (int): Batch index used for naming the output file.
-
-        Examples:
-            >>> validator = ClassificationValidator()
-            >>> batch = {"img": torch.rand(16, 3, 224, 224)}
-            >>> preds = torch.rand(16, 10)  # 16 images, 10 classes
-            >>> validator.plot_predictions(batch, preds, 0)
-        """
-        batched_preds = dict(
-            img=batch["img"],
-            batch_idx=torch.arange(batch["img"].shape[0]),
-            cls=torch.argmax(preds, dim=1),
-        )
-        plot_images(
-            batched_preds,
-            fname=self.save_dir / f"val_batch{ni}_pred.jpg",
-            names=self.names,
-            on_plot=self.on_plot,
-        )  # pred
diff --git a/ultralytics/models/yolo/detect/__init__.py b/ultralytics/models/yolo/detect/__init__.py
deleted file mode 100644
index caece94..0000000
--- a/ultralytics/models/yolo/detect/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from .predict import DetectionPredictor
-from .train import DetectionTrainer
-from .val import DetectionValidator
-
-__all__ = "DetectionPredictor", "DetectionTrainer", "DetectionValidator"
diff --git a/ultralytics/models/yolo/detect/__pycache__/__init__.cpython-310.pyc b/ultralytics/models/yolo/detect/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index d97f05e..0000000
Binary files a/ultralytics/models/yolo/detect/__pycache__/__init__.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/yolo/detect/__pycache__/predict.cpython-310.pyc b/ultralytics/models/yolo/detect/__pycache__/predict.cpython-310.pyc
deleted file mode 100644
index 130a605..0000000
Binary files a/ultralytics/models/yolo/detect/__pycache__/predict.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/yolo/detect/__pycache__/train.cpython-310.pyc b/ultralytics/models/yolo/detect/__pycache__/train.cpython-310.pyc
deleted file mode 100644
index 42b9a34..0000000
Binary files a/ultralytics/models/yolo/detect/__pycache__/train.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/yolo/detect/__pycache__/val.cpython-310.pyc b/ultralytics/models/yolo/detect/__pycache__/val.cpython-310.pyc
deleted file mode 100644
index 3781eac..0000000
Binary files a/ultralytics/models/yolo/detect/__pycache__/val.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/yolo/detect/predict.py b/ultralytics/models/yolo/detect/predict.py
deleted file mode 100644
index fdf40a7..0000000
--- a/ultralytics/models/yolo/detect/predict.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from ultralytics.engine.predictor import BasePredictor
-from ultralytics.engine.results import Results
-from ultralytics.utils import nms, ops
-
-
-class DetectionPredictor(BasePredictor):
-    """
-    A class extending the BasePredictor class for prediction based on a detection model.
-
-    This predictor specializes in object detection tasks, processing model outputs into meaningful detection results
-    with bounding boxes and class predictions.
-
-    Attributes:
-        args (namespace): Configuration arguments for the predictor.
-        model (nn.Module): The detection model used for inference.
-        batch (list): Batch of images and metadata for processing.
-
-    Methods:
-        postprocess: Process raw model predictions into detection results.
-        construct_results: Build Results objects from processed predictions.
-        construct_result: Create a single Result object from a prediction.
-        get_obj_feats: Extract object features from the feature maps.
-
-    Examples:
-        >>> from ultralytics.utils import ASSETS
-        >>> from ultralytics.models.yolo.detect import DetectionPredictor
-        >>> args = dict(model="yolo11n.pt", source=ASSETS)
-        >>> predictor = DetectionPredictor(overrides=args)
-        >>> predictor.predict_cli()
-    """
-
-    def postprocess(self, preds, img, orig_imgs, **kwargs):
-        """
-        Post-process predictions and return a list of Results objects.
-
-        This method applies non-maximum suppression to raw model predictions and prepares them for visualization and
-        further analysis.
-
-        Args:
-            preds (torch.Tensor): Raw predictions from the model.
-            img (torch.Tensor): Processed input image tensor in model input format.
-            orig_imgs (torch.Tensor | list): Original input images before preprocessing.
-            **kwargs (Any): Additional keyword arguments.
-
-        Returns:
-            (list): List of Results objects containing the post-processed predictions.
-
-        Examples:
-            >>> predictor = DetectionPredictor(overrides=dict(model="yolo11n.pt"))
-            >>> results = predictor.predict("path/to/image.jpg")
-            >>> processed_results = predictor.postprocess(preds, img, orig_imgs)
-        """
-        save_feats = getattr(self, "_feats", None) is not None
-        preds = nms.non_max_suppression(
-            preds,
-            self.args.conf,
-            self.args.iou,
-            self.args.classes,
-            self.args.agnostic_nms,
-            max_det=self.args.max_det,
-            nc=0 if self.args.task == "detect" else len(self.model.names),
-            end2end=getattr(self.model, "end2end", False),
-            rotated=self.args.task == "obb",
-            return_idxs=save_feats,
-        )
-
-        if not isinstance(orig_imgs, list):  # input images are a torch.Tensor, not a list
-            orig_imgs = ops.convert_torch2numpy_batch(orig_imgs)
-
-        if save_feats:
-            obj_feats = self.get_obj_feats(self._feats, preds[1])
-            preds = preds[0]
-
-        results = self.construct_results(preds, img, orig_imgs, **kwargs)
-
-        if save_feats:
-            for r, f in zip(results, obj_feats):
-                r.feats = f  # add object features to results
-
-        return results
-
-    def get_obj_feats(self, feat_maps, idxs):
-        """Extract object features from the feature maps."""
-        import torch
-
-        s = min(x.shape[1] for x in feat_maps)  # find shortest vector length
-        obj_feats = torch.cat(
-            [x.permute(0, 2, 3, 1).reshape(x.shape[0], -1, s, x.shape[1] // s).mean(dim=-1) for x in feat_maps], dim=1
-        )  # mean reduce all vectors to same length
-        return [feats[idx] if idx.shape[0] else [] for feats, idx in zip(obj_feats, idxs)]  # for each img in batch
-
-    def construct_results(self, preds, img, orig_imgs):
-        """
-        Construct a list of Results objects from model predictions.
-
-        Args:
-            preds (list[torch.Tensor]): List of predicted bounding boxes and scores for each image.
-            img (torch.Tensor): Batch of preprocessed images used for inference.
-            orig_imgs (list[np.ndarray]): List of original images before preprocessing.
-
-        Returns:
-            (list[Results]): List of Results objects containing detection information for each image.
-        """
-        return [
-            self.construct_result(pred, img, orig_img, img_path)
-            for pred, orig_img, img_path in zip(preds, orig_imgs, self.batch[0])
-        ]
-
-    def construct_result(self, pred, img, orig_img, img_path):
-        """
-        Construct a single Results object from one image prediction.
-
-        Args:
-            pred (torch.Tensor): Predicted boxes and scores with shape (N, 6) where N is the number of detections.
-            img (torch.Tensor): Preprocessed image tensor used for inference.
-            orig_img (np.ndarray): Original image before preprocessing.
-            img_path (str): Path to the original image file.
-
-        Returns:
-            (Results): Results object containing the original image, image path, class names, and scaled bounding boxes.
-        """
-        pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
-        return Results(orig_img, path=img_path, names=self.model.names, boxes=pred[:, :6])
diff --git a/ultralytics/models/yolo/detect/train.py b/ultralytics/models/yolo/detect/train.py
deleted file mode 100644
index 37f54be..0000000
--- a/ultralytics/models/yolo/detect/train.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import math
-import random
-from copy import copy
-from typing import Any
-
-import numpy as np
-import torch
-import torch.nn as nn
-
-from ultralytics.data import build_dataloader, build_yolo_dataset
-from ultralytics.engine.trainer import BaseTrainer
-from ultralytics.models import yolo
-from ultralytics.nn.tasks import DetectionModel
-from ultralytics.utils import DEFAULT_CFG, LOGGER, RANK
-from ultralytics.utils.patches import override_configs
-from ultralytics.utils.plotting import plot_images, plot_labels
-from ultralytics.utils.torch_utils import torch_distributed_zero_first, unwrap_model
-
-
-class DetectionTrainer(BaseTrainer):
-    """
-    A class extending the BaseTrainer class for training based on a detection model.
-
-    This trainer specializes in object detection tasks, handling the specific requirements for training YOLO models
-    for object detection including dataset building, data loading, preprocessing, and model configuration.
-
-    Attributes:
-        model (DetectionModel): The YOLO detection model being trained.
-        data (dict): Dictionary containing dataset information including class names and number of classes.
-        loss_names (tuple): Names of the loss components used in training (box_loss, cls_loss, dfl_loss).
-
-    Methods:
-        build_dataset: Build YOLO dataset for training or validation.
-        get_dataloader: Construct and return dataloader for the specified mode.
-        preprocess_batch: Preprocess a batch of images by scaling and converting to float.
-        set_model_attributes: Set model attributes based on dataset information.
-        get_model: Return a YOLO detection model.
-        get_validator: Return a validator for model evaluation.
-        label_loss_items: Return a loss dictionary with labeled training loss items.
-        progress_string: Return a formatted string of training progress.
-        plot_training_samples: Plot training samples with their annotations.
-        plot_training_labels: Create a labeled training plot of the YOLO model.
-        auto_batch: Calculate optimal batch size based on model memory requirements.
-
-    Examples:
-        >>> from ultralytics.models.yolo.detect import DetectionTrainer
-        >>> args = dict(model="yolo11n.pt", data="coco8.yaml", epochs=3)
-        >>> trainer = DetectionTrainer(overrides=args)
-        >>> trainer.train()
-    """
-
-    def __init__(self, cfg=DEFAULT_CFG, overrides: dict[str, Any] | None = None, _callbacks=None):
-        """
-        Initialize a DetectionTrainer object for training YOLO object detection model training.
-
-        Args:
-            cfg (dict, optional): Default configuration dictionary containing training parameters.
-            overrides (dict, optional): Dictionary of parameter overrides for the default configuration.
-            _callbacks (list, optional): List of callback functions to be executed during training.
-        """
-        super().__init__(cfg, overrides, _callbacks)
-
-    def build_dataset(self, img_path: str, mode: str = "train", batch: int | None = None):
-        """
-        Build YOLO Dataset for training or validation.
-
-        Args:
-            img_path (str): Path to the folder containing images.
-            mode (str): 'train' mode or 'val' mode, users are able to customize different augmentations for each mode.
-            batch (int, optional): Size of batches, this is for 'rect' mode.
-
-        Returns:
-            (Dataset): YOLO dataset object configured for the specified mode.
-        """
-        gs = max(int(unwrap_model(self.model).stride.max() if self.model else 0), 32)
-        return build_yolo_dataset(self.args, img_path, batch, self.data, mode=mode, rect=mode == "val", stride=gs)
-
-    def get_dataloader(self, dataset_path: str, batch_size: int = 16, rank: int = 0, mode: str = "train"):
-        """
-        Construct and return dataloader for the specified mode.
-
-        Args:
-            dataset_path (str): Path to the dataset.
-            batch_size (int): Number of images per batch.
-            rank (int): Process rank for distributed training.
-            mode (str): 'train' for training dataloader, 'val' for validation dataloader.
-
-        Returns:
-            (DataLoader): PyTorch dataloader object.
-        """
-        assert mode in {"train", "val"}, f"Mode must be 'train' or 'val', not {mode}."
-        with torch_distributed_zero_first(rank):  # init dataset *.cache only once if DDP
-            dataset = self.build_dataset(dataset_path, mode, batch_size)
-        shuffle = mode == "train"
-        if getattr(dataset, "rect", False) and shuffle:
-            LOGGER.warning("'rect=True' is incompatible with DataLoader shuffle, setting shuffle=False")
-            shuffle = False
-        return build_dataloader(
-            dataset,
-            batch=batch_size,
-            workers=self.args.workers if mode == "train" else self.args.workers * 2,
-            shuffle=shuffle,
-            rank=rank,
-            drop_last=self.args.compile and mode == "train",
-        )
-
-    def preprocess_batch(self, batch: dict) -> dict:
-        """
-        Preprocess a batch of images by scaling and converting to float.
-
-        Args:
-            batch (dict): Dictionary containing batch data with 'img' tensor.
-
-        Returns:
-            (dict): Preprocessed batch with normalized images.
-        """
-        for k, v in batch.items():
-            if isinstance(v, torch.Tensor):
-                batch[k] = v.to(self.device, non_blocking=self.device.type == "cuda")
-        batch["img"] = batch["img"].float() / 255
-        if self.args.multi_scale:
-            imgs = batch["img"]
-            sz = (
-                random.randrange(int(self.args.imgsz * 0.5), int(self.args.imgsz * 1.5 + self.stride))
-                // self.stride
-                * self.stride
-            )  # size
-            sf = sz / max(imgs.shape[2:])  # scale factor
-            if sf != 1:
-                ns = [
-                    math.ceil(x * sf / self.stride) * self.stride for x in imgs.shape[2:]
-                ]  # new shape (stretched to gs-multiple)
-                imgs = nn.functional.interpolate(imgs, size=ns, mode="bilinear", align_corners=False)
-            batch["img"] = imgs
-        return batch
-
-    def set_model_attributes(self):
-        """Set model attributes based on dataset information."""
-        # Nl = de_parallel(self.model).model[-1].nl  # number of detection layers (to scale hyps)
-        # self.args.box *= 3 / nl  # scale to layers
-        # self.args.cls *= self.data["nc"] / 80 * 3 / nl  # scale to classes and layers
-        # self.args.cls *= (self.args.imgsz / 640) ** 2 * 3 / nl  # scale to image size and layers
-        self.model.nc = self.data["nc"]  # attach number of classes to model
-        self.model.names = self.data["names"]  # attach class names to model
-        self.model.args = self.args  # attach hyperparameters to model
-        # TODO: self.model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) * nc
-
-    def get_model(self, cfg: str | None = None, weights: str | None = None, verbose: bool = True):
-        """
-        Return a YOLO detection model.
-
-        Args:
-            cfg (str, optional): Path to model configuration file.
-            weights (str, optional): Path to model weights.
-            verbose (bool): Whether to display model information.
-
-        Returns:
-            (DetectionModel): YOLO detection model.
-        """
-        model = DetectionModel(cfg, nc=self.data["nc"], ch=self.data["channels"], verbose=verbose and RANK == -1)
-        if weights:
-            model.load(weights)
-        return model
-
-    def get_validator(self):
-        """Return a DetectionValidator for YOLO model validation."""
-        self.loss_names = "box_loss", "cls_loss", "dfl_loss"
-        return yolo.detect.DetectionValidator(
-            self.test_loader, save_dir=self.save_dir, args=copy(self.args), _callbacks=self.callbacks
-        )
-
-    def label_loss_items(self, loss_items: list[float] | None = None, prefix: str = "train"):
-        """
-        Return a loss dict with labeled training loss items tensor.
-
-        Args:
-            loss_items (list[float], optional): List of loss values.
-            prefix (str): Prefix for keys in the returned dictionary.
-
-        Returns:
-            (dict | list): Dictionary of labeled loss items if loss_items is provided, otherwise list of keys.
-        """
-        keys = [f"{prefix}/{x}" for x in self.loss_names]
-        if loss_items is not None:
-            loss_items = [round(float(x), 5) for x in loss_items]  # convert tensors to 5 decimal place floats
-            return dict(zip(keys, loss_items))
-        else:
-            return keys
-
-    def progress_string(self):
-        """Return a formatted string of training progress with epoch, GPU memory, loss, instances and size."""
-        return ("\n" + "%11s" * (4 + len(self.loss_names))) % (
-            "Epoch",
-            "GPU_mem",
-            *self.loss_names,
-            "Instances",
-            "Size",
-        )
-
-    def plot_training_samples(self, batch: dict[str, Any], ni: int) -> None:
-        """
-        Plot training samples with their annotations.
-
-        Args:
-            batch (dict[str, Any]): Dictionary containing batch data.
-            ni (int): Number of iterations.
-        """
-        plot_images(
-            labels=batch,
-            paths=batch["im_file"],
-            fname=self.save_dir / f"train_batch{ni}.jpg",
-            on_plot=self.on_plot,
-        )
-
-    def plot_training_labels(self):
-        """Create a labeled training plot of the YOLO model."""
-        boxes = np.concatenate([lb["bboxes"] for lb in self.train_loader.dataset.labels], 0)
-        cls = np.concatenate([lb["cls"] for lb in self.train_loader.dataset.labels], 0)
-        plot_labels(boxes, cls.squeeze(), names=self.data["names"], save_dir=self.save_dir, on_plot=self.on_plot)
-
-    def auto_batch(self):
-        """
-        Get optimal batch size by calculating memory occupation of model.
-
-        Returns:
-            (int): Optimal batch size.
-        """
-        with override_configs(self.args, overrides={"cache": False}) as self.args:
-            train_dataset = self.build_dataset(self.data["train"], mode="train", batch=16)
-        max_num_obj = max(len(label["cls"]) for label in train_dataset.labels) * 4  # 4 for mosaic augmentation
-        del train_dataset  # free memory
-        return super().auto_batch(max_num_obj)
diff --git a/ultralytics/models/yolo/detect/val.py b/ultralytics/models/yolo/detect/val.py
deleted file mode 100644
index cbdef04..0000000
--- a/ultralytics/models/yolo/detect/val.py
+++ /dev/null
@@ -1,495 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import os
-from pathlib import Path
-from typing import Any
-
-import numpy as np
-import torch
-
-from ultralytics.data import build_dataloader, build_yolo_dataset, converter
-from ultralytics.engine.validator import BaseValidator
-from ultralytics.utils import LOGGER, nms, ops
-from ultralytics.utils.checks import check_requirements
-from ultralytics.utils.metrics import ConfusionMatrix, DetMetrics, box_iou
-from ultralytics.utils.plotting import plot_images
-
-
-class DetectionValidator(BaseValidator):
-    """
-    A class extending the BaseValidator class for validation based on a detection model.
-
-    This class implements validation functionality specific to object detection tasks, including metrics calculation,
-    prediction processing, and visualization of results.
-
-    Attributes:
-        is_coco (bool): Whether the dataset is COCO.
-        is_lvis (bool): Whether the dataset is LVIS.
-        class_map (list[int]): Mapping from model class indices to dataset class indices.
-        metrics (DetMetrics): Object detection metrics calculator.
-        iouv (torch.Tensor): IoU thresholds for mAP calculation.
-        niou (int): Number of IoU thresholds.
-        lb (list[Any]): List for storing ground truth labels for hybrid saving.
-        jdict (list[dict[str, Any]]): List for storing JSON detection results.
-        stats (dict[str, list[torch.Tensor]]): Dictionary for storing statistics during validation.
-
-    Examples:
-        >>> from ultralytics.models.yolo.detect import DetectionValidator
-        >>> args = dict(model="yolo11n.pt", data="coco8.yaml")
-        >>> validator = DetectionValidator(args=args)
-        >>> validator()
-    """
-
-    def __init__(self, dataloader=None, save_dir=None, args=None, _callbacks=None) -> None:
-        """
-        Initialize detection validator with necessary variables and settings.
-
-        Args:
-            dataloader (torch.utils.data.DataLoader, optional): Dataloader to use for validation.
-            save_dir (Path, optional): Directory to save results.
-            args (dict[str, Any], optional): Arguments for the validator.
-            _callbacks (list[Any], optional): List of callback functions.
-        """
-        super().__init__(dataloader, save_dir, args, _callbacks)
-        self.is_coco = False
-        self.is_lvis = False
-        self.class_map = None
-        self.args.task = "detect"
-        self.iouv = torch.linspace(0.5, 0.95, 10)  # IoU vector for mAP@0.5:0.95
-        self.niou = self.iouv.numel()
-        self.metrics = DetMetrics()
-
-    def preprocess(self, batch: dict[str, Any]) -> dict[str, Any]:
-        """
-        Preprocess batch of images for YOLO validation.
-
-        Args:
-            batch (dict[str, Any]): Batch containing images and annotations.
-
-        Returns:
-            (dict[str, Any]): Preprocessed batch.
-        """
-        for k, v in batch.items():
-            if isinstance(v, torch.Tensor):
-                batch[k] = v.to(self.device, non_blocking=self.device.type == "cuda")
-        batch["img"] = (batch["img"].half() if self.args.half else batch["img"].float()) / 255
-        return batch
-
-    def init_metrics(self, model: torch.nn.Module) -> None:
-        """
-        Initialize evaluation metrics for YOLO detection validation.
-
-        Args:
-            model (torch.nn.Module): Model to validate.
-        """
-        val = self.data.get(self.args.split, "")  # validation path
-        self.is_coco = (
-            isinstance(val, str)
-            and "coco" in val
-            and (val.endswith(f"{os.sep}val2017.txt") or val.endswith(f"{os.sep}test-dev2017.txt"))
-        )  # is COCO
-        self.is_lvis = isinstance(val, str) and "lvis" in val and not self.is_coco  # is LVIS
-        self.class_map = converter.coco80_to_coco91_class() if self.is_coco else list(range(1, len(model.names) + 1))
-        self.args.save_json |= self.args.val and (self.is_coco or self.is_lvis) and not self.training  # run final val
-        self.names = model.names
-        self.nc = len(model.names)
-        self.end2end = getattr(model, "end2end", False)
-        self.seen = 0
-        self.jdict = []
-        self.metrics.names = model.names
-        self.confusion_matrix = ConfusionMatrix(names=model.names, save_matches=self.args.plots and self.args.visualize)
-
-    def get_desc(self) -> str:
-        """Return a formatted string summarizing class metrics of YOLO model."""
-        return ("%22s" + "%11s" * 6) % ("Class", "Images", "Instances", "Box(P", "R", "mAP50", "mAP50-95)")
-
-    def postprocess(self, preds: torch.Tensor) -> list[dict[str, torch.Tensor]]:
-        """
-        Apply Non-maximum suppression to prediction outputs.
-
-        Args:
-            preds (torch.Tensor): Raw predictions from the model.
-
-        Returns:
-            (list[dict[str, torch.Tensor]]): Processed predictions after NMS, where each dict contains
-                'bboxes', 'conf', 'cls', and 'extra' tensors.
-        """
-        outputs = nms.non_max_suppression(
-            preds,
-            self.args.conf,
-            self.args.iou,
-            nc=0 if self.args.task == "detect" else self.nc,
-            multi_label=True,
-            agnostic=self.args.single_cls or self.args.agnostic_nms,
-            max_det=self.args.max_det,
-            end2end=self.end2end,
-            rotated=self.args.task == "obb",
-        )
-        return [{"bboxes": x[:, :4], "conf": x[:, 4], "cls": x[:, 5], "extra": x[:, 6:]} for x in outputs]
-
-    def _prepare_batch(self, si: int, batch: dict[str, Any]) -> dict[str, Any]:
-        """
-        Prepare a batch of images and annotations for validation.
-
-        Args:
-            si (int): Batch index.
-            batch (dict[str, Any]): Batch data containing images and annotations.
-
-        Returns:
-            (dict[str, Any]): Prepared batch with processed annotations.
-        """
-        idx = batch["batch_idx"] == si
-        cls = batch["cls"][idx].squeeze(-1)
-        bbox = batch["bboxes"][idx]
-        ori_shape = batch["ori_shape"][si]
-        imgsz = batch["img"].shape[2:]
-        ratio_pad = batch["ratio_pad"][si]
-        if cls.shape[0]:
-            bbox = ops.xywh2xyxy(bbox) * torch.tensor(imgsz, device=self.device)[[1, 0, 1, 0]]  # target boxes
-        return {
-            "cls": cls,
-            "bboxes": bbox,
-            "ori_shape": ori_shape,
-            "imgsz": imgsz,
-            "ratio_pad": ratio_pad,
-            "im_file": batch["im_file"][si],
-        }
-
-    def _prepare_pred(self, pred: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
-        """
-        Prepare predictions for evaluation against ground truth.
-
-        Args:
-            pred (dict[str, torch.Tensor]): Post-processed predictions from the model.
-
-        Returns:
-            (dict[str, torch.Tensor]): Prepared predictions in native space.
-        """
-        if self.args.single_cls:
-            pred["cls"] *= 0
-        return pred
-
-    def update_metrics(self, preds: list[dict[str, torch.Tensor]], batch: dict[str, Any]) -> None:
-        """
-        Update metrics with new predictions and ground truth.
-
-        Args:
-            preds (list[dict[str, torch.Tensor]]): List of predictions from the model.
-            batch (dict[str, Any]): Batch data containing ground truth.
-        """
-        for si, pred in enumerate(preds):
-            self.seen += 1
-            pbatch = self._prepare_batch(si, batch)
-            predn = self._prepare_pred(pred)
-
-            cls = pbatch["cls"].cpu().numpy()
-            no_pred = predn["cls"].shape[0] == 0
-            self.metrics.update_stats(
-                {
-                    **self._process_batch(predn, pbatch),
-                    "target_cls": cls,
-                    "target_img": np.unique(cls),
-                    "conf": np.zeros(0) if no_pred else predn["conf"].cpu().numpy(),
-                    "pred_cls": np.zeros(0) if no_pred else predn["cls"].cpu().numpy(),
-                }
-            )
-            # Evaluate
-            if self.args.plots:
-                self.confusion_matrix.process_batch(predn, pbatch, conf=self.args.conf)
-                if self.args.visualize:
-                    self.confusion_matrix.plot_matches(batch["img"][si], pbatch["im_file"], self.save_dir)
-
-            if no_pred:
-                continue
-
-            # Save
-            if self.args.save_json or self.args.save_txt:
-                predn_scaled = self.scale_preds(predn, pbatch)
-            if self.args.save_json:
-                self.pred_to_json(predn_scaled, pbatch)
-            if self.args.save_txt:
-                self.save_one_txt(
-                    predn_scaled,
-                    self.args.save_conf,
-                    pbatch["ori_shape"],
-                    self.save_dir / "labels" / f"{Path(pbatch['im_file']).stem}.txt",
-                )
-
-    def finalize_metrics(self) -> None:
-        """Set final values for metrics speed and confusion matrix."""
-        if self.args.plots:
-            for normalize in True, False:
-                self.confusion_matrix.plot(save_dir=self.save_dir, normalize=normalize, on_plot=self.on_plot)
-        self.metrics.speed = self.speed
-        self.metrics.confusion_matrix = self.confusion_matrix
-        self.metrics.save_dir = self.save_dir
-
-    def get_stats(self) -> dict[str, Any]:
-        """
-        Calculate and return metrics statistics.
-
-        Returns:
-            (dict[str, Any]): Dictionary containing metrics results.
-        """
-        self.metrics.process(save_dir=self.save_dir, plot=self.args.plots, on_plot=self.on_plot)
-        self.metrics.clear_stats()
-        return self.metrics.results_dict
-
-    def print_results(self) -> None:
-        """Print training/validation set metrics per class."""
-        pf = "%22s" + "%11i" * 2 + "%11.3g" * len(self.metrics.keys)  # print format
-        LOGGER.info(pf % ("all", self.seen, self.metrics.nt_per_class.sum(), *self.metrics.mean_results()))
-        if self.metrics.nt_per_class.sum() == 0:
-            LOGGER.warning(f"no labels found in {self.args.task} set, can not compute metrics without labels")
-
-        # Print results per class
-        if self.args.verbose and not self.training and self.nc > 1 and len(self.metrics.stats):
-            for i, c in enumerate(self.metrics.ap_class_index):
-                LOGGER.info(
-                    pf
-                    % (
-                        self.names[c],
-                        self.metrics.nt_per_image[c],
-                        self.metrics.nt_per_class[c],
-                        *self.metrics.class_result(i),
-                    )
-                )
-
-    def _process_batch(self, preds: dict[str, torch.Tensor], batch: dict[str, Any]) -> dict[str, np.ndarray]:
-        """
-        Return correct prediction matrix.
-
-        Args:
-            preds (dict[str, torch.Tensor]): Dictionary containing prediction data with 'bboxes' and 'cls' keys.
-            batch (dict[str, Any]): Batch dictionary containing ground truth data with 'bboxes' and 'cls' keys.
-
-        Returns:
-            (dict[str, np.ndarray]): Dictionary containing 'tp' key with correct prediction matrix of shape (N, 10) for 10 IoU levels.
-        """
-        if batch["cls"].shape[0] == 0 or preds["cls"].shape[0] == 0:
-            return {"tp": np.zeros((preds["cls"].shape[0], self.niou), dtype=bool)}
-        iou = box_iou(batch["bboxes"], preds["bboxes"])
-        return {"tp": self.match_predictions(preds["cls"], batch["cls"], iou).cpu().numpy()}
-
-    def build_dataset(self, img_path: str, mode: str = "val", batch: int | None = None) -> torch.utils.data.Dataset:
-        """
-        Build YOLO Dataset.
-
-        Args:
-            img_path (str): Path to the folder containing images.
-            mode (str): `train` mode or `val` mode, users are able to customize different augmentations for each mode.
-            batch (int, optional): Size of batches, this is for `rect`.
-
-        Returns:
-            (Dataset): YOLO dataset.
-        """
-        return build_yolo_dataset(self.args, img_path, batch, self.data, mode=mode, stride=self.stride)
-
-    def get_dataloader(self, dataset_path: str, batch_size: int) -> torch.utils.data.DataLoader:
-        """
-        Construct and return dataloader.
-
-        Args:
-            dataset_path (str): Path to the dataset.
-            batch_size (int): Size of each batch.
-
-        Returns:
-            (torch.utils.data.DataLoader): Dataloader for validation.
-        """
-        dataset = self.build_dataset(dataset_path, batch=batch_size, mode="val")
-        return build_dataloader(
-            dataset, batch_size, self.args.workers, shuffle=False, rank=-1, drop_last=self.args.compile
-        )
-
-    def plot_val_samples(self, batch: dict[str, Any], ni: int) -> None:
-        """
-        Plot validation image samples.
-
-        Args:
-            batch (dict[str, Any]): Batch containing images and annotations.
-            ni (int): Batch index.
-        """
-        plot_images(
-            labels=batch,
-            paths=batch["im_file"],
-            fname=self.save_dir / f"val_batch{ni}_labels.jpg",
-            names=self.names,
-            on_plot=self.on_plot,
-        )
-
-    def plot_predictions(
-        self, batch: dict[str, Any], preds: list[dict[str, torch.Tensor]], ni: int, max_det: int | None = None
-    ) -> None:
-        """
-        Plot predicted bounding boxes on input images and save the result.
-
-        Args:
-            batch (dict[str, Any]): Batch containing images and annotations.
-            preds (list[dict[str, torch.Tensor]]): List of predictions from the model.
-            ni (int): Batch index.
-            max_det (Optional[int]): Maximum number of detections to plot.
-        """
-        # TODO: optimize this
-        for i, pred in enumerate(preds):
-            pred["batch_idx"] = torch.ones_like(pred["conf"]) * i  # add batch index to predictions
-        keys = preds[0].keys()
-        max_det = max_det or self.args.max_det
-        batched_preds = {k: torch.cat([x[k][:max_det] for x in preds], dim=0) for k in keys}
-        # TODO: fix this
-        batched_preds["bboxes"][:, :4] = ops.xyxy2xywh(batched_preds["bboxes"][:, :4])  # convert to xywh format
-        plot_images(
-            images=batch["img"],
-            labels=batched_preds,
-            paths=batch["im_file"],
-            fname=self.save_dir / f"val_batch{ni}_pred.jpg",
-            names=self.names,
-            on_plot=self.on_plot,
-        )  # pred
-
-    def save_one_txt(self, predn: dict[str, torch.Tensor], save_conf: bool, shape: tuple[int, int], file: Path) -> None:
-        """
-        Save YOLO detections to a txt file in normalized coordinates in a specific format.
-
-        Args:
-            predn (dict[str, torch.Tensor]): Dictionary containing predictions with keys 'bboxes', 'conf', and 'cls'.
-            save_conf (bool): Whether to save confidence scores.
-            shape (tuple[int, int]): Shape of the original image (height, width).
-            file (Path): File path to save the detections.
-        """
-        from ultralytics.engine.results import Results
-
-        Results(
-            np.zeros((shape[0], shape[1]), dtype=np.uint8),
-            path=None,
-            names=self.names,
-            boxes=torch.cat([predn["bboxes"], predn["conf"].unsqueeze(-1), predn["cls"].unsqueeze(-1)], dim=1),
-        ).save_txt(file, save_conf=save_conf)
-
-    def pred_to_json(self, predn: dict[str, torch.Tensor], pbatch: dict[str, Any]) -> None:
-        """
-        Serialize YOLO predictions to COCO json format.
-
-        Args:
-            predn (dict[str, torch.Tensor]): Predictions dictionary containing 'bboxes', 'conf', and 'cls' keys
-                with bounding box coordinates, confidence scores, and class predictions.
-            pbatch (dict[str, Any]): Batch dictionary containing 'imgsz', 'ori_shape', 'ratio_pad', and 'im_file'.
-
-        Examples:
-             >>> result = {
-             ...     "image_id": 42,
-             ...     "file_name": "42.jpg",
-             ...     "category_id": 18,
-             ...     "bbox": [258.15, 41.29, 348.26, 243.78],
-             ...     "score": 0.236,
-             ... }
-        """
-        path = Path(pbatch["im_file"])
-        stem = path.stem
-        image_id = int(stem) if stem.isnumeric() else stem
-        box = ops.xyxy2xywh(predn["bboxes"])  # xywh
-        box[:, :2] -= box[:, 2:] / 2  # xy center to top-left corner
-        for b, s, c in zip(box.tolist(), predn["conf"].tolist(), predn["cls"].tolist()):
-            self.jdict.append(
-                {
-                    "image_id": image_id,
-                    "file_name": path.name,
-                    "category_id": self.class_map[int(c)],
-                    "bbox": [round(x, 3) for x in b],
-                    "score": round(s, 5),
-                }
-            )
-
-    def scale_preds(self, predn: dict[str, torch.Tensor], pbatch: dict[str, Any]) -> dict[str, torch.Tensor]:
-        """Scales predictions to the original image size."""
-        return {
-            **predn,
-            "bboxes": ops.scale_boxes(
-                pbatch["imgsz"],
-                predn["bboxes"].clone(),
-                pbatch["ori_shape"],
-                ratio_pad=pbatch["ratio_pad"],
-            ),
-        }
-
-    def eval_json(self, stats: dict[str, Any]) -> dict[str, Any]:
-        """
-        Evaluate YOLO output in JSON format and return performance statistics.
-
-        Args:
-            stats (dict[str, Any]): Current statistics dictionary.
-
-        Returns:
-            (dict[str, Any]): Updated statistics dictionary with COCO/LVIS evaluation results.
-        """
-        pred_json = self.save_dir / "predictions.json"  # predictions
-        anno_json = (
-            self.data["path"]
-            / "annotations"
-            / ("instances_val2017.json" if self.is_coco else f"lvis_v1_{self.args.split}.json")
-        )  # annotations
-        return self.coco_evaluate(stats, pred_json, anno_json)
-
-    def coco_evaluate(
-        self,
-        stats: dict[str, Any],
-        pred_json: str,
-        anno_json: str,
-        iou_types: str | list[str] = "bbox",
-        suffix: str | list[str] = "Box",
-    ) -> dict[str, Any]:
-        """
-        Evaluate COCO/LVIS metrics using faster-coco-eval library.
-
-        Performs evaluation using the faster-coco-eval library to compute mAP metrics
-        for object detection. Updates the provided stats dictionary with computed metrics
-        including mAP50, mAP50-95, and LVIS-specific metrics if applicable.
-
-        Args:
-            stats (dict[str, Any]): Dictionary to store computed metrics and statistics.
-            pred_json (str | Path]): Path to JSON file containing predictions in COCO format.
-            anno_json (str | Path]): Path to JSON file containing ground truth annotations in COCO format.
-            iou_types (str | list[str]]): IoU type(s) for evaluation. Can be single string or list of strings.
-                Common values include "bbox", "segm", "keypoints". Defaults to "bbox".
-            suffix (str | list[str]]): Suffix to append to metric names in stats dictionary. Should correspond
-                to iou_types if multiple types provided. Defaults to "Box".
-
-        Returns:
-            (dict[str, Any]): Updated stats dictionary containing the computed COCO/LVIS evaluation metrics.
-        """
-        if self.args.save_json and (self.is_coco or self.is_lvis) and len(self.jdict):
-            LOGGER.info(f"\nEvaluating faster-coco-eval mAP using {pred_json} and {anno_json}...")
-            try:
-                for x in pred_json, anno_json:
-                    assert x.is_file(), f"{x} file not found"
-                iou_types = [iou_types] if isinstance(iou_types, str) else iou_types
-                suffix = [suffix] if isinstance(suffix, str) else suffix
-                check_requirements("faster-coco-eval>=1.6.7")
-                from faster_coco_eval import COCO, COCOeval_faster
-
-                anno = COCO(anno_json)
-                pred = anno.loadRes(pred_json)
-                for i, iou_type in enumerate(iou_types):
-                    val = COCOeval_faster(
-                        anno, pred, iouType=iou_type, lvis_style=self.is_lvis, print_function=LOGGER.info
-                    )
-                    val.params.imgIds = [int(Path(x).stem) for x in self.dataloader.dataset.im_files]  # images to eval
-                    val.evaluate()
-                    val.accumulate()
-                    val.summarize()
-
-                    # update mAP50-95 and mAP50
-                    stats[f"metrics/mAP50({suffix[i][0]})"] = val.stats_as_dict["AP_50"]
-                    stats[f"metrics/mAP50-95({suffix[i][0]})"] = val.stats_as_dict["AP_all"]
-
-                    if self.is_lvis:
-                        stats[f"metrics/APr({suffix[i][0]})"] = val.stats_as_dict["APr"]
-                        stats[f"metrics/APc({suffix[i][0]})"] = val.stats_as_dict["APc"]
-                        stats[f"metrics/APf({suffix[i][0]})"] = val.stats_as_dict["APf"]
-
-                if self.is_lvis:
-                    stats["fitness"] = stats["metrics/mAP50-95(B)"]  # always use box mAP50-95 for fitness
-            except Exception as e:
-                LOGGER.warning(f"faster-coco-eval unable to run: {e}")
-        return stats
diff --git a/ultralytics/models/yolo/model.py b/ultralytics/models/yolo/model.py
deleted file mode 100644
index a327a03..0000000
--- a/ultralytics/models/yolo/model.py
+++ /dev/null
@@ -1,447 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-from pathlib import Path
-from typing import Any
-
-import torch
-
-from ultralytics.data.build import load_inference_source
-from ultralytics.engine.model import Model
-from ultralytics.models import yolo
-from ultralytics.nn.tasks import (
-    ClassificationModel,
-    DetectionModel,
-    OBBModel,
-    PoseModel,
-    SegmentationModel,
-    WorldModel,
-    YOLOEModel,
-    YOLOESegModel,
-)
-from ultralytics.utils import ROOT, YAML
-
-
-class YOLO(Model):
-    """
-    YOLO (You Only Look Once) object detection model.
-
-    This class provides a unified interface for YOLO models, automatically switching to specialized model types
-    (YOLOWorld or YOLOE) based on the model filename. It supports various computer vision tasks including object
-    detection, segmentation, classification, pose estimation, and oriented bounding box detection.
-
-    Attributes:
-        model: The loaded YOLO model instance.
-        task: The task type (detect, segment, classify, pose, obb).
-        overrides: Configuration overrides for the model.
-
-    Methods:
-        __init__: Initialize a YOLO model with automatic type detection.
-        task_map: Map tasks to their corresponding model, trainer, validator, and predictor classes.
-
-    Examples:
-        Load a pretrained YOLOv11n detection model
-        >>> model = YOLO("yolo11n.pt")
-
-        Load a pretrained YOLO11n segmentation model
-        >>> model = YOLO("yolo11n-seg.pt")
-
-        Initialize from a YAML configuration
-        >>> model = YOLO("yolo11n.yaml")
-    """
-
-    def __init__(self, model: str | Path = "yolo11n.pt", task: str | None = None, verbose: bool = False):
-        """
-        Initialize a YOLO model.
-
-        This constructor initializes a YOLO model, automatically switching to specialized model types
-        (YOLOWorld or YOLOE) based on the model filename.
-
-        Args:
-            model (str | Path): Model name or path to model file, i.e. 'yolo11n.pt', 'yolo11n.yaml'.
-            task (str, optional): YOLO task specification, i.e. 'detect', 'segment', 'classify', 'pose', 'obb'.
-                Defaults to auto-detection based on model.
-            verbose (bool): Display model info on load.
-
-        Examples:
-            >>> from ultralytics import YOLO
-            >>> model = YOLO("yolo11n.pt")  # load a pretrained YOLOv11n detection model
-            >>> model = YOLO("yolo11n-seg.pt")  # load a pretrained YOLO11n segmentation model
-        """
-        path = Path(model if isinstance(model, (str, Path)) else "")
-        if "-world" in path.stem and path.suffix in {".pt", ".yaml", ".yml"}:  # if YOLOWorld PyTorch model
-            new_instance = YOLOWorld(path, verbose=verbose)
-            self.__class__ = type(new_instance)
-            self.__dict__ = new_instance.__dict__
-        elif "yoloe" in path.stem and path.suffix in {".pt", ".yaml", ".yml"}:  # if YOLOE PyTorch model
-            new_instance = YOLOE(path, task=task, verbose=verbose)
-            self.__class__ = type(new_instance)
-            self.__dict__ = new_instance.__dict__
-        else:
-            # Continue with default YOLO initialization
-            super().__init__(model=model, task=task, verbose=verbose)
-            if hasattr(self.model, "model") and "RTDETR" in self.model.model[-1]._get_name():  # if RTDETR head
-                from ultralytics import RTDETR
-
-                new_instance = RTDETR(self)
-                self.__class__ = type(new_instance)
-                self.__dict__ = new_instance.__dict__
-
-    @property
-    def task_map(self) -> dict[str, dict[str, Any]]:
-        """Map head to model, trainer, validator, and predictor classes."""
-        return {
-            "classify": {
-                "model": ClassificationModel,
-                "trainer": yolo.classify.ClassificationTrainer,
-                "validator": yolo.classify.ClassificationValidator,
-                "predictor": yolo.classify.ClassificationPredictor,
-            },
-            "detect": {
-                "model": DetectionModel,
-                "trainer": yolo.detect.DetectionTrainer,
-                "validator": yolo.detect.DetectionValidator,
-                "predictor": yolo.detect.DetectionPredictor,
-            },
-            "segment": {
-                "model": SegmentationModel,
-                "trainer": yolo.segment.SegmentationTrainer,
-                "validator": yolo.segment.SegmentationValidator,
-                "predictor": yolo.segment.SegmentationPredictor,
-            },
-            "pose": {
-                "model": PoseModel,
-                "trainer": yolo.pose.PoseTrainer,
-                "validator": yolo.pose.PoseValidator,
-                "predictor": yolo.pose.PosePredictor,
-            },
-            "obb": {
-                "model": OBBModel,
-                "trainer": yolo.obb.OBBTrainer,
-                "validator": yolo.obb.OBBValidator,
-                "predictor": yolo.obb.OBBPredictor,
-            },
-        }
-
-
-class YOLOWorld(Model):
-    """
-    YOLO-World object detection model.
-
-    YOLO-World is an open-vocabulary object detection model that can detect objects based on text descriptions
-    without requiring training on specific classes. It extends the YOLO architecture to support real-time
-    open-vocabulary detection.
-
-    Attributes:
-        model: The loaded YOLO-World model instance.
-        task: Always set to 'detect' for object detection.
-        overrides: Configuration overrides for the model.
-
-    Methods:
-        __init__: Initialize YOLOv8-World model with a pre-trained model file.
-        task_map: Map tasks to their corresponding model, trainer, validator, and predictor classes.
-        set_classes: Set the model's class names for detection.
-
-    Examples:
-        Load a YOLOv8-World model
-        >>> model = YOLOWorld("yolov8s-world.pt")
-
-        Set custom classes for detection
-        >>> model.set_classes(["person", "car", "bicycle"])
-    """
-
-    def __init__(self, model: str | Path = "yolov8s-world.pt", verbose: bool = False) -> None:
-        """
-        Initialize YOLOv8-World model with a pre-trained model file.
-
-        Loads a YOLOv8-World model for object detection. If no custom class names are provided, it assigns default
-        COCO class names.
-
-        Args:
-            model (str | Path): Path to the pre-trained model file. Supports *.pt and *.yaml formats.
-            verbose (bool): If True, prints additional information during initialization.
-        """
-        super().__init__(model=model, task="detect", verbose=verbose)
-
-        # Assign default COCO class names when there are no custom names
-        if not hasattr(self.model, "names"):
-            self.model.names = YAML.load(ROOT / "cfg/datasets/coco8.yaml").get("names")
-
-    @property
-    def task_map(self) -> dict[str, dict[str, Any]]:
-        """Map head to model, validator, and predictor classes."""
-        return {
-            "detect": {
-                "model": WorldModel,
-                "validator": yolo.detect.DetectionValidator,
-                "predictor": yolo.detect.DetectionPredictor,
-                "trainer": yolo.world.WorldTrainer,
-            }
-        }
-
-    def set_classes(self, classes: list[str]) -> None:
-        """
-        Set the model's class names for detection.
-
-        Args:
-            classes (list[str]): A list of categories i.e. ["person"].
-        """
-        self.model.set_classes(classes)
-        # Remove background if it's given
-        background = " "
-        if background in classes:
-            classes.remove(background)
-        self.model.names = classes
-
-        # Reset method class names
-        if self.predictor:
-            self.predictor.model.names = classes
-
-
-class YOLOE(Model):
-    """
-    YOLOE object detection and segmentation model.
-
-    YOLOE is an enhanced YOLO model that supports both object detection and instance segmentation tasks with
-    improved performance and additional features like visual and text positional embeddings.
-
-    Attributes:
-        model: The loaded YOLOE model instance.
-        task: The task type (detect or segment).
-        overrides: Configuration overrides for the model.
-
-    Methods:
-        __init__: Initialize YOLOE model with a pre-trained model file.
-        task_map: Map tasks to their corresponding model, trainer, validator, and predictor classes.
-        get_text_pe: Get text positional embeddings for the given texts.
-        get_visual_pe: Get visual positional embeddings for the given image and visual features.
-        set_vocab: Set vocabulary and class names for the YOLOE model.
-        get_vocab: Get vocabulary for the given class names.
-        set_classes: Set the model's class names and embeddings for detection.
-        val: Validate the model using text or visual prompts.
-        predict: Run prediction on images, videos, directories, streams, etc.
-
-    Examples:
-        Load a YOLOE detection model
-        >>> model = YOLOE("yoloe-11s-seg.pt")
-
-        Set vocabulary and class names
-        >>> model.set_vocab(["person", "car", "dog"], ["person", "car", "dog"])
-
-        Predict with visual prompts
-        >>> prompts = {"bboxes": [[10, 20, 100, 200]], "cls": ["person"]}
-        >>> results = model.predict("image.jpg", visual_prompts=prompts)
-    """
-
-    def __init__(self, model: str | Path = "yoloe-11s-seg.pt", task: str | None = None, verbose: bool = False) -> None:
-        """
-        Initialize YOLOE model with a pre-trained model file.
-
-        Args:
-            model (str | Path): Path to the pre-trained model file. Supports *.pt and *.yaml formats.
-            task (str, optional): Task type for the model. Auto-detected if None.
-            verbose (bool): If True, prints additional information during initialization.
-        """
-        super().__init__(model=model, task=task, verbose=verbose)
-
-    @property
-    def task_map(self) -> dict[str, dict[str, Any]]:
-        """Map head to model, validator, and predictor classes."""
-        return {
-            "detect": {
-                "model": YOLOEModel,
-                "validator": yolo.yoloe.YOLOEDetectValidator,
-                "predictor": yolo.detect.DetectionPredictor,
-                "trainer": yolo.yoloe.YOLOETrainer,
-            },
-            "segment": {
-                "model": YOLOESegModel,
-                "validator": yolo.yoloe.YOLOESegValidator,
-                "predictor": yolo.segment.SegmentationPredictor,
-                "trainer": yolo.yoloe.YOLOESegTrainer,
-            },
-        }
-
-    def get_text_pe(self, texts):
-        """Get text positional embeddings for the given texts."""
-        assert isinstance(self.model, YOLOEModel)
-        return self.model.get_text_pe(texts)
-
-    def get_visual_pe(self, img, visual):
-        """
-        Get visual positional embeddings for the given image and visual features.
-
-        This method extracts positional embeddings from visual features based on the input image. It requires
-        that the model is an instance of YOLOEModel.
-
-        Args:
-            img (torch.Tensor): Input image tensor.
-            visual (torch.Tensor): Visual features extracted from the image.
-
-        Returns:
-            (torch.Tensor): Visual positional embeddings.
-
-        Examples:
-            >>> model = YOLOE("yoloe-11s-seg.pt")
-            >>> img = torch.rand(1, 3, 640, 640)
-            >>> visual_features = torch.rand(1, 1, 80, 80)
-            >>> pe = model.get_visual_pe(img, visual_features)
-        """
-        assert isinstance(self.model, YOLOEModel)
-        return self.model.get_visual_pe(img, visual)
-
-    def set_vocab(self, vocab: list[str], names: list[str]) -> None:
-        """
-        Set vocabulary and class names for the YOLOE model.
-
-        This method configures the vocabulary and class names used by the model for text processing and
-        classification tasks. The model must be an instance of YOLOEModel.
-
-        Args:
-            vocab (list[str]): Vocabulary list containing tokens or words used by the model for text processing.
-            names (list[str]): List of class names that the model can detect or classify.
-
-        Raises:
-            AssertionError: If the model is not an instance of YOLOEModel.
-
-        Examples:
-            >>> model = YOLOE("yoloe-11s-seg.pt")
-            >>> model.set_vocab(["person", "car", "dog"], ["person", "car", "dog"])
-        """
-        assert isinstance(self.model, YOLOEModel)
-        self.model.set_vocab(vocab, names=names)
-
-    def get_vocab(self, names):
-        """Get vocabulary for the given class names."""
-        assert isinstance(self.model, YOLOEModel)
-        return self.model.get_vocab(names)
-
-    def set_classes(self, classes: list[str], embeddings: torch.Tensor | None = None) -> None:
-        """
-        Set the model's class names and embeddings for detection.
-
-        Args:
-            classes (list[str]): A list of categories i.e. ["person"].
-            embeddings (torch.Tensor): Embeddings corresponding to the classes.
-        """
-        assert isinstance(self.model, YOLOEModel)
-        if embeddings is None:
-            embeddings = self.get_text_pe(classes)  # generate text embeddings if not provided
-        self.model.set_classes(classes, embeddings)
-        # Verify no background class is present
-        assert " " not in classes
-        self.model.names = classes
-
-        # Reset method class names
-        if self.predictor:
-            self.predictor.model.names = classes
-
-    def val(
-        self,
-        validator=None,
-        load_vp: bool = False,
-        refer_data: str | None = None,
-        **kwargs,
-    ):
-        """
-        Validate the model using text or visual prompts.
-
-        Args:
-            validator (callable, optional): A callable validator function. If None, a default validator is loaded.
-            load_vp (bool): Whether to load visual prompts. If False, text prompts are used.
-            refer_data (str, optional): Path to the reference data for visual prompts.
-            **kwargs (Any): Additional keyword arguments to override default settings.
-
-        Returns:
-            (dict): Validation statistics containing metrics computed during validation.
-        """
-        custom = {"rect": not load_vp}  # method defaults
-        args = {**self.overrides, **custom, **kwargs, "mode": "val"}  # highest priority args on the right
-
-        validator = (validator or self._smart_load("validator"))(args=args, _callbacks=self.callbacks)
-        validator(model=self.model, load_vp=load_vp, refer_data=refer_data)
-        self.metrics = validator.metrics
-        return validator.metrics
-
-    def predict(
-        self,
-        source=None,
-        stream: bool = False,
-        visual_prompts: dict[str, list] = {},
-        refer_image=None,
-        predictor=yolo.yoloe.YOLOEVPDetectPredictor,
-        **kwargs,
-    ):
-        """
-        Run prediction on images, videos, directories, streams, etc.
-
-        Args:
-            source (str | int | PIL.Image | np.ndarray, optional): Source for prediction. Accepts image paths,
-                directory paths, URL/YouTube streams, PIL images, numpy arrays, or webcam indices.
-            stream (bool): Whether to stream the prediction results. If True, results are yielded as a
-                generator as they are computed.
-            visual_prompts (dict[str, list]): Dictionary containing visual prompts for the model. Must include
-                'bboxes' and 'cls' keys when non-empty.
-            refer_image (str | PIL.Image | np.ndarray, optional): Reference image for visual prompts.
-            predictor (callable, optional): Custom predictor function. If None, a predictor is automatically
-                loaded based on the task.
-            **kwargs (Any): Additional keyword arguments passed to the predictor.
-
-        Returns:
-            (list | generator): List of Results objects or generator of Results objects if stream=True.
-
-        Examples:
-            >>> model = YOLOE("yoloe-11s-seg.pt")
-            >>> results = model.predict("path/to/image.jpg")
-            >>> # With visual prompts
-            >>> prompts = {"bboxes": [[10, 20, 100, 200]], "cls": ["person"]}
-            >>> results = model.predict("path/to/image.jpg", visual_prompts=prompts)
-        """
-        if len(visual_prompts):
-            assert "bboxes" in visual_prompts and "cls" in visual_prompts, (
-                f"Expected 'bboxes' and 'cls' in visual prompts, but got {visual_prompts.keys()}"
-            )
-            assert len(visual_prompts["bboxes"]) == len(visual_prompts["cls"]), (
-                f"Expected equal number of bounding boxes and classes, but got {len(visual_prompts['bboxes'])} and "
-                f"{len(visual_prompts['cls'])} respectively"
-            )
-            if type(self.predictor) is not predictor:
-                self.predictor = predictor(
-                    overrides={
-                        "task": self.model.task,
-                        "mode": "predict",
-                        "save": False,
-                        "verbose": refer_image is None,
-                        "batch": 1,
-                        "device": kwargs.get("device", None),
-                        "half": kwargs.get("half", False),
-                        "imgsz": kwargs.get("imgsz", self.overrides["imgsz"]),
-                    },
-                    _callbacks=self.callbacks,
-                )
-
-            num_cls = (
-                max(len(set(c)) for c in visual_prompts["cls"])
-                if isinstance(source, list) and refer_image is None  # means multiple images
-                else len(set(visual_prompts["cls"]))
-            )
-            self.model.model[-1].nc = num_cls
-            self.model.names = [f"object{i}" for i in range(num_cls)]
-            self.predictor.set_prompts(visual_prompts.copy())
-            self.predictor.setup_model(model=self.model)
-
-            if refer_image is None and source is not None:
-                dataset = load_inference_source(source)
-                if dataset.mode in {"video", "stream"}:
-                    # NOTE: set the first frame as refer image for videos/streams inference
-                    refer_image = next(iter(dataset))[1][0]
-            if refer_image is not None:
-                vpe = self.predictor.get_vpe(refer_image)
-                self.model.set_classes(self.model.names, vpe)
-                self.task = "segment" if isinstance(self.predictor, yolo.segment.SegmentationPredictor) else "detect"
-                self.predictor = None  # reset predictor
-        elif isinstance(self.predictor, yolo.yoloe.YOLOEVPDetectPredictor):
-            self.predictor = None  # reset predictor if no visual prompts
-
-        return super().predict(source, stream, **kwargs)
diff --git a/ultralytics/models/yolo/obb/__init__.py b/ultralytics/models/yolo/obb/__init__.py
deleted file mode 100644
index 61e3e3c..0000000
--- a/ultralytics/models/yolo/obb/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from .predict import OBBPredictor
-from .train import OBBTrainer
-from .val import OBBValidator
-
-__all__ = "OBBPredictor", "OBBTrainer", "OBBValidator"
diff --git a/ultralytics/models/yolo/obb/__pycache__/__init__.cpython-310.pyc b/ultralytics/models/yolo/obb/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index ecf38b3..0000000
Binary files a/ultralytics/models/yolo/obb/__pycache__/__init__.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/yolo/obb/__pycache__/predict.cpython-310.pyc b/ultralytics/models/yolo/obb/__pycache__/predict.cpython-310.pyc
deleted file mode 100644
index 51ed6e0..0000000
Binary files a/ultralytics/models/yolo/obb/__pycache__/predict.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/yolo/obb/__pycache__/train.cpython-310.pyc b/ultralytics/models/yolo/obb/__pycache__/train.cpython-310.pyc
deleted file mode 100644
index d38e8e5..0000000
Binary files a/ultralytics/models/yolo/obb/__pycache__/train.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/yolo/obb/__pycache__/val.cpython-310.pyc b/ultralytics/models/yolo/obb/__pycache__/val.cpython-310.pyc
deleted file mode 100644
index f9c264b..0000000
Binary files a/ultralytics/models/yolo/obb/__pycache__/val.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/yolo/obb/predict.py b/ultralytics/models/yolo/obb/predict.py
deleted file mode 100644
index 9322770..0000000
--- a/ultralytics/models/yolo/obb/predict.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-import torch
-
-from ultralytics.engine.results import Results
-from ultralytics.models.yolo.detect.predict import DetectionPredictor
-from ultralytics.utils import DEFAULT_CFG, ops
-
-
-class OBBPredictor(DetectionPredictor):
-    """
-    A class extending the DetectionPredictor class for prediction based on an Oriented Bounding Box (OBB) model.
-
-    This predictor handles oriented bounding box detection tasks, processing images and returning results with rotated
-    bounding boxes.
-
-    Attributes:
-        args (namespace): Configuration arguments for the predictor.
-        model (torch.nn.Module): The loaded YOLO OBB model.
-
-    Examples:
-        >>> from ultralytics.utils import ASSETS
-        >>> from ultralytics.models.yolo.obb import OBBPredictor
-        >>> args = dict(model="yolo11n-obb.pt", source=ASSETS)
-        >>> predictor = OBBPredictor(overrides=args)
-        >>> predictor.predict_cli()
-    """
-
-    def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
-        """
-        Initialize OBBPredictor with optional model and data configuration overrides.
-
-        Args:
-            cfg (dict, optional): Default configuration for the predictor.
-            overrides (dict, optional): Configuration overrides that take precedence over the default config.
-            _callbacks (list, optional): List of callback functions to be invoked during prediction.
-
-        Examples:
-            >>> from ultralytics.utils import ASSETS
-            >>> from ultralytics.models.yolo.obb import OBBPredictor
-            >>> args = dict(model="yolo11n-obb.pt", source=ASSETS)
-            >>> predictor = OBBPredictor(overrides=args)
-        """
-        super().__init__(cfg, overrides, _callbacks)
-        self.args.task = "obb"
-
-    def construct_result(self, pred, img, orig_img, img_path):
-        """
-        Construct the result object from the prediction.
-
-        Args:
-            pred (torch.Tensor): The predicted bounding boxes, scores, and rotation angles with shape (N, 7) where
-                the last dimension contains [x, y, w, h, confidence, class_id, angle].
-            img (torch.Tensor): The image after preprocessing with shape (B, C, H, W).
-            orig_img (np.ndarray): The original image before preprocessing.
-            img_path (str): The path to the original image.
-
-        Returns:
-            (Results): The result object containing the original image, image path, class names, and oriented bounding
-                boxes.
-        """
-        rboxes = ops.regularize_rboxes(torch.cat([pred[:, :4], pred[:, -1:]], dim=-1))
-        rboxes[:, :4] = ops.scale_boxes(img.shape[2:], rboxes[:, :4], orig_img.shape, xywh=True)
-        obb = torch.cat([rboxes, pred[:, 4:6]], dim=-1)
-        return Results(orig_img, path=img_path, names=self.model.names, obb=obb)
diff --git a/ultralytics/models/yolo/obb/train.py b/ultralytics/models/yolo/obb/train.py
deleted file mode 100644
index c1d06ae..0000000
--- a/ultralytics/models/yolo/obb/train.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-from copy import copy
-from pathlib import Path
-from typing import Any
-
-from ultralytics.models import yolo
-from ultralytics.nn.tasks import OBBModel
-from ultralytics.utils import DEFAULT_CFG, RANK
-
-
-class OBBTrainer(yolo.detect.DetectionTrainer):
-    """
-    A class extending the DetectionTrainer class for training based on an Oriented Bounding Box (OBB) model.
-
-    This trainer specializes in training YOLO models that detect oriented bounding boxes, which are useful for
-    detecting objects at arbitrary angles rather than just axis-aligned rectangles.
-
-    Attributes:
-        loss_names (tuple): Names of the loss components used during training including box_loss, cls_loss,
-            and dfl_loss.
-
-    Methods:
-        get_model: Return OBBModel initialized with specified config and weights.
-        get_validator: Return an instance of OBBValidator for validation of YOLO model.
-
-    Examples:
-        >>> from ultralytics.models.yolo.obb import OBBTrainer
-        >>> args = dict(model="yolo11n-obb.pt", data="dota8.yaml", epochs=3)
-        >>> trainer = OBBTrainer(overrides=args)
-        >>> trainer.train()
-    """
-
-    def __init__(self, cfg=DEFAULT_CFG, overrides: dict | None = None, _callbacks: list[Any] | None = None):
-        """
-        Initialize an OBBTrainer object for training Oriented Bounding Box (OBB) models.
-
-        Args:
-            cfg (dict, optional): Configuration dictionary for the trainer. Contains training parameters and
-                model configuration.
-            overrides (dict, optional): Dictionary of parameter overrides for the configuration. Any values here
-                will take precedence over those in cfg.
-            _callbacks (list[Any], optional): List of callback functions to be invoked during training.
-        """
-        if overrides is None:
-            overrides = {}
-        overrides["task"] = "obb"
-        super().__init__(cfg, overrides, _callbacks)
-
-    def get_model(
-        self, cfg: str | dict | None = None, weights: str | Path | None = None, verbose: bool = True
-    ) -> OBBModel:
-        """
-        Return OBBModel initialized with specified config and weights.
-
-        Args:
-            cfg (str | dict, optional): Model configuration. Can be a path to a YAML config file, a dictionary
-                containing configuration parameters, or None to use default configuration.
-            weights (str | Path, optional): Path to pretrained weights file. If None, random initialization is used.
-            verbose (bool): Whether to display model information during initialization.
-
-        Returns:
-            (OBBModel): Initialized OBBModel with the specified configuration and weights.
-
-        Examples:
-            >>> trainer = OBBTrainer()
-            >>> model = trainer.get_model(cfg="yolo11n-obb.yaml", weights="yolo11n-obb.pt")
-        """
-        model = OBBModel(cfg, nc=self.data["nc"], ch=self.data["channels"], verbose=verbose and RANK == -1)
-        if weights:
-            model.load(weights)
-
-        return model
-
-    def get_validator(self):
-        """Return an instance of OBBValidator for validation of YOLO model."""
-        self.loss_names = "box_loss", "cls_loss", "dfl_loss"
-        return yolo.obb.OBBValidator(
-            self.test_loader, save_dir=self.save_dir, args=copy(self.args), _callbacks=self.callbacks
-        )
diff --git a/ultralytics/models/yolo/obb/val.py b/ultralytics/models/yolo/obb/val.py
deleted file mode 100644
index cee52ea..0000000
--- a/ultralytics/models/yolo/obb/val.py
+++ /dev/null
@@ -1,299 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-from pathlib import Path
-from typing import Any
-
-import numpy as np
-import torch
-
-from ultralytics.models.yolo.detect import DetectionValidator
-from ultralytics.utils import LOGGER, ops
-from ultralytics.utils.metrics import OBBMetrics, batch_probiou
-from ultralytics.utils.nms import TorchNMS
-
-
-class OBBValidator(DetectionValidator):
-    """
-    A class extending the DetectionValidator class for validation based on an Oriented Bounding Box (OBB) model.
-
-    This validator specializes in evaluating models that predict rotated bounding boxes, commonly used for aerial and
-    satellite imagery where objects can appear at various orientations.
-
-    Attributes:
-        args (dict): Configuration arguments for the validator.
-        metrics (OBBMetrics): Metrics object for evaluating OBB model performance.
-        is_dota (bool): Flag indicating whether the validation dataset is in DOTA format.
-
-    Methods:
-        init_metrics: Initialize evaluation metrics for YOLO.
-        _process_batch: Process batch of detections and ground truth boxes to compute IoU matrix.
-        _prepare_batch: Prepare batch data for OBB validation.
-        _prepare_pred: Prepare predictions with scaled and padded bounding boxes.
-        plot_predictions: Plot predicted bounding boxes on input images.
-        pred_to_json: Serialize YOLO predictions to COCO json format.
-        save_one_txt: Save YOLO detections to a txt file in normalized coordinates.
-        eval_json: Evaluate YOLO output in JSON format and return performance statistics.
-
-    Examples:
-        >>> from ultralytics.models.yolo.obb import OBBValidator
-        >>> args = dict(model="yolo11n-obb.pt", data="dota8.yaml")
-        >>> validator = OBBValidator(args=args)
-        >>> validator(model=args["model"])
-    """
-
-    def __init__(self, dataloader=None, save_dir=None, args=None, _callbacks=None) -> None:
-        """
-        Initialize OBBValidator and set task to 'obb', metrics to OBBMetrics.
-
-        This constructor initializes an OBBValidator instance for validating Oriented Bounding Box (OBB) models.
-        It extends the DetectionValidator class and configures it specifically for the OBB task.
-
-        Args:
-            dataloader (torch.utils.data.DataLoader, optional): Dataloader to be used for validation.
-            save_dir (str | Path, optional): Directory to save results.
-            args (dict | SimpleNamespace, optional): Arguments containing validation parameters.
-            _callbacks (list, optional): List of callback functions to be called during validation.
-        """
-        super().__init__(dataloader, save_dir, args, _callbacks)
-        self.args.task = "obb"
-        self.metrics = OBBMetrics()
-
-    def init_metrics(self, model: torch.nn.Module) -> None:
-        """
-        Initialize evaluation metrics for YOLO obb validation.
-
-        Args:
-            model (torch.nn.Module): Model to validate.
-        """
-        super().init_metrics(model)
-        val = self.data.get(self.args.split, "")  # validation path
-        self.is_dota = isinstance(val, str) and "DOTA" in val  # check if dataset is DOTA format
-        self.confusion_matrix.task = "obb"  # set confusion matrix task to 'obb'
-
-    def _process_batch(self, preds: dict[str, torch.Tensor], batch: dict[str, torch.Tensor]) -> dict[str, np.ndarray]:
-        """
-        Compute the correct prediction matrix for a batch of detections and ground truth bounding boxes.
-
-        Args:
-            preds (dict[str, torch.Tensor]): Prediction dictionary containing 'cls' and 'bboxes' keys with detected
-                class labels and bounding boxes.
-            batch (dict[str, torch.Tensor]): Batch dictionary containing 'cls' and 'bboxes' keys with ground truth
-                class labels and bounding boxes.
-
-        Returns:
-            (dict[str, np.ndarray]): Dictionary containing 'tp' key with the correct prediction matrix as a numpy
-                array with shape (N, 10), which includes 10 IoU levels for each detection, indicating the accuracy
-                of predictions compared to the ground truth.
-
-        Examples:
-            >>> detections = torch.rand(100, 7)  # 100 sample detections
-            >>> gt_bboxes = torch.rand(50, 5)  # 50 sample ground truth boxes
-            >>> gt_cls = torch.randint(0, 5, (50,))  # 50 ground truth class labels
-            >>> correct_matrix = validator._process_batch(detections, gt_bboxes, gt_cls)
-        """
-        if batch["cls"].shape[0] == 0 or preds["cls"].shape[0] == 0:
-            return {"tp": np.zeros((preds["cls"].shape[0], self.niou), dtype=bool)}
-        iou = batch_probiou(batch["bboxes"], preds["bboxes"])
-        return {"tp": self.match_predictions(preds["cls"], batch["cls"], iou).cpu().numpy()}
-
-    def postprocess(self, preds: torch.Tensor) -> list[dict[str, torch.Tensor]]:
-        """
-        Args:
-            preds (torch.Tensor): Raw predictions from the model.
-
-        Returns:
-            (list[dict[str, torch.Tensor]]): Processed predictions with angle information concatenated to bboxes.
-        """
-        preds = super().postprocess(preds)
-        for pred in preds:
-            pred["bboxes"] = torch.cat([pred["bboxes"], pred.pop("extra")], dim=-1)  # concatenate angle
-        return preds
-
-    def _prepare_batch(self, si: int, batch: dict[str, Any]) -> dict[str, Any]:
-        """
-        Prepare batch data for OBB validation with proper scaling and formatting.
-
-        Args:
-            si (int): Batch index to process.
-            batch (dict[str, Any]): Dictionary containing batch data with keys:
-                - batch_idx: Tensor of batch indices
-                - cls: Tensor of class labels
-                - bboxes: Tensor of bounding boxes
-                - ori_shape: Original image shapes
-                - img: Batch of images
-                - ratio_pad: Ratio and padding information
-
-        Returns:
-            (dict[str, Any]): Prepared batch data with scaled bounding boxes and metadata.
-        """
-        idx = batch["batch_idx"] == si
-        cls = batch["cls"][idx].squeeze(-1)
-        bbox = batch["bboxes"][idx]
-        ori_shape = batch["ori_shape"][si]
-        imgsz = batch["img"].shape[2:]
-        ratio_pad = batch["ratio_pad"][si]
-        if cls.shape[0]:
-            bbox[..., :4].mul_(torch.tensor(imgsz, device=self.device)[[1, 0, 1, 0]])  # target boxes
-        return {
-            "cls": cls,
-            "bboxes": bbox,
-            "ori_shape": ori_shape,
-            "imgsz": imgsz,
-            "ratio_pad": ratio_pad,
-            "im_file": batch["im_file"][si],
-        }
-
-    def plot_predictions(self, batch: dict[str, Any], preds: list[torch.Tensor], ni: int) -> None:
-        """
-        Plot predicted bounding boxes on input images and save the result.
-
-        Args:
-            batch (dict[str, Any]): Batch data containing images, file paths, and other metadata.
-            preds (list[torch.Tensor]): List of prediction tensors for each image in the batch.
-            ni (int): Batch index used for naming the output file.
-
-        Examples:
-            >>> validator = OBBValidator()
-            >>> batch = {"img": images, "im_file": paths}
-            >>> preds = [torch.rand(10, 7)]  # Example predictions for one image
-            >>> validator.plot_predictions(batch, preds, 0)
-        """
-        for p in preds:
-            # TODO: fix this duplicated `xywh2xyxy`
-            p["bboxes"][:, :4] = ops.xywh2xyxy(p["bboxes"][:, :4])  # convert to xyxy format for plotting
-        super().plot_predictions(batch, preds, ni)  # plot bboxes
-
-    def pred_to_json(self, predn: dict[str, torch.Tensor], pbatch: dict[str, Any]) -> None:
-        """
-        Convert YOLO predictions to COCO JSON format with rotated bounding box information.
-
-        Args:
-            predn (dict[str, torch.Tensor]): Prediction dictionary containing 'bboxes', 'conf', and 'cls' keys
-                with bounding box coordinates, confidence scores, and class predictions.
-            pbatch (dict[str, Any]): Batch dictionary containing 'imgsz', 'ori_shape', 'ratio_pad', and 'im_file'.
-
-        Notes:
-            This method processes rotated bounding box predictions and converts them to both rbox format
-            (x, y, w, h, angle) and polygon format (x1, y1, x2, y2, x3, y3, x4, y4) before adding them
-            to the JSON dictionary.
-        """
-        path = Path(pbatch["im_file"])
-        stem = path.stem
-        image_id = int(stem) if stem.isnumeric() else stem
-        rbox = predn["bboxes"]
-        poly = ops.xywhr2xyxyxyxy(rbox).view(-1, 8)
-        for r, b, s, c in zip(rbox.tolist(), poly.tolist(), predn["conf"].tolist(), predn["cls"].tolist()):
-            self.jdict.append(
-                {
-                    "image_id": image_id,
-                    "file_name": path.name,
-                    "category_id": self.class_map[int(c)],
-                    "score": round(s, 5),
-                    "rbox": [round(x, 3) for x in r],
-                    "poly": [round(x, 3) for x in b],
-                }
-            )
-
-    def save_one_txt(self, predn: dict[str, torch.Tensor], save_conf: bool, shape: tuple[int, int], file: Path) -> None:
-        """
-        Save YOLO OBB detections to a text file in normalized coordinates.
-
-        Args:
-            predn (torch.Tensor): Predicted detections with shape (N, 7) containing bounding boxes, confidence scores,
-                class predictions, and angles in format (x, y, w, h, conf, cls, angle).
-            save_conf (bool): Whether to save confidence scores in the text file.
-            shape (tuple[int, int]): Original image shape in format (height, width).
-            file (Path): Output file path to save detections.
-
-        Examples:
-            >>> validator = OBBValidator()
-            >>> predn = torch.tensor([[100, 100, 50, 30, 0.9, 0, 45]])  # One detection: x,y,w,h,conf,cls,angle
-            >>> validator.save_one_txt(predn, True, (640, 480), "detection.txt")
-        """
-        import numpy as np
-
-        from ultralytics.engine.results import Results
-
-        Results(
-            np.zeros((shape[0], shape[1]), dtype=np.uint8),
-            path=None,
-            names=self.names,
-            obb=torch.cat([predn["bboxes"], predn["conf"].unsqueeze(-1), predn["cls"].unsqueeze(-1)], dim=1),
-        ).save_txt(file, save_conf=save_conf)
-
-    def scale_preds(self, predn: dict[str, torch.Tensor], pbatch: dict[str, Any]) -> dict[str, torch.Tensor]:
-        """Scales predictions to the original image size."""
-        return {
-            **predn,
-            "bboxes": ops.scale_boxes(
-                pbatch["imgsz"], predn["bboxes"].clone(), pbatch["ori_shape"], ratio_pad=pbatch["ratio_pad"], xywh=True
-            ),
-        }
-
-    def eval_json(self, stats: dict[str, Any]) -> dict[str, Any]:
-        """
-        Evaluate YOLO output in JSON format and save predictions in DOTA format.
-
-        Args:
-            stats (dict[str, Any]): Performance statistics dictionary.
-
-        Returns:
-            (dict[str, Any]): Updated performance statistics.
-        """
-        if self.args.save_json and self.is_dota and len(self.jdict):
-            import json
-            import re
-            from collections import defaultdict
-
-            pred_json = self.save_dir / "predictions.json"  # predictions
-            pred_txt = self.save_dir / "predictions_txt"  # predictions
-            pred_txt.mkdir(parents=True, exist_ok=True)
-            data = json.load(open(pred_json))
-            # Save split results
-            LOGGER.info(f"Saving predictions with DOTA format to {pred_txt}...")
-            for d in data:
-                image_id = d["image_id"]
-                score = d["score"]
-                classname = self.names[d["category_id"] - 1].replace(" ", "-")
-                p = d["poly"]
-
-                with open(f"{pred_txt / f'Task1_{classname}'}.txt", "a", encoding="utf-8") as f:
-                    f.writelines(f"{image_id} {score} {p[0]} {p[1]} {p[2]} {p[3]} {p[4]} {p[5]} {p[6]} {p[7]}\n")
-            # Save merged results, this could result slightly lower map than using official merging script,
-            # because of the probiou calculation.
-            pred_merged_txt = self.save_dir / "predictions_merged_txt"  # predictions
-            pred_merged_txt.mkdir(parents=True, exist_ok=True)
-            merged_results = defaultdict(list)
-            LOGGER.info(f"Saving merged predictions with DOTA format to {pred_merged_txt}...")
-            for d in data:
-                image_id = d["image_id"].split("__", 1)[0]
-                pattern = re.compile(r"\d+___\d+")
-                x, y = (int(c) for c in re.findall(pattern, d["image_id"])[0].split("___"))
-                bbox, score, cls = d["rbox"], d["score"], d["category_id"] - 1
-                bbox[0] += x
-                bbox[1] += y
-                bbox.extend([score, cls])
-                merged_results[image_id].append(bbox)
-            for image_id, bbox in merged_results.items():
-                bbox = torch.tensor(bbox)
-                max_wh = torch.max(bbox[:, :2]).item() * 2
-                c = bbox[:, 6:7] * max_wh  # classes
-                scores = bbox[:, 5]  # scores
-                b = bbox[:, :5].clone()
-                b[:, :2] += c
-                # 0.3 could get results close to the ones from official merging script, even slightly better.
-                i = TorchNMS.fast_nms(b, scores, 0.3, iou_func=batch_probiou)
-                bbox = bbox[i]
-
-                b = ops.xywhr2xyxyxyxy(bbox[:, :5]).view(-1, 8)
-                for x in torch.cat([b, bbox[:, 5:7]], dim=-1).tolist():
-                    classname = self.names[int(x[-1])].replace(" ", "-")
-                    p = [round(i, 3) for i in x[:-2]]  # poly
-                    score = round(x[-2], 3)
-
-                    with open(f"{pred_merged_txt / f'Task1_{classname}'}.txt", "a", encoding="utf-8") as f:
-                        f.writelines(f"{image_id} {score} {p[0]} {p[1]} {p[2]} {p[3]} {p[4]} {p[5]} {p[6]} {p[7]}\n")
-
-        return stats
diff --git a/ultralytics/models/yolo/pose/__init__.py b/ultralytics/models/yolo/pose/__init__.py
deleted file mode 100644
index 396167b..0000000
--- a/ultralytics/models/yolo/pose/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from .predict import PosePredictor
-from .train import PoseTrainer
-from .val import PoseValidator
-
-__all__ = "PoseTrainer", "PoseValidator", "PosePredictor"
diff --git a/ultralytics/models/yolo/pose/__pycache__/__init__.cpython-310.pyc b/ultralytics/models/yolo/pose/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index 9475ddd..0000000
Binary files a/ultralytics/models/yolo/pose/__pycache__/__init__.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/yolo/pose/__pycache__/predict.cpython-310.pyc b/ultralytics/models/yolo/pose/__pycache__/predict.cpython-310.pyc
deleted file mode 100644
index 94ba708..0000000
Binary files a/ultralytics/models/yolo/pose/__pycache__/predict.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/yolo/pose/__pycache__/train.cpython-310.pyc b/ultralytics/models/yolo/pose/__pycache__/train.cpython-310.pyc
deleted file mode 100644
index aceb28a..0000000
Binary files a/ultralytics/models/yolo/pose/__pycache__/train.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/yolo/pose/__pycache__/val.cpython-310.pyc b/ultralytics/models/yolo/pose/__pycache__/val.cpython-310.pyc
deleted file mode 100644
index 0eeb718..0000000
Binary files a/ultralytics/models/yolo/pose/__pycache__/val.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/yolo/pose/predict.py b/ultralytics/models/yolo/pose/predict.py
deleted file mode 100644
index 72b4976..0000000
--- a/ultralytics/models/yolo/pose/predict.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from ultralytics.models.yolo.detect.predict import DetectionPredictor
-from ultralytics.utils import DEFAULT_CFG, LOGGER, ops
-
-
-class PosePredictor(DetectionPredictor):
-    """
-    A class extending the DetectionPredictor class for prediction based on a pose model.
-
-    This class specializes in pose estimation, handling keypoints detection alongside standard object detection
-    capabilities inherited from DetectionPredictor.
-
-    Attributes:
-        args (namespace): Configuration arguments for the predictor.
-        model (torch.nn.Module): The loaded YOLO pose model with keypoint detection capabilities.
-
-    Methods:
-        construct_result: Construct the result object from the prediction, including keypoints.
-
-    Examples:
-        >>> from ultralytics.utils import ASSETS
-        >>> from ultralytics.models.yolo.pose import PosePredictor
-        >>> args = dict(model="yolo11n-pose.pt", source=ASSETS)
-        >>> predictor = PosePredictor(overrides=args)
-        >>> predictor.predict_cli()
-    """
-
-    def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
-        """
-        Initialize PosePredictor for pose estimation tasks.
-
-        Sets up a PosePredictor instance, configuring it for pose detection tasks and handling device-specific
-        warnings for Apple MPS.
-
-        Args:
-            cfg (Any): Configuration for the predictor.
-            overrides (dict, optional): Configuration overrides that take precedence over cfg.
-            _callbacks (list, optional): List of callback functions to be invoked during prediction.
-
-        Examples:
-            >>> from ultralytics.utils import ASSETS
-            >>> from ultralytics.models.yolo.pose import PosePredictor
-            >>> args = dict(model="yolo11n-pose.pt", source=ASSETS)
-            >>> predictor = PosePredictor(overrides=args)
-            >>> predictor.predict_cli()
-        """
-        super().__init__(cfg, overrides, _callbacks)
-        self.args.task = "pose"
-        if isinstance(self.args.device, str) and self.args.device.lower() == "mps":
-            LOGGER.warning(
-                "Apple MPS known Pose bug. Recommend 'device=cpu' for Pose models. "
-                "See https://github.com/ultralytics/ultralytics/issues/4031."
-            )
-
-    def construct_result(self, pred, img, orig_img, img_path):
-        """
-        Construct the result object from the prediction, including keypoints.
-
-        Extends the parent class implementation by extracting keypoint data from predictions and adding them to the
-        result object.
-
-        Args:
-            pred (torch.Tensor): The predicted bounding boxes, scores, and keypoints with shape (N, 6+K*D) where N is
-                the number of detections, K is the number of keypoints, and D is the keypoint dimension.
-            img (torch.Tensor): The processed input image tensor with shape (B, C, H, W).
-            orig_img (np.ndarray): The original unprocessed image as a numpy array.
-            img_path (str): The path to the original image file.
-
-        Returns:
-            (Results): The result object containing the original image, image path, class names, bounding boxes, and
-                keypoints.
-        """
-        result = super().construct_result(pred, img, orig_img, img_path)
-        # Extract keypoints from prediction and reshape according to model's keypoint shape
-        pred_kpts = pred[:, 6:].view(pred.shape[0], *self.model.kpt_shape)
-        # Scale keypoints coordinates to match the original image dimensions
-        pred_kpts = ops.scale_coords(img.shape[2:], pred_kpts, orig_img.shape)
-        result.update(keypoints=pred_kpts)
-        return result
diff --git a/ultralytics/models/yolo/pose/train.py b/ultralytics/models/yolo/pose/train.py
deleted file mode 100644
index 9ea9c53..0000000
--- a/ultralytics/models/yolo/pose/train.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-from copy import copy
-from pathlib import Path
-from typing import Any
-
-from ultralytics.models import yolo
-from ultralytics.nn.tasks import PoseModel
-from ultralytics.utils import DEFAULT_CFG, LOGGER
-
-
-class PoseTrainer(yolo.detect.DetectionTrainer):
-    """
-    A class extending the DetectionTrainer class for training YOLO pose estimation models.
-
-    This trainer specializes in handling pose estimation tasks, managing model training, validation, and visualization
-    of pose keypoints alongside bounding boxes.
-
-    Attributes:
-        args (dict): Configuration arguments for training.
-        model (PoseModel): The pose estimation model being trained.
-        data (dict): Dataset configuration including keypoint shape information.
-        loss_names (tuple): Names of the loss components used in training.
-
-    Methods:
-        get_model: Retrieve a pose estimation model with specified configuration.
-        set_model_attributes: Set keypoints shape attribute on the model.
-        get_validator: Create a validator instance for model evaluation.
-        plot_training_samples: Visualize training samples with keypoints.
-        get_dataset: Retrieve the dataset and ensure it contains required kpt_shape key.
-
-    Examples:
-        >>> from ultralytics.models.yolo.pose import PoseTrainer
-        >>> args = dict(model="yolo11n-pose.pt", data="coco8-pose.yaml", epochs=3)
-        >>> trainer = PoseTrainer(overrides=args)
-        >>> trainer.train()
-    """
-
-    def __init__(self, cfg=DEFAULT_CFG, overrides: dict[str, Any] | None = None, _callbacks=None):
-        """
-        Initialize a PoseTrainer object for training YOLO pose estimation models.
-
-        Args:
-            cfg (dict, optional): Default configuration dictionary containing training parameters.
-            overrides (dict, optional): Dictionary of parameter overrides for the default configuration.
-            _callbacks (list, optional): List of callback functions to be executed during training.
-
-        Notes:
-            This trainer will automatically set the task to 'pose' regardless of what is provided in overrides.
-            A warning is issued when using Apple MPS device due to known bugs with pose models.
-        """
-        if overrides is None:
-            overrides = {}
-        overrides["task"] = "pose"
-        super().__init__(cfg, overrides, _callbacks)
-
-        if isinstance(self.args.device, str) and self.args.device.lower() == "mps":
-            LOGGER.warning(
-                "Apple MPS known Pose bug. Recommend 'device=cpu' for Pose models. "
-                "See https://github.com/ultralytics/ultralytics/issues/4031."
-            )
-
-    def get_model(
-        self,
-        cfg: str | Path | dict[str, Any] | None = None,
-        weights: str | Path | None = None,
-        verbose: bool = True,
-    ) -> PoseModel:
-        """
-        Get pose estimation model with specified configuration and weights.
-
-        Args:
-            cfg (str | Path | dict, optional): Model configuration file path or dictionary.
-            weights (str | Path, optional): Path to the model weights file.
-            verbose (bool): Whether to display model information.
-
-        Returns:
-            (PoseModel): Initialized pose estimation model.
-        """
-        model = PoseModel(
-            cfg, nc=self.data["nc"], ch=self.data["channels"], data_kpt_shape=self.data["kpt_shape"], verbose=verbose
-        )
-        if weights:
-            model.load(weights)
-
-        return model
-
-    def set_model_attributes(self):
-        """Set keypoints shape attribute of PoseModel."""
-        super().set_model_attributes()
-        self.model.kpt_shape = self.data["kpt_shape"]
-
-    def get_validator(self):
-        """Return an instance of the PoseValidator class for validation."""
-        self.loss_names = "box_loss", "pose_loss", "kobj_loss", "cls_loss", "dfl_loss"
-        return yolo.pose.PoseValidator(
-            self.test_loader, save_dir=self.save_dir, args=copy(self.args), _callbacks=self.callbacks
-        )
-
-    def get_dataset(self) -> dict[str, Any]:
-        """
-        Retrieve the dataset and ensure it contains the required `kpt_shape` key.
-
-        Returns:
-            (dict): A dictionary containing the training/validation/test dataset and category names.
-
-        Raises:
-            KeyError: If the `kpt_shape` key is not present in the dataset.
-        """
-        data = super().get_dataset()
-        if "kpt_shape" not in data:
-            raise KeyError(f"No `kpt_shape` in the {self.args.data}. See https://docs.ultralytics.com/datasets/pose/")
-        return data
diff --git a/ultralytics/models/yolo/pose/val.py b/ultralytics/models/yolo/pose/val.py
deleted file mode 100644
index ad83a0d..0000000
--- a/ultralytics/models/yolo/pose/val.py
+++ /dev/null
@@ -1,267 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-from pathlib import Path
-from typing import Any
-
-import numpy as np
-import torch
-
-from ultralytics.models.yolo.detect import DetectionValidator
-from ultralytics.utils import LOGGER, ops
-from ultralytics.utils.metrics import OKS_SIGMA, PoseMetrics, kpt_iou
-
-
-class PoseValidator(DetectionValidator):
-    """
-    A class extending the DetectionValidator class for validation based on a pose model.
-
-    This validator is specifically designed for pose estimation tasks, handling keypoints and implementing
-    specialized metrics for pose evaluation.
-
-    Attributes:
-        sigma (np.ndarray): Sigma values for OKS calculation, either OKS_SIGMA or ones divided by number of keypoints.
-        kpt_shape (list[int]): Shape of the keypoints, typically [17, 3] for COCO format.
-        args (dict): Arguments for the validator including task set to "pose".
-        metrics (PoseMetrics): Metrics object for pose evaluation.
-
-    Methods:
-        preprocess: Preprocess batch by converting keypoints data to float and moving it to the device.
-        get_desc: Return description of evaluation metrics in string format.
-        init_metrics: Initialize pose estimation metrics for YOLO model.
-        _prepare_batch: Prepare a batch for processing by converting keypoints to float and scaling to original
-            dimensions.
-        _prepare_pred: Prepare and scale keypoints in predictions for pose processing.
-        _process_batch: Return correct prediction matrix by computing Intersection over Union (IoU) between
-            detections and ground truth.
-        plot_val_samples: Plot and save validation set samples with ground truth bounding boxes and keypoints.
-        plot_predictions: Plot and save model predictions with bounding boxes and keypoints.
-        save_one_txt: Save YOLO pose detections to a text file in normalized coordinates.
-        pred_to_json: Convert YOLO predictions to COCO JSON format.
-        eval_json: Evaluate object detection model using COCO JSON format.
-
-    Examples:
-        >>> from ultralytics.models.yolo.pose import PoseValidator
-        >>> args = dict(model="yolo11n-pose.pt", data="coco8-pose.yaml")
-        >>> validator = PoseValidator(args=args)
-        >>> validator()
-    """
-
-    def __init__(self, dataloader=None, save_dir=None, args=None, _callbacks=None) -> None:
-        """
-        Initialize a PoseValidator object for pose estimation validation.
-
-        This validator is specifically designed for pose estimation tasks, handling keypoints and implementing
-        specialized metrics for pose evaluation.
-
-        Args:
-            dataloader (torch.utils.data.DataLoader, optional): Dataloader to be used for validation.
-            save_dir (Path | str, optional): Directory to save results.
-            args (dict, optional): Arguments for the validator including task set to "pose".
-            _callbacks (list, optional): List of callback functions to be executed during validation.
-
-        Examples:
-            >>> from ultralytics.models.yolo.pose import PoseValidator
-            >>> args = dict(model="yolo11n-pose.pt", data="coco8-pose.yaml")
-            >>> validator = PoseValidator(args=args)
-            >>> validator()
-
-        Notes:
-            This class extends DetectionValidator with pose-specific functionality. It initializes with sigma values
-            for OKS calculation and sets up PoseMetrics for evaluation. A warning is displayed when using Apple MPS
-            due to a known bug with pose models.
-        """
-        super().__init__(dataloader, save_dir, args, _callbacks)
-        self.sigma = None
-        self.kpt_shape = None
-        self.args.task = "pose"
-        self.metrics = PoseMetrics()
-        if isinstance(self.args.device, str) and self.args.device.lower() == "mps":
-            LOGGER.warning(
-                "Apple MPS known Pose bug. Recommend 'device=cpu' for Pose models. "
-                "See https://github.com/ultralytics/ultralytics/issues/4031."
-            )
-
-    def preprocess(self, batch: dict[str, Any]) -> dict[str, Any]:
-        """Preprocess batch by converting keypoints data to float and moving it to the device."""
-        batch = super().preprocess(batch)
-        batch["keypoints"] = batch["keypoints"].float()
-        return batch
-
-    def get_desc(self) -> str:
-        """Return description of evaluation metrics in string format."""
-        return ("%22s" + "%11s" * 10) % (
-            "Class",
-            "Images",
-            "Instances",
-            "Box(P",
-            "R",
-            "mAP50",
-            "mAP50-95)",
-            "Pose(P",
-            "R",
-            "mAP50",
-            "mAP50-95)",
-        )
-
-    def init_metrics(self, model: torch.nn.Module) -> None:
-        """
-        Initialize evaluation metrics for YOLO pose validation.
-
-        Args:
-            model (torch.nn.Module): Model to validate.
-        """
-        super().init_metrics(model)
-        self.kpt_shape = self.data["kpt_shape"]
-        is_pose = self.kpt_shape == [17, 3]
-        nkpt = self.kpt_shape[0]
-        self.sigma = OKS_SIGMA if is_pose else np.ones(nkpt) / nkpt
-
-    def postprocess(self, preds: torch.Tensor) -> dict[str, torch.Tensor]:
-        """
-        Postprocess YOLO predictions to extract and reshape keypoints for pose estimation.
-
-        This method extends the parent class postprocessing by extracting keypoints from the 'extra'
-        field of predictions and reshaping them according to the keypoint shape configuration.
-        The keypoints are reshaped from a flattened format to the proper dimensional structure
-        (typically [N, 17, 3] for COCO pose format).
-
-        Args:
-            preds (torch.Tensor): Raw prediction tensor from the YOLO pose model containing
-                bounding boxes, confidence scores, class predictions, and keypoint data.
-
-        Returns:
-            (dict[torch.Tensor]): Dict of processed prediction dictionaries, each containing:
-                - 'bboxes': Bounding box coordinates
-                - 'conf': Confidence scores
-                - 'cls': Class predictions
-                - 'keypoints': Reshaped keypoint coordinates with shape (-1, *self.kpt_shape)
-
-        Note:
-            If no keypoints are present in a prediction (empty keypoints), that prediction
-            is skipped and continues to the next one. The keypoints are extracted from the
-            'extra' field which contains additional task-specific data beyond basic detection.
-        """
-        preds = super().postprocess(preds)
-        for pred in preds:
-            pred["keypoints"] = pred.pop("extra").view(-1, *self.kpt_shape)  # remove extra if exists
-        return preds
-
-    def _prepare_batch(self, si: int, batch: dict[str, Any]) -> dict[str, Any]:
-        """
-        Prepare a batch for processing by converting keypoints to float and scaling to original dimensions.
-
-        Args:
-            si (int): Batch index.
-            batch (dict[str, Any]): Dictionary containing batch data with keys like 'keypoints', 'batch_idx', etc.
-
-        Returns:
-            (dict[str, Any]): Prepared batch with keypoints scaled to original image dimensions.
-
-        Notes:
-            This method extends the parent class's _prepare_batch method by adding keypoint processing.
-            Keypoints are scaled from normalized coordinates to original image dimensions.
-        """
-        pbatch = super()._prepare_batch(si, batch)
-        kpts = batch["keypoints"][batch["batch_idx"] == si]
-        h, w = pbatch["imgsz"]
-        kpts = kpts.clone()
-        kpts[..., 0] *= w
-        kpts[..., 1] *= h
-        pbatch["keypoints"] = kpts
-        return pbatch
-
-    def _process_batch(self, preds: dict[str, torch.Tensor], batch: dict[str, Any]) -> dict[str, np.ndarray]:
-        """
-        Return correct prediction matrix by computing Intersection over Union (IoU) between detections and ground truth.
-
-        Args:
-            preds (dict[str, torch.Tensor]): Dictionary containing prediction data with keys 'cls' for class predictions
-                and 'keypoints' for keypoint predictions.
-            batch (dict[str, Any]): Dictionary containing ground truth data with keys 'cls' for class labels,
-                'bboxes' for bounding boxes, and 'keypoints' for keypoint annotations.
-
-        Returns:
-            (dict[str, np.ndarray]): Dictionary containing the correct prediction matrix including 'tp_p' for pose
-                true positives across 10 IoU levels.
-
-        Notes:
-            `0.53` scale factor used in area computation is referenced from
-            https://github.com/jin-s13/xtcocoapi/blob/master/xtcocotools/cocoeval.py#L384.
-        """
-        tp = super()._process_batch(preds, batch)
-        gt_cls = batch["cls"]
-        if gt_cls.shape[0] == 0 or preds["cls"].shape[0] == 0:
-            tp_p = np.zeros((preds["cls"].shape[0], self.niou), dtype=bool)
-        else:
-            # `0.53` is from https://github.com/jin-s13/xtcocoapi/blob/master/xtcocotools/cocoeval.py#L384
-            area = ops.xyxy2xywh(batch["bboxes"])[:, 2:].prod(1) * 0.53
-            iou = kpt_iou(batch["keypoints"], preds["keypoints"], sigma=self.sigma, area=area)
-            tp_p = self.match_predictions(preds["cls"], gt_cls, iou).cpu().numpy()
-        tp.update({"tp_p": tp_p})  # update tp with kpts IoU
-        return tp
-
-    def save_one_txt(self, predn: dict[str, torch.Tensor], save_conf: bool, shape: tuple[int, int], file: Path) -> None:
-        """
-        Save YOLO pose detections to a text file in normalized coordinates.
-
-        Args:
-            predn (dict[str, torch.Tensor]): Dictionary containing predictions with keys 'bboxes', 'conf', 'cls' and 'keypoints.
-            save_conf (bool): Whether to save confidence scores.
-            shape (tuple[int, int]): Shape of the original image (height, width).
-            file (Path): Output file path to save detections.
-
-        Notes:
-            The output format is: class_id x_center y_center width height confidence keypoints where keypoints are
-            normalized (x, y, visibility) values for each point.
-        """
-        from ultralytics.engine.results import Results
-
-        Results(
-            np.zeros((shape[0], shape[1]), dtype=np.uint8),
-            path=None,
-            names=self.names,
-            boxes=torch.cat([predn["bboxes"], predn["conf"].unsqueeze(-1), predn["cls"].unsqueeze(-1)], dim=1),
-            keypoints=predn["keypoints"],
-        ).save_txt(file, save_conf=save_conf)
-
-    def pred_to_json(self, predn: dict[str, torch.Tensor], pbatch: dict[str, Any]) -> None:
-        """
-        Convert YOLO predictions to COCO JSON format.
-
-        This method takes prediction tensors and a filename, converts the bounding boxes from YOLO format
-        to COCO format, and appends the results to the internal JSON dictionary (self.jdict).
-
-        Args:
-            predn (dict[str, torch.Tensor]): Prediction dictionary containing 'bboxes', 'conf', 'cls',
-                and 'keypoints' tensors.
-            pbatch (dict[str, Any]): Batch dictionary containing 'imgsz', 'ori_shape', 'ratio_pad', and 'im_file'.
-
-        Notes:
-            The method extracts the image ID from the filename stem (either as an integer if numeric, or as a string),
-            converts bounding boxes from xyxy to xywh format, and adjusts coordinates from center to top-left corner
-            before saving to the JSON dictionary.
-        """
-        super().pred_to_json(predn, pbatch)
-        kpts = predn["kpts"]
-        for i, k in enumerate(kpts.flatten(1, 2).tolist()):
-            self.jdict[-len(kpts) + i]["keypoints"] = k  # keypoints
-
-    def scale_preds(self, predn: dict[str, torch.Tensor], pbatch: dict[str, Any]) -> dict[str, torch.Tensor]:
-        """Scales predictions to the original image size."""
-        return {
-            **super().scale_preds(predn, pbatch),
-            "kpts": ops.scale_coords(
-                pbatch["imgsz"],
-                predn["keypoints"].clone(),
-                pbatch["ori_shape"],
-                ratio_pad=pbatch["ratio_pad"],
-            ),
-        }
-
-    def eval_json(self, stats: dict[str, Any]) -> dict[str, Any]:
-        """Evaluate object detection model using COCO JSON format."""
-        anno_json = self.data["path"] / "annotations/person_keypoints_val2017.json"  # annotations
-        pred_json = self.save_dir / "predictions.json"  # predictions
-        return super().coco_evaluate(stats, pred_json, anno_json, ["bbox", "keypoints"], suffix=["Box", "Pose"])
diff --git a/ultralytics/models/yolo/segment/__init__.py b/ultralytics/models/yolo/segment/__init__.py
deleted file mode 100644
index 36a921a..0000000
--- a/ultralytics/models/yolo/segment/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from .predict import SegmentationPredictor
-from .train import SegmentationTrainer
-from .val import SegmentationValidator
-
-__all__ = "SegmentationPredictor", "SegmentationTrainer", "SegmentationValidator"
diff --git a/ultralytics/models/yolo/segment/__pycache__/__init__.cpython-310.pyc b/ultralytics/models/yolo/segment/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index 1ce22ee..0000000
Binary files a/ultralytics/models/yolo/segment/__pycache__/__init__.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/yolo/segment/__pycache__/predict.cpython-310.pyc b/ultralytics/models/yolo/segment/__pycache__/predict.cpython-310.pyc
deleted file mode 100644
index 70128f2..0000000
Binary files a/ultralytics/models/yolo/segment/__pycache__/predict.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/yolo/segment/__pycache__/train.cpython-310.pyc b/ultralytics/models/yolo/segment/__pycache__/train.cpython-310.pyc
deleted file mode 100644
index 3082859..0000000
Binary files a/ultralytics/models/yolo/segment/__pycache__/train.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/yolo/segment/__pycache__/val.cpython-310.pyc b/ultralytics/models/yolo/segment/__pycache__/val.cpython-310.pyc
deleted file mode 100644
index 6571af8..0000000
Binary files a/ultralytics/models/yolo/segment/__pycache__/val.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/yolo/segment/predict.py b/ultralytics/models/yolo/segment/predict.py
deleted file mode 100644
index 6abeeb6..0000000
--- a/ultralytics/models/yolo/segment/predict.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from ultralytics.engine.results import Results
-from ultralytics.models.yolo.detect.predict import DetectionPredictor
-from ultralytics.utils import DEFAULT_CFG, ops
-
-
-class SegmentationPredictor(DetectionPredictor):
-    """
-    A class extending the DetectionPredictor class for prediction based on a segmentation model.
-
-    This class specializes in processing segmentation model outputs, handling both bounding boxes and masks in the
-    prediction results.
-
-    Attributes:
-        args (dict): Configuration arguments for the predictor.
-        model (torch.nn.Module): The loaded YOLO segmentation model.
-        batch (list): Current batch of images being processed.
-
-    Methods:
-        postprocess: Apply non-max suppression and process segmentation detections.
-        construct_results: Construct a list of result objects from predictions.
-        construct_result: Construct a single result object from a prediction.
-
-    Examples:
-        >>> from ultralytics.utils import ASSETS
-        >>> from ultralytics.models.yolo.segment import SegmentationPredictor
-        >>> args = dict(model="yolo11n-seg.pt", source=ASSETS)
-        >>> predictor = SegmentationPredictor(overrides=args)
-        >>> predictor.predict_cli()
-    """
-
-    def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
-        """
-        Initialize the SegmentationPredictor with configuration, overrides, and callbacks.
-
-        This class specializes in processing segmentation model outputs, handling both bounding boxes and masks in the
-        prediction results.
-
-        Args:
-            cfg (dict): Configuration for the predictor.
-            overrides (dict, optional): Configuration overrides that take precedence over cfg.
-            _callbacks (list, optional): List of callback functions to be invoked during prediction.
-        """
-        super().__init__(cfg, overrides, _callbacks)
-        self.args.task = "segment"
-
-    def postprocess(self, preds, img, orig_imgs):
-        """
-        Apply non-max suppression and process segmentation detections for each image in the input batch.
-
-        Args:
-            preds (tuple): Model predictions, containing bounding boxes, scores, classes, and mask coefficients.
-            img (torch.Tensor): Input image tensor in model format, with shape (B, C, H, W).
-            orig_imgs (list | torch.Tensor | np.ndarray): Original image or batch of images.
-
-        Returns:
-            (list): List of Results objects containing the segmentation predictions for each image in the batch.
-                Each Results object includes both bounding boxes and segmentation masks.
-
-        Examples:
-            >>> predictor = SegmentationPredictor(overrides=dict(model="yolo11n-seg.pt"))
-            >>> results = predictor.postprocess(preds, img, orig_img)
-        """
-        # Extract protos - tuple if PyTorch model or array if exported
-        protos = preds[1][-1] if isinstance(preds[1], tuple) else preds[1]
-        return super().postprocess(preds[0], img, orig_imgs, protos=protos)
-
-    def construct_results(self, preds, img, orig_imgs, protos):
-        """
-        Construct a list of result objects from the predictions.
-
-        Args:
-            preds (list[torch.Tensor]): List of predicted bounding boxes, scores, and masks.
-            img (torch.Tensor): The image after preprocessing.
-            orig_imgs (list[np.ndarray]): List of original images before preprocessing.
-            protos (list[torch.Tensor]): List of prototype masks.
-
-        Returns:
-            (list[Results]): List of result objects containing the original images, image paths, class names,
-                bounding boxes, and masks.
-        """
-        return [
-            self.construct_result(pred, img, orig_img, img_path, proto)
-            for pred, orig_img, img_path, proto in zip(preds, orig_imgs, self.batch[0], protos)
-        ]
-
-    def construct_result(self, pred, img, orig_img, img_path, proto):
-        """
-        Construct a single result object from the prediction.
-
-        Args:
-            pred (torch.Tensor): The predicted bounding boxes, scores, and masks.
-            img (torch.Tensor): The image after preprocessing.
-            orig_img (np.ndarray): The original image before preprocessing.
-            img_path (str): The path to the original image.
-            proto (torch.Tensor): The prototype masks.
-
-        Returns:
-            (Results): Result object containing the original image, image path, class names, bounding boxes, and masks.
-        """
-        if pred.shape[0] == 0:  # save empty boxes
-            masks = None
-        elif self.args.retina_masks:
-            pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
-            masks = ops.process_mask_native(proto, pred[:, 6:], pred[:, :4], orig_img.shape[:2])  # HWC
-        else:
-            masks = ops.process_mask(proto, pred[:, 6:], pred[:, :4], img.shape[2:], upsample=True)  # HWC
-            pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
-        if masks is not None:
-            keep = masks.sum((-2, -1)) > 0  # only keep predictions with masks
-            pred, masks = pred[keep], masks[keep]
-        return Results(orig_img, path=img_path, names=self.model.names, boxes=pred[:, :6], masks=masks)
diff --git a/ultralytics/models/yolo/segment/train.py b/ultralytics/models/yolo/segment/train.py
deleted file mode 100644
index f7e6725..0000000
--- a/ultralytics/models/yolo/segment/train.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-from copy import copy
-from pathlib import Path
-
-from ultralytics.models import yolo
-from ultralytics.nn.tasks import SegmentationModel
-from ultralytics.utils import DEFAULT_CFG, RANK
-
-
-class SegmentationTrainer(yolo.detect.DetectionTrainer):
-    """
-    A class extending the DetectionTrainer class for training based on a segmentation model.
-
-    This trainer specializes in handling segmentation tasks, extending the detection trainer with segmentation-specific
-    functionality including model initialization, validation, and visualization.
-
-    Attributes:
-        loss_names (tuple[str]): Names of the loss components used during training.
-
-    Examples:
-        >>> from ultralytics.models.yolo.segment import SegmentationTrainer
-        >>> args = dict(model="yolo11n-seg.pt", data="coco8-seg.yaml", epochs=3)
-        >>> trainer = SegmentationTrainer(overrides=args)
-        >>> trainer.train()
-    """
-
-    def __init__(self, cfg=DEFAULT_CFG, overrides: dict | None = None, _callbacks=None):
-        """
-        Initialize a SegmentationTrainer object.
-
-        Args:
-            cfg (dict): Configuration dictionary with default training settings.
-            overrides (dict, optional): Dictionary of parameter overrides for the default configuration.
-            _callbacks (list, optional): List of callback functions to be executed during training.
-        """
-        if overrides is None:
-            overrides = {}
-        overrides["task"] = "segment"
-        super().__init__(cfg, overrides, _callbacks)
-
-    def get_model(self, cfg: dict | str | None = None, weights: str | Path | None = None, verbose: bool = True):
-        """
-        Initialize and return a SegmentationModel with specified configuration and weights.
-
-        Args:
-            cfg (dict | str, optional): Model configuration. Can be a dictionary, a path to a YAML file, or None.
-            weights (str | Path, optional): Path to pretrained weights file.
-            verbose (bool): Whether to display model information during initialization.
-
-        Returns:
-            (SegmentationModel): Initialized segmentation model with loaded weights if specified.
-
-        Examples:
-            >>> trainer = SegmentationTrainer()
-            >>> model = trainer.get_model(cfg="yolo11n-seg.yaml")
-            >>> model = trainer.get_model(weights="yolo11n-seg.pt", verbose=False)
-        """
-        model = SegmentationModel(cfg, nc=self.data["nc"], ch=self.data["channels"], verbose=verbose and RANK == -1)
-        if weights:
-            model.load(weights)
-
-        return model
-
-    def get_validator(self):
-        """Return an instance of SegmentationValidator for validation of YOLO model."""
-        self.loss_names = "box_loss", "seg_loss", "cls_loss", "dfl_loss"
-        return yolo.segment.SegmentationValidator(
-            self.test_loader, save_dir=self.save_dir, args=copy(self.args), _callbacks=self.callbacks
-        )
diff --git a/ultralytics/models/yolo/segment/val.py b/ultralytics/models/yolo/segment/val.py
deleted file mode 100644
index 188f8bd..0000000
--- a/ultralytics/models/yolo/segment/val.py
+++ /dev/null
@@ -1,259 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-from multiprocessing.pool import ThreadPool
-from pathlib import Path
-from typing import Any
-
-import numpy as np
-import torch
-import torch.nn.functional as F
-
-from ultralytics.models.yolo.detect import DetectionValidator
-from ultralytics.utils import LOGGER, NUM_THREADS, ops
-from ultralytics.utils.checks import check_requirements
-from ultralytics.utils.metrics import SegmentMetrics, mask_iou
-
-
-class SegmentationValidator(DetectionValidator):
-    """
-    A class extending the DetectionValidator class for validation based on a segmentation model.
-
-    This validator handles the evaluation of segmentation models, processing both bounding box and mask predictions
-    to compute metrics such as mAP for both detection and segmentation tasks.
-
-    Attributes:
-        plot_masks (list): List to store masks for plotting.
-        process (callable): Function to process masks based on save_json and save_txt flags.
-        args (namespace): Arguments for the validator.
-        metrics (SegmentMetrics): Metrics calculator for segmentation tasks.
-        stats (dict): Dictionary to store statistics during validation.
-
-    Examples:
-        >>> from ultralytics.models.yolo.segment import SegmentationValidator
-        >>> args = dict(model="yolo11n-seg.pt", data="coco8-seg.yaml")
-        >>> validator = SegmentationValidator(args=args)
-        >>> validator()
-    """
-
-    def __init__(self, dataloader=None, save_dir=None, args=None, _callbacks=None) -> None:
-        """
-        Initialize SegmentationValidator and set task to 'segment', metrics to SegmentMetrics.
-
-        Args:
-            dataloader (torch.utils.data.DataLoader, optional): Dataloader to use for validation.
-            save_dir (Path, optional): Directory to save results.
-            args (namespace, optional): Arguments for the validator.
-            _callbacks (list, optional): List of callback functions.
-        """
-        super().__init__(dataloader, save_dir, args, _callbacks)
-        self.process = None
-        self.args.task = "segment"
-        self.metrics = SegmentMetrics()
-
-    def preprocess(self, batch: dict[str, Any]) -> dict[str, Any]:
-        """
-        Preprocess batch of images for YOLO segmentation validation.
-
-        Args:
-            batch (dict[str, Any]): Batch containing images and annotations.
-
-        Returns:
-            (dict[str, Any]): Preprocessed batch.
-        """
-        batch = super().preprocess(batch)
-        batch["masks"] = batch["masks"].float()
-        return batch
-
-    def init_metrics(self, model: torch.nn.Module) -> None:
-        """
-        Initialize metrics and select mask processing function based on save_json flag.
-
-        Args:
-            model (torch.nn.Module): Model to validate.
-        """
-        super().init_metrics(model)
-        if self.args.save_json:
-            check_requirements("faster-coco-eval>=1.6.7")
-        # More accurate vs faster
-        self.process = ops.process_mask_native if self.args.save_json or self.args.save_txt else ops.process_mask
-
-    def get_desc(self) -> str:
-        """Return a formatted description of evaluation metrics."""
-        return ("%22s" + "%11s" * 10) % (
-            "Class",
-            "Images",
-            "Instances",
-            "Box(P",
-            "R",
-            "mAP50",
-            "mAP50-95)",
-            "Mask(P",
-            "R",
-            "mAP50",
-            "mAP50-95)",
-        )
-
-    def postprocess(self, preds: list[torch.Tensor]) -> list[dict[str, torch.Tensor]]:
-        """
-        Post-process YOLO predictions and return output detections with proto.
-
-        Args:
-            preds (list[torch.Tensor]): Raw predictions from the model.
-
-        Returns:
-            list[dict[str, torch.Tensor]]: Processed detection predictions with masks.
-        """
-        proto = preds[1][-1] if len(preds[1]) == 3 else preds[1]  # second output is len 3 if pt, but only 1 if exported
-        preds = super().postprocess(preds[0])
-        imgsz = [4 * x for x in proto.shape[2:]]  # get image size from proto
-        for i, pred in enumerate(preds):
-            coefficient = pred.pop("extra")
-            pred["masks"] = (
-                self.process(proto[i], coefficient, pred["bboxes"], shape=imgsz)
-                if coefficient.shape[0]
-                else torch.zeros(
-                    (0, *(imgsz if self.process is ops.process_mask_native else proto.shape[2:])),
-                    dtype=torch.uint8,
-                    device=pred["bboxes"].device,
-                )
-            )
-        return preds
-
-    def _prepare_batch(self, si: int, batch: dict[str, Any]) -> dict[str, Any]:
-        """
-        Prepare a batch for training or inference by processing images and targets.
-
-        Args:
-            si (int): Batch index.
-            batch (dict[str, Any]): Batch data containing images and annotations.
-
-        Returns:
-            (dict[str, Any]): Prepared batch with processed annotations.
-        """
-        prepared_batch = super()._prepare_batch(si, batch)
-        nl = prepared_batch["cls"].shape[0]
-        if self.args.overlap_mask:
-            masks = batch["masks"][si]
-            index = torch.arange(1, nl + 1, device=masks.device).view(nl, 1, 1)
-            masks = (masks == index).float()
-        else:
-            masks = batch["masks"][batch["batch_idx"] == si]
-        if nl:
-            mask_size = [s if self.process is ops.process_mask_native else s // 4 for s in prepared_batch["imgsz"]]
-            if masks.shape[1:] != mask_size:
-                masks = F.interpolate(masks[None], mask_size, mode="bilinear", align_corners=False)[0]
-                masks = masks.gt_(0.5)
-        prepared_batch["masks"] = masks
-        return prepared_batch
-
-    def _process_batch(self, preds: dict[str, torch.Tensor], batch: dict[str, Any]) -> dict[str, np.ndarray]:
-        """
-        Compute correct prediction matrix for a batch based on bounding boxes and optional masks.
-
-        Args:
-            preds (dict[str, torch.Tensor]): Dictionary containing predictions with keys like 'cls' and 'masks'.
-            batch (dict[str, Any]): Dictionary containing batch data with keys like 'cls' and 'masks'.
-
-        Returns:
-            (dict[str, np.ndarray]): A dictionary containing correct prediction matrices including 'tp_m' for mask IoU.
-
-        Notes:
-            - If `masks` is True, the function computes IoU between predicted and ground truth masks.
-            - If `overlap` is True and `masks` is True, overlapping masks are taken into account when computing IoU.
-
-        Examples:
-            >>> preds = {"cls": torch.tensor([1, 0]), "masks": torch.rand(2, 640, 640), "bboxes": torch.rand(2, 4)}
-            >>> batch = {"cls": torch.tensor([1, 0]), "masks": torch.rand(2, 640, 640), "bboxes": torch.rand(2, 4)}
-            >>> correct_preds = validator._process_batch(preds, batch)
-        """
-        tp = super()._process_batch(preds, batch)
-        gt_cls = batch["cls"]
-        if gt_cls.shape[0] == 0 or preds["cls"].shape[0] == 0:
-            tp_m = np.zeros((preds["cls"].shape[0], self.niou), dtype=bool)
-        else:
-            iou = mask_iou(batch["masks"].flatten(1), preds["masks"].flatten(1))
-            tp_m = self.match_predictions(preds["cls"], gt_cls, iou).cpu().numpy()
-        tp.update({"tp_m": tp_m})  # update tp with mask IoU
-        return tp
-
-    def plot_predictions(self, batch: dict[str, Any], preds: list[dict[str, torch.Tensor]], ni: int) -> None:
-        """
-        Plot batch predictions with masks and bounding boxes.
-
-        Args:
-            batch (dict[str, Any]): Batch containing images and annotations.
-            preds (list[dict[str, torch.Tensor]]): List of predictions from the model.
-            ni (int): Batch index.
-        """
-        for p in preds:
-            masks = p["masks"]
-            if masks.shape[0] > self.args.max_det:
-                LOGGER.warning(f"Limiting validation plots to 'max_det={self.args.max_det}' items.")
-            p["masks"] = torch.as_tensor(masks[: self.args.max_det], dtype=torch.uint8).cpu()
-        super().plot_predictions(batch, preds, ni, max_det=self.args.max_det)  # plot bboxes
-
-    def save_one_txt(self, predn: torch.Tensor, save_conf: bool, shape: tuple[int, int], file: Path) -> None:
-        """
-        Save YOLO detections to a txt file in normalized coordinates in a specific format.
-
-        Args:
-            predn (torch.Tensor): Predictions in the format (x1, y1, x2, y2, conf, class).
-            save_conf (bool): Whether to save confidence scores.
-            shape (tuple[int, int]): Shape of the original image.
-            file (Path): File path to save the detections.
-        """
-        from ultralytics.engine.results import Results
-
-        Results(
-            np.zeros((shape[0], shape[1]), dtype=np.uint8),
-            path=None,
-            names=self.names,
-            boxes=torch.cat([predn["bboxes"], predn["conf"].unsqueeze(-1), predn["cls"].unsqueeze(-1)], dim=1),
-            masks=torch.as_tensor(predn["masks"], dtype=torch.uint8),
-        ).save_txt(file, save_conf=save_conf)
-
-    def pred_to_json(self, predn: dict[str, torch.Tensor], pbatch: dict[str, Any]) -> None:
-        """
-        Save one JSON result for COCO evaluation.
-
-        Args:
-            predn (dict[str, torch.Tensor]): Predictions containing bboxes, masks, confidence scores, and classes.
-            pbatch (dict[str, Any]): Batch dictionary containing 'imgsz', 'ori_shape', 'ratio_pad', and 'im_file'.
-        """
-        from faster_coco_eval.core.mask import encode  # noqa
-
-        def single_encode(x):
-            """Encode predicted masks as RLE and append results to jdict."""
-            rle = encode(np.asarray(x[:, :, None], order="F", dtype="uint8"))[0]
-            rle["counts"] = rle["counts"].decode("utf-8")
-            return rle
-
-        pred_masks = np.transpose(predn["masks"], (2, 0, 1))
-        with ThreadPool(NUM_THREADS) as pool:
-            rles = pool.map(single_encode, pred_masks)
-        super().pred_to_json(predn, pbatch)
-        for i, r in enumerate(rles):
-            self.jdict[-len(rles) + i]["segmentation"] = r  # segmentation
-
-    def scale_preds(self, predn: dict[str, torch.Tensor], pbatch: dict[str, Any]) -> dict[str, torch.Tensor]:
-        """Scales predictions to the original image size."""
-        return {
-            **super().scale_preds(predn, pbatch),
-            "masks": ops.scale_image(
-                torch.as_tensor(predn["masks"], dtype=torch.uint8).permute(1, 2, 0).contiguous().cpu().numpy(),
-                pbatch["ori_shape"],
-                ratio_pad=pbatch["ratio_pad"],
-            ),
-        }
-
-    def eval_json(self, stats: dict[str, Any]) -> dict[str, Any]:
-        """Return COCO-style instance segmentation evaluation metrics."""
-        pred_json = self.save_dir / "predictions.json"  # predictions
-        anno_json = (
-            self.data["path"]
-            / "annotations"
-            / ("instances_val2017.json" if self.is_coco else f"lvis_v1_{self.args.split}.json")
-        )  # annotations
-        return super().coco_evaluate(stats, pred_json, anno_json, ["bbox", "segm"], suffix=["Box", "Mask"])
diff --git a/ultralytics/models/yolo/world/__init__.py b/ultralytics/models/yolo/world/__init__.py
deleted file mode 100644
index 4380d24..0000000
--- a/ultralytics/models/yolo/world/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from .train import WorldTrainer
-
-__all__ = ["WorldTrainer"]
diff --git a/ultralytics/models/yolo/world/__pycache__/__init__.cpython-310.pyc b/ultralytics/models/yolo/world/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index a0a79d6..0000000
Binary files a/ultralytics/models/yolo/world/__pycache__/__init__.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/yolo/world/__pycache__/train.cpython-310.pyc b/ultralytics/models/yolo/world/__pycache__/train.cpython-310.pyc
deleted file mode 100644
index 30fe368..0000000
Binary files a/ultralytics/models/yolo/world/__pycache__/train.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/yolo/world/__pycache__/train_world.cpython-310.pyc b/ultralytics/models/yolo/world/__pycache__/train_world.cpython-310.pyc
deleted file mode 100644
index 62e66cf..0000000
Binary files a/ultralytics/models/yolo/world/__pycache__/train_world.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/yolo/world/train.py b/ultralytics/models/yolo/world/train.py
deleted file mode 100644
index bb6d2f7..0000000
--- a/ultralytics/models/yolo/world/train.py
+++ /dev/null
@@ -1,179 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import itertools
-from pathlib import Path
-from typing import Any
-
-import torch
-
-from ultralytics.data import build_yolo_dataset
-from ultralytics.models.yolo.detect import DetectionTrainer
-from ultralytics.nn.tasks import WorldModel
-from ultralytics.utils import DEFAULT_CFG, LOGGER, RANK
-from ultralytics.utils.torch_utils import unwrap_model
-
-
-def on_pretrain_routine_end(trainer) -> None:
-    """Set up model classes and text encoder at the end of the pretrain routine."""
-    if RANK in {-1, 0}:
-        # Set class names for evaluation
-        names = [name.split("/", 1)[0] for name in list(trainer.test_loader.dataset.data["names"].values())]
-        unwrap_model(trainer.ema.ema).set_classes(names, cache_clip_model=False)
-
-
-class WorldTrainer(DetectionTrainer):
-    """
-    A trainer class for fine-tuning YOLO World models on close-set datasets.
-
-    This trainer extends the DetectionTrainer to support training YOLO World models, which combine visual and textual
-    features for improved object detection and understanding. It handles text embedding generation and caching to
-    accelerate training with multi-modal data.
-
-    Attributes:
-        text_embeddings (dict[str, torch.Tensor] | None): Cached text embeddings for category names to accelerate
-            training.
-        model (WorldModel): The YOLO World model being trained.
-        data (dict[str, Any]): Dataset configuration containing class information.
-        args (Any): Training arguments and configuration.
-
-    Methods:
-        get_model: Return WorldModel initialized with specified config and weights.
-        build_dataset: Build YOLO Dataset for training or validation.
-        set_text_embeddings: Set text embeddings for datasets to accelerate training.
-        generate_text_embeddings: Generate text embeddings for a list of text samples.
-        preprocess_batch: Preprocess a batch of images and text for YOLOWorld training.
-
-    Examples:
-        Initialize and train a YOLO World model
-        >>> from ultralytics.models.yolo.world import WorldTrainer
-        >>> args = dict(model="yolov8s-world.pt", data="coco8.yaml", epochs=3)
-        >>> trainer = WorldTrainer(overrides=args)
-        >>> trainer.train()
-    """
-
-    def __init__(self, cfg=DEFAULT_CFG, overrides: dict[str, Any] | None = None, _callbacks=None):
-        """
-        Initialize a WorldTrainer object with given arguments.
-
-        Args:
-            cfg (dict[str, Any]): Configuration for the trainer.
-            overrides (dict[str, Any], optional): Configuration overrides.
-            _callbacks (list[Any], optional): List of callback functions.
-        """
-        if overrides is None:
-            overrides = {}
-        assert not overrides.get("compile"), f"Training with 'model={overrides['model']}' requires 'compile=False'"
-        super().__init__(cfg, overrides, _callbacks)
-        self.text_embeddings = None
-
-    def get_model(self, cfg=None, weights: str | None = None, verbose: bool = True) -> WorldModel:
-        """
-        Return WorldModel initialized with specified config and weights.
-
-        Args:
-            cfg (dict[str, Any] | str, optional): Model configuration.
-            weights (str, optional): Path to pretrained weights.
-            verbose (bool): Whether to display model info.
-
-        Returns:
-            (WorldModel): Initialized WorldModel.
-        """
-        # NOTE: This `nc` here is the max number of different text samples in one image, rather than the actual `nc`.
-        # NOTE: Following the official config, nc hard-coded to 80 for now.
-        model = WorldModel(
-            cfg["yaml_file"] if isinstance(cfg, dict) else cfg,
-            ch=self.data["channels"],
-            nc=min(self.data["nc"], 80),
-            verbose=verbose and RANK == -1,
-        )
-        if weights:
-            model.load(weights)
-        self.add_callback("on_pretrain_routine_end", on_pretrain_routine_end)
-
-        return model
-
-    def build_dataset(self, img_path: str, mode: str = "train", batch: int | None = None):
-        """
-        Build YOLO Dataset for training or validation.
-
-        Args:
-            img_path (str): Path to the folder containing images.
-            mode (str): `train` mode or `val` mode, users are able to customize different augmentations for each mode.
-            batch (int, optional): Size of batches, this is for `rect`.
-
-        Returns:
-            (Any): YOLO dataset configured for training or validation.
-        """
-        gs = max(int(unwrap_model(self.model).stride.max() if self.model else 0), 32)
-        dataset = build_yolo_dataset(
-            self.args, img_path, batch, self.data, mode=mode, rect=mode == "val", stride=gs, multi_modal=mode == "train"
-        )
-        if mode == "train":
-            self.set_text_embeddings([dataset], batch)  # cache text embeddings to accelerate training
-        return dataset
-
-    def set_text_embeddings(self, datasets: list[Any], batch: int | None) -> None:
-        """
-        Set text embeddings for datasets to accelerate training by caching category names.
-
-        This method collects unique category names from all datasets, then generates and caches text embeddings
-        for these categories to improve training efficiency.
-
-        Args:
-            datasets (list[Any]): List of datasets from which to extract category names.
-            batch (int | None): Batch size used for processing.
-
-        Notes:
-            This method collects category names from datasets that have the 'category_names' attribute,
-            then uses the first dataset's image path to determine where to cache the generated text embeddings.
-        """
-        text_embeddings = {}
-        for dataset in datasets:
-            if not hasattr(dataset, "category_names"):
-                continue
-            text_embeddings.update(
-                self.generate_text_embeddings(
-                    list(dataset.category_names), batch, cache_dir=Path(dataset.img_path).parent
-                )
-            )
-        self.text_embeddings = text_embeddings
-
-    def generate_text_embeddings(self, texts: list[str], batch: int, cache_dir: Path) -> dict[str, torch.Tensor]:
-        """
-        Generate text embeddings for a list of text samples.
-
-        Args:
-            texts (list[str]): List of text samples to encode.
-            batch (int): Batch size for processing.
-            cache_dir (Path): Directory to save/load cached embeddings.
-
-        Returns:
-            (dict[str, torch.Tensor]): Dictionary mapping text samples to their embeddings.
-        """
-        model = "clip:ViT-B/32"
-        cache_path = cache_dir / f"text_embeddings_{model.replace(':', '_').replace('/', '_')}.pt"
-        if cache_path.exists():
-            LOGGER.info(f"Reading existed cache from '{cache_path}'")
-            txt_map = torch.load(cache_path, map_location=self.device)
-            if sorted(txt_map.keys()) == sorted(texts):
-                return txt_map
-        LOGGER.info(f"Caching text embeddings to '{cache_path}'")
-        assert self.model is not None
-        txt_feats = unwrap_model(self.model).get_text_pe(texts, batch, cache_clip_model=False)
-        txt_map = dict(zip(texts, txt_feats.squeeze(0)))
-        torch.save(txt_map, cache_path)
-        return txt_map
-
-    def preprocess_batch(self, batch: dict[str, Any]) -> dict[str, Any]:
-        """Preprocess a batch of images and text for YOLOWorld training."""
-        batch = DetectionTrainer.preprocess_batch(self, batch)
-
-        # Add text features
-        texts = list(itertools.chain(*batch["texts"]))
-        txt_feats = torch.stack([self.text_embeddings[text] for text in texts]).to(
-            self.device, non_blocking=self.device.type == "cuda"
-        )
-        batch["txt_feats"] = txt_feats.reshape(len(batch["texts"]), -1, txt_feats.shape[-1])
-        return batch
diff --git a/ultralytics/models/yolo/world/train_world.py b/ultralytics/models/yolo/world/train_world.py
deleted file mode 100644
index 558323c..0000000
--- a/ultralytics/models/yolo/world/train_world.py
+++ /dev/null
@@ -1,201 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from pathlib import Path
-
-from ultralytics.data import YOLOConcatDataset, build_grounding, build_yolo_dataset
-from ultralytics.data.utils import check_det_dataset
-from ultralytics.models.yolo.world import WorldTrainer
-from ultralytics.utils import DATASETS_DIR, DEFAULT_CFG, LOGGER
-from ultralytics.utils.torch_utils import unwrap_model
-
-
-class WorldTrainerFromScratch(WorldTrainer):
-    """
-    A class extending the WorldTrainer for training a world model from scratch on open-set datasets.
-
-    This trainer specializes in handling mixed datasets including both object detection and grounding datasets,
-    supporting training YOLO-World models with combined vision-language capabilities.
-
-    Attributes:
-        cfg (dict): Configuration dictionary with default parameters for model training.
-        overrides (dict): Dictionary of parameter overrides to customize the configuration.
-        _callbacks (list): List of callback functions to be executed during different stages of training.
-        data (dict): Final processed data configuration containing train/val paths and metadata.
-        training_data (dict): Dictionary mapping training dataset paths to their configurations.
-
-    Methods:
-        build_dataset: Build YOLO Dataset for training or validation with mixed dataset support.
-        get_dataset: Get train and validation paths from data dictionary.
-        plot_training_labels: Skip label plotting for YOLO-World training.
-        final_eval: Perform final evaluation and validation for the YOLO-World model.
-
-    Examples:
-        >>> from ultralytics.models.yolo.world.train_world import WorldTrainerFromScratch
-        >>> from ultralytics import YOLOWorld
-        >>> data = dict(
-        ...     train=dict(
-        ...         yolo_data=["Objects365.yaml"],
-        ...         grounding_data=[
-        ...             dict(
-        ...                 img_path="flickr30k/images",
-        ...                 json_file="flickr30k/final_flickr_separateGT_train.json",
-        ...             ),
-        ...             dict(
-        ...                 img_path="GQA/images",
-        ...                 json_file="GQA/final_mixed_train_no_coco.json",
-        ...             ),
-        ...         ],
-        ...     ),
-        ...     val=dict(yolo_data=["lvis.yaml"]),
-        ... )
-        >>> model = YOLOWorld("yolov8s-worldv2.yaml")
-        >>> model.train(data=data, trainer=WorldTrainerFromScratch)
-    """
-
-    def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
-        """
-        Initialize a WorldTrainerFromScratch object.
-
-        This initializes a trainer for YOLO-World models from scratch, supporting mixed datasets including both
-        object detection and grounding datasets for vision-language capabilities.
-
-        Args:
-            cfg (dict): Configuration dictionary with default parameters for model training.
-            overrides (dict, optional): Dictionary of parameter overrides to customize the configuration.
-            _callbacks (list, optional): List of callback functions to be executed during different stages of training.
-
-        Examples:
-            >>> from ultralytics.models.yolo.world.train_world import WorldTrainerFromScratch
-            >>> from ultralytics import YOLOWorld
-            >>> data = dict(
-            ...     train=dict(
-            ...         yolo_data=["Objects365.yaml"],
-            ...         grounding_data=[
-            ...             dict(
-            ...                 img_path="flickr30k/images",
-            ...                 json_file="flickr30k/final_flickr_separateGT_train.json",
-            ...             ),
-            ...         ],
-            ...     ),
-            ...     val=dict(yolo_data=["lvis.yaml"]),
-            ... )
-            >>> model = YOLOWorld("yolov8s-worldv2.yaml")
-            >>> model.train(data=data, trainer=WorldTrainerFromScratch)
-        """
-        if overrides is None:
-            overrides = {}
-        super().__init__(cfg, overrides, _callbacks)
-
-    def build_dataset(self, img_path, mode="train", batch=None):
-        """
-        Build YOLO Dataset for training or validation.
-
-        This method constructs appropriate datasets based on the mode and input paths, handling both
-        standard YOLO datasets and grounding datasets with different formats.
-
-        Args:
-            img_path (list[str] | str): Path to the folder containing images or list of paths.
-            mode (str): 'train' mode or 'val' mode, allowing customized augmentations for each mode.
-            batch (int, optional): Size of batches, used for rectangular training/validation.
-
-        Returns:
-            (YOLOConcatDataset | Dataset): The constructed dataset for training or validation.
-        """
-        gs = max(int(unwrap_model(self.model).stride.max() if self.model else 0), 32)
-        if mode != "train":
-            return build_yolo_dataset(self.args, img_path, batch, self.data, mode=mode, rect=False, stride=gs)
-        datasets = [
-            build_yolo_dataset(self.args, im_path, batch, self.training_data[im_path], stride=gs, multi_modal=True)
-            if isinstance(im_path, str)
-            else build_grounding(
-                # assign `nc` from validation set to max number of text samples for training consistency
-                self.args,
-                im_path["img_path"],
-                im_path["json_file"],
-                batch,
-                stride=gs,
-                max_samples=self.data["nc"],
-            )
-            for im_path in img_path
-        ]
-        self.set_text_embeddings(datasets, batch)  # cache text embeddings to accelerate training
-        return YOLOConcatDataset(datasets) if len(datasets) > 1 else datasets[0]
-
-    def get_dataset(self):
-        """
-        Get train and validation paths from data dictionary.
-
-        Processes the data configuration to extract paths for training and validation datasets,
-        handling both YOLO detection datasets and grounding datasets.
-
-        Returns:
-            train_path (str): Train dataset path.
-            val_path (str): Validation dataset path.
-
-        Raises:
-            AssertionError: If train or validation datasets are not found, or if validation has multiple datasets.
-        """
-        final_data = {}
-        data_yaml = self.args.data
-        assert data_yaml.get("train", False), "train dataset not found"  # object365.yaml
-        assert data_yaml.get("val", False), "validation dataset not found"  # lvis.yaml
-        data = {k: [check_det_dataset(d) for d in v.get("yolo_data", [])] for k, v in data_yaml.items()}
-        assert len(data["val"]) == 1, f"Only support validating on 1 dataset for now, but got {len(data['val'])}."
-        val_split = "minival" if "lvis" in data["val"][0]["val"] else "val"
-        for d in data["val"]:
-            if d.get("minival") is None:  # for lvis dataset
-                continue
-            d["minival"] = str(d["path"] / d["minival"])
-        for s in {"train", "val"}:
-            final_data[s] = [d["train" if s == "train" else val_split] for d in data[s]]
-            # save grounding data if there's one
-            grounding_data = data_yaml[s].get("grounding_data")
-            if grounding_data is None:
-                continue
-            grounding_data = grounding_data if isinstance(grounding_data, list) else [grounding_data]
-            for g in grounding_data:
-                assert isinstance(g, dict), f"Grounding data should be provided in dict format, but got {type(g)}"
-                for k in {"img_path", "json_file"}:
-                    path = Path(g[k])
-                    if not path.exists() and not path.is_absolute():
-                        g[k] = str((DATASETS_DIR / g[k]).resolve())  # path relative to DATASETS_DIR
-            final_data[s] += grounding_data
-        # assign the first val dataset as currently only one validation set is supported
-        data["val"] = data["val"][0]
-        final_data["val"] = final_data["val"][0]
-        # NOTE: to make training work properly, set `nc` and `names`
-        final_data["nc"] = data["val"]["nc"]
-        final_data["names"] = data["val"]["names"]
-        # NOTE: add path with lvis path
-        final_data["path"] = data["val"]["path"]
-        final_data["channels"] = data["val"]["channels"]
-        self.data = final_data
-        if self.args.single_cls:  # consistent with base trainer
-            LOGGER.info("Overriding class names with single class.")
-            self.data["names"] = {0: "object"}
-            self.data["nc"] = 1
-        self.training_data = {}
-        for d in data["train"]:
-            if self.args.single_cls:
-                d["names"] = {0: "object"}
-                d["nc"] = 1
-            self.training_data[d["train"]] = d
-        return final_data
-
-    def plot_training_labels(self):
-        """Skip label plotting for YOLO-World training."""
-        pass
-
-    def final_eval(self):
-        """
-        Perform final evaluation and validation for the YOLO-World model.
-
-        Configures the validator with appropriate dataset and split information before running evaluation.
-
-        Returns:
-            (dict): Dictionary containing evaluation metrics and results.
-        """
-        val = self.args.data["val"]["yolo_data"][0]
-        self.validator.args.data = val
-        self.validator.args.split = "minival" if isinstance(val, str) and "lvis" in val else "val"
-        return super().final_eval()
diff --git a/ultralytics/models/yolo/yoloe/__init__.py b/ultralytics/models/yolo/yoloe/__init__.py
deleted file mode 100644
index 258498c..0000000
--- a/ultralytics/models/yolo/yoloe/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from .predict import YOLOEVPDetectPredictor, YOLOEVPSegPredictor
-from .train import YOLOEPEFreeTrainer, YOLOEPETrainer, YOLOETrainer, YOLOETrainerFromScratch, YOLOEVPTrainer
-from .train_seg import YOLOEPESegTrainer, YOLOESegTrainer, YOLOESegTrainerFromScratch, YOLOESegVPTrainer
-from .val import YOLOEDetectValidator, YOLOESegValidator
-
-__all__ = [
-    "YOLOETrainer",
-    "YOLOEPETrainer",
-    "YOLOESegTrainer",
-    "YOLOEDetectValidator",
-    "YOLOESegValidator",
-    "YOLOEPESegTrainer",
-    "YOLOESegTrainerFromScratch",
-    "YOLOESegVPTrainer",
-    "YOLOEVPTrainer",
-    "YOLOEPEFreeTrainer",
-    "YOLOEVPDetectPredictor",
-    "YOLOEVPSegPredictor",
-    "YOLOETrainerFromScratch",
-]
diff --git a/ultralytics/models/yolo/yoloe/__pycache__/__init__.cpython-310.pyc b/ultralytics/models/yolo/yoloe/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index 5f06ca2..0000000
Binary files a/ultralytics/models/yolo/yoloe/__pycache__/__init__.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/yolo/yoloe/__pycache__/predict.cpython-310.pyc b/ultralytics/models/yolo/yoloe/__pycache__/predict.cpython-310.pyc
deleted file mode 100644
index 7ff8297..0000000
Binary files a/ultralytics/models/yolo/yoloe/__pycache__/predict.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/yolo/yoloe/__pycache__/train.cpython-310.pyc b/ultralytics/models/yolo/yoloe/__pycache__/train.cpython-310.pyc
deleted file mode 100644
index 50511e7..0000000
Binary files a/ultralytics/models/yolo/yoloe/__pycache__/train.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/yolo/yoloe/__pycache__/train_seg.cpython-310.pyc b/ultralytics/models/yolo/yoloe/__pycache__/train_seg.cpython-310.pyc
deleted file mode 100644
index da6510c..0000000
Binary files a/ultralytics/models/yolo/yoloe/__pycache__/train_seg.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/yolo/yoloe/__pycache__/val.cpython-310.pyc b/ultralytics/models/yolo/yoloe/__pycache__/val.cpython-310.pyc
deleted file mode 100644
index a6dc229..0000000
Binary files a/ultralytics/models/yolo/yoloe/__pycache__/val.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/models/yolo/yoloe/predict.py b/ultralytics/models/yolo/yoloe/predict.py
deleted file mode 100644
index ee9a522..0000000
--- a/ultralytics/models/yolo/yoloe/predict.py
+++ /dev/null
@@ -1,169 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-import numpy as np
-import torch
-
-from ultralytics.data.augment import LoadVisualPrompt
-from ultralytics.models.yolo.detect import DetectionPredictor
-from ultralytics.models.yolo.segment import SegmentationPredictor
-
-
-class YOLOEVPDetectPredictor(DetectionPredictor):
-    """
-    A mixin class for YOLO-EVP (Enhanced Visual Prompting) predictors.
-
-    This mixin provides common functionality for YOLO models that use visual prompting, including
-    model setup, prompt handling, and preprocessing transformations.
-
-    Attributes:
-        model (torch.nn.Module): The YOLO model for inference.
-        device (torch.device): Device to run the model on (CPU or CUDA).
-        prompts (dict | torch.Tensor): Visual prompts containing class indices and bounding boxes or masks.
-
-    Methods:
-        setup_model: Initialize the YOLO model and set it to evaluation mode.
-        set_prompts: Set the visual prompts for the model.
-        pre_transform: Preprocess images and prompts before inference.
-        inference: Run inference with visual prompts.
-        get_vpe: Process source to get visual prompt embeddings.
-    """
-
-    def setup_model(self, model, verbose: bool = True):
-        """
-        Set up the model for prediction.
-
-        Args:
-            model (torch.nn.Module): Model to load or use.
-            verbose (bool, optional): If True, provides detailed logging.
-        """
-        super().setup_model(model, verbose=verbose)
-        self.done_warmup = True
-
-    def set_prompts(self, prompts):
-        """
-        Set the visual prompts for the model.
-
-        Args:
-            prompts (dict): Dictionary containing class indices and bounding boxes or masks.
-                Must include a 'cls' key with class indices.
-        """
-        self.prompts = prompts
-
-    def pre_transform(self, im):
-        """
-        Preprocess images and prompts before inference.
-
-        This method applies letterboxing to the input image and transforms the visual prompts
-        (bounding boxes or masks) accordingly.
-
-        Args:
-            im (list): List containing a single input image.
-
-        Returns:
-            (list): Preprocessed image ready for model inference.
-
-        Raises:
-            ValueError: If neither valid bounding boxes nor masks are provided in the prompts.
-        """
-        img = super().pre_transform(im)
-        bboxes = self.prompts.pop("bboxes", None)
-        masks = self.prompts.pop("masks", None)
-        category = self.prompts["cls"]
-        if len(img) == 1:
-            visuals = self._process_single_image(img[0].shape[:2], im[0].shape[:2], category, bboxes, masks)
-            prompts = visuals.unsqueeze(0).to(self.device)  # (1, N, H, W)
-        else:
-            # NOTE: only supports bboxes as prompts for now
-            assert bboxes is not None, f"Expected bboxes, but got {bboxes}!"
-            # NOTE: needs list[np.ndarray]
-            assert isinstance(bboxes, list) and all(isinstance(b, np.ndarray) for b in bboxes), (
-                f"Expected list[np.ndarray], but got {bboxes}!"
-            )
-            assert isinstance(category, list) and all(isinstance(b, np.ndarray) for b in category), (
-                f"Expected list[np.ndarray], but got {category}!"
-            )
-            assert len(im) == len(category) == len(bboxes), (
-                f"Expected same length for all inputs, but got {len(im)}vs{len(category)}vs{len(bboxes)}!"
-            )
-            visuals = [
-                self._process_single_image(img[i].shape[:2], im[i].shape[:2], category[i], bboxes[i])
-                for i in range(len(img))
-            ]
-            prompts = torch.nn.utils.rnn.pad_sequence(visuals, batch_first=True).to(self.device)  # (B, N, H, W)
-        self.prompts = prompts.half() if self.model.fp16 else prompts.float()
-        return img
-
-    def _process_single_image(self, dst_shape, src_shape, category, bboxes=None, masks=None):
-        """
-        Process a single image by resizing bounding boxes or masks and generating visuals.
-
-        Args:
-            dst_shape (tuple): The target shape (height, width) of the image.
-            src_shape (tuple): The original shape (height, width) of the image.
-            category (str): The category of the image for visual prompts.
-            bboxes (list | np.ndarray, optional): A list of bounding boxes in the format [x1, y1, x2, y2].
-            masks (np.ndarray, optional): A list of masks corresponding to the image.
-
-        Returns:
-            (torch.Tensor): The processed visuals for the image.
-
-        Raises:
-            ValueError: If neither `bboxes` nor `masks` are provided.
-        """
-        if bboxes is not None and len(bboxes):
-            bboxes = np.array(bboxes, dtype=np.float32)
-            if bboxes.ndim == 1:
-                bboxes = bboxes[None, :]
-            # Calculate scaling factor and adjust bounding boxes
-            gain = min(dst_shape[0] / src_shape[0], dst_shape[1] / src_shape[1])  # gain = old / new
-            bboxes *= gain
-            bboxes[..., 0::2] += round((dst_shape[1] - src_shape[1] * gain) / 2 - 0.1)
-            bboxes[..., 1::2] += round((dst_shape[0] - src_shape[0] * gain) / 2 - 0.1)
-        elif masks is not None:
-            # Resize and process masks
-            resized_masks = super().pre_transform(masks)
-            masks = np.stack(resized_masks)  # (N, H, W)
-            masks[masks == 114] = 0  # Reset padding values to 0
-        else:
-            raise ValueError("Please provide valid bboxes or masks")
-
-        # Generate visuals using the visual prompt loader
-        return LoadVisualPrompt().get_visuals(category, dst_shape, bboxes, masks)
-
-    def inference(self, im, *args, **kwargs):
-        """
-        Run inference with visual prompts.
-
-        Args:
-            im (torch.Tensor): Input image tensor.
-            *args (Any): Variable length argument list.
-            **kwargs (Any): Arbitrary keyword arguments.
-
-        Returns:
-            (torch.Tensor): Model prediction results.
-        """
-        return super().inference(im, vpe=self.prompts, *args, **kwargs)
-
-    def get_vpe(self, source):
-        """
-        Process the source to get the visual prompt embeddings (VPE).
-
-        Args:
-            source (str | Path | int | PIL.Image | np.ndarray | torch.Tensor | list | tuple): The source
-                of the image to make predictions on. Accepts various types including file paths, URLs, PIL
-                images, numpy arrays, and torch tensors.
-
-        Returns:
-            (torch.Tensor): The visual prompt embeddings (VPE) from the model.
-        """
-        self.setup_source(source)
-        assert len(self.dataset) == 1, "get_vpe only supports one image!"
-        for _, im0s, _ in self.dataset:
-            im = self.preprocess(im0s)
-            return self.model(im, vpe=self.prompts, return_vpe=True)
-
-
-class YOLOEVPSegPredictor(YOLOEVPDetectPredictor, SegmentationPredictor):
-    """Predictor for YOLO-EVP segmentation tasks combining detection and segmentation capabilities."""
-
-    pass
diff --git a/ultralytics/models/yolo/yoloe/train.py b/ultralytics/models/yolo/yoloe/train.py
deleted file mode 100644
index 58a3a1f..0000000
--- a/ultralytics/models/yolo/yoloe/train.py
+++ /dev/null
@@ -1,300 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-from copy import copy, deepcopy
-from pathlib import Path
-
-import torch
-
-from ultralytics.data import YOLOConcatDataset, build_yolo_dataset
-from ultralytics.data.augment import LoadVisualPrompt
-from ultralytics.models.yolo.detect import DetectionTrainer, DetectionValidator
-from ultralytics.nn.tasks import YOLOEModel
-from ultralytics.utils import DEFAULT_CFG, LOGGER, RANK
-from ultralytics.utils.torch_utils import unwrap_model
-
-from ..world.train_world import WorldTrainerFromScratch
-from .val import YOLOEDetectValidator
-
-
-class YOLOETrainer(DetectionTrainer):
-    """
-    A trainer class for YOLOE object detection models.
-
-    This class extends DetectionTrainer to provide specialized training functionality for YOLOE models,
-    including custom model initialization, validation, and dataset building with multi-modal support.
-
-    Attributes:
-        loss_names (tuple): Names of loss components used during training.
-
-    Methods:
-        get_model: Initialize and return a YOLOEModel with specified configuration.
-        get_validator: Return a YOLOEDetectValidator for model validation.
-        build_dataset: Build YOLO dataset with multi-modal support for training.
-    """
-
-    def __init__(self, cfg=DEFAULT_CFG, overrides: dict | None = None, _callbacks=None):
-        """
-        Initialize the YOLOE Trainer with specified configurations.
-
-        Args:
-            cfg (dict): Configuration dictionary with default training settings from DEFAULT_CFG.
-            overrides (dict, optional): Dictionary of parameter overrides for the default configuration.
-            _callbacks (list, optional): List of callback functions to be applied during training.
-        """
-        if overrides is None:
-            overrides = {}
-        assert not overrides.get("compile"), f"Training with 'model={overrides['model']}' requires 'compile=False'"
-        overrides["overlap_mask"] = False
-        super().__init__(cfg, overrides, _callbacks)
-
-    def get_model(self, cfg=None, weights=None, verbose: bool = True):
-        """
-        Return a YOLOEModel initialized with the specified configuration and weights.
-
-        Args:
-            cfg (dict | str, optional): Model configuration. Can be a dictionary containing a 'yaml_file' key,
-                a direct path to a YAML file, or None to use default configuration.
-            weights (str | Path, optional): Path to pretrained weights file to load into the model.
-            verbose (bool): Whether to display model information during initialization.
-
-        Returns:
-            (YOLOEModel): The initialized YOLOE model.
-
-        Notes:
-            - The number of classes (nc) is hard-coded to a maximum of 80 following the official configuration.
-            - The nc parameter here represents the maximum number of different text samples in one image,
-              rather than the actual number of classes.
-        """
-        # NOTE: This `nc` here is the max number of different text samples in one image, rather than the actual `nc`.
-        # NOTE: Following the official config, nc hard-coded to 80 for now.
-        model = YOLOEModel(
-            cfg["yaml_file"] if isinstance(cfg, dict) else cfg,
-            ch=self.data["channels"],
-            nc=min(self.data["nc"], 80),
-            verbose=verbose and RANK == -1,
-        )
-        if weights:
-            model.load(weights)
-
-        return model
-
-    def get_validator(self):
-        """Return a YOLOEDetectValidator for YOLOE model validation."""
-        self.loss_names = "box", "cls", "dfl"
-        return YOLOEDetectValidator(
-            self.test_loader, save_dir=self.save_dir, args=copy(self.args), _callbacks=self.callbacks
-        )
-
-    def build_dataset(self, img_path: str, mode: str = "train", batch: int | None = None):
-        """
-        Build YOLO Dataset.
-
-        Args:
-            img_path (str): Path to the folder containing images.
-            mode (str): 'train' mode or 'val' mode, users are able to customize different augmentations for each mode.
-            batch (int, optional): Size of batches, this is for rectangular training.
-
-        Returns:
-            (Dataset): YOLO dataset configured for training or validation.
-        """
-        gs = max(int(unwrap_model(self.model).stride.max() if self.model else 0), 32)
-        return build_yolo_dataset(
-            self.args, img_path, batch, self.data, mode=mode, rect=mode == "val", stride=gs, multi_modal=mode == "train"
-        )
-
-
-class YOLOEPETrainer(DetectionTrainer):
-    """
-    Fine-tune YOLOE model using linear probing approach.
-
-    This trainer freezes most model layers and only trains specific projection layers for efficient
-    fine-tuning on new datasets while preserving pretrained features.
-
-    Methods:
-        get_model: Initialize YOLOEModel with frozen layers except projection layers.
-    """
-
-    def get_model(self, cfg=None, weights=None, verbose: bool = True):
-        """
-        Return YOLOEModel initialized with specified config and weights.
-
-        Args:
-            cfg (dict | str, optional): Model configuration.
-            weights (str, optional): Path to pretrained weights.
-            verbose (bool): Whether to display model information.
-
-        Returns:
-            (YOLOEModel): Initialized model with frozen layers except for specific projection layers.
-        """
-        # NOTE: This `nc` here is the max number of different text samples in one image, rather than the actual `nc`.
-        # NOTE: Following the official config, nc hard-coded to 80 for now.
-        model = YOLOEModel(
-            cfg["yaml_file"] if isinstance(cfg, dict) else cfg,
-            ch=self.data["channels"],
-            nc=self.data["nc"],
-            verbose=verbose and RANK == -1,
-        )
-
-        del model.model[-1].savpe
-
-        assert weights is not None, "Pretrained weights must be provided for linear probing."
-        if weights:
-            model.load(weights)
-
-        model.eval()
-        names = list(self.data["names"].values())
-        # NOTE: `get_text_pe` related to text model and YOLOEDetect.reprta,
-        # it'd get correct results as long as loading proper pretrained weights.
-        tpe = model.get_text_pe(names)
-        model.set_classes(names, tpe)
-        model.model[-1].fuse(model.pe)  # fuse text embeddings to classify head
-        model.model[-1].cv3[0][2] = deepcopy(model.model[-1].cv3[0][2]).requires_grad_(True)
-        model.model[-1].cv3[1][2] = deepcopy(model.model[-1].cv3[1][2]).requires_grad_(True)
-        model.model[-1].cv3[2][2] = deepcopy(model.model[-1].cv3[2][2]).requires_grad_(True)
-        del model.pe
-        model.train()
-
-        return model
-
-
-class YOLOETrainerFromScratch(YOLOETrainer, WorldTrainerFromScratch):
-    """
-    Train YOLOE models from scratch with text embedding support.
-
-    This trainer combines YOLOE training capabilities with world training features, enabling
-    training from scratch with text embeddings and grounding datasets.
-
-    Methods:
-        build_dataset: Build datasets for training with grounding support.
-        generate_text_embeddings: Generate and cache text embeddings for training.
-    """
-
-    def build_dataset(self, img_path: list[str] | str, mode: str = "train", batch: int | None = None):
-        """
-        Build YOLO Dataset for training or validation.
-
-        This method constructs appropriate datasets based on the mode and input paths, handling both
-        standard YOLO datasets and grounding datasets with different formats.
-
-        Args:
-            img_path (list[str] | str): Path to the folder containing images or list of paths.
-            mode (str): 'train' mode or 'val' mode, allowing customized augmentations for each mode.
-            batch (int, optional): Size of batches, used for rectangular training/validation.
-
-        Returns:
-            (YOLOConcatDataset | Dataset): The constructed dataset for training or validation.
-        """
-        return WorldTrainerFromScratch.build_dataset(self, img_path, mode, batch)
-
-    def generate_text_embeddings(self, texts: list[str], batch: int, cache_dir: Path):
-        """
-        Generate text embeddings for a list of text samples.
-
-        Args:
-            texts (list[str]): List of text samples to encode.
-            batch (int): Batch size for processing.
-            cache_dir (Path): Directory to save/load cached embeddings.
-
-        Returns:
-            (dict): Dictionary mapping text samples to their embeddings.
-        """
-        model = "mobileclip:blt"
-        cache_path = cache_dir / f"text_embeddings_{model.replace(':', '_').replace('/', '_')}.pt"
-        if cache_path.exists():
-            LOGGER.info(f"Reading existed cache from '{cache_path}'")
-            txt_map = torch.load(cache_path, map_location=self.device)
-            if sorted(txt_map.keys()) == sorted(texts):
-                return txt_map
-        LOGGER.info(f"Caching text embeddings to '{cache_path}'")
-        assert self.model is not None
-        txt_feats = unwrap_model(self.model).get_text_pe(texts, batch, without_reprta=True, cache_clip_model=False)
-        txt_map = dict(zip(texts, txt_feats.squeeze(0)))
-        torch.save(txt_map, cache_path)
-        return txt_map
-
-
-class YOLOEPEFreeTrainer(YOLOEPETrainer, YOLOETrainerFromScratch):
-    """
-    Train prompt-free YOLOE model.
-
-    This trainer combines linear probing capabilities with from-scratch training for prompt-free
-    YOLOE models that don't require text prompts during inference.
-
-    Methods:
-        get_validator: Return standard DetectionValidator for validation.
-        preprocess_batch: Preprocess batches without text features.
-        set_text_embeddings: Set text embeddings for datasets (no-op for prompt-free).
-    """
-
-    def get_validator(self):
-        """Return a DetectionValidator for YOLO model validation."""
-        self.loss_names = "box", "cls", "dfl"
-        return DetectionValidator(
-            self.test_loader, save_dir=self.save_dir, args=copy(self.args), _callbacks=self.callbacks
-        )
-
-    def preprocess_batch(self, batch):
-        """Preprocess a batch of images for YOLOE training, adjusting formatting and dimensions as needed."""
-        return DetectionTrainer.preprocess_batch(self, batch)
-
-    def set_text_embeddings(self, datasets, batch: int):
-        """
-        Set text embeddings for datasets to accelerate training by caching category names.
-
-        This method collects unique category names from all datasets, generates text embeddings for them,
-        and caches these embeddings to improve training efficiency. The embeddings are stored in a file
-        in the parent directory of the first dataset's image path.
-
-        Args:
-            datasets (list[Dataset]): List of datasets containing category names to process.
-            batch (int): Batch size for processing text embeddings.
-
-        Notes:
-            The method creates a dictionary mapping text samples to their embeddings and stores it
-            at the path specified by 'cache_path'. If the cache file already exists, it will be loaded
-            instead of regenerating the embeddings.
-        """
-        pass
-
-
-class YOLOEVPTrainer(YOLOETrainerFromScratch):
-    """
-    Train YOLOE model with visual prompts.
-
-    This trainer extends YOLOETrainerFromScratch to support visual prompt-based training,
-    where visual cues are provided alongside images to guide the detection process.
-
-    Methods:
-        build_dataset: Build dataset with visual prompt loading transforms.
-    """
-
-    def build_dataset(self, img_path: list[str] | str, mode: str = "train", batch: int | None = None):
-        """
-        Build YOLO Dataset for training or validation with visual prompts.
-
-        Args:
-            img_path (list[str] | str): Path to the folder containing images or list of paths.
-            mode (str): 'train' mode or 'val' mode, allowing customized augmentations for each mode.
-            batch (int, optional): Size of batches, used for rectangular training/validation.
-
-        Returns:
-            (Dataset): YOLO dataset configured for training or validation, with visual prompts for training mode.
-        """
-        dataset = super().build_dataset(img_path, mode, batch)
-        if isinstance(dataset, YOLOConcatDataset):
-            for d in dataset.datasets:
-                d.transforms.append(LoadVisualPrompt())
-        else:
-            dataset.transforms.append(LoadVisualPrompt())
-        return dataset
-
-    def _close_dataloader_mosaic(self):
-        """Close mosaic augmentation and add visual prompt loading to the training dataset."""
-        super()._close_dataloader_mosaic()
-        if isinstance(self.train_loader.dataset, YOLOConcatDataset):
-            for d in self.train_loader.dataset.datasets:
-                d.transforms.append(LoadVisualPrompt())
-        else:
-            self.train_loader.dataset.transforms.append(LoadVisualPrompt())
diff --git a/ultralytics/models/yolo/yoloe/train_seg.py b/ultralytics/models/yolo/yoloe/train_seg.py
deleted file mode 100644
index 76bce90..0000000
--- a/ultralytics/models/yolo/yoloe/train_seg.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from copy import copy, deepcopy
-
-from ultralytics.models.yolo.segment import SegmentationTrainer
-from ultralytics.nn.tasks import YOLOESegModel
-from ultralytics.utils import RANK
-
-from .train import YOLOETrainer, YOLOETrainerFromScratch, YOLOEVPTrainer
-from .val import YOLOESegValidator
-
-
-class YOLOESegTrainer(YOLOETrainer, SegmentationTrainer):
-    """
-    Trainer class for YOLOE segmentation models.
-
-    This class combines YOLOETrainer and SegmentationTrainer to provide training functionality specifically for YOLOE
-    segmentation models, enabling both object detection and instance segmentation capabilities.
-
-    Attributes:
-        cfg (dict): Configuration dictionary with training parameters.
-        overrides (dict): Dictionary with parameter overrides.
-        _callbacks (list): List of callback functions for training events.
-    """
-
-    def get_model(self, cfg=None, weights=None, verbose=True):
-        """
-        Return YOLOESegModel initialized with specified config and weights.
-
-        Args:
-            cfg (dict | str, optional): Model configuration dictionary or YAML file path.
-            weights (str, optional): Path to pretrained weights file.
-            verbose (bool): Whether to display model information.
-
-        Returns:
-            (YOLOESegModel): Initialized YOLOE segmentation model.
-        """
-        # NOTE: This `nc` here is the max number of different text samples in one image, rather than the actual `nc`.
-        # NOTE: Following the official config, nc hard-coded to 80 for now.
-        model = YOLOESegModel(
-            cfg["yaml_file"] if isinstance(cfg, dict) else cfg,
-            ch=self.data["channels"],
-            nc=min(self.data["nc"], 80),
-            verbose=verbose and RANK == -1,
-        )
-        if weights:
-            model.load(weights)
-
-        return model
-
-    def get_validator(self):
-        """
-        Create and return a validator for YOLOE segmentation model evaluation.
-
-        Returns:
-            (YOLOESegValidator): Validator for YOLOE segmentation models.
-        """
-        self.loss_names = "box", "seg", "cls", "dfl"
-        return YOLOESegValidator(
-            self.test_loader, save_dir=self.save_dir, args=copy(self.args), _callbacks=self.callbacks
-        )
-
-
-class YOLOEPESegTrainer(SegmentationTrainer):
-    """
-    Fine-tune YOLOESeg model in linear probing way.
-
-    This trainer specializes in fine-tuning YOLOESeg models using a linear probing approach, which involves freezing
-    most of the model and only training specific layers for efficient adaptation to new tasks.
-
-    Attributes:
-        data (dict): Dataset configuration containing channels, class names, and number of classes.
-    """
-
-    def get_model(self, cfg=None, weights=None, verbose=True):
-        """
-        Return YOLOESegModel initialized with specified config and weights for linear probing.
-
-        Args:
-            cfg (dict | str, optional): Model configuration dictionary or YAML file path.
-            weights (str, optional): Path to pretrained weights file.
-            verbose (bool): Whether to display model information.
-
-        Returns:
-            (YOLOESegModel): Initialized YOLOE segmentation model configured for linear probing.
-        """
-        # NOTE: This `nc` here is the max number of different text samples in one image, rather than the actual `nc`.
-        # NOTE: Following the official config, nc hard-coded to 80 for now.
-        model = YOLOESegModel(
-            cfg["yaml_file"] if isinstance(cfg, dict) else cfg,
-            ch=self.data["channels"],
-            nc=self.data["nc"],
-            verbose=verbose and RANK == -1,
-        )
-
-        del model.model[-1].savpe
-
-        assert weights is not None, "Pretrained weights must be provided for linear probing."
-        if weights:
-            model.load(weights)
-
-        model.eval()
-        names = list(self.data["names"].values())
-        # NOTE: `get_text_pe` related to text model and YOLOEDetect.reprta,
-        # it'd get correct results as long as loading proper pretrained weights.
-        tpe = model.get_text_pe(names)
-        model.set_classes(names, tpe)
-        model.model[-1].fuse(model.pe)
-        model.model[-1].cv3[0][2] = deepcopy(model.model[-1].cv3[0][2]).requires_grad_(True)
-        model.model[-1].cv3[1][2] = deepcopy(model.model[-1].cv3[1][2]).requires_grad_(True)
-        model.model[-1].cv3[2][2] = deepcopy(model.model[-1].cv3[2][2]).requires_grad_(True)
-        del model.pe
-        model.train()
-
-        return model
-
-
-class YOLOESegTrainerFromScratch(YOLOETrainerFromScratch, YOLOESegTrainer):
-    """Trainer for YOLOE segmentation models trained from scratch without pretrained weights."""
-
-    pass
-
-
-class YOLOESegVPTrainer(YOLOEVPTrainer, YOLOESegTrainerFromScratch):
-    """Trainer for YOLOE segmentation models with Vision Prompt (VP) capabilities."""
-
-    pass
diff --git a/ultralytics/models/yolo/yoloe/val.py b/ultralytics/models/yolo/yoloe/val.py
deleted file mode 100644
index 68f22a6..0000000
--- a/ultralytics/models/yolo/yoloe/val.py
+++ /dev/null
@@ -1,211 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-from copy import deepcopy
-from pathlib import Path
-from typing import Any
-
-import torch
-from torch.nn import functional as F
-
-from ultralytics.data import YOLOConcatDataset, build_dataloader, build_yolo_dataset
-from ultralytics.data.augment import LoadVisualPrompt
-from ultralytics.data.utils import check_det_dataset
-from ultralytics.models.yolo.detect import DetectionValidator
-from ultralytics.models.yolo.segment import SegmentationValidator
-from ultralytics.nn.modules.head import YOLOEDetect
-from ultralytics.nn.tasks import YOLOEModel
-from ultralytics.utils import LOGGER, TQDM
-from ultralytics.utils.torch_utils import select_device, smart_inference_mode
-
-
-class YOLOEDetectValidator(DetectionValidator):
-    """
-    A validator class for YOLOE detection models that handles both text and visual prompt embeddings.
-
-    This class extends DetectionValidator to provide specialized validation functionality for YOLOE models.
-    It supports validation using either text prompts or visual prompt embeddings extracted from training samples,
-    enabling flexible evaluation strategies for prompt-based object detection.
-
-    Attributes:
-        device (torch.device): The device on which validation is performed.
-        args (namespace): Configuration arguments for validation.
-        dataloader (DataLoader): DataLoader for validation data.
-
-    Methods:
-        get_visual_pe: Extract visual prompt embeddings from training samples.
-        preprocess: Preprocess batch data ensuring visuals are on the same device as images.
-        get_vpe_dataloader: Create a dataloader for LVIS training visual prompt samples.
-        __call__: Run validation using either text or visual prompt embeddings.
-
-    Examples:
-        Validate with text prompts
-        >>> validator = YOLOEDetectValidator()
-        >>> stats = validator(model=model, load_vp=False)
-
-        Validate with visual prompts
-        >>> stats = validator(model=model, refer_data="path/to/data.yaml", load_vp=True)
-    """
-
-    @smart_inference_mode()
-    def get_visual_pe(self, dataloader: torch.utils.data.DataLoader, model: YOLOEModel) -> torch.Tensor:
-        """
-        Extract visual prompt embeddings from training samples.
-
-        This method processes a dataloader to compute visual prompt embeddings for each class using a YOLOE model.
-        It normalizes the embeddings and handles cases where no samples exist for a class by setting their
-        embeddings to zero.
-
-        Args:
-            dataloader (torch.utils.data.DataLoader): The dataloader providing training samples.
-            model (YOLOEModel): The YOLOE model from which to extract visual prompt embeddings.
-
-        Returns:
-            (torch.Tensor): Visual prompt embeddings with shape (1, num_classes, embed_dim).
-        """
-        assert isinstance(model, YOLOEModel)
-        names = [name.split("/", 1)[0] for name in list(dataloader.dataset.data["names"].values())]
-        visual_pe = torch.zeros(len(names), model.model[-1].embed, device=self.device)
-        cls_visual_num = torch.zeros(len(names))
-
-        desc = "Get visual prompt embeddings from samples"
-
-        # Count samples per class
-        for batch in dataloader:
-            cls = batch["cls"].squeeze(-1).to(torch.int).unique()
-            count = torch.bincount(cls, minlength=len(names))
-            cls_visual_num += count
-
-        cls_visual_num = cls_visual_num.to(self.device)
-
-        # Extract visual prompt embeddings
-        pbar = TQDM(dataloader, total=len(dataloader), desc=desc)
-        for batch in pbar:
-            batch = self.preprocess(batch)
-            preds = model.get_visual_pe(batch["img"], visual=batch["visuals"])  # (B, max_n, embed_dim)
-
-            batch_idx = batch["batch_idx"]
-            for i in range(preds.shape[0]):
-                cls = batch["cls"][batch_idx == i].squeeze(-1).to(torch.int).unique(sorted=True)
-                pad_cls = torch.ones(preds.shape[1], device=self.device) * -1
-                pad_cls[: cls.shape[0]] = cls
-                for c in cls:
-                    visual_pe[c] += preds[i][pad_cls == c].sum(0) / cls_visual_num[c]
-
-        # Normalize embeddings for classes with samples, set others to zero
-        visual_pe[cls_visual_num != 0] = F.normalize(visual_pe[cls_visual_num != 0], dim=-1, p=2)
-        visual_pe[cls_visual_num == 0] = 0
-        return visual_pe.unsqueeze(0)
-
-    def get_vpe_dataloader(self, data: dict[str, Any]) -> torch.utils.data.DataLoader:
-        """
-        Create a dataloader for LVIS training visual prompt samples.
-
-        This method prepares a dataloader for visual prompt embeddings (VPE) using the specified dataset.
-        It applies necessary transformations including LoadVisualPrompt and configurations to the dataset
-        for validation purposes.
-
-        Args:
-            data (dict): Dataset configuration dictionary containing paths and settings.
-
-        Returns:
-            (torch.utils.data.DataLoader): The dataloader for visual prompt samples.
-        """
-        dataset = build_yolo_dataset(
-            self.args,
-            data.get(self.args.split, data.get("val")),
-            self.args.batch,
-            data,
-            mode="val",
-            rect=False,
-        )
-        if isinstance(dataset, YOLOConcatDataset):
-            for d in dataset.datasets:
-                d.transforms.append(LoadVisualPrompt())
-        else:
-            dataset.transforms.append(LoadVisualPrompt())
-        return build_dataloader(
-            dataset,
-            self.args.batch,
-            self.args.workers,
-            shuffle=False,
-            rank=-1,
-        )
-
-    @smart_inference_mode()
-    def __call__(
-        self,
-        trainer: Any | None = None,
-        model: YOLOEModel | str | None = None,
-        refer_data: str | None = None,
-        load_vp: bool = False,
-    ) -> dict[str, Any]:
-        """
-        Run validation on the model using either text or visual prompt embeddings.
-
-        This method validates the model using either text prompts or visual prompts, depending on the load_vp flag.
-        It supports validation during training (using a trainer object) or standalone validation with a provided
-        model. For visual prompts, reference data can be specified to extract embeddings from a different dataset.
-
-        Args:
-            trainer (object, optional): Trainer object containing the model and device.
-            model (YOLOEModel | str, optional): Model to validate. Required if trainer is not provided.
-            refer_data (str, optional): Path to reference data for visual prompts.
-            load_vp (bool): Whether to load visual prompts. If False, text prompts are used.
-
-        Returns:
-            (dict): Validation statistics containing metrics computed during validation.
-        """
-        if trainer is not None:
-            self.device = trainer.device
-            model = trainer.ema.ema
-            names = [name.split("/", 1)[0] for name in list(self.dataloader.dataset.data["names"].values())]
-
-            if load_vp:
-                LOGGER.info("Validate using the visual prompt.")
-                self.args.half = False
-                # Directly use the same dataloader for visual embeddings extracted during training
-                vpe = self.get_visual_pe(self.dataloader, model)
-                model.set_classes(names, vpe)
-            else:
-                LOGGER.info("Validate using the text prompt.")
-                tpe = model.get_text_pe(names)
-                model.set_classes(names, tpe)
-            stats = super().__call__(trainer, model)
-        else:
-            if refer_data is not None:
-                assert load_vp, "Refer data is only used for visual prompt validation."
-            self.device = select_device(self.args.device, verbose=False)
-
-            if isinstance(model, (str, Path)):
-                from ultralytics.nn.tasks import load_checkpoint
-
-                model, _ = load_checkpoint(model, device=self.device)  # model, ckpt
-            model.eval().to(self.device)
-            data = check_det_dataset(refer_data or self.args.data)
-            names = [name.split("/", 1)[0] for name in list(data["names"].values())]
-
-            if load_vp:
-                LOGGER.info("Validate using the visual prompt.")
-                self.args.half = False
-                # TODO: need to check if the names from refer data is consistent with the evaluated dataset
-                # could use same dataset or refer to extract visual prompt embeddings
-                dataloader = self.get_vpe_dataloader(data)
-                vpe = self.get_visual_pe(dataloader, model)
-                model.set_classes(names, vpe)
-                stats = super().__call__(model=deepcopy(model))
-            elif isinstance(model.model[-1], YOLOEDetect) and hasattr(model.model[-1], "lrpc"):  # prompt-free
-                return super().__call__(trainer, model)
-            else:
-                LOGGER.info("Validate using the text prompt.")
-                tpe = model.get_text_pe(names)
-                model.set_classes(names, tpe)
-                stats = super().__call__(model=deepcopy(model))
-        return stats
-
-
-class YOLOESegValidator(YOLOEDetectValidator, SegmentationValidator):
-    """YOLOE segmentation validator that supports both text and visual prompt embeddings."""
-
-    pass
diff --git a/ultralytics/nn/__init__.py b/ultralytics/nn/__init__.py
deleted file mode 100644
index fb2c034..0000000
--- a/ultralytics/nn/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from .tasks import (
-    BaseModel,
-    ClassificationModel,
-    DetectionModel,
-    SegmentationModel,
-    guess_model_scale,
-    guess_model_task,
-    load_checkpoint,
-    parse_model,
-    torch_safe_load,
-    yaml_model_load,
-)
-
-__all__ = (
-    "load_checkpoint",
-    "parse_model",
-    "yaml_model_load",
-    "guess_model_task",
-    "guess_model_scale",
-    "torch_safe_load",
-    "DetectionModel",
-    "SegmentationModel",
-    "ClassificationModel",
-    "BaseModel",
-)
diff --git a/ultralytics/nn/__pycache__/__init__.cpython-310.pyc b/ultralytics/nn/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index 0cde393..0000000
Binary files a/ultralytics/nn/__pycache__/__init__.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/nn/__pycache__/autobackend.cpython-310.pyc b/ultralytics/nn/__pycache__/autobackend.cpython-310.pyc
deleted file mode 100644
index b1e361f..0000000
Binary files a/ultralytics/nn/__pycache__/autobackend.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/nn/__pycache__/tasks.cpython-310.pyc b/ultralytics/nn/__pycache__/tasks.cpython-310.pyc
deleted file mode 100644
index fdb7d39..0000000
Binary files a/ultralytics/nn/__pycache__/tasks.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/nn/autobackend.py b/ultralytics/nn/autobackend.py
deleted file mode 100644
index 4c98906..0000000
--- a/ultralytics/nn/autobackend.py
+++ /dev/null
@@ -1,886 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import ast
-import json
-import platform
-import zipfile
-from collections import OrderedDict, namedtuple
-from pathlib import Path
-from typing import Any
-
-import cv2
-import numpy as np
-import torch
-import torch.nn as nn
-from PIL import Image
-
-from ultralytics.utils import ARM64, IS_JETSON, LINUX, LOGGER, PYTHON_VERSION, ROOT, YAML, is_jetson
-from ultralytics.utils.checks import check_requirements, check_suffix, check_version, check_yaml, is_rockchip
-from ultralytics.utils.downloads import attempt_download_asset, is_url
-
-
-def check_class_names(names: list | dict) -> dict[int, str]:
-    """
-    Check class names and convert to dict format if needed.
-
-    Args:
-        names (list | dict): Class names as list or dict format.
-
-    Returns:
-        (dict): Class names in dict format with integer keys and string values.
-
-    Raises:
-        KeyError: If class indices are invalid for the dataset size.
-    """
-    if isinstance(names, list):  # names is a list
-        names = dict(enumerate(names))  # convert to dict
-    if isinstance(names, dict):
-        # Convert 1) string keys to int, i.e. '0' to 0, and non-string values to strings, i.e. True to 'True'
-        names = {int(k): str(v) for k, v in names.items()}
-        n = len(names)
-        if max(names.keys()) >= n:
-            raise KeyError(
-                f"{n}-class dataset requires class indices 0-{n - 1}, but you have invalid class indices "
-                f"{min(names.keys())}-{max(names.keys())} defined in your dataset YAML."
-            )
-        if isinstance(names[0], str) and names[0].startswith("n0"):  # imagenet class codes, i.e. 'n01440764'
-            names_map = YAML.load(ROOT / "cfg/datasets/ImageNet.yaml")["map"]  # human-readable names
-            names = {k: names_map[v] for k, v in names.items()}
-    return names
-
-
-def default_class_names(data: str | Path | None = None) -> dict[int, str]:
-    """
-    Apply default class names to an input YAML file or return numerical class names.
-
-    Args:
-        data (str | Path, optional): Path to YAML file containing class names.
-
-    Returns:
-        (dict): Dictionary mapping class indices to class names.
-    """
-    if data:
-        try:
-            return YAML.load(check_yaml(data))["names"]
-        except Exception:
-            pass
-    return {i: f"class{i}" for i in range(999)}  # return default if above errors
-
-
-class AutoBackend(nn.Module):
-    """
-    Handle dynamic backend selection for running inference using Ultralytics YOLO models.
-
-    The AutoBackend class is designed to provide an abstraction layer for various inference engines. It supports a wide
-    range of formats, each with specific naming conventions as outlined below:
-
-        Supported Formats and Naming Conventions:
-            | Format                | File Suffix       |
-            | --------------------- | ----------------- |
-            | PyTorch               | *.pt              |
-            | TorchScript           | *.torchscript     |
-            | ONNX Runtime          | *.onnx            |
-            | ONNX OpenCV DNN       | *.onnx (dnn=True) |
-            | OpenVINO              | *openvino_model/  |
-            | CoreML                | *.mlpackage       |
-            | TensorRT              | *.engine          |
-            | TensorFlow SavedModel | *_saved_model/    |
-            | TensorFlow GraphDef   | *.pb              |
-            | TensorFlow Lite       | *.tflite          |
-            | TensorFlow Edge TPU   | *_edgetpu.tflite  |
-            | PaddlePaddle          | *_paddle_model/   |
-            | MNN                   | *.mnn             |
-            | NCNN                  | *_ncnn_model/     |
-            | IMX                   | *_imx_model/      |
-            | RKNN                  | *_rknn_model/     |
-
-    Attributes:
-        model (torch.nn.Module): The loaded YOLO model.
-        device (torch.device): The device (CPU or GPU) on which the model is loaded.
-        task (str): The type of task the model performs (detect, segment, classify, pose).
-        names (dict): A dictionary of class names that the model can detect.
-        stride (int): The model stride, typically 32 for YOLO models.
-        fp16 (bool): Whether the model uses half-precision (FP16) inference.
-        nhwc (bool): Whether the model expects NHWC input format instead of NCHW.
-        pt (bool): Whether the model is a PyTorch model.
-        jit (bool): Whether the model is a TorchScript model.
-        onnx (bool): Whether the model is an ONNX model.
-        xml (bool): Whether the model is an OpenVINO model.
-        engine (bool): Whether the model is a TensorRT engine.
-        coreml (bool): Whether the model is a CoreML model.
-        saved_model (bool): Whether the model is a TensorFlow SavedModel.
-        pb (bool): Whether the model is a TensorFlow GraphDef.
-        tflite (bool): Whether the model is a TensorFlow Lite model.
-        edgetpu (bool): Whether the model is a TensorFlow Edge TPU model.
-        tfjs (bool): Whether the model is a TensorFlow.js model.
-        paddle (bool): Whether the model is a PaddlePaddle model.
-        mnn (bool): Whether the model is an MNN model.
-        ncnn (bool): Whether the model is an NCNN model.
-        imx (bool): Whether the model is an IMX model.
-        rknn (bool): Whether the model is an RKNN model.
-        triton (bool): Whether the model is a Triton Inference Server model.
-
-    Methods:
-        forward: Run inference on an input image.
-        from_numpy: Convert numpy array to tensor.
-        warmup: Warm up the model with a dummy input.
-        _model_type: Determine the model type from file path.
-
-    Examples:
-        >>> model = AutoBackend(model="yolo11n.pt", device="cuda")
-        >>> results = model(img)
-    """
-
-    @torch.no_grad()
-    def __init__(
-        self,
-        model: str | torch.nn.Module = "yolo11n.pt",
-        device: torch.device = torch.device("cpu"),
-        dnn: bool = False,
-        data: str | Path | None = None,
-        fp16: bool = False,
-        fuse: bool = True,
-        verbose: bool = True,
-    ):
-        """
-        Initialize the AutoBackend for inference.
-
-        Args:
-            model (str | torch.nn.Module): Path to the model weights file or a module instance.
-            device (torch.device): Device to run the model on.
-            dnn (bool): Use OpenCV DNN module for ONNX inference.
-            data (str | Path, optional): Path to the additional data.yaml file containing class names.
-            fp16 (bool): Enable half-precision inference. Supported only on specific backends.
-            fuse (bool): Fuse Conv2D + BatchNorm layers for optimization.
-            verbose (bool): Enable verbose logging.
-        """
-        super().__init__()
-        nn_module = isinstance(model, torch.nn.Module)
-        (
-            pt,
-            jit,
-            onnx,
-            xml,
-            engine,
-            coreml,
-            saved_model,
-            pb,
-            tflite,
-            edgetpu,
-            tfjs,
-            paddle,
-            mnn,
-            ncnn,
-            imx,
-            rknn,
-            triton,
-        ) = self._model_type("" if nn_module else model)
-        fp16 &= pt or jit or onnx or xml or engine or nn_module or triton  # FP16
-        nhwc = coreml or saved_model or pb or tflite or edgetpu or rknn  # BHWC formats (vs torch BCWH)
-        stride, ch = 32, 3  # default stride and channels
-        end2end, dynamic = False, False
-        metadata, task = None, None
-
-        # Set device
-        cuda = isinstance(device, torch.device) and torch.cuda.is_available() and device.type != "cpu"  # use CUDA
-        if cuda and not any([nn_module, pt, jit, engine, onnx, paddle]):  # GPU dataloader formats
-            device = torch.device("cpu")
-            cuda = False
-
-        # Download if not local
-        w = attempt_download_asset(model) if pt else model  # weights path
-
-        # PyTorch (in-memory or file)
-        if nn_module or pt:
-            if nn_module:
-                pt = True
-                if fuse:
-                    if IS_JETSON and is_jetson(jetpack=5):
-                        # Jetson Jetpack5 requires device before fuse https://github.com/ultralytics/ultralytics/pull/21028
-                        model = model.to(device)
-                    model = model.fuse(verbose=verbose)
-                model = model.to(device)
-            else:  # pt file
-                from ultralytics.nn.tasks import load_checkpoint
-
-                model, _ = load_checkpoint(model, device=device, fuse=fuse)  # load model, ckpt
-
-            # Common PyTorch model processing
-            if hasattr(model, "kpt_shape"):
-                kpt_shape = model.kpt_shape  # pose-only
-            stride = max(int(model.stride.max()), 32)  # model stride
-            names = model.module.names if hasattr(model, "module") else model.names  # get class names
-            model.half() if fp16 else model.float()
-            ch = model.yaml.get("channels", 3)
-            for p in model.parameters():
-                p.requires_grad = False
-            self.model = model  # explicitly assign for to(), cpu(), cuda(), half()
-
-        # TorchScript
-        elif jit:
-            import torchvision  # noqa - https://github.com/ultralytics/ultralytics/pull/19747
-
-            LOGGER.info(f"Loading {w} for TorchScript inference...")
-            extra_files = {"config.txt": ""}  # model metadata
-            model = torch.jit.load(w, _extra_files=extra_files, map_location=device)
-            model.half() if fp16 else model.float()
-            if extra_files["config.txt"]:  # load metadata dict
-                metadata = json.loads(extra_files["config.txt"], object_hook=lambda x: dict(x.items()))
-
-        # ONNX OpenCV DNN
-        elif dnn:
-            LOGGER.info(f"Loading {w} for ONNX OpenCV DNN inference...")
-            check_requirements("opencv-python>=4.5.4")
-            net = cv2.dnn.readNetFromONNX(w)
-
-        # ONNX Runtime and IMX
-        elif onnx or imx:
-            LOGGER.info(f"Loading {w} for ONNX Runtime inference...")
-            check_requirements(("onnx", "onnxruntime-gpu" if cuda else "onnxruntime"))
-            import onnxruntime
-
-            providers = ["CPUExecutionProvider"]
-            if cuda:
-                if "CUDAExecutionProvider" in onnxruntime.get_available_providers():
-                    providers.insert(0, "CUDAExecutionProvider")
-                else:  # Only log warning if CUDA was requested but unavailable
-                    LOGGER.warning("Failed to start ONNX Runtime with CUDA. Using CPU...")
-                    device = torch.device("cpu")
-                    cuda = False
-            LOGGER.info(f"Using ONNX Runtime {onnxruntime.__version__} {providers[0]}")
-            if onnx:
-                session = onnxruntime.InferenceSession(w, providers=providers)
-            else:
-                check_requirements(
-                    ["model-compression-toolkit>=2.4.1", "sony-custom-layers[torch]>=0.3.0", "onnxruntime-extensions"]
-                )
-                w = next(Path(w).glob("*.onnx"))
-                LOGGER.info(f"Loading {w} for ONNX IMX inference...")
-                import mct_quantizers as mctq
-                from sony_custom_layers.pytorch.nms import nms_ort  # noqa
-
-                session_options = mctq.get_ort_session_options()
-                session_options.enable_mem_reuse = False  # fix the shape mismatch from onnxruntime
-                session = onnxruntime.InferenceSession(w, session_options, providers=["CPUExecutionProvider"])
-
-            output_names = [x.name for x in session.get_outputs()]
-            metadata = session.get_modelmeta().custom_metadata_map
-            dynamic = isinstance(session.get_outputs()[0].shape[0], str)
-            fp16 = "float16" in session.get_inputs()[0].type
-            if not dynamic:
-                io = session.io_binding()
-                bindings = []
-                for output in session.get_outputs():
-                    out_fp16 = "float16" in output.type
-                    y_tensor = torch.empty(output.shape, dtype=torch.float16 if out_fp16 else torch.float32).to(device)
-                    io.bind_output(
-                        name=output.name,
-                        device_type=device.type,
-                        device_id=device.index if cuda else 0,
-                        element_type=np.float16 if out_fp16 else np.float32,
-                        shape=tuple(y_tensor.shape),
-                        buffer_ptr=y_tensor.data_ptr(),
-                    )
-                    bindings.append(y_tensor)
-
-        # OpenVINO
-        elif xml:
-            LOGGER.info(f"Loading {w} for OpenVINO inference...")
-            check_requirements("openvino>=2024.0.0")
-            import openvino as ov
-
-            core = ov.Core()
-            device_name = "AUTO"
-            if isinstance(device, str) and device.startswith("intel"):
-                device_name = device.split(":")[1].upper()  # Intel OpenVINO device
-                device = torch.device("cpu")
-                if device_name not in core.available_devices:
-                    LOGGER.warning(f"OpenVINO device '{device_name}' not available. Using 'AUTO' instead.")
-                    device_name = "AUTO"
-            w = Path(w)
-            if not w.is_file():  # if not *.xml
-                w = next(w.glob("*.xml"))  # get *.xml file from *_openvino_model dir
-            ov_model = core.read_model(model=str(w), weights=w.with_suffix(".bin"))
-            if ov_model.get_parameters()[0].get_layout().empty:
-                ov_model.get_parameters()[0].set_layout(ov.Layout("NCHW"))
-
-            metadata = w.parent / "metadata.yaml"
-            if metadata.exists():
-                metadata = YAML.load(metadata)
-                batch = metadata["batch"]
-                dynamic = metadata.get("args", {}).get("dynamic", dynamic)
-            # OpenVINO inference modes are 'LATENCY', 'THROUGHPUT' (not recommended), or 'CUMULATIVE_THROUGHPUT'
-            inference_mode = "CUMULATIVE_THROUGHPUT" if batch > 1 and dynamic else "LATENCY"
-            ov_compiled_model = core.compile_model(
-                ov_model,
-                device_name=device_name,
-                config={"PERFORMANCE_HINT": inference_mode},
-            )
-            LOGGER.info(
-                f"Using OpenVINO {inference_mode} mode for batch={batch} inference on {', '.join(ov_compiled_model.get_property('EXECUTION_DEVICES'))}..."
-            )
-            input_name = ov_compiled_model.input().get_any_name()
-
-        # TensorRT
-        elif engine:
-            LOGGER.info(f"Loading {w} for TensorRT inference...")
-
-            if IS_JETSON and check_version(PYTHON_VERSION, "<=3.8.10"):
-                # fix error: `np.bool` was a deprecated alias for the builtin `bool` for JetPack 4 and JetPack 5 with Python <= 3.8.10
-                check_requirements("numpy==1.23.5")
-
-            try:  # https://developer.nvidia.com/nvidia-tensorrt-download
-                import tensorrt as trt  # noqa
-            except ImportError:
-                if LINUX:
-                    check_requirements("tensorrt>7.0.0,!=10.1.0")
-                import tensorrt as trt  # noqa
-            check_version(trt.__version__, ">=7.0.0", hard=True)
-            check_version(trt.__version__, "!=10.1.0", msg="https://github.com/ultralytics/ultralytics/pull/14239")
-            if device.type == "cpu":
-                device = torch.device("cuda:0")
-            Binding = namedtuple("Binding", ("name", "dtype", "shape", "data", "ptr"))
-            logger = trt.Logger(trt.Logger.INFO)
-            # Read file
-            with open(w, "rb") as f, trt.Runtime(logger) as runtime:
-                try:
-                    meta_len = int.from_bytes(f.read(4), byteorder="little")  # read metadata length
-                    metadata = json.loads(f.read(meta_len).decode("utf-8"))  # read metadata
-                    dla = metadata.get("dla", None)
-                    if dla is not None:
-                        runtime.DLA_core = int(dla)
-                except UnicodeDecodeError:
-                    f.seek(0)  # engine file may lack embedded Ultralytics metadata
-                model = runtime.deserialize_cuda_engine(f.read())  # read engine
-
-            # Model context
-            try:
-                context = model.create_execution_context()
-            except Exception as e:  # model is None
-                LOGGER.error(f"TensorRT model exported with a different version than {trt.__version__}\n")
-                raise e
-
-            bindings = OrderedDict()
-            output_names = []
-            fp16 = False  # default updated below
-            dynamic = False
-            is_trt10 = not hasattr(model, "num_bindings")
-            num = range(model.num_io_tensors) if is_trt10 else range(model.num_bindings)
-            for i in num:
-                if is_trt10:
-                    name = model.get_tensor_name(i)
-                    dtype = trt.nptype(model.get_tensor_dtype(name))
-                    is_input = model.get_tensor_mode(name) == trt.TensorIOMode.INPUT
-                    if is_input:
-                        if -1 in tuple(model.get_tensor_shape(name)):
-                            dynamic = True
-                            context.set_input_shape(name, tuple(model.get_tensor_profile_shape(name, 0)[1]))
-                        if dtype == np.float16:
-                            fp16 = True
-                    else:
-                        output_names.append(name)
-                    shape = tuple(context.get_tensor_shape(name))
-                else:  # TensorRT < 10.0
-                    name = model.get_binding_name(i)
-                    dtype = trt.nptype(model.get_binding_dtype(i))
-                    is_input = model.binding_is_input(i)
-                    if model.binding_is_input(i):
-                        if -1 in tuple(model.get_binding_shape(i)):  # dynamic
-                            dynamic = True
-                            context.set_binding_shape(i, tuple(model.get_profile_shape(0, i)[1]))
-                        if dtype == np.float16:
-                            fp16 = True
-                    else:
-                        output_names.append(name)
-                    shape = tuple(context.get_binding_shape(i))
-                im = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device)
-                bindings[name] = Binding(name, dtype, shape, im, int(im.data_ptr()))
-            binding_addrs = OrderedDict((n, d.ptr) for n, d in bindings.items())
-
-        # CoreML
-        elif coreml:
-            check_requirements("coremltools>=8.0")
-            LOGGER.info(f"Loading {w} for CoreML inference...")
-            import coremltools as ct
-
-            model = ct.models.MLModel(w)
-            metadata = dict(model.user_defined_metadata)
-
-        # TF SavedModel
-        elif saved_model:
-            LOGGER.info(f"Loading {w} for TensorFlow SavedModel inference...")
-            import tensorflow as tf
-
-            keras = False  # assume TF1 saved_model
-            model = tf.keras.models.load_model(w) if keras else tf.saved_model.load(w)
-            metadata = Path(w) / "metadata.yaml"
-
-        # TF GraphDef
-        elif pb:  # https://www.tensorflow.org/guide/migrate#a_graphpb_or_graphpbtxt
-            LOGGER.info(f"Loading {w} for TensorFlow GraphDef inference...")
-            import tensorflow as tf
-
-            from ultralytics.engine.exporter import gd_outputs
-
-            def wrap_frozen_graph(gd, inputs, outputs):
-                """Wrap frozen graphs for deployment."""
-                x = tf.compat.v1.wrap_function(lambda: tf.compat.v1.import_graph_def(gd, name=""), [])  # wrapped
-                ge = x.graph.as_graph_element
-                return x.prune(tf.nest.map_structure(ge, inputs), tf.nest.map_structure(ge, outputs))
-
-            gd = tf.Graph().as_graph_def()  # TF GraphDef
-            with open(w, "rb") as f:
-                gd.ParseFromString(f.read())
-            frozen_func = wrap_frozen_graph(gd, inputs="x:0", outputs=gd_outputs(gd))
-            try:  # find metadata in SavedModel alongside GraphDef
-                metadata = next(Path(w).resolve().parent.rglob(f"{Path(w).stem}_saved_model*/metadata.yaml"))
-            except StopIteration:
-                pass
-
-        # TFLite or TFLite Edge TPU
-        elif tflite or edgetpu:  # https://ai.google.dev/edge/litert/microcontrollers/python
-            try:  # https://coral.ai/docs/edgetpu/tflite-python/#update-existing-tf-lite-code-for-the-edge-tpu
-                from tflite_runtime.interpreter import Interpreter, load_delegate
-            except ImportError:
-                import tensorflow as tf
-
-                Interpreter, load_delegate = tf.lite.Interpreter, tf.lite.experimental.load_delegate
-            if edgetpu:  # TF Edge TPU https://coral.ai/software/#edgetpu-runtime
-                device = device[3:] if str(device).startswith("tpu") else ":0"
-                LOGGER.info(f"Loading {w} on device {device[1:]} for TensorFlow Lite Edge TPU inference...")
-                delegate = {"Linux": "libedgetpu.so.1", "Darwin": "libedgetpu.1.dylib", "Windows": "edgetpu.dll"}[
-                    platform.system()
-                ]
-                interpreter = Interpreter(
-                    model_path=w,
-                    experimental_delegates=[load_delegate(delegate, options={"device": device})],
-                )
-                device = "cpu"  # Required, otherwise PyTorch will try to use the wrong device
-            else:  # TFLite
-                LOGGER.info(f"Loading {w} for TensorFlow Lite inference...")
-                interpreter = Interpreter(model_path=w)  # load TFLite model
-            interpreter.allocate_tensors()  # allocate
-            input_details = interpreter.get_input_details()  # inputs
-            output_details = interpreter.get_output_details()  # outputs
-            # Load metadata
-            try:
-                with zipfile.ZipFile(w, "r") as zf:
-                    name = zf.namelist()[0]
-                    contents = zf.read(name).decode("utf-8")
-                    if name == "metadata.json":  # Custom Ultralytics metadata dict for Python>=3.12
-                        metadata = json.loads(contents)
-                    else:
-                        metadata = ast.literal_eval(contents)  # Default tflite-support metadata for Python<=3.11
-            except (zipfile.BadZipFile, SyntaxError, ValueError, json.JSONDecodeError):
-                pass
-
-        # TF.js
-        elif tfjs:
-            raise NotImplementedError("Ultralytics TF.js inference is not currently supported.")
-
-        # PaddlePaddle
-        elif paddle:
-            LOGGER.info(f"Loading {w} for PaddlePaddle inference...")
-            check_requirements(
-                "paddlepaddle-gpu"
-                if torch.cuda.is_available()
-                else "paddlepaddle==3.0.0"  # pin 3.0.0 for ARM64
-                if ARM64
-                else "paddlepaddle>=3.0.0"
-            )
-            import paddle.inference as pdi  # noqa
-
-            w = Path(w)
-            model_file, params_file = None, None
-            if w.is_dir():
-                model_file = next(w.rglob("*.json"), None)
-                params_file = next(w.rglob("*.pdiparams"), None)
-            elif w.suffix == ".pdiparams":
-                model_file = w.with_name("model.json")
-                params_file = w
-
-            if not (model_file and params_file and model_file.is_file() and params_file.is_file()):
-                raise FileNotFoundError(f"Paddle model not found in {w}. Both .json and .pdiparams files are required.")
-
-            config = pdi.Config(str(model_file), str(params_file))
-            if cuda:
-                config.enable_use_gpu(memory_pool_init_size_mb=2048, device_id=0)
-            predictor = pdi.create_predictor(config)
-            input_handle = predictor.get_input_handle(predictor.get_input_names()[0])
-            output_names = predictor.get_output_names()
-            metadata = w / "metadata.yaml"
-
-        # MNN
-        elif mnn:
-            LOGGER.info(f"Loading {w} for MNN inference...")
-            check_requirements("MNN")  # requires MNN
-            import os
-
-            import MNN
-
-            config = {"precision": "low", "backend": "CPU", "numThread": (os.cpu_count() + 1) // 2}
-            rt = MNN.nn.create_runtime_manager((config,))
-            net = MNN.nn.load_module_from_file(w, [], [], runtime_manager=rt, rearrange=True)
-
-            def torch_to_mnn(x):
-                return MNN.expr.const(x.data_ptr(), x.shape)
-
-            metadata = json.loads(net.get_info()["bizCode"])
-
-        # NCNN
-        elif ncnn:
-            LOGGER.info(f"Loading {w} for NCNN inference...")
-            check_requirements("git+https://github.com/Tencent/ncnn.git" if ARM64 else "ncnn", cmds="--no-deps")
-            import ncnn as pyncnn
-
-            net = pyncnn.Net()
-            net.opt.use_vulkan_compute = cuda
-            w = Path(w)
-            if not w.is_file():  # if not *.param
-                w = next(w.glob("*.param"))  # get *.param file from *_ncnn_model dir
-            net.load_param(str(w))
-            net.load_model(str(w.with_suffix(".bin")))
-            metadata = w.parent / "metadata.yaml"
-
-        # NVIDIA Triton Inference Server
-        elif triton:
-            check_requirements("tritonclient[all]")
-            from ultralytics.utils.triton import TritonRemoteModel
-
-            model = TritonRemoteModel(w)
-            metadata = model.metadata
-
-        # RKNN
-        elif rknn:
-            if not is_rockchip():
-                raise OSError("RKNN inference is only supported on Rockchip devices.")
-            LOGGER.info(f"Loading {w} for RKNN inference...")
-            check_requirements("rknn-toolkit-lite2")
-            from rknnlite.api import RKNNLite
-
-            w = Path(w)
-            if not w.is_file():  # if not *.rknn
-                w = next(w.rglob("*.rknn"))  # get *.rknn file from *_rknn_model dir
-            rknn_model = RKNNLite()
-            rknn_model.load_rknn(str(w))
-            rknn_model.init_runtime()
-            metadata = w.parent / "metadata.yaml"
-
-        # Any other format (unsupported)
-        else:
-            from ultralytics.engine.exporter import export_formats
-
-            raise TypeError(
-                f"model='{w}' is not a supported model format. Ultralytics supports: {export_formats()['Format']}\n"
-                f"See https://docs.ultralytics.com/modes/predict for help."
-            )
-
-        # Load external metadata YAML
-        if isinstance(metadata, (str, Path)) and Path(metadata).exists():
-            metadata = YAML.load(metadata)
-        if metadata and isinstance(metadata, dict):
-            for k, v in metadata.items():
-                if k in {"stride", "batch", "channels"}:
-                    metadata[k] = int(v)
-                elif k in {"imgsz", "names", "kpt_shape", "args"} and isinstance(v, str):
-                    metadata[k] = eval(v)
-            stride = metadata["stride"]
-            task = metadata["task"]
-            batch = metadata["batch"]
-            imgsz = metadata["imgsz"]
-            names = metadata["names"]
-            kpt_shape = metadata.get("kpt_shape")
-            end2end = metadata.get("args", {}).get("nms", False)
-            dynamic = metadata.get("args", {}).get("dynamic", dynamic)
-            ch = metadata.get("channels", 3)
-        elif not (pt or triton or nn_module):
-            LOGGER.warning(f"Metadata not found for 'model={w}'")
-
-        # Check names
-        if "names" not in locals():  # names missing
-            names = default_class_names(data)
-        names = check_class_names(names)
-
-        self.__dict__.update(locals())  # assign all variables to self
-
-    def forward(
-        self,
-        im: torch.Tensor,
-        augment: bool = False,
-        visualize: bool = False,
-        embed: list | None = None,
-        **kwargs: Any,
-    ) -> torch.Tensor | list[torch.Tensor]:
-        """
-        Run inference on an AutoBackend model.
-
-        Args:
-            im (torch.Tensor): The image tensor to perform inference on.
-            augment (bool): Whether to perform data augmentation during inference.
-            visualize (bool): Whether to visualize the output predictions.
-            embed (list, optional): A list of feature vectors/embeddings to return.
-            **kwargs (Any): Additional keyword arguments for model configuration.
-
-        Returns:
-            (torch.Tensor | list[torch.Tensor]): The raw output tensor(s) from the model.
-        """
-        b, ch, h, w = im.shape  # batch, channel, height, width
-        if self.fp16 and im.dtype != torch.float16:
-            im = im.half()  # to FP16
-        if self.nhwc:
-            im = im.permute(0, 2, 3, 1)  # torch BCHW to numpy BHWC shape(1,320,192,3)
-
-        # PyTorch
-        if self.pt or self.nn_module:
-            y = self.model(im, augment=augment, visualize=visualize, embed=embed, **kwargs)
-
-        # TorchScript
-        elif self.jit:
-            y = self.model(im)
-
-        # ONNX OpenCV DNN
-        elif self.dnn:
-            im = im.cpu().numpy()  # torch to numpy
-            self.net.setInput(im)
-            y = self.net.forward()
-
-        # ONNX Runtime
-        elif self.onnx or self.imx:
-            if self.dynamic:
-                im = im.cpu().numpy()  # torch to numpy
-                y = self.session.run(self.output_names, {self.session.get_inputs()[0].name: im})
-            else:
-                if not self.cuda:
-                    im = im.cpu()
-                self.io.bind_input(
-                    name="images",
-                    device_type=im.device.type,
-                    device_id=im.device.index if im.device.type == "cuda" else 0,
-                    element_type=np.float16 if self.fp16 else np.float32,
-                    shape=tuple(im.shape),
-                    buffer_ptr=im.data_ptr(),
-                )
-                self.session.run_with_iobinding(self.io)
-                y = self.bindings
-            if self.imx:
-                if self.task == "detect":
-                    # boxes, conf, cls
-                    y = np.concatenate([y[0], y[1][:, :, None], y[2][:, :, None]], axis=-1)
-                elif self.task == "pose":
-                    # boxes, conf, kpts
-                    y = np.concatenate([y[0], y[1][:, :, None], y[2][:, :, None], y[3]], axis=-1)
-
-        # OpenVINO
-        elif self.xml:
-            im = im.cpu().numpy()  # FP32
-
-            if self.inference_mode in {"THROUGHPUT", "CUMULATIVE_THROUGHPUT"}:  # optimized for larger batch-sizes
-                n = im.shape[0]  # number of images in batch
-                results = [None] * n  # preallocate list with None to match the number of images
-
-                def callback(request, userdata):
-                    """Place result in preallocated list using userdata index."""
-                    results[userdata] = request.results
-
-                # Create AsyncInferQueue, set the callback and start asynchronous inference for each input image
-                async_queue = self.ov.AsyncInferQueue(self.ov_compiled_model)
-                async_queue.set_callback(callback)
-                for i in range(n):
-                    # Start async inference with userdata=i to specify the position in results list
-                    async_queue.start_async(inputs={self.input_name: im[i : i + 1]}, userdata=i)  # keep image as BCHW
-                async_queue.wait_all()  # wait for all inference requests to complete
-                y = [list(r.values()) for r in results]
-                y = [np.concatenate(x) for x in zip(*y)]
-            else:  # inference_mode = "LATENCY", optimized for fastest first result at batch-size 1
-                y = list(self.ov_compiled_model(im).values())
-
-        # TensorRT
-        elif self.engine:
-            if self.dynamic and im.shape != self.bindings["images"].shape:
-                if self.is_trt10:
-                    self.context.set_input_shape("images", im.shape)
-                    self.bindings["images"] = self.bindings["images"]._replace(shape=im.shape)
-                    for name in self.output_names:
-                        self.bindings[name].data.resize_(tuple(self.context.get_tensor_shape(name)))
-                else:
-                    i = self.model.get_binding_index("images")
-                    self.context.set_binding_shape(i, im.shape)
-                    self.bindings["images"] = self.bindings["images"]._replace(shape=im.shape)
-                    for name in self.output_names:
-                        i = self.model.get_binding_index(name)
-                        self.bindings[name].data.resize_(tuple(self.context.get_binding_shape(i)))
-
-            s = self.bindings["images"].shape
-            assert im.shape == s, f"input size {im.shape} {'>' if self.dynamic else 'not equal to'} max model size {s}"
-            self.binding_addrs["images"] = int(im.data_ptr())
-            self.context.execute_v2(list(self.binding_addrs.values()))
-            y = [self.bindings[x].data for x in sorted(self.output_names)]
-
-        # CoreML
-        elif self.coreml:
-            im = im[0].cpu().numpy()
-            im_pil = Image.fromarray((im * 255).astype("uint8"))
-            # im = im.resize((192, 320), Image.BILINEAR)
-            y = self.model.predict({"image": im_pil})  # coordinates are xywh normalized
-            if "confidence" in y:  # NMS included
-                from ultralytics.utils.ops import xywh2xyxy
-
-                box = xywh2xyxy(y["coordinates"] * [[w, h, w, h]])  # xyxy pixels
-                cls = y["confidence"].argmax(1, keepdims=True)
-                y = np.concatenate((box, np.take_along_axis(y["confidence"], cls, axis=1), cls), 1)[None]
-            else:
-                y = list(y.values())
-            if len(y) == 2 and len(y[1].shape) != 4:  # segmentation model
-                y = list(reversed(y))  # reversed for segmentation models (pred, proto)
-
-        # PaddlePaddle
-        elif self.paddle:
-            im = im.cpu().numpy().astype(np.float32)
-            self.input_handle.copy_from_cpu(im)
-            self.predictor.run()
-            y = [self.predictor.get_output_handle(x).copy_to_cpu() for x in self.output_names]
-
-        # MNN
-        elif self.mnn:
-            input_var = self.torch_to_mnn(im)
-            output_var = self.net.onForward([input_var])
-            y = [x.read() for x in output_var]
-
-        # NCNN
-        elif self.ncnn:
-            mat_in = self.pyncnn.Mat(im[0].cpu().numpy())
-            with self.net.create_extractor() as ex:
-                ex.input(self.net.input_names()[0], mat_in)
-                # WARNING: 'output_names' sorted as a temporary fix for https://github.com/pnnx/pnnx/issues/130
-                y = [np.array(ex.extract(x)[1])[None] for x in sorted(self.net.output_names())]
-
-        # NVIDIA Triton Inference Server
-        elif self.triton:
-            im = im.cpu().numpy()  # torch to numpy
-            y = self.model(im)
-
-        # RKNN
-        elif self.rknn:
-            im = (im.cpu().numpy() * 255).astype("uint8")
-            im = im if isinstance(im, (list, tuple)) else [im]
-            y = self.rknn_model.inference(inputs=im)
-
-        # TensorFlow (SavedModel, GraphDef, Lite, Edge TPU)
-        else:
-            im = im.cpu().numpy()
-            if self.saved_model:  # SavedModel
-                y = self.model(im, training=False) if self.keras else self.model.serving_default(im)
-                if not isinstance(y, list):
-                    y = [y]
-            elif self.pb:  # GraphDef
-                y = self.frozen_func(x=self.tf.constant(im))
-            else:  # Lite or Edge TPU
-                details = self.input_details[0]
-                is_int = details["dtype"] in {np.int8, np.int16}  # is TFLite quantized int8 or int16 model
-                if is_int:
-                    scale, zero_point = details["quantization"]
-                    im = (im / scale + zero_point).astype(details["dtype"])  # de-scale
-                self.interpreter.set_tensor(details["index"], im)
-                self.interpreter.invoke()
-                y = []
-                for output in self.output_details:
-                    x = self.interpreter.get_tensor(output["index"])
-                    if is_int:
-                        scale, zero_point = output["quantization"]
-                        x = (x.astype(np.float32) - zero_point) * scale  # re-scale
-                    if x.ndim == 3:  # if task is not classification, excluding masks (ndim=4) as well
-                        # Denormalize xywh by image size. See https://github.com/ultralytics/ultralytics/pull/1695
-                        # xywh are normalized in TFLite/EdgeTPU to mitigate quantization error of integer models
-                        if x.shape[-1] == 6 or self.end2end:  # end-to-end model
-                            x[:, :, [0, 2]] *= w
-                            x[:, :, [1, 3]] *= h
-                            if self.task == "pose":
-                                x[:, :, 6::3] *= w
-                                x[:, :, 7::3] *= h
-                        else:
-                            x[:, [0, 2]] *= w
-                            x[:, [1, 3]] *= h
-                            if self.task == "pose":
-                                x[:, 5::3] *= w
-                                x[:, 6::3] *= h
-                    y.append(x)
-            # TF segment fixes: export is reversed vs ONNX export and protos are transposed
-            if len(y) == 2:  # segment with (det, proto) output order reversed
-                if len(y[1].shape) != 4:
-                    y = list(reversed(y))  # should be y = (1, 116, 8400), (1, 160, 160, 32)
-                if y[1].shape[-1] == 6:  # end-to-end model
-                    y = [y[1]]
-                else:
-                    y[1] = np.transpose(y[1], (0, 3, 1, 2))  # should be y = (1, 116, 8400), (1, 32, 160, 160)
-            y = [x if isinstance(x, np.ndarray) else x.numpy() for x in y]
-
-        # for x in y:
-        #     print(type(x), len(x)) if isinstance(x, (list, tuple)) else print(type(x), x.shape)  # debug shapes
-        if isinstance(y, (list, tuple)):
-            if len(self.names) == 999 and (self.task == "segment" or len(y) == 2):  # segments and names not defined
-                nc = y[0].shape[1] - y[1].shape[1] - 4  # y = (1, 32, 160, 160), (1, 116, 8400)
-                self.names = {i: f"class{i}" for i in range(nc)}
-            return self.from_numpy(y[0]) if len(y) == 1 else [self.from_numpy(x) for x in y]
-        else:
-            return self.from_numpy(y)
-
-    def from_numpy(self, x: np.ndarray) -> torch.Tensor:
-        """
-        Convert a numpy array to a tensor.
-
-        Args:
-            x (np.ndarray): The array to be converted.
-
-        Returns:
-            (torch.Tensor): The converted tensor
-        """
-        return torch.tensor(x).to(self.device) if isinstance(x, np.ndarray) else x
-
-    def warmup(self, imgsz: tuple[int, int, int, int] = (1, 3, 640, 640)) -> None:
-        """
-        Warm up the model by running one forward pass with a dummy input.
-
-        Args:
-            imgsz (tuple): The shape of the dummy input tensor in the format (batch_size, channels, height, width)
-        """
-        warmup_types = self.pt, self.jit, self.onnx, self.engine, self.saved_model, self.pb, self.triton, self.nn_module
-        if any(warmup_types) and (self.device.type != "cpu" or self.triton):
-            im = torch.empty(*imgsz, dtype=torch.half if self.fp16 else torch.float, device=self.device)  # input
-            for _ in range(2 if self.jit else 1):
-                self.forward(im)  # warmup
-
-    @staticmethod
-    def _model_type(p: str = "path/to/model.pt") -> list[bool]:
-        """
-        Take a path to a model file and return the model type.
-
-        Args:
-            p (str): Path to the model file.
-
-        Returns:
-            (list[bool]): List of booleans indicating the model type.
-
-        Examples:
-            >>> model = AutoBackend(model="path/to/model.onnx")
-            >>> model_type = model._model_type()  # returns "onnx"
-        """
-        from ultralytics.engine.exporter import export_formats
-
-        sf = export_formats()["Suffix"]  # export suffixes
-        if not is_url(p) and not isinstance(p, str):
-            check_suffix(p, sf)  # checks
-        name = Path(p).name
-        types = [s in name for s in sf]
-        types[5] |= name.endswith(".mlmodel")  # retain support for older Apple CoreML *.mlmodel formats
-        types[8] &= not types[9]  # tflite &= not edgetpu
-        if any(types):
-            triton = False
-        else:
-            from urllib.parse import urlsplit
-
-            url = urlsplit(p)
-            triton = bool(url.netloc) and bool(url.path) and url.scheme in {"http", "grpc"}
-
-        return types + [triton]
diff --git a/ultralytics/nn/modules/__init__.py b/ultralytics/nn/modules/__init__.py
deleted file mode 100644
index 63c250e..0000000
--- a/ultralytics/nn/modules/__init__.py
+++ /dev/null
@@ -1,182 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-"""
-Ultralytics neural network modules.
-
-This module provides access to various neural network components used in Ultralytics models, including convolution
-blocks, attention mechanisms, transformer components, and detection/segmentation heads.
-
-Examples:
-    Visualize a module with Netron
-    >>> from ultralytics.nn.modules import Conv
-    >>> import torch
-    >>> import subprocess
-    >>> x = torch.ones(1, 128, 40, 40)
-    >>> m = Conv(128, 128)
-    >>> f = f"{m._get_name()}.onnx"
-    >>> torch.onnx.export(m, x, f)
-    >>> subprocess.run(f"onnxslim {f} {f} && open {f}", shell=True, check=True)  # pip install onnxslim
-"""
-
-from .block import (
-    C1,
-    C2,
-    C2PSA,
-    C3,
-    C3TR,
-    CIB,
-    DFL,
-    ELAN1,
-    PSA,
-    SPP,
-    SPPELAN,
-    SPPF,
-    A2C2f,
-    AConv,
-    ADown,
-    Attention,
-    BNContrastiveHead,
-    Bottleneck,
-    BottleneckCSP,
-    C2f,
-    C2fAttn,
-    C2fCIB,
-    C2fPSA,
-    C3Ghost,
-    C3k2,
-    C3x,
-    CBFuse,
-    CBLinear,
-    ContrastiveHead,
-    GhostBottleneck,
-    HGBlock,
-    HGStem,
-    ImagePoolingAttn,
-    MaxSigmoidAttnBlock,
-    Proto,
-    RepC3,
-    RepNCSPELAN4,
-    RepVGGDW,
-    ResNetLayer,
-    SCDown,
-    TorchVision,
-)
-from .conv import (
-    CBAM,
-    ChannelAttention,
-    Concat,
-    Conv,
-    Conv2,
-    ConvTranspose,
-    DWConv,
-    DWConvTranspose2d,
-    Focus,
-    GhostConv,
-    Index,
-    LightConv,
-    RepConv,
-    SpatialAttention,
-)
-from .head import (
-    OBB,
-    Classify,
-    Detect,
-    LRPCHead,
-    Pose,
-    RTDETRDecoder,
-    Segment,
-    WorldDetect,
-    YOLOEDetect,
-    YOLOESegment,
-    v10Detect,
-)
-from .transformer import (
-    AIFI,
-    MLP,
-    DeformableTransformerDecoder,
-    DeformableTransformerDecoderLayer,
-    LayerNorm2d,
-    MLPBlock,
-    MSDeformAttn,
-    TransformerBlock,
-    TransformerEncoderLayer,
-    TransformerLayer,
-)
-
-__all__ = (
-    "Conv",
-    "Conv2",
-    "LightConv",
-    "RepConv",
-    "DWConv",
-    "DWConvTranspose2d",
-    "ConvTranspose",
-    "Focus",
-    "GhostConv",
-    "ChannelAttention",
-    "SpatialAttention",
-    "CBAM",
-    "Concat",
-    "TransformerLayer",
-    "TransformerBlock",
-    "MLPBlock",
-    "LayerNorm2d",
-    "DFL",
-    "HGBlock",
-    "HGStem",
-    "SPP",
-    "SPPF",
-    "C1",
-    "C2",
-    "C3",
-    "C2f",
-    "C3k2",
-    "SCDown",
-    "C2fPSA",
-    "C2PSA",
-    "C2fAttn",
-    "C3x",
-    "C3TR",
-    "C3Ghost",
-    "GhostBottleneck",
-    "Bottleneck",
-    "BottleneckCSP",
-    "Proto",
-    "Detect",
-    "Segment",
-    "Pose",
-    "Classify",
-    "TransformerEncoderLayer",
-    "RepC3",
-    "RTDETRDecoder",
-    "AIFI",
-    "DeformableTransformerDecoder",
-    "DeformableTransformerDecoderLayer",
-    "MSDeformAttn",
-    "MLP",
-    "ResNetLayer",
-    "OBB",
-    "WorldDetect",
-    "YOLOEDetect",
-    "YOLOESegment",
-    "v10Detect",
-    "LRPCHead",
-    "ImagePoolingAttn",
-    "MaxSigmoidAttnBlock",
-    "ContrastiveHead",
-    "BNContrastiveHead",
-    "RepNCSPELAN4",
-    "ADown",
-    "SPPELAN",
-    "CBFuse",
-    "CBLinear",
-    "AConv",
-    "ELAN1",
-    "RepVGGDW",
-    "CIB",
-    "C2fCIB",
-    "Attention",
-    "PSA",
-    "TorchVision",
-    "Index",
-    "A2C2f",
-)
diff --git a/ultralytics/nn/modules/__pycache__/__init__.cpython-310.pyc b/ultralytics/nn/modules/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index f2740f6..0000000
Binary files a/ultralytics/nn/modules/__pycache__/__init__.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/nn/modules/__pycache__/block.cpython-310.pyc b/ultralytics/nn/modules/__pycache__/block.cpython-310.pyc
deleted file mode 100644
index be35096..0000000
Binary files a/ultralytics/nn/modules/__pycache__/block.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/nn/modules/__pycache__/conv.cpython-310.pyc b/ultralytics/nn/modules/__pycache__/conv.cpython-310.pyc
deleted file mode 100644
index c71e67e..0000000
Binary files a/ultralytics/nn/modules/__pycache__/conv.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/nn/modules/__pycache__/head.cpython-310.pyc b/ultralytics/nn/modules/__pycache__/head.cpython-310.pyc
deleted file mode 100644
index 1309990..0000000
Binary files a/ultralytics/nn/modules/__pycache__/head.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/nn/modules/__pycache__/transformer.cpython-310.pyc b/ultralytics/nn/modules/__pycache__/transformer.cpython-310.pyc
deleted file mode 100644
index ca5ccea..0000000
Binary files a/ultralytics/nn/modules/__pycache__/transformer.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/nn/modules/__pycache__/utils.cpython-310.pyc b/ultralytics/nn/modules/__pycache__/utils.cpython-310.pyc
deleted file mode 100644
index 8dc6ea5..0000000
Binary files a/ultralytics/nn/modules/__pycache__/utils.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/nn/modules/activation.py b/ultralytics/nn/modules/activation.py
deleted file mode 100644
index 69b4c42..0000000
--- a/ultralytics/nn/modules/activation.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-"""Activation modules."""
-
-import torch
-import torch.nn as nn
-
-
-class AGLU(nn.Module):
-    """
-    Unified activation function module from AGLU.
-
-    This class implements a parameterized activation function with learnable parameters lambda and kappa, based on the
-    AGLU (Adaptive Gated Linear Unit) approach.
-
-    Attributes:
-        act (nn.Softplus): Softplus activation function with negative beta.
-        lambd (nn.Parameter): Learnable lambda parameter initialized with uniform distribution.
-        kappa (nn.Parameter): Learnable kappa parameter initialized with uniform distribution.
-
-    Methods:
-        forward: Compute the forward pass of the Unified activation function.
-
-    Examples:
-        >>> import torch
-        >>> m = AGLU()
-        >>> input = torch.randn(2)
-        >>> output = m(input)
-        >>> print(output.shape)
-        torch.Size([2])
-
-    References:
-        https://github.com/kostas1515/AGLU
-    """
-
-    def __init__(self, device=None, dtype=None) -> None:
-        """Initialize the Unified activation function with learnable parameters."""
-        super().__init__()
-        self.act = nn.Softplus(beta=-1.0)
-        self.lambd = nn.Parameter(nn.init.uniform_(torch.empty(1, device=device, dtype=dtype)))  # lambda parameter
-        self.kappa = nn.Parameter(nn.init.uniform_(torch.empty(1, device=device, dtype=dtype)))  # kappa parameter
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Apply the Adaptive Gated Linear Unit (AGLU) activation function.
-
-        This forward method implements the AGLU activation function with learnable parameters lambda and kappa.
-        The function applies a transformation that adaptively combines linear and non-linear components.
-
-        Args:
-            x (torch.Tensor): Input tensor to apply the activation function to.
-
-        Returns:
-            (torch.Tensor): Output tensor after applying the AGLU activation function, with the same shape as the input.
-        """
-        lam = torch.clamp(self.lambd, min=0.0001)  # Clamp lambda to avoid division by zero
-        return torch.exp((1 / lam) * self.act((self.kappa * x) - torch.log(lam)))
diff --git a/ultralytics/nn/modules/block.py b/ultralytics/nn/modules/block.py
deleted file mode 100644
index cc79e40..0000000
--- a/ultralytics/nn/modules/block.py
+++ /dev/null
@@ -1,2031 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-"""Block modules."""
-
-from __future__ import annotations
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from ultralytics.utils.torch_utils import fuse_conv_and_bn
-
-from .conv import Conv, DWConv, GhostConv, LightConv, RepConv, autopad
-from .transformer import TransformerBlock
-
-__all__ = (
-    "DFL",
-    "HGBlock",
-    "HGStem",
-    "SPP",
-    "SPPF",
-    "C1",
-    "C2",
-    "C3",
-    "C2f",
-    "C2fAttn",
-    "ImagePoolingAttn",
-    "ContrastiveHead",
-    "BNContrastiveHead",
-    "C3x",
-    "C3TR",
-    "C3Ghost",
-    "GhostBottleneck",
-    "Bottleneck",
-    "BottleneckCSP",
-    "Proto",
-    "RepC3",
-    "ResNetLayer",
-    "RepNCSPELAN4",
-    "ELAN1",
-    "ADown",
-    "AConv",
-    "SPPELAN",
-    "CBFuse",
-    "CBLinear",
-    "C3k2",
-    "C2fPSA",
-    "C2PSA",
-    "RepVGGDW",
-    "CIB",
-    "C2fCIB",
-    "Attention",
-    "PSA",
-    "SCDown",
-    "TorchVision",
-)
-
-
-class DFL(nn.Module):
-    """
-    Integral module of Distribution Focal Loss (DFL).
-
-    Proposed in Generalized Focal Loss https://ieeexplore.ieee.org/document/9792391
-    """
-
-    def __init__(self, c1: int = 16):
-        """
-        Initialize a convolutional layer with a given number of input channels.
-
-        Args:
-            c1 (int): Number of input channels.
-        """
-        super().__init__()
-        self.conv = nn.Conv2d(c1, 1, 1, bias=False).requires_grad_(False)
-        x = torch.arange(c1, dtype=torch.float)
-        self.conv.weight.data[:] = nn.Parameter(x.view(1, c1, 1, 1))
-        self.c1 = c1
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Apply the DFL module to input tensor and return transformed output."""
-        b, _, a = x.shape  # batch, channels, anchors
-        return self.conv(x.view(b, 4, self.c1, a).transpose(2, 1).softmax(1)).view(b, 4, a)
-        # return self.conv(x.view(b, self.c1, 4, a).softmax(1)).view(b, 4, a)
-
-
-class Proto(nn.Module):
-    """Ultralytics YOLO models mask Proto module for segmentation models."""
-
-    def __init__(self, c1: int, c_: int = 256, c2: int = 32):
-        """
-        Initialize the Ultralytics YOLO models mask Proto module with specified number of protos and masks.
-
-        Args:
-            c1 (int): Input channels.
-            c_ (int): Intermediate channels.
-            c2 (int): Output channels (number of protos).
-        """
-        super().__init__()
-        self.cv1 = Conv(c1, c_, k=3)
-        self.upsample = nn.ConvTranspose2d(c_, c_, 2, 2, 0, bias=True)  # nn.Upsample(scale_factor=2, mode='nearest')
-        self.cv2 = Conv(c_, c_, k=3)
-        self.cv3 = Conv(c_, c2)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Perform a forward pass through layers using an upsampled input image."""
-        return self.cv3(self.cv2(self.upsample(self.cv1(x))))
-
-
-class HGStem(nn.Module):
-    """
-    StemBlock of PPHGNetV2 with 5 convolutions and one maxpool2d.
-
-    https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
-    """
-
-    def __init__(self, c1: int, cm: int, c2: int):
-        """
-        Initialize the StemBlock of PPHGNetV2.
-
-        Args:
-            c1 (int): Input channels.
-            cm (int): Middle channels.
-            c2 (int): Output channels.
-        """
-        super().__init__()
-        self.stem1 = Conv(c1, cm, 3, 2, act=nn.ReLU())
-        self.stem2a = Conv(cm, cm // 2, 2, 1, 0, act=nn.ReLU())
-        self.stem2b = Conv(cm // 2, cm, 2, 1, 0, act=nn.ReLU())
-        self.stem3 = Conv(cm * 2, cm, 3, 2, act=nn.ReLU())
-        self.stem4 = Conv(cm, c2, 1, 1, act=nn.ReLU())
-        self.pool = nn.MaxPool2d(kernel_size=2, stride=1, padding=0, ceil_mode=True)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Forward pass of a PPHGNetV2 backbone layer."""
-        x = self.stem1(x)
-        x = F.pad(x, [0, 1, 0, 1])
-        x2 = self.stem2a(x)
-        x2 = F.pad(x2, [0, 1, 0, 1])
-        x2 = self.stem2b(x2)
-        x1 = self.pool(x)
-        x = torch.cat([x1, x2], dim=1)
-        x = self.stem3(x)
-        x = self.stem4(x)
-        return x
-
-
-class HGBlock(nn.Module):
-    """
-    HG_Block of PPHGNetV2 with 2 convolutions and LightConv.
-
-    https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
-    """
-
-    def __init__(
-        self,
-        c1: int,
-        cm: int,
-        c2: int,
-        k: int = 3,
-        n: int = 6,
-        lightconv: bool = False,
-        shortcut: bool = False,
-        act: nn.Module = nn.ReLU(),
-    ):
-        """
-        Initialize HGBlock with specified parameters.
-
-        Args:
-            c1 (int): Input channels.
-            cm (int): Middle channels.
-            c2 (int): Output channels.
-            k (int): Kernel size.
-            n (int): Number of LightConv or Conv blocks.
-            lightconv (bool): Whether to use LightConv.
-            shortcut (bool): Whether to use shortcut connection.
-            act (nn.Module): Activation function.
-        """
-        super().__init__()
-        block = LightConv if lightconv else Conv
-        self.m = nn.ModuleList(block(c1 if i == 0 else cm, cm, k=k, act=act) for i in range(n))
-        self.sc = Conv(c1 + n * cm, c2 // 2, 1, 1, act=act)  # squeeze conv
-        self.ec = Conv(c2 // 2, c2, 1, 1, act=act)  # excitation conv
-        self.add = shortcut and c1 == c2
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Forward pass of a PPHGNetV2 backbone layer."""
-        y = [x]
-        y.extend(m(y[-1]) for m in self.m)
-        y = self.ec(self.sc(torch.cat(y, 1)))
-        return y + x if self.add else y
-
-
-class SPP(nn.Module):
-    """Spatial Pyramid Pooling (SPP) layer https://arxiv.org/abs/1406.4729."""
-
-    def __init__(self, c1: int, c2: int, k: tuple[int, ...] = (5, 9, 13)):
-        """
-        Initialize the SPP layer with input/output channels and pooling kernel sizes.
-
-        Args:
-            c1 (int): Input channels.
-            c2 (int): Output channels.
-            k (tuple): Kernel sizes for max pooling.
-        """
-        super().__init__()
-        c_ = c1 // 2  # hidden channels
-        self.cv1 = Conv(c1, c_, 1, 1)
-        self.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1)
-        self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Forward pass of the SPP layer, performing spatial pyramid pooling."""
-        x = self.cv1(x)
-        return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1))
-
-
-class SPPF(nn.Module):
-    """Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher."""
-
-    def __init__(self, c1: int, c2: int, k: int = 5):
-        """
-        Initialize the SPPF layer with given input/output channels and kernel size.
-
-        Args:
-            c1 (int): Input channels.
-            c2 (int): Output channels.
-            k (int): Kernel size.
-
-        Notes:
-            This module is equivalent to SPP(k=(5, 9, 13)).
-        """
-        super().__init__()
-        c_ = c1 // 2  # hidden channels
-        self.cv1 = Conv(c1, c_, 1, 1)
-        self.cv2 = Conv(c_ * 4, c2, 1, 1)
-        self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Apply sequential pooling operations to input and return concatenated feature maps."""
-        y = [self.cv1(x)]
-        y.extend(self.m(y[-1]) for _ in range(3))
-        return self.cv2(torch.cat(y, 1))
-
-
-class C1(nn.Module):
-    """CSP Bottleneck with 1 convolution."""
-
-    def __init__(self, c1: int, c2: int, n: int = 1):
-        """
-        Initialize the CSP Bottleneck with 1 convolution.
-
-        Args:
-            c1 (int): Input channels.
-            c2 (int): Output channels.
-            n (int): Number of convolutions.
-        """
-        super().__init__()
-        self.cv1 = Conv(c1, c2, 1, 1)
-        self.m = nn.Sequential(*(Conv(c2, c2, 3) for _ in range(n)))
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Apply convolution and residual connection to input tensor."""
-        y = self.cv1(x)
-        return self.m(y) + y
-
-
-class C2(nn.Module):
-    """CSP Bottleneck with 2 convolutions."""
-
-    def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = True, g: int = 1, e: float = 0.5):
-        """
-        Initialize a CSP Bottleneck with 2 convolutions.
-
-        Args:
-            c1 (int): Input channels.
-            c2 (int): Output channels.
-            n (int): Number of Bottleneck blocks.
-            shortcut (bool): Whether to use shortcut connections.
-            g (int): Groups for convolutions.
-            e (float): Expansion ratio.
-        """
-        super().__init__()
-        self.c = int(c2 * e)  # hidden channels
-        self.cv1 = Conv(c1, 2 * self.c, 1, 1)
-        self.cv2 = Conv(2 * self.c, c2, 1)  # optional act=FReLU(c2)
-        # self.attention = ChannelAttention(2 * self.c)  # or SpatialAttention()
-        self.m = nn.Sequential(*(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n)))
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Forward pass through the CSP bottleneck with 2 convolutions."""
-        a, b = self.cv1(x).chunk(2, 1)
-        return self.cv2(torch.cat((self.m(a), b), 1))
-
-
-class C2f(nn.Module):
-    """Faster Implementation of CSP Bottleneck with 2 convolutions."""
-
-    def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = False, g: int = 1, e: float = 0.5):
-        """
-        Initialize a CSP bottleneck with 2 convolutions.
-
-        Args:
-            c1 (int): Input channels.
-            c2 (int): Output channels.
-            n (int): Number of Bottleneck blocks.
-            shortcut (bool): Whether to use shortcut connections.
-            g (int): Groups for convolutions.
-            e (float): Expansion ratio.
-        """
-        super().__init__()
-        self.c = int(c2 * e)  # hidden channels
-        self.cv1 = Conv(c1, 2 * self.c, 1, 1)
-        self.cv2 = Conv((2 + n) * self.c, c2, 1)  # optional act=FReLU(c2)
-        self.m = nn.ModuleList(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n))
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Forward pass through C2f layer."""
-        y = list(self.cv1(x).chunk(2, 1))
-        y.extend(m(y[-1]) for m in self.m)
-        return self.cv2(torch.cat(y, 1))
-
-    def forward_split(self, x: torch.Tensor) -> torch.Tensor:
-        """Forward pass using split() instead of chunk()."""
-        y = self.cv1(x).split((self.c, self.c), 1)
-        y = [y[0], y[1]]
-        y.extend(m(y[-1]) for m in self.m)
-        return self.cv2(torch.cat(y, 1))
-
-
-class C3(nn.Module):
-    """CSP Bottleneck with 3 convolutions."""
-
-    def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = True, g: int = 1, e: float = 0.5):
-        """
-        Initialize the CSP Bottleneck with 3 convolutions.
-
-        Args:
-            c1 (int): Input channels.
-            c2 (int): Output channels.
-            n (int): Number of Bottleneck blocks.
-            shortcut (bool): Whether to use shortcut connections.
-            g (int): Groups for convolutions.
-            e (float): Expansion ratio.
-        """
-        super().__init__()
-        c_ = int(c2 * e)  # hidden channels
-        self.cv1 = Conv(c1, c_, 1, 1)
-        self.cv2 = Conv(c1, c_, 1, 1)
-        self.cv3 = Conv(2 * c_, c2, 1)  # optional act=FReLU(c2)
-        self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, k=((1, 1), (3, 3)), e=1.0) for _ in range(n)))
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Forward pass through the CSP bottleneck with 3 convolutions."""
-        return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1))
-
-
-class C3x(C3):
-    """C3 module with cross-convolutions."""
-
-    def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = True, g: int = 1, e: float = 0.5):
-        """
-        Initialize C3 module with cross-convolutions.
-
-        Args:
-            c1 (int): Input channels.
-            c2 (int): Output channels.
-            n (int): Number of Bottleneck blocks.
-            shortcut (bool): Whether to use shortcut connections.
-            g (int): Groups for convolutions.
-            e (float): Expansion ratio.
-        """
-        super().__init__(c1, c2, n, shortcut, g, e)
-        self.c_ = int(c2 * e)
-        self.m = nn.Sequential(*(Bottleneck(self.c_, self.c_, shortcut, g, k=((1, 3), (3, 1)), e=1) for _ in range(n)))
-
-
-class RepC3(nn.Module):
-    """Rep C3."""
-
-    def __init__(self, c1: int, c2: int, n: int = 3, e: float = 1.0):
-        """
-        Initialize CSP Bottleneck with a single convolution.
-
-        Args:
-            c1 (int): Input channels.
-            c2 (int): Output channels.
-            n (int): Number of RepConv blocks.
-            e (float): Expansion ratio.
-        """
-        super().__init__()
-        c_ = int(c2 * e)  # hidden channels
-        self.cv1 = Conv(c1, c_, 1, 1)
-        self.cv2 = Conv(c1, c_, 1, 1)
-        self.m = nn.Sequential(*[RepConv(c_, c_) for _ in range(n)])
-        self.cv3 = Conv(c_, c2, 1, 1) if c_ != c2 else nn.Identity()
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Forward pass of RepC3 module."""
-        return self.cv3(self.m(self.cv1(x)) + self.cv2(x))
-
-
-class C3TR(C3):
-    """C3 module with TransformerBlock()."""
-
-    def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = True, g: int = 1, e: float = 0.5):
-        """
-        Initialize C3 module with TransformerBlock.
-
-        Args:
-            c1 (int): Input channels.
-            c2 (int): Output channels.
-            n (int): Number of Transformer blocks.
-            shortcut (bool): Whether to use shortcut connections.
-            g (int): Groups for convolutions.
-            e (float): Expansion ratio.
-        """
-        super().__init__(c1, c2, n, shortcut, g, e)
-        c_ = int(c2 * e)
-        self.m = TransformerBlock(c_, c_, 4, n)
-
-
-class C3Ghost(C3):
-    """C3 module with GhostBottleneck()."""
-
-    def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = True, g: int = 1, e: float = 0.5):
-        """
-        Initialize C3 module with GhostBottleneck.
-
-        Args:
-            c1 (int): Input channels.
-            c2 (int): Output channels.
-            n (int): Number of Ghost bottleneck blocks.
-            shortcut (bool): Whether to use shortcut connections.
-            g (int): Groups for convolutions.
-            e (float): Expansion ratio.
-        """
-        super().__init__(c1, c2, n, shortcut, g, e)
-        c_ = int(c2 * e)  # hidden channels
-        self.m = nn.Sequential(*(GhostBottleneck(c_, c_) for _ in range(n)))
-
-
-class GhostBottleneck(nn.Module):
-    """Ghost Bottleneck https://github.com/huawei-noah/Efficient-AI-Backbones."""
-
-    def __init__(self, c1: int, c2: int, k: int = 3, s: int = 1):
-        """
-        Initialize Ghost Bottleneck module.
-
-        Args:
-            c1 (int): Input channels.
-            c2 (int): Output channels.
-            k (int): Kernel size.
-            s (int): Stride.
-        """
-        super().__init__()
-        c_ = c2 // 2
-        self.conv = nn.Sequential(
-            GhostConv(c1, c_, 1, 1),  # pw
-            DWConv(c_, c_, k, s, act=False) if s == 2 else nn.Identity(),  # dw
-            GhostConv(c_, c2, 1, 1, act=False),  # pw-linear
-        )
-        self.shortcut = (
-            nn.Sequential(DWConv(c1, c1, k, s, act=False), Conv(c1, c2, 1, 1, act=False)) if s == 2 else nn.Identity()
-        )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Apply skip connection and concatenation to input tensor."""
-        return self.conv(x) + self.shortcut(x)
-
-
-class Bottleneck(nn.Module):
-    """Standard bottleneck."""
-
-    def __init__(
-        self, c1: int, c2: int, shortcut: bool = True, g: int = 1, k: tuple[int, int] = (3, 3), e: float = 0.5
-    ):
-        """
-        Initialize a standard bottleneck module.
-
-        Args:
-            c1 (int): Input channels.
-            c2 (int): Output channels.
-            shortcut (bool): Whether to use shortcut connection.
-            g (int): Groups for convolutions.
-            k (tuple): Kernel sizes for convolutions.
-            e (float): Expansion ratio.
-        """
-        super().__init__()
-        c_ = int(c2 * e)  # hidden channels
-        self.cv1 = Conv(c1, c_, k[0], 1)
-        self.cv2 = Conv(c_, c2, k[1], 1, g=g)
-        self.add = shortcut and c1 == c2
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Apply bottleneck with optional shortcut connection."""
-        return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
-
-
-class BottleneckCSP(nn.Module):
-    """CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks."""
-
-    def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = True, g: int = 1, e: float = 0.5):
-        """
-        Initialize CSP Bottleneck.
-
-        Args:
-            c1 (int): Input channels.
-            c2 (int): Output channels.
-            n (int): Number of Bottleneck blocks.
-            shortcut (bool): Whether to use shortcut connections.
-            g (int): Groups for convolutions.
-            e (float): Expansion ratio.
-        """
-        super().__init__()
-        c_ = int(c2 * e)  # hidden channels
-        self.cv1 = Conv(c1, c_, 1, 1)
-        self.cv2 = nn.Conv2d(c1, c_, 1, 1, bias=False)
-        self.cv3 = nn.Conv2d(c_, c_, 1, 1, bias=False)
-        self.cv4 = Conv(2 * c_, c2, 1, 1)
-        self.bn = nn.BatchNorm2d(2 * c_)  # applied to cat(cv2, cv3)
-        self.act = nn.SiLU()
-        self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Apply CSP bottleneck with 3 convolutions."""
-        y1 = self.cv3(self.m(self.cv1(x)))
-        y2 = self.cv2(x)
-        return self.cv4(self.act(self.bn(torch.cat((y1, y2), 1))))
-
-
-class ResNetBlock(nn.Module):
-    """ResNet block with standard convolution layers."""
-
-    def __init__(self, c1: int, c2: int, s: int = 1, e: int = 4):
-        """
-        Initialize ResNet block.
-
-        Args:
-            c1 (int): Input channels.
-            c2 (int): Output channels.
-            s (int): Stride.
-            e (int): Expansion ratio.
-        """
-        super().__init__()
-        c3 = e * c2
-        self.cv1 = Conv(c1, c2, k=1, s=1, act=True)
-        self.cv2 = Conv(c2, c2, k=3, s=s, p=1, act=True)
-        self.cv3 = Conv(c2, c3, k=1, act=False)
-        self.shortcut = nn.Sequential(Conv(c1, c3, k=1, s=s, act=False)) if s != 1 or c1 != c3 else nn.Identity()
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Forward pass through the ResNet block."""
-        return F.relu(self.cv3(self.cv2(self.cv1(x))) + self.shortcut(x))
-
-
-class ResNetLayer(nn.Module):
-    """ResNet layer with multiple ResNet blocks."""
-
-    def __init__(self, c1: int, c2: int, s: int = 1, is_first: bool = False, n: int = 1, e: int = 4):
-        """
-        Initialize ResNet layer.
-
-        Args:
-            c1 (int): Input channels.
-            c2 (int): Output channels.
-            s (int): Stride.
-            is_first (bool): Whether this is the first layer.
-            n (int): Number of ResNet blocks.
-            e (int): Expansion ratio.
-        """
-        super().__init__()
-        self.is_first = is_first
-
-        if self.is_first:
-            self.layer = nn.Sequential(
-                Conv(c1, c2, k=7, s=2, p=3, act=True), nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
-            )
-        else:
-            blocks = [ResNetBlock(c1, c2, s, e=e)]
-            blocks.extend([ResNetBlock(e * c2, c2, 1, e=e) for _ in range(n - 1)])
-            self.layer = nn.Sequential(*blocks)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Forward pass through the ResNet layer."""
-        return self.layer(x)
-
-
-class MaxSigmoidAttnBlock(nn.Module):
-    """Max Sigmoid attention block."""
-
-    def __init__(self, c1: int, c2: int, nh: int = 1, ec: int = 128, gc: int = 512, scale: bool = False):
-        """
-        Initialize MaxSigmoidAttnBlock.
-
-        Args:
-            c1 (int): Input channels.
-            c2 (int): Output channels.
-            nh (int): Number of heads.
-            ec (int): Embedding channels.
-            gc (int): Guide channels.
-            scale (bool): Whether to use learnable scale parameter.
-        """
-        super().__init__()
-        self.nh = nh
-        self.hc = c2 // nh
-        self.ec = Conv(c1, ec, k=1, act=False) if c1 != ec else None
-        self.gl = nn.Linear(gc, ec)
-        self.bias = nn.Parameter(torch.zeros(nh))
-        self.proj_conv = Conv(c1, c2, k=3, s=1, act=False)
-        self.scale = nn.Parameter(torch.ones(1, nh, 1, 1)) if scale else 1.0
-
-    def forward(self, x: torch.Tensor, guide: torch.Tensor) -> torch.Tensor:
-        """
-        Forward pass of MaxSigmoidAttnBlock.
-
-        Args:
-            x (torch.Tensor): Input tensor.
-            guide (torch.Tensor): Guide tensor.
-
-        Returns:
-            (torch.Tensor): Output tensor after attention.
-        """
-        bs, _, h, w = x.shape
-
-        guide = self.gl(guide)
-        guide = guide.view(bs, guide.shape[1], self.nh, self.hc)
-        embed = self.ec(x) if self.ec is not None else x
-        embed = embed.view(bs, self.nh, self.hc, h, w)
-
-        aw = torch.einsum("bmchw,bnmc->bmhwn", embed, guide)
-        aw = aw.max(dim=-1)[0]
-        aw = aw / (self.hc**0.5)
-        aw = aw + self.bias[None, :, None, None]
-        aw = aw.sigmoid() * self.scale
-
-        x = self.proj_conv(x)
-        x = x.view(bs, self.nh, -1, h, w)
-        x = x * aw.unsqueeze(2)
-        return x.view(bs, -1, h, w)
-
-
-class C2fAttn(nn.Module):
-    """C2f module with an additional attn module."""
-
-    def __init__(
-        self,
-        c1: int,
-        c2: int,
-        n: int = 1,
-        ec: int = 128,
-        nh: int = 1,
-        gc: int = 512,
-        shortcut: bool = False,
-        g: int = 1,
-        e: float = 0.5,
-    ):
-        """
-        Initialize C2f module with attention mechanism.
-
-        Args:
-            c1 (int): Input channels.
-            c2 (int): Output channels.
-            n (int): Number of Bottleneck blocks.
-            ec (int): Embedding channels for attention.
-            nh (int): Number of heads for attention.
-            gc (int): Guide channels for attention.
-            shortcut (bool): Whether to use shortcut connections.
-            g (int): Groups for convolutions.
-            e (float): Expansion ratio.
-        """
-        super().__init__()
-        self.c = int(c2 * e)  # hidden channels
-        self.cv1 = Conv(c1, 2 * self.c, 1, 1)
-        self.cv2 = Conv((3 + n) * self.c, c2, 1)  # optional act=FReLU(c2)
-        self.m = nn.ModuleList(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n))
-        self.attn = MaxSigmoidAttnBlock(self.c, self.c, gc=gc, ec=ec, nh=nh)
-
-    def forward(self, x: torch.Tensor, guide: torch.Tensor) -> torch.Tensor:
-        """
-        Forward pass through C2f layer with attention.
-
-        Args:
-            x (torch.Tensor): Input tensor.
-            guide (torch.Tensor): Guide tensor for attention.
-
-        Returns:
-            (torch.Tensor): Output tensor after processing.
-        """
-        y = list(self.cv1(x).chunk(2, 1))
-        y.extend(m(y[-1]) for m in self.m)
-        y.append(self.attn(y[-1], guide))
-        return self.cv2(torch.cat(y, 1))
-
-    def forward_split(self, x: torch.Tensor, guide: torch.Tensor) -> torch.Tensor:
-        """
-        Forward pass using split() instead of chunk().
-
-        Args:
-            x (torch.Tensor): Input tensor.
-            guide (torch.Tensor): Guide tensor for attention.
-
-        Returns:
-            (torch.Tensor): Output tensor after processing.
-        """
-        y = list(self.cv1(x).split((self.c, self.c), 1))
-        y.extend(m(y[-1]) for m in self.m)
-        y.append(self.attn(y[-1], guide))
-        return self.cv2(torch.cat(y, 1))
-
-
-class ImagePoolingAttn(nn.Module):
-    """ImagePoolingAttn: Enhance the text embeddings with image-aware information."""
-
-    def __init__(
-        self, ec: int = 256, ch: tuple[int, ...] = (), ct: int = 512, nh: int = 8, k: int = 3, scale: bool = False
-    ):
-        """
-        Initialize ImagePoolingAttn module.
-
-        Args:
-            ec (int): Embedding channels.
-            ch (tuple): Channel dimensions for feature maps.
-            ct (int): Channel dimension for text embeddings.
-            nh (int): Number of attention heads.
-            k (int): Kernel size for pooling.
-            scale (bool): Whether to use learnable scale parameter.
-        """
-        super().__init__()
-
-        nf = len(ch)
-        self.query = nn.Sequential(nn.LayerNorm(ct), nn.Linear(ct, ec))
-        self.key = nn.Sequential(nn.LayerNorm(ec), nn.Linear(ec, ec))
-        self.value = nn.Sequential(nn.LayerNorm(ec), nn.Linear(ec, ec))
-        self.proj = nn.Linear(ec, ct)
-        self.scale = nn.Parameter(torch.tensor([0.0]), requires_grad=True) if scale else 1.0
-        self.projections = nn.ModuleList([nn.Conv2d(in_channels, ec, kernel_size=1) for in_channels in ch])
-        self.im_pools = nn.ModuleList([nn.AdaptiveMaxPool2d((k, k)) for _ in range(nf)])
-        self.ec = ec
-        self.nh = nh
-        self.nf = nf
-        self.hc = ec // nh
-        self.k = k
-
-    def forward(self, x: list[torch.Tensor], text: torch.Tensor) -> torch.Tensor:
-        """
-        Forward pass of ImagePoolingAttn.
-
-        Args:
-            x (list[torch.Tensor]): List of input feature maps.
-            text (torch.Tensor): Text embeddings.
-
-        Returns:
-            (torch.Tensor): Enhanced text embeddings.
-        """
-        bs = x[0].shape[0]
-        assert len(x) == self.nf
-        num_patches = self.k**2
-        x = [pool(proj(x)).view(bs, -1, num_patches) for (x, proj, pool) in zip(x, self.projections, self.im_pools)]
-        x = torch.cat(x, dim=-1).transpose(1, 2)
-        q = self.query(text)
-        k = self.key(x)
-        v = self.value(x)
-
-        # q = q.reshape(1, text.shape[1], self.nh, self.hc).repeat(bs, 1, 1, 1)
-        q = q.reshape(bs, -1, self.nh, self.hc)
-        k = k.reshape(bs, -1, self.nh, self.hc)
-        v = v.reshape(bs, -1, self.nh, self.hc)
-
-        aw = torch.einsum("bnmc,bkmc->bmnk", q, k)
-        aw = aw / (self.hc**0.5)
-        aw = F.softmax(aw, dim=-1)
-
-        x = torch.einsum("bmnk,bkmc->bnmc", aw, v)
-        x = self.proj(x.reshape(bs, -1, self.ec))
-        return x * self.scale + text
-
-
-class ContrastiveHead(nn.Module):
-    """Implements contrastive learning head for region-text similarity in vision-language models."""
-
-    def __init__(self):
-        """Initialize ContrastiveHead with region-text similarity parameters."""
-        super().__init__()
-        # NOTE: use -10.0 to keep the init cls loss consistency with other losses
-        self.bias = nn.Parameter(torch.tensor([-10.0]))
-        self.logit_scale = nn.Parameter(torch.ones([]) * torch.tensor(1 / 0.07).log())
-
-    def forward(self, x: torch.Tensor, w: torch.Tensor) -> torch.Tensor:
-        """
-        Forward function of contrastive learning.
-
-        Args:
-            x (torch.Tensor): Image features.
-            w (torch.Tensor): Text features.
-
-        Returns:
-            (torch.Tensor): Similarity scores.
-        """
-        x = F.normalize(x, dim=1, p=2)
-        w = F.normalize(w, dim=-1, p=2)
-        x = torch.einsum("bchw,bkc->bkhw", x, w)
-        return x * self.logit_scale.exp() + self.bias
-
-
-class BNContrastiveHead(nn.Module):
-    """
-    Batch Norm Contrastive Head using batch norm instead of l2-normalization.
-
-    Args:
-        embed_dims (int): Embed dimensions of text and image features.
-    """
-
-    def __init__(self, embed_dims: int):
-        """
-        Initialize BNContrastiveHead.
-
-        Args:
-            embed_dims (int): Embedding dimensions for features.
-        """
-        super().__init__()
-        self.norm = nn.BatchNorm2d(embed_dims)
-        # NOTE: use -10.0 to keep the init cls loss consistency with other losses
-        self.bias = nn.Parameter(torch.tensor([-10.0]))
-        # use -1.0 is more stable
-        self.logit_scale = nn.Parameter(-1.0 * torch.ones([]))
-
-    def fuse(self):
-        """Fuse the batch normalization layer in the BNContrastiveHead module."""
-        del self.norm
-        del self.bias
-        del self.logit_scale
-        self.forward = self.forward_fuse
-
-    def forward_fuse(self, x: torch.Tensor, w: torch.Tensor) -> torch.Tensor:
-        """Passes input out unchanged."""
-        return x
-
-    def forward(self, x: torch.Tensor, w: torch.Tensor) -> torch.Tensor:
-        """
-        Forward function of contrastive learning with batch normalization.
-
-        Args:
-            x (torch.Tensor): Image features.
-            w (torch.Tensor): Text features.
-
-        Returns:
-            (torch.Tensor): Similarity scores.
-        """
-        x = self.norm(x)
-        w = F.normalize(w, dim=-1, p=2)
-
-        x = torch.einsum("bchw,bkc->bkhw", x, w)
-        return x * self.logit_scale.exp() + self.bias
-
-
-class RepBottleneck(Bottleneck):
-    """Rep bottleneck."""
-
-    def __init__(
-        self, c1: int, c2: int, shortcut: bool = True, g: int = 1, k: tuple[int, int] = (3, 3), e: float = 0.5
-    ):
-        """
-        Initialize RepBottleneck.
-
-        Args:
-            c1 (int): Input channels.
-            c2 (int): Output channels.
-            shortcut (bool): Whether to use shortcut connection.
-            g (int): Groups for convolutions.
-            k (tuple): Kernel sizes for convolutions.
-            e (float): Expansion ratio.
-        """
-        super().__init__(c1, c2, shortcut, g, k, e)
-        c_ = int(c2 * e)  # hidden channels
-        self.cv1 = RepConv(c1, c_, k[0], 1)
-
-
-class RepCSP(C3):
-    """Repeatable Cross Stage Partial Network (RepCSP) module for efficient feature extraction."""
-
-    def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = True, g: int = 1, e: float = 0.5):
-        """
-        Initialize RepCSP layer.
-
-        Args:
-            c1 (int): Input channels.
-            c2 (int): Output channels.
-            n (int): Number of RepBottleneck blocks.
-            shortcut (bool): Whether to use shortcut connections.
-            g (int): Groups for convolutions.
-            e (float): Expansion ratio.
-        """
-        super().__init__(c1, c2, n, shortcut, g, e)
-        c_ = int(c2 * e)  # hidden channels
-        self.m = nn.Sequential(*(RepBottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))
-
-
-class RepNCSPELAN4(nn.Module):
-    """CSP-ELAN."""
-
-    def __init__(self, c1: int, c2: int, c3: int, c4: int, n: int = 1):
-        """
-        Initialize CSP-ELAN layer.
-
-        Args:
-            c1 (int): Input channels.
-            c2 (int): Output channels.
-            c3 (int): Intermediate channels.
-            c4 (int): Intermediate channels for RepCSP.
-            n (int): Number of RepCSP blocks.
-        """
-        super().__init__()
-        self.c = c3 // 2
-        self.cv1 = Conv(c1, c3, 1, 1)
-        self.cv2 = nn.Sequential(RepCSP(c3 // 2, c4, n), Conv(c4, c4, 3, 1))
-        self.cv3 = nn.Sequential(RepCSP(c4, c4, n), Conv(c4, c4, 3, 1))
-        self.cv4 = Conv(c3 + (2 * c4), c2, 1, 1)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Forward pass through RepNCSPELAN4 layer."""
-        y = list(self.cv1(x).chunk(2, 1))
-        y.extend((m(y[-1])) for m in [self.cv2, self.cv3])
-        return self.cv4(torch.cat(y, 1))
-
-    def forward_split(self, x: torch.Tensor) -> torch.Tensor:
-        """Forward pass using split() instead of chunk()."""
-        y = list(self.cv1(x).split((self.c, self.c), 1))
-        y.extend(m(y[-1]) for m in [self.cv2, self.cv3])
-        return self.cv4(torch.cat(y, 1))
-
-
-class ELAN1(RepNCSPELAN4):
-    """ELAN1 module with 4 convolutions."""
-
-    def __init__(self, c1: int, c2: int, c3: int, c4: int):
-        """
-        Initialize ELAN1 layer.
-
-        Args:
-            c1 (int): Input channels.
-            c2 (int): Output channels.
-            c3 (int): Intermediate channels.
-            c4 (int): Intermediate channels for convolutions.
-        """
-        super().__init__(c1, c2, c3, c4)
-        self.c = c3 // 2
-        self.cv1 = Conv(c1, c3, 1, 1)
-        self.cv2 = Conv(c3 // 2, c4, 3, 1)
-        self.cv3 = Conv(c4, c4, 3, 1)
-        self.cv4 = Conv(c3 + (2 * c4), c2, 1, 1)
-
-
-class AConv(nn.Module):
-    """AConv."""
-
-    def __init__(self, c1: int, c2: int):
-        """
-        Initialize AConv module.
-
-        Args:
-            c1 (int): Input channels.
-            c2 (int): Output channels.
-        """
-        super().__init__()
-        self.cv1 = Conv(c1, c2, 3, 2, 1)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Forward pass through AConv layer."""
-        x = torch.nn.functional.avg_pool2d(x, 2, 1, 0, False, True)
-        return self.cv1(x)
-
-
-class ADown(nn.Module):
-    """ADown."""
-
-    def __init__(self, c1: int, c2: int):
-        """
-        Initialize ADown module.
-
-        Args:
-            c1 (int): Input channels.
-            c2 (int): Output channels.
-        """
-        super().__init__()
-        self.c = c2 // 2
-        self.cv1 = Conv(c1 // 2, self.c, 3, 2, 1)
-        self.cv2 = Conv(c1 // 2, self.c, 1, 1, 0)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Forward pass through ADown layer."""
-        x = torch.nn.functional.avg_pool2d(x, 2, 1, 0, False, True)
-        x1, x2 = x.chunk(2, 1)
-        x1 = self.cv1(x1)
-        x2 = torch.nn.functional.max_pool2d(x2, 3, 2, 1)
-        x2 = self.cv2(x2)
-        return torch.cat((x1, x2), 1)
-
-
-class SPPELAN(nn.Module):
-    """SPP-ELAN."""
-
-    def __init__(self, c1: int, c2: int, c3: int, k: int = 5):
-        """
-        Initialize SPP-ELAN block.
-
-        Args:
-            c1 (int): Input channels.
-            c2 (int): Output channels.
-            c3 (int): Intermediate channels.
-            k (int): Kernel size for max pooling.
-        """
-        super().__init__()
-        self.c = c3
-        self.cv1 = Conv(c1, c3, 1, 1)
-        self.cv2 = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
-        self.cv3 = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
-        self.cv4 = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
-        self.cv5 = Conv(4 * c3, c2, 1, 1)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Forward pass through SPPELAN layer."""
-        y = [self.cv1(x)]
-        y.extend(m(y[-1]) for m in [self.cv2, self.cv3, self.cv4])
-        return self.cv5(torch.cat(y, 1))
-
-
-class CBLinear(nn.Module):
-    """CBLinear."""
-
-    def __init__(self, c1: int, c2s: list[int], k: int = 1, s: int = 1, p: int | None = None, g: int = 1):
-        """
-        Initialize CBLinear module.
-
-        Args:
-            c1 (int): Input channels.
-            c2s (list[int]): List of output channel sizes.
-            k (int): Kernel size.
-            s (int): Stride.
-            p (int | None): Padding.
-            g (int): Groups.
-        """
-        super().__init__()
-        self.c2s = c2s
-        self.conv = nn.Conv2d(c1, sum(c2s), k, s, autopad(k, p), groups=g, bias=True)
-
-    def forward(self, x: torch.Tensor) -> list[torch.Tensor]:
-        """Forward pass through CBLinear layer."""
-        return self.conv(x).split(self.c2s, dim=1)
-
-
-class CBFuse(nn.Module):
-    """CBFuse."""
-
-    def __init__(self, idx: list[int]):
-        """
-        Initialize CBFuse module.
-
-        Args:
-            idx (list[int]): Indices for feature selection.
-        """
-        super().__init__()
-        self.idx = idx
-
-    def forward(self, xs: list[torch.Tensor]) -> torch.Tensor:
-        """
-        Forward pass through CBFuse layer.
-
-        Args:
-            xs (list[torch.Tensor]): List of input tensors.
-
-        Returns:
-            (torch.Tensor): Fused output tensor.
-        """
-        target_size = xs[-1].shape[2:]
-        res = [F.interpolate(x[self.idx[i]], size=target_size, mode="nearest") for i, x in enumerate(xs[:-1])]
-        return torch.sum(torch.stack(res + xs[-1:]), dim=0)
-
-
-class C3f(nn.Module):
-    """Faster Implementation of CSP Bottleneck with 2 convolutions."""
-
-    def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = False, g: int = 1, e: float = 0.5):
-        """
-        Initialize CSP bottleneck layer with two convolutions.
-
-        Args:
-            c1 (int): Input channels.
-            c2 (int): Output channels.
-            n (int): Number of Bottleneck blocks.
-            shortcut (bool): Whether to use shortcut connections.
-            g (int): Groups for convolutions.
-            e (float): Expansion ratio.
-        """
-        super().__init__()
-        c_ = int(c2 * e)  # hidden channels
-        self.cv1 = Conv(c1, c_, 1, 1)
-        self.cv2 = Conv(c1, c_, 1, 1)
-        self.cv3 = Conv((2 + n) * c_, c2, 1)  # optional act=FReLU(c2)
-        self.m = nn.ModuleList(Bottleneck(c_, c_, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n))
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Forward pass through C3f layer."""
-        y = [self.cv2(x), self.cv1(x)]
-        y.extend(m(y[-1]) for m in self.m)
-        return self.cv3(torch.cat(y, 1))
-
-
-class C3k2(C2f):
-    """Faster Implementation of CSP Bottleneck with 2 convolutions."""
-
-    def __init__(
-        self, c1: int, c2: int, n: int = 1, c3k: bool = False, e: float = 0.5, g: int = 1, shortcut: bool = True
-    ):
-        """
-        Initialize C3k2 module.
-
-        Args:
-            c1 (int): Input channels.
-            c2 (int): Output channels.
-            n (int): Number of blocks.
-            c3k (bool): Whether to use C3k blocks.
-            e (float): Expansion ratio.
-            g (int): Groups for convolutions.
-            shortcut (bool): Whether to use shortcut connections.
-        """
-        super().__init__(c1, c2, n, shortcut, g, e)
-        self.m = nn.ModuleList(
-            C3k(self.c, self.c, 2, shortcut, g) if c3k else Bottleneck(self.c, self.c, shortcut, g) for _ in range(n)
-        )
-
-
-class C3k(C3):
-    """C3k is a CSP bottleneck module with customizable kernel sizes for feature extraction in neural networks."""
-
-    def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = True, g: int = 1, e: float = 0.5, k: int = 3):
-        """
-        Initialize C3k module.
-
-        Args:
-            c1 (int): Input channels.
-            c2 (int): Output channels.
-            n (int): Number of Bottleneck blocks.
-            shortcut (bool): Whether to use shortcut connections.
-            g (int): Groups for convolutions.
-            e (float): Expansion ratio.
-            k (int): Kernel size.
-        """
-        super().__init__(c1, c2, n, shortcut, g, e)
-        c_ = int(c2 * e)  # hidden channels
-        # self.m = nn.Sequential(*(RepBottleneck(c_, c_, shortcut, g, k=(k, k), e=1.0) for _ in range(n)))
-        self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, k=(k, k), e=1.0) for _ in range(n)))
-
-
-class RepVGGDW(torch.nn.Module):
-    """RepVGGDW is a class that represents a depth wise separable convolutional block in RepVGG architecture."""
-
-    def __init__(self, ed: int) -> None:
-        """
-        Initialize RepVGGDW module.
-
-        Args:
-            ed (int): Input and output channels.
-        """
-        super().__init__()
-        self.conv = Conv(ed, ed, 7, 1, 3, g=ed, act=False)
-        self.conv1 = Conv(ed, ed, 3, 1, 1, g=ed, act=False)
-        self.dim = ed
-        self.act = nn.SiLU()
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Perform a forward pass of the RepVGGDW block.
-
-        Args:
-            x (torch.Tensor): Input tensor.
-
-        Returns:
-            (torch.Tensor): Output tensor after applying the depth wise separable convolution.
-        """
-        return self.act(self.conv(x) + self.conv1(x))
-
-    def forward_fuse(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Perform a forward pass of the RepVGGDW block without fusing the convolutions.
-
-        Args:
-            x (torch.Tensor): Input tensor.
-
-        Returns:
-            (torch.Tensor): Output tensor after applying the depth wise separable convolution.
-        """
-        return self.act(self.conv(x))
-
-    @torch.no_grad()
-    def fuse(self):
-        """
-        Fuse the convolutional layers in the RepVGGDW block.
-
-        This method fuses the convolutional layers and updates the weights and biases accordingly.
-        """
-        conv = fuse_conv_and_bn(self.conv.conv, self.conv.bn)
-        conv1 = fuse_conv_and_bn(self.conv1.conv, self.conv1.bn)
-
-        conv_w = conv.weight
-        conv_b = conv.bias
-        conv1_w = conv1.weight
-        conv1_b = conv1.bias
-
-        conv1_w = torch.nn.functional.pad(conv1_w, [2, 2, 2, 2])
-
-        final_conv_w = conv_w + conv1_w
-        final_conv_b = conv_b + conv1_b
-
-        conv.weight.data.copy_(final_conv_w)
-        conv.bias.data.copy_(final_conv_b)
-
-        self.conv = conv
-        del self.conv1
-
-
-class CIB(nn.Module):
-    """
-    Conditional Identity Block (CIB) module.
-
-    Args:
-        c1 (int): Number of input channels.
-        c2 (int): Number of output channels.
-        shortcut (bool, optional): Whether to add a shortcut connection. Defaults to True.
-        e (float, optional): Scaling factor for the hidden channels. Defaults to 0.5.
-        lk (bool, optional): Whether to use RepVGGDW for the third convolutional layer. Defaults to False.
-    """
-
-    def __init__(self, c1: int, c2: int, shortcut: bool = True, e: float = 0.5, lk: bool = False):
-        """
-        Initialize the CIB module.
-
-        Args:
-            c1 (int): Input channels.
-            c2 (int): Output channels.
-            shortcut (bool): Whether to use shortcut connection.
-            e (float): Expansion ratio.
-            lk (bool): Whether to use RepVGGDW.
-        """
-        super().__init__()
-        c_ = int(c2 * e)  # hidden channels
-        self.cv1 = nn.Sequential(
-            Conv(c1, c1, 3, g=c1),
-            Conv(c1, 2 * c_, 1),
-            RepVGGDW(2 * c_) if lk else Conv(2 * c_, 2 * c_, 3, g=2 * c_),
-            Conv(2 * c_, c2, 1),
-            Conv(c2, c2, 3, g=c2),
-        )
-
-        self.add = shortcut and c1 == c2
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Forward pass of the CIB module.
-
-        Args:
-            x (torch.Tensor): Input tensor.
-
-        Returns:
-            (torch.Tensor): Output tensor.
-        """
-        return x + self.cv1(x) if self.add else self.cv1(x)
-
-
-class C2fCIB(C2f):
-    """
-    C2fCIB class represents a convolutional block with C2f and CIB modules.
-
-    Args:
-        c1 (int): Number of input channels.
-        c2 (int): Number of output channels.
-        n (int, optional): Number of CIB modules to stack. Defaults to 1.
-        shortcut (bool, optional): Whether to use shortcut connection. Defaults to False.
-        lk (bool, optional): Whether to use local key connection. Defaults to False.
-        g (int, optional): Number of groups for grouped convolution. Defaults to 1.
-        e (float, optional): Expansion ratio for CIB modules. Defaults to 0.5.
-    """
-
-    def __init__(
-        self, c1: int, c2: int, n: int = 1, shortcut: bool = False, lk: bool = False, g: int = 1, e: float = 0.5
-    ):
-        """
-        Initialize C2fCIB module.
-
-        Args:
-            c1 (int): Input channels.
-            c2 (int): Output channels.
-            n (int): Number of CIB modules.
-            shortcut (bool): Whether to use shortcut connection.
-            lk (bool): Whether to use local key connection.
-            g (int): Groups for convolutions.
-            e (float): Expansion ratio.
-        """
-        super().__init__(c1, c2, n, shortcut, g, e)
-        self.m = nn.ModuleList(CIB(self.c, self.c, shortcut, e=1.0, lk=lk) for _ in range(n))
-
-
-class Attention(nn.Module):
-    """
-    Attention module that performs self-attention on the input tensor.
-
-    Args:
-        dim (int): The input tensor dimension.
-        num_heads (int): The number of attention heads.
-        attn_ratio (float): The ratio of the attention key dimension to the head dimension.
-
-    Attributes:
-        num_heads (int): The number of attention heads.
-        head_dim (int): The dimension of each attention head.
-        key_dim (int): The dimension of the attention key.
-        scale (float): The scaling factor for the attention scores.
-        qkv (Conv): Convolutional layer for computing the query, key, and value.
-        proj (Conv): Convolutional layer for projecting the attended values.
-        pe (Conv): Convolutional layer for positional encoding.
-    """
-
-    def __init__(self, dim: int, num_heads: int = 8, attn_ratio: float = 0.5):
-        """
-        Initialize multi-head attention module.
-
-        Args:
-            dim (int): Input dimension.
-            num_heads (int): Number of attention heads.
-            attn_ratio (float): Attention ratio for key dimension.
-        """
-        super().__init__()
-        self.num_heads = num_heads
-        self.head_dim = dim // num_heads
-        self.key_dim = int(self.head_dim * attn_ratio)
-        self.scale = self.key_dim**-0.5
-        nh_kd = self.key_dim * num_heads
-        h = dim + nh_kd * 2
-        self.qkv = Conv(dim, h, 1, act=False)
-        self.proj = Conv(dim, dim, 1, act=False)
-        self.pe = Conv(dim, dim, 3, 1, g=dim, act=False)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Forward pass of the Attention module.
-
-        Args:
-            x (torch.Tensor): The input tensor.
-
-        Returns:
-            (torch.Tensor): The output tensor after self-attention.
-        """
-        B, C, H, W = x.shape
-        N = H * W
-        qkv = self.qkv(x)
-        q, k, v = qkv.view(B, self.num_heads, self.key_dim * 2 + self.head_dim, N).split(
-            [self.key_dim, self.key_dim, self.head_dim], dim=2
-        )
-
-        attn = (q.transpose(-2, -1) @ k) * self.scale
-        attn = attn.softmax(dim=-1)
-        x = (v @ attn.transpose(-2, -1)).view(B, C, H, W) + self.pe(v.reshape(B, C, H, W))
-        x = self.proj(x)
-        return x
-
-
-class PSABlock(nn.Module):
-    """
-    PSABlock class implementing a Position-Sensitive Attention block for neural networks.
-
-    This class encapsulates the functionality for applying multi-head attention and feed-forward neural network layers
-    with optional shortcut connections.
-
-    Attributes:
-        attn (Attention): Multi-head attention module.
-        ffn (nn.Sequential): Feed-forward neural network module.
-        add (bool): Flag indicating whether to add shortcut connections.
-
-    Methods:
-        forward: Performs a forward pass through the PSABlock, applying attention and feed-forward layers.
-
-    Examples:
-        Create a PSABlock and perform a forward pass
-        >>> psablock = PSABlock(c=128, attn_ratio=0.5, num_heads=4, shortcut=True)
-        >>> input_tensor = torch.randn(1, 128, 32, 32)
-        >>> output_tensor = psablock(input_tensor)
-    """
-
-    def __init__(self, c: int, attn_ratio: float = 0.5, num_heads: int = 4, shortcut: bool = True) -> None:
-        """
-        Initialize the PSABlock.
-
-        Args:
-            c (int): Input and output channels.
-            attn_ratio (float): Attention ratio for key dimension.
-            num_heads (int): Number of attention heads.
-            shortcut (bool): Whether to use shortcut connections.
-        """
-        super().__init__()
-
-        self.attn = Attention(c, attn_ratio=attn_ratio, num_heads=num_heads)
-        self.ffn = nn.Sequential(Conv(c, c * 2, 1), Conv(c * 2, c, 1, act=False))
-        self.add = shortcut
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Execute a forward pass through PSABlock.
-
-        Args:
-            x (torch.Tensor): Input tensor.
-
-        Returns:
-            (torch.Tensor): Output tensor after attention and feed-forward processing.
-        """
-        x = x + self.attn(x) if self.add else self.attn(x)
-        x = x + self.ffn(x) if self.add else self.ffn(x)
-        return x
-
-
-class PSA(nn.Module):
-    """
-    PSA class for implementing Position-Sensitive Attention in neural networks.
-
-    This class encapsulates the functionality for applying position-sensitive attention and feed-forward networks to
-    input tensors, enhancing feature extraction and processing capabilities.
-
-    Attributes:
-        c (int): Number of hidden channels after applying the initial convolution.
-        cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
-        cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
-        attn (Attention): Attention module for position-sensitive attention.
-        ffn (nn.Sequential): Feed-forward network for further processing.
-
-    Methods:
-        forward: Applies position-sensitive attention and feed-forward network to the input tensor.
-
-    Examples:
-        Create a PSA module and apply it to an input tensor
-        >>> psa = PSA(c1=128, c2=128, e=0.5)
-        >>> input_tensor = torch.randn(1, 128, 64, 64)
-        >>> output_tensor = psa.forward(input_tensor)
-    """
-
-    def __init__(self, c1: int, c2: int, e: float = 0.5):
-        """
-        Initialize PSA module.
-
-        Args:
-            c1 (int): Input channels.
-            c2 (int): Output channels.
-            e (float): Expansion ratio.
-        """
-        super().__init__()
-        assert c1 == c2
-        self.c = int(c1 * e)
-        self.cv1 = Conv(c1, 2 * self.c, 1, 1)
-        self.cv2 = Conv(2 * self.c, c1, 1)
-
-        self.attn = Attention(self.c, attn_ratio=0.5, num_heads=self.c // 64)
-        self.ffn = nn.Sequential(Conv(self.c, self.c * 2, 1), Conv(self.c * 2, self.c, 1, act=False))
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Execute forward pass in PSA module.
-
-        Args:
-            x (torch.Tensor): Input tensor.
-
-        Returns:
-            (torch.Tensor): Output tensor after attention and feed-forward processing.
-        """
-        a, b = self.cv1(x).split((self.c, self.c), dim=1)
-        b = b + self.attn(b)
-        b = b + self.ffn(b)
-        return self.cv2(torch.cat((a, b), 1))
-
-
-class C2PSA(nn.Module):
-    """
-    C2PSA module with attention mechanism for enhanced feature extraction and processing.
-
-    This module implements a convolutional block with attention mechanisms to enhance feature extraction and processing
-    capabilities. It includes a series of PSABlock modules for self-attention and feed-forward operations.
-
-    Attributes:
-        c (int): Number of hidden channels.
-        cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
-        cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
-        m (nn.Sequential): Sequential container of PSABlock modules for attention and feed-forward operations.
-
-    Methods:
-        forward: Performs a forward pass through the C2PSA module, applying attention and feed-forward operations.
-
-    Notes:
-        This module essentially is the same as PSA module, but refactored to allow stacking more PSABlock modules.
-
-    Examples:
-        >>> c2psa = C2PSA(c1=256, c2=256, n=3, e=0.5)
-        >>> input_tensor = torch.randn(1, 256, 64, 64)
-        >>> output_tensor = c2psa(input_tensor)
-    """
-
-    def __init__(self, c1: int, c2: int, n: int = 1, e: float = 0.5):
-        """
-        Initialize C2PSA module.
-
-        Args:
-            c1 (int): Input channels.
-            c2 (int): Output channels.
-            n (int): Number of PSABlock modules.
-            e (float): Expansion ratio.
-        """
-        super().__init__()
-        assert c1 == c2
-        self.c = int(c1 * e)
-        self.cv1 = Conv(c1, 2 * self.c, 1, 1)
-        self.cv2 = Conv(2 * self.c, c1, 1)
-
-        self.m = nn.Sequential(*(PSABlock(self.c, attn_ratio=0.5, num_heads=self.c // 64) for _ in range(n)))
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Process the input tensor through a series of PSA blocks.
-
-        Args:
-            x (torch.Tensor): Input tensor.
-
-        Returns:
-            (torch.Tensor): Output tensor after processing.
-        """
-        a, b = self.cv1(x).split((self.c, self.c), dim=1)
-        b = self.m(b)
-        return self.cv2(torch.cat((a, b), 1))
-
-
-class C2fPSA(C2f):
-    """
-    C2fPSA module with enhanced feature extraction using PSA blocks.
-
-    This class extends the C2f module by incorporating PSA blocks for improved attention mechanisms and feature extraction.
-
-    Attributes:
-        c (int): Number of hidden channels.
-        cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
-        cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
-        m (nn.ModuleList): List of PSA blocks for feature extraction.
-
-    Methods:
-        forward: Performs a forward pass through the C2fPSA module.
-        forward_split: Performs a forward pass using split() instead of chunk().
-
-    Examples:
-        >>> import torch
-        >>> from ultralytics.models.common import C2fPSA
-        >>> model = C2fPSA(c1=64, c2=64, n=3, e=0.5)
-        >>> x = torch.randn(1, 64, 128, 128)
-        >>> output = model(x)
-        >>> print(output.shape)
-    """
-
-    def __init__(self, c1: int, c2: int, n: int = 1, e: float = 0.5):
-        """
-        Initialize C2fPSA module.
-
-        Args:
-            c1 (int): Input channels.
-            c2 (int): Output channels.
-            n (int): Number of PSABlock modules.
-            e (float): Expansion ratio.
-        """
-        assert c1 == c2
-        super().__init__(c1, c2, n=n, e=e)
-        self.m = nn.ModuleList(PSABlock(self.c, attn_ratio=0.5, num_heads=self.c // 64) for _ in range(n))
-
-
-class SCDown(nn.Module):
-    """
-    SCDown module for downsampling with separable convolutions.
-
-    This module performs downsampling using a combination of pointwise and depthwise convolutions, which helps in
-    efficiently reducing the spatial dimensions of the input tensor while maintaining the channel information.
-
-    Attributes:
-        cv1 (Conv): Pointwise convolution layer that reduces the number of channels.
-        cv2 (Conv): Depthwise convolution layer that performs spatial downsampling.
-
-    Methods:
-        forward: Applies the SCDown module to the input tensor.
-
-    Examples:
-        >>> import torch
-        >>> from ultralytics import SCDown
-        >>> model = SCDown(c1=64, c2=128, k=3, s=2)
-        >>> x = torch.randn(1, 64, 128, 128)
-        >>> y = model(x)
-        >>> print(y.shape)
-        torch.Size([1, 128, 64, 64])
-    """
-
-    def __init__(self, c1: int, c2: int, k: int, s: int):
-        """
-        Initialize SCDown module.
-
-        Args:
-            c1 (int): Input channels.
-            c2 (int): Output channels.
-            k (int): Kernel size.
-            s (int): Stride.
-        """
-        super().__init__()
-        self.cv1 = Conv(c1, c2, 1, 1)
-        self.cv2 = Conv(c2, c2, k=k, s=s, g=c2, act=False)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Apply convolution and downsampling to the input tensor.
-
-        Args:
-            x (torch.Tensor): Input tensor.
-
-        Returns:
-            (torch.Tensor): Downsampled output tensor.
-        """
-        return self.cv2(self.cv1(x))
-
-
-class TorchVision(nn.Module):
-    """
-    TorchVision module to allow loading any torchvision model.
-
-    This class provides a way to load a model from the torchvision library, optionally load pre-trained weights, and customize the model by truncating or unwrapping layers.
-
-    Attributes:
-        m (nn.Module): The loaded torchvision model, possibly truncated and unwrapped.
-
-    Args:
-        model (str): Name of the torchvision model to load.
-        weights (str, optional): Pre-trained weights to load. Default is "DEFAULT".
-        unwrap (bool, optional): If True, unwraps the model to a sequential containing all but the last `truncate` layers. Default is True.
-        truncate (int, optional): Number of layers to truncate from the end if `unwrap` is True. Default is 2.
-        split (bool, optional): Returns output from intermediate child modules as list. Default is False.
-    """
-
-    def __init__(
-        self, model: str, weights: str = "DEFAULT", unwrap: bool = True, truncate: int = 2, split: bool = False
-    ):
-        """
-        Load the model and weights from torchvision.
-
-        Args:
-            model (str): Name of the torchvision model to load.
-            weights (str): Pre-trained weights to load.
-            unwrap (bool): Whether to unwrap the model.
-            truncate (int): Number of layers to truncate.
-            split (bool): Whether to split the output.
-        """
-        import torchvision  # scope for faster 'import ultralytics'
-
-        super().__init__()
-        if hasattr(torchvision.models, "get_model"):
-            self.m = torchvision.models.get_model(model, weights=weights)
-        else:
-            self.m = torchvision.models.__dict__[model](pretrained=bool(weights))
-        if unwrap:
-            layers = list(self.m.children())
-            if isinstance(layers[0], nn.Sequential):  # Second-level for some models like EfficientNet, Swin
-                layers = [*list(layers[0].children()), *layers[1:]]
-            self.m = nn.Sequential(*(layers[:-truncate] if truncate else layers))
-            self.split = split
-        else:
-            self.split = False
-            self.m.head = self.m.heads = nn.Identity()
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Forward pass through the model.
-
-        Args:
-            x (torch.Tensor): Input tensor.
-
-        Returns:
-            (torch.Tensor | list[torch.Tensor]): Output tensor or list of tensors.
-        """
-        if self.split:
-            y = [x]
-            y.extend(m(y[-1]) for m in self.m)
-        else:
-            y = self.m(x)
-        return y
-
-
-class AAttn(nn.Module):
-    """
-    Area-attention module for YOLO models, providing efficient attention mechanisms.
-
-    This module implements an area-based attention mechanism that processes input features in a spatially-aware manner,
-    making it particularly effective for object detection tasks.
-
-    Attributes:
-        area (int): Number of areas the feature map is divided.
-        num_heads (int): Number of heads into which the attention mechanism is divided.
-        head_dim (int): Dimension of each attention head.
-        qkv (Conv): Convolution layer for computing query, key and value tensors.
-        proj (Conv): Projection convolution layer.
-        pe (Conv): Position encoding convolution layer.
-
-    Methods:
-        forward: Applies area-attention to input tensor.
-
-    Examples:
-        >>> attn = AAttn(dim=256, num_heads=8, area=4)
-        >>> x = torch.randn(1, 256, 32, 32)
-        >>> output = attn(x)
-        >>> print(output.shape)
-        torch.Size([1, 256, 32, 32])
-    """
-
-    def __init__(self, dim: int, num_heads: int, area: int = 1):
-        """
-        Initialize an Area-attention module for YOLO models.
-
-        Args:
-            dim (int): Number of hidden channels.
-            num_heads (int): Number of heads into which the attention mechanism is divided.
-            area (int): Number of areas the feature map is divided.
-        """
-        super().__init__()
-        self.area = area
-
-        self.num_heads = num_heads
-        self.head_dim = head_dim = dim // num_heads
-        all_head_dim = head_dim * self.num_heads
-
-        self.qkv = Conv(dim, all_head_dim * 3, 1, act=False)
-        self.proj = Conv(all_head_dim, dim, 1, act=False)
-        self.pe = Conv(all_head_dim, dim, 7, 1, 3, g=dim, act=False)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Process the input tensor through the area-attention.
-
-        Args:
-            x (torch.Tensor): Input tensor.
-
-        Returns:
-            (torch.Tensor): Output tensor after area-attention.
-        """
-        B, C, H, W = x.shape
-        N = H * W
-
-        qkv = self.qkv(x).flatten(2).transpose(1, 2)
-        if self.area > 1:
-            qkv = qkv.reshape(B * self.area, N // self.area, C * 3)
-            B, N, _ = qkv.shape
-        q, k, v = (
-            qkv.view(B, N, self.num_heads, self.head_dim * 3)
-            .permute(0, 2, 3, 1)
-            .split([self.head_dim, self.head_dim, self.head_dim], dim=2)
-        )
-        attn = (q.transpose(-2, -1) @ k) * (self.head_dim**-0.5)
-        attn = attn.softmax(dim=-1)
-        x = v @ attn.transpose(-2, -1)
-        x = x.permute(0, 3, 1, 2)
-        v = v.permute(0, 3, 1, 2)
-
-        if self.area > 1:
-            x = x.reshape(B // self.area, N * self.area, C)
-            v = v.reshape(B // self.area, N * self.area, C)
-            B, N, _ = x.shape
-
-        x = x.reshape(B, H, W, C).permute(0, 3, 1, 2).contiguous()
-        v = v.reshape(B, H, W, C).permute(0, 3, 1, 2).contiguous()
-
-        x = x + self.pe(v)
-        return self.proj(x)
-
-
-class ABlock(nn.Module):
-    """
-    Area-attention block module for efficient feature extraction in YOLO models.
-
-    This module implements an area-attention mechanism combined with a feed-forward network for processing feature maps.
-    It uses a novel area-based attention approach that is more efficient than traditional self-attention while
-    maintaining effectiveness.
-
-    Attributes:
-        attn (AAttn): Area-attention module for processing spatial features.
-        mlp (nn.Sequential): Multi-layer perceptron for feature transformation.
-
-    Methods:
-        _init_weights: Initializes module weights using truncated normal distribution.
-        forward: Applies area-attention and feed-forward processing to input tensor.
-
-    Examples:
-        >>> block = ABlock(dim=256, num_heads=8, mlp_ratio=1.2, area=1)
-        >>> x = torch.randn(1, 256, 32, 32)
-        >>> output = block(x)
-        >>> print(output.shape)
-        torch.Size([1, 256, 32, 32])
-    """
-
-    def __init__(self, dim: int, num_heads: int, mlp_ratio: float = 1.2, area: int = 1):
-        """
-        Initialize an Area-attention block module.
-
-        Args:
-            dim (int): Number of input channels.
-            num_heads (int): Number of heads into which the attention mechanism is divided.
-            mlp_ratio (float): Expansion ratio for MLP hidden dimension.
-            area (int): Number of areas the feature map is divided.
-        """
-        super().__init__()
-
-        self.attn = AAttn(dim, num_heads=num_heads, area=area)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = nn.Sequential(Conv(dim, mlp_hidden_dim, 1), Conv(mlp_hidden_dim, dim, 1, act=False))
-
-        self.apply(self._init_weights)
-
-    def _init_weights(self, m: nn.Module):
-        """
-        Initialize weights using a truncated normal distribution.
-
-        Args:
-            m (nn.Module): Module to initialize.
-        """
-        if isinstance(m, nn.Conv2d):
-            nn.init.trunc_normal_(m.weight, std=0.02)
-            if m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Forward pass through ABlock.
-
-        Args:
-            x (torch.Tensor): Input tensor.
-
-        Returns:
-            (torch.Tensor): Output tensor after area-attention and feed-forward processing.
-        """
-        x = x + self.attn(x)
-        return x + self.mlp(x)
-
-
-class A2C2f(nn.Module):
-    """
-    Area-Attention C2f module for enhanced feature extraction with area-based attention mechanisms.
-
-    This module extends the C2f architecture by incorporating area-attention and ABlock layers for improved feature
-    processing. It supports both area-attention and standard convolution modes.
-
-    Attributes:
-        cv1 (Conv): Initial 1x1 convolution layer that reduces input channels to hidden channels.
-        cv2 (Conv): Final 1x1 convolution layer that processes concatenated features.
-        gamma (nn.Parameter | None): Learnable parameter for residual scaling when using area attention.
-        m (nn.ModuleList): List of either ABlock or C3k modules for feature processing.
-
-    Methods:
-        forward: Processes input through area-attention or standard convolution pathway.
-
-    Examples:
-        >>> m = A2C2f(512, 512, n=1, a2=True, area=1)
-        >>> x = torch.randn(1, 512, 32, 32)
-        >>> output = m(x)
-        >>> print(output.shape)
-        torch.Size([1, 512, 32, 32])
-    """
-
-    def __init__(
-        self,
-        c1: int,
-        c2: int,
-        n: int = 1,
-        a2: bool = True,
-        area: int = 1,
-        residual: bool = False,
-        mlp_ratio: float = 2.0,
-        e: float = 0.5,
-        g: int = 1,
-        shortcut: bool = True,
-    ):
-        """
-        Initialize Area-Attention C2f module.
-
-        Args:
-            c1 (int): Number of input channels.
-            c2 (int): Number of output channels.
-            n (int): Number of ABlock or C3k modules to stack.
-            a2 (bool): Whether to use area attention blocks. If False, uses C3k blocks instead.
-            area (int): Number of areas the feature map is divided.
-            residual (bool): Whether to use residual connections with learnable gamma parameter.
-            mlp_ratio (float): Expansion ratio for MLP hidden dimension.
-            e (float): Channel expansion ratio for hidden channels.
-            g (int): Number of groups for grouped convolutions.
-            shortcut (bool): Whether to use shortcut connections in C3k blocks.
-        """
-        super().__init__()
-        c_ = int(c2 * e)  # hidden channels
-        assert c_ % 32 == 0, "Dimension of ABlock be a multiple of 32."
-
-        self.cv1 = Conv(c1, c_, 1, 1)
-        self.cv2 = Conv((1 + n) * c_, c2, 1)
-
-        self.gamma = nn.Parameter(0.01 * torch.ones(c2), requires_grad=True) if a2 and residual else None
-        self.m = nn.ModuleList(
-            nn.Sequential(*(ABlock(c_, c_ // 32, mlp_ratio, area) for _ in range(2)))
-            if a2
-            else C3k(c_, c_, 2, shortcut, g)
-            for _ in range(n)
-        )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Forward pass through A2C2f layer.
-
-        Args:
-            x (torch.Tensor): Input tensor.
-
-        Returns:
-            (torch.Tensor): Output tensor after processing.
-        """
-        y = [self.cv1(x)]
-        y.extend(m(y[-1]) for m in self.m)
-        y = self.cv2(torch.cat(y, 1))
-        if self.gamma is not None:
-            return x + self.gamma.view(-1, self.gamma.shape[0], 1, 1) * y
-        return y
-
-
-class SwiGLUFFN(nn.Module):
-    """SwiGLU Feed-Forward Network for transformer-based architectures."""
-
-    def __init__(self, gc: int, ec: int, e: int = 4) -> None:
-        """
-        Initialize SwiGLU FFN with input dimension, output dimension, and expansion factor.
-
-        Args:
-            gc (int): Guide channels.
-            ec (int): Embedding channels.
-            e (int): Expansion factor.
-        """
-        super().__init__()
-        self.w12 = nn.Linear(gc, e * ec)
-        self.w3 = nn.Linear(e * ec // 2, ec)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Apply SwiGLU transformation to input features."""
-        x12 = self.w12(x)
-        x1, x2 = x12.chunk(2, dim=-1)
-        hidden = F.silu(x1) * x2
-        return self.w3(hidden)
-
-
-class Residual(nn.Module):
-    """Residual connection wrapper for neural network modules."""
-
-    def __init__(self, m: nn.Module) -> None:
-        """
-        Initialize residual module with the wrapped module.
-
-        Args:
-            m (nn.Module): Module to wrap with residual connection.
-        """
-        super().__init__()
-        self.m = m
-        nn.init.zeros_(self.m.w3.bias)
-        # For models with l scale, please change the initialization to
-        # nn.init.constant_(self.m.w3.weight, 1e-6)
-        nn.init.zeros_(self.m.w3.weight)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Apply residual connection to input features."""
-        return x + self.m(x)
-
-
-class SAVPE(nn.Module):
-    """Spatial-Aware Visual Prompt Embedding module for feature enhancement."""
-
-    def __init__(self, ch: list[int], c3: int, embed: int):
-        """
-        Initialize SAVPE module with channels, intermediate channels, and embedding dimension.
-
-        Args:
-            ch (list[int]): List of input channel dimensions.
-            c3 (int): Intermediate channels.
-            embed (int): Embedding dimension.
-        """
-        super().__init__()
-        self.cv1 = nn.ModuleList(
-            nn.Sequential(
-                Conv(x, c3, 3), Conv(c3, c3, 3), nn.Upsample(scale_factor=i * 2) if i in {1, 2} else nn.Identity()
-            )
-            for i, x in enumerate(ch)
-        )
-
-        self.cv2 = nn.ModuleList(
-            nn.Sequential(Conv(x, c3, 1), nn.Upsample(scale_factor=i * 2) if i in {1, 2} else nn.Identity())
-            for i, x in enumerate(ch)
-        )
-
-        self.c = 16
-        self.cv3 = nn.Conv2d(3 * c3, embed, 1)
-        self.cv4 = nn.Conv2d(3 * c3, self.c, 3, padding=1)
-        self.cv5 = nn.Conv2d(1, self.c, 3, padding=1)
-        self.cv6 = nn.Sequential(Conv(2 * self.c, self.c, 3), nn.Conv2d(self.c, self.c, 3, padding=1))
-
-    def forward(self, x: list[torch.Tensor], vp: torch.Tensor) -> torch.Tensor:
-        """Process input features and visual prompts to generate enhanced embeddings."""
-        y = [self.cv2[i](xi) for i, xi in enumerate(x)]
-        y = self.cv4(torch.cat(y, dim=1))
-
-        x = [self.cv1[i](xi) for i, xi in enumerate(x)]
-        x = self.cv3(torch.cat(x, dim=1))
-
-        B, C, H, W = x.shape
-
-        Q = vp.shape[1]
-
-        x = x.view(B, C, -1)
-
-        y = y.reshape(B, 1, self.c, H, W).expand(-1, Q, -1, -1, -1).reshape(B * Q, self.c, H, W)
-        vp = vp.reshape(B, Q, 1, H, W).reshape(B * Q, 1, H, W)
-
-        y = self.cv6(torch.cat((y, self.cv5(vp)), dim=1))
-
-        y = y.reshape(B, Q, self.c, -1)
-        vp = vp.reshape(B, Q, 1, -1)
-
-        score = y * vp + torch.logical_not(vp) * torch.finfo(y.dtype).min
-        score = F.softmax(score, dim=-1).to(y.dtype)
-        aggregated = score.transpose(-2, -3) @ x.reshape(B, self.c, C // self.c, -1).transpose(-1, -2)
-
-        return F.normalize(aggregated.transpose(-2, -3).reshape(B, Q, -1), dim=-1, p=2)
diff --git a/ultralytics/nn/modules/conv.py b/ultralytics/nn/modules/conv.py
deleted file mode 100644
index 5c6ea00..0000000
--- a/ultralytics/nn/modules/conv.py
+++ /dev/null
@@ -1,714 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-"""Convolution modules."""
-
-from __future__ import annotations
-
-import math
-
-import numpy as np
-import torch
-import torch.nn as nn
-
-__all__ = (
-    "Conv",
-    "Conv2",
-    "LightConv",
-    "DWConv",
-    "DWConvTranspose2d",
-    "ConvTranspose",
-    "Focus",
-    "GhostConv",
-    "ChannelAttention",
-    "SpatialAttention",
-    "CBAM",
-    "Concat",
-    "RepConv",
-    "Index",
-)
-
-
-def autopad(k, p=None, d=1):  # kernel, padding, dilation
-    """Pad to 'same' shape outputs."""
-    if d > 1:
-        k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k]  # actual kernel-size
-    if p is None:
-        p = k // 2 if isinstance(k, int) else [x // 2 for x in k]  # auto-pad
-    return p
-
-
-class Conv(nn.Module):
-    """
-    Standard convolution module with batch normalization and activation.
-
-    Attributes:
-        conv (nn.Conv2d): Convolutional layer.
-        bn (nn.BatchNorm2d): Batch normalization layer.
-        act (nn.Module): Activation function layer.
-        default_act (nn.Module): Default activation function (SiLU).
-    """
-
-    default_act = nn.SiLU()  # default activation
-
-    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True):
-        """
-        Initialize Conv layer with given parameters.
-
-        Args:
-            c1 (int): Number of input channels.
-            c2 (int): Number of output channels.
-            k (int): Kernel size.
-            s (int): Stride.
-            p (int, optional): Padding.
-            g (int): Groups.
-            d (int): Dilation.
-            act (bool | nn.Module): Activation function.
-        """
-        super().__init__()
-        self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False)
-        self.bn = nn.BatchNorm2d(c2)
-        self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
-
-    def forward(self, x):
-        """
-        Apply convolution, batch normalization and activation to input tensor.
-
-        Args:
-            x (torch.Tensor): Input tensor.
-
-        Returns:
-            (torch.Tensor): Output tensor.
-        """
-        return self.act(self.bn(self.conv(x)))
-
-    def forward_fuse(self, x):
-        """
-        Apply convolution and activation without batch normalization.
-
-        Args:
-            x (torch.Tensor): Input tensor.
-
-        Returns:
-            (torch.Tensor): Output tensor.
-        """
-        return self.act(self.conv(x))
-
-
-class Conv2(Conv):
-    """
-    Simplified RepConv module with Conv fusing.
-
-    Attributes:
-        conv (nn.Conv2d): Main 3x3 convolutional layer.
-        cv2 (nn.Conv2d): Additional 1x1 convolutional layer.
-        bn (nn.BatchNorm2d): Batch normalization layer.
-        act (nn.Module): Activation function layer.
-    """
-
-    def __init__(self, c1, c2, k=3, s=1, p=None, g=1, d=1, act=True):
-        """
-        Initialize Conv2 layer with given parameters.
-
-        Args:
-            c1 (int): Number of input channels.
-            c2 (int): Number of output channels.
-            k (int): Kernel size.
-            s (int): Stride.
-            p (int, optional): Padding.
-            g (int): Groups.
-            d (int): Dilation.
-            act (bool | nn.Module): Activation function.
-        """
-        super().__init__(c1, c2, k, s, p, g=g, d=d, act=act)
-        self.cv2 = nn.Conv2d(c1, c2, 1, s, autopad(1, p, d), groups=g, dilation=d, bias=False)  # add 1x1 conv
-
-    def forward(self, x):
-        """
-        Apply convolution, batch normalization and activation to input tensor.
-
-        Args:
-            x (torch.Tensor): Input tensor.
-
-        Returns:
-            (torch.Tensor): Output tensor.
-        """
-        return self.act(self.bn(self.conv(x) + self.cv2(x)))
-
-    def forward_fuse(self, x):
-        """
-        Apply fused convolution, batch normalization and activation to input tensor.
-
-        Args:
-            x (torch.Tensor): Input tensor.
-
-        Returns:
-            (torch.Tensor): Output tensor.
-        """
-        return self.act(self.bn(self.conv(x)))
-
-    def fuse_convs(self):
-        """Fuse parallel convolutions."""
-        w = torch.zeros_like(self.conv.weight.data)
-        i = [x // 2 for x in w.shape[2:]]
-        w[:, :, i[0] : i[0] + 1, i[1] : i[1] + 1] = self.cv2.weight.data.clone()
-        self.conv.weight.data += w
-        self.__delattr__("cv2")
-        self.forward = self.forward_fuse
-
-
-class LightConv(nn.Module):
-    """
-    Light convolution module with 1x1 and depthwise convolutions.
-
-    This implementation is based on the PaddleDetection HGNetV2 backbone.
-
-    Attributes:
-        conv1 (Conv): 1x1 convolution layer.
-        conv2 (DWConv): Depthwise convolution layer.
-    """
-
-    def __init__(self, c1, c2, k=1, act=nn.ReLU()):
-        """
-        Initialize LightConv layer with given parameters.
-
-        Args:
-            c1 (int): Number of input channels.
-            c2 (int): Number of output channels.
-            k (int): Kernel size for depthwise convolution.
-            act (nn.Module): Activation function.
-        """
-        super().__init__()
-        self.conv1 = Conv(c1, c2, 1, act=False)
-        self.conv2 = DWConv(c2, c2, k, act=act)
-
-    def forward(self, x):
-        """
-        Apply 2 convolutions to input tensor.
-
-        Args:
-            x (torch.Tensor): Input tensor.
-
-        Returns:
-            (torch.Tensor): Output tensor.
-        """
-        return self.conv2(self.conv1(x))
-
-
-class DWConv(Conv):
-    """Depth-wise convolution module."""
-
-    def __init__(self, c1, c2, k=1, s=1, d=1, act=True):
-        """
-        Initialize depth-wise convolution with given parameters.
-
-        Args:
-            c1 (int): Number of input channels.
-            c2 (int): Number of output channels.
-            k (int): Kernel size.
-            s (int): Stride.
-            d (int): Dilation.
-            act (bool | nn.Module): Activation function.
-        """
-        super().__init__(c1, c2, k, s, g=math.gcd(c1, c2), d=d, act=act)
-
-
-class DWConvTranspose2d(nn.ConvTranspose2d):
-    """Depth-wise transpose convolution module."""
-
-    def __init__(self, c1, c2, k=1, s=1, p1=0, p2=0):
-        """
-        Initialize depth-wise transpose convolution with given parameters.
-
-        Args:
-            c1 (int): Number of input channels.
-            c2 (int): Number of output channels.
-            k (int): Kernel size.
-            s (int): Stride.
-            p1 (int): Padding.
-            p2 (int): Output padding.
-        """
-        super().__init__(c1, c2, k, s, p1, p2, groups=math.gcd(c1, c2))
-
-
-class ConvTranspose(nn.Module):
-    """
-    Convolution transpose module with optional batch normalization and activation.
-
-    Attributes:
-        conv_transpose (nn.ConvTranspose2d): Transposed convolution layer.
-        bn (nn.BatchNorm2d | nn.Identity): Batch normalization layer.
-        act (nn.Module): Activation function layer.
-        default_act (nn.Module): Default activation function (SiLU).
-    """
-
-    default_act = nn.SiLU()  # default activation
-
-    def __init__(self, c1, c2, k=2, s=2, p=0, bn=True, act=True):
-        """
-        Initialize ConvTranspose layer with given parameters.
-
-        Args:
-            c1 (int): Number of input channels.
-            c2 (int): Number of output channels.
-            k (int): Kernel size.
-            s (int): Stride.
-            p (int): Padding.
-            bn (bool): Use batch normalization.
-            act (bool | nn.Module): Activation function.
-        """
-        super().__init__()
-        self.conv_transpose = nn.ConvTranspose2d(c1, c2, k, s, p, bias=not bn)
-        self.bn = nn.BatchNorm2d(c2) if bn else nn.Identity()
-        self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
-
-    def forward(self, x):
-        """
-        Apply transposed convolution, batch normalization and activation to input.
-
-        Args:
-            x (torch.Tensor): Input tensor.
-
-        Returns:
-            (torch.Tensor): Output tensor.
-        """
-        return self.act(self.bn(self.conv_transpose(x)))
-
-    def forward_fuse(self, x):
-        """
-        Apply activation and convolution transpose operation to input.
-
-        Args:
-            x (torch.Tensor): Input tensor.
-
-        Returns:
-            (torch.Tensor): Output tensor.
-        """
-        return self.act(self.conv_transpose(x))
-
-
-class Focus(nn.Module):
-    """
-    Focus module for concentrating feature information.
-
-    Slices input tensor into 4 parts and concatenates them in the channel dimension.
-
-    Attributes:
-        conv (Conv): Convolution layer.
-    """
-
-    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):
-        """
-        Initialize Focus module with given parameters.
-
-        Args:
-            c1 (int): Number of input channels.
-            c2 (int): Number of output channels.
-            k (int): Kernel size.
-            s (int): Stride.
-            p (int, optional): Padding.
-            g (int): Groups.
-            act (bool | nn.Module): Activation function.
-        """
-        super().__init__()
-        self.conv = Conv(c1 * 4, c2, k, s, p, g, act=act)
-        # self.contract = Contract(gain=2)
-
-    def forward(self, x):
-        """
-        Apply Focus operation and convolution to input tensor.
-
-        Input shape is (B, C, W, H) and output shape is (B, 4C, W/2, H/2).
-
-        Args:
-            x (torch.Tensor): Input tensor.
-
-        Returns:
-            (torch.Tensor): Output tensor.
-        """
-        return self.conv(torch.cat((x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]), 1))
-        # return self.conv(self.contract(x))
-
-
-class GhostConv(nn.Module):
-    """
-    Ghost Convolution module.
-
-    Generates more features with fewer parameters by using cheap operations.
-
-    Attributes:
-        cv1 (Conv): Primary convolution.
-        cv2 (Conv): Cheap operation convolution.
-
-    References:
-        https://github.com/huawei-noah/Efficient-AI-Backbones
-    """
-
-    def __init__(self, c1, c2, k=1, s=1, g=1, act=True):
-        """
-        Initialize Ghost Convolution module with given parameters.
-
-        Args:
-            c1 (int): Number of input channels.
-            c2 (int): Number of output channels.
-            k (int): Kernel size.
-            s (int): Stride.
-            g (int): Groups.
-            act (bool | nn.Module): Activation function.
-        """
-        super().__init__()
-        c_ = c2 // 2  # hidden channels
-        self.cv1 = Conv(c1, c_, k, s, None, g, act=act)
-        self.cv2 = Conv(c_, c_, 5, 1, None, c_, act=act)
-
-    def forward(self, x):
-        """
-        Apply Ghost Convolution to input tensor.
-
-        Args:
-            x (torch.Tensor): Input tensor.
-
-        Returns:
-            (torch.Tensor): Output tensor with concatenated features.
-        """
-        y = self.cv1(x)
-        return torch.cat((y, self.cv2(y)), 1)
-
-
-class RepConv(nn.Module):
-    """
-    RepConv module with training and deploy modes.
-
-    This module is used in RT-DETR and can fuse convolutions during inference for efficiency.
-
-    Attributes:
-        conv1 (Conv): 3x3 convolution.
-        conv2 (Conv): 1x1 convolution.
-        bn (nn.BatchNorm2d, optional): Batch normalization for identity branch.
-        act (nn.Module): Activation function.
-        default_act (nn.Module): Default activation function (SiLU).
-
-    References:
-        https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
-    """
-
-    default_act = nn.SiLU()  # default activation
-
-    def __init__(self, c1, c2, k=3, s=1, p=1, g=1, d=1, act=True, bn=False, deploy=False):
-        """
-        Initialize RepConv module with given parameters.
-
-        Args:
-            c1 (int): Number of input channels.
-            c2 (int): Number of output channels.
-            k (int): Kernel size.
-            s (int): Stride.
-            p (int): Padding.
-            g (int): Groups.
-            d (int): Dilation.
-            act (bool | nn.Module): Activation function.
-            bn (bool): Use batch normalization for identity branch.
-            deploy (bool): Deploy mode for inference.
-        """
-        super().__init__()
-        assert k == 3 and p == 1
-        self.g = g
-        self.c1 = c1
-        self.c2 = c2
-        self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
-
-        self.bn = nn.BatchNorm2d(num_features=c1) if bn and c2 == c1 and s == 1 else None
-        self.conv1 = Conv(c1, c2, k, s, p=p, g=g, act=False)
-        self.conv2 = Conv(c1, c2, 1, s, p=(p - k // 2), g=g, act=False)
-
-    def forward_fuse(self, x):
-        """
-        Forward pass for deploy mode.
-
-        Args:
-            x (torch.Tensor): Input tensor.
-
-        Returns:
-            (torch.Tensor): Output tensor.
-        """
-        return self.act(self.conv(x))
-
-    def forward(self, x):
-        """
-        Forward pass for training mode.
-
-        Args:
-            x (torch.Tensor): Input tensor.
-
-        Returns:
-            (torch.Tensor): Output tensor.
-        """
-        id_out = 0 if self.bn is None else self.bn(x)
-        return self.act(self.conv1(x) + self.conv2(x) + id_out)
-
-    def get_equivalent_kernel_bias(self):
-        """
-        Calculate equivalent kernel and bias by fusing convolutions.
-
-        Returns:
-            (torch.Tensor): Equivalent kernel
-            (torch.Tensor): Equivalent bias
-        """
-        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
-        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
-        kernelid, biasid = self._fuse_bn_tensor(self.bn)
-        return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid
-
-    @staticmethod
-    def _pad_1x1_to_3x3_tensor(kernel1x1):
-        """
-        Pad a 1x1 kernel to 3x3 size.
-
-        Args:
-            kernel1x1 (torch.Tensor): 1x1 convolution kernel.
-
-        Returns:
-            (torch.Tensor): Padded 3x3 kernel.
-        """
-        if kernel1x1 is None:
-            return 0
-        else:
-            return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1])
-
-    def _fuse_bn_tensor(self, branch):
-        """
-        Fuse batch normalization with convolution weights.
-
-        Args:
-            branch (Conv | nn.BatchNorm2d | None): Branch to fuse.
-
-        Returns:
-            kernel (torch.Tensor): Fused kernel.
-            bias (torch.Tensor): Fused bias.
-        """
-        if branch is None:
-            return 0, 0
-        if isinstance(branch, Conv):
-            kernel = branch.conv.weight
-            running_mean = branch.bn.running_mean
-            running_var = branch.bn.running_var
-            gamma = branch.bn.weight
-            beta = branch.bn.bias
-            eps = branch.bn.eps
-        elif isinstance(branch, nn.BatchNorm2d):
-            if not hasattr(self, "id_tensor"):
-                input_dim = self.c1 // self.g
-                kernel_value = np.zeros((self.c1, input_dim, 3, 3), dtype=np.float32)
-                for i in range(self.c1):
-                    kernel_value[i, i % input_dim, 1, 1] = 1
-                self.id_tensor = torch.from_numpy(kernel_value).to(branch.weight.device)
-            kernel = self.id_tensor
-            running_mean = branch.running_mean
-            running_var = branch.running_var
-            gamma = branch.weight
-            beta = branch.bias
-            eps = branch.eps
-        std = (running_var + eps).sqrt()
-        t = (gamma / std).reshape(-1, 1, 1, 1)
-        return kernel * t, beta - running_mean * gamma / std
-
-    def fuse_convs(self):
-        """Fuse convolutions for inference by creating a single equivalent convolution."""
-        if hasattr(self, "conv"):
-            return
-        kernel, bias = self.get_equivalent_kernel_bias()
-        self.conv = nn.Conv2d(
-            in_channels=self.conv1.conv.in_channels,
-            out_channels=self.conv1.conv.out_channels,
-            kernel_size=self.conv1.conv.kernel_size,
-            stride=self.conv1.conv.stride,
-            padding=self.conv1.conv.padding,
-            dilation=self.conv1.conv.dilation,
-            groups=self.conv1.conv.groups,
-            bias=True,
-        ).requires_grad_(False)
-        self.conv.weight.data = kernel
-        self.conv.bias.data = bias
-        for para in self.parameters():
-            para.detach_()
-        self.__delattr__("conv1")
-        self.__delattr__("conv2")
-        if hasattr(self, "nm"):
-            self.__delattr__("nm")
-        if hasattr(self, "bn"):
-            self.__delattr__("bn")
-        if hasattr(self, "id_tensor"):
-            self.__delattr__("id_tensor")
-
-
-class ChannelAttention(nn.Module):
-    """
-    Channel-attention module for feature recalibration.
-
-    Applies attention weights to channels based on global average pooling.
-
-    Attributes:
-        pool (nn.AdaptiveAvgPool2d): Global average pooling.
-        fc (nn.Conv2d): Fully connected layer implemented as 1x1 convolution.
-        act (nn.Sigmoid): Sigmoid activation for attention weights.
-
-    References:
-        https://github.com/open-mmlab/mmdetection/tree/v3.0.0rc1/configs/rtmdet
-    """
-
-    def __init__(self, channels: int) -> None:
-        """
-        Initialize Channel-attention module.
-
-        Args:
-            channels (int): Number of input channels.
-        """
-        super().__init__()
-        self.pool = nn.AdaptiveAvgPool2d(1)
-        self.fc = nn.Conv2d(channels, channels, 1, 1, 0, bias=True)
-        self.act = nn.Sigmoid()
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Apply channel attention to input tensor.
-
-        Args:
-            x (torch.Tensor): Input tensor.
-
-        Returns:
-            (torch.Tensor): Channel-attended output tensor.
-        """
-        return x * self.act(self.fc(self.pool(x)))
-
-
-class SpatialAttention(nn.Module):
-    """
-    Spatial-attention module for feature recalibration.
-
-    Applies attention weights to spatial dimensions based on channel statistics.
-
-    Attributes:
-        cv1 (nn.Conv2d): Convolution layer for spatial attention.
-        act (nn.Sigmoid): Sigmoid activation for attention weights.
-    """
-
-    def __init__(self, kernel_size=7):
-        """
-        Initialize Spatial-attention module.
-
-        Args:
-            kernel_size (int): Size of the convolutional kernel (3 or 7).
-        """
-        super().__init__()
-        assert kernel_size in {3, 7}, "kernel size must be 3 or 7"
-        padding = 3 if kernel_size == 7 else 1
-        self.cv1 = nn.Conv2d(2, 1, kernel_size, padding=padding, bias=False)
-        self.act = nn.Sigmoid()
-
-    def forward(self, x):
-        """
-        Apply spatial attention to input tensor.
-
-        Args:
-            x (torch.Tensor): Input tensor.
-
-        Returns:
-            (torch.Tensor): Spatial-attended output tensor.
-        """
-        return x * self.act(self.cv1(torch.cat([torch.mean(x, 1, keepdim=True), torch.max(x, 1, keepdim=True)[0]], 1)))
-
-
-class CBAM(nn.Module):
-    """
-    Convolutional Block Attention Module.
-
-    Combines channel and spatial attention mechanisms for comprehensive feature refinement.
-
-    Attributes:
-        channel_attention (ChannelAttention): Channel attention module.
-        spatial_attention (SpatialAttention): Spatial attention module.
-    """
-
-    def __init__(self, c1, kernel_size=7):
-        """
-        Initialize CBAM with given parameters.
-
-        Args:
-            c1 (int): Number of input channels.
-            kernel_size (int): Size of the convolutional kernel for spatial attention.
-        """
-        super().__init__()
-        self.channel_attention = ChannelAttention(c1)
-        self.spatial_attention = SpatialAttention(kernel_size)
-
-    def forward(self, x):
-        """
-        Apply channel and spatial attention sequentially to input tensor.
-
-        Args:
-            x (torch.Tensor): Input tensor.
-
-        Returns:
-            (torch.Tensor): Attended output tensor.
-        """
-        return self.spatial_attention(self.channel_attention(x))
-
-
-class Concat(nn.Module):
-    """
-    Concatenate a list of tensors along specified dimension.
-
-    Attributes:
-        d (int): Dimension along which to concatenate tensors.
-    """
-
-    def __init__(self, dimension=1):
-        """
-        Initialize Concat module.
-
-        Args:
-            dimension (int): Dimension along which to concatenate tensors.
-        """
-        super().__init__()
-        self.d = dimension
-
-    def forward(self, x: list[torch.Tensor]):
-        """
-        Concatenate input tensors along specified dimension.
-
-        Args:
-            x (list[torch.Tensor]): List of input tensors.
-
-        Returns:
-            (torch.Tensor): Concatenated tensor.
-        """
-        return torch.cat(x, self.d)
-
-
-class Index(nn.Module):
-    """
-    Returns a particular index of the input.
-
-    Attributes:
-        index (int): Index to select from input.
-    """
-
-    def __init__(self, index=0):
-        """
-        Initialize Index module.
-
-        Args:
-            index (int): Index to select from input.
-        """
-        super().__init__()
-        self.index = index
-
-    def forward(self, x: list[torch.Tensor]):
-        """
-        Select and return a particular index from input.
-
-        Args:
-            x (list[torch.Tensor]): List of input tensors.
-
-        Returns:
-            (torch.Tensor): Selected tensor.
-        """
-        return x[self.index]
diff --git a/ultralytics/nn/modules/head.py b/ultralytics/nn/modules/head.py
deleted file mode 100644
index 2342686..0000000
--- a/ultralytics/nn/modules/head.py
+++ /dev/null
@@ -1,1230 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-"""Model head modules."""
-
-from __future__ import annotations
-
-import copy
-import math
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn.init import constant_, xavier_uniform_
-
-from ultralytics.utils import NOT_MACOS14
-from ultralytics.utils.tal import dist2bbox, dist2rbox, make_anchors
-from ultralytics.utils.torch_utils import TORCH_1_11, fuse_conv_and_bn, smart_inference_mode
-
-from .block import DFL, SAVPE, BNContrastiveHead, ContrastiveHead, Proto, Residual, SwiGLUFFN
-from .conv import Conv, DWConv
-from .transformer import MLP, DeformableTransformerDecoder, DeformableTransformerDecoderLayer
-from .utils import bias_init_with_prob, linear_init
-
-__all__ = "Detect", "Segment", "Pose", "Classify", "OBB", "RTDETRDecoder", "v10Detect", "YOLOEDetect", "YOLOESegment"
-
-
-class Detect(nn.Module):
-    """
-    YOLO Detect head for object detection models.
-
-    This class implements the detection head used in YOLO models for predicting bounding boxes and class probabilities.
-    It supports both training and inference modes, with optional end-to-end detection capabilities.
-
-    Attributes:
-        dynamic (bool): Force grid reconstruction.
-        export (bool): Export mode flag.
-        format (str): Export format.
-        end2end (bool): End-to-end detection mode.
-        max_det (int): Maximum detections per image.
-        shape (tuple): Input shape.
-        anchors (torch.Tensor): Anchor points.
-        strides (torch.Tensor): Feature map strides.
-        legacy (bool): Backward compatibility for v3/v5/v8/v9 models.
-        xyxy (bool): Output format, xyxy or xywh.
-        nc (int): Number of classes.
-        nl (int): Number of detection layers.
-        reg_max (int): DFL channels.
-        no (int): Number of outputs per anchor.
-        stride (torch.Tensor): Strides computed during build.
-        cv2 (nn.ModuleList): Convolution layers for box regression.
-        cv3 (nn.ModuleList): Convolution layers for classification.
-        dfl (nn.Module): Distribution Focal Loss layer.
-        one2one_cv2 (nn.ModuleList): One-to-one convolution layers for box regression.
-        one2one_cv3 (nn.ModuleList): One-to-one convolution layers for classification.
-
-    Methods:
-        forward: Perform forward pass and return predictions.
-        forward_end2end: Perform forward pass for end-to-end detection.
-        bias_init: Initialize detection head biases.
-        decode_bboxes: Decode bounding boxes from predictions.
-        postprocess: Post-process model predictions.
-
-    Examples:
-        Create a detection head for 80 classes
-        >>> detect = Detect(nc=80, ch=(256, 512, 1024))
-        >>> x = [torch.randn(1, 256, 80, 80), torch.randn(1, 512, 40, 40), torch.randn(1, 1024, 20, 20)]
-        >>> outputs = detect(x)
-    """
-
-    dynamic = False  # force grid reconstruction
-    export = False  # export mode
-    format = None  # export format
-    end2end = False  # end2end
-    max_det = 300  # max_det
-    shape = None
-    anchors = torch.empty(0)  # init
-    strides = torch.empty(0)  # init
-    legacy = False  # backward compatibility for v3/v5/v8/v9 models
-    xyxy = False  # xyxy or xywh output
-
-    def __init__(self, nc: int = 80, ch: tuple = ()):
-        """
-        Initialize the YOLO detection layer with specified number of classes and channels.
-
-        Args:
-            nc (int): Number of classes.
-            ch (tuple): Tuple of channel sizes from backbone feature maps.
-        """
-        super().__init__()
-        self.nc = nc  # number of classes
-        self.nl = len(ch)  # number of detection layers
-        self.reg_max = 16  # DFL channels (ch[0] // 16 to scale 4/8/12/16/20 for n/s/m/l/x)
-        self.no = nc + self.reg_max * 4  # number of outputs per anchor
-        self.stride = torch.zeros(self.nl)  # strides computed during build
-        c2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], min(self.nc, 100))  # channels
-        self.cv2 = nn.ModuleList(
-            nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch
-        )
-        self.cv3 = (
-            nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch)
-            if self.legacy
-            else nn.ModuleList(
-                nn.Sequential(
-                    nn.Sequential(DWConv(x, x, 3), Conv(x, c3, 1)),
-                    nn.Sequential(DWConv(c3, c3, 3), Conv(c3, c3, 1)),
-                    nn.Conv2d(c3, self.nc, 1),
-                )
-                for x in ch
-            )
-        )
-        self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity()
-
-        if self.end2end:
-            self.one2one_cv2 = copy.deepcopy(self.cv2)
-            self.one2one_cv3 = copy.deepcopy(self.cv3)
-
-    def forward(self, x: list[torch.Tensor]) -> list[torch.Tensor] | tuple:
-        """Concatenate and return predicted bounding boxes and class probabilities."""
-        if self.end2end:
-            return self.forward_end2end(x)
-
-        for i in range(self.nl):
-            x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)
-        if self.training:  # Training path
-            return x
-        y = self._inference(x)
-        return y if self.export else (y, x)
-
-    def forward_end2end(self, x: list[torch.Tensor]) -> dict | tuple:
-        """
-        Perform forward pass of the v10Detect module.
-
-        Args:
-            x (list[torch.Tensor]): Input feature maps from different levels.
-
-        Returns:
-            outputs (dict | tuple): Training mode returns dict with one2many and one2one outputs.
-                Inference mode returns processed detections or tuple with detections and raw outputs.
-        """
-        x_detach = [xi.detach() for xi in x]
-        one2one = [
-            torch.cat((self.one2one_cv2[i](x_detach[i]), self.one2one_cv3[i](x_detach[i])), 1) for i in range(self.nl)
-        ]
-        for i in range(self.nl):
-            x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)
-        if self.training:  # Training path
-            return {"one2many": x, "one2one": one2one}
-
-        y = self._inference(one2one)
-        y = self.postprocess(y.permute(0, 2, 1), self.max_det, self.nc)
-        return y if self.export else (y, {"one2many": x, "one2one": one2one})
-
-    def _inference(self, x: list[torch.Tensor]) -> torch.Tensor:
-        """
-        Decode predicted bounding boxes and class probabilities based on multiple-level feature maps.
-
-        Args:
-            x (list[torch.Tensor]): List of feature maps from different detection layers.
-
-        Returns:
-            (torch.Tensor): Concatenated tensor of decoded bounding boxes and class probabilities.
-        """
-        # Inference path
-        shape = x[0].shape  # BCHW
-        x_cat = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2)
-        if self.dynamic or self.shape != shape:
-            self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))
-            self.shape = shape
-
-        if self.export and self.format in {"saved_model", "pb", "tflite", "edgetpu", "tfjs"}:  # avoid TF FlexSplitV ops
-            box = x_cat[:, : self.reg_max * 4]
-            cls = x_cat[:, self.reg_max * 4 :]
-        else:
-            box, cls = x_cat.split((self.reg_max * 4, self.nc), 1)
-
-        if self.export and self.format in {"tflite", "edgetpu"}:
-            # Precompute normalization factor to increase numerical stability
-            # See https://github.com/ultralytics/ultralytics/issues/7371
-            grid_h = shape[2]
-            grid_w = shape[3]
-            grid_size = torch.tensor([grid_w, grid_h, grid_w, grid_h], device=box.device).reshape(1, 4, 1)
-            norm = self.strides / (self.stride[0] * grid_size)
-            dbox = self.decode_bboxes(self.dfl(box) * norm, self.anchors.unsqueeze(0) * norm[:, :2])
-        else:
-            dbox = self.decode_bboxes(self.dfl(box), self.anchors.unsqueeze(0)) * self.strides
-        return torch.cat((dbox, cls.sigmoid()), 1)
-
-    def bias_init(self):
-        """Initialize Detect() biases, WARNING: requires stride availability."""
-        m = self  # self.model[-1]  # Detect() module
-        # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1
-        # ncf = math.log(0.6 / (m.nc - 0.999999)) if cf is None else torch.log(cf / cf.sum())  # nominal class frequency
-        for a, b, s in zip(m.cv2, m.cv3, m.stride):  # from
-            a[-1].bias.data[:] = 1.0  # box
-            b[-1].bias.data[: m.nc] = math.log(5 / m.nc / (640 / s) ** 2)  # cls (.01 objects, 80 classes, 640 img)
-        if self.end2end:
-            for a, b, s in zip(m.one2one_cv2, m.one2one_cv3, m.stride):  # from
-                a[-1].bias.data[:] = 1.0  # box
-                b[-1].bias.data[: m.nc] = math.log(5 / m.nc / (640 / s) ** 2)  # cls (.01 objects, 80 classes, 640 img)
-
-    def decode_bboxes(self, bboxes: torch.Tensor, anchors: torch.Tensor, xywh: bool = True) -> torch.Tensor:
-        """Decode bounding boxes from predictions."""
-        return dist2bbox(
-            bboxes,
-            anchors,
-            xywh=xywh and not self.end2end and not self.xyxy,
-            dim=1,
-        )
-
-    @staticmethod
-    def postprocess(preds: torch.Tensor, max_det: int, nc: int = 80) -> torch.Tensor:
-        """
-        Post-process YOLO model predictions.
-
-        Args:
-            preds (torch.Tensor): Raw predictions with shape (batch_size, num_anchors, 4 + nc) with last dimension
-                format [x, y, w, h, class_probs].
-            max_det (int): Maximum detections per image.
-            nc (int, optional): Number of classes.
-
-        Returns:
-            (torch.Tensor): Processed predictions with shape (batch_size, min(max_det, num_anchors), 6) and last
-                dimension format [x, y, w, h, max_class_prob, class_index].
-        """
-        batch_size, anchors, _ = preds.shape  # i.e. shape(16,8400,84)
-        boxes, scores = preds.split([4, nc], dim=-1)
-        index = scores.amax(dim=-1).topk(min(max_det, anchors))[1].unsqueeze(-1)
-        boxes = boxes.gather(dim=1, index=index.repeat(1, 1, 4))
-        scores = scores.gather(dim=1, index=index.repeat(1, 1, nc))
-        scores, index = scores.flatten(1).topk(min(max_det, anchors))
-        i = torch.arange(batch_size)[..., None]  # batch indices
-        return torch.cat([boxes[i, index // nc], scores[..., None], (index % nc)[..., None].float()], dim=-1)
-
-
-class Segment(Detect):
-    """
-    YOLO Segment head for segmentation models.
-
-    This class extends the Detect head to include mask prediction capabilities for instance segmentation tasks.
-
-    Attributes:
-        nm (int): Number of masks.
-        npr (int): Number of protos.
-        proto (Proto): Prototype generation module.
-        cv4 (nn.ModuleList): Convolution layers for mask coefficients.
-
-    Methods:
-        forward: Return model outputs and mask coefficients.
-
-    Examples:
-        Create a segmentation head
-        >>> segment = Segment(nc=80, nm=32, npr=256, ch=(256, 512, 1024))
-        >>> x = [torch.randn(1, 256, 80, 80), torch.randn(1, 512, 40, 40), torch.randn(1, 1024, 20, 20)]
-        >>> outputs = segment(x)
-    """
-
-    def __init__(self, nc: int = 80, nm: int = 32, npr: int = 256, ch: tuple = ()):
-        """
-        Initialize the YOLO model attributes such as the number of masks, prototypes, and the convolution layers.
-
-        Args:
-            nc (int): Number of classes.
-            nm (int): Number of masks.
-            npr (int): Number of protos.
-            ch (tuple): Tuple of channel sizes from backbone feature maps.
-        """
-        super().__init__(nc, ch)
-        self.nm = nm  # number of masks
-        self.npr = npr  # number of protos
-        self.proto = Proto(ch[0], self.npr, self.nm)  # protos
-
-        c4 = max(ch[0] // 4, self.nm)
-        self.cv4 = nn.ModuleList(nn.Sequential(Conv(x, c4, 3), Conv(c4, c4, 3), nn.Conv2d(c4, self.nm, 1)) for x in ch)
-
-    def forward(self, x: list[torch.Tensor]) -> tuple | list[torch.Tensor]:
-        """Return model outputs and mask coefficients if training, otherwise return outputs and mask coefficients."""
-        p = self.proto(x[0])  # mask protos
-        bs = p.shape[0]  # batch size
-
-        mc = torch.cat([self.cv4[i](x[i]).view(bs, self.nm, -1) for i in range(self.nl)], 2)  # mask coefficients
-        x = Detect.forward(self, x)
-        if self.training:
-            return x, mc, p
-        return (torch.cat([x, mc], 1), p) if self.export else (torch.cat([x[0], mc], 1), (x[1], mc, p))
-
-
-class OBB(Detect):
-    """
-    YOLO OBB detection head for detection with rotation models.
-
-    This class extends the Detect head to include oriented bounding box prediction with rotation angles.
-
-    Attributes:
-        ne (int): Number of extra parameters.
-        cv4 (nn.ModuleList): Convolution layers for angle prediction.
-        angle (torch.Tensor): Predicted rotation angles.
-
-    Methods:
-        forward: Concatenate and return predicted bounding boxes and class probabilities.
-        decode_bboxes: Decode rotated bounding boxes.
-
-    Examples:
-        Create an OBB detection head
-        >>> obb = OBB(nc=80, ne=1, ch=(256, 512, 1024))
-        >>> x = [torch.randn(1, 256, 80, 80), torch.randn(1, 512, 40, 40), torch.randn(1, 1024, 20, 20)]
-        >>> outputs = obb(x)
-    """
-
-    def __init__(self, nc: int = 80, ne: int = 1, ch: tuple = ()):
-        """
-        Initialize OBB with number of classes `nc` and layer channels `ch`.
-
-        Args:
-            nc (int): Number of classes.
-            ne (int): Number of extra parameters.
-            ch (tuple): Tuple of channel sizes from backbone feature maps.
-        """
-        super().__init__(nc, ch)
-        self.ne = ne  # number of extra parameters
-
-        c4 = max(ch[0] // 4, self.ne)
-        self.cv4 = nn.ModuleList(nn.Sequential(Conv(x, c4, 3), Conv(c4, c4, 3), nn.Conv2d(c4, self.ne, 1)) for x in ch)
-
-    def forward(self, x: list[torch.Tensor]) -> torch.Tensor | tuple:
-        """Concatenate and return predicted bounding boxes and class probabilities."""
-        bs = x[0].shape[0]  # batch size
-        angle = torch.cat([self.cv4[i](x[i]).view(bs, self.ne, -1) for i in range(self.nl)], 2)  # OBB theta logits
-        # NOTE: set `angle` as an attribute so that `decode_bboxes` could use it.
-        angle = (angle.sigmoid() - 0.25) * math.pi  # [-pi/4, 3pi/4]
-        # angle = angle.sigmoid() * math.pi / 2  # [0, pi/2]
-        if not self.training:
-            self.angle = angle
-        x = Detect.forward(self, x)
-        if self.training:
-            return x, angle
-        return torch.cat([x, angle], 1) if self.export else (torch.cat([x[0], angle], 1), (x[1], angle))
-
-    def decode_bboxes(self, bboxes: torch.Tensor, anchors: torch.Tensor) -> torch.Tensor:
-        """Decode rotated bounding boxes."""
-        return dist2rbox(bboxes, self.angle, anchors, dim=1)
-
-
-class Pose(Detect):
-    """
-    YOLO Pose head for keypoints models.
-
-    This class extends the Detect head to include keypoint prediction capabilities for pose estimation tasks.
-
-    Attributes:
-        kpt_shape (tuple): Number of keypoints and dimensions (2 for x,y or 3 for x,y,visible).
-        nk (int): Total number of keypoint values.
-        cv4 (nn.ModuleList): Convolution layers for keypoint prediction.
-
-    Methods:
-        forward: Perform forward pass through YOLO model and return predictions.
-        kpts_decode: Decode keypoints from predictions.
-
-    Examples:
-        Create a pose detection head
-        >>> pose = Pose(nc=80, kpt_shape=(17, 3), ch=(256, 512, 1024))
-        >>> x = [torch.randn(1, 256, 80, 80), torch.randn(1, 512, 40, 40), torch.randn(1, 1024, 20, 20)]
-        >>> outputs = pose(x)
-    """
-
-    def __init__(self, nc: int = 80, kpt_shape: tuple = (17, 3), ch: tuple = ()):
-        """
-        Initialize YOLO network with default parameters and Convolutional Layers.
-
-        Args:
-            nc (int): Number of classes.
-            kpt_shape (tuple): Number of keypoints, number of dims (2 for x,y or 3 for x,y,visible).
-            ch (tuple): Tuple of channel sizes from backbone feature maps.
-        """
-        super().__init__(nc, ch)
-        self.kpt_shape = kpt_shape  # number of keypoints, number of dims (2 for x,y or 3 for x,y,visible)
-        self.nk = kpt_shape[0] * kpt_shape[1]  # number of keypoints total
-
-        c4 = max(ch[0] // 4, self.nk)
-        self.cv4 = nn.ModuleList(nn.Sequential(Conv(x, c4, 3), Conv(c4, c4, 3), nn.Conv2d(c4, self.nk, 1)) for x in ch)
-
-    def forward(self, x: list[torch.Tensor]) -> torch.Tensor | tuple:
-        """Perform forward pass through YOLO model and return predictions."""
-        bs = x[0].shape[0]  # batch size
-        kpt = torch.cat([self.cv4[i](x[i]).view(bs, self.nk, -1) for i in range(self.nl)], -1)  # (bs, 17*3, h*w)
-        x = Detect.forward(self, x)
-        if self.training:
-            return x, kpt
-        pred_kpt = self.kpts_decode(bs, kpt)
-        return torch.cat([x, pred_kpt], 1) if self.export else (torch.cat([x[0], pred_kpt], 1), (x[1], kpt))
-
-    def kpts_decode(self, bs: int, kpts: torch.Tensor) -> torch.Tensor:
-        """Decode keypoints from predictions."""
-        ndim = self.kpt_shape[1]
-        if self.export:
-            if self.format in {
-                "tflite",
-                "edgetpu",
-            }:  # required for TFLite export to avoid 'PLACEHOLDER_FOR_GREATER_OP_CODES' bug
-                # Precompute normalization factor to increase numerical stability
-                y = kpts.view(bs, *self.kpt_shape, -1)
-                grid_h, grid_w = self.shape[2], self.shape[3]
-                grid_size = torch.tensor([grid_w, grid_h], device=y.device).reshape(1, 2, 1)
-                norm = self.strides / (self.stride[0] * grid_size)
-                a = (y[:, :, :2] * 2.0 + (self.anchors - 0.5)) * norm
-            else:
-                # NCNN fix
-                y = kpts.view(bs, *self.kpt_shape, -1)
-                a = (y[:, :, :2] * 2.0 + (self.anchors - 0.5)) * self.strides
-            if ndim == 3:
-                a = torch.cat((a, y[:, :, 2:3].sigmoid()), 2)
-            return a.view(bs, self.nk, -1)
-        else:
-            y = kpts.clone()
-            if ndim == 3:
-                if NOT_MACOS14:
-                    y[:, 2::ndim].sigmoid_()
-                else:  # Apple macOS14 MPS bug https://github.com/ultralytics/ultralytics/pull/21878
-                    y[:, 2::ndim] = y[:, 2::ndim].sigmoid()
-            y[:, 0::ndim] = (y[:, 0::ndim] * 2.0 + (self.anchors[0] - 0.5)) * self.strides
-            y[:, 1::ndim] = (y[:, 1::ndim] * 2.0 + (self.anchors[1] - 0.5)) * self.strides
-            return y
-
-
-class Classify(nn.Module):
-    """
-    YOLO classification head, i.e. x(b,c1,20,20) to x(b,c2).
-
-    This class implements a classification head that transforms feature maps into class predictions.
-
-    Attributes:
-        export (bool): Export mode flag.
-        conv (Conv): Convolutional layer for feature transformation.
-        pool (nn.AdaptiveAvgPool2d): Global average pooling layer.
-        drop (nn.Dropout): Dropout layer for regularization.
-        linear (nn.Linear): Linear layer for final classification.
-
-    Methods:
-        forward: Perform forward pass of the YOLO model on input image data.
-
-    Examples:
-        Create a classification head
-        >>> classify = Classify(c1=1024, c2=1000)
-        >>> x = torch.randn(1, 1024, 20, 20)
-        >>> output = classify(x)
-    """
-
-    export = False  # export mode
-
-    def __init__(self, c1: int, c2: int, k: int = 1, s: int = 1, p: int | None = None, g: int = 1):
-        """
-        Initialize YOLO classification head to transform input tensor from (b,c1,20,20) to (b,c2) shape.
-
-        Args:
-            c1 (int): Number of input channels.
-            c2 (int): Number of output classes.
-            k (int, optional): Kernel size.
-            s (int, optional): Stride.
-            p (int, optional): Padding.
-            g (int, optional): Groups.
-        """
-        super().__init__()
-        c_ = 1280  # efficientnet_b0 size
-        self.conv = Conv(c1, c_, k, s, p, g)
-        self.pool = nn.AdaptiveAvgPool2d(1)  # to x(b,c_,1,1)
-        self.drop = nn.Dropout(p=0.0, inplace=True)
-        self.linear = nn.Linear(c_, c2)  # to x(b,c2)
-
-    def forward(self, x: list[torch.Tensor] | torch.Tensor) -> torch.Tensor | tuple:
-        """Perform forward pass of the YOLO model on input image data."""
-        if isinstance(x, list):
-            x = torch.cat(x, 1)
-        x = self.linear(self.drop(self.pool(self.conv(x)).flatten(1)))
-        if self.training:
-            return x
-        y = x.softmax(1)  # get final output
-        return y if self.export else (y, x)
-
-
-class WorldDetect(Detect):
-    """
-    Head for integrating YOLO detection models with semantic understanding from text embeddings.
-
-    This class extends the standard Detect head to incorporate text embeddings for enhanced semantic understanding
-    in object detection tasks.
-
-    Attributes:
-        cv3 (nn.ModuleList): Convolution layers for embedding features.
-        cv4 (nn.ModuleList): Contrastive head layers for text-vision alignment.
-
-    Methods:
-        forward: Concatenate and return predicted bounding boxes and class probabilities.
-        bias_init: Initialize detection head biases.
-
-    Examples:
-        Create a WorldDetect head
-        >>> world_detect = WorldDetect(nc=80, embed=512, with_bn=False, ch=(256, 512, 1024))
-        >>> x = [torch.randn(1, 256, 80, 80), torch.randn(1, 512, 40, 40), torch.randn(1, 1024, 20, 20)]
-        >>> text = torch.randn(1, 80, 512)
-        >>> outputs = world_detect(x, text)
-    """
-
-    def __init__(self, nc: int = 80, embed: int = 512, with_bn: bool = False, ch: tuple = ()):
-        """
-        Initialize YOLO detection layer with nc classes and layer channels ch.
-
-        Args:
-            nc (int): Number of classes.
-            embed (int): Embedding dimension.
-            with_bn (bool): Whether to use batch normalization in contrastive head.
-            ch (tuple): Tuple of channel sizes from backbone feature maps.
-        """
-        super().__init__(nc, ch)
-        c3 = max(ch[0], min(self.nc, 100))
-        self.cv3 = nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, embed, 1)) for x in ch)
-        self.cv4 = nn.ModuleList(BNContrastiveHead(embed) if with_bn else ContrastiveHead() for _ in ch)
-
-    def forward(self, x: list[torch.Tensor], text: torch.Tensor) -> list[torch.Tensor] | tuple:
-        """Concatenate and return predicted bounding boxes and class probabilities."""
-        for i in range(self.nl):
-            x[i] = torch.cat((self.cv2[i](x[i]), self.cv4[i](self.cv3[i](x[i]), text)), 1)
-        if self.training:
-            return x
-        self.no = self.nc + self.reg_max * 4  # self.nc could be changed when inference with different texts
-        y = self._inference(x)
-        return y if self.export else (y, x)
-
-    def bias_init(self):
-        """Initialize Detect() biases, WARNING: requires stride availability."""
-        m = self  # self.model[-1]  # Detect() module
-        # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1
-        # ncf = math.log(0.6 / (m.nc - 0.999999)) if cf is None else torch.log(cf / cf.sum())  # nominal class frequency
-        for a, b, s in zip(m.cv2, m.cv3, m.stride):  # from
-            a[-1].bias.data[:] = 1.0  # box
-            # b[-1].bias.data[:] = math.log(5 / m.nc / (640 / s) ** 2)  # cls (.01 objects, 80 classes, 640 img)
-
-
-class LRPCHead(nn.Module):
-    """
-    Lightweight Region Proposal and Classification Head for efficient object detection.
-
-    This head combines region proposal filtering with classification to enable efficient detection with
-    dynamic vocabulary support.
-
-    Attributes:
-        vocab (nn.Module): Vocabulary/classification layer.
-        pf (nn.Module): Proposal filter module.
-        loc (nn.Module): Localization module.
-        enabled (bool): Whether the head is enabled.
-
-    Methods:
-        conv2linear: Convert a 1x1 convolutional layer to a linear layer.
-        forward: Process classification and localization features to generate detection proposals.
-
-    Examples:
-        Create an LRPC head
-        >>> vocab = nn.Conv2d(256, 80, 1)
-        >>> pf = nn.Conv2d(256, 1, 1)
-        >>> loc = nn.Conv2d(256, 4, 1)
-        >>> head = LRPCHead(vocab, pf, loc, enabled=True)
-    """
-
-    def __init__(self, vocab: nn.Module, pf: nn.Module, loc: nn.Module, enabled: bool = True):
-        """
-        Initialize LRPCHead with vocabulary, proposal filter, and localization components.
-
-        Args:
-            vocab (nn.Module): Vocabulary/classification module.
-            pf (nn.Module): Proposal filter module.
-            loc (nn.Module): Localization module.
-            enabled (bool): Whether to enable the head functionality.
-        """
-        super().__init__()
-        self.vocab = self.conv2linear(vocab) if enabled else vocab
-        self.pf = pf
-        self.loc = loc
-        self.enabled = enabled
-
-    def conv2linear(self, conv: nn.Conv2d) -> nn.Linear:
-        """Convert a 1x1 convolutional layer to a linear layer."""
-        assert isinstance(conv, nn.Conv2d) and conv.kernel_size == (1, 1)
-        linear = nn.Linear(conv.in_channels, conv.out_channels)
-        linear.weight.data = conv.weight.view(conv.out_channels, -1).data
-        linear.bias.data = conv.bias.data
-        return linear
-
-    def forward(self, cls_feat: torch.Tensor, loc_feat: torch.Tensor, conf: float) -> tuple[tuple, torch.Tensor]:
-        """Process classification and localization features to generate detection proposals."""
-        if self.enabled:
-            pf_score = self.pf(cls_feat)[0, 0].flatten(0)
-            mask = pf_score.sigmoid() > conf
-            cls_feat = cls_feat.flatten(2).transpose(-1, -2)
-            cls_feat = self.vocab(cls_feat[:, mask] if conf else cls_feat * mask.unsqueeze(-1).int())
-            return (self.loc(loc_feat), cls_feat.transpose(-1, -2)), mask
-        else:
-            cls_feat = self.vocab(cls_feat)
-            loc_feat = self.loc(loc_feat)
-            return (loc_feat, cls_feat.flatten(2)), torch.ones(
-                cls_feat.shape[2] * cls_feat.shape[3], device=cls_feat.device, dtype=torch.bool
-            )
-
-
-class YOLOEDetect(Detect):
-    """
-    Head for integrating YOLO detection models with semantic understanding from text embeddings.
-
-    This class extends the standard Detect head to support text-guided detection with enhanced semantic understanding
-    through text embeddings and visual prompt embeddings.
-
-    Attributes:
-        is_fused (bool): Whether the model is fused for inference.
-        cv3 (nn.ModuleList): Convolution layers for embedding features.
-        cv4 (nn.ModuleList): Contrastive head layers for text-vision alignment.
-        reprta (Residual): Residual block for text prompt embeddings.
-        savpe (SAVPE): Spatial-aware visual prompt embeddings module.
-        embed (int): Embedding dimension.
-
-    Methods:
-        fuse: Fuse text features with model weights for efficient inference.
-        get_tpe: Get text prompt embeddings with normalization.
-        get_vpe: Get visual prompt embeddings with spatial awareness.
-        forward_lrpc: Process features with fused text embeddings for prompt-free model.
-        forward: Process features with class prompt embeddings to generate detections.
-        bias_init: Initialize biases for detection heads.
-
-    Examples:
-        Create a YOLOEDetect head
-        >>> yoloe_detect = YOLOEDetect(nc=80, embed=512, with_bn=True, ch=(256, 512, 1024))
-        >>> x = [torch.randn(1, 256, 80, 80), torch.randn(1, 512, 40, 40), torch.randn(1, 1024, 20, 20)]
-        >>> cls_pe = torch.randn(1, 80, 512)
-        >>> outputs = yoloe_detect(x, cls_pe)
-    """
-
-    is_fused = False
-
-    def __init__(self, nc: int = 80, embed: int = 512, with_bn: bool = False, ch: tuple = ()):
-        """
-        Initialize YOLO detection layer with nc classes and layer channels ch.
-
-        Args:
-            nc (int): Number of classes.
-            embed (int): Embedding dimension.
-            with_bn (bool): Whether to use batch normalization in contrastive head.
-            ch (tuple): Tuple of channel sizes from backbone feature maps.
-        """
-        super().__init__(nc, ch)
-        c3 = max(ch[0], min(self.nc, 100))
-        assert c3 <= embed
-        assert with_bn
-        self.cv3 = (
-            nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, embed, 1)) for x in ch)
-            if self.legacy
-            else nn.ModuleList(
-                nn.Sequential(
-                    nn.Sequential(DWConv(x, x, 3), Conv(x, c3, 1)),
-                    nn.Sequential(DWConv(c3, c3, 3), Conv(c3, c3, 1)),
-                    nn.Conv2d(c3, embed, 1),
-                )
-                for x in ch
-            )
-        )
-
-        self.cv4 = nn.ModuleList(BNContrastiveHead(embed) if with_bn else ContrastiveHead() for _ in ch)
-
-        self.reprta = Residual(SwiGLUFFN(embed, embed))
-        self.savpe = SAVPE(ch, c3, embed)
-        self.embed = embed
-
-    @smart_inference_mode()
-    def fuse(self, txt_feats: torch.Tensor):
-        """Fuse text features with model weights for efficient inference."""
-        if self.is_fused:
-            return
-
-        assert not self.training
-        txt_feats = txt_feats.to(torch.float32).squeeze(0)
-        for cls_head, bn_head in zip(self.cv3, self.cv4):
-            assert isinstance(cls_head, nn.Sequential)
-            assert isinstance(bn_head, BNContrastiveHead)
-            conv = cls_head[-1]
-            assert isinstance(conv, nn.Conv2d)
-            logit_scale = bn_head.logit_scale
-            bias = bn_head.bias
-            norm = bn_head.norm
-
-            t = txt_feats * logit_scale.exp()
-            conv: nn.Conv2d = fuse_conv_and_bn(conv, norm)
-
-            w = conv.weight.data.squeeze(-1).squeeze(-1)
-            b = conv.bias.data
-
-            w = t @ w
-            b1 = (t @ b.reshape(-1).unsqueeze(-1)).squeeze(-1)
-            b2 = torch.ones_like(b1) * bias
-
-            conv = (
-                nn.Conv2d(
-                    conv.in_channels,
-                    w.shape[0],
-                    kernel_size=1,
-                )
-                .requires_grad_(False)
-                .to(conv.weight.device)
-            )
-
-            conv.weight.data.copy_(w.unsqueeze(-1).unsqueeze(-1))
-            conv.bias.data.copy_(b1 + b2)
-            cls_head[-1] = conv
-
-            bn_head.fuse()
-
-        del self.reprta
-        self.reprta = nn.Identity()
-        self.is_fused = True
-
-    def get_tpe(self, tpe: torch.Tensor | None) -> torch.Tensor | None:
-        """Get text prompt embeddings with normalization."""
-        return None if tpe is None else F.normalize(self.reprta(tpe), dim=-1, p=2)
-
-    def get_vpe(self, x: list[torch.Tensor], vpe: torch.Tensor) -> torch.Tensor:
-        """Get visual prompt embeddings with spatial awareness."""
-        if vpe.shape[1] == 0:  # no visual prompt embeddings
-            return torch.zeros(x[0].shape[0], 0, self.embed, device=x[0].device)
-        if vpe.ndim == 4:  # (B, N, H, W)
-            vpe = self.savpe(x, vpe)
-        assert vpe.ndim == 3  # (B, N, D)
-        return vpe
-
-    def forward_lrpc(self, x: list[torch.Tensor], return_mask: bool = False) -> torch.Tensor | tuple:
-        """Process features with fused text embeddings to generate detections for prompt-free model."""
-        masks = []
-        assert self.is_fused, "Prompt-free inference requires model to be fused!"
-        for i in range(self.nl):
-            cls_feat = self.cv3[i](x[i])
-            loc_feat = self.cv2[i](x[i])
-            assert isinstance(self.lrpc[i], LRPCHead)
-            x[i], mask = self.lrpc[i](
-                cls_feat, loc_feat, 0 if self.export and not self.dynamic else getattr(self, "conf", 0.001)
-            )
-            masks.append(mask)
-        shape = x[0][0].shape
-        if self.dynamic or self.shape != shape:
-            self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors([b[0] for b in x], self.stride, 0.5))
-            self.shape = shape
-        box = torch.cat([xi[0].view(shape[0], self.reg_max * 4, -1) for xi in x], 2)
-        cls = torch.cat([xi[1] for xi in x], 2)
-
-        if self.export and self.format in {"tflite", "edgetpu"}:
-            # Precompute normalization factor to increase numerical stability
-            # See https://github.com/ultralytics/ultralytics/issues/7371
-            grid_h = shape[2]
-            grid_w = shape[3]
-            grid_size = torch.tensor([grid_w, grid_h, grid_w, grid_h], device=box.device).reshape(1, 4, 1)
-            norm = self.strides / (self.stride[0] * grid_size)
-            dbox = self.decode_bboxes(self.dfl(box) * norm, self.anchors.unsqueeze(0) * norm[:, :2])
-        else:
-            dbox = self.decode_bboxes(self.dfl(box), self.anchors.unsqueeze(0)) * self.strides
-
-        mask = torch.cat(masks)
-        y = torch.cat((dbox if self.export and not self.dynamic else dbox[..., mask], cls.sigmoid()), 1)
-
-        if return_mask:
-            return (y, mask) if self.export else ((y, x), mask)
-        else:
-            return y if self.export else (y, x)
-
-    def forward(self, x: list[torch.Tensor], cls_pe: torch.Tensor, return_mask: bool = False) -> torch.Tensor | tuple:
-        """Process features with class prompt embeddings to generate detections."""
-        if hasattr(self, "lrpc"):  # for prompt-free inference
-            return self.forward_lrpc(x, return_mask)
-        for i in range(self.nl):
-            x[i] = torch.cat((self.cv2[i](x[i]), self.cv4[i](self.cv3[i](x[i]), cls_pe)), 1)
-        if self.training:
-            return x
-        self.no = self.nc + self.reg_max * 4  # self.nc could be changed when inference with different texts
-        y = self._inference(x)
-        return y if self.export else (y, x)
-
-    def bias_init(self):
-        """Initialize biases for detection heads."""
-        m = self  # self.model[-1]  # Detect() module
-        # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1
-        # ncf = math.log(0.6 / (m.nc - 0.999999)) if cf is None else torch.log(cf / cf.sum())  # nominal class frequency
-        for a, b, c, s in zip(m.cv2, m.cv3, m.cv4, m.stride):  # from
-            a[-1].bias.data[:] = 1.0  # box
-            # b[-1].bias.data[:] = math.log(5 / m.nc / (640 / s) ** 2)  # cls (.01 objects, 80 classes, 640 img)
-            b[-1].bias.data[:] = 0.0
-            c.bias.data[:] = math.log(5 / m.nc / (640 / s) ** 2)
-
-
-class YOLOESegment(YOLOEDetect):
-    """
-    YOLO segmentation head with text embedding capabilities.
-
-    This class extends YOLOEDetect to include mask prediction capabilities for instance segmentation tasks
-    with text-guided semantic understanding.
-
-    Attributes:
-        nm (int): Number of masks.
-        npr (int): Number of protos.
-        proto (Proto): Prototype generation module.
-        cv5 (nn.ModuleList): Convolution layers for mask coefficients.
-
-    Methods:
-        forward: Return model outputs and mask coefficients.
-
-    Examples:
-        Create a YOLOESegment head
-        >>> yoloe_segment = YOLOESegment(nc=80, nm=32, npr=256, embed=512, with_bn=True, ch=(256, 512, 1024))
-        >>> x = [torch.randn(1, 256, 80, 80), torch.randn(1, 512, 40, 40), torch.randn(1, 1024, 20, 20)]
-        >>> text = torch.randn(1, 80, 512)
-        >>> outputs = yoloe_segment(x, text)
-    """
-
-    def __init__(
-        self, nc: int = 80, nm: int = 32, npr: int = 256, embed: int = 512, with_bn: bool = False, ch: tuple = ()
-    ):
-        """
-        Initialize YOLOESegment with class count, mask parameters, and embedding dimensions.
-
-        Args:
-            nc (int): Number of classes.
-            nm (int): Number of masks.
-            npr (int): Number of protos.
-            embed (int): Embedding dimension.
-            with_bn (bool): Whether to use batch normalization in contrastive head.
-            ch (tuple): Tuple of channel sizes from backbone feature maps.
-        """
-        super().__init__(nc, embed, with_bn, ch)
-        self.nm = nm
-        self.npr = npr
-        self.proto = Proto(ch[0], self.npr, self.nm)
-
-        c5 = max(ch[0] // 4, self.nm)
-        self.cv5 = nn.ModuleList(nn.Sequential(Conv(x, c5, 3), Conv(c5, c5, 3), nn.Conv2d(c5, self.nm, 1)) for x in ch)
-
-    def forward(self, x: list[torch.Tensor], text: torch.Tensor) -> tuple | torch.Tensor:
-        """Return model outputs and mask coefficients if training, otherwise return outputs and mask coefficients."""
-        p = self.proto(x[0])  # mask protos
-        bs = p.shape[0]  # batch size
-
-        mc = torch.cat([self.cv5[i](x[i]).view(bs, self.nm, -1) for i in range(self.nl)], 2)  # mask coefficients
-        has_lrpc = hasattr(self, "lrpc")
-
-        if not has_lrpc:
-            x = YOLOEDetect.forward(self, x, text)
-        else:
-            x, mask = YOLOEDetect.forward(self, x, text, return_mask=True)
-
-        if self.training:
-            return x, mc, p
-
-        if has_lrpc:
-            mc = (mc * mask.int()) if self.export and not self.dynamic else mc[..., mask]
-
-        return (torch.cat([x, mc], 1), p) if self.export else (torch.cat([x[0], mc], 1), (x[1], mc, p))
-
-
-class RTDETRDecoder(nn.Module):
-    """
-    Real-Time Deformable Transformer Decoder (RTDETRDecoder) module for object detection.
-
-    This decoder module utilizes Transformer architecture along with deformable convolutions to predict bounding boxes
-    and class labels for objects in an image. It integrates features from multiple layers and runs through a series of
-    Transformer decoder layers to output the final predictions.
-
-    Attributes:
-        export (bool): Export mode flag.
-        hidden_dim (int): Dimension of hidden layers.
-        nhead (int): Number of heads in multi-head attention.
-        nl (int): Number of feature levels.
-        nc (int): Number of classes.
-        num_queries (int): Number of query points.
-        num_decoder_layers (int): Number of decoder layers.
-        input_proj (nn.ModuleList): Input projection layers for backbone features.
-        decoder (DeformableTransformerDecoder): Transformer decoder module.
-        denoising_class_embed (nn.Embedding): Class embeddings for denoising.
-        num_denoising (int): Number of denoising queries.
-        label_noise_ratio (float): Label noise ratio for training.
-        box_noise_scale (float): Box noise scale for training.
-        learnt_init_query (bool): Whether to learn initial query embeddings.
-        tgt_embed (nn.Embedding): Target embeddings for queries.
-        query_pos_head (MLP): Query position head.
-        enc_output (nn.Sequential): Encoder output layers.
-        enc_score_head (nn.Linear): Encoder score prediction head.
-        enc_bbox_head (MLP): Encoder bbox prediction head.
-        dec_score_head (nn.ModuleList): Decoder score prediction heads.
-        dec_bbox_head (nn.ModuleList): Decoder bbox prediction heads.
-
-    Methods:
-        forward: Run forward pass and return bounding box and classification scores.
-
-    Examples:
-        Create an RTDETRDecoder
-        >>> decoder = RTDETRDecoder(nc=80, ch=(512, 1024, 2048), hd=256, nq=300)
-        >>> x = [torch.randn(1, 512, 64, 64), torch.randn(1, 1024, 32, 32), torch.randn(1, 2048, 16, 16)]
-        >>> outputs = decoder(x)
-    """
-
-    export = False  # export mode
-
-    def __init__(
-        self,
-        nc: int = 80,
-        ch: tuple = (512, 1024, 2048),
-        hd: int = 256,  # hidden dim
-        nq: int = 300,  # num queries
-        ndp: int = 4,  # num decoder points
-        nh: int = 8,  # num head
-        ndl: int = 6,  # num decoder layers
-        d_ffn: int = 1024,  # dim of feedforward
-        dropout: float = 0.0,
-        act: nn.Module = nn.ReLU(),
-        eval_idx: int = -1,
-        # Training args
-        nd: int = 100,  # num denoising
-        label_noise_ratio: float = 0.5,
-        box_noise_scale: float = 1.0,
-        learnt_init_query: bool = False,
-    ):
-        """
-        Initialize the RTDETRDecoder module with the given parameters.
-
-        Args:
-            nc (int): Number of classes.
-            ch (tuple): Channels in the backbone feature maps.
-            hd (int): Dimension of hidden layers.
-            nq (int): Number of query points.
-            ndp (int): Number of decoder points.
-            nh (int): Number of heads in multi-head attention.
-            ndl (int): Number of decoder layers.
-            d_ffn (int): Dimension of the feed-forward networks.
-            dropout (float): Dropout rate.
-            act (nn.Module): Activation function.
-            eval_idx (int): Evaluation index.
-            nd (int): Number of denoising.
-            label_noise_ratio (float): Label noise ratio.
-            box_noise_scale (float): Box noise scale.
-            learnt_init_query (bool): Whether to learn initial query embeddings.
-        """
-        super().__init__()
-        self.hidden_dim = hd
-        self.nhead = nh
-        self.nl = len(ch)  # num level
-        self.nc = nc
-        self.num_queries = nq
-        self.num_decoder_layers = ndl
-
-        # Backbone feature projection
-        self.input_proj = nn.ModuleList(nn.Sequential(nn.Conv2d(x, hd, 1, bias=False), nn.BatchNorm2d(hd)) for x in ch)
-        # NOTE: simplified version but it's not consistent with .pt weights.
-        # self.input_proj = nn.ModuleList(Conv(x, hd, act=False) for x in ch)
-
-        # Transformer module
-        decoder_layer = DeformableTransformerDecoderLayer(hd, nh, d_ffn, dropout, act, self.nl, ndp)
-        self.decoder = DeformableTransformerDecoder(hd, decoder_layer, ndl, eval_idx)
-
-        # Denoising part
-        self.denoising_class_embed = nn.Embedding(nc, hd)
-        self.num_denoising = nd
-        self.label_noise_ratio = label_noise_ratio
-        self.box_noise_scale = box_noise_scale
-
-        # Decoder embedding
-        self.learnt_init_query = learnt_init_query
-        if learnt_init_query:
-            self.tgt_embed = nn.Embedding(nq, hd)
-        self.query_pos_head = MLP(4, 2 * hd, hd, num_layers=2)
-
-        # Encoder head
-        self.enc_output = nn.Sequential(nn.Linear(hd, hd), nn.LayerNorm(hd))
-        self.enc_score_head = nn.Linear(hd, nc)
-        self.enc_bbox_head = MLP(hd, hd, 4, num_layers=3)
-
-        # Decoder head
-        self.dec_score_head = nn.ModuleList([nn.Linear(hd, nc) for _ in range(ndl)])
-        self.dec_bbox_head = nn.ModuleList([MLP(hd, hd, 4, num_layers=3) for _ in range(ndl)])
-
-        self._reset_parameters()
-
-    def forward(self, x: list[torch.Tensor], batch: dict | None = None) -> tuple | torch.Tensor:
-        """
-        Run the forward pass of the module, returning bounding box and classification scores for the input.
-
-        Args:
-            x (list[torch.Tensor]): List of feature maps from the backbone.
-            batch (dict, optional): Batch information for training.
-
-        Returns:
-            outputs (tuple | torch.Tensor): During training, returns a tuple of bounding boxes, scores, and other
-                metadata. During inference, returns a tensor of shape (bs, 300, 4+nc) containing bounding boxes and
-                class scores.
-        """
-        from ultralytics.models.utils.ops import get_cdn_group
-
-        # Input projection and embedding
-        feats, shapes = self._get_encoder_input(x)
-
-        # Prepare denoising training
-        dn_embed, dn_bbox, attn_mask, dn_meta = get_cdn_group(
-            batch,
-            self.nc,
-            self.num_queries,
-            self.denoising_class_embed.weight,
-            self.num_denoising,
-            self.label_noise_ratio,
-            self.box_noise_scale,
-            self.training,
-        )
-
-        embed, refer_bbox, enc_bboxes, enc_scores = self._get_decoder_input(feats, shapes, dn_embed, dn_bbox)
-
-        # Decoder
-        dec_bboxes, dec_scores = self.decoder(
-            embed,
-            refer_bbox,
-            feats,
-            shapes,
-            self.dec_bbox_head,
-            self.dec_score_head,
-            self.query_pos_head,
-            attn_mask=attn_mask,
-        )
-        x = dec_bboxes, dec_scores, enc_bboxes, enc_scores, dn_meta
-        if self.training:
-            return x
-        # (bs, 300, 4+nc)
-        y = torch.cat((dec_bboxes.squeeze(0), dec_scores.squeeze(0).sigmoid()), -1)
-        return y if self.export else (y, x)
-
-    def _generate_anchors(
-        self,
-        shapes: list[list[int]],
-        grid_size: float = 0.05,
-        dtype: torch.dtype = torch.float32,
-        device: str = "cpu",
-        eps: float = 1e-2,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        """
-        Generate anchor bounding boxes for given shapes with specific grid size and validate them.
-
-        Args:
-            shapes (list): List of feature map shapes.
-            grid_size (float, optional): Base size of grid cells.
-            dtype (torch.dtype, optional): Data type for tensors.
-            device (str, optional): Device to create tensors on.
-            eps (float, optional): Small value for numerical stability.
-
-        Returns:
-            anchors (torch.Tensor): Generated anchor boxes.
-            valid_mask (torch.Tensor): Valid mask for anchors.
-        """
-        anchors = []
-        for i, (h, w) in enumerate(shapes):
-            sy = torch.arange(end=h, dtype=dtype, device=device)
-            sx = torch.arange(end=w, dtype=dtype, device=device)
-            grid_y, grid_x = torch.meshgrid(sy, sx, indexing="ij") if TORCH_1_11 else torch.meshgrid(sy, sx)
-            grid_xy = torch.stack([grid_x, grid_y], -1)  # (h, w, 2)
-
-            valid_WH = torch.tensor([w, h], dtype=dtype, device=device)
-            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH  # (1, h, w, 2)
-            wh = torch.ones_like(grid_xy, dtype=dtype, device=device) * grid_size * (2.0**i)
-            anchors.append(torch.cat([grid_xy, wh], -1).view(-1, h * w, 4))  # (1, h*w, 4)
-
-        anchors = torch.cat(anchors, 1)  # (1, h*w*nl, 4)
-        valid_mask = ((anchors > eps) & (anchors < 1 - eps)).all(-1, keepdim=True)  # 1, h*w*nl, 1
-        anchors = torch.log(anchors / (1 - anchors))
-        anchors = anchors.masked_fill(~valid_mask, float("inf"))
-        return anchors, valid_mask
-
-    def _get_encoder_input(self, x: list[torch.Tensor]) -> tuple[torch.Tensor, list[list[int]]]:
-        """
-        Process and return encoder inputs by getting projection features from input and concatenating them.
-
-        Args:
-            x (list[torch.Tensor]): List of feature maps from the backbone.
-
-        Returns:
-            feats (torch.Tensor): Processed features.
-            shapes (list): List of feature map shapes.
-        """
-        # Get projection features
-        x = [self.input_proj[i](feat) for i, feat in enumerate(x)]
-        # Get encoder inputs
-        feats = []
-        shapes = []
-        for feat in x:
-            h, w = feat.shape[2:]
-            # [b, c, h, w] -> [b, h*w, c]
-            feats.append(feat.flatten(2).permute(0, 2, 1))
-            # [nl, 2]
-            shapes.append([h, w])
-
-        # [b, h*w, c]
-        feats = torch.cat(feats, 1)
-        return feats, shapes
-
-    def _get_decoder_input(
-        self,
-        feats: torch.Tensor,
-        shapes: list[list[int]],
-        dn_embed: torch.Tensor | None = None,
-        dn_bbox: torch.Tensor | None = None,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        """
-        Generate and prepare the input required for the decoder from the provided features and shapes.
-
-        Args:
-            feats (torch.Tensor): Processed features from encoder.
-            shapes (list): List of feature map shapes.
-            dn_embed (torch.Tensor, optional): Denoising embeddings.
-            dn_bbox (torch.Tensor, optional): Denoising bounding boxes.
-
-        Returns:
-            embeddings (torch.Tensor): Query embeddings for decoder.
-            refer_bbox (torch.Tensor): Reference bounding boxes.
-            enc_bboxes (torch.Tensor): Encoded bounding boxes.
-            enc_scores (torch.Tensor): Encoded scores.
-        """
-        bs = feats.shape[0]
-        # Prepare input for decoder
-        anchors, valid_mask = self._generate_anchors(shapes, dtype=feats.dtype, device=feats.device)
-        features = self.enc_output(valid_mask * feats)  # bs, h*w, 256
-
-        enc_outputs_scores = self.enc_score_head(features)  # (bs, h*w, nc)
-
-        # Query selection
-        # (bs, num_queries)
-        topk_ind = torch.topk(enc_outputs_scores.max(-1).values, self.num_queries, dim=1).indices.view(-1)
-        # (bs, num_queries)
-        batch_ind = torch.arange(end=bs, dtype=topk_ind.dtype).unsqueeze(-1).repeat(1, self.num_queries).view(-1)
-
-        # (bs, num_queries, 256)
-        top_k_features = features[batch_ind, topk_ind].view(bs, self.num_queries, -1)
-        # (bs, num_queries, 4)
-        top_k_anchors = anchors[:, topk_ind].view(bs, self.num_queries, -1)
-
-        # Dynamic anchors + static content
-        refer_bbox = self.enc_bbox_head(top_k_features) + top_k_anchors
-
-        enc_bboxes = refer_bbox.sigmoid()
-        if dn_bbox is not None:
-            refer_bbox = torch.cat([dn_bbox, refer_bbox], 1)
-        enc_scores = enc_outputs_scores[batch_ind, topk_ind].view(bs, self.num_queries, -1)
-
-        embeddings = self.tgt_embed.weight.unsqueeze(0).repeat(bs, 1, 1) if self.learnt_init_query else top_k_features
-        if self.training:
-            refer_bbox = refer_bbox.detach()
-            if not self.learnt_init_query:
-                embeddings = embeddings.detach()
-        if dn_embed is not None:
-            embeddings = torch.cat([dn_embed, embeddings], 1)
-
-        return embeddings, refer_bbox, enc_bboxes, enc_scores
-
-    def _reset_parameters(self):
-        """Initialize or reset the parameters of the model's various components with predefined weights and biases."""
-        # Class and bbox head init
-        bias_cls = bias_init_with_prob(0.01) / 80 * self.nc
-        # NOTE: the weight initialization in `linear_init` would cause NaN when training with custom datasets.
-        # linear_init(self.enc_score_head)
-        constant_(self.enc_score_head.bias, bias_cls)
-        constant_(self.enc_bbox_head.layers[-1].weight, 0.0)
-        constant_(self.enc_bbox_head.layers[-1].bias, 0.0)
-        for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
-            # linear_init(cls_)
-            constant_(cls_.bias, bias_cls)
-            constant_(reg_.layers[-1].weight, 0.0)
-            constant_(reg_.layers[-1].bias, 0.0)
-
-        linear_init(self.enc_output[0])
-        xavier_uniform_(self.enc_output[0].weight)
-        if self.learnt_init_query:
-            xavier_uniform_(self.tgt_embed.weight)
-        xavier_uniform_(self.query_pos_head.layers[0].weight)
-        xavier_uniform_(self.query_pos_head.layers[1].weight)
-        for layer in self.input_proj:
-            xavier_uniform_(layer[0].weight)
-
-
-class v10Detect(Detect):
-    """
-    v10 Detection head from https://arxiv.org/pdf/2405.14458.
-
-    This class implements the YOLOv10 detection head with dual-assignment training and consistent dual predictions
-    for improved efficiency and performance.
-
-    Attributes:
-        end2end (bool): End-to-end detection mode.
-        max_det (int): Maximum number of detections.
-        cv3 (nn.ModuleList): Light classification head layers.
-        one2one_cv3 (nn.ModuleList): One-to-one classification head layers.
-
-    Methods:
-        __init__: Initialize the v10Detect object with specified number of classes and input channels.
-        forward: Perform forward pass of the v10Detect module.
-        bias_init: Initialize biases of the Detect module.
-        fuse: Remove the one2many head for inference optimization.
-
-    Examples:
-        Create a v10Detect head
-        >>> v10_detect = v10Detect(nc=80, ch=(256, 512, 1024))
-        >>> x = [torch.randn(1, 256, 80, 80), torch.randn(1, 512, 40, 40), torch.randn(1, 1024, 20, 20)]
-        >>> outputs = v10_detect(x)
-    """
-
-    end2end = True
-
-    def __init__(self, nc: int = 80, ch: tuple = ()):
-        """
-        Initialize the v10Detect object with the specified number of classes and input channels.
-
-        Args:
-            nc (int): Number of classes.
-            ch (tuple): Tuple of channel sizes from backbone feature maps.
-        """
-        super().__init__(nc, ch)
-        c3 = max(ch[0], min(self.nc, 100))  # channels
-        # Light cls head
-        self.cv3 = nn.ModuleList(
-            nn.Sequential(
-                nn.Sequential(Conv(x, x, 3, g=x), Conv(x, c3, 1)),
-                nn.Sequential(Conv(c3, c3, 3, g=c3), Conv(c3, c3, 1)),
-                nn.Conv2d(c3, self.nc, 1),
-            )
-            for x in ch
-        )
-        self.one2one_cv3 = copy.deepcopy(self.cv3)
-
-    def fuse(self):
-        """Remove the one2many head for inference optimization."""
-        self.cv2 = self.cv3 = nn.ModuleList([nn.Identity()] * self.nl)
diff --git a/ultralytics/nn/modules/transformer.py b/ultralytics/nn/modules/transformer.py
deleted file mode 100644
index c9e6688..0000000
--- a/ultralytics/nn/modules/transformer.py
+++ /dev/null
@@ -1,805 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-"""Transformer modules."""
-
-from __future__ import annotations
-
-import math
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn.init import constant_, xavier_uniform_
-
-from ultralytics.utils.torch_utils import TORCH_1_11
-
-from .conv import Conv
-from .utils import _get_clones, inverse_sigmoid, multi_scale_deformable_attn_pytorch
-
-__all__ = (
-    "TransformerEncoderLayer",
-    "TransformerLayer",
-    "TransformerBlock",
-    "MLPBlock",
-    "LayerNorm2d",
-    "AIFI",
-    "DeformableTransformerDecoder",
-    "DeformableTransformerDecoderLayer",
-    "MSDeformAttn",
-    "MLP",
-)
-
-
-class TransformerEncoderLayer(nn.Module):
-    """
-    A single layer of the transformer encoder.
-
-    This class implements a standard transformer encoder layer with multi-head attention and feedforward network,
-    supporting both pre-normalization and post-normalization configurations.
-
-    Attributes:
-        ma (nn.MultiheadAttention): Multi-head attention module.
-        fc1 (nn.Linear): First linear layer in the feedforward network.
-        fc2 (nn.Linear): Second linear layer in the feedforward network.
-        norm1 (nn.LayerNorm): Layer normalization after attention.
-        norm2 (nn.LayerNorm): Layer normalization after feedforward network.
-        dropout (nn.Dropout): Dropout layer for the feedforward network.
-        dropout1 (nn.Dropout): Dropout layer after attention.
-        dropout2 (nn.Dropout): Dropout layer after feedforward network.
-        act (nn.Module): Activation function.
-        normalize_before (bool): Whether to apply normalization before attention and feedforward.
-    """
-
-    def __init__(
-        self,
-        c1: int,
-        cm: int = 2048,
-        num_heads: int = 8,
-        dropout: float = 0.0,
-        act: nn.Module = nn.GELU(),
-        normalize_before: bool = False,
-    ):
-        """
-        Initialize the TransformerEncoderLayer with specified parameters.
-
-        Args:
-            c1 (int): Input dimension.
-            cm (int): Hidden dimension in the feedforward network.
-            num_heads (int): Number of attention heads.
-            dropout (float): Dropout probability.
-            act (nn.Module): Activation function.
-            normalize_before (bool): Whether to apply normalization before attention and feedforward.
-        """
-        super().__init__()
-        from ...utils.torch_utils import TORCH_1_9
-
-        if not TORCH_1_9:
-            raise ModuleNotFoundError(
-                "TransformerEncoderLayer() requires torch>=1.9 to use nn.MultiheadAttention(batch_first=True)."
-            )
-        self.ma = nn.MultiheadAttention(c1, num_heads, dropout=dropout, batch_first=True)
-        # Implementation of Feedforward model
-        self.fc1 = nn.Linear(c1, cm)
-        self.fc2 = nn.Linear(cm, c1)
-
-        self.norm1 = nn.LayerNorm(c1)
-        self.norm2 = nn.LayerNorm(c1)
-        self.dropout = nn.Dropout(dropout)
-        self.dropout1 = nn.Dropout(dropout)
-        self.dropout2 = nn.Dropout(dropout)
-
-        self.act = act
-        self.normalize_before = normalize_before
-
-    @staticmethod
-    def with_pos_embed(tensor: torch.Tensor, pos: torch.Tensor | None = None) -> torch.Tensor:
-        """Add position embeddings to the tensor if provided."""
-        return tensor if pos is None else tensor + pos
-
-    def forward_post(
-        self,
-        src: torch.Tensor,
-        src_mask: torch.Tensor | None = None,
-        src_key_padding_mask: torch.Tensor | None = None,
-        pos: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        """
-        Perform forward pass with post-normalization.
-
-        Args:
-            src (torch.Tensor): Input tensor.
-            src_mask (torch.Tensor, optional): Mask for the src sequence.
-            src_key_padding_mask (torch.Tensor, optional): Mask for the src keys per batch.
-            pos (torch.Tensor, optional): Positional encoding.
-
-        Returns:
-            (torch.Tensor): Output tensor after attention and feedforward.
-        """
-        q = k = self.with_pos_embed(src, pos)
-        src2 = self.ma(q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
-        src = src + self.dropout1(src2)
-        src = self.norm1(src)
-        src2 = self.fc2(self.dropout(self.act(self.fc1(src))))
-        src = src + self.dropout2(src2)
-        return self.norm2(src)
-
-    def forward_pre(
-        self,
-        src: torch.Tensor,
-        src_mask: torch.Tensor | None = None,
-        src_key_padding_mask: torch.Tensor | None = None,
-        pos: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        """
-        Perform forward pass with pre-normalization.
-
-        Args:
-            src (torch.Tensor): Input tensor.
-            src_mask (torch.Tensor, optional): Mask for the src sequence.
-            src_key_padding_mask (torch.Tensor, optional): Mask for the src keys per batch.
-            pos (torch.Tensor, optional): Positional encoding.
-
-        Returns:
-            (torch.Tensor): Output tensor after attention and feedforward.
-        """
-        src2 = self.norm1(src)
-        q = k = self.with_pos_embed(src2, pos)
-        src2 = self.ma(q, k, value=src2, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
-        src = src + self.dropout1(src2)
-        src2 = self.norm2(src)
-        src2 = self.fc2(self.dropout(self.act(self.fc1(src2))))
-        return src + self.dropout2(src2)
-
-    def forward(
-        self,
-        src: torch.Tensor,
-        src_mask: torch.Tensor | None = None,
-        src_key_padding_mask: torch.Tensor | None = None,
-        pos: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        """
-        Forward propagate the input through the encoder module.
-
-        Args:
-            src (torch.Tensor): Input tensor.
-            src_mask (torch.Tensor, optional): Mask for the src sequence.
-            src_key_padding_mask (torch.Tensor, optional): Mask for the src keys per batch.
-            pos (torch.Tensor, optional): Positional encoding.
-
-        Returns:
-            (torch.Tensor): Output tensor after transformer encoder layer.
-        """
-        if self.normalize_before:
-            return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
-        return self.forward_post(src, src_mask, src_key_padding_mask, pos)
-
-
-class AIFI(TransformerEncoderLayer):
-    """
-    AIFI transformer layer for 2D data with positional embeddings.
-
-    This class extends TransformerEncoderLayer to work with 2D feature maps by adding 2D sine-cosine positional
-    embeddings and handling the spatial dimensions appropriately.
-    """
-
-    def __init__(
-        self,
-        c1: int,
-        cm: int = 2048,
-        num_heads: int = 8,
-        dropout: float = 0,
-        act: nn.Module = nn.GELU(),
-        normalize_before: bool = False,
-    ):
-        """
-        Initialize the AIFI instance with specified parameters.
-
-        Args:
-            c1 (int): Input dimension.
-            cm (int): Hidden dimension in the feedforward network.
-            num_heads (int): Number of attention heads.
-            dropout (float): Dropout probability.
-            act (nn.Module): Activation function.
-            normalize_before (bool): Whether to apply normalization before attention and feedforward.
-        """
-        super().__init__(c1, cm, num_heads, dropout, act, normalize_before)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Forward pass for the AIFI transformer layer.
-
-        Args:
-            x (torch.Tensor): Input tensor with shape [B, C, H, W].
-
-        Returns:
-            (torch.Tensor): Output tensor with shape [B, C, H, W].
-        """
-        c, h, w = x.shape[1:]
-        pos_embed = self.build_2d_sincos_position_embedding(w, h, c)
-        # Flatten [B, C, H, W] to [B, HxW, C]
-        x = super().forward(x.flatten(2).permute(0, 2, 1), pos=pos_embed.to(device=x.device, dtype=x.dtype))
-        return x.permute(0, 2, 1).view([-1, c, h, w]).contiguous()
-
-    @staticmethod
-    def build_2d_sincos_position_embedding(
-        w: int, h: int, embed_dim: int = 256, temperature: float = 10000.0
-    ) -> torch.Tensor:
-        """
-        Build 2D sine-cosine position embedding.
-
-        Args:
-            w (int): Width of the feature map.
-            h (int): Height of the feature map.
-            embed_dim (int): Embedding dimension.
-            temperature (float): Temperature for the sine/cosine functions.
-
-        Returns:
-            (torch.Tensor): Position embedding with shape [1, embed_dim, h*w].
-        """
-        assert embed_dim % 4 == 0, "Embed dimension must be divisible by 4 for 2D sin-cos position embedding"
-        grid_w = torch.arange(w, dtype=torch.float32)
-        grid_h = torch.arange(h, dtype=torch.float32)
-        grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="ij") if TORCH_1_11 else torch.meshgrid(grid_w, grid_h)
-        pos_dim = embed_dim // 4
-        omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
-        omega = 1.0 / (temperature**omega)
-
-        out_w = grid_w.flatten()[..., None] @ omega[None]
-        out_h = grid_h.flatten()[..., None] @ omega[None]
-
-        return torch.cat([torch.sin(out_w), torch.cos(out_w), torch.sin(out_h), torch.cos(out_h)], 1)[None]
-
-
-class TransformerLayer(nn.Module):
-    """Transformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance)."""
-
-    def __init__(self, c: int, num_heads: int):
-        """
-        Initialize a self-attention mechanism using linear transformations and multi-head attention.
-
-        Args:
-            c (int): Input and output channel dimension.
-            num_heads (int): Number of attention heads.
-        """
-        super().__init__()
-        self.q = nn.Linear(c, c, bias=False)
-        self.k = nn.Linear(c, c, bias=False)
-        self.v = nn.Linear(c, c, bias=False)
-        self.ma = nn.MultiheadAttention(embed_dim=c, num_heads=num_heads)
-        self.fc1 = nn.Linear(c, c, bias=False)
-        self.fc2 = nn.Linear(c, c, bias=False)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Apply a transformer block to the input x and return the output.
-
-        Args:
-            x (torch.Tensor): Input tensor.
-
-        Returns:
-            (torch.Tensor): Output tensor after transformer layer.
-        """
-        x = self.ma(self.q(x), self.k(x), self.v(x))[0] + x
-        return self.fc2(self.fc1(x)) + x
-
-
-class TransformerBlock(nn.Module):
-    """
-    Vision Transformer block based on https://arxiv.org/abs/2010.11929.
-
-    This class implements a complete transformer block with optional convolution layer for channel adjustment,
-    learnable position embedding, and multiple transformer layers.
-
-    Attributes:
-        conv (Conv, optional): Convolution layer if input and output channels differ.
-        linear (nn.Linear): Learnable position embedding.
-        tr (nn.Sequential): Sequential container of transformer layers.
-        c2 (int): Output channel dimension.
-    """
-
-    def __init__(self, c1: int, c2: int, num_heads: int, num_layers: int):
-        """
-        Initialize a Transformer module with position embedding and specified number of heads and layers.
-
-        Args:
-            c1 (int): Input channel dimension.
-            c2 (int): Output channel dimension.
-            num_heads (int): Number of attention heads.
-            num_layers (int): Number of transformer layers.
-        """
-        super().__init__()
-        self.conv = None
-        if c1 != c2:
-            self.conv = Conv(c1, c2)
-        self.linear = nn.Linear(c2, c2)  # learnable position embedding
-        self.tr = nn.Sequential(*(TransformerLayer(c2, num_heads) for _ in range(num_layers)))
-        self.c2 = c2
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Forward propagate the input through the transformer block.
-
-        Args:
-            x (torch.Tensor): Input tensor with shape [b, c1, w, h].
-
-        Returns:
-            (torch.Tensor): Output tensor with shape [b, c2, w, h].
-        """
-        if self.conv is not None:
-            x = self.conv(x)
-        b, _, w, h = x.shape
-        p = x.flatten(2).permute(2, 0, 1)
-        return self.tr(p + self.linear(p)).permute(1, 2, 0).reshape(b, self.c2, w, h)
-
-
-class MLPBlock(nn.Module):
-    """A single block of a multi-layer perceptron."""
-
-    def __init__(self, embedding_dim: int, mlp_dim: int, act=nn.GELU):
-        """
-        Initialize the MLPBlock with specified embedding dimension, MLP dimension, and activation function.
-
-        Args:
-            embedding_dim (int): Input and output dimension.
-            mlp_dim (int): Hidden dimension.
-            act (nn.Module): Activation function.
-        """
-        super().__init__()
-        self.lin1 = nn.Linear(embedding_dim, mlp_dim)
-        self.lin2 = nn.Linear(mlp_dim, embedding_dim)
-        self.act = act()
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Forward pass for the MLPBlock.
-
-        Args:
-            x (torch.Tensor): Input tensor.
-
-        Returns:
-            (torch.Tensor): Output tensor after MLP block.
-        """
-        return self.lin2(self.act(self.lin1(x)))
-
-
-class MLP(nn.Module):
-    """
-    A simple multi-layer perceptron (also called FFN).
-
-    This class implements a configurable MLP with multiple linear layers, activation functions, and optional
-    sigmoid output activation.
-
-    Attributes:
-        num_layers (int): Number of layers in the MLP.
-        layers (nn.ModuleList): List of linear layers.
-        sigmoid (bool): Whether to apply sigmoid to the output.
-        act (nn.Module): Activation function.
-    """
-
-    def __init__(
-        self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int, act=nn.ReLU, sigmoid: bool = False
-    ):
-        """
-        Initialize the MLP with specified input, hidden, output dimensions and number of layers.
-
-        Args:
-            input_dim (int): Input dimension.
-            hidden_dim (int): Hidden dimension.
-            output_dim (int): Output dimension.
-            num_layers (int): Number of layers.
-            act (nn.Module): Activation function.
-            sigmoid (bool): Whether to apply sigmoid to the output.
-        """
-        super().__init__()
-        self.num_layers = num_layers
-        h = [hidden_dim] * (num_layers - 1)
-        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
-        self.sigmoid = sigmoid
-        self.act = act()
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Forward pass for the entire MLP.
-
-        Args:
-            x (torch.Tensor): Input tensor.
-
-        Returns:
-            (torch.Tensor): Output tensor after MLP.
-        """
-        for i, layer in enumerate(self.layers):
-            x = getattr(self, "act", nn.ReLU())(layer(x)) if i < self.num_layers - 1 else layer(x)
-        return x.sigmoid() if getattr(self, "sigmoid", False) else x
-
-
-class LayerNorm2d(nn.Module):
-    """
-    2D Layer Normalization module inspired by Detectron2 and ConvNeXt implementations.
-
-    This class implements layer normalization for 2D feature maps, normalizing across the channel dimension
-    while preserving spatial dimensions.
-
-    Attributes:
-        weight (nn.Parameter): Learnable scale parameter.
-        bias (nn.Parameter): Learnable bias parameter.
-        eps (float): Small constant for numerical stability.
-
-    References:
-        https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py
-        https://github.com/facebookresearch/ConvNeXt/blob/main/models/convnext.py
-    """
-
-    def __init__(self, num_channels: int, eps: float = 1e-6):
-        """
-        Initialize LayerNorm2d with the given parameters.
-
-        Args:
-            num_channels (int): Number of channels in the input.
-            eps (float): Small constant for numerical stability.
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(num_channels))
-        self.bias = nn.Parameter(torch.zeros(num_channels))
-        self.eps = eps
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Perform forward pass for 2D layer normalization.
-
-        Args:
-            x (torch.Tensor): Input tensor.
-
-        Returns:
-            (torch.Tensor): Normalized output tensor.
-        """
-        u = x.mean(1, keepdim=True)
-        s = (x - u).pow(2).mean(1, keepdim=True)
-        x = (x - u) / torch.sqrt(s + self.eps)
-        return self.weight[:, None, None] * x + self.bias[:, None, None]
-
-
-class MSDeformAttn(nn.Module):
-    """
-    Multiscale Deformable Attention Module based on Deformable-DETR and PaddleDetection implementations.
-
-    This module implements multiscale deformable attention that can attend to features at multiple scales
-    with learnable sampling locations and attention weights.
-
-    Attributes:
-        im2col_step (int): Step size for im2col operations.
-        d_model (int): Model dimension.
-        n_levels (int): Number of feature levels.
-        n_heads (int): Number of attention heads.
-        n_points (int): Number of sampling points per attention head per feature level.
-        sampling_offsets (nn.Linear): Linear layer for generating sampling offsets.
-        attention_weights (nn.Linear): Linear layer for generating attention weights.
-        value_proj (nn.Linear): Linear layer for projecting values.
-        output_proj (nn.Linear): Linear layer for projecting output.
-
-    References:
-        https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/modules/ms_deform_attn.py
-    """
-
-    def __init__(self, d_model: int = 256, n_levels: int = 4, n_heads: int = 8, n_points: int = 4):
-        """
-        Initialize MSDeformAttn with the given parameters.
-
-        Args:
-            d_model (int): Model dimension.
-            n_levels (int): Number of feature levels.
-            n_heads (int): Number of attention heads.
-            n_points (int): Number of sampling points per attention head per feature level.
-        """
-        super().__init__()
-        if d_model % n_heads != 0:
-            raise ValueError(f"d_model must be divisible by n_heads, but got {d_model} and {n_heads}")
-        _d_per_head = d_model // n_heads
-        # Better to set _d_per_head to a power of 2 which is more efficient in a CUDA implementation
-        assert _d_per_head * n_heads == d_model, "`d_model` must be divisible by `n_heads`"
-
-        self.im2col_step = 64
-
-        self.d_model = d_model
-        self.n_levels = n_levels
-        self.n_heads = n_heads
-        self.n_points = n_points
-
-        self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
-        self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
-        self.value_proj = nn.Linear(d_model, d_model)
-        self.output_proj = nn.Linear(d_model, d_model)
-
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        """Reset module parameters."""
-        constant_(self.sampling_offsets.weight.data, 0.0)
-        thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
-        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
-        grid_init = (
-            (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
-            .view(self.n_heads, 1, 1, 2)
-            .repeat(1, self.n_levels, self.n_points, 1)
-        )
-        for i in range(self.n_points):
-            grid_init[:, :, i, :] *= i + 1
-        with torch.no_grad():
-            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
-        constant_(self.attention_weights.weight.data, 0.0)
-        constant_(self.attention_weights.bias.data, 0.0)
-        xavier_uniform_(self.value_proj.weight.data)
-        constant_(self.value_proj.bias.data, 0.0)
-        xavier_uniform_(self.output_proj.weight.data)
-        constant_(self.output_proj.bias.data, 0.0)
-
-    def forward(
-        self,
-        query: torch.Tensor,
-        refer_bbox: torch.Tensor,
-        value: torch.Tensor,
-        value_shapes: list,
-        value_mask: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        """
-        Perform forward pass for multiscale deformable attention.
-
-        Args:
-            query (torch.Tensor): Query tensor with shape [bs, query_length, C].
-            refer_bbox (torch.Tensor): Reference bounding boxes with shape [bs, query_length, n_levels, 2],
-                range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area.
-            value (torch.Tensor): Value tensor with shape [bs, value_length, C].
-            value_shapes (list): List with shape [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})].
-            value_mask (torch.Tensor, optional): Mask tensor with shape [bs, value_length], True for non-padding
-                elements, False for padding elements.
-
-        Returns:
-            (torch.Tensor): Output tensor with shape [bs, Length_{query}, C].
-
-        References:
-            https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
-        """
-        bs, len_q = query.shape[:2]
-        len_v = value.shape[1]
-        assert sum(s[0] * s[1] for s in value_shapes) == len_v
-
-        value = self.value_proj(value)
-        if value_mask is not None:
-            value = value.masked_fill(value_mask[..., None], float(0))
-        value = value.view(bs, len_v, self.n_heads, self.d_model // self.n_heads)
-        sampling_offsets = self.sampling_offsets(query).view(bs, len_q, self.n_heads, self.n_levels, self.n_points, 2)
-        attention_weights = self.attention_weights(query).view(bs, len_q, self.n_heads, self.n_levels * self.n_points)
-        attention_weights = F.softmax(attention_weights, -1).view(bs, len_q, self.n_heads, self.n_levels, self.n_points)
-        # N, Len_q, n_heads, n_levels, n_points, 2
-        num_points = refer_bbox.shape[-1]
-        if num_points == 2:
-            offset_normalizer = torch.as_tensor(value_shapes, dtype=query.dtype, device=query.device).flip(-1)
-            add = sampling_offsets / offset_normalizer[None, None, None, :, None, :]
-            sampling_locations = refer_bbox[:, :, None, :, None, :] + add
-        elif num_points == 4:
-            add = sampling_offsets / self.n_points * refer_bbox[:, :, None, :, None, 2:] * 0.5
-            sampling_locations = refer_bbox[:, :, None, :, None, :2] + add
-        else:
-            raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {num_points}.")
-        output = multi_scale_deformable_attn_pytorch(value, value_shapes, sampling_locations, attention_weights)
-        return self.output_proj(output)
-
-
-class DeformableTransformerDecoderLayer(nn.Module):
-    """
-    Deformable Transformer Decoder Layer inspired by PaddleDetection and Deformable-DETR implementations.
-
-    This class implements a single decoder layer with self-attention, cross-attention using multiscale deformable
-    attention, and a feedforward network.
-
-    Attributes:
-        self_attn (nn.MultiheadAttention): Self-attention module.
-        dropout1 (nn.Dropout): Dropout after self-attention.
-        norm1 (nn.LayerNorm): Layer normalization after self-attention.
-        cross_attn (MSDeformAttn): Cross-attention module.
-        dropout2 (nn.Dropout): Dropout after cross-attention.
-        norm2 (nn.LayerNorm): Layer normalization after cross-attention.
-        linear1 (nn.Linear): First linear layer in the feedforward network.
-        act (nn.Module): Activation function.
-        dropout3 (nn.Dropout): Dropout in the feedforward network.
-        linear2 (nn.Linear): Second linear layer in the feedforward network.
-        dropout4 (nn.Dropout): Dropout after the feedforward network.
-        norm3 (nn.LayerNorm): Layer normalization after the feedforward network.
-
-    References:
-        https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
-        https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/deformable_transformer.py
-    """
-
-    def __init__(
-        self,
-        d_model: int = 256,
-        n_heads: int = 8,
-        d_ffn: int = 1024,
-        dropout: float = 0.0,
-        act: nn.Module = nn.ReLU(),
-        n_levels: int = 4,
-        n_points: int = 4,
-    ):
-        """
-        Initialize the DeformableTransformerDecoderLayer with the given parameters.
-
-        Args:
-            d_model (int): Model dimension.
-            n_heads (int): Number of attention heads.
-            d_ffn (int): Dimension of the feedforward network.
-            dropout (float): Dropout probability.
-            act (nn.Module): Activation function.
-            n_levels (int): Number of feature levels.
-            n_points (int): Number of sampling points.
-        """
-        super().__init__()
-
-        # Self attention
-        self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
-        self.dropout1 = nn.Dropout(dropout)
-        self.norm1 = nn.LayerNorm(d_model)
-
-        # Cross attention
-        self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
-        self.dropout2 = nn.Dropout(dropout)
-        self.norm2 = nn.LayerNorm(d_model)
-
-        # FFN
-        self.linear1 = nn.Linear(d_model, d_ffn)
-        self.act = act
-        self.dropout3 = nn.Dropout(dropout)
-        self.linear2 = nn.Linear(d_ffn, d_model)
-        self.dropout4 = nn.Dropout(dropout)
-        self.norm3 = nn.LayerNorm(d_model)
-
-    @staticmethod
-    def with_pos_embed(tensor: torch.Tensor, pos: torch.Tensor | None) -> torch.Tensor:
-        """Add positional embeddings to the input tensor, if provided."""
-        return tensor if pos is None else tensor + pos
-
-    def forward_ffn(self, tgt: torch.Tensor) -> torch.Tensor:
-        """
-        Perform forward pass through the Feed-Forward Network part of the layer.
-
-        Args:
-            tgt (torch.Tensor): Input tensor.
-
-        Returns:
-            (torch.Tensor): Output tensor after FFN.
-        """
-        tgt2 = self.linear2(self.dropout3(self.act(self.linear1(tgt))))
-        tgt = tgt + self.dropout4(tgt2)
-        return self.norm3(tgt)
-
-    def forward(
-        self,
-        embed: torch.Tensor,
-        refer_bbox: torch.Tensor,
-        feats: torch.Tensor,
-        shapes: list,
-        padding_mask: torch.Tensor | None = None,
-        attn_mask: torch.Tensor | None = None,
-        query_pos: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        """
-        Perform the forward pass through the entire decoder layer.
-
-        Args:
-            embed (torch.Tensor): Input embeddings.
-            refer_bbox (torch.Tensor): Reference bounding boxes.
-            feats (torch.Tensor): Feature maps.
-            shapes (list): Feature shapes.
-            padding_mask (torch.Tensor, optional): Padding mask.
-            attn_mask (torch.Tensor, optional): Attention mask.
-            query_pos (torch.Tensor, optional): Query position embeddings.
-
-        Returns:
-            (torch.Tensor): Output tensor after decoder layer.
-        """
-        # Self attention
-        q = k = self.with_pos_embed(embed, query_pos)
-        tgt = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), embed.transpose(0, 1), attn_mask=attn_mask)[
-            0
-        ].transpose(0, 1)
-        embed = embed + self.dropout1(tgt)
-        embed = self.norm1(embed)
-
-        # Cross attention
-        tgt = self.cross_attn(
-            self.with_pos_embed(embed, query_pos), refer_bbox.unsqueeze(2), feats, shapes, padding_mask
-        )
-        embed = embed + self.dropout2(tgt)
-        embed = self.norm2(embed)
-
-        # FFN
-        return self.forward_ffn(embed)
-
-
-class DeformableTransformerDecoder(nn.Module):
-    """
-    Deformable Transformer Decoder based on PaddleDetection implementation.
-
-    This class implements a complete deformable transformer decoder with multiple decoder layers and prediction
-    heads for bounding box regression and classification.
-
-    Attributes:
-        layers (nn.ModuleList): List of decoder layers.
-        num_layers (int): Number of decoder layers.
-        hidden_dim (int): Hidden dimension.
-        eval_idx (int): Index of the layer to use during evaluation.
-
-    References:
-        https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
-    """
-
-    def __init__(self, hidden_dim: int, decoder_layer: nn.Module, num_layers: int, eval_idx: int = -1):
-        """
-        Initialize the DeformableTransformerDecoder with the given parameters.
-
-        Args:
-            hidden_dim (int): Hidden dimension.
-            decoder_layer (nn.Module): Decoder layer module.
-            num_layers (int): Number of decoder layers.
-            eval_idx (int): Index of the layer to use during evaluation.
-        """
-        super().__init__()
-        self.layers = _get_clones(decoder_layer, num_layers)
-        self.num_layers = num_layers
-        self.hidden_dim = hidden_dim
-        self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx
-
-    def forward(
-        self,
-        embed: torch.Tensor,  # decoder embeddings
-        refer_bbox: torch.Tensor,  # anchor
-        feats: torch.Tensor,  # image features
-        shapes: list,  # feature shapes
-        bbox_head: nn.Module,
-        score_head: nn.Module,
-        pos_mlp: nn.Module,
-        attn_mask: torch.Tensor | None = None,
-        padding_mask: torch.Tensor | None = None,
-    ):
-        """
-        Perform the forward pass through the entire decoder.
-
-        Args:
-            embed (torch.Tensor): Decoder embeddings.
-            refer_bbox (torch.Tensor): Reference bounding boxes.
-            feats (torch.Tensor): Image features.
-            shapes (list): Feature shapes.
-            bbox_head (nn.Module): Bounding box prediction head.
-            score_head (nn.Module): Score prediction head.
-            pos_mlp (nn.Module): Position MLP.
-            attn_mask (torch.Tensor, optional): Attention mask.
-            padding_mask (torch.Tensor, optional): Padding mask.
-
-        Returns:
-            dec_bboxes (torch.Tensor): Decoded bounding boxes.
-            dec_cls (torch.Tensor): Decoded classification scores.
-        """
-        output = embed
-        dec_bboxes = []
-        dec_cls = []
-        last_refined_bbox = None
-        refer_bbox = refer_bbox.sigmoid()
-        for i, layer in enumerate(self.layers):
-            output = layer(output, refer_bbox, feats, shapes, padding_mask, attn_mask, pos_mlp(refer_bbox))
-
-            bbox = bbox_head[i](output)
-            refined_bbox = torch.sigmoid(bbox + inverse_sigmoid(refer_bbox))
-
-            if self.training:
-                dec_cls.append(score_head[i](output))
-                if i == 0:
-                    dec_bboxes.append(refined_bbox)
-                else:
-                    dec_bboxes.append(torch.sigmoid(bbox + inverse_sigmoid(last_refined_bbox)))
-            elif i == self.eval_idx:
-                dec_cls.append(score_head[i](output))
-                dec_bboxes.append(refined_bbox)
-                break
-
-            last_refined_bbox = refined_bbox
-            refer_bbox = refined_bbox.detach() if self.training else refined_bbox
-
-        return torch.stack(dec_bboxes), torch.stack(dec_cls)
diff --git a/ultralytics/nn/modules/utils.py b/ultralytics/nn/modules/utils.py
deleted file mode 100644
index af9a674..0000000
--- a/ultralytics/nn/modules/utils.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-import copy
-import math
-
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn.init import uniform_
-
-__all__ = "multi_scale_deformable_attn_pytorch", "inverse_sigmoid"
-
-
-def _get_clones(module, n):
-    """
-    Create a list of cloned modules from the given module.
-
-    Args:
-        module (nn.Module): The module to be cloned.
-        n (int): Number of clones to create.
-
-    Returns:
-        (nn.ModuleList): A ModuleList containing n clones of the input module.
-
-    Examples:
-        >>> import torch.nn as nn
-        >>> layer = nn.Linear(10, 10)
-        >>> clones = _get_clones(layer, 3)
-        >>> len(clones)
-        3
-    """
-    return nn.ModuleList([copy.deepcopy(module) for _ in range(n)])
-
-
-def bias_init_with_prob(prior_prob=0.01):
-    """
-    Initialize conv/fc bias value according to a given probability value.
-
-    This function calculates the bias initialization value based on a prior probability using the inverse error function.
-    It's commonly used in object detection models to initialize classification layers with a specific positive prediction
-    probability.
-
-    Args:
-        prior_prob (float, optional): Prior probability for bias initialization.
-
-    Returns:
-        (float): Bias initialization value calculated from the prior probability.
-
-    Examples:
-        >>> bias = bias_init_with_prob(0.01)
-        >>> print(f"Bias initialization value: {bias:.4f}")
-        Bias initialization value: -4.5951
-    """
-    return float(-np.log((1 - prior_prob) / prior_prob))  # return bias_init
-
-
-def linear_init(module):
-    """
-    Initialize the weights and biases of a linear module.
-
-    This function initializes the weights of a linear module using a uniform distribution within bounds calculated
-    from the input dimension. If the module has a bias, it is also initialized.
-
-    Args:
-        module (nn.Module): Linear module to initialize.
-
-    Returns:
-        (nn.Module): The initialized module.
-
-    Examples:
-        >>> import torch.nn as nn
-        >>> linear = nn.Linear(10, 5)
-        >>> initialized_linear = linear_init(linear)
-    """
-    bound = 1 / math.sqrt(module.weight.shape[0])
-    uniform_(module.weight, -bound, bound)
-    if hasattr(module, "bias") and module.bias is not None:
-        uniform_(module.bias, -bound, bound)
-
-
-def inverse_sigmoid(x, eps=1e-5):
-    """
-    Calculate the inverse sigmoid function for a tensor.
-
-    This function applies the inverse of the sigmoid function to a tensor, which is useful in various neural network
-    operations, particularly in attention mechanisms and coordinate transformations.
-
-    Args:
-        x (torch.Tensor): Input tensor with values in range [0, 1].
-        eps (float, optional): Small epsilon value to prevent numerical instability.
-
-    Returns:
-        (torch.Tensor): Tensor after applying the inverse sigmoid function.
-
-    Examples:
-        >>> x = torch.tensor([0.2, 0.5, 0.8])
-        >>> inverse_sigmoid(x)
-        tensor([-1.3863,  0.0000,  1.3863])
-    """
-    x = x.clamp(min=0, max=1)
-    x1 = x.clamp(min=eps)
-    x2 = (1 - x).clamp(min=eps)
-    return torch.log(x1 / x2)
-
-
-def multi_scale_deformable_attn_pytorch(
-    value: torch.Tensor,
-    value_spatial_shapes: torch.Tensor,
-    sampling_locations: torch.Tensor,
-    attention_weights: torch.Tensor,
-) -> torch.Tensor:
-    """
-    Implement multi-scale deformable attention in PyTorch.
-
-    This function performs deformable attention across multiple feature map scales, allowing the model to attend to
-    different spatial locations with learned offsets.
-
-    Args:
-        value (torch.Tensor): The value tensor with shape (bs, num_keys, num_heads, embed_dims).
-        value_spatial_shapes (torch.Tensor): Spatial shapes of the value tensor with shape (num_levels, 2).
-        sampling_locations (torch.Tensor): The sampling locations with shape
-            (bs, num_queries, num_heads, num_levels, num_points, 2).
-        attention_weights (torch.Tensor): The attention weights with shape
-            (bs, num_queries, num_heads, num_levels, num_points).
-
-    Returns:
-        (torch.Tensor): The output tensor with shape (bs, num_queries, embed_dims).
-
-    References:
-        https://github.com/IDEA-Research/detrex/blob/main/detrex/layers/multi_scale_deform_attn.py
-    """
-    bs, _, num_heads, embed_dims = value.shape
-    _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
-    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
-    sampling_grids = 2 * sampling_locations - 1
-    sampling_value_list = []
-    for level, (H_, W_) in enumerate(value_spatial_shapes):
-        # bs, H_*W_, num_heads, embed_dims ->
-        # bs, H_*W_, num_heads*embed_dims ->
-        # bs, num_heads*embed_dims, H_*W_ ->
-        # bs*num_heads, embed_dims, H_, W_
-        value_l_ = value_list[level].flatten(2).transpose(1, 2).reshape(bs * num_heads, embed_dims, H_, W_)
-        # bs, num_queries, num_heads, num_points, 2 ->
-        # bs, num_heads, num_queries, num_points, 2 ->
-        # bs*num_heads, num_queries, num_points, 2
-        sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(1, 2).flatten(0, 1)
-        # bs*num_heads, embed_dims, num_queries, num_points
-        sampling_value_l_ = F.grid_sample(
-            value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
-        )
-        sampling_value_list.append(sampling_value_l_)
-    # (bs, num_queries, num_heads, num_levels, num_points) ->
-    # (bs, num_heads, num_queries, num_levels, num_points) ->
-    # (bs, num_heads, 1, num_queries, num_levels*num_points)
-    attention_weights = attention_weights.transpose(1, 2).reshape(
-        bs * num_heads, 1, num_queries, num_levels * num_points
-    )
-    output = (
-        (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
-        .sum(-1)
-        .view(bs, num_heads * embed_dims, num_queries)
-    )
-    return output.transpose(1, 2).contiguous()
diff --git a/ultralytics/nn/tasks.py b/ultralytics/nn/tasks.py
deleted file mode 100644
index 128f257..0000000
--- a/ultralytics/nn/tasks.py
+++ /dev/null
@@ -1,1812 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-import contextlib
-import pickle
-import re
-import types
-from copy import deepcopy
-from pathlib import Path
-
-import torch
-import torch.nn as nn
-
-from ultralytics.nn.autobackend import check_class_names
-from ultralytics.nn.modules import (
-    AIFI,
-    C1,
-    C2,
-    C2PSA,
-    C3,
-    C3TR,
-    ELAN1,
-    OBB,
-    PSA,
-    SPP,
-    SPPELAN,
-    SPPF,
-    A2C2f,
-    AConv,
-    ADown,
-    Bottleneck,
-    BottleneckCSP,
-    C2f,
-    C2fAttn,
-    C2fCIB,
-    C2fPSA,
-    C3Ghost,
-    C3k2,
-    C3x,
-    CBFuse,
-    CBLinear,
-    Classify,
-    Concat,
-    Conv,
-    Conv2,
-    ConvTranspose,
-    Detect,
-    DWConv,
-    DWConvTranspose2d,
-    Focus,
-    GhostBottleneck,
-    GhostConv,
-    HGBlock,
-    HGStem,
-    ImagePoolingAttn,
-    Index,
-    LRPCHead,
-    Pose,
-    RepC3,
-    RepConv,
-    RepNCSPELAN4,
-    RepVGGDW,
-    ResNetLayer,
-    RTDETRDecoder,
-    SCDown,
-    Segment,
-    TorchVision,
-    WorldDetect,
-    YOLOEDetect,
-    YOLOESegment,
-    v10Detect,
-)
-from ultralytics.utils import DEFAULT_CFG_DICT, LOGGER, YAML, colorstr, emojis
-from ultralytics.utils.checks import check_requirements, check_suffix, check_yaml
-from ultralytics.utils.loss import (
-    E2EDetectLoss,
-    v8ClassificationLoss,
-    v8DetectionLoss,
-    v8OBBLoss,
-    v8PoseLoss,
-    v8SegmentationLoss,
-)
-from ultralytics.utils.ops import make_divisible
-from ultralytics.utils.patches import torch_load
-from ultralytics.utils.plotting import feature_visualization
-from ultralytics.utils.torch_utils import (
-    fuse_conv_and_bn,
-    fuse_deconv_and_bn,
-    initialize_weights,
-    intersect_dicts,
-    model_info,
-    scale_img,
-    smart_inference_mode,
-    time_sync,
-)
-
-
-class BaseModel(torch.nn.Module):
-    """
-    Base class for all YOLO models in the Ultralytics family.
-
-    This class provides common functionality for YOLO models including forward pass handling, model fusion,
-    information display, and weight loading capabilities.
-
-    Attributes:
-        model (torch.nn.Module): The neural network model.
-        save (list): List of layer indices to save outputs from.
-        stride (torch.Tensor): Model stride values.
-
-    Methods:
-        forward: Perform forward pass for training or inference.
-        predict: Perform inference on input tensor.
-        fuse: Fuse Conv2d and BatchNorm2d layers for optimization.
-        info: Print model information.
-        load: Load weights into the model.
-        loss: Compute loss for training.
-
-    Examples:
-        Create a BaseModel instance
-        >>> model = BaseModel()
-        >>> model.info()  # Display model information
-    """
-
-    def forward(self, x, *args, **kwargs):
-        """
-        Perform forward pass of the model for either training or inference.
-
-        If x is a dict, calculates and returns the loss for training. Otherwise, returns predictions for inference.
-
-        Args:
-            x (torch.Tensor | dict): Input tensor for inference, or dict with image tensor and labels for training.
-            *args (Any): Variable length argument list.
-            **kwargs (Any): Arbitrary keyword arguments.
-
-        Returns:
-            (torch.Tensor): Loss if x is a dict (training), or network predictions (inference).
-        """
-        if isinstance(x, dict):  # for cases of training and validating while training.
-            return self.loss(x, *args, **kwargs)
-        return self.predict(x, *args, **kwargs)
-
-    def predict(self, x, profile=False, visualize=False, augment=False, embed=None):
-        """
-        Perform a forward pass through the network.
-
-        Args:
-            x (torch.Tensor): The input tensor to the model.
-            profile (bool): Print the computation time of each layer if True.
-            visualize (bool): Save the feature maps of the model if True.
-            augment (bool): Augment image during prediction.
-            embed (list, optional): A list of feature vectors/embeddings to return.
-
-        Returns:
-            (torch.Tensor): The last output of the model.
-        """
-        if augment:
-            return self._predict_augment(x)
-        return self._predict_once(x, profile, visualize, embed)
-
-    def _predict_once(self, x, profile=False, visualize=False, embed=None):
-        """
-        Perform a forward pass through the network.
-
-        Args:
-            x (torch.Tensor): The input tensor to the model.
-            profile (bool): Print the computation time of each layer if True.
-            visualize (bool): Save the feature maps of the model if True.
-            embed (list, optional): A list of feature vectors/embeddings to return.
-
-        Returns:
-            (torch.Tensor): The last output of the model.
-        """
-        y, dt, embeddings = [], [], []  # outputs
-        embed = frozenset(embed) if embed is not None else {-1}
-        max_idx = max(embed)
-        for m in self.model:
-            if m.f != -1:  # if not from previous layer
-                x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f]  # from earlier layers
-            if profile:
-                self._profile_one_layer(m, x, dt)
-            x = m(x)  # run
-            y.append(x if m.i in self.save else None)  # save output
-            if visualize:
-                feature_visualization(x, m.type, m.i, save_dir=visualize)
-            if m.i in embed:
-                embeddings.append(torch.nn.functional.adaptive_avg_pool2d(x, (1, 1)).squeeze(-1).squeeze(-1))  # flatten
-                if m.i == max_idx:
-                    return torch.unbind(torch.cat(embeddings, 1), dim=0)
-        return x
-
-    def _predict_augment(self, x):
-        """Perform augmentations on input image x and return augmented inference."""
-        LOGGER.warning(
-            f"{self.__class__.__name__} does not support 'augment=True' prediction. "
-            f"Reverting to single-scale prediction."
-        )
-        return self._predict_once(x)
-
-    def _profile_one_layer(self, m, x, dt):
-        """
-        Profile the computation time and FLOPs of a single layer of the model on a given input.
-
-        Args:
-            m (torch.nn.Module): The layer to be profiled.
-            x (torch.Tensor): The input data to the layer.
-            dt (list): A list to store the computation time of the layer.
-        """
-        try:
-            import thop
-        except ImportError:
-            thop = None  # conda support without 'ultralytics-thop' installed
-
-        c = m == self.model[-1] and isinstance(x, list)  # is final layer list, copy input as inplace fix
-        flops = thop.profile(m, inputs=[x.copy() if c else x], verbose=False)[0] / 1e9 * 2 if thop else 0  # GFLOPs
-        t = time_sync()
-        for _ in range(10):
-            m(x.copy() if c else x)
-        dt.append((time_sync() - t) * 100)
-        if m == self.model[0]:
-            LOGGER.info(f"{'time (ms)':>10s} {'GFLOPs':>10s} {'params':>10s}  module")
-        LOGGER.info(f"{dt[-1]:10.2f} {flops:10.2f} {m.np:10.0f}  {m.type}")
-        if c:
-            LOGGER.info(f"{sum(dt):10.2f} {'-':>10s} {'-':>10s}  Total")
-
-    def fuse(self, verbose=True):
-        """
-        Fuse the `Conv2d()` and `BatchNorm2d()` layers of the model into a single layer for improved computation
-        efficiency.
-
-        Returns:
-            (torch.nn.Module): The fused model is returned.
-        """
-        if not self.is_fused():
-            for m in self.model.modules():
-                if isinstance(m, (Conv, Conv2, DWConv)) and hasattr(m, "bn"):
-                    if isinstance(m, Conv2):
-                        m.fuse_convs()
-                    m.conv = fuse_conv_and_bn(m.conv, m.bn)  # update conv
-                    delattr(m, "bn")  # remove batchnorm
-                    m.forward = m.forward_fuse  # update forward
-                if isinstance(m, ConvTranspose) and hasattr(m, "bn"):
-                    m.conv_transpose = fuse_deconv_and_bn(m.conv_transpose, m.bn)
-                    delattr(m, "bn")  # remove batchnorm
-                    m.forward = m.forward_fuse  # update forward
-                if isinstance(m, RepConv):
-                    m.fuse_convs()
-                    m.forward = m.forward_fuse  # update forward
-                if isinstance(m, RepVGGDW):
-                    m.fuse()
-                    m.forward = m.forward_fuse
-                if isinstance(m, v10Detect):
-                    m.fuse()  # remove one2many head
-            self.info(verbose=verbose)
-
-        return self
-
-    def is_fused(self, thresh=10):
-        """
-        Check if the model has less than a certain threshold of BatchNorm layers.
-
-        Args:
-            thresh (int, optional): The threshold number of BatchNorm layers.
-
-        Returns:
-            (bool): True if the number of BatchNorm layers in the model is less than the threshold, False otherwise.
-        """
-        bn = tuple(v for k, v in torch.nn.__dict__.items() if "Norm" in k)  # normalization layers, i.e. BatchNorm2d()
-        return sum(isinstance(v, bn) for v in self.modules()) < thresh  # True if < 'thresh' BatchNorm layers in model
-
-    def info(self, detailed=False, verbose=True, imgsz=640):
-        """
-        Print model information.
-
-        Args:
-            detailed (bool): If True, prints out detailed information about the model.
-            verbose (bool): If True, prints out the model information.
-            imgsz (int): The size of the image that the model will be trained on.
-        """
-        return model_info(self, detailed=detailed, verbose=verbose, imgsz=imgsz)
-
-    def _apply(self, fn):
-        """
-        Apply a function to all tensors in the model that are not parameters or registered buffers.
-
-        Args:
-            fn (function): The function to apply to the model.
-
-        Returns:
-            (BaseModel): An updated BaseModel object.
-        """
-        self = super()._apply(fn)
-        m = self.model[-1]  # Detect()
-        if isinstance(
-            m, Detect
-        ):  # includes all Detect subclasses like Segment, Pose, OBB, WorldDetect, YOLOEDetect, YOLOESegment
-            m.stride = fn(m.stride)
-            m.anchors = fn(m.anchors)
-            m.strides = fn(m.strides)
-        return self
-
-    def load(self, weights, verbose=True):
-        """
-        Load weights into the model.
-
-        Args:
-            weights (dict | torch.nn.Module): The pre-trained weights to be loaded.
-            verbose (bool, optional): Whether to log the transfer progress.
-        """
-        model = weights["model"] if isinstance(weights, dict) else weights  # torchvision models are not dicts
-        csd = model.float().state_dict()  # checkpoint state_dict as FP32
-        updated_csd = intersect_dicts(csd, self.state_dict())  # intersect
-        self.load_state_dict(updated_csd, strict=False)  # load
-        len_updated_csd = len(updated_csd)
-        first_conv = "model.0.conv.weight"  # hard-coded to yolo models for now
-        # mostly used to boost multi-channel training
-        state_dict = self.state_dict()
-        if first_conv not in updated_csd and first_conv in state_dict:
-            c1, c2, h, w = state_dict[first_conv].shape
-            cc1, cc2, ch, cw = csd[first_conv].shape
-            if ch == h and cw == w:
-                c1, c2 = min(c1, cc1), min(c2, cc2)
-                state_dict[first_conv][:c1, :c2] = csd[first_conv][:c1, :c2]
-                len_updated_csd += 1
-        if verbose:
-            LOGGER.info(f"Transferred {len_updated_csd}/{len(self.model.state_dict())} items from pretrained weights")
-
-    def loss(self, batch, preds=None):
-        """
-        Compute loss.
-
-        Args:
-            batch (dict): Batch to compute loss on.
-            preds (torch.Tensor | list[torch.Tensor], optional): Predictions.
-        """
-        if getattr(self, "criterion", None) is None:
-            self.criterion = self.init_criterion()
-
-        if preds is None:
-            preds = self.forward(batch["img"])
-        return self.criterion(preds, batch)
-
-    def init_criterion(self):
-        """Initialize the loss criterion for the BaseModel."""
-        raise NotImplementedError("compute_loss() needs to be implemented by task heads")
-
-
-class DetectionModel(BaseModel):
-    """
-    YOLO detection model.
-
-    This class implements the YOLO detection architecture, handling model initialization, forward pass,
-    augmented inference, and loss computation for object detection tasks.
-
-    Attributes:
-        yaml (dict): Model configuration dictionary.
-        model (torch.nn.Sequential): The neural network model.
-        save (list): List of layer indices to save outputs from.
-        names (dict): Class names dictionary.
-        inplace (bool): Whether to use inplace operations.
-        end2end (bool): Whether the model uses end-to-end detection.
-        stride (torch.Tensor): Model stride values.
-
-    Methods:
-        __init__: Initialize the YOLO detection model.
-        _predict_augment: Perform augmented inference.
-        _descale_pred: De-scale predictions following augmented inference.
-        _clip_augmented: Clip YOLO augmented inference tails.
-        init_criterion: Initialize the loss criterion.
-
-    Examples:
-        Initialize a detection model
-        >>> model = DetectionModel("yolo11n.yaml", ch=3, nc=80)
-        >>> results = model.predict(image_tensor)
-    """
-
-    def __init__(self, cfg="yolo11n.yaml", ch=3, nc=None, verbose=True):
-        """
-        Initialize the YOLO detection model with the given config and parameters.
-
-        Args:
-            cfg (str | dict): Model configuration file path or dictionary.
-            ch (int): Number of input channels.
-            nc (int, optional): Number of classes.
-            verbose (bool): Whether to display model information.
-        """
-        super().__init__()
-        self.yaml = cfg if isinstance(cfg, dict) else yaml_model_load(cfg)  # cfg dict
-        if self.yaml["backbone"][0][2] == "Silence":
-            LOGGER.warning(
-                "YOLOv9 `Silence` module is deprecated in favor of torch.nn.Identity. "
-                "Please delete local *.pt file and re-download the latest model checkpoint."
-            )
-            self.yaml["backbone"][0][2] = "nn.Identity"
-
-        # Define model
-        self.yaml["channels"] = ch  # save channels
-        if nc and nc != self.yaml["nc"]:
-            LOGGER.info(f"Overriding model.yaml nc={self.yaml['nc']} with nc={nc}")
-            self.yaml["nc"] = nc  # override YAML value
-        self.model, self.save = parse_model(deepcopy(self.yaml), ch=ch, verbose=verbose)  # model, savelist
-        self.names = {i: f"{i}" for i in range(self.yaml["nc"])}  # default names dict
-        self.inplace = self.yaml.get("inplace", True)
-        self.end2end = getattr(self.model[-1], "end2end", False)
-
-        # Build strides
-        m = self.model[-1]  # Detect()
-        if isinstance(m, Detect):  # includes all Detect subclasses like Segment, Pose, OBB, YOLOEDetect, YOLOESegment
-            s = 256  # 2x min stride
-            m.inplace = self.inplace
-
-            def _forward(x):
-                """Perform a forward pass through the model, handling different Detect subclass types accordingly."""
-                if self.end2end:
-                    return self.forward(x)["one2many"]
-                return self.forward(x)[0] if isinstance(m, (Segment, YOLOESegment, Pose, OBB)) else self.forward(x)
-
-            self.model.eval()  # Avoid changing batch statistics until training begins
-            m.training = True  # Setting it to True to properly return strides
-            m.stride = torch.tensor([s / x.shape[-2] for x in _forward(torch.zeros(1, ch, s, s))])  # forward
-            self.stride = m.stride
-            self.model.train()  # Set model back to training(default) mode
-            m.bias_init()  # only run once
-        else:
-            self.stride = torch.Tensor([32])  # default stride for i.e. RTDETR
-
-        # Init weights, biases
-        initialize_weights(self)
-        if verbose:
-            self.info()
-            LOGGER.info("")
-
-    def _predict_augment(self, x):
-        """
-        Perform augmentations on input image x and return augmented inference and train outputs.
-
-        Args:
-            x (torch.Tensor): Input image tensor.
-
-        Returns:
-            (torch.Tensor): Augmented inference output.
-        """
-        if getattr(self, "end2end", False) or self.__class__.__name__ != "DetectionModel":
-            LOGGER.warning("Model does not support 'augment=True', reverting to single-scale prediction.")
-            return self._predict_once(x)
-        img_size = x.shape[-2:]  # height, width
-        s = [1, 0.83, 0.67]  # scales
-        f = [None, 3, None]  # flips (2-ud, 3-lr)
-        y = []  # outputs
-        for si, fi in zip(s, f):
-            xi = scale_img(x.flip(fi) if fi else x, si, gs=int(self.stride.max()))
-            yi = super().predict(xi)[0]  # forward
-            yi = self._descale_pred(yi, fi, si, img_size)
-            y.append(yi)
-        y = self._clip_augmented(y)  # clip augmented tails
-        return torch.cat(y, -1), None  # augmented inference, train
-
-    @staticmethod
-    def _descale_pred(p, flips, scale, img_size, dim=1):
-        """
-        De-scale predictions following augmented inference (inverse operation).
-
-        Args:
-            p (torch.Tensor): Predictions tensor.
-            flips (int): Flip type (0=none, 2=ud, 3=lr).
-            scale (float): Scale factor.
-            img_size (tuple): Original image size (height, width).
-            dim (int): Dimension to split at.
-
-        Returns:
-            (torch.Tensor): De-scaled predictions.
-        """
-        p[:, :4] /= scale  # de-scale
-        x, y, wh, cls = p.split((1, 1, 2, p.shape[dim] - 4), dim)
-        if flips == 2:
-            y = img_size[0] - y  # de-flip ud
-        elif flips == 3:
-            x = img_size[1] - x  # de-flip lr
-        return torch.cat((x, y, wh, cls), dim)
-
-    def _clip_augmented(self, y):
-        """
-        Clip YOLO augmented inference tails.
-
-        Args:
-            y (list[torch.Tensor]): List of detection tensors.
-
-        Returns:
-            (list[torch.Tensor]): Clipped detection tensors.
-        """
-        nl = self.model[-1].nl  # number of detection layers (P3-P5)
-        g = sum(4**x for x in range(nl))  # grid points
-        e = 1  # exclude layer count
-        i = (y[0].shape[-1] // g) * sum(4**x for x in range(e))  # indices
-        y[0] = y[0][..., :-i]  # large
-        i = (y[-1].shape[-1] // g) * sum(4 ** (nl - 1 - x) for x in range(e))  # indices
-        y[-1] = y[-1][..., i:]  # small
-        return y
-
-    def init_criterion(self):
-        """Initialize the loss criterion for the DetectionModel."""
-        return E2EDetectLoss(self) if getattr(self, "end2end", False) else v8DetectionLoss(self)
-
-
-class OBBModel(DetectionModel):
-    """
-    YOLO Oriented Bounding Box (OBB) model.
-
-    This class extends DetectionModel to handle oriented bounding box detection tasks, providing specialized
-    loss computation for rotated object detection.
-
-    Methods:
-        __init__: Initialize YOLO OBB model.
-        init_criterion: Initialize the loss criterion for OBB detection.
-
-    Examples:
-        Initialize an OBB model
-        >>> model = OBBModel("yolo11n-obb.yaml", ch=3, nc=80)
-        >>> results = model.predict(image_tensor)
-    """
-
-    def __init__(self, cfg="yolo11n-obb.yaml", ch=3, nc=None, verbose=True):
-        """
-        Initialize YOLO OBB model with given config and parameters.
-
-        Args:
-            cfg (str | dict): Model configuration file path or dictionary.
-            ch (int): Number of input channels.
-            nc (int, optional): Number of classes.
-            verbose (bool): Whether to display model information.
-        """
-        super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
-
-    def init_criterion(self):
-        """Initialize the loss criterion for the model."""
-        return v8OBBLoss(self)
-
-
-class SegmentationModel(DetectionModel):
-    """
-    YOLO segmentation model.
-
-    This class extends DetectionModel to handle instance segmentation tasks, providing specialized
-    loss computation for pixel-level object detection and segmentation.
-
-    Methods:
-        __init__: Initialize YOLO segmentation model.
-        init_criterion: Initialize the loss criterion for segmentation.
-
-    Examples:
-        Initialize a segmentation model
-        >>> model = SegmentationModel("yolo11n-seg.yaml", ch=3, nc=80)
-        >>> results = model.predict(image_tensor)
-    """
-
-    def __init__(self, cfg="yolo11n-seg.yaml", ch=3, nc=None, verbose=True):
-        """
-        Initialize Ultralytics YOLO segmentation model with given config and parameters.
-
-        Args:
-            cfg (str | dict): Model configuration file path or dictionary.
-            ch (int): Number of input channels.
-            nc (int, optional): Number of classes.
-            verbose (bool): Whether to display model information.
-        """
-        super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
-
-    def init_criterion(self):
-        """Initialize the loss criterion for the SegmentationModel."""
-        return v8SegmentationLoss(self)
-
-
-class PoseModel(DetectionModel):
-    """
-    YOLO pose model.
-
-    This class extends DetectionModel to handle human pose estimation tasks, providing specialized
-    loss computation for keypoint detection and pose estimation.
-
-    Attributes:
-        kpt_shape (tuple): Shape of keypoints data (num_keypoints, num_dimensions).
-
-    Methods:
-        __init__: Initialize YOLO pose model.
-        init_criterion: Initialize the loss criterion for pose estimation.
-
-    Examples:
-        Initialize a pose model
-        >>> model = PoseModel("yolo11n-pose.yaml", ch=3, nc=1, data_kpt_shape=(17, 3))
-        >>> results = model.predict(image_tensor)
-    """
-
-    def __init__(self, cfg="yolo11n-pose.yaml", ch=3, nc=None, data_kpt_shape=(None, None), verbose=True):
-        """
-        Initialize Ultralytics YOLO Pose model.
-
-        Args:
-            cfg (str | dict): Model configuration file path or dictionary.
-            ch (int): Number of input channels.
-            nc (int, optional): Number of classes.
-            data_kpt_shape (tuple): Shape of keypoints data.
-            verbose (bool): Whether to display model information.
-        """
-        if not isinstance(cfg, dict):
-            cfg = yaml_model_load(cfg)  # load model YAML
-        if any(data_kpt_shape) and list(data_kpt_shape) != list(cfg["kpt_shape"]):
-            LOGGER.info(f"Overriding model.yaml kpt_shape={cfg['kpt_shape']} with kpt_shape={data_kpt_shape}")
-            cfg["kpt_shape"] = data_kpt_shape
-        super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
-
-    def init_criterion(self):
-        """Initialize the loss criterion for the PoseModel."""
-        return v8PoseLoss(self)
-
-
-class ClassificationModel(BaseModel):
-    """
-    YOLO classification model.
-
-    This class implements the YOLO classification architecture for image classification tasks,
-    providing model initialization, configuration, and output reshaping capabilities.
-
-    Attributes:
-        yaml (dict): Model configuration dictionary.
-        model (torch.nn.Sequential): The neural network model.
-        stride (torch.Tensor): Model stride values.
-        names (dict): Class names dictionary.
-
-    Methods:
-        __init__: Initialize ClassificationModel.
-        _from_yaml: Set model configurations and define architecture.
-        reshape_outputs: Update model to specified class count.
-        init_criterion: Initialize the loss criterion.
-
-    Examples:
-        Initialize a classification model
-        >>> model = ClassificationModel("yolo11n-cls.yaml", ch=3, nc=1000)
-        >>> results = model.predict(image_tensor)
-    """
-
-    def __init__(self, cfg="yolo11n-cls.yaml", ch=3, nc=None, verbose=True):
-        """
-        Initialize ClassificationModel with YAML, channels, number of classes, verbose flag.
-
-        Args:
-            cfg (str | dict): Model configuration file path or dictionary.
-            ch (int): Number of input channels.
-            nc (int, optional): Number of classes.
-            verbose (bool): Whether to display model information.
-        """
-        super().__init__()
-        self._from_yaml(cfg, ch, nc, verbose)
-
-    def _from_yaml(self, cfg, ch, nc, verbose):
-        """
-        Set Ultralytics YOLO model configurations and define the model architecture.
-
-        Args:
-            cfg (str | dict): Model configuration file path or dictionary.
-            ch (int): Number of input channels.
-            nc (int, optional): Number of classes.
-            verbose (bool): Whether to display model information.
-        """
-        self.yaml = cfg if isinstance(cfg, dict) else yaml_model_load(cfg)  # cfg dict
-
-        # Define model
-        ch = self.yaml["channels"] = self.yaml.get("channels", ch)  # input channels
-        if nc and nc != self.yaml["nc"]:
-            LOGGER.info(f"Overriding model.yaml nc={self.yaml['nc']} with nc={nc}")
-            self.yaml["nc"] = nc  # override YAML value
-        elif not nc and not self.yaml.get("nc", None):
-            raise ValueError("nc not specified. Must specify nc in model.yaml or function arguments.")
-        self.model, self.save = parse_model(deepcopy(self.yaml), ch=ch, verbose=verbose)  # model, savelist
-        self.stride = torch.Tensor([1])  # no stride constraints
-        self.names = {i: f"{i}" for i in range(self.yaml["nc"])}  # default names dict
-        self.info()
-
-    @staticmethod
-    def reshape_outputs(model, nc):
-        """
-        Update a TorchVision classification model to class count 'n' if required.
-
-        Args:
-            model (torch.nn.Module): Model to update.
-            nc (int): New number of classes.
-        """
-        name, m = list((model.model if hasattr(model, "model") else model).named_children())[-1]  # last module
-        if isinstance(m, Classify):  # YOLO Classify() head
-            if m.linear.out_features != nc:
-                m.linear = torch.nn.Linear(m.linear.in_features, nc)
-        elif isinstance(m, torch.nn.Linear):  # ResNet, EfficientNet
-            if m.out_features != nc:
-                setattr(model, name, torch.nn.Linear(m.in_features, nc))
-        elif isinstance(m, torch.nn.Sequential):
-            types = [type(x) for x in m]
-            if torch.nn.Linear in types:
-                i = len(types) - 1 - types[::-1].index(torch.nn.Linear)  # last torch.nn.Linear index
-                if m[i].out_features != nc:
-                    m[i] = torch.nn.Linear(m[i].in_features, nc)
-            elif torch.nn.Conv2d in types:
-                i = len(types) - 1 - types[::-1].index(torch.nn.Conv2d)  # last torch.nn.Conv2d index
-                if m[i].out_channels != nc:
-                    m[i] = torch.nn.Conv2d(
-                        m[i].in_channels, nc, m[i].kernel_size, m[i].stride, bias=m[i].bias is not None
-                    )
-
-    def init_criterion(self):
-        """Initialize the loss criterion for the ClassificationModel."""
-        return v8ClassificationLoss()
-
-
-class RTDETRDetectionModel(DetectionModel):
-    """
-    RTDETR (Real-time DEtection and Tracking using Transformers) Detection Model class.
-
-    This class is responsible for constructing the RTDETR architecture, defining loss functions, and facilitating both
-    the training and inference processes. RTDETR is an object detection and tracking model that extends from the
-    DetectionModel base class.
-
-    Attributes:
-        nc (int): Number of classes for detection.
-        criterion (RTDETRDetectionLoss): Loss function for training.
-
-    Methods:
-        __init__: Initialize the RTDETRDetectionModel.
-        init_criterion: Initialize the loss criterion.
-        loss: Compute loss for training.
-        predict: Perform forward pass through the model.
-
-    Examples:
-        Initialize an RTDETR model
-        >>> model = RTDETRDetectionModel("rtdetr-l.yaml", ch=3, nc=80)
-        >>> results = model.predict(image_tensor)
-    """
-
-    def __init__(self, cfg="rtdetr-l.yaml", ch=3, nc=None, verbose=True):
-        """
-        Initialize the RTDETRDetectionModel.
-
-        Args:
-            cfg (str | dict): Configuration file name or path.
-            ch (int): Number of input channels.
-            nc (int, optional): Number of classes.
-            verbose (bool): Print additional information during initialization.
-        """
-        super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
-
-    def init_criterion(self):
-        """Initialize the loss criterion for the RTDETRDetectionModel."""
-        from ultralytics.models.utils.loss import RTDETRDetectionLoss
-
-        return RTDETRDetectionLoss(nc=self.nc, use_vfl=True)
-
-    def loss(self, batch, preds=None):
-        """
-        Compute the loss for the given batch of data.
-
-        Args:
-            batch (dict): Dictionary containing image and label data.
-            preds (torch.Tensor, optional): Precomputed model predictions.
-
-        Returns:
-            loss_sum (torch.Tensor): Total loss value.
-            loss_items (torch.Tensor): Main three losses in a tensor.
-        """
-        if not hasattr(self, "criterion"):
-            self.criterion = self.init_criterion()
-
-        img = batch["img"]
-        # NOTE: preprocess gt_bbox and gt_labels to list.
-        bs = img.shape[0]
-        batch_idx = batch["batch_idx"]
-        gt_groups = [(batch_idx == i).sum().item() for i in range(bs)]
-        targets = {
-            "cls": batch["cls"].to(img.device, dtype=torch.long).view(-1),
-            "bboxes": batch["bboxes"].to(device=img.device),
-            "batch_idx": batch_idx.to(img.device, dtype=torch.long).view(-1),
-            "gt_groups": gt_groups,
-        }
-
-        if preds is None:
-            preds = self.predict(img, batch=targets)
-        dec_bboxes, dec_scores, enc_bboxes, enc_scores, dn_meta = preds if self.training else preds[1]
-        if dn_meta is None:
-            dn_bboxes, dn_scores = None, None
-        else:
-            dn_bboxes, dec_bboxes = torch.split(dec_bboxes, dn_meta["dn_num_split"], dim=2)
-            dn_scores, dec_scores = torch.split(dec_scores, dn_meta["dn_num_split"], dim=2)
-
-        dec_bboxes = torch.cat([enc_bboxes.unsqueeze(0), dec_bboxes])  # (7, bs, 300, 4)
-        dec_scores = torch.cat([enc_scores.unsqueeze(0), dec_scores])
-
-        loss = self.criterion(
-            (dec_bboxes, dec_scores), targets, dn_bboxes=dn_bboxes, dn_scores=dn_scores, dn_meta=dn_meta
-        )
-        # NOTE: There are like 12 losses in RTDETR, backward with all losses but only show the main three losses.
-        return sum(loss.values()), torch.as_tensor(
-            [loss[k].detach() for k in ["loss_giou", "loss_class", "loss_bbox"]], device=img.device
-        )
-
-    def predict(self, x, profile=False, visualize=False, batch=None, augment=False, embed=None):
-        """
-        Perform a forward pass through the model.
-
-        Args:
-            x (torch.Tensor): The input tensor.
-            profile (bool): If True, profile the computation time for each layer.
-            visualize (bool): If True, save feature maps for visualization.
-            batch (dict, optional): Ground truth data for evaluation.
-            augment (bool): If True, perform data augmentation during inference.
-            embed (list, optional): A list of feature vectors/embeddings to return.
-
-        Returns:
-            (torch.Tensor): Model's output tensor.
-        """
-        y, dt, embeddings = [], [], []  # outputs
-        embed = frozenset(embed) if embed is not None else {-1}
-        max_idx = max(embed)
-        for m in self.model[:-1]:  # except the head part
-            if m.f != -1:  # if not from previous layer
-                x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f]  # from earlier layers
-            if profile:
-                self._profile_one_layer(m, x, dt)
-            x = m(x)  # run
-            y.append(x if m.i in self.save else None)  # save output
-            if visualize:
-                feature_visualization(x, m.type, m.i, save_dir=visualize)
-            if m.i in embed:
-                embeddings.append(torch.nn.functional.adaptive_avg_pool2d(x, (1, 1)).squeeze(-1).squeeze(-1))  # flatten
-                if m.i == max_idx:
-                    return torch.unbind(torch.cat(embeddings, 1), dim=0)
-        head = self.model[-1]
-        x = head([y[j] for j in head.f], batch)  # head inference
-        return x
-
-
-class WorldModel(DetectionModel):
-    """
-    YOLOv8 World Model.
-
-    This class implements the YOLOv8 World model for open-vocabulary object detection, supporting text-based
-    class specification and CLIP model integration for zero-shot detection capabilities.
-
-    Attributes:
-        txt_feats (torch.Tensor): Text feature embeddings for classes.
-        clip_model (torch.nn.Module): CLIP model for text encoding.
-
-    Methods:
-        __init__: Initialize YOLOv8 world model.
-        set_classes: Set classes for offline inference.
-        get_text_pe: Get text positional embeddings.
-        predict: Perform forward pass with text features.
-        loss: Compute loss with text features.
-
-    Examples:
-        Initialize a world model
-        >>> model = WorldModel("yolov8s-world.yaml", ch=3, nc=80)
-        >>> model.set_classes(["person", "car", "bicycle"])
-        >>> results = model.predict(image_tensor)
-    """
-
-    def __init__(self, cfg="yolov8s-world.yaml", ch=3, nc=None, verbose=True):
-        """
-        Initialize YOLOv8 world model with given config and parameters.
-
-        Args:
-            cfg (str | dict): Model configuration file path or dictionary.
-            ch (int): Number of input channels.
-            nc (int, optional): Number of classes.
-            verbose (bool): Whether to display model information.
-        """
-        self.txt_feats = torch.randn(1, nc or 80, 512)  # features placeholder
-        self.clip_model = None  # CLIP model placeholder
-        super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
-
-    def set_classes(self, text, batch=80, cache_clip_model=True):
-        """
-        Set classes in advance so that model could do offline-inference without clip model.
-
-        Args:
-            text (list[str]): List of class names.
-            batch (int): Batch size for processing text tokens.
-            cache_clip_model (bool): Whether to cache the CLIP model.
-        """
-        self.txt_feats = self.get_text_pe(text, batch=batch, cache_clip_model=cache_clip_model)
-        self.model[-1].nc = len(text)
-
-    def get_text_pe(self, text, batch=80, cache_clip_model=True):
-        """
-        Set classes in advance so that model could do offline-inference without clip model.
-
-        Args:
-            text (list[str]): List of class names.
-            batch (int): Batch size for processing text tokens.
-            cache_clip_model (bool): Whether to cache the CLIP model.
-
-        Returns:
-            (torch.Tensor): Text positional embeddings.
-        """
-        from ultralytics.nn.text_model import build_text_model
-
-        device = next(self.model.parameters()).device
-        if not getattr(self, "clip_model", None) and cache_clip_model:
-            # For backwards compatibility of models lacking clip_model attribute
-            self.clip_model = build_text_model("clip:ViT-B/32", device=device)
-        model = self.clip_model if cache_clip_model else build_text_model("clip:ViT-B/32", device=device)
-        text_token = model.tokenize(text)
-        txt_feats = [model.encode_text(token).detach() for token in text_token.split(batch)]
-        txt_feats = txt_feats[0] if len(txt_feats) == 1 else torch.cat(txt_feats, dim=0)
-        return txt_feats.reshape(-1, len(text), txt_feats.shape[-1])
-
-    def predict(self, x, profile=False, visualize=False, txt_feats=None, augment=False, embed=None):
-        """
-        Perform a forward pass through the model.
-
-        Args:
-            x (torch.Tensor): The input tensor.
-            profile (bool): If True, profile the computation time for each layer.
-            visualize (bool): If True, save feature maps for visualization.
-            txt_feats (torch.Tensor, optional): The text features, use it if it's given.
-            augment (bool): If True, perform data augmentation during inference.
-            embed (list, optional): A list of feature vectors/embeddings to return.
-
-        Returns:
-            (torch.Tensor): Model's output tensor.
-        """
-        txt_feats = (self.txt_feats if txt_feats is None else txt_feats).to(device=x.device, dtype=x.dtype)
-        if txt_feats.shape[0] != x.shape[0] or self.model[-1].export:
-            txt_feats = txt_feats.expand(x.shape[0], -1, -1)
-        ori_txt_feats = txt_feats.clone()
-        y, dt, embeddings = [], [], []  # outputs
-        embed = frozenset(embed) if embed is not None else {-1}
-        max_idx = max(embed)
-        for m in self.model:  # except the head part
-            if m.f != -1:  # if not from previous layer
-                x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f]  # from earlier layers
-            if profile:
-                self._profile_one_layer(m, x, dt)
-            if isinstance(m, C2fAttn):
-                x = m(x, txt_feats)
-            elif isinstance(m, WorldDetect):
-                x = m(x, ori_txt_feats)
-            elif isinstance(m, ImagePoolingAttn):
-                txt_feats = m(x, txt_feats)
-            else:
-                x = m(x)  # run
-
-            y.append(x if m.i in self.save else None)  # save output
-            if visualize:
-                feature_visualization(x, m.type, m.i, save_dir=visualize)
-            if m.i in embed:
-                embeddings.append(torch.nn.functional.adaptive_avg_pool2d(x, (1, 1)).squeeze(-1).squeeze(-1))  # flatten
-                if m.i == max_idx:
-                    return torch.unbind(torch.cat(embeddings, 1), dim=0)
-        return x
-
-    def loss(self, batch, preds=None):
-        """
-        Compute loss.
-
-        Args:
-            batch (dict): Batch to compute loss on.
-            preds (torch.Tensor | list[torch.Tensor], optional): Predictions.
-        """
-        if not hasattr(self, "criterion"):
-            self.criterion = self.init_criterion()
-
-        if preds is None:
-            preds = self.forward(batch["img"], txt_feats=batch["txt_feats"])
-        return self.criterion(preds, batch)
-
-
-class YOLOEModel(DetectionModel):
-    """
-    YOLOE detection model.
-
-    This class implements the YOLOE architecture for efficient object detection with text and visual prompts,
-    supporting both prompt-based and prompt-free inference modes.
-
-    Attributes:
-        pe (torch.Tensor): Prompt embeddings for classes.
-        clip_model (torch.nn.Module): CLIP model for text encoding.
-
-    Methods:
-        __init__: Initialize YOLOE model.
-        get_text_pe: Get text positional embeddings.
-        get_visual_pe: Get visual embeddings.
-        set_vocab: Set vocabulary for prompt-free model.
-        get_vocab: Get fused vocabulary layer.
-        set_classes: Set classes for offline inference.
-        get_cls_pe: Get class positional embeddings.
-        predict: Perform forward pass with prompts.
-        loss: Compute loss with prompts.
-
-    Examples:
-        Initialize a YOLOE model
-        >>> model = YOLOEModel("yoloe-v8s.yaml", ch=3, nc=80)
-        >>> results = model.predict(image_tensor, tpe=text_embeddings)
-    """
-
-    def __init__(self, cfg="yoloe-v8s.yaml", ch=3, nc=None, verbose=True):
-        """
-        Initialize YOLOE model with given config and parameters.
-
-        Args:
-            cfg (str | dict): Model configuration file path or dictionary.
-            ch (int): Number of input channels.
-            nc (int, optional): Number of classes.
-            verbose (bool): Whether to display model information.
-        """
-        super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
-
-    @smart_inference_mode()
-    def get_text_pe(self, text, batch=80, cache_clip_model=False, without_reprta=False):
-        """
-        Set classes in advance so that model could do offline-inference without clip model.
-
-        Args:
-            text (list[str]): List of class names.
-            batch (int): Batch size for processing text tokens.
-            cache_clip_model (bool): Whether to cache the CLIP model.
-            without_reprta (bool): Whether to return text embeddings cooperated with reprta module.
-
-        Returns:
-            (torch.Tensor): Text positional embeddings.
-        """
-        from ultralytics.nn.text_model import build_text_model
-
-        device = next(self.model.parameters()).device
-        if not getattr(self, "clip_model", None) and cache_clip_model:
-            # For backwards compatibility of models lacking clip_model attribute
-            self.clip_model = build_text_model("mobileclip:blt", device=device)
-
-        model = self.clip_model if cache_clip_model else build_text_model("mobileclip:blt", device=device)
-        text_token = model.tokenize(text)
-        txt_feats = [model.encode_text(token).detach() for token in text_token.split(batch)]
-        txt_feats = txt_feats[0] if len(txt_feats) == 1 else torch.cat(txt_feats, dim=0)
-        txt_feats = txt_feats.reshape(-1, len(text), txt_feats.shape[-1])
-        if without_reprta:
-            return txt_feats
-
-        head = self.model[-1]
-        assert isinstance(head, YOLOEDetect)
-        return head.get_tpe(txt_feats)  # run auxiliary text head
-
-    @smart_inference_mode()
-    def get_visual_pe(self, img, visual):
-        """
-        Get visual embeddings.
-
-        Args:
-            img (torch.Tensor): Input image tensor.
-            visual (torch.Tensor): Visual features.
-
-        Returns:
-            (torch.Tensor): Visual positional embeddings.
-        """
-        return self(img, vpe=visual, return_vpe=True)
-
-    def set_vocab(self, vocab, names):
-        """
-        Set vocabulary for the prompt-free model.
-
-        Args:
-            vocab (nn.ModuleList): List of vocabulary items.
-            names (list[str]): List of class names.
-        """
-        assert not self.training
-        head = self.model[-1]
-        assert isinstance(head, YOLOEDetect)
-
-        # Cache anchors for head
-        device = next(self.parameters()).device
-        self(torch.empty(1, 3, self.args["imgsz"], self.args["imgsz"]).to(device))  # warmup
-
-        # re-parameterization for prompt-free model
-        self.model[-1].lrpc = nn.ModuleList(
-            LRPCHead(cls, pf[-1], loc[-1], enabled=i != 2)
-            for i, (cls, pf, loc) in enumerate(zip(vocab, head.cv3, head.cv2))
-        )
-        for loc_head, cls_head in zip(head.cv2, head.cv3):
-            assert isinstance(loc_head, nn.Sequential)
-            assert isinstance(cls_head, nn.Sequential)
-            del loc_head[-1]
-            del cls_head[-1]
-        self.model[-1].nc = len(names)
-        self.names = check_class_names(names)
-
-    def get_vocab(self, names):
-        """
-        Get fused vocabulary layer from the model.
-
-        Args:
-            names (list): List of class names.
-
-        Returns:
-            (nn.ModuleList): List of vocabulary modules.
-        """
-        assert not self.training
-        head = self.model[-1]
-        assert isinstance(head, YOLOEDetect)
-        assert not head.is_fused
-
-        tpe = self.get_text_pe(names)
-        self.set_classes(names, tpe)
-        device = next(self.model.parameters()).device
-        head.fuse(self.pe.to(device))  # fuse prompt embeddings to classify head
-
-        vocab = nn.ModuleList()
-        for cls_head in head.cv3:
-            assert isinstance(cls_head, nn.Sequential)
-            vocab.append(cls_head[-1])
-        return vocab
-
-    def set_classes(self, names, embeddings):
-        """
-        Set classes in advance so that model could do offline-inference without clip model.
-
-        Args:
-            names (list[str]): List of class names.
-            embeddings (torch.Tensor): Embeddings tensor.
-        """
-        assert not hasattr(self.model[-1], "lrpc"), (
-            "Prompt-free model does not support setting classes. Please try with Text/Visual prompt models."
-        )
-        assert embeddings.ndim == 3
-        self.pe = embeddings
-        self.model[-1].nc = len(names)
-        self.names = check_class_names(names)
-
-    def get_cls_pe(self, tpe, vpe):
-        """
-        Get class positional embeddings.
-
-        Args:
-            tpe (torch.Tensor, optional): Text positional embeddings.
-            vpe (torch.Tensor, optional): Visual positional embeddings.
-
-        Returns:
-            (torch.Tensor): Class positional embeddings.
-        """
-        all_pe = []
-        if tpe is not None:
-            assert tpe.ndim == 3
-            all_pe.append(tpe)
-        if vpe is not None:
-            assert vpe.ndim == 3
-            all_pe.append(vpe)
-        if not all_pe:
-            all_pe.append(getattr(self, "pe", torch.zeros(1, 80, 512)))
-        return torch.cat(all_pe, dim=1)
-
-    def predict(
-        self, x, profile=False, visualize=False, tpe=None, augment=False, embed=None, vpe=None, return_vpe=False
-    ):
-        """
-        Perform a forward pass through the model.
-
-        Args:
-            x (torch.Tensor): The input tensor.
-            profile (bool): If True, profile the computation time for each layer.
-            visualize (bool): If True, save feature maps for visualization.
-            tpe (torch.Tensor, optional): Text positional embeddings.
-            augment (bool): If True, perform data augmentation during inference.
-            embed (list, optional): A list of feature vectors/embeddings to return.
-            vpe (torch.Tensor, optional): Visual positional embeddings.
-            return_vpe (bool): If True, return visual positional embeddings.
-
-        Returns:
-            (torch.Tensor): Model's output tensor.
-        """
-        y, dt, embeddings = [], [], []  # outputs
-        b = x.shape[0]
-        embed = frozenset(embed) if embed is not None else {-1}
-        max_idx = max(embed)
-        for m in self.model:  # except the head part
-            if m.f != -1:  # if not from previous layer
-                x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f]  # from earlier layers
-            if profile:
-                self._profile_one_layer(m, x, dt)
-            if isinstance(m, YOLOEDetect):
-                vpe = m.get_vpe(x, vpe) if vpe is not None else None
-                if return_vpe:
-                    assert vpe is not None
-                    assert not self.training
-                    return vpe
-                cls_pe = self.get_cls_pe(m.get_tpe(tpe), vpe).to(device=x[0].device, dtype=x[0].dtype)
-                if cls_pe.shape[0] != b or m.export:
-                    cls_pe = cls_pe.expand(b, -1, -1)
-                x = m(x, cls_pe)
-            else:
-                x = m(x)  # run
-
-            y.append(x if m.i in self.save else None)  # save output
-            if visualize:
-                feature_visualization(x, m.type, m.i, save_dir=visualize)
-            if m.i in embed:
-                embeddings.append(torch.nn.functional.adaptive_avg_pool2d(x, (1, 1)).squeeze(-1).squeeze(-1))  # flatten
-                if m.i == max_idx:
-                    return torch.unbind(torch.cat(embeddings, 1), dim=0)
-        return x
-
-    def loss(self, batch, preds=None):
-        """
-        Compute loss.
-
-        Args:
-            batch (dict): Batch to compute loss on.
-            preds (torch.Tensor | list[torch.Tensor], optional): Predictions.
-        """
-        if not hasattr(self, "criterion"):
-            from ultralytics.utils.loss import TVPDetectLoss
-
-            visual_prompt = batch.get("visuals", None) is not None  # TODO
-            self.criterion = TVPDetectLoss(self) if visual_prompt else self.init_criterion()
-
-        if preds is None:
-            preds = self.forward(batch["img"], tpe=batch.get("txt_feats", None), vpe=batch.get("visuals", None))
-        return self.criterion(preds, batch)
-
-
-class YOLOESegModel(YOLOEModel, SegmentationModel):
-    """
-    YOLOE segmentation model.
-
-    This class extends YOLOEModel to handle instance segmentation tasks with text and visual prompts,
-    providing specialized loss computation for pixel-level object detection and segmentation.
-
-    Methods:
-        __init__: Initialize YOLOE segmentation model.
-        loss: Compute loss with prompts for segmentation.
-
-    Examples:
-        Initialize a YOLOE segmentation model
-        >>> model = YOLOESegModel("yoloe-v8s-seg.yaml", ch=3, nc=80)
-        >>> results = model.predict(image_tensor, tpe=text_embeddings)
-    """
-
-    def __init__(self, cfg="yoloe-v8s-seg.yaml", ch=3, nc=None, verbose=True):
-        """
-        Initialize YOLOE segmentation model with given config and parameters.
-
-        Args:
-            cfg (str | dict): Model configuration file path or dictionary.
-            ch (int): Number of input channels.
-            nc (int, optional): Number of classes.
-            verbose (bool): Whether to display model information.
-        """
-        super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
-
-    def loss(self, batch, preds=None):
-        """
-        Compute loss.
-
-        Args:
-            batch (dict): Batch to compute loss on.
-            preds (torch.Tensor | list[torch.Tensor], optional): Predictions.
-        """
-        if not hasattr(self, "criterion"):
-            from ultralytics.utils.loss import TVPSegmentLoss
-
-            visual_prompt = batch.get("visuals", None) is not None  # TODO
-            self.criterion = TVPSegmentLoss(self) if visual_prompt else self.init_criterion()
-
-        if preds is None:
-            preds = self.forward(batch["img"], tpe=batch.get("txt_feats", None), vpe=batch.get("visuals", None))
-        return self.criterion(preds, batch)
-
-
-class Ensemble(torch.nn.ModuleList):
-    """
-    Ensemble of models.
-
-    This class allows combining multiple YOLO models into an ensemble for improved performance through
-    model averaging or other ensemble techniques.
-
-    Methods:
-        __init__: Initialize an ensemble of models.
-        forward: Generate predictions from all models in the ensemble.
-
-    Examples:
-        Create an ensemble of models
-        >>> ensemble = Ensemble()
-        >>> ensemble.append(model1)
-        >>> ensemble.append(model2)
-        >>> results = ensemble(image_tensor)
-    """
-
-    def __init__(self):
-        """Initialize an ensemble of models."""
-        super().__init__()
-
-    def forward(self, x, augment=False, profile=False, visualize=False):
-        """
-        Generate the YOLO network's final layer.
-
-        Args:
-            x (torch.Tensor): Input tensor.
-            augment (bool): Whether to augment the input.
-            profile (bool): Whether to profile the model.
-            visualize (bool): Whether to visualize the features.
-
-        Returns:
-            y (torch.Tensor): Concatenated predictions from all models.
-            train_out (None): Always None for ensemble inference.
-        """
-        y = [module(x, augment, profile, visualize)[0] for module in self]
-        # y = torch.stack(y).max(0)[0]  # max ensemble
-        # y = torch.stack(y).mean(0)  # mean ensemble
-        y = torch.cat(y, 2)  # nms ensemble, y shape(B, HW, C)
-        return y, None  # inference, train output
-
-
-# Functions ------------------------------------------------------------------------------------------------------------
-
-
-@contextlib.contextmanager
-def temporary_modules(modules=None, attributes=None):
-    """
-    Context manager for temporarily adding or modifying modules in Python's module cache (`sys.modules`).
-
-    This function can be used to change the module paths during runtime. It's useful when refactoring code,
-    where you've moved a module from one location to another, but you still want to support the old import
-    paths for backwards compatibility.
-
-    Args:
-        modules (dict, optional): A dictionary mapping old module paths to new module paths.
-        attributes (dict, optional): A dictionary mapping old module attributes to new module attributes.
-
-    Examples:
-        >>> with temporary_modules({"old.module": "new.module"}, {"old.module.attribute": "new.module.attribute"}):
-        >>> import old.module  # this will now import new.module
-        >>> from old.module import attribute  # this will now import new.module.attribute
-
-    Note:
-        The changes are only in effect inside the context manager and are undone once the context manager exits.
-        Be aware that directly manipulating `sys.modules` can lead to unpredictable results, especially in larger
-        applications or libraries. Use this function with caution.
-    """
-    if modules is None:
-        modules = {}
-    if attributes is None:
-        attributes = {}
-    import sys
-    from importlib import import_module
-
-    try:
-        # Set attributes in sys.modules under their old name
-        for old, new in attributes.items():
-            old_module, old_attr = old.rsplit(".", 1)
-            new_module, new_attr = new.rsplit(".", 1)
-            setattr(import_module(old_module), old_attr, getattr(import_module(new_module), new_attr))
-
-        # Set modules in sys.modules under their old name
-        for old, new in modules.items():
-            sys.modules[old] = import_module(new)
-
-        yield
-    finally:
-        # Remove the temporary module paths
-        for old in modules:
-            if old in sys.modules:
-                del sys.modules[old]
-
-
-class SafeClass:
-    """A placeholder class to replace unknown classes during unpickling."""
-
-    def __init__(self, *args, **kwargs):
-        """Initialize SafeClass instance, ignoring all arguments."""
-        pass
-
-    def __call__(self, *args, **kwargs):
-        """Run SafeClass instance, ignoring all arguments."""
-        pass
-
-
-class SafeUnpickler(pickle.Unpickler):
-    """Custom Unpickler that replaces unknown classes with SafeClass."""
-
-    def find_class(self, module, name):
-        """
-        Attempt to find a class, returning SafeClass if not among safe modules.
-
-        Args:
-            module (str): Module name.
-            name (str): Class name.
-
-        Returns:
-            (type): Found class or SafeClass.
-        """
-        safe_modules = (
-            "torch",
-            "collections",
-            "collections.abc",
-            "builtins",
-            "math",
-            "numpy",
-            # Add other modules considered safe
-        )
-        if module in safe_modules:
-            return super().find_class(module, name)
-        else:
-            return SafeClass
-
-
-def torch_safe_load(weight, safe_only=False):
-    """
-    Attempt to load a PyTorch model with the torch.load() function. If a ModuleNotFoundError is raised, it catches the
-    error, logs a warning message, and attempts to install the missing module via the check_requirements() function.
-    After installation, the function again attempts to load the model using torch.load().
-
-    Args:
-        weight (str): The file path of the PyTorch model.
-        safe_only (bool): If True, replace unknown classes with SafeClass during loading.
-
-    Returns:
-        ckpt (dict): The loaded model checkpoint.
-        file (str): The loaded filename.
-
-    Examples:
-        >>> from ultralytics.nn.tasks import torch_safe_load
-        >>> ckpt, file = torch_safe_load("path/to/best.pt", safe_only=True)
-    """
-    from ultralytics.utils.downloads import attempt_download_asset
-
-    check_suffix(file=weight, suffix=".pt")
-    file = attempt_download_asset(weight)  # search online if missing locally
-    try:
-        with temporary_modules(
-            modules={
-                "ultralytics.yolo.utils": "ultralytics.utils",
-                "ultralytics.yolo.v8": "ultralytics.models.yolo",
-                "ultralytics.yolo.data": "ultralytics.data",
-            },
-            attributes={
-                "ultralytics.nn.modules.block.Silence": "torch.nn.Identity",  # YOLOv9e
-                "ultralytics.nn.tasks.YOLOv10DetectionModel": "ultralytics.nn.tasks.DetectionModel",  # YOLOv10
-                "ultralytics.utils.loss.v10DetectLoss": "ultralytics.utils.loss.E2EDetectLoss",  # YOLOv10
-            },
-        ):
-            if safe_only:
-                # Load via custom pickle module
-                safe_pickle = types.ModuleType("safe_pickle")
-                safe_pickle.Unpickler = SafeUnpickler
-                safe_pickle.load = lambda file_obj: SafeUnpickler(file_obj).load()
-                with open(file, "rb") as f:
-                    ckpt = torch_load(f, pickle_module=safe_pickle)
-            else:
-                ckpt = torch_load(file, map_location="cpu")
-
-    except ModuleNotFoundError as e:  # e.name is missing module name
-        if e.name == "models":
-            raise TypeError(
-                emojis(
-                    f"ERROR ❌️ {weight} appears to be an Ultralytics YOLOv5 model originally trained "
-                    f"with https://github.com/ultralytics/yolov5.\nThis model is NOT forwards compatible with "
-                    f"YOLOv8 at https://github.com/ultralytics/ultralytics."
-                    f"\nRecommend fixes are to train a new model using the latest 'ultralytics' package or to "
-                    f"run a command with an official Ultralytics model, i.e. 'yolo predict model=yolo11n.pt'"
-                )
-            ) from e
-        elif e.name == "numpy._core":
-            raise ModuleNotFoundError(
-                emojis(
-                    f"ERROR ❌️ {weight} requires numpy>=1.26.1, however numpy=={__import__('numpy').__version__} is installed."
-                )
-            ) from e
-        LOGGER.warning(
-            f"{weight} appears to require '{e.name}', which is not in Ultralytics requirements."
-            f"\nAutoInstall will run now for '{e.name}' but this feature will be removed in the future."
-            f"\nRecommend fixes are to train a new model using the latest 'ultralytics' package or to "
-            f"run a command with an official Ultralytics model, i.e. 'yolo predict model=yolo11n.pt'"
-        )
-        check_requirements(e.name)  # install missing module
-        ckpt = torch_load(file, map_location="cpu")
-
-    if not isinstance(ckpt, dict):
-        # File is likely a YOLO instance saved with i.e. torch.save(model, "saved_model.pt")
-        LOGGER.warning(
-            f"The file '{weight}' appears to be improperly saved or formatted. "
-            f"For optimal results, use model.save('filename.pt') to correctly save YOLO models."
-        )
-        ckpt = {"model": ckpt.model}
-
-    return ckpt, file
-
-
-def load_checkpoint(weight, device=None, inplace=True, fuse=False):
-    """
-    Load a single model weights.
-
-    Args:
-        weight (str | Path): Model weight path.
-        device (torch.device, optional): Device to load model to.
-        inplace (bool): Whether to do inplace operations.
-        fuse (bool): Whether to fuse model.
-
-    Returns:
-        model (torch.nn.Module): Loaded model.
-        ckpt (dict): Model checkpoint dictionary.
-    """
-    ckpt, weight = torch_safe_load(weight)  # load ckpt
-    args = {**DEFAULT_CFG_DICT, **(ckpt.get("train_args", {}))}  # combine model and default args, preferring model args
-    model = (ckpt.get("ema") or ckpt["model"]).float()  # FP32 model
-
-    # Model compatibility updates
-    model.args = args  # attach args to model
-    model.pt_path = weight  # attach *.pt file path to model
-    model.task = getattr(model, "task", guess_model_task(model))
-    if not hasattr(model, "stride"):
-        model.stride = torch.tensor([32.0])
-
-    model = (model.fuse() if fuse and hasattr(model, "fuse") else model).eval().to(device)  # model in eval mode
-
-    # Module updates
-    for m in model.modules():
-        if hasattr(m, "inplace"):
-            m.inplace = inplace
-        elif isinstance(m, torch.nn.Upsample) and not hasattr(m, "recompute_scale_factor"):
-            m.recompute_scale_factor = None  # torch 1.11.0 compatibility
-
-    # Return model and ckpt
-    return model, ckpt
-
-
-def parse_model(d, ch, verbose=True):
-    """
-    Parse a YOLO model.yaml dictionary into a PyTorch model.
-
-    Args:
-        d (dict): Model dictionary.
-        ch (int): Input channels.
-        verbose (bool): Whether to print model details.
-
-    Returns:
-        model (torch.nn.Sequential): PyTorch model.
-        save (list): Sorted list of output layers.
-    """
-    import ast
-
-    # Args
-    legacy = True  # backward compatibility for v3/v5/v8/v9 models
-    max_channels = float("inf")
-    nc, act, scales = (d.get(x) for x in ("nc", "activation", "scales"))
-    depth, width, kpt_shape = (d.get(x, 1.0) for x in ("depth_multiple", "width_multiple", "kpt_shape"))
-    scale = d.get("scale")
-    if scales:
-        if not scale:
-            scale = tuple(scales.keys())[0]
-            LOGGER.warning(f"no model scale passed. Assuming scale='{scale}'.")
-        depth, width, max_channels = scales[scale]
-
-    if act:
-        Conv.default_act = eval(act)  # redefine default activation, i.e. Conv.default_act = torch.nn.SiLU()
-        if verbose:
-            LOGGER.info(f"{colorstr('activation:')} {act}")  # print
-
-    if verbose:
-        LOGGER.info(f"\n{'':>3}{'from':>20}{'n':>3}{'params':>10}  {'module':<45}{'arguments':<30}")
-    ch = [ch]
-    layers, save, c2 = [], [], ch[-1]  # layers, savelist, ch out
-    base_modules = frozenset(
-        {
-            Classify,
-            Conv,
-            ConvTranspose,
-            GhostConv,
-            Bottleneck,
-            GhostBottleneck,
-            SPP,
-            SPPF,
-            C2fPSA,
-            C2PSA,
-            DWConv,
-            Focus,
-            BottleneckCSP,
-            C1,
-            C2,
-            C2f,
-            C3k2,
-            RepNCSPELAN4,
-            ELAN1,
-            ADown,
-            AConv,
-            SPPELAN,
-            C2fAttn,
-            C3,
-            C3TR,
-            C3Ghost,
-            torch.nn.ConvTranspose2d,
-            DWConvTranspose2d,
-            C3x,
-            RepC3,
-            PSA,
-            SCDown,
-            C2fCIB,
-            A2C2f,
-        }
-    )
-    repeat_modules = frozenset(  # modules with 'repeat' arguments
-        {
-            BottleneckCSP,
-            C1,
-            C2,
-            C2f,
-            C3k2,
-            C2fAttn,
-            C3,
-            C3TR,
-            C3Ghost,
-            C3x,
-            RepC3,
-            C2fPSA,
-            C2fCIB,
-            C2PSA,
-            A2C2f,
-        }
-    )
-    for i, (f, n, m, args) in enumerate(d["backbone"] + d["head"]):  # from, number, module, args
-        m = (
-            getattr(torch.nn, m[3:])
-            if "nn." in m
-            else getattr(__import__("torchvision").ops, m[16:])
-            if "torchvision.ops." in m
-            else globals()[m]
-        )  # get module
-        for j, a in enumerate(args):
-            if isinstance(a, str):
-                with contextlib.suppress(ValueError):
-                    args[j] = locals()[a] if a in locals() else ast.literal_eval(a)
-        n = n_ = max(round(n * depth), 1) if n > 1 else n  # depth gain
-        if m in base_modules:
-            c1, c2 = ch[f], args[0]
-            if c2 != nc:  # if c2 not equal to number of classes (i.e. for Classify() output)
-                c2 = make_divisible(min(c2, max_channels) * width, 8)
-            if m is C2fAttn:  # set 1) embed channels and 2) num heads
-                args[1] = make_divisible(min(args[1], max_channels // 2) * width, 8)
-                args[2] = int(max(round(min(args[2], max_channels // 2 // 32)) * width, 1) if args[2] > 1 else args[2])
-
-            args = [c1, c2, *args[1:]]
-            if m in repeat_modules:
-                args.insert(2, n)  # number of repeats
-                n = 1
-            if m is C3k2:  # for M/L/X sizes
-                legacy = False
-                if scale in "mlx":
-                    args[3] = True
-            if m is A2C2f:
-                legacy = False
-                if scale in "lx":  # for L/X sizes
-                    args.extend((True, 1.2))
-            if m is C2fCIB:
-                legacy = False
-        elif m is AIFI:
-            args = [ch[f], *args]
-        elif m in frozenset({HGStem, HGBlock}):
-            c1, cm, c2 = ch[f], args[0], args[1]
-            args = [c1, cm, c2, *args[2:]]
-            if m is HGBlock:
-                args.insert(4, n)  # number of repeats
-                n = 1
-        elif m is ResNetLayer:
-            c2 = args[1] if args[3] else args[1] * 4
-        elif m is torch.nn.BatchNorm2d:
-            args = [ch[f]]
-        elif m is Concat:
-            c2 = sum(ch[x] for x in f)
-        elif m in frozenset(
-            {Detect, WorldDetect, YOLOEDetect, Segment, YOLOESegment, Pose, OBB, ImagePoolingAttn, v10Detect}
-        ):
-            args.append([ch[x] for x in f])
-            if m is Segment or m is YOLOESegment:
-                args[2] = make_divisible(min(args[2], max_channels) * width, 8)
-            if m in {Detect, YOLOEDetect, Segment, YOLOESegment, Pose, OBB}:
-                m.legacy = legacy
-        elif m is RTDETRDecoder:  # special case, channels arg must be passed in index 1
-            args.insert(1, [ch[x] for x in f])
-        elif m is CBLinear:
-            c2 = args[0]
-            c1 = ch[f]
-            args = [c1, c2, *args[1:]]
-        elif m is CBFuse:
-            c2 = ch[f[-1]]
-        elif m in frozenset({TorchVision, Index}):
-            c2 = args[0]
-            c1 = ch[f]
-            args = [*args[1:]]
-        else:
-            c2 = ch[f]
-
-        m_ = torch.nn.Sequential(*(m(*args) for _ in range(n))) if n > 1 else m(*args)  # module
-        t = str(m)[8:-2].replace("__main__.", "")  # module type
-        m_.np = sum(x.numel() for x in m_.parameters())  # number params
-        m_.i, m_.f, m_.type = i, f, t  # attach index, 'from' index, type
-        if verbose:
-            LOGGER.info(f"{i:>3}{str(f):>20}{n_:>3}{m_.np:10.0f}  {t:<45}{str(args):<30}")  # print
-        save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1)  # append to savelist
-        layers.append(m_)
-        if i == 0:
-            ch = []
-        ch.append(c2)
-    return torch.nn.Sequential(*layers), sorted(save)
-
-
-def yaml_model_load(path):
-    """
-    Load a YOLOv8 model from a YAML file.
-
-    Args:
-        path (str | Path): Path to the YAML file.
-
-    Returns:
-        (dict): Model dictionary.
-    """
-    path = Path(path)
-    if path.stem in (f"yolov{d}{x}6" for x in "nsmlx" for d in (5, 8)):
-        new_stem = re.sub(r"(\d+)([nslmx])6(.+)?$", r"\1\2-p6\3", path.stem)
-        LOGGER.warning(f"Ultralytics YOLO P6 models now use -p6 suffix. Renaming {path.stem} to {new_stem}.")
-        path = path.with_name(new_stem + path.suffix)
-
-    unified_path = re.sub(r"(\d+)([nslmx])(.+)?$", r"\1\3", str(path))  # i.e. yolov8x.yaml -> yolov8.yaml
-    yaml_file = check_yaml(unified_path, hard=False) or check_yaml(path)
-    d = YAML.load(yaml_file)  # model dict
-    d["scale"] = guess_model_scale(path)
-    d["yaml_file"] = str(path)
-    return d
-
-
-def guess_model_scale(model_path):
-    """
-    Extract the size character n, s, m, l, or x of the model's scale from the model path.
-
-    Args:
-        model_path (str | Path): The path to the YOLO model's YAML file.
-
-    Returns:
-        (str): The size character of the model's scale (n, s, m, l, or x).
-    """
-    try:
-        return re.search(r"yolo(e-)?[v]?\d+([nslmx])", Path(model_path).stem).group(2)  # noqa
-    except AttributeError:
-        return ""
-
-
-def guess_model_task(model):
-    """
-    Guess the task of a PyTorch model from its architecture or configuration.
-
-    Args:
-        model (torch.nn.Module | dict): PyTorch model or model configuration in YAML format.
-
-    Returns:
-        (str): Task of the model ('detect', 'segment', 'classify', 'pose', 'obb').
-    """
-
-    def cfg2task(cfg):
-        """Guess from YAML dictionary."""
-        m = cfg["head"][-1][-2].lower()  # output module name
-        if m in {"classify", "classifier", "cls", "fc"}:
-            return "classify"
-        if "detect" in m:
-            return "detect"
-        if "segment" in m:
-            return "segment"
-        if m == "pose":
-            return "pose"
-        if m == "obb":
-            return "obb"
-
-    # Guess from model cfg
-    if isinstance(model, dict):
-        with contextlib.suppress(Exception):
-            return cfg2task(model)
-    # Guess from PyTorch model
-    if isinstance(model, torch.nn.Module):  # PyTorch model
-        for x in "model.args", "model.model.args", "model.model.model.args":
-            with contextlib.suppress(Exception):
-                return eval(x)["task"]
-        for x in "model.yaml", "model.model.yaml", "model.model.model.yaml":
-            with contextlib.suppress(Exception):
-                return cfg2task(eval(x))
-        for m in model.modules():
-            if isinstance(m, (Segment, YOLOESegment)):
-                return "segment"
-            elif isinstance(m, Classify):
-                return "classify"
-            elif isinstance(m, Pose):
-                return "pose"
-            elif isinstance(m, OBB):
-                return "obb"
-            elif isinstance(m, (Detect, WorldDetect, YOLOEDetect, v10Detect)):
-                return "detect"
-
-    # Guess from model filename
-    if isinstance(model, (str, Path)):
-        model = Path(model)
-        if "-seg" in model.stem or "segment" in model.parts:
-            return "segment"
-        elif "-cls" in model.stem or "classify" in model.parts:
-            return "classify"
-        elif "-pose" in model.stem or "pose" in model.parts:
-            return "pose"
-        elif "-obb" in model.stem or "obb" in model.parts:
-            return "obb"
-        elif "detect" in model.parts:
-            return "detect"
-
-    # Unable to determine task from model
-    LOGGER.warning(
-        "Unable to automatically guess model task, assuming 'task=detect'. "
-        "Explicitly define task for your model, i.e. 'task=detect', 'segment', 'classify','pose' or 'obb'."
-    )
-    return "detect"  # assume detect
diff --git a/ultralytics/nn/text_model.py b/ultralytics/nn/text_model.py
deleted file mode 100644
index 234bb17..0000000
--- a/ultralytics/nn/text_model.py
+++ /dev/null
@@ -1,383 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-from abc import abstractmethod
-from pathlib import Path
-
-import torch
-import torch.nn as nn
-from PIL import Image
-
-from ultralytics.utils import checks
-from ultralytics.utils.torch_utils import smart_inference_mode
-
-try:
-    import clip
-except ImportError:
-    checks.check_requirements("git+https://github.com/ultralytics/CLIP.git")
-    import clip
-
-
-class TextModel(nn.Module):
-    """
-    Abstract base class for text encoding models.
-
-    This class defines the interface for text encoding models used in vision-language tasks. Subclasses must implement
-    the tokenize and encode_text methods to provide text tokenization and encoding functionality.
-
-    Methods:
-        tokenize: Convert input texts to tokens for model processing.
-        encode_text: Encode tokenized texts into normalized feature vectors.
-    """
-
-    def __init__(self):
-        """Initialize the TextModel base class."""
-        super().__init__()
-
-    @abstractmethod
-    def tokenize(self, texts):
-        """Convert input texts to tokens for model processing."""
-        pass
-
-    @abstractmethod
-    def encode_text(self, texts, dtype):
-        """Encode tokenized texts into normalized feature vectors."""
-        pass
-
-
-class CLIP(TextModel):
-    """
-    Implements OpenAI's CLIP (Contrastive Language-Image Pre-training) text encoder.
-
-    This class provides a text encoder based on OpenAI's CLIP model, which can convert text into feature vectors
-    that are aligned with corresponding image features in a shared embedding space.
-
-    Attributes:
-        model (clip.model.CLIP): The loaded CLIP model.
-        device (torch.device): Device where the model is loaded.
-
-    Methods:
-        tokenize: Convert input texts to CLIP tokens.
-        encode_text: Encode tokenized texts into normalized feature vectors.
-
-    Examples:
-        >>> import torch
-        >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        >>> clip_model = CLIP(size="ViT-B/32", device=device)
-        >>> tokens = clip_model.tokenize(["a photo of a cat", "a photo of a dog"])
-        >>> text_features = clip_model.encode_text(tokens)
-        >>> print(text_features.shape)
-    """
-
-    def __init__(self, size: str, device: torch.device) -> None:
-        """
-        Initialize the CLIP text encoder.
-
-        This class implements the TextModel interface using OpenAI's CLIP model for text encoding. It loads
-        a pre-trained CLIP model of the specified size and prepares it for text encoding tasks.
-
-        Args:
-            size (str): Model size identifier (e.g., 'ViT-B/32').
-            device (torch.device): Device to load the model on.
-
-        Examples:
-            >>> import torch
-            >>> clip_model = CLIP("ViT-B/32", device=torch.device("cuda:0"))
-            >>> text_features = clip_model.encode_text(["a photo of a cat", "a photo of a dog"])
-        """
-        super().__init__()
-        self.model, self.image_preprocess = clip.load(size, device=device)
-        self.to(device)
-        self.device = device
-        self.eval()
-
-    def tokenize(self, texts: str | list[str]) -> torch.Tensor:
-        """
-        Convert input texts to CLIP tokens.
-
-        Args:
-            texts (str | list[str]): Input text or list of texts to tokenize.
-
-        Returns:
-            (torch.Tensor): Tokenized text tensor with shape (batch_size, context_length) ready for model processing.
-
-        Examples:
-            >>> model = CLIP("ViT-B/32", device="cpu")
-            >>> tokens = model.tokenize("a photo of a cat")
-            >>> print(tokens.shape)  # torch.Size([1, 77])
-        """
-        return clip.tokenize(texts).to(self.device)
-
-    @smart_inference_mode()
-    def encode_text(self, texts: torch.Tensor, dtype: torch.dtype = torch.float32) -> torch.Tensor:
-        """
-        Encode tokenized texts into normalized feature vectors.
-
-        This method processes tokenized text inputs through the CLIP model to generate feature vectors, which are then
-        normalized to unit length. These normalized vectors can be used for text-image similarity comparisons.
-
-        Args:
-            texts (torch.Tensor): Tokenized text inputs, typically created using the tokenize() method.
-            dtype (torch.dtype, optional): Data type for output features.
-
-        Returns:
-            (torch.Tensor): Normalized text feature vectors with unit length (L2 norm = 1).
-
-        Examples:
-            >>> clip_model = CLIP("ViT-B/32", device="cuda")
-            >>> tokens = clip_model.tokenize(["a photo of a cat", "a photo of a dog"])
-            >>> features = clip_model.encode_text(tokens)
-            >>> features.shape
-            torch.Size([2, 512])
-        """
-        txt_feats = self.model.encode_text(texts).to(dtype)
-        txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True)
-        return txt_feats
-
-    @smart_inference_mode()
-    def encode_image(self, image: Image.Image | torch.Tensor, dtype: torch.dtype = torch.float32) -> torch.Tensor:
-        """
-        Encode preprocessed images into normalized feature vectors.
-
-        This method processes preprocessed image inputs through the CLIP model to generate feature vectors, which are then
-        normalized to unit length. These normalized vectors can be used for text-image similarity comparisons.
-
-        Args:
-            image (PIL.Image | torch.Tensor): Preprocessed image input. If a PIL Image is provided, it will be
-                converted to a tensor using the model's image preprocessing function.
-            dtype (torch.dtype, optional): Data type for output features.
-
-        Returns:
-            (torch.Tensor): Normalized image feature vectors with unit length (L2 norm = 1).
-
-        Examples:
-            >>> from ultralytics.nn.text_model import CLIP
-            >>> from PIL import Image
-            >>> clip_model = CLIP("ViT-B/32", device="cuda")
-            >>> image = Image.open("path/to/image.jpg")
-            >>> image_tensor = clip_model.image_preprocess(image).unsqueeze(0).to("cuda")
-            >>> features = clip_model.encode_image(image_tensor)
-            >>> features.shape
-            torch.Size([1, 512])
-        """
-        if isinstance(image, Image.Image):
-            image = self.image_preprocess(image).unsqueeze(0).to(self.device)
-        img_feats = self.model.encode_image(image).to(dtype)
-        img_feats = img_feats / img_feats.norm(p=2, dim=-1, keepdim=True)
-        return img_feats
-
-
-class MobileCLIP(TextModel):
-    """
-    Implement Apple's MobileCLIP text encoder for efficient text encoding.
-
-    This class implements the TextModel interface using Apple's MobileCLIP model, providing efficient text encoding
-    capabilities for vision-language tasks with reduced computational requirements compared to standard CLIP models.
-
-    Attributes:
-        model (mobileclip.model.MobileCLIP): The loaded MobileCLIP model.
-        tokenizer (callable): Tokenizer function for processing text inputs.
-        device (torch.device): Device where the model is loaded.
-        config_size_map (dict): Mapping from size identifiers to model configuration names.
-
-    Methods:
-        tokenize: Convert input texts to MobileCLIP tokens.
-        encode_text: Encode tokenized texts into normalized feature vectors.
-
-    Examples:
-        >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        >>> text_encoder = MobileCLIP(size="s0", device=device)
-        >>> tokens = text_encoder.tokenize(["a photo of a cat", "a photo of a dog"])
-        >>> features = text_encoder.encode_text(tokens)
-    """
-
-    config_size_map = {"s0": "s0", "s1": "s1", "s2": "s2", "b": "b", "blt": "b"}
-
-    def __init__(self, size: str, device: torch.device) -> None:
-        """
-        Initialize the MobileCLIP text encoder.
-
-        This class implements the TextModel interface using Apple's MobileCLIP model for efficient text encoding.
-
-        Args:
-            size (str): Model size identifier (e.g., 's0', 's1', 's2', 'b', 'blt').
-            device (torch.device): Device to load the model on.
-
-        Examples:
-            >>> import torch
-            >>> model = MobileCLIP("s0", device=torch.device("cpu"))
-            >>> tokens = model.tokenize(["a photo of a cat", "a photo of a dog"])
-            >>> features = model.encode_text(tokens)
-        """
-        try:
-            import warnings
-
-            # Suppress 'timm.models.layers is deprecated, please import via timm.layers' warning from mobileclip usage
-            with warnings.catch_warnings():
-                warnings.filterwarnings("ignore", category=FutureWarning)
-                import mobileclip
-        except ImportError:
-            # Ultralytics fork preferred since Apple MobileCLIP repo has incorrect version of torchvision
-            checks.check_requirements("git+https://github.com/ultralytics/mobileclip.git")
-            import mobileclip
-
-        super().__init__()
-        config = self.config_size_map[size]
-        file = f"mobileclip_{size}.pt"
-        if not Path(file).is_file():
-            from ultralytics import download
-
-            download(f"https://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/{file}")
-        self.model = mobileclip.create_model_and_transforms(f"mobileclip_{config}", pretrained=file, device=device)[0]
-        self.tokenizer = mobileclip.get_tokenizer(f"mobileclip_{config}")
-        self.to(device)
-        self.device = device
-        self.eval()
-
-    def tokenize(self, texts: list[str]) -> torch.Tensor:
-        """
-        Convert input texts to MobileCLIP tokens.
-
-        Args:
-            texts (list[str]): List of text strings to tokenize.
-
-        Returns:
-            (torch.Tensor): Tokenized text inputs with shape (batch_size, sequence_length).
-
-        Examples:
-            >>> model = MobileCLIP("s0", "cpu")
-            >>> tokens = model.tokenize(["a photo of a cat", "a photo of a dog"])
-        """
-        return self.tokenizer(texts).to(self.device)
-
-    @smart_inference_mode()
-    def encode_text(self, texts: torch.Tensor, dtype: torch.dtype = torch.float32) -> torch.Tensor:
-        """
-        Encode tokenized texts into normalized feature vectors.
-
-        Args:
-            texts (torch.Tensor): Tokenized text inputs.
-            dtype (torch.dtype, optional): Data type for output features.
-
-        Returns:
-            (torch.Tensor): Normalized text feature vectors with L2 normalization applied.
-
-        Examples:
-            >>> model = MobileCLIP("s0", device="cpu")
-            >>> tokens = model.tokenize(["a photo of a cat", "a photo of a dog"])
-            >>> features = model.encode_text(tokens)
-            >>> features.shape
-            torch.Size([2, 512])  # Actual dimension depends on model size
-        """
-        text_features = self.model.encode_text(texts).to(dtype)
-        text_features /= text_features.norm(p=2, dim=-1, keepdim=True)
-        return text_features
-
-
-class MobileCLIPTS(TextModel):
-    """
-    Load a TorchScript traced version of MobileCLIP.
-
-    This class implements the TextModel interface using Apple's MobileCLIP model in TorchScript format, providing
-    efficient text encoding capabilities for vision-language tasks with optimized inference performance.
-
-    Attributes:
-        encoder (torch.jit.ScriptModule): The loaded TorchScript MobileCLIP text encoder.
-        tokenizer (callable): Tokenizer function for processing text inputs.
-        device (torch.device): Device where the model is loaded.
-
-    Methods:
-        tokenize: Convert input texts to MobileCLIP tokens.
-        encode_text: Encode tokenized texts into normalized feature vectors.
-
-    Examples:
-        >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        >>> text_encoder = MobileCLIPTS(device=device)
-        >>> tokens = text_encoder.tokenize(["a photo of a cat", "a photo of a dog"])
-        >>> features = text_encoder.encode_text(tokens)
-    """
-
-    def __init__(self, device: torch.device):
-        """
-        Initialize the MobileCLIP TorchScript text encoder.
-
-        This class implements the TextModel interface using Apple's MobileCLIP model in TorchScript format for
-        efficient text encoding with optimized inference performance.
-
-        Args:
-            device (torch.device): Device to load the model on.
-
-        Examples:
-            >>> model = MobileCLIPTS(device=torch.device("cpu"))
-            >>> tokens = model.tokenize(["a photo of a cat", "a photo of a dog"])
-            >>> features = model.encode_text(tokens)
-        """
-        super().__init__()
-        from ultralytics.utils.downloads import attempt_download_asset
-
-        self.encoder = torch.jit.load(attempt_download_asset("mobileclip_blt.ts"), map_location=device)
-        self.tokenizer = clip.clip.tokenize
-        self.device = device
-
-    def tokenize(self, texts: list[str]) -> torch.Tensor:
-        """
-        Convert input texts to MobileCLIP tokens.
-
-        Args:
-            texts (list[str]): List of text strings to tokenize.
-
-        Returns:
-            (torch.Tensor): Tokenized text inputs with shape (batch_size, sequence_length).
-
-        Examples:
-            >>> model = MobileCLIPTS("cpu")
-            >>> tokens = model.tokenize(["a photo of a cat", "a photo of a dog"])
-        """
-        return self.tokenizer(texts).to(self.device)
-
-    @smart_inference_mode()
-    def encode_text(self, texts: torch.Tensor, dtype: torch.dtype = torch.float32) -> torch.Tensor:
-        """
-        Encode tokenized texts into normalized feature vectors.
-
-        Args:
-            texts (torch.Tensor): Tokenized text inputs.
-            dtype (torch.dtype, optional): Data type for output features.
-
-        Returns:
-            (torch.Tensor): Normalized text feature vectors with L2 normalization applied.
-
-        Examples:
-            >>> model = MobileCLIPTS(device="cpu")
-            >>> tokens = model.tokenize(["a photo of a cat", "a photo of a dog"])
-            >>> features = model.encode_text(tokens)
-            >>> features.shape
-            torch.Size([2, 512])  # Actual dimension depends on model size
-        """
-        # NOTE: no need to do normalization here as it's embedded in the torchscript model
-        return self.encoder(texts).to(dtype)
-
-
-def build_text_model(variant: str, device: torch.device = None) -> TextModel:
-    """
-    Build a text encoding model based on the specified variant.
-
-    Args:
-        variant (str): Model variant in format "base:size" (e.g., "clip:ViT-B/32" or "mobileclip:s0").
-        device (torch.device, optional): Device to load the model on.
-
-    Returns:
-        (TextModel): Instantiated text encoding model.
-
-    Examples:
-        >>> model = build_text_model("clip:ViT-B/32", device=torch.device("cuda"))
-        >>> model = build_text_model("mobileclip:s0", device=torch.device("cpu"))
-    """
-    base, size = variant.split(":")
-    if base == "clip":
-        return CLIP(size, device)
-    elif base == "mobileclip":
-        return MobileCLIPTS(device)
-    else:
-        raise ValueError(f"Unrecognized base model: '{base}'. Supported base models: 'clip', 'mobileclip'.")
diff --git a/ultralytics/py.typed b/ultralytics/py.typed
deleted file mode 100644
index b648ac9..0000000
--- a/ultralytics/py.typed
+++ /dev/null
@@ -1 +0,0 @@
-partial
diff --git a/ultralytics/solutions/__init__.py b/ultralytics/solutions/__init__.py
deleted file mode 100644
index 756e0aa..0000000
--- a/ultralytics/solutions/__init__.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from .ai_gym import AIGym
-from .analytics import Analytics
-from .distance_calculation import DistanceCalculation
-from .heatmap import Heatmap
-from .instance_segmentation import InstanceSegmentation
-from .object_blurrer import ObjectBlurrer
-from .object_counter import ObjectCounter
-from .object_cropper import ObjectCropper
-from .parking_management import ParkingManagement, ParkingPtsSelection
-from .queue_management import QueueManager
-from .region_counter import RegionCounter
-from .security_alarm import SecurityAlarm
-from .similarity_search import SearchApp, VisualAISearch
-from .speed_estimation import SpeedEstimator
-from .streamlit_inference import Inference
-from .trackzone import TrackZone
-from .vision_eye import VisionEye
-
-__all__ = (
-    "ObjectCounter",
-    "ObjectCropper",
-    "ObjectBlurrer",
-    "AIGym",
-    "RegionCounter",
-    "SecurityAlarm",
-    "Heatmap",
-    "InstanceSegmentation",
-    "VisionEye",
-    "SpeedEstimator",
-    "DistanceCalculation",
-    "QueueManager",
-    "ParkingManagement",
-    "ParkingPtsSelection",
-    "Analytics",
-    "Inference",
-    "TrackZone",
-    "SearchApp",
-    "VisualAISearch",
-)
diff --git a/ultralytics/solutions/ai_gym.py b/ultralytics/solutions/ai_gym.py
deleted file mode 100644
index 9a5c914..0000000
--- a/ultralytics/solutions/ai_gym.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from collections import defaultdict
-from typing import Any
-
-from ultralytics.solutions.solutions import BaseSolution, SolutionAnnotator, SolutionResults
-
-
-class AIGym(BaseSolution):
-    """
-    A class to manage gym steps of people in a real-time video stream based on their poses.
-
-    This class extends BaseSolution to monitor workouts using YOLO pose estimation models. It tracks and counts
-    repetitions of exercises based on predefined angle thresholds for up and down positions.
-
-    Attributes:
-        states (dict[float, int, str]): Stores per-track angle, count, and stage for workout monitoring.
-        up_angle (float): Angle threshold for considering the 'up' position of an exercise.
-        down_angle (float): Angle threshold for considering the 'down' position of an exercise.
-        kpts (list[int]): Indices of keypoints used for angle calculation.
-
-    Methods:
-        process: Process a frame to detect poses, calculate angles, and count repetitions.
-
-    Examples:
-        >>> gym = AIGym(model="yolo11n-pose.pt")
-        >>> image = cv2.imread("gym_scene.jpg")
-        >>> results = gym.process(image)
-        >>> processed_image = results.plot_im
-        >>> cv2.imshow("Processed Image", processed_image)
-        >>> cv2.waitKey(0)
-    """
-
-    def __init__(self, **kwargs: Any) -> None:
-        """
-        Initialize AIGym for workout monitoring using pose estimation and predefined angles.
-
-        Args:
-            **kwargs (Any): Keyword arguments passed to the parent class constructor.
-                model (str): Model name or path, defaults to "yolo11n-pose.pt".
-        """
-        kwargs["model"] = kwargs.get("model", "yolo11n-pose.pt")
-        super().__init__(**kwargs)
-        self.states = defaultdict(lambda: {"angle": 0, "count": 0, "stage": "-"})  # Dict for count, angle and stage
-
-        # Extract details from CFG single time for usage later
-        self.up_angle = float(self.CFG["up_angle"])  # Pose up predefined angle to consider up pose
-        self.down_angle = float(self.CFG["down_angle"])  # Pose down predefined angle to consider down pose
-        self.kpts = self.CFG["kpts"]  # User selected kpts of workouts storage for further usage
-
-    def process(self, im0) -> SolutionResults:
-        """
-        Monitor workouts using Ultralytics YOLO Pose Model.
-
-        This function processes an input image to track and analyze human poses for workout monitoring. It uses
-        the YOLO Pose model to detect keypoints, estimate angles, and count repetitions based on predefined
-        angle thresholds.
-
-        Args:
-            im0 (np.ndarray): Input image for processing.
-
-        Returns:
-            (SolutionResults): Contains processed image `plot_im`,
-                'workout_count' (list of completed reps),
-                'workout_stage' (list of current stages),
-                'workout_angle' (list of angles), and
-                'total_tracks' (total number of tracked individuals).
-
-        Examples:
-            >>> gym = AIGym()
-            >>> image = cv2.imread("workout.jpg")
-            >>> results = gym.process(image)
-            >>> processed_image = results.plot_im
-        """
-        annotator = SolutionAnnotator(im0, line_width=self.line_width)  # Initialize annotator
-
-        self.extract_tracks(im0)  # Extract tracks (bounding boxes, classes, and masks)
-
-        if len(self.boxes):
-            kpt_data = self.tracks.keypoints.data
-
-            for i, k in enumerate(kpt_data):
-                state = self.states[self.track_ids[i]]  # get state details
-                # Get keypoints and estimate the angle
-                state["angle"] = annotator.estimate_pose_angle(*[k[int(idx)] for idx in self.kpts])
-                annotator.draw_specific_kpts(k, self.kpts, radius=self.line_width * 3)
-
-                # Determine stage and count logic based on angle thresholds
-                if state["angle"] < self.down_angle:
-                    if state["stage"] == "up":
-                        state["count"] += 1
-                    state["stage"] = "down"
-                elif state["angle"] > self.up_angle:
-                    state["stage"] = "up"
-
-                # Display angle, count, and stage text
-                if self.show_labels:
-                    annotator.plot_angle_and_count_and_stage(
-                        angle_text=state["angle"],  # angle text for display
-                        count_text=state["count"],  # count text for workouts
-                        stage_text=state["stage"],  # stage position text
-                        center_kpt=k[int(self.kpts[1])],  # center keypoint for display
-                    )
-        plot_im = annotator.result()
-        self.display_output(plot_im)  # Display output image, if environment support display
-
-        # Return SolutionResults
-        return SolutionResults(
-            plot_im=plot_im,
-            workout_count=[v["count"] for v in self.states.values()],
-            workout_stage=[v["stage"] for v in self.states.values()],
-            workout_angle=[v["angle"] for v in self.states.values()],
-            total_tracks=len(self.track_ids),
-        )
diff --git a/ultralytics/solutions/analytics.py b/ultralytics/solutions/analytics.py
deleted file mode 100644
index e62eaf0..0000000
--- a/ultralytics/solutions/analytics.py
+++ /dev/null
@@ -1,265 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-from itertools import cycle
-from typing import Any
-
-import cv2
-import numpy as np
-
-from ultralytics.solutions.solutions import BaseSolution, SolutionResults  # Import a parent class
-
-
-class Analytics(BaseSolution):
-    """
-    A class for creating and updating various types of charts for visual analytics.
-
-    This class extends BaseSolution to provide functionality for generating line, bar, pie, and area charts
-    based on object detection and tracking data.
-
-    Attributes:
-        type (str): The type of analytics chart to generate ('line', 'bar', 'pie', or 'area').
-        x_label (str): Label for the x-axis.
-        y_label (str): Label for the y-axis.
-        bg_color (str): Background color of the chart frame.
-        fg_color (str): Foreground color of the chart frame.
-        title (str): Title of the chart window.
-        max_points (int): Maximum number of data points to display on the chart.
-        fontsize (int): Font size for text display.
-        color_cycle (cycle): Cyclic iterator for chart colors.
-        total_counts (int): Total count of detected objects (used for line charts).
-        clswise_count (dict[str, int]): Dictionary for class-wise object counts.
-        fig (Figure): Matplotlib figure object for the chart.
-        ax (Axes): Matplotlib axes object for the chart.
-        canvas (FigureCanvasAgg): Canvas for rendering the chart.
-        lines (dict): Dictionary to store line objects for area charts.
-        color_mapping (dict[str, str]): Dictionary mapping class labels to colors for consistent visualization.
-
-    Methods:
-        process: Process image data and update the chart.
-        update_graph: Update the chart with new data points.
-
-    Examples:
-        >>> analytics = Analytics(analytics_type="line")
-        >>> frame = cv2.imread("image.jpg")
-        >>> results = analytics.process(frame, frame_number=1)
-        >>> cv2.imshow("Analytics", results.plot_im)
-    """
-
-    def __init__(self, **kwargs: Any) -> None:
-        """Initialize Analytics class with various chart types for visual data representation."""
-        super().__init__(**kwargs)
-
-        import matplotlib.pyplot as plt  # scope for faster 'import ultralytics'
-        from matplotlib.backends.backend_agg import FigureCanvasAgg
-        from matplotlib.figure import Figure
-
-        self.type = self.CFG["analytics_type"]  # type of analytics i.e "line", "pie", "bar" or "area" charts.
-        self.x_label = "Classes" if self.type in {"bar", "pie"} else "Frame#"
-        self.y_label = "Total Counts"
-
-        # Predefined data
-        self.bg_color = "#F3F3F3"  # background color of frame
-        self.fg_color = "#111E68"  # foreground color of frame
-        self.title = "Ultralytics Solutions"  # window name
-        self.max_points = 45  # maximum points to be drawn on window
-        self.fontsize = 25  # text font size for display
-        figsize = self.CFG["figsize"]  # set output image size i.e (12.8, 7.2) -> w = 1280, h = 720
-        self.color_cycle = cycle(["#DD00BA", "#042AFF", "#FF4447", "#7D24FF", "#BD00FF"])
-
-        self.total_counts = 0  # count variable for storing total counts i.e. for line
-        self.clswise_count = {}  # dictionary for class-wise counts
-        self.update_every = kwargs.get("update_every", 30)  # Only update graph every 30 frames by default
-        self.last_plot_im = None  # Cache of the last rendered chart
-
-        # Ensure line and area chart
-        if self.type in {"line", "area"}:
-            self.lines = {}
-            self.fig = Figure(facecolor=self.bg_color, figsize=figsize)
-            self.canvas = FigureCanvasAgg(self.fig)  # Set common axis properties
-            self.ax = self.fig.add_subplot(111, facecolor=self.bg_color)
-            if self.type == "line":
-                (self.line,) = self.ax.plot([], [], color="cyan", linewidth=self.line_width)
-        elif self.type in {"bar", "pie"}:
-            # Initialize bar or pie plot
-            self.fig, self.ax = plt.subplots(figsize=figsize, facecolor=self.bg_color)
-            self.canvas = FigureCanvasAgg(self.fig)  # Set common axis properties
-            self.ax.set_facecolor(self.bg_color)
-            self.color_mapping = {}
-
-            if self.type == "pie":  # Ensure pie chart is circular
-                self.ax.axis("equal")
-
-    def process(self, im0: np.ndarray, frame_number: int) -> SolutionResults:
-        """
-        Process image data and run object tracking to update analytics charts.
-
-        Args:
-            im0 (np.ndarray): Input image for processing.
-            frame_number (int): Video frame number for plotting the data.
-
-        Returns:
-            (SolutionResults): Contains processed image `plot_im`, 'total_tracks' (int, total number of tracked objects)
-                and 'classwise_count' (dict, per-class object count).
-
-        Raises:
-            ModuleNotFoundError: If an unsupported chart type is specified.
-
-        Examples:
-            >>> analytics = Analytics(analytics_type="line")
-            >>> frame = np.zeros((480, 640, 3), dtype=np.uint8)
-            >>> results = analytics.process(frame, frame_number=1)
-        """
-        self.extract_tracks(im0)  # Extract tracks
-        if self.type == "line":
-            for _ in self.boxes:
-                self.total_counts += 1
-            update_required = frame_number % self.update_every == 0 or self.last_plot_im is None
-            if update_required:
-                self.last_plot_im = self.update_graph(frame_number=frame_number)
-            plot_im = self.last_plot_im
-            self.total_counts = 0
-        elif self.type in {"pie", "bar", "area"}:
-            from collections import Counter
-
-            self.clswise_count = Counter(self.names[int(cls)] for cls in self.clss)
-            update_required = frame_number % self.update_every == 0 or self.last_plot_im is None
-            if update_required:
-                self.last_plot_im = self.update_graph(
-                    frame_number=frame_number, count_dict=self.clswise_count, plot=self.type
-                )
-            plot_im = self.last_plot_im
-        else:
-            raise ModuleNotFoundError(f"{self.type} chart is not supported ❌")
-
-        # return output dictionary with summary for more usage
-        return SolutionResults(plot_im=plot_im, total_tracks=len(self.track_ids), classwise_count=self.clswise_count)
-
-    def update_graph(
-        self, frame_number: int, count_dict: dict[str, int] | None = None, plot: str = "line"
-    ) -> np.ndarray:
-        """
-        Update the graph with new data for single or multiple classes.
-
-        Args:
-            frame_number (int): The current frame number.
-            count_dict (dict[str, int], optional): Dictionary with class names as keys and counts as values for
-                multiple classes. If None, updates a single line graph.
-            plot (str): Type of the plot. Options are 'line', 'bar', 'pie', or 'area'.
-
-        Returns:
-            (np.ndarray): Updated image containing the graph.
-
-        Examples:
-            >>> analytics = Analytics(analytics_type="bar")
-            >>> frame_num = 10
-            >>> results_dict = {"person": 5, "car": 3}
-            >>> updated_image = analytics.update_graph(frame_num, results_dict, plot="bar")
-        """
-        if count_dict is None:
-            # Single line update
-            x_data = np.append(self.line.get_xdata(), float(frame_number))
-            y_data = np.append(self.line.get_ydata(), float(self.total_counts))
-
-            if len(x_data) > self.max_points:
-                x_data, y_data = x_data[-self.max_points :], y_data[-self.max_points :]
-
-            self.line.set_data(x_data, y_data)
-            self.line.set_label("Counts")
-            self.line.set_color("#7b0068")  # Pink color
-            self.line.set_marker("*")
-            self.line.set_markersize(self.line_width * 5)
-        else:
-            labels = list(count_dict.keys())
-            counts = list(count_dict.values())
-            if plot == "area":
-                color_cycle = cycle(["#DD00BA", "#042AFF", "#FF4447", "#7D24FF", "#BD00FF"])
-                # Multiple lines or area update
-                x_data = self.ax.lines[0].get_xdata() if self.ax.lines else np.array([])
-                y_data_dict = {key: np.array([]) for key in count_dict.keys()}
-                if self.ax.lines:
-                    for line, key in zip(self.ax.lines, count_dict.keys()):
-                        y_data_dict[key] = line.get_ydata()
-
-                x_data = np.append(x_data, float(frame_number))
-                max_length = len(x_data)
-                for key in count_dict.keys():
-                    y_data_dict[key] = np.append(y_data_dict[key], float(count_dict[key]))
-                    if len(y_data_dict[key]) < max_length:
-                        y_data_dict[key] = np.pad(y_data_dict[key], (0, max_length - len(y_data_dict[key])))
-                if len(x_data) > self.max_points:
-                    x_data = x_data[1:]
-                    for key in count_dict.keys():
-                        y_data_dict[key] = y_data_dict[key][1:]
-
-                self.ax.clear()
-                for key, y_data in y_data_dict.items():
-                    color = next(color_cycle)
-                    self.ax.fill_between(x_data, y_data, color=color, alpha=0.55)
-                    self.ax.plot(
-                        x_data,
-                        y_data,
-                        color=color,
-                        linewidth=self.line_width,
-                        marker="o",
-                        markersize=self.line_width * 5,
-                        label=f"{key} Data Points",
-                    )
-            elif plot == "bar":
-                self.ax.clear()  # clear bar data
-                for label in labels:  # Map labels to colors
-                    if label not in self.color_mapping:
-                        self.color_mapping[label] = next(self.color_cycle)
-                colors = [self.color_mapping[label] for label in labels]
-                bars = self.ax.bar(labels, counts, color=colors)
-                for bar, count in zip(bars, counts):
-                    self.ax.text(
-                        bar.get_x() + bar.get_width() / 2,
-                        bar.get_height(),
-                        str(count),
-                        ha="center",
-                        va="bottom",
-                        color=self.fg_color,
-                    )
-                # Create the legend using labels from the bars
-                for bar, label in zip(bars, labels):
-                    bar.set_label(label)  # Assign label to each bar
-                self.ax.legend(loc="upper left", fontsize=13, facecolor=self.fg_color, edgecolor=self.fg_color)
-            elif plot == "pie":
-                total = sum(counts)
-                percentages = [size / total * 100 for size in counts]
-                self.ax.clear()
-
-                start_angle = 90
-                # Create pie chart and create legend labels with percentages
-                wedges, _ = self.ax.pie(
-                    counts, labels=labels, startangle=start_angle, textprops={"color": self.fg_color}, autopct=None
-                )
-                legend_labels = [f"{label} ({percentage:.1f}%)" for label, percentage in zip(labels, percentages)]
-
-                # Assign the legend using the wedges and manually created labels
-                self.ax.legend(wedges, legend_labels, title="Classes", loc="center left", bbox_to_anchor=(1, 0, 0.5, 1))
-                self.fig.subplots_adjust(left=0.1, right=0.75)  # Adjust layout to fit the legend
-
-        # Common plot settings
-        self.ax.set_facecolor("#f0f0f0")  # Set to light gray or any other color you like
-        self.ax.grid(True, linestyle="--", linewidth=0.5, alpha=0.5)  # Display grid for more data insights
-        self.ax.set_title(self.title, color=self.fg_color, fontsize=self.fontsize)
-        self.ax.set_xlabel(self.x_label, color=self.fg_color, fontsize=self.fontsize - 3)
-        self.ax.set_ylabel(self.y_label, color=self.fg_color, fontsize=self.fontsize - 3)
-
-        # Add and format legend
-        legend = self.ax.legend(loc="upper left", fontsize=13, facecolor=self.bg_color, edgecolor=self.bg_color)
-        for text in legend.get_texts():
-            text.set_color(self.fg_color)
-
-        # Redraw graph, update view, capture, and display the updated plot
-        self.ax.relim()
-        self.ax.autoscale_view()
-        self.canvas.draw()
-        im0 = np.array(self.canvas.renderer.buffer_rgba())
-        im0 = cv2.cvtColor(im0[:, :, :3], cv2.COLOR_RGBA2BGR)
-        self.display_output(im0)
-
-        return im0  # Return the image
diff --git a/ultralytics/solutions/config.py b/ultralytics/solutions/config.py
deleted file mode 100644
index d63d4fd..0000000
--- a/ultralytics/solutions/config.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-from dataclasses import dataclass, field
-from typing import Any
-
-import cv2
-
-
-@dataclass
-class SolutionConfig:
-    """
-    Manages configuration parameters for Ultralytics Vision AI solutions.
-
-    The SolutionConfig class serves as a centralized configuration container for all the
-    Ultralytics solution modules: https://docs.ultralytics.com/solutions/#solutions.
-    It leverages Python `dataclass` for clear, type-safe, and maintainable parameter definitions.
-
-    Attributes:
-        source (str, optional): Path to the input source (video, RTSP, etc.). Only usable with Solutions CLI.
-        model (str, optional): Path to the Ultralytics YOLO model to be used for inference.
-        classes (list[int], optional): List of class indices to filter detections.
-        show_conf (bool): Whether to show confidence scores on the visual output.
-        show_labels (bool): Whether to display class labels on visual output.
-        region (list[tuple[int, int]], optional): Polygonal region or line for object counting.
-        colormap (int, optional): OpenCV colormap constant for visual overlays (e.g., cv2.COLORMAP_JET).
-        show_in (bool): Whether to display count number for objects entering the region.
-        show_out (bool): Whether to display count number for objects leaving the region.
-        up_angle (float): Upper angle threshold used in pose-based workouts monitoring.
-        down_angle (int): Lower angle threshold used in pose-based workouts monitoring.
-        kpts (list[int]): Keypoint indices to monitor, e.g., for pose analytics.
-        analytics_type (str): Type of analytics to perform ("line", "area", "bar", "pie", etc.).
-        figsize (tuple[int, int], optional): Size of the matplotlib figure used for analytical plots (width, height).
-        blur_ratio (float): Ratio used to blur objects in the video frames (0.0 to 1.0).
-        vision_point (tuple[int, int]): Reference point for directional tracking or perspective drawing.
-        crop_dir (str): Directory path to save cropped detection images.
-        json_file (str): Path to a JSON file containing data for parking areas.
-        line_width (int): Width for visual display i.e. bounding boxes, keypoints, counts.
-        records (int): Number of detection records to send email alerts.
-        fps (float): Frame rate (Frames Per Second) for speed estimation calculation.
-        max_hist (int): Maximum number of historical points or states stored per tracked object for speed estimation.
-        meter_per_pixel (float): Scale for real-world measurement, used in speed or distance calculations.
-        max_speed (int): Maximum speed limit (e.g., km/h or mph) used in visual alerts or constraints.
-        show (bool): Whether to display the visual output on screen.
-        iou (float): Intersection-over-Union threshold for detection filtering.
-        conf (float): Confidence threshold for keeping predictions.
-        device (str, optional): Device to run inference on (e.g., 'cpu', '0' for CUDA GPU).
-        max_det (int): Maximum number of detections allowed per video frame.
-        half (bool): Whether to use FP16 precision (requires a supported CUDA device).
-        tracker (str): Path to tracking configuration YAML file (e.g., 'botsort.yaml').
-        verbose (bool): Enable verbose logging output for debugging or diagnostics.
-        data (str): Path to image directory used for similarity search.
-
-    Methods:
-        update: Update the configuration with user-defined keyword arguments and raise error on invalid keys.
-
-    Examples:
-        >>> from ultralytics.solutions.config import SolutionConfig
-        >>> cfg = SolutionConfig(model="yolo11n.pt", region=[(0, 0), (100, 0), (100, 100), (0, 100)])
-        >>> cfg.update(show=False, conf=0.3)
-        >>> print(cfg.model)
-    """
-
-    source: str | None = None
-    model: str | None = None
-    classes: list[int] | None = None
-    show_conf: bool = True
-    show_labels: bool = True
-    region: list[tuple[int, int]] | None = None
-    colormap: int | None = cv2.COLORMAP_DEEPGREEN
-    show_in: bool = True
-    show_out: bool = True
-    up_angle: float = 145.0
-    down_angle: int = 90
-    kpts: list[int] = field(default_factory=lambda: [6, 8, 10])
-    analytics_type: str = "line"
-    figsize: tuple[int, int] | None = (12.8, 7.2)
-    blur_ratio: float = 0.5
-    vision_point: tuple[int, int] = (20, 20)
-    crop_dir: str = "cropped-detections"
-    json_file: str = None
-    line_width: int = 2
-    records: int = 5
-    fps: float = 30.0
-    max_hist: int = 5
-    meter_per_pixel: float = 0.05
-    max_speed: int = 120
-    show: bool = False
-    iou: float = 0.7
-    conf: float = 0.25
-    device: str | None = None
-    max_det: int = 300
-    half: bool = False
-    tracker: str = "botsort.yaml"
-    verbose: bool = True
-    data: str = "images"
-
-    def update(self, **kwargs: Any):
-        """Update configuration parameters with new values provided as keyword arguments."""
-        for key, value in kwargs.items():
-            if hasattr(self, key):
-                setattr(self, key, value)
-            else:
-                url = "https://docs.ultralytics.com/solutions/#solutions-arguments"
-                raise ValueError(f"{key} is not a valid solution argument, see {url}")
-
-        return self
diff --git a/ultralytics/solutions/distance_calculation.py b/ultralytics/solutions/distance_calculation.py
deleted file mode 100644
index 03173f4..0000000
--- a/ultralytics/solutions/distance_calculation.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-import math
-from typing import Any
-
-import cv2
-
-from ultralytics.solutions.solutions import BaseSolution, SolutionAnnotator, SolutionResults
-from ultralytics.utils.plotting import colors
-
-
-class DistanceCalculation(BaseSolution):
-    """
-    A class to calculate distance between two objects in a real-time video stream based on their tracks.
-
-    This class extends BaseSolution to provide functionality for selecting objects and calculating the distance
-    between them in a video stream using YOLO object detection and tracking.
-
-    Attributes:
-        left_mouse_count (int): Counter for left mouse button clicks.
-        selected_boxes (dict[int, list[float]]): Dictionary to store selected bounding boxes and their track IDs.
-        centroids (list[list[int]]): List to store centroids of selected bounding boxes.
-
-    Methods:
-        mouse_event_for_distance: Handle mouse events for selecting objects in the video stream.
-        process: Process video frames and calculate the distance between selected objects.
-
-    Examples:
-        >>> distance_calc = DistanceCalculation()
-        >>> frame = cv2.imread("frame.jpg")
-        >>> results = distance_calc.process(frame)
-        >>> cv2.imshow("Distance Calculation", results.plot_im)
-        >>> cv2.waitKey(0)
-    """
-
-    def __init__(self, **kwargs: Any) -> None:
-        """Initialize the DistanceCalculation class for measuring object distances in video streams."""
-        super().__init__(**kwargs)
-
-        # Mouse event information
-        self.left_mouse_count = 0
-        self.selected_boxes: dict[int, list[float]] = {}
-        self.centroids: list[list[int]] = []  # Store centroids of selected objects
-
-    def mouse_event_for_distance(self, event: int, x: int, y: int, flags: int, param: Any) -> None:
-        """
-        Handle mouse events to select regions in a real-time video stream for distance calculation.
-
-        Args:
-            event (int): Type of mouse event (e.g., cv2.EVENT_MOUSEMOVE, cv2.EVENT_LBUTTONDOWN).
-            x (int): X-coordinate of the mouse pointer.
-            y (int): Y-coordinate of the mouse pointer.
-            flags (int): Flags associated with the event (e.g., cv2.EVENT_FLAG_CTRLKEY, cv2.EVENT_FLAG_SHIFTKEY).
-            param (Any): Additional parameters passed to the function.
-
-        Examples:
-            >>> # Assuming 'dc' is an instance of DistanceCalculation
-            >>> cv2.setMouseCallback("window_name", dc.mouse_event_for_distance)
-        """
-        if event == cv2.EVENT_LBUTTONDOWN:
-            self.left_mouse_count += 1
-            if self.left_mouse_count <= 2:
-                for box, track_id in zip(self.boxes, self.track_ids):
-                    if box[0] < x < box[2] and box[1] < y < box[3] and track_id not in self.selected_boxes:
-                        self.selected_boxes[track_id] = box
-
-        elif event == cv2.EVENT_RBUTTONDOWN:
-            self.selected_boxes = {}
-            self.left_mouse_count = 0
-
-    def process(self, im0) -> SolutionResults:
-        """
-        Process a video frame and calculate the distance between two selected bounding boxes.
-
-        This method extracts tracks from the input frame, annotates bounding boxes, and calculates the distance
-        between two user-selected objects if they have been chosen.
-
-        Args:
-            im0 (np.ndarray): The input image frame to process.
-
-        Returns:
-            (SolutionResults): Contains processed image `plot_im`, `total_tracks` (int) representing the total number
-                of tracked objects, and `pixels_distance` (float) representing the distance between selected objects
-                in pixels.
-
-        Examples:
-            >>> import numpy as np
-            >>> from ultralytics.solutions import DistanceCalculation
-            >>> dc = DistanceCalculation()
-            >>> frame = np.random.randint(0, 255, (480, 640, 3), dtype=np.uint8)
-            >>> results = dc.process(frame)
-            >>> print(f"Distance: {results.pixels_distance:.2f} pixels")
-        """
-        self.extract_tracks(im0)  # Extract tracks
-        annotator = SolutionAnnotator(im0, line_width=self.line_width)  # Initialize annotator
-
-        pixels_distance = 0
-        # Iterate over bounding boxes, track ids and classes index
-        for box, track_id, cls, conf in zip(self.boxes, self.track_ids, self.clss, self.confs):
-            annotator.box_label(box, color=colors(int(cls), True), label=self.adjust_box_label(cls, conf, track_id))
-
-            # Update selected boxes if they're being tracked
-            if len(self.selected_boxes) == 2:
-                for trk_id in self.selected_boxes.keys():
-                    if trk_id == track_id:
-                        self.selected_boxes[track_id] = box
-
-        if len(self.selected_boxes) == 2:
-            # Calculate centroids of selected boxes
-            self.centroids.extend(
-                [[int((box[0] + box[2]) // 2), int((box[1] + box[3]) // 2)] for box in self.selected_boxes.values()]
-            )
-            # Calculate Euclidean distance between centroids
-            pixels_distance = math.sqrt(
-                (self.centroids[0][0] - self.centroids[1][0]) ** 2 + (self.centroids[0][1] - self.centroids[1][1]) ** 2
-            )
-            annotator.plot_distance_and_line(pixels_distance, self.centroids)
-
-        self.centroids = []  # Reset centroids for next frame
-        plot_im = annotator.result()
-        self.display_output(plot_im)  # Display output with base class function
-        if self.CFG.get("show") and self.env_check:
-            cv2.setMouseCallback("Ultralytics Solutions", self.mouse_event_for_distance)
-
-        # Return SolutionResults with processed image and calculated metrics
-        return SolutionResults(plot_im=plot_im, pixels_distance=pixels_distance, total_tracks=len(self.track_ids))
diff --git a/ultralytics/solutions/heatmap.py b/ultralytics/solutions/heatmap.py
deleted file mode 100644
index 5fad50b..0000000
--- a/ultralytics/solutions/heatmap.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-from typing import Any
-
-import cv2
-import numpy as np
-
-from ultralytics.solutions.object_counter import ObjectCounter
-from ultralytics.solutions.solutions import SolutionAnnotator, SolutionResults
-
-
-class Heatmap(ObjectCounter):
-    """
-    A class to draw heatmaps in real-time video streams based on object tracks.
-
-    This class extends the ObjectCounter class to generate and visualize heatmaps of object movements in video
-    streams. It uses tracked object positions to create a cumulative heatmap effect over time.
-
-    Attributes:
-        initialized (bool): Flag indicating whether the heatmap has been initialized.
-        colormap (int): OpenCV colormap used for heatmap visualization.
-        heatmap (np.ndarray): Array storing the cumulative heatmap data.
-        annotator (SolutionAnnotator): Object for drawing annotations on the image.
-
-    Methods:
-        heatmap_effect: Calculate and update the heatmap effect for a given bounding box.
-        process: Generate and apply the heatmap effect to each frame.
-
-    Examples:
-        >>> from ultralytics.solutions import Heatmap
-        >>> heatmap = Heatmap(model="yolo11n.pt", colormap=cv2.COLORMAP_JET)
-        >>> frame = cv2.imread("frame.jpg")
-        >>> processed_frame = heatmap.process(frame)
-    """
-
-    def __init__(self, **kwargs: Any) -> None:
-        """
-        Initialize the Heatmap class for real-time video stream heatmap generation based on object tracks.
-
-        Args:
-            **kwargs (Any): Keyword arguments passed to the parent ObjectCounter class.
-        """
-        super().__init__(**kwargs)
-
-        self.initialized = False  # Flag for heatmap initialization
-        if self.region is not None:  # Check if user provided the region coordinates
-            self.initialize_region()
-
-        # Store colormap
-        self.colormap = self.CFG["colormap"]
-        self.heatmap = None
-
-    def heatmap_effect(self, box: list[float]) -> None:
-        """
-        Efficiently calculate heatmap area and effect location for applying colormap.
-
-        Args:
-            box (list[float]): Bounding box coordinates [x0, y0, x1, y1].
-        """
-        x0, y0, x1, y1 = map(int, box)
-        radius_squared = (min(x1 - x0, y1 - y0) // 2) ** 2
-
-        # Create a meshgrid with region of interest (ROI) for vectorized distance calculations
-        xv, yv = np.meshgrid(np.arange(x0, x1), np.arange(y0, y1))
-
-        # Calculate squared distances from the center
-        dist_squared = (xv - ((x0 + x1) // 2)) ** 2 + (yv - ((y0 + y1) // 2)) ** 2
-
-        # Create a mask of points within the radius
-        within_radius = dist_squared <= radius_squared
-
-        # Update only the values within the bounding box in a single vectorized operation
-        self.heatmap[y0:y1, x0:x1][within_radius] += 2
-
-    def process(self, im0: np.ndarray) -> SolutionResults:
-        """
-        Generate heatmap for each frame using Ultralytics tracking.
-
-        Args:
-            im0 (np.ndarray): Input image array for processing.
-
-        Returns:
-            (SolutionResults): Contains processed image `plot_im`,
-                'in_count' (int, count of objects entering the region),
-                'out_count' (int, count of objects exiting the region),
-                'classwise_count' (dict, per-class object count), and
-                'total_tracks' (int, total number of tracked objects).
-        """
-        if not self.initialized:
-            self.heatmap = np.zeros_like(im0, dtype=np.float32) * 0.99
-            self.initialized = True  # Initialize heatmap only once
-
-        self.extract_tracks(im0)  # Extract tracks
-        self.annotator = SolutionAnnotator(im0, line_width=self.line_width)  # Initialize annotator
-
-        # Iterate over bounding boxes, track ids and classes index
-        for box, track_id, cls in zip(self.boxes, self.track_ids, self.clss):
-            # Apply heatmap effect for the bounding box
-            self.heatmap_effect(box)
-
-            if self.region is not None:
-                self.annotator.draw_region(reg_pts=self.region, color=(104, 0, 123), thickness=self.line_width * 2)
-                self.store_tracking_history(track_id, box)  # Store track history
-                # Get previous position if available
-                prev_position = None
-                if len(self.track_history[track_id]) > 1:
-                    prev_position = self.track_history[track_id][-2]
-                self.count_objects(self.track_history[track_id][-1], track_id, prev_position, cls)  # object counting
-
-        plot_im = self.annotator.result()
-        if self.region is not None:
-            self.display_counts(plot_im)  # Display the counts on the frame
-
-        # Normalize, apply colormap to heatmap and combine with original image
-        if self.track_data.is_track:
-            normalized_heatmap = cv2.normalize(self.heatmap, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
-            colored_heatmap = cv2.applyColorMap(normalized_heatmap, self.colormap)
-            plot_im = cv2.addWeighted(plot_im, 0.5, colored_heatmap, 0.5, 0)
-
-        self.display_output(plot_im)  # Display output with base class function
-
-        # Return SolutionResults
-        return SolutionResults(
-            plot_im=plot_im,
-            in_count=self.in_count,
-            out_count=self.out_count,
-            classwise_count=dict(self.classwise_count),
-            total_tracks=len(self.track_ids),
-        )
diff --git a/ultralytics/solutions/instance_segmentation.py b/ultralytics/solutions/instance_segmentation.py
deleted file mode 100644
index 2084783..0000000
--- a/ultralytics/solutions/instance_segmentation.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from typing import Any
-
-from ultralytics.engine.results import Results
-from ultralytics.solutions.solutions import BaseSolution, SolutionResults
-
-
-class InstanceSegmentation(BaseSolution):
-    """
-    A class to manage instance segmentation in images or video streams.
-
-    This class extends the BaseSolution class and provides functionality for performing instance segmentation, including
-    drawing segmented masks with bounding boxes and labels.
-
-    Attributes:
-        model (str): The segmentation model to use for inference.
-        line_width (int): Width of the bounding box and text lines.
-        names (dict[int, str]): Dictionary mapping class indices to class names.
-        clss (list[int]): List of detected class indices.
-        track_ids (list[int]): List of track IDs for detected instances.
-        masks (list[np.ndarray]): List of segmentation masks for detected instances.
-        show_conf (bool): Whether to display confidence scores.
-        show_labels (bool): Whether to display class labels.
-        show_boxes (bool): Whether to display bounding boxes.
-
-    Methods:
-        process: Process the input image to perform instance segmentation and annotate results.
-        extract_tracks: Extract tracks including bounding boxes, classes, and masks from model predictions.
-
-    Examples:
-        >>> segmenter = InstanceSegmentation()
-        >>> frame = cv2.imread("frame.jpg")
-        >>> results = segmenter.process(frame)
-        >>> print(f"Total segmented instances: {results.total_tracks}")
-    """
-
-    def __init__(self, **kwargs: Any) -> None:
-        """
-        Initialize the InstanceSegmentation class for detecting and annotating segmented instances.
-
-        Args:
-            **kwargs (Any): Keyword arguments passed to the BaseSolution parent class.
-                model (str): Model name or path, defaults to "yolo11n-seg.pt".
-        """
-        kwargs["model"] = kwargs.get("model", "yolo11n-seg.pt")
-        super().__init__(**kwargs)
-
-        self.show_conf = self.CFG.get("show_conf", True)
-        self.show_labels = self.CFG.get("show_labels", True)
-        self.show_boxes = self.CFG.get("show_boxes", True)
-
-    def process(self, im0) -> SolutionResults:
-        """
-        Perform instance segmentation on the input image and annotate the results.
-
-        Args:
-            im0 (np.ndarray): The input image for segmentation.
-
-        Returns:
-            (SolutionResults): Object containing the annotated image and total number of tracked instances.
-
-        Examples:
-            >>> segmenter = InstanceSegmentation()
-            >>> frame = cv2.imread("image.jpg")
-            >>> summary = segmenter.process(frame)
-            >>> print(summary)
-        """
-        self.extract_tracks(im0)  # Extract tracks (bounding boxes, classes, and masks)
-        self.masks = getattr(self.tracks, "masks", None)
-
-        # Iterate over detected classes, track IDs, and segmentation masks
-        if self.masks is None:
-            self.LOGGER.warning("No masks detected! Ensure you're using a supported Ultralytics segmentation model.")
-            plot_im = im0
-        else:
-            results = Results(im0, path=None, names=self.names, boxes=self.track_data.data, masks=self.masks.data)
-            plot_im = results.plot(
-                line_width=self.line_width,
-                boxes=self.show_boxes,
-                conf=self.show_conf,
-                labels=self.show_labels,
-                color_mode="instance",
-            )
-
-        self.display_output(plot_im)  # Display the annotated output using the base class function
-
-        # Return SolutionResults
-        return SolutionResults(plot_im=plot_im, total_tracks=len(self.track_ids))
diff --git a/ultralytics/solutions/object_blurrer.py b/ultralytics/solutions/object_blurrer.py
deleted file mode 100644
index 3081264..0000000
--- a/ultralytics/solutions/object_blurrer.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from typing import Any
-
-import cv2
-
-from ultralytics.solutions.solutions import BaseSolution, SolutionAnnotator, SolutionResults
-from ultralytics.utils import LOGGER
-from ultralytics.utils.plotting import colors
-
-
-class ObjectBlurrer(BaseSolution):
-    """
-    A class to manage the blurring of detected objects in a real-time video stream.
-
-    This class extends the BaseSolution class and provides functionality for blurring objects based on detected bounding
-    boxes. The blurred areas are updated directly in the input image, allowing for privacy preservation or other effects.
-
-    Attributes:
-        blur_ratio (int): The intensity of the blur effect applied to detected objects (higher values create more blur).
-        iou (float): Intersection over Union threshold for object detection.
-        conf (float): Confidence threshold for object detection.
-
-    Methods:
-        process: Apply a blurring effect to detected objects in the input image.
-        extract_tracks: Extract tracking information from detected objects.
-        display_output: Display the processed output image.
-
-    Examples:
-        >>> blurrer = ObjectBlurrer()
-        >>> frame = cv2.imread("frame.jpg")
-        >>> processed_results = blurrer.process(frame)
-        >>> print(f"Total blurred objects: {processed_results.total_tracks}")
-    """
-
-    def __init__(self, **kwargs: Any) -> None:
-        """
-        Initialize the ObjectBlurrer class for applying a blur effect to objects detected in video streams or images.
-
-        Args:
-            **kwargs (Any): Keyword arguments passed to the parent class and for configuration.
-                blur_ratio (float): Intensity of the blur effect (0.1-1.0, default=0.5).
-        """
-        super().__init__(**kwargs)
-        blur_ratio = self.CFG["blur_ratio"]
-        if blur_ratio < 0.1:
-            LOGGER.warning("blur ratio cannot be less than 0.1, updating it to default value 0.5")
-            blur_ratio = 0.5
-        self.blur_ratio = int(blur_ratio * 100)
-
-    def process(self, im0) -> SolutionResults:
-        """
-        Apply a blurring effect to detected objects in the input image.
-
-        This method extracts tracking information, applies blur to regions corresponding to detected objects,
-        and annotates the image with bounding boxes.
-
-        Args:
-            im0 (np.ndarray): The input image containing detected objects.
-
-        Returns:
-            (SolutionResults): Object containing the processed image and number of tracked objects.
-                - plot_im (np.ndarray): The annotated output image with blurred objects.
-                - total_tracks (int): The total number of tracked objects in the frame.
-
-        Examples:
-            >>> blurrer = ObjectBlurrer()
-            >>> frame = cv2.imread("image.jpg")
-            >>> results = blurrer.process(frame)
-            >>> print(f"Blurred {results.total_tracks} objects")
-        """
-        self.extract_tracks(im0)  # Extract tracks
-        annotator = SolutionAnnotator(im0, self.line_width)
-
-        # Iterate over bounding boxes and classes
-        for box, cls, conf in zip(self.boxes, self.clss, self.confs):
-            # Crop and blur the detected object
-            blur_obj = cv2.blur(
-                im0[int(box[1]) : int(box[3]), int(box[0]) : int(box[2])],
-                (self.blur_ratio, self.blur_ratio),
-            )
-            # Update the blurred area in the original image
-            im0[int(box[1]) : int(box[3]), int(box[0]) : int(box[2])] = blur_obj
-            annotator.box_label(
-                box, label=self.adjust_box_label(cls, conf), color=colors(cls, True)
-            )  # Annotate bounding box
-
-        plot_im = annotator.result()
-        self.display_output(plot_im)  # Display the output using the base class function
-
-        # Return a SolutionResults
-        return SolutionResults(plot_im=plot_im, total_tracks=len(self.track_ids))
diff --git a/ultralytics/solutions/object_counter.py b/ultralytics/solutions/object_counter.py
deleted file mode 100644
index 9505951..0000000
--- a/ultralytics/solutions/object_counter.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-from collections import defaultdict
-from typing import Any
-
-from ultralytics.solutions.solutions import BaseSolution, SolutionAnnotator, SolutionResults
-from ultralytics.utils.plotting import colors
-
-
-class ObjectCounter(BaseSolution):
-    """
-    A class to manage the counting of objects in a real-time video stream based on their tracks.
-
-    This class extends the BaseSolution class and provides functionality for counting objects moving in and out of a
-    specified region in a video stream. It supports both polygonal and linear regions for counting.
-
-    Attributes:
-        in_count (int): Counter for objects moving inward.
-        out_count (int): Counter for objects moving outward.
-        counted_ids (list[int]): List of IDs of objects that have been counted.
-        classwise_counts (dict[str, dict[str, int]]): Dictionary for counts, categorized by object class.
-        region_initialized (bool): Flag indicating whether the counting region has been initialized.
-        show_in (bool): Flag to control display of inward count.
-        show_out (bool): Flag to control display of outward count.
-        margin (int): Margin for background rectangle size to display counts properly.
-
-    Methods:
-        count_objects: Count objects within a polygonal or linear region based on their tracks.
-        display_counts: Display object counts on the frame.
-        process: Process input data and update counts.
-
-    Examples:
-        >>> counter = ObjectCounter()
-        >>> frame = cv2.imread("frame.jpg")
-        >>> results = counter.process(frame)
-        >>> print(f"Inward count: {counter.in_count}, Outward count: {counter.out_count}")
-    """
-
-    def __init__(self, **kwargs: Any) -> None:
-        """Initialize the ObjectCounter class for real-time object counting in video streams."""
-        super().__init__(**kwargs)
-
-        self.in_count = 0  # Counter for objects moving inward
-        self.out_count = 0  # Counter for objects moving outward
-        self.counted_ids = []  # List of IDs of objects that have been counted
-        self.classwise_count = defaultdict(lambda: {"IN": 0, "OUT": 0})  # Dictionary for counts, categorized by class
-        self.region_initialized = False  # Flag indicating whether the region has been initialized
-
-        self.show_in = self.CFG["show_in"]
-        self.show_out = self.CFG["show_out"]
-        self.margin = self.line_width * 2  # Scales the background rectangle size to display counts properly
-
-    def count_objects(
-        self,
-        current_centroid: tuple[float, float],
-        track_id: int,
-        prev_position: tuple[float, float] | None,
-        cls: int,
-    ) -> None:
-        """
-        Count objects within a polygonal or linear region based on their tracks.
-
-        Args:
-            current_centroid (tuple[float, float]): Current centroid coordinates (x, y) in the current frame.
-            track_id (int): Unique identifier for the tracked object.
-            prev_position (tuple[float, float], optional): Last frame position coordinates (x, y) of the track.
-            cls (int): Class index for classwise count updates.
-
-        Examples:
-            >>> counter = ObjectCounter()
-            >>> track_line = {1: [100, 200], 2: [110, 210], 3: [120, 220]}
-            >>> box = [130, 230, 150, 250]
-            >>> track_id_num = 1
-            >>> previous_position = (120, 220)
-            >>> class_to_count = 0  # In COCO model, class 0 = person
-            >>> counter.count_objects((140, 240), track_id_num, previous_position, class_to_count)
-        """
-        if prev_position is None or track_id in self.counted_ids:
-            return
-
-        if len(self.region) == 2:  # Linear region (defined as a line segment)
-            if self.r_s.intersects(self.LineString([prev_position, current_centroid])):
-                # Determine orientation of the region (vertical or horizontal)
-                if abs(self.region[0][0] - self.region[1][0]) < abs(self.region[0][1] - self.region[1][1]):
-                    # Vertical region: Compare x-coordinates to determine direction
-                    if current_centroid[0] > prev_position[0]:  # Moving right
-                        self.in_count += 1
-                        self.classwise_count[self.names[cls]]["IN"] += 1
-                    else:  # Moving left
-                        self.out_count += 1
-                        self.classwise_count[self.names[cls]]["OUT"] += 1
-                # Horizontal region: Compare y-coordinates to determine direction
-                elif current_centroid[1] > prev_position[1]:  # Moving downward
-                    self.in_count += 1
-                    self.classwise_count[self.names[cls]]["IN"] += 1
-                else:  # Moving upward
-                    self.out_count += 1
-                    self.classwise_count[self.names[cls]]["OUT"] += 1
-                self.counted_ids.append(track_id)
-
-        elif len(self.region) > 2:  # Polygonal region
-            if self.r_s.contains(self.Point(current_centroid)):
-                # Determine motion direction for vertical or horizontal polygons
-                region_width = max(p[0] for p in self.region) - min(p[0] for p in self.region)
-                region_height = max(p[1] for p in self.region) - min(p[1] for p in self.region)
-
-                if (
-                    region_width < region_height
-                    and current_centroid[0] > prev_position[0]
-                    or region_width >= region_height
-                    and current_centroid[1] > prev_position[1]
-                ):  # Moving right or downward
-                    self.in_count += 1
-                    self.classwise_count[self.names[cls]]["IN"] += 1
-                else:  # Moving left or upward
-                    self.out_count += 1
-                    self.classwise_count[self.names[cls]]["OUT"] += 1
-                self.counted_ids.append(track_id)
-
-    def display_counts(self, plot_im) -> None:
-        """
-        Display object counts on the input image or frame.
-
-        Args:
-            plot_im (np.ndarray): The image or frame to display counts on.
-
-        Examples:
-            >>> counter = ObjectCounter()
-            >>> frame = cv2.imread("image.jpg")
-            >>> counter.display_counts(frame)
-        """
-        labels_dict = {
-            str.capitalize(key): f"{'IN ' + str(value['IN']) if self.show_in else ''} "
-            f"{'OUT ' + str(value['OUT']) if self.show_out else ''}".strip()
-            for key, value in self.classwise_count.items()
-            if value["IN"] != 0 or value["OUT"] != 0 and (self.show_in or self.show_out)
-        }
-        if labels_dict:
-            self.annotator.display_analytics(plot_im, labels_dict, (104, 31, 17), (255, 255, 255), self.margin)
-
-    def process(self, im0) -> SolutionResults:
-        """
-        Process input data (frames or object tracks) and update object counts.
-
-        This method initializes the counting region, extracts tracks, draws bounding boxes and regions, updates
-        object counts, and displays the results on the input image.
-
-        Args:
-            im0 (np.ndarray): The input image or frame to be processed.
-
-        Returns:
-            (SolutionResults): Contains processed image `im0`, 'in_count' (int, count of objects entering the region),
-                'out_count' (int, count of objects exiting the region), 'classwise_count' (dict, per-class object count),
-                and 'total_tracks' (int, total number of tracked objects).
-
-        Examples:
-            >>> counter = ObjectCounter()
-            >>> frame = cv2.imread("path/to/image.jpg")
-            >>> results = counter.process(frame)
-        """
-        if not self.region_initialized:
-            self.initialize_region()
-            self.region_initialized = True
-
-        self.extract_tracks(im0)  # Extract tracks
-        self.annotator = SolutionAnnotator(im0, line_width=self.line_width)  # Initialize annotator
-
-        self.annotator.draw_region(
-            reg_pts=self.region, color=(104, 0, 123), thickness=self.line_width * 2
-        )  # Draw region
-
-        # Iterate over bounding boxes, track ids and classes index
-        for box, track_id, cls, conf in zip(self.boxes, self.track_ids, self.clss, self.confs):
-            # Draw bounding box and counting region
-            self.annotator.box_label(box, label=self.adjust_box_label(cls, conf, track_id), color=colors(cls, True))
-            self.store_tracking_history(track_id, box)  # Store track history
-
-            # Store previous position of track for object counting
-            prev_position = None
-            if len(self.track_history[track_id]) > 1:
-                prev_position = self.track_history[track_id][-2]
-            self.count_objects(self.track_history[track_id][-1], track_id, prev_position, cls)  # object counting
-
-        plot_im = self.annotator.result()
-        self.display_counts(plot_im)  # Display the counts on the frame
-        self.display_output(plot_im)  # Display output with base class function
-
-        # Return SolutionResults
-        return SolutionResults(
-            plot_im=plot_im,
-            in_count=self.in_count,
-            out_count=self.out_count,
-            classwise_count=dict(self.classwise_count),
-            total_tracks=len(self.track_ids),
-        )
diff --git a/ultralytics/solutions/object_cropper.py b/ultralytics/solutions/object_cropper.py
deleted file mode 100644
index ce3ed78..0000000
--- a/ultralytics/solutions/object_cropper.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-import os
-from pathlib import Path
-from typing import Any
-
-from ultralytics.solutions.solutions import BaseSolution, SolutionResults
-from ultralytics.utils.plotting import save_one_box
-
-
-class ObjectCropper(BaseSolution):
-    """
-    A class to manage the cropping of detected objects in a real-time video stream or images.
-
-    This class extends the BaseSolution class and provides functionality for cropping objects based on detected bounding
-    boxes. The cropped images are saved to a specified directory for further analysis or usage.
-
-    Attributes:
-        crop_dir (str): Directory where cropped object images are stored.
-        crop_idx (int): Counter for the total number of cropped objects.
-        iou (float): IoU (Intersection over Union) threshold for non-maximum suppression.
-        conf (float): Confidence threshold for filtering detections.
-
-    Methods:
-        process: Crop detected objects from the input image and save them to the output directory.
-
-    Examples:
-        >>> cropper = ObjectCropper()
-        >>> frame = cv2.imread("frame.jpg")
-        >>> processed_results = cropper.process(frame)
-        >>> print(f"Total cropped objects: {cropper.crop_idx}")
-    """
-
-    def __init__(self, **kwargs: Any) -> None:
-        """
-        Initialize the ObjectCropper class for cropping objects from detected bounding boxes.
-
-        Args:
-            **kwargs (Any): Keyword arguments passed to the parent class and used for configuration.
-                crop_dir (str): Path to the directory for saving cropped object images.
-        """
-        super().__init__(**kwargs)
-
-        self.crop_dir = self.CFG["crop_dir"]  # Directory for storing cropped detections
-        if not os.path.exists(self.crop_dir):
-            os.mkdir(self.crop_dir)  # Create directory if it does not exist
-        if self.CFG["show"]:
-            self.LOGGER.warning(
-                f"show=True disabled for crop solution, results will be saved in the directory named: {self.crop_dir}"
-            )
-        self.crop_idx = 0  # Initialize counter for total cropped objects
-        self.iou = self.CFG["iou"]
-        self.conf = self.CFG["conf"]
-
-    def process(self, im0) -> SolutionResults:
-        """
-        Crop detected objects from the input image and save them as separate images.
-
-        Args:
-            im0 (np.ndarray): The input image containing detected objects.
-
-        Returns:
-            (SolutionResults): A SolutionResults object containing the total number of cropped objects and processed
-                image.
-
-        Examples:
-            >>> cropper = ObjectCropper()
-            >>> frame = cv2.imread("image.jpg")
-            >>> results = cropper.process(frame)
-            >>> print(f"Total cropped objects: {results.total_crop_objects}")
-        """
-        with self.profilers[0]:
-            results = self.model.predict(
-                im0,
-                classes=self.classes,
-                conf=self.conf,
-                iou=self.iou,
-                device=self.CFG["device"],
-                verbose=False,
-            )[0]
-            self.clss = results.boxes.cls.tolist()  # required for logging only.
-
-        for box in results.boxes:
-            self.crop_idx += 1
-            save_one_box(
-                box.xyxy,
-                im0,
-                file=Path(self.crop_dir) / f"crop_{self.crop_idx}.jpg",
-                BGR=True,
-            )
-
-        # Return SolutionResults
-        return SolutionResults(plot_im=im0, total_crop_objects=self.crop_idx)
diff --git a/ultralytics/solutions/parking_management.py b/ultralytics/solutions/parking_management.py
deleted file mode 100644
index ba3acf2..0000000
--- a/ultralytics/solutions/parking_management.py
+++ /dev/null
@@ -1,278 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import json
-from typing import Any
-
-import cv2
-import numpy as np
-
-from ultralytics.solutions.solutions import BaseSolution, SolutionAnnotator, SolutionResults
-from ultralytics.utils import LOGGER
-from ultralytics.utils.checks import check_imshow
-
-
-class ParkingPtsSelection:
-    """
-    A class for selecting and managing parking zone points on images using a Tkinter-based UI.
-
-    This class provides functionality to upload an image, select points to define parking zones, and save the
-    selected points to a JSON file. It uses Tkinter for the graphical user interface.
-
-    Attributes:
-        tk (module): The Tkinter module for GUI operations.
-        filedialog (module): Tkinter's filedialog module for file selection operations.
-        messagebox (module): Tkinter's messagebox module for displaying message boxes.
-        master (tk.Tk): The main Tkinter window.
-        canvas (tk.Canvas): The canvas widget for displaying the image and drawing bounding boxes.
-        image (PIL.Image.Image): The uploaded image.
-        canvas_image (ImageTk.PhotoImage): The image displayed on the canvas.
-        rg_data (list[list[tuple[int, int]]]): List of bounding boxes, each defined by 4 points.
-        current_box (list[tuple[int, int]]): Temporary storage for the points of the current bounding box.
-        imgw (int): Original width of the uploaded image.
-        imgh (int): Original height of the uploaded image.
-        canvas_max_width (int): Maximum width of the canvas.
-        canvas_max_height (int): Maximum height of the canvas.
-
-    Methods:
-        initialize_properties: Initialize properties for image, canvas, bounding boxes, and dimensions.
-        upload_image: Upload and display an image on the canvas, resizing it to fit within specified dimensions.
-        on_canvas_click: Handle mouse clicks to add points for bounding boxes on the canvas.
-        draw_box: Draw a bounding box on the canvas using the provided coordinates.
-        remove_last_bounding_box: Remove the last bounding box from the list and redraw the canvas.
-        redraw_canvas: Redraw the canvas with the image and all bounding boxes.
-        save_to_json: Save the selected parking zone points to a JSON file with scaled coordinates.
-
-    Examples:
-        >>> parking_selector = ParkingPtsSelection()
-        >>> # Use the GUI to upload an image, select parking zones, and save the data
-    """
-
-    def __init__(self) -> None:
-        """Initialize the ParkingPtsSelection class, setting up UI and properties for parking zone point selection."""
-        try:  # Check if tkinter is installed
-            import tkinter as tk
-            from tkinter import filedialog, messagebox
-        except ImportError:  # Display error with recommendations
-            import platform
-
-            install_cmd = {
-                "Linux": "sudo apt install python3-tk (Debian/Ubuntu) | sudo dnf install python3-tkinter (Fedora) | "
-                "sudo pacman -S tk (Arch)",
-                "Windows": "reinstall Python and enable the checkbox `tcl/tk and IDLE` on **Optional Features** during installation",
-                "Darwin": "reinstall Python from https://www.python.org/downloads/macos/ or `brew install python-tk`",
-            }.get(platform.system(), "Unknown OS. Check your Python installation.")
-
-            LOGGER.warning(f" Tkinter is not configured or supported. Potential fix: {install_cmd}")
-            return
-
-        if not check_imshow(warn=True):
-            return
-
-        self.tk, self.filedialog, self.messagebox = tk, filedialog, messagebox
-        self.master = self.tk.Tk()  # Reference to the main application window
-        self.master.title("Ultralytics Parking Zones Points Selector")
-        self.master.resizable(False, False)
-
-        self.canvas = self.tk.Canvas(self.master, bg="white")  # Canvas widget for displaying images
-        self.canvas.pack(side=self.tk.BOTTOM)
-
-        self.image = None  # Variable to store the loaded image
-        self.canvas_image = None  # Reference to the image displayed on the canvas
-        self.canvas_max_width = None  # Maximum allowed width for the canvas
-        self.canvas_max_height = None  # Maximum allowed height for the canvas
-        self.rg_data = None  # Data for region annotation management
-        self.current_box = None  # Stores the currently selected bounding box
-        self.imgh = None  # Height of the current image
-        self.imgw = None  # Width of the current image
-
-        # Button frame with buttons
-        button_frame = self.tk.Frame(self.master)
-        button_frame.pack(side=self.tk.TOP)
-
-        for text, cmd in [
-            ("Upload Image", self.upload_image),
-            ("Remove Last BBox", self.remove_last_bounding_box),
-            ("Save", self.save_to_json),
-        ]:
-            self.tk.Button(button_frame, text=text, command=cmd).pack(side=self.tk.LEFT)
-
-        self.initialize_properties()
-        self.master.mainloop()
-
-    def initialize_properties(self) -> None:
-        """Initialize properties for image, canvas, bounding boxes, and dimensions."""
-        self.image = self.canvas_image = None
-        self.rg_data, self.current_box = [], []
-        self.imgw = self.imgh = 0
-        self.canvas_max_width, self.canvas_max_height = 1280, 720
-
-    def upload_image(self) -> None:
-        """Upload and display an image on the canvas, resizing it to fit within specified dimensions."""
-        from PIL import Image, ImageTk  # Scoped import because ImageTk requires tkinter package
-
-        file = self.filedialog.askopenfilename(filetypes=[("Image Files", "*.png *.jpg *.jpeg")])
-        if not file:
-            LOGGER.info("No image selected.")
-            return
-
-        self.image = Image.open(file)
-        self.imgw, self.imgh = self.image.size
-        aspect_ratio = self.imgw / self.imgh
-        canvas_width = (
-            min(self.canvas_max_width, self.imgw) if aspect_ratio > 1 else int(self.canvas_max_height * aspect_ratio)
-        )
-        canvas_height = (
-            min(self.canvas_max_height, self.imgh) if aspect_ratio <= 1 else int(canvas_width / aspect_ratio)
-        )
-
-        self.canvas.config(width=canvas_width, height=canvas_height)
-        self.canvas_image = ImageTk.PhotoImage(self.image.resize((canvas_width, canvas_height)))
-        self.canvas.create_image(0, 0, anchor=self.tk.NW, image=self.canvas_image)
-        self.canvas.bind("<Button-1>", self.on_canvas_click)
-
-        self.rg_data.clear(), self.current_box.clear()
-
-    def on_canvas_click(self, event) -> None:
-        """Handle mouse clicks to add points for bounding boxes on the canvas."""
-        self.current_box.append((event.x, event.y))
-        self.canvas.create_oval(event.x - 3, event.y - 3, event.x + 3, event.y + 3, fill="red")
-        if len(self.current_box) == 4:
-            self.rg_data.append(self.current_box.copy())
-            self.draw_box(self.current_box)
-            self.current_box.clear()
-
-    def draw_box(self, box: list[tuple[int, int]]) -> None:
-        """Draw a bounding box on the canvas using the provided coordinates."""
-        for i in range(4):
-            self.canvas.create_line(box[i], box[(i + 1) % 4], fill="blue", width=2)
-
-    def remove_last_bounding_box(self) -> None:
-        """Remove the last bounding box from the list and redraw the canvas."""
-        if not self.rg_data:
-            self.messagebox.showwarning("Warning", "No bounding boxes to remove.")
-            return
-        self.rg_data.pop()
-        self.redraw_canvas()
-
-    def redraw_canvas(self) -> None:
-        """Redraw the canvas with the image and all bounding boxes."""
-        self.canvas.delete("all")
-        self.canvas.create_image(0, 0, anchor=self.tk.NW, image=self.canvas_image)
-        for box in self.rg_data:
-            self.draw_box(box)
-
-    def save_to_json(self) -> None:
-        """Save the selected parking zone points to a JSON file with scaled coordinates."""
-        scale_w, scale_h = self.imgw / self.canvas.winfo_width(), self.imgh / self.canvas.winfo_height()
-        data = [{"points": [(int(x * scale_w), int(y * scale_h)) for x, y in box]} for box in self.rg_data]
-
-        from io import StringIO  # Function level import, as it's only required to store coordinates
-
-        write_buffer = StringIO()
-        json.dump(data, write_buffer, indent=4)
-        with open("bounding_boxes.json", "w", encoding="utf-8") as f:
-            f.write(write_buffer.getvalue())
-        self.messagebox.showinfo("Success", "Bounding boxes saved to bounding_boxes.json")
-
-
-class ParkingManagement(BaseSolution):
-    """
-    Manages parking occupancy and availability using YOLO model for real-time monitoring and visualization.
-
-    This class extends BaseSolution to provide functionality for parking lot management, including detection of
-    occupied spaces, visualization of parking regions, and display of occupancy statistics.
-
-    Attributes:
-        json_file (str): Path to the JSON file containing parking region details.
-        json (list[dict]): Loaded JSON data containing parking region information.
-        pr_info (dict[str, int]): Dictionary storing parking information (Occupancy and Available spaces).
-        arc (tuple[int, int, int]): RGB color tuple for available region visualization.
-        occ (tuple[int, int, int]): RGB color tuple for occupied region visualization.
-        dc (tuple[int, int, int]): RGB color tuple for centroid visualization of detected objects.
-
-    Methods:
-        process: Process the input image for parking lot management and visualization.
-
-    Examples:
-        >>> from ultralytics.solutions import ParkingManagement
-        >>> parking_manager = ParkingManagement(model="yolo11n.pt", json_file="parking_regions.json")
-        >>> print(f"Occupied spaces: {parking_manager.pr_info['Occupancy']}")
-        >>> print(f"Available spaces: {parking_manager.pr_info['Available']}")
-    """
-
-    def __init__(self, **kwargs: Any) -> None:
-        """Initialize the parking management system with a YOLO model and visualization settings."""
-        super().__init__(**kwargs)
-
-        self.json_file = self.CFG["json_file"]  # Load parking regions JSON data
-        if self.json_file is None:
-            LOGGER.warning("json_file argument missing. Parking region details required.")
-            raise ValueError("❌ Json file path can not be empty")
-
-        with open(self.json_file) as f:
-            self.json = json.load(f)
-
-        self.pr_info = {"Occupancy": 0, "Available": 0}  # Dictionary for parking information
-
-        self.arc = (0, 0, 255)  # Available region color
-        self.occ = (0, 255, 0)  # Occupied region color
-        self.dc = (255, 0, 189)  # Centroid color for each box
-
-    def process(self, im0: np.ndarray) -> SolutionResults:
-        """
-        Process the input image for parking lot management and visualization.
-
-        This function analyzes the input image, extracts tracks, and determines the occupancy status of parking
-        regions defined in the JSON file. It annotates the image with occupied and available parking spots,
-        and updates the parking information.
-
-        Args:
-            im0 (np.ndarray): The input inference image.
-
-        Returns:
-            (SolutionResults): Contains processed image `plot_im`, 'filled_slots' (number of occupied parking slots),
-                'available_slots' (number of available parking slots), and 'total_tracks' (total number of tracked objects).
-
-        Examples:
-            >>> parking_manager = ParkingManagement(json_file="parking_regions.json")
-            >>> image = cv2.imread("parking_lot.jpg")
-            >>> results = parking_manager.process(image)
-        """
-        self.extract_tracks(im0)  # Extract tracks from im0
-        es, fs = len(self.json), 0  # Empty slots, filled slots
-        annotator = SolutionAnnotator(im0, self.line_width)  # Initialize annotator
-
-        for region in self.json:
-            # Convert points to a NumPy array with the correct dtype and reshape properly
-            pts_array = np.array(region["points"], dtype=np.int32).reshape((-1, 1, 2))
-            rg_occupied = False  # Occupied region initialization
-            for box, cls in zip(self.boxes, self.clss):
-                xc, yc = int((box[0] + box[2]) / 2), int((box[1] + box[3]) / 2)
-                dist = cv2.pointPolygonTest(pts_array, (xc, yc), False)
-                if dist >= 0:
-                    # cv2.circle(im0, (xc, yc), radius=self.line_width * 4, color=self.dc, thickness=-1)
-                    annotator.display_objects_labels(
-                        im0, self.model.names[int(cls)], (104, 31, 17), (255, 255, 255), xc, yc, 10
-                    )
-                    rg_occupied = True
-                    break
-            fs, es = (fs + 1, es - 1) if rg_occupied else (fs, es)
-            # Plot regions
-            cv2.polylines(im0, [pts_array], isClosed=True, color=self.occ if rg_occupied else self.arc, thickness=2)
-
-        self.pr_info["Occupancy"], self.pr_info["Available"] = fs, es
-
-        annotator.display_analytics(im0, self.pr_info, (104, 31, 17), (255, 255, 255), 10)
-
-        plot_im = annotator.result()
-        self.display_output(plot_im)  # Display output with base class function
-
-        # Return SolutionResults
-        return SolutionResults(
-            plot_im=plot_im,
-            filled_slots=self.pr_info["Occupancy"],
-            available_slots=self.pr_info["Available"],
-            total_tracks=len(self.track_ids),
-        )
diff --git a/ultralytics/solutions/queue_management.py b/ultralytics/solutions/queue_management.py
deleted file mode 100644
index 4cdbfa8..0000000
--- a/ultralytics/solutions/queue_management.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from typing import Any
-
-from ultralytics.solutions.solutions import BaseSolution, SolutionAnnotator, SolutionResults
-from ultralytics.utils.plotting import colors
-
-
-class QueueManager(BaseSolution):
-    """
-    Manages queue counting in real-time video streams based on object tracks.
-
-    This class extends BaseSolution to provide functionality for tracking and counting objects within a specified
-    region in video frames.
-
-    Attributes:
-        counts (int): The current count of objects in the queue.
-        rect_color (tuple[int, int, int]): RGB color tuple for drawing the queue region rectangle.
-        region_length (int): The number of points defining the queue region.
-        track_line (list[tuple[int, int]]): List of track line coordinates.
-        track_history (dict[int, list[tuple[int, int]]]): Dictionary storing tracking history for each object.
-
-    Methods:
-        initialize_region: Initialize the queue region.
-        process: Process a single frame for queue management.
-        extract_tracks: Extract object tracks from the current frame.
-        store_tracking_history: Store the tracking history for an object.
-        display_output: Display the processed output.
-
-    Examples:
-        >>> cap = cv2.VideoCapture("path/to/video.mp4")
-        >>> queue_manager = QueueManager(region=[100, 100, 200, 200, 300, 300])
-        >>> while cap.isOpened():
-        >>>     success, im0 = cap.read()
-        >>>     if not success:
-        >>>         break
-        >>>     results = queue_manager.process(im0)
-    """
-
-    def __init__(self, **kwargs: Any) -> None:
-        """Initialize the QueueManager with parameters for tracking and counting objects in a video stream."""
-        super().__init__(**kwargs)
-        self.initialize_region()
-        self.counts = 0  # Queue counts information
-        self.rect_color = (255, 255, 255)  # Rectangle color for visualization
-        self.region_length = len(self.region)  # Store region length for further usage
-
-    def process(self, im0) -> SolutionResults:
-        """
-        Process queue management for a single frame of video.
-
-        Args:
-            im0 (np.ndarray): Input image for processing, typically a frame from a video stream.
-
-        Returns:
-            (SolutionResults): Contains processed image `im0`, 'queue_count' (int, number of objects in the queue) and
-                'total_tracks' (int, total number of tracked objects).
-
-        Examples:
-            >>> queue_manager = QueueManager()
-            >>> frame = cv2.imread("frame.jpg")
-            >>> results = queue_manager.process(frame)
-        """
-        self.counts = 0  # Reset counts every frame
-        self.extract_tracks(im0)  # Extract tracks from the current frame
-        annotator = SolutionAnnotator(im0, line_width=self.line_width)  # Initialize annotator
-        annotator.draw_region(reg_pts=self.region, color=self.rect_color, thickness=self.line_width * 2)  # Draw region
-
-        for box, track_id, cls, conf in zip(self.boxes, self.track_ids, self.clss, self.confs):
-            # Draw bounding box and counting region
-            annotator.box_label(box, label=self.adjust_box_label(cls, conf, track_id), color=colors(track_id, True))
-            self.store_tracking_history(track_id, box)  # Store track history
-
-            # Cache frequently accessed attributes
-            track_history = self.track_history.get(track_id, [])
-
-            # Store previous position of track and check if the object is inside the counting region
-            prev_position = None
-            if len(track_history) > 1:
-                prev_position = track_history[-2]
-            if self.region_length >= 3 and prev_position and self.r_s.contains(self.Point(self.track_line[-1])):
-                self.counts += 1
-
-        # Display queue counts
-        annotator.queue_counts_display(
-            f"Queue Counts : {str(self.counts)}",
-            points=self.region,
-            region_color=self.rect_color,
-            txt_color=(104, 31, 17),
-        )
-        plot_im = annotator.result()
-        self.display_output(plot_im)  # Display output with base class function
-
-        # Return a SolutionResults object with processed data
-        return SolutionResults(plot_im=plot_im, queue_count=self.counts, total_tracks=len(self.track_ids))
diff --git a/ultralytics/solutions/region_counter.py b/ultralytics/solutions/region_counter.py
deleted file mode 100644
index 2f4d6fa..0000000
--- a/ultralytics/solutions/region_counter.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-from typing import Any
-
-import numpy as np
-
-from ultralytics.solutions.solutions import BaseSolution, SolutionAnnotator, SolutionResults
-from ultralytics.utils.plotting import colors
-
-
-class RegionCounter(BaseSolution):
-    """
-    A class for real-time counting of objects within user-defined regions in a video stream.
-
-    This class inherits from `BaseSolution` and provides functionality to define polygonal regions in a video frame,
-    track objects, and count those objects that pass through each defined region. Useful for applications requiring
-    counting in specified areas, such as monitoring zones or segmented sections.
-
-    Attributes:
-        region_template (dict): Template for creating new counting regions with default attributes including name,
-            polygon coordinates, and display colors.
-        counting_regions (list): List storing all defined regions, where each entry is based on `region_template`
-            and includes specific region settings like name, coordinates, and color.
-        region_counts (dict): Dictionary storing the count of objects for each named region.
-
-    Methods:
-        add_region: Add a new counting region with specified attributes.
-        process: Process video frames to count objects in each region.
-        initialize_regions: Initialize zones to count the objects in each one. Zones could be multiple as well.
-
-    Examples:
-        Initialize a RegionCounter and add a counting region
-        >>> counter = RegionCounter()
-        >>> counter.add_region("Zone1", [(100, 100), (200, 100), (200, 200), (100, 200)], (255, 0, 0), (255, 255, 255))
-        >>> results = counter.process(frame)
-        >>> print(f"Total tracks: {results.total_tracks}")
-    """
-
-    def __init__(self, **kwargs: Any) -> None:
-        """Initialize the RegionCounter for real-time object counting in user-defined regions."""
-        super().__init__(**kwargs)
-        self.region_template = {
-            "name": "Default Region",
-            "polygon": None,
-            "counts": 0,
-            "region_color": (255, 255, 255),
-            "text_color": (0, 0, 0),
-        }
-        self.region_counts = {}
-        self.counting_regions = []
-        self.initialize_regions()
-
-    def add_region(
-        self,
-        name: str,
-        polygon_points: list[tuple],
-        region_color: tuple[int, int, int],
-        text_color: tuple[int, int, int],
-    ) -> dict[str, Any]:
-        """
-        Add a new region to the counting list based on the provided template with specific attributes.
-
-        Args:
-            name (str): Name assigned to the new region.
-            polygon_points (list[tuple]): List of (x, y) coordinates defining the region's polygon.
-            region_color (tuple[int, int, int]): BGR color for region visualization.
-            text_color (tuple[int, int, int]): BGR color for the text within the region.
-
-        Returns:
-            (dict[str, any]): Returns a dictionary including the region information i.e. name, region_color etc.
-        """
-        region = self.region_template.copy()
-        region.update(
-            {
-                "name": name,
-                "polygon": self.Polygon(polygon_points),
-                "region_color": region_color,
-                "text_color": text_color,
-            }
-        )
-        self.counting_regions.append(region)
-        return region
-
-    def initialize_regions(self):
-        """Initialize regions only once."""
-        if self.region is None:
-            self.initialize_region()
-        if not isinstance(self.region, dict):  # Ensure self.region is initialized and structured as a dictionary
-            self.region = {"Region#01": self.region}
-        for i, (name, pts) in enumerate(self.region.items()):
-            region = self.add_region(name, pts, colors(i, True), (255, 255, 255))
-            region["prepared_polygon"] = self.prep(region["polygon"])
-
-    def process(self, im0: np.ndarray) -> SolutionResults:
-        """
-        Process the input frame to detect and count objects within each defined region.
-
-        Args:
-            im0 (np.ndarray): Input image frame where objects and regions are annotated.
-
-        Returns:
-            (SolutionResults): Contains processed image `plot_im`, 'total_tracks' (int, total number of tracked objects),
-                and 'region_counts' (dict, counts of objects per region).
-        """
-        self.extract_tracks(im0)
-        annotator = SolutionAnnotator(im0, line_width=self.line_width)
-
-        for box, cls, track_id, conf in zip(self.boxes, self.clss, self.track_ids, self.confs):
-            annotator.box_label(box, label=self.adjust_box_label(cls, conf, track_id), color=colors(track_id, True))
-            center = self.Point(((box[0] + box[2]) / 2, (box[1] + box[3]) / 2))
-            for region in self.counting_regions:
-                if region["prepared_polygon"].contains(center):
-                    region["counts"] += 1
-                    self.region_counts[region["name"]] = region["counts"]
-
-        # Display region counts
-        for region in self.counting_regions:
-            poly = region["polygon"]
-            pts = list(map(tuple, np.array(poly.exterior.coords, dtype=np.int32)))
-            (x1, y1), (x2, y2) = [(int(poly.centroid.x), int(poly.centroid.y))] * 2
-            annotator.draw_region(pts, region["region_color"], self.line_width * 2)
-            annotator.adaptive_label(
-                [x1, y1, x2, y2],
-                label=str(region["counts"]),
-                color=region["region_color"],
-                txt_color=region["text_color"],
-                margin=self.line_width * 4,
-                shape="rect",
-            )
-            region["counts"] = 0  # Reset for next frame
-        plot_im = annotator.result()
-        self.display_output(plot_im)
-
-        return SolutionResults(plot_im=plot_im, total_tracks=len(self.track_ids), region_counts=self.region_counts)
diff --git a/ultralytics/solutions/security_alarm.py b/ultralytics/solutions/security_alarm.py
deleted file mode 100644
index d34f78d..0000000
--- a/ultralytics/solutions/security_alarm.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from typing import Any
-
-from ultralytics.solutions.solutions import BaseSolution, SolutionAnnotator, SolutionResults
-from ultralytics.utils import LOGGER
-from ultralytics.utils.plotting import colors
-
-
-class SecurityAlarm(BaseSolution):
-    """
-    A class to manage security alarm functionalities for real-time monitoring.
-
-    This class extends the BaseSolution class and provides features to monitor objects in a frame, send email
-    notifications when specific thresholds are exceeded for total detections, and annotate the output frame for
-    visualization.
-
-    Attributes:
-        email_sent (bool): Flag to track if an email has already been sent for the current event.
-        records (int): Threshold for the number of detected objects to trigger an alert.
-        server (smtplib.SMTP): SMTP server connection for sending email alerts.
-        to_email (str): Recipient's email address for alerts.
-        from_email (str): Sender's email address for alerts.
-
-    Methods:
-        authenticate: Set up email server authentication for sending alerts.
-        send_email: Send an email notification with details and an image attachment.
-        process: Monitor the frame, process detections, and trigger alerts if thresholds are crossed.
-
-    Examples:
-        >>> security = SecurityAlarm()
-        >>> security.authenticate("abc@gmail.com", "1111222233334444", "xyz@gmail.com")
-        >>> frame = cv2.imread("frame.jpg")
-        >>> results = security.process(frame)
-    """
-
-    def __init__(self, **kwargs: Any) -> None:
-        """
-        Initialize the SecurityAlarm class with parameters for real-time object monitoring.
-
-        Args:
-            **kwargs (Any): Additional keyword arguments passed to the parent class.
-        """
-        super().__init__(**kwargs)
-        self.email_sent = False
-        self.records = self.CFG["records"]
-        self.server = None
-        self.to_email = ""
-        self.from_email = ""
-
-    def authenticate(self, from_email: str, password: str, to_email: str) -> None:
-        """
-        Authenticate the email server for sending alert notifications.
-
-        Args:
-            from_email (str): Sender's email address.
-            password (str): Password for the sender's email account.
-            to_email (str): Recipient's email address.
-
-        This method initializes a secure connection with the SMTP server and logs in using the provided credentials.
-
-        Examples:
-            >>> alarm = SecurityAlarm()
-            >>> alarm.authenticate("sender@example.com", "password123", "recipient@example.com")
-        """
-        import smtplib
-
-        self.server = smtplib.SMTP("smtp.gmail.com: 587")
-        self.server.starttls()
-        self.server.login(from_email, password)
-        self.to_email = to_email
-        self.from_email = from_email
-
-    def send_email(self, im0, records: int = 5) -> None:
-        """
-        Send an email notification with an image attachment indicating the number of objects detected.
-
-        Args:
-            im0 (np.ndarray): The input image or frame to be attached to the email.
-            records (int, optional): The number of detected objects to be included in the email message.
-
-        This method encodes the input image, composes the email message with details about the detection, and sends it
-        to the specified recipient.
-
-        Examples:
-            >>> alarm = SecurityAlarm()
-            >>> frame = cv2.imread("path/to/image.jpg")
-            >>> alarm.send_email(frame, records=10)
-        """
-        from email.mime.image import MIMEImage
-        from email.mime.multipart import MIMEMultipart
-        from email.mime.text import MIMEText
-
-        import cv2
-
-        img_bytes = cv2.imencode(".jpg", im0)[1].tobytes()  # Encode the image as JPEG
-
-        # Create the email
-        message = MIMEMultipart()
-        message["From"] = self.from_email
-        message["To"] = self.to_email
-        message["Subject"] = "Security Alert"
-
-        # Add the text message body
-        message_body = f"Ultralytics ALERT!!! {records} objects have been detected!!"
-        message.attach(MIMEText(message_body))
-
-        # Attach the image
-        image_attachment = MIMEImage(img_bytes, name="ultralytics.jpg")
-        message.attach(image_attachment)
-
-        # Send the email
-        try:
-            self.server.send_message(message)
-            LOGGER.info("Email sent successfully!")
-        except Exception as e:
-            LOGGER.error(f"Failed to send email: {e}")
-
-    def process(self, im0) -> SolutionResults:
-        """
-        Monitor the frame, process object detections, and trigger alerts if thresholds are exceeded.
-
-        Args:
-            im0 (np.ndarray): The input image or frame to be processed and annotated.
-
-        Returns:
-            (SolutionResults): Contains processed image `plot_im`, 'total_tracks' (total number of tracked objects) and
-                'email_sent' (whether an email alert was triggered).
-
-        This method processes the input frame, extracts detections, annotates the frame with bounding boxes, and sends
-        an email notification if the number of detected objects surpasses the specified threshold and an alert has not
-        already been sent.
-
-        Examples:
-            >>> alarm = SecurityAlarm()
-            >>> frame = cv2.imread("path/to/image.jpg")
-            >>> results = alarm.process(frame)
-        """
-        self.extract_tracks(im0)  # Extract tracks
-        annotator = SolutionAnnotator(im0, line_width=self.line_width)  # Initialize annotator
-
-        # Iterate over bounding boxes and classes index
-        for box, cls in zip(self.boxes, self.clss):
-            # Draw bounding box
-            annotator.box_label(box, label=self.names[cls], color=colors(cls, True))
-
-        total_det = len(self.clss)
-        if total_det >= self.records and not self.email_sent:  # Only send email if not sent before
-            self.send_email(im0, total_det)
-            self.email_sent = True
-
-        plot_im = annotator.result()
-        self.display_output(plot_im)  # Display output with base class function
-
-        # Return a SolutionResults
-        return SolutionResults(plot_im=plot_im, total_tracks=len(self.track_ids), email_sent=self.email_sent)
diff --git a/ultralytics/solutions/similarity_search.py b/ultralytics/solutions/similarity_search.py
deleted file mode 100644
index 37a13ec..0000000
--- a/ultralytics/solutions/similarity_search.py
+++ /dev/null
@@ -1,224 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import os
-from pathlib import Path
-from typing import Any
-
-import numpy as np
-from PIL import Image
-
-from ultralytics.data.utils import IMG_FORMATS
-from ultralytics.utils import LOGGER, TORCH_VERSION
-from ultralytics.utils.checks import check_requirements
-from ultralytics.utils.torch_utils import TORCH_2_4, select_device
-
-os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"  # Avoid OpenMP conflict on some systems
-
-
-class VisualAISearch:
-    """
-    A semantic image search system that leverages OpenCLIP for generating high-quality image and text embeddings and
-    FAISS for fast similarity-based retrieval.
-
-    This class aligns image and text embeddings in a shared semantic space, enabling users to search large collections
-    of images using natural language queries with high accuracy and speed.
-
-    Attributes:
-        data (str): Directory containing images.
-        device (str): Computation device, e.g., 'cpu' or 'cuda'.
-        faiss_index (str): Path to the FAISS index file.
-        data_path_npy (str): Path to the numpy file storing image paths.
-        data_dir (Path): Path object for the data directory.
-        model: Loaded CLIP model.
-        index: FAISS index for similarity search.
-        image_paths (list[str]): List of image file paths.
-
-    Methods:
-        extract_image_feature: Extract CLIP embedding from an image.
-        extract_text_feature: Extract CLIP embedding from text.
-        load_or_build_index: Load existing FAISS index or build new one.
-        search: Perform semantic search for similar images.
-
-    Examples:
-        Initialize and search for images
-        >>> searcher = VisualAISearch(data="path/to/images", device="cuda")
-        >>> results = searcher.search("a cat sitting on a chair", k=10)
-    """
-
-    def __init__(self, **kwargs: Any) -> None:
-        """Initialize the VisualAISearch class with FAISS index and CLIP model."""
-        assert TORCH_2_4, f"VisualAISearch requires torch>=2.4 (found torch=={TORCH_VERSION})"
-        from ultralytics.nn.text_model import build_text_model
-
-        check_requirements("faiss-cpu")
-
-        self.faiss = __import__("faiss")
-        self.faiss_index = "faiss.index"
-        self.data_path_npy = "paths.npy"
-        self.data_dir = Path(kwargs.get("data", "images"))
-        self.device = select_device(kwargs.get("device", "cpu"))
-
-        if not self.data_dir.exists():
-            from ultralytics.utils import ASSETS_URL
-
-            LOGGER.warning(f"{self.data_dir} not found. Downloading images.zip from {ASSETS_URL}/images.zip")
-            from ultralytics.utils.downloads import safe_download
-
-            safe_download(url=f"{ASSETS_URL}/images.zip", unzip=True, retry=3)
-            self.data_dir = Path("images")
-
-        self.model = build_text_model("clip:ViT-B/32", device=self.device)
-
-        self.index = None
-        self.image_paths = []
-
-        self.load_or_build_index()
-
-    def extract_image_feature(self, path: Path) -> np.ndarray:
-        """Extract CLIP image embedding from the given image path."""
-        return self.model.encode_image(Image.open(path)).cpu().numpy()
-
-    def extract_text_feature(self, text: str) -> np.ndarray:
-        """Extract CLIP text embedding from the given text query."""
-        return self.model.encode_text(self.model.tokenize([text])).cpu().numpy()
-
-    def load_or_build_index(self) -> None:
-        """
-        Load existing FAISS index or build a new one from image features.
-
-        Checks if FAISS index and image paths exist on disk. If found, loads them directly. Otherwise, builds a new
-        index by extracting features from all images in the data directory, normalizes the features, and saves both the
-        index and image paths for future use.
-        """
-        # Check if the FAISS index and corresponding image paths already exist
-        if Path(self.faiss_index).exists() and Path(self.data_path_npy).exists():
-            LOGGER.info("Loading existing FAISS index...")
-            self.index = self.faiss.read_index(self.faiss_index)  # Load the FAISS index from disk
-            self.image_paths = np.load(self.data_path_npy)  # Load the saved image path list
-            return  # Exit the function as the index is successfully loaded
-
-        # If the index doesn't exist, start building it from scratch
-        LOGGER.info("Building FAISS index from images...")
-        vectors = []  # List to store feature vectors of images
-
-        # Iterate over all image files in the data directory
-        for file in self.data_dir.iterdir():
-            # Skip files that are not valid image formats
-            if file.suffix.lower().lstrip(".") not in IMG_FORMATS:
-                continue
-            try:
-                # Extract feature vector for the image and add to the list
-                vectors.append(self.extract_image_feature(file))
-                self.image_paths.append(file.name)  # Store the corresponding image name
-            except Exception as e:
-                LOGGER.warning(f"Skipping {file.name}: {e}")
-
-        # If no vectors were successfully created, raise an error
-        if not vectors:
-            raise RuntimeError("No image embeddings could be generated.")
-
-        vectors = np.vstack(vectors).astype("float32")  # Stack all vectors into a NumPy array and convert to float32
-        self.faiss.normalize_L2(vectors)  # Normalize vectors to unit length for cosine similarity
-
-        self.index = self.faiss.IndexFlatIP(vectors.shape[1])  # Create a new FAISS index using inner product
-        self.index.add(vectors)  # Add the normalized vectors to the FAISS index
-        self.faiss.write_index(self.index, self.faiss_index)  # Save the newly built FAISS index to disk
-        np.save(self.data_path_npy, np.array(self.image_paths))  # Save the list of image paths to disk
-
-        LOGGER.info(f"Indexed {len(self.image_paths)} images.")
-
-    def search(self, query: str, k: int = 30, similarity_thresh: float = 0.1) -> list[str]:
-        """
-        Return top-k semantically similar images to the given query.
-
-        Args:
-            query (str): Natural language text query to search for.
-            k (int, optional): Maximum number of results to return.
-            similarity_thresh (float, optional): Minimum similarity threshold for filtering results.
-
-        Returns:
-            (list[str]): List of image filenames ranked by similarity score.
-
-        Examples:
-            Search for images matching a query
-            >>> searcher = VisualAISearch(data="images")
-            >>> results = searcher.search("red car", k=5, similarity_thresh=0.2)
-        """
-        text_feat = self.extract_text_feature(query).astype("float32")
-        self.faiss.normalize_L2(text_feat)
-
-        D, index = self.index.search(text_feat, k)
-        results = [
-            (self.image_paths[i], float(D[0][idx])) for idx, i in enumerate(index[0]) if D[0][idx] >= similarity_thresh
-        ]
-        results.sort(key=lambda x: x[1], reverse=True)
-
-        LOGGER.info("\nRanked Results:")
-        for name, score in results:
-            LOGGER.info(f"  - {name} | Similarity: {score:.4f}")
-
-        return [r[0] for r in results]
-
-    def __call__(self, query: str) -> list[str]:
-        """Direct call interface for the search function."""
-        return self.search(query)
-
-
-class SearchApp:
-    """
-    A Flask-based web interface for semantic image search with natural language queries.
-
-    This class provides a clean, responsive frontend that enables users to input natural language queries and
-    instantly view the most relevant images retrieved from the indexed database.
-
-    Attributes:
-        render_template: Flask template rendering function.
-        request: Flask request object.
-        searcher (VisualAISearch): Instance of the VisualAISearch class.
-        app (Flask): Flask application instance.
-
-    Methods:
-        index: Process user queries and display search results.
-        run: Start the Flask web application.
-
-    Examples:
-        Start a search application
-        >>> app = SearchApp(data="path/to/images", device="cuda")
-        >>> app.run(debug=True)
-    """
-
-    def __init__(self, data: str = "images", device: str = None) -> None:
-        """
-        Initialize the SearchApp with VisualAISearch backend.
-
-        Args:
-            data (str, optional): Path to directory containing images to index and search.
-            device (str, optional): Device to run inference on (e.g. 'cpu', 'cuda').
-        """
-        check_requirements("flask>=3.0.1")
-        from flask import Flask, render_template, request
-
-        self.render_template = render_template
-        self.request = request
-        self.searcher = VisualAISearch(data=data, device=device)
-        self.app = Flask(
-            __name__,
-            template_folder="templates",
-            static_folder=Path(data).resolve(),  # Absolute path to serve images
-            static_url_path="/images",  # URL prefix for images
-        )
-        self.app.add_url_rule("/", view_func=self.index, methods=["GET", "POST"])
-
-    def index(self) -> str:
-        """Process user query and display search results in the web interface."""
-        results = []
-        if self.request.method == "POST":
-            query = self.request.form.get("query", "").strip()
-            results = self.searcher(query)
-        return self.render_template("similarity-search.html", results=results)
-
-    def run(self, debug: bool = False) -> None:
-        """Start the Flask web application server."""
-        self.app.run(debug=debug)
diff --git a/ultralytics/solutions/solutions.py b/ultralytics/solutions/solutions.py
deleted file mode 100644
index b7da9e6..0000000
--- a/ultralytics/solutions/solutions.py
+++ /dev/null
@@ -1,827 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import math
-from collections import Counter, defaultdict
-from functools import lru_cache
-from typing import Any
-
-import cv2
-import numpy as np
-
-from ultralytics import YOLO
-from ultralytics.solutions.config import SolutionConfig
-from ultralytics.utils import ASSETS_URL, LOGGER, ops
-from ultralytics.utils.checks import check_imshow, check_requirements
-from ultralytics.utils.plotting import Annotator
-
-
-class BaseSolution:
-    """
-    A base class for managing Ultralytics Solutions.
-
-    This class provides core functionality for various Ultralytics Solutions, including model loading, object tracking,
-    and region initialization. It serves as the foundation for implementing specific computer vision solutions such as
-    object counting, pose estimation, and analytics.
-
-    Attributes:
-        LineString: Class for creating line string geometries from shapely.
-        Polygon: Class for creating polygon geometries from shapely.
-        Point: Class for creating point geometries from shapely.
-        prep: Prepared geometry function from shapely for optimized spatial operations.
-        CFG (dict[str, Any]): Configuration dictionary loaded from YAML file and updated with kwargs.
-        LOGGER: Logger instance for solution-specific logging.
-        annotator: Annotator instance for drawing on images.
-        tracks: YOLO tracking results from the latest inference.
-        track_data: Extracted tracking data (boxes or OBB) from tracks.
-        boxes (list): Bounding box coordinates from tracking results.
-        clss (list[int]): Class indices from tracking results.
-        track_ids (list[int]): Track IDs from tracking results.
-        confs (list[float]): Confidence scores from tracking results.
-        track_line: Current track line for storing tracking history.
-        masks: Segmentation masks from tracking results.
-        r_s: Region or line geometry object for spatial operations.
-        frame_no (int): Current frame number for logging purposes.
-        region (list[tuple[int, int]]): List of coordinate tuples defining region of interest.
-        line_width (int): Width of lines used in visualizations.
-        model (YOLO): Loaded YOLO model instance.
-        names (dict[int, str]): Dictionary mapping class indices to class names.
-        classes (list[int]): List of class indices to track.
-        show_conf (bool): Flag to show confidence scores in annotations.
-        show_labels (bool): Flag to show class labels in annotations.
-        device (str): Device for model inference.
-        track_add_args (dict[str, Any]): Additional arguments for tracking configuration.
-        env_check (bool): Flag indicating whether environment supports image display.
-        track_history (defaultdict): Dictionary storing tracking history for each object.
-        profilers (tuple): Profiler instances for performance monitoring.
-
-    Methods:
-        adjust_box_label: Generate formatted label for bounding box.
-        extract_tracks: Apply object tracking and extract tracks from input image.
-        store_tracking_history: Store object tracking history for given track ID and bounding box.
-        initialize_region: Initialize counting region and line segment based on configuration.
-        display_output: Display processing results including frames or saved results.
-        process: Process method to be implemented by each Solution subclass.
-
-    Examples:
-        >>> solution = BaseSolution(model="yolo11n.pt", region=[(0, 0), (100, 0), (100, 100), (0, 100)])
-        >>> solution.initialize_region()
-        >>> image = cv2.imread("image.jpg")
-        >>> solution.extract_tracks(image)
-        >>> solution.display_output(image)
-    """
-
-    def __init__(self, is_cli: bool = False, **kwargs: Any) -> None:
-        """
-        Initialize the BaseSolution class with configuration settings and YOLO model.
-
-        Args:
-            is_cli (bool): Enable CLI mode if set to True.
-            **kwargs (Any): Additional configuration parameters that override defaults.
-        """
-        self.CFG = vars(SolutionConfig().update(**kwargs))
-        self.LOGGER = LOGGER  # Store logger object to be used in multiple solution classes
-
-        check_requirements("shapely>=2.0.0")
-        from shapely.geometry import LineString, Point, Polygon
-        from shapely.prepared import prep
-
-        self.LineString = LineString
-        self.Polygon = Polygon
-        self.Point = Point
-        self.prep = prep
-        self.annotator = None  # Initialize annotator
-        self.tracks = None
-        self.track_data = None
-        self.boxes = []
-        self.clss = []
-        self.track_ids = []
-        self.track_line = None
-        self.masks = None
-        self.r_s = None
-        self.frame_no = -1  # Only for logging
-
-        self.LOGGER.info(f"Ultralytics Solutions: ✅ {self.CFG}")
-        self.region = self.CFG["region"]  # Store region data for other classes usage
-        self.line_width = self.CFG["line_width"]
-
-        # Load Model and store additional information (classes, show_conf, show_label)
-        if self.CFG["model"] is None:
-            self.CFG["model"] = "yolo11n.pt"
-        self.model = YOLO(self.CFG["model"])
-        self.names = self.model.names
-        self.classes = self.CFG["classes"]
-        self.show_conf = self.CFG["show_conf"]
-        self.show_labels = self.CFG["show_labels"]
-        self.device = self.CFG["device"]
-
-        self.track_add_args = {  # Tracker additional arguments for advance configuration
-            k: self.CFG[k] for k in {"iou", "conf", "device", "max_det", "half", "tracker"}
-        }  # verbose must be passed to track method; setting it False in YOLO still logs the track information.
-
-        if is_cli and self.CFG["source"] is None:
-            d_s = "solutions_ci_demo.mp4" if "-pose" not in self.CFG["model"] else "solution_ci_pose_demo.mp4"
-            self.LOGGER.warning(f"source not provided. using default source {ASSETS_URL}/{d_s}")
-            from ultralytics.utils.downloads import safe_download
-
-            safe_download(f"{ASSETS_URL}/{d_s}")  # download source from ultralytics assets
-            self.CFG["source"] = d_s  # set default source
-
-        # Initialize environment and region setup
-        self.env_check = check_imshow(warn=True)
-        self.track_history = defaultdict(list)
-
-        self.profilers = (
-            ops.Profile(device=self.device),  # track
-            ops.Profile(device=self.device),  # solution
-        )
-
-    def adjust_box_label(self, cls: int, conf: float, track_id: int | None = None) -> str | None:
-        """
-        Generate a formatted label for a bounding box.
-
-        This method constructs a label string for a bounding box using the class index and confidence score.
-        Optionally includes the track ID if provided. The label format adapts based on the display settings
-        defined in `self.show_conf` and `self.show_labels`.
-
-        Args:
-            cls (int): The class index of the detected object.
-            conf (float): The confidence score of the detection.
-            track_id (int, optional): The unique identifier for the tracked object.
-
-        Returns:
-            (str | None): The formatted label string if `self.show_labels` is True; otherwise, None.
-        """
-        name = ("" if track_id is None else f"{track_id} ") + self.names[cls]
-        return (f"{name} {conf:.2f}" if self.show_conf else name) if self.show_labels else None
-
-    def extract_tracks(self, im0: np.ndarray) -> None:
-        """
-        Apply object tracking and extract tracks from an input image or frame.
-
-        Args:
-            im0 (np.ndarray): The input image or frame.
-
-        Examples:
-            >>> solution = BaseSolution()
-            >>> frame = cv2.imread("path/to/image.jpg")
-            >>> solution.extract_tracks(frame)
-        """
-        with self.profilers[0]:
-            self.tracks = self.model.track(
-                source=im0, persist=True, classes=self.classes, verbose=False, **self.track_add_args
-            )[0]
-        is_obb = self.tracks.obb is not None
-        self.track_data = self.tracks.obb if is_obb else self.tracks.boxes  # Extract tracks for OBB or object detection
-
-        if self.track_data and self.track_data.is_track:
-            self.boxes = (self.track_data.xyxyxyxy if is_obb else self.track_data.xyxy).cpu()
-            self.clss = self.track_data.cls.cpu().tolist()
-            self.track_ids = self.track_data.id.int().cpu().tolist()
-            self.confs = self.track_data.conf.cpu().tolist()
-        else:
-            self.LOGGER.warning("no tracks found!")
-            self.boxes, self.clss, self.track_ids, self.confs = [], [], [], []
-
-    def store_tracking_history(self, track_id: int, box) -> None:
-        """
-        Store the tracking history of an object.
-
-        This method updates the tracking history for a given object by appending the center point of its
-        bounding box to the track line. It maintains a maximum of 30 points in the tracking history.
-
-        Args:
-            track_id (int): The unique identifier for the tracked object.
-            box (list[float]): The bounding box coordinates of the object in the format [x1, y1, x2, y2].
-
-        Examples:
-            >>> solution = BaseSolution()
-            >>> solution.store_tracking_history(1, [100, 200, 300, 400])
-        """
-        # Store tracking history
-        self.track_line = self.track_history[track_id]
-        self.track_line.append(tuple(box.mean(dim=0)) if box.numel() > 4 else (box[:4:2].mean(), box[1:4:2].mean()))
-        if len(self.track_line) > 30:
-            self.track_line.pop(0)
-
-    def initialize_region(self) -> None:
-        """Initialize the counting region and line segment based on configuration settings."""
-        if self.region is None:
-            self.region = [(10, 200), (540, 200), (540, 180), (10, 180)]
-        self.r_s = (
-            self.Polygon(self.region) if len(self.region) >= 3 else self.LineString(self.region)
-        )  # region or line
-
-    def display_output(self, plot_im: np.ndarray) -> None:
-        """
-        Display the results of the processing, which could involve showing frames, printing counts, or saving results.
-
-        This method is responsible for visualizing the output of the object detection and tracking process. It displays
-        the processed frame with annotations, and allows for user interaction to close the display.
-
-        Args:
-            plot_im (np.ndarray): The image or frame that has been processed and annotated.
-
-        Examples:
-            >>> solution = BaseSolution()
-            >>> frame = cv2.imread("path/to/image.jpg")
-            >>> solution.display_output(frame)
-
-        Notes:
-            - This method will only display output if the 'show' configuration is set to True and the environment
-              supports image display.
-            - The display can be closed by pressing the 'q' key.
-        """
-        if self.CFG.get("show") and self.env_check:
-            cv2.imshow("Ultralytics Solutions", plot_im)
-            if cv2.waitKey(1) & 0xFF == ord("q"):
-                cv2.destroyAllWindows()  # Closes current frame window
-                return
-
-    def process(self, *args: Any, **kwargs: Any):
-        """Process method should be implemented by each Solution subclass."""
-
-    def __call__(self, *args: Any, **kwargs: Any):
-        """Allow instances to be called like a function with flexible arguments."""
-        with self.profilers[1]:
-            result = self.process(*args, **kwargs)  # Call the subclass-specific process method
-        track_or_predict = "predict" if type(self).__name__ == "ObjectCropper" else "track"
-        track_or_predict_speed = self.profilers[0].dt * 1e3
-        solution_speed = (self.profilers[1].dt - self.profilers[0].dt) * 1e3  # solution time = process - track
-        result.speed = {track_or_predict: track_or_predict_speed, "solution": solution_speed}
-        if self.CFG["verbose"]:
-            self.frame_no += 1
-            counts = Counter(self.clss)  # Only for logging.
-            LOGGER.info(
-                f"{self.frame_no}: {result.plot_im.shape[0]}x{result.plot_im.shape[1]} {solution_speed:.1f}ms,"
-                f" {', '.join([f'{v} {self.names[k]}' for k, v in counts.items()])}\n"
-                f"Speed: {track_or_predict_speed:.1f}ms {track_or_predict}, "
-                f"{solution_speed:.1f}ms solution per image at shape "
-                f"(1, {getattr(self.model, 'ch', 3)}, {result.plot_im.shape[0]}, {result.plot_im.shape[1]})\n"
-            )
-        return result
-
-
-class SolutionAnnotator(Annotator):
-    """
-    A specialized annotator class for visualizing and analyzing computer vision tasks.
-
-    This class extends the base Annotator class, providing additional methods for drawing regions, centroids, tracking
-    trails, and visual annotations for Ultralytics Solutions. It offers comprehensive visualization capabilities for
-    various computer vision applications including object detection, tracking, pose estimation, and analytics.
-
-    Attributes:
-        im (np.ndarray): The image being annotated.
-        line_width (int): Thickness of lines used in annotations.
-        font_size (int): Size of the font used for text annotations.
-        font (str): Path to the font file used for text rendering.
-        pil (bool): Whether to use PIL for text rendering.
-        example (str): An example attribute for demonstration purposes.
-
-    Methods:
-        draw_region: Draw a region using specified points, colors, and thickness.
-        queue_counts_display: Display queue counts in the specified region.
-        display_analytics: Display overall statistics for parking lot management.
-        estimate_pose_angle: Calculate the angle between three points in an object pose.
-        draw_specific_kpts: Draw specific keypoints on the image.
-        plot_workout_information: Draw a labeled text box on the image.
-        plot_angle_and_count_and_stage: Visualize angle, step count, and stage for workout monitoring.
-        plot_distance_and_line: Display the distance between centroids and connect them with a line.
-        display_objects_labels: Annotate bounding boxes with object class labels.
-        sweep_annotator: Visualize a vertical sweep line and optional label.
-        visioneye: Map and connect object centroids to a visual "eye" point.
-        adaptive_label: Draw a circular or rectangle background shape label in center of a bounding box.
-
-    Examples:
-        >>> annotator = SolutionAnnotator(image)
-        >>> annotator.draw_region([(0, 0), (100, 100)], color=(0, 255, 0), thickness=5)
-        >>> annotator.display_analytics(
-        ...     image, text={"Available Spots": 5}, txt_color=(0, 0, 0), bg_color=(255, 255, 255), margin=10
-        ... )
-    """
-
-    def __init__(
-        self,
-        im: np.ndarray,
-        line_width: int | None = None,
-        font_size: int | None = None,
-        font: str = "Arial.ttf",
-        pil: bool = False,
-        example: str = "abc",
-    ):
-        """
-        Initialize the SolutionAnnotator class with an image for annotation.
-
-        Args:
-            im (np.ndarray): The image to be annotated.
-            line_width (int, optional): Line thickness for drawing on the image.
-            font_size (int, optional): Font size for text annotations.
-            font (str): Path to the font file.
-            pil (bool): Indicates whether to use PIL for rendering text.
-            example (str): An example parameter for demonstration purposes.
-        """
-        super().__init__(im, line_width, font_size, font, pil, example)
-
-    def draw_region(
-        self,
-        reg_pts: list[tuple[int, int]] | None = None,
-        color: tuple[int, int, int] = (0, 255, 0),
-        thickness: int = 5,
-    ):
-        """
-        Draw a region or line on the image.
-
-        Args:
-            reg_pts (list[tuple[int, int]], optional): Region points (for line 2 points, for region 4+ points).
-            color (tuple[int, int, int]): RGB color value for the region.
-            thickness (int): Line thickness for drawing the region.
-        """
-        cv2.polylines(self.im, [np.array(reg_pts, dtype=np.int32)], isClosed=True, color=color, thickness=thickness)
-
-        # Draw small circles at the corner points
-        for point in reg_pts:
-            cv2.circle(self.im, (point[0], point[1]), thickness * 2, color, -1)  # -1 fills the circle
-
-    def queue_counts_display(
-        self,
-        label: str,
-        points: list[tuple[int, int]] | None = None,
-        region_color: tuple[int, int, int] = (255, 255, 255),
-        txt_color: tuple[int, int, int] = (0, 0, 0),
-    ):
-        """
-        Display queue counts on an image centered at the points with customizable font size and colors.
-
-        Args:
-            label (str): Queue counts label.
-            points (list[tuple[int, int]], optional): Region points for center point calculation to display text.
-            region_color (tuple[int, int, int]): RGB queue region color.
-            txt_color (tuple[int, int, int]): RGB text display color.
-        """
-        x_values = [point[0] for point in points]
-        y_values = [point[1] for point in points]
-        center_x = sum(x_values) // len(points)
-        center_y = sum(y_values) // len(points)
-
-        text_size = cv2.getTextSize(label, 0, fontScale=self.sf, thickness=self.tf)[0]
-        text_width = text_size[0]
-        text_height = text_size[1]
-
-        rect_width = text_width + 20
-        rect_height = text_height + 20
-        rect_top_left = (center_x - rect_width // 2, center_y - rect_height // 2)
-        rect_bottom_right = (center_x + rect_width // 2, center_y + rect_height // 2)
-        cv2.rectangle(self.im, rect_top_left, rect_bottom_right, region_color, -1)
-
-        text_x = center_x - text_width // 2
-        text_y = center_y + text_height // 2
-
-        # Draw text
-        cv2.putText(
-            self.im,
-            label,
-            (text_x, text_y),
-            0,
-            fontScale=self.sf,
-            color=txt_color,
-            thickness=self.tf,
-            lineType=cv2.LINE_AA,
-        )
-
-    def display_analytics(
-        self,
-        im0: np.ndarray,
-        text: dict[str, Any],
-        txt_color: tuple[int, int, int],
-        bg_color: tuple[int, int, int],
-        margin: int,
-    ):
-        """
-        Display the overall statistics for parking lots, object counter etc.
-
-        Args:
-            im0 (np.ndarray): Inference image.
-            text (dict[str, Any]): Labels dictionary.
-            txt_color (tuple[int, int, int]): Display color for text foreground.
-            bg_color (tuple[int, int, int]): Display color for text background.
-            margin (int): Gap between text and rectangle for better display.
-        """
-        horizontal_gap = int(im0.shape[1] * 0.02)
-        vertical_gap = int(im0.shape[0] * 0.01)
-        text_y_offset = 0
-        for label, value in text.items():
-            txt = f"{label}: {value}"
-            text_size = cv2.getTextSize(txt, 0, self.sf, self.tf)[0]
-            if text_size[0] < 5 or text_size[1] < 5:
-                text_size = (5, 5)
-            text_x = im0.shape[1] - text_size[0] - margin * 2 - horizontal_gap
-            text_y = text_y_offset + text_size[1] + margin * 2 + vertical_gap
-            rect_x1 = text_x - margin * 2
-            rect_y1 = text_y - text_size[1] - margin * 2
-            rect_x2 = text_x + text_size[0] + margin * 2
-            rect_y2 = text_y + margin * 2
-            cv2.rectangle(im0, (rect_x1, rect_y1), (rect_x2, rect_y2), bg_color, -1)
-            cv2.putText(im0, txt, (text_x, text_y), 0, self.sf, txt_color, self.tf, lineType=cv2.LINE_AA)
-            text_y_offset = rect_y2
-
-    @staticmethod
-    @lru_cache(maxsize=256)
-    def estimate_pose_angle(a: list[float], b: list[float], c: list[float]) -> float:
-        """
-        Calculate the angle between three points for workout monitoring.
-
-        Args:
-            a (list[float]): The coordinates of the first point.
-            b (list[float]): The coordinates of the second point (vertex).
-            c (list[float]): The coordinates of the third point.
-
-        Returns:
-            (float): The angle in degrees between the three points.
-        """
-        radians = math.atan2(c[1] - b[1], c[0] - b[0]) - math.atan2(a[1] - b[1], a[0] - b[0])
-        angle = abs(radians * 180.0 / math.pi)
-        return angle if angle <= 180.0 else (360 - angle)
-
-    def draw_specific_kpts(
-        self,
-        keypoints: list[list[float]],
-        indices: list[int] | None = None,
-        radius: int = 2,
-        conf_thresh: float = 0.25,
-    ) -> np.ndarray:
-        """
-        Draw specific keypoints for gym steps counting.
-
-        Args:
-            keypoints (list[list[float]]): Keypoints data to be plotted, each in format [x, y, confidence].
-            indices (list[int], optional): Keypoint indices to be plotted.
-            radius (int): Keypoint radius.
-            conf_thresh (float): Confidence threshold for keypoints.
-
-        Returns:
-            (np.ndarray): Image with drawn keypoints.
-
-        Notes:
-            Keypoint format: [x, y] or [x, y, confidence].
-            Modifies self.im in-place.
-        """
-        indices = indices or [2, 5, 7]
-        points = [(int(k[0]), int(k[1])) for i, k in enumerate(keypoints) if i in indices and k[2] >= conf_thresh]
-
-        # Draw lines between consecutive points
-        for start, end in zip(points[:-1], points[1:]):
-            cv2.line(self.im, start, end, (0, 255, 0), 2, lineType=cv2.LINE_AA)
-
-        # Draw circles for keypoints
-        for pt in points:
-            cv2.circle(self.im, pt, radius, (0, 0, 255), -1, lineType=cv2.LINE_AA)
-
-        return self.im
-
-    def plot_workout_information(
-        self,
-        display_text: str,
-        position: tuple[int, int],
-        color: tuple[int, int, int] = (104, 31, 17),
-        txt_color: tuple[int, int, int] = (255, 255, 255),
-    ) -> int:
-        """
-        Draw workout text with a background on the image.
-
-        Args:
-            display_text (str): The text to be displayed.
-            position (tuple[int, int]): Coordinates (x, y) on the image where the text will be placed.
-            color (tuple[int, int, int]): Text background color.
-            txt_color (tuple[int, int, int]): Text foreground color.
-
-        Returns:
-            (int): The height of the text.
-        """
-        (text_width, text_height), _ = cv2.getTextSize(display_text, 0, fontScale=self.sf, thickness=self.tf)
-
-        # Draw background rectangle
-        cv2.rectangle(
-            self.im,
-            (position[0], position[1] - text_height - 5),
-            (position[0] + text_width + 10, position[1] - text_height - 5 + text_height + 10 + self.tf),
-            color,
-            -1,
-        )
-        # Draw text
-        cv2.putText(self.im, display_text, position, 0, self.sf, txt_color, self.tf)
-
-        return text_height
-
-    def plot_angle_and_count_and_stage(
-        self,
-        angle_text: str,
-        count_text: str,
-        stage_text: str,
-        center_kpt: list[int],
-        color: tuple[int, int, int] = (104, 31, 17),
-        txt_color: tuple[int, int, int] = (255, 255, 255),
-    ):
-        """
-        Plot the pose angle, count value, and step stage for workout monitoring.
-
-        Args:
-            angle_text (str): Angle value for workout monitoring.
-            count_text (str): Counts value for workout monitoring.
-            stage_text (str): Stage decision for workout monitoring.
-            center_kpt (list[int]): Centroid pose index for workout monitoring.
-            color (tuple[int, int, int]): Text background color.
-            txt_color (tuple[int, int, int]): Text foreground color.
-        """
-        # Format text
-        angle_text, count_text, stage_text = f" {angle_text:.2f}", f"Steps : {count_text}", f" {stage_text}"
-
-        # Draw angle, count and stage text
-        angle_height = self.plot_workout_information(
-            angle_text, (int(center_kpt[0]), int(center_kpt[1])), color, txt_color
-        )
-        count_height = self.plot_workout_information(
-            count_text, (int(center_kpt[0]), int(center_kpt[1]) + angle_height + 20), color, txt_color
-        )
-        self.plot_workout_information(
-            stage_text, (int(center_kpt[0]), int(center_kpt[1]) + angle_height + count_height + 40), color, txt_color
-        )
-
-    def plot_distance_and_line(
-        self,
-        pixels_distance: float,
-        centroids: list[tuple[int, int]],
-        line_color: tuple[int, int, int] = (104, 31, 17),
-        centroid_color: tuple[int, int, int] = (255, 0, 255),
-    ):
-        """
-        Plot the distance and line between two centroids on the frame.
-
-        Args:
-            pixels_distance (float): Pixels distance between two bbox centroids.
-            centroids (list[tuple[int, int]]): Bounding box centroids data.
-            line_color (tuple[int, int, int]): Distance line color.
-            centroid_color (tuple[int, int, int]): Bounding box centroid color.
-        """
-        # Get the text size
-        text = f"Pixels Distance: {pixels_distance:.2f}"
-        (text_width_m, text_height_m), _ = cv2.getTextSize(text, 0, self.sf, self.tf)
-
-        # Define corners with 10-pixel margin and draw rectangle
-        cv2.rectangle(self.im, (15, 25), (15 + text_width_m + 20, 25 + text_height_m + 20), line_color, -1)
-
-        # Calculate the position for the text with a 10-pixel margin and draw text
-        text_position = (25, 25 + text_height_m + 10)
-        cv2.putText(
-            self.im,
-            text,
-            text_position,
-            0,
-            self.sf,
-            (255, 255, 255),
-            self.tf,
-            cv2.LINE_AA,
-        )
-
-        cv2.line(self.im, centroids[0], centroids[1], line_color, 3)
-        cv2.circle(self.im, centroids[0], 6, centroid_color, -1)
-        cv2.circle(self.im, centroids[1], 6, centroid_color, -1)
-
-    def display_objects_labels(
-        self,
-        im0: np.ndarray,
-        text: str,
-        txt_color: tuple[int, int, int],
-        bg_color: tuple[int, int, int],
-        x_center: float,
-        y_center: float,
-        margin: int,
-    ):
-        """
-        Display the bounding boxes labels in parking management app.
-
-        Args:
-            im0 (np.ndarray): Inference image.
-            text (str): Object/class name.
-            txt_color (tuple[int, int, int]): Display color for text foreground.
-            bg_color (tuple[int, int, int]): Display color for text background.
-            x_center (float): The x position center point for bounding box.
-            y_center (float): The y position center point for bounding box.
-            margin (int): The gap between text and rectangle for better display.
-        """
-        text_size = cv2.getTextSize(text, 0, fontScale=self.sf, thickness=self.tf)[0]
-        text_x = x_center - text_size[0] // 2
-        text_y = y_center + text_size[1] // 2
-
-        rect_x1 = text_x - margin
-        rect_y1 = text_y - text_size[1] - margin
-        rect_x2 = text_x + text_size[0] + margin
-        rect_y2 = text_y + margin
-        cv2.rectangle(
-            im0,
-            (int(rect_x1), int(rect_y1)),
-            (int(rect_x2), int(rect_y2)),
-            tuple(map(int, bg_color)),  # Ensure color values are int
-            -1,
-        )
-
-        cv2.putText(
-            im0,
-            text,
-            (int(text_x), int(text_y)),
-            0,
-            self.sf,
-            tuple(map(int, txt_color)),  # Ensure color values are int
-            self.tf,
-            lineType=cv2.LINE_AA,
-        )
-
-    def sweep_annotator(
-        self,
-        line_x: int = 0,
-        line_y: int = 0,
-        label: str | None = None,
-        color: tuple[int, int, int] = (221, 0, 186),
-        txt_color: tuple[int, int, int] = (255, 255, 255),
-    ):
-        """
-        Draw a sweep annotation line and an optional label.
-
-        Args:
-            line_x (int): The x-coordinate of the sweep line.
-            line_y (int): The y-coordinate limit of the sweep line.
-            label (str, optional): Text label to be drawn in center of sweep line. If None, no label is drawn.
-            color (tuple[int, int, int]): RGB color for the line and label background.
-            txt_color (tuple[int, int, int]): RGB color for the label text.
-        """
-        # Draw the sweep line
-        cv2.line(self.im, (line_x, 0), (line_x, line_y), color, self.tf * 2)
-
-        # Draw label, if provided
-        if label:
-            (text_width, text_height), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, self.sf, self.tf)
-            cv2.rectangle(
-                self.im,
-                (line_x - text_width // 2 - 10, line_y // 2 - text_height // 2 - 10),
-                (line_x + text_width // 2 + 10, line_y // 2 + text_height // 2 + 10),
-                color,
-                -1,
-            )
-            cv2.putText(
-                self.im,
-                label,
-                (line_x - text_width // 2, line_y // 2 + text_height // 2),
-                cv2.FONT_HERSHEY_SIMPLEX,
-                self.sf,
-                txt_color,
-                self.tf,
-            )
-
-    def visioneye(
-        self,
-        box: list[float],
-        center_point: tuple[int, int],
-        color: tuple[int, int, int] = (235, 219, 11),
-        pin_color: tuple[int, int, int] = (255, 0, 255),
-    ):
-        """
-        Perform pinpoint human-vision eye mapping and plotting.
-
-        Args:
-            box (list[float]): Bounding box coordinates in format [x1, y1, x2, y2].
-            center_point (tuple[int, int]): Center point for vision eye view.
-            color (tuple[int, int, int]): Object centroid and line color.
-            pin_color (tuple[int, int, int]): Visioneye point color.
-        """
-        center_bbox = int((box[0] + box[2]) / 2), int((box[1] + box[3]) / 2)
-        cv2.circle(self.im, center_point, self.tf * 2, pin_color, -1)
-        cv2.circle(self.im, center_bbox, self.tf * 2, color, -1)
-        cv2.line(self.im, center_point, center_bbox, color, self.tf)
-
-    def adaptive_label(
-        self,
-        box: tuple[float, float, float, float],
-        label: str = "",
-        color: tuple[int, int, int] = (128, 128, 128),
-        txt_color: tuple[int, int, int] = (255, 255, 255),
-        shape: str = "rect",
-        margin: int = 5,
-    ):
-        """
-        Draw a label with a background rectangle or circle centered within a given bounding box.
-
-        Args:
-            box (tuple[float, float, float, float]): The bounding box coordinates (x1, y1, x2, y2).
-            label (str): The text label to be displayed.
-            color (tuple[int, int, int]): The background color of the rectangle (B, G, R).
-            txt_color (tuple[int, int, int]): The color of the text (R, G, B).
-            shape (str): The shape of the label i.e "circle" or "rect"
-            margin (int): The margin between the text and the rectangle border.
-        """
-        if shape == "circle" and len(label) > 3:
-            LOGGER.warning(f"Length of label is {len(label)}, only first 3 letters will be used for circle annotation.")
-            label = label[:3]
-
-        x_center, y_center = int((box[0] + box[2]) / 2), int((box[1] + box[3]) / 2)  # Calculate center of the bbox
-        text_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, self.sf - 0.15, self.tf)[0]  # Get size of the text
-        text_x, text_y = x_center - text_size[0] // 2, y_center + text_size[1] // 2  # Calculate top-left corner of text
-
-        if shape == "circle":
-            cv2.circle(
-                self.im,
-                (x_center, y_center),
-                int(((text_size[0] ** 2 + text_size[1] ** 2) ** 0.5) / 2) + margin,  # Calculate the radius
-                color,
-                -1,
-            )
-        else:
-            cv2.rectangle(
-                self.im,
-                (text_x - margin, text_y - text_size[1] - margin),  # Calculate coordinates of the rectangle
-                (text_x + text_size[0] + margin, text_y + margin),  # Calculate coordinates of the rectangle
-                color,
-                -1,
-            )
-
-        # Draw the text on top of the rectangle
-        cv2.putText(
-            self.im,
-            label,
-            (text_x, text_y),  # Calculate top-left corner of the text
-            cv2.FONT_HERSHEY_SIMPLEX,
-            self.sf - 0.15,
-            self.get_txt_color(color, txt_color),
-            self.tf,
-            lineType=cv2.LINE_AA,
-        )
-
-
-class SolutionResults:
-    """
-    A class to encapsulate the results of Ultralytics Solutions.
-
-    This class is designed to store and manage various outputs generated by the solution pipeline, including counts,
-    angles, workout stages, and other analytics data. It provides a structured way to access and manipulate results
-    from different computer vision solutions such as object counting, pose estimation, and tracking analytics.
-
-    Attributes:
-        plot_im (np.ndarray): Processed image with counts, blurred, or other effects from solutions.
-        in_count (int): The total number of "in" counts in a video stream.
-        out_count (int): The total number of "out" counts in a video stream.
-        classwise_count (dict[str, int]): A dictionary containing counts of objects categorized by class.
-        queue_count (int): The count of objects in a queue or waiting area.
-        workout_count (int): The count of workout repetitions.
-        workout_angle (float): The angle calculated during a workout exercise.
-        workout_stage (str): The current stage of the workout.
-        pixels_distance (float): The calculated distance in pixels between two points or objects.
-        available_slots (int): The number of available slots in a monitored area.
-        filled_slots (int): The number of filled slots in a monitored area.
-        email_sent (bool): A flag indicating whether an email notification was sent.
-        total_tracks (int): The total number of tracked objects.
-        region_counts (dict[str, int]): The count of objects within a specific region.
-        speed_dict (dict[str, float]): A dictionary containing speed information for tracked objects.
-        total_crop_objects (int): Total number of cropped objects using ObjectCropper class.
-        speed (dict[str, float]): Performance timing information for tracking and solution processing.
-    """
-
-    def __init__(self, **kwargs):
-        """
-        Initialize a SolutionResults object with default or user-specified values.
-
-        Args:
-            **kwargs (Any): Optional arguments to override default attribute values.
-        """
-        self.plot_im = None
-        self.in_count = 0
-        self.out_count = 0
-        self.classwise_count = {}
-        self.queue_count = 0
-        self.workout_count = 0
-        self.workout_angle = 0.0
-        self.workout_stage = None
-        self.pixels_distance = 0.0
-        self.available_slots = 0
-        self.filled_slots = 0
-        self.email_sent = False
-        self.total_tracks = 0
-        self.region_counts = {}
-        self.speed_dict = {}  # for speed estimation
-        self.total_crop_objects = 0
-        self.speed = {}
-
-        # Override with user-defined values
-        self.__dict__.update(kwargs)
-
-    def __str__(self) -> str:
-        """
-        Return a formatted string representation of the SolutionResults object.
-
-        Returns:
-            (str): A string representation listing non-null attributes.
-        """
-        attrs = {
-            k: v
-            for k, v in self.__dict__.items()
-            if k != "plot_im" and v not in [None, {}, 0, 0.0, False]  # Exclude `plot_im` explicitly
-        }
-        return ", ".join(f"{k}={v}" for k, v in attrs.items())
diff --git a/ultralytics/solutions/speed_estimation.py b/ultralytics/solutions/speed_estimation.py
deleted file mode 100644
index 0da4223..0000000
--- a/ultralytics/solutions/speed_estimation.py
+++ /dev/null
@@ -1,117 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from collections import deque
-from math import sqrt
-from typing import Any
-
-from ultralytics.solutions.solutions import BaseSolution, SolutionAnnotator, SolutionResults
-from ultralytics.utils.plotting import colors
-
-
-class SpeedEstimator(BaseSolution):
-    """
-    A class to estimate the speed of objects in a real-time video stream based on their tracks.
-
-    This class extends the BaseSolution class and provides functionality for estimating object speeds using
-    tracking data in video streams. Speed is calculated based on pixel displacement over time and converted
-    to real-world units using a configurable meters-per-pixel scale factor.
-
-    Attributes:
-        fps (float): Video frame rate for time calculations.
-        frame_count (int): Global frame counter for tracking temporal information.
-        trk_frame_ids (dict): Maps track IDs to their first frame index.
-        spd (dict): Final speed per object in km/h once locked.
-        trk_hist (dict): Maps track IDs to deque of position history.
-        locked_ids (set): Track IDs whose speed has been finalized.
-        max_hist (int): Required frame history before computing speed.
-        meter_per_pixel (float): Real-world meters represented by one pixel for scene scale conversion.
-        max_speed (int): Maximum allowed object speed; values above this will be capped.
-
-    Methods:
-        process: Process input frames to estimate object speeds based on tracking data.
-        store_tracking_history: Store the tracking history for an object.
-        extract_tracks: Extract tracks from the current frame.
-        display_output: Display the output with annotations.
-
-    Examples:
-        Initialize speed estimator and process a frame
-        >>> estimator = SpeedEstimator(meter_per_pixel=0.04, max_speed=120)
-        >>> frame = cv2.imread("frame.jpg")
-        >>> results = estimator.process(frame)
-        >>> cv2.imshow("Speed Estimation", results.plot_im)
-    """
-
-    def __init__(self, **kwargs: Any) -> None:
-        """
-        Initialize the SpeedEstimator object with speed estimation parameters and data structures.
-
-        Args:
-            **kwargs (Any): Additional keyword arguments passed to the parent class.
-        """
-        super().__init__(**kwargs)
-
-        self.fps = self.CFG["fps"]  # Video frame rate for time calculations
-        self.frame_count = 0  # Global frame counter
-        self.trk_frame_ids = {}  # Track ID → first frame index
-        self.spd = {}  # Final speed per object (km/h), once locked
-        self.trk_hist = {}  # Track ID → deque of (time, position)
-        self.locked_ids = set()  # Track IDs whose speed has been finalized
-        self.max_hist = self.CFG["max_hist"]  # Required frame history before computing speed
-        self.meter_per_pixel = self.CFG["meter_per_pixel"]  # Scene scale, depends on camera details
-        self.max_speed = self.CFG["max_speed"]  # Maximum speed adjustment
-
-    def process(self, im0) -> SolutionResults:
-        """
-        Process an input frame to estimate object speeds based on tracking data.
-
-        Args:
-            im0 (np.ndarray): Input image for processing with shape (H, W, C) for RGB images.
-
-        Returns:
-            (SolutionResults): Contains processed image `plot_im` and `total_tracks` (number of tracked objects).
-
-        Examples:
-            Process a frame for speed estimation
-            >>> estimator = SpeedEstimator()
-            >>> image = np.random.randint(0, 255, (480, 640, 3), dtype=np.uint8)
-            >>> results = estimator.process(image)
-        """
-        self.frame_count += 1
-        self.extract_tracks(im0)
-        annotator = SolutionAnnotator(im0, line_width=self.line_width)
-
-        for box, track_id, _, _ in zip(self.boxes, self.track_ids, self.clss, self.confs):
-            self.store_tracking_history(track_id, box)
-
-            if track_id not in self.trk_hist:  # Initialize history if new track found
-                self.trk_hist[track_id] = deque(maxlen=self.max_hist)
-                self.trk_frame_ids[track_id] = self.frame_count
-
-            if track_id not in self.locked_ids:  # Update history until speed is locked
-                trk_hist = self.trk_hist[track_id]
-                trk_hist.append(self.track_line[-1])
-
-                # Compute and lock speed once enough history is collected
-                if len(trk_hist) == self.max_hist:
-                    p0, p1 = trk_hist[0], trk_hist[-1]  # First and last points of track
-                    dt = (self.frame_count - self.trk_frame_ids[track_id]) / self.fps  # Time in seconds
-                    if dt > 0:
-                        dx, dy = p1[0] - p0[0], p1[1] - p0[1]  # Pixel displacement
-                        pixel_distance = sqrt(dx * dx + dy * dy)  # Calculate pixel distance
-                        meters = pixel_distance * self.meter_per_pixel  # Convert to meters
-                        self.spd[track_id] = int(
-                            min((meters / dt) * 3.6, self.max_speed)
-                        )  # Convert to km/h and store final speed
-                        self.locked_ids.add(track_id)  # Prevent further updates
-                        self.trk_hist.pop(track_id, None)  # Free memory
-                        self.trk_frame_ids.pop(track_id, None)  # Remove frame start reference
-
-            if track_id in self.spd:
-                speed_label = f"{self.spd[track_id]} km/h"
-                annotator.box_label(box, label=speed_label, color=colors(track_id, True))  # Draw bounding box
-
-        plot_im = annotator.result()
-        self.display_output(plot_im)  # Display output with base class function
-
-        # Return results with processed image and tracking summary
-        return SolutionResults(plot_im=plot_im, total_tracks=len(self.track_ids))
diff --git a/ultralytics/solutions/streamlit_inference.py b/ultralytics/solutions/streamlit_inference.py
deleted file mode 100644
index 44e2029..0000000
--- a/ultralytics/solutions/streamlit_inference.py
+++ /dev/null
@@ -1,262 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-import io
-import os
-from typing import Any
-
-import cv2
-import torch
-
-from ultralytics import YOLO
-from ultralytics.utils import LOGGER
-from ultralytics.utils.checks import check_requirements
-from ultralytics.utils.downloads import GITHUB_ASSETS_STEMS
-
-torch.classes.__path__ = []  # Torch module __path__._path issue: https://github.com/datalab-to/marker/issues/442
-
-
-class Inference:
-    """
-    A class to perform object detection, image classification, image segmentation and pose estimation inference.
-
-    This class provides functionalities for loading models, configuring settings, uploading video files, and performing
-    real-time inference using Streamlit and Ultralytics YOLO models.
-
-    Attributes:
-        st (module): Streamlit module for UI creation.
-        temp_dict (dict): Temporary dictionary to store the model path and other configuration.
-        model_path (str): Path to the loaded model.
-        model (YOLO): The YOLO model instance.
-        source (str): Selected video source (webcam or video file).
-        enable_trk (bool): Enable tracking option.
-        conf (float): Confidence threshold for detection.
-        iou (float): IoU threshold for non-maximum suppression.
-        org_frame (Any): Container for the original frame to be displayed.
-        ann_frame (Any): Container for the annotated frame to be displayed.
-        vid_file_name (str | int): Name of the uploaded video file or webcam index.
-        selected_ind (list[int]): List of selected class indices for detection.
-
-    Methods:
-        web_ui: Set up the Streamlit web interface with custom HTML elements.
-        sidebar: Configure the Streamlit sidebar for model and inference settings.
-        source_upload: Handle video file uploads through the Streamlit interface.
-        configure: Configure the model and load selected classes for inference.
-        inference: Perform real-time object detection inference.
-
-    Examples:
-        Create an Inference instance with a custom model
-        >>> inf = Inference(model="path/to/model.pt")
-        >>> inf.inference()
-
-        Create an Inference instance with default settings
-        >>> inf = Inference()
-        >>> inf.inference()
-    """
-
-    def __init__(self, **kwargs: Any) -> None:
-        """
-        Initialize the Inference class, checking Streamlit requirements and setting up the model path.
-
-        Args:
-            **kwargs (Any): Additional keyword arguments for model configuration.
-        """
-        check_requirements("streamlit>=1.29.0")  # scope imports for faster ultralytics package load speeds
-        import streamlit as st
-
-        self.st = st  # Reference to the Streamlit module
-        self.source = None  # Video source selection (webcam or video file)
-        self.img_file_names = []  # List of image file names
-        self.enable_trk = False  # Flag to toggle object tracking
-        self.conf = 0.25  # Confidence threshold for detection
-        self.iou = 0.45  # Intersection-over-Union (IoU) threshold for non-maximum suppression
-        self.org_frame = None  # Container for the original frame display
-        self.ann_frame = None  # Container for the annotated frame display
-        self.vid_file_name = None  # Video file name or webcam index
-        self.selected_ind: list[int] = []  # List of selected class indices for detection
-        self.model = None  # YOLO model instance
-
-        self.temp_dict = {"model": None, **kwargs}
-        self.model_path = None  # Model file path
-        if self.temp_dict["model"] is not None:
-            self.model_path = self.temp_dict["model"]
-
-        LOGGER.info(f"Ultralytics Solutions: ✅ {self.temp_dict}")
-
-    def web_ui(self) -> None:
-        """Set up the Streamlit web interface with custom HTML elements."""
-        menu_style_cfg = """<style>MainMenu {visibility: hidden;}</style>"""  # Hide main menu style
-
-        # Main title of streamlit application
-        main_title_cfg = """<div><h1 style="color:#111F68; text-align:center; font-size:40px; margin-top:-50px;
-        font-family: 'Archivo', sans-serif; margin-bottom:20px;">Ultralytics YOLO Streamlit Application</h1></div>"""
-
-        # Subtitle of streamlit application
-        sub_title_cfg = """<div><h5 style="color:#042AFF; text-align:center; font-family: 'Archivo', sans-serif;
-        margin-top:-15px; margin-bottom:50px;">Experience real-time object detection on your webcam, videos, and images
-        with the power of Ultralytics YOLO! 🚀</h5></div>"""
-
-        # Set html page configuration and append custom HTML
-        self.st.set_page_config(page_title="Ultralytics Streamlit App", layout="wide")
-        self.st.markdown(menu_style_cfg, unsafe_allow_html=True)
-        self.st.markdown(main_title_cfg, unsafe_allow_html=True)
-        self.st.markdown(sub_title_cfg, unsafe_allow_html=True)
-
-    def sidebar(self) -> None:
-        """Configure the Streamlit sidebar for model and inference settings."""
-        with self.st.sidebar:  # Add Ultralytics LOGO
-            logo = "https://raw.githubusercontent.com/ultralytics/assets/main/logo/Ultralytics_Logotype_Original.svg"
-            self.st.image(logo, width=250)
-
-        self.st.sidebar.title("User Configuration")  # Add elements to vertical setting menu
-        self.source = self.st.sidebar.selectbox(
-            "Source",
-            ("webcam", "video", "image"),
-        )  # Add source selection dropdown
-        if self.source in ["webcam", "video"]:
-            self.enable_trk = self.st.sidebar.radio("Enable Tracking", ("Yes", "No")) == "Yes"  # Enable object tracking
-        self.conf = float(
-            self.st.sidebar.slider("Confidence Threshold", 0.0, 1.0, self.conf, 0.01)
-        )  # Slider for confidence
-        self.iou = float(self.st.sidebar.slider("IoU Threshold", 0.0, 1.0, self.iou, 0.01))  # Slider for NMS threshold
-
-        if self.source != "image":  # Only create columns for video/webcam
-            col1, col2 = self.st.columns(2)  # Create two columns for displaying frames
-            self.org_frame = col1.empty()  # Container for original frame
-            self.ann_frame = col2.empty()  # Container for annotated frame
-
-    def source_upload(self) -> None:
-        """Handle video file uploads through the Streamlit interface."""
-        from ultralytics.data.utils import IMG_FORMATS, VID_FORMATS  # scope import
-
-        self.vid_file_name = ""
-        if self.source == "video":
-            vid_file = self.st.sidebar.file_uploader("Upload Video File", type=VID_FORMATS)
-            if vid_file is not None:
-                g = io.BytesIO(vid_file.read())  # BytesIO Object
-                with open("ultralytics.mp4", "wb") as out:  # Open temporary file as bytes
-                    out.write(g.read())  # Read bytes into file
-                self.vid_file_name = "ultralytics.mp4"
-        elif self.source == "webcam":
-            self.vid_file_name = 0  # Use webcam index 0
-        elif self.source == "image":
-            import tempfile  # scope import
-
-            if imgfiles := self.st.sidebar.file_uploader(
-                "Upload Image Files", type=IMG_FORMATS, accept_multiple_files=True
-            ):
-                for imgfile in imgfiles:  # Save each uploaded image to a temporary file
-                    with tempfile.NamedTemporaryFile(delete=False, suffix=f".{imgfile.name.split('.')[-1]}") as tf:
-                        tf.write(imgfile.read())
-                        self.img_file_names.append({"path": tf.name, "name": imgfile.name})
-
-    def configure(self) -> None:
-        """Configure the model and load selected classes for inference."""
-        # Add dropdown menu for model selection
-        M_ORD, T_ORD = ["yolo11n", "yolo11s", "yolo11m", "yolo11l", "yolo11x"], ["", "-seg", "-pose", "-obb", "-cls"]
-        available_models = sorted(
-            [
-                x.replace("yolo", "YOLO")
-                for x in GITHUB_ASSETS_STEMS
-                if any(x.startswith(b) for b in M_ORD) and "grayscale" not in x
-            ],
-            key=lambda x: (M_ORD.index(x[:7].lower()), T_ORD.index(x[7:].lower() or "")),
-        )
-        if self.model_path:  # Insert user provided custom model in available_models
-            available_models.insert(0, self.model_path)
-        selected_model = self.st.sidebar.selectbox("Model", available_models)
-
-        with self.st.spinner("Model is downloading..."):
-            if selected_model.endswith((".pt", ".onnx", ".torchscript", ".mlpackage", ".engine")) or any(
-                fmt in selected_model for fmt in ("openvino_model", "rknn_model")
-            ):
-                model_path = selected_model
-            else:
-                model_path = f"{selected_model.lower()}.pt"  # Default to .pt if no model provided during function call.
-            self.model = YOLO(model_path)  # Load the YOLO model
-            class_names = list(self.model.names.values())  # Convert dictionary to list of class names
-        self.st.success("Model loaded successfully!")
-
-        # Multiselect box with class names and get indices of selected classes
-        selected_classes = self.st.sidebar.multiselect("Classes", class_names, default=class_names[:3])
-        self.selected_ind = [class_names.index(option) for option in selected_classes]
-
-        if not isinstance(self.selected_ind, list):  # Ensure selected_options is a list
-            self.selected_ind = list(self.selected_ind)
-
-    def image_inference(self) -> None:
-        """Perform inference on uploaded images."""
-        for img_info in self.img_file_names:
-            img_path = img_info["path"]
-            image = cv2.imread(img_path)  # Load and display the original image
-            if image is not None:
-                self.st.markdown(f"#### Processed: {img_info['name']}")
-                col1, col2 = self.st.columns(2)
-                with col1:
-                    self.st.image(image, channels="BGR", caption="Original Image")
-                results = self.model(image, conf=self.conf, iou=self.iou, classes=self.selected_ind)
-                annotated_image = results[0].plot()
-                with col2:
-                    self.st.image(annotated_image, channels="BGR", caption="Predicted Image")
-                try:  # Clean up temporary file
-                    os.unlink(img_path)
-                except FileNotFoundError:
-                    pass  # File doesn't exist, ignore
-            else:
-                self.st.error("Could not load the uploaded image.")
-
-    def inference(self) -> None:
-        """Perform real-time object detection inference on video or webcam feed."""
-        self.web_ui()  # Initialize the web interface
-        self.sidebar()  # Create the sidebar
-        self.source_upload()  # Upload the video source
-        self.configure()  # Configure the app
-
-        if self.st.sidebar.button("Start"):
-            if self.source == "image":
-                if self.img_file_names:
-                    self.image_inference()
-                else:
-                    self.st.info("Please upload an image file to perform inference.")
-                return
-
-            stop_button = self.st.sidebar.button("Stop")  # Button to stop the inference
-            cap = cv2.VideoCapture(self.vid_file_name)  # Capture the video
-            if not cap.isOpened():
-                self.st.error("Could not open webcam or video source.")
-                return
-
-            while cap.isOpened():
-                success, frame = cap.read()
-                if not success:
-                    self.st.warning("Failed to read frame from webcam. Please verify the webcam is connected properly.")
-                    break
-
-                # Process frame with model
-                if self.enable_trk:
-                    results = self.model.track(
-                        frame, conf=self.conf, iou=self.iou, classes=self.selected_ind, persist=True
-                    )
-                else:
-                    results = self.model(frame, conf=self.conf, iou=self.iou, classes=self.selected_ind)
-
-                annotated_frame = results[0].plot()  # Add annotations on frame
-
-                if stop_button:
-                    cap.release()  # Release the capture
-                    self.st.stop()  # Stop streamlit app
-
-                self.org_frame.image(frame, channels="BGR", caption="Original Frame")  # Display original frame
-                self.ann_frame.image(annotated_frame, channels="BGR", caption="Predicted Frame")  # Display processed
-
-            cap.release()  # Release the capture
-        cv2.destroyAllWindows()  # Destroy all OpenCV windows
-
-
-if __name__ == "__main__":
-    import sys  # Import the sys module for accessing command-line arguments
-
-    # Check if a model name is provided as a command-line argument
-    args = len(sys.argv)
-    model = sys.argv[1] if args > 1 else None  # Assign first argument as the model name if provided
-    # Create an instance of the Inference class and run inference
-    Inference(model=model).inference()
diff --git a/ultralytics/solutions/templates/similarity-search.html b/ultralytics/solutions/templates/similarity-search.html
deleted file mode 100644
index 6a24179..0000000
--- a/ultralytics/solutions/templates/similarity-search.html
+++ /dev/null
@@ -1,167 +0,0 @@
-<!-- Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license -->
-
-<!--Similarity search webpage-->
-<!doctype html>
-<html lang="en">
-  <head>
-    <meta charset="UTF-8" />
-    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
-    <title>Semantic Image Search</title>
-    <link
-      href="https://fonts.googleapis.com/css2?family=Inter:wght@400;600&display=swap"
-      rel="stylesheet"
-    />
-    <style>
-      body {
-        background: linear-gradient(135deg, #f0f4ff, #f9fbff);
-        font-family: "Inter", sans-serif;
-        color: #111e68;
-        padding: 2rem;
-        margin: 0;
-        min-height: 100vh;
-      }
-
-      h1 {
-        text-align: center;
-        margin-bottom: 2rem;
-        font-size: 2.5rem;
-        font-weight: 600;
-      }
-
-      form {
-        display: flex;
-        flex-wrap: wrap;
-        justify-content: center;
-        align-items: center;
-        gap: 1rem;
-        margin-bottom: 3rem;
-      }
-
-      input[type="text"] {
-        width: 300px;
-        padding: 0.75rem 1rem;
-        font-size: 1rem;
-        border-radius: 10px;
-        border: 1px solid #ccc;
-        box-shadow: 0 2px 6px rgba(0, 0, 0, 0.05);
-        transition: box-shadow 0.3s ease;
-      }
-
-      input[type="text"]:focus {
-        outline: none;
-        box-shadow: 0 0 0 3px rgba(17, 30, 104, 0.2);
-      }
-
-      button {
-        background-color: #111e68;
-        color: white;
-        font-weight: 600;
-        font-size: 1rem;
-        padding: 0.75rem 1.5rem;
-        border-radius: 10px;
-        border: none;
-        cursor: pointer;
-        transition:
-          background-color 0.3s ease,
-          transform 0.2s ease;
-      }
-
-      button:hover {
-        background-color: #1f2e9f;
-        transform: translateY(-2px);
-      }
-
-      .grid {
-        display: grid;
-        grid-template-columns: repeat(auto-fill, minmax(260px, 1fr));
-        gap: 1.5rem;
-        max-width: 1600px;
-        margin: auto;
-      }
-
-      .card {
-        background: white;
-        border-radius: 16px;
-        overflow: hidden;
-        box-shadow: 0 6px 14px rgba(0, 0, 0, 0.08);
-        transition:
-          transform 0.3s ease,
-          box-shadow 0.3s ease;
-      }
-
-      .card:hover {
-        transform: translateY(-6px);
-        box-shadow: 0 10px 20px rgba(0, 0, 0, 0.1);
-      }
-
-      .card img {
-        width: 100%;
-        height: 100%;
-        object-fit: cover;
-        display: block;
-      }
-    </style>
-  </head>
-  <script>
-    function filterResults(k) {
-      const cards = document.querySelectorAll(".grid .card");
-      cards.forEach((card, idx) => {
-        card.style.display = idx < k ? "block" : "none";
-      });
-      const buttons = document.querySelectorAll(".topk-btn");
-      buttons.forEach((btn) => btn.classList.remove("active"));
-      event.target.classList.add("active");
-    }
-    document.addEventListener("DOMContentLoaded", () => {
-      filterResults(10);
-    });
-  </script>
-  <body>
-    <div style="text-align: center; margin-bottom: 1rem">
-      <img
-        src="https://raw.githubusercontent.com/ultralytics/assets/main/logo/favicon.png"
-        alt="Ultralytics Logo"
-        style="height: 40px"
-      />
-    </div>
-    <h1>Semantic Image Search with AI</h1>
-
-    <!-- Search box -->
-    <form method="POST">
-      <input
-        type="text"
-        name="query"
-        placeholder="Describe the scene (e.g., man walking)"
-        value="{{ request.form['query'] }}"
-        required
-      />
-      <button type="submit">Search</button>
-      {% if results %}
-      <div class="top-k-buttons">
-        <button type="button" class="topk-btn" onclick="filterResults(5)">
-          Top 5
-        </button>
-        <button
-          type="button"
-          class="topk-btn active"
-          onclick="filterResults(10)"
-        >
-          Top 10
-        </button>
-        <button type="button" class="topk-btn" onclick="filterResults(30)">
-          Top 30
-        </button>
-      </div>
-      {% endif %}
-    </form>
-
-    <!-- Search results grid -->
-    <div class="grid">
-      {% for img in results %}
-      <div class="card">
-        <img src="{{ url_for('static', filename=img) }}" alt="Result Image" />
-      </div>
-      {% endfor %}
-    </div>
-  </body>
-</html>
diff --git a/ultralytics/solutions/trackzone.py b/ultralytics/solutions/trackzone.py
deleted file mode 100644
index 5505317..0000000
--- a/ultralytics/solutions/trackzone.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from typing import Any
-
-import cv2
-import numpy as np
-
-from ultralytics.solutions.solutions import BaseSolution, SolutionAnnotator, SolutionResults
-from ultralytics.utils.plotting import colors
-
-
-class TrackZone(BaseSolution):
-    """
-    A class to manage region-based object tracking in a video stream.
-
-    This class extends the BaseSolution class and provides functionality for tracking objects within a specific region
-    defined by a polygonal area. Objects outside the region are excluded from tracking.
-
-    Attributes:
-        region (np.ndarray): The polygonal region for tracking, represented as a convex hull of points.
-        line_width (int): Width of the lines used for drawing bounding boxes and region boundaries.
-        names (list[str]): List of class names that the model can detect.
-        boxes (list[np.ndarray]): Bounding boxes of tracked objects.
-        track_ids (list[int]): Unique identifiers for each tracked object.
-        clss (list[int]): Class indices of tracked objects.
-
-    Methods:
-        process: Process each frame of the video, applying region-based tracking.
-        extract_tracks: Extract tracking information from the input frame.
-        display_output: Display the processed output.
-
-    Examples:
-        >>> tracker = TrackZone()
-        >>> frame = cv2.imread("frame.jpg")
-        >>> results = tracker.process(frame)
-        >>> cv2.imshow("Tracked Frame", results.plot_im)
-    """
-
-    def __init__(self, **kwargs: Any) -> None:
-        """
-        Initialize the TrackZone class for tracking objects within a defined region in video streams.
-
-        Args:
-            **kwargs (Any): Additional keyword arguments passed to the parent class.
-        """
-        super().__init__(**kwargs)
-        default_region = [(75, 75), (565, 75), (565, 285), (75, 285)]
-        self.region = cv2.convexHull(np.array(self.region or default_region, dtype=np.int32))
-        self.mask = None
-
-    def process(self, im0: np.ndarray) -> SolutionResults:
-        """
-        Process the input frame to track objects within a defined region.
-
-        This method initializes the annotator, creates a mask for the specified region, extracts tracks
-        only from the masked area, and updates tracking information. Objects outside the region are ignored.
-
-        Args:
-            im0 (np.ndarray): The input image or frame to be processed.
-
-        Returns:
-            (SolutionResults): Contains processed image `plot_im` and `total_tracks` (int) representing the
-                               total number of tracked objects within the defined region.
-
-        Examples:
-            >>> tracker = TrackZone()
-            >>> frame = cv2.imread("path/to/image.jpg")
-            >>> results = tracker.process(frame)
-        """
-        annotator = SolutionAnnotator(im0, line_width=self.line_width)  # Initialize annotator
-
-        if self.mask is None:  # Create a mask for the region
-            self.mask = np.zeros_like(im0[:, :, 0])
-            cv2.fillPoly(self.mask, [self.region], 255)
-        masked_frame = cv2.bitwise_and(im0, im0, mask=self.mask)
-        self.extract_tracks(masked_frame)
-
-        # Draw the region boundary
-        cv2.polylines(im0, [self.region], isClosed=True, color=(255, 255, 255), thickness=self.line_width * 2)
-
-        # Iterate over boxes, track ids, classes indexes list and draw bounding boxes
-        for box, track_id, cls, conf in zip(self.boxes, self.track_ids, self.clss, self.confs):
-            annotator.box_label(
-                box, label=self.adjust_box_label(cls, conf, track_id=track_id), color=colors(track_id, True)
-            )
-
-        plot_im = annotator.result()
-        self.display_output(plot_im)  # Display output with base class function
-
-        # Return a SolutionResults
-        return SolutionResults(plot_im=plot_im, total_tracks=len(self.track_ids))
diff --git a/ultralytics/solutions/vision_eye.py b/ultralytics/solutions/vision_eye.py
deleted file mode 100644
index 7732345..0000000
--- a/ultralytics/solutions/vision_eye.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from typing import Any
-
-from ultralytics.solutions.solutions import BaseSolution, SolutionAnnotator, SolutionResults
-from ultralytics.utils.plotting import colors
-
-
-class VisionEye(BaseSolution):
-    """
-    A class to manage object detection and vision mapping in images or video streams.
-
-    This class extends the BaseSolution class and provides functionality for detecting objects,
-    mapping vision points, and annotating results with bounding boxes and labels.
-
-    Attributes:
-        vision_point (tuple[int, int]): Coordinates (x, y) where vision will view objects and draw tracks.
-
-    Methods:
-        process: Process the input image to detect objects, annotate them, and apply vision mapping.
-
-    Examples:
-        >>> vision_eye = VisionEye()
-        >>> frame = cv2.imread("frame.jpg")
-        >>> results = vision_eye.process(frame)
-        >>> print(f"Total detected instances: {results.total_tracks}")
-    """
-
-    def __init__(self, **kwargs: Any) -> None:
-        """
-        Initialize the VisionEye class for detecting objects and applying vision mapping.
-
-        Args:
-            **kwargs (Any): Keyword arguments passed to the parent class and for configuring vision_point.
-        """
-        super().__init__(**kwargs)
-        # Set the vision point where the system will view objects and draw tracks
-        self.vision_point = self.CFG["vision_point"]
-
-    def process(self, im0) -> SolutionResults:
-        """
-        Perform object detection, vision mapping, and annotation on the input image.
-
-        Args:
-            im0 (np.ndarray): The input image for detection and annotation.
-
-        Returns:
-            (SolutionResults): Object containing the annotated image and tracking statistics.
-                - plot_im: Annotated output image with bounding boxes and vision mapping
-                - total_tracks: Number of tracked objects in the frame
-
-        Examples:
-            >>> vision_eye = VisionEye()
-            >>> frame = cv2.imread("image.jpg")
-            >>> results = vision_eye.process(frame)
-            >>> print(f"Detected {results.total_tracks} objects")
-        """
-        self.extract_tracks(im0)  # Extract tracks (bounding boxes, classes, and masks)
-        annotator = SolutionAnnotator(im0, self.line_width)
-
-        for cls, t_id, box, conf in zip(self.clss, self.track_ids, self.boxes, self.confs):
-            # Annotate the image with bounding boxes, labels, and vision mapping
-            annotator.box_label(box, label=self.adjust_box_label(cls, conf, t_id), color=colors(int(t_id), True))
-            annotator.visioneye(box, self.vision_point)
-
-        plot_im = annotator.result()
-        self.display_output(plot_im)  # Display the annotated output using the base class function
-
-        # Return a SolutionResults object with the annotated image and tracking statistics
-        return SolutionResults(plot_im=plot_im, total_tracks=len(self.track_ids))
diff --git a/ultralytics/trackers/README.md b/ultralytics/trackers/README.md
deleted file mode 100644
index de6acbd..0000000
--- a/ultralytics/trackers/README.md
+++ /dev/null
@@ -1,295 +0,0 @@
-<a href="https://www.ultralytics.com/" target="_blank"><img src="https://raw.githubusercontent.com/ultralytics/assets/main/logo/Ultralytics_Logotype_Original.svg" width="320" alt="Ultralytics logo"></a>
-
-# Multi-Object Tracking with Ultralytics YOLO
-
-<img width="1024" src="https://user-images.githubusercontent.com/26833433/243418637-1d6250fd-1515-4c10-a844-a32818ae6d46.png" alt="Ultralytics YOLO trackers visualization">
-
-[Object tracking](https://www.ultralytics.com/glossary/object-tracking), a key aspect of [video analytics](https://en.wikipedia.org/wiki/Video_content_analysis), involves identifying the location and class of objects within video frames and assigning a unique ID to each detected object as it moves. This capability enables a wide range of applications, from surveillance and security systems to [real-time](https://www.ultralytics.com/glossary/real-time-inference) sports analysis and autonomous vehicle navigation. Learn more about tracking on our [tracking documentation page](https://docs.ultralytics.com/modes/track/).
-
-## 🎯 Why Choose Ultralytics YOLO for Object Tracking?
-
-Ultralytics YOLO trackers provide output consistent with standard [object detection](https://docs.ultralytics.com/tasks/detect/) but add persistent object IDs. This simplifies the process of tracking objects in video streams and performing subsequent analyses. Here’s why Ultralytics YOLO is an excellent choice for your object tracking needs:
-
-- **Efficiency:** Process video streams in real-time without sacrificing accuracy.
-- **Flexibility:** Supports multiple robust tracking algorithms and configurations.
-- **Ease of Use:** Offers straightforward [Python API](https://docs.ultralytics.com/usage/python/) and [CLI](https://docs.ultralytics.com/usage/cli/) options for rapid integration and deployment.
-- **Customizability:** Easily integrates with [custom-trained YOLO models](https://docs.ultralytics.com/modes/train/), enabling deployment in specialized, domain-specific applications.
-
-**Watch:** Object Detection and Tracking with Ultralytics YOLOv8.
-
-[![Watch the video](https://user-images.githubusercontent.com/26833433/244171528-66a4a68d-cb85-466a-984a-34301616b7a3.png)](https://www.youtube.com/watch?v=hHyHmOtmEgs)
-
-## ✨ Features at a Glance
-
-Ultralytics YOLO extends its powerful object detection features to deliver robust and versatile object tracking:
-
-- **Real-Time Tracking:** Seamlessly track objects in high-frame-rate videos.
-- **Multiple Tracker Support:** Choose from a selection of established tracking algorithms.
-- **Customizable Tracker Configurations:** Adapt the tracking algorithm to specific requirements by adjusting various parameters.
-
-## 🛠️ Available Trackers
-
-Ultralytics YOLO supports the following tracking algorithms. Enable them by passing the relevant YAML configuration file, such as `tracker=tracker_type.yaml`:
-
-- **BoT-SORT:** Use [`botsort.yaml`](https://github.com/ultralytics/ultralytics/blob/main/ultralytics/cfg/trackers/botsort.yaml) to enable this tracker. Based on the [BoT-SORT paper](https://arxiv.org/abs/2206.14651) and its official [code implementation](https://github.com/NirAharon/BoT-SORT).
-- **ByteTrack:** Use [`bytetrack.yaml`](https://github.com/ultralytics/ultralytics/blob/main/ultralytics/cfg/trackers/bytetrack.yaml) to enable this tracker. Based on the [ByteTrack paper](https://arxiv.org/abs/2110.06864) and its official [code implementation](https://github.com/FoundationVision/ByteTrack).
-
-The default tracker is **BoT-SORT**.
-
-## ⚙️ Usage
-
-To run the tracker on video streams, use a trained Detect, Segment, or Pose model like [Ultralytics YOLO11n](https://docs.ultralytics.com/models/yolo11/), YOLO11n-seg, or YOLO11n-pose.
-
-```python
-# Python
-from ultralytics import YOLO
-
-# Load an official or custom model
-model = YOLO("yolo11n.pt")  # Load an official Detect model
-# model = YOLO("yolo11n-seg.pt")  # Load an official Segment model
-# model = YOLO("yolo11n-pose.pt")  # Load an official Pose model
-# model = YOLO("path/to/best.pt")  # Load a custom trained model
-
-# Perform tracking with the model
-results = model.track(source="https://youtu.be/LNwODJXcvt4", show=True)  # Tracking with default tracker
-# results = model.track(source="https://youtu.be/LNwODJXcvt4", show=True, tracker="bytetrack.yaml")  # Tracking with ByteTrack tracker
-```
-
-```bash
-# CLI
-# Perform tracking with various models using the command line interface
-yolo track model=yolo11n.pt source="https://youtu.be/LNwODJXcvt4" # Official Detect model
-# yolo track model=yolo11n-seg.pt source="https://youtu.be/LNwODJXcvt4"  # Official Segment model
-# yolo track model=yolo11n-pose.pt source="https://youtu.be/LNwODJXcvt4" # Official Pose model
-# yolo track model=path/to/best.pt source="https://youtu.be/LNwODJXcvt4" # Custom trained model
-
-# Track using ByteTrack tracker
-# yolo track model=path/to/best.pt tracker="bytetrack.yaml"
-```
-
-As shown above, tracking is available for all [Detect](https://docs.ultralytics.com/tasks/detect/), [Segment](https://docs.ultralytics.com/tasks/segment/), and [Pose](https://docs.ultralytics.com/tasks/pose/) models when run on videos or streaming sources.
-
-## 🔧 Configuration
-
-### Tracking Arguments
-
-Tracking configuration shares properties with the Predict mode, such as `conf` (confidence threshold), `iou` ([Intersection over Union](https://www.ultralytics.com/glossary/intersection-over-union-iou) threshold), and `show` (display results). For additional configurations, refer to the [Predict mode documentation](https://docs.ultralytics.com/modes/predict/).
-
-```python
-# Python
-from ultralytics import YOLO
-
-# Configure the tracking parameters and run the tracker
-model = YOLO("yolo11n.pt")
-results = model.track(source="https://youtu.be/LNwODJXcvt4", conf=0.3, iou=0.5, show=True)
-```
-
-```bash
-# CLI
-# Configure tracking parameters and run the tracker using the command line interface
-yolo track model=yolo11n.pt source="https://youtu.be/LNwODJXcvt4" conf=0.3 iou=0.5 show
-```
-
-### Tracker Selection
-
-Ultralytics allows you to use a modified tracker configuration file. Create a copy of a tracker config file (e.g., `custom_tracker.yaml`) from [ultralytics/cfg/trackers](https://github.com/ultralytics/ultralytics/tree/main/ultralytics/cfg/trackers) and adjust any configurations (except `tracker_type`) according to your needs.
-
-```python
-# Python
-from ultralytics import YOLO
-
-# Load the model and run the tracker with a custom configuration file
-model = YOLO("yolo11n.pt")
-results = model.track(source="https://youtu.be/LNwODJXcvt4", tracker="custom_tracker.yaml")
-```
-
-```bash
-# CLI
-# Load the model and run the tracker with a custom configuration file using the command line interface
-yolo track model=yolo11n.pt source="https://youtu.be/LNwODJXcvt4" tracker='custom_tracker.yaml'
-```
-
-For a comprehensive list of tracking arguments, consult the [Tracking Configuration files](https://github.com/ultralytics/ultralytics/tree/main/ultralytics/cfg/trackers) in the repository.
-
-## 🐍 Python Examples
-
-### Persisting Tracks Loop
-
-This Python script uses [OpenCV (`cv2`)](https://opencv.org/) and Ultralytics YOLO11 to perform object tracking on video frames. Ensure you have installed the necessary packages (`opencv-python` and `ultralytics`). The [`persist=True`](https://docs.ultralytics.com/modes/predict/#tracking) argument indicates that the current frame is the next in a sequence, allowing the tracker to maintain track continuity from the previous frame.
-
-```python
-# Python
-import cv2
-
-from ultralytics import YOLO
-
-# Load the YOLO11 model
-model = YOLO("yolo11n.pt")
-
-# Open the video file
-video_path = "path/to/video.mp4"
-cap = cv2.VideoCapture(video_path)
-
-# Loop through the video frames
-while cap.isOpened():
-    # Read a frame from the video
-    success, frame = cap.read()
-
-    if success:
-        # Run YOLO11 tracking on the frame, persisting tracks between frames
-        results = model.track(frame, persist=True)
-
-        # Visualize the results on the frame
-        annotated_frame = results[0].plot()
-
-        # Display the annotated frame
-        cv2.imshow("YOLO11 Tracking", annotated_frame)
-
-        # Break the loop if 'q' is pressed
-        if cv2.waitKey(1) & 0xFF == ord("q"):
-            break
-    else:
-        # Break the loop if the end of the video is reached
-        break
-
-# Release the video capture object and close the display window
-cap.release()
-cv2.destroyAllWindows()
-```
-
-Note the use of `model.track(frame)` instead of `model(frame)`, which specifically enables object tracking. This script processes each video frame, visualizes the tracking results, and displays them. Press 'q' to exit the loop.
-
-### Plotting Tracks Over Time
-
-Visualizing object tracks across consecutive frames offers valuable insights into movement patterns within a video. Ultralytics YOLO11 makes plotting these tracks efficient.
-
-The following example demonstrates how to use YOLO11's tracking capabilities to plot the movement of detected objects. The script opens a video, reads it frame by frame, and uses the YOLO model built on [PyTorch](https://pytorch.org/) to identify and track objects. By storing the center points of the detected [bounding boxes](https://www.ultralytics.com/glossary/bounding-box) and connecting them, we can draw lines representing the paths of tracked objects using [NumPy](https://numpy.org/) for numerical operations.
-
-```python
-# Python
-from collections import defaultdict
-
-import cv2
-import numpy as np
-
-from ultralytics import YOLO
-
-# Load the YOLO11 model
-model = YOLO("yolo11n.pt")
-
-# Open the video file
-video_path = "path/to/video.mp4"
-cap = cv2.VideoCapture(video_path)
-
-# Store the track history
-track_history = defaultdict(lambda: [])
-
-# Loop through the video frames
-while cap.isOpened():
-    # Read a frame from the video
-    success, frame = cap.read()
-
-    if success:
-        # Run YOLO11 tracking on the frame, persisting tracks between frames
-        result = model.track(frame, persist=True)[0]
-
-        # Get the boxes and track IDs
-        if result.boxes and result.boxes.is_track:
-            boxes = result.boxes.xywh.cpu()
-            track_ids = result.boxes.id.int().cpu().tolist()
-
-            # Visualize the result on the frame
-            frame = result.plot()
-
-            # Plot the tracks
-            for box, track_id in zip(boxes, track_ids):
-                x, y, w, h = box
-                track = track_history[track_id]
-                track.append((float(x), float(y)))  # x, y center point
-                if len(track) > 30:  # retain 30 tracks for 30 frames
-                    track.pop(0)
-
-                # Draw the tracking lines
-                points = np.hstack(track).astype(np.int32).reshape((-1, 1, 2))
-                cv2.polylines(frame, [points], isClosed=False, color=(230, 230, 230), thickness=10)
-
-        # Display the annotated frame
-        cv2.imshow("YOLO11 Tracking", frame)
-
-        # Break the loop if 'q' is pressed
-        if cv2.waitKey(1) & 0xFF == ord("q"):
-            break
-    else:
-        # Break the loop if the end of the video is reached
-        break
-
-# Release the video capture object and close the display window
-cap.release()
-cv2.destroyAllWindows()
-```
-
-### Multithreaded Tracking
-
-Multithreaded tracking allows running object tracking on multiple video streams simultaneously, which is highly beneficial for systems handling inputs from several cameras, improving efficiency through concurrent processing.
-
-This Python script utilizes Python's [`threading`](https://docs.python.org/3/library/threading.html) module for concurrent tracker execution. Each thread manages tracking for a single video file.
-
-The `run_tracker_in_thread` function accepts parameters like the video file path, model, and a unique window index. It contains the main tracking loop, reading frames, running the tracker, and displaying results in a dedicated window.
-
-This example uses two models, `yolo11n.pt` and `yolo11n-seg.pt`, tracking objects in `video_file1` and `video_file2`, respectively.
-
-Setting `daemon=True` in `threading.Thread` ensures threads exit when the main program finishes. Threads are started with `start()` and the main thread waits for their completion using `join()`.
-
-Finally, `cv2.destroyAllWindows()` closes all OpenCV windows after the threads finish.
-
-```python
-# Python
-import threading
-
-import cv2
-
-from ultralytics import YOLO
-
-# Define model names and video sources
-MODEL_NAMES = ["yolo11n.pt", "yolo11n-seg.pt"]
-SOURCES = ["path/to/video.mp4", "0"]  # local video, 0 for webcam
-
-
-def run_tracker_in_thread(model_name, filename):
-    """
-    Run YOLO tracker in its own thread for concurrent processing.
-
-    Args:
-        model_name (str): The YOLO11 model object.
-        filename (str): The path to the video file or the identifier for the webcam/external camera source.
-    """
-    model = YOLO(model_name)
-    results = model.track(filename, save=True, stream=True)
-    for r in results:
-        pass
-
-
-# Create and start tracker threads using a for loop
-tracker_threads = []
-for video_file, model_name in zip(SOURCES, MODEL_NAMES):
-    thread = threading.Thread(target=run_tracker_in_thread, args=(model_name, video_file), daemon=True)
-    tracker_threads.append(thread)
-    thread.start()
-
-# Wait for all tracker threads to finish
-for thread in tracker_threads:
-    thread.join()
-
-# Clean up and close windows
-cv2.destroyAllWindows()
-```
-
-This setup can be easily scaled to handle more video streams by creating additional threads following the same pattern. Explore more applications in our [blog post on object tracking](https://www.ultralytics.com/blog/object-detection-and-tracking-with-ultralytics-yolov8).
-
-## 🤝 Contribute New Trackers
-
-Are you experienced in multi-object tracking and have implemented or adapted an algorithm with Ultralytics YOLO? We encourage you to contribute to our Trackers section in [ultralytics/cfg/trackers](https://github.com/ultralytics/ultralytics/tree/main/ultralytics/cfg/trackers)! Your contributions can help expand the tracking solutions available within the Ultralytics [ecosystem](https://docs.ultralytics.com/).
-
-To contribute, please review our [Contributing Guide](https://docs.ultralytics.com/help/contributing/) for instructions on submitting a [Pull Request (PR)](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests) 🛠️. We look forward to your contributions!
-
-Let's work together to enhance the tracking capabilities of Ultralytics YOLO and provide more powerful tools for the [computer vision](https://www.ultralytics.com/glossary/computer-vision-cv) and [deep learning](https://www.ultralytics.com/glossary/deep-learning-dl) community 🙏!
diff --git a/ultralytics/trackers/__init__.py b/ultralytics/trackers/__init__.py
deleted file mode 100644
index 2919511..0000000
--- a/ultralytics/trackers/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from .bot_sort import BOTSORT
-from .byte_tracker import BYTETracker
-from .track import register_tracker
-
-__all__ = "register_tracker", "BOTSORT", "BYTETracker"  # allow simpler import
diff --git a/ultralytics/trackers/basetrack.py b/ultralytics/trackers/basetrack.py
deleted file mode 100644
index d254883..0000000
--- a/ultralytics/trackers/basetrack.py
+++ /dev/null
@@ -1,117 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-"""Module defines the base classes and structures for object tracking in YOLO."""
-
-from collections import OrderedDict
-from typing import Any
-
-import numpy as np
-
-
-class TrackState:
-    """
-    Enumeration class representing the possible states of an object being tracked.
-
-    Attributes:
-        New (int): State when the object is newly detected.
-        Tracked (int): State when the object is successfully tracked in subsequent frames.
-        Lost (int): State when the object is no longer tracked.
-        Removed (int): State when the object is removed from tracking.
-
-    Examples:
-        >>> state = TrackState.New
-        >>> if state == TrackState.New:
-        >>>     print("Object is newly detected.")
-    """
-
-    New = 0
-    Tracked = 1
-    Lost = 2
-    Removed = 3
-
-
-class BaseTrack:
-    """
-    Base class for object tracking, providing foundational attributes and methods.
-
-    Attributes:
-        _count (int): Class-level counter for unique track IDs.
-        track_id (int): Unique identifier for the track.
-        is_activated (bool): Flag indicating whether the track is currently active.
-        state (TrackState): Current state of the track.
-        history (OrderedDict): Ordered history of the track's states.
-        features (list): List of features extracted from the object for tracking.
-        curr_feature (Any): The current feature of the object being tracked.
-        score (float): The confidence score of the tracking.
-        start_frame (int): The frame number where tracking started.
-        frame_id (int): The most recent frame ID processed by the track.
-        time_since_update (int): Frames passed since the last update.
-        location (tuple): The location of the object in the context of multi-camera tracking.
-
-    Methods:
-        end_frame: Returns the ID of the last frame where the object was tracked.
-        next_id: Increments and returns the next global track ID.
-        activate: Abstract method to activate the track.
-        predict: Abstract method to predict the next state of the track.
-        update: Abstract method to update the track with new data.
-        mark_lost: Marks the track as lost.
-        mark_removed: Marks the track as removed.
-        reset_id: Resets the global track ID counter.
-
-    Examples:
-        Initialize a new track and mark it as lost:
-        >>> track = BaseTrack()
-        >>> track.mark_lost()
-        >>> print(track.state)  # Output: 2 (TrackState.Lost)
-    """
-
-    _count = 0
-
-    def __init__(self):
-        """Initialize a new track with a unique ID and foundational tracking attributes."""
-        self.track_id = 0
-        self.is_activated = False
-        self.state = TrackState.New
-        self.history = OrderedDict()
-        self.features = []
-        self.curr_feature = None
-        self.score = 0
-        self.start_frame = 0
-        self.frame_id = 0
-        self.time_since_update = 0
-        self.location = (np.inf, np.inf)
-
-    @property
-    def end_frame(self) -> int:
-        """Return the ID of the most recent frame where the object was tracked."""
-        return self.frame_id
-
-    @staticmethod
-    def next_id() -> int:
-        """Increment and return the next unique global track ID for object tracking."""
-        BaseTrack._count += 1
-        return BaseTrack._count
-
-    def activate(self, *args: Any) -> None:
-        """Activate the track with provided arguments, initializing necessary attributes for tracking."""
-        raise NotImplementedError
-
-    def predict(self) -> None:
-        """Predict the next state of the track based on the current state and tracking model."""
-        raise NotImplementedError
-
-    def update(self, *args: Any, **kwargs: Any) -> None:
-        """Update the track with new observations and data, modifying its state and attributes accordingly."""
-        raise NotImplementedError
-
-    def mark_lost(self) -> None:
-        """Mark the track as lost by updating its state to TrackState.Lost."""
-        self.state = TrackState.Lost
-
-    def mark_removed(self) -> None:
-        """Mark the track as removed by setting its state to TrackState.Removed."""
-        self.state = TrackState.Removed
-
-    @staticmethod
-    def reset_id() -> None:
-        """Reset the global track ID counter to its initial value."""
-        BaseTrack._count = 0
diff --git a/ultralytics/trackers/bot_sort.py b/ultralytics/trackers/bot_sort.py
deleted file mode 100644
index 30f9463..0000000
--- a/ultralytics/trackers/bot_sort.py
+++ /dev/null
@@ -1,274 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-from collections import deque
-from typing import Any
-
-import numpy as np
-import torch
-
-from ultralytics.utils.ops import xywh2xyxy
-from ultralytics.utils.plotting import save_one_box
-
-from .basetrack import TrackState
-from .byte_tracker import BYTETracker, STrack
-from .utils import matching
-from .utils.gmc import GMC
-from .utils.kalman_filter import KalmanFilterXYWH
-
-
-class BOTrack(STrack):
-    """
-    An extended version of the STrack class for YOLO, adding object tracking features.
-
-    This class extends the STrack class to include additional functionalities for object tracking, such as feature
-    smoothing, Kalman filter prediction, and reactivation of tracks.
-
-    Attributes:
-        shared_kalman (KalmanFilterXYWH): A shared Kalman filter for all instances of BOTrack.
-        smooth_feat (np.ndarray): Smoothed feature vector.
-        curr_feat (np.ndarray): Current feature vector.
-        features (deque): A deque to store feature vectors with a maximum length defined by `feat_history`.
-        alpha (float): Smoothing factor for the exponential moving average of features.
-        mean (np.ndarray): The mean state of the Kalman filter.
-        covariance (np.ndarray): The covariance matrix of the Kalman filter.
-
-    Methods:
-        update_features: Update features vector and smooth it using exponential moving average.
-        predict: Predict the mean and covariance using Kalman filter.
-        re_activate: Reactivate a track with updated features and optionally new ID.
-        update: Update the track with new detection and frame ID.
-        tlwh: Property that gets the current position in tlwh format `(top left x, top left y, width, height)`.
-        multi_predict: Predict the mean and covariance of multiple object tracks using shared Kalman filter.
-        convert_coords: Convert tlwh bounding box coordinates to xywh format.
-        tlwh_to_xywh: Convert bounding box to xywh format `(center x, center y, width, height)`.
-
-    Examples:
-        Create a BOTrack instance and update its features
-        >>> bo_track = BOTrack(tlwh=[100, 50, 80, 40], score=0.9, cls=1, feat=np.random.rand(128))
-        >>> bo_track.predict()
-        >>> new_track = BOTrack(tlwh=[110, 60, 80, 40], score=0.85, cls=1, feat=np.random.rand(128))
-        >>> bo_track.update(new_track, frame_id=2)
-    """
-
-    shared_kalman = KalmanFilterXYWH()
-
-    def __init__(
-        self, xywh: np.ndarray, score: float, cls: int, feat: np.ndarray | None = None, feat_history: int = 50
-    ):
-        """
-        Initialize a BOTrack object with temporal parameters, such as feature history, alpha, and current features.
-
-        Args:
-            xywh (np.ndarray): Bounding box coordinates in xywh format (center x, center y, width, height).
-            score (float): Confidence score of the detection.
-            cls (int): Class ID of the detected object.
-            feat (np.ndarray, optional): Feature vector associated with the detection.
-            feat_history (int): Maximum length of the feature history deque.
-
-        Examples:
-            Initialize a BOTrack object with bounding box, score, class ID, and feature vector
-            >>> xywh = np.array([100, 150, 60, 50])
-            >>> score = 0.9
-            >>> cls = 1
-            >>> feat = np.random.rand(128)
-            >>> bo_track = BOTrack(xywh, score, cls, feat)
-        """
-        super().__init__(xywh, score, cls)
-
-        self.smooth_feat = None
-        self.curr_feat = None
-        if feat is not None:
-            self.update_features(feat)
-        self.features = deque([], maxlen=feat_history)
-        self.alpha = 0.9
-
-    def update_features(self, feat: np.ndarray) -> None:
-        """Update the feature vector and apply exponential moving average smoothing."""
-        feat /= np.linalg.norm(feat)
-        self.curr_feat = feat
-        if self.smooth_feat is None:
-            self.smooth_feat = feat
-        else:
-            self.smooth_feat = self.alpha * self.smooth_feat + (1 - self.alpha) * feat
-        self.features.append(feat)
-        self.smooth_feat /= np.linalg.norm(self.smooth_feat)
-
-    def predict(self) -> None:
-        """Predict the object's future state using the Kalman filter to update its mean and covariance."""
-        mean_state = self.mean.copy()
-        if self.state != TrackState.Tracked:
-            mean_state[6] = 0
-            mean_state[7] = 0
-
-        self.mean, self.covariance = self.kalman_filter.predict(mean_state, self.covariance)
-
-    def re_activate(self, new_track: BOTrack, frame_id: int, new_id: bool = False) -> None:
-        """Reactivate a track with updated features and optionally assign a new ID."""
-        if new_track.curr_feat is not None:
-            self.update_features(new_track.curr_feat)
-        super().re_activate(new_track, frame_id, new_id)
-
-    def update(self, new_track: BOTrack, frame_id: int) -> None:
-        """Update the track with new detection information and the current frame ID."""
-        if new_track.curr_feat is not None:
-            self.update_features(new_track.curr_feat)
-        super().update(new_track, frame_id)
-
-    @property
-    def tlwh(self) -> np.ndarray:
-        """Return the current bounding box position in `(top left x, top left y, width, height)` format."""
-        if self.mean is None:
-            return self._tlwh.copy()
-        ret = self.mean[:4].copy()
-        ret[:2] -= ret[2:] / 2
-        return ret
-
-    @staticmethod
-    def multi_predict(stracks: list[BOTrack]) -> None:
-        """Predict the mean and covariance for multiple object tracks using a shared Kalman filter."""
-        if len(stracks) <= 0:
-            return
-        multi_mean = np.asarray([st.mean.copy() for st in stracks])
-        multi_covariance = np.asarray([st.covariance for st in stracks])
-        for i, st in enumerate(stracks):
-            if st.state != TrackState.Tracked:
-                multi_mean[i][6] = 0
-                multi_mean[i][7] = 0
-        multi_mean, multi_covariance = BOTrack.shared_kalman.multi_predict(multi_mean, multi_covariance)
-        for i, (mean, cov) in enumerate(zip(multi_mean, multi_covariance)):
-            stracks[i].mean = mean
-            stracks[i].covariance = cov
-
-    def convert_coords(self, tlwh: np.ndarray) -> np.ndarray:
-        """Convert tlwh bounding box coordinates to xywh format."""
-        return self.tlwh_to_xywh(tlwh)
-
-    @staticmethod
-    def tlwh_to_xywh(tlwh: np.ndarray) -> np.ndarray:
-        """Convert bounding box from tlwh (top-left-width-height) to xywh (center-x-center-y-width-height) format."""
-        ret = np.asarray(tlwh).copy()
-        ret[:2] += ret[2:] / 2
-        return ret
-
-
-class BOTSORT(BYTETracker):
-    """
-    An extended version of the BYTETracker class for YOLO, designed for object tracking with ReID and GMC algorithm.
-
-    Attributes:
-        proximity_thresh (float): Threshold for spatial proximity (IoU) between tracks and detections.
-        appearance_thresh (float): Threshold for appearance similarity (ReID embeddings) between tracks and detections.
-        encoder (Any): Object to handle ReID embeddings, set to None if ReID is not enabled.
-        gmc (GMC): An instance of the GMC algorithm for data association.
-        args (Any): Parsed command-line arguments containing tracking parameters.
-
-    Methods:
-        get_kalmanfilter: Return an instance of KalmanFilterXYWH for object tracking.
-        init_track: Initialize track with detections, scores, and classes.
-        get_dists: Get distances between tracks and detections using IoU and (optionally) ReID.
-        multi_predict: Predict and track multiple objects with a YOLO model.
-        reset: Reset the BOTSORT tracker to its initial state.
-
-    Examples:
-        Initialize BOTSORT and process detections
-        >>> bot_sort = BOTSORT(args, frame_rate=30)
-        >>> bot_sort.init_track(dets, scores, cls, img)
-        >>> bot_sort.multi_predict(tracks)
-
-    Note:
-        The class is designed to work with a YOLO object detection model and supports ReID only if enabled via args.
-    """
-
-    def __init__(self, args: Any, frame_rate: int = 30):
-        """
-        Initialize BOTSORT object with ReID module and GMC algorithm.
-
-        Args:
-            args (Any): Parsed command-line arguments containing tracking parameters.
-            frame_rate (int): Frame rate of the video being processed.
-
-        Examples:
-            Initialize BOTSORT with command-line arguments and a specified frame rate:
-            >>> args = parse_args()
-            >>> bot_sort = BOTSORT(args, frame_rate=30)
-        """
-        super().__init__(args, frame_rate)
-        self.gmc = GMC(method=args.gmc_method)
-
-        # ReID module
-        self.proximity_thresh = args.proximity_thresh
-        self.appearance_thresh = args.appearance_thresh
-        self.encoder = (
-            (lambda feats, s: [f.cpu().numpy() for f in feats])  # native features do not require any model
-            if args.with_reid and self.args.model == "auto"
-            else ReID(args.model)
-            if args.with_reid
-            else None
-        )
-
-    def get_kalmanfilter(self) -> KalmanFilterXYWH:
-        """Return an instance of KalmanFilterXYWH for predicting and updating object states in the tracking process."""
-        return KalmanFilterXYWH()
-
-    def init_track(self, results, img: np.ndarray | None = None) -> list[BOTrack]:
-        """Initialize object tracks using detection bounding boxes, scores, class labels, and optional ReID features."""
-        if len(results) == 0:
-            return []
-        bboxes = results.xywhr if hasattr(results, "xywhr") else results.xywh
-        bboxes = np.concatenate([bboxes, np.arange(len(bboxes)).reshape(-1, 1)], axis=-1)
-        if self.args.with_reid and self.encoder is not None:
-            features_keep = self.encoder(img, bboxes)
-            return [BOTrack(xywh, s, c, f) for (xywh, s, c, f) in zip(bboxes, results.conf, results.cls, features_keep)]
-        else:
-            return [BOTrack(xywh, s, c) for (xywh, s, c) in zip(bboxes, results.conf, results.cls)]
-
-    def get_dists(self, tracks: list[BOTrack], detections: list[BOTrack]) -> np.ndarray:
-        """Calculate distances between tracks and detections using IoU and optionally ReID embeddings."""
-        dists = matching.iou_distance(tracks, detections)
-        dists_mask = dists > (1 - self.proximity_thresh)
-
-        if self.args.fuse_score:
-            dists = matching.fuse_score(dists, detections)
-
-        if self.args.with_reid and self.encoder is not None:
-            emb_dists = matching.embedding_distance(tracks, detections) / 2.0
-            emb_dists[emb_dists > (1 - self.appearance_thresh)] = 1.0
-            emb_dists[dists_mask] = 1.0
-            dists = np.minimum(dists, emb_dists)
-        return dists
-
-    def multi_predict(self, tracks: list[BOTrack]) -> None:
-        """Predict the mean and covariance of multiple object tracks using a shared Kalman filter."""
-        BOTrack.multi_predict(tracks)
-
-    def reset(self) -> None:
-        """Reset the BOTSORT tracker to its initial state, clearing all tracked objects and internal states."""
-        super().reset()
-        self.gmc.reset_params()
-
-
-class ReID:
-    """YOLO model as encoder for re-identification."""
-
-    def __init__(self, model: str):
-        """
-        Initialize encoder for re-identification.
-
-        Args:
-            model (str): Path to the YOLO model for re-identification.
-        """
-        from ultralytics import YOLO
-
-        self.model = YOLO(model)
-        self.model(embed=[len(self.model.model.model) - 2 if ".pt" in model else -1], verbose=False, save=False)  # init
-
-    def __call__(self, img: np.ndarray, dets: np.ndarray) -> list[np.ndarray]:
-        """Extract embeddings for detected objects."""
-        feats = self.model.predictor(
-            [save_one_box(det, img, save=False) for det in xywh2xyxy(torch.from_numpy(dets[:, :4]))]
-        )
-        if len(feats) != dets.shape[0] and feats[0].shape[0] == dets.shape[0]:
-            feats = feats[0]  # batched prediction with non-PyTorch backend
-        return [f.cpu().numpy() for f in feats]
diff --git a/ultralytics/trackers/byte_tracker.py b/ultralytics/trackers/byte_tracker.py
deleted file mode 100644
index cdc7dcb..0000000
--- a/ultralytics/trackers/byte_tracker.py
+++ /dev/null
@@ -1,485 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-from typing import Any
-
-import numpy as np
-
-from ..utils import LOGGER
-from ..utils.ops import xywh2ltwh
-from .basetrack import BaseTrack, TrackState
-from .utils import matching
-from .utils.kalman_filter import KalmanFilterXYAH
-
-
-class STrack(BaseTrack):
-    """
-    Single object tracking representation that uses Kalman filtering for state estimation.
-
-    This class is responsible for storing all the information regarding individual tracklets and performs state updates
-    and predictions based on Kalman filter.
-
-    Attributes:
-        shared_kalman (KalmanFilterXYAH): Shared Kalman filter used across all STrack instances for prediction.
-        _tlwh (np.ndarray): Private attribute to store top-left corner coordinates and width and height of bounding box.
-        kalman_filter (KalmanFilterXYAH): Instance of Kalman filter used for this particular object track.
-        mean (np.ndarray): Mean state estimate vector.
-        covariance (np.ndarray): Covariance of state estimate.
-        is_activated (bool): Boolean flag indicating if the track has been activated.
-        score (float): Confidence score of the track.
-        tracklet_len (int): Length of the tracklet.
-        cls (Any): Class label for the object.
-        idx (int): Index or identifier for the object.
-        frame_id (int): Current frame ID.
-        start_frame (int): Frame where the object was first detected.
-        angle (float | None): Optional angle information for oriented bounding boxes.
-
-    Methods:
-        predict: Predict the next state of the object using Kalman filter.
-        multi_predict: Predict the next states for multiple tracks.
-        multi_gmc: Update multiple track states using a homography matrix.
-        activate: Activate a new tracklet.
-        re_activate: Reactivate a previously lost tracklet.
-        update: Update the state of a matched track.
-        convert_coords: Convert bounding box to x-y-aspect-height format.
-        tlwh_to_xyah: Convert tlwh bounding box to xyah format.
-
-    Examples:
-        Initialize and activate a new track
-        >>> track = STrack(xywh=[100, 200, 50, 80, 0], score=0.9, cls="person")
-        >>> track.activate(kalman_filter=KalmanFilterXYAH(), frame_id=1)
-    """
-
-    shared_kalman = KalmanFilterXYAH()
-
-    def __init__(self, xywh: list[float], score: float, cls: Any):
-        """
-        Initialize a new STrack instance.
-
-        Args:
-            xywh (list[float]): Bounding box coordinates and dimensions in the format (x, y, w, h, [a], idx), where
-                (x, y) is the center, (w, h) are width and height, [a] is optional aspect ratio, and idx is the id.
-            score (float): Confidence score of the detection.
-            cls (Any): Class label for the detected object.
-
-        Examples:
-            >>> xywh = [100.0, 150.0, 50.0, 75.0, 1]
-            >>> score = 0.9
-            >>> cls = "person"
-            >>> track = STrack(xywh, score, cls)
-        """
-        super().__init__()
-        # xywh+idx or xywha+idx
-        assert len(xywh) in {5, 6}, f"expected 5 or 6 values but got {len(xywh)}"
-        self._tlwh = np.asarray(xywh2ltwh(xywh[:4]), dtype=np.float32)
-        self.kalman_filter = None
-        self.mean, self.covariance = None, None
-        self.is_activated = False
-
-        self.score = score
-        self.tracklet_len = 0
-        self.cls = cls
-        self.idx = xywh[-1]
-        self.angle = xywh[4] if len(xywh) == 6 else None
-
-    def predict(self):
-        """Predict the next state (mean and covariance) of the object using the Kalman filter."""
-        mean_state = self.mean.copy()
-        if self.state != TrackState.Tracked:
-            mean_state[7] = 0
-        self.mean, self.covariance = self.kalman_filter.predict(mean_state, self.covariance)
-
-    @staticmethod
-    def multi_predict(stracks: list[STrack]):
-        """Perform multi-object predictive tracking using Kalman filter for the provided list of STrack instances."""
-        if len(stracks) <= 0:
-            return
-        multi_mean = np.asarray([st.mean.copy() for st in stracks])
-        multi_covariance = np.asarray([st.covariance for st in stracks])
-        for i, st in enumerate(stracks):
-            if st.state != TrackState.Tracked:
-                multi_mean[i][7] = 0
-        multi_mean, multi_covariance = STrack.shared_kalman.multi_predict(multi_mean, multi_covariance)
-        for i, (mean, cov) in enumerate(zip(multi_mean, multi_covariance)):
-            stracks[i].mean = mean
-            stracks[i].covariance = cov
-
-    @staticmethod
-    def multi_gmc(stracks: list[STrack], H: np.ndarray = np.eye(2, 3)):
-        """Update state tracks positions and covariances using a homography matrix for multiple tracks."""
-        if stracks:
-            multi_mean = np.asarray([st.mean.copy() for st in stracks])
-            multi_covariance = np.asarray([st.covariance for st in stracks])
-
-            R = H[:2, :2]
-            R8x8 = np.kron(np.eye(4, dtype=float), R)
-            t = H[:2, 2]
-
-            for i, (mean, cov) in enumerate(zip(multi_mean, multi_covariance)):
-                mean = R8x8.dot(mean)
-                mean[:2] += t
-                cov = R8x8.dot(cov).dot(R8x8.transpose())
-
-                stracks[i].mean = mean
-                stracks[i].covariance = cov
-
-    def activate(self, kalman_filter: KalmanFilterXYAH, frame_id: int):
-        """Activate a new tracklet using the provided Kalman filter and initialize its state and covariance."""
-        self.kalman_filter = kalman_filter
-        self.track_id = self.next_id()
-        self.mean, self.covariance = self.kalman_filter.initiate(self.convert_coords(self._tlwh))
-
-        self.tracklet_len = 0
-        self.state = TrackState.Tracked
-        if frame_id == 1:
-            self.is_activated = True
-        self.frame_id = frame_id
-        self.start_frame = frame_id
-
-    def re_activate(self, new_track: STrack, frame_id: int, new_id: bool = False):
-        """Reactivate a previously lost track using new detection data and update its state and attributes."""
-        self.mean, self.covariance = self.kalman_filter.update(
-            self.mean, self.covariance, self.convert_coords(new_track.tlwh)
-        )
-        self.tracklet_len = 0
-        self.state = TrackState.Tracked
-        self.is_activated = True
-        self.frame_id = frame_id
-        if new_id:
-            self.track_id = self.next_id()
-        self.score = new_track.score
-        self.cls = new_track.cls
-        self.angle = new_track.angle
-        self.idx = new_track.idx
-
-    def update(self, new_track: STrack, frame_id: int):
-        """
-        Update the state of a matched track.
-
-        Args:
-            new_track (STrack): The new track containing updated information.
-            frame_id (int): The ID of the current frame.
-
-        Examples:
-            Update the state of a track with new detection information
-            >>> track = STrack([100, 200, 50, 80, 0.9, 1])
-            >>> new_track = STrack([105, 205, 55, 85, 0.95, 1])
-            >>> track.update(new_track, 2)
-        """
-        self.frame_id = frame_id
-        self.tracklet_len += 1
-
-        new_tlwh = new_track.tlwh
-        self.mean, self.covariance = self.kalman_filter.update(
-            self.mean, self.covariance, self.convert_coords(new_tlwh)
-        )
-        self.state = TrackState.Tracked
-        self.is_activated = True
-
-        self.score = new_track.score
-        self.cls = new_track.cls
-        self.angle = new_track.angle
-        self.idx = new_track.idx
-
-    def convert_coords(self, tlwh: np.ndarray) -> np.ndarray:
-        """Convert a bounding box's top-left-width-height format to its x-y-aspect-height equivalent."""
-        return self.tlwh_to_xyah(tlwh)
-
-    @property
-    def tlwh(self) -> np.ndarray:
-        """Get the bounding box in top-left-width-height format from the current state estimate."""
-        if self.mean is None:
-            return self._tlwh.copy()
-        ret = self.mean[:4].copy()
-        ret[2] *= ret[3]
-        ret[:2] -= ret[2:] / 2
-        return ret
-
-    @property
-    def xyxy(self) -> np.ndarray:
-        """Convert bounding box from (top left x, top left y, width, height) to (min x, min y, max x, max y) format."""
-        ret = self.tlwh.copy()
-        ret[2:] += ret[:2]
-        return ret
-
-    @staticmethod
-    def tlwh_to_xyah(tlwh: np.ndarray) -> np.ndarray:
-        """Convert bounding box from tlwh format to center-x-center-y-aspect-height (xyah) format."""
-        ret = np.asarray(tlwh).copy()
-        ret[:2] += ret[2:] / 2
-        ret[2] /= ret[3]
-        return ret
-
-    @property
-    def xywh(self) -> np.ndarray:
-        """Get the current position of the bounding box in (center x, center y, width, height) format."""
-        ret = np.asarray(self.tlwh).copy()
-        ret[:2] += ret[2:] / 2
-        return ret
-
-    @property
-    def xywha(self) -> np.ndarray:
-        """Get position in (center x, center y, width, height, angle) format, warning if angle is missing."""
-        if self.angle is None:
-            LOGGER.warning("`angle` attr not found, returning `xywh` instead.")
-            return self.xywh
-        return np.concatenate([self.xywh, self.angle[None]])
-
-    @property
-    def result(self) -> list[float]:
-        """Get the current tracking results in the appropriate bounding box format."""
-        coords = self.xyxy if self.angle is None else self.xywha
-        return coords.tolist() + [self.track_id, self.score, self.cls, self.idx]
-
-    def __repr__(self) -> str:
-        """Return a string representation of the STrack object including start frame, end frame, and track ID."""
-        return f"OT_{self.track_id}_({self.start_frame}-{self.end_frame})"
-
-
-class BYTETracker:
-    """
-    BYTETracker: A tracking algorithm built on top of YOLOv8 for object detection and tracking.
-
-    This class encapsulates the functionality for initializing, updating, and managing the tracks for detected objects in a
-    video sequence. It maintains the state of tracked, lost, and removed tracks over frames, utilizes Kalman filtering for
-    predicting the new object locations, and performs data association.
-
-    Attributes:
-        tracked_stracks (list[STrack]): List of successfully activated tracks.
-        lost_stracks (list[STrack]): List of lost tracks.
-        removed_stracks (list[STrack]): List of removed tracks.
-        frame_id (int): The current frame ID.
-        args (Namespace): Command-line arguments.
-        max_time_lost (int): The maximum frames for a track to be considered as 'lost'.
-        kalman_filter (KalmanFilterXYAH): Kalman Filter object.
-
-    Methods:
-        update: Update object tracker with new detections.
-        get_kalmanfilter: Return a Kalman filter object for tracking bounding boxes.
-        init_track: Initialize object tracking with detections.
-        get_dists: Calculate the distance between tracks and detections.
-        multi_predict: Predict the location of tracks.
-        reset_id: Reset the ID counter of STrack.
-        reset: Reset the tracker by clearing all tracks.
-        joint_stracks: Combine two lists of stracks.
-        sub_stracks: Filter out the stracks present in the second list from the first list.
-        remove_duplicate_stracks: Remove duplicate stracks based on IoU.
-
-    Examples:
-        Initialize BYTETracker and update with detection results
-        >>> tracker = BYTETracker(args, frame_rate=30)
-        >>> results = yolo_model.detect(image)
-        >>> tracked_objects = tracker.update(results)
-    """
-
-    def __init__(self, args, frame_rate: int = 30):
-        """
-        Initialize a BYTETracker instance for object tracking.
-
-        Args:
-            args (Namespace): Command-line arguments containing tracking parameters.
-            frame_rate (int): Frame rate of the video sequence.
-
-        Examples:
-            Initialize BYTETracker with command-line arguments and a frame rate of 30
-            >>> args = Namespace(track_buffer=30)
-            >>> tracker = BYTETracker(args, frame_rate=30)
-        """
-        self.tracked_stracks = []  # type: list[STrack]
-        self.lost_stracks = []  # type: list[STrack]
-        self.removed_stracks = []  # type: list[STrack]
-
-        self.frame_id = 0
-        self.args = args
-        self.max_time_lost = int(frame_rate / 30.0 * args.track_buffer)
-        self.kalman_filter = self.get_kalmanfilter()
-        self.reset_id()
-
-    def update(self, results, img: np.ndarray | None = None, feats: np.ndarray | None = None) -> np.ndarray:
-        """Update the tracker with new detections and return the current list of tracked objects."""
-        self.frame_id += 1
-        activated_stracks = []
-        refind_stracks = []
-        lost_stracks = []
-        removed_stracks = []
-
-        scores = results.conf
-        remain_inds = scores >= self.args.track_high_thresh
-        inds_low = scores > self.args.track_low_thresh
-        inds_high = scores < self.args.track_high_thresh
-
-        inds_second = inds_low & inds_high
-        results_second = results[inds_second]
-        results = results[remain_inds]
-        feats_keep = feats_second = img
-        if feats is not None and len(feats):
-            feats_keep = feats[remain_inds]
-            feats_second = feats[inds_second]
-
-        detections = self.init_track(results, feats_keep)
-        # Add newly detected tracklets to tracked_stracks
-        unconfirmed = []
-        tracked_stracks = []  # type: list[STrack]
-        for track in self.tracked_stracks:
-            if not track.is_activated:
-                unconfirmed.append(track)
-            else:
-                tracked_stracks.append(track)
-        # Step 2: First association, with high score detection boxes
-        strack_pool = self.joint_stracks(tracked_stracks, self.lost_stracks)
-        # Predict the current location with KF
-        self.multi_predict(strack_pool)
-        if hasattr(self, "gmc") and img is not None:
-            # use try-except here to bypass errors from gmc module
-            try:
-                warp = self.gmc.apply(img, results.xyxy)
-            except Exception:
-                warp = np.eye(2, 3)
-            STrack.multi_gmc(strack_pool, warp)
-            STrack.multi_gmc(unconfirmed, warp)
-
-        dists = self.get_dists(strack_pool, detections)
-        matches, u_track, u_detection = matching.linear_assignment(dists, thresh=self.args.match_thresh)
-
-        for itracked, idet in matches:
-            track = strack_pool[itracked]
-            det = detections[idet]
-            if track.state == TrackState.Tracked:
-                track.update(det, self.frame_id)
-                activated_stracks.append(track)
-            else:
-                track.re_activate(det, self.frame_id, new_id=False)
-                refind_stracks.append(track)
-        # Step 3: Second association, with low score detection boxes association the untrack to the low score detections
-        detections_second = self.init_track(results_second, feats_second)
-        r_tracked_stracks = [strack_pool[i] for i in u_track if strack_pool[i].state == TrackState.Tracked]
-        # TODO
-        dists = matching.iou_distance(r_tracked_stracks, detections_second)
-        matches, u_track, u_detection_second = matching.linear_assignment(dists, thresh=0.5)
-        for itracked, idet in matches:
-            track = r_tracked_stracks[itracked]
-            det = detections_second[idet]
-            if track.state == TrackState.Tracked:
-                track.update(det, self.frame_id)
-                activated_stracks.append(track)
-            else:
-                track.re_activate(det, self.frame_id, new_id=False)
-                refind_stracks.append(track)
-
-        for it in u_track:
-            track = r_tracked_stracks[it]
-            if track.state != TrackState.Lost:
-                track.mark_lost()
-                lost_stracks.append(track)
-        # Deal with unconfirmed tracks, usually tracks with only one beginning frame
-        detections = [detections[i] for i in u_detection]
-        dists = self.get_dists(unconfirmed, detections)
-        matches, u_unconfirmed, u_detection = matching.linear_assignment(dists, thresh=0.7)
-        for itracked, idet in matches:
-            unconfirmed[itracked].update(detections[idet], self.frame_id)
-            activated_stracks.append(unconfirmed[itracked])
-        for it in u_unconfirmed:
-            track = unconfirmed[it]
-            track.mark_removed()
-            removed_stracks.append(track)
-        # Step 4: Init new stracks
-        for inew in u_detection:
-            track = detections[inew]
-            if track.score < self.args.new_track_thresh:
-                continue
-            track.activate(self.kalman_filter, self.frame_id)
-            activated_stracks.append(track)
-        # Step 5: Update state
-        for track in self.lost_stracks:
-            if self.frame_id - track.end_frame > self.max_time_lost:
-                track.mark_removed()
-                removed_stracks.append(track)
-
-        self.tracked_stracks = [t for t in self.tracked_stracks if t.state == TrackState.Tracked]
-        self.tracked_stracks = self.joint_stracks(self.tracked_stracks, activated_stracks)
-        self.tracked_stracks = self.joint_stracks(self.tracked_stracks, refind_stracks)
-        self.lost_stracks = self.sub_stracks(self.lost_stracks, self.tracked_stracks)
-        self.lost_stracks.extend(lost_stracks)
-        self.lost_stracks = self.sub_stracks(self.lost_stracks, self.removed_stracks)
-        self.tracked_stracks, self.lost_stracks = self.remove_duplicate_stracks(self.tracked_stracks, self.lost_stracks)
-        self.removed_stracks.extend(removed_stracks)
-        if len(self.removed_stracks) > 1000:
-            self.removed_stracks = self.removed_stracks[-999:]  # clip remove stracks to 1000 maximum
-
-        return np.asarray([x.result for x in self.tracked_stracks if x.is_activated], dtype=np.float32)
-
-    def get_kalmanfilter(self) -> KalmanFilterXYAH:
-        """Return a Kalman filter object for tracking bounding boxes using KalmanFilterXYAH."""
-        return KalmanFilterXYAH()
-
-    def init_track(self, results, img: np.ndarray | None = None) -> list[STrack]:
-        """Initialize object tracking with given detections, scores, and class labels using the STrack algorithm."""
-        if len(results) == 0:
-            return []
-        bboxes = results.xywhr if hasattr(results, "xywhr") else results.xywh
-        bboxes = np.concatenate([bboxes, np.arange(len(bboxes)).reshape(-1, 1)], axis=-1)
-        return [STrack(xywh, s, c) for (xywh, s, c) in zip(bboxes, results.conf, results.cls)]
-
-    def get_dists(self, tracks: list[STrack], detections: list[STrack]) -> np.ndarray:
-        """Calculate the distance between tracks and detections using IoU and optionally fuse scores."""
-        dists = matching.iou_distance(tracks, detections)
-        if self.args.fuse_score:
-            dists = matching.fuse_score(dists, detections)
-        return dists
-
-    def multi_predict(self, tracks: list[STrack]):
-        """Predict the next states for multiple tracks using Kalman filter."""
-        STrack.multi_predict(tracks)
-
-    @staticmethod
-    def reset_id():
-        """Reset the ID counter for STrack instances to ensure unique track IDs across tracking sessions."""
-        STrack.reset_id()
-
-    def reset(self):
-        """Reset the tracker by clearing all tracked, lost, and removed tracks and reinitializing the Kalman filter."""
-        self.tracked_stracks = []  # type: list[STrack]
-        self.lost_stracks = []  # type: list[STrack]
-        self.removed_stracks = []  # type: list[STrack]
-        self.frame_id = 0
-        self.kalman_filter = self.get_kalmanfilter()
-        self.reset_id()
-
-    @staticmethod
-    def joint_stracks(tlista: list[STrack], tlistb: list[STrack]) -> list[STrack]:
-        """Combine two lists of STrack objects into a single list, ensuring no duplicates based on track IDs."""
-        exists = {}
-        res = []
-        for t in tlista:
-            exists[t.track_id] = 1
-            res.append(t)
-        for t in tlistb:
-            tid = t.track_id
-            if not exists.get(tid, 0):
-                exists[tid] = 1
-                res.append(t)
-        return res
-
-    @staticmethod
-    def sub_stracks(tlista: list[STrack], tlistb: list[STrack]) -> list[STrack]:
-        """Filter out the stracks present in the second list from the first list."""
-        track_ids_b = {t.track_id for t in tlistb}
-        return [t for t in tlista if t.track_id not in track_ids_b]
-
-    @staticmethod
-    def remove_duplicate_stracks(stracksa: list[STrack], stracksb: list[STrack]) -> tuple[list[STrack], list[STrack]]:
-        """Remove duplicate stracks from two lists based on Intersection over Union (IoU) distance."""
-        pdist = matching.iou_distance(stracksa, stracksb)
-        pairs = np.where(pdist < 0.15)
-        dupa, dupb = [], []
-        for p, q in zip(*pairs):
-            timep = stracksa[p].frame_id - stracksa[p].start_frame
-            timeq = stracksb[q].frame_id - stracksb[q].start_frame
-            if timep > timeq:
-                dupb.append(q)
-            else:
-                dupa.append(p)
-        resa = [t for i, t in enumerate(stracksa) if i not in dupa]
-        resb = [t for i, t in enumerate(stracksb) if i not in dupb]
-        return resa, resb
diff --git a/ultralytics/trackers/track.py b/ultralytics/trackers/track.py
deleted file mode 100644
index 8720f73..0000000
--- a/ultralytics/trackers/track.py
+++ /dev/null
@@ -1,119 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from functools import partial
-from pathlib import Path
-
-import torch
-
-from ultralytics.utils import YAML, IterableSimpleNamespace
-from ultralytics.utils.checks import check_yaml
-
-from .bot_sort import BOTSORT
-from .byte_tracker import BYTETracker
-
-# A mapping of tracker types to corresponding tracker classes
-TRACKER_MAP = {"bytetrack": BYTETracker, "botsort": BOTSORT}
-
-
-def on_predict_start(predictor: object, persist: bool = False) -> None:
-    """
-    Initialize trackers for object tracking during prediction.
-
-    Args:
-        predictor (ultralytics.engine.predictor.BasePredictor): The predictor object to initialize trackers for.
-        persist (bool, optional): Whether to persist the trackers if they already exist.
-
-    Examples:
-        Initialize trackers for a predictor object
-        >>> predictor = SomePredictorClass()
-        >>> on_predict_start(predictor, persist=True)
-    """
-    if predictor.args.task == "classify":
-        raise ValueError("❌ Classification doesn't support 'mode=track'")
-
-    if hasattr(predictor, "trackers") and persist:
-        return
-
-    tracker = check_yaml(predictor.args.tracker)
-    cfg = IterableSimpleNamespace(**YAML.load(tracker))
-
-    if cfg.tracker_type not in {"bytetrack", "botsort"}:
-        raise AssertionError(f"Only 'bytetrack' and 'botsort' are supported for now, but got '{cfg.tracker_type}'")
-
-    predictor._feats = None  # reset in case used earlier
-    if hasattr(predictor, "_hook"):
-        predictor._hook.remove()
-    if cfg.tracker_type == "botsort" and cfg.with_reid and cfg.model == "auto":
-        from ultralytics.nn.modules.head import Detect
-
-        if not (
-            isinstance(predictor.model.model, torch.nn.Module)
-            and isinstance(predictor.model.model.model[-1], Detect)
-            and not predictor.model.model.model[-1].end2end
-        ):
-            cfg.model = "yolo11n-cls.pt"
-        else:
-            # Register hook to extract input of Detect layer
-            def pre_hook(module, input):
-                predictor._feats = list(input[0])  # unroll to new list to avoid mutation in forward
-
-            predictor._hook = predictor.model.model.model[-1].register_forward_pre_hook(pre_hook)
-
-    trackers = []
-    for _ in range(predictor.dataset.bs):
-        tracker = TRACKER_MAP[cfg.tracker_type](args=cfg, frame_rate=30)
-        trackers.append(tracker)
-        if predictor.dataset.mode != "stream":  # only need one tracker for other modes
-            break
-    predictor.trackers = trackers
-    predictor.vid_path = [None] * predictor.dataset.bs  # for determining when to reset tracker on new video
-
-
-def on_predict_postprocess_end(predictor: object, persist: bool = False) -> None:
-    """
-    Postprocess detected boxes and update with object tracking.
-
-    Args:
-        predictor (object): The predictor object containing the predictions.
-        persist (bool, optional): Whether to persist the trackers if they already exist.
-
-    Examples:
-        Postprocess predictions and update with tracking
-        >>> predictor = YourPredictorClass()
-        >>> on_predict_postprocess_end(predictor, persist=True)
-    """
-    is_obb = predictor.args.task == "obb"
-    is_stream = predictor.dataset.mode == "stream"
-    for i, result in enumerate(predictor.results):
-        tracker = predictor.trackers[i if is_stream else 0]
-        vid_path = predictor.save_dir / Path(result.path).name
-        if not persist and predictor.vid_path[i if is_stream else 0] != vid_path:
-            tracker.reset()
-            predictor.vid_path[i if is_stream else 0] = vid_path
-
-        det = (result.obb if is_obb else result.boxes).cpu().numpy()
-        tracks = tracker.update(det, result.orig_img, getattr(result, "feats", None))
-        if len(tracks) == 0:
-            continue
-        idx = tracks[:, -1].astype(int)
-        predictor.results[i] = result[idx]
-
-        update_args = {"obb" if is_obb else "boxes": torch.as_tensor(tracks[:, :-1])}
-        predictor.results[i].update(**update_args)
-
-
-def register_tracker(model: object, persist: bool) -> None:
-    """
-    Register tracking callbacks to the model for object tracking during prediction.
-
-    Args:
-        model (object): The model object to register tracking callbacks for.
-        persist (bool): Whether to persist the trackers if they already exist.
-
-    Examples:
-        Register tracking callbacks to a YOLO model
-        >>> model = YOLOModel()
-        >>> register_tracker(model, persist=True)
-    """
-    model.add_callback("on_predict_start", partial(on_predict_start, persist=persist))
-    model.add_callback("on_predict_postprocess_end", partial(on_predict_postprocess_end, persist=persist))
diff --git a/ultralytics/trackers/utils/__init__.py b/ultralytics/trackers/utils/__init__.py
deleted file mode 100644
index 77a19dc..0000000
--- a/ultralytics/trackers/utils/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
diff --git a/ultralytics/trackers/utils/gmc.py b/ultralytics/trackers/utils/gmc.py
deleted file mode 100644
index 0eab5f2..0000000
--- a/ultralytics/trackers/utils/gmc.py
+++ /dev/null
@@ -1,350 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import copy
-
-import cv2
-import numpy as np
-
-from ultralytics.utils import LOGGER
-
-
-class GMC:
-    """
-    Generalized Motion Compensation (GMC) class for tracking and object detection in video frames.
-
-    This class provides methods for tracking and detecting objects based on several tracking algorithms including ORB,
-    SIFT, ECC, and Sparse Optical Flow. It also supports downscaling of frames for computational efficiency.
-
-    Attributes:
-        method (str): The tracking method to use. Options include 'orb', 'sift', 'ecc', 'sparseOptFlow', 'none'.
-        downscale (int): Factor by which to downscale the frames for processing.
-        prevFrame (np.ndarray): Previous frame for tracking.
-        prevKeyPoints (list): Keypoints from the previous frame.
-        prevDescriptors (np.ndarray): Descriptors from the previous frame.
-        initializedFirstFrame (bool): Flag indicating if the first frame has been processed.
-
-    Methods:
-        apply: Apply the chosen method to a raw frame and optionally use provided detections.
-        apply_ecc: Apply the ECC algorithm to a raw frame.
-        apply_features: Apply feature-based methods like ORB or SIFT to a raw frame.
-        apply_sparseoptflow: Apply the Sparse Optical Flow method to a raw frame.
-        reset_params: Reset the internal parameters of the GMC object.
-
-    Examples:
-        Create a GMC object and apply it to a frame
-        >>> gmc = GMC(method="sparseOptFlow", downscale=2)
-        >>> frame = np.array([[1, 2, 3], [4, 5, 6]])
-        >>> processed_frame = gmc.apply(frame)
-        >>> print(processed_frame)
-        array([[1, 2, 3],
-               [4, 5, 6]])
-    """
-
-    def __init__(self, method: str = "sparseOptFlow", downscale: int = 2) -> None:
-        """
-        Initialize a Generalized Motion Compensation (GMC) object with tracking method and downscale factor.
-
-        Args:
-            method (str): The tracking method to use. Options include 'orb', 'sift', 'ecc', 'sparseOptFlow', 'none'.
-            downscale (int): Downscale factor for processing frames.
-
-        Examples:
-            Initialize a GMC object with the 'sparseOptFlow' method and a downscale factor of 2
-            >>> gmc = GMC(method="sparseOptFlow", downscale=2)
-        """
-        super().__init__()
-
-        self.method = method
-        self.downscale = max(1, downscale)
-
-        if self.method == "orb":
-            self.detector = cv2.FastFeatureDetector_create(20)
-            self.extractor = cv2.ORB_create()
-            self.matcher = cv2.BFMatcher(cv2.NORM_HAMMING)
-
-        elif self.method == "sift":
-            self.detector = cv2.SIFT_create(nOctaveLayers=3, contrastThreshold=0.02, edgeThreshold=20)
-            self.extractor = cv2.SIFT_create(nOctaveLayers=3, contrastThreshold=0.02, edgeThreshold=20)
-            self.matcher = cv2.BFMatcher(cv2.NORM_L2)
-
-        elif self.method == "ecc":
-            number_of_iterations = 5000
-            termination_eps = 1e-6
-            self.warp_mode = cv2.MOTION_EUCLIDEAN
-            self.criteria = (cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, number_of_iterations, termination_eps)
-
-        elif self.method == "sparseOptFlow":
-            self.feature_params = dict(
-                maxCorners=1000, qualityLevel=0.01, minDistance=1, blockSize=3, useHarrisDetector=False, k=0.04
-            )
-
-        elif self.method in {"none", "None", None}:
-            self.method = None
-        else:
-            raise ValueError(f"Unknown GMC method: {method}")
-
-        self.prevFrame = None
-        self.prevKeyPoints = None
-        self.prevDescriptors = None
-        self.initializedFirstFrame = False
-
-    def apply(self, raw_frame: np.ndarray, detections: list | None = None) -> np.ndarray:
-        """
-        Apply object detection on a raw frame using the specified method.
-
-        Args:
-            raw_frame (np.ndarray): The raw frame to be processed, with shape (H, W, C).
-            detections (list, optional): List of detections to be used in the processing.
-
-        Returns:
-            (np.ndarray): Transformation matrix with shape (2, 3).
-
-        Examples:
-            >>> gmc = GMC(method="sparseOptFlow")
-            >>> raw_frame = np.random.rand(480, 640, 3)
-            >>> transformation_matrix = gmc.apply(raw_frame)
-            >>> print(transformation_matrix.shape)
-            (2, 3)
-        """
-        if self.method in {"orb", "sift"}:
-            return self.apply_features(raw_frame, detections)
-        elif self.method == "ecc":
-            return self.apply_ecc(raw_frame)
-        elif self.method == "sparseOptFlow":
-            return self.apply_sparseoptflow(raw_frame)
-        else:
-            return np.eye(2, 3)
-
-    def apply_ecc(self, raw_frame: np.ndarray) -> np.ndarray:
-        """
-        Apply the ECC (Enhanced Correlation Coefficient) algorithm to a raw frame for motion compensation.
-
-        Args:
-            raw_frame (np.ndarray): The raw frame to be processed, with shape (H, W, C).
-
-        Returns:
-            (np.ndarray): Transformation matrix with shape (2, 3).
-
-        Examples:
-            >>> gmc = GMC(method="ecc")
-            >>> processed_frame = gmc.apply_ecc(np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]))
-            >>> print(processed_frame)
-            [[1. 0. 0.]
-             [0. 1. 0.]]
-        """
-        height, width, c = raw_frame.shape
-        frame = cv2.cvtColor(raw_frame, cv2.COLOR_BGR2GRAY) if c == 3 else raw_frame
-        H = np.eye(2, 3, dtype=np.float32)
-
-        # Downscale image for computational efficiency
-        if self.downscale > 1.0:
-            frame = cv2.GaussianBlur(frame, (3, 3), 1.5)
-            frame = cv2.resize(frame, (width // self.downscale, height // self.downscale))
-
-        # Handle first frame initialization
-        if not self.initializedFirstFrame:
-            self.prevFrame = frame.copy()
-            self.initializedFirstFrame = True
-            return H
-
-        # Run the ECC algorithm to find transformation matrix
-        try:
-            (_, H) = cv2.findTransformECC(self.prevFrame, frame, H, self.warp_mode, self.criteria, None, 1)
-        except Exception as e:
-            LOGGER.warning(f"find transform failed. Set warp as identity {e}")
-
-        return H
-
-    def apply_features(self, raw_frame: np.ndarray, detections: list | None = None) -> np.ndarray:
-        """
-        Apply feature-based methods like ORB or SIFT to a raw frame.
-
-        Args:
-            raw_frame (np.ndarray): The raw frame to be processed, with shape (H, W, C).
-            detections (list, optional): List of detections to be used in the processing.
-
-        Returns:
-            (np.ndarray): Transformation matrix with shape (2, 3).
-
-        Examples:
-            >>> gmc = GMC(method="orb")
-            >>> raw_frame = np.random.randint(0, 255, (480, 640, 3), dtype=np.uint8)
-            >>> transformation_matrix = gmc.apply_features(raw_frame)
-            >>> print(transformation_matrix.shape)
-            (2, 3)
-        """
-        height, width, c = raw_frame.shape
-        frame = cv2.cvtColor(raw_frame, cv2.COLOR_BGR2GRAY) if c == 3 else raw_frame
-        H = np.eye(2, 3)
-
-        # Downscale image for computational efficiency
-        if self.downscale > 1.0:
-            frame = cv2.resize(frame, (width // self.downscale, height // self.downscale))
-            width = width // self.downscale
-            height = height // self.downscale
-
-        # Create mask for keypoint detection, excluding border regions
-        mask = np.zeros_like(frame)
-        mask[int(0.02 * height) : int(0.98 * height), int(0.02 * width) : int(0.98 * width)] = 255
-
-        # Exclude detection regions from mask to avoid tracking detected objects
-        if detections is not None:
-            for det in detections:
-                tlbr = (det[:4] / self.downscale).astype(np.int_)
-                mask[tlbr[1] : tlbr[3], tlbr[0] : tlbr[2]] = 0
-
-        # Find keypoints and compute descriptors
-        keypoints = self.detector.detect(frame, mask)
-        keypoints, descriptors = self.extractor.compute(frame, keypoints)
-
-        # Handle first frame initialization
-        if not self.initializedFirstFrame:
-            self.prevFrame = frame.copy()
-            self.prevKeyPoints = copy.copy(keypoints)
-            self.prevDescriptors = copy.copy(descriptors)
-            self.initializedFirstFrame = True
-            return H
-
-        # Match descriptors between previous and current frame
-        knnMatches = self.matcher.knnMatch(self.prevDescriptors, descriptors, 2)
-
-        # Filter matches based on spatial distance constraints
-        matches = []
-        spatialDistances = []
-        maxSpatialDistance = 0.25 * np.array([width, height])
-
-        # Handle empty matches case
-        if len(knnMatches) == 0:
-            self.prevFrame = frame.copy()
-            self.prevKeyPoints = copy.copy(keypoints)
-            self.prevDescriptors = copy.copy(descriptors)
-            return H
-
-        # Apply Lowe's ratio test and spatial distance filtering
-        for m, n in knnMatches:
-            if m.distance < 0.9 * n.distance:
-                prevKeyPointLocation = self.prevKeyPoints[m.queryIdx].pt
-                currKeyPointLocation = keypoints[m.trainIdx].pt
-
-                spatialDistance = (
-                    prevKeyPointLocation[0] - currKeyPointLocation[0],
-                    prevKeyPointLocation[1] - currKeyPointLocation[1],
-                )
-
-                if (np.abs(spatialDistance[0]) < maxSpatialDistance[0]) and (
-                    np.abs(spatialDistance[1]) < maxSpatialDistance[1]
-                ):
-                    spatialDistances.append(spatialDistance)
-                    matches.append(m)
-
-        # Filter outliers using statistical analysis
-        meanSpatialDistances = np.mean(spatialDistances, 0)
-        stdSpatialDistances = np.std(spatialDistances, 0)
-        inliers = (spatialDistances - meanSpatialDistances) < 2.5 * stdSpatialDistances
-
-        # Extract good matches and corresponding points
-        goodMatches = []
-        prevPoints = []
-        currPoints = []
-        for i in range(len(matches)):
-            if inliers[i, 0] and inliers[i, 1]:
-                goodMatches.append(matches[i])
-                prevPoints.append(self.prevKeyPoints[matches[i].queryIdx].pt)
-                currPoints.append(keypoints[matches[i].trainIdx].pt)
-
-        prevPoints = np.array(prevPoints)
-        currPoints = np.array(currPoints)
-
-        # Estimate transformation matrix using RANSAC
-        if prevPoints.shape[0] > 4:
-            H, inliers = cv2.estimateAffinePartial2D(prevPoints, currPoints, cv2.RANSAC)
-
-            # Scale translation components back to original resolution
-            if self.downscale > 1.0:
-                H[0, 2] *= self.downscale
-                H[1, 2] *= self.downscale
-        else:
-            LOGGER.warning("not enough matching points")
-
-        # Store current frame data for next iteration
-        self.prevFrame = frame.copy()
-        self.prevKeyPoints = copy.copy(keypoints)
-        self.prevDescriptors = copy.copy(descriptors)
-
-        return H
-
-    def apply_sparseoptflow(self, raw_frame: np.ndarray) -> np.ndarray:
-        """
-        Apply Sparse Optical Flow method to a raw frame.
-
-        Args:
-            raw_frame (np.ndarray): The raw frame to be processed, with shape (H, W, C).
-
-        Returns:
-            (np.ndarray): Transformation matrix with shape (2, 3).
-
-        Examples:
-            >>> gmc = GMC()
-            >>> result = gmc.apply_sparseoptflow(np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]))
-            >>> print(result)
-            [[1. 0. 0.]
-             [0. 1. 0.]]
-        """
-        height, width, c = raw_frame.shape
-        frame = cv2.cvtColor(raw_frame, cv2.COLOR_BGR2GRAY) if c == 3 else raw_frame
-        H = np.eye(2, 3)
-
-        # Downscale image for computational efficiency
-        if self.downscale > 1.0:
-            frame = cv2.resize(frame, (width // self.downscale, height // self.downscale))
-
-        # Find good features to track
-        keypoints = cv2.goodFeaturesToTrack(frame, mask=None, **self.feature_params)
-
-        # Handle first frame initialization
-        if not self.initializedFirstFrame or self.prevKeyPoints is None:
-            self.prevFrame = frame.copy()
-            self.prevKeyPoints = copy.copy(keypoints)
-            self.initializedFirstFrame = True
-            return H
-
-        # Calculate optical flow using Lucas-Kanade method
-        matchedKeypoints, status, _ = cv2.calcOpticalFlowPyrLK(self.prevFrame, frame, self.prevKeyPoints, None)
-
-        # Extract successfully tracked points
-        prevPoints = []
-        currPoints = []
-
-        for i in range(len(status)):
-            if status[i]:
-                prevPoints.append(self.prevKeyPoints[i])
-                currPoints.append(matchedKeypoints[i])
-
-        prevPoints = np.array(prevPoints)
-        currPoints = np.array(currPoints)
-
-        # Estimate transformation matrix using RANSAC
-        if (prevPoints.shape[0] > 4) and (prevPoints.shape[0] == currPoints.shape[0]):
-            H, _ = cv2.estimateAffinePartial2D(prevPoints, currPoints, cv2.RANSAC)
-
-            # Scale translation components back to original resolution
-            if self.downscale > 1.0:
-                H[0, 2] *= self.downscale
-                H[1, 2] *= self.downscale
-        else:
-            LOGGER.warning("not enough matching points")
-
-        # Store current frame data for next iteration
-        self.prevFrame = frame.copy()
-        self.prevKeyPoints = copy.copy(keypoints)
-
-        return H
-
-    def reset_params(self) -> None:
-        """Reset the internal parameters including previous frame, keypoints, and descriptors."""
-        self.prevFrame = None
-        self.prevKeyPoints = None
-        self.prevDescriptors = None
-        self.initializedFirstFrame = False
diff --git a/ultralytics/trackers/utils/kalman_filter.py b/ultralytics/trackers/utils/kalman_filter.py
deleted file mode 100644
index 82fd515..0000000
--- a/ultralytics/trackers/utils/kalman_filter.py
+++ /dev/null
@@ -1,493 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-import numpy as np
-import scipy.linalg
-
-
-class KalmanFilterXYAH:
-    """
-    A KalmanFilterXYAH class for tracking bounding boxes in image space using a Kalman filter.
-
-    Implements a simple Kalman filter for tracking bounding boxes in image space. The 8-dimensional state space
-    (x, y, a, h, vx, vy, va, vh) contains the bounding box center position (x, y), aspect ratio a, height h, and their
-    respective velocities. Object motion follows a constant velocity model, and bounding box location (x, y, a, h) is
-    taken as a direct observation of the state space (linear observation model).
-
-    Attributes:
-        _motion_mat (np.ndarray): The motion matrix for the Kalman filter.
-        _update_mat (np.ndarray): The update matrix for the Kalman filter.
-        _std_weight_position (float): Standard deviation weight for position.
-        _std_weight_velocity (float): Standard deviation weight for velocity.
-
-    Methods:
-        initiate: Create a track from an unassociated measurement.
-        predict: Run the Kalman filter prediction step.
-        project: Project the state distribution to measurement space.
-        multi_predict: Run the Kalman filter prediction step (vectorized version).
-        update: Run the Kalman filter correction step.
-        gating_distance: Compute the gating distance between state distribution and measurements.
-
-    Examples:
-        Initialize the Kalman filter and create a track from a measurement
-        >>> kf = KalmanFilterXYAH()
-        >>> measurement = np.array([100, 200, 1.5, 50])
-        >>> mean, covariance = kf.initiate(measurement)
-        >>> print(mean)
-        >>> print(covariance)
-    """
-
-    def __init__(self):
-        """
-        Initialize Kalman filter model matrices with motion and observation uncertainty weights.
-
-        The Kalman filter is initialized with an 8-dimensional state space (x, y, a, h, vx, vy, va, vh), where (x, y)
-        represents the bounding box center position, 'a' is the aspect ratio, 'h' is the height, and their respective
-        velocities are (vx, vy, va, vh). The filter uses a constant velocity model for object motion and a linear
-        observation model for bounding box location.
-
-        Examples:
-            Initialize a Kalman filter for tracking:
-            >>> kf = KalmanFilterXYAH()
-        """
-        ndim, dt = 4, 1.0
-
-        # Create Kalman filter model matrices
-        self._motion_mat = np.eye(2 * ndim, 2 * ndim)
-        for i in range(ndim):
-            self._motion_mat[i, ndim + i] = dt
-        self._update_mat = np.eye(ndim, 2 * ndim)
-
-        # Motion and observation uncertainty are chosen relative to the current state estimate
-        self._std_weight_position = 1.0 / 20
-        self._std_weight_velocity = 1.0 / 160
-
-    def initiate(self, measurement: np.ndarray):
-        """
-        Create a track from an unassociated measurement.
-
-        Args:
-            measurement (np.ndarray): Bounding box coordinates (x, y, a, h) with center position (x, y), aspect ratio a,
-                and height h.
-
-        Returns:
-            mean (np.ndarray): Mean vector (8-dimensional) of the new track. Unobserved velocities are initialized to 0 mean.
-            covariance (np.ndarray): Covariance matrix (8x8 dimensional) of the new track.
-
-        Examples:
-            >>> kf = KalmanFilterXYAH()
-            >>> measurement = np.array([100, 50, 1.5, 200])
-            >>> mean, covariance = kf.initiate(measurement)
-        """
-        mean_pos = measurement
-        mean_vel = np.zeros_like(mean_pos)
-        mean = np.r_[mean_pos, mean_vel]
-
-        std = [
-            2 * self._std_weight_position * measurement[3],
-            2 * self._std_weight_position * measurement[3],
-            1e-2,
-            2 * self._std_weight_position * measurement[3],
-            10 * self._std_weight_velocity * measurement[3],
-            10 * self._std_weight_velocity * measurement[3],
-            1e-5,
-            10 * self._std_weight_velocity * measurement[3],
-        ]
-        covariance = np.diag(np.square(std))
-        return mean, covariance
-
-    def predict(self, mean: np.ndarray, covariance: np.ndarray):
-        """
-        Run Kalman filter prediction step.
-
-        Args:
-            mean (np.ndarray): The 8-dimensional mean vector of the object state at the previous time step.
-            covariance (np.ndarray): The 8x8-dimensional covariance matrix of the object state at the previous time step.
-
-        Returns:
-            mean (np.ndarray): Mean vector of the predicted state. Unobserved velocities are initialized to 0 mean.
-            covariance (np.ndarray): Covariance matrix of the predicted state.
-
-        Examples:
-            >>> kf = KalmanFilterXYAH()
-            >>> mean = np.array([0, 0, 1, 1, 0, 0, 0, 0])
-            >>> covariance = np.eye(8)
-            >>> predicted_mean, predicted_covariance = kf.predict(mean, covariance)
-        """
-        std_pos = [
-            self._std_weight_position * mean[3],
-            self._std_weight_position * mean[3],
-            1e-2,
-            self._std_weight_position * mean[3],
-        ]
-        std_vel = [
-            self._std_weight_velocity * mean[3],
-            self._std_weight_velocity * mean[3],
-            1e-5,
-            self._std_weight_velocity * mean[3],
-        ]
-        motion_cov = np.diag(np.square(np.r_[std_pos, std_vel]))
-
-        mean = np.dot(mean, self._motion_mat.T)
-        covariance = np.linalg.multi_dot((self._motion_mat, covariance, self._motion_mat.T)) + motion_cov
-
-        return mean, covariance
-
-    def project(self, mean: np.ndarray, covariance: np.ndarray):
-        """
-        Project state distribution to measurement space.
-
-        Args:
-            mean (np.ndarray): The state's mean vector (8 dimensional array).
-            covariance (np.ndarray): The state's covariance matrix (8x8 dimensional).
-
-        Returns:
-            mean (np.ndarray): Projected mean of the given state estimate.
-            covariance (np.ndarray): Projected covariance matrix of the given state estimate.
-
-        Examples:
-            >>> kf = KalmanFilterXYAH()
-            >>> mean = np.array([0, 0, 1, 1, 0, 0, 0, 0])
-            >>> covariance = np.eye(8)
-            >>> projected_mean, projected_covariance = kf.project(mean, covariance)
-        """
-        std = [
-            self._std_weight_position * mean[3],
-            self._std_weight_position * mean[3],
-            1e-1,
-            self._std_weight_position * mean[3],
-        ]
-        innovation_cov = np.diag(np.square(std))
-
-        mean = np.dot(self._update_mat, mean)
-        covariance = np.linalg.multi_dot((self._update_mat, covariance, self._update_mat.T))
-        return mean, covariance + innovation_cov
-
-    def multi_predict(self, mean: np.ndarray, covariance: np.ndarray):
-        """
-        Run Kalman filter prediction step for multiple object states (Vectorized version).
-
-        Args:
-            mean (np.ndarray): The Nx8 dimensional mean matrix of the object states at the previous time step.
-            covariance (np.ndarray): The Nx8x8 covariance matrix of the object states at the previous time step.
-
-        Returns:
-            mean (np.ndarray): Mean matrix of the predicted states with shape (N, 8).
-            covariance (np.ndarray): Covariance matrix of the predicted states with shape (N, 8, 8).
-
-        Examples:
-            >>> mean = np.random.rand(10, 8)  # 10 object states
-            >>> covariance = np.random.rand(10, 8, 8)  # Covariance matrices for 10 object states
-            >>> predicted_mean, predicted_covariance = kalman_filter.multi_predict(mean, covariance)
-        """
-        std_pos = [
-            self._std_weight_position * mean[:, 3],
-            self._std_weight_position * mean[:, 3],
-            1e-2 * np.ones_like(mean[:, 3]),
-            self._std_weight_position * mean[:, 3],
-        ]
-        std_vel = [
-            self._std_weight_velocity * mean[:, 3],
-            self._std_weight_velocity * mean[:, 3],
-            1e-5 * np.ones_like(mean[:, 3]),
-            self._std_weight_velocity * mean[:, 3],
-        ]
-        sqr = np.square(np.r_[std_pos, std_vel]).T
-
-        motion_cov = [np.diag(sqr[i]) for i in range(len(mean))]
-        motion_cov = np.asarray(motion_cov)
-
-        mean = np.dot(mean, self._motion_mat.T)
-        left = np.dot(self._motion_mat, covariance).transpose((1, 0, 2))
-        covariance = np.dot(left, self._motion_mat.T) + motion_cov
-
-        return mean, covariance
-
-    def update(self, mean: np.ndarray, covariance: np.ndarray, measurement: np.ndarray):
-        """
-        Run Kalman filter correction step.
-
-        Args:
-            mean (np.ndarray): The predicted state's mean vector (8 dimensional).
-            covariance (np.ndarray): The state's covariance matrix (8x8 dimensional).
-            measurement (np.ndarray): The 4 dimensional measurement vector (x, y, a, h), where (x, y) is the center
-                position, a the aspect ratio, and h the height of the bounding box.
-
-        Returns:
-            new_mean (np.ndarray): Measurement-corrected state mean.
-            new_covariance (np.ndarray): Measurement-corrected state covariance.
-
-        Examples:
-            >>> kf = KalmanFilterXYAH()
-            >>> mean = np.array([0, 0, 1, 1, 0, 0, 0, 0])
-            >>> covariance = np.eye(8)
-            >>> measurement = np.array([1, 1, 1, 1])
-            >>> new_mean, new_covariance = kf.update(mean, covariance, measurement)
-        """
-        projected_mean, projected_cov = self.project(mean, covariance)
-
-        chol_factor, lower = scipy.linalg.cho_factor(projected_cov, lower=True, check_finite=False)
-        kalman_gain = scipy.linalg.cho_solve(
-            (chol_factor, lower), np.dot(covariance, self._update_mat.T).T, check_finite=False
-        ).T
-        innovation = measurement - projected_mean
-
-        new_mean = mean + np.dot(innovation, kalman_gain.T)
-        new_covariance = covariance - np.linalg.multi_dot((kalman_gain, projected_cov, kalman_gain.T))
-        return new_mean, new_covariance
-
-    def gating_distance(
-        self,
-        mean: np.ndarray,
-        covariance: np.ndarray,
-        measurements: np.ndarray,
-        only_position: bool = False,
-        metric: str = "maha",
-    ) -> np.ndarray:
-        """
-        Compute gating distance between state distribution and measurements.
-
-        A suitable distance threshold can be obtained from `chi2inv95`. If `only_position` is False, the chi-square
-        distribution has 4 degrees of freedom, otherwise 2.
-
-        Args:
-            mean (np.ndarray): Mean vector over the state distribution (8 dimensional).
-            covariance (np.ndarray): Covariance of the state distribution (8x8 dimensional).
-            measurements (np.ndarray): An (N, 4) matrix of N measurements, each in format (x, y, a, h) where (x, y) is the
-                bounding box center position, a the aspect ratio, and h the height.
-            only_position (bool, optional): If True, distance computation is done with respect to box center position only.
-            metric (str, optional): The metric to use for calculating the distance. Options are 'gaussian' for the squared
-                Euclidean distance and 'maha' for the squared Mahalanobis distance.
-
-        Returns:
-            (np.ndarray): Returns an array of length N, where the i-th element contains the squared distance between
-                (mean, covariance) and `measurements[i]`.
-
-        Examples:
-            Compute gating distance using Mahalanobis metric:
-            >>> kf = KalmanFilterXYAH()
-            >>> mean = np.array([0, 0, 1, 1, 0, 0, 0, 0])
-            >>> covariance = np.eye(8)
-            >>> measurements = np.array([[1, 1, 1, 1], [2, 2, 1, 1]])
-            >>> distances = kf.gating_distance(mean, covariance, measurements, only_position=False, metric="maha")
-        """
-        mean, covariance = self.project(mean, covariance)
-        if only_position:
-            mean, covariance = mean[:2], covariance[:2, :2]
-            measurements = measurements[:, :2]
-
-        d = measurements - mean
-        if metric == "gaussian":
-            return np.sum(d * d, axis=1)
-        elif metric == "maha":
-            cholesky_factor = np.linalg.cholesky(covariance)
-            z = scipy.linalg.solve_triangular(cholesky_factor, d.T, lower=True, check_finite=False, overwrite_b=True)
-            return np.sum(z * z, axis=0)  # square maha
-        else:
-            raise ValueError("Invalid distance metric")
-
-
-class KalmanFilterXYWH(KalmanFilterXYAH):
-    """
-    A KalmanFilterXYWH class for tracking bounding boxes in image space using a Kalman filter.
-
-    Implements a Kalman filter for tracking bounding boxes with state space (x, y, w, h, vx, vy, vw, vh), where
-    (x, y) is the center position, w is the width, h is the height, and vx, vy, vw, vh are their respective velocities.
-    The object motion follows a constant velocity model, and the bounding box location (x, y, w, h) is taken as a direct
-    observation of the state space (linear observation model).
-
-    Attributes:
-        _motion_mat (np.ndarray): The motion matrix for the Kalman filter.
-        _update_mat (np.ndarray): The update matrix for the Kalman filter.
-        _std_weight_position (float): Standard deviation weight for position.
-        _std_weight_velocity (float): Standard deviation weight for velocity.
-
-    Methods:
-        initiate: Create a track from an unassociated measurement.
-        predict: Run the Kalman filter prediction step.
-        project: Project the state distribution to measurement space.
-        multi_predict: Run the Kalman filter prediction step in a vectorized manner.
-        update: Run the Kalman filter correction step.
-
-    Examples:
-        Create a Kalman filter and initialize a track
-        >>> kf = KalmanFilterXYWH()
-        >>> measurement = np.array([100, 50, 20, 40])
-        >>> mean, covariance = kf.initiate(measurement)
-        >>> print(mean)
-        >>> print(covariance)
-    """
-
-    def initiate(self, measurement: np.ndarray):
-        """
-        Create track from unassociated measurement.
-
-        Args:
-            measurement (np.ndarray): Bounding box coordinates (x, y, w, h) with center position (x, y), width, and height.
-
-        Returns:
-            mean (np.ndarray): Mean vector (8 dimensional) of the new track. Unobserved velocities are initialized to 0 mean.
-            covariance (np.ndarray): Covariance matrix (8x8 dimensional) of the new track.
-
-        Examples:
-            >>> kf = KalmanFilterXYWH()
-            >>> measurement = np.array([100, 50, 20, 40])
-            >>> mean, covariance = kf.initiate(measurement)
-            >>> print(mean)
-            [100.  50.  20.  40.   0.   0.   0.   0.]
-            >>> print(covariance)
-            [[ 4.  0.  0.  0.  0.  0.  0.  0.]
-             [ 0.  4.  0.  0.  0.  0.  0.  0.]
-             [ 0.  0.  4.  0.  0.  0.  0.  0.]
-             [ 0.  0.  0.  4.  0.  0.  0.  0.]
-             [ 0.  0.  0.  0.  0.25  0.  0.  0.]
-             [ 0.  0.  0.  0.  0.  0.25  0.  0.]
-             [ 0.  0.  0.  0.  0.  0.  0.25  0.]
-             [ 0.  0.  0.  0.  0.  0.  0.  0.25]]
-        """
-        mean_pos = measurement
-        mean_vel = np.zeros_like(mean_pos)
-        mean = np.r_[mean_pos, mean_vel]
-
-        std = [
-            2 * self._std_weight_position * measurement[2],
-            2 * self._std_weight_position * measurement[3],
-            2 * self._std_weight_position * measurement[2],
-            2 * self._std_weight_position * measurement[3],
-            10 * self._std_weight_velocity * measurement[2],
-            10 * self._std_weight_velocity * measurement[3],
-            10 * self._std_weight_velocity * measurement[2],
-            10 * self._std_weight_velocity * measurement[3],
-        ]
-        covariance = np.diag(np.square(std))
-        return mean, covariance
-
-    def predict(self, mean: np.ndarray, covariance: np.ndarray):
-        """
-        Run Kalman filter prediction step.
-
-        Args:
-            mean (np.ndarray): The 8-dimensional mean vector of the object state at the previous time step.
-            covariance (np.ndarray): The 8x8-dimensional covariance matrix of the object state at the previous time step.
-
-        Returns:
-            mean (np.ndarray): Mean vector of the predicted state. Unobserved velocities are initialized to 0 mean.
-            covariance (np.ndarray): Covariance matrix of the predicted state.
-
-        Examples:
-            >>> kf = KalmanFilterXYWH()
-            >>> mean = np.array([0, 0, 1, 1, 0, 0, 0, 0])
-            >>> covariance = np.eye(8)
-            >>> predicted_mean, predicted_covariance = kf.predict(mean, covariance)
-        """
-        std_pos = [
-            self._std_weight_position * mean[2],
-            self._std_weight_position * mean[3],
-            self._std_weight_position * mean[2],
-            self._std_weight_position * mean[3],
-        ]
-        std_vel = [
-            self._std_weight_velocity * mean[2],
-            self._std_weight_velocity * mean[3],
-            self._std_weight_velocity * mean[2],
-            self._std_weight_velocity * mean[3],
-        ]
-        motion_cov = np.diag(np.square(np.r_[std_pos, std_vel]))
-
-        mean = np.dot(mean, self._motion_mat.T)
-        covariance = np.linalg.multi_dot((self._motion_mat, covariance, self._motion_mat.T)) + motion_cov
-
-        return mean, covariance
-
-    def project(self, mean: np.ndarray, covariance: np.ndarray):
-        """
-        Project state distribution to measurement space.
-
-        Args:
-            mean (np.ndarray): The state's mean vector (8 dimensional array).
-            covariance (np.ndarray): The state's covariance matrix (8x8 dimensional).
-
-        Returns:
-            mean (np.ndarray): Projected mean of the given state estimate.
-            covariance (np.ndarray): Projected covariance matrix of the given state estimate.
-
-        Examples:
-            >>> kf = KalmanFilterXYWH()
-            >>> mean = np.array([0, 0, 1, 1, 0, 0, 0, 0])
-            >>> covariance = np.eye(8)
-            >>> projected_mean, projected_cov = kf.project(mean, covariance)
-        """
-        std = [
-            self._std_weight_position * mean[2],
-            self._std_weight_position * mean[3],
-            self._std_weight_position * mean[2],
-            self._std_weight_position * mean[3],
-        ]
-        innovation_cov = np.diag(np.square(std))
-
-        mean = np.dot(self._update_mat, mean)
-        covariance = np.linalg.multi_dot((self._update_mat, covariance, self._update_mat.T))
-        return mean, covariance + innovation_cov
-
-    def multi_predict(self, mean: np.ndarray, covariance: np.ndarray):
-        """
-        Run Kalman filter prediction step (Vectorized version).
-
-        Args:
-            mean (np.ndarray): The Nx8 dimensional mean matrix of the object states at the previous time step.
-            covariance (np.ndarray): The Nx8x8 covariance matrix of the object states at the previous time step.
-
-        Returns:
-            mean (np.ndarray): Mean matrix of the predicted states with shape (N, 8).
-            covariance (np.ndarray): Covariance matrix of the predicted states with shape (N, 8, 8).
-
-        Examples:
-            >>> mean = np.random.rand(5, 8)  # 5 objects with 8-dimensional state vectors
-            >>> covariance = np.random.rand(5, 8, 8)  # 5 objects with 8x8 covariance matrices
-            >>> kf = KalmanFilterXYWH()
-            >>> predicted_mean, predicted_covariance = kf.multi_predict(mean, covariance)
-        """
-        std_pos = [
-            self._std_weight_position * mean[:, 2],
-            self._std_weight_position * mean[:, 3],
-            self._std_weight_position * mean[:, 2],
-            self._std_weight_position * mean[:, 3],
-        ]
-        std_vel = [
-            self._std_weight_velocity * mean[:, 2],
-            self._std_weight_velocity * mean[:, 3],
-            self._std_weight_velocity * mean[:, 2],
-            self._std_weight_velocity * mean[:, 3],
-        ]
-        sqr = np.square(np.r_[std_pos, std_vel]).T
-
-        motion_cov = [np.diag(sqr[i]) for i in range(len(mean))]
-        motion_cov = np.asarray(motion_cov)
-
-        mean = np.dot(mean, self._motion_mat.T)
-        left = np.dot(self._motion_mat, covariance).transpose((1, 0, 2))
-        covariance = np.dot(left, self._motion_mat.T) + motion_cov
-
-        return mean, covariance
-
-    def update(self, mean: np.ndarray, covariance: np.ndarray, measurement: np.ndarray):
-        """
-        Run Kalman filter correction step.
-
-        Args:
-            mean (np.ndarray): The predicted state's mean vector (8 dimensional).
-            covariance (np.ndarray): The state's covariance matrix (8x8 dimensional).
-            measurement (np.ndarray): The 4 dimensional measurement vector (x, y, w, h), where (x, y) is the center
-                position, w the width, and h the height of the bounding box.
-
-        Returns:
-            new_mean (np.ndarray): Measurement-corrected state mean.
-            new_covariance (np.ndarray): Measurement-corrected state covariance.
-
-        Examples:
-            >>> kf = KalmanFilterXYWH()
-            >>> mean = np.array([0, 0, 1, 1, 0, 0, 0, 0])
-            >>> covariance = np.eye(8)
-            >>> measurement = np.array([0.5, 0.5, 1.2, 1.2])
-            >>> new_mean, new_covariance = kf.update(mean, covariance, measurement)
-        """
-        return super().update(mean, covariance, measurement)
diff --git a/ultralytics/trackers/utils/matching.py b/ultralytics/trackers/utils/matching.py
deleted file mode 100644
index e85a78c..0000000
--- a/ultralytics/trackers/utils/matching.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-import numpy as np
-import scipy
-from scipy.spatial.distance import cdist
-
-from ultralytics.utils.metrics import batch_probiou, bbox_ioa
-
-try:
-    import lap  # for linear_assignment
-
-    assert lap.__version__  # verify package is not directory
-except (ImportError, AssertionError, AttributeError):
-    from ultralytics.utils.checks import check_requirements
-
-    check_requirements("lap>=0.5.12")  # https://github.com/gatagat/lap
-    import lap
-
-
-def linear_assignment(cost_matrix: np.ndarray, thresh: float, use_lap: bool = True):
-    """
-    Perform linear assignment using either the scipy or lap.lapjv method.
-
-    Args:
-        cost_matrix (np.ndarray): The matrix containing cost values for assignments, with shape (N, M).
-        thresh (float): Threshold for considering an assignment valid.
-        use_lap (bool): Use lap.lapjv for the assignment. If False, scipy.optimize.linear_sum_assignment is used.
-
-    Returns:
-        matched_indices (np.ndarray): Array of matched indices of shape (K, 2), where K is the number of matches.
-        unmatched_a (np.ndarray): Array of unmatched indices from the first set, with shape (L,).
-        unmatched_b (np.ndarray): Array of unmatched indices from the second set, with shape (M,).
-
-    Examples:
-        >>> cost_matrix = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
-        >>> thresh = 5.0
-        >>> matched_indices, unmatched_a, unmatched_b = linear_assignment(cost_matrix, thresh, use_lap=True)
-    """
-    if cost_matrix.size == 0:
-        return np.empty((0, 2), dtype=int), tuple(range(cost_matrix.shape[0])), tuple(range(cost_matrix.shape[1]))
-
-    if use_lap:
-        # Use lap.lapjv
-        # https://github.com/gatagat/lap
-        _, x, y = lap.lapjv(cost_matrix, extend_cost=True, cost_limit=thresh)
-        matches = [[ix, mx] for ix, mx in enumerate(x) if mx >= 0]
-        unmatched_a = np.where(x < 0)[0]
-        unmatched_b = np.where(y < 0)[0]
-    else:
-        # Use scipy.optimize.linear_sum_assignment
-        # https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.linear_sum_assignment.html
-        x, y = scipy.optimize.linear_sum_assignment(cost_matrix)  # row x, col y
-        matches = np.asarray([[x[i], y[i]] for i in range(len(x)) if cost_matrix[x[i], y[i]] <= thresh])
-        if len(matches) == 0:
-            unmatched_a = list(np.arange(cost_matrix.shape[0]))
-            unmatched_b = list(np.arange(cost_matrix.shape[1]))
-        else:
-            unmatched_a = list(frozenset(np.arange(cost_matrix.shape[0])) - frozenset(matches[:, 0]))
-            unmatched_b = list(frozenset(np.arange(cost_matrix.shape[1])) - frozenset(matches[:, 1]))
-
-    return matches, unmatched_a, unmatched_b
-
-
-def iou_distance(atracks: list, btracks: list) -> np.ndarray:
-    """
-    Compute cost based on Intersection over Union (IoU) between tracks.
-
-    Args:
-        atracks (list[STrack] | list[np.ndarray]): List of tracks 'a' or bounding boxes.
-        btracks (list[STrack] | list[np.ndarray]): List of tracks 'b' or bounding boxes.
-
-    Returns:
-        (np.ndarray): Cost matrix computed based on IoU with shape (len(atracks), len(btracks)).
-
-    Examples:
-        Compute IoU distance between two sets of tracks
-        >>> atracks = [np.array([0, 0, 10, 10]), np.array([20, 20, 30, 30])]
-        >>> btracks = [np.array([5, 5, 15, 15]), np.array([25, 25, 35, 35])]
-        >>> cost_matrix = iou_distance(atracks, btracks)
-    """
-    if atracks and isinstance(atracks[0], np.ndarray) or btracks and isinstance(btracks[0], np.ndarray):
-        atlbrs = atracks
-        btlbrs = btracks
-    else:
-        atlbrs = [track.xywha if track.angle is not None else track.xyxy for track in atracks]
-        btlbrs = [track.xywha if track.angle is not None else track.xyxy for track in btracks]
-
-    ious = np.zeros((len(atlbrs), len(btlbrs)), dtype=np.float32)
-    if len(atlbrs) and len(btlbrs):
-        if len(atlbrs[0]) == 5 and len(btlbrs[0]) == 5:
-            ious = batch_probiou(
-                np.ascontiguousarray(atlbrs, dtype=np.float32),
-                np.ascontiguousarray(btlbrs, dtype=np.float32),
-            ).numpy()
-        else:
-            ious = bbox_ioa(
-                np.ascontiguousarray(atlbrs, dtype=np.float32),
-                np.ascontiguousarray(btlbrs, dtype=np.float32),
-                iou=True,
-            )
-    return 1 - ious  # cost matrix
-
-
-def embedding_distance(tracks: list, detections: list, metric: str = "cosine") -> np.ndarray:
-    """
-    Compute distance between tracks and detections based on embeddings.
-
-    Args:
-        tracks (list[STrack]): List of tracks, where each track contains embedding features.
-        detections (list[BaseTrack]): List of detections, where each detection contains embedding features.
-        metric (str): Metric for distance computation. Supported metrics include 'cosine', 'euclidean', etc.
-
-    Returns:
-        (np.ndarray): Cost matrix computed based on embeddings with shape (N, M), where N is the number of tracks
-            and M is the number of detections.
-
-    Examples:
-        Compute the embedding distance between tracks and detections using cosine metric
-        >>> tracks = [STrack(...), STrack(...)]  # List of track objects with embedding features
-        >>> detections = [BaseTrack(...), BaseTrack(...)]  # List of detection objects with embedding features
-        >>> cost_matrix = embedding_distance(tracks, detections, metric="cosine")
-    """
-    cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float32)
-    if cost_matrix.size == 0:
-        return cost_matrix
-    det_features = np.asarray([track.curr_feat for track in detections], dtype=np.float32)
-    # for i, track in enumerate(tracks):
-    # cost_matrix[i, :] = np.maximum(0.0, cdist(track.smooth_feat.reshape(1,-1), det_features, metric))
-    track_features = np.asarray([track.smooth_feat for track in tracks], dtype=np.float32)
-    cost_matrix = np.maximum(0.0, cdist(track_features, det_features, metric))  # Normalized features
-    return cost_matrix
-
-
-def fuse_score(cost_matrix: np.ndarray, detections: list) -> np.ndarray:
-    """
-    Fuse cost matrix with detection scores to produce a single similarity matrix.
-
-    Args:
-        cost_matrix (np.ndarray): The matrix containing cost values for assignments, with shape (N, M).
-        detections (list[BaseTrack]): List of detections, each containing a score attribute.
-
-    Returns:
-        (np.ndarray): Fused similarity matrix with shape (N, M).
-
-    Examples:
-        Fuse a cost matrix with detection scores
-        >>> cost_matrix = np.random.rand(5, 10)  # 5 tracks and 10 detections
-        >>> detections = [BaseTrack(score=np.random.rand()) for _ in range(10)]
-        >>> fused_matrix = fuse_score(cost_matrix, detections)
-    """
-    if cost_matrix.size == 0:
-        return cost_matrix
-    iou_sim = 1 - cost_matrix
-    det_scores = np.array([det.score for det in detections])
-    det_scores = np.expand_dims(det_scores, axis=0).repeat(cost_matrix.shape[0], axis=0)
-    fuse_sim = iou_sim * det_scores
-    return 1 - fuse_sim  # fuse_cost
diff --git a/ultralytics/utils/__init__.py b/ultralytics/utils/__init__.py
deleted file mode 100644
index f97d5b9..0000000
--- a/ultralytics/utils/__init__.py
+++ /dev/null
@@ -1,1450 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import contextlib
-import importlib.metadata
-import inspect
-import json
-import logging
-import os
-import platform
-import re
-import socket
-import sys
-import threading
-import time
-from functools import lru_cache
-from pathlib import Path
-from threading import Lock
-from types import SimpleNamespace
-from urllib.parse import unquote
-
-import cv2
-import numpy as np
-import torch
-
-from ultralytics import __version__
-from ultralytics.utils.git import GitRepo
-from ultralytics.utils.patches import imread, imshow, imwrite, torch_save  # for patches
-from ultralytics.utils.tqdm import TQDM  # noqa
-
-# PyTorch Multi-GPU DDP Constants
-RANK = int(os.getenv("RANK", -1))
-LOCAL_RANK = int(os.getenv("LOCAL_RANK", -1))  # https://pytorch.org/docs/stable/elastic/run.html
-
-# Other Constants
-ARGV = sys.argv or ["", ""]  # sometimes sys.argv = []
-FILE = Path(__file__).resolve()
-ROOT = FILE.parents[1]  # YOLO
-ASSETS = ROOT / "assets"  # default images
-ASSETS_URL = "https://github.com/ultralytics/assets/releases/download/v0.0.0"  # assets GitHub URL
-DEFAULT_CFG_PATH = ROOT / "cfg/default.yaml"
-NUM_THREADS = min(8, max(1, os.cpu_count() - 1))  # number of YOLO multiprocessing threads
-AUTOINSTALL = str(os.getenv("YOLO_AUTOINSTALL", True)).lower() == "true"  # global auto-install mode
-VERBOSE = str(os.getenv("YOLO_VERBOSE", True)).lower() == "true"  # global verbose mode
-LOGGING_NAME = "ultralytics"
-MACOS, LINUX, WINDOWS = (platform.system() == x for x in ["Darwin", "Linux", "Windows"])  # environment booleans
-MACOS_VERSION = platform.mac_ver()[0] if MACOS else None
-NOT_MACOS14 = not (MACOS and MACOS_VERSION.startswith("14."))
-ARM64 = platform.machine() in {"arm64", "aarch64"}  # ARM64 booleans
-PYTHON_VERSION = platform.python_version()
-TORCH_VERSION = str(torch.__version__)  # Normalize torch.__version__ (PyTorch>1.9 returns TorchVersion objects)
-TORCHVISION_VERSION = importlib.metadata.version("torchvision")  # faster than importing torchvision
-IS_VSCODE = os.environ.get("TERM_PROGRAM", False) == "vscode"
-RKNN_CHIPS = frozenset(
-    {
-        "rk3588",
-        "rk3576",
-        "rk3566",
-        "rk3568",
-        "rk3562",
-        "rv1103",
-        "rv1106",
-        "rv1103b",
-        "rv1106b",
-        "rk2118",
-    }
-)  # Rockchip processors available for export
-HELP_MSG = """
-    Examples for running Ultralytics:
-
-    1. Install the ultralytics package:
-
-        pip install ultralytics
-
-    2. Use the Python SDK:
-
-        from ultralytics import YOLO
-
-        # Load a model
-        model = YOLO("yolo11n.yaml")  # build a new model from scratch
-        model = YOLO("yolo11n.pt")  # load a pretrained model (recommended for training)
-
-        # Use the model
-        results = model.train(data="coco8.yaml", epochs=3)  # train the model
-        results = model.val()  # evaluate model performance on the validation set
-        results = model("https://ultralytics.com/images/bus.jpg")  # predict on an image
-        success = model.export(format="onnx")  # export the model to ONNX format
-
-    3. Use the command line interface (CLI):
-
-        Ultralytics 'yolo' CLI commands use the following syntax:
-
-            yolo TASK MODE ARGS
-
-            Where   TASK (optional) is one of [detect, segment, classify, pose, obb]
-                    MODE (required) is one of [train, val, predict, export, track, benchmark]
-                    ARGS (optional) are any number of custom "arg=value" pairs like "imgsz=320" that override defaults.
-                        See all ARGS at https://docs.ultralytics.com/usage/cfg or with "yolo cfg"
-
-        - Train a detection model for 10 epochs with an initial learning_rate of 0.01
-            yolo detect train data=coco8.yaml model=yolo11n.pt epochs=10 lr0=0.01
-
-        - Predict a YouTube video using a pretrained segmentation model at image size 320:
-            yolo segment predict model=yolo11n-seg.pt source='https://youtu.be/LNwODJXcvt4' imgsz=320
-
-        - Val a pretrained detection model at batch-size 1 and image size 640:
-            yolo detect val model=yolo11n.pt data=coco8.yaml batch=1 imgsz=640
-
-        - Export a YOLO11n classification model to ONNX format at image size 224 by 128 (no TASK required)
-            yolo export model=yolo11n-cls.pt format=onnx imgsz=224,128
-
-        - Run special commands:
-            yolo help
-            yolo checks
-            yolo version
-            yolo settings
-            yolo copy-cfg
-            yolo cfg
-
-    Docs: https://docs.ultralytics.com
-    Community: https://community.ultralytics.com
-    GitHub: https://github.com/ultralytics/ultralytics
-    """
-
-# Settings and Environment Variables
-torch.set_printoptions(linewidth=320, precision=4, profile="default")
-np.set_printoptions(linewidth=320, formatter=dict(float_kind="{:11.5g}".format))  # format short g, %precision=5
-cv2.setNumThreads(0)  # prevent OpenCV from multithreading (incompatible with PyTorch DataLoader)
-os.environ["NUMEXPR_MAX_THREADS"] = str(NUM_THREADS)  # NumExpr max threads
-os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"  # suppress verbose TF compiler warnings in Colab
-os.environ["TORCH_CPP_LOG_LEVEL"] = "ERROR"  # suppress "NNPACK.cpp could not initialize NNPACK" warnings
-os.environ["KINETO_LOG_LEVEL"] = "5"  # suppress verbose PyTorch profiler output when computing FLOPs
-
-# Precompiled type tuples for faster isinstance() checks
-FLOAT_OR_INT = (float, int)
-STR_OR_PATH = (str, Path)
-
-
-class DataExportMixin:
-    """
-    Mixin class for exporting validation metrics or prediction results in various formats.
-
-    This class provides utilities to export performance metrics (e.g., mAP, precision, recall) or prediction results
-    from classification, object detection, segmentation, or pose estimation tasks into various formats: Polars
-    DataFrame, CSV and JSON.
-
-    Methods:
-        to_df: Convert summary to a Polars DataFrame.
-        to_csv: Export results as a CSV string.
-        to_json: Export results as a JSON string.
-        tojson: Deprecated alias for `to_json()`.
-
-    Examples:
-        >>> model = YOLO("yolo11n.pt")
-        >>> results = model("image.jpg")
-        >>> df = results.to_df()
-        >>> print(df)
-        >>> csv_data = results.to_csv()
-    """
-
-    def to_df(self, normalize=False, decimals=5):
-        """
-        Create a polars DataFrame from the prediction results summary or validation metrics.
-
-        Args:
-            normalize (bool, optional): Normalize numerical values for easier comparison.
-            decimals (int, optional): Decimal places to round floats.
-
-        Returns:
-            (DataFrame): DataFrame containing the summary data.
-        """
-        import polars as pl  # scope for faster 'import ultralytics'
-
-        return pl.DataFrame(self.summary(normalize=normalize, decimals=decimals))
-
-    def to_csv(self, normalize=False, decimals=5):
-        """
-        Export results or metrics to CSV string format.
-
-        Args:
-           normalize (bool, optional): Normalize numeric values.
-           decimals (int, optional): Decimal precision.
-
-        Returns:
-           (str): CSV content as string.
-        """
-        import polars as pl
-
-        df = self.to_df(normalize=normalize, decimals=decimals)
-
-        try:
-            return df.write_csv()
-        except Exception:
-            # Minimal string conversion for any remaining complex types
-            def _to_str_simple(v):
-                if v is None:
-                    return ""
-                elif isinstance(v, (dict, list, tuple, set)):
-                    return repr(v)
-                else:
-                    return str(v)
-
-            df_str = df.select(
-                [pl.col(c).map_elements(_to_str_simple, return_dtype=pl.String).alias(c) for c in df.columns]
-            )
-            return df_str.write_csv()
-
-    def to_json(self, normalize=False, decimals=5):
-        """
-        Export results to JSON format.
-
-        Args:
-            normalize (bool, optional): Normalize numeric values.
-            decimals (int, optional): Decimal precision.
-
-        Returns:
-            (str): JSON-formatted string of the results.
-        """
-        return self.to_df(normalize=normalize, decimals=decimals).write_json()
-
-
-class SimpleClass:
-    """
-    A simple base class for creating objects with string representations of their attributes.
-
-    This class provides a foundation for creating objects that can be easily printed or represented as strings,
-    showing all their non-callable attributes. It's useful for debugging and introspection of object states.
-
-    Methods:
-        __str__: Return a human-readable string representation of the object.
-        __repr__: Return a machine-readable string representation of the object.
-        __getattr__: Provide a custom attribute access error message with helpful information.
-
-    Examples:
-        >>> class MyClass(SimpleClass):
-        ...     def __init__(self):
-        ...         self.x = 10
-        ...         self.y = "hello"
-        >>> obj = MyClass()
-        >>> print(obj)
-        __main__.MyClass object with attributes:
-
-        x: 10
-        y: 'hello'
-
-    Notes:
-        - This class is designed to be subclassed. It provides a convenient way to inspect object attributes.
-        - The string representation includes the module and class name of the object.
-        - Callable attributes and attributes starting with an underscore are excluded from the string representation.
-    """
-
-    def __str__(self):
-        """Return a human-readable string representation of the object."""
-        attr = []
-        for a in dir(self):
-            v = getattr(self, a)
-            if not callable(v) and not a.startswith("_"):
-                if isinstance(v, SimpleClass):
-                    # Display only the module and class name for subclasses
-                    s = f"{a}: {v.__module__}.{v.__class__.__name__} object"
-                else:
-                    s = f"{a}: {repr(v)}"
-                attr.append(s)
-        return f"{self.__module__}.{self.__class__.__name__} object with attributes:\n\n" + "\n".join(attr)
-
-    def __repr__(self):
-        """Return a machine-readable string representation of the object."""
-        return self.__str__()
-
-    def __getattr__(self, attr):
-        """Provide a custom attribute access error message with helpful information."""
-        name = self.__class__.__name__
-        raise AttributeError(f"'{name}' object has no attribute '{attr}'. See valid attributes below.\n{self.__doc__}")
-
-
-class IterableSimpleNamespace(SimpleNamespace):
-    """
-    An iterable SimpleNamespace class that provides enhanced functionality for attribute access and iteration.
-
-    This class extends the SimpleNamespace class with additional methods for iteration, string representation,
-    and attribute access. It is designed to be used as a convenient container for storing and accessing
-    configuration parameters.
-
-    Methods:
-        __iter__: Return an iterator of key-value pairs from the namespace's attributes.
-        __str__: Return a human-readable string representation of the object.
-        __getattr__: Provide a custom attribute access error message with helpful information.
-        get: Retrieve the value of a specified key, or a default value if the key doesn't exist.
-
-    Examples:
-        >>> cfg = IterableSimpleNamespace(a=1, b=2, c=3)
-        >>> for k, v in cfg:
-        ...     print(f"{k}: {v}")
-        a: 1
-        b: 2
-        c: 3
-        >>> print(cfg)
-        a=1
-        b=2
-        c=3
-        >>> cfg.get("b")
-        2
-        >>> cfg.get("d", "default")
-        'default'
-
-    Notes:
-        This class is particularly useful for storing configuration parameters in a more accessible
-        and iterable format compared to a standard dictionary.
-    """
-
-    def __iter__(self):
-        """Return an iterator of key-value pairs from the namespace's attributes."""
-        return iter(vars(self).items())
-
-    def __str__(self):
-        """Return a human-readable string representation of the object."""
-        return "\n".join(f"{k}={v}" for k, v in vars(self).items())
-
-    def __getattr__(self, attr):
-        """Provide a custom attribute access error message with helpful information."""
-        name = self.__class__.__name__
-        raise AttributeError(
-            f"""
-            '{name}' object has no attribute '{attr}'. This may be caused by a modified or out of date ultralytics
-            'default.yaml' file.\nPlease update your code with 'pip install -U ultralytics' and if necessary replace
-            {DEFAULT_CFG_PATH} with the latest version from
-            https://github.com/ultralytics/ultralytics/blob/main/ultralytics/cfg/default.yaml
-            """
-        )
-
-    def get(self, key, default=None):
-        """Return the value of the specified key if it exists; otherwise, return the default value."""
-        return getattr(self, key, default)
-
-
-def plt_settings(rcparams=None, backend="Agg"):
-    """
-    Decorator to temporarily set rc parameters and the backend for a plotting function.
-
-    Args:
-        rcparams (dict, optional): Dictionary of rc parameters to set.
-        backend (str, optional): Name of the backend to use.
-
-    Returns:
-        (Callable): Decorated function with temporarily set rc parameters and backend.
-
-    Examples:
-        >>> @plt_settings({"font.size": 12})
-        >>> def plot_function():
-        ...     plt.figure()
-        ...     plt.plot([1, 2, 3])
-        ...     plt.show()
-
-        >>> with plt_settings({"font.size": 12}):
-        ...     plt.figure()
-        ...     plt.plot([1, 2, 3])
-        ...     plt.show()
-    """
-    if rcparams is None:
-        rcparams = {"font.size": 11}
-
-    def decorator(func):
-        """Decorator to apply temporary rc parameters and backend to a function."""
-
-        def wrapper(*args, **kwargs):
-            """Set rc parameters and backend, call the original function, and restore the settings."""
-            import matplotlib.pyplot as plt  # scope for faster 'import ultralytics'
-
-            original_backend = plt.get_backend()
-            switch = backend.lower() != original_backend.lower()
-            if switch:
-                plt.close("all")  # auto-close()ing of figures upon backend switching is deprecated since 3.8
-                plt.switch_backend(backend)
-
-            # Plot with backend and always revert to original backend
-            try:
-                with plt.rc_context(rcparams):
-                    result = func(*args, **kwargs)
-            finally:
-                if switch:
-                    plt.close("all")
-                    plt.switch_backend(original_backend)
-            return result
-
-        return wrapper
-
-    return decorator
-
-
-def set_logging(name="LOGGING_NAME", verbose=True):
-    """
-    Set up logging with UTF-8 encoding and configurable verbosity.
-
-    This function configures logging for the Ultralytics library, setting the appropriate logging level and
-    formatter based on the verbosity flag and the current process rank. It handles special cases for Windows
-    environments where UTF-8 encoding might not be the default.
-
-    Args:
-        name (str): Name of the logger.
-        verbose (bool): Flag to set logging level to INFO if True, ERROR otherwise.
-
-    Returns:
-        (logging.Logger): Configured logger object.
-
-    Examples:
-        >>> set_logging(name="ultralytics", verbose=True)
-        >>> logger = logging.getLogger("ultralytics")
-        >>> logger.info("This is an info message")
-
-    Notes:
-        - On Windows, this function attempts to reconfigure stdout to use UTF-8 encoding if possible.
-        - If reconfiguration is not possible, it falls back to a custom formatter that handles non-UTF-8 environments.
-        - The function sets up a StreamHandler with the appropriate formatter and level.
-        - The logger's propagate flag is set to False to prevent duplicate logging in parent loggers.
-    """
-    level = logging.INFO if verbose and RANK in {-1, 0} else logging.ERROR  # rank in world for Multi-GPU trainings
-
-    class PrefixFormatter(logging.Formatter):
-        def format(self, record):
-            """Format log records with prefixes based on level."""
-            # Apply prefixes based on log level
-            if record.levelno == logging.WARNING:
-                prefix = "WARNING" if WINDOWS else "WARNING ⚠️"
-                record.msg = f"{prefix} {record.msg}"
-            elif record.levelno == logging.ERROR:
-                prefix = "ERROR" if WINDOWS else "ERROR ❌"
-                record.msg = f"{prefix} {record.msg}"
-
-            # Handle emojis in message based on platform
-            formatted_message = super().format(record)
-            return emojis(formatted_message)
-
-    formatter = PrefixFormatter("%(message)s")
-
-    # Handle Windows UTF-8 encoding issues
-    if WINDOWS and hasattr(sys.stdout, "encoding") and sys.stdout.encoding != "utf-8":
-        with contextlib.suppress(Exception):
-            # Attempt to reconfigure stdout to use UTF-8 encoding if possible
-            if hasattr(sys.stdout, "reconfigure"):
-                sys.stdout.reconfigure(encoding="utf-8")
-            # For environments where reconfigure is not available, wrap stdout in a TextIOWrapper
-            elif hasattr(sys.stdout, "buffer"):
-                import io
-
-                sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8")
-
-    # Create and configure the StreamHandler with the appropriate formatter and level
-    stream_handler = logging.StreamHandler(sys.stdout)
-    stream_handler.setFormatter(formatter)
-    stream_handler.setLevel(level)
-
-    # Set up the logger
-    logger = logging.getLogger(name)
-    logger.setLevel(level)
-    logger.addHandler(stream_handler)
-    logger.propagate = False
-    return logger
-
-
-# Set logger
-LOGGER = set_logging(LOGGING_NAME, verbose=VERBOSE)  # define globally (used in train.py, val.py, predict.py, etc.)
-logging.getLogger("sentry_sdk").setLevel(logging.CRITICAL + 1)
-
-
-def emojis(string=""):
-    """Return platform-dependent emoji-safe version of string."""
-    return string.encode().decode("ascii", "ignore") if WINDOWS else string
-
-
-class ThreadingLocked:
-    """
-    A decorator class for ensuring thread-safe execution of a function or method.
-
-    This class can be used as a decorator to make sure that if the decorated function is called from multiple threads,
-    only one thread at a time will be able to execute the function.
-
-    Attributes:
-        lock (threading.Lock): A lock object used to manage access to the decorated function.
-
-    Examples:
-        >>> from ultralytics.utils import ThreadingLocked
-        >>> @ThreadingLocked()
-        >>> def my_function():
-        ...    # Your code here
-    """
-
-    def __init__(self):
-        """Initialize the decorator class with a threading lock."""
-        self.lock = threading.Lock()
-
-    def __call__(self, f):
-        """Run thread-safe execution of function or method."""
-        from functools import wraps
-
-        @wraps(f)
-        def decorated(*args, **kwargs):
-            """Apply thread-safety to the decorated function or method."""
-            with self.lock:
-                return f(*args, **kwargs)
-
-        return decorated
-
-
-class YAML:
-    """
-    YAML utility class for efficient file operations with automatic C-implementation detection.
-
-    This class provides optimized YAML loading and saving operations using PyYAML's fastest available implementation
-    (C-based when possible). It implements a singleton pattern with lazy initialization, allowing direct class method
-    usage without explicit instantiation. The class handles file path creation, validation, and character encoding
-    issues automatically.
-
-    The implementation prioritizes performance through:
-        - Automatic C-based loader/dumper selection when available
-        - Singleton pattern to reuse the same instance
-        - Lazy initialization to defer import costs until needed
-        - Fallback mechanisms for handling problematic YAML content
-
-    Attributes:
-        _instance: Internal singleton instance storage.
-        yaml: Reference to the PyYAML module.
-        SafeLoader: Best available YAML loader (CSafeLoader if available).
-        SafeDumper: Best available YAML dumper (CSafeDumper if available).
-
-    Examples:
-        >>> data = YAML.load("config.yaml")
-        >>> data["new_value"] = 123
-        >>> YAML.save("updated_config.yaml", data)
-        >>> YAML.print(data)
-    """
-
-    _instance = None
-
-    @classmethod
-    def _get_instance(cls):
-        """Initialize singleton instance on first use."""
-        if cls._instance is None:
-            cls._instance = cls()
-        return cls._instance
-
-    def __init__(self):
-        """Initialize with optimal YAML implementation (C-based when available)."""
-        import yaml
-
-        self.yaml = yaml
-        # Use C-based implementation if available for better performance
-        try:
-            self.SafeLoader = yaml.CSafeLoader
-            self.SafeDumper = yaml.CSafeDumper
-        except (AttributeError, ImportError):
-            self.SafeLoader = yaml.SafeLoader
-            self.SafeDumper = yaml.SafeDumper
-
-    @classmethod
-    def save(cls, file="data.yaml", data=None, header=""):
-        """
-        Save Python object as YAML file.
-
-        Args:
-            file (str | Path): Path to save YAML file.
-            data (dict | None): Dict or compatible object to save.
-            header (str): Optional string to add at file beginning.
-        """
-        instance = cls._get_instance()
-        if data is None:
-            data = {}
-
-        # Create parent directories if needed
-        file = Path(file)
-        file.parent.mkdir(parents=True, exist_ok=True)
-
-        # Convert non-serializable objects to strings
-        valid_types = int, float, str, bool, list, tuple, dict, type(None)
-        for k, v in data.items():
-            if not isinstance(v, valid_types):
-                data[k] = str(v)
-
-        # Write YAML file
-        with open(file, "w", errors="ignore", encoding="utf-8") as f:
-            if header:
-                f.write(header)
-            instance.yaml.dump(data, f, sort_keys=False, allow_unicode=True, Dumper=instance.SafeDumper)
-
-    @classmethod
-    def load(cls, file="data.yaml", append_filename=False):
-        """
-        Load YAML file to Python object with robust error handling.
-
-        Args:
-            file (str | Path): Path to YAML file.
-            append_filename (bool): Whether to add filename to returned dict.
-
-        Returns:
-            (dict): Loaded YAML content.
-        """
-        instance = cls._get_instance()
-        assert str(file).endswith((".yaml", ".yml")), f"Not a YAML file: {file}"
-
-        # Read file content
-        with open(file, errors="ignore", encoding="utf-8") as f:
-            s = f.read()
-
-        # Try loading YAML with fallback for problematic characters
-        try:
-            data = instance.yaml.load(s, Loader=instance.SafeLoader) or {}
-        except Exception:
-            # Remove problematic characters and retry
-            s = re.sub(r"[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD\U00010000-\U0010ffff]+", "", s)
-            data = instance.yaml.load(s, Loader=instance.SafeLoader) or {}
-
-        # Check for accidental user-error None strings (should be 'null' in YAML)
-        if "None" in data.values():
-            data = {k: None if v == "None" else v for k, v in data.items()}
-
-        if append_filename:
-            data["yaml_file"] = str(file)
-        return data
-
-    @classmethod
-    def print(cls, yaml_file):
-        """
-        Pretty print YAML file or object to console.
-
-        Args:
-            yaml_file (str | Path | dict): Path to YAML file or dict to print.
-        """
-        instance = cls._get_instance()
-
-        # Load file if path provided
-        yaml_dict = cls.load(yaml_file) if isinstance(yaml_file, (str, Path)) else yaml_file
-
-        # Use -1 for unlimited width in C implementation
-        dump = instance.yaml.dump(yaml_dict, sort_keys=False, allow_unicode=True, width=-1, Dumper=instance.SafeDumper)
-
-        LOGGER.info(f"Printing '{colorstr('bold', 'black', yaml_file)}'\n\n{dump}")
-
-
-# Default configuration
-DEFAULT_CFG_DICT = YAML.load(DEFAULT_CFG_PATH)
-DEFAULT_CFG_KEYS = DEFAULT_CFG_DICT.keys()
-DEFAULT_CFG = IterableSimpleNamespace(**DEFAULT_CFG_DICT)
-
-
-def read_device_model() -> str:
-    """
-    Read the device model information from the system and cache it for quick access.
-
-    Returns:
-        (str): Kernel release information.
-    """
-    return platform.release().lower()
-
-
-def is_ubuntu() -> bool:
-    """
-    Check if the OS is Ubuntu.
-
-    Returns:
-        (bool): True if OS is Ubuntu, False otherwise.
-    """
-    try:
-        with open("/etc/os-release") as f:
-            return "ID=ubuntu" in f.read()
-    except FileNotFoundError:
-        return False
-
-
-def is_colab():
-    """
-    Check if the current script is running inside a Google Colab notebook.
-
-    Returns:
-        (bool): True if running inside a Colab notebook, False otherwise.
-    """
-    return "COLAB_RELEASE_TAG" in os.environ or "COLAB_BACKEND_VERSION" in os.environ
-
-
-def is_kaggle():
-    """
-    Check if the current script is running inside a Kaggle kernel.
-
-    Returns:
-        (bool): True if running inside a Kaggle kernel, False otherwise.
-    """
-    return os.environ.get("PWD") == "/kaggle/working" and os.environ.get("KAGGLE_URL_BASE") == "https://www.kaggle.com"
-
-
-def is_jupyter():
-    """
-    Check if the current script is running inside a Jupyter Notebook.
-
-    Returns:
-        (bool): True if running inside a Jupyter Notebook, False otherwise.
-
-    Notes:
-        - Only works on Colab and Kaggle, other environments like Jupyterlab and Paperspace are not reliably detectable.
-        - "get_ipython" in globals() method suffers false positives when IPython package installed manually.
-    """
-    return IS_COLAB or IS_KAGGLE
-
-
-def is_runpod():
-    """
-    Check if the current script is running inside a RunPod container.
-
-    Returns:
-        (bool): True if running in RunPod, False otherwise.
-    """
-    return "RUNPOD_POD_ID" in os.environ
-
-
-def is_docker() -> bool:
-    """
-    Determine if the script is running inside a Docker container.
-
-    Returns:
-        (bool): True if the script is running inside a Docker container, False otherwise.
-    """
-    try:
-        return os.path.exists("/.dockerenv")
-    except Exception:
-        return False
-
-
-def is_raspberrypi() -> bool:
-    """
-    Determine if the Python environment is running on a Raspberry Pi.
-
-    Returns:
-        (bool): True if running on a Raspberry Pi, False otherwise.
-    """
-    return "rpi" in DEVICE_MODEL
-
-
-@lru_cache(maxsize=3)
-def is_jetson(jetpack=None) -> bool:
-    """
-    Determine if the Python environment is running on an NVIDIA Jetson device.
-
-    Args:
-        jetpack (int | None): If specified, check for specific JetPack version (4, 5, 6).
-
-    Returns:
-        (bool): True if running on an NVIDIA Jetson device, False otherwise.
-    """
-    if jetson := ("tegra" in DEVICE_MODEL):
-        if jetpack:
-            try:
-                content = open("/etc/nv_tegra_release").read()
-                version_map = {4: "R32", 5: "R35", 6: "R36"}  # JetPack to L4T major version mapping
-                return jetpack in version_map and version_map[jetpack] in content
-            except Exception:
-                return False
-    return jetson
-
-
-def is_online() -> bool:
-    """
-    Fast online check using DNS (v4/v6) resolution (Cloudflare + Google).
-
-    Returns:
-        (bool): True if connection is successful, False otherwise.
-    """
-    if str(os.getenv("YOLO_OFFLINE", "")).lower() == "true":
-        return False
-
-    for host in ("one.one.one.one", "dns.google"):
-        try:
-            socket.getaddrinfo(host, 0, socket.AF_UNSPEC, 0, 0, socket.AI_ADDRCONFIG)
-            return True
-        except OSError:
-            continue
-    return False
-
-
-def is_pip_package(filepath: str = __name__) -> bool:
-    """
-    Determine if the file at the given filepath is part of a pip package.
-
-    Args:
-        filepath (str): The filepath to check.
-
-    Returns:
-        (bool): True if the file is part of a pip package, False otherwise.
-    """
-    import importlib.util
-
-    # Get the spec for the module
-    spec = importlib.util.find_spec(filepath)
-
-    # Return whether the spec is not None and the origin is not None (indicating it is a package)
-    return spec is not None and spec.origin is not None
-
-
-def is_dir_writeable(dir_path: str | Path) -> bool:
-    """
-    Check if a directory is writeable.
-
-    Args:
-        dir_path (str | Path): The path to the directory.
-
-    Returns:
-        (bool): True if the directory is writeable, False otherwise.
-    """
-    return os.access(str(dir_path), os.W_OK)
-
-
-def is_pytest_running():
-    """
-    Determine whether pytest is currently running or not.
-
-    Returns:
-        (bool): True if pytest is running, False otherwise.
-    """
-    return ("PYTEST_CURRENT_TEST" in os.environ) or ("pytest" in sys.modules) or ("pytest" in Path(ARGV[0]).stem)
-
-
-def is_github_action_running() -> bool:
-    """
-    Determine if the current environment is a GitHub Actions runner.
-
-    Returns:
-        (bool): True if the current environment is a GitHub Actions runner, False otherwise.
-    """
-    return "GITHUB_ACTIONS" in os.environ and "GITHUB_WORKFLOW" in os.environ and "RUNNER_OS" in os.environ
-
-
-def get_default_args(func):
-    """
-    Return a dictionary of default arguments for a function.
-
-    Args:
-        func (callable): The function to inspect.
-
-    Returns:
-        (dict): A dictionary where each key is a parameter name, and each value is the default value of that parameter.
-    """
-    signature = inspect.signature(func)
-    return {k: v.default for k, v in signature.parameters.items() if v.default is not inspect.Parameter.empty}
-
-
-def get_ubuntu_version():
-    """
-    Retrieve the Ubuntu version if the OS is Ubuntu.
-
-    Returns:
-        (str): Ubuntu version or None if not an Ubuntu OS.
-    """
-    if is_ubuntu():
-        try:
-            with open("/etc/os-release") as f:
-                return re.search(r'VERSION_ID="(\d+\.\d+)"', f.read())[1]
-        except (FileNotFoundError, AttributeError):
-            return None
-
-
-def get_user_config_dir(sub_dir="Ultralytics"):
-    """
-    Return a writable config dir, preferring YOLO_CONFIG_DIR and being OS-aware.
-
-    Args:
-        sub_dir (str): The name of the subdirectory to create.
-
-    Returns:
-        (Path): The path to the user config directory.
-    """
-    if env_dir := os.getenv("YOLO_CONFIG_DIR"):
-        p = Path(env_dir).expanduser() / sub_dir
-    elif LINUX:
-        p = Path(os.getenv("XDG_CONFIG_HOME", Path.home() / ".config")) / sub_dir
-    elif WINDOWS:
-        p = Path.home() / "AppData" / "Roaming" / sub_dir
-    elif MACOS:
-        p = Path.home() / "Library" / "Application Support" / sub_dir
-    else:
-        raise ValueError(f"Unsupported operating system: {platform.system()}")
-
-    if p.exists():  # already created → trust it
-        return p
-    if is_dir_writeable(p.parent):  # create if possible
-        p.mkdir(parents=True, exist_ok=True)
-        return p
-
-    # Fallbacks for Docker, GCP/AWS functions where only /tmp is writeable
-    for alt in [Path("/tmp") / sub_dir, Path.cwd() / sub_dir]:
-        if alt.exists():
-            return alt
-        if is_dir_writeable(alt.parent):
-            alt.mkdir(parents=True, exist_ok=True)
-            LOGGER.warning(
-                f"user config directory '{p}' is not writeable, using '{alt}'. Set YOLO_CONFIG_DIR to override."
-            )
-            return alt
-
-    # Last fallback → CWD
-    p = Path.cwd() / sub_dir
-    p.mkdir(parents=True, exist_ok=True)
-    return p
-
-
-# Define constants (required below)
-DEVICE_MODEL = read_device_model()  # is_jetson() and is_raspberrypi() depend on this constant
-ONLINE = is_online()
-IS_COLAB = is_colab()
-IS_KAGGLE = is_kaggle()
-IS_DOCKER = is_docker()
-IS_JETSON = is_jetson()
-IS_JUPYTER = is_jupyter()
-IS_PIP_PACKAGE = is_pip_package()
-IS_RASPBERRYPI = is_raspberrypi()
-GIT = GitRepo()
-USER_CONFIG_DIR = get_user_config_dir()  # Ultralytics settings dir
-SETTINGS_FILE = USER_CONFIG_DIR / "settings.json"
-
-
-def colorstr(*input):
-    r"""
-    Color a string based on the provided color and style arguments using ANSI escape codes.
-
-    This function can be called in two ways:
-        - colorstr('color', 'style', 'your string')
-        - colorstr('your string')
-
-    In the second form, 'blue' and 'bold' will be applied by default.
-
-    Args:
-        *input (str | Path): A sequence of strings where the first n-1 strings are color and style arguments,
-                      and the last string is the one to be colored.
-
-    Returns:
-        (str): The input string wrapped with ANSI escape codes for the specified color and style.
-
-    Notes:
-        Supported Colors and Styles:
-        - Basic Colors: 'black', 'red', 'green', 'yellow', 'blue', 'magenta', 'cyan', 'white'
-        - Bright Colors: 'bright_black', 'bright_red', 'bright_green', 'bright_yellow',
-                       'bright_blue', 'bright_magenta', 'bright_cyan', 'bright_white'
-        - Misc: 'end', 'bold', 'underline'
-
-    Examples:
-        >>> colorstr("blue", "bold", "hello world")
-        >>> "\033[34m\033[1mhello world\033[0m"
-
-    References:
-        https://en.wikipedia.org/wiki/ANSI_escape_code
-    """
-    *args, string = input if len(input) > 1 else ("blue", "bold", input[0])  # color arguments, string
-    colors = {
-        "black": "\033[30m",  # basic colors
-        "red": "\033[31m",
-        "green": "\033[32m",
-        "yellow": "\033[33m",
-        "blue": "\033[34m",
-        "magenta": "\033[35m",
-        "cyan": "\033[36m",
-        "white": "\033[37m",
-        "bright_black": "\033[90m",  # bright colors
-        "bright_red": "\033[91m",
-        "bright_green": "\033[92m",
-        "bright_yellow": "\033[93m",
-        "bright_blue": "\033[94m",
-        "bright_magenta": "\033[95m",
-        "bright_cyan": "\033[96m",
-        "bright_white": "\033[97m",
-        "end": "\033[0m",  # misc
-        "bold": "\033[1m",
-        "underline": "\033[4m",
-    }
-    return "".join(colors[x] for x in args) + f"{string}" + colors["end"]
-
-
-def remove_colorstr(input_string):
-    """
-    Remove ANSI escape codes from a string, effectively un-coloring it.
-
-    Args:
-        input_string (str): The string to remove color and style from.
-
-    Returns:
-        (str): A new string with all ANSI escape codes removed.
-
-    Examples:
-        >>> remove_colorstr(colorstr("blue", "bold", "hello world"))
-        >>> "hello world"
-    """
-    ansi_escape = re.compile(r"\x1B\[[0-9;]*[A-Za-z]")
-    return ansi_escape.sub("", input_string)
-
-
-class TryExcept(contextlib.ContextDecorator):
-    """
-    Ultralytics TryExcept class for handling exceptions gracefully.
-
-    This class can be used as a decorator or context manager to catch exceptions and optionally print warning messages.
-    It allows code to continue execution even when exceptions occur, which is useful for non-critical operations.
-
-    Attributes:
-        msg (str): Optional message to display when an exception occurs.
-        verbose (bool): Whether to print the exception message.
-
-    Examples:
-        As a decorator:
-        >>> @TryExcept(msg="Error occurred in func", verbose=True)
-        >>> def func():
-        >>> # Function logic here
-        >>>     pass
-
-        As a context manager:
-        >>> with TryExcept(msg="Error occurred in block", verbose=True):
-        >>> # Code block here
-        >>>     pass
-    """
-
-    def __init__(self, msg="", verbose=True):
-        """Initialize TryExcept class with optional message and verbosity settings."""
-        self.msg = msg
-        self.verbose = verbose
-
-    def __enter__(self):
-        """Execute when entering TryExcept context, initialize instance."""
-        pass
-
-    def __exit__(self, exc_type, value, traceback):
-        """Define behavior when exiting a 'with' block, print error message if necessary."""
-        if self.verbose and value:
-            LOGGER.warning(f"{self.msg}{': ' if self.msg else ''}{value}")
-        return True
-
-
-class Retry(contextlib.ContextDecorator):
-    """
-    Retry class for function execution with exponential backoff.
-
-    This decorator can be used to retry a function on exceptions, up to a specified number of times with an
-    exponentially increasing delay between retries. It's useful for handling transient failures in network
-    operations or other unreliable processes.
-
-    Attributes:
-        times (int): Maximum number of retry attempts.
-        delay (int): Initial delay between retries in seconds.
-
-    Examples:
-        Example usage as a decorator:
-        >>> @Retry(times=3, delay=2)
-        >>> def test_func():
-        >>> # Replace with function logic that may raise exceptions
-        >>>     return True
-    """
-
-    def __init__(self, times=3, delay=2):
-        """Initialize Retry class with specified number of retries and delay."""
-        self.times = times
-        self.delay = delay
-        self._attempts = 0
-
-    def __call__(self, func):
-        """Decorator implementation for Retry with exponential backoff."""
-
-        def wrapped_func(*args, **kwargs):
-            """Apply retries to the decorated function or method."""
-            self._attempts = 0
-            while self._attempts < self.times:
-                try:
-                    return func(*args, **kwargs)
-                except Exception as e:
-                    self._attempts += 1
-                    LOGGER.warning(f"Retry {self._attempts}/{self.times} failed: {e}")
-                    if self._attempts >= self.times:
-                        raise e
-                    time.sleep(self.delay * (2**self._attempts))  # exponential backoff delay
-
-        return wrapped_func
-
-
-def threaded(func):
-    """
-    Multi-thread a target function by default and return the thread or function result.
-
-    This decorator provides flexible execution of the target function, either in a separate thread or synchronously.
-    By default, the function runs in a thread, but this can be controlled via the 'threaded=False' keyword argument
-    which is removed from kwargs before calling the function.
-
-    Args:
-        func (callable): The function to be potentially executed in a separate thread.
-
-    Returns:
-        (callable): A wrapper function that either returns a daemon thread or the direct function result.
-
-    Examples:
-        >>> @threaded
-        ... def process_data(data):
-        ...     return data
-        >>>
-        >>> thread = process_data(my_data)  # Runs in background thread
-        >>> result = process_data(my_data, threaded=False)  # Runs synchronously, returns function result
-    """
-
-    def wrapper(*args, **kwargs):
-        """Multi-thread a given function based on 'threaded' kwarg and return the thread or function result."""
-        if kwargs.pop("threaded", True):  # run in thread
-            thread = threading.Thread(target=func, args=args, kwargs=kwargs, daemon=True)
-            thread.start()
-            return thread
-        else:
-            return func(*args, **kwargs)
-
-    return wrapper
-
-
-def set_sentry():
-    """
-    Initialize the Sentry SDK for error tracking and reporting.
-
-    Only used if sentry_sdk package is installed and sync=True in settings. Run 'yolo settings' to see and update
-    settings.
-
-    Conditions required to send errors (ALL conditions must be met or no errors will be reported):
-        - sentry_sdk package is installed
-        - sync=True in YOLO settings
-        - pytest is not running
-        - running in a pip package installation
-        - running in a non-git directory
-        - running with rank -1 or 0
-        - online environment
-        - CLI used to run package (checked with 'yolo' as the name of the main CLI command)
-    """
-    if (
-        not SETTINGS["sync"]
-        or RANK not in {-1, 0}
-        or Path(ARGV[0]).name != "yolo"
-        or TESTS_RUNNING
-        or not ONLINE
-        or not IS_PIP_PACKAGE
-        or GIT.is_repo
-    ):
-        return
-    # If sentry_sdk package is not installed then return and do not use Sentry
-    try:
-        import sentry_sdk  # noqa
-    except ImportError:
-        return
-
-    def before_send(event, hint):
-        """
-        Modify the event before sending it to Sentry based on specific exception types and messages.
-
-        Args:
-            event (dict): The event dictionary containing information about the error.
-            hint (dict): A dictionary containing additional information about the error.
-
-        Returns:
-            (dict | None): The modified event or None if the event should not be sent to Sentry.
-        """
-        if "exc_info" in hint:
-            exc_type, exc_value, _ = hint["exc_info"]
-            if exc_type in {KeyboardInterrupt, FileNotFoundError} or "out of memory" in str(exc_value):
-                return None  # do not send event
-
-        event["tags"] = {
-            "sys_argv": ARGV[0],
-            "sys_argv_name": Path(ARGV[0]).name,
-            "install": "git" if GIT.is_repo else "pip" if IS_PIP_PACKAGE else "other",
-            "os": ENVIRONMENT,
-        }
-        return event
-
-    sentry_sdk.init(
-        dsn="https://888e5a0778212e1d0314c37d4b9aae5d@o4504521589325824.ingest.us.sentry.io/4504521592406016",
-        debug=False,
-        auto_enabling_integrations=False,
-        traces_sample_rate=1.0,
-        release=__version__,
-        environment="runpod" if is_runpod() else "production",
-        before_send=before_send,
-        ignore_errors=[KeyboardInterrupt, FileNotFoundError],
-    )
-    sentry_sdk.set_user({"id": SETTINGS["uuid"]})  # SHA-256 anonymized UUID hash
-
-
-class JSONDict(dict):
-    """
-    A dictionary-like class that provides JSON persistence for its contents.
-
-    This class extends the built-in dictionary to automatically save its contents to a JSON file whenever they are
-    modified. It ensures thread-safe operations using a lock and handles JSON serialization of Path objects.
-
-    Attributes:
-        file_path (Path): The path to the JSON file used for persistence.
-        lock (threading.Lock): A lock object to ensure thread-safe operations.
-
-    Methods:
-        _load: Load the data from the JSON file into the dictionary.
-        _save: Save the current state of the dictionary to the JSON file.
-        __setitem__: Store a key-value pair and persist it to disk.
-        __delitem__: Remove an item and update the persistent storage.
-        update: Update the dictionary and persist changes.
-        clear: Clear all entries and update the persistent storage.
-
-    Examples:
-        >>> json_dict = JSONDict("data.json")
-        >>> json_dict["key"] = "value"
-        >>> print(json_dict["key"])
-        value
-        >>> del json_dict["key"]
-        >>> json_dict.update({"new_key": "new_value"})
-        >>> json_dict.clear()
-    """
-
-    def __init__(self, file_path: str | Path = "data.json"):
-        """Initialize a JSONDict object with a specified file path for JSON persistence."""
-        super().__init__()
-        self.file_path = Path(file_path)
-        self.lock = Lock()
-        self._load()
-
-    def _load(self):
-        """Load the data from the JSON file into the dictionary."""
-        try:
-            if self.file_path.exists():
-                with open(self.file_path) as f:
-                    self.update(json.load(f))
-        except json.JSONDecodeError:
-            LOGGER.warning(f"Error decoding JSON from {self.file_path}. Starting with an empty dictionary.")
-        except Exception as e:
-            LOGGER.error(f"Error reading from {self.file_path}: {e}")
-
-    def _save(self):
-        """Save the current state of the dictionary to the JSON file."""
-        try:
-            self.file_path.parent.mkdir(parents=True, exist_ok=True)
-            with open(self.file_path, "w", encoding="utf-8") as f:
-                json.dump(dict(self), f, indent=2, default=self._json_default)
-        except Exception as e:
-            LOGGER.error(f"Error writing to {self.file_path}: {e}")
-
-    @staticmethod
-    def _json_default(obj):
-        """Handle JSON serialization of Path objects."""
-        if isinstance(obj, Path):
-            return str(obj)
-        raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable")
-
-    def __setitem__(self, key, value):
-        """Store a key-value pair and persist to disk."""
-        with self.lock:
-            super().__setitem__(key, value)
-            self._save()
-
-    def __delitem__(self, key):
-        """Remove an item and update the persistent storage."""
-        with self.lock:
-            super().__delitem__(key)
-            self._save()
-
-    def __str__(self):
-        """Return a pretty-printed JSON string representation of the dictionary."""
-        contents = json.dumps(dict(self), indent=2, ensure_ascii=False, default=self._json_default)
-        return f'JSONDict("{self.file_path}"):\n{contents}'
-
-    def update(self, *args, **kwargs):
-        """Update the dictionary and persist changes."""
-        with self.lock:
-            super().update(*args, **kwargs)
-            self._save()
-
-    def clear(self):
-        """Clear all entries and update the persistent storage."""
-        with self.lock:
-            super().clear()
-            self._save()
-
-
-class SettingsManager(JSONDict):
-    """
-    SettingsManager class for managing and persisting Ultralytics settings.
-
-    This class extends JSONDict to provide JSON persistence for settings, ensuring thread-safe operations and default
-    values. It validates settings on initialization and provides methods to update or reset settings. The settings
-    include directories for datasets, weights, and runs, as well as various integration flags.
-
-    Attributes:
-        file (Path): The path to the JSON file used for persistence.
-        version (str): The version of the settings schema.
-        defaults (dict): A dictionary containing default settings.
-        help_msg (str): A help message for users on how to view and update settings.
-
-    Methods:
-        _validate_settings: Validate the current settings and reset if necessary.
-        update: Update settings, validating keys and types.
-        reset: Reset the settings to default and save them.
-
-    Examples:
-        Initialize and update settings:
-        >>> settings = SettingsManager()
-        >>> settings.update(runs_dir="/new/runs/dir")
-        >>> print(settings["runs_dir"])
-        /new/runs/dir
-    """
-
-    def __init__(self, file=SETTINGS_FILE, version="0.0.6"):
-        """Initialize the SettingsManager with default settings and load user settings."""
-        import hashlib
-        import uuid
-
-        from ultralytics.utils.torch_utils import torch_distributed_zero_first
-
-        root = GIT.root or Path()
-        datasets_root = (root.parent if GIT.root and is_dir_writeable(root.parent) else root).resolve()
-
-        self.file = Path(file)
-        self.version = version
-        self.defaults = {
-            "settings_version": version,  # Settings schema version
-            "datasets_dir": str(datasets_root / "datasets"),  # Datasets directory
-            "weights_dir": str(root / "weights"),  # Model weights directory
-            "runs_dir": str(root / "runs"),  # Experiment runs directory
-            "uuid": hashlib.sha256(str(uuid.getnode()).encode()).hexdigest(),  # SHA-256 anonymized UUID hash
-            "sync": True,  # Enable synchronization
-            "api_key": "",  # Ultralytics API Key
-            "openai_api_key": "",  # OpenAI API Key
-            "clearml": True,  # ClearML integration
-            "comet": True,  # Comet integration
-            "dvc": True,  # DVC integration
-            "hub": True,  # Ultralytics HUB integration
-            "mlflow": True,  # MLflow integration
-            "neptune": True,  # Neptune integration
-            "raytune": True,  # Ray Tune integration
-            "tensorboard": False,  # TensorBoard logging
-            "wandb": False,  # Weights & Biases logging
-            "vscode_msg": True,  # VSCode message
-            "openvino_msg": True,  # OpenVINO export on Intel CPU message
-        }
-
-        self.help_msg = (
-            f"\nView Ultralytics Settings with 'yolo settings' or at '{self.file}'"
-            "\nUpdate Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. "
-            "For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings."
-        )
-
-        with torch_distributed_zero_first(LOCAL_RANK):
-            super().__init__(self.file)
-
-            if not self.file.exists() or not self:  # Check if file doesn't exist or is empty
-                LOGGER.info(f"Creating new Ultralytics Settings v{version} file ✅ {self.help_msg}")
-                self.reset()
-
-            self._validate_settings()
-
-    def _validate_settings(self):
-        """Validate the current settings and reset if necessary."""
-        correct_keys = frozenset(self.keys()) == frozenset(self.defaults.keys())
-        correct_types = all(isinstance(self.get(k), type(v)) for k, v in self.defaults.items())
-        correct_version = self.get("settings_version", "") == self.version
-
-        if not (correct_keys and correct_types and correct_version):
-            LOGGER.warning(
-                "Ultralytics settings reset to default values. This may be due to a possible problem "
-                f"with your settings or a recent ultralytics package update. {self.help_msg}"
-            )
-            self.reset()
-
-        if self.get("datasets_dir") == self.get("runs_dir"):
-            LOGGER.warning(
-                f"Ultralytics setting 'datasets_dir: {self.get('datasets_dir')}' "
-                f"must be different than 'runs_dir: {self.get('runs_dir')}'. "
-                f"Please change one to avoid possible issues during training. {self.help_msg}"
-            )
-
-    def __setitem__(self, key, value):
-        """Update one key: value pair."""
-        self.update({key: value})
-
-    def update(self, *args, **kwargs):
-        """Update settings, validating keys and types."""
-        for arg in args:
-            if isinstance(arg, dict):
-                kwargs.update(arg)
-        for k, v in kwargs.items():
-            if k not in self.defaults:
-                raise KeyError(f"No Ultralytics setting '{k}'. {self.help_msg}")
-            t = type(self.defaults[k])
-            if not isinstance(v, t):
-                raise TypeError(
-                    f"Ultralytics setting '{k}' must be '{t.__name__}' type, not '{type(v).__name__}'. {self.help_msg}"
-                )
-        super().update(*args, **kwargs)
-
-    def reset(self):
-        """Reset the settings to default and save them."""
-        self.clear()
-        self.update(self.defaults)
-
-
-def deprecation_warn(arg, new_arg=None):
-    """Issue a deprecation warning when a deprecated argument is used, suggesting an updated argument."""
-    msg = f"'{arg}' is deprecated and will be removed in the future."
-    if new_arg is not None:
-        msg += f" Use '{new_arg}' instead."
-    LOGGER.warning(msg)
-
-
-def clean_url(url):
-    """Strip auth from URL, i.e. https://url.com/file.txt?auth -> https://url.com/file.txt."""
-    url = Path(url).as_posix().replace(":/", "://")  # Pathlib turns :// -> :/, as_posix() for Windows
-    return unquote(url).split("?", 1)[0]  # '%2F' to '/', split https://url.com/file.txt?auth
-
-
-def url2file(url):
-    """Convert URL to filename, i.e. https://url.com/file.txt?auth -> file.txt."""
-    return Path(clean_url(url)).name
-
-
-def vscode_msg(ext="ultralytics.ultralytics-snippets") -> str:
-    """Display a message to install Ultralytics-Snippets for VS Code if not already installed."""
-    path = (USER_CONFIG_DIR.parents[2] if WINDOWS else USER_CONFIG_DIR.parents[1]) / ".vscode/extensions"
-    obs_file = path / ".obsolete"  # file tracks uninstalled extensions, while source directory remains
-    installed = any(path.glob(f"{ext}*")) and ext not in (obs_file.read_text("utf-8") if obs_file.exists() else "")
-    url = "https://docs.ultralytics.com/integrations/vscode"
-    return "" if installed else f"{colorstr('VS Code:')} view Ultralytics VS Code Extension ⚡ at {url}"
-
-
-# Run below code on utils init ------------------------------------------------------------------------------------
-
-# Check first-install steps
-PREFIX = colorstr("Ultralytics: ")
-SETTINGS = SettingsManager()  # initialize settings
-PERSISTENT_CACHE = JSONDict(USER_CONFIG_DIR / "persistent_cache.json")  # initialize persistent cache
-DATASETS_DIR = Path(SETTINGS["datasets_dir"])  # global datasets directory
-WEIGHTS_DIR = Path(SETTINGS["weights_dir"])  # global weights directory
-RUNS_DIR = Path(SETTINGS["runs_dir"])  # global runs directory
-ENVIRONMENT = (
-    "Colab"
-    if IS_COLAB
-    else "Kaggle"
-    if IS_KAGGLE
-    else "Jupyter"
-    if IS_JUPYTER
-    else "Docker"
-    if IS_DOCKER
-    else platform.system()
-)
-TESTS_RUNNING = is_pytest_running() or is_github_action_running()
-set_sentry()
-
-# Apply monkey patches
-torch.save = torch_save
-if WINDOWS:
-    # Apply cv2 patches for non-ASCII and non-UTF characters in image paths
-    cv2.imread, cv2.imwrite, cv2.imshow = imread, imwrite, imshow
diff --git a/ultralytics/utils/__pycache__/__init__.cpython-310.pyc b/ultralytics/utils/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index ba71194..0000000
Binary files a/ultralytics/utils/__pycache__/__init__.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/utils/__pycache__/autobatch.cpython-310.pyc b/ultralytics/utils/__pycache__/autobatch.cpython-310.pyc
deleted file mode 100644
index 27fd61c..0000000
Binary files a/ultralytics/utils/__pycache__/autobatch.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/utils/__pycache__/checks.cpython-310.pyc b/ultralytics/utils/__pycache__/checks.cpython-310.pyc
deleted file mode 100644
index 808e940..0000000
Binary files a/ultralytics/utils/__pycache__/checks.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/utils/__pycache__/cpu.cpython-310.pyc b/ultralytics/utils/__pycache__/cpu.cpython-310.pyc
deleted file mode 100644
index a474098..0000000
Binary files a/ultralytics/utils/__pycache__/cpu.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/utils/__pycache__/dist.cpython-310.pyc b/ultralytics/utils/__pycache__/dist.cpython-310.pyc
deleted file mode 100644
index 3b0477d..0000000
Binary files a/ultralytics/utils/__pycache__/dist.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/utils/__pycache__/downloads.cpython-310.pyc b/ultralytics/utils/__pycache__/downloads.cpython-310.pyc
deleted file mode 100644
index 54af114..0000000
Binary files a/ultralytics/utils/__pycache__/downloads.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/utils/__pycache__/errors.cpython-310.pyc b/ultralytics/utils/__pycache__/errors.cpython-310.pyc
deleted file mode 100644
index a819cab..0000000
Binary files a/ultralytics/utils/__pycache__/errors.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/utils/__pycache__/events.cpython-310.pyc b/ultralytics/utils/__pycache__/events.cpython-310.pyc
deleted file mode 100644
index f837209..0000000
Binary files a/ultralytics/utils/__pycache__/events.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/utils/__pycache__/files.cpython-310.pyc b/ultralytics/utils/__pycache__/files.cpython-310.pyc
deleted file mode 100644
index 8ffb38d..0000000
Binary files a/ultralytics/utils/__pycache__/files.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/utils/__pycache__/git.cpython-310.pyc b/ultralytics/utils/__pycache__/git.cpython-310.pyc
deleted file mode 100644
index 84c553d..0000000
Binary files a/ultralytics/utils/__pycache__/git.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/utils/__pycache__/instance.cpython-310.pyc b/ultralytics/utils/__pycache__/instance.cpython-310.pyc
deleted file mode 100644
index 8de8105..0000000
Binary files a/ultralytics/utils/__pycache__/instance.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/utils/__pycache__/loss.cpython-310.pyc b/ultralytics/utils/__pycache__/loss.cpython-310.pyc
deleted file mode 100644
index aa0dbce..0000000
Binary files a/ultralytics/utils/__pycache__/loss.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/utils/__pycache__/metrics.cpython-310.pyc b/ultralytics/utils/__pycache__/metrics.cpython-310.pyc
deleted file mode 100644
index e8de956..0000000
Binary files a/ultralytics/utils/__pycache__/metrics.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/utils/__pycache__/nms.cpython-310.pyc b/ultralytics/utils/__pycache__/nms.cpython-310.pyc
deleted file mode 100644
index 4085b3f..0000000
Binary files a/ultralytics/utils/__pycache__/nms.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/utils/__pycache__/ops.cpython-310.pyc b/ultralytics/utils/__pycache__/ops.cpython-310.pyc
deleted file mode 100644
index 64cb8d9..0000000
Binary files a/ultralytics/utils/__pycache__/ops.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/utils/__pycache__/patches.cpython-310.pyc b/ultralytics/utils/__pycache__/patches.cpython-310.pyc
deleted file mode 100644
index 81b64c3..0000000
Binary files a/ultralytics/utils/__pycache__/patches.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/utils/__pycache__/plotting.cpython-310.pyc b/ultralytics/utils/__pycache__/plotting.cpython-310.pyc
deleted file mode 100644
index 94a759e..0000000
Binary files a/ultralytics/utils/__pycache__/plotting.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/utils/__pycache__/tal.cpython-310.pyc b/ultralytics/utils/__pycache__/tal.cpython-310.pyc
deleted file mode 100644
index 5b2a39b..0000000
Binary files a/ultralytics/utils/__pycache__/tal.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/utils/__pycache__/torch_utils.cpython-310.pyc b/ultralytics/utils/__pycache__/torch_utils.cpython-310.pyc
deleted file mode 100644
index 8708a1d..0000000
Binary files a/ultralytics/utils/__pycache__/torch_utils.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/utils/__pycache__/tqdm.cpython-310.pyc b/ultralytics/utils/__pycache__/tqdm.cpython-310.pyc
deleted file mode 100644
index 1d1c1b8..0000000
Binary files a/ultralytics/utils/__pycache__/tqdm.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/utils/autobatch.py b/ultralytics/utils/autobatch.py
deleted file mode 100644
index ef67cb4..0000000
--- a/ultralytics/utils/autobatch.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-"""Functions for estimating the best YOLO batch size to use a fraction of the available CUDA memory in PyTorch."""
-
-from __future__ import annotations
-
-import os
-from copy import deepcopy
-
-import numpy as np
-import torch
-
-from ultralytics.utils import DEFAULT_CFG, LOGGER, colorstr
-from ultralytics.utils.torch_utils import autocast, profile_ops
-
-
-def check_train_batch_size(
-    model: torch.nn.Module,
-    imgsz: int = 640,
-    amp: bool = True,
-    batch: int | float = -1,
-    max_num_obj: int = 1,
-) -> int:
-    """
-    Compute optimal YOLO training batch size using the autobatch() function.
-
-    Args:
-        model (torch.nn.Module): YOLO model to check batch size for.
-        imgsz (int, optional): Image size used for training.
-        amp (bool, optional): Use automatic mixed precision if True.
-        batch (int | float, optional): Fraction of GPU memory to use. If -1, use default.
-        max_num_obj (int, optional): The maximum number of objects from dataset.
-
-    Returns:
-        (int): Optimal batch size computed using the autobatch() function.
-
-    Notes:
-        If 0.0 < batch < 1.0, it's used as the fraction of GPU memory to use.
-        Otherwise, a default fraction of 0.6 is used.
-    """
-    with autocast(enabled=amp):
-        return autobatch(
-            deepcopy(model).train(), imgsz, fraction=batch if 0.0 < batch < 1.0 else 0.6, max_num_obj=max_num_obj
-        )
-
-
-def autobatch(
-    model: torch.nn.Module,
-    imgsz: int = 640,
-    fraction: float = 0.60,
-    batch_size: int = DEFAULT_CFG.batch,
-    max_num_obj: int = 1,
-) -> int:
-    """
-    Automatically estimate the best YOLO batch size to use a fraction of the available CUDA memory.
-
-    Args:
-        model (torch.nn.Module): YOLO model to compute batch size for.
-        imgsz (int, optional): The image size used as input for the YOLO model.
-        fraction (float, optional): The fraction of available CUDA memory to use.
-        batch_size (int, optional): The default batch size to use if an error is detected.
-        max_num_obj (int, optional): The maximum number of objects from dataset.
-
-    Returns:
-        (int): The optimal batch size.
-    """
-    # Check device
-    prefix = colorstr("AutoBatch: ")
-    LOGGER.info(f"{prefix}Computing optimal batch size for imgsz={imgsz} at {fraction * 100}% CUDA memory utilization.")
-    device = next(model.parameters()).device  # get model device
-    if device.type in {"cpu", "mps"}:
-        LOGGER.warning(f"{prefix}intended for CUDA devices, using default batch-size {batch_size}")
-        return batch_size
-    if torch.backends.cudnn.benchmark:
-        LOGGER.warning(f"{prefix}Requires torch.backends.cudnn.benchmark=False, using default batch-size {batch_size}")
-        return batch_size
-
-    # Inspect CUDA memory
-    gb = 1 << 30  # bytes to GiB (1024 ** 3)
-    d = f"CUDA:{os.getenv('CUDA_VISIBLE_DEVICES', '0').strip()[0]}"  # 'CUDA:0'
-    properties = torch.cuda.get_device_properties(device)  # device properties
-    t = properties.total_memory / gb  # GiB total
-    r = torch.cuda.memory_reserved(device) / gb  # GiB reserved
-    a = torch.cuda.memory_allocated(device) / gb  # GiB allocated
-    f = t - (r + a)  # GiB free
-    LOGGER.info(f"{prefix}{d} ({properties.name}) {t:.2f}G total, {r:.2f}G reserved, {a:.2f}G allocated, {f:.2f}G free")
-
-    # Profile batch sizes
-    batch_sizes = [1, 2, 4, 8, 16] if t < 16 else [1, 2, 4, 8, 16, 32, 64]
-    try:
-        img = [torch.empty(b, 3, imgsz, imgsz) for b in batch_sizes]
-        results = profile_ops(img, model, n=1, device=device, max_num_obj=max_num_obj)
-
-        # Fit a solution
-        xy = [
-            [x, y[2]]
-            for i, (x, y) in enumerate(zip(batch_sizes, results))
-            if y  # valid result
-            and isinstance(y[2], (int, float))  # is numeric
-            and 0 < y[2] < t  # between 0 and GPU limit
-            and (i == 0 or not results[i - 1] or y[2] > results[i - 1][2])  # first item or increasing memory
-        ]
-        fit_x, fit_y = zip(*xy) if xy else ([], [])
-        p = np.polyfit(fit_x, fit_y, deg=1)  # first-degree polynomial fit in log space
-        b = int((round(f * fraction) - p[1]) / p[0])  # y intercept (optimal batch size)
-        if None in results:  # some sizes failed
-            i = results.index(None)  # first fail index
-            if b >= batch_sizes[i]:  # y intercept above failure point
-                b = batch_sizes[max(i - 1, 0)]  # select prior safe point
-        if b < 1 or b > 1024:  # b outside of safe range
-            LOGGER.warning(f"{prefix}batch={b} outside safe range, using default batch-size {batch_size}.")
-            b = batch_size
-
-        fraction = (np.polyval(p, b) + r + a) / t  # predicted fraction
-        LOGGER.info(f"{prefix}Using batch-size {b} for {d} {t * fraction:.2f}G/{t:.2f}G ({fraction * 100:.0f}%) ✅")
-        return b
-    except Exception as e:
-        LOGGER.warning(f"{prefix}error detected: {e},  using default batch-size {batch_size}.")
-        return batch_size
-    finally:
-        torch.cuda.empty_cache()
diff --git a/ultralytics/utils/autodevice.py b/ultralytics/utils/autodevice.py
deleted file mode 100644
index a0971bc..0000000
--- a/ultralytics/utils/autodevice.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-from typing import Any
-
-from ultralytics.utils import LOGGER
-from ultralytics.utils.checks import check_requirements
-
-
-class GPUInfo:
-    """
-    Manages NVIDIA GPU information via pynvml with robust error handling.
-
-    Provides methods to query detailed GPU statistics (utilization, memory, temp, power) and select the most idle
-    GPUs based on configurable criteria. It safely handles the absence or initialization failure of the pynvml
-    library by logging warnings and disabling related features, preventing application crashes.
-
-    Includes fallback logic using `torch.cuda` for basic device counting if NVML is unavailable during GPU
-    selection. Manages NVML initialization and shutdown internally.
-
-    Attributes:
-        pynvml (module | None): The `pynvml` module if successfully imported and initialized, otherwise `None`.
-        nvml_available (bool): Indicates if `pynvml` is ready for use. True if import and `nvmlInit()` succeeded,
-            False otherwise.
-        gpu_stats (list[dict[str, Any]]): A list of dictionaries, each holding stats for one GPU. Populated on
-            initialization and by `refresh_stats()`. Keys include: 'index', 'name', 'utilization' (%),
-            'memory_used' (MiB), 'memory_total' (MiB), 'memory_free' (MiB), 'temperature' (C), 'power_draw' (W),
-            'power_limit' (W or 'N/A'). Empty if NVML is unavailable or queries fail.
-
-    Methods:
-        refresh_stats: Refresh the internal gpu_stats list by querying NVML.
-        print_status: Print GPU status in a compact table format using current stats.
-        select_idle_gpu: Select the most idle GPUs based on utilization and free memory.
-        shutdown: Shut down NVML if it was initialized.
-
-    Examples:
-        Initialize GPUInfo and print status
-        >>> gpu_info = GPUInfo()
-        >>> gpu_info.print_status()
-
-        Select idle GPUs with minimum memory requirements
-        >>> selected = gpu_info.select_idle_gpu(count=2, min_memory_fraction=0.2)
-        >>> print(f"Selected GPU indices: {selected}")
-    """
-
-    def __init__(self):
-        """Initialize GPUInfo, attempting to import and initialize pynvml."""
-        self.pynvml: Any | None = None
-        self.nvml_available: bool = False
-        self.gpu_stats: list[dict[str, Any]] = []
-
-        try:
-            check_requirements("nvidia-ml-py>=12.0.0")
-            self.pynvml = __import__("pynvml")
-            self.pynvml.nvmlInit()
-            self.nvml_available = True
-            self.refresh_stats()
-        except Exception as e:
-            LOGGER.warning(f"Failed to initialize pynvml, GPU stats disabled: {e}")
-
-    def __del__(self):
-        """Ensure NVML is shut down when the object is garbage collected."""
-        self.shutdown()
-
-    def shutdown(self):
-        """Shut down NVML if it was initialized."""
-        if self.nvml_available and self.pynvml:
-            try:
-                self.pynvml.nvmlShutdown()
-            except Exception:
-                pass
-            self.nvml_available = False
-
-    def refresh_stats(self):
-        """Refresh the internal gpu_stats list by querying NVML."""
-        self.gpu_stats = []
-        if not self.nvml_available or not self.pynvml:
-            return
-
-        try:
-            device_count = self.pynvml.nvmlDeviceGetCount()
-            self.gpu_stats.extend(self._get_device_stats(i) for i in range(device_count))
-        except Exception as e:
-            LOGGER.warning(f"Error during device query: {e}")
-            self.gpu_stats = []
-
-    def _get_device_stats(self, index: int) -> dict[str, Any]:
-        """Get stats for a single GPU device."""
-        handle = self.pynvml.nvmlDeviceGetHandleByIndex(index)
-        memory = self.pynvml.nvmlDeviceGetMemoryInfo(handle)
-        util = self.pynvml.nvmlDeviceGetUtilizationRates(handle)
-
-        def safe_get(func, *args, default=-1, divisor=1):
-            try:
-                val = func(*args)
-                return val // divisor if divisor != 1 and isinstance(val, (int, float)) else val
-            except Exception:
-                return default
-
-        temp_type = getattr(self.pynvml, "NVML_TEMPERATURE_GPU", -1)
-
-        return {
-            "index": index,
-            "name": self.pynvml.nvmlDeviceGetName(handle),
-            "utilization": util.gpu if util else -1,
-            "memory_used": memory.used >> 20 if memory else -1,  # Convert bytes to MiB
-            "memory_total": memory.total >> 20 if memory else -1,
-            "memory_free": memory.free >> 20 if memory else -1,
-            "temperature": safe_get(self.pynvml.nvmlDeviceGetTemperature, handle, temp_type),
-            "power_draw": safe_get(self.pynvml.nvmlDeviceGetPowerUsage, handle, divisor=1000),  # Convert mW to W
-            "power_limit": safe_get(self.pynvml.nvmlDeviceGetEnforcedPowerLimit, handle, divisor=1000),
-        }
-
-    def print_status(self):
-        """Print GPU status in a compact table format using current stats."""
-        self.refresh_stats()
-        if not self.gpu_stats:
-            LOGGER.warning("No GPU stats available.")
-            return
-
-        stats = self.gpu_stats
-        name_len = max(len(gpu.get("name", "N/A")) for gpu in stats)
-        hdr = f"{'Idx':<3} {'Name':<{name_len}} {'Util':>6} {'Mem (MiB)':>15} {'Temp':>5} {'Pwr (W)':>10}"
-        LOGGER.info(f"\n--- GPU Status ---\n{hdr}\n{'-' * len(hdr)}")
-
-        for gpu in stats:
-            u = f"{gpu['utilization']:>5}%" if gpu["utilization"] >= 0 else " N/A "
-            m = f"{gpu['memory_used']:>6}/{gpu['memory_total']:<6}" if gpu["memory_used"] >= 0 else " N/A / N/A "
-            t = f"{gpu['temperature']}C" if gpu["temperature"] >= 0 else " N/A "
-            p = f"{gpu['power_draw']:>3}/{gpu['power_limit']:<3}" if gpu["power_draw"] >= 0 else " N/A "
-
-            LOGGER.info(f"{gpu.get('index'):<3d} {gpu.get('name', 'N/A'):<{name_len}} {u:>6} {m:>15} {t:>5} {p:>10}")
-
-        LOGGER.info(f"{'-' * len(hdr)}\n")
-
-    def select_idle_gpu(
-        self, count: int = 1, min_memory_fraction: float = 0, min_util_fraction: float = 0
-    ) -> list[int]:
-        """
-        Select the most idle GPUs based on utilization and free memory.
-
-        Args:
-            count (int): The number of idle GPUs to select.
-            min_memory_fraction (float): Minimum free memory required as a fraction of total memory.
-            min_util_fraction (float): Minimum free utilization rate required from 0.0 - 1.0.
-
-        Returns:
-            (list[int]): Indices of the selected GPUs, sorted by idleness (lowest utilization first).
-
-        Notes:
-             Returns fewer than 'count' if not enough qualify or exist.
-             Returns basic CUDA indices if NVML fails. Empty list if no GPUs found.
-        """
-        assert min_memory_fraction <= 1.0, f"min_memory_fraction must be <= 1.0, got {min_memory_fraction}"
-        assert min_util_fraction <= 1.0, f"min_util_fraction must be <= 1.0, got {min_util_fraction}"
-        LOGGER.info(
-            f"Searching for {count} idle GPUs with free memory >= {min_memory_fraction * 100:.1f}% and free utilization >= {min_util_fraction * 100:.1f}%..."
-        )
-
-        if count <= 0:
-            return []
-
-        self.refresh_stats()
-        if not self.gpu_stats:
-            LOGGER.warning("NVML stats unavailable.")
-            return []
-
-        # Filter and sort eligible GPUs
-        eligible_gpus = [
-            gpu
-            for gpu in self.gpu_stats
-            if gpu.get("memory_free", 0) / gpu.get("memory_total", 1) >= min_memory_fraction
-            and (100 - gpu.get("utilization", 100)) >= min_util_fraction * 100
-        ]
-        eligible_gpus.sort(key=lambda x: (x.get("utilization", 101), -x.get("memory_free", 0)))
-
-        # Select top 'count' indices
-        selected = [gpu["index"] for gpu in eligible_gpus[:count]]
-
-        if selected:
-            LOGGER.info(f"Selected idle CUDA devices {selected}")
-        else:
-            LOGGER.warning(
-                f"No GPUs met criteria (Free Mem >= {min_memory_fraction * 100:.1f}% and Free Util >= {min_util_fraction * 100:.1f}%)."
-            )
-
-        return selected
-
-
-if __name__ == "__main__":
-    required_free_mem_fraction = 0.2  # Require 20% free VRAM
-    required_free_util_fraction = 0.2  # Require 20% free utilization
-    num_gpus_to_select = 1
-
-    gpu_info = GPUInfo()
-    gpu_info.print_status()
-
-    if selected := gpu_info.select_idle_gpu(
-        count=num_gpus_to_select,
-        min_memory_fraction=required_free_mem_fraction,
-        min_util_fraction=required_free_util_fraction,
-    ):
-        print(f"\n==> Using selected GPU indices: {selected}")
-        devices = [f"cuda:{idx}" for idx in selected]
-        print(f"    Target devices: {devices}")
diff --git a/ultralytics/utils/benchmarks.py b/ultralytics/utils/benchmarks.py
deleted file mode 100644
index da8f263..0000000
--- a/ultralytics/utils/benchmarks.py
+++ /dev/null
@@ -1,728 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-"""
-Benchmark a YOLO model formats for speed and accuracy.
-
-Usage:
-    from ultralytics.utils.benchmarks import ProfileModels, benchmark
-    ProfileModels(['yolo11n.yaml', 'yolov8s.yaml']).run()
-    benchmark(model='yolo11n.pt', imgsz=160)
-
-Format                  | `format=argument`         | Model
----                     | ---                       | ---
-PyTorch                 | -                         | yolo11n.pt
-TorchScript             | `torchscript`             | yolo11n.torchscript
-ONNX                    | `onnx`                    | yolo11n.onnx
-OpenVINO                | `openvino`                | yolo11n_openvino_model/
-TensorRT                | `engine`                  | yolo11n.engine
-CoreML                  | `coreml`                  | yolo11n.mlpackage
-TensorFlow SavedModel   | `saved_model`             | yolo11n_saved_model/
-TensorFlow GraphDef     | `pb`                      | yolo11n.pb
-TensorFlow Lite         | `tflite`                  | yolo11n.tflite
-TensorFlow Edge TPU     | `edgetpu`                 | yolo11n_edgetpu.tflite
-TensorFlow.js           | `tfjs`                    | yolo11n_web_model/
-PaddlePaddle            | `paddle`                  | yolo11n_paddle_model/
-MNN                     | `mnn`                     | yolo11n.mnn
-NCNN                    | `ncnn`                    | yolo11n_ncnn_model/
-IMX                     | `imx`                     | yolo11n_imx_model/
-RKNN                    | `rknn`                    | yolo11n_rknn_model/
-"""
-
-from __future__ import annotations
-
-import glob
-import os
-import platform
-import re
-import shutil
-import time
-from pathlib import Path
-
-import numpy as np
-import torch.cuda
-
-from ultralytics import YOLO, YOLOWorld
-from ultralytics.cfg import TASK2DATA, TASK2METRIC
-from ultralytics.engine.exporter import export_formats
-from ultralytics.utils import ARM64, ASSETS, IS_JETSON, LINUX, LOGGER, MACOS, TQDM, WEIGHTS_DIR, YAML
-from ultralytics.utils.checks import IS_PYTHON_3_13, check_imgsz, check_requirements, check_yolo, is_rockchip
-from ultralytics.utils.downloads import safe_download
-from ultralytics.utils.files import file_size
-from ultralytics.utils.torch_utils import get_cpu_info, select_device
-
-
-def benchmark(
-    model=WEIGHTS_DIR / "yolo11n.pt",
-    data=None,
-    imgsz=160,
-    half=False,
-    int8=False,
-    device="cpu",
-    verbose=False,
-    eps=1e-3,
-    format="",
-    **kwargs,
-):
-    """
-    Benchmark a YOLO model across different formats for speed and accuracy.
-
-    Args:
-        model (str | Path): Path to the model file or directory.
-        data (str | None): Dataset to evaluate on, inherited from TASK2DATA if not passed.
-        imgsz (int): Image size for the benchmark.
-        half (bool): Use half-precision for the model if True.
-        int8 (bool): Use int8-precision for the model if True.
-        device (str): Device to run the benchmark on, either 'cpu' or 'cuda'.
-        verbose (bool | float): If True or a float, assert benchmarks pass with given metric.
-        eps (float): Epsilon value for divide by zero prevention.
-        format (str): Export format for benchmarking. If not supplied all formats are benchmarked.
-        **kwargs (Any): Additional keyword arguments for exporter.
-
-    Returns:
-        (polars.DataFrame): A polars DataFrame with benchmark results for each format, including file size, metric,
-            and inference time.
-
-    Examples:
-        Benchmark a YOLO model with default settings:
-        >>> from ultralytics.utils.benchmarks import benchmark
-        >>> benchmark(model="yolo11n.pt", imgsz=640)
-    """
-    imgsz = check_imgsz(imgsz)
-    assert imgsz[0] == imgsz[1] if isinstance(imgsz, list) else True, "benchmark() only supports square imgsz."
-
-    import polars as pl  # scope for faster 'import ultralytics'
-
-    pl.Config.set_tbl_cols(-1)  # Show all columns
-    pl.Config.set_tbl_rows(-1)  # Show all rows
-    pl.Config.set_tbl_width_chars(-1)  # No width limit
-    pl.Config.set_tbl_hide_column_data_types(True)  # Hide data types
-    pl.Config.set_tbl_hide_dataframe_shape(True)  # Hide shape info
-    pl.Config.set_tbl_formatting("ASCII_BORDERS_ONLY_CONDENSED")
-
-    device = select_device(device, verbose=False)
-    if isinstance(model, (str, Path)):
-        model = YOLO(model)
-    is_end2end = getattr(model.model.model[-1], "end2end", False)
-    data = data or TASK2DATA[model.task]  # task to dataset, i.e. coco8.yaml for task=detect
-    key = TASK2METRIC[model.task]  # task to metric, i.e. metrics/mAP50-95(B) for task=detect
-
-    y = []
-    t0 = time.time()
-
-    format_arg = format.lower()
-    if format_arg:
-        formats = frozenset(export_formats()["Argument"])
-        assert format in formats, f"Expected format to be one of {formats}, but got '{format_arg}'."
-    for name, format, suffix, cpu, gpu, _ in zip(*export_formats().values()):
-        emoji, filename = "❌", None  # export defaults
-        try:
-            if format_arg and format_arg != format:
-                continue
-
-            # Checks
-            if format == "pb":
-                assert model.task != "obb", "TensorFlow GraphDef not supported for OBB task"
-            elif format == "edgetpu":
-                assert LINUX and not ARM64, "Edge TPU export only supported on non-aarch64 Linux"
-            elif format in {"coreml", "tfjs"}:
-                assert MACOS or (LINUX and not ARM64), (
-                    "CoreML and TF.js export only supported on macOS and non-aarch64 Linux"
-                )
-            if format == "coreml":
-                assert not IS_PYTHON_3_13, "CoreML not supported on Python 3.13"
-            if format in {"saved_model", "pb", "tflite", "edgetpu", "tfjs"}:
-                assert not isinstance(model, YOLOWorld), "YOLOWorldv2 TensorFlow exports not supported by onnx2tf yet"
-                # assert not IS_PYTHON_MINIMUM_3_12, "TFLite exports not supported on Python>=3.12 yet"
-            if format == "paddle":
-                assert not isinstance(model, YOLOWorld), "YOLOWorldv2 Paddle exports not supported yet"
-                assert model.task != "obb", "Paddle OBB bug https://github.com/PaddlePaddle/Paddle/issues/72024"
-                assert not is_end2end, "End-to-end models not supported by PaddlePaddle yet"
-                assert (LINUX and not IS_JETSON) or MACOS, "Windows and Jetson Paddle exports not supported yet"
-            if format == "mnn":
-                assert not isinstance(model, YOLOWorld), "YOLOWorldv2 MNN exports not supported yet"
-            if format == "ncnn":
-                assert not isinstance(model, YOLOWorld), "YOLOWorldv2 NCNN exports not supported yet"
-            if format == "imx":
-                assert not is_end2end
-                assert not isinstance(model, YOLOWorld), "YOLOWorldv2 IMX exports not supported"
-                assert model.task == "detect", "IMX only supported for detection task"
-                assert "C2f" in model.__str__(), "IMX only supported for YOLOv8n and YOLO11n"
-            if format == "rknn":
-                assert not isinstance(model, YOLOWorld), "YOLOWorldv2 RKNN exports not supported yet"
-                assert not is_end2end, "End-to-end models not supported by RKNN yet"
-                assert LINUX, "RKNN only supported on Linux"
-                assert not is_rockchip(), "RKNN Inference only supported on Rockchip devices"
-            if "cpu" in device.type:
-                assert cpu, "inference not supported on CPU"
-            if "cuda" in device.type:
-                assert gpu, "inference not supported on GPU"
-
-            # Export
-            if format == "-":
-                filename = model.pt_path or model.ckpt_path or model.model_name
-                exported_model = model  # PyTorch format
-            else:
-                filename = model.export(
-                    imgsz=imgsz, format=format, half=half, int8=int8, data=data, device=device, verbose=False, **kwargs
-                )
-                exported_model = YOLO(filename, task=model.task)
-                assert suffix in str(filename), "export failed"
-            emoji = "❎"  # indicates export succeeded
-
-            # Predict
-            assert model.task != "pose" or format != "pb", "GraphDef Pose inference is not supported"
-            assert format not in {"edgetpu", "tfjs"}, "inference not supported"
-            assert format != "coreml" or platform.system() == "Darwin", "inference only supported on macOS>=10.13"
-            if format == "ncnn":
-                assert not is_end2end, "End-to-end torch.topk operation is not supported for NCNN prediction yet"
-            exported_model.predict(ASSETS / "bus.jpg", imgsz=imgsz, device=device, half=half, verbose=False)
-
-            # Validate
-            results = exported_model.val(
-                data=data,
-                batch=1,
-                imgsz=imgsz,
-                plots=False,
-                device=device,
-                half=half,
-                int8=int8,
-                verbose=False,
-                conf=0.001,  # all the pre-set benchmark mAP values are based on conf=0.001
-            )
-            metric, speed = results.results_dict[key], results.speed["inference"]
-            fps = round(1000 / (speed + eps), 2)  # frames per second
-            y.append([name, "✅", round(file_size(filename), 1), round(metric, 4), round(speed, 2), fps])
-        except Exception as e:
-            if verbose:
-                assert type(e) is AssertionError, f"Benchmark failure for {name}: {e}"
-            LOGGER.error(f"Benchmark failure for {name}: {e}")
-            y.append([name, emoji, round(file_size(filename), 1), None, None, None])  # mAP, t_inference
-
-    # Print results
-    check_yolo(device=device)  # print system info
-    df = pl.DataFrame(y, schema=["Format", "Status❔", "Size (MB)", key, "Inference time (ms/im)", "FPS"], orient="row")
-    df = df.with_row_index(" ", offset=1)  # add index info
-    df_display = df.with_columns(pl.all().cast(pl.String).fill_null("-"))
-
-    name = model.model_name
-    dt = time.time() - t0
-    legend = "Benchmarks legend:  - ✅ Success  - ❎ Export passed but validation failed  - ❌️ Export failed"
-    s = f"\nBenchmarks complete for {name} on {data} at imgsz={imgsz} ({dt:.2f}s)\n{legend}\n{df_display}\n"
-    LOGGER.info(s)
-    with open("benchmarks.log", "a", errors="ignore", encoding="utf-8") as f:
-        f.write(s)
-
-    if verbose and isinstance(verbose, float):
-        metrics = df[key].to_numpy()  # values to compare to floor
-        floor = verbose  # minimum metric floor to pass, i.e. = 0.29 mAP for YOLOv5n
-        assert all(x > floor for x in metrics if not np.isnan(x)), f"Benchmark failure: metric(s) < floor {floor}"
-
-    return df_display
-
-
-class RF100Benchmark:
-    """
-    Benchmark YOLO model performance across various formats for speed and accuracy.
-
-    This class provides functionality to benchmark YOLO models on the RF100 dataset collection.
-
-    Attributes:
-        ds_names (list[str]): Names of datasets used for benchmarking.
-        ds_cfg_list (list[Path]): List of paths to dataset configuration files.
-        rf (Roboflow): Roboflow instance for accessing datasets.
-        val_metrics (list[str]): Metrics used for validation.
-
-    Methods:
-        set_key: Set Roboflow API key for accessing datasets.
-        parse_dataset: Parse dataset links and download datasets.
-        fix_yaml: Fix train and validation paths in YAML files.
-        evaluate: Evaluate model performance on validation results.
-    """
-
-    def __init__(self):
-        """Initialize the RF100Benchmark class for benchmarking YOLO model performance across various formats."""
-        self.ds_names = []
-        self.ds_cfg_list = []
-        self.rf = None
-        self.val_metrics = ["class", "images", "targets", "precision", "recall", "map50", "map95"]
-
-    def set_key(self, api_key: str):
-        """
-        Set Roboflow API key for processing.
-
-        Args:
-            api_key (str): The API key.
-
-        Examples:
-            Set the Roboflow API key for accessing datasets:
-            >>> benchmark = RF100Benchmark()
-            >>> benchmark.set_key("your_roboflow_api_key")
-        """
-        check_requirements("roboflow")
-        from roboflow import Roboflow
-
-        self.rf = Roboflow(api_key=api_key)
-
-    def parse_dataset(self, ds_link_txt: str = "datasets_links.txt"):
-        """
-        Parse dataset links and download datasets.
-
-        Args:
-            ds_link_txt (str): Path to the file containing dataset links.
-
-        Returns:
-            ds_names (list[str]): List of dataset names.
-            ds_cfg_list (list[Path]): List of paths to dataset configuration files.
-
-        Examples:
-            >>> benchmark = RF100Benchmark()
-            >>> benchmark.set_key("api_key")
-            >>> benchmark.parse_dataset("datasets_links.txt")
-        """
-        (shutil.rmtree("rf-100"), os.mkdir("rf-100")) if os.path.exists("rf-100") else os.mkdir("rf-100")
-        os.chdir("rf-100")
-        os.mkdir("ultralytics-benchmarks")
-        safe_download("https://github.com/ultralytics/assets/releases/download/v0.0.0/datasets_links.txt")
-
-        with open(ds_link_txt, encoding="utf-8") as file:
-            for line in file:
-                try:
-                    _, url, workspace, project, version = re.split("/+", line.strip())
-                    self.ds_names.append(project)
-                    proj_version = f"{project}-{version}"
-                    if not Path(proj_version).exists():
-                        self.rf.workspace(workspace).project(project).version(version).download("yolov8")
-                    else:
-                        LOGGER.info("Dataset already downloaded.")
-                    self.ds_cfg_list.append(Path.cwd() / proj_version / "data.yaml")
-                except Exception:
-                    continue
-
-        return self.ds_names, self.ds_cfg_list
-
-    @staticmethod
-    def fix_yaml(path: Path):
-        """Fix the train and validation paths in a given YAML file."""
-        yaml_data = YAML.load(path)
-        yaml_data["train"] = "train/images"
-        yaml_data["val"] = "valid/images"
-        YAML.dump(yaml_data, path)
-
-    def evaluate(self, yaml_path: str, val_log_file: str, eval_log_file: str, list_ind: int):
-        """
-        Evaluate model performance on validation results.
-
-        Args:
-            yaml_path (str): Path to the YAML configuration file.
-            val_log_file (str): Path to the validation log file.
-            eval_log_file (str): Path to the evaluation log file.
-            list_ind (int): Index of the current dataset in the list.
-
-        Returns:
-            (float): The mean average precision (mAP) value for the evaluated model.
-
-        Examples:
-            Evaluate a model on a specific dataset
-            >>> benchmark = RF100Benchmark()
-            >>> benchmark.evaluate("path/to/data.yaml", "path/to/val_log.txt", "path/to/eval_log.txt", 0)
-        """
-        skip_symbols = ["🚀", "⚠️", "💡", "❌"]
-        class_names = YAML.load(yaml_path)["names"]
-        with open(val_log_file, encoding="utf-8") as f:
-            lines = f.readlines()
-            eval_lines = []
-            for line in lines:
-                if any(symbol in line for symbol in skip_symbols):
-                    continue
-                entries = line.split(" ")
-                entries = list(filter(lambda val: val != "", entries))
-                entries = [e.strip("\n") for e in entries]
-                eval_lines.extend(
-                    {
-                        "class": entries[0],
-                        "images": entries[1],
-                        "targets": entries[2],
-                        "precision": entries[3],
-                        "recall": entries[4],
-                        "map50": entries[5],
-                        "map95": entries[6],
-                    }
-                    for e in entries
-                    if e in class_names or (e == "all" and "(AP)" not in entries and "(AR)" not in entries)
-                )
-        map_val = 0.0
-        if len(eval_lines) > 1:
-            LOGGER.info("Multiple dicts found")
-            for lst in eval_lines:
-                if lst["class"] == "all":
-                    map_val = lst["map50"]
-        else:
-            LOGGER.info("Single dict found")
-            map_val = [res["map50"] for res in eval_lines][0]
-
-        with open(eval_log_file, "a", encoding="utf-8") as f:
-            f.write(f"{self.ds_names[list_ind]}: {map_val}\n")
-
-        return float(map_val)
-
-
-class ProfileModels:
-    """
-    ProfileModels class for profiling different models on ONNX and TensorRT.
-
-    This class profiles the performance of different models, returning results such as model speed and FLOPs.
-
-    Attributes:
-        paths (list[str]): Paths of the models to profile.
-        num_timed_runs (int): Number of timed runs for the profiling.
-        num_warmup_runs (int): Number of warmup runs before profiling.
-        min_time (float): Minimum number of seconds to profile for.
-        imgsz (int): Image size used in the models.
-        half (bool): Flag to indicate whether to use FP16 half-precision for TensorRT profiling.
-        trt (bool): Flag to indicate whether to profile using TensorRT.
-        device (torch.device): Device used for profiling.
-
-    Methods:
-        run: Profile YOLO models for speed and accuracy across various formats.
-        get_files: Get all relevant model files.
-        get_onnx_model_info: Extract metadata from an ONNX model.
-        iterative_sigma_clipping: Apply sigma clipping to remove outliers.
-        profile_tensorrt_model: Profile a TensorRT model.
-        profile_onnx_model: Profile an ONNX model.
-        generate_table_row: Generate a table row with model metrics.
-        generate_results_dict: Generate a dictionary of profiling results.
-        print_table: Print a formatted table of results.
-
-    Examples:
-        Profile models and print results
-        >>> from ultralytics.utils.benchmarks import ProfileModels
-        >>> profiler = ProfileModels(["yolo11n.yaml", "yolov8s.yaml"], imgsz=640)
-        >>> profiler.run()
-    """
-
-    def __init__(
-        self,
-        paths: list[str],
-        num_timed_runs: int = 100,
-        num_warmup_runs: int = 10,
-        min_time: float = 60,
-        imgsz: int = 640,
-        half: bool = True,
-        trt: bool = True,
-        device: torch.device | str | None = None,
-    ):
-        """
-        Initialize the ProfileModels class for profiling models.
-
-        Args:
-            paths (list[str]): List of paths of the models to be profiled.
-            num_timed_runs (int): Number of timed runs for the profiling.
-            num_warmup_runs (int): Number of warmup runs before the actual profiling starts.
-            min_time (float): Minimum time in seconds for profiling a model.
-            imgsz (int): Size of the image used during profiling.
-            half (bool): Flag to indicate whether to use FP16 half-precision for TensorRT profiling.
-            trt (bool): Flag to indicate whether to profile using TensorRT.
-            device (torch.device | str | None): Device used for profiling. If None, it is determined automatically.
-
-        Notes:
-            FP16 'half' argument option removed for ONNX as slower on CPU than FP32.
-
-        Examples:
-            Initialize and profile models
-            >>> from ultralytics.utils.benchmarks import ProfileModels
-            >>> profiler = ProfileModels(["yolo11n.yaml", "yolov8s.yaml"], imgsz=640)
-            >>> profiler.run()
-        """
-        self.paths = paths
-        self.num_timed_runs = num_timed_runs
-        self.num_warmup_runs = num_warmup_runs
-        self.min_time = min_time
-        self.imgsz = imgsz
-        self.half = half
-        self.trt = trt  # run TensorRT profiling
-        self.device = device if isinstance(device, torch.device) else select_device(device)
-
-    def run(self):
-        """
-        Profile YOLO models for speed and accuracy across various formats including ONNX and TensorRT.
-
-        Returns:
-            (list[dict]): List of dictionaries containing profiling results for each model.
-
-        Examples:
-            Profile models and print results
-            >>> from ultralytics.utils.benchmarks import ProfileModels
-            >>> profiler = ProfileModels(["yolo11n.yaml", "yolov8s.yaml"])
-            >>> results = profiler.run()
-        """
-        files = self.get_files()
-
-        if not files:
-            LOGGER.warning("No matching *.pt or *.onnx files found.")
-            return []
-
-        table_rows = []
-        output = []
-        for file in files:
-            engine_file = file.with_suffix(".engine")
-            if file.suffix in {".pt", ".yaml", ".yml"}:
-                model = YOLO(str(file))
-                model.fuse()  # to report correct params and GFLOPs in model.info()
-                model_info = model.info()
-                if self.trt and self.device.type != "cpu" and not engine_file.is_file():
-                    engine_file = model.export(
-                        format="engine",
-                        half=self.half,
-                        imgsz=self.imgsz,
-                        device=self.device,
-                        verbose=False,
-                    )
-                onnx_file = model.export(
-                    format="onnx",
-                    imgsz=self.imgsz,
-                    device=self.device,
-                    verbose=False,
-                )
-            elif file.suffix == ".onnx":
-                model_info = self.get_onnx_model_info(file)
-                onnx_file = file
-            else:
-                continue
-
-            t_engine = self.profile_tensorrt_model(str(engine_file))
-            t_onnx = self.profile_onnx_model(str(onnx_file))
-            table_rows.append(self.generate_table_row(file.stem, t_onnx, t_engine, model_info))
-            output.append(self.generate_results_dict(file.stem, t_onnx, t_engine, model_info))
-
-        self.print_table(table_rows)
-        return output
-
-    def get_files(self):
-        """
-        Return a list of paths for all relevant model files given by the user.
-
-        Returns:
-            (list[Path]): List of Path objects for the model files.
-        """
-        files = []
-        for path in self.paths:
-            path = Path(path)
-            if path.is_dir():
-                extensions = ["*.pt", "*.onnx", "*.yaml"]
-                files.extend([file for ext in extensions for file in glob.glob(str(path / ext))])
-            elif path.suffix in {".pt", ".yaml", ".yml"}:  # add non-existing
-                files.append(str(path))
-            else:
-                files.extend(glob.glob(str(path)))
-
-        LOGGER.info(f"Profiling: {sorted(files)}")
-        return [Path(file) for file in sorted(files)]
-
-    @staticmethod
-    def get_onnx_model_info(onnx_file: str):
-        """Extract metadata from an ONNX model file including parameters, GFLOPs, and input shape."""
-        return 0.0, 0.0, 0.0, 0.0  # return (num_layers, num_params, num_gradients, num_flops)
-
-    @staticmethod
-    def iterative_sigma_clipping(data: np.ndarray, sigma: float = 2, max_iters: int = 3):
-        """
-        Apply iterative sigma clipping to data to remove outliers.
-
-        Args:
-            data (np.ndarray): Input data array.
-            sigma (float): Number of standard deviations to use for clipping.
-            max_iters (int): Maximum number of iterations for the clipping process.
-
-        Returns:
-            (np.ndarray): Clipped data array with outliers removed.
-        """
-        data = np.array(data)
-        for _ in range(max_iters):
-            mean, std = np.mean(data), np.std(data)
-            clipped_data = data[(data > mean - sigma * std) & (data < mean + sigma * std)]
-            if len(clipped_data) == len(data):
-                break
-            data = clipped_data
-        return data
-
-    def profile_tensorrt_model(self, engine_file: str, eps: float = 1e-3):
-        """
-        Profile YOLO model performance with TensorRT, measuring average run time and standard deviation.
-
-        Args:
-            engine_file (str): Path to the TensorRT engine file.
-            eps (float): Small epsilon value to prevent division by zero.
-
-        Returns:
-            mean_time (float): Mean inference time in milliseconds.
-            std_time (float): Standard deviation of inference time in milliseconds.
-        """
-        if not self.trt or not Path(engine_file).is_file():
-            return 0.0, 0.0
-
-        # Model and input
-        model = YOLO(engine_file)
-        input_data = np.zeros((self.imgsz, self.imgsz, 3), dtype=np.uint8)  # use uint8 for Classify
-
-        # Warmup runs
-        elapsed = 0.0
-        for _ in range(3):
-            start_time = time.time()
-            for _ in range(self.num_warmup_runs):
-                model(input_data, imgsz=self.imgsz, verbose=False)
-            elapsed = time.time() - start_time
-
-        # Compute number of runs as higher of min_time or num_timed_runs
-        num_runs = max(round(self.min_time / (elapsed + eps) * self.num_warmup_runs), self.num_timed_runs * 50)
-
-        # Timed runs
-        run_times = []
-        for _ in TQDM(range(num_runs), desc=engine_file):
-            results = model(input_data, imgsz=self.imgsz, verbose=False)
-            run_times.append(results[0].speed["inference"])  # Convert to milliseconds
-
-        run_times = self.iterative_sigma_clipping(np.array(run_times), sigma=2, max_iters=3)  # sigma clipping
-        return np.mean(run_times), np.std(run_times)
-
-    def profile_onnx_model(self, onnx_file: str, eps: float = 1e-3):
-        """
-        Profile an ONNX model, measuring average inference time and standard deviation across multiple runs.
-
-        Args:
-            onnx_file (str): Path to the ONNX model file.
-            eps (float): Small epsilon value to prevent division by zero.
-
-        Returns:
-            mean_time (float): Mean inference time in milliseconds.
-            std_time (float): Standard deviation of inference time in milliseconds.
-        """
-        check_requirements("onnxruntime")
-        import onnxruntime as ort
-
-        # Session with either 'TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider'
-        sess_options = ort.SessionOptions()
-        sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
-        sess_options.intra_op_num_threads = 8  # Limit the number of threads
-        sess = ort.InferenceSession(onnx_file, sess_options, providers=["CPUExecutionProvider"])
-
-        input_tensor = sess.get_inputs()[0]
-        input_type = input_tensor.type
-        dynamic = not all(isinstance(dim, int) and dim >= 0 for dim in input_tensor.shape)  # dynamic input shape
-        input_shape = (1, 3, self.imgsz, self.imgsz) if dynamic else input_tensor.shape
-
-        # Mapping ONNX datatype to numpy datatype
-        if "float16" in input_type:
-            input_dtype = np.float16
-        elif "float" in input_type:
-            input_dtype = np.float32
-        elif "double" in input_type:
-            input_dtype = np.float64
-        elif "int64" in input_type:
-            input_dtype = np.int64
-        elif "int32" in input_type:
-            input_dtype = np.int32
-        else:
-            raise ValueError(f"Unsupported ONNX datatype {input_type}")
-
-        input_data = np.random.rand(*input_shape).astype(input_dtype)
-        input_name = input_tensor.name
-        output_name = sess.get_outputs()[0].name
-
-        # Warmup runs
-        elapsed = 0.0
-        for _ in range(3):
-            start_time = time.time()
-            for _ in range(self.num_warmup_runs):
-                sess.run([output_name], {input_name: input_data})
-            elapsed = time.time() - start_time
-
-        # Compute number of runs as higher of min_time or num_timed_runs
-        num_runs = max(round(self.min_time / (elapsed + eps) * self.num_warmup_runs), self.num_timed_runs)
-
-        # Timed runs
-        run_times = []
-        for _ in TQDM(range(num_runs), desc=onnx_file):
-            start_time = time.time()
-            sess.run([output_name], {input_name: input_data})
-            run_times.append((time.time() - start_time) * 1000)  # Convert to milliseconds
-
-        run_times = self.iterative_sigma_clipping(np.array(run_times), sigma=2, max_iters=5)  # sigma clipping
-        return np.mean(run_times), np.std(run_times)
-
-    def generate_table_row(
-        self,
-        model_name: str,
-        t_onnx: tuple[float, float],
-        t_engine: tuple[float, float],
-        model_info: tuple[float, float, float, float],
-    ):
-        """
-        Generate a table row string with model performance metrics.
-
-        Args:
-            model_name (str): Name of the model.
-            t_onnx (tuple): ONNX model inference time statistics (mean, std).
-            t_engine (tuple): TensorRT engine inference time statistics (mean, std).
-            model_info (tuple): Model information (layers, params, gradients, flops).
-
-        Returns:
-            (str): Formatted table row string with model metrics.
-        """
-        layers, params, gradients, flops = model_info
-        return (
-            f"| {model_name:18s} | {self.imgsz} | - | {t_onnx[0]:.1f}±{t_onnx[1]:.1f} ms | {t_engine[0]:.1f}±"
-            f"{t_engine[1]:.1f} ms | {params / 1e6:.1f} | {flops:.1f} |"
-        )
-
-    @staticmethod
-    def generate_results_dict(
-        model_name: str,
-        t_onnx: tuple[float, float],
-        t_engine: tuple[float, float],
-        model_info: tuple[float, float, float, float],
-    ):
-        """
-        Generate a dictionary of profiling results.
-
-        Args:
-            model_name (str): Name of the model.
-            t_onnx (tuple): ONNX model inference time statistics (mean, std).
-            t_engine (tuple): TensorRT engine inference time statistics (mean, std).
-            model_info (tuple): Model information (layers, params, gradients, flops).
-
-        Returns:
-            (dict): Dictionary containing profiling results.
-        """
-        layers, params, gradients, flops = model_info
-        return {
-            "model/name": model_name,
-            "model/parameters": params,
-            "model/GFLOPs": round(flops, 3),
-            "model/speed_ONNX(ms)": round(t_onnx[0], 3),
-            "model/speed_TensorRT(ms)": round(t_engine[0], 3),
-        }
-
-    @staticmethod
-    def print_table(table_rows: list[str]):
-        """
-        Print a formatted table of model profiling results.
-
-        Args:
-            table_rows (list[str]): List of formatted table row strings.
-        """
-        gpu = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "GPU"
-        headers = [
-            "Model",
-            "size<br><sup>(pixels)",
-            "mAP<sup>val<br>50-95",
-            f"Speed<br><sup>CPU ({get_cpu_info()}) ONNX<br>(ms)",
-            f"Speed<br><sup>{gpu} TensorRT<br>(ms)",
-            "params<br><sup>(M)",
-            "FLOPs<br><sup>(B)",
-        ]
-        header = "|" + "|".join(f" {h} " for h in headers) + "|"
-        separator = "|" + "|".join("-" * (len(h) + 2) for h in headers) + "|"
-
-        LOGGER.info(f"\n\n{header}")
-        LOGGER.info(separator)
-        for row in table_rows:
-            LOGGER.info(row)
diff --git a/ultralytics/utils/callbacks/__init__.py b/ultralytics/utils/callbacks/__init__.py
deleted file mode 100644
index 920cc4f..0000000
--- a/ultralytics/utils/callbacks/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from .base import add_integration_callbacks, default_callbacks, get_default_callbacks
-
-__all__ = "add_integration_callbacks", "default_callbacks", "get_default_callbacks"
diff --git a/ultralytics/utils/callbacks/__pycache__/__init__.cpython-310.pyc b/ultralytics/utils/callbacks/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index b8e3ff5..0000000
Binary files a/ultralytics/utils/callbacks/__pycache__/__init__.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/utils/callbacks/__pycache__/base.cpython-310.pyc b/ultralytics/utils/callbacks/__pycache__/base.cpython-310.pyc
deleted file mode 100644
index db97163..0000000
Binary files a/ultralytics/utils/callbacks/__pycache__/base.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/utils/callbacks/__pycache__/hub.cpython-310.pyc b/ultralytics/utils/callbacks/__pycache__/hub.cpython-310.pyc
deleted file mode 100644
index 0a17e4c..0000000
Binary files a/ultralytics/utils/callbacks/__pycache__/hub.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/utils/callbacks/__pycache__/platform.cpython-310.pyc b/ultralytics/utils/callbacks/__pycache__/platform.cpython-310.pyc
deleted file mode 100644
index e9e5217..0000000
Binary files a/ultralytics/utils/callbacks/__pycache__/platform.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/utils/callbacks/base.py b/ultralytics/utils/callbacks/base.py
deleted file mode 100644
index 46e529b..0000000
--- a/ultralytics/utils/callbacks/base.py
+++ /dev/null
@@ -1,235 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-"""Base callbacks for Ultralytics training, validation, prediction, and export processes."""
-
-from collections import defaultdict
-from copy import deepcopy
-
-# Trainer callbacks ----------------------------------------------------------------------------------------------------
-
-
-def on_pretrain_routine_start(trainer):
-    """Called before the pretraining routine starts."""
-    pass
-
-
-def on_pretrain_routine_end(trainer):
-    """Called after the pretraining routine ends."""
-    pass
-
-
-def on_train_start(trainer):
-    """Called when the training starts."""
-    pass
-
-
-def on_train_epoch_start(trainer):
-    """Called at the start of each training epoch."""
-    pass
-
-
-def on_train_batch_start(trainer):
-    """Called at the start of each training batch."""
-    pass
-
-
-def optimizer_step(trainer):
-    """Called when the optimizer takes a step."""
-    pass
-
-
-def on_before_zero_grad(trainer):
-    """Called before the gradients are set to zero."""
-    pass
-
-
-def on_train_batch_end(trainer):
-    """Called at the end of each training batch."""
-    pass
-
-
-def on_train_epoch_end(trainer):
-    """Called at the end of each training epoch."""
-    pass
-
-
-def on_fit_epoch_end(trainer):
-    """Called at the end of each fit epoch (train + val)."""
-    pass
-
-
-def on_model_save(trainer):
-    """Called when the model is saved."""
-    pass
-
-
-def on_train_end(trainer):
-    """Called when the training ends."""
-    pass
-
-
-def on_params_update(trainer):
-    """Called when the model parameters are updated."""
-    pass
-
-
-def teardown(trainer):
-    """Called during the teardown of the training process."""
-    pass
-
-
-# Validator callbacks --------------------------------------------------------------------------------------------------
-
-
-def on_val_start(validator):
-    """Called when the validation starts."""
-    pass
-
-
-def on_val_batch_start(validator):
-    """Called at the start of each validation batch."""
-    pass
-
-
-def on_val_batch_end(validator):
-    """Called at the end of each validation batch."""
-    pass
-
-
-def on_val_end(validator):
-    """Called when the validation ends."""
-    pass
-
-
-# Predictor callbacks --------------------------------------------------------------------------------------------------
-
-
-def on_predict_start(predictor):
-    """Called when the prediction starts."""
-    pass
-
-
-def on_predict_batch_start(predictor):
-    """Called at the start of each prediction batch."""
-    pass
-
-
-def on_predict_batch_end(predictor):
-    """Called at the end of each prediction batch."""
-    pass
-
-
-def on_predict_postprocess_end(predictor):
-    """Called after the post-processing of the prediction ends."""
-    pass
-
-
-def on_predict_end(predictor):
-    """Called when the prediction ends."""
-    pass
-
-
-# Exporter callbacks ---------------------------------------------------------------------------------------------------
-
-
-def on_export_start(exporter):
-    """Called when the model export starts."""
-    pass
-
-
-def on_export_end(exporter):
-    """Called when the model export ends."""
-    pass
-
-
-default_callbacks = {
-    # Run in trainer
-    "on_pretrain_routine_start": [on_pretrain_routine_start],
-    "on_pretrain_routine_end": [on_pretrain_routine_end],
-    "on_train_start": [on_train_start],
-    "on_train_epoch_start": [on_train_epoch_start],
-    "on_train_batch_start": [on_train_batch_start],
-    "optimizer_step": [optimizer_step],
-    "on_before_zero_grad": [on_before_zero_grad],
-    "on_train_batch_end": [on_train_batch_end],
-    "on_train_epoch_end": [on_train_epoch_end],
-    "on_fit_epoch_end": [on_fit_epoch_end],  # fit = train + val
-    "on_model_save": [on_model_save],
-    "on_train_end": [on_train_end],
-    "on_params_update": [on_params_update],
-    "teardown": [teardown],
-    # Run in validator
-    "on_val_start": [on_val_start],
-    "on_val_batch_start": [on_val_batch_start],
-    "on_val_batch_end": [on_val_batch_end],
-    "on_val_end": [on_val_end],
-    # Run in predictor
-    "on_predict_start": [on_predict_start],
-    "on_predict_batch_start": [on_predict_batch_start],
-    "on_predict_postprocess_end": [on_predict_postprocess_end],
-    "on_predict_batch_end": [on_predict_batch_end],
-    "on_predict_end": [on_predict_end],
-    # Run in exporter
-    "on_export_start": [on_export_start],
-    "on_export_end": [on_export_end],
-}
-
-
-def get_default_callbacks():
-    """
-    Get the default callbacks for Ultralytics training, validation, prediction, and export processes.
-
-    Returns:
-        (dict): Dictionary of default callbacks for various training events. Each key represents an event during the
-            training process, and the corresponding value is a list of callback functions executed when that event
-            occurs.
-
-    Examples:
-        >>> callbacks = get_default_callbacks()
-        >>> print(list(callbacks.keys()))  # show all available callback events
-        ['on_pretrain_routine_start', 'on_pretrain_routine_end', ...]
-    """
-    return defaultdict(list, deepcopy(default_callbacks))
-
-
-def add_integration_callbacks(instance):
-    """
-    Add integration callbacks to the instance's callbacks dictionary.
-
-    This function loads and adds various integration callbacks to the provided instance. The specific callbacks added
-    depend on the type of instance provided. All instances receive HUB callbacks, while Trainer instances also receive
-    additional callbacks for various integrations like ClearML, Comet, DVC, MLflow, Neptune, Ray Tune, TensorBoard,
-    and Weights & Biases.
-
-    Args:
-        instance (Trainer | Predictor | Validator | Exporter): The object instance to which callbacks will be added.
-            The type of instance determines which callbacks are loaded.
-
-    Examples:
-        >>> from ultralytics.engine.trainer import BaseTrainer
-        >>> trainer = BaseTrainer()
-        >>> add_integration_callbacks(trainer)
-    """
-    from .hub import callbacks as hub_cb
-    from .platform import callbacks as platform_cb
-
-    # Load Ultralytics callbacks
-    callbacks_list = [hub_cb, platform_cb]
-
-    # Load training callbacks
-    if "Trainer" in instance.__class__.__name__:
-        from .clearml import callbacks as clear_cb
-        from .comet import callbacks as comet_cb
-        from .dvc import callbacks as dvc_cb
-        from .mlflow import callbacks as mlflow_cb
-        from .neptune import callbacks as neptune_cb
-        from .raytune import callbacks as tune_cb
-        from .tensorboard import callbacks as tb_cb
-        from .wb import callbacks as wb_cb
-
-        callbacks_list.extend([clear_cb, comet_cb, dvc_cb, mlflow_cb, neptune_cb, tune_cb, tb_cb, wb_cb])
-
-    # Add the callbacks to the callbacks dictionary
-    for callbacks in callbacks_list:
-        for k, v in callbacks.items():
-            if v not in instance.callbacks[k]:
-                instance.callbacks[k].append(v)
diff --git a/ultralytics/utils/callbacks/clearml.py b/ultralytics/utils/callbacks/clearml.py
deleted file mode 100644
index 446ee01..0000000
--- a/ultralytics/utils/callbacks/clearml.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from ultralytics.utils import LOGGER, SETTINGS, TESTS_RUNNING
-
-try:
-    assert not TESTS_RUNNING  # do not log pytest
-    assert SETTINGS["clearml"] is True  # verify integration is enabled
-    import clearml
-    from clearml import Task
-
-    assert hasattr(clearml, "__version__")  # verify package is not directory
-
-except (ImportError, AssertionError):
-    clearml = None
-
-
-def _log_debug_samples(files, title: str = "Debug Samples") -> None:
-    """
-    Log files (images) as debug samples in the ClearML task.
-
-    Args:
-        files (list[Path]): A list of file paths in PosixPath format.
-        title (str): A title that groups together images with the same values.
-    """
-    import re
-
-    if task := Task.current_task():
-        for f in files:
-            if f.exists():
-                it = re.search(r"_batch(\d+)", f.name)
-                iteration = int(it.groups()[0]) if it else 0
-                task.get_logger().report_image(
-                    title=title, series=f.name.replace(it.group(), ""), local_path=str(f), iteration=iteration
-                )
-
-
-def _log_plot(title: str, plot_path: str) -> None:
-    """
-    Log an image as a plot in the plot section of ClearML.
-
-    Args:
-        title (str): The title of the plot.
-        plot_path (str): The path to the saved image file.
-    """
-    import matplotlib.image as mpimg
-    import matplotlib.pyplot as plt
-
-    img = mpimg.imread(plot_path)
-    fig = plt.figure()
-    ax = fig.add_axes([0, 0, 1, 1], frameon=False, aspect="auto", xticks=[], yticks=[])  # no ticks
-    ax.imshow(img)
-
-    Task.current_task().get_logger().report_matplotlib_figure(
-        title=title, series="", figure=fig, report_interactive=False
-    )
-
-
-def on_pretrain_routine_start(trainer) -> None:
-    """Initialize and connect ClearML task at the start of pretraining routine."""
-    try:
-        if task := Task.current_task():
-            # WARNING: make sure the automatic pytorch and matplotlib bindings are disabled!
-            # We are logging these plots and model files manually in the integration
-            from clearml.binding.frameworks.pytorch_bind import PatchPyTorchModelIO
-            from clearml.binding.matplotlib_bind import PatchedMatplotlib
-
-            PatchPyTorchModelIO.update_current_task(None)
-            PatchedMatplotlib.update_current_task(None)
-        else:
-            task = Task.init(
-                project_name=trainer.args.project or "Ultralytics",
-                task_name=trainer.args.name,
-                tags=["Ultralytics"],
-                output_uri=True,
-                reuse_last_task_id=False,
-                auto_connect_frameworks={"pytorch": False, "matplotlib": False},
-            )
-            LOGGER.warning(
-                "ClearML Initialized a new task. If you want to run remotely, "
-                "please add clearml-init and connect your arguments before initializing YOLO."
-            )
-        task.connect(vars(trainer.args), name="General")
-    except Exception as e:
-        LOGGER.warning(f"ClearML installed but not initialized correctly, not logging this run. {e}")
-
-
-def on_train_epoch_end(trainer) -> None:
-    """Log debug samples for the first epoch and report current training progress."""
-    if task := Task.current_task():
-        # Log debug samples for first epoch only
-        if trainer.epoch == 1:
-            _log_debug_samples(sorted(trainer.save_dir.glob("train_batch*.jpg")), "Mosaic")
-        # Report the current training progress
-        for k, v in trainer.label_loss_items(trainer.tloss, prefix="train").items():
-            task.get_logger().report_scalar("train", k, v, iteration=trainer.epoch)
-        for k, v in trainer.lr.items():
-            task.get_logger().report_scalar("lr", k, v, iteration=trainer.epoch)
-
-
-def on_fit_epoch_end(trainer) -> None:
-    """Report model information and metrics to logger at the end of an epoch."""
-    if task := Task.current_task():
-        # Report epoch time and validation metrics
-        task.get_logger().report_scalar(
-            title="Epoch Time", series="Epoch Time", value=trainer.epoch_time, iteration=trainer.epoch
-        )
-        for k, v in trainer.metrics.items():
-            title = k.split("/")[0]
-            task.get_logger().report_scalar(title, k, v, iteration=trainer.epoch)
-        if trainer.epoch == 0:
-            from ultralytics.utils.torch_utils import model_info_for_loggers
-
-            for k, v in model_info_for_loggers(trainer).items():
-                task.get_logger().report_single_value(k, v)
-
-
-def on_val_end(validator) -> None:
-    """Log validation results including labels and predictions."""
-    if Task.current_task():
-        # Log validation labels and predictions
-        _log_debug_samples(sorted(validator.save_dir.glob("val*.jpg")), "Validation")
-
-
-def on_train_end(trainer) -> None:
-    """Log final model and training results on training completion."""
-    if task := Task.current_task():
-        # Log final results, confusion matrix and PR plots
-        files = [
-            "results.png",
-            "confusion_matrix.png",
-            "confusion_matrix_normalized.png",
-            *(f"{x}_curve.png" for x in ("F1", "PR", "P", "R")),
-        ]
-        files = [(trainer.save_dir / f) for f in files if (trainer.save_dir / f).exists()]  # filter existing files
-        for f in files:
-            _log_plot(title=f.stem, plot_path=f)
-        # Report final metrics
-        for k, v in trainer.validator.metrics.results_dict.items():
-            task.get_logger().report_single_value(k, v)
-        # Log the final model
-        task.update_output_model(model_path=str(trainer.best), model_name=trainer.args.name, auto_delete_file=False)
-
-
-callbacks = (
-    {
-        "on_pretrain_routine_start": on_pretrain_routine_start,
-        "on_train_epoch_end": on_train_epoch_end,
-        "on_fit_epoch_end": on_fit_epoch_end,
-        "on_val_end": on_val_end,
-        "on_train_end": on_train_end,
-    }
-    if clearml
-    else {}
-)
diff --git a/ultralytics/utils/callbacks/comet.py b/ultralytics/utils/callbacks/comet.py
deleted file mode 100644
index f094113..0000000
--- a/ultralytics/utils/callbacks/comet.py
+++ /dev/null
@@ -1,639 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-from collections.abc import Callable
-from types import SimpleNamespace
-from typing import Any
-
-import cv2
-import numpy as np
-
-from ultralytics.utils import LOGGER, RANK, SETTINGS, TESTS_RUNNING, ops
-from ultralytics.utils.metrics import ClassifyMetrics, DetMetrics, OBBMetrics, PoseMetrics, SegmentMetrics
-
-try:
-    assert not TESTS_RUNNING  # do not log pytest
-    assert SETTINGS["comet"] is True  # verify integration is enabled
-    import comet_ml
-
-    assert hasattr(comet_ml, "__version__")  # verify package is not directory
-
-    import os
-    from pathlib import Path
-
-    # Ensures certain logging functions only run for supported tasks
-    COMET_SUPPORTED_TASKS = ["detect", "segment"]
-
-    # Names of plots created by Ultralytics that are logged to Comet
-    CONFUSION_MATRIX_PLOT_NAMES = "confusion_matrix", "confusion_matrix_normalized"
-    EVALUATION_PLOT_NAMES = "F1_curve", "P_curve", "R_curve", "PR_curve"
-    LABEL_PLOT_NAMES = ["labels"]
-    SEGMENT_METRICS_PLOT_PREFIX = "Box", "Mask"
-    POSE_METRICS_PLOT_PREFIX = "Box", "Pose"
-    DETECTION_METRICS_PLOT_PREFIX = ["Box"]
-    RESULTS_TABLE_NAME = "results.csv"
-    ARGS_YAML_NAME = "args.yaml"
-
-    _comet_image_prediction_count = 0
-
-except (ImportError, AssertionError):
-    comet_ml = None
-
-
-def _get_comet_mode() -> str:
-    """Return the Comet mode from environment variables, defaulting to 'online'."""
-    comet_mode = os.getenv("COMET_MODE")
-    if comet_mode is not None:
-        LOGGER.warning(
-            "The COMET_MODE environment variable is deprecated. "
-            "Please use COMET_START_ONLINE to set the Comet experiment mode. "
-            "To start an offline Comet experiment, use 'export COMET_START_ONLINE=0'. "
-            "If COMET_START_ONLINE is not set or is set to '1', an online Comet experiment will be created."
-        )
-        return comet_mode
-
-    return "online"
-
-
-def _get_comet_model_name() -> str:
-    """Return the Comet model name from environment variable or default to 'Ultralytics'."""
-    return os.getenv("COMET_MODEL_NAME", "Ultralytics")
-
-
-def _get_eval_batch_logging_interval() -> int:
-    """Get the evaluation batch logging interval from environment variable or use default value 1."""
-    return int(os.getenv("COMET_EVAL_BATCH_LOGGING_INTERVAL", 1))
-
-
-def _get_max_image_predictions_to_log() -> int:
-    """Get the maximum number of image predictions to log from environment variables."""
-    return int(os.getenv("COMET_MAX_IMAGE_PREDICTIONS", 100))
-
-
-def _scale_confidence_score(score: float) -> float:
-    """Scale the confidence score by a factor specified in environment variable."""
-    scale = float(os.getenv("COMET_MAX_CONFIDENCE_SCORE", 100.0))
-    return score * scale
-
-
-def _should_log_confusion_matrix() -> bool:
-    """Determine if the confusion matrix should be logged based on environment variable settings."""
-    return os.getenv("COMET_EVAL_LOG_CONFUSION_MATRIX", "false").lower() == "true"
-
-
-def _should_log_image_predictions() -> bool:
-    """Determine whether to log image predictions based on environment variable."""
-    return os.getenv("COMET_EVAL_LOG_IMAGE_PREDICTIONS", "true").lower() == "true"
-
-
-def _resume_or_create_experiment(args: SimpleNamespace) -> None:
-    """
-    Resume CometML experiment or create a new experiment based on args.
-
-    Ensures that the experiment object is only created in a single process during distributed training.
-
-    Args:
-        args (SimpleNamespace): Training arguments containing project configuration and other parameters.
-    """
-    if RANK not in {-1, 0}:
-        return
-
-    # Set environment variable (if not set by the user) to configure the Comet experiment's online mode under the hood.
-    # IF COMET_START_ONLINE is set by the user it will override COMET_MODE value.
-    if os.getenv("COMET_START_ONLINE") is None:
-        comet_mode = _get_comet_mode()
-        os.environ["COMET_START_ONLINE"] = "1" if comet_mode != "offline" else "0"
-
-    try:
-        _project_name = os.getenv("COMET_PROJECT_NAME", args.project)
-        experiment = comet_ml.start(project_name=_project_name)
-        experiment.log_parameters(vars(args))
-        experiment.log_others(
-            {
-                "eval_batch_logging_interval": _get_eval_batch_logging_interval(),
-                "log_confusion_matrix_on_eval": _should_log_confusion_matrix(),
-                "log_image_predictions": _should_log_image_predictions(),
-                "max_image_predictions": _get_max_image_predictions_to_log(),
-            }
-        )
-        experiment.log_other("Created from", "ultralytics")
-
-    except Exception as e:
-        LOGGER.warning(f"Comet installed but not initialized correctly, not logging this run. {e}")
-
-
-def _fetch_trainer_metadata(trainer) -> dict:
-    """
-    Return metadata for YOLO training including epoch and asset saving status.
-
-    Args:
-        trainer (ultralytics.engine.trainer.BaseTrainer): The YOLO trainer object containing training state and config.
-
-    Returns:
-        (dict): Dictionary containing current epoch, step, save assets flag, and final epoch flag.
-    """
-    curr_epoch = trainer.epoch + 1
-
-    train_num_steps_per_epoch = len(trainer.train_loader.dataset) // trainer.batch_size
-    curr_step = curr_epoch * train_num_steps_per_epoch
-    final_epoch = curr_epoch == trainer.epochs
-
-    save = trainer.args.save
-    save_period = trainer.args.save_period
-    save_interval = curr_epoch % save_period == 0
-    save_assets = save and save_period > 0 and save_interval and not final_epoch
-
-    return dict(curr_epoch=curr_epoch, curr_step=curr_step, save_assets=save_assets, final_epoch=final_epoch)
-
-
-def _scale_bounding_box_to_original_image_shape(
-    box, resized_image_shape, original_image_shape, ratio_pad
-) -> list[float]:
-    """
-    Scale bounding box from resized image coordinates to original image coordinates.
-
-    YOLO resizes images during training and the label values are normalized based on this resized shape.
-    This function rescales the bounding box labels to the original image shape.
-
-    Args:
-        box (torch.Tensor): Bounding box in normalized xywh format.
-        resized_image_shape (tuple): Shape of the resized image (height, width).
-        original_image_shape (tuple): Shape of the original image (height, width).
-        ratio_pad (tuple): Ratio and padding information for scaling.
-
-    Returns:
-        (list[float]): Scaled bounding box coordinates in xywh format with top-left corner adjustment.
-    """
-    resized_image_height, resized_image_width = resized_image_shape
-
-    # Convert normalized xywh format predictions to xyxy in resized scale format
-    box = ops.xywhn2xyxy(box, h=resized_image_height, w=resized_image_width)
-    # Scale box predictions from resized image scale back to original image scale
-    box = ops.scale_boxes(resized_image_shape, box, original_image_shape, ratio_pad)
-    # Convert bounding box format from xyxy to xywh for Comet logging
-    box = ops.xyxy2xywh(box)
-    # Adjust xy center to correspond top-left corner
-    box[:2] -= box[2:] / 2
-    box = box.tolist()
-
-    return box
-
-
-def _format_ground_truth_annotations_for_detection(img_idx, image_path, batch, class_name_map=None) -> dict | None:
-    """
-    Format ground truth annotations for object detection.
-
-    This function processes ground truth annotations from a batch of images for object detection tasks. It extracts
-    bounding boxes, class labels, and other metadata for a specific image in the batch, and formats them for
-    visualization or evaluation.
-
-    Args:
-        img_idx (int): Index of the image in the batch to process.
-        image_path (str | Path): Path to the image file.
-        batch (dict): Batch dictionary containing detection data with keys:
-            - 'batch_idx': Tensor of batch indices
-            - 'bboxes': Tensor of bounding boxes in normalized xywh format
-            - 'cls': Tensor of class labels
-            - 'ori_shape': Original image shapes
-            - 'resized_shape': Resized image shapes
-            - 'ratio_pad': Ratio and padding information
-        class_name_map (dict, optional): Mapping from class indices to class names.
-
-    Returns:
-        (dict | None): Formatted ground truth annotations with the following structure:
-            - 'boxes': List of box coordinates [x, y, width, height]
-            - 'label': Label string with format "gt_{class_name}"
-            - 'score': Confidence score (always 1.0, scaled by _scale_confidence_score)
-        Returns None if no bounding boxes are found for the image.
-    """
-    indices = batch["batch_idx"] == img_idx
-    bboxes = batch["bboxes"][indices]
-    if len(bboxes) == 0:
-        LOGGER.debug(f"Comet Image: {image_path} has no bounding boxes labels")
-        return None
-
-    cls_labels = batch["cls"][indices].squeeze(1).tolist()
-    if class_name_map:
-        cls_labels = [str(class_name_map[label]) for label in cls_labels]
-
-    original_image_shape = batch["ori_shape"][img_idx]
-    resized_image_shape = batch["resized_shape"][img_idx]
-    ratio_pad = batch["ratio_pad"][img_idx]
-
-    data = []
-    for box, label in zip(bboxes, cls_labels):
-        box = _scale_bounding_box_to_original_image_shape(box, resized_image_shape, original_image_shape, ratio_pad)
-        data.append(
-            {
-                "boxes": [box],
-                "label": f"gt_{label}",
-                "score": _scale_confidence_score(1.0),
-            }
-        )
-
-    return {"name": "ground_truth", "data": data}
-
-
-def _format_prediction_annotations(image_path, metadata, class_label_map=None, class_map=None) -> dict | None:
-    """
-    Format YOLO predictions for object detection visualization.
-
-    Args:
-        image_path (Path): Path to the image file.
-        metadata (dict): Prediction metadata containing bounding boxes and class information.
-        class_label_map (dict, optional): Mapping from class indices to class names.
-        class_map (dict, optional): Additional class mapping for label conversion.
-
-    Returns:
-        (dict | None): Formatted prediction annotations or None if no predictions exist.
-    """
-    stem = image_path.stem
-    image_id = int(stem) if stem.isnumeric() else stem
-
-    predictions = metadata.get(image_id)
-    if not predictions:
-        LOGGER.debug(f"Comet Image: {image_path} has no bounding boxes predictions")
-        return None
-
-    # apply the mapping that was used to map the predicted classes when the JSON was created
-    if class_label_map and class_map:
-        class_label_map = {class_map[k]: v for k, v in class_label_map.items()}
-    try:
-        # import pycotools utilities to decompress annotations for various tasks, e.g. segmentation
-        from faster_coco_eval.core.mask import decode  # noqa
-    except ImportError:
-        decode = None
-
-    data = []
-    for prediction in predictions:
-        boxes = prediction["bbox"]
-        score = _scale_confidence_score(prediction["score"])
-        cls_label = prediction["category_id"]
-        if class_label_map:
-            cls_label = str(class_label_map[cls_label])
-
-        annotation_data = {"boxes": [boxes], "label": cls_label, "score": score}
-
-        if decode is not None:
-            # do segmentation processing only if we are able to decode it
-            segments = prediction.get("segmentation", None)
-            if segments is not None:
-                segments = _extract_segmentation_annotation(segments, decode)
-            if segments is not None:
-                annotation_data["points"] = segments
-
-        data.append(annotation_data)
-
-    return {"name": "prediction", "data": data}
-
-
-def _extract_segmentation_annotation(segmentation_raw: str, decode: Callable) -> list[list[Any]] | None:
-    """
-    Extract segmentation annotation from compressed segmentations as list of polygons.
-
-    Args:
-        segmentation_raw (str): Raw segmentation data in compressed format.
-        decode (Callable): Function to decode the compressed segmentation data.
-
-    Returns:
-        (list[list[Any]] | None): List of polygon points or None if extraction fails.
-    """
-    try:
-        mask = decode(segmentation_raw)
-        contours, _ = cv2.findContours(mask, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
-        annotations = [np.array(polygon).squeeze() for polygon in contours if len(polygon) >= 3]
-        return [annotation.ravel().tolist() for annotation in annotations]
-    except Exception as e:
-        LOGGER.warning(f"Comet Failed to extract segmentation annotation: {e}")
-    return None
-
-
-def _fetch_annotations(img_idx, image_path, batch, prediction_metadata_map, class_label_map, class_map) -> list | None:
-    """
-    Join the ground truth and prediction annotations if they exist.
-
-    Args:
-        img_idx (int): Index of the image in the batch.
-        image_path (Path): Path to the image file.
-        batch (dict): Batch data containing ground truth annotations.
-        prediction_metadata_map (dict): Map of prediction metadata by image ID.
-        class_label_map (dict): Mapping from class indices to class names.
-        class_map (dict): Additional class mapping for label conversion.
-
-    Returns:
-        (list | None): List of annotation dictionaries or None if no annotations exist.
-    """
-    ground_truth_annotations = _format_ground_truth_annotations_for_detection(
-        img_idx, image_path, batch, class_label_map
-    )
-    prediction_annotations = _format_prediction_annotations(
-        image_path, prediction_metadata_map, class_label_map, class_map
-    )
-
-    annotations = [
-        annotation for annotation in [ground_truth_annotations, prediction_annotations] if annotation is not None
-    ]
-    return [annotations] if annotations else None
-
-
-def _create_prediction_metadata_map(model_predictions) -> dict:
-    """Create metadata map for model predictions by grouping them based on image ID."""
-    pred_metadata_map = {}
-    for prediction in model_predictions:
-        pred_metadata_map.setdefault(prediction["image_id"], [])
-        pred_metadata_map[prediction["image_id"]].append(prediction)
-
-    return pred_metadata_map
-
-
-def _log_confusion_matrix(experiment, trainer, curr_step, curr_epoch) -> None:
-    """Log the confusion matrix to Comet experiment."""
-    conf_mat = trainer.validator.confusion_matrix.matrix
-    names = list(trainer.data["names"].values()) + ["background"]
-    experiment.log_confusion_matrix(
-        matrix=conf_mat, labels=names, max_categories=len(names), epoch=curr_epoch, step=curr_step
-    )
-
-
-def _log_images(experiment, image_paths, curr_step: int | None, annotations=None) -> None:
-    """
-    Log images to the experiment with optional annotations.
-
-    This function logs images to a Comet ML experiment, optionally including annotation data for visualization
-    such as bounding boxes or segmentation masks.
-
-    Args:
-        experiment (comet_ml.CometExperiment): The Comet ML experiment to log images to.
-        image_paths (list[Path]): List of paths to images that will be logged.
-        curr_step (int): Current training step/iteration for tracking in the experiment timeline.
-        annotations (list[list[dict]], optional): Nested list of annotation dictionaries for each image. Each
-            annotation contains visualization data like bounding boxes, labels, and confidence scores.
-    """
-    if annotations:
-        for image_path, annotation in zip(image_paths, annotations):
-            experiment.log_image(image_path, name=image_path.stem, step=curr_step, annotations=annotation)
-
-    else:
-        for image_path in image_paths:
-            experiment.log_image(image_path, name=image_path.stem, step=curr_step)
-
-
-def _log_image_predictions(experiment, validator, curr_step) -> None:
-    """
-    Log predicted boxes for a single image during training.
-
-    This function logs image predictions to a Comet ML experiment during model validation. It processes
-    validation data and formats both ground truth and prediction annotations for visualization in the Comet
-    dashboard. The function respects configured limits on the number of images to log.
-
-    Args:
-        experiment (comet_ml.CometExperiment): The Comet ML experiment to log to.
-        validator (BaseValidator): The validator instance containing validation data and predictions.
-        curr_step (int): The current training step for logging timeline.
-
-    Notes:
-        This function uses global state to track the number of logged predictions across calls.
-        It only logs predictions for supported tasks defined in COMET_SUPPORTED_TASKS.
-        The number of logged images is limited by the COMET_MAX_IMAGE_PREDICTIONS environment variable.
-    """
-    global _comet_image_prediction_count
-
-    task = validator.args.task
-    if task not in COMET_SUPPORTED_TASKS:
-        return
-
-    jdict = validator.jdict
-    if not jdict:
-        return
-
-    predictions_metadata_map = _create_prediction_metadata_map(jdict)
-    dataloader = validator.dataloader
-    class_label_map = validator.names
-    class_map = getattr(validator, "class_map", None)
-
-    batch_logging_interval = _get_eval_batch_logging_interval()
-    max_image_predictions = _get_max_image_predictions_to_log()
-
-    for batch_idx, batch in enumerate(dataloader):
-        if (batch_idx + 1) % batch_logging_interval != 0:
-            continue
-
-        image_paths = batch["im_file"]
-        for img_idx, image_path in enumerate(image_paths):
-            if _comet_image_prediction_count >= max_image_predictions:
-                return
-
-            image_path = Path(image_path)
-            annotations = _fetch_annotations(
-                img_idx,
-                image_path,
-                batch,
-                predictions_metadata_map,
-                class_label_map,
-                class_map=class_map,
-            )
-            _log_images(
-                experiment,
-                [image_path],
-                curr_step,
-                annotations=annotations,
-            )
-            _comet_image_prediction_count += 1
-
-
-def _log_plots(experiment, trainer) -> None:
-    """
-    Log evaluation plots and label plots for the experiment.
-
-    This function logs various evaluation plots and confusion matrices to the experiment tracking system. It handles
-    different types of metrics (SegmentMetrics, PoseMetrics, DetMetrics, OBBMetrics) and logs the appropriate plots
-    for each type.
-
-    Args:
-        experiment (comet_ml.CometExperiment): The Comet ML experiment to log plots to.
-        trainer (ultralytics.engine.trainer.BaseTrainer): The trainer object containing validation metrics and save
-            directory information.
-
-    Examples:
-        >>> from ultralytics.utils.callbacks.comet import _log_plots
-        >>> _log_plots(experiment, trainer)
-    """
-    plot_filenames = None
-    if isinstance(trainer.validator.metrics, SegmentMetrics):
-        plot_filenames = [
-            trainer.save_dir / f"{prefix}{plots}.png"
-            for plots in EVALUATION_PLOT_NAMES
-            for prefix in SEGMENT_METRICS_PLOT_PREFIX
-        ]
-    elif isinstance(trainer.validator.metrics, PoseMetrics):
-        plot_filenames = [
-            trainer.save_dir / f"{prefix}{plots}.png"
-            for plots in EVALUATION_PLOT_NAMES
-            for prefix in POSE_METRICS_PLOT_PREFIX
-        ]
-    elif isinstance(trainer.validator.metrics, (DetMetrics, OBBMetrics)):
-        plot_filenames = [
-            trainer.save_dir / f"{prefix}{plots}.png"
-            for plots in EVALUATION_PLOT_NAMES
-            for prefix in DETECTION_METRICS_PLOT_PREFIX
-        ]
-
-    if plot_filenames is not None:
-        _log_images(experiment, plot_filenames, None)
-
-    confusion_matrix_filenames = [trainer.save_dir / f"{plots}.png" for plots in CONFUSION_MATRIX_PLOT_NAMES]
-    _log_images(experiment, confusion_matrix_filenames, None)
-
-    if not isinstance(trainer.validator.metrics, ClassifyMetrics):
-        label_plot_filenames = [trainer.save_dir / f"{labels}.jpg" for labels in LABEL_PLOT_NAMES]
-        _log_images(experiment, label_plot_filenames, None)
-
-
-def _log_model(experiment, trainer) -> None:
-    """Log the best-trained model to Comet.ml."""
-    model_name = _get_comet_model_name()
-    experiment.log_model(model_name, file_or_folder=str(trainer.best), file_name="best.pt", overwrite=True)
-
-
-def _log_image_batches(experiment, trainer, curr_step: int) -> None:
-    """Log samples of image batches for train, validation, and test."""
-    _log_images(experiment, trainer.save_dir.glob("train_batch*.jpg"), curr_step)
-    _log_images(experiment, trainer.save_dir.glob("val_batch*.jpg"), curr_step)
-
-
-def _log_asset(experiment, asset_path) -> None:
-    """
-    Logs a specific asset file to the given experiment.
-
-    This function facilitates logging an asset, such as a file, to the provided
-    experiment. It enables integration with experiment tracking platforms.
-
-    Args:
-        experiment (comet_ml.CometExperiment): The experiment instance to which the asset will be logged.
-        asset_path (Path): The file path of the asset to log.
-    """
-    experiment.log_asset(asset_path)
-
-
-def _log_table(experiment, table_path) -> None:
-    """
-    Logs a table to the provided experiment.
-
-    This function is used to log a table file to the given experiment. The table
-    is identified by its file path.
-
-    Args:
-        experiment (comet_ml.CometExperiment): The experiment object where the table file will be logged.
-        table_path (Path): The file path of the table to be logged.
-    """
-    experiment.log_table(str(table_path))
-
-
-def on_pretrain_routine_start(trainer) -> None:
-    """Create or resume a CometML experiment at the start of a YOLO pre-training routine."""
-    _resume_or_create_experiment(trainer.args)
-
-
-def on_train_epoch_end(trainer) -> None:
-    """Log metrics and save batch images at the end of training epochs."""
-    experiment = comet_ml.get_running_experiment()
-    if not experiment:
-        return
-
-    metadata = _fetch_trainer_metadata(trainer)
-    curr_epoch = metadata["curr_epoch"]
-    curr_step = metadata["curr_step"]
-
-    experiment.log_metrics(trainer.label_loss_items(trainer.tloss, prefix="train"), step=curr_step, epoch=curr_epoch)
-
-
-def on_fit_epoch_end(trainer) -> None:
-    """
-    Log model assets at the end of each epoch during training.
-
-    This function is called at the end of each training epoch to log metrics, learning rates, and model information
-    to a Comet ML experiment. It also logs model assets, confusion matrices, and image predictions based on
-    configuration settings.
-
-    The function retrieves the current Comet ML experiment and logs various training metrics. If it's the first epoch,
-    it also logs model information. On specified save intervals, it logs the model, confusion matrix (if enabled),
-    and image predictions (if enabled).
-
-    Args:
-        trainer (BaseTrainer): The YOLO trainer object containing training state, metrics, and configuration.
-
-    Examples:
-        >>> # Inside a training loop
-        >>> on_fit_epoch_end(trainer)  # Log metrics and assets to Comet ML
-    """
-    experiment = comet_ml.get_running_experiment()
-    if not experiment:
-        return
-
-    metadata = _fetch_trainer_metadata(trainer)
-    curr_epoch = metadata["curr_epoch"]
-    curr_step = metadata["curr_step"]
-    save_assets = metadata["save_assets"]
-
-    experiment.log_metrics(trainer.metrics, step=curr_step, epoch=curr_epoch)
-    experiment.log_metrics(trainer.lr, step=curr_step, epoch=curr_epoch)
-    if curr_epoch == 1:
-        from ultralytics.utils.torch_utils import model_info_for_loggers
-
-        experiment.log_metrics(model_info_for_loggers(trainer), step=curr_step, epoch=curr_epoch)
-
-    if not save_assets:
-        return
-
-    _log_model(experiment, trainer)
-    if _should_log_confusion_matrix():
-        _log_confusion_matrix(experiment, trainer, curr_step, curr_epoch)
-    if _should_log_image_predictions():
-        _log_image_predictions(experiment, trainer.validator, curr_step)
-
-
-def on_train_end(trainer) -> None:
-    """Perform operations at the end of training."""
-    experiment = comet_ml.get_running_experiment()
-    if not experiment:
-        return
-
-    metadata = _fetch_trainer_metadata(trainer)
-    curr_epoch = metadata["curr_epoch"]
-    curr_step = metadata["curr_step"]
-    plots = trainer.args.plots
-
-    _log_model(experiment, trainer)
-    if plots:
-        _log_plots(experiment, trainer)
-
-    _log_confusion_matrix(experiment, trainer, curr_step, curr_epoch)
-    _log_image_predictions(experiment, trainer.validator, curr_step)
-    _log_image_batches(experiment, trainer, curr_step)
-    # log results table
-    table_path = trainer.save_dir / RESULTS_TABLE_NAME
-    if table_path.exists():
-        _log_table(experiment, table_path)
-
-    # log arguments YAML
-    args_path = trainer.save_dir / ARGS_YAML_NAME
-    if args_path.exists():
-        _log_asset(experiment, args_path)
-
-    experiment.end()
-
-    global _comet_image_prediction_count
-    _comet_image_prediction_count = 0
-
-
-callbacks = (
-    {
-        "on_pretrain_routine_start": on_pretrain_routine_start,
-        "on_train_epoch_end": on_train_epoch_end,
-        "on_fit_epoch_end": on_fit_epoch_end,
-        "on_train_end": on_train_end,
-    }
-    if comet_ml
-    else {}
-)
diff --git a/ultralytics/utils/callbacks/dvc.py b/ultralytics/utils/callbacks/dvc.py
deleted file mode 100644
index 35a16d7..0000000
--- a/ultralytics/utils/callbacks/dvc.py
+++ /dev/null
@@ -1,202 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from pathlib import Path
-
-from ultralytics.utils import LOGGER, SETTINGS, TESTS_RUNNING, checks
-
-try:
-    assert not TESTS_RUNNING  # do not log pytest
-    assert SETTINGS["dvc"] is True  # verify integration is enabled
-    import dvclive
-
-    assert checks.check_version("dvclive", "2.11.0", verbose=True)
-
-    import os
-    import re
-
-    # DVCLive logger instance
-    live = None
-    _processed_plots = {}
-
-    # `on_fit_epoch_end` is called on final validation (probably need to be fixed) for now this is the way we
-    # distinguish final evaluation of the best model vs last epoch validation
-    _training_epoch = False
-
-except (ImportError, AssertionError, TypeError):
-    dvclive = None
-
-
-def _log_images(path: Path, prefix: str = "") -> None:
-    """
-    Log images at specified path with an optional prefix using DVCLive.
-
-    This function logs images found at the given path to DVCLive, organizing them by batch to enable slider
-    functionality in the UI. It processes image filenames to extract batch information and restructures the path
-    accordingly.
-
-    Args:
-        path (Path): Path to the image file to be logged.
-        prefix (str, optional): Optional prefix to add to the image name when logging.
-
-    Examples:
-        >>> from pathlib import Path
-        >>> _log_images(Path("runs/train/exp/val_batch0_pred.jpg"), prefix="validation")
-    """
-    if live:
-        name = path.name
-
-        # Group images by batch to enable sliders in UI
-        if m := re.search(r"_batch(\d+)", name):
-            ni = m[1]
-            new_stem = re.sub(r"_batch(\d+)", "_batch", path.stem)
-            name = (Path(new_stem) / ni).with_suffix(path.suffix)
-
-        live.log_image(os.path.join(prefix, name), path)
-
-
-def _log_plots(plots: dict, prefix: str = "") -> None:
-    """
-    Log plot images for training progress if they have not been previously processed.
-
-    Args:
-        plots (dict): Dictionary containing plot information with timestamps.
-        prefix (str, optional): Optional prefix to add to the logged image paths.
-    """
-    for name, params in plots.items():
-        timestamp = params["timestamp"]
-        if _processed_plots.get(name) != timestamp:
-            _log_images(name, prefix)
-            _processed_plots[name] = timestamp
-
-
-def _log_confusion_matrix(validator) -> None:
-    """
-    Log confusion matrix for a validator using DVCLive.
-
-    This function processes the confusion matrix from a validator object and logs it to DVCLive by converting
-    the matrix into lists of target and prediction labels.
-
-    Args:
-        validator (BaseValidator): The validator object containing the confusion matrix and class names. Must have
-            attributes: confusion_matrix.matrix, confusion_matrix.task, and names.
-    """
-    targets = []
-    preds = []
-    matrix = validator.confusion_matrix.matrix
-    names = list(validator.names.values())
-    if validator.confusion_matrix.task == "detect":
-        names += ["background"]
-
-    for ti, pred in enumerate(matrix.T.astype(int)):
-        for pi, num in enumerate(pred):
-            targets.extend([names[ti]] * num)
-            preds.extend([names[pi]] * num)
-
-    live.log_sklearn_plot("confusion_matrix", targets, preds, name="cf.json", normalized=True)
-
-
-def on_pretrain_routine_start(trainer) -> None:
-    """Initialize DVCLive logger for training metadata during pre-training routine."""
-    try:
-        global live
-        live = dvclive.Live(save_dvc_exp=True, cache_images=True)
-        LOGGER.info("DVCLive is detected and auto logging is enabled (run 'yolo settings dvc=False' to disable).")
-    except Exception as e:
-        LOGGER.warning(f"DVCLive installed but not initialized correctly, not logging this run. {e}")
-
-
-def on_pretrain_routine_end(trainer) -> None:
-    """Log plots related to the training process at the end of the pretraining routine."""
-    _log_plots(trainer.plots, "train")
-
-
-def on_train_start(trainer) -> None:
-    """Log the training parameters if DVCLive logging is active."""
-    if live:
-        live.log_params(trainer.args)
-
-
-def on_train_epoch_start(trainer) -> None:
-    """Set the global variable _training_epoch value to True at the start of training each epoch."""
-    global _training_epoch
-    _training_epoch = True
-
-
-def on_fit_epoch_end(trainer) -> None:
-    """
-    Log training metrics, model info, and advance to next step at the end of each fit epoch.
-
-    This function is called at the end of each fit epoch during training. It logs various metrics including
-    training loss items, validation metrics, and learning rates. On the first epoch, it also logs model
-    information. Additionally, it logs training and validation plots and advances the DVCLive step counter.
-
-    Args:
-        trainer (BaseTrainer): The trainer object containing training state, metrics, and plots.
-
-    Notes:
-        This function only performs logging operations when DVCLive logging is active and during a training epoch.
-        The global variable _training_epoch is used to track whether the current epoch is a training epoch.
-    """
-    global _training_epoch
-    if live and _training_epoch:
-        all_metrics = {**trainer.label_loss_items(trainer.tloss, prefix="train"), **trainer.metrics, **trainer.lr}
-        for metric, value in all_metrics.items():
-            live.log_metric(metric, value)
-
-        if trainer.epoch == 0:
-            from ultralytics.utils.torch_utils import model_info_for_loggers
-
-            for metric, value in model_info_for_loggers(trainer).items():
-                live.log_metric(metric, value, plot=False)
-
-        _log_plots(trainer.plots, "train")
-        _log_plots(trainer.validator.plots, "val")
-
-        live.next_step()
-        _training_epoch = False
-
-
-def on_train_end(trainer) -> None:
-    """
-    Log best metrics, plots, and confusion matrix at the end of training.
-
-    This function is called at the conclusion of the training process to log final metrics, visualizations, and
-    model artifacts if DVCLive logging is active. It captures the best model performance metrics, training plots,
-    validation plots, and confusion matrix for later analysis.
-
-    Args:
-        trainer (BaseTrainer): The trainer object containing training state, metrics, and validation results.
-
-    Examples:
-        >>> # Inside a custom training loop
-        >>> from ultralytics.utils.callbacks.dvc import on_train_end
-        >>> on_train_end(trainer)  # Log final metrics and artifacts
-    """
-    if live:
-        # At the end log the best metrics. It runs validator on the best model internally.
-        all_metrics = {**trainer.label_loss_items(trainer.tloss, prefix="train"), **trainer.metrics, **trainer.lr}
-        for metric, value in all_metrics.items():
-            live.log_metric(metric, value, plot=False)
-
-        _log_plots(trainer.plots, "val")
-        _log_plots(trainer.validator.plots, "val")
-        _log_confusion_matrix(trainer.validator)
-
-        if trainer.best.exists():
-            live.log_artifact(trainer.best, copy=True, type="model")
-
-        live.end()
-
-
-callbacks = (
-    {
-        "on_pretrain_routine_start": on_pretrain_routine_start,
-        "on_pretrain_routine_end": on_pretrain_routine_end,
-        "on_train_start": on_train_start,
-        "on_train_epoch_start": on_train_epoch_start,
-        "on_fit_epoch_end": on_fit_epoch_end,
-        "on_train_end": on_train_end,
-    }
-    if dvclive
-    else {}
-)
diff --git a/ultralytics/utils/callbacks/hub.py b/ultralytics/utils/callbacks/hub.py
deleted file mode 100644
index 2b57cd1..0000000
--- a/ultralytics/utils/callbacks/hub.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-import json
-from time import time
-
-from ultralytics.hub import HUB_WEB_ROOT, PREFIX, HUBTrainingSession
-from ultralytics.utils import LOGGER, RANK, SETTINGS
-from ultralytics.utils.events import events
-
-
-def on_pretrain_routine_start(trainer):
-    """Create a remote Ultralytics HUB session to log local model training."""
-    if RANK in {-1, 0} and SETTINGS["hub"] is True and SETTINGS["api_key"] and trainer.hub_session is None:
-        trainer.hub_session = HUBTrainingSession.create_session(trainer.args.model, trainer.args)
-
-
-def on_pretrain_routine_end(trainer):
-    """Initialize timers for upload rate limiting before training begins."""
-    if session := getattr(trainer, "hub_session", None):
-        # Start timer for upload rate limit
-        session.timers = {"metrics": time(), "ckpt": time()}  # start timer for session rate limiting
-
-
-def on_fit_epoch_end(trainer):
-    """Upload training progress metrics to Ultralytics HUB at the end of each epoch."""
-    if session := getattr(trainer, "hub_session", None):
-        # Upload metrics after validation ends
-        all_plots = {
-            **trainer.label_loss_items(trainer.tloss, prefix="train"),
-            **trainer.metrics,
-        }
-        if trainer.epoch == 0:
-            from ultralytics.utils.torch_utils import model_info_for_loggers
-
-            all_plots = {**all_plots, **model_info_for_loggers(trainer)}
-
-        session.metrics_queue[trainer.epoch] = json.dumps(all_plots)
-
-        # If any metrics failed to upload previously, add them to the queue to attempt uploading again
-        if session.metrics_upload_failed_queue:
-            session.metrics_queue.update(session.metrics_upload_failed_queue)
-
-        if time() - session.timers["metrics"] > session.rate_limits["metrics"]:
-            session.upload_metrics()
-            session.timers["metrics"] = time()  # reset timer
-            session.metrics_queue = {}  # reset queue
-
-
-def on_model_save(trainer):
-    """Upload model checkpoints to Ultralytics HUB with rate limiting."""
-    if session := getattr(trainer, "hub_session", None):
-        # Upload checkpoints with rate limiting
-        is_best = trainer.best_fitness == trainer.fitness
-        if time() - session.timers["ckpt"] > session.rate_limits["ckpt"]:
-            LOGGER.info(f"{PREFIX}Uploading checkpoint {HUB_WEB_ROOT}/models/{session.model.id}")
-            session.upload_model(trainer.epoch, trainer.last, is_best)
-            session.timers["ckpt"] = time()  # reset timer
-
-
-def on_train_end(trainer):
-    """Upload final model and metrics to Ultralytics HUB at the end of training."""
-    if session := getattr(trainer, "hub_session", None):
-        # Upload final model and metrics with exponential standoff
-        LOGGER.info(f"{PREFIX}Syncing final model...")
-        session.upload_model(
-            trainer.epoch,
-            trainer.best,
-            map=trainer.metrics.get("metrics/mAP50-95(B)", 0),
-            final=True,
-        )
-        session.alive = False  # stop heartbeats
-        LOGGER.info(f"{PREFIX}Done ✅\n{PREFIX}View model at {session.model_url} 🚀")
-
-
-def on_train_start(trainer):
-    """Run events on train start."""
-    events(trainer.args, trainer.device)
-
-
-def on_val_start(validator):
-    """Run events on validation start."""
-    if not validator.training:
-        events(validator.args, validator.device)
-
-
-def on_predict_start(predictor):
-    """Run events on predict start."""
-    events(predictor.args, predictor.device)
-
-
-def on_export_start(exporter):
-    """Run events on export start."""
-    events(exporter.args, exporter.device)
-
-
-callbacks = (
-    {
-        "on_pretrain_routine_start": on_pretrain_routine_start,
-        "on_pretrain_routine_end": on_pretrain_routine_end,
-        "on_fit_epoch_end": on_fit_epoch_end,
-        "on_model_save": on_model_save,
-        "on_train_end": on_train_end,
-        "on_train_start": on_train_start,
-        "on_val_start": on_val_start,
-        "on_predict_start": on_predict_start,
-        "on_export_start": on_export_start,
-    }
-    if SETTINGS["hub"] is True
-    else {}
-)
diff --git a/ultralytics/utils/callbacks/mlflow.py b/ultralytics/utils/callbacks/mlflow.py
deleted file mode 100644
index f570240..0000000
--- a/ultralytics/utils/callbacks/mlflow.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-"""
-MLflow Logging for Ultralytics YOLO.
-
-This module enables MLflow logging for Ultralytics YOLO. It logs metrics, parameters, and model artifacts.
-For setting up, a tracking URI should be specified. The logging can be customized using environment variables.
-
-Commands:
-    1. To set a project name:
-        `export MLFLOW_EXPERIMENT_NAME=<your_experiment_name>` or use the project=<project> argument
-
-    2. To set a run name:
-        `export MLFLOW_RUN=<your_run_name>` or use the name=<name> argument
-
-    3. To start a local MLflow server:
-        mlflow server --backend-store-uri runs/mlflow
-       It will by default start a local server at http://127.0.0.1:5000.
-       To specify a different URI, set the MLFLOW_TRACKING_URI environment variable.
-
-    4. To kill all running MLflow server instances:
-        ps aux | grep 'mlflow' | grep -v 'grep' | awk '{print $2}' | xargs kill -9
-"""
-
-from ultralytics.utils import LOGGER, RUNS_DIR, SETTINGS, TESTS_RUNNING, colorstr
-
-try:
-    import os
-
-    assert not TESTS_RUNNING or "test_mlflow" in os.environ.get("PYTEST_CURRENT_TEST", "")  # do not log pytest
-    assert SETTINGS["mlflow"] is True  # verify integration is enabled
-    import mlflow
-
-    assert hasattr(mlflow, "__version__")  # verify package is not directory
-    from pathlib import Path
-
-    PREFIX = colorstr("MLflow: ")
-
-except (ImportError, AssertionError):
-    mlflow = None
-
-
-def sanitize_dict(x: dict) -> dict:
-    """Sanitize dictionary keys by removing parentheses and converting values to floats."""
-    return {k.replace("(", "").replace(")", ""): float(v) for k, v in x.items()}
-
-
-def on_pretrain_routine_end(trainer):
-    """
-    Log training parameters to MLflow at the end of the pretraining routine.
-
-    This function sets up MLflow logging based on environment variables and trainer arguments. It sets the tracking URI,
-    experiment name, and run name, then starts the MLflow run if not already active. It finally logs the parameters
-    from the trainer.
-
-    Args:
-        trainer (ultralytics.engine.trainer.BaseTrainer): The training object with arguments and parameters to log.
-
-    Environment Variables:
-        MLFLOW_TRACKING_URI: The URI for MLflow tracking. If not set, defaults to 'runs/mlflow'.
-        MLFLOW_EXPERIMENT_NAME: The name of the MLflow experiment. If not set, defaults to trainer.args.project.
-        MLFLOW_RUN: The name of the MLflow run. If not set, defaults to trainer.args.name.
-        MLFLOW_KEEP_RUN_ACTIVE: Boolean indicating whether to keep the MLflow run active after training ends.
-    """
-    global mlflow
-
-    uri = os.environ.get("MLFLOW_TRACKING_URI") or str(RUNS_DIR / "mlflow")
-    LOGGER.debug(f"{PREFIX} tracking uri: {uri}")
-    mlflow.set_tracking_uri(uri)
-
-    # Set experiment and run names
-    experiment_name = os.environ.get("MLFLOW_EXPERIMENT_NAME") or trainer.args.project or "/Shared/Ultralytics"
-    run_name = os.environ.get("MLFLOW_RUN") or trainer.args.name
-    mlflow.set_experiment(experiment_name)
-
-    mlflow.autolog()
-    try:
-        active_run = mlflow.active_run() or mlflow.start_run(run_name=run_name)
-        LOGGER.info(f"{PREFIX}logging run_id({active_run.info.run_id}) to {uri}")
-        if Path(uri).is_dir():
-            LOGGER.info(f"{PREFIX}view at http://127.0.0.1:5000 with 'mlflow server --backend-store-uri {uri}'")
-        LOGGER.info(f"{PREFIX}disable with 'yolo settings mlflow=False'")
-        mlflow.log_params(dict(trainer.args))
-    except Exception as e:
-        LOGGER.warning(f"{PREFIX}Failed to initialize: {e}")
-        LOGGER.warning(f"{PREFIX}Not tracking this run")
-
-
-def on_train_epoch_end(trainer):
-    """Log training metrics at the end of each train epoch to MLflow."""
-    if mlflow:
-        mlflow.log_metrics(
-            metrics={
-                **sanitize_dict(trainer.lr),
-                **sanitize_dict(trainer.label_loss_items(trainer.tloss, prefix="train")),
-            },
-            step=trainer.epoch,
-        )
-
-
-def on_fit_epoch_end(trainer):
-    """Log training metrics at the end of each fit epoch to MLflow."""
-    if mlflow:
-        mlflow.log_metrics(metrics=sanitize_dict(trainer.metrics), step=trainer.epoch)
-
-
-def on_train_end(trainer):
-    """Log model artifacts at the end of training."""
-    if not mlflow:
-        return
-    mlflow.log_artifact(str(trainer.best.parent))  # log save_dir/weights directory with best.pt and last.pt
-    for f in trainer.save_dir.glob("*"):  # log all other files in save_dir
-        if f.suffix in {".png", ".jpg", ".csv", ".pt", ".yaml"}:
-            mlflow.log_artifact(str(f))
-    keep_run_active = os.environ.get("MLFLOW_KEEP_RUN_ACTIVE", "False").lower() == "true"
-    if keep_run_active:
-        LOGGER.info(f"{PREFIX}mlflow run still alive, remember to close it using mlflow.end_run()")
-    else:
-        mlflow.end_run()
-        LOGGER.debug(f"{PREFIX}mlflow run ended")
-
-    LOGGER.info(
-        f"{PREFIX}results logged to {mlflow.get_tracking_uri()}\n{PREFIX}disable with 'yolo settings mlflow=False'"
-    )
-
-
-callbacks = (
-    {
-        "on_pretrain_routine_end": on_pretrain_routine_end,
-        "on_train_epoch_end": on_train_epoch_end,
-        "on_fit_epoch_end": on_fit_epoch_end,
-        "on_train_end": on_train_end,
-    }
-    if mlflow
-    else {}
-)
diff --git a/ultralytics/utils/callbacks/neptune.py b/ultralytics/utils/callbacks/neptune.py
deleted file mode 100644
index b27964b..0000000
--- a/ultralytics/utils/callbacks/neptune.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from ultralytics.utils import LOGGER, SETTINGS, TESTS_RUNNING
-
-try:
-    assert not TESTS_RUNNING  # do not log pytest
-    assert SETTINGS["neptune"] is True  # verify integration is enabled
-
-    import neptune
-    from neptune.types import File
-
-    assert hasattr(neptune, "__version__")
-
-    run = None  # NeptuneAI experiment logger instance
-
-except (ImportError, AssertionError):
-    neptune = None
-
-
-def _log_scalars(scalars: dict, step: int = 0) -> None:
-    """
-    Log scalars to the NeptuneAI experiment logger.
-
-    Args:
-        scalars (dict): Dictionary of scalar values to log to NeptuneAI.
-        step (int, optional): The current step or iteration number for logging.
-
-    Examples:
-        >>> metrics = {"mAP": 0.85, "loss": 0.32}
-        >>> _log_scalars(metrics, step=100)
-    """
-    if run:
-        for k, v in scalars.items():
-            run[k].append(value=v, step=step)
-
-
-def _log_images(imgs_dict: dict, group: str = "") -> None:
-    """
-    Log images to the NeptuneAI experiment logger.
-
-    This function logs image data to Neptune.ai when a valid Neptune run is active. Images are organized
-    under the specified group name.
-
-    Args:
-        imgs_dict (dict): Dictionary of images to log, with keys as image names and values as image data.
-        group (str, optional): Group name to organize images under in the Neptune UI.
-
-    Examples:
-        >>> # Log validation images
-        >>> _log_images({"val_batch": img_tensor}, group="validation")
-    """
-    if run:
-        for k, v in imgs_dict.items():
-            run[f"{group}/{k}"].upload(File(v))
-
-
-def _log_plot(title: str, plot_path: str) -> None:
-    """Log plots to the NeptuneAI experiment logger."""
-    import matplotlib.image as mpimg
-    import matplotlib.pyplot as plt
-
-    img = mpimg.imread(plot_path)
-    fig = plt.figure()
-    ax = fig.add_axes([0, 0, 1, 1], frameon=False, aspect="auto", xticks=[], yticks=[])  # no ticks
-    ax.imshow(img)
-    run[f"Plots/{title}"].upload(fig)
-
-
-def on_pretrain_routine_start(trainer) -> None:
-    """Initialize NeptuneAI run and log hyperparameters before training starts."""
-    try:
-        global run
-        run = neptune.init_run(
-            project=trainer.args.project or "Ultralytics",
-            name=trainer.args.name,
-            tags=["Ultralytics"],
-        )
-        run["Configuration/Hyperparameters"] = {k: "" if v is None else v for k, v in vars(trainer.args).items()}
-    except Exception as e:
-        LOGGER.warning(f"NeptuneAI installed but not initialized correctly, not logging this run. {e}")
-
-
-def on_train_epoch_end(trainer) -> None:
-    """Log training metrics and learning rate at the end of each training epoch."""
-    _log_scalars(trainer.label_loss_items(trainer.tloss, prefix="train"), trainer.epoch + 1)
-    _log_scalars(trainer.lr, trainer.epoch + 1)
-    if trainer.epoch == 1:
-        _log_images({f.stem: str(f) for f in trainer.save_dir.glob("train_batch*.jpg")}, "Mosaic")
-
-
-def on_fit_epoch_end(trainer) -> None:
-    """Log model info and validation metrics at the end of each fit epoch."""
-    if run and trainer.epoch == 0:
-        from ultralytics.utils.torch_utils import model_info_for_loggers
-
-        run["Configuration/Model"] = model_info_for_loggers(trainer)
-    _log_scalars(trainer.metrics, trainer.epoch + 1)
-
-
-def on_val_end(validator) -> None:
-    """Log validation images at the end of validation."""
-    if run:
-        # Log val_labels and val_pred
-        _log_images({f.stem: str(f) for f in validator.save_dir.glob("val*.jpg")}, "Validation")
-
-
-def on_train_end(trainer) -> None:
-    """Log final results, plots, and model weights at the end of training."""
-    if run:
-        # Log final results, CM matrix + PR plots
-        files = [
-            "results.png",
-            "confusion_matrix.png",
-            "confusion_matrix_normalized.png",
-            *(f"{x}_curve.png" for x in ("F1", "PR", "P", "R")),
-        ]
-        files = [(trainer.save_dir / f) for f in files if (trainer.save_dir / f).exists()]  # filter
-        for f in files:
-            _log_plot(title=f.stem, plot_path=f)
-        # Log the final model
-        run[f"weights/{trainer.args.name or trainer.args.task}/{trainer.best.name}"].upload(File(str(trainer.best)))
-
-
-callbacks = (
-    {
-        "on_pretrain_routine_start": on_pretrain_routine_start,
-        "on_train_epoch_end": on_train_epoch_end,
-        "on_fit_epoch_end": on_fit_epoch_end,
-        "on_val_end": on_val_end,
-        "on_train_end": on_train_end,
-    }
-    if neptune
-    else {}
-)
diff --git a/ultralytics/utils/callbacks/platform.py b/ultralytics/utils/callbacks/platform.py
deleted file mode 100644
index 8e983f3..0000000
--- a/ultralytics/utils/callbacks/platform.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from ultralytics.utils import RANK, SETTINGS
-
-
-def on_pretrain_routine_start(trainer):
-    """Initialize and start console logging immediately at the very beginning."""
-    if RANK in {-1, 0}:
-        from ultralytics.utils.logger import DEFAULT_LOG_PATH, ConsoleLogger, SystemLogger
-
-        trainer.system_logger = SystemLogger()
-        trainer.console_logger = ConsoleLogger(DEFAULT_LOG_PATH)
-        trainer.console_logger.start_capture()
-
-
-def on_pretrain_routine_end(trainer):
-    """Handle pre-training routine completion event."""
-    pass
-
-
-def on_fit_epoch_end(trainer):
-    """Handle end of training epoch event and collect system metrics."""
-    if RANK in {-1, 0} and hasattr(trainer, "system_logger"):
-        system_metrics = trainer.system_logger.get_metrics()
-        print(system_metrics)  # for debug
-
-
-def on_model_save(trainer):
-    """Handle model checkpoint save event."""
-    pass
-
-
-def on_train_end(trainer):
-    """Stop console capture and finalize logs."""
-    if logger := getattr(trainer, "console_logger", None):
-        logger.stop_capture()
-
-
-def on_train_start(trainer):
-    """Handle training start event."""
-    pass
-
-
-def on_val_start(validator):
-    """Handle validation start event."""
-    pass
-
-
-def on_predict_start(predictor):
-    """Handle prediction start event."""
-    pass
-
-
-def on_export_start(exporter):
-    """Handle model export start event."""
-    pass
-
-
-callbacks = (
-    {
-        "on_pretrain_routine_start": on_pretrain_routine_start,
-        "on_pretrain_routine_end": on_pretrain_routine_end,
-        "on_fit_epoch_end": on_fit_epoch_end,
-        "on_model_save": on_model_save,
-        "on_train_end": on_train_end,
-        "on_train_start": on_train_start,
-        "on_val_start": on_val_start,
-        "on_predict_start": on_predict_start,
-        "on_export_start": on_export_start,
-    }
-    if SETTINGS.get("platform", False) is True  # disabled for debugging
-    else {}
-)
diff --git a/ultralytics/utils/callbacks/raytune.py b/ultralytics/utils/callbacks/raytune.py
deleted file mode 100644
index 4a75a70..0000000
--- a/ultralytics/utils/callbacks/raytune.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from ultralytics.utils import SETTINGS
-
-try:
-    assert SETTINGS["raytune"] is True  # verify integration is enabled
-    import ray
-    from ray import tune
-    from ray.air import session
-
-except (ImportError, AssertionError):
-    tune = None
-
-
-def on_fit_epoch_end(trainer):
-    """
-    Report training metrics to Ray Tune at epoch end when a Ray session is active.
-
-    Captures metrics from the trainer object and sends them to Ray Tune with the current epoch number,
-    enabling hyperparameter tuning optimization. Only executes when within an active Ray Tune session.
-
-    Args:
-        trainer (ultralytics.engine.trainer.BaseTrainer): The Ultralytics trainer object containing metrics and epochs.
-
-    Examples:
-        >>> # Called automatically by the Ultralytics training loop
-        >>> on_fit_epoch_end(trainer)
-
-    References:
-        Ray Tune docs: https://docs.ray.io/en/latest/tune/index.html
-    """
-    if ray.train._internal.session.get_session():  # check if Ray Tune session is active
-        metrics = trainer.metrics
-        session.report({**metrics, **{"epoch": trainer.epoch + 1}})
-
-
-callbacks = (
-    {
-        "on_fit_epoch_end": on_fit_epoch_end,
-    }
-    if tune
-    else {}
-)
diff --git a/ultralytics/utils/callbacks/tensorboard.py b/ultralytics/utils/callbacks/tensorboard.py
deleted file mode 100644
index 5dbe3e1..0000000
--- a/ultralytics/utils/callbacks/tensorboard.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from ultralytics.utils import LOGGER, SETTINGS, TESTS_RUNNING, colorstr, torch_utils
-
-try:
-    assert not TESTS_RUNNING  # do not log pytest
-    assert SETTINGS["tensorboard"] is True  # verify integration is enabled
-    WRITER = None  # TensorBoard SummaryWriter instance
-    PREFIX = colorstr("TensorBoard: ")
-
-    # Imports below only required if TensorBoard enabled
-    import warnings
-    from copy import deepcopy
-
-    import torch
-    from torch.utils.tensorboard import SummaryWriter
-
-except (ImportError, AssertionError, TypeError, AttributeError):
-    # TypeError for handling 'Descriptors cannot not be created directly.' protobuf errors in Windows
-    # AttributeError: module 'tensorflow' has no attribute 'io' if 'tensorflow' not installed
-    SummaryWriter = None
-
-
-def _log_scalars(scalars: dict, step: int = 0) -> None:
-    """
-    Log scalar values to TensorBoard.
-
-    Args:
-        scalars (dict): Dictionary of scalar values to log to TensorBoard. Keys are scalar names and values are the
-            corresponding scalar values.
-        step (int): Global step value to record with the scalar values. Used for x-axis in TensorBoard graphs.
-
-    Examples:
-        Log training metrics
-        >>> metrics = {"loss": 0.5, "accuracy": 0.95}
-        >>> _log_scalars(metrics, step=100)
-    """
-    if WRITER:
-        for k, v in scalars.items():
-            WRITER.add_scalar(k, v, step)
-
-
-def _log_tensorboard_graph(trainer) -> None:
-    """
-    Log model graph to TensorBoard.
-
-    This function attempts to visualize the model architecture in TensorBoard by tracing the model with a dummy input
-    tensor. It first tries a simple method suitable for YOLO models, and if that fails, falls back to a more complex
-    approach for models like RTDETR that may require special handling.
-
-    Args:
-        trainer (ultralytics.engine.trainer.BaseTrainer): The trainer object containing the model to visualize.
-            Must have attributes model and args with imgsz.
-
-    Notes:
-        This function requires TensorBoard integration to be enabled and the global WRITER to be initialized.
-        It handles potential warnings from the PyTorch JIT tracer and attempts to gracefully handle different
-        model architectures.
-    """
-    # Input image
-    imgsz = trainer.args.imgsz
-    imgsz = (imgsz, imgsz) if isinstance(imgsz, int) else imgsz
-    p = next(trainer.model.parameters())  # for device, type
-    im = torch.zeros((1, 3, *imgsz), device=p.device, dtype=p.dtype)  # input image (must be zeros, not empty)
-
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore", category=UserWarning)  # suppress jit trace warning
-        warnings.simplefilter("ignore", category=torch.jit.TracerWarning)  # suppress jit trace warning
-
-        # Try simple method first (YOLO)
-        try:
-            trainer.model.eval()  # place in .eval() mode to avoid BatchNorm statistics changes
-            WRITER.add_graph(torch.jit.trace(torch_utils.unwrap_model(trainer.model), im, strict=False), [])
-            LOGGER.info(f"{PREFIX}model graph visualization added ✅")
-            return
-
-        except Exception:
-            # Fallback to TorchScript export steps (RTDETR)
-            try:
-                model = deepcopy(torch_utils.unwrap_model(trainer.model))
-                model.eval()
-                model = model.fuse(verbose=False)
-                for m in model.modules():
-                    if hasattr(m, "export"):  # Detect, RTDETRDecoder (Segment and Pose use Detect base class)
-                        m.export = True
-                        m.format = "torchscript"
-                model(im)  # dry run
-                WRITER.add_graph(torch.jit.trace(model, im, strict=False), [])
-                LOGGER.info(f"{PREFIX}model graph visualization added ✅")
-            except Exception as e:
-                LOGGER.warning(f"{PREFIX}TensorBoard graph visualization failure {e}")
-
-
-def on_pretrain_routine_start(trainer) -> None:
-    """Initialize TensorBoard logging with SummaryWriter."""
-    if SummaryWriter:
-        try:
-            global WRITER
-            WRITER = SummaryWriter(str(trainer.save_dir))
-            LOGGER.info(f"{PREFIX}Start with 'tensorboard --logdir {trainer.save_dir}', view at http://localhost:6006/")
-        except Exception as e:
-            LOGGER.warning(f"{PREFIX}TensorBoard not initialized correctly, not logging this run. {e}")
-
-
-def on_train_start(trainer) -> None:
-    """Log TensorBoard graph."""
-    if WRITER:
-        _log_tensorboard_graph(trainer)
-
-
-def on_train_epoch_end(trainer) -> None:
-    """Log scalar statistics at the end of a training epoch."""
-    _log_scalars(trainer.label_loss_items(trainer.tloss, prefix="train"), trainer.epoch + 1)
-    _log_scalars(trainer.lr, trainer.epoch + 1)
-
-
-def on_fit_epoch_end(trainer) -> None:
-    """Log epoch metrics at end of training epoch."""
-    _log_scalars(trainer.metrics, trainer.epoch + 1)
-
-
-callbacks = (
-    {
-        "on_pretrain_routine_start": on_pretrain_routine_start,
-        "on_train_start": on_train_start,
-        "on_fit_epoch_end": on_fit_epoch_end,
-        "on_train_epoch_end": on_train_epoch_end,
-    }
-    if SummaryWriter
-    else {}
-)
diff --git a/ultralytics/utils/callbacks/wb.py b/ultralytics/utils/callbacks/wb.py
deleted file mode 100644
index d97de5d..0000000
--- a/ultralytics/utils/callbacks/wb.py
+++ /dev/null
@@ -1,191 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from ultralytics.utils import SETTINGS, TESTS_RUNNING
-from ultralytics.utils.torch_utils import model_info_for_loggers
-
-try:
-    assert not TESTS_RUNNING  # do not log pytest
-    assert SETTINGS["wandb"] is True  # verify integration is enabled
-    import wandb as wb
-
-    assert hasattr(wb, "__version__")  # verify package is not directory
-    _processed_plots = {}
-
-except (ImportError, AssertionError):
-    wb = None
-
-
-def _custom_table(x, y, classes, title="Precision Recall Curve", x_title="Recall", y_title="Precision"):
-    """
-    Create and log a custom metric visualization to wandb.plot.pr_curve.
-
-    This function crafts a custom metric visualization that mimics the behavior of the default wandb precision-recall
-    curve while allowing for enhanced customization. The visual metric is useful for monitoring model performance across
-    different classes.
-
-    Args:
-        x (list): Values for the x-axis; expected to have length N.
-        y (list): Corresponding values for the y-axis; also expected to have length N.
-        classes (list): Labels identifying the class of each point; length N.
-        title (str, optional): Title for the plot.
-        x_title (str, optional): Label for the x-axis.
-        y_title (str, optional): Label for the y-axis.
-
-    Returns:
-        (wandb.Object): A wandb object suitable for logging, showcasing the crafted metric visualization.
-    """
-    import polars as pl  # scope for faster 'import ultralytics'
-    import polars.selectors as cs
-
-    df = pl.DataFrame({"class": classes, "y": y, "x": x}).with_columns(cs.numeric().round(3))
-    data = df.select(["class", "y", "x"]).rows()
-
-    fields = {"x": "x", "y": "y", "class": "class"}
-    string_fields = {"title": title, "x-axis-title": x_title, "y-axis-title": y_title}
-    return wb.plot_table(
-        "wandb/area-under-curve/v0",
-        wb.Table(data=data, columns=["class", "y", "x"]),
-        fields=fields,
-        string_fields=string_fields,
-    )
-
-
-def _plot_curve(
-    x,
-    y,
-    names=None,
-    id="precision-recall",
-    title="Precision Recall Curve",
-    x_title="Recall",
-    y_title="Precision",
-    num_x=100,
-    only_mean=False,
-):
-    """
-    Log a metric curve visualization.
-
-    This function generates a metric curve based on input data and logs the visualization to wandb.
-    The curve can represent aggregated data (mean) or individual class data, depending on the 'only_mean' flag.
-
-    Args:
-        x (np.ndarray): Data points for the x-axis with length N.
-        y (np.ndarray): Corresponding data points for the y-axis with shape (C, N), where C is the number of classes.
-        names (list, optional): Names of the classes corresponding to the y-axis data; length C.
-        id (str, optional): Unique identifier for the logged data in wandb.
-        title (str, optional): Title for the visualization plot.
-        x_title (str, optional): Label for the x-axis.
-        y_title (str, optional): Label for the y-axis.
-        num_x (int, optional): Number of interpolated data points for visualization.
-        only_mean (bool, optional): Flag to indicate if only the mean curve should be plotted.
-
-    Notes:
-        The function leverages the '_custom_table' function to generate the actual visualization.
-    """
-    import numpy as np
-
-    # Create new x
-    if names is None:
-        names = []
-    x_new = np.linspace(x[0], x[-1], num_x).round(5)
-
-    # Create arrays for logging
-    x_log = x_new.tolist()
-    y_log = np.interp(x_new, x, np.mean(y, axis=0)).round(3).tolist()
-
-    if only_mean:
-        table = wb.Table(data=list(zip(x_log, y_log)), columns=[x_title, y_title])
-        wb.run.log({title: wb.plot.line(table, x_title, y_title, title=title)})
-    else:
-        classes = ["mean"] * len(x_log)
-        for i, yi in enumerate(y):
-            x_log.extend(x_new)  # add new x
-            y_log.extend(np.interp(x_new, x, yi))  # interpolate y to new x
-            classes.extend([names[i]] * len(x_new))  # add class names
-        wb.log({id: _custom_table(x_log, y_log, classes, title, x_title, y_title)}, commit=False)
-
-
-def _log_plots(plots, step):
-    """
-    Log plots to WandB at a specific step if they haven't been logged already.
-
-    This function checks each plot in the input dictionary against previously processed plots and logs
-    new or updated plots to WandB at the specified step.
-
-    Args:
-        plots (dict): Dictionary of plots to log, where keys are plot names and values are dictionaries
-            containing plot metadata including timestamps.
-        step (int): The step/epoch at which to log the plots in the WandB run.
-
-    Notes:
-        The function uses a shallow copy of the plots dictionary to prevent modification during iteration.
-        Plots are identified by their stem name (filename without extension).
-        Each plot is logged as a WandB Image object.
-    """
-    for name, params in plots.copy().items():  # shallow copy to prevent plots dict changing during iteration
-        timestamp = params["timestamp"]
-        if _processed_plots.get(name) != timestamp:
-            wb.run.log({name.stem: wb.Image(str(name))}, step=step)
-            _processed_plots[name] = timestamp
-
-
-def on_pretrain_routine_start(trainer):
-    """Initialize and start wandb project if module is present."""
-    if not wb.run:
-        wb.init(
-            project=str(trainer.args.project).replace("/", "-") if trainer.args.project else "Ultralytics",
-            name=str(trainer.args.name).replace("/", "-"),
-            config=vars(trainer.args),
-        )
-
-
-def on_fit_epoch_end(trainer):
-    """Log training metrics and model information at the end of an epoch."""
-    wb.run.log(trainer.metrics, step=trainer.epoch + 1)
-    _log_plots(trainer.plots, step=trainer.epoch + 1)
-    _log_plots(trainer.validator.plots, step=trainer.epoch + 1)
-    if trainer.epoch == 0:
-        wb.run.log(model_info_for_loggers(trainer), step=trainer.epoch + 1)
-
-
-def on_train_epoch_end(trainer):
-    """Log metrics and save images at the end of each training epoch."""
-    wb.run.log(trainer.label_loss_items(trainer.tloss, prefix="train"), step=trainer.epoch + 1)
-    wb.run.log(trainer.lr, step=trainer.epoch + 1)
-    if trainer.epoch == 1:
-        _log_plots(trainer.plots, step=trainer.epoch + 1)
-
-
-def on_train_end(trainer):
-    """Save the best model as an artifact and log final plots at the end of training."""
-    _log_plots(trainer.validator.plots, step=trainer.epoch + 1)
-    _log_plots(trainer.plots, step=trainer.epoch + 1)
-    art = wb.Artifact(type="model", name=f"run_{wb.run.id}_model")
-    if trainer.best.exists():
-        art.add_file(trainer.best)
-        wb.run.log_artifact(art, aliases=["best"])
-    # Check if we actually have plots to save
-    if trainer.args.plots and hasattr(trainer.validator.metrics, "curves_results"):
-        for curve_name, curve_values in zip(trainer.validator.metrics.curves, trainer.validator.metrics.curves_results):
-            x, y, x_title, y_title = curve_values
-            _plot_curve(
-                x,
-                y,
-                names=list(trainer.validator.metrics.names.values()),
-                id=f"curves/{curve_name}",
-                title=curve_name,
-                x_title=x_title,
-                y_title=y_title,
-            )
-    wb.run.finish()  # required or run continues on dashboard
-
-
-callbacks = (
-    {
-        "on_pretrain_routine_start": on_pretrain_routine_start,
-        "on_train_epoch_end": on_train_epoch_end,
-        "on_fit_epoch_end": on_fit_epoch_end,
-        "on_train_end": on_train_end,
-    }
-    if wb
-    else {}
-)
diff --git a/ultralytics/utils/checks.py b/ultralytics/utils/checks.py
deleted file mode 100644
index d801d14..0000000
--- a/ultralytics/utils/checks.py
+++ /dev/null
@@ -1,964 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import functools
-import glob
-import inspect
-import math
-import os
-import platform
-import re
-import shutil
-import subprocess
-import time
-from importlib import metadata
-from pathlib import Path
-from types import SimpleNamespace
-
-import cv2
-import numpy as np
-import torch
-
-from ultralytics.utils import (
-    ARM64,
-    ASSETS,
-    AUTOINSTALL,
-    GIT,
-    IS_COLAB,
-    IS_JETSON,
-    IS_KAGGLE,
-    IS_PIP_PACKAGE,
-    LINUX,
-    LOGGER,
-    MACOS,
-    ONLINE,
-    PYTHON_VERSION,
-    RKNN_CHIPS,
-    ROOT,
-    TORCH_VERSION,
-    TORCHVISION_VERSION,
-    USER_CONFIG_DIR,
-    WINDOWS,
-    Retry,
-    ThreadingLocked,
-    TryExcept,
-    clean_url,
-    colorstr,
-    downloads,
-    is_github_action_running,
-    url2file,
-)
-
-
-def parse_requirements(file_path=ROOT.parent / "requirements.txt", package=""):
-    """
-    Parse a requirements.txt file, ignoring lines that start with '#' and any text after '#'.
-
-    Args:
-        file_path (Path): Path to the requirements.txt file.
-        package (str, optional): Python package to use instead of requirements.txt file.
-
-    Returns:
-        requirements (list[SimpleNamespace]): List of parsed requirements as SimpleNamespace objects with `name` and
-            `specifier` attributes.
-
-    Examples:
-        >>> from ultralytics.utils.checks import parse_requirements
-        >>> parse_requirements(package="ultralytics")
-    """
-    if package:
-        requires = [x for x in metadata.distribution(package).requires if "extra == " not in x]
-    else:
-        requires = Path(file_path).read_text().splitlines()
-
-    requirements = []
-    for line in requires:
-        line = line.strip()
-        if line and not line.startswith("#"):
-            line = line.partition("#")[0].strip()  # ignore inline comments
-            if match := re.match(r"([a-zA-Z0-9-_]+)\s*([<>!=~]+.*)?", line):
-                requirements.append(SimpleNamespace(name=match[1], specifier=match[2].strip() if match[2] else ""))
-
-    return requirements
-
-
-@functools.lru_cache
-def parse_version(version="0.0.0") -> tuple:
-    """
-    Convert a version string to a tuple of integers, ignoring any extra non-numeric string attached to the version.
-
-    Args:
-        version (str): Version string, i.e. '2.0.1+cpu'
-
-    Returns:
-        (tuple): Tuple of integers representing the numeric part of the version, i.e. (2, 0, 1)
-    """
-    try:
-        return tuple(map(int, re.findall(r"\d+", version)[:3]))  # '2.0.1+cpu' -> (2, 0, 1)
-    except Exception as e:
-        LOGGER.warning(f"failure for parse_version({version}), returning (0, 0, 0): {e}")
-        return 0, 0, 0
-
-
-def is_ascii(s) -> bool:
-    """
-    Check if a string is composed of only ASCII characters.
-
-    Args:
-        s (str | list | tuple | dict): Input to be checked (all are converted to string for checking).
-
-    Returns:
-        (bool): True if the string is composed only of ASCII characters, False otherwise.
-    """
-    return all(ord(c) < 128 for c in str(s))
-
-
-def check_imgsz(imgsz, stride=32, min_dim=1, max_dim=2, floor=0):
-    """
-    Verify image size is a multiple of the given stride in each dimension. If the image size is not a multiple of the
-    stride, update it to the nearest multiple of the stride that is greater than or equal to the given floor value.
-
-    Args:
-        imgsz (int | list[int]): Image size.
-        stride (int): Stride value.
-        min_dim (int): Minimum number of dimensions.
-        max_dim (int): Maximum number of dimensions.
-        floor (int): Minimum allowed value for image size.
-
-    Returns:
-        (list[int] | int): Updated image size.
-    """
-    # Convert stride to integer if it is a tensor
-    stride = int(stride.max() if isinstance(stride, torch.Tensor) else stride)
-
-    # Convert image size to list if it is an integer
-    if isinstance(imgsz, int):
-        imgsz = [imgsz]
-    elif isinstance(imgsz, (list, tuple)):
-        imgsz = list(imgsz)
-    elif isinstance(imgsz, str):  # i.e. '640' or '[640,640]'
-        imgsz = [int(imgsz)] if imgsz.isnumeric() else eval(imgsz)
-    else:
-        raise TypeError(
-            f"'imgsz={imgsz}' is of invalid type {type(imgsz).__name__}. "
-            f"Valid imgsz types are int i.e. 'imgsz=640' or list i.e. 'imgsz=[640,640]'"
-        )
-
-    # Apply max_dim
-    if len(imgsz) > max_dim:
-        msg = (
-            "'train' and 'val' imgsz must be an integer, while 'predict' and 'export' imgsz may be a [h, w] list "
-            "or an integer, i.e. 'yolo export imgsz=640,480' or 'yolo export imgsz=640'"
-        )
-        if max_dim != 1:
-            raise ValueError(f"imgsz={imgsz} is not a valid image size. {msg}")
-        LOGGER.warning(f"updating to 'imgsz={max(imgsz)}'. {msg}")
-        imgsz = [max(imgsz)]
-    # Make image size a multiple of the stride
-    sz = [max(math.ceil(x / stride) * stride, floor) for x in imgsz]
-
-    # Print warning message if image size was updated
-    if sz != imgsz:
-        LOGGER.warning(f"imgsz={imgsz} must be multiple of max stride {stride}, updating to {sz}")
-
-    # Add missing dimensions if necessary
-    sz = [sz[0], sz[0]] if min_dim == 2 and len(sz) == 1 else sz[0] if min_dim == 1 and len(sz) == 1 else sz
-
-    return sz
-
-
-@functools.lru_cache
-def check_uv():
-    """Check if uv package manager is installed and can run successfully."""
-    try:
-        return subprocess.run(["uv", "-V"], capture_output=True).returncode == 0
-    except FileNotFoundError:
-        return False
-
-
-@functools.lru_cache
-def check_version(
-    current: str = "0.0.0",
-    required: str = "0.0.0",
-    name: str = "version",
-    hard: bool = False,
-    verbose: bool = False,
-    msg: str = "",
-) -> bool:
-    """
-    Check current version against the required version or range.
-
-    Args:
-        current (str): Current version or package name to get version from.
-        required (str): Required version or range (in pip-style format).
-        name (str): Name to be used in warning message.
-        hard (bool): If True, raise an AssertionError if the requirement is not met.
-        verbose (bool): If True, print warning message if requirement is not met.
-        msg (str): Extra message to display if verbose.
-
-    Returns:
-        (bool): True if requirement is met, False otherwise.
-
-    Examples:
-        Check if current version is exactly 22.04
-        >>> check_version(current="22.04", required="==22.04")
-
-        Check if current version is greater than or equal to 22.04
-        >>> check_version(current="22.10", required="22.04")  # assumes '>=' inequality if none passed
-
-        Check if current version is less than or equal to 22.04
-        >>> check_version(current="22.04", required="<=22.04")
-
-        Check if current version is between 20.04 (inclusive) and 22.04 (exclusive)
-        >>> check_version(current="21.10", required=">20.04,<22.04")
-    """
-    if not current:  # if current is '' or None
-        LOGGER.warning(f"invalid check_version({current}, {required}) requested, please check values.")
-        return True
-    elif not current[0].isdigit():  # current is package name rather than version string, i.e. current='ultralytics'
-        try:
-            name = current  # assigned package name to 'name' arg
-            current = metadata.version(current)  # get version string from package name
-        except metadata.PackageNotFoundError as e:
-            if hard:
-                raise ModuleNotFoundError(f"{current} package is required but not installed") from e
-            else:
-                return False
-
-    if not required:  # if required is '' or None
-        return True
-
-    if "sys_platform" in required and (  # i.e. required='<2.4.0,>=1.8.0; sys_platform == "win32"'
-        (WINDOWS and "win32" not in required)
-        or (LINUX and "linux" not in required)
-        or (MACOS and "macos" not in required and "darwin" not in required)
-    ):
-        return True
-
-    op = ""
-    version = ""
-    result = True
-    c = parse_version(current)  # '1.2.3' -> (1, 2, 3)
-    for r in required.strip(",").split(","):
-        op, version = re.match(r"([^0-9]*)([\d.]+)", r).groups()  # split '>=22.04' -> ('>=', '22.04')
-        if not op:
-            op = ">="  # assume >= if no op passed
-        v = parse_version(version)  # '1.2.3' -> (1, 2, 3)
-        if op == "==" and c != v:
-            result = False
-        elif op == "!=" and c == v:
-            result = False
-        elif op == ">=" and not (c >= v):
-            result = False
-        elif op == "<=" and not (c <= v):
-            result = False
-        elif op == ">" and not (c > v):
-            result = False
-        elif op == "<" and not (c < v):
-            result = False
-    if not result:
-        warning = f"{name}{required} is required, but {name}=={current} is currently installed {msg}"
-        if hard:
-            raise ModuleNotFoundError(warning)  # assert version requirements met
-        if verbose:
-            LOGGER.warning(warning)
-    return result
-
-
-def check_latest_pypi_version(package_name="ultralytics"):
-    """
-    Return the latest version of a PyPI package without downloading or installing it.
-
-    Args:
-        package_name (str): The name of the package to find the latest version for.
-
-    Returns:
-        (str): The latest version of the package.
-    """
-    import requests  # scoped as slow import
-
-    try:
-        requests.packages.urllib3.disable_warnings()  # Disable the InsecureRequestWarning
-        response = requests.get(f"https://pypi.org/pypi/{package_name}/json", timeout=3)
-        if response.status_code == 200:
-            return response.json()["info"]["version"]
-    except Exception:
-        return None
-
-
-def check_pip_update_available():
-    """
-    Check if a new version of the ultralytics package is available on PyPI.
-
-    Returns:
-        (bool): True if an update is available, False otherwise.
-    """
-    if ONLINE and IS_PIP_PACKAGE:
-        try:
-            from ultralytics import __version__
-
-            latest = check_latest_pypi_version()
-            if check_version(__version__, f"<{latest}"):  # check if current version is < latest version
-                LOGGER.info(
-                    f"New https://pypi.org/project/ultralytics/{latest} available 😃 "
-                    f"Update with 'pip install -U ultralytics'"
-                )
-                return True
-        except Exception:
-            pass
-    return False
-
-
-@ThreadingLocked()
-@functools.lru_cache
-def check_font(font="Arial.ttf"):
-    """
-    Find font locally or download to user's configuration directory if it does not already exist.
-
-    Args:
-        font (str): Path or name of font.
-
-    Returns:
-        (Path): Resolved font file path.
-    """
-    from matplotlib import font_manager  # scope for faster 'import ultralytics'
-
-    # Check USER_CONFIG_DIR
-    name = Path(font).name
-    file = USER_CONFIG_DIR / name
-    if file.exists():
-        return file
-
-    # Check system fonts
-    matches = [s for s in font_manager.findSystemFonts() if font in s]
-    if any(matches):
-        return matches[0]
-
-    # Download to USER_CONFIG_DIR if missing
-    url = f"https://github.com/ultralytics/assets/releases/download/v0.0.0/{name}"
-    if downloads.is_url(url, check=True):
-        downloads.safe_download(url=url, file=file)
-        return file
-
-
-def check_python(minimum: str = "3.8.0", hard: bool = True, verbose: bool = False) -> bool:
-    """
-    Check current python version against the required minimum version.
-
-    Args:
-        minimum (str): Required minimum version of python.
-        hard (bool): If True, raise an AssertionError if the requirement is not met.
-        verbose (bool): If True, print warning message if requirement is not met.
-
-    Returns:
-        (bool): Whether the installed Python version meets the minimum constraints.
-    """
-    return check_version(PYTHON_VERSION, minimum, name="Python", hard=hard, verbose=verbose)
-
-
-@TryExcept()
-def check_requirements(requirements=ROOT.parent / "requirements.txt", exclude=(), install=True, cmds=""):
-    """
-    Check if installed dependencies meet Ultralytics YOLO models requirements and attempt to auto-update if needed.
-
-    Args:
-        requirements (Path | str | list[str] | tuple[str]): Path to a requirements.txt file, a single package
-            requirement as a string, or a list of package requirements as strings.
-        exclude (tuple): Tuple of package names to exclude from checking.
-        install (bool): If True, attempt to auto-update packages that don't meet requirements.
-        cmds (str): Additional commands to pass to the pip install command when auto-updating.
-
-    Examples:
-        >>> from ultralytics.utils.checks import check_requirements
-
-        Check a requirements.txt file
-        >>> check_requirements("path/to/requirements.txt")
-
-        Check a single package
-        >>> check_requirements("ultralytics>=8.0.0")
-
-        Check multiple packages
-        >>> check_requirements(["numpy", "ultralytics>=8.0.0"])
-    """
-    prefix = colorstr("red", "bold", "requirements:")
-    if isinstance(requirements, Path):  # requirements.txt file
-        file = requirements.resolve()
-        assert file.exists(), f"{prefix} {file} not found, check failed."
-        requirements = [f"{x.name}{x.specifier}" for x in parse_requirements(file) if x.name not in exclude]
-    elif isinstance(requirements, str):
-        requirements = [requirements]
-
-    pkgs = []
-    for r in requirements:
-        r_stripped = r.rpartition("/")[-1].replace(".git", "")  # replace git+https://org/repo.git -> 'repo'
-        match = re.match(r"([a-zA-Z0-9-_]+)([<>!=~]+.*)?", r_stripped)
-        name, required = match[1], match[2].strip() if match[2] else ""
-        try:
-            assert check_version(metadata.version(name), required)  # exception if requirements not met
-        except (AssertionError, metadata.PackageNotFoundError):
-            pkgs.append(r)
-
-    @Retry(times=2, delay=1)
-    def attempt_install(packages, commands, use_uv):
-        """Attempt package installation with uv if available, falling back to pip."""
-        if use_uv:
-            base = (
-                f"uv pip install --no-cache-dir {packages} {commands} "
-                f"--index-strategy=unsafe-best-match --break-system-packages --prerelease=allow"
-            )
-            try:
-                return subprocess.check_output(base, shell=True, stderr=subprocess.PIPE, text=True)
-            except subprocess.CalledProcessError as e:
-                if e.stderr and "No virtual environment found" in e.stderr:
-                    return subprocess.check_output(
-                        base.replace("uv pip install", "uv pip install --system"),
-                        shell=True,
-                        stderr=subprocess.PIPE,
-                        text=True,
-                    )
-                raise
-        return subprocess.check_output(f"pip install --no-cache-dir {packages} {commands}", shell=True, text=True)
-
-    s = " ".join(f'"{x}"' for x in pkgs)  # console string
-    if s:
-        if install and AUTOINSTALL:  # check environment variable
-            # Note uv fails on arm64 macOS and Raspberry Pi runners
-            n = len(pkgs)  # number of packages updates
-            LOGGER.info(f"{prefix} Ultralytics requirement{'s' * (n > 1)} {pkgs} not found, attempting AutoUpdate...")
-            try:
-                t = time.time()
-                assert ONLINE, "AutoUpdate skipped (offline)"
-                LOGGER.info(attempt_install(s, cmds, use_uv=not ARM64 and check_uv()))
-                dt = time.time() - t
-                LOGGER.info(f"{prefix} AutoUpdate success ✅ {dt:.1f}s")
-                LOGGER.warning(
-                    f"{prefix} {colorstr('bold', 'Restart runtime or rerun command for updates to take effect')}\n"
-                )
-            except Exception as e:
-                LOGGER.warning(f"{prefix} ❌ {e}")
-                return False
-        else:
-            return False
-
-    return True
-
-
-def check_torchvision():
-    """
-    Check the installed versions of PyTorch and Torchvision to ensure they're compatible.
-
-    This function checks the installed versions of PyTorch and Torchvision, and warns if they're incompatible according
-    to the compatibility table based on: https://github.com/pytorch/vision#installation.
-    """
-    compatibility_table = {
-        "2.9": ["0.24"],
-        "2.8": ["0.23"],
-        "2.7": ["0.22"],
-        "2.6": ["0.21"],
-        "2.5": ["0.20"],
-        "2.4": ["0.19"],
-        "2.3": ["0.18"],
-        "2.2": ["0.17"],
-        "2.1": ["0.16"],
-        "2.0": ["0.15"],
-        "1.13": ["0.14"],
-        "1.12": ["0.13"],
-    }
-
-    # Check major and minor versions
-    v_torch = ".".join(TORCH_VERSION.split("+", 1)[0].split(".")[:2])
-    if v_torch in compatibility_table:
-        compatible_versions = compatibility_table[v_torch]
-        v_torchvision = ".".join(TORCHVISION_VERSION.split("+", 1)[0].split(".")[:2])
-        if all(v_torchvision != v for v in compatible_versions):
-            LOGGER.warning(
-                f"torchvision=={v_torchvision} is incompatible with torch=={v_torch}.\n"
-                f"Run 'pip install torchvision=={compatible_versions[0]}' to fix torchvision or "
-                "'pip install -U torch torchvision' to update both.\n"
-                "For a full compatibility table see https://github.com/pytorch/vision#installation"
-            )
-
-
-def check_suffix(file="yolo11n.pt", suffix=".pt", msg=""):
-    """
-    Check file(s) for acceptable suffix.
-
-    Args:
-        file (str | list[str]): File or list of files to check.
-        suffix (str | tuple): Acceptable suffix or tuple of suffixes.
-        msg (str): Additional message to display in case of error.
-    """
-    if file and suffix:
-        if isinstance(suffix, str):
-            suffix = {suffix}
-        for f in file if isinstance(file, (list, tuple)) else [file]:
-            if s := str(f).rpartition(".")[-1].lower().strip():  # file suffix
-                assert f".{s}" in suffix, f"{msg}{f} acceptable suffix is {suffix}, not .{s}"
-
-
-def check_yolov5u_filename(file: str, verbose: bool = True):
-    """
-    Replace legacy YOLOv5 filenames with updated YOLOv5u filenames.
-
-    Args:
-        file (str): Filename to check and potentially update.
-        verbose (bool): Whether to print information about the replacement.
-
-    Returns:
-        (str): Updated filename.
-    """
-    if "yolov3" in file or "yolov5" in file:
-        if "u.yaml" in file:
-            file = file.replace("u.yaml", ".yaml")  # i.e. yolov5nu.yaml -> yolov5n.yaml
-        elif ".pt" in file and "u" not in file:
-            original_file = file
-            file = re.sub(r"(.*yolov5([nsmlx]))\.pt", "\\1u.pt", file)  # i.e. yolov5n.pt -> yolov5nu.pt
-            file = re.sub(r"(.*yolov5([nsmlx])6)\.pt", "\\1u.pt", file)  # i.e. yolov5n6.pt -> yolov5n6u.pt
-            file = re.sub(r"(.*yolov3(|-tiny|-spp))\.pt", "\\1u.pt", file)  # i.e. yolov3-spp.pt -> yolov3-sppu.pt
-            if file != original_file and verbose:
-                LOGGER.info(
-                    f"PRO TIP 💡 Replace 'model={original_file}' with new 'model={file}'.\nYOLOv5 'u' models are "
-                    f"trained with https://github.com/ultralytics/ultralytics and feature improved performance vs "
-                    f"standard YOLOv5 models trained with https://github.com/ultralytics/yolov5.\n"
-                )
-    return file
-
-
-def check_model_file_from_stem(model="yolo11n"):
-    """
-    Return a model filename from a valid model stem.
-
-    Args:
-        model (str): Model stem to check.
-
-    Returns:
-        (str | Path): Model filename with appropriate suffix.
-    """
-    path = Path(model)
-    if not path.suffix and path.stem in downloads.GITHUB_ASSETS_STEMS:
-        return path.with_suffix(".pt")  # add suffix, i.e. yolo11n -> yolo11n.pt
-    return model
-
-
-def check_file(file, suffix="", download=True, download_dir=".", hard=True):
-    """
-    Search/download file (if necessary), check suffix (if provided), and return path.
-
-    Args:
-        file (str): File name or path.
-        suffix (str | tuple): Acceptable suffix or tuple of suffixes to validate against the file.
-        download (bool): Whether to download the file if it doesn't exist locally.
-        download_dir (str): Directory to download the file to.
-        hard (bool): Whether to raise an error if the file is not found.
-
-    Returns:
-        (str): Path to the file.
-    """
-    check_suffix(file, suffix)  # optional
-    file = str(file).strip()  # convert to string and strip spaces
-    file = check_yolov5u_filename(file)  # yolov5n -> yolov5nu
-    if (
-        not file
-        or ("://" not in file and Path(file).exists())  # '://' check required in Windows Python<3.10
-        or file.lower().startswith("grpc://")
-    ):  # file exists or gRPC Triton images
-        return file
-    elif download and file.lower().startswith(("https://", "http://", "rtsp://", "rtmp://", "tcp://")):  # download
-        url = file  # warning: Pathlib turns :// -> :/
-        file = Path(download_dir) / url2file(file)  # '%2F' to '/', split https://url.com/file.txt?auth
-        if file.exists():
-            LOGGER.info(f"Found {clean_url(url)} locally at {file}")  # file already exists
-        else:
-            downloads.safe_download(url=url, file=file, unzip=False)
-        return str(file)
-    else:  # search
-        files = glob.glob(str(ROOT / "**" / file), recursive=True) or glob.glob(str(ROOT.parent / file))  # find file
-        if not files and hard:
-            raise FileNotFoundError(f"'{file}' does not exist")
-        elif len(files) > 1 and hard:
-            raise FileNotFoundError(f"Multiple files match '{file}', specify exact path: {files}")
-        return files[0] if len(files) else []  # return file
-
-
-def check_yaml(file, suffix=(".yaml", ".yml"), hard=True):
-    """
-    Search/download YAML file (if necessary) and return path, checking suffix.
-
-    Args:
-        file (str | Path): File name or path.
-        suffix (tuple): Tuple of acceptable YAML file suffixes.
-        hard (bool): Whether to raise an error if the file is not found or multiple files are found.
-
-    Returns:
-        (str): Path to the YAML file.
-    """
-    return check_file(file, suffix, hard=hard)
-
-
-def check_is_path_safe(basedir, path):
-    """
-    Check if the resolved path is under the intended directory to prevent path traversal.
-
-    Args:
-        basedir (Path | str): The intended directory.
-        path (Path | str): The path to check.
-
-    Returns:
-        (bool): True if the path is safe, False otherwise.
-    """
-    base_dir_resolved = Path(basedir).resolve()
-    path_resolved = Path(path).resolve()
-
-    return path_resolved.exists() and path_resolved.parts[: len(base_dir_resolved.parts)] == base_dir_resolved.parts
-
-
-@functools.lru_cache
-def check_imshow(warn=False):
-    """
-    Check if environment supports image displays.
-
-    Args:
-        warn (bool): Whether to warn if environment doesn't support image displays.
-
-    Returns:
-        (bool): True if environment supports image displays, False otherwise.
-    """
-    try:
-        if LINUX:
-            assert not IS_COLAB and not IS_KAGGLE
-            assert "DISPLAY" in os.environ, "The DISPLAY environment variable isn't set."
-        cv2.imshow("test", np.zeros((8, 8, 3), dtype=np.uint8))  # show a small 8-pixel image
-        cv2.waitKey(1)
-        cv2.destroyAllWindows()
-        cv2.waitKey(1)
-        return True
-    except Exception as e:
-        if warn:
-            LOGGER.warning(f"Environment does not support cv2.imshow() or PIL Image.show()\n{e}")
-        return False
-
-
-def check_yolo(verbose=True, device=""):
-    """
-    Return a human-readable YOLO software and hardware summary.
-
-    Args:
-        verbose (bool): Whether to print verbose information.
-        device (str | torch.device): Device to use for YOLO.
-    """
-    import psutil  # scoped as slow import
-
-    from ultralytics.utils.torch_utils import select_device
-
-    if IS_COLAB:
-        shutil.rmtree("sample_data", ignore_errors=True)  # remove colab /sample_data directory
-
-    if verbose:
-        # System info
-        gib = 1 << 30  # bytes per GiB
-        ram = psutil.virtual_memory().total
-        total, used, free = shutil.disk_usage("/")
-        s = f"({os.cpu_count()} CPUs, {ram / gib:.1f} GB RAM, {(total - free) / gib:.1f}/{total / gib:.1f} GB disk)"
-        try:
-            from IPython import display
-
-            display.clear_output()  # clear display if notebook
-        except ImportError:
-            pass
-    else:
-        s = ""
-
-    if GIT.is_repo:
-        check_multiple_install()  # check conflicting installation if using local clone
-
-    select_device(device=device, newline=False)
-    LOGGER.info(f"Setup complete ✅ {s}")
-
-
-def collect_system_info():
-    """
-    Collect and print relevant system information including OS, Python, RAM, CPU, and CUDA.
-
-    Returns:
-        (dict): Dictionary containing system information.
-    """
-    import psutil  # scoped as slow import
-
-    from ultralytics.utils import ENVIRONMENT  # scope to avoid circular import
-    from ultralytics.utils.torch_utils import get_cpu_info, get_gpu_info
-
-    gib = 1 << 30  # bytes per GiB
-    cuda = torch.cuda.is_available()
-    check_yolo()
-    total, used, free = shutil.disk_usage("/")
-
-    info_dict = {
-        "OS": platform.platform(),
-        "Environment": ENVIRONMENT,
-        "Python": PYTHON_VERSION,
-        "Install": "git" if GIT.is_repo else "pip" if IS_PIP_PACKAGE else "other",
-        "Path": str(ROOT),
-        "RAM": f"{psutil.virtual_memory().total / gib:.2f} GB",
-        "Disk": f"{(total - free) / gib:.1f}/{total / gib:.1f} GB",
-        "CPU": get_cpu_info(),
-        "CPU count": os.cpu_count(),
-        "GPU": get_gpu_info(index=0) if cuda else None,
-        "GPU count": torch.cuda.device_count() if cuda else None,
-        "CUDA": torch.version.cuda if cuda else None,
-    }
-    LOGGER.info("\n" + "\n".join(f"{k:<23}{v}" for k, v in info_dict.items()) + "\n")
-
-    package_info = {}
-    for r in parse_requirements(package="ultralytics"):
-        try:
-            current = metadata.version(r.name)
-            is_met = "✅ " if check_version(current, str(r.specifier), name=r.name, hard=True) else "❌ "
-        except metadata.PackageNotFoundError:
-            current = "(not installed)"
-            is_met = "❌ "
-        package_info[r.name] = f"{is_met}{current}{r.specifier}"
-        LOGGER.info(f"{r.name:<23}{package_info[r.name]}")
-
-    info_dict["Package Info"] = package_info
-
-    if is_github_action_running():
-        github_info = {
-            "RUNNER_OS": os.getenv("RUNNER_OS"),
-            "GITHUB_EVENT_NAME": os.getenv("GITHUB_EVENT_NAME"),
-            "GITHUB_WORKFLOW": os.getenv("GITHUB_WORKFLOW"),
-            "GITHUB_ACTOR": os.getenv("GITHUB_ACTOR"),
-            "GITHUB_REPOSITORY": os.getenv("GITHUB_REPOSITORY"),
-            "GITHUB_REPOSITORY_OWNER": os.getenv("GITHUB_REPOSITORY_OWNER"),
-        }
-        LOGGER.info("\n" + "\n".join(f"{k}: {v}" for k, v in github_info.items()))
-        info_dict["GitHub Info"] = github_info
-
-    return info_dict
-
-
-def check_amp(model):
-    """
-    Check the PyTorch Automatic Mixed Precision (AMP) functionality of a YOLO model.
-
-    If the checks fail, it means there are anomalies with AMP on the system that may cause NaN losses or zero-mAP
-    results, so AMP will be disabled during training.
-
-    Args:
-        model (torch.nn.Module): A YOLO model instance.
-
-    Returns:
-        (bool): Returns True if the AMP functionality works correctly with YOLO11 model, else False.
-
-    Examples:
-        >>> from ultralytics import YOLO
-        >>> from ultralytics.utils.checks import check_amp
-        >>> model = YOLO("yolo11n.pt").model.cuda()
-        >>> check_amp(model)
-    """
-    from ultralytics.utils.torch_utils import autocast
-
-    device = next(model.parameters()).device  # get model device
-    prefix = colorstr("AMP: ")
-    if device.type in {"cpu", "mps"}:
-        return False  # AMP only used on CUDA devices
-    else:
-        # GPUs that have issues with AMP
-        pattern = re.compile(
-            r"(nvidia|geforce|quadro|tesla).*?(1660|1650|1630|t400|t550|t600|t1000|t1200|t2000|k40m)", re.IGNORECASE
-        )
-
-        gpu = torch.cuda.get_device_name(device)
-        if bool(pattern.search(gpu)):
-            LOGGER.warning(
-                f"{prefix}checks failed ❌. AMP training on {gpu} GPU may cause "
-                f"NaN losses or zero-mAP results, so AMP will be disabled during training."
-            )
-            return False
-
-    def amp_allclose(m, im):
-        """All close FP32 vs AMP results."""
-        batch = [im] * 8
-        imgsz = max(256, int(model.stride.max() * 4))  # max stride P5-32 and P6-64
-        a = m(batch, imgsz=imgsz, device=device, verbose=False)[0].boxes.data  # FP32 inference
-        with autocast(enabled=True):
-            b = m(batch, imgsz=imgsz, device=device, verbose=False)[0].boxes.data  # AMP inference
-        del m
-        return a.shape == b.shape and torch.allclose(a, b.float(), atol=0.5)  # close to 0.5 absolute tolerance
-
-    im = ASSETS / "bus.jpg"  # image to check
-    LOGGER.info(f"{prefix}running Automatic Mixed Precision (AMP) checks...")
-    warning_msg = "Setting 'amp=True'. If you experience zero-mAP or NaN losses you can disable AMP with amp=False."
-    try:
-        from ultralytics import YOLO
-
-        assert amp_allclose(YOLO("yolo11n.pt"), im)
-        LOGGER.info(f"{prefix}checks passed ✅")
-    except ConnectionError:
-        LOGGER.warning(f"{prefix}checks skipped. Offline and unable to download YOLO11n for AMP checks. {warning_msg}")
-    except (AttributeError, ModuleNotFoundError):
-        LOGGER.warning(
-            f"{prefix}checks skipped. "
-            f"Unable to load YOLO11n for AMP checks due to possible Ultralytics package modifications. {warning_msg}"
-        )
-    except AssertionError:
-        LOGGER.error(
-            f"{prefix}checks failed. Anomalies were detected with AMP on your system that may lead to "
-            f"NaN losses or zero-mAP results, so AMP will be disabled during training."
-        )
-        return False
-    return True
-
-
-def check_multiple_install():
-    """Check if there are multiple Ultralytics installations."""
-    import sys
-
-    try:
-        result = subprocess.run([sys.executable, "-m", "pip", "show", "ultralytics"], capture_output=True, text=True)
-        install_msg = (
-            f"Install your local copy in editable mode with 'pip install -e {ROOT.parent}' to avoid "
-            "issues. See https://docs.ultralytics.com/quickstart/"
-        )
-        if result.returncode != 0:
-            if "not found" in result.stderr.lower():  # Package not pip-installed but locally imported
-                LOGGER.warning(f"Ultralytics not found via pip but importing from: {ROOT}. {install_msg}")
-            return
-        yolo_path = (Path(re.findall(r"location:\s+(.+)", result.stdout, flags=re.I)[-1]) / "ultralytics").resolve()
-        if not yolo_path.samefile(ROOT.resolve()):
-            LOGGER.warning(
-                f"Multiple Ultralytics installations detected. The `yolo` command uses: {yolo_path}, "
-                f"but current session imports from: {ROOT}. This may cause version conflicts. {install_msg}"
-            )
-    except Exception:
-        return
-
-
-def print_args(args: dict | None = None, show_file=True, show_func=False):
-    """
-    Print function arguments (optional args dict).
-
-    Args:
-        args (dict, optional): Arguments to print.
-        show_file (bool): Whether to show the file name.
-        show_func (bool): Whether to show the function name.
-    """
-
-    def strip_auth(v):
-        """Clean longer Ultralytics HUB URLs by stripping potential authentication information."""
-        return clean_url(v) if (isinstance(v, str) and v.startswith("http") and len(v) > 100) else v
-
-    x = inspect.currentframe().f_back  # previous frame
-    file, _, func, _, _ = inspect.getframeinfo(x)
-    if args is None:  # get args automatically
-        args, _, _, frm = inspect.getargvalues(x)
-        args = {k: v for k, v in frm.items() if k in args}
-    try:
-        file = Path(file).resolve().relative_to(ROOT).with_suffix("")
-    except ValueError:
-        file = Path(file).stem
-    s = (f"{file}: " if show_file else "") + (f"{func}: " if show_func else "")
-    LOGGER.info(colorstr(s) + ", ".join(f"{k}={strip_auth(v)}" for k, v in sorted(args.items())))
-
-
-def cuda_device_count() -> int:
-    """
-    Get the number of NVIDIA GPUs available in the environment.
-
-    Returns:
-        (int): The number of NVIDIA GPUs available.
-    """
-    if IS_JETSON:
-        # NVIDIA Jetson does not fully support nvidia-smi and therefore use PyTorch instead
-        return torch.cuda.device_count()
-    else:
-        try:
-            # Run the nvidia-smi command and capture its output
-            output = subprocess.check_output(
-                ["nvidia-smi", "--query-gpu=count", "--format=csv,noheader,nounits"], encoding="utf-8"
-            )
-
-            # Take the first line and strip any leading/trailing white space
-            first_line = output.strip().split("\n", 1)[0]
-
-            return int(first_line)
-        except (subprocess.CalledProcessError, FileNotFoundError, ValueError):
-            # If the command fails, nvidia-smi is not found, or output is not an integer, assume no GPUs are available
-            return 0
-
-
-def cuda_is_available() -> bool:
-    """
-    Check if CUDA is available in the environment.
-
-    Returns:
-        (bool): True if one or more NVIDIA GPUs are available, False otherwise.
-    """
-    return cuda_device_count() > 0
-
-
-def is_rockchip():
-    """
-    Check if the current environment is running on a Rockchip SoC.
-
-    Returns:
-        (bool): True if running on a Rockchip SoC, False otherwise.
-    """
-    if LINUX and ARM64:
-        try:
-            with open("/proc/device-tree/compatible") as f:
-                dev_str = f.read()
-                *_, soc = dev_str.split(",")
-                if soc.replace("\x00", "") in RKNN_CHIPS:
-                    return True
-        except OSError:
-            return False
-    else:
-        return False
-
-
-def is_intel():
-    """
-    Check if the system has Intel hardware (CPU or GPU).
-
-    Returns:
-        (bool): True if Intel hardware is detected, False otherwise.
-    """
-    from ultralytics.utils.torch_utils import get_cpu_info
-
-    # Check CPU
-    if "intel" in get_cpu_info().lower():
-        return True
-
-    # Check GPU via xpu-smi
-    try:
-        result = subprocess.run(["xpu-smi", "discovery"], capture_output=True, text=True, timeout=5)
-        return "intel" in result.stdout.lower()
-    except Exception:  # broad clause to capture all Intel GPU exception types
-        return False
-
-
-def is_sudo_available() -> bool:
-    """
-    Check if the sudo command is available in the environment.
-
-    Returns:
-        (bool): True if the sudo command is available, False otherwise.
-    """
-    if WINDOWS:
-        return False
-    cmd = "sudo --version"
-    return subprocess.run(cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL).returncode == 0
-
-
-# Run checks and define constants
-check_python("3.8", hard=False, verbose=True)  # check python version
-check_torchvision()  # check torch-torchvision compatibility
-
-# Define constants
-IS_PYTHON_3_8 = PYTHON_VERSION.startswith("3.8")
-IS_PYTHON_3_12 = PYTHON_VERSION.startswith("3.12")
-IS_PYTHON_3_13 = PYTHON_VERSION.startswith("3.13")
-
-IS_PYTHON_MINIMUM_3_10 = check_python("3.10", hard=False)
-IS_PYTHON_MINIMUM_3_12 = check_python("3.12", hard=False)
diff --git a/ultralytics/utils/cpu.py b/ultralytics/utils/cpu.py
deleted file mode 100644
index 0915df8..0000000
--- a/ultralytics/utils/cpu.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import platform
-import re
-import subprocess
-import sys
-from pathlib import Path
-
-
-class CPUInfo:
-    """
-    Provide cross-platform CPU brand and model information.
-
-    Query platform-specific sources to retrieve a human-readable CPU descriptor and normalize it for consistent
-    presentation across macOS, Linux, and Windows. If platform-specific probing fails, generic platform identifiers are
-    used to ensure a stable string is always returned.
-
-    Methods:
-        name: Return the normalized CPU name using platform-specific sources with robust fallbacks.
-        _clean: Normalize and prettify common vendor brand strings and frequency patterns.
-        __str__: Return the normalized CPU name for string contexts.
-
-    Examples:
-        >>> CPUInfo.name()
-        'Apple M4 Pro'
-        >>> str(CPUInfo())
-        'Intel Core i7-9750H 2.60GHz'
-    """
-
-    @staticmethod
-    def name() -> str:
-        """Return a normalized CPU model string from platform-specific sources."""
-        try:
-            if sys.platform == "darwin":
-                # Query macOS sysctl for the CPU brand string
-                s = subprocess.run(
-                    ["sysctl", "-n", "machdep.cpu.brand_string"], capture_output=True, text=True
-                ).stdout.strip()
-                if s:
-                    return CPUInfo._clean(s)
-            elif sys.platform.startswith("linux"):
-                # Parse /proc/cpuinfo for the first "model name" entry
-                p = Path("/proc/cpuinfo")
-                if p.exists():
-                    for line in p.read_text(errors="ignore").splitlines():
-                        if "model name" in line:
-                            return CPUInfo._clean(line.split(":", 1)[1])
-            elif sys.platform.startswith("win"):
-                try:
-                    import winreg as wr
-
-                    with wr.OpenKey(wr.HKEY_LOCAL_MACHINE, r"HARDWARE\DESCRIPTION\System\CentralProcessor\0") as k:
-                        val, _ = wr.QueryValueEx(k, "ProcessorNameString")
-                        if val:
-                            return CPUInfo._clean(val)
-                except Exception:
-                    # Fall through to generic platform fallbacks on Windows registry access failure
-                    pass
-            # Generic platform fallbacks
-            s = platform.processor() or getattr(platform.uname(), "processor", "") or platform.machine()
-            return CPUInfo._clean(s or "Unknown CPU")
-        except Exception:
-            # Ensure a string is always returned even on unexpected failures
-            s = platform.processor() or platform.machine() or ""
-            return CPUInfo._clean(s or "Unknown CPU")
-
-    @staticmethod
-    def _clean(s: str) -> str:
-        """Normalize and prettify a raw CPU descriptor string."""
-        s = re.sub(r"\s+", " ", s.strip())
-        s = s.replace("(TM)", "").replace("(tm)", "").replace("(R)", "").replace("(r)", "").strip()
-        # Normalize common Intel pattern to 'Model Freq'
-        m = re.search(r"(Intel.*?i\d[\w-]*) CPU @ ([\d.]+GHz)", s, re.I)
-        if m:
-            return f"{m.group(1)} {m.group(2)}"
-        # Normalize common AMD Ryzen pattern to 'Model Freq'
-        m = re.search(r"(AMD.*?Ryzen.*?[\w-]*) CPU @ ([\d.]+GHz)", s, re.I)
-        if m:
-            return f"{m.group(1)} {m.group(2)}"
-        return s
-
-    def __str__(self) -> str:
-        """Return the normalized CPU name."""
-        return self.name()
-
-
-if __name__ == "__main__":
-    print(CPUInfo.name())
diff --git a/ultralytics/utils/dist.py b/ultralytics/utils/dist.py
deleted file mode 100644
index 30d7c04..0000000
--- a/ultralytics/utils/dist.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-import os
-import shutil
-import sys
-import tempfile
-
-from . import USER_CONFIG_DIR
-from .torch_utils import TORCH_1_9
-
-
-def find_free_network_port() -> int:
-    """
-    Find a free port on localhost.
-
-    It is useful in single-node training when we don't want to connect to a real main node but have to set the
-    `MASTER_PORT` environment variable.
-
-    Returns:
-        (int): The available network port number.
-    """
-    import socket
-
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-        s.bind(("127.0.0.1", 0))
-        return s.getsockname()[1]  # port
-
-
-def generate_ddp_file(trainer):
-    """
-    Generate a DDP (Distributed Data Parallel) file for multi-GPU training.
-
-    This function creates a temporary Python file that enables distributed training across multiple GPUs.
-    The file contains the necessary configuration to initialize the trainer in a distributed environment.
-
-    Args:
-        trainer (ultralytics.engine.trainer.BaseTrainer): The trainer containing training configuration and arguments.
-            Must have args attribute and be a class instance.
-
-    Returns:
-        (str): Path to the generated temporary DDP file.
-
-    Notes:
-        The generated file is saved in the USER_CONFIG_DIR/DDP directory and includes:
-        - Trainer class import
-        - Configuration overrides from the trainer arguments
-        - Model path configuration
-        - Training initialization code
-    """
-    module, name = f"{trainer.__class__.__module__}.{trainer.__class__.__name__}".rsplit(".", 1)
-
-    content = f"""
-# Ultralytics Multi-GPU training temp file (should be automatically deleted after use)
-overrides = {vars(trainer.args)}
-
-if __name__ == "__main__":
-    from {module} import {name}
-    from ultralytics.utils import DEFAULT_CFG_DICT
-
-    cfg = DEFAULT_CFG_DICT.copy()
-    cfg.update(save_dir='')   # handle the extra key 'save_dir'
-    trainer = {name}(cfg=cfg, overrides=overrides)
-    trainer.args.model = "{getattr(trainer.hub_session, "model_url", trainer.args.model)}"
-    results = trainer.train()
-"""
-    (USER_CONFIG_DIR / "DDP").mkdir(exist_ok=True)
-    with tempfile.NamedTemporaryFile(
-        prefix="_temp_",
-        suffix=f"{id(trainer)}.py",
-        mode="w+",
-        encoding="utf-8",
-        dir=USER_CONFIG_DIR / "DDP",
-        delete=False,
-    ) as file:
-        file.write(content)
-    return file.name
-
-
-def generate_ddp_command(trainer):
-    """
-    Generate command for distributed training.
-
-    Args:
-        trainer (ultralytics.engine.trainer.BaseTrainer): The trainer containing configuration for distributed training.
-
-    Returns:
-        cmd (list[str]): The command to execute for distributed training.
-        file (str): Path to the temporary file created for DDP training.
-    """
-    import __main__  # noqa local import to avoid https://github.com/Lightning-AI/pytorch-lightning/issues/15218
-
-    if not trainer.resume:
-        shutil.rmtree(trainer.save_dir)  # remove the save_dir
-    file = generate_ddp_file(trainer)
-    dist_cmd = "torch.distributed.run" if TORCH_1_9 else "torch.distributed.launch"
-    port = find_free_network_port()
-    cmd = [
-        sys.executable,
-        "-m",
-        dist_cmd,
-        "--nproc_per_node",
-        f"{trainer.world_size}",
-        "--master_port",
-        f"{port}",
-        file,
-    ]
-    return cmd, file
-
-
-def ddp_cleanup(trainer, file):
-    """
-    Delete temporary file if created during distributed data parallel (DDP) training.
-
-    This function checks if the provided file contains the trainer's ID in its name, indicating it was created
-    as a temporary file for DDP training, and deletes it if so.
-
-    Args:
-        trainer (ultralytics.engine.trainer.BaseTrainer): The trainer used for distributed training.
-        file (str): Path to the file that might need to be deleted.
-
-    Examples:
-        >>> trainer = YOLOTrainer()
-        >>> file = "/tmp/ddp_temp_123456789.py"
-        >>> ddp_cleanup(trainer, file)
-    """
-    if f"{id(trainer)}.py" in file:  # if temp_file suffix in file
-        os.remove(file)
diff --git a/ultralytics/utils/downloads.py b/ultralytics/utils/downloads.py
deleted file mode 100644
index 6257d21..0000000
--- a/ultralytics/utils/downloads.py
+++ /dev/null
@@ -1,541 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import re
-import shutil
-import subprocess
-from itertools import repeat
-from multiprocessing.pool import ThreadPool
-from pathlib import Path
-from urllib import parse, request
-
-from ultralytics.utils import LOGGER, TQDM, checks, clean_url, emojis, is_online, url2file
-
-# Define Ultralytics GitHub assets maintained at https://github.com/ultralytics/assets
-GITHUB_ASSETS_REPO = "ultralytics/assets"
-GITHUB_ASSETS_NAMES = frozenset(
-    [f"yolov8{k}{suffix}.pt" for k in "nsmlx" for suffix in ("", "-cls", "-seg", "-pose", "-obb", "-oiv7")]
-    + [f"yolo11{k}{suffix}.pt" for k in "nsmlx" for suffix in ("", "-cls", "-seg", "-pose", "-obb")]
-    + [f"yolo12{k}{suffix}.pt" for k in "nsmlx" for suffix in ("",)]  # detect models only currently
-    + [f"yolov5{k}{resolution}u.pt" for k in "nsmlx" for resolution in ("", "6")]
-    + [f"yolov3{k}u.pt" for k in ("", "-spp", "-tiny")]
-    + [f"yolov8{k}-world.pt" for k in "smlx"]
-    + [f"yolov8{k}-worldv2.pt" for k in "smlx"]
-    + [f"yoloe-v8{k}{suffix}.pt" for k in "sml" for suffix in ("-seg", "-seg-pf")]
-    + [f"yoloe-11{k}{suffix}.pt" for k in "sml" for suffix in ("-seg", "-seg-pf")]
-    + [f"yolov9{k}.pt" for k in "tsmce"]
-    + [f"yolov10{k}.pt" for k in "nsmblx"]
-    + [f"yolo_nas_{k}.pt" for k in "sml"]
-    + [f"sam_{k}.pt" for k in "bl"]
-    + [f"sam2_{k}.pt" for k in "blst"]
-    + [f"sam2.1_{k}.pt" for k in "blst"]
-    + [f"FastSAM-{k}.pt" for k in "sx"]
-    + [f"rtdetr-{k}.pt" for k in "lx"]
-    + [
-        "mobile_sam.pt",
-        "mobileclip_blt.ts",
-        "yolo11n-grayscale.pt",
-        "calibration_image_sample_data_20x128x128x3_float32.npy.zip",
-    ]
-)
-GITHUB_ASSETS_STEMS = frozenset(k.rpartition(".")[0] for k in GITHUB_ASSETS_NAMES)
-
-
-def is_url(url: str | Path, check: bool = False) -> bool:
-    """
-    Validate if the given string is a URL and optionally check if the URL exists online.
-
-    Args:
-        url (str): The string to be validated as a URL.
-        check (bool, optional): If True, performs an additional check to see if the URL exists online.
-
-    Returns:
-        (bool): True for a valid URL. If 'check' is True, also returns True if the URL exists online.
-
-    Examples:
-        >>> valid = is_url("https://www.example.com")
-        >>> valid_and_exists = is_url("https://www.example.com", check=True)
-    """
-    try:
-        url = str(url)
-        result = parse.urlparse(url)
-        assert all([result.scheme, result.netloc])  # check if is url
-        if check:
-            with request.urlopen(url) as response:
-                return response.getcode() == 200  # check if exists online
-        return True
-    except Exception:
-        return False
-
-
-def delete_dsstore(path: str | Path, files_to_delete: tuple[str, ...] = (".DS_Store", "__MACOSX")) -> None:
-    """
-    Delete all specified system files in a directory.
-
-    Args:
-        path (str | Path): The directory path where the files should be deleted.
-        files_to_delete (tuple): The files to be deleted.
-
-    Examples:
-        >>> from ultralytics.utils.downloads import delete_dsstore
-        >>> delete_dsstore("path/to/dir")
-
-    Notes:
-        ".DS_store" files are created by the Apple operating system and contain metadata about folders and files. They
-        are hidden system files and can cause issues when transferring files between different operating systems.
-    """
-    for file in files_to_delete:
-        matches = list(Path(path).rglob(file))
-        LOGGER.info(f"Deleting {file} files: {matches}")
-        for f in matches:
-            f.unlink()
-
-
-def zip_directory(
-    directory: str | Path,
-    compress: bool = True,
-    exclude: tuple[str, ...] = (".DS_Store", "__MACOSX"),
-    progress: bool = True,
-) -> Path:
-    """
-    Zip the contents of a directory, excluding specified files.
-
-    The resulting zip file is named after the directory and placed alongside it.
-
-    Args:
-        directory (str | Path): The path to the directory to be zipped.
-        compress (bool): Whether to compress the files while zipping.
-        exclude (tuple, optional): A tuple of filename strings to be excluded.
-        progress (bool, optional): Whether to display a progress bar.
-
-    Returns:
-        (Path): The path to the resulting zip file.
-
-    Examples:
-        >>> from ultralytics.utils.downloads import zip_directory
-        >>> file = zip_directory("path/to/dir")
-    """
-    from zipfile import ZIP_DEFLATED, ZIP_STORED, ZipFile
-
-    delete_dsstore(directory)
-    directory = Path(directory)
-    if not directory.is_dir():
-        raise FileNotFoundError(f"Directory '{directory}' does not exist.")
-
-    # Zip with progress bar
-    files = [f for f in directory.rglob("*") if f.is_file() and all(x not in f.name for x in exclude)]  # files to zip
-    zip_file = directory.with_suffix(".zip")
-    compression = ZIP_DEFLATED if compress else ZIP_STORED
-    with ZipFile(zip_file, "w", compression) as f:
-        for file in TQDM(files, desc=f"Zipping {directory} to {zip_file}...", unit="files", disable=not progress):
-            f.write(file, file.relative_to(directory))
-
-    return zip_file  # return path to zip file
-
-
-def unzip_file(
-    file: str | Path,
-    path: str | Path | None = None,
-    exclude: tuple[str, ...] = (".DS_Store", "__MACOSX"),
-    exist_ok: bool = False,
-    progress: bool = True,
-) -> Path:
-    """
-    Unzip a *.zip file to the specified path, excluding specified files.
-
-    If the zipfile does not contain a single top-level directory, the function will create a new
-    directory with the same name as the zipfile (without the extension) to extract its contents.
-    If a path is not provided, the function will use the parent directory of the zipfile as the default path.
-
-    Args:
-        file (str | Path): The path to the zipfile to be extracted.
-        path (str | Path, optional): The path to extract the zipfile to.
-        exclude (tuple, optional): A tuple of filename strings to be excluded.
-        exist_ok (bool, optional): Whether to overwrite existing contents if they exist.
-        progress (bool, optional): Whether to display a progress bar.
-
-    Returns:
-        (Path): The path to the directory where the zipfile was extracted.
-
-    Raises:
-        BadZipFile: If the provided file does not exist or is not a valid zipfile.
-
-    Examples:
-        >>> from ultralytics.utils.downloads import unzip_file
-        >>> directory = unzip_file("path/to/file.zip")
-    """
-    from zipfile import BadZipFile, ZipFile, is_zipfile
-
-    if not (Path(file).exists() and is_zipfile(file)):
-        raise BadZipFile(f"File '{file}' does not exist or is a bad zip file.")
-    if path is None:
-        path = Path(file).parent  # default path
-
-    # Unzip the file contents
-    with ZipFile(file) as zipObj:
-        files = [f for f in zipObj.namelist() if all(x not in f for x in exclude)]
-        top_level_dirs = {Path(f).parts[0] for f in files}
-
-        # Decide to unzip directly or unzip into a directory
-        unzip_as_dir = len(top_level_dirs) == 1  # (len(files) > 1 and not files[0].endswith("/"))
-        if unzip_as_dir:
-            # Zip has 1 top-level directory
-            extract_path = path  # i.e. ../datasets
-            path = Path(path) / list(top_level_dirs)[0]  # i.e. extract coco8/ dir to ../datasets/
-        else:
-            # Zip has multiple files at top level
-            path = extract_path = Path(path) / Path(file).stem  # i.e. extract multiple files to ../datasets/coco8/
-
-        # Check if destination directory already exists and contains files
-        if path.exists() and any(path.iterdir()) and not exist_ok:
-            # If it exists and is not empty, return the path without unzipping
-            LOGGER.warning(f"Skipping {file} unzip as destination directory {path} is not empty.")
-            return path
-
-        for f in TQDM(files, desc=f"Unzipping {file} to {Path(path).resolve()}...", unit="files", disable=not progress):
-            # Ensure the file is within the extract_path to avoid path traversal security vulnerability
-            if ".." in Path(f).parts:
-                LOGGER.warning(f"Potentially insecure file path: {f}, skipping extraction.")
-                continue
-            zipObj.extract(f, extract_path)
-
-    return path  # return unzip dir
-
-
-def check_disk_space(
-    file_bytes: int,
-    path: str | Path = Path.cwd(),
-    sf: float = 1.5,
-    hard: bool = True,
-) -> bool:
-    """
-    Check if there is sufficient disk space to download and store a file.
-
-    Args:
-        file_bytes (int): The file size in bytes.
-        path (str | Path, optional): The path or drive to check the available free space on.
-        sf (float, optional): Safety factor, the multiplier for the required free space.
-        hard (bool, optional): Whether to throw an error or not on insufficient disk space.
-
-    Returns:
-        (bool): True if there is sufficient disk space, False otherwise.
-    """
-    total, used, free = shutil.disk_usage(path)  # bytes
-    if file_bytes * sf < free:
-        return True  # sufficient space
-
-    # Insufficient space
-    text = (
-        f"Insufficient free disk space {free >> 30:.3f} GB < {int(file_bytes * sf) >> 30:.3f} GB required, "
-        f"Please free {int(file_bytes * sf - free) >> 30:.3f} GB additional disk space and try again."
-    )
-    if hard:
-        raise MemoryError(text)
-    LOGGER.warning(text)
-    return False
-
-
-def get_google_drive_file_info(link: str) -> tuple[str, str | None]:
-    """
-    Retrieve the direct download link and filename for a shareable Google Drive file link.
-
-    Args:
-        link (str): The shareable link of the Google Drive file.
-
-    Returns:
-        url (str): Direct download URL for the Google Drive file.
-        filename (str | None): Original filename of the Google Drive file. If filename extraction fails, returns None.
-
-    Examples:
-        >>> from ultralytics.utils.downloads import get_google_drive_file_info
-        >>> link = "https://drive.google.com/file/d/1cqT-cJgANNrhIHCrEufUYhQ4RqiWG_lJ/view?usp=drive_link"
-        >>> url, filename = get_google_drive_file_info(link)
-    """
-    import requests  # scoped as slow import
-
-    file_id = link.split("/d/")[1].split("/view", 1)[0]
-    drive_url = f"https://drive.google.com/uc?export=download&id={file_id}"
-    filename = None
-
-    # Start session
-    with requests.Session() as session:
-        response = session.get(drive_url, stream=True)
-        if "quota exceeded" in str(response.content.lower()):
-            raise ConnectionError(
-                emojis(
-                    f"❌  Google Drive file download quota exceeded. "
-                    f"Please try again later or download this file manually at {link}."
-                )
-            )
-        for k, v in response.cookies.items():
-            if k.startswith("download_warning"):
-                drive_url += f"&confirm={v}"  # v is token
-        if cd := response.headers.get("content-disposition"):
-            filename = re.findall('filename="(.+)"', cd)[0]
-    return drive_url, filename
-
-
-def safe_download(
-    url: str | Path,
-    file: str | Path | None = None,
-    dir: str | Path | None = None,
-    unzip: bool = True,
-    delete: bool = False,
-    curl: bool = False,
-    retry: int = 3,
-    min_bytes: float = 1e0,
-    exist_ok: bool = False,
-    progress: bool = True,
-) -> Path | str:
-    """
-    Download files from a URL with options for retrying, unzipping, and deleting the downloaded file. Enhanced with
-    robust partial download detection using Content-Length validation.
-
-    Args:
-        url (str): The URL of the file to be downloaded.
-        file (str, optional): The filename of the downloaded file.
-            If not provided, the file will be saved with the same name as the URL.
-        dir (str | Path, optional): The directory to save the downloaded file.
-            If not provided, the file will be saved in the current working directory.
-        unzip (bool, optional): Whether to unzip the downloaded file.
-        delete (bool, optional): Whether to delete the downloaded file after unzipping.
-        curl (bool, optional): Whether to use curl command line tool for downloading.
-        retry (int, optional): The number of times to retry the download in case of failure.
-        min_bytes (float, optional): The minimum number of bytes that the downloaded file should have, to be considered
-            a successful download.
-        exist_ok (bool, optional): Whether to overwrite existing contents during unzipping.
-        progress (bool, optional): Whether to display a progress bar during the download.
-
-    Returns:
-        (Path | str): The path to the downloaded file or extracted directory.
-
-    Examples:
-        >>> from ultralytics.utils.downloads import safe_download
-        >>> link = "https://ultralytics.com/assets/bus.jpg"
-        >>> path = safe_download(link)
-    """
-    gdrive = url.startswith("https://drive.google.com/")  # check if the URL is a Google Drive link
-    if gdrive:
-        url, file = get_google_drive_file_info(url)
-
-    f = Path(dir or ".") / (file or url2file(url))  # URL converted to filename
-    if "://" not in str(url) and Path(url).is_file():  # URL exists ('://' check required in Windows Python<3.10)
-        f = Path(url)  # filename
-    elif not f.is_file():  # URL and file do not exist
-        uri = (url if gdrive else clean_url(url)).replace(  # cleaned and aliased url
-            "https://github.com/ultralytics/assets/releases/download/v0.0.0/",
-            "https://ultralytics.com/assets/",  # assets alias
-        )
-        desc = f"Downloading {uri} to '{f}'"
-        f.parent.mkdir(parents=True, exist_ok=True)  # make directory if missing
-        curl_installed = shutil.which("curl")
-        for i in range(retry + 1):
-            try:
-                if (curl or i > 0) and curl_installed:  # curl download with retry, continue
-                    s = "sS" * (not progress)  # silent
-                    r = subprocess.run(["curl", "-#", f"-{s}L", url, "-o", f, "--retry", "3", "-C", "-"]).returncode
-                    assert r == 0, f"Curl return value {r}"
-                    expected_size = None  # Can't get size with curl
-                else:  # urllib download
-                    with request.urlopen(url) as response:
-                        expected_size = int(response.getheader("Content-Length", 0))
-                        if i == 0 and expected_size > 1048576:
-                            check_disk_space(expected_size, path=f.parent)
-                        buffer_size = max(8192, min(1048576, expected_size // 1000)) if expected_size else 8192
-                        with TQDM(
-                            total=expected_size,
-                            desc=desc,
-                            disable=not progress,
-                            unit="B",
-                            unit_scale=True,
-                            unit_divisor=1024,
-                        ) as pbar:
-                            with open(f, "wb") as f_opened:
-                                while True:
-                                    data = response.read(buffer_size)
-                                    if not data:
-                                        break
-                                    f_opened.write(data)
-                                    pbar.update(len(data))
-
-                if f.exists():
-                    file_size = f.stat().st_size
-                    if file_size > min_bytes:
-                        # Check if download is complete (only if we have expected_size)
-                        if expected_size and file_size != expected_size:
-                            LOGGER.warning(
-                                f"Partial download: {file_size}/{expected_size} bytes ({file_size / expected_size * 100:.1f}%)"
-                            )
-                        else:
-                            break  # success
-                    f.unlink()  # remove partial downloads
-            except MemoryError:
-                raise  # Re-raise immediately - no point retrying if insufficient disk space
-            except Exception as e:
-                if i == 0 and not is_online():
-                    raise ConnectionError(emojis(f"❌  Download failure for {uri}. Environment is not online.")) from e
-                elif i >= retry:
-                    raise ConnectionError(emojis(f"❌  Download failure for {uri}. Retry limit reached.")) from e
-                LOGGER.warning(f"Download failure, retrying {i + 1}/{retry} {uri}...")
-
-    if unzip and f.exists() and f.suffix in {"", ".zip", ".tar", ".gz"}:
-        from zipfile import is_zipfile
-
-        unzip_dir = (dir or f.parent).resolve()  # unzip to dir if provided else unzip in place
-        if is_zipfile(f):
-            unzip_dir = unzip_file(file=f, path=unzip_dir, exist_ok=exist_ok, progress=progress)  # unzip
-        elif f.suffix in {".tar", ".gz"}:
-            LOGGER.info(f"Unzipping {f} to {unzip_dir}...")
-            subprocess.run(["tar", "xf" if f.suffix == ".tar" else "xfz", f, "--directory", unzip_dir], check=True)
-        if delete:
-            f.unlink()  # remove zip
-        return unzip_dir
-    return f
-
-
-def get_github_assets(
-    repo: str = "ultralytics/assets",
-    version: str = "latest",
-    retry: bool = False,
-) -> tuple[str, list[str]]:
-    """
-    Retrieve the specified version's tag and assets from a GitHub repository.
-
-    If the version is not specified, the function fetches the latest release assets.
-
-    Args:
-        repo (str, optional): The GitHub repository in the format 'owner/repo'.
-        version (str, optional): The release version to fetch assets from.
-        retry (bool, optional): Flag to retry the request in case of a failure.
-
-    Returns:
-        tag (str): The release tag.
-        assets (list[str]): A list of asset names.
-
-    Examples:
-        >>> tag, assets = get_github_assets(repo="ultralytics/assets", version="latest")
-    """
-    import requests  # scoped as slow import
-
-    if version != "latest":
-        version = f"tags/{version}"  # i.e. tags/v6.2
-    url = f"https://api.github.com/repos/{repo}/releases/{version}"
-    r = requests.get(url)  # github api
-    if r.status_code != 200 and r.reason != "rate limit exceeded" and retry:  # failed and not 403 rate limit exceeded
-        r = requests.get(url)  # try again
-    if r.status_code != 200:
-        LOGGER.warning(f"GitHub assets check failure for {url}: {r.status_code} {r.reason}")
-        return "", []
-    data = r.json()
-    return data["tag_name"], [x["name"] for x in data["assets"]]  # tag, assets i.e. ['yolo11n.pt', 'yolov8s.pt', ...]
-
-
-def attempt_download_asset(
-    file: str | Path,
-    repo: str = "ultralytics/assets",
-    release: str = "v8.3.0",
-    **kwargs,
-) -> str:
-    """
-    Attempt to download a file from GitHub release assets if it is not found locally.
-
-    Args:
-        file (str | Path): The filename or file path to be downloaded.
-        repo (str, optional): The GitHub repository in the format 'owner/repo'.
-        release (str, optional): The specific release version to be downloaded.
-        **kwargs (Any): Additional keyword arguments for the download process.
-
-    Returns:
-        (str): The path to the downloaded file.
-
-    Examples:
-        >>> file_path = attempt_download_asset("yolo11n.pt", repo="ultralytics/assets", release="latest")
-    """
-    from ultralytics.utils import SETTINGS  # scoped for circular import
-
-    # YOLOv3/5u updates
-    file = str(file)
-    file = checks.check_yolov5u_filename(file)
-    file = Path(file.strip().replace("'", ""))
-    if file.exists():
-        return str(file)
-    elif (SETTINGS["weights_dir"] / file).exists():
-        return str(SETTINGS["weights_dir"] / file)
-    else:
-        # URL specified
-        name = Path(parse.unquote(str(file))).name  # decode '%2F' to '/' etc.
-        download_url = f"https://github.com/{repo}/releases/download"
-        if str(file).startswith(("http:/", "https:/")):  # download
-            url = str(file).replace(":/", "://")  # Pathlib turns :// -> :/
-            file = url2file(name)  # parse authentication https://url.com/file.txt?auth...
-            if Path(file).is_file():
-                LOGGER.info(f"Found {clean_url(url)} locally at {file}")  # file already exists
-            else:
-                safe_download(url=url, file=file, min_bytes=1e5, **kwargs)
-
-        elif repo == GITHUB_ASSETS_REPO and name in GITHUB_ASSETS_NAMES:
-            safe_download(url=f"{download_url}/{release}/{name}", file=file, min_bytes=1e5, **kwargs)
-
-        else:
-            tag, assets = get_github_assets(repo, release)
-            if not assets:
-                tag, assets = get_github_assets(repo)  # latest release
-            if name in assets:
-                safe_download(url=f"{download_url}/{tag}/{name}", file=file, min_bytes=1e5, **kwargs)
-
-        return str(file)
-
-
-def download(
-    url: str | list[str] | Path,
-    dir: Path = Path.cwd(),
-    unzip: bool = True,
-    delete: bool = False,
-    curl: bool = False,
-    threads: int = 1,
-    retry: int = 3,
-    exist_ok: bool = False,
-) -> None:
-    """
-    Download files from specified URLs to a given directory.
-
-    Supports concurrent downloads if multiple threads are specified.
-
-    Args:
-        url (str | list[str]): The URL or list of URLs of the files to be downloaded.
-        dir (Path, optional): The directory where the files will be saved.
-        unzip (bool, optional): Flag to unzip the files after downloading.
-        delete (bool, optional): Flag to delete the zip files after extraction.
-        curl (bool, optional): Flag to use curl for downloading.
-        threads (int, optional): Number of threads to use for concurrent downloads.
-        retry (int, optional): Number of retries in case of download failure.
-        exist_ok (bool, optional): Whether to overwrite existing contents during unzipping.
-
-    Examples:
-        >>> download("https://ultralytics.com/assets/example.zip", dir="path/to/dir", unzip=True)
-    """
-    dir = Path(dir)
-    dir.mkdir(parents=True, exist_ok=True)  # make directory
-    urls = [url] if isinstance(url, (str, Path)) else url
-    if threads > 1:
-        LOGGER.info(f"Downloading {len(urls)} file(s) with {threads} threads to {dir}...")
-        with ThreadPool(threads) as pool:
-            pool.map(
-                lambda x: safe_download(
-                    url=x[0],
-                    dir=x[1],
-                    unzip=unzip,
-                    delete=delete,
-                    curl=curl,
-                    retry=retry,
-                    exist_ok=exist_ok,
-                    progress=True,
-                ),
-                zip(urls, repeat(dir)),
-            )
-            pool.close()
-            pool.join()
-    else:
-        for u in urls:
-            safe_download(url=u, dir=dir, unzip=unzip, delete=delete, curl=curl, retry=retry, exist_ok=exist_ok)
diff --git a/ultralytics/utils/errors.py b/ultralytics/utils/errors.py
deleted file mode 100644
index 036c23e..0000000
--- a/ultralytics/utils/errors.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from ultralytics.utils import emojis
-
-
-class HUBModelError(Exception):
-    """
-    Exception raised when a model cannot be found or retrieved from ultralytics HUB.
-
-    This custom exception is used specifically for handling errors related to model fetching in Ultralytics YOLO.
-    The error message is processed to include emojis for better user experience.
-
-    Attributes:
-        message (str): The error message displayed when the exception is raised.
-
-    Methods:
-        __init__: Initialize the HUBModelError with a custom message.
-
-    Examples:
-        >>> try:
-        ...     # Code that might fail to find a model
-        ...     raise HUBModelError("Custom model not found message")
-        ... except HUBModelError as e:
-        ...     print(e)  # Displays the emoji-enhanced error message
-    """
-
-    def __init__(self, message: str = "Model not found. Please check model URL and try again."):
-        """
-        Initialize a HUBModelError exception.
-
-        This exception is raised when a requested model is not found or cannot be retrieved from ultralytics HUB.
-        The message is processed to include emojis for better user experience.
-
-        Args:
-            message (str, optional): The error message to display when the exception is raised.
-
-        Examples:
-            >>> try:
-            ...     raise HUBModelError("Custom model error message")
-            ... except HUBModelError as e:
-            ...     print(e)
-        """
-        super().__init__(emojis(message))
diff --git a/ultralytics/utils/events.py b/ultralytics/utils/events.py
deleted file mode 100644
index d267911..0000000
--- a/ultralytics/utils/events.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-import json
-import random
-import time
-from pathlib import Path
-from threading import Thread
-from urllib.request import Request, urlopen
-
-from ultralytics import SETTINGS, __version__
-from ultralytics.utils import ARGV, ENVIRONMENT, GIT, IS_PIP_PACKAGE, ONLINE, PYTHON_VERSION, RANK, TESTS_RUNNING
-from ultralytics.utils.downloads import GITHUB_ASSETS_NAMES
-from ultralytics.utils.torch_utils import get_cpu_info
-
-
-def _post(url: str, data: dict, timeout: float = 5.0) -> None:
-    """Send a one-shot JSON POST request."""
-    try:
-        body = json.dumps(data, separators=(",", ":")).encode()  # compact JSON
-        req = Request(url, data=body, headers={"Content-Type": "application/json"})
-        urlopen(req, timeout=timeout).close()
-    except Exception:
-        pass
-
-
-class Events:
-    """
-    Collect and send anonymous usage analytics with rate-limiting.
-
-    Event collection and transmission are enabled when sync is enabled in settings, the current process is rank -1 or 0,
-    tests are not running, the environment is online, and the installation source is either pip or the official
-    Ultralytics GitHub repository.
-
-    Attributes:
-        url (str): Measurement Protocol endpoint for receiving anonymous events.
-        events (list[dict]): In-memory queue of event payloads awaiting transmission.
-        rate_limit (float): Minimum time in seconds between POST requests.
-        t (float): Timestamp of the last transmission in seconds since the epoch.
-        metadata (dict): Static metadata describing runtime, installation source, and environment.
-        enabled (bool): Flag indicating whether analytics collection is active.
-
-    Methods:
-        __init__: Initialize the event queue, rate limiter, and runtime metadata.
-        __call__: Queue an event and trigger a non-blocking send when the rate limit elapses.
-    """
-
-    url = "https://www.google-analytics.com/mp/collect?measurement_id=G-X8NCJYTQXM&api_secret=QLQrATrNSwGRFRLE-cbHJw"
-
-    def __init__(self) -> None:
-        """Initialize the Events instance with queue, rate limiter, and environment metadata."""
-        self.events = []  # pending events
-        self.rate_limit = 30.0  # rate limit (seconds)
-        self.t = 0.0  # last send timestamp (seconds)
-        self.metadata = {
-            "cli": Path(ARGV[0]).name == "yolo",
-            "install": "git" if GIT.is_repo else "pip" if IS_PIP_PACKAGE else "other",
-            "python": PYTHON_VERSION.rsplit(".", 1)[0],  # i.e. 3.13
-            "CPU": get_cpu_info(),
-            # "GPU": get_gpu_info(index=0) if cuda else None,
-            "version": __version__,
-            "env": ENVIRONMENT,
-            "session_id": round(random.random() * 1e15),
-            "engagement_time_msec": 1000,
-        }
-        self.enabled = (
-            SETTINGS["sync"]
-            and RANK in {-1, 0}
-            and not TESTS_RUNNING
-            and ONLINE
-            and (IS_PIP_PACKAGE or GIT.origin == "https://github.com/ultralytics/ultralytics.git")
-        )
-
-    def __call__(self, cfg, device=None) -> None:
-        """
-        Queue an event and flush the queue asynchronously when the rate limit elapses.
-
-        Args:
-            cfg (IterableSimpleNamespace): The configuration object containing mode and task information.
-            device (torch.device | str, optional): The device type (e.g., 'cpu', 'cuda').
-        """
-        if not self.enabled:
-            # Events disabled, do nothing
-            return
-
-        # Attempt to enqueue a new event
-        if len(self.events) < 25:  # Queue limited to 25 events to bound memory and traffic
-            params = {
-                **self.metadata,
-                "task": cfg.task,
-                "model": cfg.model if cfg.model in GITHUB_ASSETS_NAMES else "custom",
-                "device": str(device),
-            }
-            if cfg.mode == "export":
-                params["format"] = cfg.format
-            self.events.append({"name": cfg.mode, "params": params})
-
-        # Check rate limit and return early if under limit
-        t = time.time()
-        if (t - self.t) < self.rate_limit:
-            return
-
-        # Overrate limit: send a snapshot of queued events in a background thread
-        payload_events = list(self.events)  # snapshot to avoid race with queue reset
-        Thread(
-            target=_post,
-            args=(self.url, {"client_id": SETTINGS["uuid"], "events": payload_events}),  # SHA-256 anonymized
-            daemon=True,
-        ).start()
-
-        # Reset queue and rate limit timer
-        self.events = []
-        self.t = t
-
-
-events = Events()
diff --git a/ultralytics/utils/export/__init__.py b/ultralytics/utils/export/__init__.py
deleted file mode 100644
index 5e028e6..0000000
--- a/ultralytics/utils/export/__init__.py
+++ /dev/null
@@ -1,239 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import json
-from pathlib import Path
-
-import torch
-
-from ultralytics.utils import IS_JETSON, LOGGER
-
-from .imx import torch2imx  # noqa
-
-
-def torch2onnx(
-    torch_model: torch.nn.Module,
-    im: torch.Tensor,
-    onnx_file: str,
-    opset: int = 14,
-    input_names: list[str] = ["images"],
-    output_names: list[str] = ["output0"],
-    dynamic: bool | dict = False,
-) -> None:
-    """
-    Export a PyTorch model to ONNX format.
-
-    Args:
-        torch_model (torch.nn.Module): The PyTorch model to export.
-        im (torch.Tensor): Example input tensor for the model.
-        onnx_file (str): Path to save the exported ONNX file.
-        opset (int): ONNX opset version to use for export.
-        input_names (list[str]): List of input tensor names.
-        output_names (list[str]): List of output tensor names.
-        dynamic (bool | dict, optional): Whether to enable dynamic axes.
-
-    Notes:
-        Setting `do_constant_folding=True` may cause issues with DNN inference for torch>=1.12.
-    """
-    torch.onnx.export(
-        torch_model,
-        im,
-        onnx_file,
-        verbose=False,
-        opset_version=opset,
-        do_constant_folding=True,  # WARNING: DNN inference with torch>=1.12 may require do_constant_folding=False
-        input_names=input_names,
-        output_names=output_names,
-        dynamic_axes=dynamic or None,
-    )
-
-
-def onnx2engine(
-    onnx_file: str,
-    engine_file: str | None = None,
-    workspace: int | None = None,
-    half: bool = False,
-    int8: bool = False,
-    dynamic: bool = False,
-    shape: tuple[int, int, int, int] = (1, 3, 640, 640),
-    dla: int | None = None,
-    dataset=None,
-    metadata: dict | None = None,
-    verbose: bool = False,
-    prefix: str = "",
-) -> None:
-    """
-    Export a YOLO model to TensorRT engine format.
-
-    Args:
-        onnx_file (str): Path to the ONNX file to be converted.
-        engine_file (str, optional): Path to save the generated TensorRT engine file.
-        workspace (int, optional): Workspace size in GB for TensorRT.
-        half (bool, optional): Enable FP16 precision.
-        int8 (bool, optional): Enable INT8 precision.
-        dynamic (bool, optional): Enable dynamic input shapes.
-        shape (tuple[int, int, int, int], optional): Input shape (batch, channels, height, width).
-        dla (int, optional): DLA core to use (Jetson devices only).
-        dataset (ultralytics.data.build.InfiniteDataLoader, optional): Dataset for INT8 calibration.
-        metadata (dict, optional): Metadata to include in the engine file.
-        verbose (bool, optional): Enable verbose logging.
-        prefix (str, optional): Prefix for log messages.
-
-    Raises:
-        ValueError: If DLA is enabled on non-Jetson devices or required precision is not set.
-        RuntimeError: If the ONNX file cannot be parsed.
-
-    Notes:
-        TensorRT version compatibility is handled for workspace size and engine building.
-        INT8 calibration requires a dataset and generates a calibration cache.
-        Metadata is serialized and written to the engine file if provided.
-    """
-    import tensorrt as trt  # noqa
-
-    engine_file = engine_file or Path(onnx_file).with_suffix(".engine")
-
-    logger = trt.Logger(trt.Logger.INFO)
-    if verbose:
-        logger.min_severity = trt.Logger.Severity.VERBOSE
-
-    # Engine builder
-    builder = trt.Builder(logger)
-    config = builder.create_builder_config()
-    workspace_bytes = int((workspace or 0) * (1 << 30))
-    is_trt10 = int(trt.__version__.split(".", 1)[0]) >= 10  # is TensorRT >= 10
-    if is_trt10 and workspace_bytes > 0:
-        config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace_bytes)
-    elif workspace_bytes > 0:  # TensorRT versions 7, 8
-        config.max_workspace_size = workspace_bytes
-    flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
-    network = builder.create_network(flag)
-    half = builder.platform_has_fast_fp16 and half
-    int8 = builder.platform_has_fast_int8 and int8
-
-    # Optionally switch to DLA if enabled
-    if dla is not None:
-        if not IS_JETSON:
-            raise ValueError("DLA is only available on NVIDIA Jetson devices")
-        LOGGER.info(f"{prefix} enabling DLA on core {dla}...")
-        if not half and not int8:
-            raise ValueError(
-                "DLA requires either 'half=True' (FP16) or 'int8=True' (INT8) to be enabled. Please enable one of them and try again."
-            )
-        config.default_device_type = trt.DeviceType.DLA
-        config.DLA_core = int(dla)
-        config.set_flag(trt.BuilderFlag.GPU_FALLBACK)
-
-    # Read ONNX file
-    parser = trt.OnnxParser(network, logger)
-    if not parser.parse_from_file(onnx_file):
-        raise RuntimeError(f"failed to load ONNX file: {onnx_file}")
-
-    # Network inputs
-    inputs = [network.get_input(i) for i in range(network.num_inputs)]
-    outputs = [network.get_output(i) for i in range(network.num_outputs)]
-    for inp in inputs:
-        LOGGER.info(f'{prefix} input "{inp.name}" with shape{inp.shape} {inp.dtype}')
-    for out in outputs:
-        LOGGER.info(f'{prefix} output "{out.name}" with shape{out.shape} {out.dtype}')
-
-    if dynamic:
-        profile = builder.create_optimization_profile()
-        min_shape = (1, shape[1], 32, 32)  # minimum input shape
-        max_shape = (*shape[:2], *(int(max(2, workspace or 2) * d) for d in shape[2:]))  # max input shape
-        for inp in inputs:
-            profile.set_shape(inp.name, min=min_shape, opt=shape, max=max_shape)
-        config.add_optimization_profile(profile)
-        if int8:
-            config.set_calibration_profile(profile)
-
-    LOGGER.info(f"{prefix} building {'INT8' if int8 else 'FP' + ('16' if half else '32')} engine as {engine_file}")
-    if int8:
-        config.set_flag(trt.BuilderFlag.INT8)
-        config.profiling_verbosity = trt.ProfilingVerbosity.DETAILED
-
-        class EngineCalibrator(trt.IInt8Calibrator):
-            """
-            Custom INT8 calibrator for TensorRT engine optimization.
-
-            This calibrator provides the necessary interface for TensorRT to perform INT8 quantization calibration
-            using a dataset. It handles batch generation, caching, and calibration algorithm selection.
-
-            Attributes:
-                dataset: Dataset for calibration.
-                data_iter: Iterator over the calibration dataset.
-                algo (trt.CalibrationAlgoType): Calibration algorithm type.
-                batch (int): Batch size for calibration.
-                cache (Path): Path to save the calibration cache.
-
-            Methods:
-                get_algorithm: Get the calibration algorithm to use.
-                get_batch_size: Get the batch size to use for calibration.
-                get_batch: Get the next batch to use for calibration.
-                read_calibration_cache: Use existing cache instead of calibrating again.
-                write_calibration_cache: Write calibration cache to disk.
-            """
-
-            def __init__(
-                self,
-                dataset,  # ultralytics.data.build.InfiniteDataLoader
-                cache: str = "",
-            ) -> None:
-                """Initialize the INT8 calibrator with dataset and cache path."""
-                trt.IInt8Calibrator.__init__(self)
-                self.dataset = dataset
-                self.data_iter = iter(dataset)
-                self.algo = (
-                    trt.CalibrationAlgoType.ENTROPY_CALIBRATION_2  # DLA quantization needs ENTROPY_CALIBRATION_2
-                    if dla is not None
-                    else trt.CalibrationAlgoType.MINMAX_CALIBRATION
-                )
-                self.batch = dataset.batch_size
-                self.cache = Path(cache)
-
-            def get_algorithm(self) -> trt.CalibrationAlgoType:
-                """Get the calibration algorithm to use."""
-                return self.algo
-
-            def get_batch_size(self) -> int:
-                """Get the batch size to use for calibration."""
-                return self.batch or 1
-
-            def get_batch(self, names) -> list[int] | None:
-                """Get the next batch to use for calibration, as a list of device memory pointers."""
-                try:
-                    im0s = next(self.data_iter)["img"] / 255.0
-                    im0s = im0s.to("cuda") if im0s.device.type == "cpu" else im0s
-                    return [int(im0s.data_ptr())]
-                except StopIteration:
-                    # Return None to signal to TensorRT there is no calibration data remaining
-                    return None
-
-            def read_calibration_cache(self) -> bytes | None:
-                """Use existing cache instead of calibrating again, otherwise, implicitly return None."""
-                if self.cache.exists() and self.cache.suffix == ".cache":
-                    return self.cache.read_bytes()
-
-            def write_calibration_cache(self, cache: bytes) -> None:
-                """Write calibration cache to disk."""
-                _ = self.cache.write_bytes(cache)
-
-        # Load dataset w/ builder (for batching) and calibrate
-        config.int8_calibrator = EngineCalibrator(
-            dataset=dataset,
-            cache=str(Path(onnx_file).with_suffix(".cache")),
-        )
-
-    elif half:
-        config.set_flag(trt.BuilderFlag.FP16)
-
-    # Write file
-    build = builder.build_serialized_network if is_trt10 else builder.build_engine
-    with build(network, config) as engine, open(engine_file, "wb") as t:
-        # Metadata
-        if metadata is not None:
-            meta = json.dumps(metadata)
-            t.write(len(meta).to_bytes(4, byteorder="little", signed=True))
-            t.write(meta.encode())
-        # Model
-        t.write(engine if is_trt10 else engine.serialize())
diff --git a/ultralytics/utils/export/__pycache__/__init__.cpython-310.pyc b/ultralytics/utils/export/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index bb56cb5..0000000
Binary files a/ultralytics/utils/export/__pycache__/__init__.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/utils/export/__pycache__/imx.cpython-310.pyc b/ultralytics/utils/export/__pycache__/imx.cpython-310.pyc
deleted file mode 100644
index 7bb0716..0000000
Binary files a/ultralytics/utils/export/__pycache__/imx.cpython-310.pyc and /dev/null differ
diff --git a/ultralytics/utils/export/imx.py b/ultralytics/utils/export/imx.py
deleted file mode 100644
index a72ea31..0000000
--- a/ultralytics/utils/export/imx.py
+++ /dev/null
@@ -1,289 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import subprocess
-import types
-from pathlib import Path
-
-import torch
-
-from ultralytics.nn.modules import Detect, Pose
-from ultralytics.utils import LOGGER
-from ultralytics.utils.tal import make_anchors
-from ultralytics.utils.torch_utils import copy_attr
-
-
-class FXModel(torch.nn.Module):
-    """
-    A custom model class for torch.fx compatibility.
-
-    This class extends `torch.nn.Module` and is designed to ensure compatibility with torch.fx for tracing and graph
-    manipulation. It copies attributes from an existing model and explicitly sets the model attribute to ensure proper
-    copying.
-
-    Attributes:
-        model (nn.Module): The original model's layers.
-    """
-
-    def __init__(self, model, imgsz=(640, 640)):
-        """
-        Initialize the FXModel.
-
-        Args:
-            model (nn.Module): The original model to wrap for torch.fx compatibility.
-            imgsz (tuple[int, int]): The input image size (height, width). Default is (640, 640).
-        """
-        super().__init__()
-        copy_attr(self, model)
-        # Explicitly set `model` since `copy_attr` somehow does not copy it.
-        self.model = model.model
-        self.imgsz = imgsz
-
-    def forward(self, x):
-        """
-        Forward pass through the model.
-
-        This method performs the forward pass through the model, handling the dependencies between layers and saving
-        intermediate outputs.
-
-        Args:
-            x (torch.Tensor): The input tensor to the model.
-
-        Returns:
-            (torch.Tensor): The output tensor from the model.
-        """
-        y = []  # outputs
-        for m in self.model:
-            if m.f != -1:  # if not from previous layer
-                # from earlier layers
-                x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f]
-            if isinstance(m, Detect):
-                m._inference = types.MethodType(_inference, m)  # bind method to Detect
-                m.anchors, m.strides = (
-                    x.transpose(0, 1)
-                    for x in make_anchors(
-                        torch.cat([s / m.stride.unsqueeze(-1) for s in self.imgsz], dim=1), m.stride, 0.5
-                    )
-                )
-            if type(m) is Pose:
-                m.forward = types.MethodType(pose_forward, m)  # bind method to Detect
-            x = m(x)  # run
-            y.append(x)  # save output
-        return x
-
-
-def _inference(self, x: list[torch.Tensor]) -> tuple[torch.Tensor]:
-    """Decode boxes and cls scores for imx object detection."""
-    x_cat = torch.cat([xi.view(x[0].shape[0], self.no, -1) for xi in x], 2)
-    box, cls = x_cat.split((self.reg_max * 4, self.nc), 1)
-    dbox = self.decode_bboxes(self.dfl(box), self.anchors.unsqueeze(0)) * self.strides
-    return dbox.transpose(1, 2), cls.sigmoid().permute(0, 2, 1)
-
-
-def pose_forward(self, x: list[torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    """Forward pass for imx pose estimation, including keypoint decoding."""
-    bs = x[0].shape[0]  # batch size
-    kpt = torch.cat([self.cv4[i](x[i]).view(bs, self.nk, -1) for i in range(self.nl)], -1)  # (bs, 17*3, h*w)
-    x = Detect.forward(self, x)
-    pred_kpt = self.kpts_decode(bs, kpt)
-    return (*x, pred_kpt.permute(0, 2, 1))
-
-
-class NMSWrapper(torch.nn.Module):
-    """Wrap PyTorch Module with multiclass_nms layer from sony_custom_layers."""
-
-    def __init__(
-        self,
-        model: torch.nn.Module,
-        score_threshold: float = 0.001,
-        iou_threshold: float = 0.7,
-        max_detections: int = 300,
-        task: str = "detect",
-    ):
-        """
-        Initialize NMSWrapper with PyTorch Module and NMS parameters.
-
-        Args:
-            model (torch.nn.Module): Model instance.
-            score_threshold (float): Score threshold for non-maximum suppression.
-            iou_threshold (float): Intersection over union threshold for non-maximum suppression.
-            max_detections (int): The number of detections to return.
-            task (str): Task type, either 'detect' or 'pose'.
-        """
-        super().__init__()
-        self.model = model
-        self.score_threshold = score_threshold
-        self.iou_threshold = iou_threshold
-        self.max_detections = max_detections
-        self.task = task
-
-    def forward(self, images):
-        """Forward pass with model inference and NMS post-processing."""
-        from sony_custom_layers.pytorch import multiclass_nms_with_indices
-
-        # model inference
-        outputs = self.model(images)
-        boxes, scores = outputs[0], outputs[1]
-        nms_outputs = multiclass_nms_with_indices(
-            boxes=boxes,
-            scores=scores,
-            score_threshold=self.score_threshold,
-            iou_threshold=self.iou_threshold,
-            max_detections=self.max_detections,
-        )
-        if self.task == "pose":
-            kpts = outputs[2]  # (bs, max_detections, kpts 17*3)
-            out_kpts = torch.gather(kpts, 1, nms_outputs.indices.unsqueeze(-1).expand(-1, -1, kpts.size(-1)))
-            return nms_outputs.boxes, nms_outputs.scores, nms_outputs.labels, out_kpts
-        return nms_outputs.boxes, nms_outputs.scores, nms_outputs.labels, nms_outputs.n_valid
-
-
-def torch2imx(
-    model: torch.nn.Module,
-    file: Path | str,
-    conf: float,
-    iou: float,
-    max_det: int,
-    metadata: dict | None = None,
-    gptq: bool = False,
-    dataset=None,
-    prefix: str = "",
-):
-    """
-    Export YOLO model to IMX format for deployment on Sony IMX500 devices.
-
-    This function quantizes a YOLO model using Model Compression Toolkit (MCT) and exports it
-    to IMX format compatible with Sony IMX500 edge devices. It supports both YOLOv8n and YOLO11n
-    models for detection and pose estimation tasks.
-
-    Args:
-        model (torch.nn.Module): The YOLO model to export. Must be YOLOv8n or YOLO11n.
-        file (Path | str): Output file path for the exported model.
-        conf (float): Confidence threshold for NMS post-processing.
-        iou (float): IoU threshold for NMS post-processing.
-        max_det (int): Maximum number of detections to return.
-        metadata (dict | None, optional): Metadata to embed in the ONNX model. Defaults to None.
-        gptq (bool, optional): Whether to use Gradient-Based Post Training Quantization.
-            If False, uses standard Post Training Quantization. Defaults to False.
-        dataset (optional): Representative dataset for quantization calibration. Defaults to None.
-        prefix (str, optional): Logging prefix string. Defaults to "".
-
-    Returns:
-        f (Path): Path to the exported IMX model directory
-
-    Raises:
-        ValueError: If the model is not a supported YOLOv8n or YOLO11n variant.
-
-    Example:
-        >>> from ultralytics import YOLO
-        >>> model = YOLO("yolo11n.pt")
-        >>> path, _ = export_imx(model, "model.imx", conf=0.25, iou=0.45, max_det=300)
-
-    Note:
-        - Requires model_compression_toolkit, onnx, edgemdt_tpc, and sony_custom_layers packages
-        - Only supports YOLOv8n and YOLO11n models (detection and pose tasks)
-        - Output includes quantized ONNX model, IMX binary, and labels.txt file
-    """
-    import model_compression_toolkit as mct
-    import onnx
-    from edgemdt_tpc import get_target_platform_capabilities
-
-    LOGGER.info(f"\n{prefix} starting export with model_compression_toolkit {mct.__version__}...")
-
-    def representative_dataset_gen(dataloader=dataset):
-        for batch in dataloader:
-            img = batch["img"]
-            img = img / 255.0
-            yield [img]
-
-    tpc = get_target_platform_capabilities(tpc_version="4.0", device_type="imx500")
-
-    bit_cfg = mct.core.BitWidthConfig()
-    if "C2PSA" in model.__str__():  # YOLO11
-        if model.task == "detect":
-            layer_names = ["sub", "mul_2", "add_14", "cat_21"]
-            weights_memory = 2585350.2439
-            n_layers = 238  # 238 layers for fused YOLO11n
-        elif model.task == "pose":
-            layer_names = ["sub", "mul_2", "add_14", "cat_22", "cat_23", "mul_4", "add_15"]
-            weights_memory = 2437771.67
-            n_layers = 257  # 257 layers for fused YOLO11n-pose
-    else:  # YOLOv8
-        if model.task == "detect":
-            layer_names = ["sub", "mul", "add_6", "cat_17"]
-            weights_memory = 2550540.8
-            n_layers = 168  # 168 layers for fused YOLOv8n
-        elif model.task == "pose":
-            layer_names = ["add_7", "mul_2", "cat_19", "mul", "sub", "add_6", "cat_18"]
-            weights_memory = 2482451.85
-            n_layers = 187  # 187 layers for fused YOLO11n-pose
-
-    # Check if the model has the expected number of layers
-    if len(list(model.modules())) != n_layers:
-        raise ValueError("IMX export only supported for YOLOv8n and YOLO11n models.")
-
-    for layer_name in layer_names:
-        bit_cfg.set_manual_activation_bit_width([mct.core.common.network_editors.NodeNameFilter(layer_name)], 16)
-
-    config = mct.core.CoreConfig(
-        mixed_precision_config=mct.core.MixedPrecisionQuantizationConfig(num_of_images=10),
-        quantization_config=mct.core.QuantizationConfig(concat_threshold_update=True),
-        bit_width_config=bit_cfg,
-    )
-
-    resource_utilization = mct.core.ResourceUtilization(weights_memory=weights_memory)
-
-    quant_model = (
-        mct.gptq.pytorch_gradient_post_training_quantization(  # Perform Gradient-Based Post Training Quantization
-            model=model,
-            representative_data_gen=representative_dataset_gen,
-            target_resource_utilization=resource_utilization,
-            gptq_config=mct.gptq.get_pytorch_gptq_config(
-                n_epochs=1000, use_hessian_based_weights=False, use_hessian_sample_attention=False
-            ),
-            core_config=config,
-            target_platform_capabilities=tpc,
-        )[0]
-        if gptq
-        else mct.ptq.pytorch_post_training_quantization(  # Perform post training quantization
-            in_module=model,
-            representative_data_gen=representative_dataset_gen,
-            target_resource_utilization=resource_utilization,
-            core_config=config,
-            target_platform_capabilities=tpc,
-        )[0]
-    )
-
-    quant_model = NMSWrapper(
-        model=quant_model,
-        score_threshold=conf or 0.001,
-        iou_threshold=iou,
-        max_detections=max_det,
-        task=model.task,
-    )
-
-    f = Path(str(file).replace(file.suffix, "_imx_model"))
-    f.mkdir(exist_ok=True)
-    onnx_model = f / Path(str(file.name).replace(file.suffix, "_imx.onnx"))  # js dir
-    mct.exporter.pytorch_export_model(
-        model=quant_model, save_model_path=onnx_model, repr_dataset=representative_dataset_gen
-    )
-
-    model_onnx = onnx.load(onnx_model)  # load onnx model
-    for k, v in metadata.items():
-        meta = model_onnx.metadata_props.add()
-        meta.key, meta.value = k, str(v)
-
-    onnx.save(model_onnx, onnx_model)
-
-    subprocess.run(
-        ["imxconv-pt", "-i", str(onnx_model), "-o", str(f), "--no-input-persistency", "--overwrite-output"],
-        check=True,
-    )
-
-    # Needed for imx models.
-    with open(f / "labels.txt", "w", encoding="utf-8") as file:
-        file.writelines([f"{name}\n" for _, name in model.names.items()])
-
-    return f
diff --git a/ultralytics/utils/files.py b/ultralytics/utils/files.py
deleted file mode 100644
index e7bce39..0000000
--- a/ultralytics/utils/files.py
+++ /dev/null
@@ -1,223 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import contextlib
-import glob
-import os
-import shutil
-import tempfile
-from contextlib import contextmanager
-from datetime import datetime
-from pathlib import Path
-
-
-class WorkingDirectory(contextlib.ContextDecorator):
-    """
-    A context manager and decorator for temporarily changing the working directory.
-
-    This class allows for the temporary change of the working directory using a context manager or decorator.
-    It ensures that the original working directory is restored after the context or decorated function completes.
-
-    Attributes:
-        dir (Path | str): The new directory to switch to.
-        cwd (Path): The original current working directory before the switch.
-
-    Methods:
-        __enter__: Changes the current directory to the specified directory.
-        __exit__: Restores the original working directory on context exit.
-
-    Examples:
-        Using as a context manager:
-        >>> with WorkingDirectory('/path/to/new/dir'):
-        >>> # Perform operations in the new directory
-        >>>     pass
-
-        Using as a decorator:
-        >>> @WorkingDirectory('/path/to/new/dir')
-        >>> def some_function():
-        >>> # Perform operations in the new directory
-        >>>     pass
-    """
-
-    def __init__(self, new_dir: str | Path):
-        """Initialize the WorkingDirectory context manager with the target directory."""
-        self.dir = new_dir  # new dir
-        self.cwd = Path.cwd().resolve()  # current dir
-
-    def __enter__(self):
-        """Change the current working directory to the specified directory upon entering the context."""
-        os.chdir(self.dir)
-
-    def __exit__(self, exc_type, exc_val, exc_tb):  # noqa
-        """Restore the original working directory when exiting the context."""
-        os.chdir(self.cwd)
-
-
-@contextmanager
-def spaces_in_path(path: str | Path):
-    """
-    Context manager to handle paths with spaces in their names.
-
-    If a path contains spaces, it replaces them with underscores, copies the file/directory to the new path, executes
-    the context code block, then copies the file/directory back to its original location.
-
-    Args:
-        path (str | Path): The original path that may contain spaces.
-
-    Yields:
-        (Path | str): Temporary path with spaces replaced by underscores if spaces were present, otherwise the
-            original path.
-
-    Examples:
-        >>> with spaces_in_path('/path/with spaces') as new_path:
-        >>> # Your code here
-        >>>     pass
-    """
-    # If path has spaces, replace them with underscores
-    if " " in str(path):
-        string = isinstance(path, str)  # input type
-        path = Path(path)
-
-        # Create a temporary directory and construct the new path
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            tmp_path = Path(tmp_dir) / path.name.replace(" ", "_")
-
-            # Copy file/directory
-            if path.is_dir():
-                shutil.copytree(path, tmp_path)
-            elif path.is_file():
-                tmp_path.parent.mkdir(parents=True, exist_ok=True)
-                shutil.copy2(path, tmp_path)
-
-            try:
-                # Yield the temporary path
-                yield str(tmp_path) if string else tmp_path
-
-            finally:
-                # Copy file/directory back
-                if tmp_path.is_dir():
-                    shutil.copytree(tmp_path, path, dirs_exist_ok=True)
-                elif tmp_path.is_file():
-                    shutil.copy2(tmp_path, path)  # Copy back the file
-
-    else:
-        # If there are no spaces, just yield the original path
-        yield path
-
-
-def increment_path(path: str | Path, exist_ok: bool = False, sep: str = "", mkdir: bool = False) -> Path:
-    """
-    Increment a file or directory path, i.e., runs/exp --> runs/exp{sep}2, runs/exp{sep}3, ... etc.
-
-    If the path exists and `exist_ok` is not True, the path will be incremented by appending a number and `sep` to
-    the end of the path. If the path is a file, the file extension will be preserved. If the path is a directory, the
-    number will be appended directly to the end of the path.
-
-    Args:
-        path (str | Path): Path to increment.
-        exist_ok (bool, optional): If True, the path will not be incremented and returned as-is.
-        sep (str, optional): Separator to use between the path and the incrementation number.
-        mkdir (bool, optional): Create a directory if it does not exist.
-
-    Returns:
-        (Path): Incremented path.
-
-    Examples:
-        Increment a directory path:
-        >>> from pathlib import Path
-        >>> path = Path("runs/exp")
-        >>> new_path = increment_path(path)
-        >>> print(new_path)
-        runs/exp2
-
-        Increment a file path:
-        >>> path = Path("runs/exp/results.txt")
-        >>> new_path = increment_path(path)
-        >>> print(new_path)
-        runs/exp/results2.txt
-    """
-    path = Path(path)  # os-agnostic
-    if path.exists() and not exist_ok:
-        path, suffix = (path.with_suffix(""), path.suffix) if path.is_file() else (path, "")
-
-        # Method 1
-        for n in range(2, 9999):
-            p = f"{path}{sep}{n}{suffix}"  # increment path
-            if not os.path.exists(p):
-                break
-        path = Path(p)
-
-    if mkdir:
-        path.mkdir(parents=True, exist_ok=True)  # make directory
-
-    return path
-
-
-def file_age(path: str | Path = __file__) -> int:
-    """Return days since the last modification of the specified file."""
-    dt = datetime.now() - datetime.fromtimestamp(Path(path).stat().st_mtime)  # delta
-    return dt.days  # + dt.seconds / 86400  # fractional days
-
-
-def file_date(path: str | Path = __file__) -> str:
-    """Return the file modification date in 'YYYY-M-D' format."""
-    t = datetime.fromtimestamp(Path(path).stat().st_mtime)
-    return f"{t.year}-{t.month}-{t.day}"
-
-
-def file_size(path: str | Path) -> float:
-    """Return the size of a file or directory in megabytes (MB)."""
-    if isinstance(path, (str, Path)):
-        mb = 1 << 20  # bytes to MiB (1024 ** 2)
-        path = Path(path)
-        if path.is_file():
-            return path.stat().st_size / mb
-        elif path.is_dir():
-            return sum(f.stat().st_size for f in path.glob("**/*") if f.is_file()) / mb
-    return 0.0
-
-
-def get_latest_run(search_dir: str = ".") -> str:
-    """Return the path to the most recent 'last.pt' file in the specified directory for resuming training."""
-    last_list = glob.glob(f"{search_dir}/**/last*.pt", recursive=True)
-    return max(last_list, key=os.path.getctime) if last_list else ""
-
-
-def update_models(model_names: tuple = ("yolo11n.pt",), source_dir: Path = Path("."), update_names: bool = False):
-    """
-    Update and re-save specified YOLO models in an 'updated_models' subdirectory.
-
-    Args:
-        model_names (tuple, optional): Model filenames to update.
-        source_dir (Path, optional): Directory containing models and target subdirectory.
-        update_names (bool, optional): Update model names from a data YAML.
-
-    Examples:
-        Update specified YOLO models and save them in 'updated_models' subdirectory:
-        >>> from ultralytics.utils.files import update_models
-        >>> model_names = ("yolo11n.pt", "yolov8s.pt")
-        >>> update_models(model_names, source_dir=Path("/models"), update_names=True)
-    """
-    from ultralytics import YOLO
-    from ultralytics.nn.autobackend import default_class_names
-
-    target_dir = source_dir / "updated_models"
-    target_dir.mkdir(parents=True, exist_ok=True)  # Ensure target directory exists
-
-    for model_name in model_names:
-        model_path = source_dir / model_name
-        print(f"Loading model from {model_path}")
-
-        # Load model
-        model = YOLO(model_path)
-        model.half()
-        if update_names:  # update model names from a dataset YAML
-            model.model.names = default_class_names("coco8.yaml")
-
-        # Define new save path
-        save_path = target_dir / model_name
-
-        # Save model using model.save()
-        print(f"Re-saving {model_name} model to {save_path}")
-        model.save(save_path)
diff --git a/ultralytics/utils/git.py b/ultralytics/utils/git.py
deleted file mode 100644
index 9cfc951..0000000
--- a/ultralytics/utils/git.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-from functools import cached_property
-from pathlib import Path
-
-
-class GitRepo:
-    """
-    Represent a local Git repository and expose branch, commit, and remote metadata.
-
-    This class discovers the repository root by searching for a .git entry from the given path upward, resolves the
-    actual .git directory (including worktrees), and reads Git metadata directly from on-disk files. It does not
-    invoke the git binary and therefore works in restricted environments. All metadata properties are resolved
-    lazily and cached; construct a new instance to refresh state.
-
-    Attributes:
-        root (Path | None): Repository root directory containing the .git entry; None if not in a repository.
-        gitdir (Path | None): Resolved .git directory path; handles worktrees; None if unresolved.
-        head (str | None): Raw contents of HEAD; a SHA for detached HEAD or "ref: <refname>" for branch heads.
-        is_repo (bool): Whether the provided path resides inside a Git repository.
-        branch (str | None): Current branch name when HEAD points to a branch; None for detached HEAD or non-repo.
-        commit (str | None): Current commit SHA for HEAD; None if not determinable.
-        origin (str | None): URL of the "origin" remote as read from gitdir/config; None if unset or unavailable.
-
-    Examples:
-        Initialize from the current working directory and read metadata
-        >>> from pathlib import Path
-        >>> repo = GitRepo(Path.cwd())
-        >>> repo.is_repo
-        True
-        >>> repo.branch, repo.commit[:7], repo.origin
-        ('main', '1a2b3c4', 'https://example.com/owner/repo.git')
-
-    Notes:
-        - Resolves metadata by reading files: HEAD, packed-refs, and config; no subprocess calls are used.
-        - Caches properties on first access using cached_property; recreate the object to reflect repository changes.
-    """
-
-    def __init__(self, path: Path = Path(__file__).resolve()):
-        """
-        Initialize a Git repository context by discovering the repository root from a starting path.
-
-        Args:
-            path (Path, optional): File or directory path used as the starting point to locate the repository root.
-        """
-        self.root = self._find_root(path)
-        self.gitdir = self._gitdir(self.root) if self.root else None
-
-    @staticmethod
-    def _find_root(p: Path) -> Path | None:
-        """Return repo root or None."""
-        return next((d for d in [p] + list(p.parents) if (d / ".git").exists()), None)
-
-    @staticmethod
-    def _gitdir(root: Path) -> Path | None:
-        """Resolve actual .git directory (handles worktrees)."""
-        g = root / ".git"
-        if g.is_dir():
-            return g
-        if g.is_file():
-            t = g.read_text(errors="ignore").strip()
-            if t.startswith("gitdir:"):
-                return (root / t.split(":", 1)[1].strip()).resolve()
-        return None
-
-    def _read(self, p: Path | None) -> str | None:
-        """Read and strip file if exists."""
-        return p.read_text(errors="ignore").strip() if p and p.exists() else None
-
-    @cached_property
-    def head(self) -> str | None:
-        """HEAD file contents."""
-        return self._read(self.gitdir / "HEAD" if self.gitdir else None)
-
-    def _ref_commit(self, ref: str) -> str | None:
-        """Commit for ref (handles packed-refs)."""
-        rf = self.gitdir / ref
-        s = self._read(rf)
-        if s:
-            return s
-        pf = self.gitdir / "packed-refs"
-        b = pf.read_bytes().splitlines() if pf.exists() else []
-        tgt = ref.encode()
-        for line in b:
-            if line[:1] in (b"#", b"^") or b" " not in line:
-                continue
-            sha, name = line.split(b" ", 1)
-            if name.strip() == tgt:
-                return sha.decode()
-        return None
-
-    @property
-    def is_repo(self) -> bool:
-        """True if inside a git repo."""
-        return self.gitdir is not None
-
-    @cached_property
-    def branch(self) -> str | None:
-        """Current branch or None."""
-        if not self.is_repo or not self.head or not self.head.startswith("ref: "):
-            return None
-        ref = self.head[5:].strip()
-        return ref[len("refs/heads/") :] if ref.startswith("refs/heads/") else ref
-
-    @cached_property
-    def commit(self) -> str | None:
-        """Current commit SHA or None."""
-        if not self.is_repo or not self.head:
-            return None
-        return self._ref_commit(self.head[5:].strip()) if self.head.startswith("ref: ") else self.head
-
-    @cached_property
-    def origin(self) -> str | None:
-        """Origin URL or None."""
-        if not self.is_repo:
-            return None
-        cfg = self.gitdir / "config"
-        remote, url = None, None
-        for s in (self._read(cfg) or "").splitlines():
-            t = s.strip()
-            if t.startswith("[") and t.endswith("]"):
-                remote = t.lower()
-            elif t.lower().startswith("url =") and remote == '[remote "origin"]':
-                url = t.split("=", 1)[1].strip()
-                break
-        return url
-
-
-if __name__ == "__main__":
-    import time
-
-    g = GitRepo()
-    if g.is_repo:
-        t0 = time.perf_counter()
-        print(f"repo={g.root}\nbranch={g.branch}\ncommit={g.commit}\norigin={g.origin}")
-        dt = (time.perf_counter() - t0) * 1000
-        print(f"\n⏱️ Profiling: total {dt:.3f} ms")
diff --git a/ultralytics/utils/instance.py b/ultralytics/utils/instance.py
deleted file mode 100644
index bfc1d54..0000000
--- a/ultralytics/utils/instance.py
+++ /dev/null
@@ -1,505 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-from collections import abc
-from itertools import repeat
-from numbers import Number
-
-import numpy as np
-
-from .ops import ltwh2xywh, ltwh2xyxy, resample_segments, xywh2ltwh, xywh2xyxy, xyxy2ltwh, xyxy2xywh
-
-
-def _ntuple(n):
-    """Create a function that converts input to n-tuple by repeating singleton values."""
-
-    def parse(x):
-        """Parse input to return n-tuple by repeating singleton values n times."""
-        return x if isinstance(x, abc.Iterable) else tuple(repeat(x, n))
-
-    return parse
-
-
-to_2tuple = _ntuple(2)
-to_4tuple = _ntuple(4)
-
-# `xyxy` means left top and right bottom
-# `xywh` means center x, center y and width, height(YOLO format)
-# `ltwh` means left top and width, height(COCO format)
-_formats = ["xyxy", "xywh", "ltwh"]
-
-__all__ = ("Bboxes", "Instances")  # tuple or list
-
-
-class Bboxes:
-    """
-    A class for handling bounding boxes in multiple formats.
-
-    The class supports various bounding box formats like 'xyxy', 'xywh', and 'ltwh' and provides methods for format
-    conversion, scaling, and area calculation. Bounding box data should be provided as numpy arrays.
-
-    Attributes:
-        bboxes (np.ndarray): The bounding boxes stored in a 2D numpy array with shape (N, 4).
-        format (str): The format of the bounding boxes ('xyxy', 'xywh', or 'ltwh').
-
-    Methods:
-        convert: Convert bounding box format from one type to another.
-        areas: Calculate the area of bounding boxes.
-        mul: Multiply bounding box coordinates by scale factor(s).
-        add: Add offset to bounding box coordinates.
-        concatenate: Concatenate multiple Bboxes objects.
-
-    Examples:
-        Create bounding boxes in YOLO format
-        >>> bboxes = Bboxes(np.array([[100, 50, 150, 100]]), format="xywh")
-        >>> bboxes.convert("xyxy")
-        >>> print(bboxes.areas())
-
-    Notes:
-        This class does not handle normalization or denormalization of bounding boxes.
-    """
-
-    def __init__(self, bboxes: np.ndarray, format: str = "xyxy") -> None:
-        """
-        Initialize the Bboxes class with bounding box data in a specified format.
-
-        Args:
-            bboxes (np.ndarray): Array of bounding boxes with shape (N, 4) or (4,).
-            format (str): Format of the bounding boxes, one of 'xyxy', 'xywh', or 'ltwh'.
-        """
-        assert format in _formats, f"Invalid bounding box format: {format}, format must be one of {_formats}"
-        bboxes = bboxes[None, :] if bboxes.ndim == 1 else bboxes
-        assert bboxes.ndim == 2
-        assert bboxes.shape[1] == 4
-        self.bboxes = bboxes
-        self.format = format
-
-    def convert(self, format: str) -> None:
-        """
-        Convert bounding box format from one type to another.
-
-        Args:
-            format (str): Target format for conversion, one of 'xyxy', 'xywh', or 'ltwh'.
-        """
-        assert format in _formats, f"Invalid bounding box format: {format}, format must be one of {_formats}"
-        if self.format == format:
-            return
-        elif self.format == "xyxy":
-            func = xyxy2xywh if format == "xywh" else xyxy2ltwh
-        elif self.format == "xywh":
-            func = xywh2xyxy if format == "xyxy" else xywh2ltwh
-        else:
-            func = ltwh2xyxy if format == "xyxy" else ltwh2xywh
-        self.bboxes = func(self.bboxes)
-        self.format = format
-
-    def areas(self) -> np.ndarray:
-        """Calculate the area of bounding boxes."""
-        return (
-            (self.bboxes[:, 2] - self.bboxes[:, 0]) * (self.bboxes[:, 3] - self.bboxes[:, 1])  # format xyxy
-            if self.format == "xyxy"
-            else self.bboxes[:, 3] * self.bboxes[:, 2]  # format xywh or ltwh
-        )
-
-    def mul(self, scale: int | tuple | list) -> None:
-        """
-        Multiply bounding box coordinates by scale factor(s).
-
-        Args:
-            scale (int | tuple | list): Scale factor(s) for four coordinates. If int, the same scale is applied to
-                all coordinates.
-        """
-        if isinstance(scale, Number):
-            scale = to_4tuple(scale)
-        assert isinstance(scale, (tuple, list))
-        assert len(scale) == 4
-        self.bboxes[:, 0] *= scale[0]
-        self.bboxes[:, 1] *= scale[1]
-        self.bboxes[:, 2] *= scale[2]
-        self.bboxes[:, 3] *= scale[3]
-
-    def add(self, offset: int | tuple | list) -> None:
-        """
-        Add offset to bounding box coordinates.
-
-        Args:
-            offset (int | tuple | list): Offset(s) for four coordinates. If int, the same offset is applied to
-                all coordinates.
-        """
-        if isinstance(offset, Number):
-            offset = to_4tuple(offset)
-        assert isinstance(offset, (tuple, list))
-        assert len(offset) == 4
-        self.bboxes[:, 0] += offset[0]
-        self.bboxes[:, 1] += offset[1]
-        self.bboxes[:, 2] += offset[2]
-        self.bboxes[:, 3] += offset[3]
-
-    def __len__(self) -> int:
-        """Return the number of bounding boxes."""
-        return len(self.bboxes)
-
-    @classmethod
-    def concatenate(cls, boxes_list: list[Bboxes], axis: int = 0) -> Bboxes:
-        """
-        Concatenate a list of Bboxes objects into a single Bboxes object.
-
-        Args:
-            boxes_list (list[Bboxes]): A list of Bboxes objects to concatenate.
-            axis (int, optional): The axis along which to concatenate the bounding boxes.
-
-        Returns:
-            (Bboxes): A new Bboxes object containing the concatenated bounding boxes.
-
-        Notes:
-            The input should be a list or tuple of Bboxes objects.
-        """
-        assert isinstance(boxes_list, (list, tuple))
-        if not boxes_list:
-            return cls(np.empty(0))
-        assert all(isinstance(box, Bboxes) for box in boxes_list)
-
-        if len(boxes_list) == 1:
-            return boxes_list[0]
-        return cls(np.concatenate([b.bboxes for b in boxes_list], axis=axis))
-
-    def __getitem__(self, index: int | np.ndarray | slice) -> Bboxes:
-        """
-        Retrieve a specific bounding box or a set of bounding boxes using indexing.
-
-        Args:
-            index (int | slice | np.ndarray): The index, slice, or boolean array to select the desired bounding boxes.
-
-        Returns:
-            (Bboxes): A new Bboxes object containing the selected bounding boxes.
-
-        Notes:
-            When using boolean indexing, make sure to provide a boolean array with the same length as the number of
-            bounding boxes.
-        """
-        if isinstance(index, int):
-            return Bboxes(self.bboxes[index].reshape(1, -1))
-        b = self.bboxes[index]
-        assert b.ndim == 2, f"Indexing on Bboxes with {index} failed to return a matrix!"
-        return Bboxes(b)
-
-
-class Instances:
-    """
-    Container for bounding boxes, segments, and keypoints of detected objects in an image.
-
-    This class provides a unified interface for handling different types of object annotations including bounding
-    boxes, segmentation masks, and keypoints. It supports various operations like scaling, normalization, clipping,
-    and format conversion.
-
-    Attributes:
-        _bboxes (Bboxes): Internal object for handling bounding box operations.
-        keypoints (np.ndarray): Keypoints with shape (N, 17, 3) in format (x, y, visible).
-        normalized (bool): Flag indicating whether the bounding box coordinates are normalized.
-        segments (np.ndarray): Segments array with shape (N, M, 2) after resampling.
-
-    Methods:
-        convert_bbox: Convert bounding box format.
-        scale: Scale coordinates by given factors.
-        denormalize: Convert normalized coordinates to absolute coordinates.
-        normalize: Convert absolute coordinates to normalized coordinates.
-        add_padding: Add padding to coordinates.
-        flipud: Flip coordinates vertically.
-        fliplr: Flip coordinates horizontally.
-        clip: Clip coordinates to stay within image boundaries.
-        remove_zero_area_boxes: Remove boxes with zero area.
-        update: Update instance variables.
-        concatenate: Concatenate multiple Instances objects.
-
-    Examples:
-        Create instances with bounding boxes and segments
-        >>> instances = Instances(
-        ...     bboxes=np.array([[10, 10, 30, 30], [20, 20, 40, 40]]),
-        ...     segments=[np.array([[5, 5], [10, 10]]), np.array([[15, 15], [20, 20]])],
-        ...     keypoints=np.array([[[5, 5, 1], [10, 10, 1]], [[15, 15, 1], [20, 20, 1]]]),
-        ... )
-    """
-
-    def __init__(
-        self,
-        bboxes: np.ndarray,
-        segments: np.ndarray = None,
-        keypoints: np.ndarray = None,
-        bbox_format: str = "xywh",
-        normalized: bool = True,
-    ) -> None:
-        """
-        Initialize the Instances object with bounding boxes, segments, and keypoints.
-
-        Args:
-            bboxes (np.ndarray): Bounding boxes with shape (N, 4).
-            segments (np.ndarray, optional): Segmentation masks.
-            keypoints (np.ndarray, optional): Keypoints with shape (N, 17, 3) in format (x, y, visible).
-            bbox_format (str): Format of bboxes.
-            normalized (bool): Whether the coordinates are normalized.
-        """
-        self._bboxes = Bboxes(bboxes=bboxes, format=bbox_format)
-        self.keypoints = keypoints
-        self.normalized = normalized
-        self.segments = segments
-
-    def convert_bbox(self, format: str) -> None:
-        """
-        Convert bounding box format.
-
-        Args:
-            format (str): Target format for conversion, one of 'xyxy', 'xywh', or 'ltwh'.
-        """
-        self._bboxes.convert(format=format)
-
-    @property
-    def bbox_areas(self) -> np.ndarray:
-        """Calculate the area of bounding boxes."""
-        return self._bboxes.areas()
-
-    def scale(self, scale_w: float, scale_h: float, bbox_only: bool = False):
-        """
-        Scale coordinates by given factors.
-
-        Args:
-            scale_w (float): Scale factor for width.
-            scale_h (float): Scale factor for height.
-            bbox_only (bool, optional): Whether to scale only bounding boxes.
-        """
-        self._bboxes.mul(scale=(scale_w, scale_h, scale_w, scale_h))
-        if bbox_only:
-            return
-        self.segments[..., 0] *= scale_w
-        self.segments[..., 1] *= scale_h
-        if self.keypoints is not None:
-            self.keypoints[..., 0] *= scale_w
-            self.keypoints[..., 1] *= scale_h
-
-    def denormalize(self, w: int, h: int) -> None:
-        """
-        Convert normalized coordinates to absolute coordinates.
-
-        Args:
-            w (int): Image width.
-            h (int): Image height.
-        """
-        if not self.normalized:
-            return
-        self._bboxes.mul(scale=(w, h, w, h))
-        self.segments[..., 0] *= w
-        self.segments[..., 1] *= h
-        if self.keypoints is not None:
-            self.keypoints[..., 0] *= w
-            self.keypoints[..., 1] *= h
-        self.normalized = False
-
-    def normalize(self, w: int, h: int) -> None:
-        """
-        Convert absolute coordinates to normalized coordinates.
-
-        Args:
-            w (int): Image width.
-            h (int): Image height.
-        """
-        if self.normalized:
-            return
-        self._bboxes.mul(scale=(1 / w, 1 / h, 1 / w, 1 / h))
-        self.segments[..., 0] /= w
-        self.segments[..., 1] /= h
-        if self.keypoints is not None:
-            self.keypoints[..., 0] /= w
-            self.keypoints[..., 1] /= h
-        self.normalized = True
-
-    def add_padding(self, padw: int, padh: int) -> None:
-        """
-        Add padding to coordinates.
-
-        Args:
-            padw (int): Padding width.
-            padh (int): Padding height.
-        """
-        assert not self.normalized, "you should add padding with absolute coordinates."
-        self._bboxes.add(offset=(padw, padh, padw, padh))
-        self.segments[..., 0] += padw
-        self.segments[..., 1] += padh
-        if self.keypoints is not None:
-            self.keypoints[..., 0] += padw
-            self.keypoints[..., 1] += padh
-
-    def __getitem__(self, index: int | np.ndarray | slice) -> Instances:
-        """
-        Retrieve a specific instance or a set of instances using indexing.
-
-        Args:
-            index (int | slice | np.ndarray): The index, slice, or boolean array to select the desired instances.
-
-        Returns:
-            (Instances): A new Instances object containing the selected boxes, segments, and keypoints if present.
-
-        Notes:
-            When using boolean indexing, make sure to provide a boolean array with the same length as the number of
-            instances.
-        """
-        segments = self.segments[index] if len(self.segments) else self.segments
-        keypoints = self.keypoints[index] if self.keypoints is not None else None
-        bboxes = self.bboxes[index]
-        bbox_format = self._bboxes.format
-        return Instances(
-            bboxes=bboxes,
-            segments=segments,
-            keypoints=keypoints,
-            bbox_format=bbox_format,
-            normalized=self.normalized,
-        )
-
-    def flipud(self, h: int) -> None:
-        """
-        Flip coordinates vertically.
-
-        Args:
-            h (int): Image height.
-        """
-        if self._bboxes.format == "xyxy":
-            y1 = self.bboxes[:, 1].copy()
-            y2 = self.bboxes[:, 3].copy()
-            self.bboxes[:, 1] = h - y2
-            self.bboxes[:, 3] = h - y1
-        else:
-            self.bboxes[:, 1] = h - self.bboxes[:, 1]
-        self.segments[..., 1] = h - self.segments[..., 1]
-        if self.keypoints is not None:
-            self.keypoints[..., 1] = h - self.keypoints[..., 1]
-
-    def fliplr(self, w: int) -> None:
-        """
-        Flip coordinates horizontally.
-
-        Args:
-            w (int): Image width.
-        """
-        if self._bboxes.format == "xyxy":
-            x1 = self.bboxes[:, 0].copy()
-            x2 = self.bboxes[:, 2].copy()
-            self.bboxes[:, 0] = w - x2
-            self.bboxes[:, 2] = w - x1
-        else:
-            self.bboxes[:, 0] = w - self.bboxes[:, 0]
-        self.segments[..., 0] = w - self.segments[..., 0]
-        if self.keypoints is not None:
-            self.keypoints[..., 0] = w - self.keypoints[..., 0]
-
-    def clip(self, w: int, h: int) -> None:
-        """
-        Clip coordinates to stay within image boundaries.
-
-        Args:
-            w (int): Image width.
-            h (int): Image height.
-        """
-        ori_format = self._bboxes.format
-        self.convert_bbox(format="xyxy")
-        self.bboxes[:, [0, 2]] = self.bboxes[:, [0, 2]].clip(0, w)
-        self.bboxes[:, [1, 3]] = self.bboxes[:, [1, 3]].clip(0, h)
-        if ori_format != "xyxy":
-            self.convert_bbox(format=ori_format)
-        self.segments[..., 0] = self.segments[..., 0].clip(0, w)
-        self.segments[..., 1] = self.segments[..., 1].clip(0, h)
-        if self.keypoints is not None:
-            # Set out of bounds visibility to zero
-            self.keypoints[..., 2][
-                (self.keypoints[..., 0] < 0)
-                | (self.keypoints[..., 0] > w)
-                | (self.keypoints[..., 1] < 0)
-                | (self.keypoints[..., 1] > h)
-            ] = 0.0
-            self.keypoints[..., 0] = self.keypoints[..., 0].clip(0, w)
-            self.keypoints[..., 1] = self.keypoints[..., 1].clip(0, h)
-
-    def remove_zero_area_boxes(self) -> np.ndarray:
-        """
-        Remove zero-area boxes, i.e. after clipping some boxes may have zero width or height.
-
-        Returns:
-            (np.ndarray): Boolean array indicating which boxes were kept.
-        """
-        good = self.bbox_areas > 0
-        if not all(good):
-            self._bboxes = self._bboxes[good]
-            if len(self.segments):
-                self.segments = self.segments[good]
-            if self.keypoints is not None:
-                self.keypoints = self.keypoints[good]
-        return good
-
-    def update(self, bboxes: np.ndarray, segments: np.ndarray = None, keypoints: np.ndarray = None):
-        """
-        Update instance variables.
-
-        Args:
-            bboxes (np.ndarray): New bounding boxes.
-            segments (np.ndarray, optional): New segments.
-            keypoints (np.ndarray, optional): New keypoints.
-        """
-        self._bboxes = Bboxes(bboxes, format=self._bboxes.format)
-        if segments is not None:
-            self.segments = segments
-        if keypoints is not None:
-            self.keypoints = keypoints
-
-    def __len__(self) -> int:
-        """Return the number of instances."""
-        return len(self.bboxes)
-
-    @classmethod
-    def concatenate(cls, instances_list: list[Instances], axis=0) -> Instances:
-        """
-        Concatenate a list of Instances objects into a single Instances object.
-
-        Args:
-            instances_list (list[Instances]): A list of Instances objects to concatenate.
-            axis (int, optional): The axis along which the arrays will be concatenated.
-
-        Returns:
-            (Instances): A new Instances object containing the concatenated bounding boxes, segments, and keypoints
-                if present.
-
-        Notes:
-            The `Instances` objects in the list should have the same properties, such as the format of the bounding
-            boxes, whether keypoints are present, and if the coordinates are normalized.
-        """
-        assert isinstance(instances_list, (list, tuple))
-        if not instances_list:
-            return cls(np.empty(0))
-        assert all(isinstance(instance, Instances) for instance in instances_list)
-
-        if len(instances_list) == 1:
-            return instances_list[0]
-
-        use_keypoint = instances_list[0].keypoints is not None
-        bbox_format = instances_list[0]._bboxes.format
-        normalized = instances_list[0].normalized
-
-        cat_boxes = np.concatenate([ins.bboxes for ins in instances_list], axis=axis)
-        seg_len = [b.segments.shape[1] for b in instances_list]
-        if len(frozenset(seg_len)) > 1:  # resample segments if there's different length
-            max_len = max(seg_len)
-            cat_segments = np.concatenate(
-                [
-                    resample_segments(list(b.segments), max_len)
-                    if len(b.segments)
-                    else np.zeros((0, max_len, 2), dtype=np.float32)  # re-generating empty segments
-                    for b in instances_list
-                ],
-                axis=axis,
-            )
-        else:
-            cat_segments = np.concatenate([b.segments for b in instances_list], axis=axis)
-        cat_keypoints = np.concatenate([b.keypoints for b in instances_list], axis=axis) if use_keypoint else None
-        return cls(cat_boxes, cat_segments, cat_keypoints, bbox_format, normalized)
-
-    @property
-    def bboxes(self) -> np.ndarray:
-        """Return bounding boxes."""
-        return self._bboxes.bboxes
diff --git a/ultralytics/utils/logger.py b/ultralytics/utils/logger.py
deleted file mode 100644
index 6494ec5..0000000
--- a/ultralytics/utils/logger.py
+++ /dev/null
@@ -1,408 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-import logging
-import queue
-import shutil
-import sys
-import threading
-import time
-from datetime import datetime
-from pathlib import Path
-
-from ultralytics.utils import MACOS, RANK
-from ultralytics.utils.checks import check_requirements
-
-# Initialize default log file
-DEFAULT_LOG_PATH = Path("train.log")
-if RANK in {-1, 0} and DEFAULT_LOG_PATH.exists():
-    DEFAULT_LOG_PATH.unlink(missing_ok=True)
-
-
-class ConsoleLogger:
-    """
-    Console output capture with API/file streaming and deduplication.
-
-    Captures stdout/stderr output and streams it to either an API endpoint or local file, with intelligent
-    deduplication to reduce noise from repetitive console output.
-
-    Attributes:
-        destination (str | Path): Target destination for streaming (URL or Path object).
-        is_api (bool): Whether destination is an API endpoint (True) or local file (False).
-        original_stdout: Reference to original sys.stdout for restoration.
-        original_stderr: Reference to original sys.stderr for restoration.
-        log_queue (queue.Queue): Thread-safe queue for buffering log messages.
-        active (bool): Whether console capture is currently active.
-        worker_thread (threading.Thread): Background thread for processing log queue.
-        last_line (str): Last processed line for deduplication.
-        last_time (float): Timestamp of last processed line.
-        last_progress_line (str): Last progress bar line for progress deduplication.
-        last_was_progress (bool): Whether the last line was a progress bar.
-
-    Examples:
-        Basic file logging:
-        >>> logger = ConsoleLogger("training.log")
-        >>> logger.start_capture()
-        >>> print("This will be logged")
-        >>> logger.stop_capture()
-
-        API streaming:
-        >>> logger = ConsoleLogger("https://api.example.com/logs")
-        >>> logger.start_capture()
-        >>> # All output streams to API
-        >>> logger.stop_capture()
-    """
-
-    def __init__(self, destination):
-        """
-        Initialize with API endpoint or local file path.
-
-        Args:
-            destination (str | Path): API endpoint URL (http/https) or local file path for streaming output.
-        """
-        self.destination = destination
-        self.is_api = isinstance(destination, str) and destination.startswith(("http://", "https://"))
-        if not self.is_api:
-            self.destination = Path(destination)
-
-        # Console capture
-        self.original_stdout = sys.stdout
-        self.original_stderr = sys.stderr
-        self.log_queue = queue.Queue(maxsize=1000)
-        self.active = False
-        self.worker_thread = None
-
-        # State tracking
-        self.last_line = ""
-        self.last_time = 0.0
-        self.last_progress_line = ""  # Track last progress line for deduplication
-        self.last_was_progress = False  # Track if last line was a progress bar
-
-    def start_capture(self):
-        """Start capturing console output and redirect stdout/stderr to custom capture objects."""
-        if self.active:
-            return
-
-        self.active = True
-        sys.stdout = self._ConsoleCapture(self.original_stdout, self._queue_log)
-        sys.stderr = self._ConsoleCapture(self.original_stderr, self._queue_log)
-
-        # Hook Ultralytics logger
-        try:
-            handler = self._LogHandler(self._queue_log)
-            logging.getLogger("ultralytics").addHandler(handler)
-        except Exception:
-            pass
-
-        self.worker_thread = threading.Thread(target=self._stream_worker, daemon=True)
-        self.worker_thread.start()
-
-    def stop_capture(self):
-        """Stop capturing console output and restore original stdout/stderr."""
-        if not self.active:
-            return
-
-        self.active = False
-        sys.stdout = self.original_stdout
-        sys.stderr = self.original_stderr
-        self.log_queue.put(None)
-
-    def _queue_log(self, text):
-        """Queue console text with deduplication and timestamp processing."""
-        if not self.active:
-            return
-
-        current_time = time.time()
-
-        # Handle carriage returns and process lines
-        if "\r" in text:
-            text = text.split("\r")[-1]
-
-        lines = text.split("\n")
-        if lines and lines[-1] == "":
-            lines.pop()
-
-        for line in lines:
-            line = line.rstrip()
-
-            # Skip lines with only thin progress bars (partial progress)
-            if "─" in line:  # Has thin lines but no thick lines
-                continue
-
-            # Deduplicate completed progress bars only if they match the previous progress line
-            if " ━━" in line:
-                progress_core = line.split(" ━━")[0].strip()
-                if progress_core == self.last_progress_line and self.last_was_progress:
-                    continue
-                self.last_progress_line = progress_core
-                self.last_was_progress = True
-            else:
-                # Skip empty line after progress bar
-                if not line and self.last_was_progress:
-                    self.last_was_progress = False
-                    continue
-                self.last_was_progress = False
-
-            # General deduplication
-            if line == self.last_line and current_time - self.last_time < 0.1:
-                continue
-
-            self.last_line = line
-            self.last_time = current_time
-
-            # Add timestamp if needed
-            if not line.startswith("[20"):
-                timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-                line = f"[{timestamp}] {line}"
-
-            # Queue with overflow protection
-            if not self._safe_put(f"{line}\n"):
-                continue  # Skip if queue handling fails
-
-    def _safe_put(self, item):
-        """Safely put item in queue with overflow handling."""
-        try:
-            self.log_queue.put_nowait(item)
-            return True
-        except queue.Full:
-            try:
-                self.log_queue.get_nowait()  # Drop oldest
-                self.log_queue.put_nowait(item)
-                return True
-            except queue.Empty:
-                return False
-
-    def _stream_worker(self):
-        """Background worker for streaming logs to destination."""
-        while self.active:
-            try:
-                log_text = self.log_queue.get(timeout=1)
-                if log_text is None:
-                    break
-                self._write_log(log_text)
-            except queue.Empty:
-                continue
-
-    def _write_log(self, text):
-        """Write log to API endpoint or local file destination."""
-        try:
-            if self.is_api:
-                import requests  # scoped as slow import
-
-                payload = {"timestamp": datetime.now().isoformat(), "message": text.strip()}
-                requests.post(str(self.destination), json=payload, timeout=5)
-            else:
-                self.destination.parent.mkdir(parents=True, exist_ok=True)
-                with self.destination.open("a", encoding="utf-8") as f:
-                    f.write(text)
-        except Exception as e:
-            print(f"Platform logging error: {e}", file=self.original_stderr)
-
-    class _ConsoleCapture:
-        """Lightweight stdout/stderr capture."""
-
-        __slots__ = ("original", "callback")
-
-        def __init__(self, original, callback):
-            self.original = original
-            self.callback = callback
-
-        def write(self, text):
-            self.original.write(text)
-            self.callback(text)
-
-        def flush(self):
-            self.original.flush()
-
-    class _LogHandler(logging.Handler):
-        """Lightweight logging handler."""
-
-        __slots__ = ("callback",)
-
-        def __init__(self, callback):
-            super().__init__()
-            self.callback = callback
-
-        def emit(self, record):
-            self.callback(self.format(record) + "\n")
-
-
-class SystemLogger:
-    """
-    Log dynamic system metrics for training monitoring.
-
-    Captures real-time system metrics including CPU, RAM, disk I/O, network I/O, and NVIDIA GPU statistics for
-    training performance monitoring and analysis.
-
-    Attributes:
-        pynvml: NVIDIA pynvml module instance if successfully imported, None otherwise.
-        nvidia_initialized (bool): Whether NVIDIA GPU monitoring is available and initialized.
-        net_start: Initial network I/O counters for calculating cumulative usage.
-        disk_start: Initial disk I/O counters for calculating cumulative usage.
-
-    Examples:
-        Basic usage:
-        >>> logger = SystemLogger()
-        >>> metrics = logger.get_metrics()
-        >>> print(f"CPU: {metrics['cpu']}%, RAM: {metrics['ram']}%")
-        >>> if metrics["gpus"]:
-        ...     gpu0 = metrics["gpus"]["0"]
-        ...     print(f"GPU0: {gpu0['usage']}% usage, {gpu0['temp']}°C")
-
-        Training loop integration:
-        >>> system_logger = SystemLogger()
-        >>> for epoch in range(epochs):
-        ...     # Training code here
-        ...     metrics = system_logger.get_metrics()
-        ...     # Log to database/file
-    """
-
-    def __init__(self):
-        """Initialize the system logger."""
-        import psutil  # scoped as slow import
-
-        self.pynvml = None
-        self.nvidia_initialized = self._init_nvidia()
-        self.net_start = psutil.net_io_counters()
-        self.disk_start = psutil.disk_io_counters()
-
-    def _init_nvidia(self):
-        """Initialize NVIDIA GPU monitoring with pynvml."""
-        try:
-            assert not MACOS
-            check_requirements("nvidia-ml-py>=12.0.0")
-            self.pynvml = __import__("pynvml")
-            self.pynvml.nvmlInit()
-            return True
-        except Exception:
-            return False
-
-    def get_metrics(self):
-        """
-        Get current system metrics.
-
-        Collects comprehensive system metrics including CPU usage, RAM usage, disk I/O statistics,
-        network I/O statistics, and GPU metrics (if available). Example output:
-
-        ```python
-        metrics = {
-            "cpu": 45.2,
-            "ram": 78.9,
-            "disk": {"read_mb": 156.7, "write_mb": 89.3, "used_gb": 256.8},
-            "network": {"recv_mb": 157.2, "sent_mb": 89.1},
-            "gpus": {
-                0: {"usage": 95.6, "memory": 85.4, "temp": 72, "power": 285},
-                1: {"usage": 94.1, "memory": 82.7, "temp": 70, "power": 278},
-            },
-        }
-        ```
-
-        - cpu (float): CPU usage percentage (0-100%)
-        - ram (float): RAM usage percentage (0-100%)
-        - disk (dict):
-            - read_mb (float): Cumulative disk read in MB since initialization
-            - write_mb (float): Cumulative disk write in MB since initialization
-            - used_gb (float): Total disk space used in GB
-        - network (dict):
-            - recv_mb (float): Cumulative network received in MB since initialization
-            - sent_mb (float): Cumulative network sent in MB since initialization
-        - gpus (dict): GPU metrics by device index (e.g., 0, 1) containing:
-            - usage (int): GPU utilization percentage (0-100%)
-            - memory (float): CUDA memory usage percentage (0-100%)
-            - temp (int): GPU temperature in degrees Celsius
-            - power (int): GPU power consumption in watts
-
-        Returns:
-            metrics (dict): System metrics containing 'cpu', 'ram', 'disk', 'network', 'gpus' with respective usage data.
-        """
-        import psutil  # scoped as slow import
-
-        net = psutil.net_io_counters()
-        disk = psutil.disk_io_counters()
-        memory = psutil.virtual_memory()
-        disk_usage = shutil.disk_usage("/")
-
-        metrics = {
-            "cpu": round(psutil.cpu_percent(), 3),
-            "ram": round(memory.percent, 3),
-            "disk": {
-                "read_mb": round((disk.read_bytes - self.disk_start.read_bytes) / (1 << 20), 3),
-                "write_mb": round((disk.write_bytes - self.disk_start.write_bytes) / (1 << 20), 3),
-                "used_gb": round(disk_usage.used / (1 << 30), 3),
-            },
-            "network": {
-                "recv_mb": round((net.bytes_recv - self.net_start.bytes_recv) / (1 << 20), 3),
-                "sent_mb": round((net.bytes_sent - self.net_start.bytes_sent) / (1 << 20), 3),
-            },
-            "gpus": {},
-        }
-
-        # Add GPU metrics (NVIDIA only)
-        if self.nvidia_initialized:
-            metrics["gpus"].update(self._get_nvidia_metrics())
-
-        return metrics
-
-    def _get_nvidia_metrics(self):
-        """Get NVIDIA GPU metrics including utilization, memory, temperature, and power."""
-        gpus = {}
-        if not self.nvidia_initialized or not self.pynvml:
-            return gpus
-        try:
-            device_count = self.pynvml.nvmlDeviceGetCount()
-            for i in range(device_count):
-                handle = self.pynvml.nvmlDeviceGetHandleByIndex(i)
-                util = self.pynvml.nvmlDeviceGetUtilizationRates(handle)
-                memory = self.pynvml.nvmlDeviceGetMemoryInfo(handle)
-                temp = self.pynvml.nvmlDeviceGetTemperature(handle, self.pynvml.NVML_TEMPERATURE_GPU)
-                power = self.pynvml.nvmlDeviceGetPowerUsage(handle) // 1000
-
-                gpus[str(i)] = {
-                    "usage": round(util.gpu, 3),
-                    "memory": round((memory.used / memory.total) * 100, 3),
-                    "temp": temp,
-                    "power": power,
-                }
-        except Exception:
-            pass
-        return gpus
-
-
-if __name__ == "__main__":
-    print("SystemLogger Real-time Metrics Monitor")
-    print("Press Ctrl+C to stop\n")
-
-    logger = SystemLogger()
-
-    try:
-        while True:
-            metrics = logger.get_metrics()
-
-            # Clear screen (works on most terminals)
-            print("\033[H\033[J", end="")
-
-            # Display system metrics
-            print(f"CPU: {metrics['cpu']:5.1f}%")
-            print(f"RAM: {metrics['ram']:5.1f}%")
-            print(f"Disk Read: {metrics['disk']['read_mb']:8.1f} MB")
-            print(f"Disk Write: {metrics['disk']['write_mb']:7.1f} MB")
-            print(f"Disk Used: {metrics['disk']['used_gb']:8.1f} GB")
-            print(f"Net Recv: {metrics['network']['recv_mb']:9.1f} MB")
-            print(f"Net Sent: {metrics['network']['sent_mb']:9.1f} MB")
-
-            # Display GPU metrics if available
-            if metrics["gpus"]:
-                print("\nGPU Metrics:")
-                for gpu_id, gpu_data in metrics["gpus"].items():
-                    print(
-                        f"  GPU {gpu_id}: {gpu_data['usage']:3}% | "
-                        f"Mem: {gpu_data['memory']:5.1f}% | "
-                        f"Temp: {gpu_data['temp']:2}°C | "
-                        f"Power: {gpu_data['power']:3}W"
-                    )
-            else:
-                print("\nGPU: No NVIDIA GPUs detected")
-
-            time.sleep(1)
-
-    except KeyboardInterrupt:
-        print("\n\nStopped monitoring.")
diff --git a/ultralytics/utils/loss.py b/ultralytics/utils/loss.py
deleted file mode 100644
index 95628da..0000000
--- a/ultralytics/utils/loss.py
+++ /dev/null
@@ -1,857 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-from typing import Any
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from ultralytics.utils.metrics import OKS_SIGMA
-from ultralytics.utils.ops import crop_mask, xywh2xyxy, xyxy2xywh
-from ultralytics.utils.tal import RotatedTaskAlignedAssigner, TaskAlignedAssigner, dist2bbox, dist2rbox, make_anchors
-from ultralytics.utils.torch_utils import autocast
-
-from .metrics import bbox_iou, probiou
-from .tal import bbox2dist
-
-
-class VarifocalLoss(nn.Module):
-    """
-    Varifocal loss by Zhang et al.
-
-    Implements the Varifocal Loss function for addressing class imbalance in object detection by focusing on
-    hard-to-classify examples and balancing positive/negative samples.
-
-    Attributes:
-        gamma (float): The focusing parameter that controls how much the loss focuses on hard-to-classify examples.
-        alpha (float): The balancing factor used to address class imbalance.
-
-    References:
-        https://arxiv.org/abs/2008.13367
-    """
-
-    def __init__(self, gamma: float = 2.0, alpha: float = 0.75):
-        """Initialize the VarifocalLoss class with focusing and balancing parameters."""
-        super().__init__()
-        self.gamma = gamma
-        self.alpha = alpha
-
-    def forward(self, pred_score: torch.Tensor, gt_score: torch.Tensor, label: torch.Tensor) -> torch.Tensor:
-        """Compute varifocal loss between predictions and ground truth."""
-        weight = self.alpha * pred_score.sigmoid().pow(self.gamma) * (1 - label) + gt_score * label
-        with autocast(enabled=False):
-            loss = (
-                (F.binary_cross_entropy_with_logits(pred_score.float(), gt_score.float(), reduction="none") * weight)
-                .mean(1)
-                .sum()
-            )
-        return loss
-
-
-class FocalLoss(nn.Module):
-    """
-    Wraps focal loss around existing loss_fcn(), i.e. criteria = FocalLoss(nn.BCEWithLogitsLoss(), gamma=1.5).
-
-    Implements the Focal Loss function for addressing class imbalance by down-weighting easy examples and focusing
-    on hard negatives during training.
-
-    Attributes:
-        gamma (float): The focusing parameter that controls how much the loss focuses on hard-to-classify examples.
-        alpha (torch.Tensor): The balancing factor used to address class imbalance.
-    """
-
-    def __init__(self, gamma: float = 1.5, alpha: float = 0.25):
-        """Initialize FocalLoss class with focusing and balancing parameters."""
-        super().__init__()
-        self.gamma = gamma
-        self.alpha = torch.tensor(alpha)
-
-    def forward(self, pred: torch.Tensor, label: torch.Tensor) -> torch.Tensor:
-        """Calculate focal loss with modulating factors for class imbalance."""
-        loss = F.binary_cross_entropy_with_logits(pred, label, reduction="none")
-        # p_t = torch.exp(-loss)
-        # loss *= self.alpha * (1.000001 - p_t) ** self.gamma  # non-zero power for gradient stability
-
-        # TF implementation https://github.com/tensorflow/addons/blob/v0.7.1/tensorflow_addons/losses/focal_loss.py
-        pred_prob = pred.sigmoid()  # prob from logits
-        p_t = label * pred_prob + (1 - label) * (1 - pred_prob)
-        modulating_factor = (1.0 - p_t) ** self.gamma
-        loss *= modulating_factor
-        if (self.alpha > 0).any():
-            self.alpha = self.alpha.to(device=pred.device, dtype=pred.dtype)
-            alpha_factor = label * self.alpha + (1 - label) * (1 - self.alpha)
-            loss *= alpha_factor
-        return loss.mean(1).sum()
-
-
-class DFLoss(nn.Module):
-    """Criterion class for computing Distribution Focal Loss (DFL)."""
-
-    def __init__(self, reg_max: int = 16) -> None:
-        """Initialize the DFL module with regularization maximum."""
-        super().__init__()
-        self.reg_max = reg_max
-
-    def __call__(self, pred_dist: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
-        """Return sum of left and right DFL losses from https://ieeexplore.ieee.org/document/9792391."""
-        target = target.clamp_(0, self.reg_max - 1 - 0.01)
-        tl = target.long()  # target left
-        tr = tl + 1  # target right
-        wl = tr - target  # weight left
-        wr = 1 - wl  # weight right
-        return (
-            F.cross_entropy(pred_dist, tl.view(-1), reduction="none").view(tl.shape) * wl
-            + F.cross_entropy(pred_dist, tr.view(-1), reduction="none").view(tl.shape) * wr
-        ).mean(-1, keepdim=True)
-
-
-class BboxLoss(nn.Module):
-    """Criterion class for computing training losses for bounding boxes."""
-
-    def __init__(self, reg_max: int = 16):
-        """Initialize the BboxLoss module with regularization maximum and DFL settings."""
-        super().__init__()
-        self.dfl_loss = DFLoss(reg_max) if reg_max > 1 else None
-
-    def forward(
-        self,
-        pred_dist: torch.Tensor,
-        pred_bboxes: torch.Tensor,
-        anchor_points: torch.Tensor,
-        target_bboxes: torch.Tensor,
-        target_scores: torch.Tensor,
-        target_scores_sum: torch.Tensor,
-        fg_mask: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        """Compute IoU and DFL losses for bounding boxes."""
-        weight = target_scores.sum(-1)[fg_mask].unsqueeze(-1)
-        iou = bbox_iou(pred_bboxes[fg_mask], target_bboxes[fg_mask], xywh=False, CIoU=True)
-        loss_iou = ((1.0 - iou) * weight).sum() / target_scores_sum
-
-        # DFL loss
-        if self.dfl_loss:
-            target_ltrb = bbox2dist(anchor_points, target_bboxes, self.dfl_loss.reg_max - 1)
-            loss_dfl = self.dfl_loss(pred_dist[fg_mask].view(-1, self.dfl_loss.reg_max), target_ltrb[fg_mask]) * weight
-            loss_dfl = loss_dfl.sum() / target_scores_sum
-        else:
-            loss_dfl = torch.tensor(0.0).to(pred_dist.device)
-
-        return loss_iou, loss_dfl
-
-
-class RotatedBboxLoss(BboxLoss):
-    """Criterion class for computing training losses for rotated bounding boxes."""
-
-    def __init__(self, reg_max: int):
-        """Initialize the RotatedBboxLoss module with regularization maximum and DFL settings."""
-        super().__init__(reg_max)
-
-    def forward(
-        self,
-        pred_dist: torch.Tensor,
-        pred_bboxes: torch.Tensor,
-        anchor_points: torch.Tensor,
-        target_bboxes: torch.Tensor,
-        target_scores: torch.Tensor,
-        target_scores_sum: torch.Tensor,
-        fg_mask: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        """Compute IoU and DFL losses for rotated bounding boxes."""
-        weight = target_scores.sum(-1)[fg_mask].unsqueeze(-1)
-        iou = probiou(pred_bboxes[fg_mask], target_bboxes[fg_mask])
-        loss_iou = ((1.0 - iou) * weight).sum() / target_scores_sum
-
-        # DFL loss
-        if self.dfl_loss:
-            target_ltrb = bbox2dist(anchor_points, xywh2xyxy(target_bboxes[..., :4]), self.dfl_loss.reg_max - 1)
-            loss_dfl = self.dfl_loss(pred_dist[fg_mask].view(-1, self.dfl_loss.reg_max), target_ltrb[fg_mask]) * weight
-            loss_dfl = loss_dfl.sum() / target_scores_sum
-        else:
-            loss_dfl = torch.tensor(0.0).to(pred_dist.device)
-
-        return loss_iou, loss_dfl
-
-
-class KeypointLoss(nn.Module):
-    """Criterion class for computing keypoint losses."""
-
-    def __init__(self, sigmas: torch.Tensor) -> None:
-        """Initialize the KeypointLoss class with keypoint sigmas."""
-        super().__init__()
-        self.sigmas = sigmas
-
-    def forward(
-        self, pred_kpts: torch.Tensor, gt_kpts: torch.Tensor, kpt_mask: torch.Tensor, area: torch.Tensor
-    ) -> torch.Tensor:
-        """Calculate keypoint loss factor and Euclidean distance loss for keypoints."""
-        d = (pred_kpts[..., 0] - gt_kpts[..., 0]).pow(2) + (pred_kpts[..., 1] - gt_kpts[..., 1]).pow(2)
-        kpt_loss_factor = kpt_mask.shape[1] / (torch.sum(kpt_mask != 0, dim=1) + 1e-9)
-        # e = d / (2 * (area * self.sigmas) ** 2 + 1e-9)  # from formula
-        e = d / ((2 * self.sigmas).pow(2) * (area + 1e-9) * 2)  # from cocoeval
-        return (kpt_loss_factor.view(-1, 1) * ((1 - torch.exp(-e)) * kpt_mask)).mean()
-
-
-class v8DetectionLoss:
-    """Criterion class for computing training losses for YOLOv8 object detection."""
-
-    def __init__(self, model, tal_topk: int = 10):  # model must be de-paralleled
-        """Initialize v8DetectionLoss with model parameters and task-aligned assignment settings."""
-        device = next(model.parameters()).device  # get model device
-        h = model.args  # hyperparameters
-
-        m = model.model[-1]  # Detect() module
-        self.bce = nn.BCEWithLogitsLoss(reduction="none")
-        self.hyp = h
-        self.stride = m.stride  # model strides
-        self.nc = m.nc  # number of classes
-        self.no = m.nc + m.reg_max * 4
-        self.reg_max = m.reg_max
-        self.device = device
-
-        self.use_dfl = m.reg_max > 1
-
-        self.assigner = TaskAlignedAssigner(topk=tal_topk, num_classes=self.nc, alpha=0.5, beta=6.0)
-        self.bbox_loss = BboxLoss(m.reg_max).to(device)
-        self.proj = torch.arange(m.reg_max, dtype=torch.float, device=device)
-
-    def preprocess(self, targets: torch.Tensor, batch_size: int, scale_tensor: torch.Tensor) -> torch.Tensor:
-        """Preprocess targets by converting to tensor format and scaling coordinates."""
-        nl, ne = targets.shape
-        if nl == 0:
-            out = torch.zeros(batch_size, 0, ne - 1, device=self.device)
-        else:
-            i = targets[:, 0]  # image index
-            _, counts = i.unique(return_counts=True)
-            counts = counts.to(dtype=torch.int32)
-            out = torch.zeros(batch_size, counts.max(), ne - 1, device=self.device)
-            for j in range(batch_size):
-                matches = i == j
-                if n := matches.sum():
-                    out[j, :n] = targets[matches, 1:]
-            out[..., 1:5] = xywh2xyxy(out[..., 1:5].mul_(scale_tensor))
-        return out
-
-    def bbox_decode(self, anchor_points: torch.Tensor, pred_dist: torch.Tensor) -> torch.Tensor:
-        """Decode predicted object bounding box coordinates from anchor points and distribution."""
-        if self.use_dfl:
-            b, a, c = pred_dist.shape  # batch, anchors, channels
-            pred_dist = pred_dist.view(b, a, 4, c // 4).softmax(3).matmul(self.proj.type(pred_dist.dtype))
-            # pred_dist = pred_dist.view(b, a, c // 4, 4).transpose(2,3).softmax(3).matmul(self.proj.type(pred_dist.dtype))
-            # pred_dist = (pred_dist.view(b, a, c // 4, 4).softmax(2) * self.proj.type(pred_dist.dtype).view(1, 1, -1, 1)).sum(2)
-        return dist2bbox(pred_dist, anchor_points, xywh=False)
-
-    def __call__(self, preds: Any, batch: dict[str, torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]:
-        """Calculate the sum of the loss for box, cls and dfl multiplied by batch size."""
-        loss = torch.zeros(3, device=self.device)  # box, cls, dfl
-        feats = preds[1] if isinstance(preds, tuple) else preds
-        pred_distri, pred_scores = torch.cat([xi.view(feats[0].shape[0], self.no, -1) for xi in feats], 2).split(
-            (self.reg_max * 4, self.nc), 1
-        )
-
-        pred_scores = pred_scores.permute(0, 2, 1).contiguous()
-        pred_distri = pred_distri.permute(0, 2, 1).contiguous()
-
-        dtype = pred_scores.dtype
-        batch_size = pred_scores.shape[0]
-        imgsz = torch.tensor(feats[0].shape[2:], device=self.device, dtype=dtype) * self.stride[0]  # image size (h,w)
-        anchor_points, stride_tensor = make_anchors(feats, self.stride, 0.5)
-
-        # Targets
-        targets = torch.cat((batch["batch_idx"].view(-1, 1), batch["cls"].view(-1, 1), batch["bboxes"]), 1)
-        targets = self.preprocess(targets, batch_size, scale_tensor=imgsz[[1, 0, 1, 0]])
-        gt_labels, gt_bboxes = targets.split((1, 4), 2)  # cls, xyxy
-        mask_gt = gt_bboxes.sum(2, keepdim=True).gt_(0.0)
-
-        # Pboxes
-        pred_bboxes = self.bbox_decode(anchor_points, pred_distri)  # xyxy, (b, h*w, 4)
-        # dfl_conf = pred_distri.view(batch_size, -1, 4, self.reg_max).detach().softmax(-1)
-        # dfl_conf = (dfl_conf.amax(-1).mean(-1) + dfl_conf.amax(-1).amin(-1)) / 2
-
-        _, target_bboxes, target_scores, fg_mask, _ = self.assigner(
-            # pred_scores.detach().sigmoid() * 0.8 + dfl_conf.unsqueeze(-1) * 0.2,
-            pred_scores.detach().sigmoid(),
-            (pred_bboxes.detach() * stride_tensor).type(gt_bboxes.dtype),
-            anchor_points * stride_tensor,
-            gt_labels,
-            gt_bboxes,
-            mask_gt,
-        )
-
-        target_scores_sum = max(target_scores.sum(), 1)
-
-        # Cls loss
-        # loss[1] = self.varifocal_loss(pred_scores, target_scores, target_labels) / target_scores_sum  # VFL way
-        loss[1] = self.bce(pred_scores, target_scores.to(dtype)).sum() / target_scores_sum  # BCE
-
-        # Bbox loss
-        if fg_mask.sum():
-            loss[0], loss[2] = self.bbox_loss(
-                pred_distri,
-                pred_bboxes,
-                anchor_points,
-                target_bboxes / stride_tensor,
-                target_scores,
-                target_scores_sum,
-                fg_mask,
-            )
-
-        loss[0] *= self.hyp.box  # box gain
-        loss[1] *= self.hyp.cls  # cls gain
-        loss[2] *= self.hyp.dfl  # dfl gain
-
-        return loss * batch_size, loss.detach()  # loss(box, cls, dfl)
-
-
-class v8SegmentationLoss(v8DetectionLoss):
-    """Criterion class for computing training losses for YOLOv8 segmentation."""
-
-    def __init__(self, model):  # model must be de-paralleled
-        """Initialize the v8SegmentationLoss class with model parameters and mask overlap setting."""
-        super().__init__(model)
-        self.overlap = model.args.overlap_mask
-
-    def __call__(self, preds: Any, batch: dict[str, torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]:
-        """Calculate and return the combined loss for detection and segmentation."""
-        loss = torch.zeros(4, device=self.device)  # box, seg, cls, dfl
-        feats, pred_masks, proto = preds if len(preds) == 3 else preds[1]
-        batch_size, _, mask_h, mask_w = proto.shape  # batch size, number of masks, mask height, mask width
-        pred_distri, pred_scores = torch.cat([xi.view(feats[0].shape[0], self.no, -1) for xi in feats], 2).split(
-            (self.reg_max * 4, self.nc), 1
-        )
-
-        # B, grids, ..
-        pred_scores = pred_scores.permute(0, 2, 1).contiguous()
-        pred_distri = pred_distri.permute(0, 2, 1).contiguous()
-        pred_masks = pred_masks.permute(0, 2, 1).contiguous()
-
-        dtype = pred_scores.dtype
-        imgsz = torch.tensor(feats[0].shape[2:], device=self.device, dtype=dtype) * self.stride[0]  # image size (h,w)
-        anchor_points, stride_tensor = make_anchors(feats, self.stride, 0.5)
-
-        # Targets
-        try:
-            batch_idx = batch["batch_idx"].view(-1, 1)
-            targets = torch.cat((batch_idx, batch["cls"].view(-1, 1), batch["bboxes"]), 1)
-            targets = self.preprocess(targets, batch_size, scale_tensor=imgsz[[1, 0, 1, 0]])
-            gt_labels, gt_bboxes = targets.split((1, 4), 2)  # cls, xyxy
-            mask_gt = gt_bboxes.sum(2, keepdim=True).gt_(0.0)
-        except RuntimeError as e:
-            raise TypeError(
-                "ERROR ❌ segment dataset incorrectly formatted or not a segment dataset.\n"
-                "This error can occur when incorrectly training a 'segment' model on a 'detect' dataset, "
-                "i.e. 'yolo train model=yolo11n-seg.pt data=coco8.yaml'.\nVerify your dataset is a "
-                "correctly formatted 'segment' dataset using 'data=coco8-seg.yaml' "
-                "as an example.\nSee https://docs.ultralytics.com/datasets/segment/ for help."
-            ) from e
-
-        # Pboxes
-        pred_bboxes = self.bbox_decode(anchor_points, pred_distri)  # xyxy, (b, h*w, 4)
-
-        _, target_bboxes, target_scores, fg_mask, target_gt_idx = self.assigner(
-            pred_scores.detach().sigmoid(),
-            (pred_bboxes.detach() * stride_tensor).type(gt_bboxes.dtype),
-            anchor_points * stride_tensor,
-            gt_labels,
-            gt_bboxes,
-            mask_gt,
-        )
-
-        target_scores_sum = max(target_scores.sum(), 1)
-
-        # Cls loss
-        # loss[1] = self.varifocal_loss(pred_scores, target_scores, target_labels) / target_scores_sum  # VFL way
-        loss[2] = self.bce(pred_scores, target_scores.to(dtype)).sum() / target_scores_sum  # BCE
-
-        if fg_mask.sum():
-            # Bbox loss
-            loss[0], loss[3] = self.bbox_loss(
-                pred_distri,
-                pred_bboxes,
-                anchor_points,
-                target_bboxes / stride_tensor,
-                target_scores,
-                target_scores_sum,
-                fg_mask,
-            )
-            # Masks loss
-            masks = batch["masks"].to(self.device).float()
-            if tuple(masks.shape[-2:]) != (mask_h, mask_w):  # downsample
-                masks = F.interpolate(masks[None], (mask_h, mask_w), mode="nearest")[0]
-
-            loss[1] = self.calculate_segmentation_loss(
-                fg_mask, masks, target_gt_idx, target_bboxes, batch_idx, proto, pred_masks, imgsz, self.overlap
-            )
-
-        # WARNING: lines below prevent Multi-GPU DDP 'unused gradient' PyTorch errors, do not remove
-        else:
-            loss[1] += (proto * 0).sum() + (pred_masks * 0).sum()  # inf sums may lead to nan loss
-
-        loss[0] *= self.hyp.box  # box gain
-        loss[1] *= self.hyp.box  # seg gain
-        loss[2] *= self.hyp.cls  # cls gain
-        loss[3] *= self.hyp.dfl  # dfl gain
-
-        return loss * batch_size, loss.detach()  # loss(box, seg, cls, dfl)
-
-    @staticmethod
-    def single_mask_loss(
-        gt_mask: torch.Tensor, pred: torch.Tensor, proto: torch.Tensor, xyxy: torch.Tensor, area: torch.Tensor
-    ) -> torch.Tensor:
-        """
-        Compute the instance segmentation loss for a single image.
-
-        Args:
-            gt_mask (torch.Tensor): Ground truth mask of shape (N, H, W), where N is the number of objects.
-            pred (torch.Tensor): Predicted mask coefficients of shape (N, 32).
-            proto (torch.Tensor): Prototype masks of shape (32, H, W).
-            xyxy (torch.Tensor): Ground truth bounding boxes in xyxy format, normalized to [0, 1], of shape (N, 4).
-            area (torch.Tensor): Area of each ground truth bounding box of shape (N,).
-
-        Returns:
-            (torch.Tensor): The calculated mask loss for a single image.
-
-        Notes:
-            The function uses the equation pred_mask = torch.einsum('in,nhw->ihw', pred, proto) to produce the
-            predicted masks from the prototype masks and predicted mask coefficients.
-        """
-        pred_mask = torch.einsum("in,nhw->ihw", pred, proto)  # (n, 32) @ (32, 80, 80) -> (n, 80, 80)
-        loss = F.binary_cross_entropy_with_logits(pred_mask, gt_mask, reduction="none")
-        return (crop_mask(loss, xyxy).mean(dim=(1, 2)) / area).sum()
-
-    def calculate_segmentation_loss(
-        self,
-        fg_mask: torch.Tensor,
-        masks: torch.Tensor,
-        target_gt_idx: torch.Tensor,
-        target_bboxes: torch.Tensor,
-        batch_idx: torch.Tensor,
-        proto: torch.Tensor,
-        pred_masks: torch.Tensor,
-        imgsz: torch.Tensor,
-        overlap: bool,
-    ) -> torch.Tensor:
-        """
-        Calculate the loss for instance segmentation.
-
-        Args:
-            fg_mask (torch.Tensor): A binary tensor of shape (BS, N_anchors) indicating which anchors are positive.
-            masks (torch.Tensor): Ground truth masks of shape (BS, H, W) if `overlap` is False, otherwise (BS, ?, H, W).
-            target_gt_idx (torch.Tensor): Indexes of ground truth objects for each anchor of shape (BS, N_anchors).
-            target_bboxes (torch.Tensor): Ground truth bounding boxes for each anchor of shape (BS, N_anchors, 4).
-            batch_idx (torch.Tensor): Batch indices of shape (N_labels_in_batch, 1).
-            proto (torch.Tensor): Prototype masks of shape (BS, 32, H, W).
-            pred_masks (torch.Tensor): Predicted masks for each anchor of shape (BS, N_anchors, 32).
-            imgsz (torch.Tensor): Size of the input image as a tensor of shape (2), i.e., (H, W).
-            overlap (bool): Whether the masks in `masks` tensor overlap.
-
-        Returns:
-            (torch.Tensor): The calculated loss for instance segmentation.
-
-        Notes:
-            The batch loss can be computed for improved speed at higher memory usage.
-            For example, pred_mask can be computed as follows:
-                pred_mask = torch.einsum('in,nhw->ihw', pred, proto)  # (i, 32) @ (32, 160, 160) -> (i, 160, 160)
-        """
-        _, _, mask_h, mask_w = proto.shape
-        loss = 0
-
-        # Normalize to 0-1
-        target_bboxes_normalized = target_bboxes / imgsz[[1, 0, 1, 0]]
-
-        # Areas of target bboxes
-        marea = xyxy2xywh(target_bboxes_normalized)[..., 2:].prod(2)
-
-        # Normalize to mask size
-        mxyxy = target_bboxes_normalized * torch.tensor([mask_w, mask_h, mask_w, mask_h], device=proto.device)
-
-        for i, single_i in enumerate(zip(fg_mask, target_gt_idx, pred_masks, proto, mxyxy, marea, masks)):
-            fg_mask_i, target_gt_idx_i, pred_masks_i, proto_i, mxyxy_i, marea_i, masks_i = single_i
-            if fg_mask_i.any():
-                mask_idx = target_gt_idx_i[fg_mask_i]
-                if overlap:
-                    gt_mask = masks_i == (mask_idx + 1).view(-1, 1, 1)
-                    gt_mask = gt_mask.float()
-                else:
-                    gt_mask = masks[batch_idx.view(-1) == i][mask_idx]
-
-                loss += self.single_mask_loss(
-                    gt_mask, pred_masks_i[fg_mask_i], proto_i, mxyxy_i[fg_mask_i], marea_i[fg_mask_i]
-                )
-
-            # WARNING: lines below prevents Multi-GPU DDP 'unused gradient' PyTorch errors, do not remove
-            else:
-                loss += (proto * 0).sum() + (pred_masks * 0).sum()  # inf sums may lead to nan loss
-
-        return loss / fg_mask.sum()
-
-
-class v8PoseLoss(v8DetectionLoss):
-    """Criterion class for computing training losses for YOLOv8 pose estimation."""
-
-    def __init__(self, model):  # model must be de-paralleled
-        """Initialize v8PoseLoss with model parameters and keypoint-specific loss functions."""
-        super().__init__(model)
-        self.kpt_shape = model.model[-1].kpt_shape
-        self.bce_pose = nn.BCEWithLogitsLoss()
-        is_pose = self.kpt_shape == [17, 3]
-        nkpt = self.kpt_shape[0]  # number of keypoints
-        sigmas = torch.from_numpy(OKS_SIGMA).to(self.device) if is_pose else torch.ones(nkpt, device=self.device) / nkpt
-        self.keypoint_loss = KeypointLoss(sigmas=sigmas)
-
-    def __call__(self, preds: Any, batch: dict[str, torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]:
-        """Calculate the total loss and detach it for pose estimation."""
-        loss = torch.zeros(5, device=self.device)  # box, cls, dfl, kpt_location, kpt_visibility
-        feats, pred_kpts = preds if isinstance(preds[0], list) else preds[1]
-        pred_distri, pred_scores = torch.cat([xi.view(feats[0].shape[0], self.no, -1) for xi in feats], 2).split(
-            (self.reg_max * 4, self.nc), 1
-        )
-
-        # B, grids, ..
-        pred_scores = pred_scores.permute(0, 2, 1).contiguous()
-        pred_distri = pred_distri.permute(0, 2, 1).contiguous()
-        pred_kpts = pred_kpts.permute(0, 2, 1).contiguous()
-
-        dtype = pred_scores.dtype
-        imgsz = torch.tensor(feats[0].shape[2:], device=self.device, dtype=dtype) * self.stride[0]  # image size (h,w)
-        anchor_points, stride_tensor = make_anchors(feats, self.stride, 0.5)
-
-        # Targets
-        batch_size = pred_scores.shape[0]
-        batch_idx = batch["batch_idx"].view(-1, 1)
-        targets = torch.cat((batch_idx, batch["cls"].view(-1, 1), batch["bboxes"]), 1)
-        targets = self.preprocess(targets, batch_size, scale_tensor=imgsz[[1, 0, 1, 0]])
-        gt_labels, gt_bboxes = targets.split((1, 4), 2)  # cls, xyxy
-        mask_gt = gt_bboxes.sum(2, keepdim=True).gt_(0.0)
-
-        # Pboxes
-        pred_bboxes = self.bbox_decode(anchor_points, pred_distri)  # xyxy, (b, h*w, 4)
-        pred_kpts = self.kpts_decode(anchor_points, pred_kpts.view(batch_size, -1, *self.kpt_shape))  # (b, h*w, 17, 3)
-
-        _, target_bboxes, target_scores, fg_mask, target_gt_idx = self.assigner(
-            pred_scores.detach().sigmoid(),
-            (pred_bboxes.detach() * stride_tensor).type(gt_bboxes.dtype),
-            anchor_points * stride_tensor,
-            gt_labels,
-            gt_bboxes,
-            mask_gt,
-        )
-
-        target_scores_sum = max(target_scores.sum(), 1)
-
-        # Cls loss
-        # loss[1] = self.varifocal_loss(pred_scores, target_scores, target_labels) / target_scores_sum  # VFL way
-        loss[3] = self.bce(pred_scores, target_scores.to(dtype)).sum() / target_scores_sum  # BCE
-
-        # Bbox loss
-        if fg_mask.sum():
-            target_bboxes /= stride_tensor
-            loss[0], loss[4] = self.bbox_loss(
-                pred_distri, pred_bboxes, anchor_points, target_bboxes, target_scores, target_scores_sum, fg_mask
-            )
-            keypoints = batch["keypoints"].to(self.device).float().clone()
-            keypoints[..., 0] *= imgsz[1]
-            keypoints[..., 1] *= imgsz[0]
-
-            loss[1], loss[2] = self.calculate_keypoints_loss(
-                fg_mask, target_gt_idx, keypoints, batch_idx, stride_tensor, target_bboxes, pred_kpts
-            )
-
-        loss[0] *= self.hyp.box  # box gain
-        loss[1] *= self.hyp.pose  # pose gain
-        loss[2] *= self.hyp.kobj  # kobj gain
-        loss[3] *= self.hyp.cls  # cls gain
-        loss[4] *= self.hyp.dfl  # dfl gain
-
-        return loss * batch_size, loss.detach()  # loss(box, cls, dfl)
-
-    @staticmethod
-    def kpts_decode(anchor_points: torch.Tensor, pred_kpts: torch.Tensor) -> torch.Tensor:
-        """Decode predicted keypoints to image coordinates."""
-        y = pred_kpts.clone()
-        y[..., :2] *= 2.0
-        y[..., 0] += anchor_points[:, [0]] - 0.5
-        y[..., 1] += anchor_points[:, [1]] - 0.5
-        return y
-
-    def calculate_keypoints_loss(
-        self,
-        masks: torch.Tensor,
-        target_gt_idx: torch.Tensor,
-        keypoints: torch.Tensor,
-        batch_idx: torch.Tensor,
-        stride_tensor: torch.Tensor,
-        target_bboxes: torch.Tensor,
-        pred_kpts: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        """
-        Calculate the keypoints loss for the model.
-
-        This function calculates the keypoints loss and keypoints object loss for a given batch. The keypoints loss is
-        based on the difference between the predicted keypoints and ground truth keypoints. The keypoints object loss is
-        a binary classification loss that classifies whether a keypoint is present or not.
-
-        Args:
-            masks (torch.Tensor): Binary mask tensor indicating object presence, shape (BS, N_anchors).
-            target_gt_idx (torch.Tensor): Index tensor mapping anchors to ground truth objects, shape (BS, N_anchors).
-            keypoints (torch.Tensor): Ground truth keypoints, shape (N_kpts_in_batch, N_kpts_per_object, kpts_dim).
-            batch_idx (torch.Tensor): Batch index tensor for keypoints, shape (N_kpts_in_batch, 1).
-            stride_tensor (torch.Tensor): Stride tensor for anchors, shape (N_anchors, 1).
-            target_bboxes (torch.Tensor): Ground truth boxes in (x1, y1, x2, y2) format, shape (BS, N_anchors, 4).
-            pred_kpts (torch.Tensor): Predicted keypoints, shape (BS, N_anchors, N_kpts_per_object, kpts_dim).
-
-        Returns:
-            kpts_loss (torch.Tensor): The keypoints loss.
-            kpts_obj_loss (torch.Tensor): The keypoints object loss.
-        """
-        batch_idx = batch_idx.flatten()
-        batch_size = len(masks)
-
-        # Find the maximum number of keypoints in a single image
-        max_kpts = torch.unique(batch_idx, return_counts=True)[1].max()
-
-        # Create a tensor to hold batched keypoints
-        batched_keypoints = torch.zeros(
-            (batch_size, max_kpts, keypoints.shape[1], keypoints.shape[2]), device=keypoints.device
-        )
-
-        # TODO: any idea how to vectorize this?
-        # Fill batched_keypoints with keypoints based on batch_idx
-        for i in range(batch_size):
-            keypoints_i = keypoints[batch_idx == i]
-            batched_keypoints[i, : keypoints_i.shape[0]] = keypoints_i
-
-        # Expand dimensions of target_gt_idx to match the shape of batched_keypoints
-        target_gt_idx_expanded = target_gt_idx.unsqueeze(-1).unsqueeze(-1)
-
-        # Use target_gt_idx_expanded to select keypoints from batched_keypoints
-        selected_keypoints = batched_keypoints.gather(
-            1, target_gt_idx_expanded.expand(-1, -1, keypoints.shape[1], keypoints.shape[2])
-        )
-
-        # Divide coordinates by stride
-        selected_keypoints[..., :2] /= stride_tensor.view(1, -1, 1, 1)
-
-        kpts_loss = 0
-        kpts_obj_loss = 0
-
-        if masks.any():
-            gt_kpt = selected_keypoints[masks]
-            area = xyxy2xywh(target_bboxes[masks])[:, 2:].prod(1, keepdim=True)
-            pred_kpt = pred_kpts[masks]
-            kpt_mask = gt_kpt[..., 2] != 0 if gt_kpt.shape[-1] == 3 else torch.full_like(gt_kpt[..., 0], True)
-            kpts_loss = self.keypoint_loss(pred_kpt, gt_kpt, kpt_mask, area)  # pose loss
-
-            if pred_kpt.shape[-1] == 3:
-                kpts_obj_loss = self.bce_pose(pred_kpt[..., 2], kpt_mask.float())  # keypoint obj loss
-
-        return kpts_loss, kpts_obj_loss
-
-
-class v8ClassificationLoss:
-    """Criterion class for computing training losses for classification."""
-
-    def __call__(self, preds: Any, batch: dict[str, torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]:
-        """Compute the classification loss between predictions and true labels."""
-        preds = preds[1] if isinstance(preds, (list, tuple)) else preds
-        loss = F.cross_entropy(preds, batch["cls"], reduction="mean")
-        return loss, loss.detach()
-
-
-class v8OBBLoss(v8DetectionLoss):
-    """Calculates losses for object detection, classification, and box distribution in rotated YOLO models."""
-
-    def __init__(self, model):
-        """Initialize v8OBBLoss with model, assigner, and rotated bbox loss; model must be de-paralleled."""
-        super().__init__(model)
-        self.assigner = RotatedTaskAlignedAssigner(topk=10, num_classes=self.nc, alpha=0.5, beta=6.0)
-        self.bbox_loss = RotatedBboxLoss(self.reg_max).to(self.device)
-
-    def preprocess(self, targets: torch.Tensor, batch_size: int, scale_tensor: torch.Tensor) -> torch.Tensor:
-        """Preprocess targets for oriented bounding box detection."""
-        if targets.shape[0] == 0:
-            out = torch.zeros(batch_size, 0, 6, device=self.device)
-        else:
-            i = targets[:, 0]  # image index
-            _, counts = i.unique(return_counts=True)
-            counts = counts.to(dtype=torch.int32)
-            out = torch.zeros(batch_size, counts.max(), 6, device=self.device)
-            for j in range(batch_size):
-                matches = i == j
-                if n := matches.sum():
-                    bboxes = targets[matches, 2:]
-                    bboxes[..., :4].mul_(scale_tensor)
-                    out[j, :n] = torch.cat([targets[matches, 1:2], bboxes], dim=-1)
-        return out
-
-    def __call__(self, preds: Any, batch: dict[str, torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]:
-        """Calculate and return the loss for oriented bounding box detection."""
-        loss = torch.zeros(3, device=self.device)  # box, cls, dfl
-        feats, pred_angle = preds if isinstance(preds[0], list) else preds[1]
-        batch_size = pred_angle.shape[0]  # batch size, number of masks, mask height, mask width
-        pred_distri, pred_scores = torch.cat([xi.view(feats[0].shape[0], self.no, -1) for xi in feats], 2).split(
-            (self.reg_max * 4, self.nc), 1
-        )
-
-        # b, grids, ..
-        pred_scores = pred_scores.permute(0, 2, 1).contiguous()
-        pred_distri = pred_distri.permute(0, 2, 1).contiguous()
-        pred_angle = pred_angle.permute(0, 2, 1).contiguous()
-
-        dtype = pred_scores.dtype
-        imgsz = torch.tensor(feats[0].shape[2:], device=self.device, dtype=dtype) * self.stride[0]  # image size (h,w)
-        anchor_points, stride_tensor = make_anchors(feats, self.stride, 0.5)
-
-        # targets
-        try:
-            batch_idx = batch["batch_idx"].view(-1, 1)
-            targets = torch.cat((batch_idx, batch["cls"].view(-1, 1), batch["bboxes"].view(-1, 5)), 1)
-            rw, rh = targets[:, 4] * imgsz[0].item(), targets[:, 5] * imgsz[1].item()
-            targets = targets[(rw >= 2) & (rh >= 2)]  # filter rboxes of tiny size to stabilize training
-            targets = self.preprocess(targets, batch_size, scale_tensor=imgsz[[1, 0, 1, 0]])
-            gt_labels, gt_bboxes = targets.split((1, 5), 2)  # cls, xywhr
-            mask_gt = gt_bboxes.sum(2, keepdim=True).gt_(0.0)
-        except RuntimeError as e:
-            raise TypeError(
-                "ERROR ❌ OBB dataset incorrectly formatted or not a OBB dataset.\n"
-                "This error can occur when incorrectly training a 'OBB' model on a 'detect' dataset, "
-                "i.e. 'yolo train model=yolo11n-obb.pt data=coco8.yaml'.\nVerify your dataset is a "
-                "correctly formatted 'OBB' dataset using 'data=dota8.yaml' "
-                "as an example.\nSee https://docs.ultralytics.com/datasets/obb/ for help."
-            ) from e
-
-        # Pboxes
-        pred_bboxes = self.bbox_decode(anchor_points, pred_distri, pred_angle)  # xyxy, (b, h*w, 4)
-
-        bboxes_for_assigner = pred_bboxes.clone().detach()
-        # Only the first four elements need to be scaled
-        bboxes_for_assigner[..., :4] *= stride_tensor
-        _, target_bboxes, target_scores, fg_mask, _ = self.assigner(
-            pred_scores.detach().sigmoid(),
-            bboxes_for_assigner.type(gt_bboxes.dtype),
-            anchor_points * stride_tensor,
-            gt_labels,
-            gt_bboxes,
-            mask_gt,
-        )
-
-        target_scores_sum = max(target_scores.sum(), 1)
-
-        # Cls loss
-        # loss[1] = self.varifocal_loss(pred_scores, target_scores, target_labels) / target_scores_sum  # VFL way
-        loss[1] = self.bce(pred_scores, target_scores.to(dtype)).sum() / target_scores_sum  # BCE
-
-        # Bbox loss
-        if fg_mask.sum():
-            target_bboxes[..., :4] /= stride_tensor
-            loss[0], loss[2] = self.bbox_loss(
-                pred_distri, pred_bboxes, anchor_points, target_bboxes, target_scores, target_scores_sum, fg_mask
-            )
-        else:
-            loss[0] += (pred_angle * 0).sum()
-
-        loss[0] *= self.hyp.box  # box gain
-        loss[1] *= self.hyp.cls  # cls gain
-        loss[2] *= self.hyp.dfl  # dfl gain
-
-        return loss * batch_size, loss.detach()  # loss(box, cls, dfl)
-
-    def bbox_decode(
-        self, anchor_points: torch.Tensor, pred_dist: torch.Tensor, pred_angle: torch.Tensor
-    ) -> torch.Tensor:
-        """
-        Decode predicted object bounding box coordinates from anchor points and distribution.
-
-        Args:
-            anchor_points (torch.Tensor): Anchor points, (h*w, 2).
-            pred_dist (torch.Tensor): Predicted rotated distance, (bs, h*w, 4).
-            pred_angle (torch.Tensor): Predicted angle, (bs, h*w, 1).
-
-        Returns:
-            (torch.Tensor): Predicted rotated bounding boxes with angles, (bs, h*w, 5).
-        """
-        if self.use_dfl:
-            b, a, c = pred_dist.shape  # batch, anchors, channels
-            pred_dist = pred_dist.view(b, a, 4, c // 4).softmax(3).matmul(self.proj.type(pred_dist.dtype))
-        return torch.cat((dist2rbox(pred_dist, pred_angle, anchor_points), pred_angle), dim=-1)
-
-
-class E2EDetectLoss:
-    """Criterion class for computing training losses for end-to-end detection."""
-
-    def __init__(self, model):
-        """Initialize E2EDetectLoss with one-to-many and one-to-one detection losses using the provided model."""
-        self.one2many = v8DetectionLoss(model, tal_topk=10)
-        self.one2one = v8DetectionLoss(model, tal_topk=1)
-
-    def __call__(self, preds: Any, batch: dict[str, torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]:
-        """Calculate the sum of the loss for box, cls and dfl multiplied by batch size."""
-        preds = preds[1] if isinstance(preds, tuple) else preds
-        one2many = preds["one2many"]
-        loss_one2many = self.one2many(one2many, batch)
-        one2one = preds["one2one"]
-        loss_one2one = self.one2one(one2one, batch)
-        return loss_one2many[0] + loss_one2one[0], loss_one2many[1] + loss_one2one[1]
-
-
-class TVPDetectLoss:
-    """Criterion class for computing training losses for text-visual prompt detection."""
-
-    def __init__(self, model):
-        """Initialize TVPDetectLoss with task-prompt and visual-prompt criteria using the provided model."""
-        self.vp_criterion = v8DetectionLoss(model)
-        # NOTE: store following info as it's changeable in __call__
-        self.ori_nc = self.vp_criterion.nc
-        self.ori_no = self.vp_criterion.no
-        self.ori_reg_max = self.vp_criterion.reg_max
-
-    def __call__(self, preds: Any, batch: dict[str, torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]:
-        """Calculate the loss for text-visual prompt detection."""
-        feats = preds[1] if isinstance(preds, tuple) else preds
-        assert self.ori_reg_max == self.vp_criterion.reg_max  # TODO: remove it
-
-        if self.ori_reg_max * 4 + self.ori_nc == feats[0].shape[1]:
-            loss = torch.zeros(3, device=self.vp_criterion.device, requires_grad=True)
-            return loss, loss.detach()
-
-        vp_feats = self._get_vp_features(feats)
-        vp_loss = self.vp_criterion(vp_feats, batch)
-        box_loss = vp_loss[0][1]
-        return box_loss, vp_loss[1]
-
-    def _get_vp_features(self, feats: list[torch.Tensor]) -> list[torch.Tensor]:
-        """Extract visual-prompt features from the model output."""
-        vnc = feats[0].shape[1] - self.ori_reg_max * 4 - self.ori_nc
-
-        self.vp_criterion.nc = vnc
-        self.vp_criterion.no = vnc + self.vp_criterion.reg_max * 4
-        self.vp_criterion.assigner.num_classes = vnc
-
-        return [
-            torch.cat((box, cls_vp), dim=1)
-            for box, _, cls_vp in [xi.split((self.ori_reg_max * 4, self.ori_nc, vnc), dim=1) for xi in feats]
-        ]
-
-
-class TVPSegmentLoss(TVPDetectLoss):
-    """Criterion class for computing training losses for text-visual prompt segmentation."""
-
-    def __init__(self, model):
-        """Initialize TVPSegmentLoss with task-prompt and visual-prompt criteria using the provided model."""
-        super().__init__(model)
-        self.vp_criterion = v8SegmentationLoss(model)
-
-    def __call__(self, preds: Any, batch: dict[str, torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]:
-        """Calculate the loss for text-visual prompt segmentation."""
-        feats, pred_masks, proto = preds if len(preds) == 3 else preds[1]
-        assert self.ori_reg_max == self.vp_criterion.reg_max  # TODO: remove it
-
-        if self.ori_reg_max * 4 + self.ori_nc == feats[0].shape[1]:
-            loss = torch.zeros(4, device=self.vp_criterion.device, requires_grad=True)
-            return loss, loss.detach()
-
-        vp_feats = self._get_vp_features(feats)
-        vp_loss = self.vp_criterion((vp_feats, pred_masks, proto), batch)
-        cls_loss = vp_loss[0][2]
-        return cls_loss, vp_loss[1]
diff --git a/ultralytics/utils/metrics.py b/ultralytics/utils/metrics.py
deleted file mode 100644
index dd1feb3..0000000
--- a/ultralytics/utils/metrics.py
+++ /dev/null
@@ -1,1592 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-"""Model validation metrics."""
-
-from __future__ import annotations
-
-import math
-import warnings
-from collections import defaultdict
-from pathlib import Path
-from typing import Any
-
-import numpy as np
-import torch
-
-from ultralytics.utils import LOGGER, DataExportMixin, SimpleClass, TryExcept, checks, plt_settings
-
-OKS_SIGMA = (
-    np.array([0.26, 0.25, 0.25, 0.35, 0.35, 0.79, 0.79, 0.72, 0.72, 0.62, 0.62, 1.07, 1.07, 0.87, 0.87, 0.89, 0.89])
-    / 10.0
-)
-
-
-def bbox_ioa(box1: np.ndarray, box2: np.ndarray, iou: bool = False, eps: float = 1e-7) -> np.ndarray:
-    """
-    Calculate the intersection over box2 area given box1 and box2.
-
-    Args:
-        box1 (np.ndarray): A numpy array of shape (N, 4) representing N bounding boxes in x1y1x2y2 format.
-        box2 (np.ndarray): A numpy array of shape (M, 4) representing M bounding boxes in x1y1x2y2 format.
-        iou (bool, optional): Calculate the standard IoU if True else return inter_area/box2_area.
-        eps (float, optional): A small value to avoid division by zero.
-
-    Returns:
-        (np.ndarray): A numpy array of shape (N, M) representing the intersection over box2 area.
-    """
-    # Get the coordinates of bounding boxes
-    b1_x1, b1_y1, b1_x2, b1_y2 = box1.T
-    b2_x1, b2_y1, b2_x2, b2_y2 = box2.T
-
-    # Intersection area
-    inter_area = (np.minimum(b1_x2[:, None], b2_x2) - np.maximum(b1_x1[:, None], b2_x1)).clip(0) * (
-        np.minimum(b1_y2[:, None], b2_y2) - np.maximum(b1_y1[:, None], b2_y1)
-    ).clip(0)
-
-    # Box2 area
-    area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1)
-    if iou:
-        box1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1)
-        area = area + box1_area[:, None] - inter_area
-
-    # Intersection over box2 area
-    return inter_area / (area + eps)
-
-
-def box_iou(box1: torch.Tensor, box2: torch.Tensor, eps: float = 1e-7) -> torch.Tensor:
-    """
-    Calculate intersection-over-union (IoU) of boxes.
-
-    Args:
-        box1 (torch.Tensor): A tensor of shape (N, 4) representing N bounding boxes in (x1, y1, x2, y2) format.
-        box2 (torch.Tensor): A tensor of shape (M, 4) representing M bounding boxes in (x1, y1, x2, y2) format.
-        eps (float, optional): A small value to avoid division by zero.
-
-    Returns:
-        (torch.Tensor): An NxM tensor containing the pairwise IoU values for every element in box1 and box2.
-
-    References:
-        https://github.com/pytorch/vision/blob/main/torchvision/ops/boxes.py
-    """
-    # NOTE: Need .float() to get accurate iou values
-    # inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)
-    (a1, a2), (b1, b2) = box1.float().unsqueeze(1).chunk(2, 2), box2.float().unsqueeze(0).chunk(2, 2)
-    inter = (torch.min(a2, b2) - torch.max(a1, b1)).clamp_(0).prod(2)
-
-    # IoU = inter / (area1 + area2 - inter)
-    return inter / ((a2 - a1).prod(2) + (b2 - b1).prod(2) - inter + eps)
-
-
-def bbox_iou(
-    box1: torch.Tensor,
-    box2: torch.Tensor,
-    xywh: bool = True,
-    GIoU: bool = False,
-    DIoU: bool = False,
-    CIoU: bool = False,
-    eps: float = 1e-7,
-) -> torch.Tensor:
-    """
-    Calculate the Intersection over Union (IoU) between bounding boxes.
-
-    This function supports various shapes for `box1` and `box2` as long as the last dimension is 4.
-    For instance, you may pass tensors shaped like (4,), (N, 4), (B, N, 4), or (B, N, 1, 4).
-    Internally, the code will split the last dimension into (x, y, w, h) if `xywh=True`,
-    or (x1, y1, x2, y2) if `xywh=False`.
-
-    Args:
-        box1 (torch.Tensor): A tensor representing one or more bounding boxes, with the last dimension being 4.
-        box2 (torch.Tensor): A tensor representing one or more bounding boxes, with the last dimension being 4.
-        xywh (bool, optional): If True, input boxes are in (x, y, w, h) format. If False, input boxes are in
-                               (x1, y1, x2, y2) format.
-        GIoU (bool, optional): If True, calculate Generalized IoU.
-        DIoU (bool, optional): If True, calculate Distance IoU.
-        CIoU (bool, optional): If True, calculate Complete IoU.
-        eps (float, optional): A small value to avoid division by zero.
-
-    Returns:
-        (torch.Tensor): IoU, GIoU, DIoU, or CIoU values depending on the specified flags.
-    """
-    # Get the coordinates of bounding boxes
-    if xywh:  # transform from xywh to xyxy
-        (x1, y1, w1, h1), (x2, y2, w2, h2) = box1.chunk(4, -1), box2.chunk(4, -1)
-        w1_, h1_, w2_, h2_ = w1 / 2, h1 / 2, w2 / 2, h2 / 2
-        b1_x1, b1_x2, b1_y1, b1_y2 = x1 - w1_, x1 + w1_, y1 - h1_, y1 + h1_
-        b2_x1, b2_x2, b2_y1, b2_y2 = x2 - w2_, x2 + w2_, y2 - h2_, y2 + h2_
-    else:  # x1, y1, x2, y2 = box1
-        b1_x1, b1_y1, b1_x2, b1_y2 = box1.chunk(4, -1)
-        b2_x1, b2_y1, b2_x2, b2_y2 = box2.chunk(4, -1)
-        w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps
-        w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps
-
-    # Intersection area
-    inter = (b1_x2.minimum(b2_x2) - b1_x1.maximum(b2_x1)).clamp_(0) * (
-        b1_y2.minimum(b2_y2) - b1_y1.maximum(b2_y1)
-    ).clamp_(0)
-
-    # Union Area
-    union = w1 * h1 + w2 * h2 - inter + eps
-
-    # IoU
-    iou = inter / union
-    if CIoU or DIoU or GIoU:
-        cw = b1_x2.maximum(b2_x2) - b1_x1.minimum(b2_x1)  # convex (smallest enclosing box) width
-        ch = b1_y2.maximum(b2_y2) - b1_y1.minimum(b2_y1)  # convex height
-        if CIoU or DIoU:  # Distance or Complete IoU https://arxiv.org/abs/1911.08287v1
-            c2 = cw.pow(2) + ch.pow(2) + eps  # convex diagonal squared
-            rho2 = (
-                (b2_x1 + b2_x2 - b1_x1 - b1_x2).pow(2) + (b2_y1 + b2_y2 - b1_y1 - b1_y2).pow(2)
-            ) / 4  # center dist**2
-            if CIoU:  # https://github.com/Zzh-tju/DIoU-SSD-pytorch/blob/master/utils/box/box_utils.py#L47
-                v = (4 / math.pi**2) * ((w2 / h2).atan() - (w1 / h1).atan()).pow(2)
-                with torch.no_grad():
-                    alpha = v / (v - iou + (1 + eps))
-                return iou - (rho2 / c2 + v * alpha)  # CIoU
-            return iou - rho2 / c2  # DIoU
-        c_area = cw * ch + eps  # convex area
-        return iou - (c_area - union) / c_area  # GIoU https://arxiv.org/pdf/1902.09630.pdf
-    return iou  # IoU
-
-
-def mask_iou(mask1: torch.Tensor, mask2: torch.Tensor, eps: float = 1e-7) -> torch.Tensor:
-    """
-    Calculate masks IoU.
-
-    Args:
-        mask1 (torch.Tensor): A tensor of shape (N, n) where N is the number of ground truth objects and n is the
-                        product of image width and height.
-        mask2 (torch.Tensor): A tensor of shape (M, n) where M is the number of predicted objects and n is the
-                        product of image width and height.
-        eps (float, optional): A small value to avoid division by zero.
-
-    Returns:
-        (torch.Tensor): A tensor of shape (N, M) representing masks IoU.
-    """
-    intersection = torch.matmul(mask1, mask2.T).clamp_(0)
-    union = (mask1.sum(1)[:, None] + mask2.sum(1)[None]) - intersection  # (area1 + area2) - intersection
-    return intersection / (union + eps)
-
-
-def kpt_iou(
-    kpt1: torch.Tensor, kpt2: torch.Tensor, area: torch.Tensor, sigma: list[float], eps: float = 1e-7
-) -> torch.Tensor:
-    """
-    Calculate Object Keypoint Similarity (OKS).
-
-    Args:
-        kpt1 (torch.Tensor): A tensor of shape (N, 17, 3) representing ground truth keypoints.
-        kpt2 (torch.Tensor): A tensor of shape (M, 17, 3) representing predicted keypoints.
-        area (torch.Tensor): A tensor of shape (N,) representing areas from ground truth.
-        sigma (list): A list containing 17 values representing keypoint scales.
-        eps (float, optional): A small value to avoid division by zero.
-
-    Returns:
-        (torch.Tensor): A tensor of shape (N, M) representing keypoint similarities.
-    """
-    d = (kpt1[:, None, :, 0] - kpt2[..., 0]).pow(2) + (kpt1[:, None, :, 1] - kpt2[..., 1]).pow(2)  # (N, M, 17)
-    sigma = torch.tensor(sigma, device=kpt1.device, dtype=kpt1.dtype)  # (17, )
-    kpt_mask = kpt1[..., 2] != 0  # (N, 17)
-    e = d / ((2 * sigma).pow(2) * (area[:, None, None] + eps) * 2)  # from cocoeval
-    # e = d / ((area[None, :, None] + eps) * sigma) ** 2 / 2  # from formula
-    return ((-e).exp() * kpt_mask[:, None]).sum(-1) / (kpt_mask.sum(-1)[:, None] + eps)
-
-
-def _get_covariance_matrix(boxes: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    """
-    Generate covariance matrix from oriented bounding boxes.
-
-    Args:
-        boxes (torch.Tensor): A tensor of shape (N, 5) representing rotated bounding boxes, with xywhr format.
-
-    Returns:
-        (torch.Tensor): Covariance matrices corresponding to original rotated bounding boxes.
-    """
-    # Gaussian bounding boxes, ignore the center points (the first two columns) because they are not needed here.
-    gbbs = torch.cat((boxes[:, 2:4].pow(2) / 12, boxes[:, 4:]), dim=-1)
-    a, b, c = gbbs.split(1, dim=-1)
-    cos = c.cos()
-    sin = c.sin()
-    cos2 = cos.pow(2)
-    sin2 = sin.pow(2)
-    return a * cos2 + b * sin2, a * sin2 + b * cos2, (a - b) * cos * sin
-
-
-def probiou(obb1: torch.Tensor, obb2: torch.Tensor, CIoU: bool = False, eps: float = 1e-7) -> torch.Tensor:
-    """
-    Calculate probabilistic IoU between oriented bounding boxes.
-
-    Args:
-        obb1 (torch.Tensor): Ground truth OBBs, shape (N, 5), format xywhr.
-        obb2 (torch.Tensor): Predicted OBBs, shape (N, 5), format xywhr.
-        CIoU (bool, optional): If True, calculate CIoU.
-        eps (float, optional): Small value to avoid division by zero.
-
-    Returns:
-        (torch.Tensor): OBB similarities, shape (N,).
-
-    Notes:
-        OBB format: [center_x, center_y, width, height, rotation_angle].
-
-    References:
-        https://arxiv.org/pdf/2106.06072v1.pdf
-    """
-    x1, y1 = obb1[..., :2].split(1, dim=-1)
-    x2, y2 = obb2[..., :2].split(1, dim=-1)
-    a1, b1, c1 = _get_covariance_matrix(obb1)
-    a2, b2, c2 = _get_covariance_matrix(obb2)
-
-    t1 = (
-        ((a1 + a2) * (y1 - y2).pow(2) + (b1 + b2) * (x1 - x2).pow(2)) / ((a1 + a2) * (b1 + b2) - (c1 + c2).pow(2) + eps)
-    ) * 0.25
-    t2 = (((c1 + c2) * (x2 - x1) * (y1 - y2)) / ((a1 + a2) * (b1 + b2) - (c1 + c2).pow(2) + eps)) * 0.5
-    t3 = (
-        ((a1 + a2) * (b1 + b2) - (c1 + c2).pow(2))
-        / (4 * ((a1 * b1 - c1.pow(2)).clamp_(0) * (a2 * b2 - c2.pow(2)).clamp_(0)).sqrt() + eps)
-        + eps
-    ).log() * 0.5
-    bd = (t1 + t2 + t3).clamp(eps, 100.0)
-    hd = (1.0 - (-bd).exp() + eps).sqrt()
-    iou = 1 - hd
-    if CIoU:  # only include the wh aspect ratio part
-        w1, h1 = obb1[..., 2:4].split(1, dim=-1)
-        w2, h2 = obb2[..., 2:4].split(1, dim=-1)
-        v = (4 / math.pi**2) * ((w2 / h2).atan() - (w1 / h1).atan()).pow(2)
-        with torch.no_grad():
-            alpha = v / (v - iou + (1 + eps))
-        return iou - v * alpha  # CIoU
-    return iou
-
-
-def batch_probiou(obb1: torch.Tensor | np.ndarray, obb2: torch.Tensor | np.ndarray, eps: float = 1e-7) -> torch.Tensor:
-    """
-    Calculate the probabilistic IoU between oriented bounding boxes.
-
-    Args:
-        obb1 (torch.Tensor | np.ndarray): A tensor of shape (N, 5) representing ground truth obbs, with xywhr format.
-        obb2 (torch.Tensor | np.ndarray): A tensor of shape (M, 5) representing predicted obbs, with xywhr format.
-        eps (float, optional): A small value to avoid division by zero.
-
-    Returns:
-        (torch.Tensor): A tensor of shape (N, M) representing obb similarities.
-
-    References:
-        https://arxiv.org/pdf/2106.06072v1.pdf
-    """
-    obb1 = torch.from_numpy(obb1) if isinstance(obb1, np.ndarray) else obb1
-    obb2 = torch.from_numpy(obb2) if isinstance(obb2, np.ndarray) else obb2
-
-    x1, y1 = obb1[..., :2].split(1, dim=-1)
-    x2, y2 = (x.squeeze(-1)[None] for x in obb2[..., :2].split(1, dim=-1))
-    a1, b1, c1 = _get_covariance_matrix(obb1)
-    a2, b2, c2 = (x.squeeze(-1)[None] for x in _get_covariance_matrix(obb2))
-
-    t1 = (
-        ((a1 + a2) * (y1 - y2).pow(2) + (b1 + b2) * (x1 - x2).pow(2)) / ((a1 + a2) * (b1 + b2) - (c1 + c2).pow(2) + eps)
-    ) * 0.25
-    t2 = (((c1 + c2) * (x2 - x1) * (y1 - y2)) / ((a1 + a2) * (b1 + b2) - (c1 + c2).pow(2) + eps)) * 0.5
-    t3 = (
-        ((a1 + a2) * (b1 + b2) - (c1 + c2).pow(2))
-        / (4 * ((a1 * b1 - c1.pow(2)).clamp_(0) * (a2 * b2 - c2.pow(2)).clamp_(0)).sqrt() + eps)
-        + eps
-    ).log() * 0.5
-    bd = (t1 + t2 + t3).clamp(eps, 100.0)
-    hd = (1.0 - (-bd).exp() + eps).sqrt()
-    return 1 - hd
-
-
-def smooth_bce(eps: float = 0.1) -> tuple[float, float]:
-    """
-    Compute smoothed positive and negative Binary Cross-Entropy targets.
-
-    Args:
-        eps (float, optional): The epsilon value for label smoothing.
-
-    Returns:
-        pos (float): Positive label smoothing BCE target.
-        neg (float): Negative label smoothing BCE target.
-
-    References:
-        https://github.com/ultralytics/yolov3/issues/238#issuecomment-598028441
-    """
-    return 1.0 - 0.5 * eps, 0.5 * eps
-
-
-class ConfusionMatrix(DataExportMixin):
-    """
-    A class for calculating and updating a confusion matrix for object detection and classification tasks.
-
-    Attributes:
-        task (str): The type of task, either 'detect' or 'classify'.
-        matrix (np.ndarray): The confusion matrix, with dimensions depending on the task.
-        nc (int): The number of category.
-        names (list[str]): The names of the classes, used as labels on the plot.
-        matches (dict): Contains the indices of ground truths and predictions categorized into TP, FP and FN.
-    """
-
-    def __init__(self, names: dict[int, str] = [], task: str = "detect", save_matches: bool = False):
-        """
-        Initialize a ConfusionMatrix instance.
-
-        Args:
-            names (dict[int, str], optional): Names of classes, used as labels on the plot.
-            task (str, optional): Type of task, either 'detect' or 'classify'.
-            save_matches (bool, optional): Save the indices of GTs, TPs, FPs, FNs for visualization.
-        """
-        self.task = task
-        self.nc = len(names)  # number of classes
-        self.matrix = np.zeros((self.nc, self.nc)) if self.task == "classify" else np.zeros((self.nc + 1, self.nc + 1))
-        self.names = names  # name of classes
-        self.matches = {} if save_matches else None
-
-    def _append_matches(self, mtype: str, batch: dict[str, Any], idx: int) -> None:
-        """
-        Append the matches to TP, FP, FN or GT list for the last batch.
-
-        This method updates the matches dictionary by appending specific batch data
-        to the appropriate match type (True Positive, False Positive, or False Negative).
-
-        Args:
-            mtype (str): Match type identifier ('TP', 'FP', 'FN' or 'GT').
-            batch (dict[str, Any]): Batch data containing detection results with keys
-                like 'bboxes', 'cls', 'conf', 'keypoints', 'masks'.
-            idx (int): Index of the specific detection to append from the batch.
-
-        Note:
-            For masks, handles both overlap and non-overlap cases. When masks.max() > 1.0,
-            it indicates overlap_mask=True with shape (1, H, W), otherwise uses direct indexing.
-        """
-        if self.matches is None:
-            return
-        for k, v in batch.items():
-            if k in {"bboxes", "cls", "conf", "keypoints"}:
-                self.matches[mtype][k] += v[[idx]]
-            elif k == "masks":
-                # NOTE: masks.max() > 1.0 means overlap_mask=True with (1, H, W) shape
-                self.matches[mtype][k] += [v[0] == idx + 1] if v.max() > 1.0 else [v[idx]]
-
-    def process_cls_preds(self, preds: list[torch.Tensor], targets: list[torch.Tensor]) -> None:
-        """
-        Update confusion matrix for classification task.
-
-        Args:
-            preds (list[N, min(nc,5)]): Predicted class labels.
-            targets (list[N, 1]): Ground truth class labels.
-        """
-        preds, targets = torch.cat(preds)[:, 0], torch.cat(targets)
-        for p, t in zip(preds.cpu().numpy(), targets.cpu().numpy()):
-            self.matrix[p][t] += 1
-
-    def process_batch(
-        self,
-        detections: dict[str, torch.Tensor],
-        batch: dict[str, Any],
-        conf: float = 0.25,
-        iou_thres: float = 0.45,
-    ) -> None:
-        """
-        Update confusion matrix for object detection task.
-
-        Args:
-            detections (dict[str, torch.Tensor]): Dictionary containing detected bounding boxes and their associated information.
-                                       Should contain 'cls', 'conf', and 'bboxes' keys, where 'bboxes' can be
-                                       Array[N, 4] for regular boxes or Array[N, 5] for OBB with angle.
-            batch (dict[str, Any]): Batch dictionary containing ground truth data with 'bboxes' (Array[M, 4]| Array[M, 5]) and
-                'cls' (Array[M]) keys, where M is the number of ground truth objects.
-            conf (float, optional): Confidence threshold for detections.
-            iou_thres (float, optional): IoU threshold for matching detections to ground truth.
-        """
-        gt_cls, gt_bboxes = batch["cls"], batch["bboxes"]
-        if self.matches is not None:  # only if visualization is enabled
-            self.matches = {k: defaultdict(list) for k in {"TP", "FP", "FN", "GT"}}
-            for i in range(gt_cls.shape[0]):
-                self._append_matches("GT", batch, i)  # store GT
-        is_obb = gt_bboxes.shape[1] == 5  # check if boxes contains angle for OBB
-        conf = 0.25 if conf in {None, 0.01 if is_obb else 0.001} else conf  # apply 0.25 if default val conf is passed
-        no_pred = detections["cls"].shape[0] == 0
-        if gt_cls.shape[0] == 0:  # Check if labels is empty
-            if not no_pred:
-                detections = {k: detections[k][detections["conf"] > conf] for k in detections}
-                detection_classes = detections["cls"].int().tolist()
-                for i, dc in enumerate(detection_classes):
-                    self.matrix[dc, self.nc] += 1  # FP
-                    self._append_matches("FP", detections, i)
-            return
-        if no_pred:
-            gt_classes = gt_cls.int().tolist()
-            for i, gc in enumerate(gt_classes):
-                self.matrix[self.nc, gc] += 1  # FN
-                self._append_matches("FN", batch, i)
-            return
-
-        detections = {k: detections[k][detections["conf"] > conf] for k in detections}
-        gt_classes = gt_cls.int().tolist()
-        detection_classes = detections["cls"].int().tolist()
-        bboxes = detections["bboxes"]
-        iou = batch_probiou(gt_bboxes, bboxes) if is_obb else box_iou(gt_bboxes, bboxes)
-
-        x = torch.where(iou > iou_thres)
-        if x[0].shape[0]:
-            matches = torch.cat((torch.stack(x, 1), iou[x[0], x[1]][:, None]), 1).cpu().numpy()
-            if x[0].shape[0] > 1:
-                matches = matches[matches[:, 2].argsort()[::-1]]
-                matches = matches[np.unique(matches[:, 1], return_index=True)[1]]
-                matches = matches[matches[:, 2].argsort()[::-1]]
-                matches = matches[np.unique(matches[:, 0], return_index=True)[1]]
-        else:
-            matches = np.zeros((0, 3))
-
-        n = matches.shape[0] > 0
-        m0, m1, _ = matches.transpose().astype(int)
-        for i, gc in enumerate(gt_classes):
-            j = m0 == i
-            if n and sum(j) == 1:
-                dc = detection_classes[m1[j].item()]
-                self.matrix[dc, gc] += 1  # TP if class is correct else both an FP and an FN
-                if dc == gc:
-                    self._append_matches("TP", detections, m1[j].item())
-                else:
-                    self._append_matches("FP", detections, m1[j].item())
-                    self._append_matches("FN", batch, i)
-            else:
-                self.matrix[self.nc, gc] += 1  # FN
-                self._append_matches("FN", batch, i)
-
-        for i, dc in enumerate(detection_classes):
-            if not any(m1 == i):
-                self.matrix[dc, self.nc] += 1  # FP
-                self._append_matches("FP", detections, i)
-
-    def matrix(self):
-        """Return the confusion matrix."""
-        return self.matrix
-
-    def tp_fp(self) -> tuple[np.ndarray, np.ndarray]:
-        """
-        Return true positives and false positives.
-
-        Returns:
-            tp (np.ndarray): True positives.
-            fp (np.ndarray): False positives.
-        """
-        tp = self.matrix.diagonal()  # true positives
-        fp = self.matrix.sum(1) - tp  # false positives
-        # fn = self.matrix.sum(0) - tp  # false negatives (missed detections)
-        return (tp, fp) if self.task == "classify" else (tp[:-1], fp[:-1])  # remove background class if task=detect
-
-    def plot_matches(self, img: torch.Tensor, im_file: str, save_dir: Path) -> None:
-        """
-        Plot grid of GT, TP, FP, FN for each image.
-
-        Args:
-            img (torch.Tensor): Image to plot onto.
-            im_file (str): Image filename to save visualizations.
-            save_dir (Path): Location to save the visualizations to.
-        """
-        if not self.matches:
-            return
-        from .ops import xyxy2xywh
-        from .plotting import plot_images
-
-        # Create batch of 4 (GT, TP, FP, FN)
-        labels = defaultdict(list)
-        for i, mtype in enumerate(["GT", "FP", "TP", "FN"]):
-            mbatch = self.matches[mtype]
-            if "conf" not in mbatch:
-                mbatch["conf"] = torch.tensor([1.0] * len(mbatch["bboxes"]), device=img.device)
-            mbatch["batch_idx"] = torch.ones(len(mbatch["bboxes"]), device=img.device) * i
-            for k in mbatch.keys():
-                labels[k] += mbatch[k]
-
-        labels = {k: torch.stack(v, 0) if len(v) else torch.empty(0) for k, v in labels.items()}
-        if self.task != "obb" and labels["bboxes"].shape[0]:
-            labels["bboxes"] = xyxy2xywh(labels["bboxes"])
-        (save_dir / "visualizations").mkdir(parents=True, exist_ok=True)
-        plot_images(
-            labels,
-            img.repeat(4, 1, 1, 1),
-            paths=["Ground Truth", "False Positives", "True Positives", "False Negatives"],
-            fname=save_dir / "visualizations" / Path(im_file).name,
-            names=self.names,
-            max_subplots=4,
-            conf_thres=0.001,
-        )
-
-    @TryExcept(msg="ConfusionMatrix plot failure")
-    @plt_settings()
-    def plot(self, normalize: bool = True, save_dir: str = "", on_plot=None):
-        """
-        Plot the confusion matrix using matplotlib and save it to a file.
-
-        Args:
-            normalize (bool, optional): Whether to normalize the confusion matrix.
-            save_dir (str, optional): Directory where the plot will be saved.
-            on_plot (callable, optional): An optional callback to pass plots path and data when they are rendered.
-        """
-        import matplotlib.pyplot as plt  # scope for faster 'import ultralytics'
-
-        array = self.matrix / ((self.matrix.sum(0).reshape(1, -1) + 1e-9) if normalize else 1)  # normalize columns
-        array[array < 0.005] = np.nan  # don't annotate (would appear as 0.00)
-
-        fig, ax = plt.subplots(1, 1, figsize=(12, 9))
-        names, n = list(self.names.values()), self.nc
-        if self.nc >= 100:  # downsample for large class count
-            k = max(2, self.nc // 60)  # step size for downsampling, always > 1
-            keep_idx = slice(None, None, k)  # create slice instead of array
-            names = names[keep_idx]  # slice class names
-            array = array[keep_idx, :][:, keep_idx]  # slice matrix rows and cols
-            n = (self.nc + k - 1) // k  # number of retained classes
-        nc = nn = n if self.task == "classify" else n + 1  # adjust for background if needed
-        ticklabels = (names + ["background"]) if (0 < nn < 99) and (nn == nc) else "auto"
-        xy_ticks = np.arange(len(ticklabels))
-        tick_fontsize = max(6, 15 - 0.1 * nc)  # Minimum size is 6
-        label_fontsize = max(6, 12 - 0.1 * nc)
-        title_fontsize = max(6, 12 - 0.1 * nc)
-        btm = max(0.1, 0.25 - 0.001 * nc)  # Minimum value is 0.1
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")  # suppress empty matrix RuntimeWarning: All-NaN slice encountered
-            im = ax.imshow(array, cmap="Blues", vmin=0.0, interpolation="none")
-            ax.xaxis.set_label_position("bottom")
-            if nc < 30:  # Add score for each cell of confusion matrix
-                color_threshold = 0.45 * (1 if normalize else np.nanmax(array))  # text color threshold
-                for i, row in enumerate(array[:nc]):
-                    for j, val in enumerate(row[:nc]):
-                        val = array[i, j]
-                        if np.isnan(val):
-                            continue
-                        ax.text(
-                            j,
-                            i,
-                            f"{val:.2f}" if normalize else f"{int(val)}",
-                            ha="center",
-                            va="center",
-                            fontsize=10,
-                            color="white" if val > color_threshold else "black",
-                        )
-            cbar = fig.colorbar(im, ax=ax, fraction=0.046, pad=0.05)
-        title = "Confusion Matrix" + " Normalized" * normalize
-        ax.set_xlabel("True", fontsize=label_fontsize, labelpad=10)
-        ax.set_ylabel("Predicted", fontsize=label_fontsize, labelpad=10)
-        ax.set_title(title, fontsize=title_fontsize, pad=20)
-        ax.set_xticks(xy_ticks)
-        ax.set_yticks(xy_ticks)
-        ax.tick_params(axis="x", bottom=True, top=False, labelbottom=True, labeltop=False)
-        ax.tick_params(axis="y", left=True, right=False, labelleft=True, labelright=False)
-        if ticklabels != "auto":
-            ax.set_xticklabels(ticklabels, fontsize=tick_fontsize, rotation=90, ha="center")
-            ax.set_yticklabels(ticklabels, fontsize=tick_fontsize)
-        for s in {"left", "right", "bottom", "top", "outline"}:
-            if s != "outline":
-                ax.spines[s].set_visible(False)  # Confusion matrix plot don't have outline
-            cbar.ax.spines[s].set_visible(False)
-        fig.subplots_adjust(left=0, right=0.84, top=0.94, bottom=btm)  # Adjust layout to ensure equal margins
-        plot_fname = Path(save_dir) / f"{title.lower().replace(' ', '_')}.png"
-        fig.savefig(plot_fname, dpi=250)
-        plt.close(fig)
-        if on_plot:
-            on_plot(plot_fname)
-
-    def print(self):
-        """Print the confusion matrix to the console."""
-        for i in range(self.matrix.shape[0]):
-            LOGGER.info(" ".join(map(str, self.matrix[i])))
-
-    def summary(self, normalize: bool = False, decimals: int = 5) -> list[dict[str, float]]:
-        """
-        Generate a summarized representation of the confusion matrix as a list of dictionaries, with optional
-        normalization. This is useful for exporting the matrix to various formats such as CSV, XML, HTML, JSON, or SQL.
-
-        Args:
-            normalize (bool): Whether to normalize the confusion matrix values.
-            decimals (int): Number of decimal places to round the output values to.
-
-        Returns:
-            (list[dict[str, float]]): A list of dictionaries, each representing one predicted class with corresponding values for all actual classes.
-
-        Examples:
-            >>> results = model.val(data="coco8.yaml", plots=True)
-            >>> cm_dict = results.confusion_matrix.summary(normalize=True, decimals=5)
-            >>> print(cm_dict)
-        """
-        import re
-
-        names = list(self.names.values()) if self.task == "classify" else list(self.names.values()) + ["background"]
-        clean_names, seen = [], set()
-        for name in names:
-            clean_name = re.sub(r"[^a-zA-Z0-9_]", "_", name)
-            original_clean = clean_name
-            counter = 1
-            while clean_name.lower() in seen:
-                clean_name = f"{original_clean}_{counter}"
-                counter += 1
-            seen.add(clean_name.lower())
-            clean_names.append(clean_name)
-        array = (self.matrix / ((self.matrix.sum(0).reshape(1, -1) + 1e-9) if normalize else 1)).round(decimals)
-        return [
-            dict({"Predicted": clean_names[i]}, **{clean_names[j]: array[i, j] for j in range(len(clean_names))})
-            for i in range(len(clean_names))
-        ]
-
-
-def smooth(y: np.ndarray, f: float = 0.05) -> np.ndarray:
-    """Box filter of fraction f."""
-    nf = round(len(y) * f * 2) // 2 + 1  # number of filter elements (must be odd)
-    p = np.ones(nf // 2)  # ones padding
-    yp = np.concatenate((p * y[0], y, p * y[-1]), 0)  # y padded
-    return np.convolve(yp, np.ones(nf) / nf, mode="valid")  # y-smoothed
-
-
-@plt_settings()
-def plot_pr_curve(
-    px: np.ndarray,
-    py: np.ndarray,
-    ap: np.ndarray,
-    save_dir: Path = Path("pr_curve.png"),
-    names: dict[int, str] = {},
-    on_plot=None,
-):
-    """
-    Plot precision-recall curve.
-
-    Args:
-        px (np.ndarray): X values for the PR curve.
-        py (np.ndarray): Y values for the PR curve.
-        ap (np.ndarray): Average precision values.
-        save_dir (Path, optional): Path to save the plot.
-        names (dict[int, str], optional): Dictionary mapping class indices to class names.
-        on_plot (callable, optional): Function to call after plot is saved.
-    """
-    import matplotlib.pyplot as plt  # scope for faster 'import ultralytics'
-
-    fig, ax = plt.subplots(1, 1, figsize=(9, 6), tight_layout=True)
-    py = np.stack(py, axis=1)
-
-    if 0 < len(names) < 21:  # display per-class legend if < 21 classes
-        for i, y in enumerate(py.T):
-            ax.plot(px, y, linewidth=1, label=f"{names[i]} {ap[i, 0]:.3f}")  # plot(recall, precision)
-    else:
-        ax.plot(px, py, linewidth=1, color="grey")  # plot(recall, precision)
-
-    ax.plot(px, py.mean(1), linewidth=3, color="blue", label=f"all classes {ap[:, 0].mean():.3f} mAP@0.5")
-    ax.set_xlabel("Recall")
-    ax.set_ylabel("Precision")
-    ax.set_xlim(0, 1)
-    ax.set_ylim(0, 1)
-    ax.legend(bbox_to_anchor=(1.04, 1), loc="upper left")
-    ax.set_title("Precision-Recall Curve")
-    fig.savefig(save_dir, dpi=250)
-    plt.close(fig)
-    if on_plot:
-        on_plot(save_dir)
-
-
-@plt_settings()
-def plot_mc_curve(
-    px: np.ndarray,
-    py: np.ndarray,
-    save_dir: Path = Path("mc_curve.png"),
-    names: dict[int, str] = {},
-    xlabel: str = "Confidence",
-    ylabel: str = "Metric",
-    on_plot=None,
-):
-    """
-    Plot metric-confidence curve.
-
-    Args:
-        px (np.ndarray): X values for the metric-confidence curve.
-        py (np.ndarray): Y values for the metric-confidence curve.
-        save_dir (Path, optional): Path to save the plot.
-        names (dict[int, str], optional): Dictionary mapping class indices to class names.
-        xlabel (str, optional): X-axis label.
-        ylabel (str, optional): Y-axis label.
-        on_plot (callable, optional): Function to call after plot is saved.
-    """
-    import matplotlib.pyplot as plt  # scope for faster 'import ultralytics'
-
-    fig, ax = plt.subplots(1, 1, figsize=(9, 6), tight_layout=True)
-
-    if 0 < len(names) < 21:  # display per-class legend if < 21 classes
-        for i, y in enumerate(py):
-            ax.plot(px, y, linewidth=1, label=f"{names[i]}")  # plot(confidence, metric)
-    else:
-        ax.plot(px, py.T, linewidth=1, color="grey")  # plot(confidence, metric)
-
-    y = smooth(py.mean(0), 0.1)
-    ax.plot(px, y, linewidth=3, color="blue", label=f"all classes {y.max():.2f} at {px[y.argmax()]:.3f}")
-    ax.set_xlabel(xlabel)
-    ax.set_ylabel(ylabel)
-    ax.set_xlim(0, 1)
-    ax.set_ylim(0, 1)
-    ax.legend(bbox_to_anchor=(1.04, 1), loc="upper left")
-    ax.set_title(f"{ylabel}-Confidence Curve")
-    fig.savefig(save_dir, dpi=250)
-    plt.close(fig)
-    if on_plot:
-        on_plot(save_dir)
-
-
-def compute_ap(recall: list[float], precision: list[float]) -> tuple[float, np.ndarray, np.ndarray]:
-    """
-    Compute the average precision (AP) given the recall and precision curves.
-
-    Args:
-        recall (list): The recall curve.
-        precision (list): The precision curve.
-
-    Returns:
-        ap (float): Average precision.
-        mpre (np.ndarray): Precision envelope curve.
-        mrec (np.ndarray): Modified recall curve with sentinel values added at the beginning and end.
-    """
-    # Append sentinel values to beginning and end
-    mrec = np.concatenate(([0.0], recall, [1.0]))
-    mpre = np.concatenate(([1.0], precision, [0.0]))
-
-    # Compute the precision envelope
-    mpre = np.flip(np.maximum.accumulate(np.flip(mpre)))
-
-    # Integrate area under curve
-    method = "interp"  # methods: 'continuous', 'interp'
-    if method == "interp":
-        x = np.linspace(0, 1, 101)  # 101-point interp (COCO)
-        func = np.trapezoid if checks.check_version(np.__version__, ">=2.0") else np.trapz  # np.trapz deprecated
-        ap = func(np.interp(x, mrec, mpre), x)  # integrate
-    else:  # 'continuous'
-        i = np.where(mrec[1:] != mrec[:-1])[0]  # points where x-axis (recall) changes
-        ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])  # area under curve
-
-    return ap, mpre, mrec
-
-
-def ap_per_class(
-    tp: np.ndarray,
-    conf: np.ndarray,
-    pred_cls: np.ndarray,
-    target_cls: np.ndarray,
-    plot: bool = False,
-    on_plot=None,
-    save_dir: Path = Path(),
-    names: dict[int, str] = {},
-    eps: float = 1e-16,
-    prefix: str = "",
-) -> tuple:
-    """
-    Compute the average precision per class for object detection evaluation.
-
-    Args:
-        tp (np.ndarray): Binary array indicating whether the detection is correct (True) or not (False).
-        conf (np.ndarray): Array of confidence scores of the detections.
-        pred_cls (np.ndarray): Array of predicted classes of the detections.
-        target_cls (np.ndarray): Array of true classes of the detections.
-        plot (bool, optional): Whether to plot PR curves or not.
-        on_plot (callable, optional): A callback to pass plots path and data when they are rendered.
-        save_dir (Path, optional): Directory to save the PR curves.
-        names (dict[int, str], optional): Dictionary of class names to plot PR curves.
-        eps (float, optional): A small value to avoid division by zero.
-        prefix (str, optional): A prefix string for saving the plot files.
-
-    Returns:
-        tp (np.ndarray): True positive counts at threshold given by max F1 metric for each class.
-        fp (np.ndarray): False positive counts at threshold given by max F1 metric for each class.
-        p (np.ndarray): Precision values at threshold given by max F1 metric for each class.
-        r (np.ndarray): Recall values at threshold given by max F1 metric for each class.
-        f1 (np.ndarray): F1-score values at threshold given by max F1 metric for each class.
-        ap (np.ndarray): Average precision for each class at different IoU thresholds.
-        unique_classes (np.ndarray): An array of unique classes that have data.
-        p_curve (np.ndarray): Precision curves for each class.
-        r_curve (np.ndarray): Recall curves for each class.
-        f1_curve (np.ndarray): F1-score curves for each class.
-        x (np.ndarray): X-axis values for the curves.
-        prec_values (np.ndarray): Precision values at mAP@0.5 for each class.
-    """
-    # Sort by objectness
-    i = np.argsort(-conf)
-    tp, conf, pred_cls = tp[i], conf[i], pred_cls[i]
-
-    # Find unique classes
-    unique_classes, nt = np.unique(target_cls, return_counts=True)
-    nc = unique_classes.shape[0]  # number of classes, number of detections
-
-    # Create Precision-Recall curve and compute AP for each class
-    x, prec_values = np.linspace(0, 1, 1000), []
-
-    # Average precision, precision and recall curves
-    ap, p_curve, r_curve = np.zeros((nc, tp.shape[1])), np.zeros((nc, 1000)), np.zeros((nc, 1000))
-    for ci, c in enumerate(unique_classes):
-        i = pred_cls == c
-        n_l = nt[ci]  # number of labels
-        n_p = i.sum()  # number of predictions
-        if n_p == 0 or n_l == 0:
-            continue
-
-        # Accumulate FPs and TPs
-        fpc = (1 - tp[i]).cumsum(0)
-        tpc = tp[i].cumsum(0)
-
-        # Recall
-        recall = tpc / (n_l + eps)  # recall curve
-        r_curve[ci] = np.interp(-x, -conf[i], recall[:, 0], left=0)  # negative x, xp because xp decreases
-
-        # Precision
-        precision = tpc / (tpc + fpc)  # precision curve
-        p_curve[ci] = np.interp(-x, -conf[i], precision[:, 0], left=1)  # p at pr_score
-
-        # AP from recall-precision curve
-        for j in range(tp.shape[1]):
-            ap[ci, j], mpre, mrec = compute_ap(recall[:, j], precision[:, j])
-            if j == 0:
-                prec_values.append(np.interp(x, mrec, mpre))  # precision at mAP@0.5
-
-    prec_values = np.array(prec_values) if prec_values else np.zeros((1, 1000))  # (nc, 1000)
-
-    # Compute F1 (harmonic mean of precision and recall)
-    f1_curve = 2 * p_curve * r_curve / (p_curve + r_curve + eps)
-    names = {i: names[k] for i, k in enumerate(unique_classes) if k in names}  # dict: only classes that have data
-    if plot:
-        plot_pr_curve(x, prec_values, ap, save_dir / f"{prefix}PR_curve.png", names, on_plot=on_plot)
-        plot_mc_curve(x, f1_curve, save_dir / f"{prefix}F1_curve.png", names, ylabel="F1", on_plot=on_plot)
-        plot_mc_curve(x, p_curve, save_dir / f"{prefix}P_curve.png", names, ylabel="Precision", on_plot=on_plot)
-        plot_mc_curve(x, r_curve, save_dir / f"{prefix}R_curve.png", names, ylabel="Recall", on_plot=on_plot)
-
-    i = smooth(f1_curve.mean(0), 0.1).argmax()  # max F1 index
-    p, r, f1 = p_curve[:, i], r_curve[:, i], f1_curve[:, i]  # max-F1 precision, recall, F1 values
-    tp = (r * nt).round()  # true positives
-    fp = (tp / (p + eps) - tp).round()  # false positives
-    return tp, fp, p, r, f1, ap, unique_classes.astype(int), p_curve, r_curve, f1_curve, x, prec_values
-
-
-class Metric(SimpleClass):
-    """
-    Class for computing evaluation metrics for Ultralytics YOLO models.
-
-    Attributes:
-        p (list): Precision for each class. Shape: (nc,).
-        r (list): Recall for each class. Shape: (nc,).
-        f1 (list): F1 score for each class. Shape: (nc,).
-        all_ap (list): AP scores for all classes and all IoU thresholds. Shape: (nc, 10).
-        ap_class_index (list): Index of class for each AP score. Shape: (nc,).
-        nc (int): Number of classes.
-
-    Methods:
-        ap50: AP at IoU threshold of 0.5 for all classes.
-        ap: AP at IoU thresholds from 0.5 to 0.95 for all classes.
-        mp: Mean precision of all classes.
-        mr: Mean recall of all classes.
-        map50: Mean AP at IoU threshold of 0.5 for all classes.
-        map75: Mean AP at IoU threshold of 0.75 for all classes.
-        map: Mean AP at IoU thresholds from 0.5 to 0.95 for all classes.
-        mean_results: Mean of results, returns mp, mr, map50, map.
-        class_result: Class-aware result, returns p[i], r[i], ap50[i], ap[i].
-        maps: mAP of each class.
-        fitness: Model fitness as a weighted combination of metrics.
-        update: Update metric attributes with new evaluation results.
-        curves: Provides a list of curves for accessing specific metrics like precision, recall, F1, etc.
-        curves_results: Provide a list of results for accessing specific metrics like precision, recall, F1, etc.
-    """
-
-    def __init__(self) -> None:
-        """Initialize a Metric instance for computing evaluation metrics for the YOLOv8 model."""
-        self.p = []  # (nc, )
-        self.r = []  # (nc, )
-        self.f1 = []  # (nc, )
-        self.all_ap = []  # (nc, 10)
-        self.ap_class_index = []  # (nc, )
-        self.nc = 0
-
-    @property
-    def ap50(self) -> np.ndarray | list:
-        """
-        Return the Average Precision (AP) at an IoU threshold of 0.5 for all classes.
-
-        Returns:
-            (np.ndarray | list): Array of shape (nc,) with AP50 values per class, or an empty list if not available.
-        """
-        return self.all_ap[:, 0] if len(self.all_ap) else []
-
-    @property
-    def ap(self) -> np.ndarray | list:
-        """
-        Return the Average Precision (AP) at an IoU threshold of 0.5-0.95 for all classes.
-
-        Returns:
-            (np.ndarray | list): Array of shape (nc,) with AP50-95 values per class, or an empty list if not available.
-        """
-        return self.all_ap.mean(1) if len(self.all_ap) else []
-
-    @property
-    def mp(self) -> float:
-        """
-        Return the Mean Precision of all classes.
-
-        Returns:
-            (float): The mean precision of all classes.
-        """
-        return self.p.mean() if len(self.p) else 0.0
-
-    @property
-    def mr(self) -> float:
-        """
-        Return the Mean Recall of all classes.
-
-        Returns:
-            (float): The mean recall of all classes.
-        """
-        return self.r.mean() if len(self.r) else 0.0
-
-    @property
-    def map50(self) -> float:
-        """
-        Return the mean Average Precision (mAP) at an IoU threshold of 0.5.
-
-        Returns:
-            (float): The mAP at an IoU threshold of 0.5.
-        """
-        return self.all_ap[:, 0].mean() if len(self.all_ap) else 0.0
-
-    @property
-    def map75(self) -> float:
-        """
-        Return the mean Average Precision (mAP) at an IoU threshold of 0.75.
-
-        Returns:
-            (float): The mAP at an IoU threshold of 0.75.
-        """
-        return self.all_ap[:, 5].mean() if len(self.all_ap) else 0.0
-
-    @property
-    def map(self) -> float:
-        """
-        Return the mean Average Precision (mAP) over IoU thresholds of 0.5 - 0.95 in steps of 0.05.
-
-        Returns:
-            (float): The mAP over IoU thresholds of 0.5 - 0.95 in steps of 0.05.
-        """
-        return self.all_ap.mean() if len(self.all_ap) else 0.0
-
-    def mean_results(self) -> list[float]:
-        """Return mean of results, mp, mr, map50, map."""
-        return [self.mp, self.mr, self.map50, self.map]
-
-    def class_result(self, i: int) -> tuple[float, float, float, float]:
-        """Return class-aware result, p[i], r[i], ap50[i], ap[i]."""
-        return self.p[i], self.r[i], self.ap50[i], self.ap[i]
-
-    @property
-    def maps(self) -> np.ndarray:
-        """Return mAP of each class."""
-        maps = np.zeros(self.nc) + self.map
-        for i, c in enumerate(self.ap_class_index):
-            maps[c] = self.ap[i]
-        return maps
-
-    def fitness(self) -> float:
-        """Return model fitness as a weighted combination of metrics."""
-        w = [0.0, 0.0, 0.0, 1.0]  # weights for [P, R, mAP@0.5, mAP@0.5:0.95]
-        return (np.nan_to_num(np.array(self.mean_results())) * w).sum()
-
-    def update(self, results: tuple):
-        """
-        Update the evaluation metrics with a new set of results.
-
-        Args:
-            results (tuple): A tuple containing evaluation metrics:
-                - p (list): Precision for each class.
-                - r (list): Recall for each class.
-                - f1 (list): F1 score for each class.
-                - all_ap (list): AP scores for all classes and all IoU thresholds.
-                - ap_class_index (list): Index of class for each AP score.
-                - p_curve (list): Precision curve for each class.
-                - r_curve (list): Recall curve for each class.
-                - f1_curve (list): F1 curve for each class.
-                - px (list): X values for the curves.
-                - prec_values (list): Precision values for each class.
-        """
-        (
-            self.p,
-            self.r,
-            self.f1,
-            self.all_ap,
-            self.ap_class_index,
-            self.p_curve,
-            self.r_curve,
-            self.f1_curve,
-            self.px,
-            self.prec_values,
-        ) = results
-
-    @property
-    def curves(self) -> list:
-        """Return a list of curves for accessing specific metrics curves."""
-        return []
-
-    @property
-    def curves_results(self) -> list[list]:
-        """Return a list of curves for accessing specific metrics curves."""
-        return [
-            [self.px, self.prec_values, "Recall", "Precision"],
-            [self.px, self.f1_curve, "Confidence", "F1"],
-            [self.px, self.p_curve, "Confidence", "Precision"],
-            [self.px, self.r_curve, "Confidence", "Recall"],
-        ]
-
-
-class DetMetrics(SimpleClass, DataExportMixin):
-    """
-    Utility class for computing detection metrics such as precision, recall, and mean average precision (mAP).
-
-    Attributes:
-        names (dict[int, str]): A dictionary of class names.
-        box (Metric): An instance of the Metric class for storing detection results.
-        speed (dict[str, float]): A dictionary for storing execution times of different parts of the detection process.
-        task (str): The task type, set to 'detect'.
-        stats (dict[str, list]): A dictionary containing lists for true positives, confidence scores, predicted classes, target classes, and target images.
-        nt_per_class: Number of targets per class.
-        nt_per_image: Number of targets per image.
-
-    Methods:
-        update_stats: Update statistics by appending new values to existing stat collections.
-        process: Process predicted results for object detection and update metrics.
-        clear_stats: Clear the stored statistics.
-        keys: Return a list of keys for accessing specific metrics.
-        mean_results: Calculate mean of detected objects & return precision, recall, mAP50, and mAP50-95.
-        class_result: Return the result of evaluating the performance of an object detection model on a specific class.
-        maps: Return mean Average Precision (mAP) scores per class.
-        fitness: Return the fitness of box object.
-        ap_class_index: Return the average precision index per class.
-        results_dict: Return dictionary of computed performance metrics and statistics.
-        curves: Return a list of curves for accessing specific metrics curves.
-        curves_results: Return a list of computed performance metrics and statistics.
-        summary: Generate a summarized representation of per-class detection metrics as a list of dictionaries.
-    """
-
-    def __init__(self, names: dict[int, str] = {}) -> None:
-        """
-        Initialize a DetMetrics instance with a save directory, plot flag, and class names.
-
-        Args:
-            names (dict[int, str], optional): Dictionary of class names.
-        """
-        self.names = names
-        self.box = Metric()
-        self.speed = {"preprocess": 0.0, "inference": 0.0, "loss": 0.0, "postprocess": 0.0}
-        self.task = "detect"
-        self.stats = dict(tp=[], conf=[], pred_cls=[], target_cls=[], target_img=[])
-        self.nt_per_class = None
-        self.nt_per_image = None
-
-    def update_stats(self, stat: dict[str, Any]) -> None:
-        """
-        Update statistics by appending new values to existing stat collections.
-
-        Args:
-            stat (dict[str, any]): Dictionary containing new statistical values to append.
-                         Keys should match existing keys in self.stats.
-        """
-        for k in self.stats.keys():
-            self.stats[k].append(stat[k])
-
-    def process(self, save_dir: Path = Path("."), plot: bool = False, on_plot=None) -> dict[str, np.ndarray]:
-        """
-        Process predicted results for object detection and update metrics.
-
-        Args:
-            save_dir (Path): Directory to save plots. Defaults to Path(".").
-            plot (bool): Whether to plot precision-recall curves. Defaults to False.
-            on_plot (callable, optional): Function to call after plots are generated. Defaults to None.
-
-        Returns:
-            (dict[str, np.ndarray]): Dictionary containing concatenated statistics arrays.
-        """
-        stats = {k: np.concatenate(v, 0) for k, v in self.stats.items()}  # to numpy
-        if not stats:
-            return stats
-        results = ap_per_class(
-            stats["tp"],
-            stats["conf"],
-            stats["pred_cls"],
-            stats["target_cls"],
-            plot=plot,
-            save_dir=save_dir,
-            names=self.names,
-            on_plot=on_plot,
-            prefix="Box",
-        )[2:]
-        self.box.nc = len(self.names)
-        self.box.update(results)
-        self.nt_per_class = np.bincount(stats["target_cls"].astype(int), minlength=len(self.names))
-        self.nt_per_image = np.bincount(stats["target_img"].astype(int), minlength=len(self.names))
-        return stats
-
-    def clear_stats(self):
-        """Clear the stored statistics."""
-        for v in self.stats.values():
-            v.clear()
-
-    @property
-    def keys(self) -> list[str]:
-        """Return a list of keys for accessing specific metrics."""
-        return ["metrics/precision(B)", "metrics/recall(B)", "metrics/mAP50(B)", "metrics/mAP50-95(B)"]
-
-    def mean_results(self) -> list[float]:
-        """Calculate mean of detected objects & return precision, recall, mAP50, and mAP50-95."""
-        return self.box.mean_results()
-
-    def class_result(self, i: int) -> tuple[float, float, float, float]:
-        """Return the result of evaluating the performance of an object detection model on a specific class."""
-        return self.box.class_result(i)
-
-    @property
-    def maps(self) -> np.ndarray:
-        """Return mean Average Precision (mAP) scores per class."""
-        return self.box.maps
-
-    @property
-    def fitness(self) -> float:
-        """Return the fitness of box object."""
-        return self.box.fitness()
-
-    @property
-    def ap_class_index(self) -> list:
-        """Return the average precision index per class."""
-        return self.box.ap_class_index
-
-    @property
-    def results_dict(self) -> dict[str, float]:
-        """Return dictionary of computed performance metrics and statistics."""
-        keys = self.keys + ["fitness"]
-        values = ((float(x) if hasattr(x, "item") else x) for x in (self.mean_results() + [self.fitness]))
-        return dict(zip(keys, values))
-
-    @property
-    def curves(self) -> list[str]:
-        """Return a list of curves for accessing specific metrics curves."""
-        return ["Precision-Recall(B)", "F1-Confidence(B)", "Precision-Confidence(B)", "Recall-Confidence(B)"]
-
-    @property
-    def curves_results(self) -> list[list]:
-        """Return a list of computed performance metrics and statistics."""
-        return self.box.curves_results
-
-    def summary(self, normalize: bool = True, decimals: int = 5) -> list[dict[str, Any]]:
-        """
-        Generate a summarized representation of per-class detection metrics as a list of dictionaries. Includes shared
-        scalar metrics (mAP, mAP50, mAP75) alongside precision, recall, and F1-score for each class.
-
-        Args:
-           normalize (bool): For Detect metrics, everything is normalized  by default [0-1].
-           decimals (int): Number of decimal places to round the metrics values to.
-
-        Returns:
-           (list[dict[str, Any]]): A list of dictionaries, each representing one class with corresponding metric values.
-
-        Examples:
-           >>> results = model.val(data="coco8.yaml")
-           >>> detection_summary = results.summary()
-           >>> print(detection_summary)
-        """
-        per_class = {
-            "Box-P": self.box.p,
-            "Box-R": self.box.r,
-            "Box-F1": self.box.f1,
-        }
-        return [
-            {
-                "Class": self.names[self.ap_class_index[i]],
-                "Images": self.nt_per_image[self.ap_class_index[i]],
-                "Instances": self.nt_per_class[self.ap_class_index[i]],
-                **{k: round(v[i], decimals) for k, v in per_class.items()},
-                "mAP50": round(self.class_result(i)[2], decimals),
-                "mAP50-95": round(self.class_result(i)[3], decimals),
-            }
-            for i in range(len(per_class["Box-P"]))
-        ]
-
-
-class SegmentMetrics(DetMetrics):
-    """
-    Calculate and aggregate detection and segmentation metrics over a given set of classes.
-
-    Attributes:
-        names (dict[int, str]): Dictionary of class names.
-        box (Metric): An instance of the Metric class for storing detection results.
-        seg (Metric): An instance of the Metric class to calculate mask segmentation metrics.
-        speed (dict[str, float]): A dictionary for storing execution times of different parts of the detection process.
-        task (str): The task type, set to 'segment'.
-        stats (dict[str, list]): A dictionary containing lists for true positives, confidence scores, predicted classes, target classes, and target images.
-        nt_per_class: Number of targets per class.
-        nt_per_image: Number of targets per image.
-
-    Methods:
-        process: Process the detection and segmentation metrics over the given set of predictions.
-        keys: Return a list of keys for accessing metrics.
-        mean_results: Return the mean metrics for bounding box and segmentation results.
-        class_result: Return classification results for a specified class index.
-        maps: Return mAP scores for object detection and semantic segmentation models.
-        fitness: Return the fitness score for both segmentation and bounding box models.
-        curves: Return a list of curves for accessing specific metrics curves.
-        curves_results: Provide a list of computed performance metrics and statistics.
-        summary: Generate a summarized representation of per-class segmentation metrics as a list of dictionaries.
-    """
-
-    def __init__(self, names: dict[int, str] = {}) -> None:
-        """
-        Initialize a SegmentMetrics instance with a save directory, plot flag, and class names.
-
-        Args:
-            names (dict[int, str], optional): Dictionary of class names.
-        """
-        DetMetrics.__init__(self, names)
-        self.seg = Metric()
-        self.task = "segment"
-        self.stats["tp_m"] = []  # add additional stats for masks
-
-    def process(self, save_dir: Path = Path("."), plot: bool = False, on_plot=None) -> dict[str, np.ndarray]:
-        """
-        Process the detection and segmentation metrics over the given set of predictions.
-
-        Args:
-            save_dir (Path): Directory to save plots. Defaults to Path(".").
-            plot (bool): Whether to plot precision-recall curves. Defaults to False.
-            on_plot (callable, optional): Function to call after plots are generated. Defaults to None.
-
-        Returns:
-            (dict[str, np.ndarray]): Dictionary containing concatenated statistics arrays.
-        """
-        stats = DetMetrics.process(self, save_dir, plot, on_plot=on_plot)  # process box stats
-        results_mask = ap_per_class(
-            stats["tp_m"],
-            stats["conf"],
-            stats["pred_cls"],
-            stats["target_cls"],
-            plot=plot,
-            on_plot=on_plot,
-            save_dir=save_dir,
-            names=self.names,
-            prefix="Mask",
-        )[2:]
-        self.seg.nc = len(self.names)
-        self.seg.update(results_mask)
-        return stats
-
-    @property
-    def keys(self) -> list[str]:
-        """Return a list of keys for accessing metrics."""
-        return DetMetrics.keys.fget(self) + [
-            "metrics/precision(M)",
-            "metrics/recall(M)",
-            "metrics/mAP50(M)",
-            "metrics/mAP50-95(M)",
-        ]
-
-    def mean_results(self) -> list[float]:
-        """Return the mean metrics for bounding box and segmentation results."""
-        return DetMetrics.mean_results(self) + self.seg.mean_results()
-
-    def class_result(self, i: int) -> list[float]:
-        """Return classification results for a specified class index."""
-        return DetMetrics.class_result(self, i) + self.seg.class_result(i)
-
-    @property
-    def maps(self) -> np.ndarray:
-        """Return mAP scores for object detection and semantic segmentation models."""
-        return DetMetrics.maps.fget(self) + self.seg.maps
-
-    @property
-    def fitness(self) -> float:
-        """Return the fitness score for both segmentation and bounding box models."""
-        return self.seg.fitness() + DetMetrics.fitness.fget(self)
-
-    @property
-    def curves(self) -> list[str]:
-        """Return a list of curves for accessing specific metrics curves."""
-        return DetMetrics.curves.fget(self) + [
-            "Precision-Recall(M)",
-            "F1-Confidence(M)",
-            "Precision-Confidence(M)",
-            "Recall-Confidence(M)",
-        ]
-
-    @property
-    def curves_results(self) -> list[list]:
-        """Return a list of computed performance metrics and statistics."""
-        return DetMetrics.curves_results.fget(self) + self.seg.curves_results
-
-    def summary(self, normalize: bool = True, decimals: int = 5) -> list[dict[str, Any]]:
-        """
-        Generate a summarized representation of per-class segmentation metrics as a list of dictionaries. Includes both
-        box and mask scalar metrics (mAP, mAP50, mAP75) alongside precision, recall, and F1-score for each class.
-
-        Args:
-            normalize (bool): For Segment metrics, everything is normalized  by default [0-1].
-            decimals (int): Number of decimal places to round the metrics values to.
-
-        Returns:
-            (list[dict[str, Any]]): A list of dictionaries, each representing one class with corresponding metric values.
-
-        Examples:
-            >>> results = model.val(data="coco8-seg.yaml")
-            >>> seg_summary = results.summary(decimals=4)
-            >>> print(seg_summary)
-        """
-        per_class = {
-            "Mask-P": self.seg.p,
-            "Mask-R": self.seg.r,
-            "Mask-F1": self.seg.f1,
-        }
-        summary = DetMetrics.summary(self, normalize, decimals)  # get box summary
-        for i, s in enumerate(summary):
-            s.update({**{k: round(v[i], decimals) for k, v in per_class.items()}})
-        return summary
-
-
-class PoseMetrics(DetMetrics):
-    """
-    Calculate and aggregate detection and pose metrics over a given set of classes.
-
-    Attributes:
-        names (dict[int, str]): Dictionary of class names.
-        pose (Metric): An instance of the Metric class to calculate pose metrics.
-        box (Metric): An instance of the Metric class for storing detection results.
-        speed (dict[str, float]): A dictionary for storing execution times of different parts of the detection process.
-        task (str): The task type, set to 'pose'.
-        stats (dict[str, list]): A dictionary containing lists for true positives, confidence scores, predicted classes, target classes, and target images.
-        nt_per_class: Number of targets per class.
-        nt_per_image: Number of targets per image.
-
-    Methods:
-        process: Process the detection and pose metrics over the given set of predictions. R
-        keys: Return a list of keys for accessing metrics.
-        mean_results: Return the mean results of box and pose.
-        class_result: Return the class-wise detection results for a specific class i.
-        maps: Return the mean average precision (mAP) per class for both box and pose detections.
-        fitness: Return combined fitness score for pose and box detection.
-        curves: Return a list of curves for accessing specific metrics curves.
-        curves_results: Provide a list of computed performance metrics and statistics.
-        summary: Generate a summarized representation of per-class pose metrics as a list of dictionaries.
-    """
-
-    def __init__(self, names: dict[int, str] = {}) -> None:
-        """
-        Initialize the PoseMetrics class with directory path, class names, and plotting options.
-
-        Args:
-            names (dict[int, str], optional): Dictionary of class names.
-        """
-        super().__init__(names)
-        self.pose = Metric()
-        self.task = "pose"
-        self.stats["tp_p"] = []  # add additional stats for pose
-
-    def process(self, save_dir: Path = Path("."), plot: bool = False, on_plot=None) -> dict[str, np.ndarray]:
-        """
-        Process the detection and pose metrics over the given set of predictions.
-
-        Args:
-            save_dir (Path): Directory to save plots. Defaults to Path(".").
-            plot (bool): Whether to plot precision-recall curves. Defaults to False.
-            on_plot (callable, optional): Function to call after plots are generated.
-
-        Returns:
-            (dict[str, np.ndarray]): Dictionary containing concatenated statistics arrays.
-        """
-        stats = DetMetrics.process(self, save_dir, plot, on_plot=on_plot)  # process box stats
-        results_pose = ap_per_class(
-            stats["tp_p"],
-            stats["conf"],
-            stats["pred_cls"],
-            stats["target_cls"],
-            plot=plot,
-            on_plot=on_plot,
-            save_dir=save_dir,
-            names=self.names,
-            prefix="Pose",
-        )[2:]
-        self.pose.nc = len(self.names)
-        self.pose.update(results_pose)
-        return stats
-
-    @property
-    def keys(self) -> list[str]:
-        """Return a list of evaluation metric keys."""
-        return DetMetrics.keys.fget(self) + [
-            "metrics/precision(P)",
-            "metrics/recall(P)",
-            "metrics/mAP50(P)",
-            "metrics/mAP50-95(P)",
-        ]
-
-    def mean_results(self) -> list[float]:
-        """Return the mean results of box and pose."""
-        return DetMetrics.mean_results(self) + self.pose.mean_results()
-
-    def class_result(self, i: int) -> list[float]:
-        """Return the class-wise detection results for a specific class i."""
-        return DetMetrics.class_result(self, i) + self.pose.class_result(i)
-
-    @property
-    def maps(self) -> np.ndarray:
-        """Return the mean average precision (mAP) per class for both box and pose detections."""
-        return DetMetrics.maps.fget(self) + self.pose.maps
-
-    @property
-    def fitness(self) -> float:
-        """Return combined fitness score for pose and box detection."""
-        return self.pose.fitness() + DetMetrics.fitness.fget(self)
-
-    @property
-    def curves(self) -> list[str]:
-        """Return a list of curves for accessing specific metrics curves."""
-        return DetMetrics.curves.fget(self) + [
-            "Precision-Recall(B)",
-            "F1-Confidence(B)",
-            "Precision-Confidence(B)",
-            "Recall-Confidence(B)",
-            "Precision-Recall(P)",
-            "F1-Confidence(P)",
-            "Precision-Confidence(P)",
-            "Recall-Confidence(P)",
-        ]
-
-    @property
-    def curves_results(self) -> list[list]:
-        """Return a list of computed performance metrics and statistics."""
-        return DetMetrics.curves_results.fget(self) + self.pose.curves_results
-
-    def summary(self, normalize: bool = True, decimals: int = 5) -> list[dict[str, Any]]:
-        """
-        Generate a summarized representation of per-class pose metrics as a list of dictionaries. Includes both box and
-        pose scalar metrics (mAP, mAP50, mAP75) alongside precision, recall, and F1-score for each class.
-
-        Args:
-            normalize (bool): For Pose metrics, everything is normalized  by default [0-1].
-            decimals (int): Number of decimal places to round the metrics values to.
-
-        Returns:
-            (list[dict[str, Any]]): A list of dictionaries, each representing one class with corresponding metric values.
-
-        Examples:
-            >>> results = model.val(data="coco8-pose.yaml")
-            >>> pose_summary = results.summary(decimals=4)
-            >>> print(pose_summary)
-        """
-        per_class = {
-            "Pose-P": self.pose.p,
-            "Pose-R": self.pose.r,
-            "Pose-F1": self.pose.f1,
-        }
-        summary = DetMetrics.summary(self, normalize, decimals)  # get box summary
-        for i, s in enumerate(summary):
-            s.update({**{k: round(v[i], decimals) for k, v in per_class.items()}})
-        return summary
-
-
-class ClassifyMetrics(SimpleClass, DataExportMixin):
-    """
-    Class for computing classification metrics including top-1 and top-5 accuracy.
-
-    Attributes:
-        top1 (float): The top-1 accuracy.
-        top5 (float): The top-5 accuracy.
-        speed (dict): A dictionary containing the time taken for each step in the pipeline.
-        task (str): The task type, set to 'classify'.
-
-    Methods:
-        process: Process target classes and predicted classes to compute metrics.
-        fitness: Return mean of top-1 and top-5 accuracies as fitness score.
-        results_dict: Return a dictionary with model's performance metrics and fitness score.
-        keys: Return a list of keys for the results_dict property.
-        curves: Return a list of curves for accessing specific metrics curves.
-        curves_results: Provide a list of computed performance metrics and statistics.
-        summary: Generate a single-row summary of classification metrics (Top-1 and Top-5 accuracy).
-    """
-
-    def __init__(self) -> None:
-        """Initialize a ClassifyMetrics instance."""
-        self.top1 = 0
-        self.top5 = 0
-        self.speed = {"preprocess": 0.0, "inference": 0.0, "loss": 0.0, "postprocess": 0.0}
-        self.task = "classify"
-
-    def process(self, targets: torch.Tensor, pred: torch.Tensor):
-        """
-        Process target classes and predicted classes to compute metrics.
-
-        Args:
-            targets (torch.Tensor): Target classes.
-            pred (torch.Tensor): Predicted classes.
-        """
-        pred, targets = torch.cat(pred), torch.cat(targets)
-        correct = (targets[:, None] == pred).float()
-        acc = torch.stack((correct[:, 0], correct.max(1).values), dim=1)  # (top1, top5) accuracy
-        self.top1, self.top5 = acc.mean(0).tolist()
-
-    @property
-    def fitness(self) -> float:
-        """Return mean of top-1 and top-5 accuracies as fitness score."""
-        return (self.top1 + self.top5) / 2
-
-    @property
-    def results_dict(self) -> dict[str, float]:
-        """Return a dictionary with model's performance metrics and fitness score."""
-        return dict(zip(self.keys + ["fitness"], [self.top1, self.top5, self.fitness]))
-
-    @property
-    def keys(self) -> list[str]:
-        """Return a list of keys for the results_dict property."""
-        return ["metrics/accuracy_top1", "metrics/accuracy_top5"]
-
-    @property
-    def curves(self) -> list:
-        """Return a list of curves for accessing specific metrics curves."""
-        return []
-
-    @property
-    def curves_results(self) -> list:
-        """Return a list of curves for accessing specific metrics curves."""
-        return []
-
-    def summary(self, normalize: bool = True, decimals: int = 5) -> list[dict[str, float]]:
-        """
-        Generate a single-row summary of classification metrics (Top-1 and Top-5 accuracy).
-
-        Args:
-            normalize (bool): For Classify metrics, everything is normalized  by default [0-1].
-            decimals (int): Number of decimal places to round the metrics values to.
-
-        Returns:
-            (list[dict[str, float]]): A list with one dictionary containing Top-1 and Top-5 classification accuracy.
-
-        Examples:
-            >>> results = model.val(data="imagenet10")
-            >>> classify_summary = results.summary(decimals=4)
-            >>> print(classify_summary)
-        """
-        return [{"top1_acc": round(self.top1, decimals), "top5_acc": round(self.top5, decimals)}]
-
-
-class OBBMetrics(DetMetrics):
-    """
-    Metrics for evaluating oriented bounding box (OBB) detection.
-
-    Attributes:
-        names (dict[int, str]): Dictionary of class names.
-        box (Metric): An instance of the Metric class for storing detection results.
-        speed (dict[str, float]): A dictionary for storing execution times of different parts of the detection process.
-        task (str): The task type, set to 'obb'.
-        stats (dict[str, list]): A dictionary containing lists for true positives, confidence scores, predicted classes, target classes, and target images.
-        nt_per_class: Number of targets per class.
-        nt_per_image: Number of targets per image.
-
-    References:
-        https://arxiv.org/pdf/2106.06072.pdf
-    """
-
-    def __init__(self, names: dict[int, str] = {}) -> None:
-        """
-        Initialize an OBBMetrics instance with directory, plotting, and class names.
-
-        Args:
-            names (dict[int, str], optional): Dictionary of class names.
-        """
-        DetMetrics.__init__(self, names)
-        # TODO: probably remove task as well
-        self.task = "obb"
diff --git a/ultralytics/utils/nms.py b/ultralytics/utils/nms.py
deleted file mode 100644
index b638640..0000000
--- a/ultralytics/utils/nms.py
+++ /dev/null
@@ -1,340 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-import sys
-import time
-
-import torch
-
-from ultralytics.utils import LOGGER
-from ultralytics.utils.metrics import batch_probiou, box_iou
-from ultralytics.utils.ops import xywh2xyxy
-
-
-def non_max_suppression(
-    prediction,
-    conf_thres: float = 0.25,
-    iou_thres: float = 0.45,
-    classes=None,
-    agnostic: bool = False,
-    multi_label: bool = False,
-    labels=(),
-    max_det: int = 300,
-    nc: int = 0,  # number of classes (optional)
-    max_time_img: float = 0.05,
-    max_nms: int = 30000,
-    max_wh: int = 7680,
-    rotated: bool = False,
-    end2end: bool = False,
-    return_idxs: bool = False,
-):
-    """
-    Perform non-maximum suppression (NMS) on prediction results.
-
-    Applies NMS to filter overlapping bounding boxes based on confidence and IoU thresholds. Supports multiple
-    detection formats including standard boxes, rotated boxes, and masks.
-
-    Args:
-        prediction (torch.Tensor): Predictions with shape (batch_size, num_classes + 4 + num_masks, num_boxes)
-            containing boxes, classes, and optional masks.
-        conf_thres (float): Confidence threshold for filtering detections. Valid values are between 0.0 and 1.0.
-        iou_thres (float): IoU threshold for NMS filtering. Valid values are between 0.0 and 1.0.
-        classes (list[int], optional): List of class indices to consider. If None, all classes are considered.
-        agnostic (bool): Whether to perform class-agnostic NMS.
-        multi_label (bool): Whether each box can have multiple labels.
-        labels (list[list[Union[int, float, torch.Tensor]]]): A priori labels for each image.
-        max_det (int): Maximum number of detections to keep per image.
-        nc (int): Number of classes. Indices after this are considered masks.
-        max_time_img (float): Maximum time in seconds for processing one image.
-        max_nms (int): Maximum number of boxes for NMS.
-        max_wh (int): Maximum box width and height in pixels.
-        rotated (bool): Whether to handle Oriented Bounding Boxes (OBB).
-        end2end (bool): Whether the model is end-to-end and doesn't require NMS.
-        return_idxs (bool): Whether to return the indices of kept detections.
-
-    Returns:
-        output (list[torch.Tensor]): List of detections per image with shape (num_boxes, 6 + num_masks)
-            containing (x1, y1, x2, y2, confidence, class, mask1, mask2, ...).
-        keepi (list[torch.Tensor]): Indices of kept detections if return_idxs=True.
-    """
-    # Checks
-    assert 0 <= conf_thres <= 1, f"Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0"
-    assert 0 <= iou_thres <= 1, f"Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0"
-    if isinstance(prediction, (list, tuple)):  # YOLOv8 model in validation model, output = (inference_out, loss_out)
-        prediction = prediction[0]  # select only inference output
-    if classes is not None:
-        classes = torch.tensor(classes, device=prediction.device)
-
-    if prediction.shape[-1] == 6 or end2end:  # end-to-end model (BNC, i.e. 1,300,6)
-        output = [pred[pred[:, 4] > conf_thres][:max_det] for pred in prediction]
-        if classes is not None:
-            output = [pred[(pred[:, 5:6] == classes).any(1)] for pred in output]
-        return output
-
-    bs = prediction.shape[0]  # batch size (BCN, i.e. 1,84,6300)
-    nc = nc or (prediction.shape[1] - 4)  # number of classes
-    extra = prediction.shape[1] - nc - 4  # number of extra info
-    mi = 4 + nc  # mask start index
-    xc = prediction[:, 4:mi].amax(1) > conf_thres  # candidates
-    xinds = torch.arange(prediction.shape[-1], device=prediction.device).expand(bs, -1)[..., None]  # to track idxs
-
-    # Settings
-    # min_wh = 2  # (pixels) minimum box width and height
-    time_limit = 2.0 + max_time_img * bs  # seconds to quit after
-    multi_label &= nc > 1  # multiple labels per box (adds 0.5ms/img)
-
-    prediction = prediction.transpose(-1, -2)  # shape(1,84,6300) to shape(1,6300,84)
-    if not rotated:
-        prediction[..., :4] = xywh2xyxy(prediction[..., :4])  # xywh to xyxy
-
-    t = time.time()
-    output = [torch.zeros((0, 6 + extra), device=prediction.device)] * bs
-    keepi = [torch.zeros((0, 1), device=prediction.device)] * bs  # to store the kept idxs
-    for xi, (x, xk) in enumerate(zip(prediction, xinds)):  # image index, (preds, preds indices)
-        # Apply constraints
-        # x[((x[:, 2:4] < min_wh) | (x[:, 2:4] > max_wh)).any(1), 4] = 0  # width-height
-        filt = xc[xi]  # confidence
-        x = x[filt]
-        if return_idxs:
-            xk = xk[filt]
-
-        # Cat apriori labels if autolabelling
-        if labels and len(labels[xi]) and not rotated:
-            lb = labels[xi]
-            v = torch.zeros((len(lb), nc + extra + 4), device=x.device)
-            v[:, :4] = xywh2xyxy(lb[:, 1:5])  # box
-            v[range(len(lb)), lb[:, 0].long() + 4] = 1.0  # cls
-            x = torch.cat((x, v), 0)
-
-        # If none remain process next image
-        if not x.shape[0]:
-            continue
-
-        # Detections matrix nx6 (xyxy, conf, cls)
-        box, cls, mask = x.split((4, nc, extra), 1)
-
-        if multi_label:
-            i, j = torch.where(cls > conf_thres)
-            x = torch.cat((box[i], x[i, 4 + j, None], j[:, None].float(), mask[i]), 1)
-            if return_idxs:
-                xk = xk[i]
-        else:  # best class only
-            conf, j = cls.max(1, keepdim=True)
-            filt = conf.view(-1) > conf_thres
-            x = torch.cat((box, conf, j.float(), mask), 1)[filt]
-            if return_idxs:
-                xk = xk[filt]
-
-        # Filter by class
-        if classes is not None:
-            filt = (x[:, 5:6] == classes).any(1)
-            x = x[filt]
-            if return_idxs:
-                xk = xk[filt]
-
-        # Check shape
-        n = x.shape[0]  # number of boxes
-        if not n:  # no boxes
-            continue
-        if n > max_nms:  # excess boxes
-            filt = x[:, 4].argsort(descending=True)[:max_nms]  # sort by confidence and remove excess boxes
-            x = x[filt]
-            if return_idxs:
-                xk = xk[filt]
-
-        c = x[:, 5:6] * (0 if agnostic else max_wh)  # classes
-        scores = x[:, 4]  # scores
-        if rotated:
-            boxes = torch.cat((x[:, :2] + c, x[:, 2:4], x[:, -1:]), dim=-1)  # xywhr
-            i = TorchNMS.fast_nms(boxes, scores, iou_thres, iou_func=batch_probiou)
-        else:
-            boxes = x[:, :4] + c  # boxes (offset by class)
-            # Speed strategy: torchvision for val or already loaded (faster), TorchNMS for predict (lower latency)
-            if "torchvision" in sys.modules:
-                import torchvision  # scope as slow import
-
-                i = torchvision.ops.nms(boxes, scores, iou_thres)
-            else:
-                i = TorchNMS.nms(boxes, scores, iou_thres)
-        i = i[:max_det]  # limit detections
-
-        output[xi] = x[i]
-        if return_idxs:
-            keepi[xi] = xk[i].view(-1)
-        if (time.time() - t) > time_limit:
-            LOGGER.warning(f"NMS time limit {time_limit:.3f}s exceeded")
-            break  # time limit exceeded
-
-    return (output, keepi) if return_idxs else output
-
-
-class TorchNMS:
-    """
-    Ultralytics custom NMS implementation optimized for YOLO.
-
-    This class provides static methods for performing non-maximum suppression (NMS) operations on bounding boxes,
-    including both standard NMS and batched NMS for multi-class scenarios.
-
-    Methods:
-        nms: Optimized NMS with early termination that matches torchvision behavior exactly.
-        batched_nms: Batched NMS for class-aware suppression.
-
-    Examples:
-        Perform standard NMS on boxes and scores
-        >>> boxes = torch.tensor([[0, 0, 10, 10], [5, 5, 15, 15]])
-        >>> scores = torch.tensor([0.9, 0.8])
-        >>> keep = TorchNMS.nms(boxes, scores, 0.5)
-    """
-
-    @staticmethod
-    def fast_nms(
-        boxes: torch.Tensor,
-        scores: torch.Tensor,
-        iou_threshold: float,
-        use_triu: bool = True,
-        iou_func=box_iou,
-        exit_early: bool = True,
-    ) -> torch.Tensor:
-        """
-        Fast-NMS implementation from https://arxiv.org/pdf/1904.02689 using upper triangular matrix operations.
-
-        Args:
-            boxes (torch.Tensor): Bounding boxes with shape (N, 4) in xyxy format.
-            scores (torch.Tensor): Confidence scores with shape (N,).
-            iou_threshold (float): IoU threshold for suppression.
-            use_triu (bool): Whether to use torch.triu operator for upper triangular matrix operations.
-            iou_func (callable): Function to compute IoU between boxes.
-            exit_early (bool): Whether to exit early if there are no boxes.
-
-        Returns:
-            (torch.Tensor): Indices of boxes to keep after NMS.
-
-        Examples:
-            Apply NMS to a set of boxes
-            >>> boxes = torch.tensor([[0, 0, 10, 10], [5, 5, 15, 15]])
-            >>> scores = torch.tensor([0.9, 0.8])
-            >>> keep = TorchNMS.nms(boxes, scores, 0.5)
-        """
-        if boxes.numel() == 0 and exit_early:
-            return torch.empty((0,), dtype=torch.int64, device=boxes.device)
-
-        sorted_idx = torch.argsort(scores, descending=True)
-        boxes = boxes[sorted_idx]
-        ious = iou_func(boxes, boxes)
-        if use_triu:
-            ious = ious.triu_(diagonal=1)
-            # NOTE: handle the case when len(boxes) hence exportable by eliminating if-else condition
-            pick = torch.nonzero((ious >= iou_threshold).sum(0) <= 0).squeeze_(-1)
-        else:
-            n = boxes.shape[0]
-            row_idx = torch.arange(n, device=boxes.device).view(-1, 1).expand(-1, n)
-            col_idx = torch.arange(n, device=boxes.device).view(1, -1).expand(n, -1)
-            upper_mask = row_idx < col_idx
-            ious = ious * upper_mask
-            # Zeroing these scores ensures the additional indices would not affect the final results
-            scores[~((ious >= iou_threshold).sum(0) <= 0)] = 0
-            # NOTE: return indices with fixed length to avoid TFLite reshape error
-            pick = torch.topk(scores, scores.shape[0]).indices
-        return sorted_idx[pick]
-
-    @staticmethod
-    def nms(boxes: torch.Tensor, scores: torch.Tensor, iou_threshold: float) -> torch.Tensor:
-        """
-        Optimized NMS with early termination that matches torchvision behavior exactly.
-
-        Args:
-            boxes (torch.Tensor): Bounding boxes with shape (N, 4) in xyxy format.
-            scores (torch.Tensor): Confidence scores with shape (N,).
-            iou_threshold (float): IoU threshold for suppression.
-
-        Returns:
-            (torch.Tensor): Indices of boxes to keep after NMS.
-
-        Examples:
-            Apply NMS to a set of boxes
-            >>> boxes = torch.tensor([[0, 0, 10, 10], [5, 5, 15, 15]])
-            >>> scores = torch.tensor([0.9, 0.8])
-            >>> keep = TorchNMS.nms(boxes, scores, 0.5)
-        """
-        if boxes.numel() == 0:
-            return torch.empty((0,), dtype=torch.int64, device=boxes.device)
-
-        # Pre-allocate and extract coordinates once
-        x1, y1, x2, y2 = boxes.unbind(1)
-        areas = (x2 - x1) * (y2 - y1)
-
-        # Sort by scores descending
-        order = scores.argsort(0, descending=True)
-
-        # Pre-allocate keep list with maximum possible size
-        keep = torch.zeros(order.numel(), dtype=torch.int64, device=boxes.device)
-        keep_idx = 0
-        while order.numel() > 0:
-            i = order[0]
-            keep[keep_idx] = i
-            keep_idx += 1
-
-            if order.numel() == 1:
-                break
-            # Vectorized IoU calculation for remaining boxes
-            rest = order[1:]
-            xx1 = torch.maximum(x1[i], x1[rest])
-            yy1 = torch.maximum(y1[i], y1[rest])
-            xx2 = torch.minimum(x2[i], x2[rest])
-            yy2 = torch.minimum(y2[i], y2[rest])
-
-            # Fast intersection and IoU
-            w = (xx2 - xx1).clamp_(min=0)
-            h = (yy2 - yy1).clamp_(min=0)
-            inter = w * h
-            # Early exit: skip IoU calculation if no intersection
-            if inter.sum() == 0:
-                # No overlaps with current box, keep all remaining boxes
-                order = rest
-                continue
-            iou = inter / (areas[i] + areas[rest] - inter)
-            # Keep boxes with IoU <= threshold
-            order = rest[iou <= iou_threshold]
-
-        return keep[:keep_idx]
-
-    @staticmethod
-    def batched_nms(
-        boxes: torch.Tensor,
-        scores: torch.Tensor,
-        idxs: torch.Tensor,
-        iou_threshold: float,
-        use_fast_nms: bool = False,
-    ) -> torch.Tensor:
-        """
-        Batched NMS for class-aware suppression.
-
-        Args:
-            boxes (torch.Tensor): Bounding boxes with shape (N, 4) in xyxy format.
-            scores (torch.Tensor): Confidence scores with shape (N,).
-            idxs (torch.Tensor): Class indices with shape (N,).
-            iou_threshold (float): IoU threshold for suppression.
-            use_fast_nms (bool): Whether to use the Fast-NMS implementation.
-
-        Returns:
-            (torch.Tensor): Indices of boxes to keep after NMS.
-
-        Examples:
-            Apply batched NMS across multiple classes
-            >>> boxes = torch.tensor([[0, 0, 10, 10], [5, 5, 15, 15]])
-            >>> scores = torch.tensor([0.9, 0.8])
-            >>> idxs = torch.tensor([0, 1])
-            >>> keep = TorchNMS.batched_nms(boxes, scores, idxs, 0.5)
-        """
-        if boxes.numel() == 0:
-            return torch.empty((0,), dtype=torch.int64, device=boxes.device)
-
-        # Strategy: offset boxes by class index to prevent cross-class suppression
-        max_coordinate = boxes.max()
-        offsets = idxs.to(boxes) * (max_coordinate + 1)
-        boxes_for_nms = boxes + offsets[:, None]
-
-        return (
-            TorchNMS.fast_nms(boxes_for_nms, scores, iou_threshold)
-            if use_fast_nms
-            else TorchNMS.nms(boxes_for_nms, scores, iou_threshold)
-        )
diff --git a/ultralytics/utils/ops.py b/ultralytics/utils/ops.py
deleted file mode 100644
index 43a0574..0000000
--- a/ultralytics/utils/ops.py
+++ /dev/null
@@ -1,722 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import contextlib
-import math
-import re
-import time
-
-import cv2
-import numpy as np
-import torch
-import torch.nn.functional as F
-
-from ultralytics.utils import NOT_MACOS14
-
-
-class Profile(contextlib.ContextDecorator):
-    """
-    Ultralytics Profile class for timing code execution.
-
-    Use as a decorator with @Profile() or as a context manager with 'with Profile():'. Provides accurate timing
-    measurements with CUDA synchronization support for GPU operations.
-
-    Attributes:
-        t (float): Accumulated time in seconds.
-        device (torch.device): Device used for model inference.
-        cuda (bool): Whether CUDA is being used for timing synchronization.
-
-    Examples:
-        Use as a context manager to time code execution
-        >>> with Profile(device=device) as dt:
-        ...     pass  # slow operation here
-        >>> print(dt)  # prints "Elapsed time is 9.5367431640625e-07 s"
-
-        Use as a decorator to time function execution
-        >>> @Profile()
-        ... def slow_function():
-        ...     time.sleep(0.1)
-    """
-
-    def __init__(self, t: float = 0.0, device: torch.device | None = None):
-        """
-        Initialize the Profile class.
-
-        Args:
-            t (float): Initial accumulated time in seconds.
-            device (torch.device, optional): Device used for model inference to enable CUDA synchronization.
-        """
-        self.t = t
-        self.device = device
-        self.cuda = bool(device and str(device).startswith("cuda"))
-
-    def __enter__(self):
-        """Start timing."""
-        self.start = self.time()
-        return self
-
-    def __exit__(self, type, value, traceback):  # noqa
-        """Stop timing."""
-        self.dt = self.time() - self.start  # delta-time
-        self.t += self.dt  # accumulate dt
-
-    def __str__(self):
-        """Return a human-readable string representing the accumulated elapsed time."""
-        return f"Elapsed time is {self.t} s"
-
-    def time(self):
-        """Get current time with CUDA synchronization if applicable."""
-        if self.cuda:
-            torch.cuda.synchronize(self.device)
-        return time.perf_counter()
-
-
-def segment2box(segment, width: int = 640, height: int = 640):
-    """
-    Convert segment coordinates to bounding box coordinates.
-
-    Converts a single segment label to a box label by finding the minimum and maximum x and y coordinates.
-    Applies inside-image constraint and clips coordinates when necessary.
-
-    Args:
-        segment (torch.Tensor): Segment coordinates in format (N, 2) where N is number of points.
-        width (int): Width of the image in pixels.
-        height (int): Height of the image in pixels.
-
-    Returns:
-        (np.ndarray): Bounding box coordinates in xyxy format [x1, y1, x2, y2].
-    """
-    x, y = segment.T  # segment xy
-    # Clip coordinates if 3 out of 4 sides are outside the image
-    if np.array([x.min() < 0, y.min() < 0, x.max() > width, y.max() > height]).sum() >= 3:
-        x = x.clip(0, width)
-        y = y.clip(0, height)
-    inside = (x >= 0) & (y >= 0) & (x <= width) & (y <= height)
-    x = x[inside]
-    y = y[inside]
-    return (
-        np.array([x.min(), y.min(), x.max(), y.max()], dtype=segment.dtype)
-        if any(x)
-        else np.zeros(4, dtype=segment.dtype)
-    )  # xyxy
-
-
-def scale_boxes(img1_shape, boxes, img0_shape, ratio_pad=None, padding: bool = True, xywh: bool = False):
-    """
-    Rescale bounding boxes from one image shape to another.
-
-    Rescales bounding boxes from img1_shape to img0_shape, accounting for padding and aspect ratio changes.
-    Supports both xyxy and xywh box formats.
-
-    Args:
-        img1_shape (tuple): Shape of the source image (height, width).
-        boxes (torch.Tensor): Bounding boxes to rescale in format (N, 4).
-        img0_shape (tuple): Shape of the target image (height, width).
-        ratio_pad (tuple, optional): Tuple of (ratio, pad) for scaling. If None, calculated from image shapes.
-        padding (bool): Whether boxes are based on YOLO-style augmented images with padding.
-        xywh (bool): Whether box format is xywh (True) or xyxy (False).
-
-    Returns:
-        (torch.Tensor): Rescaled bounding boxes in the same format as input.
-    """
-    if ratio_pad is None:  # calculate from img0_shape
-        gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])  # gain  = old / new
-        pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1)
-        pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1)
-    else:
-        gain = ratio_pad[0][0]
-        pad_x, pad_y = ratio_pad[1]
-
-    if padding:
-        boxes[..., 0] -= pad_x  # x padding
-        boxes[..., 1] -= pad_y  # y padding
-        if not xywh:
-            boxes[..., 2] -= pad_x  # x padding
-            boxes[..., 3] -= pad_y  # y padding
-    boxes[..., :4] /= gain
-    return boxes if xywh else clip_boxes(boxes, img0_shape)
-
-
-def make_divisible(x: int, divisor):
-    """
-    Return the nearest number that is divisible by the given divisor.
-
-    Args:
-        x (int): The number to make divisible.
-        divisor (int | torch.Tensor): The divisor.
-
-    Returns:
-        (int): The nearest number divisible by the divisor.
-    """
-    if isinstance(divisor, torch.Tensor):
-        divisor = int(divisor.max())  # to int
-    return math.ceil(x / divisor) * divisor
-
-
-def clip_boxes(boxes, shape):
-    """
-    Clip bounding boxes to image boundaries.
-
-    Args:
-        boxes (torch.Tensor | np.ndarray): Bounding boxes to clip.
-        shape (tuple): Image shape as HWC or HW (supports both).
-
-    Returns:
-        (torch.Tensor | np.ndarray): Clipped bounding boxes.
-    """
-    h, w = shape[:2]  # supports both HWC or HW shapes
-    if isinstance(boxes, torch.Tensor):  # faster individually
-        if NOT_MACOS14:
-            boxes[..., 0].clamp_(0, w)  # x1
-            boxes[..., 1].clamp_(0, h)  # y1
-            boxes[..., 2].clamp_(0, w)  # x2
-            boxes[..., 3].clamp_(0, h)  # y2
-        else:  # Apple macOS14 MPS bug https://github.com/ultralytics/ultralytics/pull/21878
-            boxes[..., 0] = boxes[..., 0].clamp(0, w)
-            boxes[..., 1] = boxes[..., 1].clamp(0, h)
-            boxes[..., 2] = boxes[..., 2].clamp(0, w)
-            boxes[..., 3] = boxes[..., 3].clamp(0, h)
-    else:  # np.array (faster grouped)
-        boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, w)  # x1, x2
-        boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, h)  # y1, y2
-    return boxes
-
-
-def clip_coords(coords, shape):
-    """
-    Clip line coordinates to image boundaries.
-
-    Args:
-        coords (torch.Tensor | np.ndarray): Line coordinates to clip.
-        shape (tuple): Image shape as HWC or HW (supports both).
-
-    Returns:
-        (torch.Tensor | np.ndarray): Clipped coordinates.
-    """
-    h, w = shape[:2]  # supports both HWC or HW shapes
-    if isinstance(coords, torch.Tensor):
-        if NOT_MACOS14:
-            coords[..., 0].clamp_(0, w)  # x
-            coords[..., 1].clamp_(0, h)  # y
-        else:  # Apple macOS14 MPS bug https://github.com/ultralytics/ultralytics/pull/21878
-            coords[..., 0] = coords[..., 0].clamp(0, w)
-            coords[..., 1] = coords[..., 1].clamp(0, h)
-    else:  # np.array
-        coords[..., 0] = coords[..., 0].clip(0, w)  # x
-        coords[..., 1] = coords[..., 1].clip(0, h)  # y
-    return coords
-
-
-def scale_image(masks, im0_shape, ratio_pad=None):
-    """
-    Rescale masks to original image size.
-
-    Takes resized and padded masks and rescales them back to the original image dimensions, removing any padding
-    that was applied during preprocessing.
-
-    Args:
-        masks (np.ndarray): Resized and padded masks with shape [H, W, N] or [H, W, 3].
-        im0_shape (tuple): Original image shape as HWC or HW (supports both).
-        ratio_pad (tuple, optional): Ratio and padding values as ((ratio_h, ratio_w), (pad_h, pad_w)).
-
-    Returns:
-        (np.ndarray): Rescaled masks with shape [H, W, N] matching original image dimensions.
-    """
-    # Rescale coordinates (xyxy) from im1_shape to im0_shape
-    im0_h, im0_w = im0_shape[:2]  # supports both HWC or HW shapes
-    im1_h, im1_w, _ = masks.shape
-    if im1_h == im0_h and im1_w == im0_w:
-        return masks
-
-    if ratio_pad is None:  # calculate from im0_shape
-        gain = min(im1_h / im0_h, im1_w / im0_w)  # gain  = old / new
-        pad = (im1_w - im0_w * gain) / 2, (im1_h - im0_h * gain) / 2  # wh padding
-    else:
-        pad = ratio_pad[1]
-
-    pad_w, pad_h = pad
-    top = int(round(pad_h - 0.1))
-    left = int(round(pad_w - 0.1))
-    bottom = im1_h - int(round(pad_h + 0.1))
-    right = im1_w - int(round(pad_w + 0.1))
-
-    if len(masks.shape) < 2:
-        raise ValueError(f'"len of masks shape" should be 2 or 3, but got {len(masks.shape)}')
-    masks = masks[top:bottom, left:right]
-    # handle the cv2.resize 512 channels limitation: https://github.com/ultralytics/ultralytics/pull/21947
-    masks = [cv2.resize(array, (im0_w, im0_h)) for array in np.array_split(masks, masks.shape[-1] // 512 + 1, axis=-1)]
-    masks = np.concatenate(masks, axis=-1) if len(masks) > 1 else masks[0]
-    if len(masks.shape) == 2:
-        masks = masks[:, :, None]
-
-    return masks
-
-
-def xyxy2xywh(x):
-    """
-    Convert bounding box coordinates from (x1, y1, x2, y2) format to (x, y, width, height) format where (x1, y1) is the
-    top-left corner and (x2, y2) is the bottom-right corner.
-
-    Args:
-        x (np.ndarray | torch.Tensor): Input bounding box coordinates in (x1, y1, x2, y2) format.
-
-    Returns:
-        (np.ndarray | torch.Tensor): Bounding box coordinates in (x, y, width, height) format.
-    """
-    assert x.shape[-1] == 4, f"input shape last dimension expected 4 but input shape is {x.shape}"
-    y = empty_like(x)  # faster than clone/copy
-    x1, y1, x2, y2 = x[..., 0], x[..., 1], x[..., 2], x[..., 3]
-    y[..., 0] = (x1 + x2) / 2  # x center
-    y[..., 1] = (y1 + y2) / 2  # y center
-    y[..., 2] = x2 - x1  # width
-    y[..., 3] = y2 - y1  # height
-    return y
-
-
-def xywh2xyxy(x):
-    """
-    Convert bounding box coordinates from (x, y, width, height) format to (x1, y1, x2, y2) format where (x1, y1) is the
-    top-left corner and (x2, y2) is the bottom-right corner. Note: ops per 2 channels faster than per channel.
-
-    Args:
-        x (np.ndarray | torch.Tensor): Input bounding box coordinates in (x, y, width, height) format.
-
-    Returns:
-        (np.ndarray | torch.Tensor): Bounding box coordinates in (x1, y1, x2, y2) format.
-    """
-    assert x.shape[-1] == 4, f"input shape last dimension expected 4 but input shape is {x.shape}"
-    y = empty_like(x)  # faster than clone/copy
-    xy = x[..., :2]  # centers
-    wh = x[..., 2:] / 2  # half width-height
-    y[..., :2] = xy - wh  # top left xy
-    y[..., 2:] = xy + wh  # bottom right xy
-    return y
-
-
-def xywhn2xyxy(x, w: int = 640, h: int = 640, padw: int = 0, padh: int = 0):
-    """
-    Convert normalized bounding box coordinates to pixel coordinates.
-
-    Args:
-        x (np.ndarray | torch.Tensor): Normalized bounding box coordinates in (x, y, w, h) format.
-        w (int): Image width in pixels.
-        h (int): Image height in pixels.
-        padw (int): Padding width in pixels.
-        padh (int): Padding height in pixels.
-
-    Returns:
-        y (np.ndarray | torch.Tensor): The coordinates of the bounding box in the format [x1, y1, x2, y2] where
-            x1,y1 is the top-left corner, x2,y2 is the bottom-right corner of the bounding box.
-    """
-    assert x.shape[-1] == 4, f"input shape last dimension expected 4 but input shape is {x.shape}"
-    y = empty_like(x)  # faster than clone/copy
-    xc, yc, xw, xh = x[..., 0], x[..., 1], x[..., 2], x[..., 3]
-    half_w, half_h = xw / 2, xh / 2
-    y[..., 0] = w * (xc - half_w) + padw  # top left x
-    y[..., 1] = h * (yc - half_h) + padh  # top left y
-    y[..., 2] = w * (xc + half_w) + padw  # bottom right x
-    y[..., 3] = h * (yc + half_h) + padh  # bottom right y
-    return y
-
-
-def xyxy2xywhn(x, w: int = 640, h: int = 640, clip: bool = False, eps: float = 0.0):
-    """
-    Convert bounding box coordinates from (x1, y1, x2, y2) format to (x, y, width, height, normalized) format. x, y,
-    width and height are normalized to image dimensions.
-
-    Args:
-        x (np.ndarray | torch.Tensor): Input bounding box coordinates in (x1, y1, x2, y2) format.
-        w (int): Image width in pixels.
-        h (int): Image height in pixels.
-        clip (bool): Whether to clip boxes to image boundaries.
-        eps (float): Minimum value for box width and height.
-
-    Returns:
-        (np.ndarray | torch.Tensor): Normalized bounding box coordinates in (x, y, width, height) format.
-    """
-    if clip:
-        x = clip_boxes(x, (h - eps, w - eps))
-    assert x.shape[-1] == 4, f"input shape last dimension expected 4 but input shape is {x.shape}"
-    y = empty_like(x)  # faster than clone/copy
-    x1, y1, x2, y2 = x[..., 0], x[..., 1], x[..., 2], x[..., 3]
-    y[..., 0] = ((x1 + x2) / 2) / w  # x center
-    y[..., 1] = ((y1 + y2) / 2) / h  # y center
-    y[..., 2] = (x2 - x1) / w  # width
-    y[..., 3] = (y2 - y1) / h  # height
-    return y
-
-
-def xywh2ltwh(x):
-    """
-    Convert bounding box format from [x, y, w, h] to [x1, y1, w, h] where x1, y1 are top-left coordinates.
-
-    Args:
-        x (np.ndarray | torch.Tensor): Input bounding box coordinates in xywh format.
-
-    Returns:
-        (np.ndarray | torch.Tensor): Bounding box coordinates in xyltwh format.
-    """
-    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
-    y[..., 0] = x[..., 0] - x[..., 2] / 2  # top left x
-    y[..., 1] = x[..., 1] - x[..., 3] / 2  # top left y
-    return y
-
-
-def xyxy2ltwh(x):
-    """
-    Convert bounding boxes from [x1, y1, x2, y2] to [x1, y1, w, h] format.
-
-    Args:
-        x (np.ndarray | torch.Tensor): Input bounding box coordinates in xyxy format.
-
-    Returns:
-        (np.ndarray | torch.Tensor): Bounding box coordinates in xyltwh format.
-    """
-    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
-    y[..., 2] = x[..., 2] - x[..., 0]  # width
-    y[..., 3] = x[..., 3] - x[..., 1]  # height
-    return y
-
-
-def ltwh2xywh(x):
-    """
-    Convert bounding boxes from [x1, y1, w, h] to [x, y, w, h] where xy1=top-left, xy=center.
-
-    Args:
-        x (torch.Tensor): Input bounding box coordinates.
-
-    Returns:
-        (np.ndarray | torch.Tensor): Bounding box coordinates in xywh format.
-    """
-    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
-    y[..., 0] = x[..., 0] + x[..., 2] / 2  # center x
-    y[..., 1] = x[..., 1] + x[..., 3] / 2  # center y
-    return y
-
-
-def xyxyxyxy2xywhr(x):
-    """
-    Convert batched Oriented Bounding Boxes (OBB) from [xy1, xy2, xy3, xy4] to [xywh, rotation] format.
-
-    Args:
-        x (np.ndarray | torch.Tensor): Input box corners with shape (N, 8) in [xy1, xy2, xy3, xy4] format.
-
-    Returns:
-        (np.ndarray | torch.Tensor): Converted data in [cx, cy, w, h, rotation] format with shape (N, 5).
-            Rotation values are in radians from 0 to pi/2.
-    """
-    is_torch = isinstance(x, torch.Tensor)
-    points = x.cpu().numpy() if is_torch else x
-    points = points.reshape(len(x), -1, 2)
-    rboxes = []
-    for pts in points:
-        # NOTE: Use cv2.minAreaRect to get accurate xywhr,
-        # especially some objects are cut off by augmentations in dataloader.
-        (cx, cy), (w, h), angle = cv2.minAreaRect(pts)
-        rboxes.append([cx, cy, w, h, angle / 180 * np.pi])
-    return torch.tensor(rboxes, device=x.device, dtype=x.dtype) if is_torch else np.asarray(rboxes)
-
-
-def xywhr2xyxyxyxy(x):
-    """
-    Convert batched Oriented Bounding Boxes (OBB) from [xywh, rotation] to [xy1, xy2, xy3, xy4] format.
-
-    Args:
-        x (np.ndarray | torch.Tensor): Boxes in [cx, cy, w, h, rotation] format with shape (N, 5) or (B, N, 5).
-            Rotation values should be in radians from 0 to pi/2.
-
-    Returns:
-        (np.ndarray | torch.Tensor): Converted corner points with shape (N, 4, 2) or (B, N, 4, 2).
-    """
-    cos, sin, cat, stack = (
-        (torch.cos, torch.sin, torch.cat, torch.stack)
-        if isinstance(x, torch.Tensor)
-        else (np.cos, np.sin, np.concatenate, np.stack)
-    )
-
-    ctr = x[..., :2]
-    w, h, angle = (x[..., i : i + 1] for i in range(2, 5))
-    cos_value, sin_value = cos(angle), sin(angle)
-    vec1 = [w / 2 * cos_value, w / 2 * sin_value]
-    vec2 = [-h / 2 * sin_value, h / 2 * cos_value]
-    vec1 = cat(vec1, -1)
-    vec2 = cat(vec2, -1)
-    pt1 = ctr + vec1 + vec2
-    pt2 = ctr + vec1 - vec2
-    pt3 = ctr - vec1 - vec2
-    pt4 = ctr - vec1 + vec2
-    return stack([pt1, pt2, pt3, pt4], -2)
-
-
-def ltwh2xyxy(x):
-    """
-    Convert bounding box from [x1, y1, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right.
-
-    Args:
-        x (np.ndarray | torch.Tensor): Input bounding box coordinates.
-
-    Returns:
-        (np.ndarray | torch.Tensor): Bounding box coordinates in xyxy format.
-    """
-    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
-    y[..., 2] = x[..., 2] + x[..., 0]  # width
-    y[..., 3] = x[..., 3] + x[..., 1]  # height
-    return y
-
-
-def segments2boxes(segments):
-    """
-    Convert segment labels to box labels, i.e. (cls, xy1, xy2, ...) to (cls, xywh).
-
-    Args:
-        segments (list): List of segments where each segment is a list of points, each point is [x, y] coordinates.
-
-    Returns:
-        (np.ndarray): Bounding box coordinates in xywh format.
-    """
-    boxes = []
-    for s in segments:
-        x, y = s.T  # segment xy
-        boxes.append([x.min(), y.min(), x.max(), y.max()])  # cls, xyxy
-    return xyxy2xywh(np.array(boxes))  # cls, xywh
-
-
-def resample_segments(segments, n: int = 1000):
-    """
-    Resample segments to n points each using linear interpolation.
-
-    Args:
-        segments (list): List of (N, 2) arrays where N is the number of points in each segment.
-        n (int): Number of points to resample each segment to.
-
-    Returns:
-        (list): Resampled segments with n points each.
-    """
-    for i, s in enumerate(segments):
-        if len(s) == n:
-            continue
-        s = np.concatenate((s, s[0:1, :]), axis=0)
-        x = np.linspace(0, len(s) - 1, n - len(s) if len(s) < n else n)
-        xp = np.arange(len(s))
-        x = np.insert(x, np.searchsorted(x, xp), xp) if len(s) < n else x
-        segments[i] = (
-            np.concatenate([np.interp(x, xp, s[:, i]) for i in range(2)], dtype=np.float32).reshape(2, -1).T
-        )  # segment xy
-    return segments
-
-
-def crop_mask(masks, boxes):
-    """
-    Crop masks to bounding box regions.
-
-    Args:
-        masks (torch.Tensor): Masks with shape (N, H, W).
-        boxes (torch.Tensor): Bounding box coordinates with shape (N, 4) in relative point form.
-
-    Returns:
-        (torch.Tensor): Cropped masks.
-    """
-    _, h, w = masks.shape
-    x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1)  # x1 shape(n,1,1)
-    r = torch.arange(w, device=masks.device, dtype=x1.dtype)[None, None, :]  # rows shape(1,1,w)
-    c = torch.arange(h, device=masks.device, dtype=x1.dtype)[None, :, None]  # cols shape(1,h,1)
-
-    return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))
-
-
-def process_mask(protos, masks_in, bboxes, shape, upsample: bool = False):
-    """
-    Apply masks to bounding boxes using mask head output.
-
-    Args:
-        protos (torch.Tensor): Mask prototypes with shape (mask_dim, mask_h, mask_w).
-        masks_in (torch.Tensor): Mask coefficients with shape (N, mask_dim) where N is number of masks after NMS.
-        bboxes (torch.Tensor): Bounding boxes with shape (N, 4) where N is number of masks after NMS.
-        shape (tuple): Input image size as (height, width).
-        upsample (bool): Whether to upsample masks to original image size.
-
-    Returns:
-        (torch.Tensor): A binary mask tensor of shape [n, h, w], where n is the number of masks after NMS, and h and w
-            are the height and width of the input image. The mask is applied to the bounding boxes.
-    """
-    c, mh, mw = protos.shape  # CHW
-    ih, iw = shape
-    masks = (masks_in @ protos.float().view(c, -1)).view(-1, mh, mw)  # CHW
-    width_ratio = mw / iw
-    height_ratio = mh / ih
-
-    downsampled_bboxes = bboxes.clone()
-    downsampled_bboxes[:, 0] *= width_ratio
-    downsampled_bboxes[:, 2] *= width_ratio
-    downsampled_bboxes[:, 3] *= height_ratio
-    downsampled_bboxes[:, 1] *= height_ratio
-
-    masks = crop_mask(masks, downsampled_bboxes)  # CHW
-    if upsample:
-        masks = F.interpolate(masks[None], shape, mode="bilinear", align_corners=False)[0]  # CHW
-    return masks.gt_(0.0)
-
-
-def process_mask_native(protos, masks_in, bboxes, shape):
-    """
-    Apply masks to bounding boxes using mask head output with native upsampling.
-
-    Args:
-        protos (torch.Tensor): Mask prototypes with shape (mask_dim, mask_h, mask_w).
-        masks_in (torch.Tensor): Mask coefficients with shape (N, mask_dim) where N is number of masks after NMS.
-        bboxes (torch.Tensor): Bounding boxes with shape (N, 4) where N is number of masks after NMS.
-        shape (tuple): Input image size as (height, width).
-
-    Returns:
-        (torch.Tensor): Binary mask tensor with shape (H, W, N).
-    """
-    c, mh, mw = protos.shape  # CHW
-    masks = (masks_in @ protos.float().view(c, -1)).view(-1, mh, mw)
-    masks = scale_masks(masks[None], shape)[0]  # CHW
-    masks = crop_mask(masks, bboxes)  # CHW
-    return masks.gt_(0.0)
-
-
-def scale_masks(masks, shape, padding: bool = True):
-    """
-    Rescale segment masks to target shape.
-
-    Args:
-        masks (torch.Tensor): Masks with shape (N, C, H, W).
-        shape (tuple): Target height and width as (height, width).
-        padding (bool): Whether masks are based on YOLO-style augmented images with padding.
-
-    Returns:
-        (torch.Tensor): Rescaled masks.
-    """
-    mh, mw = masks.shape[2:]
-    gain = min(mh / shape[0], mw / shape[1])  # gain  = old / new
-    pad_w = mw - shape[1] * gain
-    pad_h = mh - shape[0] * gain
-    if padding:
-        pad_w /= 2
-        pad_h /= 2
-    top, left = (int(round(pad_h - 0.1)), int(round(pad_w - 0.1))) if padding else (0, 0)
-    bottom = mh - int(round(pad_h + 0.1))
-    right = mw - int(round(pad_w + 0.1))
-    return F.interpolate(masks[..., top:bottom, left:right], shape, mode="bilinear", align_corners=False)  # NCHW masks
-
-
-def scale_coords(img1_shape, coords, img0_shape, ratio_pad=None, normalize: bool = False, padding: bool = True):
-    """
-    Rescale segment coordinates from img1_shape to img0_shape.
-
-    Args:
-        img1_shape (tuple): Source image shape as HWC or HW (supports both).
-        coords (torch.Tensor): Coordinates to scale with shape (N, 2).
-        img0_shape (tuple): Image 0 shape as HWC or HW (supports both).
-        ratio_pad (tuple, optional): Ratio and padding values as ((ratio_h, ratio_w), (pad_h, pad_w)).
-        normalize (bool): Whether to normalize coordinates to range [0, 1].
-        padding (bool): Whether coordinates are based on YOLO-style augmented images with padding.
-
-    Returns:
-        (torch.Tensor): Scaled coordinates.
-    """
-    img0_h, img0_w = img0_shape[:2]  # supports both HWC or HW shapes
-    if ratio_pad is None:  # calculate from img0_shape
-        img1_h, img1_w = img1_shape[:2]  # supports both HWC or HW shapes
-        gain = min(img1_h / img0_h, img1_w / img0_w)  # gain  = old / new
-        pad = (img1_w - img0_w * gain) / 2, (img1_h - img0_h * gain) / 2  # wh padding
-    else:
-        gain = ratio_pad[0][0]
-        pad = ratio_pad[1]
-
-    if padding:
-        coords[..., 0] -= pad[0]  # x padding
-        coords[..., 1] -= pad[1]  # y padding
-    coords[..., 0] /= gain
-    coords[..., 1] /= gain
-    coords = clip_coords(coords, img0_shape)
-    if normalize:
-        coords[..., 0] /= img0_w  # width
-        coords[..., 1] /= img0_h  # height
-    return coords
-
-
-def regularize_rboxes(rboxes):
-    """
-    Regularize rotated bounding boxes to range [0, pi/2].
-
-    Args:
-        rboxes (torch.Tensor): Input rotated boxes with shape (N, 5) in xywhr format.
-
-    Returns:
-        (torch.Tensor): Regularized rotated boxes.
-    """
-    x, y, w, h, t = rboxes.unbind(dim=-1)
-    # Swap edge if t >= pi/2 while not being symmetrically opposite
-    swap = t % math.pi >= math.pi / 2
-    w_ = torch.where(swap, h, w)
-    h_ = torch.where(swap, w, h)
-    t = t % (math.pi / 2)
-    return torch.stack([x, y, w_, h_, t], dim=-1)  # regularized boxes
-
-
-def masks2segments(masks, strategy: str = "all"):
-    """
-    Convert masks to segments using contour detection.
-
-    Args:
-        masks (torch.Tensor): Binary masks with shape (batch_size, 160, 160).
-        strategy (str): Segmentation strategy, either 'all' or 'largest'.
-
-    Returns:
-        (list): List of segment masks as float32 arrays.
-    """
-    from ultralytics.data.converter import merge_multi_segment
-
-    segments = []
-    for x in masks.int().cpu().numpy().astype("uint8"):
-        c = cv2.findContours(x, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[0]
-        if c:
-            if strategy == "all":  # merge and concatenate all segments
-                c = (
-                    np.concatenate(merge_multi_segment([x.reshape(-1, 2) for x in c]))
-                    if len(c) > 1
-                    else c[0].reshape(-1, 2)
-                )
-            elif strategy == "largest":  # select largest segment
-                c = np.array(c[np.array([len(x) for x in c]).argmax()]).reshape(-1, 2)
-        else:
-            c = np.zeros((0, 2))  # no segments found
-        segments.append(c.astype("float32"))
-    return segments
-
-
-def convert_torch2numpy_batch(batch: torch.Tensor) -> np.ndarray:
-    """
-    Convert a batch of FP32 torch tensors to NumPy uint8 arrays, changing from BCHW to BHWC layout.
-
-    Args:
-        batch (torch.Tensor): Input tensor batch with shape (Batch, Channels, Height, Width) and dtype torch.float32.
-
-    Returns:
-        (np.ndarray): Output NumPy array batch with shape (Batch, Height, Width, Channels) and dtype uint8.
-    """
-    return (batch.permute(0, 2, 3, 1).contiguous() * 255).clamp(0, 255).to(torch.uint8).cpu().numpy()
-
-
-def clean_str(s):
-    """
-    Clean a string by replacing special characters with '_' character.
-
-    Args:
-        s (str): A string needing special characters replaced.
-
-    Returns:
-        (str): A string with special characters replaced by an underscore _.
-    """
-    return re.sub(pattern="[|@#!¡·$€%&()=?¿^*;:,¨´><+]", repl="_", string=s)
-
-
-def empty_like(x):
-    """Create empty torch.Tensor or np.ndarray with same shape as input and float32 dtype."""
-    return (
-        torch.empty_like(x, dtype=torch.float32) if isinstance(x, torch.Tensor) else np.empty_like(x, dtype=np.float32)
-    )
diff --git a/ultralytics/utils/patches.py b/ultralytics/utils/patches.py
deleted file mode 100644
index 4527dae..0000000
--- a/ultralytics/utils/patches.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-"""Monkey patches to update/extend functionality of existing functions."""
-
-from __future__ import annotations
-
-import time
-from contextlib import contextmanager
-from copy import copy
-from pathlib import Path
-from typing import Any
-
-import cv2
-import numpy as np
-import torch
-
-# OpenCV Multilanguage-friendly functions ------------------------------------------------------------------------------
-_imshow = cv2.imshow  # copy to avoid recursion errors
-
-
-def imread(filename: str, flags: int = cv2.IMREAD_COLOR) -> np.ndarray | None:
-    """
-    Read an image from a file with multilanguage filename support.
-
-    Args:
-        filename (str): Path to the file to read.
-        flags (int, optional): Flag that can take values of cv2.IMREAD_*. Controls how the image is read.
-
-    Returns:
-        (np.ndarray | None): The read image array, or None if reading fails.
-
-    Examples:
-        >>> img = imread("path/to/image.jpg")
-        >>> img = imread("path/to/image.jpg", cv2.IMREAD_GRAYSCALE)
-    """
-    file_bytes = np.fromfile(filename, np.uint8)
-    if filename.endswith((".tiff", ".tif")):
-        success, frames = cv2.imdecodemulti(file_bytes, cv2.IMREAD_UNCHANGED)
-        if success:
-            # Handle RGB images in tif/tiff format
-            return frames[0] if len(frames) == 1 and frames[0].ndim == 3 else np.stack(frames, axis=2)
-        return None
-    else:
-        im = cv2.imdecode(file_bytes, flags)
-        return im[..., None] if im is not None and im.ndim == 2 else im  # Always ensure 3 dimensions
-
-
-def imwrite(filename: str, img: np.ndarray, params: list[int] | None = None) -> bool:
-    """
-    Write an image to a file with multilanguage filename support.
-
-    Args:
-        filename (str): Path to the file to write.
-        img (np.ndarray): Image to write.
-        params (list[int], optional): Additional parameters for image encoding.
-
-    Returns:
-        (bool): True if the file was written successfully, False otherwise.
-
-    Examples:
-        >>> import numpy as np
-        >>> img = np.zeros((100, 100, 3), dtype=np.uint8)  # Create a black image
-        >>> success = imwrite("output.jpg", img)  # Write image to file
-        >>> print(success)
-        True
-    """
-    try:
-        cv2.imencode(Path(filename).suffix, img, params)[1].tofile(filename)
-        return True
-    except Exception:
-        return False
-
-
-def imshow(winname: str, mat: np.ndarray) -> None:
-    """
-    Display an image in the specified window with multilanguage window name support.
-
-    This function is a wrapper around OpenCV's imshow function that displays an image in a named window. It handles
-    multilanguage window names by encoding them properly for OpenCV compatibility.
-
-    Args:
-        winname (str): Name of the window where the image will be displayed. If a window with this name already
-            exists, the image will be displayed in that window.
-        mat (np.ndarray): Image to be shown. Should be a valid numpy array representing an image.
-
-    Examples:
-        >>> import numpy as np
-        >>> img = np.zeros((300, 300, 3), dtype=np.uint8)  # Create a black image
-        >>> img[:100, :100] = [255, 0, 0]  # Add a blue square
-        >>> imshow("Example Window", img)  # Display the image
-    """
-    _imshow(winname.encode("unicode_escape").decode(), mat)
-
-
-# PyTorch functions ----------------------------------------------------------------------------------------------------
-_torch_save = torch.save
-
-
-def torch_load(*args, **kwargs):
-    """
-    Load a PyTorch model with updated arguments to avoid warnings.
-
-    This function wraps torch.load and adds the 'weights_only' argument for PyTorch 1.13.0+ to prevent warnings.
-
-    Args:
-        *args (Any): Variable length argument list to pass to torch.load.
-        **kwargs (Any): Arbitrary keyword arguments to pass to torch.load.
-
-    Returns:
-        (Any): The loaded PyTorch object.
-
-    Notes:
-        For PyTorch versions 2.0 and above, this function automatically sets 'weights_only=False'
-        if the argument is not provided, to avoid deprecation warnings.
-    """
-    from ultralytics.utils.torch_utils import TORCH_1_13
-
-    if TORCH_1_13 and "weights_only" not in kwargs:
-        kwargs["weights_only"] = False
-
-    return torch.load(*args, **kwargs)
-
-
-def torch_save(*args, **kwargs):
-    """
-    Save PyTorch objects with retry mechanism for robustness.
-
-    This function wraps torch.save with 3 retries and exponential backoff in case of save failures, which can occur
-    due to device flushing delays or antivirus scanning.
-
-    Args:
-        *args (Any): Positional arguments to pass to torch.save.
-        **kwargs (Any): Keyword arguments to pass to torch.save.
-
-    Examples:
-        >>> model = torch.nn.Linear(10, 1)
-        >>> torch_save(model.state_dict(), "model.pt")
-    """
-    for i in range(4):  # 3 retries
-        try:
-            return _torch_save(*args, **kwargs)
-        except RuntimeError as e:  # Unable to save, possibly waiting for device to flush or antivirus scan
-            if i == 3:
-                raise e
-            time.sleep((2**i) / 2)  # Exponential backoff: 0.5s, 1.0s, 2.0s
-
-
-@contextmanager
-def arange_patch(args):
-    """
-    Workaround for ONNX torch.arange incompatibility with FP16.
-
-    https://github.com/pytorch/pytorch/issues/148041.
-    """
-    if args.dynamic and args.half and args.format == "onnx":
-        func = torch.arange
-
-        def arange(*args, dtype=None, **kwargs):
-            """Return a 1-D tensor of size with values from the interval and common difference."""
-            return func(*args, **kwargs).to(dtype)  # cast to dtype instead of passing dtype
-
-        torch.arange = arange  # patch
-        yield
-        torch.arange = func  # unpatch
-    else:
-        yield
-
-
-@contextmanager
-def override_configs(args, overrides: dict[str, Any] | None = None):
-    """
-    Context manager to temporarily override configurations in args.
-
-    Args:
-        args (IterableSimpleNamespace): Original configuration arguments.
-        overrides (dict[str, Any]): Dictionary of overrides to apply.
-
-    Yields:
-        (IterableSimpleNamespace): Configuration arguments with overrides applied.
-    """
-    if overrides:
-        original_args = copy(args)
-        for key, value in overrides.items():
-            setattr(args, key, value)
-        try:
-            yield args
-        finally:
-            args.__dict__.update(original_args.__dict__)
-    else:
-        yield args
diff --git a/ultralytics/utils/plotting.py b/ultralytics/utils/plotting.py
deleted file mode 100644
index 627160a..0000000
--- a/ultralytics/utils/plotting.py
+++ /dev/null
@@ -1,1031 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import math
-import warnings
-from pathlib import Path
-from typing import Any, Callable
-
-import cv2
-import numpy as np
-import torch
-from PIL import Image, ImageDraw, ImageFont
-from PIL import __version__ as pil_version
-
-from ultralytics.utils import IS_COLAB, IS_KAGGLE, LOGGER, TryExcept, ops, plt_settings, threaded
-from ultralytics.utils.checks import check_font, check_version, is_ascii
-from ultralytics.utils.files import increment_path
-
-
-class Colors:
-    """
-    Ultralytics color palette for visualization and plotting.
-
-    This class provides methods to work with the Ultralytics color palette, including converting hex color codes to
-    RGB values and accessing predefined color schemes for object detection and pose estimation.
-
-    Attributes:
-        palette (list[tuple]): List of RGB color tuples for general use.
-        n (int): The number of colors in the palette.
-        pose_palette (np.ndarray): A specific color palette array for pose estimation with dtype np.uint8.
-
-    Examples:
-        >>> from ultralytics.utils.plotting import Colors
-        >>> colors = Colors()
-        >>> colors(5, True)  # Returns BGR format: (221, 111, 255)
-        >>> colors(5, False)  # Returns RGB format: (255, 111, 221)
-
-    ## Ultralytics Color Palette
-
-    | Index | Color                                                             | HEX       | RGB               |
-    |-------|-------------------------------------------------------------------|-----------|-------------------|
-    | 0     | <i class="fa-solid fa-square fa-2xl" style="color: #042aff;"></i> | `#042aff` | (4, 42, 255)      |
-    | 1     | <i class="fa-solid fa-square fa-2xl" style="color: #0bdbeb;"></i> | `#0bdbeb` | (11, 219, 235)    |
-    | 2     | <i class="fa-solid fa-square fa-2xl" style="color: #f3f3f3;"></i> | `#f3f3f3` | (243, 243, 243)   |
-    | 3     | <i class="fa-solid fa-square fa-2xl" style="color: #00dfb7;"></i> | `#00dfb7` | (0, 223, 183)     |
-    | 4     | <i class="fa-solid fa-square fa-2xl" style="color: #111f68;"></i> | `#111f68` | (17, 31, 104)     |
-    | 5     | <i class="fa-solid fa-square fa-2xl" style="color: #ff6fdd;"></i> | `#ff6fdd` | (255, 111, 221)   |
-    | 6     | <i class="fa-solid fa-square fa-2xl" style="color: #ff444f;"></i> | `#ff444f` | (255, 68, 79)     |
-    | 7     | <i class="fa-solid fa-square fa-2xl" style="color: #cced00;"></i> | `#cced00` | (204, 237, 0)     |
-    | 8     | <i class="fa-solid fa-square fa-2xl" style="color: #00f344;"></i> | `#00f344` | (0, 243, 68)      |
-    | 9     | <i class="fa-solid fa-square fa-2xl" style="color: #bd00ff;"></i> | `#bd00ff` | (189, 0, 255)     |
-    | 10    | <i class="fa-solid fa-square fa-2xl" style="color: #00b4ff;"></i> | `#00b4ff` | (0, 180, 255)     |
-    | 11    | <i class="fa-solid fa-square fa-2xl" style="color: #dd00ba;"></i> | `#dd00ba` | (221, 0, 186)     |
-    | 12    | <i class="fa-solid fa-square fa-2xl" style="color: #00ffff;"></i> | `#00ffff` | (0, 255, 255)     |
-    | 13    | <i class="fa-solid fa-square fa-2xl" style="color: #26c000;"></i> | `#26c000` | (38, 192, 0)      |
-    | 14    | <i class="fa-solid fa-square fa-2xl" style="color: #01ffb3;"></i> | `#01ffb3` | (1, 255, 179)     |
-    | 15    | <i class="fa-solid fa-square fa-2xl" style="color: #7d24ff;"></i> | `#7d24ff` | (125, 36, 255)    |
-    | 16    | <i class="fa-solid fa-square fa-2xl" style="color: #7b0068;"></i> | `#7b0068` | (123, 0, 104)     |
-    | 17    | <i class="fa-solid fa-square fa-2xl" style="color: #ff1b6c;"></i> | `#ff1b6c` | (255, 27, 108)    |
-    | 18    | <i class="fa-solid fa-square fa-2xl" style="color: #fc6d2f;"></i> | `#fc6d2f` | (252, 109, 47)    |
-    | 19    | <i class="fa-solid fa-square fa-2xl" style="color: #a2ff0b;"></i> | `#a2ff0b` | (162, 255, 11)    |
-
-    ## Pose Color Palette
-
-    | Index | Color                                                             | HEX       | RGB               |
-    |-------|-------------------------------------------------------------------|-----------|-------------------|
-    | 0     | <i class="fa-solid fa-square fa-2xl" style="color: #ff8000;"></i> | `#ff8000` | (255, 128, 0)     |
-    | 1     | <i class="fa-solid fa-square fa-2xl" style="color: #ff9933;"></i> | `#ff9933` | (255, 153, 51)    |
-    | 2     | <i class="fa-solid fa-square fa-2xl" style="color: #ffb266;"></i> | `#ffb266` | (255, 178, 102)   |
-    | 3     | <i class="fa-solid fa-square fa-2xl" style="color: #e6e600;"></i> | `#e6e600` | (230, 230, 0)     |
-    | 4     | <i class="fa-solid fa-square fa-2xl" style="color: #ff99ff;"></i> | `#ff99ff` | (255, 153, 255)   |
-    | 5     | <i class="fa-solid fa-square fa-2xl" style="color: #99ccff;"></i> | `#99ccff` | (153, 204, 255)   |
-    | 6     | <i class="fa-solid fa-square fa-2xl" style="color: #ff66ff;"></i> | `#ff66ff` | (255, 102, 255)   |
-    | 7     | <i class="fa-solid fa-square fa-2xl" style="color: #ff33ff;"></i> | `#ff33ff` | (255, 51, 255)    |
-    | 8     | <i class="fa-solid fa-square fa-2xl" style="color: #66b2ff;"></i> | `#66b2ff` | (102, 178, 255)   |
-    | 9     | <i class="fa-solid fa-square fa-2xl" style="color: #3399ff;"></i> | `#3399ff` | (51, 153, 255)    |
-    | 10    | <i class="fa-solid fa-square fa-2xl" style="color: #ff9999;"></i> | `#ff9999` | (255, 153, 153)   |
-    | 11    | <i class="fa-solid fa-square fa-2xl" style="color: #ff6666;"></i> | `#ff6666` | (255, 102, 102)   |
-    | 12    | <i class="fa-solid fa-square fa-2xl" style="color: #ff3333;"></i> | `#ff3333` | (255, 51, 51)     |
-    | 13    | <i class="fa-solid fa-square fa-2xl" style="color: #99ff99;"></i> | `#99ff99` | (153, 255, 153)   |
-    | 14    | <i class="fa-solid fa-square fa-2xl" style="color: #66ff66;"></i> | `#66ff66` | (102, 255, 102)   |
-    | 15    | <i class="fa-solid fa-square fa-2xl" style="color: #33ff33;"></i> | `#33ff33` | (51, 255, 51)     |
-    | 16    | <i class="fa-solid fa-square fa-2xl" style="color: #00ff00;"></i> | `#00ff00` | (0, 255, 0)       |
-    | 17    | <i class="fa-solid fa-square fa-2xl" style="color: #0000ff;"></i> | `#0000ff` | (0, 0, 255)       |
-    | 18    | <i class="fa-solid fa-square fa-2xl" style="color: #ff0000;"></i> | `#ff0000` | (255, 0, 0)       |
-    | 19    | <i class="fa-solid fa-square fa-2xl" style="color: #ffffff;"></i> | `#ffffff` | (255, 255, 255)   |
-
-    !!! note "Ultralytics Brand Colors"
-
-        For Ultralytics brand colors see [https://www.ultralytics.com/brand](https://www.ultralytics.com/brand).
-        Please use the official Ultralytics colors for all marketing materials.
-    """
-
-    def __init__(self):
-        """Initialize colors as hex = matplotlib.colors.TABLEAU_COLORS.values()."""
-        hexs = (
-            "042AFF",
-            "0BDBEB",
-            "F3F3F3",
-            "00DFB7",
-            "111F68",
-            "FF6FDD",
-            "FF444F",
-            "CCED00",
-            "00F344",
-            "BD00FF",
-            "00B4FF",
-            "DD00BA",
-            "00FFFF",
-            "26C000",
-            "01FFB3",
-            "7D24FF",
-            "7B0068",
-            "FF1B6C",
-            "FC6D2F",
-            "A2FF0B",
-        )
-        self.palette = [self.hex2rgb(f"#{c}") for c in hexs]
-        self.n = len(self.palette)
-        self.pose_palette = np.array(
-            [
-                [255, 128, 0],
-                [255, 153, 51],
-                [255, 178, 102],
-                [230, 230, 0],
-                [255, 153, 255],
-                [153, 204, 255],
-                [255, 102, 255],
-                [255, 51, 255],
-                [102, 178, 255],
-                [51, 153, 255],
-                [255, 153, 153],
-                [255, 102, 102],
-                [255, 51, 51],
-                [153, 255, 153],
-                [102, 255, 102],
-                [51, 255, 51],
-                [0, 255, 0],
-                [0, 0, 255],
-                [255, 0, 0],
-                [255, 255, 255],
-            ],
-            dtype=np.uint8,
-        )
-
-    def __call__(self, i: int | torch.Tensor, bgr: bool = False) -> tuple:
-        """
-        Convert hex color codes to RGB values.
-
-        Args:
-            i (int | torch.Tensor): Color index.
-            bgr (bool, optional): Whether to return BGR format instead of RGB.
-
-        Returns:
-            (tuple): RGB or BGR color tuple.
-        """
-        c = self.palette[int(i) % self.n]
-        return (c[2], c[1], c[0]) if bgr else c
-
-    @staticmethod
-    def hex2rgb(h: str) -> tuple:
-        """Convert hex color codes to RGB values (i.e. default PIL order)."""
-        return tuple(int(h[1 + i : 1 + i + 2], 16) for i in (0, 2, 4))
-
-
-colors = Colors()  # create instance for 'from utils.plots import colors'
-
-
-class Annotator:
-    """
-    Ultralytics Annotator for train/val mosaics and JPGs and predictions annotations.
-
-    Attributes:
-        im (Image.Image | np.ndarray): The image to annotate.
-        pil (bool): Whether to use PIL or cv2 for drawing annotations.
-        font (ImageFont.truetype | ImageFont.load_default): Font used for text annotations.
-        lw (float): Line width for drawing.
-        skeleton (list[list[int]]): Skeleton structure for keypoints.
-        limb_color (list[int]): Color palette for limbs.
-        kpt_color (list[int]): Color palette for keypoints.
-        dark_colors (set): Set of colors considered dark for text contrast.
-        light_colors (set): Set of colors considered light for text contrast.
-
-    Examples:
-        >>> from ultralytics.utils.plotting import Annotator
-        >>> im0 = cv2.imread("test.png")
-        >>> annotator = Annotator(im0, line_width=10)
-        >>> annotator.box_label([10, 10, 100, 100], "person", (255, 0, 0))
-    """
-
-    def __init__(
-        self,
-        im,
-        line_width: int | None = None,
-        font_size: int | None = None,
-        font: str = "Arial.ttf",
-        pil: bool = False,
-        example: str = "abc",
-    ):
-        """Initialize the Annotator class with image and line width along with color palette for keypoints and limbs."""
-        non_ascii = not is_ascii(example)  # non-latin labels, i.e. asian, arabic, cyrillic
-        input_is_pil = isinstance(im, Image.Image)
-        self.pil = pil or non_ascii or input_is_pil
-        self.lw = line_width or max(round(sum(im.size if input_is_pil else im.shape) / 2 * 0.003), 2)
-        if not input_is_pil:
-            if im.shape[2] == 1:  # handle grayscale
-                im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR)
-            elif im.shape[2] > 3:  # multispectral
-                im = np.ascontiguousarray(im[..., :3])
-        if self.pil:  # use PIL
-            self.im = im if input_is_pil else Image.fromarray(im)
-            if self.im.mode not in {"RGB", "RGBA"}:  # multispectral
-                self.im = self.im.convert("RGB")
-            self.draw = ImageDraw.Draw(self.im, "RGBA")
-            try:
-                font = check_font("Arial.Unicode.ttf" if non_ascii else font)
-                size = font_size or max(round(sum(self.im.size) / 2 * 0.035), 12)
-                self.font = ImageFont.truetype(str(font), size)
-            except Exception:
-                self.font = ImageFont.load_default()
-            # Deprecation fix for w, h = getsize(string) -> _, _, w, h = getbox(string)
-            if check_version(pil_version, "9.2.0"):
-                self.font.getsize = lambda x: self.font.getbbox(x)[2:4]  # text width, height
-        else:  # use cv2
-            assert im.data.contiguous, "Image not contiguous. Apply np.ascontiguousarray(im) to Annotator input images."
-            self.im = im if im.flags.writeable else im.copy()
-            self.tf = max(self.lw - 1, 1)  # font thickness
-            self.sf = self.lw / 3  # font scale
-        # Pose
-        self.skeleton = [
-            [16, 14],
-            [14, 12],
-            [17, 15],
-            [15, 13],
-            [12, 13],
-            [6, 12],
-            [7, 13],
-            [6, 7],
-            [6, 8],
-            [7, 9],
-            [8, 10],
-            [9, 11],
-            [2, 3],
-            [1, 2],
-            [1, 3],
-            [2, 4],
-            [3, 5],
-            [4, 6],
-            [5, 7],
-        ]
-
-        self.limb_color = colors.pose_palette[[9, 9, 9, 9, 7, 7, 7, 0, 0, 0, 0, 0, 16, 16, 16, 16, 16, 16, 16]]
-        self.kpt_color = colors.pose_palette[[16, 16, 16, 16, 16, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9, 9, 9]]
-        self.dark_colors = {
-            (235, 219, 11),
-            (243, 243, 243),
-            (183, 223, 0),
-            (221, 111, 255),
-            (0, 237, 204),
-            (68, 243, 0),
-            (255, 255, 0),
-            (179, 255, 1),
-            (11, 255, 162),
-        }
-        self.light_colors = {
-            (255, 42, 4),
-            (79, 68, 255),
-            (255, 0, 189),
-            (255, 180, 0),
-            (186, 0, 221),
-            (0, 192, 38),
-            (255, 36, 125),
-            (104, 0, 123),
-            (108, 27, 255),
-            (47, 109, 252),
-            (104, 31, 17),
-        }
-
-    def get_txt_color(self, color: tuple = (128, 128, 128), txt_color: tuple = (255, 255, 255)) -> tuple:
-        """
-        Assign text color based on background color.
-
-        Args:
-            color (tuple, optional): The background color of the rectangle for text (B, G, R).
-            txt_color (tuple, optional): The color of the text (R, G, B).
-
-        Returns:
-            (tuple): Text color for label.
-
-        Examples:
-            >>> from ultralytics.utils.plotting import Annotator
-            >>> im0 = cv2.imread("test.png")
-            >>> annotator = Annotator(im0, line_width=10)
-            >>> annotator.get_txt_color(color=(104, 31, 17))  # return (255, 255, 255)
-        """
-        if color in self.dark_colors:
-            return 104, 31, 17
-        elif color in self.light_colors:
-            return 255, 255, 255
-        else:
-            return txt_color
-
-    def box_label(self, box, label: str = "", color: tuple = (128, 128, 128), txt_color: tuple = (255, 255, 255)):
-        """
-        Draw a bounding box on an image with a given label.
-
-        Args:
-            box (tuple): The bounding box coordinates (x1, y1, x2, y2).
-            label (str, optional): The text label to be displayed.
-            color (tuple, optional): The background color of the rectangle (B, G, R).
-            txt_color (tuple, optional): The color of the text (R, G, B).
-
-        Examples:
-            >>> from ultralytics.utils.plotting import Annotator
-            >>> im0 = cv2.imread("test.png")
-            >>> annotator = Annotator(im0, line_width=10)
-            >>> annotator.box_label(box=[10, 20, 30, 40], label="person")
-        """
-        txt_color = self.get_txt_color(color, txt_color)
-        if isinstance(box, torch.Tensor):
-            box = box.tolist()
-
-        multi_points = isinstance(box[0], list)  # multiple points with shape (n, 2)
-        p1 = [int(b) for b in box[0]] if multi_points else (int(box[0]), int(box[1]))
-        if self.pil:
-            self.draw.polygon(
-                [tuple(b) for b in box], width=self.lw, outline=color
-            ) if multi_points else self.draw.rectangle(box, width=self.lw, outline=color)
-            if label:
-                w, h = self.font.getsize(label)  # text width, height
-                outside = p1[1] >= h  # label fits outside box
-                if p1[0] > self.im.size[0] - w:  # size is (w, h), check if label extend beyond right side of image
-                    p1 = self.im.size[0] - w, p1[1]
-                self.draw.rectangle(
-                    (p1[0], p1[1] - h if outside else p1[1], p1[0] + w + 1, p1[1] + 1 if outside else p1[1] + h + 1),
-                    fill=color,
-                )
-                # self.draw.text([box[0], box[1]], label, fill=txt_color, font=self.font, anchor='ls')  # for PIL>8.0
-                self.draw.text((p1[0], p1[1] - h if outside else p1[1]), label, fill=txt_color, font=self.font)
-        else:  # cv2
-            cv2.polylines(
-                self.im, [np.asarray(box, dtype=int)], True, color, self.lw
-            ) if multi_points else cv2.rectangle(
-                self.im, p1, (int(box[2]), int(box[3])), color, thickness=self.lw, lineType=cv2.LINE_AA
-            )
-            if label:
-                w, h = cv2.getTextSize(label, 0, fontScale=self.sf, thickness=self.tf)[0]  # text width, height
-                h += 3  # add pixels to pad text
-                outside = p1[1] >= h  # label fits outside box
-                if p1[0] > self.im.shape[1] - w:  # shape is (h, w), check if label extend beyond right side of image
-                    p1 = self.im.shape[1] - w, p1[1]
-                p2 = p1[0] + w, p1[1] - h if outside else p1[1] + h
-                cv2.rectangle(self.im, p1, p2, color, -1, cv2.LINE_AA)  # filled
-                cv2.putText(
-                    self.im,
-                    label,
-                    (p1[0], p1[1] - 2 if outside else p1[1] + h - 1),
-                    0,
-                    self.sf,
-                    txt_color,
-                    thickness=self.tf,
-                    lineType=cv2.LINE_AA,
-                )
-
-    def masks(self, masks, colors, im_gpu: torch.Tensor = None, alpha: float = 0.5, retina_masks: bool = False):
-        """
-        Plot masks on image.
-
-        Args:
-            masks (torch.Tensor | np.ndarray): Predicted masks with shape: [n, h, w]
-            colors (list[list[int]]): Colors for predicted masks, [[r, g, b] * n]
-            im_gpu (torch.Tensor | None): Image is in cuda, shape: [3, h, w], range: [0, 1]
-            alpha (float, optional): Mask transparency: 0.0 fully transparent, 1.0 opaque.
-            retina_masks (bool, optional): Whether to use high resolution masks or not.
-        """
-        if self.pil:
-            # Convert to numpy first
-            self.im = np.asarray(self.im).copy()
-        if im_gpu is None:
-            assert isinstance(masks, np.ndarray), "`masks` must be a np.ndarray if `im_gpu` is not provided."
-            overlay = self.im.copy()
-            for i, mask in enumerate(masks):
-                overlay[mask.astype(bool)] = colors[i]
-            self.im = cv2.addWeighted(self.im, 1 - alpha, overlay, alpha, 0)
-        else:
-            assert isinstance(masks, torch.Tensor), "`masks` must be a torch.Tensor if `im_gpu` is provided."
-            if len(masks) == 0:
-                self.im[:] = im_gpu.permute(1, 2, 0).contiguous().cpu().numpy() * 255
-            if im_gpu.device != masks.device:
-                im_gpu = im_gpu.to(masks.device)
-            colors = torch.tensor(colors, device=masks.device, dtype=torch.float32) / 255.0  # shape(n,3)
-            colors = colors[:, None, None]  # shape(n,1,1,3)
-            masks = masks.unsqueeze(3)  # shape(n,h,w,1)
-            masks_color = masks * (colors * alpha)  # shape(n,h,w,3)
-
-            inv_alpha_masks = (1 - masks * alpha).cumprod(0)  # shape(n,h,w,1)
-            mcs = masks_color.max(dim=0).values  # shape(n,h,w,3)
-
-            im_gpu = im_gpu.flip(dims=[0])  # flip channel
-            im_gpu = im_gpu.permute(1, 2, 0).contiguous()  # shape(h,w,3)
-            im_gpu = im_gpu * inv_alpha_masks[-1] + mcs
-            im_mask = im_gpu * 255
-            im_mask_np = im_mask.byte().cpu().numpy()
-            self.im[:] = im_mask_np if retina_masks else ops.scale_image(im_mask_np, self.im.shape)
-        if self.pil:
-            # Convert im back to PIL and update draw
-            self.fromarray(self.im)
-
-    def kpts(
-        self,
-        kpts,
-        shape: tuple = (640, 640),
-        radius: int | None = None,
-        kpt_line: bool = True,
-        conf_thres: float = 0.25,
-        kpt_color: tuple | None = None,
-    ):
-        """
-        Plot keypoints on the image.
-
-        Args:
-            kpts (torch.Tensor): Keypoints, shape [17, 3] (x, y, confidence).
-            shape (tuple, optional): Image shape (h, w).
-            radius (int, optional): Keypoint radius.
-            kpt_line (bool, optional): Draw lines between keypoints.
-            conf_thres (float, optional): Confidence threshold.
-            kpt_color (tuple, optional): Keypoint color (B, G, R).
-
-        Note:
-            - `kpt_line=True` currently only supports human pose plotting.
-            - Modifies self.im in-place.
-            - If self.pil is True, converts image to numpy array and back to PIL.
-        """
-        radius = radius if radius is not None else self.lw
-        if self.pil:
-            # Convert to numpy first
-            self.im = np.asarray(self.im).copy()
-        nkpt, ndim = kpts.shape
-        is_pose = nkpt == 17 and ndim in {2, 3}
-        kpt_line &= is_pose  # `kpt_line=True` for now only supports human pose plotting
-        for i, k in enumerate(kpts):
-            color_k = kpt_color or (self.kpt_color[i].tolist() if is_pose else colors(i))
-            x_coord, y_coord = k[0], k[1]
-            if x_coord % shape[1] != 0 and y_coord % shape[0] != 0:
-                if len(k) == 3:
-                    conf = k[2]
-                    if conf < conf_thres:
-                        continue
-                cv2.circle(self.im, (int(x_coord), int(y_coord)), radius, color_k, -1, lineType=cv2.LINE_AA)
-
-        if kpt_line:
-            ndim = kpts.shape[-1]
-            for i, sk in enumerate(self.skeleton):
-                pos1 = (int(kpts[(sk[0] - 1), 0]), int(kpts[(sk[0] - 1), 1]))
-                pos2 = (int(kpts[(sk[1] - 1), 0]), int(kpts[(sk[1] - 1), 1]))
-                if ndim == 3:
-                    conf1 = kpts[(sk[0] - 1), 2]
-                    conf2 = kpts[(sk[1] - 1), 2]
-                    if conf1 < conf_thres or conf2 < conf_thres:
-                        continue
-                if pos1[0] % shape[1] == 0 or pos1[1] % shape[0] == 0 or pos1[0] < 0 or pos1[1] < 0:
-                    continue
-                if pos2[0] % shape[1] == 0 or pos2[1] % shape[0] == 0 or pos2[0] < 0 or pos2[1] < 0:
-                    continue
-                cv2.line(
-                    self.im,
-                    pos1,
-                    pos2,
-                    kpt_color or self.limb_color[i].tolist(),
-                    thickness=int(np.ceil(self.lw / 2)),
-                    lineType=cv2.LINE_AA,
-                )
-        if self.pil:
-            # Convert im back to PIL and update draw
-            self.fromarray(self.im)
-
-    def rectangle(self, xy, fill=None, outline=None, width: int = 1):
-        """Add rectangle to image (PIL-only)."""
-        self.draw.rectangle(xy, fill, outline, width)
-
-    def text(self, xy, text: str, txt_color: tuple = (255, 255, 255), anchor: str = "top", box_color: tuple = ()):
-        """
-        Add text to an image using PIL or cv2.
-
-        Args:
-            xy (list[int]): Top-left coordinates for text placement.
-            text (str): Text to be drawn.
-            txt_color (tuple, optional): Text color (R, G, B).
-            anchor (str, optional): Text anchor position ('top' or 'bottom').
-            box_color (tuple, optional): Box color (R, G, B, A) with optional alpha.
-        """
-        if self.pil:
-            w, h = self.font.getsize(text)
-            if anchor == "bottom":  # start y from font bottom
-                xy[1] += 1 - h
-            for line in text.split("\n"):
-                if box_color:
-                    # Draw rectangle for each line
-                    w, h = self.font.getsize(line)
-                    self.draw.rectangle((xy[0], xy[1], xy[0] + w + 1, xy[1] + h + 1), fill=box_color)
-                self.draw.text(xy, line, fill=txt_color, font=self.font)
-                xy[1] += h
-        else:
-            if box_color:
-                w, h = cv2.getTextSize(text, 0, fontScale=self.sf, thickness=self.tf)[0]
-                h += 3  # add pixels to pad text
-                outside = xy[1] >= h  # label fits outside box
-                p2 = xy[0] + w, xy[1] - h if outside else xy[1] + h
-                cv2.rectangle(self.im, xy, p2, box_color, -1, cv2.LINE_AA)  # filled
-            cv2.putText(self.im, text, xy, 0, self.sf, txt_color, thickness=self.tf, lineType=cv2.LINE_AA)
-
-    def fromarray(self, im):
-        """Update self.im from a numpy array."""
-        self.im = im if isinstance(im, Image.Image) else Image.fromarray(im)
-        self.draw = ImageDraw.Draw(self.im)
-
-    def result(self):
-        """Return annotated image as array."""
-        return np.asarray(self.im)
-
-    def show(self, title: str | None = None):
-        """Show the annotated image."""
-        im = Image.fromarray(np.asarray(self.im)[..., ::-1])  # Convert numpy array to PIL Image with RGB to BGR
-        if IS_COLAB or IS_KAGGLE:  # can not use IS_JUPYTER as will run for all ipython environments
-            try:
-                display(im)  # noqa - display() function only available in ipython environments
-            except ImportError as e:
-                LOGGER.warning(f"Unable to display image in Jupyter notebooks: {e}")
-        else:
-            im.show(title=title)
-
-    def save(self, filename: str = "image.jpg"):
-        """Save the annotated image to 'filename'."""
-        cv2.imwrite(filename, np.asarray(self.im))
-
-    @staticmethod
-    def get_bbox_dimension(bbox: tuple | None = None):
-        """
-        Calculate the dimensions and area of a bounding box.
-
-        Args:
-            bbox (tuple): Bounding box coordinates in the format (x_min, y_min, x_max, y_max).
-
-        Returns:
-            width (float): Width of the bounding box.
-            height (float): Height of the bounding box.
-            area (float): Area enclosed by the bounding box.
-
-        Examples:
-            >>> from ultralytics.utils.plotting import Annotator
-            >>> im0 = cv2.imread("test.png")
-            >>> annotator = Annotator(im0, line_width=10)
-            >>> annotator.get_bbox_dimension(bbox=[10, 20, 30, 40])
-        """
-        x_min, y_min, x_max, y_max = bbox
-        width = x_max - x_min
-        height = y_max - y_min
-        return width, height, width * height
-
-
-@TryExcept()
-@plt_settings()
-def plot_labels(boxes, cls, names=(), save_dir=Path(""), on_plot=None):
-    """
-    Plot training labels including class histograms and box statistics.
-
-    Args:
-        boxes (np.ndarray): Bounding box coordinates in format [x, y, width, height].
-        cls (np.ndarray): Class indices.
-        names (dict, optional): Dictionary mapping class indices to class names.
-        save_dir (Path, optional): Directory to save the plot.
-        on_plot (Callable, optional): Function to call after plot is saved.
-    """
-    import matplotlib.pyplot as plt  # scope for faster 'import ultralytics'
-    import polars
-    from matplotlib.colors import LinearSegmentedColormap
-
-    # Filter matplotlib>=3.7.2 warning
-    warnings.filterwarnings("ignore", category=UserWarning, message="The figure layout has changed to tight")
-    warnings.filterwarnings("ignore", category=FutureWarning)
-
-    # Plot dataset labels
-    LOGGER.info(f"Plotting labels to {save_dir / 'labels.jpg'}... ")
-    nc = int(cls.max() + 1)  # number of classes
-    boxes = boxes[:1000000]  # limit to 1M boxes
-    x = polars.DataFrame(boxes, schema=["x", "y", "width", "height"])
-
-    # Matplotlib labels
-    subplot_3_4_color = LinearSegmentedColormap.from_list("white_blue", ["white", "blue"])
-    ax = plt.subplots(2, 2, figsize=(8, 8), tight_layout=True)[1].ravel()
-    y = ax[0].hist(cls, bins=np.linspace(0, nc, nc + 1) - 0.5, rwidth=0.8)
-    for i in range(nc):
-        y[2].patches[i].set_color([x / 255 for x in colors(i)])
-    ax[0].set_ylabel("instances")
-    if 0 < len(names) < 30:
-        ax[0].set_xticks(range(len(names)))
-        ax[0].set_xticklabels(list(names.values()), rotation=90, fontsize=10)
-        ax[0].bar_label(y[2])
-    else:
-        ax[0].set_xlabel("classes")
-    boxes = np.column_stack([0.5 - boxes[:, 2:4] / 2, 0.5 + boxes[:, 2:4] / 2]) * 1000
-    img = Image.fromarray(np.ones((1000, 1000, 3), dtype=np.uint8) * 255)
-    for cls, box in zip(cls[:500], boxes[:500]):
-        ImageDraw.Draw(img).rectangle(box.tolist(), width=1, outline=colors(cls))  # plot
-    ax[1].imshow(img)
-    ax[1].axis("off")
-
-    ax[2].hist2d(x["x"], x["y"], bins=50, cmap=subplot_3_4_color)
-    ax[2].set_xlabel("x")
-    ax[2].set_ylabel("y")
-    ax[3].hist2d(x["width"], x["height"], bins=50, cmap=subplot_3_4_color)
-    ax[3].set_xlabel("width")
-    ax[3].set_ylabel("height")
-    for a in {0, 1, 2, 3}:
-        for s in {"top", "right", "left", "bottom"}:
-            ax[a].spines[s].set_visible(False)
-
-    fname = save_dir / "labels.jpg"
-    plt.savefig(fname, dpi=200)
-    plt.close()
-    if on_plot:
-        on_plot(fname)
-
-
-def save_one_box(
-    xyxy,
-    im,
-    file: Path = Path("im.jpg"),
-    gain: float = 1.02,
-    pad: int = 10,
-    square: bool = False,
-    BGR: bool = False,
-    save: bool = True,
-):
-    """
-    Save image crop as {file} with crop size multiple {gain} and {pad} pixels. Save and/or return crop.
-
-    This function takes a bounding box and an image, and then saves a cropped portion of the image according
-    to the bounding box. Optionally, the crop can be squared, and the function allows for gain and padding
-    adjustments to the bounding box.
-
-    Args:
-        xyxy (torch.Tensor | list): A tensor or list representing the bounding box in xyxy format.
-        im (np.ndarray): The input image.
-        file (Path, optional): The path where the cropped image will be saved.
-        gain (float, optional): A multiplicative factor to increase the size of the bounding box.
-        pad (int, optional): The number of pixels to add to the width and height of the bounding box.
-        square (bool, optional): If True, the bounding box will be transformed into a square.
-        BGR (bool, optional): If True, the image will be returned in BGR format, otherwise in RGB.
-        save (bool, optional): If True, the cropped image will be saved to disk.
-
-    Returns:
-        (np.ndarray): The cropped image.
-
-    Examples:
-        >>> from ultralytics.utils.plotting import save_one_box
-        >>> xyxy = [50, 50, 150, 150]
-        >>> im = cv2.imread("image.jpg")
-        >>> cropped_im = save_one_box(xyxy, im, file="cropped.jpg", square=True)
-    """
-    if not isinstance(xyxy, torch.Tensor):  # may be list
-        xyxy = torch.stack(xyxy)
-    b = ops.xyxy2xywh(xyxy.view(-1, 4))  # boxes
-    if square:
-        b[:, 2:] = b[:, 2:].max(1)[0].unsqueeze(1)  # attempt rectangle to square
-    b[:, 2:] = b[:, 2:] * gain + pad  # box wh * gain + pad
-    xyxy = ops.xywh2xyxy(b).long()
-    xyxy = ops.clip_boxes(xyxy, im.shape)
-    grayscale = im.shape[2] == 1  # grayscale image
-    crop = im[int(xyxy[0, 1]) : int(xyxy[0, 3]), int(xyxy[0, 0]) : int(xyxy[0, 2]), :: (1 if BGR or grayscale else -1)]
-    if save:
-        file.parent.mkdir(parents=True, exist_ok=True)  # make directory
-        f = str(increment_path(file).with_suffix(".jpg"))
-        # cv2.imwrite(f, crop)  # save BGR, https://github.com/ultralytics/yolov5/issues/7007 chroma subsampling issue
-        crop = crop.squeeze(-1) if grayscale else crop[..., ::-1] if BGR else crop
-        Image.fromarray(crop).save(f, quality=95, subsampling=0)  # save RGB
-    return crop
-
-
-@threaded
-def plot_images(
-    labels: dict[str, Any],
-    images: torch.Tensor | np.ndarray = np.zeros((0, 3, 640, 640), dtype=np.float32),
-    paths: list[str] | None = None,
-    fname: str = "images.jpg",
-    names: dict[int, str] | None = None,
-    on_plot: Callable | None = None,
-    max_size: int = 1920,
-    max_subplots: int = 16,
-    save: bool = True,
-    conf_thres: float = 0.25,
-) -> np.ndarray | None:
-    """
-    Plot image grid with labels, bounding boxes, masks, and keypoints.
-
-    Args:
-        labels (dict[str, Any]): Dictionary containing detection data with keys like 'cls', 'bboxes', 'conf', 'masks', 'keypoints', 'batch_idx', 'img'.
-        images (torch.Tensor | np.ndarray]): Batch of images to plot. Shape: (batch_size, channels, height, width).
-        paths (Optional[list[str]]): List of file paths for each image in the batch.
-        fname (str): Output filename for the plotted image grid.
-        names (Optional[dict[int, str]]): Dictionary mapping class indices to class names.
-        on_plot (Optional[Callable]): Optional callback function to be called after saving the plot.
-        max_size (int): Maximum size of the output image grid.
-        max_subplots (int): Maximum number of subplots in the image grid.
-        save (bool): Whether to save the plotted image grid to a file.
-        conf_thres (float): Confidence threshold for displaying detections.
-
-    Returns:
-        (np.ndarray): Plotted image grid as a numpy array if save is False, None otherwise.
-
-    Note:
-        This function supports both tensor and numpy array inputs. It will automatically
-        convert tensor inputs to numpy arrays for processing.
-    """
-    for k in {"cls", "bboxes", "conf", "masks", "keypoints", "batch_idx", "images"}:
-        if k not in labels:
-            continue
-        if k == "cls" and labels[k].ndim == 2:
-            labels[k] = labels[k].squeeze(1)  # squeeze if shape is (n, 1)
-        if isinstance(labels[k], torch.Tensor):
-            labels[k] = labels[k].cpu().numpy()
-
-    cls = labels.get("cls", np.zeros(0, dtype=np.int64))
-    batch_idx = labels.get("batch_idx", np.zeros(cls.shape, dtype=np.int64))
-    bboxes = labels.get("bboxes", np.zeros(0, dtype=np.float32))
-    confs = labels.get("conf", None)
-    masks = labels.get("masks", np.zeros(0, dtype=np.uint8))
-    kpts = labels.get("keypoints", np.zeros(0, dtype=np.float32))
-    images = labels.get("img", images)  # default to input images
-
-    if len(images) and isinstance(images, torch.Tensor):
-        images = images.cpu().float().numpy()
-    if images.shape[1] > 3:
-        images = images[:, :3]  # crop multispectral images to first 3 channels
-
-    bs, _, h, w = images.shape  # batch size, _, height, width
-    bs = min(bs, max_subplots)  # limit plot images
-    ns = np.ceil(bs**0.5)  # number of subplots (square)
-    if np.max(images[0]) <= 1:
-        images *= 255  # de-normalise (optional)
-
-    # Build Image
-    mosaic = np.full((int(ns * h), int(ns * w), 3), 255, dtype=np.uint8)  # init
-    for i in range(bs):
-        x, y = int(w * (i // ns)), int(h * (i % ns))  # block origin
-        mosaic[y : y + h, x : x + w, :] = images[i].transpose(1, 2, 0)
-
-    # Resize (optional)
-    scale = max_size / ns / max(h, w)
-    if scale < 1:
-        h = math.ceil(scale * h)
-        w = math.ceil(scale * w)
-        mosaic = cv2.resize(mosaic, tuple(int(x * ns) for x in (w, h)))
-
-    # Annotate
-    fs = int((h + w) * ns * 0.01)  # font size
-    fs = max(fs, 18)  # ensure that the font size is large enough to be easily readable.
-    annotator = Annotator(mosaic, line_width=round(fs / 10), font_size=fs, pil=True, example=str(names))
-    for i in range(bs):
-        x, y = int(w * (i // ns)), int(h * (i % ns))  # block origin
-        annotator.rectangle([x, y, x + w, y + h], None, (255, 255, 255), width=2)  # borders
-        if paths:
-            annotator.text([x + 5, y + 5], text=Path(paths[i]).name[:40], txt_color=(220, 220, 220))  # filenames
-        if len(cls) > 0:
-            idx = batch_idx == i
-            classes = cls[idx].astype("int")
-            labels = confs is None
-
-            if len(bboxes):
-                boxes = bboxes[idx]
-                conf = confs[idx] if confs is not None else None  # check for confidence presence (label vs pred)
-                if len(boxes):
-                    if boxes[:, :4].max() <= 1.1:  # if normalized with tolerance 0.1
-                        boxes[..., [0, 2]] *= w  # scale to pixels
-                        boxes[..., [1, 3]] *= h
-                    elif scale < 1:  # absolute coords need scale if image scales
-                        boxes[..., :4] *= scale
-                boxes[..., 0] += x
-                boxes[..., 1] += y
-                is_obb = boxes.shape[-1] == 5  # xywhr
-                # TODO: this transformation might be unnecessary
-                boxes = ops.xywhr2xyxyxyxy(boxes) if is_obb else ops.xywh2xyxy(boxes)
-                for j, box in enumerate(boxes.astype(np.int64).tolist()):
-                    c = classes[j]
-                    color = colors(c)
-                    c = names.get(c, c) if names else c
-                    if labels or conf[j] > conf_thres:
-                        label = f"{c}" if labels else f"{c} {conf[j]:.1f}"
-                        annotator.box_label(box, label, color=color)
-
-            elif len(classes):
-                for c in classes:
-                    color = colors(c)
-                    c = names.get(c, c) if names else c
-                    annotator.text([x, y], f"{c}", txt_color=color, box_color=(64, 64, 64, 128))
-
-            # Plot keypoints
-            if len(kpts):
-                kpts_ = kpts[idx].copy()
-                if len(kpts_):
-                    if kpts_[..., 0].max() <= 1.01 or kpts_[..., 1].max() <= 1.01:  # if normalized with tolerance .01
-                        kpts_[..., 0] *= w  # scale to pixels
-                        kpts_[..., 1] *= h
-                    elif scale < 1:  # absolute coords need scale if image scales
-                        kpts_ *= scale
-                kpts_[..., 0] += x
-                kpts_[..., 1] += y
-                for j in range(len(kpts_)):
-                    if labels or conf[j] > conf_thres:
-                        annotator.kpts(kpts_[j], conf_thres=conf_thres)
-
-            # Plot masks
-            if len(masks):
-                if idx.shape[0] == masks.shape[0] and masks.max() <= 1:  # overlap_mask=False
-                    image_masks = masks[idx]
-                else:  # overlap_mask=True
-                    image_masks = masks[[i]]  # (1, 640, 640)
-                    nl = idx.sum()
-                    index = np.arange(1, nl + 1).reshape((nl, 1, 1))
-                    image_masks = (image_masks == index).astype(np.float32)
-
-                im = np.asarray(annotator.im).copy()
-                for j in range(len(image_masks)):
-                    if labels or conf[j] > conf_thres:
-                        color = colors(classes[j])
-                        mh, mw = image_masks[j].shape
-                        if mh != h or mw != w:
-                            mask = image_masks[j].astype(np.uint8)
-                            mask = cv2.resize(mask, (w, h))
-                            mask = mask.astype(bool)
-                        else:
-                            mask = image_masks[j].astype(bool)
-                        try:
-                            im[y : y + h, x : x + w, :][mask] = (
-                                im[y : y + h, x : x + w, :][mask] * 0.4 + np.array(color) * 0.6
-                            )
-                        except Exception:
-                            pass
-                annotator.fromarray(im)
-    if not save:
-        return np.asarray(annotator.im)
-    annotator.im.save(fname)  # save
-    if on_plot:
-        on_plot(fname)
-
-
-@plt_settings()
-def plot_results(file: str = "path/to/results.csv", dir: str = "", on_plot: Callable | None = None):
-    """
-    Plot training results from a results CSV file. The function supports various types of data including segmentation,
-    pose estimation, and classification. Plots are saved as 'results.png' in the directory where the CSV is located.
-
-    Args:
-        file (str, optional): Path to the CSV file containing the training results.
-        dir (str, optional): Directory where the CSV file is located if 'file' is not provided.
-        on_plot (callable, optional): Callback function to be executed after plotting. Takes filename as an argument.
-
-    Examples:
-        >>> from ultralytics.utils.plotting import plot_results
-        >>> plot_results("path/to/results.csv", segment=True)
-    """
-    import matplotlib.pyplot as plt  # scope for faster 'import ultralytics'
-    import polars as pl
-    from scipy.ndimage import gaussian_filter1d
-
-    save_dir = Path(file).parent if file else Path(dir)
-    files = list(save_dir.glob("results*.csv"))
-    assert len(files), f"No results.csv files found in {save_dir.resolve()}, nothing to plot."
-
-    loss_keys, metric_keys = [], []
-    for i, f in enumerate(files):
-        try:
-            data = pl.read_csv(f, infer_schema_length=None)
-            if i == 0:
-                for c in data.columns:
-                    if "loss" in c:
-                        loss_keys.append(c)
-                    elif "metric" in c:
-                        metric_keys.append(c)
-                loss_mid, metric_mid = len(loss_keys) // 2, len(metric_keys) // 2
-                columns = (
-                    loss_keys[:loss_mid] + metric_keys[:metric_mid] + loss_keys[loss_mid:] + metric_keys[metric_mid:]
-                )
-                fig, ax = plt.subplots(2, len(columns) // 2, figsize=(len(columns) + 2, 6), tight_layout=True)
-                ax = ax.ravel()
-            x = data.select(data.columns[0]).to_numpy().flatten()
-            for i, j in enumerate(columns):
-                y = data.select(j).to_numpy().flatten().astype("float")
-                ax[i].plot(x, y, marker=".", label=f.stem, linewidth=2, markersize=8)  # actual results
-                ax[i].plot(x, gaussian_filter1d(y, sigma=3), ":", label="smooth", linewidth=2)  # smoothing line
-                ax[i].set_title(j, fontsize=12)
-        except Exception as e:
-            LOGGER.error(f"Plotting error for {f}: {e}")
-    ax[1].legend()
-    fname = save_dir / "results.png"
-    fig.savefig(fname, dpi=200)
-    plt.close()
-    if on_plot:
-        on_plot(fname)
-
-
-def plt_color_scatter(v, f, bins: int = 20, cmap: str = "viridis", alpha: float = 0.8, edgecolors: str = "none"):
-    """
-    Plot a scatter plot with points colored based on a 2D histogram.
-
-    Args:
-        v (array-like): Values for the x-axis.
-        f (array-like): Values for the y-axis.
-        bins (int, optional): Number of bins for the histogram.
-        cmap (str, optional): Colormap for the scatter plot.
-        alpha (float, optional): Alpha for the scatter plot.
-        edgecolors (str, optional): Edge colors for the scatter plot.
-
-    Examples:
-        >>> v = np.random.rand(100)
-        >>> f = np.random.rand(100)
-        >>> plt_color_scatter(v, f)
-    """
-    import matplotlib.pyplot as plt  # scope for faster 'import ultralytics'
-
-    # Calculate 2D histogram and corresponding colors
-    hist, xedges, yedges = np.histogram2d(v, f, bins=bins)
-    colors = [
-        hist[
-            min(np.digitize(v[i], xedges, right=True) - 1, hist.shape[0] - 1),
-            min(np.digitize(f[i], yedges, right=True) - 1, hist.shape[1] - 1),
-        ]
-        for i in range(len(v))
-    ]
-
-    # Scatter plot
-    plt.scatter(v, f, c=colors, cmap=cmap, alpha=alpha, edgecolors=edgecolors)
-
-
-@plt_settings()
-def plot_tune_results(csv_file: str = "tune_results.csv", exclude_zero_fitness_points: bool = True):
-    """
-    Plot the evolution results stored in a 'tune_results.csv' file. The function generates a scatter plot for each key
-    in the CSV, color-coded based on fitness scores. The best-performing configurations are highlighted on the plots.
-
-    Args:
-        csv_file (str, optional): Path to the CSV file containing the tuning results.
-        exclude_zero_fitness_points (bool, optional): Don't include points with zero fitness in tuning plots.
-
-    Examples:
-        >>> plot_tune_results("path/to/tune_results.csv")
-    """
-    import matplotlib.pyplot as plt  # scope for faster 'import ultralytics'
-    import polars as pl
-    from scipy.ndimage import gaussian_filter1d
-
-    def _save_one_file(file):
-        """Save one matplotlib plot to 'file'."""
-        plt.savefig(file, dpi=200)
-        plt.close()
-        LOGGER.info(f"Saved {file}")
-
-    # Scatter plots for each hyperparameter
-    csv_file = Path(csv_file)
-    data = pl.read_csv(csv_file, infer_schema_length=None)
-    num_metrics_columns = 1
-    keys = [x.strip() for x in data.columns][num_metrics_columns:]
-    x = data.to_numpy()
-    fitness = x[:, 0]  # fitness
-    if exclude_zero_fitness_points:
-        mask = fitness > 0  # exclude zero-fitness points
-        x, fitness = x[mask], fitness[mask]
-    j = np.argmax(fitness)  # max fitness index
-    n = math.ceil(len(keys) ** 0.5)  # columns and rows in plot
-    plt.figure(figsize=(10, 10), tight_layout=True)
-    for i, k in enumerate(keys):
-        v = x[:, i + num_metrics_columns]
-        mu = v[j]  # best single result
-        plt.subplot(n, n, i + 1)
-        plt_color_scatter(v, fitness, cmap="viridis", alpha=0.8, edgecolors="none")
-        plt.plot(mu, fitness.max(), "k+", markersize=15)
-        plt.title(f"{k} = {mu:.3g}", fontdict={"size": 9})  # limit to 40 characters
-        plt.tick_params(axis="both", labelsize=8)  # Set axis label size to 8
-        if i % n != 0:
-            plt.yticks([])
-    _save_one_file(csv_file.with_name("tune_scatter_plots.png"))
-
-    # Fitness vs iteration
-    x = range(1, len(fitness) + 1)
-    plt.figure(figsize=(10, 6), tight_layout=True)
-    plt.plot(x, fitness, marker="o", linestyle="none", label="fitness")
-    plt.plot(x, gaussian_filter1d(fitness, sigma=3), ":", label="smoothed", linewidth=2)  # smoothing line
-    plt.title("Fitness vs Iteration")
-    plt.xlabel("Iteration")
-    plt.ylabel("Fitness")
-    plt.grid(True)
-    plt.legend()
-    _save_one_file(csv_file.with_name("tune_fitness.png"))
-
-
-@plt_settings()
-def feature_visualization(x, module_type: str, stage: int, n: int = 32, save_dir: Path = Path("runs/detect/exp")):
-    """
-    Visualize feature maps of a given model module during inference.
-
-    Args:
-        x (torch.Tensor): Features to be visualized.
-        module_type (str): Module type.
-        stage (int): Module stage within the model.
-        n (int, optional): Maximum number of feature maps to plot.
-        save_dir (Path, optional): Directory to save results.
-    """
-    import matplotlib.pyplot as plt  # scope for faster 'import ultralytics'
-
-    for m in {"Detect", "Segment", "Pose", "Classify", "OBB", "RTDETRDecoder"}:  # all model heads
-        if m in module_type:
-            return
-    if isinstance(x, torch.Tensor):
-        _, channels, height, width = x.shape  # batch, channels, height, width
-        if height > 1 and width > 1:
-            f = save_dir / f"stage{stage}_{module_type.rsplit('.', 1)[-1]}_features.png"  # filename
-
-            blocks = torch.chunk(x[0].cpu(), channels, dim=0)  # select batch index 0, block by channels
-            n = min(n, channels)  # number of plots
-            _, ax = plt.subplots(math.ceil(n / 8), 8, tight_layout=True)  # 8 rows x n/8 cols
-            ax = ax.ravel()
-            plt.subplots_adjust(wspace=0.05, hspace=0.05)
-            for i in range(n):
-                ax[i].imshow(blocks[i].squeeze())  # cmap='gray'
-                ax[i].axis("off")
-
-            LOGGER.info(f"Saving {f}... ({n}/{channels})")
-            plt.savefig(f, dpi=300, bbox_inches="tight")
-            plt.close()
-            np.save(str(f.with_suffix(".npy")), x[0].cpu().numpy())  # npy save
diff --git a/ultralytics/utils/tal.py b/ultralytics/utils/tal.py
deleted file mode 100644
index 580ce2a..0000000
--- a/ultralytics/utils/tal.py
+++ /dev/null
@@ -1,417 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-import torch
-import torch.nn as nn
-
-from . import LOGGER
-from .metrics import bbox_iou, probiou
-from .ops import xywhr2xyxyxyxy
-from .torch_utils import TORCH_1_11
-
-
-class TaskAlignedAssigner(nn.Module):
-    """
-    A task-aligned assigner for object detection.
-
-    This class assigns ground-truth (gt) objects to anchors based on the task-aligned metric, which combines both
-    classification and localization information.
-
-    Attributes:
-        topk (int): The number of top candidates to consider.
-        num_classes (int): The number of object classes.
-        alpha (float): The alpha parameter for the classification component of the task-aligned metric.
-        beta (float): The beta parameter for the localization component of the task-aligned metric.
-        eps (float): A small value to prevent division by zero.
-    """
-
-    def __init__(self, topk: int = 13, num_classes: int = 80, alpha: float = 1.0, beta: float = 6.0, eps: float = 1e-9):
-        """
-        Initialize a TaskAlignedAssigner object with customizable hyperparameters.
-
-        Args:
-            topk (int, optional): The number of top candidates to consider.
-            num_classes (int, optional): The number of object classes.
-            alpha (float, optional): The alpha parameter for the classification component of the task-aligned metric.
-            beta (float, optional): The beta parameter for the localization component of the task-aligned metric.
-            eps (float, optional): A small value to prevent division by zero.
-        """
-        super().__init__()
-        self.topk = topk
-        self.num_classes = num_classes
-        self.alpha = alpha
-        self.beta = beta
-        self.eps = eps
-
-    @torch.no_grad()
-    def forward(self, pd_scores, pd_bboxes, anc_points, gt_labels, gt_bboxes, mask_gt):
-        """
-        Compute the task-aligned assignment.
-
-        Args:
-            pd_scores (torch.Tensor): Predicted classification scores with shape (bs, num_total_anchors, num_classes).
-            pd_bboxes (torch.Tensor): Predicted bounding boxes with shape (bs, num_total_anchors, 4).
-            anc_points (torch.Tensor): Anchor points with shape (num_total_anchors, 2).
-            gt_labels (torch.Tensor): Ground truth labels with shape (bs, n_max_boxes, 1).
-            gt_bboxes (torch.Tensor): Ground truth boxes with shape (bs, n_max_boxes, 4).
-            mask_gt (torch.Tensor): Mask for valid ground truth boxes with shape (bs, n_max_boxes, 1).
-
-        Returns:
-            target_labels (torch.Tensor): Target labels with shape (bs, num_total_anchors).
-            target_bboxes (torch.Tensor): Target bounding boxes with shape (bs, num_total_anchors, 4).
-            target_scores (torch.Tensor): Target scores with shape (bs, num_total_anchors, num_classes).
-            fg_mask (torch.Tensor): Foreground mask with shape (bs, num_total_anchors).
-            target_gt_idx (torch.Tensor): Target ground truth indices with shape (bs, num_total_anchors).
-
-        References:
-            https://github.com/Nioolek/PPYOLOE_pytorch/blob/master/ppyoloe/assigner/tal_assigner.py
-        """
-        self.bs = pd_scores.shape[0]
-        self.n_max_boxes = gt_bboxes.shape[1]
-        device = gt_bboxes.device
-
-        if self.n_max_boxes == 0:
-            return (
-                torch.full_like(pd_scores[..., 0], self.num_classes),
-                torch.zeros_like(pd_bboxes),
-                torch.zeros_like(pd_scores),
-                torch.zeros_like(pd_scores[..., 0]),
-                torch.zeros_like(pd_scores[..., 0]),
-            )
-
-        try:
-            return self._forward(pd_scores, pd_bboxes, anc_points, gt_labels, gt_bboxes, mask_gt)
-        except torch.cuda.OutOfMemoryError:
-            # Move tensors to CPU, compute, then move back to original device
-            LOGGER.warning("CUDA OutOfMemoryError in TaskAlignedAssigner, using CPU")
-            cpu_tensors = [t.cpu() for t in (pd_scores, pd_bboxes, anc_points, gt_labels, gt_bboxes, mask_gt)]
-            result = self._forward(*cpu_tensors)
-            return tuple(t.to(device) for t in result)
-
-    def _forward(self, pd_scores, pd_bboxes, anc_points, gt_labels, gt_bboxes, mask_gt):
-        """
-        Compute the task-aligned assignment.
-
-        Args:
-            pd_scores (torch.Tensor): Predicted classification scores with shape (bs, num_total_anchors, num_classes).
-            pd_bboxes (torch.Tensor): Predicted bounding boxes with shape (bs, num_total_anchors, 4).
-            anc_points (torch.Tensor): Anchor points with shape (num_total_anchors, 2).
-            gt_labels (torch.Tensor): Ground truth labels with shape (bs, n_max_boxes, 1).
-            gt_bboxes (torch.Tensor): Ground truth boxes with shape (bs, n_max_boxes, 4).
-            mask_gt (torch.Tensor): Mask for valid ground truth boxes with shape (bs, n_max_boxes, 1).
-
-        Returns:
-            target_labels (torch.Tensor): Target labels with shape (bs, num_total_anchors).
-            target_bboxes (torch.Tensor): Target bounding boxes with shape (bs, num_total_anchors, 4).
-            target_scores (torch.Tensor): Target scores with shape (bs, num_total_anchors, num_classes).
-            fg_mask (torch.Tensor): Foreground mask with shape (bs, num_total_anchors).
-            target_gt_idx (torch.Tensor): Target ground truth indices with shape (bs, num_total_anchors).
-        """
-        mask_pos, align_metric, overlaps = self.get_pos_mask(
-            pd_scores, pd_bboxes, gt_labels, gt_bboxes, anc_points, mask_gt
-        )
-
-        target_gt_idx, fg_mask, mask_pos = self.select_highest_overlaps(mask_pos, overlaps, self.n_max_boxes)
-
-        # Assigned target
-        target_labels, target_bboxes, target_scores = self.get_targets(gt_labels, gt_bboxes, target_gt_idx, fg_mask)
-
-        # Normalize
-        align_metric *= mask_pos
-        pos_align_metrics = align_metric.amax(dim=-1, keepdim=True)  # b, max_num_obj
-        pos_overlaps = (overlaps * mask_pos).amax(dim=-1, keepdim=True)  # b, max_num_obj
-        norm_align_metric = (align_metric * pos_overlaps / (pos_align_metrics + self.eps)).amax(-2).unsqueeze(-1)
-        target_scores = target_scores * norm_align_metric
-
-        return target_labels, target_bboxes, target_scores, fg_mask.bool(), target_gt_idx
-
-    def get_pos_mask(self, pd_scores, pd_bboxes, gt_labels, gt_bboxes, anc_points, mask_gt):
-        """
-        Get positive mask for each ground truth box.
-
-        Args:
-            pd_scores (torch.Tensor): Predicted classification scores with shape (bs, num_total_anchors, num_classes).
-            pd_bboxes (torch.Tensor): Predicted bounding boxes with shape (bs, num_total_anchors, 4).
-            gt_labels (torch.Tensor): Ground truth labels with shape (bs, n_max_boxes, 1).
-            gt_bboxes (torch.Tensor): Ground truth boxes with shape (bs, n_max_boxes, 4).
-            anc_points (torch.Tensor): Anchor points with shape (num_total_anchors, 2).
-            mask_gt (torch.Tensor): Mask for valid ground truth boxes with shape (bs, n_max_boxes, 1).
-
-        Returns:
-            mask_pos (torch.Tensor): Positive mask with shape (bs, max_num_obj, h*w).
-            align_metric (torch.Tensor): Alignment metric with shape (bs, max_num_obj, h*w).
-            overlaps (torch.Tensor): Overlaps between predicted and ground truth boxes with shape (bs, max_num_obj, h*w).
-        """
-        mask_in_gts = self.select_candidates_in_gts(anc_points, gt_bboxes)
-        # Get anchor_align metric, (b, max_num_obj, h*w)
-        align_metric, overlaps = self.get_box_metrics(pd_scores, pd_bboxes, gt_labels, gt_bboxes, mask_in_gts * mask_gt)
-        # Get topk_metric mask, (b, max_num_obj, h*w)
-        mask_topk = self.select_topk_candidates(align_metric, topk_mask=mask_gt.expand(-1, -1, self.topk).bool())
-        # Merge all mask to a final mask, (b, max_num_obj, h*w)
-        mask_pos = mask_topk * mask_in_gts * mask_gt
-
-        return mask_pos, align_metric, overlaps
-
-    def get_box_metrics(self, pd_scores, pd_bboxes, gt_labels, gt_bboxes, mask_gt):
-        """
-        Compute alignment metric given predicted and ground truth bounding boxes.
-
-        Args:
-            pd_scores (torch.Tensor): Predicted classification scores with shape (bs, num_total_anchors, num_classes).
-            pd_bboxes (torch.Tensor): Predicted bounding boxes with shape (bs, num_total_anchors, 4).
-            gt_labels (torch.Tensor): Ground truth labels with shape (bs, n_max_boxes, 1).
-            gt_bboxes (torch.Tensor): Ground truth boxes with shape (bs, n_max_boxes, 4).
-            mask_gt (torch.Tensor): Mask for valid ground truth boxes with shape (bs, n_max_boxes, h*w).
-
-        Returns:
-            align_metric (torch.Tensor): Alignment metric combining classification and localization.
-            overlaps (torch.Tensor): IoU overlaps between predicted and ground truth boxes.
-        """
-        na = pd_bboxes.shape[-2]
-        mask_gt = mask_gt.bool()  # b, max_num_obj, h*w
-        overlaps = torch.zeros([self.bs, self.n_max_boxes, na], dtype=pd_bboxes.dtype, device=pd_bboxes.device)
-        bbox_scores = torch.zeros([self.bs, self.n_max_boxes, na], dtype=pd_scores.dtype, device=pd_scores.device)
-
-        ind = torch.zeros([2, self.bs, self.n_max_boxes], dtype=torch.long)  # 2, b, max_num_obj
-        ind[0] = torch.arange(end=self.bs).view(-1, 1).expand(-1, self.n_max_boxes)  # b, max_num_obj
-        ind[1] = gt_labels.squeeze(-1)  # b, max_num_obj
-        # Get the scores of each grid for each gt cls
-        bbox_scores[mask_gt] = pd_scores[ind[0], :, ind[1]][mask_gt]  # b, max_num_obj, h*w
-
-        # (b, max_num_obj, 1, 4), (b, 1, h*w, 4)
-        pd_boxes = pd_bboxes.unsqueeze(1).expand(-1, self.n_max_boxes, -1, -1)[mask_gt]
-        gt_boxes = gt_bboxes.unsqueeze(2).expand(-1, -1, na, -1)[mask_gt]
-        overlaps[mask_gt] = self.iou_calculation(gt_boxes, pd_boxes)
-
-        align_metric = bbox_scores.pow(self.alpha) * overlaps.pow(self.beta)
-        return align_metric, overlaps
-
-    def iou_calculation(self, gt_bboxes, pd_bboxes):
-        """
-        Calculate IoU for horizontal bounding boxes.
-
-        Args:
-            gt_bboxes (torch.Tensor): Ground truth boxes.
-            pd_bboxes (torch.Tensor): Predicted boxes.
-
-        Returns:
-            (torch.Tensor): IoU values between each pair of boxes.
-        """
-        return bbox_iou(gt_bboxes, pd_bboxes, xywh=False, CIoU=True).squeeze(-1).clamp_(0)
-
-    def select_topk_candidates(self, metrics, topk_mask=None):
-        """
-        Select the top-k candidates based on the given metrics.
-
-        Args:
-            metrics (torch.Tensor): A tensor of shape (b, max_num_obj, h*w), where b is the batch size, max_num_obj is
-                the maximum number of objects, and h*w represents the total number of anchor points.
-            topk_mask (torch.Tensor, optional): An optional boolean tensor of shape (b, max_num_obj, topk), where
-                topk is the number of top candidates to consider. If not provided, the top-k values are automatically
-                computed based on the given metrics.
-
-        Returns:
-            (torch.Tensor): A tensor of shape (b, max_num_obj, h*w) containing the selected top-k candidates.
-        """
-        # (b, max_num_obj, topk)
-        topk_metrics, topk_idxs = torch.topk(metrics, self.topk, dim=-1, largest=True)
-        if topk_mask is None:
-            topk_mask = (topk_metrics.max(-1, keepdim=True)[0] > self.eps).expand_as(topk_idxs)
-        # (b, max_num_obj, topk)
-        topk_idxs.masked_fill_(~topk_mask, 0)
-
-        # (b, max_num_obj, topk, h*w) -> (b, max_num_obj, h*w)
-        count_tensor = torch.zeros(metrics.shape, dtype=torch.int8, device=topk_idxs.device)
-        ones = torch.ones_like(topk_idxs[:, :, :1], dtype=torch.int8, device=topk_idxs.device)
-        for k in range(self.topk):
-            # Expand topk_idxs for each value of k and add 1 at the specified positions
-            count_tensor.scatter_add_(-1, topk_idxs[:, :, k : k + 1], ones)
-        # Filter invalid bboxes
-        count_tensor.masked_fill_(count_tensor > 1, 0)
-
-        return count_tensor.to(metrics.dtype)
-
-    def get_targets(self, gt_labels, gt_bboxes, target_gt_idx, fg_mask):
-        """
-        Compute target labels, target bounding boxes, and target scores for the positive anchor points.
-
-        Args:
-            gt_labels (torch.Tensor): Ground truth labels of shape (b, max_num_obj, 1), where b is the
-                                batch size and max_num_obj is the maximum number of objects.
-            gt_bboxes (torch.Tensor): Ground truth bounding boxes of shape (b, max_num_obj, 4).
-            target_gt_idx (torch.Tensor): Indices of the assigned ground truth objects for positive
-                                    anchor points, with shape (b, h*w), where h*w is the total
-                                    number of anchor points.
-            fg_mask (torch.Tensor): A boolean tensor of shape (b, h*w) indicating the positive
-                              (foreground) anchor points.
-
-        Returns:
-            target_labels (torch.Tensor): Target labels for positive anchor points with shape (b, h*w).
-            target_bboxes (torch.Tensor): Target bounding boxes for positive anchor points with shape (b, h*w, 4).
-            target_scores (torch.Tensor): Target scores for positive anchor points with shape (b, h*w, num_classes).
-        """
-        # Assigned target labels, (b, 1)
-        batch_ind = torch.arange(end=self.bs, dtype=torch.int64, device=gt_labels.device)[..., None]
-        target_gt_idx = target_gt_idx + batch_ind * self.n_max_boxes  # (b, h*w)
-        target_labels = gt_labels.long().flatten()[target_gt_idx]  # (b, h*w)
-
-        # Assigned target boxes, (b, max_num_obj, 4) -> (b, h*w, 4)
-        target_bboxes = gt_bboxes.view(-1, gt_bboxes.shape[-1])[target_gt_idx]
-
-        # Assigned target scores
-        target_labels.clamp_(0)
-
-        # 10x faster than F.one_hot()
-        target_scores = torch.zeros(
-            (target_labels.shape[0], target_labels.shape[1], self.num_classes),
-            dtype=torch.int64,
-            device=target_labels.device,
-        )  # (b, h*w, 80)
-        target_scores.scatter_(2, target_labels.unsqueeze(-1), 1)
-
-        fg_scores_mask = fg_mask[:, :, None].repeat(1, 1, self.num_classes)  # (b, h*w, 80)
-        target_scores = torch.where(fg_scores_mask > 0, target_scores, 0)
-
-        return target_labels, target_bboxes, target_scores
-
-    @staticmethod
-    def select_candidates_in_gts(xy_centers, gt_bboxes, eps=1e-9):
-        """
-        Select positive anchor centers within ground truth bounding boxes.
-
-        Args:
-            xy_centers (torch.Tensor): Anchor center coordinates, shape (h*w, 2).
-            gt_bboxes (torch.Tensor): Ground truth bounding boxes, shape (b, n_boxes, 4).
-            eps (float, optional): Small value for numerical stability.
-
-        Returns:
-            (torch.Tensor): Boolean mask of positive anchors, shape (b, n_boxes, h*w).
-
-        Note:
-            b: batch size, n_boxes: number of ground truth boxes, h: height, w: width.
-            Bounding box format: [x_min, y_min, x_max, y_max].
-        """
-        n_anchors = xy_centers.shape[0]
-        bs, n_boxes, _ = gt_bboxes.shape
-        lt, rb = gt_bboxes.view(-1, 1, 4).chunk(2, 2)  # left-top, right-bottom
-        bbox_deltas = torch.cat((xy_centers[None] - lt, rb - xy_centers[None]), dim=2).view(bs, n_boxes, n_anchors, -1)
-        return bbox_deltas.amin(3).gt_(eps)
-
-    @staticmethod
-    def select_highest_overlaps(mask_pos, overlaps, n_max_boxes):
-        """
-        Select anchor boxes with highest IoU when assigned to multiple ground truths.
-
-        Args:
-            mask_pos (torch.Tensor): Positive mask, shape (b, n_max_boxes, h*w).
-            overlaps (torch.Tensor): IoU overlaps, shape (b, n_max_boxes, h*w).
-            n_max_boxes (int): Maximum number of ground truth boxes.
-
-        Returns:
-            target_gt_idx (torch.Tensor): Indices of assigned ground truths, shape (b, h*w).
-            fg_mask (torch.Tensor): Foreground mask, shape (b, h*w).
-            mask_pos (torch.Tensor): Updated positive mask, shape (b, n_max_boxes, h*w).
-        """
-        # Convert (b, n_max_boxes, h*w) -> (b, h*w)
-        fg_mask = mask_pos.sum(-2)
-        if fg_mask.max() > 1:  # one anchor is assigned to multiple gt_bboxes
-            mask_multi_gts = (fg_mask.unsqueeze(1) > 1).expand(-1, n_max_boxes, -1)  # (b, n_max_boxes, h*w)
-            max_overlaps_idx = overlaps.argmax(1)  # (b, h*w)
-
-            is_max_overlaps = torch.zeros(mask_pos.shape, dtype=mask_pos.dtype, device=mask_pos.device)
-            is_max_overlaps.scatter_(1, max_overlaps_idx.unsqueeze(1), 1)
-
-            mask_pos = torch.where(mask_multi_gts, is_max_overlaps, mask_pos).float()  # (b, n_max_boxes, h*w)
-            fg_mask = mask_pos.sum(-2)
-        # Find each grid serve which gt(index)
-        target_gt_idx = mask_pos.argmax(-2)  # (b, h*w)
-        return target_gt_idx, fg_mask, mask_pos
-
-
-class RotatedTaskAlignedAssigner(TaskAlignedAssigner):
-    """Assigns ground-truth objects to rotated bounding boxes using a task-aligned metric."""
-
-    def iou_calculation(self, gt_bboxes, pd_bboxes):
-        """Calculate IoU for rotated bounding boxes."""
-        return probiou(gt_bboxes, pd_bboxes).squeeze(-1).clamp_(0)
-
-    @staticmethod
-    def select_candidates_in_gts(xy_centers, gt_bboxes):
-        """
-        Select the positive anchor center in gt for rotated bounding boxes.
-
-        Args:
-            xy_centers (torch.Tensor): Anchor center coordinates with shape (h*w, 2).
-            gt_bboxes (torch.Tensor): Ground truth bounding boxes with shape (b, n_boxes, 5).
-
-        Returns:
-            (torch.Tensor): Boolean mask of positive anchors with shape (b, n_boxes, h*w).
-        """
-        # (b, n_boxes, 5) --> (b, n_boxes, 4, 2)
-        corners = xywhr2xyxyxyxy(gt_bboxes)
-        # (b, n_boxes, 1, 2)
-        a, b, _, d = corners.split(1, dim=-2)
-        ab = b - a
-        ad = d - a
-
-        # (b, n_boxes, h*w, 2)
-        ap = xy_centers - a
-        norm_ab = (ab * ab).sum(dim=-1)
-        norm_ad = (ad * ad).sum(dim=-1)
-        ap_dot_ab = (ap * ab).sum(dim=-1)
-        ap_dot_ad = (ap * ad).sum(dim=-1)
-        return (ap_dot_ab >= 0) & (ap_dot_ab <= norm_ab) & (ap_dot_ad >= 0) & (ap_dot_ad <= norm_ad)  # is_in_box
-
-
-def make_anchors(feats, strides, grid_cell_offset=0.5):
-    """Generate anchors from features."""
-    anchor_points, stride_tensor = [], []
-    assert feats is not None
-    dtype, device = feats[0].dtype, feats[0].device
-    for i, stride in enumerate(strides):
-        h, w = feats[i].shape[2:] if isinstance(feats, list) else (int(feats[i][0]), int(feats[i][1]))
-        sx = torch.arange(end=w, device=device, dtype=dtype) + grid_cell_offset  # shift x
-        sy = torch.arange(end=h, device=device, dtype=dtype) + grid_cell_offset  # shift y
-        sy, sx = torch.meshgrid(sy, sx, indexing="ij") if TORCH_1_11 else torch.meshgrid(sy, sx)
-        anchor_points.append(torch.stack((sx, sy), -1).view(-1, 2))
-        stride_tensor.append(torch.full((h * w, 1), stride, dtype=dtype, device=device))
-    return torch.cat(anchor_points), torch.cat(stride_tensor)
-
-
-def dist2bbox(distance, anchor_points, xywh=True, dim=-1):
-    """Transform distance(ltrb) to box(xywh or xyxy)."""
-    lt, rb = distance.chunk(2, dim)
-    x1y1 = anchor_points - lt
-    x2y2 = anchor_points + rb
-    if xywh:
-        c_xy = (x1y1 + x2y2) / 2
-        wh = x2y2 - x1y1
-        return torch.cat([c_xy, wh], dim)  # xywh bbox
-    return torch.cat((x1y1, x2y2), dim)  # xyxy bbox
-
-
-def bbox2dist(anchor_points, bbox, reg_max):
-    """Transform bbox(xyxy) to dist(ltrb)."""
-    x1y1, x2y2 = bbox.chunk(2, -1)
-    return torch.cat((anchor_points - x1y1, x2y2 - anchor_points), -1).clamp_(0, reg_max - 0.01)  # dist (lt, rb)
-
-
-def dist2rbox(pred_dist, pred_angle, anchor_points, dim=-1):
-    """
-    Decode predicted rotated bounding box coordinates from anchor points and distribution.
-
-    Args:
-        pred_dist (torch.Tensor): Predicted rotated distance with shape (bs, h*w, 4).
-        pred_angle (torch.Tensor): Predicted angle with shape (bs, h*w, 1).
-        anchor_points (torch.Tensor): Anchor points with shape (h*w, 2).
-        dim (int, optional): Dimension along which to split.
-
-    Returns:
-        (torch.Tensor): Predicted rotated bounding boxes with shape (bs, h*w, 4).
-    """
-    lt, rb = pred_dist.split(2, dim=dim)
-    cos, sin = torch.cos(pred_angle), torch.sin(pred_angle)
-    # (bs, h*w, 1)
-    xf, yf = ((rb - lt) / 2).split(1, dim=dim)
-    x, y = xf * cos - yf * sin, xf * sin + yf * cos
-    xy = torch.cat([x, y], dim=dim) + anchor_points
-    return torch.cat([xy, lt + rb], dim=dim)
diff --git a/ultralytics/utils/torch_utils.py b/ultralytics/utils/torch_utils.py
deleted file mode 100644
index 2b757fb..0000000
--- a/ultralytics/utils/torch_utils.py
+++ /dev/null
@@ -1,1010 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import functools
-import gc
-import math
-import os
-import random
-import time
-from contextlib import contextmanager
-from copy import deepcopy
-from datetime import datetime
-from pathlib import Path
-from typing import Any
-
-import numpy as np
-import torch
-import torch.distributed as dist
-import torch.nn as nn
-import torch.nn.functional as F
-
-from ultralytics import __version__
-from ultralytics.utils import (
-    DEFAULT_CFG_DICT,
-    DEFAULT_CFG_KEYS,
-    LOGGER,
-    NUM_THREADS,
-    PYTHON_VERSION,
-    TORCH_VERSION,
-    TORCHVISION_VERSION,
-    WINDOWS,
-    colorstr,
-)
-from ultralytics.utils.checks import check_version
-from ultralytics.utils.cpu import CPUInfo
-from ultralytics.utils.patches import torch_load
-
-# Version checks (all default to version>=min_version)
-TORCH_1_9 = check_version(TORCH_VERSION, "1.9.0")
-TORCH_1_10 = check_version(TORCH_VERSION, "1.10.0")
-TORCH_1_11 = check_version(TORCH_VERSION, "1.11.0")
-TORCH_1_13 = check_version(TORCH_VERSION, "1.13.0")
-TORCH_2_0 = check_version(TORCH_VERSION, "2.0.0")
-TORCH_2_1 = check_version(TORCH_VERSION, "2.1.0")
-TORCH_2_4 = check_version(TORCH_VERSION, "2.4.0")
-TORCHVISION_0_10 = check_version(TORCHVISION_VERSION, "0.10.0")
-TORCHVISION_0_11 = check_version(TORCHVISION_VERSION, "0.11.0")
-TORCHVISION_0_13 = check_version(TORCHVISION_VERSION, "0.13.0")
-TORCHVISION_0_18 = check_version(TORCHVISION_VERSION, "0.18.0")
-if WINDOWS and check_version(TORCH_VERSION, "==2.4.0"):  # reject version 2.4.0 on Windows
-    LOGGER.warning(
-        "Known issue with torch==2.4.0 on Windows with CPU, recommend upgrading to torch>=2.4.1 to resolve "
-        "https://github.com/ultralytics/ultralytics/issues/15049"
-    )
-
-
-@contextmanager
-def torch_distributed_zero_first(local_rank: int):
-    """Ensure all processes in distributed training wait for the local master (rank 0) to complete a task first."""
-    initialized = dist.is_available() and dist.is_initialized()
-    use_ids = initialized and dist.get_backend() == "nccl"
-
-    if initialized and local_rank not in {-1, 0}:
-        dist.barrier(device_ids=[local_rank]) if use_ids else dist.barrier()
-    yield
-    if initialized and local_rank == 0:
-        dist.barrier(device_ids=[local_rank]) if use_ids else dist.barrier()
-
-
-def smart_inference_mode():
-    """Apply torch.inference_mode() decorator if torch>=1.9.0 else torch.no_grad() decorator."""
-
-    def decorate(fn):
-        """Apply appropriate torch decorator for inference mode based on torch version."""
-        if TORCH_1_9 and torch.is_inference_mode_enabled():
-            return fn  # already in inference_mode, act as a pass-through
-        else:
-            return (torch.inference_mode if TORCH_1_9 else torch.no_grad)()(fn)
-
-    return decorate
-
-
-def autocast(enabled: bool, device: str = "cuda"):
-    """
-    Get the appropriate autocast context manager based on PyTorch version and AMP setting.
-
-    This function returns a context manager for automatic mixed precision (AMP) training that is compatible with both
-    older and newer versions of PyTorch. It handles the differences in the autocast API between PyTorch versions.
-
-    Args:
-        enabled (bool): Whether to enable automatic mixed precision.
-        device (str, optional): The device to use for autocast.
-
-    Returns:
-        (torch.amp.autocast): The appropriate autocast context manager.
-
-    Notes:
-        - For PyTorch versions 1.13 and newer, it uses `torch.amp.autocast`.
-        - For older versions, it uses `torch.cuda.autocast`.
-
-    Examples:
-        >>> with autocast(enabled=True):
-        ...     # Your mixed precision operations here
-        ...     pass
-    """
-    if TORCH_1_13:
-        return torch.amp.autocast(device, enabled=enabled)
-    else:
-        return torch.cuda.amp.autocast(enabled)
-
-
-@functools.lru_cache
-def get_cpu_info():
-    """Return a string with system CPU information, i.e. 'Apple M2'."""
-    from ultralytics.utils import PERSISTENT_CACHE  # avoid circular import error
-
-    if "cpu_info" not in PERSISTENT_CACHE:
-        try:
-            PERSISTENT_CACHE["cpu_info"] = CPUInfo.name()
-        except Exception:
-            pass
-    return PERSISTENT_CACHE.get("cpu_info", "unknown")
-
-
-@functools.lru_cache
-def get_gpu_info(index):
-    """Return a string with system GPU information, i.e. 'Tesla T4, 15102MiB'."""
-    properties = torch.cuda.get_device_properties(index)
-    return f"{properties.name}, {properties.total_memory / (1 << 20):.0f}MiB"
-
-
-def select_device(device="", newline=False, verbose=True):
-    """
-    Select the appropriate PyTorch device based on the provided arguments.
-
-    The function takes a string specifying the device or a torch.device object and returns a torch.device object
-    representing the selected device. The function also validates the number of available devices and raises an
-    exception if the requested device(s) are not available.
-
-    Args:
-        device (str | torch.device, optional): Device string or torch.device object. Options are 'None', 'cpu', or
-            'cuda', or '0' or '0,1,2,3'. Auto-selects the first available GPU, or CPU if no GPU is available.
-        newline (bool, optional): If True, adds a newline at the end of the log string.
-        verbose (bool, optional): If True, logs the device information.
-
-    Returns:
-        (torch.device): Selected device.
-
-    Examples:
-        >>> select_device("cuda:0")
-        device(type='cuda', index=0)
-
-        >>> select_device("cpu")
-        device(type='cpu')
-
-    Notes:
-        Sets the 'CUDA_VISIBLE_DEVICES' environment variable for specifying which GPUs to use.
-    """
-    if isinstance(device, torch.device) or str(device).startswith(("tpu", "intel")):
-        return device
-
-    s = f"Ultralytics {__version__} 🚀 Python-{PYTHON_VERSION} torch-{TORCH_VERSION} "
-    device = str(device).lower()
-    for remove in "cuda:", "none", "(", ")", "[", "]", "'", " ":
-        device = device.replace(remove, "")  # to string, 'cuda:0' -> '0' and '(0, 1)' -> '0,1'
-
-    # Auto-select GPUs
-    if "-1" in device:
-        from ultralytics.utils.autodevice import GPUInfo
-
-        # Replace each -1 with a selected GPU or remove it
-        parts = device.split(",")
-        selected = GPUInfo().select_idle_gpu(count=parts.count("-1"), min_memory_fraction=0.2)
-        for i in range(len(parts)):
-            if parts[i] == "-1":
-                parts[i] = str(selected.pop(0)) if selected else ""
-        device = ",".join(p for p in parts if p)
-
-    cpu = device == "cpu"
-    mps = device in {"mps", "mps:0"}  # Apple Metal Performance Shaders (MPS)
-    if cpu or mps:
-        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # force torch.cuda.is_available() = False
-    elif device:  # non-cpu device requested
-        if device == "cuda":
-            device = "0"
-        if "," in device:
-            device = ",".join([x for x in device.split(",") if x])  # remove sequential commas, i.e. "0,,1" -> "0,1"
-        visible = os.environ.get("CUDA_VISIBLE_DEVICES", None)
-        os.environ["CUDA_VISIBLE_DEVICES"] = device  # set environment variable - must be before assert is_available()
-        if not (torch.cuda.is_available() and torch.cuda.device_count() >= len(device.split(","))):
-            LOGGER.info(s)
-            install = (
-                "See https://pytorch.org/get-started/locally/ for up-to-date torch install instructions if no "
-                "CUDA devices are seen by torch.\n"
-                if torch.cuda.device_count() == 0
-                else ""
-            )
-            raise ValueError(
-                f"Invalid CUDA 'device={device}' requested."
-                f" Use 'device=cpu' or pass valid CUDA device(s) if available,"
-                f" i.e. 'device=0' or 'device=0,1,2,3' for Multi-GPU.\n"
-                f"\ntorch.cuda.is_available(): {torch.cuda.is_available()}"
-                f"\ntorch.cuda.device_count(): {torch.cuda.device_count()}"
-                f"\nos.environ['CUDA_VISIBLE_DEVICES']: {visible}\n"
-                f"{install}"
-            )
-
-    if not cpu and not mps and torch.cuda.is_available():  # prefer GPU if available
-        devices = device.split(",") if device else "0"  # i.e. "0,1" -> ["0", "1"]
-        space = " " * len(s)
-        for i, d in enumerate(devices):
-            s += f"{'' if i == 0 else space}CUDA:{d} ({get_gpu_info(i)})\n"  # bytes to MB
-        arg = "cuda:0"
-    elif mps and TORCH_2_0 and torch.backends.mps.is_available():
-        # Prefer MPS if available
-        s += f"MPS ({get_cpu_info()})\n"
-        arg = "mps"
-    else:  # revert to CPU
-        s += f"CPU ({get_cpu_info()})\n"
-        arg = "cpu"
-
-    if arg in {"cpu", "mps"}:
-        torch.set_num_threads(NUM_THREADS)  # reset OMP_NUM_THREADS for cpu training
-    if verbose:
-        LOGGER.info(s if newline else s.rstrip())
-    return torch.device(arg)
-
-
-def time_sync():
-    """Return PyTorch-accurate time."""
-    if torch.cuda.is_available():
-        torch.cuda.synchronize()
-    return time.time()
-
-
-def fuse_conv_and_bn(conv, bn):
-    """
-    Fuse Conv2d and BatchNorm2d layers for inference optimization.
-
-    Args:
-        conv (nn.Conv2d): Convolutional layer to fuse.
-        bn (nn.BatchNorm2d): Batch normalization layer to fuse.
-
-    Returns:
-        (nn.Conv2d): The fused convolutional layer with gradients disabled.
-
-    Example:
-        >>> conv = nn.Conv2d(3, 16, 3)
-        >>> bn = nn.BatchNorm2d(16)
-        >>> fused_conv = fuse_conv_and_bn(conv, bn)
-    """
-    # Compute fused weights
-    w_conv = conv.weight.view(conv.out_channels, -1)
-    w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var)))
-    conv.weight.data = torch.mm(w_bn, w_conv).view(conv.weight.shape)
-
-    # Compute fused bias
-    b_conv = torch.zeros(conv.out_channels, device=conv.weight.device) if conv.bias is None else conv.bias
-    b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps))
-    fused_bias = torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn
-
-    if conv.bias is None:
-        conv.register_parameter("bias", nn.Parameter(fused_bias))
-    else:
-        conv.bias.data = fused_bias
-
-    return conv.requires_grad_(False)
-
-
-def fuse_deconv_and_bn(deconv, bn):
-    """
-    Fuse ConvTranspose2d and BatchNorm2d layers for inference optimization.
-
-    Args:
-        deconv (nn.ConvTranspose2d): Transposed convolutional layer to fuse.
-        bn (nn.BatchNorm2d): Batch normalization layer to fuse.
-
-    Returns:
-        (nn.ConvTranspose2d): The fused transposed convolutional layer with gradients disabled.
-
-    Example:
-        >>> deconv = nn.ConvTranspose2d(16, 3, 3)
-        >>> bn = nn.BatchNorm2d(3)
-        >>> fused_deconv = fuse_deconv_and_bn(deconv, bn)
-    """
-    # Compute fused weights
-    w_deconv = deconv.weight.view(deconv.out_channels, -1)
-    w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var)))
-    deconv.weight.data = torch.mm(w_bn, w_deconv).view(deconv.weight.shape)
-
-    # Compute fused bias
-    b_conv = torch.zeros(deconv.out_channels, device=deconv.weight.device) if deconv.bias is None else deconv.bias
-    b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps))
-    fused_bias = torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn
-
-    if deconv.bias is None:
-        deconv.register_parameter("bias", nn.Parameter(fused_bias))
-    else:
-        deconv.bias.data = fused_bias
-
-    return deconv.requires_grad_(False)
-
-
-def model_info(model, detailed=False, verbose=True, imgsz=640):
-    """
-    Print and return detailed model information layer by layer.
-
-    Args:
-        model (nn.Module): Model to analyze.
-        detailed (bool, optional): Whether to print detailed layer information.
-        verbose (bool, optional): Whether to print model information.
-        imgsz (int | list, optional): Input image size.
-
-    Returns:
-        n_l (int): Number of layers.
-        n_p (int): Number of parameters.
-        n_g (int): Number of gradients.
-        flops (float): GFLOPs.
-    """
-    if not verbose:
-        return
-    n_p = get_num_params(model)  # number of parameters
-    n_g = get_num_gradients(model)  # number of gradients
-    layers = __import__("collections").OrderedDict((n, m) for n, m in model.named_modules() if len(m._modules) == 0)
-    n_l = len(layers)  # number of layers
-    if detailed:
-        h = f"{'layer':>5}{'name':>40}{'type':>20}{'gradient':>10}{'parameters':>12}{'shape':>20}{'mu':>10}{'sigma':>10}"
-        LOGGER.info(h)
-        for i, (mn, m) in enumerate(layers.items()):
-            mn = mn.replace("module_list.", "")
-            mt = m.__class__.__name__
-            if len(m._parameters):
-                for pn, p in m.named_parameters():
-                    LOGGER.info(
-                        f"{i:>5g}{f'{mn}.{pn}':>40}{mt:>20}{p.requires_grad!r:>10}{p.numel():>12g}{str(list(p.shape)):>20}{p.mean():>10.3g}{p.std():>10.3g}{str(p.dtype).replace('torch.', ''):>15}"
-                    )
-            else:  # layers with no learnable params
-                LOGGER.info(f"{i:>5g}{mn:>40}{mt:>20}{False!r:>10}{0:>12g}{str([]):>20}{'-':>10}{'-':>10}{'-':>15}")
-
-    flops = get_flops(model, imgsz)  # imgsz may be int or list, i.e. imgsz=640 or imgsz=[640, 320]
-    fused = " (fused)" if getattr(model, "is_fused", lambda: False)() else ""
-    fs = f", {flops:.1f} GFLOPs" if flops else ""
-    yaml_file = getattr(model, "yaml_file", "") or getattr(model, "yaml", {}).get("yaml_file", "")
-    model_name = Path(yaml_file).stem.replace("yolo", "YOLO") or "Model"
-    LOGGER.info(f"{model_name} summary{fused}: {n_l:,} layers, {n_p:,} parameters, {n_g:,} gradients{fs}")
-    return n_l, n_p, n_g, flops
-
-
-def get_num_params(model):
-    """Return the total number of parameters in a YOLO model."""
-    return sum(x.numel() for x in model.parameters())
-
-
-def get_num_gradients(model):
-    """Return the total number of parameters with gradients in a YOLO model."""
-    return sum(x.numel() for x in model.parameters() if x.requires_grad)
-
-
-def model_info_for_loggers(trainer):
-    """
-    Return model info dict with useful model information.
-
-    Args:
-        trainer (ultralytics.engine.trainer.BaseTrainer): The trainer object containing model and validation data.
-
-    Returns:
-        (dict): Dictionary containing model parameters, GFLOPs, and inference speeds.
-
-    Examples:
-        YOLOv8n info for loggers
-        >>> results = {
-        ...    "model/parameters": 3151904,
-        ...    "model/GFLOPs": 8.746,
-        ...    "model/speed_ONNX(ms)": 41.244,
-        ...    "model/speed_TensorRT(ms)": 3.211,
-        ...    "model/speed_PyTorch(ms)": 18.755,
-        ...}
-    """
-    if trainer.args.profile:  # profile ONNX and TensorRT times
-        from ultralytics.utils.benchmarks import ProfileModels
-
-        results = ProfileModels([trainer.last], device=trainer.device).run()[0]
-        results.pop("model/name")
-    else:  # only return PyTorch times from most recent validation
-        results = {
-            "model/parameters": get_num_params(trainer.model),
-            "model/GFLOPs": round(get_flops(trainer.model), 3),
-        }
-    results["model/speed_PyTorch(ms)"] = round(trainer.validator.speed["inference"], 3)
-    return results
-
-
-def get_flops(model, imgsz=640):
-    """
-    Calculate FLOPs (floating point operations) for a model in billions.
-
-    Attempts two calculation methods: first with a stride-based tensor for efficiency,
-    then falls back to full image size if needed (e.g., for RTDETR models). Returns 0.0
-    if thop library is unavailable or calculation fails.
-
-    Args:
-        model (nn.Module): The model to calculate FLOPs for.
-        imgsz (int | list, optional): Input image size.
-
-    Returns:
-        (float): The model FLOPs in billions.
-    """
-    try:
-        import thop
-    except ImportError:
-        thop = None  # conda support without 'ultralytics-thop' installed
-
-    if not thop:
-        return 0.0  # if not installed return 0.0 GFLOPs
-
-    try:
-        model = unwrap_model(model)
-        p = next(model.parameters())
-        if not isinstance(imgsz, list):
-            imgsz = [imgsz, imgsz]  # expand if int/float
-        try:
-            # Method 1: Use stride-based input tensor
-            stride = max(int(model.stride.max()), 32) if hasattr(model, "stride") else 32  # max stride
-            im = torch.empty((1, p.shape[1], stride, stride), device=p.device)  # input image in BCHW format
-            flops = thop.profile(deepcopy(model), inputs=[im], verbose=False)[0] / 1e9 * 2  # stride GFLOPs
-            return flops * imgsz[0] / stride * imgsz[1] / stride  # imgsz GFLOPs
-        except Exception:
-            # Method 2: Use actual image size (required for RTDETR models)
-            im = torch.empty((1, p.shape[1], *imgsz), device=p.device)  # input image in BCHW format
-            return thop.profile(deepcopy(model), inputs=[im], verbose=False)[0] / 1e9 * 2  # imgsz GFLOPs
-    except Exception:
-        return 0.0
-
-
-def get_flops_with_torch_profiler(model, imgsz=640):
-    """
-    Compute model FLOPs using torch profiler (alternative to thop package, but 2-10x slower).
-
-    Args:
-        model (nn.Module): The model to calculate FLOPs for.
-        imgsz (int | list, optional): Input image size.
-
-    Returns:
-        (float): The model's FLOPs in billions.
-    """
-    if not TORCH_2_0:  # torch profiler implemented in torch>=2.0
-        return 0.0
-    model = unwrap_model(model)
-    p = next(model.parameters())
-    if not isinstance(imgsz, list):
-        imgsz = [imgsz, imgsz]  # expand if int/float
-    try:
-        # Use stride size for input tensor
-        stride = (max(int(model.stride.max()), 32) if hasattr(model, "stride") else 32) * 2  # max stride
-        im = torch.empty((1, p.shape[1], stride, stride), device=p.device)  # input image in BCHW format
-        with torch.profiler.profile(with_flops=True) as prof:
-            model(im)
-        flops = sum(x.flops for x in prof.key_averages()) / 1e9
-        flops = flops * imgsz[0] / stride * imgsz[1] / stride  # 640x640 GFLOPs
-    except Exception:
-        # Use actual image size for input tensor (i.e. required for RTDETR models)
-        im = torch.empty((1, p.shape[1], *imgsz), device=p.device)  # input image in BCHW format
-        with torch.profiler.profile(with_flops=True) as prof:
-            model(im)
-        flops = sum(x.flops for x in prof.key_averages()) / 1e9
-    return flops
-
-
-def initialize_weights(model):
-    """Initialize model weights to random values."""
-    for m in model.modules():
-        t = type(m)
-        if t is nn.Conv2d:
-            pass  # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
-        elif t is nn.BatchNorm2d:
-            m.eps = 1e-3
-            m.momentum = 0.03
-        elif t in {nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU}:
-            m.inplace = True
-
-
-def scale_img(img, ratio=1.0, same_shape=False, gs=32):
-    """
-    Scale and pad an image tensor, optionally maintaining aspect ratio and padding to gs multiple.
-
-    Args:
-        img (torch.Tensor): Input image tensor.
-        ratio (float, optional): Scaling ratio.
-        same_shape (bool, optional): Whether to maintain the same shape.
-        gs (int, optional): Grid size for padding.
-
-    Returns:
-        (torch.Tensor): Scaled and padded image tensor.
-    """
-    if ratio == 1.0:
-        return img
-    h, w = img.shape[2:]
-    s = (int(h * ratio), int(w * ratio))  # new size
-    img = F.interpolate(img, size=s, mode="bilinear", align_corners=False)  # resize
-    if not same_shape:  # pad/crop img
-        h, w = (math.ceil(x * ratio / gs) * gs for x in (h, w))
-    return F.pad(img, [0, w - s[1], 0, h - s[0]], value=0.447)  # value = imagenet mean
-
-
-def copy_attr(a, b, include=(), exclude=()):
-    """
-    Copy attributes from object 'b' to object 'a', with options to include/exclude certain attributes.
-
-    Args:
-        a (Any): Destination object to copy attributes to.
-        b (Any): Source object to copy attributes from.
-        include (tuple, optional): Attributes to include. If empty, all attributes are included.
-        exclude (tuple, optional): Attributes to exclude.
-    """
-    for k, v in b.__dict__.items():
-        if (len(include) and k not in include) or k.startswith("_") or k in exclude:
-            continue
-        else:
-            setattr(a, k, v)
-
-
-def intersect_dicts(da, db, exclude=()):
-    """
-    Return a dictionary of intersecting keys with matching shapes, excluding 'exclude' keys, using da values.
-
-    Args:
-        da (dict): First dictionary.
-        db (dict): Second dictionary.
-        exclude (tuple, optional): Keys to exclude.
-
-    Returns:
-        (dict): Dictionary of intersecting keys with matching shapes.
-    """
-    return {k: v for k, v in da.items() if k in db and all(x not in k for x in exclude) and v.shape == db[k].shape}
-
-
-def is_parallel(model):
-    """
-    Return True if model is of type DP or DDP.
-
-    Args:
-        model (nn.Module): Model to check.
-
-    Returns:
-        (bool): True if model is DataParallel or DistributedDataParallel.
-    """
-    return isinstance(model, (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel))
-
-
-def unwrap_model(m: nn.Module) -> nn.Module:
-    """
-    Unwrap compiled and parallel models to get the base model.
-
-    Args:
-        m (nn.Module): A model that may be wrapped by torch.compile (._orig_mod) or parallel wrappers such as
-            DataParallel/DistributedDataParallel (.module).
-
-    Returns:
-        m (nn.Module): The unwrapped base model without compile or parallel wrappers.
-    """
-    while True:
-        if hasattr(m, "_orig_mod") and isinstance(m._orig_mod, nn.Module):
-            m = m._orig_mod
-        elif hasattr(m, "module") and isinstance(m.module, nn.Module):
-            m = m.module
-        else:
-            return m
-
-
-def one_cycle(y1=0.0, y2=1.0, steps=100):
-    """
-    Return a lambda function for sinusoidal ramp from y1 to y2 https://arxiv.org/pdf/1812.01187.pdf.
-
-    Args:
-        y1 (float, optional): Initial value.
-        y2 (float, optional): Final value.
-        steps (int, optional): Number of steps.
-
-    Returns:
-        (function): Lambda function for computing the sinusoidal ramp.
-    """
-    return lambda x: max((1 - math.cos(x * math.pi / steps)) / 2, 0) * (y2 - y1) + y1
-
-
-def init_seeds(seed=0, deterministic=False):
-    """
-    Initialize random number generator (RNG) seeds https://pytorch.org/docs/stable/notes/randomness.html.
-
-    Args:
-        seed (int, optional): Random seed.
-        deterministic (bool, optional): Whether to set deterministic algorithms.
-    """
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)  # for Multi-GPU, exception safe
-    # torch.backends.cudnn.benchmark = True  # AutoBatch problem https://github.com/ultralytics/yolov5/issues/9287
-    if deterministic:
-        if TORCH_2_0:
-            torch.use_deterministic_algorithms(True, warn_only=True)  # warn if deterministic is not possible
-            torch.backends.cudnn.deterministic = True
-            os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
-            os.environ["PYTHONHASHSEED"] = str(seed)
-        else:
-            LOGGER.warning("Upgrade to torch>=2.0.0 for deterministic training.")
-    else:
-        unset_deterministic()
-
-
-def unset_deterministic():
-    """Unset all the configurations applied for deterministic training."""
-    torch.use_deterministic_algorithms(False)
-    torch.backends.cudnn.deterministic = False
-    os.environ.pop("CUBLAS_WORKSPACE_CONFIG", None)
-    os.environ.pop("PYTHONHASHSEED", None)
-
-
-class ModelEMA:
-    """
-    Updated Exponential Moving Average (EMA) implementation.
-
-    Keeps a moving average of everything in the model state_dict (parameters and buffers).
-    For EMA details see References.
-
-    To disable EMA set the `enabled` attribute to `False`.
-
-    Attributes:
-        ema (nn.Module): Copy of the model in evaluation mode.
-        updates (int): Number of EMA updates.
-        decay (function): Decay function that determines the EMA weight.
-        enabled (bool): Whether EMA is enabled.
-
-    References:
-        - https://github.com/rwightman/pytorch-image-models
-        - https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
-    """
-
-    def __init__(self, model, decay=0.9999, tau=2000, updates=0):
-        """
-        Initialize EMA for 'model' with given arguments.
-
-        Args:
-            model (nn.Module): Model to create EMA for.
-            decay (float, optional): Maximum EMA decay rate.
-            tau (int, optional): EMA decay time constant.
-            updates (int, optional): Initial number of updates.
-        """
-        self.ema = deepcopy(unwrap_model(model)).eval()  # FP32 EMA
-        self.updates = updates  # number of EMA updates
-        self.decay = lambda x: decay * (1 - math.exp(-x / tau))  # decay exponential ramp (to help early epochs)
-        for p in self.ema.parameters():
-            p.requires_grad_(False)
-        self.enabled = True
-
-    def update(self, model):
-        """
-        Update EMA parameters.
-
-        Args:
-            model (nn.Module): Model to update EMA from.
-        """
-        if self.enabled:
-            self.updates += 1
-            d = self.decay(self.updates)
-
-            msd = unwrap_model(model).state_dict()  # model state_dict
-            for k, v in self.ema.state_dict().items():
-                if v.dtype.is_floating_point:  # true for FP16 and FP32
-                    v *= d
-                    v += (1 - d) * msd[k].detach()
-                    # assert v.dtype == msd[k].dtype == torch.float32, f'{k}: EMA {v.dtype},  model {msd[k].dtype}'
-
-    def update_attr(self, model, include=(), exclude=("process_group", "reducer")):
-        """
-        Update attributes and save stripped model with optimizer removed.
-
-        Args:
-            model (nn.Module): Model to update attributes from.
-            include (tuple, optional): Attributes to include.
-            exclude (tuple, optional): Attributes to exclude.
-        """
-        if self.enabled:
-            copy_attr(self.ema, model, include, exclude)
-
-
-def strip_optimizer(f: str | Path = "best.pt", s: str = "", updates: dict[str, Any] = None) -> dict[str, Any]:
-    """
-    Strip optimizer from 'f' to finalize training, optionally save as 's'.
-
-    Args:
-        f (str | Path): File path to model to strip the optimizer from.
-        s (str, optional): File path to save the model with stripped optimizer to. If not provided, 'f' will be
-            overwritten.
-        updates (dict, optional): A dictionary of updates to overlay onto the checkpoint before saving.
-
-    Returns:
-        (dict): The combined checkpoint dictionary.
-
-    Examples:
-        >>> from pathlib import Path
-        >>> from ultralytics.utils.torch_utils import strip_optimizer
-        >>> for f in Path("path/to/model/checkpoints").rglob("*.pt"):
-        >>>    strip_optimizer(f)
-    """
-    try:
-        x = torch_load(f, map_location=torch.device("cpu"))
-        assert isinstance(x, dict), "checkpoint is not a Python dictionary"
-        assert "model" in x, "'model' missing from checkpoint"
-    except Exception as e:
-        LOGGER.warning(f"Skipping {f}, not a valid Ultralytics model: {e}")
-        return {}
-
-    metadata = {
-        "date": datetime.now().isoformat(),
-        "version": __version__,
-        "license": "AGPL-3.0 License (https://ultralytics.com/license)",
-        "docs": "https://docs.ultralytics.com",
-    }
-
-    # Update model
-    if x.get("ema"):
-        x["model"] = x["ema"]  # replace model with EMA
-    if hasattr(x["model"], "args"):
-        x["model"].args = dict(x["model"].args)  # convert from IterableSimpleNamespace to dict
-    if hasattr(x["model"], "criterion"):
-        x["model"].criterion = None  # strip loss criterion
-    x["model"].half()  # to FP16
-    for p in x["model"].parameters():
-        p.requires_grad = False
-
-    # Update other keys
-    args = {**DEFAULT_CFG_DICT, **x.get("train_args", {})}  # combine args
-    for k in "optimizer", "best_fitness", "ema", "updates", "scaler":  # keys
-        x[k] = None
-    x["epoch"] = -1
-    x["train_args"] = {k: v for k, v in args.items() if k in DEFAULT_CFG_KEYS}  # strip non-default keys
-    # x['model'].args = x['train_args']
-
-    # Save
-    combined = {**metadata, **x, **(updates or {})}
-    torch.save(combined, s or f)  # combine dicts (prefer to the right)
-    mb = os.path.getsize(s or f) / 1e6  # file size
-    LOGGER.info(f"Optimizer stripped from {f},{f' saved as {s},' if s else ''} {mb:.1f}MB")
-    return combined
-
-
-def convert_optimizer_state_dict_to_fp16(state_dict):
-    """
-    Convert the state_dict of a given optimizer to FP16, focusing on the 'state' key for tensor conversions.
-
-    Args:
-        state_dict (dict): Optimizer state dictionary.
-
-    Returns:
-        (dict): Converted optimizer state dictionary with FP16 tensors.
-    """
-    for state in state_dict["state"].values():
-        for k, v in state.items():
-            if k != "step" and isinstance(v, torch.Tensor) and v.dtype is torch.float32:
-                state[k] = v.half()
-
-    return state_dict
-
-
-@contextmanager
-def cuda_memory_usage(device=None):
-    """
-    Monitor and manage CUDA memory usage.
-
-    This function checks if CUDA is available and, if so, empties the CUDA cache to free up unused memory.
-    It then yields a dictionary containing memory usage information, which can be updated by the caller.
-    Finally, it updates the dictionary with the amount of memory reserved by CUDA on the specified device.
-
-    Args:
-        device (torch.device, optional): The CUDA device to query memory usage for.
-
-    Yields:
-        (dict): A dictionary with a key 'memory' initialized to 0, which will be updated with the reserved memory.
-    """
-    cuda_info = dict(memory=0)
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-        try:
-            yield cuda_info
-        finally:
-            cuda_info["memory"] = torch.cuda.memory_reserved(device)
-    else:
-        yield cuda_info
-
-
-def profile_ops(input, ops, n=10, device=None, max_num_obj=0):
-    """
-    Ultralytics speed, memory and FLOPs profiler.
-
-    Args:
-        input (torch.Tensor | list): Input tensor(s) to profile.
-        ops (nn.Module | list): Model or list of operations to profile.
-        n (int, optional): Number of iterations to average.
-        device (str | torch.device, optional): Device to profile on.
-        max_num_obj (int, optional): Maximum number of objects for simulation.
-
-    Returns:
-        (list): Profile results for each operation.
-
-    Examples:
-        >>> from ultralytics.utils.torch_utils import profile_ops
-        >>> input = torch.randn(16, 3, 640, 640)
-        >>> m1 = lambda x: x * torch.sigmoid(x)
-        >>> m2 = nn.SiLU()
-        >>> profile_ops(input, [m1, m2], n=100)  # profile over 100 iterations
-    """
-    try:
-        import thop
-    except ImportError:
-        thop = None  # conda support without 'ultralytics-thop' installed
-
-    results = []
-    if not isinstance(device, torch.device):
-        device = select_device(device)
-    LOGGER.info(
-        f"{'Params':>12s}{'GFLOPs':>12s}{'GPU_mem (GB)':>14s}{'forward (ms)':>14s}{'backward (ms)':>14s}"
-        f"{'input':>24s}{'output':>24s}"
-    )
-    gc.collect()  # attempt to free unused memory
-    torch.cuda.empty_cache()
-    for x in input if isinstance(input, list) else [input]:
-        x = x.to(device)
-        x.requires_grad = True
-        for m in ops if isinstance(ops, list) else [ops]:
-            m = m.to(device) if hasattr(m, "to") else m  # device
-            m = m.half() if hasattr(m, "half") and isinstance(x, torch.Tensor) and x.dtype is torch.float16 else m
-            tf, tb, t = 0, 0, [0, 0, 0]  # dt forward, backward
-            try:
-                flops = thop.profile(deepcopy(m), inputs=[x], verbose=False)[0] / 1e9 * 2 if thop else 0  # GFLOPs
-            except Exception:
-                flops = 0
-
-            try:
-                mem = 0
-                for _ in range(n):
-                    with cuda_memory_usage(device) as cuda_info:
-                        t[0] = time_sync()
-                        y = m(x)
-                        t[1] = time_sync()
-                        try:
-                            (sum(yi.sum() for yi in y) if isinstance(y, list) else y).sum().backward()
-                            t[2] = time_sync()
-                        except Exception:  # no backward method
-                            # print(e)  # for debug
-                            t[2] = float("nan")
-                    mem += cuda_info["memory"] / 1e9  # (GB)
-                    tf += (t[1] - t[0]) * 1000 / n  # ms per op forward
-                    tb += (t[2] - t[1]) * 1000 / n  # ms per op backward
-                    if max_num_obj:  # simulate training with predictions per image grid (for AutoBatch)
-                        with cuda_memory_usage(device) as cuda_info:
-                            torch.randn(
-                                x.shape[0],
-                                max_num_obj,
-                                int(sum((x.shape[-1] / s) * (x.shape[-2] / s) for s in m.stride.tolist())),
-                                device=device,
-                                dtype=torch.float32,
-                            )
-                        mem += cuda_info["memory"] / 1e9  # (GB)
-                s_in, s_out = (tuple(x.shape) if isinstance(x, torch.Tensor) else "list" for x in (x, y))  # shapes
-                p = sum(x.numel() for x in m.parameters()) if isinstance(m, nn.Module) else 0  # parameters
-                LOGGER.info(f"{p:12}{flops:12.4g}{mem:>14.3f}{tf:14.4g}{tb:14.4g}{str(s_in):>24s}{str(s_out):>24s}")
-                results.append([p, flops, mem, tf, tb, s_in, s_out])
-            except Exception as e:
-                LOGGER.info(e)
-                results.append(None)
-            finally:
-                gc.collect()  # attempt to free unused memory
-                torch.cuda.empty_cache()
-    return results
-
-
-class EarlyStopping:
-    """
-    Early stopping class that stops training when a specified number of epochs have passed without improvement.
-
-    Attributes:
-        best_fitness (float): Best fitness value observed.
-        best_epoch (int): Epoch where best fitness was observed.
-        patience (int): Number of epochs to wait after fitness stops improving before stopping.
-        possible_stop (bool): Flag indicating if stopping may occur next epoch.
-    """
-
-    def __init__(self, patience=50):
-        """
-        Initialize early stopping object.
-
-        Args:
-            patience (int, optional): Number of epochs to wait after fitness stops improving before stopping.
-        """
-        self.best_fitness = 0.0  # i.e. mAP
-        self.best_epoch = 0
-        self.patience = patience or float("inf")  # epochs to wait after fitness stops improving to stop
-        self.possible_stop = False  # possible stop may occur next epoch
-
-    def __call__(self, epoch, fitness):
-        """
-        Check whether to stop training.
-
-        Args:
-            epoch (int): Current epoch of training
-            fitness (float): Fitness value of current epoch
-
-        Returns:
-            (bool): True if training should stop, False otherwise
-        """
-        if fitness is None:  # check if fitness=None (happens when val=False)
-            return False
-
-        if fitness > self.best_fitness or self.best_fitness == 0:  # allow for early zero-fitness stage of training
-            self.best_epoch = epoch
-            self.best_fitness = fitness
-        delta = epoch - self.best_epoch  # epochs without improvement
-        self.possible_stop = delta >= (self.patience - 1)  # possible stop may occur next epoch
-        stop = delta >= self.patience  # stop training if patience exceeded
-        if stop:
-            prefix = colorstr("EarlyStopping: ")
-            LOGGER.info(
-                f"{prefix}Training stopped early as no improvement observed in last {self.patience} epochs. "
-                f"Best results observed at epoch {self.best_epoch}, best model saved as best.pt.\n"
-                f"To update EarlyStopping(patience={self.patience}) pass a new patience value, "
-                f"i.e. `patience=300` or use `patience=0` to disable EarlyStopping."
-            )
-        return stop
-
-
-def attempt_compile(
-    model: torch.nn.Module,
-    device: torch.device,
-    imgsz: int = 640,
-    use_autocast: bool = False,
-    warmup: bool = False,
-    mode: bool | str = "default",
-) -> torch.nn.Module:
-    """
-    Compile a model with torch.compile and optionally warm up the graph to reduce first-iteration latency.
-
-    This utility attempts to compile the provided model using the inductor backend with dynamic shapes enabled and an
-    autotuning mode. If compilation is unavailable or fails, the original model is returned unchanged. An optional
-    warmup performs a single forward pass on a dummy input to prime the compiled graph and measure compile/warmup time.
-
-    Args:
-        model (torch.nn.Module): Model to compile.
-        device (torch.device): Inference device used for warmup and autocast decisions.
-        imgsz (int, optional): Square input size to create a dummy tensor with shape (1, 3, imgsz, imgsz) for warmup.
-        use_autocast (bool, optional): Whether to run warmup under autocast on CUDA or MPS devices.
-        warmup (bool, optional): Whether to execute a single dummy forward pass to warm up the compiled model.
-        mode (bool | str, optional): torch.compile mode. True → "default", False → no compile, or a string like
-            "default", "reduce-overhead", "max-autotune-no-cudagraphs".
-
-    Returns:
-        model (torch.nn.Module): Compiled model if compilation succeeds, otherwise the original unmodified model.
-
-    Notes:
-        - If the current PyTorch build does not provide torch.compile, the function returns the input model immediately.
-        - Warmup runs under torch.inference_mode and may use torch.autocast for CUDA/MPS to align compute precision.
-        - CUDA devices are synchronized after warmup to account for asynchronous kernel execution.
-
-    Examples:
-        >>> device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-        >>> # Try to compile and warm up a model with a 640x640 input
-        >>> model = attempt_compile(model, device=device, imgsz=640, use_autocast=True, warmup=True)
-    """
-    if not hasattr(torch, "compile") or not mode:
-        return model
-
-    if mode is True:
-        mode = "default"
-    prefix = colorstr("compile:")
-    LOGGER.info(f"{prefix} starting torch.compile with '{mode}' mode...")
-    if mode == "max-autotune":
-        LOGGER.warning(f"{prefix} mode='{mode}' not recommended, using mode='max-autotune-no-cudagraphs' instead")
-        mode = "max-autotune-no-cudagraphs"
-    t0 = time.perf_counter()
-    try:
-        model = torch.compile(model, mode=mode, backend="inductor")
-    except Exception as e:
-        LOGGER.warning(f"{prefix} torch.compile failed, continuing uncompiled: {e}")
-        return model
-    t_compile = time.perf_counter() - t0
-
-    t_warm = 0.0
-    if warmup:
-        # Use a single dummy tensor to build the graph shape state and reduce first-iteration latency
-        dummy = torch.zeros(1, 3, imgsz, imgsz, device=device)
-        if use_autocast and device.type == "cuda":
-            dummy = dummy.half()
-        t1 = time.perf_counter()
-        with torch.inference_mode():
-            if use_autocast and device.type in {"cuda", "mps"}:
-                with torch.autocast(device.type):
-                    _ = model(dummy)
-            else:
-                _ = model(dummy)
-        if device.type == "cuda":
-            torch.cuda.synchronize(device)
-        t_warm = time.perf_counter() - t1
-
-    total = t_compile + t_warm
-    if warmup:
-        LOGGER.info(f"{prefix} complete in {total:.1f}s (compile {t_compile:.1f}s + warmup {t_warm:.1f}s)")
-    else:
-        LOGGER.info(f"{prefix} compile complete in {t_compile:.1f}s (no warmup)")
-    return model
diff --git a/ultralytics/utils/tqdm.py b/ultralytics/utils/tqdm.py
deleted file mode 100644
index b6f1fc7..0000000
--- a/ultralytics/utils/tqdm.py
+++ /dev/null
@@ -1,440 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-import os
-import sys
-import time
-from functools import lru_cache
-from typing import IO, Any
-
-
-@lru_cache(maxsize=1)
-def is_noninteractive_console() -> bool:
-    """Check for known non-interactive console environments."""
-    return "GITHUB_ACTIONS" in os.environ or "RUNPOD_POD_ID" in os.environ
-
-
-class TQDM:
-    """
-    Lightweight zero-dependency progress bar for Ultralytics.
-
-    Provides clean, rich-style progress bars suitable for various environments including Weights & Biases,
-    console outputs, and other logging systems. Features zero external dependencies, clean single-line output,
-    rich-style progress bars with Unicode block characters, context manager support, iterator protocol support,
-    and dynamic description updates.
-
-    Attributes:
-        iterable (object): Iterable to wrap with progress bar.
-        desc (str): Prefix description for the progress bar.
-        total (int): Expected number of iterations.
-        disable (bool): Whether to disable the progress bar.
-        unit (str): String for units of iteration.
-        unit_scale (bool): Auto-scale units flag.
-        unit_divisor (int): Divisor for unit scaling.
-        leave (bool): Whether to leave the progress bar after completion.
-        mininterval (float): Minimum time interval between updates.
-        initial (int): Initial counter value.
-        n (int): Current iteration count.
-        closed (bool): Whether the progress bar is closed.
-        bar_format (str): Custom bar format string.
-        file (object): Output file stream.
-
-    Methods:
-        update: Update progress by n steps.
-        set_description: Set or update the description.
-        set_postfix: Set postfix for the progress bar.
-        close: Close the progress bar and clean up.
-        refresh: Refresh the progress bar display.
-        clear: Clear the progress bar from display.
-        write: Write a message without breaking the progress bar.
-
-    Examples:
-        Basic usage with iterator:
-        >>> for i in TQDM(range(100)):
-        ...     time.sleep(0.01)
-
-        With custom description:
-        >>> pbar = TQDM(range(100), desc="Processing")
-        >>> for i in pbar:
-        ...     pbar.set_description(f"Processing item {i}")
-
-        Context manager usage:
-        >>> with TQDM(total=100, unit="B", unit_scale=True) as pbar:
-        ...     for i in range(100):
-        ...         pbar.update(1)
-
-        Manual updates:
-        >>> pbar = TQDM(total=100, desc="Training")
-        >>> for epoch in range(100):
-        ...     # Do work
-        ...     pbar.update(1)
-        >>> pbar.close()
-    """
-
-    # Constants
-    MIN_RATE_CALC_INTERVAL = 0.01  # Minimum time interval for rate calculation
-    RATE_SMOOTHING_FACTOR = 0.3  # Factor for exponential smoothing of rates
-    MAX_SMOOTHED_RATE = 1000000  # Maximum rate to apply smoothing to
-    NONINTERACTIVE_MIN_INTERVAL = 60.0  # Minimum interval for non-interactive environments
-
-    def __init__(
-        self,
-        iterable: Any = None,
-        desc: str | None = None,
-        total: int | None = None,
-        leave: bool = True,
-        file: IO[str] | None = None,
-        mininterval: float = 0.1,
-        disable: bool | None = None,
-        unit: str = "it",
-        unit_scale: bool = True,
-        unit_divisor: int = 1000,
-        bar_format: str | None = None,  # kept for API compatibility; not used for formatting
-        initial: int = 0,
-        **kwargs,
-    ) -> None:
-        """
-        Initialize the TQDM progress bar with specified configuration options.
-
-        Args:
-            iterable (object, optional): Iterable to wrap with progress bar.
-            desc (str, optional): Prefix description for the progress bar.
-            total (int, optional): Expected number of iterations.
-            leave (bool, optional): Whether to leave the progress bar after completion.
-            file (object, optional): Output file stream for progress display.
-            mininterval (float, optional): Minimum time interval between updates (default 0.1s, 60s in GitHub Actions).
-            disable (bool, optional): Whether to disable the progress bar. Auto-detected if None.
-            unit (str, optional): String for units of iteration (default "it" for items).
-            unit_scale (bool, optional): Auto-scale units for bytes/data units.
-            unit_divisor (int, optional): Divisor for unit scaling (default 1000).
-            bar_format (str, optional): Custom bar format string.
-            initial (int, optional): Initial counter value.
-            **kwargs (Any): Additional keyword arguments for compatibility (ignored).
-
-        Examples:
-            >>> pbar = TQDM(range(100), desc="Processing")
-            >>> with TQDM(total=1000, unit="B", unit_scale=True) as pbar:
-            ...     pbar.update(1024)  # Updates by 1KB
-        """
-        # Disable if not verbose
-        if disable is None:
-            try:
-                from ultralytics.utils import LOGGER, VERBOSE
-
-                disable = not VERBOSE or LOGGER.getEffectiveLevel() > 20
-            except ImportError:
-                disable = False
-
-        self.iterable = iterable
-        self.desc = desc or ""
-        self.total = total or (len(iterable) if hasattr(iterable, "__len__") else None) or None  # prevent total=0
-        self.disable = disable
-        self.unit = unit
-        self.unit_scale = unit_scale
-        self.unit_divisor = unit_divisor
-        self.leave = leave
-        self.noninteractive = is_noninteractive_console()
-        self.mininterval = max(mininterval, self.NONINTERACTIVE_MIN_INTERVAL) if self.noninteractive else mininterval
-        self.initial = initial
-
-        # Kept for API compatibility (unused for f-string formatting)
-        self.bar_format = bar_format
-
-        self.file = file or sys.stdout
-
-        # Internal state
-        self.n = self.initial
-        self.last_print_n = self.initial
-        self.last_print_t = time.time()
-        self.start_t = time.time()
-        self.last_rate = 0.0
-        self.closed = False
-        self.is_bytes = unit_scale and unit in ("B", "bytes")
-        self.scales = (
-            [(1073741824, "GB/s"), (1048576, "MB/s"), (1024, "KB/s")]
-            if self.is_bytes
-            else [(1e9, f"G{self.unit}/s"), (1e6, f"M{self.unit}/s"), (1e3, f"K{self.unit}/s")]
-        )
-
-        if not self.disable and self.total and not self.noninteractive:
-            self._display()
-
-    def _format_rate(self, rate: float) -> str:
-        """Format rate with units."""
-        if rate <= 0:
-            return ""
-        fallback = f"{rate:.1f}B/s" if self.is_bytes else f"{rate:.1f}{self.unit}/s"
-        return next((f"{rate / t:.1f}{u}" for t, u in self.scales if rate >= t), fallback)
-
-    def _format_num(self, num: int | float) -> str:
-        """Format number with optional unit scaling."""
-        if not self.unit_scale or not self.is_bytes:
-            return str(num)
-
-        for unit in ("", "K", "M", "G", "T"):
-            if abs(num) < self.unit_divisor:
-                return f"{num:3.1f}{unit}B" if unit else f"{num:.0f}B"
-            num /= self.unit_divisor
-        return f"{num:.1f}PB"
-
-    def _format_time(self, seconds: float) -> str:
-        """Format time duration."""
-        if seconds < 60:
-            return f"{seconds:.1f}s"
-        elif seconds < 3600:
-            return f"{int(seconds // 60)}:{seconds % 60:02.0f}"
-        else:
-            h, m = int(seconds // 3600), int((seconds % 3600) // 60)
-            return f"{h}:{m:02d}:{seconds % 60:02.0f}"
-
-    def _generate_bar(self, width: int = 12) -> str:
-        """Generate progress bar."""
-        if self.total is None:
-            return "━" * width if self.closed else "─" * width
-
-        frac = min(1.0, self.n / self.total)
-        filled = int(frac * width)
-        bar = "━" * filled + "─" * (width - filled)
-        if filled < width and frac * width - filled > 0.5:
-            bar = f"{bar[:filled]}╸{bar[filled + 1 :]}"
-        return bar
-
-    def _should_update(self, dt: float, dn: int) -> bool:
-        """Check if display should update."""
-        if self.noninteractive:
-            return False
-        return (self.total is not None and self.n >= self.total) or (dt >= self.mininterval)
-
-    def _display(self, final: bool = False) -> None:
-        """Display progress bar."""
-        if self.disable or (self.closed and not final):
-            return
-
-        current_time = time.time()
-        dt = current_time - self.last_print_t
-        dn = self.n - self.last_print_n
-
-        if not final and not self._should_update(dt, dn):
-            return
-
-        # Calculate rate (avoid crazy numbers)
-        if dt > self.MIN_RATE_CALC_INTERVAL:
-            rate = dn / dt if dt else 0.0
-            # Smooth rate for reasonable values, use raw rate for very high values
-            if rate < self.MAX_SMOOTHED_RATE:
-                self.last_rate = self.RATE_SMOOTHING_FACTOR * rate + (1 - self.RATE_SMOOTHING_FACTOR) * self.last_rate
-                rate = self.last_rate
-        else:
-            rate = self.last_rate
-
-        # At completion, use overall rate
-        if self.total and self.n >= self.total:
-            overall_elapsed = current_time - self.start_t
-            if overall_elapsed > 0:
-                rate = self.n / overall_elapsed
-
-        # Update counters
-        self.last_print_n = self.n
-        self.last_print_t = current_time
-        elapsed = current_time - self.start_t
-
-        # Remaining time
-        remaining_str = ""
-        if self.total and 0 < self.n < self.total and elapsed > 0:
-            est_rate = rate or (self.n / elapsed)
-            remaining_str = f"<{self._format_time((self.total - self.n) / est_rate)}"
-
-        # Numbers and percent
-        if self.total:
-            percent = (self.n / self.total) * 100
-            n_str = self._format_num(self.n)
-            t_str = self._format_num(self.total)
-            if self.is_bytes:
-                # Collapse suffix only when identical (e.g. "5.4/5.4MB")
-                if n_str[-2] == t_str[-2]:
-                    n_str = n_str.rstrip("KMGTPB")  # Remove unit suffix from current if different than total
-        else:
-            percent = 0.0
-            n_str, t_str = self._format_num(self.n), "?"
-
-        elapsed_str = self._format_time(elapsed)
-        rate_str = self._format_rate(rate) or (self._format_rate(self.n / elapsed) if elapsed > 0 else "")
-
-        bar = self._generate_bar()
-
-        # Compose progress line via f-strings (two shapes: with/without total)
-        if self.total:
-            if self.is_bytes and self.n >= self.total:
-                # Completed bytes: show only final size
-                progress_str = f"{self.desc}: {percent:.0f}% {bar} {t_str} {rate_str} {elapsed_str}"
-            else:
-                progress_str = (
-                    f"{self.desc}: {percent:.0f}% {bar} {n_str}/{t_str} {rate_str} {elapsed_str}{remaining_str}"
-                )
-        else:
-            progress_str = f"{self.desc}: {bar} {n_str} {rate_str} {elapsed_str}"
-
-        # Write to output
-        try:
-            if self.noninteractive:
-                # In non-interactive environments, avoid carriage return which creates empty lines
-                self.file.write(progress_str)
-            else:
-                # In interactive terminals, use carriage return and clear line for updating display
-                self.file.write(f"\r\033[K{progress_str}")
-            self.file.flush()
-        except Exception:
-            pass
-
-    def update(self, n: int = 1) -> None:
-        """Update progress by n steps."""
-        if not self.disable and not self.closed:
-            self.n += n
-            self._display()
-
-    def set_description(self, desc: str | None) -> None:
-        """Set description."""
-        self.desc = desc or ""
-        if not self.disable:
-            self._display()
-
-    def set_postfix(self, **kwargs: Any) -> None:
-        """Set postfix (appends to description)."""
-        if kwargs:
-            postfix = ", ".join(f"{k}={v}" for k, v in kwargs.items())
-            base_desc = self.desc.split(" | ")[0] if " | " in self.desc else self.desc
-            self.set_description(f"{base_desc} | {postfix}")
-
-    def close(self) -> None:
-        """Close progress bar."""
-        if self.closed:
-            return
-
-        self.closed = True
-
-        if not self.disable:
-            # Final display
-            if self.total and self.n >= self.total:
-                self.n = self.total
-            self._display(final=True)
-
-            # Cleanup
-            if self.leave:
-                self.file.write("\n")
-            else:
-                self.file.write("\r\033[K")
-
-            try:
-                self.file.flush()
-            except Exception:
-                pass
-
-    def __enter__(self) -> TQDM:
-        """Enter context manager."""
-        return self
-
-    def __exit__(self, *args: Any) -> None:
-        """Exit context manager and close progress bar."""
-        self.close()
-
-    def __iter__(self) -> Any:
-        """Iterate over the wrapped iterable with progress updates."""
-        if self.iterable is None:
-            raise TypeError("'NoneType' object is not iterable")
-
-        try:
-            for item in self.iterable:
-                yield item
-                self.update(1)
-        finally:
-            self.close()
-
-    def __del__(self) -> None:
-        """Destructor to ensure cleanup."""
-        try:
-            self.close()
-        except Exception:
-            pass
-
-    def refresh(self) -> None:
-        """Refresh display."""
-        if not self.disable:
-            self._display()
-
-    def clear(self) -> None:
-        """Clear progress bar."""
-        if not self.disable:
-            try:
-                self.file.write("\r\033[K")
-                self.file.flush()
-            except Exception:
-                pass
-
-    @staticmethod
-    def write(s: str, file: IO[str] | None = None, end: str = "\n") -> None:
-        """Static method to write without breaking progress bar."""
-        file = file or sys.stdout
-        try:
-            file.write(s + end)
-            file.flush()
-        except Exception:
-            pass
-
-
-if __name__ == "__main__":
-    import time
-
-    print("1. Basic progress bar with known total:")
-    for i in TQDM(range(3), desc="Known total"):
-        time.sleep(0.05)
-
-    print("\n2. Manual updates with known total:")
-    pbar = TQDM(total=300, desc="Manual updates", unit="files")
-    for i in range(300):
-        time.sleep(0.03)
-        pbar.update(1)
-        if i % 10 == 9:
-            pbar.set_description(f"Processing batch {i // 10 + 1}")
-    pbar.close()
-
-    print("\n3. Progress bar with unknown total:")
-    pbar = TQDM(desc="Unknown total", unit="items")
-    for i in range(25):
-        time.sleep(0.08)
-        pbar.update(1)
-        if i % 5 == 4:
-            pbar.set_postfix(processed=i + 1, status="OK")
-    pbar.close()
-
-    print("\n4. Context manager with unknown total:")
-    with TQDM(desc="Processing stream", unit="B", unit_scale=True, unit_divisor=1024) as pbar:
-        for i in range(30):
-            time.sleep(0.1)
-            pbar.update(1024 * 1024 * i)  # Simulate processing MB of data
-
-    print("\n5. Iterator with unknown length:")
-
-    def data_stream():
-        """Simulate a data stream of unknown length."""
-        import random
-
-        for i in range(random.randint(10, 20)):
-            yield f"data_chunk_{i}"
-
-    for chunk in TQDM(data_stream(), desc="Stream processing", unit="chunks"):
-        time.sleep(0.1)
-
-    print("\n6. File processing simulation (unknown size):")
-
-    def process_files():
-        """Simulate processing files of unknown count."""
-        return [f"file_{i}.txt" for i in range(18)]
-
-    pbar = TQDM(desc="Scanning files", unit="files")
-    files = process_files()
-    for i, filename in enumerate(files):
-        time.sleep(0.06)
-        pbar.update(1)
-        pbar.set_description(f"Processing {filename}")
-    pbar.close()
diff --git a/ultralytics/utils/triton.py b/ultralytics/utils/triton.py
deleted file mode 100644
index 6c122f3..0000000
--- a/ultralytics/utils/triton.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from __future__ import annotations
-
-from urllib.parse import urlsplit
-
-import numpy as np
-
-
-class TritonRemoteModel:
-    """
-    Client for interacting with a remote Triton Inference Server model.
-
-    This class provides a convenient interface for sending inference requests to a Triton Inference Server
-    and processing the responses. Supports both HTTP and gRPC communication protocols.
-
-    Attributes:
-        endpoint (str): The name of the model on the Triton server.
-        url (str): The URL of the Triton server.
-        triton_client: The Triton client (either HTTP or gRPC).
-        InferInput: The input class for the Triton client.
-        InferRequestedOutput: The output request class for the Triton client.
-        input_formats (list[str]): The data types of the model inputs.
-        np_input_formats (list[type]): The numpy data types of the model inputs.
-        input_names (list[str]): The names of the model inputs.
-        output_names (list[str]): The names of the model outputs.
-        metadata: The metadata associated with the model.
-
-    Methods:
-        __call__: Call the model with the given inputs and return the outputs.
-
-    Examples:
-        Initialize a Triton client with HTTP
-        >>> model = TritonRemoteModel(url="localhost:8000", endpoint="yolov8", scheme="http")
-
-        Make inference with numpy arrays
-        >>> outputs = model(np.random.rand(1, 3, 640, 640).astype(np.float32))
-    """
-
-    def __init__(self, url: str, endpoint: str = "", scheme: str = ""):
-        """
-        Initialize the TritonRemoteModel for interacting with a remote Triton Inference Server.
-
-        Arguments may be provided individually or parsed from a collective 'url' argument of the form
-        <scheme>://<netloc>/<endpoint>/<task_name>
-
-        Args:
-            url (str): The URL of the Triton server.
-            endpoint (str, optional): The name of the model on the Triton server.
-            scheme (str, optional): The communication scheme ('http' or 'grpc').
-
-        Examples:
-            >>> model = TritonRemoteModel(url="localhost:8000", endpoint="yolov8", scheme="http")
-            >>> model = TritonRemoteModel(url="http://localhost:8000/yolov8")
-        """
-        if not endpoint and not scheme:  # Parse all args from URL string
-            splits = urlsplit(url)
-            endpoint = splits.path.strip("/").split("/", 1)[0]
-            scheme = splits.scheme
-            url = splits.netloc
-
-        self.endpoint = endpoint
-        self.url = url
-
-        # Choose the Triton client based on the communication scheme
-        if scheme == "http":
-            import tritonclient.http as client  # noqa
-
-            self.triton_client = client.InferenceServerClient(url=self.url, verbose=False, ssl=False)
-            config = self.triton_client.get_model_config(endpoint)
-        else:
-            import tritonclient.grpc as client  # noqa
-
-            self.triton_client = client.InferenceServerClient(url=self.url, verbose=False, ssl=False)
-            config = self.triton_client.get_model_config(endpoint, as_json=True)["config"]
-
-        # Sort output names alphabetically, i.e. 'output0', 'output1', etc.
-        config["output"] = sorted(config["output"], key=lambda x: x.get("name"))
-
-        # Define model attributes
-        type_map = {"TYPE_FP32": np.float32, "TYPE_FP16": np.float16, "TYPE_UINT8": np.uint8}
-        self.InferRequestedOutput = client.InferRequestedOutput
-        self.InferInput = client.InferInput
-        self.input_formats = [x["data_type"] for x in config["input"]]
-        self.np_input_formats = [type_map[x] for x in self.input_formats]
-        self.input_names = [x["name"] for x in config["input"]]
-        self.output_names = [x["name"] for x in config["output"]]
-        self.metadata = eval(config.get("parameters", {}).get("metadata", {}).get("string_value", "None"))
-
-    def __call__(self, *inputs: np.ndarray) -> list[np.ndarray]:
-        """
-        Call the model with the given inputs and return inference results.
-
-        Args:
-            *inputs (np.ndarray): Input data to the model. Each array should match the expected shape and type
-                for the corresponding model input.
-
-        Returns:
-            (list[np.ndarray]): Model outputs with the same dtype as the input. Each element in the list
-                corresponds to one of the model's output tensors.
-
-        Examples:
-            >>> model = TritonRemoteModel(url="localhost:8000", endpoint="yolov8", scheme="http")
-            >>> outputs = model(np.random.rand(1, 3, 640, 640).astype(np.float32))
-        """
-        infer_inputs = []
-        input_format = inputs[0].dtype
-        for i, x in enumerate(inputs):
-            if x.dtype != self.np_input_formats[i]:
-                x = x.astype(self.np_input_formats[i])
-            infer_input = self.InferInput(self.input_names[i], [*x.shape], self.input_formats[i].replace("TYPE_", ""))
-            infer_input.set_data_from_numpy(x)
-            infer_inputs.append(infer_input)
-
-        infer_outputs = [self.InferRequestedOutput(output_name) for output_name in self.output_names]
-        outputs = self.triton_client.infer(model_name=self.endpoint, inputs=infer_inputs, outputs=infer_outputs)
-
-        return [outputs.as_numpy(output_name).astype(input_format) for output_name in self.output_names]
diff --git a/ultralytics/utils/tuner.py b/ultralytics/utils/tuner.py
deleted file mode 100644
index 6b025b5..0000000
--- a/ultralytics/utils/tuner.py
+++ /dev/null
@@ -1,159 +0,0 @@
-# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-
-from ultralytics.cfg import TASK2DATA, TASK2METRIC, get_cfg, get_save_dir
-from ultralytics.utils import DEFAULT_CFG, DEFAULT_CFG_DICT, LOGGER, NUM_THREADS, checks, colorstr
-
-
-def run_ray_tune(
-    model,
-    space: dict = None,
-    grace_period: int = 10,
-    gpu_per_trial: int = None,
-    max_samples: int = 10,
-    **train_args,
-):
-    """
-    Run hyperparameter tuning using Ray Tune.
-
-    Args:
-        model (YOLO): Model to run the tuner on.
-        space (dict, optional): The hyperparameter search space. If not provided, uses default space.
-        grace_period (int, optional): The grace period in epochs of the ASHA scheduler.
-        gpu_per_trial (int, optional): The number of GPUs to allocate per trial.
-        max_samples (int, optional): The maximum number of trials to run.
-        **train_args (Any): Additional arguments to pass to the `train()` method.
-
-    Returns:
-        (ray.tune.ResultGrid): A ResultGrid containing the results of the hyperparameter search.
-
-    Examples:
-        >>> from ultralytics import YOLO
-        >>> model = YOLO("yolo11n.pt")  # Load a YOLO11n model
-
-        Start tuning hyperparameters for YOLO11n training on the COCO8 dataset
-        >>> result_grid = model.tune(data="coco8.yaml", use_ray=True)
-    """
-    LOGGER.info("💡 Learn about RayTune at https://docs.ultralytics.com/integrations/ray-tune")
-    if train_args is None:
-        train_args = {}
-
-    try:
-        checks.check_requirements("ray[tune]")
-
-        import ray
-        from ray import tune
-        from ray.air import RunConfig
-        from ray.air.integrations.wandb import WandbLoggerCallback
-        from ray.tune.schedulers import ASHAScheduler
-    except ImportError:
-        raise ModuleNotFoundError('Ray Tune required but not found. To install run: pip install "ray[tune]"')
-
-    try:
-        import wandb
-
-        assert hasattr(wandb, "__version__")
-    except (ImportError, AssertionError):
-        wandb = False
-
-    checks.check_version(ray.__version__, ">=2.0.0", "ray")
-    default_space = {
-        # 'optimizer': tune.choice(['SGD', 'Adam', 'AdamW', 'NAdam', 'RAdam', 'RMSProp']),
-        "lr0": tune.uniform(1e-5, 1e-1),
-        "lrf": tune.uniform(0.01, 1.0),  # final OneCycleLR learning rate (lr0 * lrf)
-        "momentum": tune.uniform(0.6, 0.98),  # SGD momentum/Adam beta1
-        "weight_decay": tune.uniform(0.0, 0.001),  # optimizer weight decay
-        "warmup_epochs": tune.uniform(0.0, 5.0),  # warmup epochs (fractions ok)
-        "warmup_momentum": tune.uniform(0.0, 0.95),  # warmup initial momentum
-        "box": tune.uniform(0.02, 0.2),  # box loss gain
-        "cls": tune.uniform(0.2, 4.0),  # cls loss gain (scale with pixels)
-        "hsv_h": tune.uniform(0.0, 0.1),  # image HSV-Hue augmentation (fraction)
-        "hsv_s": tune.uniform(0.0, 0.9),  # image HSV-Saturation augmentation (fraction)
-        "hsv_v": tune.uniform(0.0, 0.9),  # image HSV-Value augmentation (fraction)
-        "degrees": tune.uniform(0.0, 45.0),  # image rotation (+/- deg)
-        "translate": tune.uniform(0.0, 0.9),  # image translation (+/- fraction)
-        "scale": tune.uniform(0.0, 0.9),  # image scale (+/- gain)
-        "shear": tune.uniform(0.0, 10.0),  # image shear (+/- deg)
-        "perspective": tune.uniform(0.0, 0.001),  # image perspective (+/- fraction), range 0-0.001
-        "flipud": tune.uniform(0.0, 1.0),  # image flip up-down (probability)
-        "fliplr": tune.uniform(0.0, 1.0),  # image flip left-right (probability)
-        "bgr": tune.uniform(0.0, 1.0),  # image channel BGR (probability)
-        "mosaic": tune.uniform(0.0, 1.0),  # image mosaic (probability)
-        "mixup": tune.uniform(0.0, 1.0),  # image mixup (probability)
-        "cutmix": tune.uniform(0.0, 1.0),  # image cutmix (probability)
-        "copy_paste": tune.uniform(0.0, 1.0),  # segment copy-paste (probability)
-    }
-
-    # Put the model in ray store
-    task = model.task
-    model_in_store = ray.put(model)
-
-    def _tune(config):
-        """Train the YOLO model with the specified hyperparameters and return results."""
-        model_to_train = ray.get(model_in_store)  # get the model from ray store for tuning
-        model_to_train.reset_callbacks()
-        config.update(train_args)
-        results = model_to_train.train(**config)
-        return results.results_dict
-
-    # Get search space
-    if not space and not train_args.get("resume"):
-        space = default_space
-        LOGGER.warning("Search space not provided, using default search space.")
-
-    # Get dataset
-    data = train_args.get("data", TASK2DATA[task])
-    space["data"] = data
-    if "data" not in train_args:
-        LOGGER.warning(f'Data not provided, using default "data={data}".')
-
-    # Define the trainable function with allocated resources
-    trainable_with_resources = tune.with_resources(_tune, {"cpu": NUM_THREADS, "gpu": gpu_per_trial or 0})
-
-    # Define the ASHA scheduler for hyperparameter search
-    asha_scheduler = ASHAScheduler(
-        time_attr="epoch",
-        metric=TASK2METRIC[task],
-        mode="max",
-        max_t=train_args.get("epochs") or DEFAULT_CFG_DICT["epochs"] or 100,
-        grace_period=grace_period,
-        reduction_factor=3,
-    )
-
-    # Define the callbacks for the hyperparameter search
-    tuner_callbacks = [WandbLoggerCallback(project="YOLOv8-tune")] if wandb else []
-
-    # Create the Ray Tune hyperparameter search tuner
-    tune_dir = get_save_dir(
-        get_cfg(
-            DEFAULT_CFG,
-            {**train_args, **{"exist_ok": train_args.pop("resume", False)}},  # resume w/ same tune_dir
-        ),
-        name=train_args.pop("name", "tune"),  # runs/{task}/{tune_dir}
-    )  # must be absolute dir
-    tune_dir.mkdir(parents=True, exist_ok=True)
-    if tune.Tuner.can_restore(tune_dir):
-        LOGGER.info(f"{colorstr('Tuner: ')} Resuming tuning run {tune_dir}...")
-        tuner = tune.Tuner.restore(str(tune_dir), trainable=trainable_with_resources, resume_errored=True)
-    else:
-        tuner = tune.Tuner(
-            trainable_with_resources,
-            param_space=space,
-            tune_config=tune.TuneConfig(
-                scheduler=asha_scheduler,
-                num_samples=max_samples,
-                trial_name_creator=lambda trial: f"{trial.trainable_name}_{trial.trial_id}",
-                trial_dirname_creator=lambda trial: f"{trial.trainable_name}_{trial.trial_id}",
-            ),
-            run_config=RunConfig(callbacks=tuner_callbacks, storage_path=tune_dir.parent, name=tune_dir.name),
-        )
-
-    # Run the hyperparameter search
-    tuner.fit()
-
-    # Get the results of the hyperparameter search
-    results = tuner.get_results()
-
-    # Shut down Ray to clean up workers
-    ray.shutdown()
-
-    return results