init commit

This commit is contained in:
2025-11-08 19:15:39 +01:00
parent ecffcb08e8
commit c7adacf53b
470 changed files with 73751 additions and 0 deletions

View File

@@ -0,0 +1,27 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
from .tasks import (
BaseModel,
ClassificationModel,
DetectionModel,
SegmentationModel,
guess_model_scale,
guess_model_task,
load_checkpoint,
parse_model,
torch_safe_load,
yaml_model_load,
)
__all__ = (
"load_checkpoint",
"parse_model",
"yaml_model_load",
"guess_model_task",
"guess_model_scale",
"torch_safe_load",
"DetectionModel",
"SegmentationModel",
"ClassificationModel",
"BaseModel",
)

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,886 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
from __future__ import annotations
import ast
import json
import platform
import zipfile
from collections import OrderedDict, namedtuple
from pathlib import Path
from typing import Any
import cv2
import numpy as np
import torch
import torch.nn as nn
from PIL import Image
from ultralytics.utils import ARM64, IS_JETSON, LINUX, LOGGER, PYTHON_VERSION, ROOT, YAML, is_jetson
from ultralytics.utils.checks import check_requirements, check_suffix, check_version, check_yaml, is_rockchip
from ultralytics.utils.downloads import attempt_download_asset, is_url
def check_class_names(names: list | dict) -> dict[int, str]:
"""
Check class names and convert to dict format if needed.
Args:
names (list | dict): Class names as list or dict format.
Returns:
(dict): Class names in dict format with integer keys and string values.
Raises:
KeyError: If class indices are invalid for the dataset size.
"""
if isinstance(names, list): # names is a list
names = dict(enumerate(names)) # convert to dict
if isinstance(names, dict):
# Convert 1) string keys to int, i.e. '0' to 0, and non-string values to strings, i.e. True to 'True'
names = {int(k): str(v) for k, v in names.items()}
n = len(names)
if max(names.keys()) >= n:
raise KeyError(
f"{n}-class dataset requires class indices 0-{n - 1}, but you have invalid class indices "
f"{min(names.keys())}-{max(names.keys())} defined in your dataset YAML."
)
if isinstance(names[0], str) and names[0].startswith("n0"): # imagenet class codes, i.e. 'n01440764'
names_map = YAML.load(ROOT / "cfg/datasets/ImageNet.yaml")["map"] # human-readable names
names = {k: names_map[v] for k, v in names.items()}
return names
def default_class_names(data: str | Path | None = None) -> dict[int, str]:
"""
Apply default class names to an input YAML file or return numerical class names.
Args:
data (str | Path, optional): Path to YAML file containing class names.
Returns:
(dict): Dictionary mapping class indices to class names.
"""
if data:
try:
return YAML.load(check_yaml(data))["names"]
except Exception:
pass
return {i: f"class{i}" for i in range(999)} # return default if above errors
class AutoBackend(nn.Module):
"""
Handle dynamic backend selection for running inference using Ultralytics YOLO models.
The AutoBackend class is designed to provide an abstraction layer for various inference engines. It supports a wide
range of formats, each with specific naming conventions as outlined below:
Supported Formats and Naming Conventions:
| Format | File Suffix |
| --------------------- | ----------------- |
| PyTorch | *.pt |
| TorchScript | *.torchscript |
| ONNX Runtime | *.onnx |
| ONNX OpenCV DNN | *.onnx (dnn=True) |
| OpenVINO | *openvino_model/ |
| CoreML | *.mlpackage |
| TensorRT | *.engine |
| TensorFlow SavedModel | *_saved_model/ |
| TensorFlow GraphDef | *.pb |
| TensorFlow Lite | *.tflite |
| TensorFlow Edge TPU | *_edgetpu.tflite |
| PaddlePaddle | *_paddle_model/ |
| MNN | *.mnn |
| NCNN | *_ncnn_model/ |
| IMX | *_imx_model/ |
| RKNN | *_rknn_model/ |
Attributes:
model (torch.nn.Module): The loaded YOLO model.
device (torch.device): The device (CPU or GPU) on which the model is loaded.
task (str): The type of task the model performs (detect, segment, classify, pose).
names (dict): A dictionary of class names that the model can detect.
stride (int): The model stride, typically 32 for YOLO models.
fp16 (bool): Whether the model uses half-precision (FP16) inference.
nhwc (bool): Whether the model expects NHWC input format instead of NCHW.
pt (bool): Whether the model is a PyTorch model.
jit (bool): Whether the model is a TorchScript model.
onnx (bool): Whether the model is an ONNX model.
xml (bool): Whether the model is an OpenVINO model.
engine (bool): Whether the model is a TensorRT engine.
coreml (bool): Whether the model is a CoreML model.
saved_model (bool): Whether the model is a TensorFlow SavedModel.
pb (bool): Whether the model is a TensorFlow GraphDef.
tflite (bool): Whether the model is a TensorFlow Lite model.
edgetpu (bool): Whether the model is a TensorFlow Edge TPU model.
tfjs (bool): Whether the model is a TensorFlow.js model.
paddle (bool): Whether the model is a PaddlePaddle model.
mnn (bool): Whether the model is an MNN model.
ncnn (bool): Whether the model is an NCNN model.
imx (bool): Whether the model is an IMX model.
rknn (bool): Whether the model is an RKNN model.
triton (bool): Whether the model is a Triton Inference Server model.
Methods:
forward: Run inference on an input image.
from_numpy: Convert numpy array to tensor.
warmup: Warm up the model with a dummy input.
_model_type: Determine the model type from file path.
Examples:
>>> model = AutoBackend(model="yolo11n.pt", device="cuda")
>>> results = model(img)
"""
@torch.no_grad()
def __init__(
self,
model: str | torch.nn.Module = "yolo11n.pt",
device: torch.device = torch.device("cpu"),
dnn: bool = False,
data: str | Path | None = None,
fp16: bool = False,
fuse: bool = True,
verbose: bool = True,
):
"""
Initialize the AutoBackend for inference.
Args:
model (str | torch.nn.Module): Path to the model weights file or a module instance.
device (torch.device): Device to run the model on.
dnn (bool): Use OpenCV DNN module for ONNX inference.
data (str | Path, optional): Path to the additional data.yaml file containing class names.
fp16 (bool): Enable half-precision inference. Supported only on specific backends.
fuse (bool): Fuse Conv2D + BatchNorm layers for optimization.
verbose (bool): Enable verbose logging.
"""
super().__init__()
nn_module = isinstance(model, torch.nn.Module)
(
pt,
jit,
onnx,
xml,
engine,
coreml,
saved_model,
pb,
tflite,
edgetpu,
tfjs,
paddle,
mnn,
ncnn,
imx,
rknn,
triton,
) = self._model_type("" if nn_module else model)
fp16 &= pt or jit or onnx or xml or engine or nn_module or triton # FP16
nhwc = coreml or saved_model or pb or tflite or edgetpu or rknn # BHWC formats (vs torch BCWH)
stride, ch = 32, 3 # default stride and channels
end2end, dynamic = False, False
metadata, task = None, None
# Set device
cuda = isinstance(device, torch.device) and torch.cuda.is_available() and device.type != "cpu" # use CUDA
if cuda and not any([nn_module, pt, jit, engine, onnx, paddle]): # GPU dataloader formats
device = torch.device("cpu")
cuda = False
# Download if not local
w = attempt_download_asset(model) if pt else model # weights path
# PyTorch (in-memory or file)
if nn_module or pt:
if nn_module:
pt = True
if fuse:
if IS_JETSON and is_jetson(jetpack=5):
# Jetson Jetpack5 requires device before fuse https://github.com/ultralytics/ultralytics/pull/21028
model = model.to(device)
model = model.fuse(verbose=verbose)
model = model.to(device)
else: # pt file
from ultralytics.nn.tasks import load_checkpoint
model, _ = load_checkpoint(model, device=device, fuse=fuse) # load model, ckpt
# Common PyTorch model processing
if hasattr(model, "kpt_shape"):
kpt_shape = model.kpt_shape # pose-only
stride = max(int(model.stride.max()), 32) # model stride
names = model.module.names if hasattr(model, "module") else model.names # get class names
model.half() if fp16 else model.float()
ch = model.yaml.get("channels", 3)
for p in model.parameters():
p.requires_grad = False
self.model = model # explicitly assign for to(), cpu(), cuda(), half()
# TorchScript
elif jit:
import torchvision # noqa - https://github.com/ultralytics/ultralytics/pull/19747
LOGGER.info(f"Loading {w} for TorchScript inference...")
extra_files = {"config.txt": ""} # model metadata
model = torch.jit.load(w, _extra_files=extra_files, map_location=device)
model.half() if fp16 else model.float()
if extra_files["config.txt"]: # load metadata dict
metadata = json.loads(extra_files["config.txt"], object_hook=lambda x: dict(x.items()))
# ONNX OpenCV DNN
elif dnn:
LOGGER.info(f"Loading {w} for ONNX OpenCV DNN inference...")
check_requirements("opencv-python>=4.5.4")
net = cv2.dnn.readNetFromONNX(w)
# ONNX Runtime and IMX
elif onnx or imx:
LOGGER.info(f"Loading {w} for ONNX Runtime inference...")
check_requirements(("onnx", "onnxruntime-gpu" if cuda else "onnxruntime"))
import onnxruntime
providers = ["CPUExecutionProvider"]
if cuda:
if "CUDAExecutionProvider" in onnxruntime.get_available_providers():
providers.insert(0, "CUDAExecutionProvider")
else: # Only log warning if CUDA was requested but unavailable
LOGGER.warning("Failed to start ONNX Runtime with CUDA. Using CPU...")
device = torch.device("cpu")
cuda = False
LOGGER.info(f"Using ONNX Runtime {onnxruntime.__version__} {providers[0]}")
if onnx:
session = onnxruntime.InferenceSession(w, providers=providers)
else:
check_requirements(
["model-compression-toolkit>=2.4.1", "sony-custom-layers[torch]>=0.3.0", "onnxruntime-extensions"]
)
w = next(Path(w).glob("*.onnx"))
LOGGER.info(f"Loading {w} for ONNX IMX inference...")
import mct_quantizers as mctq
from sony_custom_layers.pytorch.nms import nms_ort # noqa
session_options = mctq.get_ort_session_options()
session_options.enable_mem_reuse = False # fix the shape mismatch from onnxruntime
session = onnxruntime.InferenceSession(w, session_options, providers=["CPUExecutionProvider"])
output_names = [x.name for x in session.get_outputs()]
metadata = session.get_modelmeta().custom_metadata_map
dynamic = isinstance(session.get_outputs()[0].shape[0], str)
fp16 = "float16" in session.get_inputs()[0].type
if not dynamic:
io = session.io_binding()
bindings = []
for output in session.get_outputs():
out_fp16 = "float16" in output.type
y_tensor = torch.empty(output.shape, dtype=torch.float16 if out_fp16 else torch.float32).to(device)
io.bind_output(
name=output.name,
device_type=device.type,
device_id=device.index if cuda else 0,
element_type=np.float16 if out_fp16 else np.float32,
shape=tuple(y_tensor.shape),
buffer_ptr=y_tensor.data_ptr(),
)
bindings.append(y_tensor)
# OpenVINO
elif xml:
LOGGER.info(f"Loading {w} for OpenVINO inference...")
check_requirements("openvino>=2024.0.0")
import openvino as ov
core = ov.Core()
device_name = "AUTO"
if isinstance(device, str) and device.startswith("intel"):
device_name = device.split(":")[1].upper() # Intel OpenVINO device
device = torch.device("cpu")
if device_name not in core.available_devices:
LOGGER.warning(f"OpenVINO device '{device_name}' not available. Using 'AUTO' instead.")
device_name = "AUTO"
w = Path(w)
if not w.is_file(): # if not *.xml
w = next(w.glob("*.xml")) # get *.xml file from *_openvino_model dir
ov_model = core.read_model(model=str(w), weights=w.with_suffix(".bin"))
if ov_model.get_parameters()[0].get_layout().empty:
ov_model.get_parameters()[0].set_layout(ov.Layout("NCHW"))
metadata = w.parent / "metadata.yaml"
if metadata.exists():
metadata = YAML.load(metadata)
batch = metadata["batch"]
dynamic = metadata.get("args", {}).get("dynamic", dynamic)
# OpenVINO inference modes are 'LATENCY', 'THROUGHPUT' (not recommended), or 'CUMULATIVE_THROUGHPUT'
inference_mode = "CUMULATIVE_THROUGHPUT" if batch > 1 and dynamic else "LATENCY"
ov_compiled_model = core.compile_model(
ov_model,
device_name=device_name,
config={"PERFORMANCE_HINT": inference_mode},
)
LOGGER.info(
f"Using OpenVINO {inference_mode} mode for batch={batch} inference on {', '.join(ov_compiled_model.get_property('EXECUTION_DEVICES'))}..."
)
input_name = ov_compiled_model.input().get_any_name()
# TensorRT
elif engine:
LOGGER.info(f"Loading {w} for TensorRT inference...")
if IS_JETSON and check_version(PYTHON_VERSION, "<=3.8.10"):
# fix error: `np.bool` was a deprecated alias for the builtin `bool` for JetPack 4 and JetPack 5 with Python <= 3.8.10
check_requirements("numpy==1.23.5")
try: # https://developer.nvidia.com/nvidia-tensorrt-download
import tensorrt as trt # noqa
except ImportError:
if LINUX:
check_requirements("tensorrt>7.0.0,!=10.1.0")
import tensorrt as trt # noqa
check_version(trt.__version__, ">=7.0.0", hard=True)
check_version(trt.__version__, "!=10.1.0", msg="https://github.com/ultralytics/ultralytics/pull/14239")
if device.type == "cpu":
device = torch.device("cuda:0")
Binding = namedtuple("Binding", ("name", "dtype", "shape", "data", "ptr"))
logger = trt.Logger(trt.Logger.INFO)
# Read file
with open(w, "rb") as f, trt.Runtime(logger) as runtime:
try:
meta_len = int.from_bytes(f.read(4), byteorder="little") # read metadata length
metadata = json.loads(f.read(meta_len).decode("utf-8")) # read metadata
dla = metadata.get("dla", None)
if dla is not None:
runtime.DLA_core = int(dla)
except UnicodeDecodeError:
f.seek(0) # engine file may lack embedded Ultralytics metadata
model = runtime.deserialize_cuda_engine(f.read()) # read engine
# Model context
try:
context = model.create_execution_context()
except Exception as e: # model is None
LOGGER.error(f"TensorRT model exported with a different version than {trt.__version__}\n")
raise e
bindings = OrderedDict()
output_names = []
fp16 = False # default updated below
dynamic = False
is_trt10 = not hasattr(model, "num_bindings")
num = range(model.num_io_tensors) if is_trt10 else range(model.num_bindings)
for i in num:
if is_trt10:
name = model.get_tensor_name(i)
dtype = trt.nptype(model.get_tensor_dtype(name))
is_input = model.get_tensor_mode(name) == trt.TensorIOMode.INPUT
if is_input:
if -1 in tuple(model.get_tensor_shape(name)):
dynamic = True
context.set_input_shape(name, tuple(model.get_tensor_profile_shape(name, 0)[1]))
if dtype == np.float16:
fp16 = True
else:
output_names.append(name)
shape = tuple(context.get_tensor_shape(name))
else: # TensorRT < 10.0
name = model.get_binding_name(i)
dtype = trt.nptype(model.get_binding_dtype(i))
is_input = model.binding_is_input(i)
if model.binding_is_input(i):
if -1 in tuple(model.get_binding_shape(i)): # dynamic
dynamic = True
context.set_binding_shape(i, tuple(model.get_profile_shape(0, i)[1]))
if dtype == np.float16:
fp16 = True
else:
output_names.append(name)
shape = tuple(context.get_binding_shape(i))
im = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device)
bindings[name] = Binding(name, dtype, shape, im, int(im.data_ptr()))
binding_addrs = OrderedDict((n, d.ptr) for n, d in bindings.items())
# CoreML
elif coreml:
check_requirements("coremltools>=8.0")
LOGGER.info(f"Loading {w} for CoreML inference...")
import coremltools as ct
model = ct.models.MLModel(w)
metadata = dict(model.user_defined_metadata)
# TF SavedModel
elif saved_model:
LOGGER.info(f"Loading {w} for TensorFlow SavedModel inference...")
import tensorflow as tf
keras = False # assume TF1 saved_model
model = tf.keras.models.load_model(w) if keras else tf.saved_model.load(w)
metadata = Path(w) / "metadata.yaml"
# TF GraphDef
elif pb: # https://www.tensorflow.org/guide/migrate#a_graphpb_or_graphpbtxt
LOGGER.info(f"Loading {w} for TensorFlow GraphDef inference...")
import tensorflow as tf
from ultralytics.engine.exporter import gd_outputs
def wrap_frozen_graph(gd, inputs, outputs):
"""Wrap frozen graphs for deployment."""
x = tf.compat.v1.wrap_function(lambda: tf.compat.v1.import_graph_def(gd, name=""), []) # wrapped
ge = x.graph.as_graph_element
return x.prune(tf.nest.map_structure(ge, inputs), tf.nest.map_structure(ge, outputs))
gd = tf.Graph().as_graph_def() # TF GraphDef
with open(w, "rb") as f:
gd.ParseFromString(f.read())
frozen_func = wrap_frozen_graph(gd, inputs="x:0", outputs=gd_outputs(gd))
try: # find metadata in SavedModel alongside GraphDef
metadata = next(Path(w).resolve().parent.rglob(f"{Path(w).stem}_saved_model*/metadata.yaml"))
except StopIteration:
pass
# TFLite or TFLite Edge TPU
elif tflite or edgetpu: # https://ai.google.dev/edge/litert/microcontrollers/python
try: # https://coral.ai/docs/edgetpu/tflite-python/#update-existing-tf-lite-code-for-the-edge-tpu
from tflite_runtime.interpreter import Interpreter, load_delegate
except ImportError:
import tensorflow as tf
Interpreter, load_delegate = tf.lite.Interpreter, tf.lite.experimental.load_delegate
if edgetpu: # TF Edge TPU https://coral.ai/software/#edgetpu-runtime
device = device[3:] if str(device).startswith("tpu") else ":0"
LOGGER.info(f"Loading {w} on device {device[1:]} for TensorFlow Lite Edge TPU inference...")
delegate = {"Linux": "libedgetpu.so.1", "Darwin": "libedgetpu.1.dylib", "Windows": "edgetpu.dll"}[
platform.system()
]
interpreter = Interpreter(
model_path=w,
experimental_delegates=[load_delegate(delegate, options={"device": device})],
)
device = "cpu" # Required, otherwise PyTorch will try to use the wrong device
else: # TFLite
LOGGER.info(f"Loading {w} for TensorFlow Lite inference...")
interpreter = Interpreter(model_path=w) # load TFLite model
interpreter.allocate_tensors() # allocate
input_details = interpreter.get_input_details() # inputs
output_details = interpreter.get_output_details() # outputs
# Load metadata
try:
with zipfile.ZipFile(w, "r") as zf:
name = zf.namelist()[0]
contents = zf.read(name).decode("utf-8")
if name == "metadata.json": # Custom Ultralytics metadata dict for Python>=3.12
metadata = json.loads(contents)
else:
metadata = ast.literal_eval(contents) # Default tflite-support metadata for Python<=3.11
except (zipfile.BadZipFile, SyntaxError, ValueError, json.JSONDecodeError):
pass
# TF.js
elif tfjs:
raise NotImplementedError("Ultralytics TF.js inference is not currently supported.")
# PaddlePaddle
elif paddle:
LOGGER.info(f"Loading {w} for PaddlePaddle inference...")
check_requirements(
"paddlepaddle-gpu"
if torch.cuda.is_available()
else "paddlepaddle==3.0.0" # pin 3.0.0 for ARM64
if ARM64
else "paddlepaddle>=3.0.0"
)
import paddle.inference as pdi # noqa
w = Path(w)
model_file, params_file = None, None
if w.is_dir():
model_file = next(w.rglob("*.json"), None)
params_file = next(w.rglob("*.pdiparams"), None)
elif w.suffix == ".pdiparams":
model_file = w.with_name("model.json")
params_file = w
if not (model_file and params_file and model_file.is_file() and params_file.is_file()):
raise FileNotFoundError(f"Paddle model not found in {w}. Both .json and .pdiparams files are required.")
config = pdi.Config(str(model_file), str(params_file))
if cuda:
config.enable_use_gpu(memory_pool_init_size_mb=2048, device_id=0)
predictor = pdi.create_predictor(config)
input_handle = predictor.get_input_handle(predictor.get_input_names()[0])
output_names = predictor.get_output_names()
metadata = w / "metadata.yaml"
# MNN
elif mnn:
LOGGER.info(f"Loading {w} for MNN inference...")
check_requirements("MNN") # requires MNN
import os
import MNN
config = {"precision": "low", "backend": "CPU", "numThread": (os.cpu_count() + 1) // 2}
rt = MNN.nn.create_runtime_manager((config,))
net = MNN.nn.load_module_from_file(w, [], [], runtime_manager=rt, rearrange=True)
def torch_to_mnn(x):
return MNN.expr.const(x.data_ptr(), x.shape)
metadata = json.loads(net.get_info()["bizCode"])
# NCNN
elif ncnn:
LOGGER.info(f"Loading {w} for NCNN inference...")
check_requirements("git+https://github.com/Tencent/ncnn.git" if ARM64 else "ncnn", cmds="--no-deps")
import ncnn as pyncnn
net = pyncnn.Net()
net.opt.use_vulkan_compute = cuda
w = Path(w)
if not w.is_file(): # if not *.param
w = next(w.glob("*.param")) # get *.param file from *_ncnn_model dir
net.load_param(str(w))
net.load_model(str(w.with_suffix(".bin")))
metadata = w.parent / "metadata.yaml"
# NVIDIA Triton Inference Server
elif triton:
check_requirements("tritonclient[all]")
from ultralytics.utils.triton import TritonRemoteModel
model = TritonRemoteModel(w)
metadata = model.metadata
# RKNN
elif rknn:
if not is_rockchip():
raise OSError("RKNN inference is only supported on Rockchip devices.")
LOGGER.info(f"Loading {w} for RKNN inference...")
check_requirements("rknn-toolkit-lite2")
from rknnlite.api import RKNNLite
w = Path(w)
if not w.is_file(): # if not *.rknn
w = next(w.rglob("*.rknn")) # get *.rknn file from *_rknn_model dir
rknn_model = RKNNLite()
rknn_model.load_rknn(str(w))
rknn_model.init_runtime()
metadata = w.parent / "metadata.yaml"
# Any other format (unsupported)
else:
from ultralytics.engine.exporter import export_formats
raise TypeError(
f"model='{w}' is not a supported model format. Ultralytics supports: {export_formats()['Format']}\n"
f"See https://docs.ultralytics.com/modes/predict for help."
)
# Load external metadata YAML
if isinstance(metadata, (str, Path)) and Path(metadata).exists():
metadata = YAML.load(metadata)
if metadata and isinstance(metadata, dict):
for k, v in metadata.items():
if k in {"stride", "batch", "channels"}:
metadata[k] = int(v)
elif k in {"imgsz", "names", "kpt_shape", "args"} and isinstance(v, str):
metadata[k] = eval(v)
stride = metadata["stride"]
task = metadata["task"]
batch = metadata["batch"]
imgsz = metadata["imgsz"]
names = metadata["names"]
kpt_shape = metadata.get("kpt_shape")
end2end = metadata.get("args", {}).get("nms", False)
dynamic = metadata.get("args", {}).get("dynamic", dynamic)
ch = metadata.get("channels", 3)
elif not (pt or triton or nn_module):
LOGGER.warning(f"Metadata not found for 'model={w}'")
# Check names
if "names" not in locals(): # names missing
names = default_class_names(data)
names = check_class_names(names)
self.__dict__.update(locals()) # assign all variables to self
def forward(
self,
im: torch.Tensor,
augment: bool = False,
visualize: bool = False,
embed: list | None = None,
**kwargs: Any,
) -> torch.Tensor | list[torch.Tensor]:
"""
Run inference on an AutoBackend model.
Args:
im (torch.Tensor): The image tensor to perform inference on.
augment (bool): Whether to perform data augmentation during inference.
visualize (bool): Whether to visualize the output predictions.
embed (list, optional): A list of feature vectors/embeddings to return.
**kwargs (Any): Additional keyword arguments for model configuration.
Returns:
(torch.Tensor | list[torch.Tensor]): The raw output tensor(s) from the model.
"""
b, ch, h, w = im.shape # batch, channel, height, width
if self.fp16 and im.dtype != torch.float16:
im = im.half() # to FP16
if self.nhwc:
im = im.permute(0, 2, 3, 1) # torch BCHW to numpy BHWC shape(1,320,192,3)
# PyTorch
if self.pt or self.nn_module:
y = self.model(im, augment=augment, visualize=visualize, embed=embed, **kwargs)
# TorchScript
elif self.jit:
y = self.model(im)
# ONNX OpenCV DNN
elif self.dnn:
im = im.cpu().numpy() # torch to numpy
self.net.setInput(im)
y = self.net.forward()
# ONNX Runtime
elif self.onnx or self.imx:
if self.dynamic:
im = im.cpu().numpy() # torch to numpy
y = self.session.run(self.output_names, {self.session.get_inputs()[0].name: im})
else:
if not self.cuda:
im = im.cpu()
self.io.bind_input(
name="images",
device_type=im.device.type,
device_id=im.device.index if im.device.type == "cuda" else 0,
element_type=np.float16 if self.fp16 else np.float32,
shape=tuple(im.shape),
buffer_ptr=im.data_ptr(),
)
self.session.run_with_iobinding(self.io)
y = self.bindings
if self.imx:
if self.task == "detect":
# boxes, conf, cls
y = np.concatenate([y[0], y[1][:, :, None], y[2][:, :, None]], axis=-1)
elif self.task == "pose":
# boxes, conf, kpts
y = np.concatenate([y[0], y[1][:, :, None], y[2][:, :, None], y[3]], axis=-1)
# OpenVINO
elif self.xml:
im = im.cpu().numpy() # FP32
if self.inference_mode in {"THROUGHPUT", "CUMULATIVE_THROUGHPUT"}: # optimized for larger batch-sizes
n = im.shape[0] # number of images in batch
results = [None] * n # preallocate list with None to match the number of images
def callback(request, userdata):
"""Place result in preallocated list using userdata index."""
results[userdata] = request.results
# Create AsyncInferQueue, set the callback and start asynchronous inference for each input image
async_queue = self.ov.AsyncInferQueue(self.ov_compiled_model)
async_queue.set_callback(callback)
for i in range(n):
# Start async inference with userdata=i to specify the position in results list
async_queue.start_async(inputs={self.input_name: im[i : i + 1]}, userdata=i) # keep image as BCHW
async_queue.wait_all() # wait for all inference requests to complete
y = [list(r.values()) for r in results]
y = [np.concatenate(x) for x in zip(*y)]
else: # inference_mode = "LATENCY", optimized for fastest first result at batch-size 1
y = list(self.ov_compiled_model(im).values())
# TensorRT
elif self.engine:
if self.dynamic and im.shape != self.bindings["images"].shape:
if self.is_trt10:
self.context.set_input_shape("images", im.shape)
self.bindings["images"] = self.bindings["images"]._replace(shape=im.shape)
for name in self.output_names:
self.bindings[name].data.resize_(tuple(self.context.get_tensor_shape(name)))
else:
i = self.model.get_binding_index("images")
self.context.set_binding_shape(i, im.shape)
self.bindings["images"] = self.bindings["images"]._replace(shape=im.shape)
for name in self.output_names:
i = self.model.get_binding_index(name)
self.bindings[name].data.resize_(tuple(self.context.get_binding_shape(i)))
s = self.bindings["images"].shape
assert im.shape == s, f"input size {im.shape} {'>' if self.dynamic else 'not equal to'} max model size {s}"
self.binding_addrs["images"] = int(im.data_ptr())
self.context.execute_v2(list(self.binding_addrs.values()))
y = [self.bindings[x].data for x in sorted(self.output_names)]
# CoreML
elif self.coreml:
im = im[0].cpu().numpy()
im_pil = Image.fromarray((im * 255).astype("uint8"))
# im = im.resize((192, 320), Image.BILINEAR)
y = self.model.predict({"image": im_pil}) # coordinates are xywh normalized
if "confidence" in y: # NMS included
from ultralytics.utils.ops import xywh2xyxy
box = xywh2xyxy(y["coordinates"] * [[w, h, w, h]]) # xyxy pixels
cls = y["confidence"].argmax(1, keepdims=True)
y = np.concatenate((box, np.take_along_axis(y["confidence"], cls, axis=1), cls), 1)[None]
else:
y = list(y.values())
if len(y) == 2 and len(y[1].shape) != 4: # segmentation model
y = list(reversed(y)) # reversed for segmentation models (pred, proto)
# PaddlePaddle
elif self.paddle:
im = im.cpu().numpy().astype(np.float32)
self.input_handle.copy_from_cpu(im)
self.predictor.run()
y = [self.predictor.get_output_handle(x).copy_to_cpu() for x in self.output_names]
# MNN
elif self.mnn:
input_var = self.torch_to_mnn(im)
output_var = self.net.onForward([input_var])
y = [x.read() for x in output_var]
# NCNN
elif self.ncnn:
mat_in = self.pyncnn.Mat(im[0].cpu().numpy())
with self.net.create_extractor() as ex:
ex.input(self.net.input_names()[0], mat_in)
# WARNING: 'output_names' sorted as a temporary fix for https://github.com/pnnx/pnnx/issues/130
y = [np.array(ex.extract(x)[1])[None] for x in sorted(self.net.output_names())]
# NVIDIA Triton Inference Server
elif self.triton:
im = im.cpu().numpy() # torch to numpy
y = self.model(im)
# RKNN
elif self.rknn:
im = (im.cpu().numpy() * 255).astype("uint8")
im = im if isinstance(im, (list, tuple)) else [im]
y = self.rknn_model.inference(inputs=im)
# TensorFlow (SavedModel, GraphDef, Lite, Edge TPU)
else:
im = im.cpu().numpy()
if self.saved_model: # SavedModel
y = self.model(im, training=False) if self.keras else self.model.serving_default(im)
if not isinstance(y, list):
y = [y]
elif self.pb: # GraphDef
y = self.frozen_func(x=self.tf.constant(im))
else: # Lite or Edge TPU
details = self.input_details[0]
is_int = details["dtype"] in {np.int8, np.int16} # is TFLite quantized int8 or int16 model
if is_int:
scale, zero_point = details["quantization"]
im = (im / scale + zero_point).astype(details["dtype"]) # de-scale
self.interpreter.set_tensor(details["index"], im)
self.interpreter.invoke()
y = []
for output in self.output_details:
x = self.interpreter.get_tensor(output["index"])
if is_int:
scale, zero_point = output["quantization"]
x = (x.astype(np.float32) - zero_point) * scale # re-scale
if x.ndim == 3: # if task is not classification, excluding masks (ndim=4) as well
# Denormalize xywh by image size. See https://github.com/ultralytics/ultralytics/pull/1695
# xywh are normalized in TFLite/EdgeTPU to mitigate quantization error of integer models
if x.shape[-1] == 6 or self.end2end: # end-to-end model
x[:, :, [0, 2]] *= w
x[:, :, [1, 3]] *= h
if self.task == "pose":
x[:, :, 6::3] *= w
x[:, :, 7::3] *= h
else:
x[:, [0, 2]] *= w
x[:, [1, 3]] *= h
if self.task == "pose":
x[:, 5::3] *= w
x[:, 6::3] *= h
y.append(x)
# TF segment fixes: export is reversed vs ONNX export and protos are transposed
if len(y) == 2: # segment with (det, proto) output order reversed
if len(y[1].shape) != 4:
y = list(reversed(y)) # should be y = (1, 116, 8400), (1, 160, 160, 32)
if y[1].shape[-1] == 6: # end-to-end model
y = [y[1]]
else:
y[1] = np.transpose(y[1], (0, 3, 1, 2)) # should be y = (1, 116, 8400), (1, 32, 160, 160)
y = [x if isinstance(x, np.ndarray) else x.numpy() for x in y]
# for x in y:
# print(type(x), len(x)) if isinstance(x, (list, tuple)) else print(type(x), x.shape) # debug shapes
if isinstance(y, (list, tuple)):
if len(self.names) == 999 and (self.task == "segment" or len(y) == 2): # segments and names not defined
nc = y[0].shape[1] - y[1].shape[1] - 4 # y = (1, 32, 160, 160), (1, 116, 8400)
self.names = {i: f"class{i}" for i in range(nc)}
return self.from_numpy(y[0]) if len(y) == 1 else [self.from_numpy(x) for x in y]
else:
return self.from_numpy(y)
def from_numpy(self, x: np.ndarray) -> torch.Tensor:
"""
Convert a numpy array to a tensor.
Args:
x (np.ndarray): The array to be converted.
Returns:
(torch.Tensor): The converted tensor
"""
return torch.tensor(x).to(self.device) if isinstance(x, np.ndarray) else x
def warmup(self, imgsz: tuple[int, int, int, int] = (1, 3, 640, 640)) -> None:
"""
Warm up the model by running one forward pass with a dummy input.
Args:
imgsz (tuple): The shape of the dummy input tensor in the format (batch_size, channels, height, width)
"""
warmup_types = self.pt, self.jit, self.onnx, self.engine, self.saved_model, self.pb, self.triton, self.nn_module
if any(warmup_types) and (self.device.type != "cpu" or self.triton):
im = torch.empty(*imgsz, dtype=torch.half if self.fp16 else torch.float, device=self.device) # input
for _ in range(2 if self.jit else 1):
self.forward(im) # warmup
@staticmethod
def _model_type(p: str = "path/to/model.pt") -> list[bool]:
"""
Take a path to a model file and return the model type.
Args:
p (str): Path to the model file.
Returns:
(list[bool]): List of booleans indicating the model type.
Examples:
>>> model = AutoBackend(model="path/to/model.onnx")
>>> model_type = model._model_type() # returns "onnx"
"""
from ultralytics.engine.exporter import export_formats
sf = export_formats()["Suffix"] # export suffixes
if not is_url(p) and not isinstance(p, str):
check_suffix(p, sf) # checks
name = Path(p).name
types = [s in name for s in sf]
types[5] |= name.endswith(".mlmodel") # retain support for older Apple CoreML *.mlmodel formats
types[8] &= not types[9] # tflite &= not edgetpu
if any(types):
triton = False
else:
from urllib.parse import urlsplit
url = urlsplit(p)
triton = bool(url.netloc) and bool(url.path) and url.scheme in {"http", "grpc"}
return types + [triton]

View File

@@ -0,0 +1,182 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
"""
Ultralytics neural network modules.
This module provides access to various neural network components used in Ultralytics models, including convolution
blocks, attention mechanisms, transformer components, and detection/segmentation heads.
Examples:
Visualize a module with Netron
>>> from ultralytics.nn.modules import Conv
>>> import torch
>>> import subprocess
>>> x = torch.ones(1, 128, 40, 40)
>>> m = Conv(128, 128)
>>> f = f"{m._get_name()}.onnx"
>>> torch.onnx.export(m, x, f)
>>> subprocess.run(f"onnxslim {f} {f} && open {f}", shell=True, check=True) # pip install onnxslim
"""
from .block import (
C1,
C2,
C2PSA,
C3,
C3TR,
CIB,
DFL,
ELAN1,
PSA,
SPP,
SPPELAN,
SPPF,
A2C2f,
AConv,
ADown,
Attention,
BNContrastiveHead,
Bottleneck,
BottleneckCSP,
C2f,
C2fAttn,
C2fCIB,
C2fPSA,
C3Ghost,
C3k2,
C3x,
CBFuse,
CBLinear,
ContrastiveHead,
GhostBottleneck,
HGBlock,
HGStem,
ImagePoolingAttn,
MaxSigmoidAttnBlock,
Proto,
RepC3,
RepNCSPELAN4,
RepVGGDW,
ResNetLayer,
SCDown,
TorchVision,
)
from .conv import (
CBAM,
ChannelAttention,
Concat,
Conv,
Conv2,
ConvTranspose,
DWConv,
DWConvTranspose2d,
Focus,
GhostConv,
Index,
LightConv,
RepConv,
SpatialAttention,
)
from .head import (
OBB,
Classify,
Detect,
LRPCHead,
Pose,
RTDETRDecoder,
Segment,
WorldDetect,
YOLOEDetect,
YOLOESegment,
v10Detect,
)
from .transformer import (
AIFI,
MLP,
DeformableTransformerDecoder,
DeformableTransformerDecoderLayer,
LayerNorm2d,
MLPBlock,
MSDeformAttn,
TransformerBlock,
TransformerEncoderLayer,
TransformerLayer,
)
__all__ = (
"Conv",
"Conv2",
"LightConv",
"RepConv",
"DWConv",
"DWConvTranspose2d",
"ConvTranspose",
"Focus",
"GhostConv",
"ChannelAttention",
"SpatialAttention",
"CBAM",
"Concat",
"TransformerLayer",
"TransformerBlock",
"MLPBlock",
"LayerNorm2d",
"DFL",
"HGBlock",
"HGStem",
"SPP",
"SPPF",
"C1",
"C2",
"C3",
"C2f",
"C3k2",
"SCDown",
"C2fPSA",
"C2PSA",
"C2fAttn",
"C3x",
"C3TR",
"C3Ghost",
"GhostBottleneck",
"Bottleneck",
"BottleneckCSP",
"Proto",
"Detect",
"Segment",
"Pose",
"Classify",
"TransformerEncoderLayer",
"RepC3",
"RTDETRDecoder",
"AIFI",
"DeformableTransformerDecoder",
"DeformableTransformerDecoderLayer",
"MSDeformAttn",
"MLP",
"ResNetLayer",
"OBB",
"WorldDetect",
"YOLOEDetect",
"YOLOESegment",
"v10Detect",
"LRPCHead",
"ImagePoolingAttn",
"MaxSigmoidAttnBlock",
"ContrastiveHead",
"BNContrastiveHead",
"RepNCSPELAN4",
"ADown",
"SPPELAN",
"CBFuse",
"CBLinear",
"AConv",
"ELAN1",
"RepVGGDW",
"CIB",
"C2fCIB",
"Attention",
"PSA",
"TorchVision",
"Index",
"A2C2f",
)

View File

@@ -0,0 +1,56 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
"""Activation modules."""
import torch
import torch.nn as nn
class AGLU(nn.Module):
"""
Unified activation function module from AGLU.
This class implements a parameterized activation function with learnable parameters lambda and kappa, based on the
AGLU (Adaptive Gated Linear Unit) approach.
Attributes:
act (nn.Softplus): Softplus activation function with negative beta.
lambd (nn.Parameter): Learnable lambda parameter initialized with uniform distribution.
kappa (nn.Parameter): Learnable kappa parameter initialized with uniform distribution.
Methods:
forward: Compute the forward pass of the Unified activation function.
Examples:
>>> import torch
>>> m = AGLU()
>>> input = torch.randn(2)
>>> output = m(input)
>>> print(output.shape)
torch.Size([2])
References:
https://github.com/kostas1515/AGLU
"""
def __init__(self, device=None, dtype=None) -> None:
"""Initialize the Unified activation function with learnable parameters."""
super().__init__()
self.act = nn.Softplus(beta=-1.0)
self.lambd = nn.Parameter(nn.init.uniform_(torch.empty(1, device=device, dtype=dtype))) # lambda parameter
self.kappa = nn.Parameter(nn.init.uniform_(torch.empty(1, device=device, dtype=dtype))) # kappa parameter
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Apply the Adaptive Gated Linear Unit (AGLU) activation function.
This forward method implements the AGLU activation function with learnable parameters lambda and kappa.
The function applies a transformation that adaptively combines linear and non-linear components.
Args:
x (torch.Tensor): Input tensor to apply the activation function to.
Returns:
(torch.Tensor): Output tensor after applying the AGLU activation function, with the same shape as the input.
"""
lam = torch.clamp(self.lambd, min=0.0001) # Clamp lambda to avoid division by zero
return torch.exp((1 / lam) * self.act((self.kappa * x) - torch.log(lam)))

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,714 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
"""Convolution modules."""
from __future__ import annotations
import math
import numpy as np
import torch
import torch.nn as nn
__all__ = (
"Conv",
"Conv2",
"LightConv",
"DWConv",
"DWConvTranspose2d",
"ConvTranspose",
"Focus",
"GhostConv",
"ChannelAttention",
"SpatialAttention",
"CBAM",
"Concat",
"RepConv",
"Index",
)
def autopad(k, p=None, d=1): # kernel, padding, dilation
"""Pad to 'same' shape outputs."""
if d > 1:
k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k] # actual kernel-size
if p is None:
p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad
return p
class Conv(nn.Module):
"""
Standard convolution module with batch normalization and activation.
Attributes:
conv (nn.Conv2d): Convolutional layer.
bn (nn.BatchNorm2d): Batch normalization layer.
act (nn.Module): Activation function layer.
default_act (nn.Module): Default activation function (SiLU).
"""
default_act = nn.SiLU() # default activation
def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True):
"""
Initialize Conv layer with given parameters.
Args:
c1 (int): Number of input channels.
c2 (int): Number of output channels.
k (int): Kernel size.
s (int): Stride.
p (int, optional): Padding.
g (int): Groups.
d (int): Dilation.
act (bool | nn.Module): Activation function.
"""
super().__init__()
self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False)
self.bn = nn.BatchNorm2d(c2)
self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
def forward(self, x):
"""
Apply convolution, batch normalization and activation to input tensor.
Args:
x (torch.Tensor): Input tensor.
Returns:
(torch.Tensor): Output tensor.
"""
return self.act(self.bn(self.conv(x)))
def forward_fuse(self, x):
"""
Apply convolution and activation without batch normalization.
Args:
x (torch.Tensor): Input tensor.
Returns:
(torch.Tensor): Output tensor.
"""
return self.act(self.conv(x))
class Conv2(Conv):
"""
Simplified RepConv module with Conv fusing.
Attributes:
conv (nn.Conv2d): Main 3x3 convolutional layer.
cv2 (nn.Conv2d): Additional 1x1 convolutional layer.
bn (nn.BatchNorm2d): Batch normalization layer.
act (nn.Module): Activation function layer.
"""
def __init__(self, c1, c2, k=3, s=1, p=None, g=1, d=1, act=True):
"""
Initialize Conv2 layer with given parameters.
Args:
c1 (int): Number of input channels.
c2 (int): Number of output channels.
k (int): Kernel size.
s (int): Stride.
p (int, optional): Padding.
g (int): Groups.
d (int): Dilation.
act (bool | nn.Module): Activation function.
"""
super().__init__(c1, c2, k, s, p, g=g, d=d, act=act)
self.cv2 = nn.Conv2d(c1, c2, 1, s, autopad(1, p, d), groups=g, dilation=d, bias=False) # add 1x1 conv
def forward(self, x):
"""
Apply convolution, batch normalization and activation to input tensor.
Args:
x (torch.Tensor): Input tensor.
Returns:
(torch.Tensor): Output tensor.
"""
return self.act(self.bn(self.conv(x) + self.cv2(x)))
def forward_fuse(self, x):
"""
Apply fused convolution, batch normalization and activation to input tensor.
Args:
x (torch.Tensor): Input tensor.
Returns:
(torch.Tensor): Output tensor.
"""
return self.act(self.bn(self.conv(x)))
def fuse_convs(self):
"""Fuse parallel convolutions."""
w = torch.zeros_like(self.conv.weight.data)
i = [x // 2 for x in w.shape[2:]]
w[:, :, i[0] : i[0] + 1, i[1] : i[1] + 1] = self.cv2.weight.data.clone()
self.conv.weight.data += w
self.__delattr__("cv2")
self.forward = self.forward_fuse
class LightConv(nn.Module):
"""
Light convolution module with 1x1 and depthwise convolutions.
This implementation is based on the PaddleDetection HGNetV2 backbone.
Attributes:
conv1 (Conv): 1x1 convolution layer.
conv2 (DWConv): Depthwise convolution layer.
"""
def __init__(self, c1, c2, k=1, act=nn.ReLU()):
"""
Initialize LightConv layer with given parameters.
Args:
c1 (int): Number of input channels.
c2 (int): Number of output channels.
k (int): Kernel size for depthwise convolution.
act (nn.Module): Activation function.
"""
super().__init__()
self.conv1 = Conv(c1, c2, 1, act=False)
self.conv2 = DWConv(c2, c2, k, act=act)
def forward(self, x):
"""
Apply 2 convolutions to input tensor.
Args:
x (torch.Tensor): Input tensor.
Returns:
(torch.Tensor): Output tensor.
"""
return self.conv2(self.conv1(x))
class DWConv(Conv):
"""Depth-wise convolution module."""
def __init__(self, c1, c2, k=1, s=1, d=1, act=True):
"""
Initialize depth-wise convolution with given parameters.
Args:
c1 (int): Number of input channels.
c2 (int): Number of output channels.
k (int): Kernel size.
s (int): Stride.
d (int): Dilation.
act (bool | nn.Module): Activation function.
"""
super().__init__(c1, c2, k, s, g=math.gcd(c1, c2), d=d, act=act)
class DWConvTranspose2d(nn.ConvTranspose2d):
"""Depth-wise transpose convolution module."""
def __init__(self, c1, c2, k=1, s=1, p1=0, p2=0):
"""
Initialize depth-wise transpose convolution with given parameters.
Args:
c1 (int): Number of input channels.
c2 (int): Number of output channels.
k (int): Kernel size.
s (int): Stride.
p1 (int): Padding.
p2 (int): Output padding.
"""
super().__init__(c1, c2, k, s, p1, p2, groups=math.gcd(c1, c2))
class ConvTranspose(nn.Module):
"""
Convolution transpose module with optional batch normalization and activation.
Attributes:
conv_transpose (nn.ConvTranspose2d): Transposed convolution layer.
bn (nn.BatchNorm2d | nn.Identity): Batch normalization layer.
act (nn.Module): Activation function layer.
default_act (nn.Module): Default activation function (SiLU).
"""
default_act = nn.SiLU() # default activation
def __init__(self, c1, c2, k=2, s=2, p=0, bn=True, act=True):
"""
Initialize ConvTranspose layer with given parameters.
Args:
c1 (int): Number of input channels.
c2 (int): Number of output channels.
k (int): Kernel size.
s (int): Stride.
p (int): Padding.
bn (bool): Use batch normalization.
act (bool | nn.Module): Activation function.
"""
super().__init__()
self.conv_transpose = nn.ConvTranspose2d(c1, c2, k, s, p, bias=not bn)
self.bn = nn.BatchNorm2d(c2) if bn else nn.Identity()
self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
def forward(self, x):
"""
Apply transposed convolution, batch normalization and activation to input.
Args:
x (torch.Tensor): Input tensor.
Returns:
(torch.Tensor): Output tensor.
"""
return self.act(self.bn(self.conv_transpose(x)))
def forward_fuse(self, x):
"""
Apply activation and convolution transpose operation to input.
Args:
x (torch.Tensor): Input tensor.
Returns:
(torch.Tensor): Output tensor.
"""
return self.act(self.conv_transpose(x))
class Focus(nn.Module):
"""
Focus module for concentrating feature information.
Slices input tensor into 4 parts and concatenates them in the channel dimension.
Attributes:
conv (Conv): Convolution layer.
"""
def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):
"""
Initialize Focus module with given parameters.
Args:
c1 (int): Number of input channels.
c2 (int): Number of output channels.
k (int): Kernel size.
s (int): Stride.
p (int, optional): Padding.
g (int): Groups.
act (bool | nn.Module): Activation function.
"""
super().__init__()
self.conv = Conv(c1 * 4, c2, k, s, p, g, act=act)
# self.contract = Contract(gain=2)
def forward(self, x):
"""
Apply Focus operation and convolution to input tensor.
Input shape is (B, C, W, H) and output shape is (B, 4C, W/2, H/2).
Args:
x (torch.Tensor): Input tensor.
Returns:
(torch.Tensor): Output tensor.
"""
return self.conv(torch.cat((x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]), 1))
# return self.conv(self.contract(x))
class GhostConv(nn.Module):
"""
Ghost Convolution module.
Generates more features with fewer parameters by using cheap operations.
Attributes:
cv1 (Conv): Primary convolution.
cv2 (Conv): Cheap operation convolution.
References:
https://github.com/huawei-noah/Efficient-AI-Backbones
"""
def __init__(self, c1, c2, k=1, s=1, g=1, act=True):
"""
Initialize Ghost Convolution module with given parameters.
Args:
c1 (int): Number of input channels.
c2 (int): Number of output channels.
k (int): Kernel size.
s (int): Stride.
g (int): Groups.
act (bool | nn.Module): Activation function.
"""
super().__init__()
c_ = c2 // 2 # hidden channels
self.cv1 = Conv(c1, c_, k, s, None, g, act=act)
self.cv2 = Conv(c_, c_, 5, 1, None, c_, act=act)
def forward(self, x):
"""
Apply Ghost Convolution to input tensor.
Args:
x (torch.Tensor): Input tensor.
Returns:
(torch.Tensor): Output tensor with concatenated features.
"""
y = self.cv1(x)
return torch.cat((y, self.cv2(y)), 1)
class RepConv(nn.Module):
"""
RepConv module with training and deploy modes.
This module is used in RT-DETR and can fuse convolutions during inference for efficiency.
Attributes:
conv1 (Conv): 3x3 convolution.
conv2 (Conv): 1x1 convolution.
bn (nn.BatchNorm2d, optional): Batch normalization for identity branch.
act (nn.Module): Activation function.
default_act (nn.Module): Default activation function (SiLU).
References:
https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
"""
default_act = nn.SiLU() # default activation
def __init__(self, c1, c2, k=3, s=1, p=1, g=1, d=1, act=True, bn=False, deploy=False):
"""
Initialize RepConv module with given parameters.
Args:
c1 (int): Number of input channels.
c2 (int): Number of output channels.
k (int): Kernel size.
s (int): Stride.
p (int): Padding.
g (int): Groups.
d (int): Dilation.
act (bool | nn.Module): Activation function.
bn (bool): Use batch normalization for identity branch.
deploy (bool): Deploy mode for inference.
"""
super().__init__()
assert k == 3 and p == 1
self.g = g
self.c1 = c1
self.c2 = c2
self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
self.bn = nn.BatchNorm2d(num_features=c1) if bn and c2 == c1 and s == 1 else None
self.conv1 = Conv(c1, c2, k, s, p=p, g=g, act=False)
self.conv2 = Conv(c1, c2, 1, s, p=(p - k // 2), g=g, act=False)
def forward_fuse(self, x):
"""
Forward pass for deploy mode.
Args:
x (torch.Tensor): Input tensor.
Returns:
(torch.Tensor): Output tensor.
"""
return self.act(self.conv(x))
def forward(self, x):
"""
Forward pass for training mode.
Args:
x (torch.Tensor): Input tensor.
Returns:
(torch.Tensor): Output tensor.
"""
id_out = 0 if self.bn is None else self.bn(x)
return self.act(self.conv1(x) + self.conv2(x) + id_out)
def get_equivalent_kernel_bias(self):
"""
Calculate equivalent kernel and bias by fusing convolutions.
Returns:
(torch.Tensor): Equivalent kernel
(torch.Tensor): Equivalent bias
"""
kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
kernelid, biasid = self._fuse_bn_tensor(self.bn)
return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid
@staticmethod
def _pad_1x1_to_3x3_tensor(kernel1x1):
"""
Pad a 1x1 kernel to 3x3 size.
Args:
kernel1x1 (torch.Tensor): 1x1 convolution kernel.
Returns:
(torch.Tensor): Padded 3x3 kernel.
"""
if kernel1x1 is None:
return 0
else:
return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1])
def _fuse_bn_tensor(self, branch):
"""
Fuse batch normalization with convolution weights.
Args:
branch (Conv | nn.BatchNorm2d | None): Branch to fuse.
Returns:
kernel (torch.Tensor): Fused kernel.
bias (torch.Tensor): Fused bias.
"""
if branch is None:
return 0, 0
if isinstance(branch, Conv):
kernel = branch.conv.weight
running_mean = branch.bn.running_mean
running_var = branch.bn.running_var
gamma = branch.bn.weight
beta = branch.bn.bias
eps = branch.bn.eps
elif isinstance(branch, nn.BatchNorm2d):
if not hasattr(self, "id_tensor"):
input_dim = self.c1 // self.g
kernel_value = np.zeros((self.c1, input_dim, 3, 3), dtype=np.float32)
for i in range(self.c1):
kernel_value[i, i % input_dim, 1, 1] = 1
self.id_tensor = torch.from_numpy(kernel_value).to(branch.weight.device)
kernel = self.id_tensor
running_mean = branch.running_mean
running_var = branch.running_var
gamma = branch.weight
beta = branch.bias
eps = branch.eps
std = (running_var + eps).sqrt()
t = (gamma / std).reshape(-1, 1, 1, 1)
return kernel * t, beta - running_mean * gamma / std
def fuse_convs(self):
"""Fuse convolutions for inference by creating a single equivalent convolution."""
if hasattr(self, "conv"):
return
kernel, bias = self.get_equivalent_kernel_bias()
self.conv = nn.Conv2d(
in_channels=self.conv1.conv.in_channels,
out_channels=self.conv1.conv.out_channels,
kernel_size=self.conv1.conv.kernel_size,
stride=self.conv1.conv.stride,
padding=self.conv1.conv.padding,
dilation=self.conv1.conv.dilation,
groups=self.conv1.conv.groups,
bias=True,
).requires_grad_(False)
self.conv.weight.data = kernel
self.conv.bias.data = bias
for para in self.parameters():
para.detach_()
self.__delattr__("conv1")
self.__delattr__("conv2")
if hasattr(self, "nm"):
self.__delattr__("nm")
if hasattr(self, "bn"):
self.__delattr__("bn")
if hasattr(self, "id_tensor"):
self.__delattr__("id_tensor")
class ChannelAttention(nn.Module):
"""
Channel-attention module for feature recalibration.
Applies attention weights to channels based on global average pooling.
Attributes:
pool (nn.AdaptiveAvgPool2d): Global average pooling.
fc (nn.Conv2d): Fully connected layer implemented as 1x1 convolution.
act (nn.Sigmoid): Sigmoid activation for attention weights.
References:
https://github.com/open-mmlab/mmdetection/tree/v3.0.0rc1/configs/rtmdet
"""
def __init__(self, channels: int) -> None:
"""
Initialize Channel-attention module.
Args:
channels (int): Number of input channels.
"""
super().__init__()
self.pool = nn.AdaptiveAvgPool2d(1)
self.fc = nn.Conv2d(channels, channels, 1, 1, 0, bias=True)
self.act = nn.Sigmoid()
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Apply channel attention to input tensor.
Args:
x (torch.Tensor): Input tensor.
Returns:
(torch.Tensor): Channel-attended output tensor.
"""
return x * self.act(self.fc(self.pool(x)))
class SpatialAttention(nn.Module):
"""
Spatial-attention module for feature recalibration.
Applies attention weights to spatial dimensions based on channel statistics.
Attributes:
cv1 (nn.Conv2d): Convolution layer for spatial attention.
act (nn.Sigmoid): Sigmoid activation for attention weights.
"""
def __init__(self, kernel_size=7):
"""
Initialize Spatial-attention module.
Args:
kernel_size (int): Size of the convolutional kernel (3 or 7).
"""
super().__init__()
assert kernel_size in {3, 7}, "kernel size must be 3 or 7"
padding = 3 if kernel_size == 7 else 1
self.cv1 = nn.Conv2d(2, 1, kernel_size, padding=padding, bias=False)
self.act = nn.Sigmoid()
def forward(self, x):
"""
Apply spatial attention to input tensor.
Args:
x (torch.Tensor): Input tensor.
Returns:
(torch.Tensor): Spatial-attended output tensor.
"""
return x * self.act(self.cv1(torch.cat([torch.mean(x, 1, keepdim=True), torch.max(x, 1, keepdim=True)[0]], 1)))
class CBAM(nn.Module):
"""
Convolutional Block Attention Module.
Combines channel and spatial attention mechanisms for comprehensive feature refinement.
Attributes:
channel_attention (ChannelAttention): Channel attention module.
spatial_attention (SpatialAttention): Spatial attention module.
"""
def __init__(self, c1, kernel_size=7):
"""
Initialize CBAM with given parameters.
Args:
c1 (int): Number of input channels.
kernel_size (int): Size of the convolutional kernel for spatial attention.
"""
super().__init__()
self.channel_attention = ChannelAttention(c1)
self.spatial_attention = SpatialAttention(kernel_size)
def forward(self, x):
"""
Apply channel and spatial attention sequentially to input tensor.
Args:
x (torch.Tensor): Input tensor.
Returns:
(torch.Tensor): Attended output tensor.
"""
return self.spatial_attention(self.channel_attention(x))
class Concat(nn.Module):
"""
Concatenate a list of tensors along specified dimension.
Attributes:
d (int): Dimension along which to concatenate tensors.
"""
def __init__(self, dimension=1):
"""
Initialize Concat module.
Args:
dimension (int): Dimension along which to concatenate tensors.
"""
super().__init__()
self.d = dimension
def forward(self, x: list[torch.Tensor]):
"""
Concatenate input tensors along specified dimension.
Args:
x (list[torch.Tensor]): List of input tensors.
Returns:
(torch.Tensor): Concatenated tensor.
"""
return torch.cat(x, self.d)
class Index(nn.Module):
"""
Returns a particular index of the input.
Attributes:
index (int): Index to select from input.
"""
def __init__(self, index=0):
"""
Initialize Index module.
Args:
index (int): Index to select from input.
"""
super().__init__()
self.index = index
def forward(self, x: list[torch.Tensor]):
"""
Select and return a particular index from input.
Args:
x (list[torch.Tensor]): List of input tensors.
Returns:
(torch.Tensor): Selected tensor.
"""
return x[self.index]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,805 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
"""Transformer modules."""
from __future__ import annotations
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.init import constant_, xavier_uniform_
from ultralytics.utils.torch_utils import TORCH_1_11
from .conv import Conv
from .utils import _get_clones, inverse_sigmoid, multi_scale_deformable_attn_pytorch
__all__ = (
"TransformerEncoderLayer",
"TransformerLayer",
"TransformerBlock",
"MLPBlock",
"LayerNorm2d",
"AIFI",
"DeformableTransformerDecoder",
"DeformableTransformerDecoderLayer",
"MSDeformAttn",
"MLP",
)
class TransformerEncoderLayer(nn.Module):
"""
A single layer of the transformer encoder.
This class implements a standard transformer encoder layer with multi-head attention and feedforward network,
supporting both pre-normalization and post-normalization configurations.
Attributes:
ma (nn.MultiheadAttention): Multi-head attention module.
fc1 (nn.Linear): First linear layer in the feedforward network.
fc2 (nn.Linear): Second linear layer in the feedforward network.
norm1 (nn.LayerNorm): Layer normalization after attention.
norm2 (nn.LayerNorm): Layer normalization after feedforward network.
dropout (nn.Dropout): Dropout layer for the feedforward network.
dropout1 (nn.Dropout): Dropout layer after attention.
dropout2 (nn.Dropout): Dropout layer after feedforward network.
act (nn.Module): Activation function.
normalize_before (bool): Whether to apply normalization before attention and feedforward.
"""
def __init__(
self,
c1: int,
cm: int = 2048,
num_heads: int = 8,
dropout: float = 0.0,
act: nn.Module = nn.GELU(),
normalize_before: bool = False,
):
"""
Initialize the TransformerEncoderLayer with specified parameters.
Args:
c1 (int): Input dimension.
cm (int): Hidden dimension in the feedforward network.
num_heads (int): Number of attention heads.
dropout (float): Dropout probability.
act (nn.Module): Activation function.
normalize_before (bool): Whether to apply normalization before attention and feedforward.
"""
super().__init__()
from ...utils.torch_utils import TORCH_1_9
if not TORCH_1_9:
raise ModuleNotFoundError(
"TransformerEncoderLayer() requires torch>=1.9 to use nn.MultiheadAttention(batch_first=True)."
)
self.ma = nn.MultiheadAttention(c1, num_heads, dropout=dropout, batch_first=True)
# Implementation of Feedforward model
self.fc1 = nn.Linear(c1, cm)
self.fc2 = nn.Linear(cm, c1)
self.norm1 = nn.LayerNorm(c1)
self.norm2 = nn.LayerNorm(c1)
self.dropout = nn.Dropout(dropout)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
self.act = act
self.normalize_before = normalize_before
@staticmethod
def with_pos_embed(tensor: torch.Tensor, pos: torch.Tensor | None = None) -> torch.Tensor:
"""Add position embeddings to the tensor if provided."""
return tensor if pos is None else tensor + pos
def forward_post(
self,
src: torch.Tensor,
src_mask: torch.Tensor | None = None,
src_key_padding_mask: torch.Tensor | None = None,
pos: torch.Tensor | None = None,
) -> torch.Tensor:
"""
Perform forward pass with post-normalization.
Args:
src (torch.Tensor): Input tensor.
src_mask (torch.Tensor, optional): Mask for the src sequence.
src_key_padding_mask (torch.Tensor, optional): Mask for the src keys per batch.
pos (torch.Tensor, optional): Positional encoding.
Returns:
(torch.Tensor): Output tensor after attention and feedforward.
"""
q = k = self.with_pos_embed(src, pos)
src2 = self.ma(q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
src = src + self.dropout1(src2)
src = self.norm1(src)
src2 = self.fc2(self.dropout(self.act(self.fc1(src))))
src = src + self.dropout2(src2)
return self.norm2(src)
def forward_pre(
self,
src: torch.Tensor,
src_mask: torch.Tensor | None = None,
src_key_padding_mask: torch.Tensor | None = None,
pos: torch.Tensor | None = None,
) -> torch.Tensor:
"""
Perform forward pass with pre-normalization.
Args:
src (torch.Tensor): Input tensor.
src_mask (torch.Tensor, optional): Mask for the src sequence.
src_key_padding_mask (torch.Tensor, optional): Mask for the src keys per batch.
pos (torch.Tensor, optional): Positional encoding.
Returns:
(torch.Tensor): Output tensor after attention and feedforward.
"""
src2 = self.norm1(src)
q = k = self.with_pos_embed(src2, pos)
src2 = self.ma(q, k, value=src2, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
src = src + self.dropout1(src2)
src2 = self.norm2(src)
src2 = self.fc2(self.dropout(self.act(self.fc1(src2))))
return src + self.dropout2(src2)
def forward(
self,
src: torch.Tensor,
src_mask: torch.Tensor | None = None,
src_key_padding_mask: torch.Tensor | None = None,
pos: torch.Tensor | None = None,
) -> torch.Tensor:
"""
Forward propagate the input through the encoder module.
Args:
src (torch.Tensor): Input tensor.
src_mask (torch.Tensor, optional): Mask for the src sequence.
src_key_padding_mask (torch.Tensor, optional): Mask for the src keys per batch.
pos (torch.Tensor, optional): Positional encoding.
Returns:
(torch.Tensor): Output tensor after transformer encoder layer.
"""
if self.normalize_before:
return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
return self.forward_post(src, src_mask, src_key_padding_mask, pos)
class AIFI(TransformerEncoderLayer):
"""
AIFI transformer layer for 2D data with positional embeddings.
This class extends TransformerEncoderLayer to work with 2D feature maps by adding 2D sine-cosine positional
embeddings and handling the spatial dimensions appropriately.
"""
def __init__(
self,
c1: int,
cm: int = 2048,
num_heads: int = 8,
dropout: float = 0,
act: nn.Module = nn.GELU(),
normalize_before: bool = False,
):
"""
Initialize the AIFI instance with specified parameters.
Args:
c1 (int): Input dimension.
cm (int): Hidden dimension in the feedforward network.
num_heads (int): Number of attention heads.
dropout (float): Dropout probability.
act (nn.Module): Activation function.
normalize_before (bool): Whether to apply normalization before attention and feedforward.
"""
super().__init__(c1, cm, num_heads, dropout, act, normalize_before)
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Forward pass for the AIFI transformer layer.
Args:
x (torch.Tensor): Input tensor with shape [B, C, H, W].
Returns:
(torch.Tensor): Output tensor with shape [B, C, H, W].
"""
c, h, w = x.shape[1:]
pos_embed = self.build_2d_sincos_position_embedding(w, h, c)
# Flatten [B, C, H, W] to [B, HxW, C]
x = super().forward(x.flatten(2).permute(0, 2, 1), pos=pos_embed.to(device=x.device, dtype=x.dtype))
return x.permute(0, 2, 1).view([-1, c, h, w]).contiguous()
@staticmethod
def build_2d_sincos_position_embedding(
w: int, h: int, embed_dim: int = 256, temperature: float = 10000.0
) -> torch.Tensor:
"""
Build 2D sine-cosine position embedding.
Args:
w (int): Width of the feature map.
h (int): Height of the feature map.
embed_dim (int): Embedding dimension.
temperature (float): Temperature for the sine/cosine functions.
Returns:
(torch.Tensor): Position embedding with shape [1, embed_dim, h*w].
"""
assert embed_dim % 4 == 0, "Embed dimension must be divisible by 4 for 2D sin-cos position embedding"
grid_w = torch.arange(w, dtype=torch.float32)
grid_h = torch.arange(h, dtype=torch.float32)
grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="ij") if TORCH_1_11 else torch.meshgrid(grid_w, grid_h)
pos_dim = embed_dim // 4
omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
omega = 1.0 / (temperature**omega)
out_w = grid_w.flatten()[..., None] @ omega[None]
out_h = grid_h.flatten()[..., None] @ omega[None]
return torch.cat([torch.sin(out_w), torch.cos(out_w), torch.sin(out_h), torch.cos(out_h)], 1)[None]
class TransformerLayer(nn.Module):
"""Transformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance)."""
def __init__(self, c: int, num_heads: int):
"""
Initialize a self-attention mechanism using linear transformations and multi-head attention.
Args:
c (int): Input and output channel dimension.
num_heads (int): Number of attention heads.
"""
super().__init__()
self.q = nn.Linear(c, c, bias=False)
self.k = nn.Linear(c, c, bias=False)
self.v = nn.Linear(c, c, bias=False)
self.ma = nn.MultiheadAttention(embed_dim=c, num_heads=num_heads)
self.fc1 = nn.Linear(c, c, bias=False)
self.fc2 = nn.Linear(c, c, bias=False)
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Apply a transformer block to the input x and return the output.
Args:
x (torch.Tensor): Input tensor.
Returns:
(torch.Tensor): Output tensor after transformer layer.
"""
x = self.ma(self.q(x), self.k(x), self.v(x))[0] + x
return self.fc2(self.fc1(x)) + x
class TransformerBlock(nn.Module):
"""
Vision Transformer block based on https://arxiv.org/abs/2010.11929.
This class implements a complete transformer block with optional convolution layer for channel adjustment,
learnable position embedding, and multiple transformer layers.
Attributes:
conv (Conv, optional): Convolution layer if input and output channels differ.
linear (nn.Linear): Learnable position embedding.
tr (nn.Sequential): Sequential container of transformer layers.
c2 (int): Output channel dimension.
"""
def __init__(self, c1: int, c2: int, num_heads: int, num_layers: int):
"""
Initialize a Transformer module with position embedding and specified number of heads and layers.
Args:
c1 (int): Input channel dimension.
c2 (int): Output channel dimension.
num_heads (int): Number of attention heads.
num_layers (int): Number of transformer layers.
"""
super().__init__()
self.conv = None
if c1 != c2:
self.conv = Conv(c1, c2)
self.linear = nn.Linear(c2, c2) # learnable position embedding
self.tr = nn.Sequential(*(TransformerLayer(c2, num_heads) for _ in range(num_layers)))
self.c2 = c2
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Forward propagate the input through the transformer block.
Args:
x (torch.Tensor): Input tensor with shape [b, c1, w, h].
Returns:
(torch.Tensor): Output tensor with shape [b, c2, w, h].
"""
if self.conv is not None:
x = self.conv(x)
b, _, w, h = x.shape
p = x.flatten(2).permute(2, 0, 1)
return self.tr(p + self.linear(p)).permute(1, 2, 0).reshape(b, self.c2, w, h)
class MLPBlock(nn.Module):
"""A single block of a multi-layer perceptron."""
def __init__(self, embedding_dim: int, mlp_dim: int, act=nn.GELU):
"""
Initialize the MLPBlock with specified embedding dimension, MLP dimension, and activation function.
Args:
embedding_dim (int): Input and output dimension.
mlp_dim (int): Hidden dimension.
act (nn.Module): Activation function.
"""
super().__init__()
self.lin1 = nn.Linear(embedding_dim, mlp_dim)
self.lin2 = nn.Linear(mlp_dim, embedding_dim)
self.act = act()
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Forward pass for the MLPBlock.
Args:
x (torch.Tensor): Input tensor.
Returns:
(torch.Tensor): Output tensor after MLP block.
"""
return self.lin2(self.act(self.lin1(x)))
class MLP(nn.Module):
"""
A simple multi-layer perceptron (also called FFN).
This class implements a configurable MLP with multiple linear layers, activation functions, and optional
sigmoid output activation.
Attributes:
num_layers (int): Number of layers in the MLP.
layers (nn.ModuleList): List of linear layers.
sigmoid (bool): Whether to apply sigmoid to the output.
act (nn.Module): Activation function.
"""
def __init__(
self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int, act=nn.ReLU, sigmoid: bool = False
):
"""
Initialize the MLP with specified input, hidden, output dimensions and number of layers.
Args:
input_dim (int): Input dimension.
hidden_dim (int): Hidden dimension.
output_dim (int): Output dimension.
num_layers (int): Number of layers.
act (nn.Module): Activation function.
sigmoid (bool): Whether to apply sigmoid to the output.
"""
super().__init__()
self.num_layers = num_layers
h = [hidden_dim] * (num_layers - 1)
self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
self.sigmoid = sigmoid
self.act = act()
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Forward pass for the entire MLP.
Args:
x (torch.Tensor): Input tensor.
Returns:
(torch.Tensor): Output tensor after MLP.
"""
for i, layer in enumerate(self.layers):
x = getattr(self, "act", nn.ReLU())(layer(x)) if i < self.num_layers - 1 else layer(x)
return x.sigmoid() if getattr(self, "sigmoid", False) else x
class LayerNorm2d(nn.Module):
"""
2D Layer Normalization module inspired by Detectron2 and ConvNeXt implementations.
This class implements layer normalization for 2D feature maps, normalizing across the channel dimension
while preserving spatial dimensions.
Attributes:
weight (nn.Parameter): Learnable scale parameter.
bias (nn.Parameter): Learnable bias parameter.
eps (float): Small constant for numerical stability.
References:
https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py
https://github.com/facebookresearch/ConvNeXt/blob/main/models/convnext.py
"""
def __init__(self, num_channels: int, eps: float = 1e-6):
"""
Initialize LayerNorm2d with the given parameters.
Args:
num_channels (int): Number of channels in the input.
eps (float): Small constant for numerical stability.
"""
super().__init__()
self.weight = nn.Parameter(torch.ones(num_channels))
self.bias = nn.Parameter(torch.zeros(num_channels))
self.eps = eps
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Perform forward pass for 2D layer normalization.
Args:
x (torch.Tensor): Input tensor.
Returns:
(torch.Tensor): Normalized output tensor.
"""
u = x.mean(1, keepdim=True)
s = (x - u).pow(2).mean(1, keepdim=True)
x = (x - u) / torch.sqrt(s + self.eps)
return self.weight[:, None, None] * x + self.bias[:, None, None]
class MSDeformAttn(nn.Module):
"""
Multiscale Deformable Attention Module based on Deformable-DETR and PaddleDetection implementations.
This module implements multiscale deformable attention that can attend to features at multiple scales
with learnable sampling locations and attention weights.
Attributes:
im2col_step (int): Step size for im2col operations.
d_model (int): Model dimension.
n_levels (int): Number of feature levels.
n_heads (int): Number of attention heads.
n_points (int): Number of sampling points per attention head per feature level.
sampling_offsets (nn.Linear): Linear layer for generating sampling offsets.
attention_weights (nn.Linear): Linear layer for generating attention weights.
value_proj (nn.Linear): Linear layer for projecting values.
output_proj (nn.Linear): Linear layer for projecting output.
References:
https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/modules/ms_deform_attn.py
"""
def __init__(self, d_model: int = 256, n_levels: int = 4, n_heads: int = 8, n_points: int = 4):
"""
Initialize MSDeformAttn with the given parameters.
Args:
d_model (int): Model dimension.
n_levels (int): Number of feature levels.
n_heads (int): Number of attention heads.
n_points (int): Number of sampling points per attention head per feature level.
"""
super().__init__()
if d_model % n_heads != 0:
raise ValueError(f"d_model must be divisible by n_heads, but got {d_model} and {n_heads}")
_d_per_head = d_model // n_heads
# Better to set _d_per_head to a power of 2 which is more efficient in a CUDA implementation
assert _d_per_head * n_heads == d_model, "`d_model` must be divisible by `n_heads`"
self.im2col_step = 64
self.d_model = d_model
self.n_levels = n_levels
self.n_heads = n_heads
self.n_points = n_points
self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
self.value_proj = nn.Linear(d_model, d_model)
self.output_proj = nn.Linear(d_model, d_model)
self._reset_parameters()
def _reset_parameters(self):
"""Reset module parameters."""
constant_(self.sampling_offsets.weight.data, 0.0)
thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
grid_init = (
(grid_init / grid_init.abs().max(-1, keepdim=True)[0])
.view(self.n_heads, 1, 1, 2)
.repeat(1, self.n_levels, self.n_points, 1)
)
for i in range(self.n_points):
grid_init[:, :, i, :] *= i + 1
with torch.no_grad():
self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
constant_(self.attention_weights.weight.data, 0.0)
constant_(self.attention_weights.bias.data, 0.0)
xavier_uniform_(self.value_proj.weight.data)
constant_(self.value_proj.bias.data, 0.0)
xavier_uniform_(self.output_proj.weight.data)
constant_(self.output_proj.bias.data, 0.0)
def forward(
self,
query: torch.Tensor,
refer_bbox: torch.Tensor,
value: torch.Tensor,
value_shapes: list,
value_mask: torch.Tensor | None = None,
) -> torch.Tensor:
"""
Perform forward pass for multiscale deformable attention.
Args:
query (torch.Tensor): Query tensor with shape [bs, query_length, C].
refer_bbox (torch.Tensor): Reference bounding boxes with shape [bs, query_length, n_levels, 2],
range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area.
value (torch.Tensor): Value tensor with shape [bs, value_length, C].
value_shapes (list): List with shape [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})].
value_mask (torch.Tensor, optional): Mask tensor with shape [bs, value_length], True for non-padding
elements, False for padding elements.
Returns:
(torch.Tensor): Output tensor with shape [bs, Length_{query}, C].
References:
https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
"""
bs, len_q = query.shape[:2]
len_v = value.shape[1]
assert sum(s[0] * s[1] for s in value_shapes) == len_v
value = self.value_proj(value)
if value_mask is not None:
value = value.masked_fill(value_mask[..., None], float(0))
value = value.view(bs, len_v, self.n_heads, self.d_model // self.n_heads)
sampling_offsets = self.sampling_offsets(query).view(bs, len_q, self.n_heads, self.n_levels, self.n_points, 2)
attention_weights = self.attention_weights(query).view(bs, len_q, self.n_heads, self.n_levels * self.n_points)
attention_weights = F.softmax(attention_weights, -1).view(bs, len_q, self.n_heads, self.n_levels, self.n_points)
# N, Len_q, n_heads, n_levels, n_points, 2
num_points = refer_bbox.shape[-1]
if num_points == 2:
offset_normalizer = torch.as_tensor(value_shapes, dtype=query.dtype, device=query.device).flip(-1)
add = sampling_offsets / offset_normalizer[None, None, None, :, None, :]
sampling_locations = refer_bbox[:, :, None, :, None, :] + add
elif num_points == 4:
add = sampling_offsets / self.n_points * refer_bbox[:, :, None, :, None, 2:] * 0.5
sampling_locations = refer_bbox[:, :, None, :, None, :2] + add
else:
raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {num_points}.")
output = multi_scale_deformable_attn_pytorch(value, value_shapes, sampling_locations, attention_weights)
return self.output_proj(output)
class DeformableTransformerDecoderLayer(nn.Module):
"""
Deformable Transformer Decoder Layer inspired by PaddleDetection and Deformable-DETR implementations.
This class implements a single decoder layer with self-attention, cross-attention using multiscale deformable
attention, and a feedforward network.
Attributes:
self_attn (nn.MultiheadAttention): Self-attention module.
dropout1 (nn.Dropout): Dropout after self-attention.
norm1 (nn.LayerNorm): Layer normalization after self-attention.
cross_attn (MSDeformAttn): Cross-attention module.
dropout2 (nn.Dropout): Dropout after cross-attention.
norm2 (nn.LayerNorm): Layer normalization after cross-attention.
linear1 (nn.Linear): First linear layer in the feedforward network.
act (nn.Module): Activation function.
dropout3 (nn.Dropout): Dropout in the feedforward network.
linear2 (nn.Linear): Second linear layer in the feedforward network.
dropout4 (nn.Dropout): Dropout after the feedforward network.
norm3 (nn.LayerNorm): Layer normalization after the feedforward network.
References:
https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/deformable_transformer.py
"""
def __init__(
self,
d_model: int = 256,
n_heads: int = 8,
d_ffn: int = 1024,
dropout: float = 0.0,
act: nn.Module = nn.ReLU(),
n_levels: int = 4,
n_points: int = 4,
):
"""
Initialize the DeformableTransformerDecoderLayer with the given parameters.
Args:
d_model (int): Model dimension.
n_heads (int): Number of attention heads.
d_ffn (int): Dimension of the feedforward network.
dropout (float): Dropout probability.
act (nn.Module): Activation function.
n_levels (int): Number of feature levels.
n_points (int): Number of sampling points.
"""
super().__init__()
# Self attention
self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
self.dropout1 = nn.Dropout(dropout)
self.norm1 = nn.LayerNorm(d_model)
# Cross attention
self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
self.dropout2 = nn.Dropout(dropout)
self.norm2 = nn.LayerNorm(d_model)
# FFN
self.linear1 = nn.Linear(d_model, d_ffn)
self.act = act
self.dropout3 = nn.Dropout(dropout)
self.linear2 = nn.Linear(d_ffn, d_model)
self.dropout4 = nn.Dropout(dropout)
self.norm3 = nn.LayerNorm(d_model)
@staticmethod
def with_pos_embed(tensor: torch.Tensor, pos: torch.Tensor | None) -> torch.Tensor:
"""Add positional embeddings to the input tensor, if provided."""
return tensor if pos is None else tensor + pos
def forward_ffn(self, tgt: torch.Tensor) -> torch.Tensor:
"""
Perform forward pass through the Feed-Forward Network part of the layer.
Args:
tgt (torch.Tensor): Input tensor.
Returns:
(torch.Tensor): Output tensor after FFN.
"""
tgt2 = self.linear2(self.dropout3(self.act(self.linear1(tgt))))
tgt = tgt + self.dropout4(tgt2)
return self.norm3(tgt)
def forward(
self,
embed: torch.Tensor,
refer_bbox: torch.Tensor,
feats: torch.Tensor,
shapes: list,
padding_mask: torch.Tensor | None = None,
attn_mask: torch.Tensor | None = None,
query_pos: torch.Tensor | None = None,
) -> torch.Tensor:
"""
Perform the forward pass through the entire decoder layer.
Args:
embed (torch.Tensor): Input embeddings.
refer_bbox (torch.Tensor): Reference bounding boxes.
feats (torch.Tensor): Feature maps.
shapes (list): Feature shapes.
padding_mask (torch.Tensor, optional): Padding mask.
attn_mask (torch.Tensor, optional): Attention mask.
query_pos (torch.Tensor, optional): Query position embeddings.
Returns:
(torch.Tensor): Output tensor after decoder layer.
"""
# Self attention
q = k = self.with_pos_embed(embed, query_pos)
tgt = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), embed.transpose(0, 1), attn_mask=attn_mask)[
0
].transpose(0, 1)
embed = embed + self.dropout1(tgt)
embed = self.norm1(embed)
# Cross attention
tgt = self.cross_attn(
self.with_pos_embed(embed, query_pos), refer_bbox.unsqueeze(2), feats, shapes, padding_mask
)
embed = embed + self.dropout2(tgt)
embed = self.norm2(embed)
# FFN
return self.forward_ffn(embed)
class DeformableTransformerDecoder(nn.Module):
"""
Deformable Transformer Decoder based on PaddleDetection implementation.
This class implements a complete deformable transformer decoder with multiple decoder layers and prediction
heads for bounding box regression and classification.
Attributes:
layers (nn.ModuleList): List of decoder layers.
num_layers (int): Number of decoder layers.
hidden_dim (int): Hidden dimension.
eval_idx (int): Index of the layer to use during evaluation.
References:
https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
"""
def __init__(self, hidden_dim: int, decoder_layer: nn.Module, num_layers: int, eval_idx: int = -1):
"""
Initialize the DeformableTransformerDecoder with the given parameters.
Args:
hidden_dim (int): Hidden dimension.
decoder_layer (nn.Module): Decoder layer module.
num_layers (int): Number of decoder layers.
eval_idx (int): Index of the layer to use during evaluation.
"""
super().__init__()
self.layers = _get_clones(decoder_layer, num_layers)
self.num_layers = num_layers
self.hidden_dim = hidden_dim
self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx
def forward(
self,
embed: torch.Tensor, # decoder embeddings
refer_bbox: torch.Tensor, # anchor
feats: torch.Tensor, # image features
shapes: list, # feature shapes
bbox_head: nn.Module,
score_head: nn.Module,
pos_mlp: nn.Module,
attn_mask: torch.Tensor | None = None,
padding_mask: torch.Tensor | None = None,
):
"""
Perform the forward pass through the entire decoder.
Args:
embed (torch.Tensor): Decoder embeddings.
refer_bbox (torch.Tensor): Reference bounding boxes.
feats (torch.Tensor): Image features.
shapes (list): Feature shapes.
bbox_head (nn.Module): Bounding box prediction head.
score_head (nn.Module): Score prediction head.
pos_mlp (nn.Module): Position MLP.
attn_mask (torch.Tensor, optional): Attention mask.
padding_mask (torch.Tensor, optional): Padding mask.
Returns:
dec_bboxes (torch.Tensor): Decoded bounding boxes.
dec_cls (torch.Tensor): Decoded classification scores.
"""
output = embed
dec_bboxes = []
dec_cls = []
last_refined_bbox = None
refer_bbox = refer_bbox.sigmoid()
for i, layer in enumerate(self.layers):
output = layer(output, refer_bbox, feats, shapes, padding_mask, attn_mask, pos_mlp(refer_bbox))
bbox = bbox_head[i](output)
refined_bbox = torch.sigmoid(bbox + inverse_sigmoid(refer_bbox))
if self.training:
dec_cls.append(score_head[i](output))
if i == 0:
dec_bboxes.append(refined_bbox)
else:
dec_bboxes.append(torch.sigmoid(bbox + inverse_sigmoid(last_refined_bbox)))
elif i == self.eval_idx:
dec_cls.append(score_head[i](output))
dec_bboxes.append(refined_bbox)
break
last_refined_bbox = refined_bbox
refer_bbox = refined_bbox.detach() if self.training else refined_bbox
return torch.stack(dec_bboxes), torch.stack(dec_cls)

View File

@@ -0,0 +1,164 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
import copy
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.init import uniform_
__all__ = "multi_scale_deformable_attn_pytorch", "inverse_sigmoid"
def _get_clones(module, n):
"""
Create a list of cloned modules from the given module.
Args:
module (nn.Module): The module to be cloned.
n (int): Number of clones to create.
Returns:
(nn.ModuleList): A ModuleList containing n clones of the input module.
Examples:
>>> import torch.nn as nn
>>> layer = nn.Linear(10, 10)
>>> clones = _get_clones(layer, 3)
>>> len(clones)
3
"""
return nn.ModuleList([copy.deepcopy(module) for _ in range(n)])
def bias_init_with_prob(prior_prob=0.01):
"""
Initialize conv/fc bias value according to a given probability value.
This function calculates the bias initialization value based on a prior probability using the inverse error function.
It's commonly used in object detection models to initialize classification layers with a specific positive prediction
probability.
Args:
prior_prob (float, optional): Prior probability for bias initialization.
Returns:
(float): Bias initialization value calculated from the prior probability.
Examples:
>>> bias = bias_init_with_prob(0.01)
>>> print(f"Bias initialization value: {bias:.4f}")
Bias initialization value: -4.5951
"""
return float(-np.log((1 - prior_prob) / prior_prob)) # return bias_init
def linear_init(module):
"""
Initialize the weights and biases of a linear module.
This function initializes the weights of a linear module using a uniform distribution within bounds calculated
from the input dimension. If the module has a bias, it is also initialized.
Args:
module (nn.Module): Linear module to initialize.
Returns:
(nn.Module): The initialized module.
Examples:
>>> import torch.nn as nn
>>> linear = nn.Linear(10, 5)
>>> initialized_linear = linear_init(linear)
"""
bound = 1 / math.sqrt(module.weight.shape[0])
uniform_(module.weight, -bound, bound)
if hasattr(module, "bias") and module.bias is not None:
uniform_(module.bias, -bound, bound)
def inverse_sigmoid(x, eps=1e-5):
"""
Calculate the inverse sigmoid function for a tensor.
This function applies the inverse of the sigmoid function to a tensor, which is useful in various neural network
operations, particularly in attention mechanisms and coordinate transformations.
Args:
x (torch.Tensor): Input tensor with values in range [0, 1].
eps (float, optional): Small epsilon value to prevent numerical instability.
Returns:
(torch.Tensor): Tensor after applying the inverse sigmoid function.
Examples:
>>> x = torch.tensor([0.2, 0.5, 0.8])
>>> inverse_sigmoid(x)
tensor([-1.3863, 0.0000, 1.3863])
"""
x = x.clamp(min=0, max=1)
x1 = x.clamp(min=eps)
x2 = (1 - x).clamp(min=eps)
return torch.log(x1 / x2)
def multi_scale_deformable_attn_pytorch(
value: torch.Tensor,
value_spatial_shapes: torch.Tensor,
sampling_locations: torch.Tensor,
attention_weights: torch.Tensor,
) -> torch.Tensor:
"""
Implement multi-scale deformable attention in PyTorch.
This function performs deformable attention across multiple feature map scales, allowing the model to attend to
different spatial locations with learned offsets.
Args:
value (torch.Tensor): The value tensor with shape (bs, num_keys, num_heads, embed_dims).
value_spatial_shapes (torch.Tensor): Spatial shapes of the value tensor with shape (num_levels, 2).
sampling_locations (torch.Tensor): The sampling locations with shape
(bs, num_queries, num_heads, num_levels, num_points, 2).
attention_weights (torch.Tensor): The attention weights with shape
(bs, num_queries, num_heads, num_levels, num_points).
Returns:
(torch.Tensor): The output tensor with shape (bs, num_queries, embed_dims).
References:
https://github.com/IDEA-Research/detrex/blob/main/detrex/layers/multi_scale_deform_attn.py
"""
bs, _, num_heads, embed_dims = value.shape
_, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
sampling_grids = 2 * sampling_locations - 1
sampling_value_list = []
for level, (H_, W_) in enumerate(value_spatial_shapes):
# bs, H_*W_, num_heads, embed_dims ->
# bs, H_*W_, num_heads*embed_dims ->
# bs, num_heads*embed_dims, H_*W_ ->
# bs*num_heads, embed_dims, H_, W_
value_l_ = value_list[level].flatten(2).transpose(1, 2).reshape(bs * num_heads, embed_dims, H_, W_)
# bs, num_queries, num_heads, num_points, 2 ->
# bs, num_heads, num_queries, num_points, 2 ->
# bs*num_heads, num_queries, num_points, 2
sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(1, 2).flatten(0, 1)
# bs*num_heads, embed_dims, num_queries, num_points
sampling_value_l_ = F.grid_sample(
value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
)
sampling_value_list.append(sampling_value_l_)
# (bs, num_queries, num_heads, num_levels, num_points) ->
# (bs, num_heads, num_queries, num_levels, num_points) ->
# (bs, num_heads, 1, num_queries, num_levels*num_points)
attention_weights = attention_weights.transpose(1, 2).reshape(
bs * num_heads, 1, num_queries, num_levels * num_points
)
output = (
(torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
.sum(-1)
.view(bs, num_heads * embed_dims, num_queries)
)
return output.transpose(1, 2).contiguous()

1812
ultralytics/nn/tasks.py Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,383 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
from __future__ import annotations
from abc import abstractmethod
from pathlib import Path
import torch
import torch.nn as nn
from PIL import Image
from ultralytics.utils import checks
from ultralytics.utils.torch_utils import smart_inference_mode
try:
import clip
except ImportError:
checks.check_requirements("git+https://github.com/ultralytics/CLIP.git")
import clip
class TextModel(nn.Module):
"""
Abstract base class for text encoding models.
This class defines the interface for text encoding models used in vision-language tasks. Subclasses must implement
the tokenize and encode_text methods to provide text tokenization and encoding functionality.
Methods:
tokenize: Convert input texts to tokens for model processing.
encode_text: Encode tokenized texts into normalized feature vectors.
"""
def __init__(self):
"""Initialize the TextModel base class."""
super().__init__()
@abstractmethod
def tokenize(self, texts):
"""Convert input texts to tokens for model processing."""
pass
@abstractmethod
def encode_text(self, texts, dtype):
"""Encode tokenized texts into normalized feature vectors."""
pass
class CLIP(TextModel):
"""
Implements OpenAI's CLIP (Contrastive Language-Image Pre-training) text encoder.
This class provides a text encoder based on OpenAI's CLIP model, which can convert text into feature vectors
that are aligned with corresponding image features in a shared embedding space.
Attributes:
model (clip.model.CLIP): The loaded CLIP model.
device (torch.device): Device where the model is loaded.
Methods:
tokenize: Convert input texts to CLIP tokens.
encode_text: Encode tokenized texts into normalized feature vectors.
Examples:
>>> import torch
>>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
>>> clip_model = CLIP(size="ViT-B/32", device=device)
>>> tokens = clip_model.tokenize(["a photo of a cat", "a photo of a dog"])
>>> text_features = clip_model.encode_text(tokens)
>>> print(text_features.shape)
"""
def __init__(self, size: str, device: torch.device) -> None:
"""
Initialize the CLIP text encoder.
This class implements the TextModel interface using OpenAI's CLIP model for text encoding. It loads
a pre-trained CLIP model of the specified size and prepares it for text encoding tasks.
Args:
size (str): Model size identifier (e.g., 'ViT-B/32').
device (torch.device): Device to load the model on.
Examples:
>>> import torch
>>> clip_model = CLIP("ViT-B/32", device=torch.device("cuda:0"))
>>> text_features = clip_model.encode_text(["a photo of a cat", "a photo of a dog"])
"""
super().__init__()
self.model, self.image_preprocess = clip.load(size, device=device)
self.to(device)
self.device = device
self.eval()
def tokenize(self, texts: str | list[str]) -> torch.Tensor:
"""
Convert input texts to CLIP tokens.
Args:
texts (str | list[str]): Input text or list of texts to tokenize.
Returns:
(torch.Tensor): Tokenized text tensor with shape (batch_size, context_length) ready for model processing.
Examples:
>>> model = CLIP("ViT-B/32", device="cpu")
>>> tokens = model.tokenize("a photo of a cat")
>>> print(tokens.shape) # torch.Size([1, 77])
"""
return clip.tokenize(texts).to(self.device)
@smart_inference_mode()
def encode_text(self, texts: torch.Tensor, dtype: torch.dtype = torch.float32) -> torch.Tensor:
"""
Encode tokenized texts into normalized feature vectors.
This method processes tokenized text inputs through the CLIP model to generate feature vectors, which are then
normalized to unit length. These normalized vectors can be used for text-image similarity comparisons.
Args:
texts (torch.Tensor): Tokenized text inputs, typically created using the tokenize() method.
dtype (torch.dtype, optional): Data type for output features.
Returns:
(torch.Tensor): Normalized text feature vectors with unit length (L2 norm = 1).
Examples:
>>> clip_model = CLIP("ViT-B/32", device="cuda")
>>> tokens = clip_model.tokenize(["a photo of a cat", "a photo of a dog"])
>>> features = clip_model.encode_text(tokens)
>>> features.shape
torch.Size([2, 512])
"""
txt_feats = self.model.encode_text(texts).to(dtype)
txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True)
return txt_feats
@smart_inference_mode()
def encode_image(self, image: Image.Image | torch.Tensor, dtype: torch.dtype = torch.float32) -> torch.Tensor:
"""
Encode preprocessed images into normalized feature vectors.
This method processes preprocessed image inputs through the CLIP model to generate feature vectors, which are then
normalized to unit length. These normalized vectors can be used for text-image similarity comparisons.
Args:
image (PIL.Image | torch.Tensor): Preprocessed image input. If a PIL Image is provided, it will be
converted to a tensor using the model's image preprocessing function.
dtype (torch.dtype, optional): Data type for output features.
Returns:
(torch.Tensor): Normalized image feature vectors with unit length (L2 norm = 1).
Examples:
>>> from ultralytics.nn.text_model import CLIP
>>> from PIL import Image
>>> clip_model = CLIP("ViT-B/32", device="cuda")
>>> image = Image.open("path/to/image.jpg")
>>> image_tensor = clip_model.image_preprocess(image).unsqueeze(0).to("cuda")
>>> features = clip_model.encode_image(image_tensor)
>>> features.shape
torch.Size([1, 512])
"""
if isinstance(image, Image.Image):
image = self.image_preprocess(image).unsqueeze(0).to(self.device)
img_feats = self.model.encode_image(image).to(dtype)
img_feats = img_feats / img_feats.norm(p=2, dim=-1, keepdim=True)
return img_feats
class MobileCLIP(TextModel):
"""
Implement Apple's MobileCLIP text encoder for efficient text encoding.
This class implements the TextModel interface using Apple's MobileCLIP model, providing efficient text encoding
capabilities for vision-language tasks with reduced computational requirements compared to standard CLIP models.
Attributes:
model (mobileclip.model.MobileCLIP): The loaded MobileCLIP model.
tokenizer (callable): Tokenizer function for processing text inputs.
device (torch.device): Device where the model is loaded.
config_size_map (dict): Mapping from size identifiers to model configuration names.
Methods:
tokenize: Convert input texts to MobileCLIP tokens.
encode_text: Encode tokenized texts into normalized feature vectors.
Examples:
>>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
>>> text_encoder = MobileCLIP(size="s0", device=device)
>>> tokens = text_encoder.tokenize(["a photo of a cat", "a photo of a dog"])
>>> features = text_encoder.encode_text(tokens)
"""
config_size_map = {"s0": "s0", "s1": "s1", "s2": "s2", "b": "b", "blt": "b"}
def __init__(self, size: str, device: torch.device) -> None:
"""
Initialize the MobileCLIP text encoder.
This class implements the TextModel interface using Apple's MobileCLIP model for efficient text encoding.
Args:
size (str): Model size identifier (e.g., 's0', 's1', 's2', 'b', 'blt').
device (torch.device): Device to load the model on.
Examples:
>>> import torch
>>> model = MobileCLIP("s0", device=torch.device("cpu"))
>>> tokens = model.tokenize(["a photo of a cat", "a photo of a dog"])
>>> features = model.encode_text(tokens)
"""
try:
import warnings
# Suppress 'timm.models.layers is deprecated, please import via timm.layers' warning from mobileclip usage
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=FutureWarning)
import mobileclip
except ImportError:
# Ultralytics fork preferred since Apple MobileCLIP repo has incorrect version of torchvision
checks.check_requirements("git+https://github.com/ultralytics/mobileclip.git")
import mobileclip
super().__init__()
config = self.config_size_map[size]
file = f"mobileclip_{size}.pt"
if not Path(file).is_file():
from ultralytics import download
download(f"https://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/{file}")
self.model = mobileclip.create_model_and_transforms(f"mobileclip_{config}", pretrained=file, device=device)[0]
self.tokenizer = mobileclip.get_tokenizer(f"mobileclip_{config}")
self.to(device)
self.device = device
self.eval()
def tokenize(self, texts: list[str]) -> torch.Tensor:
"""
Convert input texts to MobileCLIP tokens.
Args:
texts (list[str]): List of text strings to tokenize.
Returns:
(torch.Tensor): Tokenized text inputs with shape (batch_size, sequence_length).
Examples:
>>> model = MobileCLIP("s0", "cpu")
>>> tokens = model.tokenize(["a photo of a cat", "a photo of a dog"])
"""
return self.tokenizer(texts).to(self.device)
@smart_inference_mode()
def encode_text(self, texts: torch.Tensor, dtype: torch.dtype = torch.float32) -> torch.Tensor:
"""
Encode tokenized texts into normalized feature vectors.
Args:
texts (torch.Tensor): Tokenized text inputs.
dtype (torch.dtype, optional): Data type for output features.
Returns:
(torch.Tensor): Normalized text feature vectors with L2 normalization applied.
Examples:
>>> model = MobileCLIP("s0", device="cpu")
>>> tokens = model.tokenize(["a photo of a cat", "a photo of a dog"])
>>> features = model.encode_text(tokens)
>>> features.shape
torch.Size([2, 512]) # Actual dimension depends on model size
"""
text_features = self.model.encode_text(texts).to(dtype)
text_features /= text_features.norm(p=2, dim=-1, keepdim=True)
return text_features
class MobileCLIPTS(TextModel):
"""
Load a TorchScript traced version of MobileCLIP.
This class implements the TextModel interface using Apple's MobileCLIP model in TorchScript format, providing
efficient text encoding capabilities for vision-language tasks with optimized inference performance.
Attributes:
encoder (torch.jit.ScriptModule): The loaded TorchScript MobileCLIP text encoder.
tokenizer (callable): Tokenizer function for processing text inputs.
device (torch.device): Device where the model is loaded.
Methods:
tokenize: Convert input texts to MobileCLIP tokens.
encode_text: Encode tokenized texts into normalized feature vectors.
Examples:
>>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
>>> text_encoder = MobileCLIPTS(device=device)
>>> tokens = text_encoder.tokenize(["a photo of a cat", "a photo of a dog"])
>>> features = text_encoder.encode_text(tokens)
"""
def __init__(self, device: torch.device):
"""
Initialize the MobileCLIP TorchScript text encoder.
This class implements the TextModel interface using Apple's MobileCLIP model in TorchScript format for
efficient text encoding with optimized inference performance.
Args:
device (torch.device): Device to load the model on.
Examples:
>>> model = MobileCLIPTS(device=torch.device("cpu"))
>>> tokens = model.tokenize(["a photo of a cat", "a photo of a dog"])
>>> features = model.encode_text(tokens)
"""
super().__init__()
from ultralytics.utils.downloads import attempt_download_asset
self.encoder = torch.jit.load(attempt_download_asset("mobileclip_blt.ts"), map_location=device)
self.tokenizer = clip.clip.tokenize
self.device = device
def tokenize(self, texts: list[str]) -> torch.Tensor:
"""
Convert input texts to MobileCLIP tokens.
Args:
texts (list[str]): List of text strings to tokenize.
Returns:
(torch.Tensor): Tokenized text inputs with shape (batch_size, sequence_length).
Examples:
>>> model = MobileCLIPTS("cpu")
>>> tokens = model.tokenize(["a photo of a cat", "a photo of a dog"])
"""
return self.tokenizer(texts).to(self.device)
@smart_inference_mode()
def encode_text(self, texts: torch.Tensor, dtype: torch.dtype = torch.float32) -> torch.Tensor:
"""
Encode tokenized texts into normalized feature vectors.
Args:
texts (torch.Tensor): Tokenized text inputs.
dtype (torch.dtype, optional): Data type for output features.
Returns:
(torch.Tensor): Normalized text feature vectors with L2 normalization applied.
Examples:
>>> model = MobileCLIPTS(device="cpu")
>>> tokens = model.tokenize(["a photo of a cat", "a photo of a dog"])
>>> features = model.encode_text(tokens)
>>> features.shape
torch.Size([2, 512]) # Actual dimension depends on model size
"""
# NOTE: no need to do normalization here as it's embedded in the torchscript model
return self.encoder(texts).to(dtype)
def build_text_model(variant: str, device: torch.device = None) -> TextModel:
"""
Build a text encoding model based on the specified variant.
Args:
variant (str): Model variant in format "base:size" (e.g., "clip:ViT-B/32" or "mobileclip:s0").
device (torch.device, optional): Device to load the model on.
Returns:
(TextModel): Instantiated text encoding model.
Examples:
>>> model = build_text_model("clip:ViT-B/32", device=torch.device("cuda"))
>>> model = build_text_model("mobileclip:s0", device=torch.device("cpu"))
"""
base, size = variant.split(":")
if base == "clip":
return CLIP(size, device)
elif base == "mobileclip":
return MobileCLIPTS(device)
else:
raise ValueError(f"Unrecognized base model: '{base}'. Supported base models: 'clip', 'mobileclip'.")