init commit

This commit is contained in:
2025-11-08 19:15:39 +01:00
parent ecffcb08e8
commit c7adacf53b
470 changed files with 73751 additions and 0 deletions

View File

@@ -0,0 +1,5 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
from .base import add_integration_callbacks, default_callbacks, get_default_callbacks
__all__ = "add_integration_callbacks", "default_callbacks", "get_default_callbacks"

View File

@@ -0,0 +1,235 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
"""Base callbacks for Ultralytics training, validation, prediction, and export processes."""
from collections import defaultdict
from copy import deepcopy
# Trainer callbacks ----------------------------------------------------------------------------------------------------
def on_pretrain_routine_start(trainer):
"""Called before the pretraining routine starts."""
pass
def on_pretrain_routine_end(trainer):
"""Called after the pretraining routine ends."""
pass
def on_train_start(trainer):
"""Called when the training starts."""
pass
def on_train_epoch_start(trainer):
"""Called at the start of each training epoch."""
pass
def on_train_batch_start(trainer):
"""Called at the start of each training batch."""
pass
def optimizer_step(trainer):
"""Called when the optimizer takes a step."""
pass
def on_before_zero_grad(trainer):
"""Called before the gradients are set to zero."""
pass
def on_train_batch_end(trainer):
"""Called at the end of each training batch."""
pass
def on_train_epoch_end(trainer):
"""Called at the end of each training epoch."""
pass
def on_fit_epoch_end(trainer):
"""Called at the end of each fit epoch (train + val)."""
pass
def on_model_save(trainer):
"""Called when the model is saved."""
pass
def on_train_end(trainer):
"""Called when the training ends."""
pass
def on_params_update(trainer):
"""Called when the model parameters are updated."""
pass
def teardown(trainer):
"""Called during the teardown of the training process."""
pass
# Validator callbacks --------------------------------------------------------------------------------------------------
def on_val_start(validator):
"""Called when the validation starts."""
pass
def on_val_batch_start(validator):
"""Called at the start of each validation batch."""
pass
def on_val_batch_end(validator):
"""Called at the end of each validation batch."""
pass
def on_val_end(validator):
"""Called when the validation ends."""
pass
# Predictor callbacks --------------------------------------------------------------------------------------------------
def on_predict_start(predictor):
"""Called when the prediction starts."""
pass
def on_predict_batch_start(predictor):
"""Called at the start of each prediction batch."""
pass
def on_predict_batch_end(predictor):
"""Called at the end of each prediction batch."""
pass
def on_predict_postprocess_end(predictor):
"""Called after the post-processing of the prediction ends."""
pass
def on_predict_end(predictor):
"""Called when the prediction ends."""
pass
# Exporter callbacks ---------------------------------------------------------------------------------------------------
def on_export_start(exporter):
"""Called when the model export starts."""
pass
def on_export_end(exporter):
"""Called when the model export ends."""
pass
default_callbacks = {
# Run in trainer
"on_pretrain_routine_start": [on_pretrain_routine_start],
"on_pretrain_routine_end": [on_pretrain_routine_end],
"on_train_start": [on_train_start],
"on_train_epoch_start": [on_train_epoch_start],
"on_train_batch_start": [on_train_batch_start],
"optimizer_step": [optimizer_step],
"on_before_zero_grad": [on_before_zero_grad],
"on_train_batch_end": [on_train_batch_end],
"on_train_epoch_end": [on_train_epoch_end],
"on_fit_epoch_end": [on_fit_epoch_end], # fit = train + val
"on_model_save": [on_model_save],
"on_train_end": [on_train_end],
"on_params_update": [on_params_update],
"teardown": [teardown],
# Run in validator
"on_val_start": [on_val_start],
"on_val_batch_start": [on_val_batch_start],
"on_val_batch_end": [on_val_batch_end],
"on_val_end": [on_val_end],
# Run in predictor
"on_predict_start": [on_predict_start],
"on_predict_batch_start": [on_predict_batch_start],
"on_predict_postprocess_end": [on_predict_postprocess_end],
"on_predict_batch_end": [on_predict_batch_end],
"on_predict_end": [on_predict_end],
# Run in exporter
"on_export_start": [on_export_start],
"on_export_end": [on_export_end],
}
def get_default_callbacks():
"""
Get the default callbacks for Ultralytics training, validation, prediction, and export processes.
Returns:
(dict): Dictionary of default callbacks for various training events. Each key represents an event during the
training process, and the corresponding value is a list of callback functions executed when that event
occurs.
Examples:
>>> callbacks = get_default_callbacks()
>>> print(list(callbacks.keys())) # show all available callback events
['on_pretrain_routine_start', 'on_pretrain_routine_end', ...]
"""
return defaultdict(list, deepcopy(default_callbacks))
def add_integration_callbacks(instance):
"""
Add integration callbacks to the instance's callbacks dictionary.
This function loads and adds various integration callbacks to the provided instance. The specific callbacks added
depend on the type of instance provided. All instances receive HUB callbacks, while Trainer instances also receive
additional callbacks for various integrations like ClearML, Comet, DVC, MLflow, Neptune, Ray Tune, TensorBoard,
and Weights & Biases.
Args:
instance (Trainer | Predictor | Validator | Exporter): The object instance to which callbacks will be added.
The type of instance determines which callbacks are loaded.
Examples:
>>> from ultralytics.engine.trainer import BaseTrainer
>>> trainer = BaseTrainer()
>>> add_integration_callbacks(trainer)
"""
from .hub import callbacks as hub_cb
from .platform import callbacks as platform_cb
# Load Ultralytics callbacks
callbacks_list = [hub_cb, platform_cb]
# Load training callbacks
if "Trainer" in instance.__class__.__name__:
from .clearml import callbacks as clear_cb
from .comet import callbacks as comet_cb
from .dvc import callbacks as dvc_cb
from .mlflow import callbacks as mlflow_cb
from .neptune import callbacks as neptune_cb
from .raytune import callbacks as tune_cb
from .tensorboard import callbacks as tb_cb
from .wb import callbacks as wb_cb
callbacks_list.extend([clear_cb, comet_cb, dvc_cb, mlflow_cb, neptune_cb, tune_cb, tb_cb, wb_cb])
# Add the callbacks to the callbacks dictionary
for callbacks in callbacks_list:
for k, v in callbacks.items():
if v not in instance.callbacks[k]:
instance.callbacks[k].append(v)

View File

@@ -0,0 +1,154 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
from ultralytics.utils import LOGGER, SETTINGS, TESTS_RUNNING
try:
assert not TESTS_RUNNING # do not log pytest
assert SETTINGS["clearml"] is True # verify integration is enabled
import clearml
from clearml import Task
assert hasattr(clearml, "__version__") # verify package is not directory
except (ImportError, AssertionError):
clearml = None
def _log_debug_samples(files, title: str = "Debug Samples") -> None:
"""
Log files (images) as debug samples in the ClearML task.
Args:
files (list[Path]): A list of file paths in PosixPath format.
title (str): A title that groups together images with the same values.
"""
import re
if task := Task.current_task():
for f in files:
if f.exists():
it = re.search(r"_batch(\d+)", f.name)
iteration = int(it.groups()[0]) if it else 0
task.get_logger().report_image(
title=title, series=f.name.replace(it.group(), ""), local_path=str(f), iteration=iteration
)
def _log_plot(title: str, plot_path: str) -> None:
"""
Log an image as a plot in the plot section of ClearML.
Args:
title (str): The title of the plot.
plot_path (str): The path to the saved image file.
"""
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
img = mpimg.imread(plot_path)
fig = plt.figure()
ax = fig.add_axes([0, 0, 1, 1], frameon=False, aspect="auto", xticks=[], yticks=[]) # no ticks
ax.imshow(img)
Task.current_task().get_logger().report_matplotlib_figure(
title=title, series="", figure=fig, report_interactive=False
)
def on_pretrain_routine_start(trainer) -> None:
"""Initialize and connect ClearML task at the start of pretraining routine."""
try:
if task := Task.current_task():
# WARNING: make sure the automatic pytorch and matplotlib bindings are disabled!
# We are logging these plots and model files manually in the integration
from clearml.binding.frameworks.pytorch_bind import PatchPyTorchModelIO
from clearml.binding.matplotlib_bind import PatchedMatplotlib
PatchPyTorchModelIO.update_current_task(None)
PatchedMatplotlib.update_current_task(None)
else:
task = Task.init(
project_name=trainer.args.project or "Ultralytics",
task_name=trainer.args.name,
tags=["Ultralytics"],
output_uri=True,
reuse_last_task_id=False,
auto_connect_frameworks={"pytorch": False, "matplotlib": False},
)
LOGGER.warning(
"ClearML Initialized a new task. If you want to run remotely, "
"please add clearml-init and connect your arguments before initializing YOLO."
)
task.connect(vars(trainer.args), name="General")
except Exception as e:
LOGGER.warning(f"ClearML installed but not initialized correctly, not logging this run. {e}")
def on_train_epoch_end(trainer) -> None:
"""Log debug samples for the first epoch and report current training progress."""
if task := Task.current_task():
# Log debug samples for first epoch only
if trainer.epoch == 1:
_log_debug_samples(sorted(trainer.save_dir.glob("train_batch*.jpg")), "Mosaic")
# Report the current training progress
for k, v in trainer.label_loss_items(trainer.tloss, prefix="train").items():
task.get_logger().report_scalar("train", k, v, iteration=trainer.epoch)
for k, v in trainer.lr.items():
task.get_logger().report_scalar("lr", k, v, iteration=trainer.epoch)
def on_fit_epoch_end(trainer) -> None:
"""Report model information and metrics to logger at the end of an epoch."""
if task := Task.current_task():
# Report epoch time and validation metrics
task.get_logger().report_scalar(
title="Epoch Time", series="Epoch Time", value=trainer.epoch_time, iteration=trainer.epoch
)
for k, v in trainer.metrics.items():
title = k.split("/")[0]
task.get_logger().report_scalar(title, k, v, iteration=trainer.epoch)
if trainer.epoch == 0:
from ultralytics.utils.torch_utils import model_info_for_loggers
for k, v in model_info_for_loggers(trainer).items():
task.get_logger().report_single_value(k, v)
def on_val_end(validator) -> None:
"""Log validation results including labels and predictions."""
if Task.current_task():
# Log validation labels and predictions
_log_debug_samples(sorted(validator.save_dir.glob("val*.jpg")), "Validation")
def on_train_end(trainer) -> None:
"""Log final model and training results on training completion."""
if task := Task.current_task():
# Log final results, confusion matrix and PR plots
files = [
"results.png",
"confusion_matrix.png",
"confusion_matrix_normalized.png",
*(f"{x}_curve.png" for x in ("F1", "PR", "P", "R")),
]
files = [(trainer.save_dir / f) for f in files if (trainer.save_dir / f).exists()] # filter existing files
for f in files:
_log_plot(title=f.stem, plot_path=f)
# Report final metrics
for k, v in trainer.validator.metrics.results_dict.items():
task.get_logger().report_single_value(k, v)
# Log the final model
task.update_output_model(model_path=str(trainer.best), model_name=trainer.args.name, auto_delete_file=False)
callbacks = (
{
"on_pretrain_routine_start": on_pretrain_routine_start,
"on_train_epoch_end": on_train_epoch_end,
"on_fit_epoch_end": on_fit_epoch_end,
"on_val_end": on_val_end,
"on_train_end": on_train_end,
}
if clearml
else {}
)

View File

@@ -0,0 +1,639 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
from __future__ import annotations
from collections.abc import Callable
from types import SimpleNamespace
from typing import Any
import cv2
import numpy as np
from ultralytics.utils import LOGGER, RANK, SETTINGS, TESTS_RUNNING, ops
from ultralytics.utils.metrics import ClassifyMetrics, DetMetrics, OBBMetrics, PoseMetrics, SegmentMetrics
try:
assert not TESTS_RUNNING # do not log pytest
assert SETTINGS["comet"] is True # verify integration is enabled
import comet_ml
assert hasattr(comet_ml, "__version__") # verify package is not directory
import os
from pathlib import Path
# Ensures certain logging functions only run for supported tasks
COMET_SUPPORTED_TASKS = ["detect", "segment"]
# Names of plots created by Ultralytics that are logged to Comet
CONFUSION_MATRIX_PLOT_NAMES = "confusion_matrix", "confusion_matrix_normalized"
EVALUATION_PLOT_NAMES = "F1_curve", "P_curve", "R_curve", "PR_curve"
LABEL_PLOT_NAMES = ["labels"]
SEGMENT_METRICS_PLOT_PREFIX = "Box", "Mask"
POSE_METRICS_PLOT_PREFIX = "Box", "Pose"
DETECTION_METRICS_PLOT_PREFIX = ["Box"]
RESULTS_TABLE_NAME = "results.csv"
ARGS_YAML_NAME = "args.yaml"
_comet_image_prediction_count = 0
except (ImportError, AssertionError):
comet_ml = None
def _get_comet_mode() -> str:
"""Return the Comet mode from environment variables, defaulting to 'online'."""
comet_mode = os.getenv("COMET_MODE")
if comet_mode is not None:
LOGGER.warning(
"The COMET_MODE environment variable is deprecated. "
"Please use COMET_START_ONLINE to set the Comet experiment mode. "
"To start an offline Comet experiment, use 'export COMET_START_ONLINE=0'. "
"If COMET_START_ONLINE is not set or is set to '1', an online Comet experiment will be created."
)
return comet_mode
return "online"
def _get_comet_model_name() -> str:
"""Return the Comet model name from environment variable or default to 'Ultralytics'."""
return os.getenv("COMET_MODEL_NAME", "Ultralytics")
def _get_eval_batch_logging_interval() -> int:
"""Get the evaluation batch logging interval from environment variable or use default value 1."""
return int(os.getenv("COMET_EVAL_BATCH_LOGGING_INTERVAL", 1))
def _get_max_image_predictions_to_log() -> int:
"""Get the maximum number of image predictions to log from environment variables."""
return int(os.getenv("COMET_MAX_IMAGE_PREDICTIONS", 100))
def _scale_confidence_score(score: float) -> float:
"""Scale the confidence score by a factor specified in environment variable."""
scale = float(os.getenv("COMET_MAX_CONFIDENCE_SCORE", 100.0))
return score * scale
def _should_log_confusion_matrix() -> bool:
"""Determine if the confusion matrix should be logged based on environment variable settings."""
return os.getenv("COMET_EVAL_LOG_CONFUSION_MATRIX", "false").lower() == "true"
def _should_log_image_predictions() -> bool:
"""Determine whether to log image predictions based on environment variable."""
return os.getenv("COMET_EVAL_LOG_IMAGE_PREDICTIONS", "true").lower() == "true"
def _resume_or_create_experiment(args: SimpleNamespace) -> None:
"""
Resume CometML experiment or create a new experiment based on args.
Ensures that the experiment object is only created in a single process during distributed training.
Args:
args (SimpleNamespace): Training arguments containing project configuration and other parameters.
"""
if RANK not in {-1, 0}:
return
# Set environment variable (if not set by the user) to configure the Comet experiment's online mode under the hood.
# IF COMET_START_ONLINE is set by the user it will override COMET_MODE value.
if os.getenv("COMET_START_ONLINE") is None:
comet_mode = _get_comet_mode()
os.environ["COMET_START_ONLINE"] = "1" if comet_mode != "offline" else "0"
try:
_project_name = os.getenv("COMET_PROJECT_NAME", args.project)
experiment = comet_ml.start(project_name=_project_name)
experiment.log_parameters(vars(args))
experiment.log_others(
{
"eval_batch_logging_interval": _get_eval_batch_logging_interval(),
"log_confusion_matrix_on_eval": _should_log_confusion_matrix(),
"log_image_predictions": _should_log_image_predictions(),
"max_image_predictions": _get_max_image_predictions_to_log(),
}
)
experiment.log_other("Created from", "ultralytics")
except Exception as e:
LOGGER.warning(f"Comet installed but not initialized correctly, not logging this run. {e}")
def _fetch_trainer_metadata(trainer) -> dict:
"""
Return metadata for YOLO training including epoch and asset saving status.
Args:
trainer (ultralytics.engine.trainer.BaseTrainer): The YOLO trainer object containing training state and config.
Returns:
(dict): Dictionary containing current epoch, step, save assets flag, and final epoch flag.
"""
curr_epoch = trainer.epoch + 1
train_num_steps_per_epoch = len(trainer.train_loader.dataset) // trainer.batch_size
curr_step = curr_epoch * train_num_steps_per_epoch
final_epoch = curr_epoch == trainer.epochs
save = trainer.args.save
save_period = trainer.args.save_period
save_interval = curr_epoch % save_period == 0
save_assets = save and save_period > 0 and save_interval and not final_epoch
return dict(curr_epoch=curr_epoch, curr_step=curr_step, save_assets=save_assets, final_epoch=final_epoch)
def _scale_bounding_box_to_original_image_shape(
box, resized_image_shape, original_image_shape, ratio_pad
) -> list[float]:
"""
Scale bounding box from resized image coordinates to original image coordinates.
YOLO resizes images during training and the label values are normalized based on this resized shape.
This function rescales the bounding box labels to the original image shape.
Args:
box (torch.Tensor): Bounding box in normalized xywh format.
resized_image_shape (tuple): Shape of the resized image (height, width).
original_image_shape (tuple): Shape of the original image (height, width).
ratio_pad (tuple): Ratio and padding information for scaling.
Returns:
(list[float]): Scaled bounding box coordinates in xywh format with top-left corner adjustment.
"""
resized_image_height, resized_image_width = resized_image_shape
# Convert normalized xywh format predictions to xyxy in resized scale format
box = ops.xywhn2xyxy(box, h=resized_image_height, w=resized_image_width)
# Scale box predictions from resized image scale back to original image scale
box = ops.scale_boxes(resized_image_shape, box, original_image_shape, ratio_pad)
# Convert bounding box format from xyxy to xywh for Comet logging
box = ops.xyxy2xywh(box)
# Adjust xy center to correspond top-left corner
box[:2] -= box[2:] / 2
box = box.tolist()
return box
def _format_ground_truth_annotations_for_detection(img_idx, image_path, batch, class_name_map=None) -> dict | None:
"""
Format ground truth annotations for object detection.
This function processes ground truth annotations from a batch of images for object detection tasks. It extracts
bounding boxes, class labels, and other metadata for a specific image in the batch, and formats them for
visualization or evaluation.
Args:
img_idx (int): Index of the image in the batch to process.
image_path (str | Path): Path to the image file.
batch (dict): Batch dictionary containing detection data with keys:
- 'batch_idx': Tensor of batch indices
- 'bboxes': Tensor of bounding boxes in normalized xywh format
- 'cls': Tensor of class labels
- 'ori_shape': Original image shapes
- 'resized_shape': Resized image shapes
- 'ratio_pad': Ratio and padding information
class_name_map (dict, optional): Mapping from class indices to class names.
Returns:
(dict | None): Formatted ground truth annotations with the following structure:
- 'boxes': List of box coordinates [x, y, width, height]
- 'label': Label string with format "gt_{class_name}"
- 'score': Confidence score (always 1.0, scaled by _scale_confidence_score)
Returns None if no bounding boxes are found for the image.
"""
indices = batch["batch_idx"] == img_idx
bboxes = batch["bboxes"][indices]
if len(bboxes) == 0:
LOGGER.debug(f"Comet Image: {image_path} has no bounding boxes labels")
return None
cls_labels = batch["cls"][indices].squeeze(1).tolist()
if class_name_map:
cls_labels = [str(class_name_map[label]) for label in cls_labels]
original_image_shape = batch["ori_shape"][img_idx]
resized_image_shape = batch["resized_shape"][img_idx]
ratio_pad = batch["ratio_pad"][img_idx]
data = []
for box, label in zip(bboxes, cls_labels):
box = _scale_bounding_box_to_original_image_shape(box, resized_image_shape, original_image_shape, ratio_pad)
data.append(
{
"boxes": [box],
"label": f"gt_{label}",
"score": _scale_confidence_score(1.0),
}
)
return {"name": "ground_truth", "data": data}
def _format_prediction_annotations(image_path, metadata, class_label_map=None, class_map=None) -> dict | None:
"""
Format YOLO predictions for object detection visualization.
Args:
image_path (Path): Path to the image file.
metadata (dict): Prediction metadata containing bounding boxes and class information.
class_label_map (dict, optional): Mapping from class indices to class names.
class_map (dict, optional): Additional class mapping for label conversion.
Returns:
(dict | None): Formatted prediction annotations or None if no predictions exist.
"""
stem = image_path.stem
image_id = int(stem) if stem.isnumeric() else stem
predictions = metadata.get(image_id)
if not predictions:
LOGGER.debug(f"Comet Image: {image_path} has no bounding boxes predictions")
return None
# apply the mapping that was used to map the predicted classes when the JSON was created
if class_label_map and class_map:
class_label_map = {class_map[k]: v for k, v in class_label_map.items()}
try:
# import pycotools utilities to decompress annotations for various tasks, e.g. segmentation
from faster_coco_eval.core.mask import decode # noqa
except ImportError:
decode = None
data = []
for prediction in predictions:
boxes = prediction["bbox"]
score = _scale_confidence_score(prediction["score"])
cls_label = prediction["category_id"]
if class_label_map:
cls_label = str(class_label_map[cls_label])
annotation_data = {"boxes": [boxes], "label": cls_label, "score": score}
if decode is not None:
# do segmentation processing only if we are able to decode it
segments = prediction.get("segmentation", None)
if segments is not None:
segments = _extract_segmentation_annotation(segments, decode)
if segments is not None:
annotation_data["points"] = segments
data.append(annotation_data)
return {"name": "prediction", "data": data}
def _extract_segmentation_annotation(segmentation_raw: str, decode: Callable) -> list[list[Any]] | None:
"""
Extract segmentation annotation from compressed segmentations as list of polygons.
Args:
segmentation_raw (str): Raw segmentation data in compressed format.
decode (Callable): Function to decode the compressed segmentation data.
Returns:
(list[list[Any]] | None): List of polygon points or None if extraction fails.
"""
try:
mask = decode(segmentation_raw)
contours, _ = cv2.findContours(mask, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
annotations = [np.array(polygon).squeeze() for polygon in contours if len(polygon) >= 3]
return [annotation.ravel().tolist() for annotation in annotations]
except Exception as e:
LOGGER.warning(f"Comet Failed to extract segmentation annotation: {e}")
return None
def _fetch_annotations(img_idx, image_path, batch, prediction_metadata_map, class_label_map, class_map) -> list | None:
"""
Join the ground truth and prediction annotations if they exist.
Args:
img_idx (int): Index of the image in the batch.
image_path (Path): Path to the image file.
batch (dict): Batch data containing ground truth annotations.
prediction_metadata_map (dict): Map of prediction metadata by image ID.
class_label_map (dict): Mapping from class indices to class names.
class_map (dict): Additional class mapping for label conversion.
Returns:
(list | None): List of annotation dictionaries or None if no annotations exist.
"""
ground_truth_annotations = _format_ground_truth_annotations_for_detection(
img_idx, image_path, batch, class_label_map
)
prediction_annotations = _format_prediction_annotations(
image_path, prediction_metadata_map, class_label_map, class_map
)
annotations = [
annotation for annotation in [ground_truth_annotations, prediction_annotations] if annotation is not None
]
return [annotations] if annotations else None
def _create_prediction_metadata_map(model_predictions) -> dict:
"""Create metadata map for model predictions by grouping them based on image ID."""
pred_metadata_map = {}
for prediction in model_predictions:
pred_metadata_map.setdefault(prediction["image_id"], [])
pred_metadata_map[prediction["image_id"]].append(prediction)
return pred_metadata_map
def _log_confusion_matrix(experiment, trainer, curr_step, curr_epoch) -> None:
"""Log the confusion matrix to Comet experiment."""
conf_mat = trainer.validator.confusion_matrix.matrix
names = list(trainer.data["names"].values()) + ["background"]
experiment.log_confusion_matrix(
matrix=conf_mat, labels=names, max_categories=len(names), epoch=curr_epoch, step=curr_step
)
def _log_images(experiment, image_paths, curr_step: int | None, annotations=None) -> None:
"""
Log images to the experiment with optional annotations.
This function logs images to a Comet ML experiment, optionally including annotation data for visualization
such as bounding boxes or segmentation masks.
Args:
experiment (comet_ml.CometExperiment): The Comet ML experiment to log images to.
image_paths (list[Path]): List of paths to images that will be logged.
curr_step (int): Current training step/iteration for tracking in the experiment timeline.
annotations (list[list[dict]], optional): Nested list of annotation dictionaries for each image. Each
annotation contains visualization data like bounding boxes, labels, and confidence scores.
"""
if annotations:
for image_path, annotation in zip(image_paths, annotations):
experiment.log_image(image_path, name=image_path.stem, step=curr_step, annotations=annotation)
else:
for image_path in image_paths:
experiment.log_image(image_path, name=image_path.stem, step=curr_step)
def _log_image_predictions(experiment, validator, curr_step) -> None:
"""
Log predicted boxes for a single image during training.
This function logs image predictions to a Comet ML experiment during model validation. It processes
validation data and formats both ground truth and prediction annotations for visualization in the Comet
dashboard. The function respects configured limits on the number of images to log.
Args:
experiment (comet_ml.CometExperiment): The Comet ML experiment to log to.
validator (BaseValidator): The validator instance containing validation data and predictions.
curr_step (int): The current training step for logging timeline.
Notes:
This function uses global state to track the number of logged predictions across calls.
It only logs predictions for supported tasks defined in COMET_SUPPORTED_TASKS.
The number of logged images is limited by the COMET_MAX_IMAGE_PREDICTIONS environment variable.
"""
global _comet_image_prediction_count
task = validator.args.task
if task not in COMET_SUPPORTED_TASKS:
return
jdict = validator.jdict
if not jdict:
return
predictions_metadata_map = _create_prediction_metadata_map(jdict)
dataloader = validator.dataloader
class_label_map = validator.names
class_map = getattr(validator, "class_map", None)
batch_logging_interval = _get_eval_batch_logging_interval()
max_image_predictions = _get_max_image_predictions_to_log()
for batch_idx, batch in enumerate(dataloader):
if (batch_idx + 1) % batch_logging_interval != 0:
continue
image_paths = batch["im_file"]
for img_idx, image_path in enumerate(image_paths):
if _comet_image_prediction_count >= max_image_predictions:
return
image_path = Path(image_path)
annotations = _fetch_annotations(
img_idx,
image_path,
batch,
predictions_metadata_map,
class_label_map,
class_map=class_map,
)
_log_images(
experiment,
[image_path],
curr_step,
annotations=annotations,
)
_comet_image_prediction_count += 1
def _log_plots(experiment, trainer) -> None:
"""
Log evaluation plots and label plots for the experiment.
This function logs various evaluation plots and confusion matrices to the experiment tracking system. It handles
different types of metrics (SegmentMetrics, PoseMetrics, DetMetrics, OBBMetrics) and logs the appropriate plots
for each type.
Args:
experiment (comet_ml.CometExperiment): The Comet ML experiment to log plots to.
trainer (ultralytics.engine.trainer.BaseTrainer): The trainer object containing validation metrics and save
directory information.
Examples:
>>> from ultralytics.utils.callbacks.comet import _log_plots
>>> _log_plots(experiment, trainer)
"""
plot_filenames = None
if isinstance(trainer.validator.metrics, SegmentMetrics):
plot_filenames = [
trainer.save_dir / f"{prefix}{plots}.png"
for plots in EVALUATION_PLOT_NAMES
for prefix in SEGMENT_METRICS_PLOT_PREFIX
]
elif isinstance(trainer.validator.metrics, PoseMetrics):
plot_filenames = [
trainer.save_dir / f"{prefix}{plots}.png"
for plots in EVALUATION_PLOT_NAMES
for prefix in POSE_METRICS_PLOT_PREFIX
]
elif isinstance(trainer.validator.metrics, (DetMetrics, OBBMetrics)):
plot_filenames = [
trainer.save_dir / f"{prefix}{plots}.png"
for plots in EVALUATION_PLOT_NAMES
for prefix in DETECTION_METRICS_PLOT_PREFIX
]
if plot_filenames is not None:
_log_images(experiment, plot_filenames, None)
confusion_matrix_filenames = [trainer.save_dir / f"{plots}.png" for plots in CONFUSION_MATRIX_PLOT_NAMES]
_log_images(experiment, confusion_matrix_filenames, None)
if not isinstance(trainer.validator.metrics, ClassifyMetrics):
label_plot_filenames = [trainer.save_dir / f"{labels}.jpg" for labels in LABEL_PLOT_NAMES]
_log_images(experiment, label_plot_filenames, None)
def _log_model(experiment, trainer) -> None:
"""Log the best-trained model to Comet.ml."""
model_name = _get_comet_model_name()
experiment.log_model(model_name, file_or_folder=str(trainer.best), file_name="best.pt", overwrite=True)
def _log_image_batches(experiment, trainer, curr_step: int) -> None:
"""Log samples of image batches for train, validation, and test."""
_log_images(experiment, trainer.save_dir.glob("train_batch*.jpg"), curr_step)
_log_images(experiment, trainer.save_dir.glob("val_batch*.jpg"), curr_step)
def _log_asset(experiment, asset_path) -> None:
"""
Logs a specific asset file to the given experiment.
This function facilitates logging an asset, such as a file, to the provided
experiment. It enables integration with experiment tracking platforms.
Args:
experiment (comet_ml.CometExperiment): The experiment instance to which the asset will be logged.
asset_path (Path): The file path of the asset to log.
"""
experiment.log_asset(asset_path)
def _log_table(experiment, table_path) -> None:
"""
Logs a table to the provided experiment.
This function is used to log a table file to the given experiment. The table
is identified by its file path.
Args:
experiment (comet_ml.CometExperiment): The experiment object where the table file will be logged.
table_path (Path): The file path of the table to be logged.
"""
experiment.log_table(str(table_path))
def on_pretrain_routine_start(trainer) -> None:
"""Create or resume a CometML experiment at the start of a YOLO pre-training routine."""
_resume_or_create_experiment(trainer.args)
def on_train_epoch_end(trainer) -> None:
"""Log metrics and save batch images at the end of training epochs."""
experiment = comet_ml.get_running_experiment()
if not experiment:
return
metadata = _fetch_trainer_metadata(trainer)
curr_epoch = metadata["curr_epoch"]
curr_step = metadata["curr_step"]
experiment.log_metrics(trainer.label_loss_items(trainer.tloss, prefix="train"), step=curr_step, epoch=curr_epoch)
def on_fit_epoch_end(trainer) -> None:
"""
Log model assets at the end of each epoch during training.
This function is called at the end of each training epoch to log metrics, learning rates, and model information
to a Comet ML experiment. It also logs model assets, confusion matrices, and image predictions based on
configuration settings.
The function retrieves the current Comet ML experiment and logs various training metrics. If it's the first epoch,
it also logs model information. On specified save intervals, it logs the model, confusion matrix (if enabled),
and image predictions (if enabled).
Args:
trainer (BaseTrainer): The YOLO trainer object containing training state, metrics, and configuration.
Examples:
>>> # Inside a training loop
>>> on_fit_epoch_end(trainer) # Log metrics and assets to Comet ML
"""
experiment = comet_ml.get_running_experiment()
if not experiment:
return
metadata = _fetch_trainer_metadata(trainer)
curr_epoch = metadata["curr_epoch"]
curr_step = metadata["curr_step"]
save_assets = metadata["save_assets"]
experiment.log_metrics(trainer.metrics, step=curr_step, epoch=curr_epoch)
experiment.log_metrics(trainer.lr, step=curr_step, epoch=curr_epoch)
if curr_epoch == 1:
from ultralytics.utils.torch_utils import model_info_for_loggers
experiment.log_metrics(model_info_for_loggers(trainer), step=curr_step, epoch=curr_epoch)
if not save_assets:
return
_log_model(experiment, trainer)
if _should_log_confusion_matrix():
_log_confusion_matrix(experiment, trainer, curr_step, curr_epoch)
if _should_log_image_predictions():
_log_image_predictions(experiment, trainer.validator, curr_step)
def on_train_end(trainer) -> None:
"""Perform operations at the end of training."""
experiment = comet_ml.get_running_experiment()
if not experiment:
return
metadata = _fetch_trainer_metadata(trainer)
curr_epoch = metadata["curr_epoch"]
curr_step = metadata["curr_step"]
plots = trainer.args.plots
_log_model(experiment, trainer)
if plots:
_log_plots(experiment, trainer)
_log_confusion_matrix(experiment, trainer, curr_step, curr_epoch)
_log_image_predictions(experiment, trainer.validator, curr_step)
_log_image_batches(experiment, trainer, curr_step)
# log results table
table_path = trainer.save_dir / RESULTS_TABLE_NAME
if table_path.exists():
_log_table(experiment, table_path)
# log arguments YAML
args_path = trainer.save_dir / ARGS_YAML_NAME
if args_path.exists():
_log_asset(experiment, args_path)
experiment.end()
global _comet_image_prediction_count
_comet_image_prediction_count = 0
callbacks = (
{
"on_pretrain_routine_start": on_pretrain_routine_start,
"on_train_epoch_end": on_train_epoch_end,
"on_fit_epoch_end": on_fit_epoch_end,
"on_train_end": on_train_end,
}
if comet_ml
else {}
)

View File

@@ -0,0 +1,202 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
from pathlib import Path
from ultralytics.utils import LOGGER, SETTINGS, TESTS_RUNNING, checks
try:
assert not TESTS_RUNNING # do not log pytest
assert SETTINGS["dvc"] is True # verify integration is enabled
import dvclive
assert checks.check_version("dvclive", "2.11.0", verbose=True)
import os
import re
# DVCLive logger instance
live = None
_processed_plots = {}
# `on_fit_epoch_end` is called on final validation (probably need to be fixed) for now this is the way we
# distinguish final evaluation of the best model vs last epoch validation
_training_epoch = False
except (ImportError, AssertionError, TypeError):
dvclive = None
def _log_images(path: Path, prefix: str = "") -> None:
"""
Log images at specified path with an optional prefix using DVCLive.
This function logs images found at the given path to DVCLive, organizing them by batch to enable slider
functionality in the UI. It processes image filenames to extract batch information and restructures the path
accordingly.
Args:
path (Path): Path to the image file to be logged.
prefix (str, optional): Optional prefix to add to the image name when logging.
Examples:
>>> from pathlib import Path
>>> _log_images(Path("runs/train/exp/val_batch0_pred.jpg"), prefix="validation")
"""
if live:
name = path.name
# Group images by batch to enable sliders in UI
if m := re.search(r"_batch(\d+)", name):
ni = m[1]
new_stem = re.sub(r"_batch(\d+)", "_batch", path.stem)
name = (Path(new_stem) / ni).with_suffix(path.suffix)
live.log_image(os.path.join(prefix, name), path)
def _log_plots(plots: dict, prefix: str = "") -> None:
"""
Log plot images for training progress if they have not been previously processed.
Args:
plots (dict): Dictionary containing plot information with timestamps.
prefix (str, optional): Optional prefix to add to the logged image paths.
"""
for name, params in plots.items():
timestamp = params["timestamp"]
if _processed_plots.get(name) != timestamp:
_log_images(name, prefix)
_processed_plots[name] = timestamp
def _log_confusion_matrix(validator) -> None:
"""
Log confusion matrix for a validator using DVCLive.
This function processes the confusion matrix from a validator object and logs it to DVCLive by converting
the matrix into lists of target and prediction labels.
Args:
validator (BaseValidator): The validator object containing the confusion matrix and class names. Must have
attributes: confusion_matrix.matrix, confusion_matrix.task, and names.
"""
targets = []
preds = []
matrix = validator.confusion_matrix.matrix
names = list(validator.names.values())
if validator.confusion_matrix.task == "detect":
names += ["background"]
for ti, pred in enumerate(matrix.T.astype(int)):
for pi, num in enumerate(pred):
targets.extend([names[ti]] * num)
preds.extend([names[pi]] * num)
live.log_sklearn_plot("confusion_matrix", targets, preds, name="cf.json", normalized=True)
def on_pretrain_routine_start(trainer) -> None:
"""Initialize DVCLive logger for training metadata during pre-training routine."""
try:
global live
live = dvclive.Live(save_dvc_exp=True, cache_images=True)
LOGGER.info("DVCLive is detected and auto logging is enabled (run 'yolo settings dvc=False' to disable).")
except Exception as e:
LOGGER.warning(f"DVCLive installed but not initialized correctly, not logging this run. {e}")
def on_pretrain_routine_end(trainer) -> None:
"""Log plots related to the training process at the end of the pretraining routine."""
_log_plots(trainer.plots, "train")
def on_train_start(trainer) -> None:
"""Log the training parameters if DVCLive logging is active."""
if live:
live.log_params(trainer.args)
def on_train_epoch_start(trainer) -> None:
"""Set the global variable _training_epoch value to True at the start of training each epoch."""
global _training_epoch
_training_epoch = True
def on_fit_epoch_end(trainer) -> None:
"""
Log training metrics, model info, and advance to next step at the end of each fit epoch.
This function is called at the end of each fit epoch during training. It logs various metrics including
training loss items, validation metrics, and learning rates. On the first epoch, it also logs model
information. Additionally, it logs training and validation plots and advances the DVCLive step counter.
Args:
trainer (BaseTrainer): The trainer object containing training state, metrics, and plots.
Notes:
This function only performs logging operations when DVCLive logging is active and during a training epoch.
The global variable _training_epoch is used to track whether the current epoch is a training epoch.
"""
global _training_epoch
if live and _training_epoch:
all_metrics = {**trainer.label_loss_items(trainer.tloss, prefix="train"), **trainer.metrics, **trainer.lr}
for metric, value in all_metrics.items():
live.log_metric(metric, value)
if trainer.epoch == 0:
from ultralytics.utils.torch_utils import model_info_for_loggers
for metric, value in model_info_for_loggers(trainer).items():
live.log_metric(metric, value, plot=False)
_log_plots(trainer.plots, "train")
_log_plots(trainer.validator.plots, "val")
live.next_step()
_training_epoch = False
def on_train_end(trainer) -> None:
"""
Log best metrics, plots, and confusion matrix at the end of training.
This function is called at the conclusion of the training process to log final metrics, visualizations, and
model artifacts if DVCLive logging is active. It captures the best model performance metrics, training plots,
validation plots, and confusion matrix for later analysis.
Args:
trainer (BaseTrainer): The trainer object containing training state, metrics, and validation results.
Examples:
>>> # Inside a custom training loop
>>> from ultralytics.utils.callbacks.dvc import on_train_end
>>> on_train_end(trainer) # Log final metrics and artifacts
"""
if live:
# At the end log the best metrics. It runs validator on the best model internally.
all_metrics = {**trainer.label_loss_items(trainer.tloss, prefix="train"), **trainer.metrics, **trainer.lr}
for metric, value in all_metrics.items():
live.log_metric(metric, value, plot=False)
_log_plots(trainer.plots, "val")
_log_plots(trainer.validator.plots, "val")
_log_confusion_matrix(trainer.validator)
if trainer.best.exists():
live.log_artifact(trainer.best, copy=True, type="model")
live.end()
callbacks = (
{
"on_pretrain_routine_start": on_pretrain_routine_start,
"on_pretrain_routine_end": on_pretrain_routine_end,
"on_train_start": on_train_start,
"on_train_epoch_start": on_train_epoch_start,
"on_fit_epoch_end": on_fit_epoch_end,
"on_train_end": on_train_end,
}
if dvclive
else {}
)

View File

@@ -0,0 +1,110 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
import json
from time import time
from ultralytics.hub import HUB_WEB_ROOT, PREFIX, HUBTrainingSession
from ultralytics.utils import LOGGER, RANK, SETTINGS
from ultralytics.utils.events import events
def on_pretrain_routine_start(trainer):
"""Create a remote Ultralytics HUB session to log local model training."""
if RANK in {-1, 0} and SETTINGS["hub"] is True and SETTINGS["api_key"] and trainer.hub_session is None:
trainer.hub_session = HUBTrainingSession.create_session(trainer.args.model, trainer.args)
def on_pretrain_routine_end(trainer):
"""Initialize timers for upload rate limiting before training begins."""
if session := getattr(trainer, "hub_session", None):
# Start timer for upload rate limit
session.timers = {"metrics": time(), "ckpt": time()} # start timer for session rate limiting
def on_fit_epoch_end(trainer):
"""Upload training progress metrics to Ultralytics HUB at the end of each epoch."""
if session := getattr(trainer, "hub_session", None):
# Upload metrics after validation ends
all_plots = {
**trainer.label_loss_items(trainer.tloss, prefix="train"),
**trainer.metrics,
}
if trainer.epoch == 0:
from ultralytics.utils.torch_utils import model_info_for_loggers
all_plots = {**all_plots, **model_info_for_loggers(trainer)}
session.metrics_queue[trainer.epoch] = json.dumps(all_plots)
# If any metrics failed to upload previously, add them to the queue to attempt uploading again
if session.metrics_upload_failed_queue:
session.metrics_queue.update(session.metrics_upload_failed_queue)
if time() - session.timers["metrics"] > session.rate_limits["metrics"]:
session.upload_metrics()
session.timers["metrics"] = time() # reset timer
session.metrics_queue = {} # reset queue
def on_model_save(trainer):
"""Upload model checkpoints to Ultralytics HUB with rate limiting."""
if session := getattr(trainer, "hub_session", None):
# Upload checkpoints with rate limiting
is_best = trainer.best_fitness == trainer.fitness
if time() - session.timers["ckpt"] > session.rate_limits["ckpt"]:
LOGGER.info(f"{PREFIX}Uploading checkpoint {HUB_WEB_ROOT}/models/{session.model.id}")
session.upload_model(trainer.epoch, trainer.last, is_best)
session.timers["ckpt"] = time() # reset timer
def on_train_end(trainer):
"""Upload final model and metrics to Ultralytics HUB at the end of training."""
if session := getattr(trainer, "hub_session", None):
# Upload final model and metrics with exponential standoff
LOGGER.info(f"{PREFIX}Syncing final model...")
session.upload_model(
trainer.epoch,
trainer.best,
map=trainer.metrics.get("metrics/mAP50-95(B)", 0),
final=True,
)
session.alive = False # stop heartbeats
LOGGER.info(f"{PREFIX}Done ✅\n{PREFIX}View model at {session.model_url} 🚀")
def on_train_start(trainer):
"""Run events on train start."""
events(trainer.args, trainer.device)
def on_val_start(validator):
"""Run events on validation start."""
if not validator.training:
events(validator.args, validator.device)
def on_predict_start(predictor):
"""Run events on predict start."""
events(predictor.args, predictor.device)
def on_export_start(exporter):
"""Run events on export start."""
events(exporter.args, exporter.device)
callbacks = (
{
"on_pretrain_routine_start": on_pretrain_routine_start,
"on_pretrain_routine_end": on_pretrain_routine_end,
"on_fit_epoch_end": on_fit_epoch_end,
"on_model_save": on_model_save,
"on_train_end": on_train_end,
"on_train_start": on_train_start,
"on_val_start": on_val_start,
"on_predict_start": on_predict_start,
"on_export_start": on_export_start,
}
if SETTINGS["hub"] is True
else {}
)

View File

@@ -0,0 +1,135 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
"""
MLflow Logging for Ultralytics YOLO.
This module enables MLflow logging for Ultralytics YOLO. It logs metrics, parameters, and model artifacts.
For setting up, a tracking URI should be specified. The logging can be customized using environment variables.
Commands:
1. To set a project name:
`export MLFLOW_EXPERIMENT_NAME=<your_experiment_name>` or use the project=<project> argument
2. To set a run name:
`export MLFLOW_RUN=<your_run_name>` or use the name=<name> argument
3. To start a local MLflow server:
mlflow server --backend-store-uri runs/mlflow
It will by default start a local server at http://127.0.0.1:5000.
To specify a different URI, set the MLFLOW_TRACKING_URI environment variable.
4. To kill all running MLflow server instances:
ps aux | grep 'mlflow' | grep -v 'grep' | awk '{print $2}' | xargs kill -9
"""
from ultralytics.utils import LOGGER, RUNS_DIR, SETTINGS, TESTS_RUNNING, colorstr
try:
import os
assert not TESTS_RUNNING or "test_mlflow" in os.environ.get("PYTEST_CURRENT_TEST", "") # do not log pytest
assert SETTINGS["mlflow"] is True # verify integration is enabled
import mlflow
assert hasattr(mlflow, "__version__") # verify package is not directory
from pathlib import Path
PREFIX = colorstr("MLflow: ")
except (ImportError, AssertionError):
mlflow = None
def sanitize_dict(x: dict) -> dict:
"""Sanitize dictionary keys by removing parentheses and converting values to floats."""
return {k.replace("(", "").replace(")", ""): float(v) for k, v in x.items()}
def on_pretrain_routine_end(trainer):
"""
Log training parameters to MLflow at the end of the pretraining routine.
This function sets up MLflow logging based on environment variables and trainer arguments. It sets the tracking URI,
experiment name, and run name, then starts the MLflow run if not already active. It finally logs the parameters
from the trainer.
Args:
trainer (ultralytics.engine.trainer.BaseTrainer): The training object with arguments and parameters to log.
Environment Variables:
MLFLOW_TRACKING_URI: The URI for MLflow tracking. If not set, defaults to 'runs/mlflow'.
MLFLOW_EXPERIMENT_NAME: The name of the MLflow experiment. If not set, defaults to trainer.args.project.
MLFLOW_RUN: The name of the MLflow run. If not set, defaults to trainer.args.name.
MLFLOW_KEEP_RUN_ACTIVE: Boolean indicating whether to keep the MLflow run active after training ends.
"""
global mlflow
uri = os.environ.get("MLFLOW_TRACKING_URI") or str(RUNS_DIR / "mlflow")
LOGGER.debug(f"{PREFIX} tracking uri: {uri}")
mlflow.set_tracking_uri(uri)
# Set experiment and run names
experiment_name = os.environ.get("MLFLOW_EXPERIMENT_NAME") or trainer.args.project or "/Shared/Ultralytics"
run_name = os.environ.get("MLFLOW_RUN") or trainer.args.name
mlflow.set_experiment(experiment_name)
mlflow.autolog()
try:
active_run = mlflow.active_run() or mlflow.start_run(run_name=run_name)
LOGGER.info(f"{PREFIX}logging run_id({active_run.info.run_id}) to {uri}")
if Path(uri).is_dir():
LOGGER.info(f"{PREFIX}view at http://127.0.0.1:5000 with 'mlflow server --backend-store-uri {uri}'")
LOGGER.info(f"{PREFIX}disable with 'yolo settings mlflow=False'")
mlflow.log_params(dict(trainer.args))
except Exception as e:
LOGGER.warning(f"{PREFIX}Failed to initialize: {e}")
LOGGER.warning(f"{PREFIX}Not tracking this run")
def on_train_epoch_end(trainer):
"""Log training metrics at the end of each train epoch to MLflow."""
if mlflow:
mlflow.log_metrics(
metrics={
**sanitize_dict(trainer.lr),
**sanitize_dict(trainer.label_loss_items(trainer.tloss, prefix="train")),
},
step=trainer.epoch,
)
def on_fit_epoch_end(trainer):
"""Log training metrics at the end of each fit epoch to MLflow."""
if mlflow:
mlflow.log_metrics(metrics=sanitize_dict(trainer.metrics), step=trainer.epoch)
def on_train_end(trainer):
"""Log model artifacts at the end of training."""
if not mlflow:
return
mlflow.log_artifact(str(trainer.best.parent)) # log save_dir/weights directory with best.pt and last.pt
for f in trainer.save_dir.glob("*"): # log all other files in save_dir
if f.suffix in {".png", ".jpg", ".csv", ".pt", ".yaml"}:
mlflow.log_artifact(str(f))
keep_run_active = os.environ.get("MLFLOW_KEEP_RUN_ACTIVE", "False").lower() == "true"
if keep_run_active:
LOGGER.info(f"{PREFIX}mlflow run still alive, remember to close it using mlflow.end_run()")
else:
mlflow.end_run()
LOGGER.debug(f"{PREFIX}mlflow run ended")
LOGGER.info(
f"{PREFIX}results logged to {mlflow.get_tracking_uri()}\n{PREFIX}disable with 'yolo settings mlflow=False'"
)
callbacks = (
{
"on_pretrain_routine_end": on_pretrain_routine_end,
"on_train_epoch_end": on_train_epoch_end,
"on_fit_epoch_end": on_fit_epoch_end,
"on_train_end": on_train_end,
}
if mlflow
else {}
)

View File

@@ -0,0 +1,134 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
from ultralytics.utils import LOGGER, SETTINGS, TESTS_RUNNING
try:
assert not TESTS_RUNNING # do not log pytest
assert SETTINGS["neptune"] is True # verify integration is enabled
import neptune
from neptune.types import File
assert hasattr(neptune, "__version__")
run = None # NeptuneAI experiment logger instance
except (ImportError, AssertionError):
neptune = None
def _log_scalars(scalars: dict, step: int = 0) -> None:
"""
Log scalars to the NeptuneAI experiment logger.
Args:
scalars (dict): Dictionary of scalar values to log to NeptuneAI.
step (int, optional): The current step or iteration number for logging.
Examples:
>>> metrics = {"mAP": 0.85, "loss": 0.32}
>>> _log_scalars(metrics, step=100)
"""
if run:
for k, v in scalars.items():
run[k].append(value=v, step=step)
def _log_images(imgs_dict: dict, group: str = "") -> None:
"""
Log images to the NeptuneAI experiment logger.
This function logs image data to Neptune.ai when a valid Neptune run is active. Images are organized
under the specified group name.
Args:
imgs_dict (dict): Dictionary of images to log, with keys as image names and values as image data.
group (str, optional): Group name to organize images under in the Neptune UI.
Examples:
>>> # Log validation images
>>> _log_images({"val_batch": img_tensor}, group="validation")
"""
if run:
for k, v in imgs_dict.items():
run[f"{group}/{k}"].upload(File(v))
def _log_plot(title: str, plot_path: str) -> None:
"""Log plots to the NeptuneAI experiment logger."""
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
img = mpimg.imread(plot_path)
fig = plt.figure()
ax = fig.add_axes([0, 0, 1, 1], frameon=False, aspect="auto", xticks=[], yticks=[]) # no ticks
ax.imshow(img)
run[f"Plots/{title}"].upload(fig)
def on_pretrain_routine_start(trainer) -> None:
"""Initialize NeptuneAI run and log hyperparameters before training starts."""
try:
global run
run = neptune.init_run(
project=trainer.args.project or "Ultralytics",
name=trainer.args.name,
tags=["Ultralytics"],
)
run["Configuration/Hyperparameters"] = {k: "" if v is None else v for k, v in vars(trainer.args).items()}
except Exception as e:
LOGGER.warning(f"NeptuneAI installed but not initialized correctly, not logging this run. {e}")
def on_train_epoch_end(trainer) -> None:
"""Log training metrics and learning rate at the end of each training epoch."""
_log_scalars(trainer.label_loss_items(trainer.tloss, prefix="train"), trainer.epoch + 1)
_log_scalars(trainer.lr, trainer.epoch + 1)
if trainer.epoch == 1:
_log_images({f.stem: str(f) for f in trainer.save_dir.glob("train_batch*.jpg")}, "Mosaic")
def on_fit_epoch_end(trainer) -> None:
"""Log model info and validation metrics at the end of each fit epoch."""
if run and trainer.epoch == 0:
from ultralytics.utils.torch_utils import model_info_for_loggers
run["Configuration/Model"] = model_info_for_loggers(trainer)
_log_scalars(trainer.metrics, trainer.epoch + 1)
def on_val_end(validator) -> None:
"""Log validation images at the end of validation."""
if run:
# Log val_labels and val_pred
_log_images({f.stem: str(f) for f in validator.save_dir.glob("val*.jpg")}, "Validation")
def on_train_end(trainer) -> None:
"""Log final results, plots, and model weights at the end of training."""
if run:
# Log final results, CM matrix + PR plots
files = [
"results.png",
"confusion_matrix.png",
"confusion_matrix_normalized.png",
*(f"{x}_curve.png" for x in ("F1", "PR", "P", "R")),
]
files = [(trainer.save_dir / f) for f in files if (trainer.save_dir / f).exists()] # filter
for f in files:
_log_plot(title=f.stem, plot_path=f)
# Log the final model
run[f"weights/{trainer.args.name or trainer.args.task}/{trainer.best.name}"].upload(File(str(trainer.best)))
callbacks = (
{
"on_pretrain_routine_start": on_pretrain_routine_start,
"on_train_epoch_end": on_train_epoch_end,
"on_fit_epoch_end": on_fit_epoch_end,
"on_val_end": on_val_end,
"on_train_end": on_train_end,
}
if neptune
else {}
)

View File

@@ -0,0 +1,73 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
from ultralytics.utils import RANK, SETTINGS
def on_pretrain_routine_start(trainer):
"""Initialize and start console logging immediately at the very beginning."""
if RANK in {-1, 0}:
from ultralytics.utils.logger import DEFAULT_LOG_PATH, ConsoleLogger, SystemLogger
trainer.system_logger = SystemLogger()
trainer.console_logger = ConsoleLogger(DEFAULT_LOG_PATH)
trainer.console_logger.start_capture()
def on_pretrain_routine_end(trainer):
"""Handle pre-training routine completion event."""
pass
def on_fit_epoch_end(trainer):
"""Handle end of training epoch event and collect system metrics."""
if RANK in {-1, 0} and hasattr(trainer, "system_logger"):
system_metrics = trainer.system_logger.get_metrics()
print(system_metrics) # for debug
def on_model_save(trainer):
"""Handle model checkpoint save event."""
pass
def on_train_end(trainer):
"""Stop console capture and finalize logs."""
if logger := getattr(trainer, "console_logger", None):
logger.stop_capture()
def on_train_start(trainer):
"""Handle training start event."""
pass
def on_val_start(validator):
"""Handle validation start event."""
pass
def on_predict_start(predictor):
"""Handle prediction start event."""
pass
def on_export_start(exporter):
"""Handle model export start event."""
pass
callbacks = (
{
"on_pretrain_routine_start": on_pretrain_routine_start,
"on_pretrain_routine_end": on_pretrain_routine_end,
"on_fit_epoch_end": on_fit_epoch_end,
"on_model_save": on_model_save,
"on_train_end": on_train_end,
"on_train_start": on_train_start,
"on_val_start": on_val_start,
"on_predict_start": on_predict_start,
"on_export_start": on_export_start,
}
if SETTINGS.get("platform", False) is True # disabled for debugging
else {}
)

View File

@@ -0,0 +1,43 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
from ultralytics.utils import SETTINGS
try:
assert SETTINGS["raytune"] is True # verify integration is enabled
import ray
from ray import tune
from ray.air import session
except (ImportError, AssertionError):
tune = None
def on_fit_epoch_end(trainer):
"""
Report training metrics to Ray Tune at epoch end when a Ray session is active.
Captures metrics from the trainer object and sends them to Ray Tune with the current epoch number,
enabling hyperparameter tuning optimization. Only executes when within an active Ray Tune session.
Args:
trainer (ultralytics.engine.trainer.BaseTrainer): The Ultralytics trainer object containing metrics and epochs.
Examples:
>>> # Called automatically by the Ultralytics training loop
>>> on_fit_epoch_end(trainer)
References:
Ray Tune docs: https://docs.ray.io/en/latest/tune/index.html
"""
if ray.train._internal.session.get_session(): # check if Ray Tune session is active
metrics = trainer.metrics
session.report({**metrics, **{"epoch": trainer.epoch + 1}})
callbacks = (
{
"on_fit_epoch_end": on_fit_epoch_end,
}
if tune
else {}
)

View File

@@ -0,0 +1,131 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
from ultralytics.utils import LOGGER, SETTINGS, TESTS_RUNNING, colorstr, torch_utils
try:
assert not TESTS_RUNNING # do not log pytest
assert SETTINGS["tensorboard"] is True # verify integration is enabled
WRITER = None # TensorBoard SummaryWriter instance
PREFIX = colorstr("TensorBoard: ")
# Imports below only required if TensorBoard enabled
import warnings
from copy import deepcopy
import torch
from torch.utils.tensorboard import SummaryWriter
except (ImportError, AssertionError, TypeError, AttributeError):
# TypeError for handling 'Descriptors cannot not be created directly.' protobuf errors in Windows
# AttributeError: module 'tensorflow' has no attribute 'io' if 'tensorflow' not installed
SummaryWriter = None
def _log_scalars(scalars: dict, step: int = 0) -> None:
"""
Log scalar values to TensorBoard.
Args:
scalars (dict): Dictionary of scalar values to log to TensorBoard. Keys are scalar names and values are the
corresponding scalar values.
step (int): Global step value to record with the scalar values. Used for x-axis in TensorBoard graphs.
Examples:
Log training metrics
>>> metrics = {"loss": 0.5, "accuracy": 0.95}
>>> _log_scalars(metrics, step=100)
"""
if WRITER:
for k, v in scalars.items():
WRITER.add_scalar(k, v, step)
def _log_tensorboard_graph(trainer) -> None:
"""
Log model graph to TensorBoard.
This function attempts to visualize the model architecture in TensorBoard by tracing the model with a dummy input
tensor. It first tries a simple method suitable for YOLO models, and if that fails, falls back to a more complex
approach for models like RTDETR that may require special handling.
Args:
trainer (ultralytics.engine.trainer.BaseTrainer): The trainer object containing the model to visualize.
Must have attributes model and args with imgsz.
Notes:
This function requires TensorBoard integration to be enabled and the global WRITER to be initialized.
It handles potential warnings from the PyTorch JIT tracer and attempts to gracefully handle different
model architectures.
"""
# Input image
imgsz = trainer.args.imgsz
imgsz = (imgsz, imgsz) if isinstance(imgsz, int) else imgsz
p = next(trainer.model.parameters()) # for device, type
im = torch.zeros((1, 3, *imgsz), device=p.device, dtype=p.dtype) # input image (must be zeros, not empty)
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=UserWarning) # suppress jit trace warning
warnings.simplefilter("ignore", category=torch.jit.TracerWarning) # suppress jit trace warning
# Try simple method first (YOLO)
try:
trainer.model.eval() # place in .eval() mode to avoid BatchNorm statistics changes
WRITER.add_graph(torch.jit.trace(torch_utils.unwrap_model(trainer.model), im, strict=False), [])
LOGGER.info(f"{PREFIX}model graph visualization added ✅")
return
except Exception:
# Fallback to TorchScript export steps (RTDETR)
try:
model = deepcopy(torch_utils.unwrap_model(trainer.model))
model.eval()
model = model.fuse(verbose=False)
for m in model.modules():
if hasattr(m, "export"): # Detect, RTDETRDecoder (Segment and Pose use Detect base class)
m.export = True
m.format = "torchscript"
model(im) # dry run
WRITER.add_graph(torch.jit.trace(model, im, strict=False), [])
LOGGER.info(f"{PREFIX}model graph visualization added ✅")
except Exception as e:
LOGGER.warning(f"{PREFIX}TensorBoard graph visualization failure {e}")
def on_pretrain_routine_start(trainer) -> None:
"""Initialize TensorBoard logging with SummaryWriter."""
if SummaryWriter:
try:
global WRITER
WRITER = SummaryWriter(str(trainer.save_dir))
LOGGER.info(f"{PREFIX}Start with 'tensorboard --logdir {trainer.save_dir}', view at http://localhost:6006/")
except Exception as e:
LOGGER.warning(f"{PREFIX}TensorBoard not initialized correctly, not logging this run. {e}")
def on_train_start(trainer) -> None:
"""Log TensorBoard graph."""
if WRITER:
_log_tensorboard_graph(trainer)
def on_train_epoch_end(trainer) -> None:
"""Log scalar statistics at the end of a training epoch."""
_log_scalars(trainer.label_loss_items(trainer.tloss, prefix="train"), trainer.epoch + 1)
_log_scalars(trainer.lr, trainer.epoch + 1)
def on_fit_epoch_end(trainer) -> None:
"""Log epoch metrics at end of training epoch."""
_log_scalars(trainer.metrics, trainer.epoch + 1)
callbacks = (
{
"on_pretrain_routine_start": on_pretrain_routine_start,
"on_train_start": on_train_start,
"on_fit_epoch_end": on_fit_epoch_end,
"on_train_epoch_end": on_train_epoch_end,
}
if SummaryWriter
else {}
)

View File

@@ -0,0 +1,191 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
from ultralytics.utils import SETTINGS, TESTS_RUNNING
from ultralytics.utils.torch_utils import model_info_for_loggers
try:
assert not TESTS_RUNNING # do not log pytest
assert SETTINGS["wandb"] is True # verify integration is enabled
import wandb as wb
assert hasattr(wb, "__version__") # verify package is not directory
_processed_plots = {}
except (ImportError, AssertionError):
wb = None
def _custom_table(x, y, classes, title="Precision Recall Curve", x_title="Recall", y_title="Precision"):
"""
Create and log a custom metric visualization to wandb.plot.pr_curve.
This function crafts a custom metric visualization that mimics the behavior of the default wandb precision-recall
curve while allowing for enhanced customization. The visual metric is useful for monitoring model performance across
different classes.
Args:
x (list): Values for the x-axis; expected to have length N.
y (list): Corresponding values for the y-axis; also expected to have length N.
classes (list): Labels identifying the class of each point; length N.
title (str, optional): Title for the plot.
x_title (str, optional): Label for the x-axis.
y_title (str, optional): Label for the y-axis.
Returns:
(wandb.Object): A wandb object suitable for logging, showcasing the crafted metric visualization.
"""
import polars as pl # scope for faster 'import ultralytics'
import polars.selectors as cs
df = pl.DataFrame({"class": classes, "y": y, "x": x}).with_columns(cs.numeric().round(3))
data = df.select(["class", "y", "x"]).rows()
fields = {"x": "x", "y": "y", "class": "class"}
string_fields = {"title": title, "x-axis-title": x_title, "y-axis-title": y_title}
return wb.plot_table(
"wandb/area-under-curve/v0",
wb.Table(data=data, columns=["class", "y", "x"]),
fields=fields,
string_fields=string_fields,
)
def _plot_curve(
x,
y,
names=None,
id="precision-recall",
title="Precision Recall Curve",
x_title="Recall",
y_title="Precision",
num_x=100,
only_mean=False,
):
"""
Log a metric curve visualization.
This function generates a metric curve based on input data and logs the visualization to wandb.
The curve can represent aggregated data (mean) or individual class data, depending on the 'only_mean' flag.
Args:
x (np.ndarray): Data points for the x-axis with length N.
y (np.ndarray): Corresponding data points for the y-axis with shape (C, N), where C is the number of classes.
names (list, optional): Names of the classes corresponding to the y-axis data; length C.
id (str, optional): Unique identifier for the logged data in wandb.
title (str, optional): Title for the visualization plot.
x_title (str, optional): Label for the x-axis.
y_title (str, optional): Label for the y-axis.
num_x (int, optional): Number of interpolated data points for visualization.
only_mean (bool, optional): Flag to indicate if only the mean curve should be plotted.
Notes:
The function leverages the '_custom_table' function to generate the actual visualization.
"""
import numpy as np
# Create new x
if names is None:
names = []
x_new = np.linspace(x[0], x[-1], num_x).round(5)
# Create arrays for logging
x_log = x_new.tolist()
y_log = np.interp(x_new, x, np.mean(y, axis=0)).round(3).tolist()
if only_mean:
table = wb.Table(data=list(zip(x_log, y_log)), columns=[x_title, y_title])
wb.run.log({title: wb.plot.line(table, x_title, y_title, title=title)})
else:
classes = ["mean"] * len(x_log)
for i, yi in enumerate(y):
x_log.extend(x_new) # add new x
y_log.extend(np.interp(x_new, x, yi)) # interpolate y to new x
classes.extend([names[i]] * len(x_new)) # add class names
wb.log({id: _custom_table(x_log, y_log, classes, title, x_title, y_title)}, commit=False)
def _log_plots(plots, step):
"""
Log plots to WandB at a specific step if they haven't been logged already.
This function checks each plot in the input dictionary against previously processed plots and logs
new or updated plots to WandB at the specified step.
Args:
plots (dict): Dictionary of plots to log, where keys are plot names and values are dictionaries
containing plot metadata including timestamps.
step (int): The step/epoch at which to log the plots in the WandB run.
Notes:
The function uses a shallow copy of the plots dictionary to prevent modification during iteration.
Plots are identified by their stem name (filename without extension).
Each plot is logged as a WandB Image object.
"""
for name, params in plots.copy().items(): # shallow copy to prevent plots dict changing during iteration
timestamp = params["timestamp"]
if _processed_plots.get(name) != timestamp:
wb.run.log({name.stem: wb.Image(str(name))}, step=step)
_processed_plots[name] = timestamp
def on_pretrain_routine_start(trainer):
"""Initialize and start wandb project if module is present."""
if not wb.run:
wb.init(
project=str(trainer.args.project).replace("/", "-") if trainer.args.project else "Ultralytics",
name=str(trainer.args.name).replace("/", "-"),
config=vars(trainer.args),
)
def on_fit_epoch_end(trainer):
"""Log training metrics and model information at the end of an epoch."""
wb.run.log(trainer.metrics, step=trainer.epoch + 1)
_log_plots(trainer.plots, step=trainer.epoch + 1)
_log_plots(trainer.validator.plots, step=trainer.epoch + 1)
if trainer.epoch == 0:
wb.run.log(model_info_for_loggers(trainer), step=trainer.epoch + 1)
def on_train_epoch_end(trainer):
"""Log metrics and save images at the end of each training epoch."""
wb.run.log(trainer.label_loss_items(trainer.tloss, prefix="train"), step=trainer.epoch + 1)
wb.run.log(trainer.lr, step=trainer.epoch + 1)
if trainer.epoch == 1:
_log_plots(trainer.plots, step=trainer.epoch + 1)
def on_train_end(trainer):
"""Save the best model as an artifact and log final plots at the end of training."""
_log_plots(trainer.validator.plots, step=trainer.epoch + 1)
_log_plots(trainer.plots, step=trainer.epoch + 1)
art = wb.Artifact(type="model", name=f"run_{wb.run.id}_model")
if trainer.best.exists():
art.add_file(trainer.best)
wb.run.log_artifact(art, aliases=["best"])
# Check if we actually have plots to save
if trainer.args.plots and hasattr(trainer.validator.metrics, "curves_results"):
for curve_name, curve_values in zip(trainer.validator.metrics.curves, trainer.validator.metrics.curves_results):
x, y, x_title, y_title = curve_values
_plot_curve(
x,
y,
names=list(trainer.validator.metrics.names.values()),
id=f"curves/{curve_name}",
title=curve_name,
x_title=x_title,
y_title=y_title,
)
wb.run.finish() # required or run continues on dashboard
callbacks = (
{
"on_pretrain_routine_start": on_pretrain_routine_start,
"on_train_epoch_end": on_train_epoch_end,
"on_fit_epoch_end": on_fit_epoch_end,
"on_train_end": on_train_end,
}
if wb
else {}
)