init commit
This commit is contained in:
5
ultralytics/utils/callbacks/__init__.py
Normal file
5
ultralytics/utils/callbacks/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
||||
|
||||
from .base import add_integration_callbacks, default_callbacks, get_default_callbacks
|
||||
|
||||
__all__ = "add_integration_callbacks", "default_callbacks", "get_default_callbacks"
|
||||
BIN
ultralytics/utils/callbacks/__pycache__/__init__.cpython-310.pyc
Normal file
BIN
ultralytics/utils/callbacks/__pycache__/__init__.cpython-310.pyc
Normal file
Binary file not shown.
BIN
ultralytics/utils/callbacks/__pycache__/base.cpython-310.pyc
Normal file
BIN
ultralytics/utils/callbacks/__pycache__/base.cpython-310.pyc
Normal file
Binary file not shown.
BIN
ultralytics/utils/callbacks/__pycache__/hub.cpython-310.pyc
Normal file
BIN
ultralytics/utils/callbacks/__pycache__/hub.cpython-310.pyc
Normal file
Binary file not shown.
BIN
ultralytics/utils/callbacks/__pycache__/platform.cpython-310.pyc
Normal file
BIN
ultralytics/utils/callbacks/__pycache__/platform.cpython-310.pyc
Normal file
Binary file not shown.
235
ultralytics/utils/callbacks/base.py
Normal file
235
ultralytics/utils/callbacks/base.py
Normal file
@@ -0,0 +1,235 @@
|
||||
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
||||
"""Base callbacks for Ultralytics training, validation, prediction, and export processes."""
|
||||
|
||||
from collections import defaultdict
|
||||
from copy import deepcopy
|
||||
|
||||
# Trainer callbacks ----------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
def on_pretrain_routine_start(trainer):
|
||||
"""Called before the pretraining routine starts."""
|
||||
pass
|
||||
|
||||
|
||||
def on_pretrain_routine_end(trainer):
|
||||
"""Called after the pretraining routine ends."""
|
||||
pass
|
||||
|
||||
|
||||
def on_train_start(trainer):
|
||||
"""Called when the training starts."""
|
||||
pass
|
||||
|
||||
|
||||
def on_train_epoch_start(trainer):
|
||||
"""Called at the start of each training epoch."""
|
||||
pass
|
||||
|
||||
|
||||
def on_train_batch_start(trainer):
|
||||
"""Called at the start of each training batch."""
|
||||
pass
|
||||
|
||||
|
||||
def optimizer_step(trainer):
|
||||
"""Called when the optimizer takes a step."""
|
||||
pass
|
||||
|
||||
|
||||
def on_before_zero_grad(trainer):
|
||||
"""Called before the gradients are set to zero."""
|
||||
pass
|
||||
|
||||
|
||||
def on_train_batch_end(trainer):
|
||||
"""Called at the end of each training batch."""
|
||||
pass
|
||||
|
||||
|
||||
def on_train_epoch_end(trainer):
|
||||
"""Called at the end of each training epoch."""
|
||||
pass
|
||||
|
||||
|
||||
def on_fit_epoch_end(trainer):
|
||||
"""Called at the end of each fit epoch (train + val)."""
|
||||
pass
|
||||
|
||||
|
||||
def on_model_save(trainer):
|
||||
"""Called when the model is saved."""
|
||||
pass
|
||||
|
||||
|
||||
def on_train_end(trainer):
|
||||
"""Called when the training ends."""
|
||||
pass
|
||||
|
||||
|
||||
def on_params_update(trainer):
|
||||
"""Called when the model parameters are updated."""
|
||||
pass
|
||||
|
||||
|
||||
def teardown(trainer):
|
||||
"""Called during the teardown of the training process."""
|
||||
pass
|
||||
|
||||
|
||||
# Validator callbacks --------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
def on_val_start(validator):
|
||||
"""Called when the validation starts."""
|
||||
pass
|
||||
|
||||
|
||||
def on_val_batch_start(validator):
|
||||
"""Called at the start of each validation batch."""
|
||||
pass
|
||||
|
||||
|
||||
def on_val_batch_end(validator):
|
||||
"""Called at the end of each validation batch."""
|
||||
pass
|
||||
|
||||
|
||||
def on_val_end(validator):
|
||||
"""Called when the validation ends."""
|
||||
pass
|
||||
|
||||
|
||||
# Predictor callbacks --------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
def on_predict_start(predictor):
|
||||
"""Called when the prediction starts."""
|
||||
pass
|
||||
|
||||
|
||||
def on_predict_batch_start(predictor):
|
||||
"""Called at the start of each prediction batch."""
|
||||
pass
|
||||
|
||||
|
||||
def on_predict_batch_end(predictor):
|
||||
"""Called at the end of each prediction batch."""
|
||||
pass
|
||||
|
||||
|
||||
def on_predict_postprocess_end(predictor):
|
||||
"""Called after the post-processing of the prediction ends."""
|
||||
pass
|
||||
|
||||
|
||||
def on_predict_end(predictor):
|
||||
"""Called when the prediction ends."""
|
||||
pass
|
||||
|
||||
|
||||
# Exporter callbacks ---------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
def on_export_start(exporter):
|
||||
"""Called when the model export starts."""
|
||||
pass
|
||||
|
||||
|
||||
def on_export_end(exporter):
|
||||
"""Called when the model export ends."""
|
||||
pass
|
||||
|
||||
|
||||
default_callbacks = {
|
||||
# Run in trainer
|
||||
"on_pretrain_routine_start": [on_pretrain_routine_start],
|
||||
"on_pretrain_routine_end": [on_pretrain_routine_end],
|
||||
"on_train_start": [on_train_start],
|
||||
"on_train_epoch_start": [on_train_epoch_start],
|
||||
"on_train_batch_start": [on_train_batch_start],
|
||||
"optimizer_step": [optimizer_step],
|
||||
"on_before_zero_grad": [on_before_zero_grad],
|
||||
"on_train_batch_end": [on_train_batch_end],
|
||||
"on_train_epoch_end": [on_train_epoch_end],
|
||||
"on_fit_epoch_end": [on_fit_epoch_end], # fit = train + val
|
||||
"on_model_save": [on_model_save],
|
||||
"on_train_end": [on_train_end],
|
||||
"on_params_update": [on_params_update],
|
||||
"teardown": [teardown],
|
||||
# Run in validator
|
||||
"on_val_start": [on_val_start],
|
||||
"on_val_batch_start": [on_val_batch_start],
|
||||
"on_val_batch_end": [on_val_batch_end],
|
||||
"on_val_end": [on_val_end],
|
||||
# Run in predictor
|
||||
"on_predict_start": [on_predict_start],
|
||||
"on_predict_batch_start": [on_predict_batch_start],
|
||||
"on_predict_postprocess_end": [on_predict_postprocess_end],
|
||||
"on_predict_batch_end": [on_predict_batch_end],
|
||||
"on_predict_end": [on_predict_end],
|
||||
# Run in exporter
|
||||
"on_export_start": [on_export_start],
|
||||
"on_export_end": [on_export_end],
|
||||
}
|
||||
|
||||
|
||||
def get_default_callbacks():
|
||||
"""
|
||||
Get the default callbacks for Ultralytics training, validation, prediction, and export processes.
|
||||
|
||||
Returns:
|
||||
(dict): Dictionary of default callbacks for various training events. Each key represents an event during the
|
||||
training process, and the corresponding value is a list of callback functions executed when that event
|
||||
occurs.
|
||||
|
||||
Examples:
|
||||
>>> callbacks = get_default_callbacks()
|
||||
>>> print(list(callbacks.keys())) # show all available callback events
|
||||
['on_pretrain_routine_start', 'on_pretrain_routine_end', ...]
|
||||
"""
|
||||
return defaultdict(list, deepcopy(default_callbacks))
|
||||
|
||||
|
||||
def add_integration_callbacks(instance):
|
||||
"""
|
||||
Add integration callbacks to the instance's callbacks dictionary.
|
||||
|
||||
This function loads and adds various integration callbacks to the provided instance. The specific callbacks added
|
||||
depend on the type of instance provided. All instances receive HUB callbacks, while Trainer instances also receive
|
||||
additional callbacks for various integrations like ClearML, Comet, DVC, MLflow, Neptune, Ray Tune, TensorBoard,
|
||||
and Weights & Biases.
|
||||
|
||||
Args:
|
||||
instance (Trainer | Predictor | Validator | Exporter): The object instance to which callbacks will be added.
|
||||
The type of instance determines which callbacks are loaded.
|
||||
|
||||
Examples:
|
||||
>>> from ultralytics.engine.trainer import BaseTrainer
|
||||
>>> trainer = BaseTrainer()
|
||||
>>> add_integration_callbacks(trainer)
|
||||
"""
|
||||
from .hub import callbacks as hub_cb
|
||||
from .platform import callbacks as platform_cb
|
||||
|
||||
# Load Ultralytics callbacks
|
||||
callbacks_list = [hub_cb, platform_cb]
|
||||
|
||||
# Load training callbacks
|
||||
if "Trainer" in instance.__class__.__name__:
|
||||
from .clearml import callbacks as clear_cb
|
||||
from .comet import callbacks as comet_cb
|
||||
from .dvc import callbacks as dvc_cb
|
||||
from .mlflow import callbacks as mlflow_cb
|
||||
from .neptune import callbacks as neptune_cb
|
||||
from .raytune import callbacks as tune_cb
|
||||
from .tensorboard import callbacks as tb_cb
|
||||
from .wb import callbacks as wb_cb
|
||||
|
||||
callbacks_list.extend([clear_cb, comet_cb, dvc_cb, mlflow_cb, neptune_cb, tune_cb, tb_cb, wb_cb])
|
||||
|
||||
# Add the callbacks to the callbacks dictionary
|
||||
for callbacks in callbacks_list:
|
||||
for k, v in callbacks.items():
|
||||
if v not in instance.callbacks[k]:
|
||||
instance.callbacks[k].append(v)
|
||||
154
ultralytics/utils/callbacks/clearml.py
Normal file
154
ultralytics/utils/callbacks/clearml.py
Normal file
@@ -0,0 +1,154 @@
|
||||
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
||||
|
||||
from ultralytics.utils import LOGGER, SETTINGS, TESTS_RUNNING
|
||||
|
||||
try:
|
||||
assert not TESTS_RUNNING # do not log pytest
|
||||
assert SETTINGS["clearml"] is True # verify integration is enabled
|
||||
import clearml
|
||||
from clearml import Task
|
||||
|
||||
assert hasattr(clearml, "__version__") # verify package is not directory
|
||||
|
||||
except (ImportError, AssertionError):
|
||||
clearml = None
|
||||
|
||||
|
||||
def _log_debug_samples(files, title: str = "Debug Samples") -> None:
|
||||
"""
|
||||
Log files (images) as debug samples in the ClearML task.
|
||||
|
||||
Args:
|
||||
files (list[Path]): A list of file paths in PosixPath format.
|
||||
title (str): A title that groups together images with the same values.
|
||||
"""
|
||||
import re
|
||||
|
||||
if task := Task.current_task():
|
||||
for f in files:
|
||||
if f.exists():
|
||||
it = re.search(r"_batch(\d+)", f.name)
|
||||
iteration = int(it.groups()[0]) if it else 0
|
||||
task.get_logger().report_image(
|
||||
title=title, series=f.name.replace(it.group(), ""), local_path=str(f), iteration=iteration
|
||||
)
|
||||
|
||||
|
||||
def _log_plot(title: str, plot_path: str) -> None:
|
||||
"""
|
||||
Log an image as a plot in the plot section of ClearML.
|
||||
|
||||
Args:
|
||||
title (str): The title of the plot.
|
||||
plot_path (str): The path to the saved image file.
|
||||
"""
|
||||
import matplotlib.image as mpimg
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
img = mpimg.imread(plot_path)
|
||||
fig = plt.figure()
|
||||
ax = fig.add_axes([0, 0, 1, 1], frameon=False, aspect="auto", xticks=[], yticks=[]) # no ticks
|
||||
ax.imshow(img)
|
||||
|
||||
Task.current_task().get_logger().report_matplotlib_figure(
|
||||
title=title, series="", figure=fig, report_interactive=False
|
||||
)
|
||||
|
||||
|
||||
def on_pretrain_routine_start(trainer) -> None:
|
||||
"""Initialize and connect ClearML task at the start of pretraining routine."""
|
||||
try:
|
||||
if task := Task.current_task():
|
||||
# WARNING: make sure the automatic pytorch and matplotlib bindings are disabled!
|
||||
# We are logging these plots and model files manually in the integration
|
||||
from clearml.binding.frameworks.pytorch_bind import PatchPyTorchModelIO
|
||||
from clearml.binding.matplotlib_bind import PatchedMatplotlib
|
||||
|
||||
PatchPyTorchModelIO.update_current_task(None)
|
||||
PatchedMatplotlib.update_current_task(None)
|
||||
else:
|
||||
task = Task.init(
|
||||
project_name=trainer.args.project or "Ultralytics",
|
||||
task_name=trainer.args.name,
|
||||
tags=["Ultralytics"],
|
||||
output_uri=True,
|
||||
reuse_last_task_id=False,
|
||||
auto_connect_frameworks={"pytorch": False, "matplotlib": False},
|
||||
)
|
||||
LOGGER.warning(
|
||||
"ClearML Initialized a new task. If you want to run remotely, "
|
||||
"please add clearml-init and connect your arguments before initializing YOLO."
|
||||
)
|
||||
task.connect(vars(trainer.args), name="General")
|
||||
except Exception as e:
|
||||
LOGGER.warning(f"ClearML installed but not initialized correctly, not logging this run. {e}")
|
||||
|
||||
|
||||
def on_train_epoch_end(trainer) -> None:
|
||||
"""Log debug samples for the first epoch and report current training progress."""
|
||||
if task := Task.current_task():
|
||||
# Log debug samples for first epoch only
|
||||
if trainer.epoch == 1:
|
||||
_log_debug_samples(sorted(trainer.save_dir.glob("train_batch*.jpg")), "Mosaic")
|
||||
# Report the current training progress
|
||||
for k, v in trainer.label_loss_items(trainer.tloss, prefix="train").items():
|
||||
task.get_logger().report_scalar("train", k, v, iteration=trainer.epoch)
|
||||
for k, v in trainer.lr.items():
|
||||
task.get_logger().report_scalar("lr", k, v, iteration=trainer.epoch)
|
||||
|
||||
|
||||
def on_fit_epoch_end(trainer) -> None:
|
||||
"""Report model information and metrics to logger at the end of an epoch."""
|
||||
if task := Task.current_task():
|
||||
# Report epoch time and validation metrics
|
||||
task.get_logger().report_scalar(
|
||||
title="Epoch Time", series="Epoch Time", value=trainer.epoch_time, iteration=trainer.epoch
|
||||
)
|
||||
for k, v in trainer.metrics.items():
|
||||
title = k.split("/")[0]
|
||||
task.get_logger().report_scalar(title, k, v, iteration=trainer.epoch)
|
||||
if trainer.epoch == 0:
|
||||
from ultralytics.utils.torch_utils import model_info_for_loggers
|
||||
|
||||
for k, v in model_info_for_loggers(trainer).items():
|
||||
task.get_logger().report_single_value(k, v)
|
||||
|
||||
|
||||
def on_val_end(validator) -> None:
|
||||
"""Log validation results including labels and predictions."""
|
||||
if Task.current_task():
|
||||
# Log validation labels and predictions
|
||||
_log_debug_samples(sorted(validator.save_dir.glob("val*.jpg")), "Validation")
|
||||
|
||||
|
||||
def on_train_end(trainer) -> None:
|
||||
"""Log final model and training results on training completion."""
|
||||
if task := Task.current_task():
|
||||
# Log final results, confusion matrix and PR plots
|
||||
files = [
|
||||
"results.png",
|
||||
"confusion_matrix.png",
|
||||
"confusion_matrix_normalized.png",
|
||||
*(f"{x}_curve.png" for x in ("F1", "PR", "P", "R")),
|
||||
]
|
||||
files = [(trainer.save_dir / f) for f in files if (trainer.save_dir / f).exists()] # filter existing files
|
||||
for f in files:
|
||||
_log_plot(title=f.stem, plot_path=f)
|
||||
# Report final metrics
|
||||
for k, v in trainer.validator.metrics.results_dict.items():
|
||||
task.get_logger().report_single_value(k, v)
|
||||
# Log the final model
|
||||
task.update_output_model(model_path=str(trainer.best), model_name=trainer.args.name, auto_delete_file=False)
|
||||
|
||||
|
||||
callbacks = (
|
||||
{
|
||||
"on_pretrain_routine_start": on_pretrain_routine_start,
|
||||
"on_train_epoch_end": on_train_epoch_end,
|
||||
"on_fit_epoch_end": on_fit_epoch_end,
|
||||
"on_val_end": on_val_end,
|
||||
"on_train_end": on_train_end,
|
||||
}
|
||||
if clearml
|
||||
else {}
|
||||
)
|
||||
639
ultralytics/utils/callbacks/comet.py
Normal file
639
ultralytics/utils/callbacks/comet.py
Normal file
@@ -0,0 +1,639 @@
|
||||
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Callable
|
||||
from types import SimpleNamespace
|
||||
from typing import Any
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
from ultralytics.utils import LOGGER, RANK, SETTINGS, TESTS_RUNNING, ops
|
||||
from ultralytics.utils.metrics import ClassifyMetrics, DetMetrics, OBBMetrics, PoseMetrics, SegmentMetrics
|
||||
|
||||
try:
|
||||
assert not TESTS_RUNNING # do not log pytest
|
||||
assert SETTINGS["comet"] is True # verify integration is enabled
|
||||
import comet_ml
|
||||
|
||||
assert hasattr(comet_ml, "__version__") # verify package is not directory
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
# Ensures certain logging functions only run for supported tasks
|
||||
COMET_SUPPORTED_TASKS = ["detect", "segment"]
|
||||
|
||||
# Names of plots created by Ultralytics that are logged to Comet
|
||||
CONFUSION_MATRIX_PLOT_NAMES = "confusion_matrix", "confusion_matrix_normalized"
|
||||
EVALUATION_PLOT_NAMES = "F1_curve", "P_curve", "R_curve", "PR_curve"
|
||||
LABEL_PLOT_NAMES = ["labels"]
|
||||
SEGMENT_METRICS_PLOT_PREFIX = "Box", "Mask"
|
||||
POSE_METRICS_PLOT_PREFIX = "Box", "Pose"
|
||||
DETECTION_METRICS_PLOT_PREFIX = ["Box"]
|
||||
RESULTS_TABLE_NAME = "results.csv"
|
||||
ARGS_YAML_NAME = "args.yaml"
|
||||
|
||||
_comet_image_prediction_count = 0
|
||||
|
||||
except (ImportError, AssertionError):
|
||||
comet_ml = None
|
||||
|
||||
|
||||
def _get_comet_mode() -> str:
|
||||
"""Return the Comet mode from environment variables, defaulting to 'online'."""
|
||||
comet_mode = os.getenv("COMET_MODE")
|
||||
if comet_mode is not None:
|
||||
LOGGER.warning(
|
||||
"The COMET_MODE environment variable is deprecated. "
|
||||
"Please use COMET_START_ONLINE to set the Comet experiment mode. "
|
||||
"To start an offline Comet experiment, use 'export COMET_START_ONLINE=0'. "
|
||||
"If COMET_START_ONLINE is not set or is set to '1', an online Comet experiment will be created."
|
||||
)
|
||||
return comet_mode
|
||||
|
||||
return "online"
|
||||
|
||||
|
||||
def _get_comet_model_name() -> str:
|
||||
"""Return the Comet model name from environment variable or default to 'Ultralytics'."""
|
||||
return os.getenv("COMET_MODEL_NAME", "Ultralytics")
|
||||
|
||||
|
||||
def _get_eval_batch_logging_interval() -> int:
|
||||
"""Get the evaluation batch logging interval from environment variable or use default value 1."""
|
||||
return int(os.getenv("COMET_EVAL_BATCH_LOGGING_INTERVAL", 1))
|
||||
|
||||
|
||||
def _get_max_image_predictions_to_log() -> int:
|
||||
"""Get the maximum number of image predictions to log from environment variables."""
|
||||
return int(os.getenv("COMET_MAX_IMAGE_PREDICTIONS", 100))
|
||||
|
||||
|
||||
def _scale_confidence_score(score: float) -> float:
|
||||
"""Scale the confidence score by a factor specified in environment variable."""
|
||||
scale = float(os.getenv("COMET_MAX_CONFIDENCE_SCORE", 100.0))
|
||||
return score * scale
|
||||
|
||||
|
||||
def _should_log_confusion_matrix() -> bool:
|
||||
"""Determine if the confusion matrix should be logged based on environment variable settings."""
|
||||
return os.getenv("COMET_EVAL_LOG_CONFUSION_MATRIX", "false").lower() == "true"
|
||||
|
||||
|
||||
def _should_log_image_predictions() -> bool:
|
||||
"""Determine whether to log image predictions based on environment variable."""
|
||||
return os.getenv("COMET_EVAL_LOG_IMAGE_PREDICTIONS", "true").lower() == "true"
|
||||
|
||||
|
||||
def _resume_or_create_experiment(args: SimpleNamespace) -> None:
|
||||
"""
|
||||
Resume CometML experiment or create a new experiment based on args.
|
||||
|
||||
Ensures that the experiment object is only created in a single process during distributed training.
|
||||
|
||||
Args:
|
||||
args (SimpleNamespace): Training arguments containing project configuration and other parameters.
|
||||
"""
|
||||
if RANK not in {-1, 0}:
|
||||
return
|
||||
|
||||
# Set environment variable (if not set by the user) to configure the Comet experiment's online mode under the hood.
|
||||
# IF COMET_START_ONLINE is set by the user it will override COMET_MODE value.
|
||||
if os.getenv("COMET_START_ONLINE") is None:
|
||||
comet_mode = _get_comet_mode()
|
||||
os.environ["COMET_START_ONLINE"] = "1" if comet_mode != "offline" else "0"
|
||||
|
||||
try:
|
||||
_project_name = os.getenv("COMET_PROJECT_NAME", args.project)
|
||||
experiment = comet_ml.start(project_name=_project_name)
|
||||
experiment.log_parameters(vars(args))
|
||||
experiment.log_others(
|
||||
{
|
||||
"eval_batch_logging_interval": _get_eval_batch_logging_interval(),
|
||||
"log_confusion_matrix_on_eval": _should_log_confusion_matrix(),
|
||||
"log_image_predictions": _should_log_image_predictions(),
|
||||
"max_image_predictions": _get_max_image_predictions_to_log(),
|
||||
}
|
||||
)
|
||||
experiment.log_other("Created from", "ultralytics")
|
||||
|
||||
except Exception as e:
|
||||
LOGGER.warning(f"Comet installed but not initialized correctly, not logging this run. {e}")
|
||||
|
||||
|
||||
def _fetch_trainer_metadata(trainer) -> dict:
|
||||
"""
|
||||
Return metadata for YOLO training including epoch and asset saving status.
|
||||
|
||||
Args:
|
||||
trainer (ultralytics.engine.trainer.BaseTrainer): The YOLO trainer object containing training state and config.
|
||||
|
||||
Returns:
|
||||
(dict): Dictionary containing current epoch, step, save assets flag, and final epoch flag.
|
||||
"""
|
||||
curr_epoch = trainer.epoch + 1
|
||||
|
||||
train_num_steps_per_epoch = len(trainer.train_loader.dataset) // trainer.batch_size
|
||||
curr_step = curr_epoch * train_num_steps_per_epoch
|
||||
final_epoch = curr_epoch == trainer.epochs
|
||||
|
||||
save = trainer.args.save
|
||||
save_period = trainer.args.save_period
|
||||
save_interval = curr_epoch % save_period == 0
|
||||
save_assets = save and save_period > 0 and save_interval and not final_epoch
|
||||
|
||||
return dict(curr_epoch=curr_epoch, curr_step=curr_step, save_assets=save_assets, final_epoch=final_epoch)
|
||||
|
||||
|
||||
def _scale_bounding_box_to_original_image_shape(
|
||||
box, resized_image_shape, original_image_shape, ratio_pad
|
||||
) -> list[float]:
|
||||
"""
|
||||
Scale bounding box from resized image coordinates to original image coordinates.
|
||||
|
||||
YOLO resizes images during training and the label values are normalized based on this resized shape.
|
||||
This function rescales the bounding box labels to the original image shape.
|
||||
|
||||
Args:
|
||||
box (torch.Tensor): Bounding box in normalized xywh format.
|
||||
resized_image_shape (tuple): Shape of the resized image (height, width).
|
||||
original_image_shape (tuple): Shape of the original image (height, width).
|
||||
ratio_pad (tuple): Ratio and padding information for scaling.
|
||||
|
||||
Returns:
|
||||
(list[float]): Scaled bounding box coordinates in xywh format with top-left corner adjustment.
|
||||
"""
|
||||
resized_image_height, resized_image_width = resized_image_shape
|
||||
|
||||
# Convert normalized xywh format predictions to xyxy in resized scale format
|
||||
box = ops.xywhn2xyxy(box, h=resized_image_height, w=resized_image_width)
|
||||
# Scale box predictions from resized image scale back to original image scale
|
||||
box = ops.scale_boxes(resized_image_shape, box, original_image_shape, ratio_pad)
|
||||
# Convert bounding box format from xyxy to xywh for Comet logging
|
||||
box = ops.xyxy2xywh(box)
|
||||
# Adjust xy center to correspond top-left corner
|
||||
box[:2] -= box[2:] / 2
|
||||
box = box.tolist()
|
||||
|
||||
return box
|
||||
|
||||
|
||||
def _format_ground_truth_annotations_for_detection(img_idx, image_path, batch, class_name_map=None) -> dict | None:
|
||||
"""
|
||||
Format ground truth annotations for object detection.
|
||||
|
||||
This function processes ground truth annotations from a batch of images for object detection tasks. It extracts
|
||||
bounding boxes, class labels, and other metadata for a specific image in the batch, and formats them for
|
||||
visualization or evaluation.
|
||||
|
||||
Args:
|
||||
img_idx (int): Index of the image in the batch to process.
|
||||
image_path (str | Path): Path to the image file.
|
||||
batch (dict): Batch dictionary containing detection data with keys:
|
||||
- 'batch_idx': Tensor of batch indices
|
||||
- 'bboxes': Tensor of bounding boxes in normalized xywh format
|
||||
- 'cls': Tensor of class labels
|
||||
- 'ori_shape': Original image shapes
|
||||
- 'resized_shape': Resized image shapes
|
||||
- 'ratio_pad': Ratio and padding information
|
||||
class_name_map (dict, optional): Mapping from class indices to class names.
|
||||
|
||||
Returns:
|
||||
(dict | None): Formatted ground truth annotations with the following structure:
|
||||
- 'boxes': List of box coordinates [x, y, width, height]
|
||||
- 'label': Label string with format "gt_{class_name}"
|
||||
- 'score': Confidence score (always 1.0, scaled by _scale_confidence_score)
|
||||
Returns None if no bounding boxes are found for the image.
|
||||
"""
|
||||
indices = batch["batch_idx"] == img_idx
|
||||
bboxes = batch["bboxes"][indices]
|
||||
if len(bboxes) == 0:
|
||||
LOGGER.debug(f"Comet Image: {image_path} has no bounding boxes labels")
|
||||
return None
|
||||
|
||||
cls_labels = batch["cls"][indices].squeeze(1).tolist()
|
||||
if class_name_map:
|
||||
cls_labels = [str(class_name_map[label]) for label in cls_labels]
|
||||
|
||||
original_image_shape = batch["ori_shape"][img_idx]
|
||||
resized_image_shape = batch["resized_shape"][img_idx]
|
||||
ratio_pad = batch["ratio_pad"][img_idx]
|
||||
|
||||
data = []
|
||||
for box, label in zip(bboxes, cls_labels):
|
||||
box = _scale_bounding_box_to_original_image_shape(box, resized_image_shape, original_image_shape, ratio_pad)
|
||||
data.append(
|
||||
{
|
||||
"boxes": [box],
|
||||
"label": f"gt_{label}",
|
||||
"score": _scale_confidence_score(1.0),
|
||||
}
|
||||
)
|
||||
|
||||
return {"name": "ground_truth", "data": data}
|
||||
|
||||
|
||||
def _format_prediction_annotations(image_path, metadata, class_label_map=None, class_map=None) -> dict | None:
|
||||
"""
|
||||
Format YOLO predictions for object detection visualization.
|
||||
|
||||
Args:
|
||||
image_path (Path): Path to the image file.
|
||||
metadata (dict): Prediction metadata containing bounding boxes and class information.
|
||||
class_label_map (dict, optional): Mapping from class indices to class names.
|
||||
class_map (dict, optional): Additional class mapping for label conversion.
|
||||
|
||||
Returns:
|
||||
(dict | None): Formatted prediction annotations or None if no predictions exist.
|
||||
"""
|
||||
stem = image_path.stem
|
||||
image_id = int(stem) if stem.isnumeric() else stem
|
||||
|
||||
predictions = metadata.get(image_id)
|
||||
if not predictions:
|
||||
LOGGER.debug(f"Comet Image: {image_path} has no bounding boxes predictions")
|
||||
return None
|
||||
|
||||
# apply the mapping that was used to map the predicted classes when the JSON was created
|
||||
if class_label_map and class_map:
|
||||
class_label_map = {class_map[k]: v for k, v in class_label_map.items()}
|
||||
try:
|
||||
# import pycotools utilities to decompress annotations for various tasks, e.g. segmentation
|
||||
from faster_coco_eval.core.mask import decode # noqa
|
||||
except ImportError:
|
||||
decode = None
|
||||
|
||||
data = []
|
||||
for prediction in predictions:
|
||||
boxes = prediction["bbox"]
|
||||
score = _scale_confidence_score(prediction["score"])
|
||||
cls_label = prediction["category_id"]
|
||||
if class_label_map:
|
||||
cls_label = str(class_label_map[cls_label])
|
||||
|
||||
annotation_data = {"boxes": [boxes], "label": cls_label, "score": score}
|
||||
|
||||
if decode is not None:
|
||||
# do segmentation processing only if we are able to decode it
|
||||
segments = prediction.get("segmentation", None)
|
||||
if segments is not None:
|
||||
segments = _extract_segmentation_annotation(segments, decode)
|
||||
if segments is not None:
|
||||
annotation_data["points"] = segments
|
||||
|
||||
data.append(annotation_data)
|
||||
|
||||
return {"name": "prediction", "data": data}
|
||||
|
||||
|
||||
def _extract_segmentation_annotation(segmentation_raw: str, decode: Callable) -> list[list[Any]] | None:
|
||||
"""
|
||||
Extract segmentation annotation from compressed segmentations as list of polygons.
|
||||
|
||||
Args:
|
||||
segmentation_raw (str): Raw segmentation data in compressed format.
|
||||
decode (Callable): Function to decode the compressed segmentation data.
|
||||
|
||||
Returns:
|
||||
(list[list[Any]] | None): List of polygon points or None if extraction fails.
|
||||
"""
|
||||
try:
|
||||
mask = decode(segmentation_raw)
|
||||
contours, _ = cv2.findContours(mask, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
|
||||
annotations = [np.array(polygon).squeeze() for polygon in contours if len(polygon) >= 3]
|
||||
return [annotation.ravel().tolist() for annotation in annotations]
|
||||
except Exception as e:
|
||||
LOGGER.warning(f"Comet Failed to extract segmentation annotation: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def _fetch_annotations(img_idx, image_path, batch, prediction_metadata_map, class_label_map, class_map) -> list | None:
|
||||
"""
|
||||
Join the ground truth and prediction annotations if they exist.
|
||||
|
||||
Args:
|
||||
img_idx (int): Index of the image in the batch.
|
||||
image_path (Path): Path to the image file.
|
||||
batch (dict): Batch data containing ground truth annotations.
|
||||
prediction_metadata_map (dict): Map of prediction metadata by image ID.
|
||||
class_label_map (dict): Mapping from class indices to class names.
|
||||
class_map (dict): Additional class mapping for label conversion.
|
||||
|
||||
Returns:
|
||||
(list | None): List of annotation dictionaries or None if no annotations exist.
|
||||
"""
|
||||
ground_truth_annotations = _format_ground_truth_annotations_for_detection(
|
||||
img_idx, image_path, batch, class_label_map
|
||||
)
|
||||
prediction_annotations = _format_prediction_annotations(
|
||||
image_path, prediction_metadata_map, class_label_map, class_map
|
||||
)
|
||||
|
||||
annotations = [
|
||||
annotation for annotation in [ground_truth_annotations, prediction_annotations] if annotation is not None
|
||||
]
|
||||
return [annotations] if annotations else None
|
||||
|
||||
|
||||
def _create_prediction_metadata_map(model_predictions) -> dict:
|
||||
"""Create metadata map for model predictions by grouping them based on image ID."""
|
||||
pred_metadata_map = {}
|
||||
for prediction in model_predictions:
|
||||
pred_metadata_map.setdefault(prediction["image_id"], [])
|
||||
pred_metadata_map[prediction["image_id"]].append(prediction)
|
||||
|
||||
return pred_metadata_map
|
||||
|
||||
|
||||
def _log_confusion_matrix(experiment, trainer, curr_step, curr_epoch) -> None:
|
||||
"""Log the confusion matrix to Comet experiment."""
|
||||
conf_mat = trainer.validator.confusion_matrix.matrix
|
||||
names = list(trainer.data["names"].values()) + ["background"]
|
||||
experiment.log_confusion_matrix(
|
||||
matrix=conf_mat, labels=names, max_categories=len(names), epoch=curr_epoch, step=curr_step
|
||||
)
|
||||
|
||||
|
||||
def _log_images(experiment, image_paths, curr_step: int | None, annotations=None) -> None:
|
||||
"""
|
||||
Log images to the experiment with optional annotations.
|
||||
|
||||
This function logs images to a Comet ML experiment, optionally including annotation data for visualization
|
||||
such as bounding boxes or segmentation masks.
|
||||
|
||||
Args:
|
||||
experiment (comet_ml.CometExperiment): The Comet ML experiment to log images to.
|
||||
image_paths (list[Path]): List of paths to images that will be logged.
|
||||
curr_step (int): Current training step/iteration for tracking in the experiment timeline.
|
||||
annotations (list[list[dict]], optional): Nested list of annotation dictionaries for each image. Each
|
||||
annotation contains visualization data like bounding boxes, labels, and confidence scores.
|
||||
"""
|
||||
if annotations:
|
||||
for image_path, annotation in zip(image_paths, annotations):
|
||||
experiment.log_image(image_path, name=image_path.stem, step=curr_step, annotations=annotation)
|
||||
|
||||
else:
|
||||
for image_path in image_paths:
|
||||
experiment.log_image(image_path, name=image_path.stem, step=curr_step)
|
||||
|
||||
|
||||
def _log_image_predictions(experiment, validator, curr_step) -> None:
|
||||
"""
|
||||
Log predicted boxes for a single image during training.
|
||||
|
||||
This function logs image predictions to a Comet ML experiment during model validation. It processes
|
||||
validation data and formats both ground truth and prediction annotations for visualization in the Comet
|
||||
dashboard. The function respects configured limits on the number of images to log.
|
||||
|
||||
Args:
|
||||
experiment (comet_ml.CometExperiment): The Comet ML experiment to log to.
|
||||
validator (BaseValidator): The validator instance containing validation data and predictions.
|
||||
curr_step (int): The current training step for logging timeline.
|
||||
|
||||
Notes:
|
||||
This function uses global state to track the number of logged predictions across calls.
|
||||
It only logs predictions for supported tasks defined in COMET_SUPPORTED_TASKS.
|
||||
The number of logged images is limited by the COMET_MAX_IMAGE_PREDICTIONS environment variable.
|
||||
"""
|
||||
global _comet_image_prediction_count
|
||||
|
||||
task = validator.args.task
|
||||
if task not in COMET_SUPPORTED_TASKS:
|
||||
return
|
||||
|
||||
jdict = validator.jdict
|
||||
if not jdict:
|
||||
return
|
||||
|
||||
predictions_metadata_map = _create_prediction_metadata_map(jdict)
|
||||
dataloader = validator.dataloader
|
||||
class_label_map = validator.names
|
||||
class_map = getattr(validator, "class_map", None)
|
||||
|
||||
batch_logging_interval = _get_eval_batch_logging_interval()
|
||||
max_image_predictions = _get_max_image_predictions_to_log()
|
||||
|
||||
for batch_idx, batch in enumerate(dataloader):
|
||||
if (batch_idx + 1) % batch_logging_interval != 0:
|
||||
continue
|
||||
|
||||
image_paths = batch["im_file"]
|
||||
for img_idx, image_path in enumerate(image_paths):
|
||||
if _comet_image_prediction_count >= max_image_predictions:
|
||||
return
|
||||
|
||||
image_path = Path(image_path)
|
||||
annotations = _fetch_annotations(
|
||||
img_idx,
|
||||
image_path,
|
||||
batch,
|
||||
predictions_metadata_map,
|
||||
class_label_map,
|
||||
class_map=class_map,
|
||||
)
|
||||
_log_images(
|
||||
experiment,
|
||||
[image_path],
|
||||
curr_step,
|
||||
annotations=annotations,
|
||||
)
|
||||
_comet_image_prediction_count += 1
|
||||
|
||||
|
||||
def _log_plots(experiment, trainer) -> None:
|
||||
"""
|
||||
Log evaluation plots and label plots for the experiment.
|
||||
|
||||
This function logs various evaluation plots and confusion matrices to the experiment tracking system. It handles
|
||||
different types of metrics (SegmentMetrics, PoseMetrics, DetMetrics, OBBMetrics) and logs the appropriate plots
|
||||
for each type.
|
||||
|
||||
Args:
|
||||
experiment (comet_ml.CometExperiment): The Comet ML experiment to log plots to.
|
||||
trainer (ultralytics.engine.trainer.BaseTrainer): The trainer object containing validation metrics and save
|
||||
directory information.
|
||||
|
||||
Examples:
|
||||
>>> from ultralytics.utils.callbacks.comet import _log_plots
|
||||
>>> _log_plots(experiment, trainer)
|
||||
"""
|
||||
plot_filenames = None
|
||||
if isinstance(trainer.validator.metrics, SegmentMetrics):
|
||||
plot_filenames = [
|
||||
trainer.save_dir / f"{prefix}{plots}.png"
|
||||
for plots in EVALUATION_PLOT_NAMES
|
||||
for prefix in SEGMENT_METRICS_PLOT_PREFIX
|
||||
]
|
||||
elif isinstance(trainer.validator.metrics, PoseMetrics):
|
||||
plot_filenames = [
|
||||
trainer.save_dir / f"{prefix}{plots}.png"
|
||||
for plots in EVALUATION_PLOT_NAMES
|
||||
for prefix in POSE_METRICS_PLOT_PREFIX
|
||||
]
|
||||
elif isinstance(trainer.validator.metrics, (DetMetrics, OBBMetrics)):
|
||||
plot_filenames = [
|
||||
trainer.save_dir / f"{prefix}{plots}.png"
|
||||
for plots in EVALUATION_PLOT_NAMES
|
||||
for prefix in DETECTION_METRICS_PLOT_PREFIX
|
||||
]
|
||||
|
||||
if plot_filenames is not None:
|
||||
_log_images(experiment, plot_filenames, None)
|
||||
|
||||
confusion_matrix_filenames = [trainer.save_dir / f"{plots}.png" for plots in CONFUSION_MATRIX_PLOT_NAMES]
|
||||
_log_images(experiment, confusion_matrix_filenames, None)
|
||||
|
||||
if not isinstance(trainer.validator.metrics, ClassifyMetrics):
|
||||
label_plot_filenames = [trainer.save_dir / f"{labels}.jpg" for labels in LABEL_PLOT_NAMES]
|
||||
_log_images(experiment, label_plot_filenames, None)
|
||||
|
||||
|
||||
def _log_model(experiment, trainer) -> None:
|
||||
"""Log the best-trained model to Comet.ml."""
|
||||
model_name = _get_comet_model_name()
|
||||
experiment.log_model(model_name, file_or_folder=str(trainer.best), file_name="best.pt", overwrite=True)
|
||||
|
||||
|
||||
def _log_image_batches(experiment, trainer, curr_step: int) -> None:
|
||||
"""Log samples of image batches for train, validation, and test."""
|
||||
_log_images(experiment, trainer.save_dir.glob("train_batch*.jpg"), curr_step)
|
||||
_log_images(experiment, trainer.save_dir.glob("val_batch*.jpg"), curr_step)
|
||||
|
||||
|
||||
def _log_asset(experiment, asset_path) -> None:
|
||||
"""
|
||||
Logs a specific asset file to the given experiment.
|
||||
|
||||
This function facilitates logging an asset, such as a file, to the provided
|
||||
experiment. It enables integration with experiment tracking platforms.
|
||||
|
||||
Args:
|
||||
experiment (comet_ml.CometExperiment): The experiment instance to which the asset will be logged.
|
||||
asset_path (Path): The file path of the asset to log.
|
||||
"""
|
||||
experiment.log_asset(asset_path)
|
||||
|
||||
|
||||
def _log_table(experiment, table_path) -> None:
|
||||
"""
|
||||
Logs a table to the provided experiment.
|
||||
|
||||
This function is used to log a table file to the given experiment. The table
|
||||
is identified by its file path.
|
||||
|
||||
Args:
|
||||
experiment (comet_ml.CometExperiment): The experiment object where the table file will be logged.
|
||||
table_path (Path): The file path of the table to be logged.
|
||||
"""
|
||||
experiment.log_table(str(table_path))
|
||||
|
||||
|
||||
def on_pretrain_routine_start(trainer) -> None:
|
||||
"""Create or resume a CometML experiment at the start of a YOLO pre-training routine."""
|
||||
_resume_or_create_experiment(trainer.args)
|
||||
|
||||
|
||||
def on_train_epoch_end(trainer) -> None:
|
||||
"""Log metrics and save batch images at the end of training epochs."""
|
||||
experiment = comet_ml.get_running_experiment()
|
||||
if not experiment:
|
||||
return
|
||||
|
||||
metadata = _fetch_trainer_metadata(trainer)
|
||||
curr_epoch = metadata["curr_epoch"]
|
||||
curr_step = metadata["curr_step"]
|
||||
|
||||
experiment.log_metrics(trainer.label_loss_items(trainer.tloss, prefix="train"), step=curr_step, epoch=curr_epoch)
|
||||
|
||||
|
||||
def on_fit_epoch_end(trainer) -> None:
|
||||
"""
|
||||
Log model assets at the end of each epoch during training.
|
||||
|
||||
This function is called at the end of each training epoch to log metrics, learning rates, and model information
|
||||
to a Comet ML experiment. It also logs model assets, confusion matrices, and image predictions based on
|
||||
configuration settings.
|
||||
|
||||
The function retrieves the current Comet ML experiment and logs various training metrics. If it's the first epoch,
|
||||
it also logs model information. On specified save intervals, it logs the model, confusion matrix (if enabled),
|
||||
and image predictions (if enabled).
|
||||
|
||||
Args:
|
||||
trainer (BaseTrainer): The YOLO trainer object containing training state, metrics, and configuration.
|
||||
|
||||
Examples:
|
||||
>>> # Inside a training loop
|
||||
>>> on_fit_epoch_end(trainer) # Log metrics and assets to Comet ML
|
||||
"""
|
||||
experiment = comet_ml.get_running_experiment()
|
||||
if not experiment:
|
||||
return
|
||||
|
||||
metadata = _fetch_trainer_metadata(trainer)
|
||||
curr_epoch = metadata["curr_epoch"]
|
||||
curr_step = metadata["curr_step"]
|
||||
save_assets = metadata["save_assets"]
|
||||
|
||||
experiment.log_metrics(trainer.metrics, step=curr_step, epoch=curr_epoch)
|
||||
experiment.log_metrics(trainer.lr, step=curr_step, epoch=curr_epoch)
|
||||
if curr_epoch == 1:
|
||||
from ultralytics.utils.torch_utils import model_info_for_loggers
|
||||
|
||||
experiment.log_metrics(model_info_for_loggers(trainer), step=curr_step, epoch=curr_epoch)
|
||||
|
||||
if not save_assets:
|
||||
return
|
||||
|
||||
_log_model(experiment, trainer)
|
||||
if _should_log_confusion_matrix():
|
||||
_log_confusion_matrix(experiment, trainer, curr_step, curr_epoch)
|
||||
if _should_log_image_predictions():
|
||||
_log_image_predictions(experiment, trainer.validator, curr_step)
|
||||
|
||||
|
||||
def on_train_end(trainer) -> None:
|
||||
"""Perform operations at the end of training."""
|
||||
experiment = comet_ml.get_running_experiment()
|
||||
if not experiment:
|
||||
return
|
||||
|
||||
metadata = _fetch_trainer_metadata(trainer)
|
||||
curr_epoch = metadata["curr_epoch"]
|
||||
curr_step = metadata["curr_step"]
|
||||
plots = trainer.args.plots
|
||||
|
||||
_log_model(experiment, trainer)
|
||||
if plots:
|
||||
_log_plots(experiment, trainer)
|
||||
|
||||
_log_confusion_matrix(experiment, trainer, curr_step, curr_epoch)
|
||||
_log_image_predictions(experiment, trainer.validator, curr_step)
|
||||
_log_image_batches(experiment, trainer, curr_step)
|
||||
# log results table
|
||||
table_path = trainer.save_dir / RESULTS_TABLE_NAME
|
||||
if table_path.exists():
|
||||
_log_table(experiment, table_path)
|
||||
|
||||
# log arguments YAML
|
||||
args_path = trainer.save_dir / ARGS_YAML_NAME
|
||||
if args_path.exists():
|
||||
_log_asset(experiment, args_path)
|
||||
|
||||
experiment.end()
|
||||
|
||||
global _comet_image_prediction_count
|
||||
_comet_image_prediction_count = 0
|
||||
|
||||
|
||||
callbacks = (
|
||||
{
|
||||
"on_pretrain_routine_start": on_pretrain_routine_start,
|
||||
"on_train_epoch_end": on_train_epoch_end,
|
||||
"on_fit_epoch_end": on_fit_epoch_end,
|
||||
"on_train_end": on_train_end,
|
||||
}
|
||||
if comet_ml
|
||||
else {}
|
||||
)
|
||||
202
ultralytics/utils/callbacks/dvc.py
Normal file
202
ultralytics/utils/callbacks/dvc.py
Normal file
@@ -0,0 +1,202 @@
|
||||
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from ultralytics.utils import LOGGER, SETTINGS, TESTS_RUNNING, checks
|
||||
|
||||
try:
|
||||
assert not TESTS_RUNNING # do not log pytest
|
||||
assert SETTINGS["dvc"] is True # verify integration is enabled
|
||||
import dvclive
|
||||
|
||||
assert checks.check_version("dvclive", "2.11.0", verbose=True)
|
||||
|
||||
import os
|
||||
import re
|
||||
|
||||
# DVCLive logger instance
|
||||
live = None
|
||||
_processed_plots = {}
|
||||
|
||||
# `on_fit_epoch_end` is called on final validation (probably need to be fixed) for now this is the way we
|
||||
# distinguish final evaluation of the best model vs last epoch validation
|
||||
_training_epoch = False
|
||||
|
||||
except (ImportError, AssertionError, TypeError):
|
||||
dvclive = None
|
||||
|
||||
|
||||
def _log_images(path: Path, prefix: str = "") -> None:
|
||||
"""
|
||||
Log images at specified path with an optional prefix using DVCLive.
|
||||
|
||||
This function logs images found at the given path to DVCLive, organizing them by batch to enable slider
|
||||
functionality in the UI. It processes image filenames to extract batch information and restructures the path
|
||||
accordingly.
|
||||
|
||||
Args:
|
||||
path (Path): Path to the image file to be logged.
|
||||
prefix (str, optional): Optional prefix to add to the image name when logging.
|
||||
|
||||
Examples:
|
||||
>>> from pathlib import Path
|
||||
>>> _log_images(Path("runs/train/exp/val_batch0_pred.jpg"), prefix="validation")
|
||||
"""
|
||||
if live:
|
||||
name = path.name
|
||||
|
||||
# Group images by batch to enable sliders in UI
|
||||
if m := re.search(r"_batch(\d+)", name):
|
||||
ni = m[1]
|
||||
new_stem = re.sub(r"_batch(\d+)", "_batch", path.stem)
|
||||
name = (Path(new_stem) / ni).with_suffix(path.suffix)
|
||||
|
||||
live.log_image(os.path.join(prefix, name), path)
|
||||
|
||||
|
||||
def _log_plots(plots: dict, prefix: str = "") -> None:
|
||||
"""
|
||||
Log plot images for training progress if they have not been previously processed.
|
||||
|
||||
Args:
|
||||
plots (dict): Dictionary containing plot information with timestamps.
|
||||
prefix (str, optional): Optional prefix to add to the logged image paths.
|
||||
"""
|
||||
for name, params in plots.items():
|
||||
timestamp = params["timestamp"]
|
||||
if _processed_plots.get(name) != timestamp:
|
||||
_log_images(name, prefix)
|
||||
_processed_plots[name] = timestamp
|
||||
|
||||
|
||||
def _log_confusion_matrix(validator) -> None:
|
||||
"""
|
||||
Log confusion matrix for a validator using DVCLive.
|
||||
|
||||
This function processes the confusion matrix from a validator object and logs it to DVCLive by converting
|
||||
the matrix into lists of target and prediction labels.
|
||||
|
||||
Args:
|
||||
validator (BaseValidator): The validator object containing the confusion matrix and class names. Must have
|
||||
attributes: confusion_matrix.matrix, confusion_matrix.task, and names.
|
||||
"""
|
||||
targets = []
|
||||
preds = []
|
||||
matrix = validator.confusion_matrix.matrix
|
||||
names = list(validator.names.values())
|
||||
if validator.confusion_matrix.task == "detect":
|
||||
names += ["background"]
|
||||
|
||||
for ti, pred in enumerate(matrix.T.astype(int)):
|
||||
for pi, num in enumerate(pred):
|
||||
targets.extend([names[ti]] * num)
|
||||
preds.extend([names[pi]] * num)
|
||||
|
||||
live.log_sklearn_plot("confusion_matrix", targets, preds, name="cf.json", normalized=True)
|
||||
|
||||
|
||||
def on_pretrain_routine_start(trainer) -> None:
|
||||
"""Initialize DVCLive logger for training metadata during pre-training routine."""
|
||||
try:
|
||||
global live
|
||||
live = dvclive.Live(save_dvc_exp=True, cache_images=True)
|
||||
LOGGER.info("DVCLive is detected and auto logging is enabled (run 'yolo settings dvc=False' to disable).")
|
||||
except Exception as e:
|
||||
LOGGER.warning(f"DVCLive installed but not initialized correctly, not logging this run. {e}")
|
||||
|
||||
|
||||
def on_pretrain_routine_end(trainer) -> None:
|
||||
"""Log plots related to the training process at the end of the pretraining routine."""
|
||||
_log_plots(trainer.plots, "train")
|
||||
|
||||
|
||||
def on_train_start(trainer) -> None:
|
||||
"""Log the training parameters if DVCLive logging is active."""
|
||||
if live:
|
||||
live.log_params(trainer.args)
|
||||
|
||||
|
||||
def on_train_epoch_start(trainer) -> None:
|
||||
"""Set the global variable _training_epoch value to True at the start of training each epoch."""
|
||||
global _training_epoch
|
||||
_training_epoch = True
|
||||
|
||||
|
||||
def on_fit_epoch_end(trainer) -> None:
|
||||
"""
|
||||
Log training metrics, model info, and advance to next step at the end of each fit epoch.
|
||||
|
||||
This function is called at the end of each fit epoch during training. It logs various metrics including
|
||||
training loss items, validation metrics, and learning rates. On the first epoch, it also logs model
|
||||
information. Additionally, it logs training and validation plots and advances the DVCLive step counter.
|
||||
|
||||
Args:
|
||||
trainer (BaseTrainer): The trainer object containing training state, metrics, and plots.
|
||||
|
||||
Notes:
|
||||
This function only performs logging operations when DVCLive logging is active and during a training epoch.
|
||||
The global variable _training_epoch is used to track whether the current epoch is a training epoch.
|
||||
"""
|
||||
global _training_epoch
|
||||
if live and _training_epoch:
|
||||
all_metrics = {**trainer.label_loss_items(trainer.tloss, prefix="train"), **trainer.metrics, **trainer.lr}
|
||||
for metric, value in all_metrics.items():
|
||||
live.log_metric(metric, value)
|
||||
|
||||
if trainer.epoch == 0:
|
||||
from ultralytics.utils.torch_utils import model_info_for_loggers
|
||||
|
||||
for metric, value in model_info_for_loggers(trainer).items():
|
||||
live.log_metric(metric, value, plot=False)
|
||||
|
||||
_log_plots(trainer.plots, "train")
|
||||
_log_plots(trainer.validator.plots, "val")
|
||||
|
||||
live.next_step()
|
||||
_training_epoch = False
|
||||
|
||||
|
||||
def on_train_end(trainer) -> None:
|
||||
"""
|
||||
Log best metrics, plots, and confusion matrix at the end of training.
|
||||
|
||||
This function is called at the conclusion of the training process to log final metrics, visualizations, and
|
||||
model artifacts if DVCLive logging is active. It captures the best model performance metrics, training plots,
|
||||
validation plots, and confusion matrix for later analysis.
|
||||
|
||||
Args:
|
||||
trainer (BaseTrainer): The trainer object containing training state, metrics, and validation results.
|
||||
|
||||
Examples:
|
||||
>>> # Inside a custom training loop
|
||||
>>> from ultralytics.utils.callbacks.dvc import on_train_end
|
||||
>>> on_train_end(trainer) # Log final metrics and artifacts
|
||||
"""
|
||||
if live:
|
||||
# At the end log the best metrics. It runs validator on the best model internally.
|
||||
all_metrics = {**trainer.label_loss_items(trainer.tloss, prefix="train"), **trainer.metrics, **trainer.lr}
|
||||
for metric, value in all_metrics.items():
|
||||
live.log_metric(metric, value, plot=False)
|
||||
|
||||
_log_plots(trainer.plots, "val")
|
||||
_log_plots(trainer.validator.plots, "val")
|
||||
_log_confusion_matrix(trainer.validator)
|
||||
|
||||
if trainer.best.exists():
|
||||
live.log_artifact(trainer.best, copy=True, type="model")
|
||||
|
||||
live.end()
|
||||
|
||||
|
||||
callbacks = (
|
||||
{
|
||||
"on_pretrain_routine_start": on_pretrain_routine_start,
|
||||
"on_pretrain_routine_end": on_pretrain_routine_end,
|
||||
"on_train_start": on_train_start,
|
||||
"on_train_epoch_start": on_train_epoch_start,
|
||||
"on_fit_epoch_end": on_fit_epoch_end,
|
||||
"on_train_end": on_train_end,
|
||||
}
|
||||
if dvclive
|
||||
else {}
|
||||
)
|
||||
110
ultralytics/utils/callbacks/hub.py
Normal file
110
ultralytics/utils/callbacks/hub.py
Normal file
@@ -0,0 +1,110 @@
|
||||
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
||||
|
||||
import json
|
||||
from time import time
|
||||
|
||||
from ultralytics.hub import HUB_WEB_ROOT, PREFIX, HUBTrainingSession
|
||||
from ultralytics.utils import LOGGER, RANK, SETTINGS
|
||||
from ultralytics.utils.events import events
|
||||
|
||||
|
||||
def on_pretrain_routine_start(trainer):
|
||||
"""Create a remote Ultralytics HUB session to log local model training."""
|
||||
if RANK in {-1, 0} and SETTINGS["hub"] is True and SETTINGS["api_key"] and trainer.hub_session is None:
|
||||
trainer.hub_session = HUBTrainingSession.create_session(trainer.args.model, trainer.args)
|
||||
|
||||
|
||||
def on_pretrain_routine_end(trainer):
|
||||
"""Initialize timers for upload rate limiting before training begins."""
|
||||
if session := getattr(trainer, "hub_session", None):
|
||||
# Start timer for upload rate limit
|
||||
session.timers = {"metrics": time(), "ckpt": time()} # start timer for session rate limiting
|
||||
|
||||
|
||||
def on_fit_epoch_end(trainer):
|
||||
"""Upload training progress metrics to Ultralytics HUB at the end of each epoch."""
|
||||
if session := getattr(trainer, "hub_session", None):
|
||||
# Upload metrics after validation ends
|
||||
all_plots = {
|
||||
**trainer.label_loss_items(trainer.tloss, prefix="train"),
|
||||
**trainer.metrics,
|
||||
}
|
||||
if trainer.epoch == 0:
|
||||
from ultralytics.utils.torch_utils import model_info_for_loggers
|
||||
|
||||
all_plots = {**all_plots, **model_info_for_loggers(trainer)}
|
||||
|
||||
session.metrics_queue[trainer.epoch] = json.dumps(all_plots)
|
||||
|
||||
# If any metrics failed to upload previously, add them to the queue to attempt uploading again
|
||||
if session.metrics_upload_failed_queue:
|
||||
session.metrics_queue.update(session.metrics_upload_failed_queue)
|
||||
|
||||
if time() - session.timers["metrics"] > session.rate_limits["metrics"]:
|
||||
session.upload_metrics()
|
||||
session.timers["metrics"] = time() # reset timer
|
||||
session.metrics_queue = {} # reset queue
|
||||
|
||||
|
||||
def on_model_save(trainer):
|
||||
"""Upload model checkpoints to Ultralytics HUB with rate limiting."""
|
||||
if session := getattr(trainer, "hub_session", None):
|
||||
# Upload checkpoints with rate limiting
|
||||
is_best = trainer.best_fitness == trainer.fitness
|
||||
if time() - session.timers["ckpt"] > session.rate_limits["ckpt"]:
|
||||
LOGGER.info(f"{PREFIX}Uploading checkpoint {HUB_WEB_ROOT}/models/{session.model.id}")
|
||||
session.upload_model(trainer.epoch, trainer.last, is_best)
|
||||
session.timers["ckpt"] = time() # reset timer
|
||||
|
||||
|
||||
def on_train_end(trainer):
|
||||
"""Upload final model and metrics to Ultralytics HUB at the end of training."""
|
||||
if session := getattr(trainer, "hub_session", None):
|
||||
# Upload final model and metrics with exponential standoff
|
||||
LOGGER.info(f"{PREFIX}Syncing final model...")
|
||||
session.upload_model(
|
||||
trainer.epoch,
|
||||
trainer.best,
|
||||
map=trainer.metrics.get("metrics/mAP50-95(B)", 0),
|
||||
final=True,
|
||||
)
|
||||
session.alive = False # stop heartbeats
|
||||
LOGGER.info(f"{PREFIX}Done ✅\n{PREFIX}View model at {session.model_url} 🚀")
|
||||
|
||||
|
||||
def on_train_start(trainer):
|
||||
"""Run events on train start."""
|
||||
events(trainer.args, trainer.device)
|
||||
|
||||
|
||||
def on_val_start(validator):
|
||||
"""Run events on validation start."""
|
||||
if not validator.training:
|
||||
events(validator.args, validator.device)
|
||||
|
||||
|
||||
def on_predict_start(predictor):
|
||||
"""Run events on predict start."""
|
||||
events(predictor.args, predictor.device)
|
||||
|
||||
|
||||
def on_export_start(exporter):
|
||||
"""Run events on export start."""
|
||||
events(exporter.args, exporter.device)
|
||||
|
||||
|
||||
callbacks = (
|
||||
{
|
||||
"on_pretrain_routine_start": on_pretrain_routine_start,
|
||||
"on_pretrain_routine_end": on_pretrain_routine_end,
|
||||
"on_fit_epoch_end": on_fit_epoch_end,
|
||||
"on_model_save": on_model_save,
|
||||
"on_train_end": on_train_end,
|
||||
"on_train_start": on_train_start,
|
||||
"on_val_start": on_val_start,
|
||||
"on_predict_start": on_predict_start,
|
||||
"on_export_start": on_export_start,
|
||||
}
|
||||
if SETTINGS["hub"] is True
|
||||
else {}
|
||||
)
|
||||
135
ultralytics/utils/callbacks/mlflow.py
Normal file
135
ultralytics/utils/callbacks/mlflow.py
Normal file
@@ -0,0 +1,135 @@
|
||||
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
||||
"""
|
||||
MLflow Logging for Ultralytics YOLO.
|
||||
|
||||
This module enables MLflow logging for Ultralytics YOLO. It logs metrics, parameters, and model artifacts.
|
||||
For setting up, a tracking URI should be specified. The logging can be customized using environment variables.
|
||||
|
||||
Commands:
|
||||
1. To set a project name:
|
||||
`export MLFLOW_EXPERIMENT_NAME=<your_experiment_name>` or use the project=<project> argument
|
||||
|
||||
2. To set a run name:
|
||||
`export MLFLOW_RUN=<your_run_name>` or use the name=<name> argument
|
||||
|
||||
3. To start a local MLflow server:
|
||||
mlflow server --backend-store-uri runs/mlflow
|
||||
It will by default start a local server at http://127.0.0.1:5000.
|
||||
To specify a different URI, set the MLFLOW_TRACKING_URI environment variable.
|
||||
|
||||
4. To kill all running MLflow server instances:
|
||||
ps aux | grep 'mlflow' | grep -v 'grep' | awk '{print $2}' | xargs kill -9
|
||||
"""
|
||||
|
||||
from ultralytics.utils import LOGGER, RUNS_DIR, SETTINGS, TESTS_RUNNING, colorstr
|
||||
|
||||
try:
|
||||
import os
|
||||
|
||||
assert not TESTS_RUNNING or "test_mlflow" in os.environ.get("PYTEST_CURRENT_TEST", "") # do not log pytest
|
||||
assert SETTINGS["mlflow"] is True # verify integration is enabled
|
||||
import mlflow
|
||||
|
||||
assert hasattr(mlflow, "__version__") # verify package is not directory
|
||||
from pathlib import Path
|
||||
|
||||
PREFIX = colorstr("MLflow: ")
|
||||
|
||||
except (ImportError, AssertionError):
|
||||
mlflow = None
|
||||
|
||||
|
||||
def sanitize_dict(x: dict) -> dict:
|
||||
"""Sanitize dictionary keys by removing parentheses and converting values to floats."""
|
||||
return {k.replace("(", "").replace(")", ""): float(v) for k, v in x.items()}
|
||||
|
||||
|
||||
def on_pretrain_routine_end(trainer):
|
||||
"""
|
||||
Log training parameters to MLflow at the end of the pretraining routine.
|
||||
|
||||
This function sets up MLflow logging based on environment variables and trainer arguments. It sets the tracking URI,
|
||||
experiment name, and run name, then starts the MLflow run if not already active. It finally logs the parameters
|
||||
from the trainer.
|
||||
|
||||
Args:
|
||||
trainer (ultralytics.engine.trainer.BaseTrainer): The training object with arguments and parameters to log.
|
||||
|
||||
Environment Variables:
|
||||
MLFLOW_TRACKING_URI: The URI for MLflow tracking. If not set, defaults to 'runs/mlflow'.
|
||||
MLFLOW_EXPERIMENT_NAME: The name of the MLflow experiment. If not set, defaults to trainer.args.project.
|
||||
MLFLOW_RUN: The name of the MLflow run. If not set, defaults to trainer.args.name.
|
||||
MLFLOW_KEEP_RUN_ACTIVE: Boolean indicating whether to keep the MLflow run active after training ends.
|
||||
"""
|
||||
global mlflow
|
||||
|
||||
uri = os.environ.get("MLFLOW_TRACKING_URI") or str(RUNS_DIR / "mlflow")
|
||||
LOGGER.debug(f"{PREFIX} tracking uri: {uri}")
|
||||
mlflow.set_tracking_uri(uri)
|
||||
|
||||
# Set experiment and run names
|
||||
experiment_name = os.environ.get("MLFLOW_EXPERIMENT_NAME") or trainer.args.project or "/Shared/Ultralytics"
|
||||
run_name = os.environ.get("MLFLOW_RUN") or trainer.args.name
|
||||
mlflow.set_experiment(experiment_name)
|
||||
|
||||
mlflow.autolog()
|
||||
try:
|
||||
active_run = mlflow.active_run() or mlflow.start_run(run_name=run_name)
|
||||
LOGGER.info(f"{PREFIX}logging run_id({active_run.info.run_id}) to {uri}")
|
||||
if Path(uri).is_dir():
|
||||
LOGGER.info(f"{PREFIX}view at http://127.0.0.1:5000 with 'mlflow server --backend-store-uri {uri}'")
|
||||
LOGGER.info(f"{PREFIX}disable with 'yolo settings mlflow=False'")
|
||||
mlflow.log_params(dict(trainer.args))
|
||||
except Exception as e:
|
||||
LOGGER.warning(f"{PREFIX}Failed to initialize: {e}")
|
||||
LOGGER.warning(f"{PREFIX}Not tracking this run")
|
||||
|
||||
|
||||
def on_train_epoch_end(trainer):
|
||||
"""Log training metrics at the end of each train epoch to MLflow."""
|
||||
if mlflow:
|
||||
mlflow.log_metrics(
|
||||
metrics={
|
||||
**sanitize_dict(trainer.lr),
|
||||
**sanitize_dict(trainer.label_loss_items(trainer.tloss, prefix="train")),
|
||||
},
|
||||
step=trainer.epoch,
|
||||
)
|
||||
|
||||
|
||||
def on_fit_epoch_end(trainer):
|
||||
"""Log training metrics at the end of each fit epoch to MLflow."""
|
||||
if mlflow:
|
||||
mlflow.log_metrics(metrics=sanitize_dict(trainer.metrics), step=trainer.epoch)
|
||||
|
||||
|
||||
def on_train_end(trainer):
|
||||
"""Log model artifacts at the end of training."""
|
||||
if not mlflow:
|
||||
return
|
||||
mlflow.log_artifact(str(trainer.best.parent)) # log save_dir/weights directory with best.pt and last.pt
|
||||
for f in trainer.save_dir.glob("*"): # log all other files in save_dir
|
||||
if f.suffix in {".png", ".jpg", ".csv", ".pt", ".yaml"}:
|
||||
mlflow.log_artifact(str(f))
|
||||
keep_run_active = os.environ.get("MLFLOW_KEEP_RUN_ACTIVE", "False").lower() == "true"
|
||||
if keep_run_active:
|
||||
LOGGER.info(f"{PREFIX}mlflow run still alive, remember to close it using mlflow.end_run()")
|
||||
else:
|
||||
mlflow.end_run()
|
||||
LOGGER.debug(f"{PREFIX}mlflow run ended")
|
||||
|
||||
LOGGER.info(
|
||||
f"{PREFIX}results logged to {mlflow.get_tracking_uri()}\n{PREFIX}disable with 'yolo settings mlflow=False'"
|
||||
)
|
||||
|
||||
|
||||
callbacks = (
|
||||
{
|
||||
"on_pretrain_routine_end": on_pretrain_routine_end,
|
||||
"on_train_epoch_end": on_train_epoch_end,
|
||||
"on_fit_epoch_end": on_fit_epoch_end,
|
||||
"on_train_end": on_train_end,
|
||||
}
|
||||
if mlflow
|
||||
else {}
|
||||
)
|
||||
134
ultralytics/utils/callbacks/neptune.py
Normal file
134
ultralytics/utils/callbacks/neptune.py
Normal file
@@ -0,0 +1,134 @@
|
||||
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
||||
|
||||
from ultralytics.utils import LOGGER, SETTINGS, TESTS_RUNNING
|
||||
|
||||
try:
|
||||
assert not TESTS_RUNNING # do not log pytest
|
||||
assert SETTINGS["neptune"] is True # verify integration is enabled
|
||||
|
||||
import neptune
|
||||
from neptune.types import File
|
||||
|
||||
assert hasattr(neptune, "__version__")
|
||||
|
||||
run = None # NeptuneAI experiment logger instance
|
||||
|
||||
except (ImportError, AssertionError):
|
||||
neptune = None
|
||||
|
||||
|
||||
def _log_scalars(scalars: dict, step: int = 0) -> None:
|
||||
"""
|
||||
Log scalars to the NeptuneAI experiment logger.
|
||||
|
||||
Args:
|
||||
scalars (dict): Dictionary of scalar values to log to NeptuneAI.
|
||||
step (int, optional): The current step or iteration number for logging.
|
||||
|
||||
Examples:
|
||||
>>> metrics = {"mAP": 0.85, "loss": 0.32}
|
||||
>>> _log_scalars(metrics, step=100)
|
||||
"""
|
||||
if run:
|
||||
for k, v in scalars.items():
|
||||
run[k].append(value=v, step=step)
|
||||
|
||||
|
||||
def _log_images(imgs_dict: dict, group: str = "") -> None:
|
||||
"""
|
||||
Log images to the NeptuneAI experiment logger.
|
||||
|
||||
This function logs image data to Neptune.ai when a valid Neptune run is active. Images are organized
|
||||
under the specified group name.
|
||||
|
||||
Args:
|
||||
imgs_dict (dict): Dictionary of images to log, with keys as image names and values as image data.
|
||||
group (str, optional): Group name to organize images under in the Neptune UI.
|
||||
|
||||
Examples:
|
||||
>>> # Log validation images
|
||||
>>> _log_images({"val_batch": img_tensor}, group="validation")
|
||||
"""
|
||||
if run:
|
||||
for k, v in imgs_dict.items():
|
||||
run[f"{group}/{k}"].upload(File(v))
|
||||
|
||||
|
||||
def _log_plot(title: str, plot_path: str) -> None:
|
||||
"""Log plots to the NeptuneAI experiment logger."""
|
||||
import matplotlib.image as mpimg
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
img = mpimg.imread(plot_path)
|
||||
fig = plt.figure()
|
||||
ax = fig.add_axes([0, 0, 1, 1], frameon=False, aspect="auto", xticks=[], yticks=[]) # no ticks
|
||||
ax.imshow(img)
|
||||
run[f"Plots/{title}"].upload(fig)
|
||||
|
||||
|
||||
def on_pretrain_routine_start(trainer) -> None:
|
||||
"""Initialize NeptuneAI run and log hyperparameters before training starts."""
|
||||
try:
|
||||
global run
|
||||
run = neptune.init_run(
|
||||
project=trainer.args.project or "Ultralytics",
|
||||
name=trainer.args.name,
|
||||
tags=["Ultralytics"],
|
||||
)
|
||||
run["Configuration/Hyperparameters"] = {k: "" if v is None else v for k, v in vars(trainer.args).items()}
|
||||
except Exception as e:
|
||||
LOGGER.warning(f"NeptuneAI installed but not initialized correctly, not logging this run. {e}")
|
||||
|
||||
|
||||
def on_train_epoch_end(trainer) -> None:
|
||||
"""Log training metrics and learning rate at the end of each training epoch."""
|
||||
_log_scalars(trainer.label_loss_items(trainer.tloss, prefix="train"), trainer.epoch + 1)
|
||||
_log_scalars(trainer.lr, trainer.epoch + 1)
|
||||
if trainer.epoch == 1:
|
||||
_log_images({f.stem: str(f) for f in trainer.save_dir.glob("train_batch*.jpg")}, "Mosaic")
|
||||
|
||||
|
||||
def on_fit_epoch_end(trainer) -> None:
|
||||
"""Log model info and validation metrics at the end of each fit epoch."""
|
||||
if run and trainer.epoch == 0:
|
||||
from ultralytics.utils.torch_utils import model_info_for_loggers
|
||||
|
||||
run["Configuration/Model"] = model_info_for_loggers(trainer)
|
||||
_log_scalars(trainer.metrics, trainer.epoch + 1)
|
||||
|
||||
|
||||
def on_val_end(validator) -> None:
|
||||
"""Log validation images at the end of validation."""
|
||||
if run:
|
||||
# Log val_labels and val_pred
|
||||
_log_images({f.stem: str(f) for f in validator.save_dir.glob("val*.jpg")}, "Validation")
|
||||
|
||||
|
||||
def on_train_end(trainer) -> None:
|
||||
"""Log final results, plots, and model weights at the end of training."""
|
||||
if run:
|
||||
# Log final results, CM matrix + PR plots
|
||||
files = [
|
||||
"results.png",
|
||||
"confusion_matrix.png",
|
||||
"confusion_matrix_normalized.png",
|
||||
*(f"{x}_curve.png" for x in ("F1", "PR", "P", "R")),
|
||||
]
|
||||
files = [(trainer.save_dir / f) for f in files if (trainer.save_dir / f).exists()] # filter
|
||||
for f in files:
|
||||
_log_plot(title=f.stem, plot_path=f)
|
||||
# Log the final model
|
||||
run[f"weights/{trainer.args.name or trainer.args.task}/{trainer.best.name}"].upload(File(str(trainer.best)))
|
||||
|
||||
|
||||
callbacks = (
|
||||
{
|
||||
"on_pretrain_routine_start": on_pretrain_routine_start,
|
||||
"on_train_epoch_end": on_train_epoch_end,
|
||||
"on_fit_epoch_end": on_fit_epoch_end,
|
||||
"on_val_end": on_val_end,
|
||||
"on_train_end": on_train_end,
|
||||
}
|
||||
if neptune
|
||||
else {}
|
||||
)
|
||||
73
ultralytics/utils/callbacks/platform.py
Normal file
73
ultralytics/utils/callbacks/platform.py
Normal file
@@ -0,0 +1,73 @@
|
||||
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
||||
|
||||
from ultralytics.utils import RANK, SETTINGS
|
||||
|
||||
|
||||
def on_pretrain_routine_start(trainer):
|
||||
"""Initialize and start console logging immediately at the very beginning."""
|
||||
if RANK in {-1, 0}:
|
||||
from ultralytics.utils.logger import DEFAULT_LOG_PATH, ConsoleLogger, SystemLogger
|
||||
|
||||
trainer.system_logger = SystemLogger()
|
||||
trainer.console_logger = ConsoleLogger(DEFAULT_LOG_PATH)
|
||||
trainer.console_logger.start_capture()
|
||||
|
||||
|
||||
def on_pretrain_routine_end(trainer):
|
||||
"""Handle pre-training routine completion event."""
|
||||
pass
|
||||
|
||||
|
||||
def on_fit_epoch_end(trainer):
|
||||
"""Handle end of training epoch event and collect system metrics."""
|
||||
if RANK in {-1, 0} and hasattr(trainer, "system_logger"):
|
||||
system_metrics = trainer.system_logger.get_metrics()
|
||||
print(system_metrics) # for debug
|
||||
|
||||
|
||||
def on_model_save(trainer):
|
||||
"""Handle model checkpoint save event."""
|
||||
pass
|
||||
|
||||
|
||||
def on_train_end(trainer):
|
||||
"""Stop console capture and finalize logs."""
|
||||
if logger := getattr(trainer, "console_logger", None):
|
||||
logger.stop_capture()
|
||||
|
||||
|
||||
def on_train_start(trainer):
|
||||
"""Handle training start event."""
|
||||
pass
|
||||
|
||||
|
||||
def on_val_start(validator):
|
||||
"""Handle validation start event."""
|
||||
pass
|
||||
|
||||
|
||||
def on_predict_start(predictor):
|
||||
"""Handle prediction start event."""
|
||||
pass
|
||||
|
||||
|
||||
def on_export_start(exporter):
|
||||
"""Handle model export start event."""
|
||||
pass
|
||||
|
||||
|
||||
callbacks = (
|
||||
{
|
||||
"on_pretrain_routine_start": on_pretrain_routine_start,
|
||||
"on_pretrain_routine_end": on_pretrain_routine_end,
|
||||
"on_fit_epoch_end": on_fit_epoch_end,
|
||||
"on_model_save": on_model_save,
|
||||
"on_train_end": on_train_end,
|
||||
"on_train_start": on_train_start,
|
||||
"on_val_start": on_val_start,
|
||||
"on_predict_start": on_predict_start,
|
||||
"on_export_start": on_export_start,
|
||||
}
|
||||
if SETTINGS.get("platform", False) is True # disabled for debugging
|
||||
else {}
|
||||
)
|
||||
43
ultralytics/utils/callbacks/raytune.py
Normal file
43
ultralytics/utils/callbacks/raytune.py
Normal file
@@ -0,0 +1,43 @@
|
||||
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
||||
|
||||
from ultralytics.utils import SETTINGS
|
||||
|
||||
try:
|
||||
assert SETTINGS["raytune"] is True # verify integration is enabled
|
||||
import ray
|
||||
from ray import tune
|
||||
from ray.air import session
|
||||
|
||||
except (ImportError, AssertionError):
|
||||
tune = None
|
||||
|
||||
|
||||
def on_fit_epoch_end(trainer):
|
||||
"""
|
||||
Report training metrics to Ray Tune at epoch end when a Ray session is active.
|
||||
|
||||
Captures metrics from the trainer object and sends them to Ray Tune with the current epoch number,
|
||||
enabling hyperparameter tuning optimization. Only executes when within an active Ray Tune session.
|
||||
|
||||
Args:
|
||||
trainer (ultralytics.engine.trainer.BaseTrainer): The Ultralytics trainer object containing metrics and epochs.
|
||||
|
||||
Examples:
|
||||
>>> # Called automatically by the Ultralytics training loop
|
||||
>>> on_fit_epoch_end(trainer)
|
||||
|
||||
References:
|
||||
Ray Tune docs: https://docs.ray.io/en/latest/tune/index.html
|
||||
"""
|
||||
if ray.train._internal.session.get_session(): # check if Ray Tune session is active
|
||||
metrics = trainer.metrics
|
||||
session.report({**metrics, **{"epoch": trainer.epoch + 1}})
|
||||
|
||||
|
||||
callbacks = (
|
||||
{
|
||||
"on_fit_epoch_end": on_fit_epoch_end,
|
||||
}
|
||||
if tune
|
||||
else {}
|
||||
)
|
||||
131
ultralytics/utils/callbacks/tensorboard.py
Normal file
131
ultralytics/utils/callbacks/tensorboard.py
Normal file
@@ -0,0 +1,131 @@
|
||||
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
||||
|
||||
from ultralytics.utils import LOGGER, SETTINGS, TESTS_RUNNING, colorstr, torch_utils
|
||||
|
||||
try:
|
||||
assert not TESTS_RUNNING # do not log pytest
|
||||
assert SETTINGS["tensorboard"] is True # verify integration is enabled
|
||||
WRITER = None # TensorBoard SummaryWriter instance
|
||||
PREFIX = colorstr("TensorBoard: ")
|
||||
|
||||
# Imports below only required if TensorBoard enabled
|
||||
import warnings
|
||||
from copy import deepcopy
|
||||
|
||||
import torch
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
|
||||
except (ImportError, AssertionError, TypeError, AttributeError):
|
||||
# TypeError for handling 'Descriptors cannot not be created directly.' protobuf errors in Windows
|
||||
# AttributeError: module 'tensorflow' has no attribute 'io' if 'tensorflow' not installed
|
||||
SummaryWriter = None
|
||||
|
||||
|
||||
def _log_scalars(scalars: dict, step: int = 0) -> None:
|
||||
"""
|
||||
Log scalar values to TensorBoard.
|
||||
|
||||
Args:
|
||||
scalars (dict): Dictionary of scalar values to log to TensorBoard. Keys are scalar names and values are the
|
||||
corresponding scalar values.
|
||||
step (int): Global step value to record with the scalar values. Used for x-axis in TensorBoard graphs.
|
||||
|
||||
Examples:
|
||||
Log training metrics
|
||||
>>> metrics = {"loss": 0.5, "accuracy": 0.95}
|
||||
>>> _log_scalars(metrics, step=100)
|
||||
"""
|
||||
if WRITER:
|
||||
for k, v in scalars.items():
|
||||
WRITER.add_scalar(k, v, step)
|
||||
|
||||
|
||||
def _log_tensorboard_graph(trainer) -> None:
|
||||
"""
|
||||
Log model graph to TensorBoard.
|
||||
|
||||
This function attempts to visualize the model architecture in TensorBoard by tracing the model with a dummy input
|
||||
tensor. It first tries a simple method suitable for YOLO models, and if that fails, falls back to a more complex
|
||||
approach for models like RTDETR that may require special handling.
|
||||
|
||||
Args:
|
||||
trainer (ultralytics.engine.trainer.BaseTrainer): The trainer object containing the model to visualize.
|
||||
Must have attributes model and args with imgsz.
|
||||
|
||||
Notes:
|
||||
This function requires TensorBoard integration to be enabled and the global WRITER to be initialized.
|
||||
It handles potential warnings from the PyTorch JIT tracer and attempts to gracefully handle different
|
||||
model architectures.
|
||||
"""
|
||||
# Input image
|
||||
imgsz = trainer.args.imgsz
|
||||
imgsz = (imgsz, imgsz) if isinstance(imgsz, int) else imgsz
|
||||
p = next(trainer.model.parameters()) # for device, type
|
||||
im = torch.zeros((1, 3, *imgsz), device=p.device, dtype=p.dtype) # input image (must be zeros, not empty)
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", category=UserWarning) # suppress jit trace warning
|
||||
warnings.simplefilter("ignore", category=torch.jit.TracerWarning) # suppress jit trace warning
|
||||
|
||||
# Try simple method first (YOLO)
|
||||
try:
|
||||
trainer.model.eval() # place in .eval() mode to avoid BatchNorm statistics changes
|
||||
WRITER.add_graph(torch.jit.trace(torch_utils.unwrap_model(trainer.model), im, strict=False), [])
|
||||
LOGGER.info(f"{PREFIX}model graph visualization added ✅")
|
||||
return
|
||||
|
||||
except Exception:
|
||||
# Fallback to TorchScript export steps (RTDETR)
|
||||
try:
|
||||
model = deepcopy(torch_utils.unwrap_model(trainer.model))
|
||||
model.eval()
|
||||
model = model.fuse(verbose=False)
|
||||
for m in model.modules():
|
||||
if hasattr(m, "export"): # Detect, RTDETRDecoder (Segment and Pose use Detect base class)
|
||||
m.export = True
|
||||
m.format = "torchscript"
|
||||
model(im) # dry run
|
||||
WRITER.add_graph(torch.jit.trace(model, im, strict=False), [])
|
||||
LOGGER.info(f"{PREFIX}model graph visualization added ✅")
|
||||
except Exception as e:
|
||||
LOGGER.warning(f"{PREFIX}TensorBoard graph visualization failure {e}")
|
||||
|
||||
|
||||
def on_pretrain_routine_start(trainer) -> None:
|
||||
"""Initialize TensorBoard logging with SummaryWriter."""
|
||||
if SummaryWriter:
|
||||
try:
|
||||
global WRITER
|
||||
WRITER = SummaryWriter(str(trainer.save_dir))
|
||||
LOGGER.info(f"{PREFIX}Start with 'tensorboard --logdir {trainer.save_dir}', view at http://localhost:6006/")
|
||||
except Exception as e:
|
||||
LOGGER.warning(f"{PREFIX}TensorBoard not initialized correctly, not logging this run. {e}")
|
||||
|
||||
|
||||
def on_train_start(trainer) -> None:
|
||||
"""Log TensorBoard graph."""
|
||||
if WRITER:
|
||||
_log_tensorboard_graph(trainer)
|
||||
|
||||
|
||||
def on_train_epoch_end(trainer) -> None:
|
||||
"""Log scalar statistics at the end of a training epoch."""
|
||||
_log_scalars(trainer.label_loss_items(trainer.tloss, prefix="train"), trainer.epoch + 1)
|
||||
_log_scalars(trainer.lr, trainer.epoch + 1)
|
||||
|
||||
|
||||
def on_fit_epoch_end(trainer) -> None:
|
||||
"""Log epoch metrics at end of training epoch."""
|
||||
_log_scalars(trainer.metrics, trainer.epoch + 1)
|
||||
|
||||
|
||||
callbacks = (
|
||||
{
|
||||
"on_pretrain_routine_start": on_pretrain_routine_start,
|
||||
"on_train_start": on_train_start,
|
||||
"on_fit_epoch_end": on_fit_epoch_end,
|
||||
"on_train_epoch_end": on_train_epoch_end,
|
||||
}
|
||||
if SummaryWriter
|
||||
else {}
|
||||
)
|
||||
191
ultralytics/utils/callbacks/wb.py
Normal file
191
ultralytics/utils/callbacks/wb.py
Normal file
@@ -0,0 +1,191 @@
|
||||
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
||||
|
||||
from ultralytics.utils import SETTINGS, TESTS_RUNNING
|
||||
from ultralytics.utils.torch_utils import model_info_for_loggers
|
||||
|
||||
try:
|
||||
assert not TESTS_RUNNING # do not log pytest
|
||||
assert SETTINGS["wandb"] is True # verify integration is enabled
|
||||
import wandb as wb
|
||||
|
||||
assert hasattr(wb, "__version__") # verify package is not directory
|
||||
_processed_plots = {}
|
||||
|
||||
except (ImportError, AssertionError):
|
||||
wb = None
|
||||
|
||||
|
||||
def _custom_table(x, y, classes, title="Precision Recall Curve", x_title="Recall", y_title="Precision"):
|
||||
"""
|
||||
Create and log a custom metric visualization to wandb.plot.pr_curve.
|
||||
|
||||
This function crafts a custom metric visualization that mimics the behavior of the default wandb precision-recall
|
||||
curve while allowing for enhanced customization. The visual metric is useful for monitoring model performance across
|
||||
different classes.
|
||||
|
||||
Args:
|
||||
x (list): Values for the x-axis; expected to have length N.
|
||||
y (list): Corresponding values for the y-axis; also expected to have length N.
|
||||
classes (list): Labels identifying the class of each point; length N.
|
||||
title (str, optional): Title for the plot.
|
||||
x_title (str, optional): Label for the x-axis.
|
||||
y_title (str, optional): Label for the y-axis.
|
||||
|
||||
Returns:
|
||||
(wandb.Object): A wandb object suitable for logging, showcasing the crafted metric visualization.
|
||||
"""
|
||||
import polars as pl # scope for faster 'import ultralytics'
|
||||
import polars.selectors as cs
|
||||
|
||||
df = pl.DataFrame({"class": classes, "y": y, "x": x}).with_columns(cs.numeric().round(3))
|
||||
data = df.select(["class", "y", "x"]).rows()
|
||||
|
||||
fields = {"x": "x", "y": "y", "class": "class"}
|
||||
string_fields = {"title": title, "x-axis-title": x_title, "y-axis-title": y_title}
|
||||
return wb.plot_table(
|
||||
"wandb/area-under-curve/v0",
|
||||
wb.Table(data=data, columns=["class", "y", "x"]),
|
||||
fields=fields,
|
||||
string_fields=string_fields,
|
||||
)
|
||||
|
||||
|
||||
def _plot_curve(
|
||||
x,
|
||||
y,
|
||||
names=None,
|
||||
id="precision-recall",
|
||||
title="Precision Recall Curve",
|
||||
x_title="Recall",
|
||||
y_title="Precision",
|
||||
num_x=100,
|
||||
only_mean=False,
|
||||
):
|
||||
"""
|
||||
Log a metric curve visualization.
|
||||
|
||||
This function generates a metric curve based on input data and logs the visualization to wandb.
|
||||
The curve can represent aggregated data (mean) or individual class data, depending on the 'only_mean' flag.
|
||||
|
||||
Args:
|
||||
x (np.ndarray): Data points for the x-axis with length N.
|
||||
y (np.ndarray): Corresponding data points for the y-axis with shape (C, N), where C is the number of classes.
|
||||
names (list, optional): Names of the classes corresponding to the y-axis data; length C.
|
||||
id (str, optional): Unique identifier for the logged data in wandb.
|
||||
title (str, optional): Title for the visualization plot.
|
||||
x_title (str, optional): Label for the x-axis.
|
||||
y_title (str, optional): Label for the y-axis.
|
||||
num_x (int, optional): Number of interpolated data points for visualization.
|
||||
only_mean (bool, optional): Flag to indicate if only the mean curve should be plotted.
|
||||
|
||||
Notes:
|
||||
The function leverages the '_custom_table' function to generate the actual visualization.
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
# Create new x
|
||||
if names is None:
|
||||
names = []
|
||||
x_new = np.linspace(x[0], x[-1], num_x).round(5)
|
||||
|
||||
# Create arrays for logging
|
||||
x_log = x_new.tolist()
|
||||
y_log = np.interp(x_new, x, np.mean(y, axis=0)).round(3).tolist()
|
||||
|
||||
if only_mean:
|
||||
table = wb.Table(data=list(zip(x_log, y_log)), columns=[x_title, y_title])
|
||||
wb.run.log({title: wb.plot.line(table, x_title, y_title, title=title)})
|
||||
else:
|
||||
classes = ["mean"] * len(x_log)
|
||||
for i, yi in enumerate(y):
|
||||
x_log.extend(x_new) # add new x
|
||||
y_log.extend(np.interp(x_new, x, yi)) # interpolate y to new x
|
||||
classes.extend([names[i]] * len(x_new)) # add class names
|
||||
wb.log({id: _custom_table(x_log, y_log, classes, title, x_title, y_title)}, commit=False)
|
||||
|
||||
|
||||
def _log_plots(plots, step):
|
||||
"""
|
||||
Log plots to WandB at a specific step if they haven't been logged already.
|
||||
|
||||
This function checks each plot in the input dictionary against previously processed plots and logs
|
||||
new or updated plots to WandB at the specified step.
|
||||
|
||||
Args:
|
||||
plots (dict): Dictionary of plots to log, where keys are plot names and values are dictionaries
|
||||
containing plot metadata including timestamps.
|
||||
step (int): The step/epoch at which to log the plots in the WandB run.
|
||||
|
||||
Notes:
|
||||
The function uses a shallow copy of the plots dictionary to prevent modification during iteration.
|
||||
Plots are identified by their stem name (filename without extension).
|
||||
Each plot is logged as a WandB Image object.
|
||||
"""
|
||||
for name, params in plots.copy().items(): # shallow copy to prevent plots dict changing during iteration
|
||||
timestamp = params["timestamp"]
|
||||
if _processed_plots.get(name) != timestamp:
|
||||
wb.run.log({name.stem: wb.Image(str(name))}, step=step)
|
||||
_processed_plots[name] = timestamp
|
||||
|
||||
|
||||
def on_pretrain_routine_start(trainer):
|
||||
"""Initialize and start wandb project if module is present."""
|
||||
if not wb.run:
|
||||
wb.init(
|
||||
project=str(trainer.args.project).replace("/", "-") if trainer.args.project else "Ultralytics",
|
||||
name=str(trainer.args.name).replace("/", "-"),
|
||||
config=vars(trainer.args),
|
||||
)
|
||||
|
||||
|
||||
def on_fit_epoch_end(trainer):
|
||||
"""Log training metrics and model information at the end of an epoch."""
|
||||
wb.run.log(trainer.metrics, step=trainer.epoch + 1)
|
||||
_log_plots(trainer.plots, step=trainer.epoch + 1)
|
||||
_log_plots(trainer.validator.plots, step=trainer.epoch + 1)
|
||||
if trainer.epoch == 0:
|
||||
wb.run.log(model_info_for_loggers(trainer), step=trainer.epoch + 1)
|
||||
|
||||
|
||||
def on_train_epoch_end(trainer):
|
||||
"""Log metrics and save images at the end of each training epoch."""
|
||||
wb.run.log(trainer.label_loss_items(trainer.tloss, prefix="train"), step=trainer.epoch + 1)
|
||||
wb.run.log(trainer.lr, step=trainer.epoch + 1)
|
||||
if trainer.epoch == 1:
|
||||
_log_plots(trainer.plots, step=trainer.epoch + 1)
|
||||
|
||||
|
||||
def on_train_end(trainer):
|
||||
"""Save the best model as an artifact and log final plots at the end of training."""
|
||||
_log_plots(trainer.validator.plots, step=trainer.epoch + 1)
|
||||
_log_plots(trainer.plots, step=trainer.epoch + 1)
|
||||
art = wb.Artifact(type="model", name=f"run_{wb.run.id}_model")
|
||||
if trainer.best.exists():
|
||||
art.add_file(trainer.best)
|
||||
wb.run.log_artifact(art, aliases=["best"])
|
||||
# Check if we actually have plots to save
|
||||
if trainer.args.plots and hasattr(trainer.validator.metrics, "curves_results"):
|
||||
for curve_name, curve_values in zip(trainer.validator.metrics.curves, trainer.validator.metrics.curves_results):
|
||||
x, y, x_title, y_title = curve_values
|
||||
_plot_curve(
|
||||
x,
|
||||
y,
|
||||
names=list(trainer.validator.metrics.names.values()),
|
||||
id=f"curves/{curve_name}",
|
||||
title=curve_name,
|
||||
x_title=x_title,
|
||||
y_title=y_title,
|
||||
)
|
||||
wb.run.finish() # required or run continues on dashboard
|
||||
|
||||
|
||||
callbacks = (
|
||||
{
|
||||
"on_pretrain_routine_start": on_pretrain_routine_start,
|
||||
"on_train_epoch_end": on_train_epoch_end,
|
||||
"on_fit_epoch_end": on_fit_epoch_end,
|
||||
"on_train_end": on_train_end,
|
||||
}
|
||||
if wb
|
||||
else {}
|
||||
)
|
||||
Reference in New Issue
Block a user