init commit
This commit is contained in:
5
ultralytics/models/yolo/world/__init__.py
Normal file
5
ultralytics/models/yolo/world/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
||||
|
||||
from .train import WorldTrainer
|
||||
|
||||
__all__ = ["WorldTrainer"]
|
||||
Binary file not shown.
BIN
ultralytics/models/yolo/world/__pycache__/train.cpython-310.pyc
Normal file
BIN
ultralytics/models/yolo/world/__pycache__/train.cpython-310.pyc
Normal file
Binary file not shown.
Binary file not shown.
179
ultralytics/models/yolo/world/train.py
Normal file
179
ultralytics/models/yolo/world/train.py
Normal file
@@ -0,0 +1,179 @@
|
||||
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import itertools
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
|
||||
from ultralytics.data import build_yolo_dataset
|
||||
from ultralytics.models.yolo.detect import DetectionTrainer
|
||||
from ultralytics.nn.tasks import WorldModel
|
||||
from ultralytics.utils import DEFAULT_CFG, LOGGER, RANK
|
||||
from ultralytics.utils.torch_utils import unwrap_model
|
||||
|
||||
|
||||
def on_pretrain_routine_end(trainer) -> None:
|
||||
"""Set up model classes and text encoder at the end of the pretrain routine."""
|
||||
if RANK in {-1, 0}:
|
||||
# Set class names for evaluation
|
||||
names = [name.split("/", 1)[0] for name in list(trainer.test_loader.dataset.data["names"].values())]
|
||||
unwrap_model(trainer.ema.ema).set_classes(names, cache_clip_model=False)
|
||||
|
||||
|
||||
class WorldTrainer(DetectionTrainer):
|
||||
"""
|
||||
A trainer class for fine-tuning YOLO World models on close-set datasets.
|
||||
|
||||
This trainer extends the DetectionTrainer to support training YOLO World models, which combine visual and textual
|
||||
features for improved object detection and understanding. It handles text embedding generation and caching to
|
||||
accelerate training with multi-modal data.
|
||||
|
||||
Attributes:
|
||||
text_embeddings (dict[str, torch.Tensor] | None): Cached text embeddings for category names to accelerate
|
||||
training.
|
||||
model (WorldModel): The YOLO World model being trained.
|
||||
data (dict[str, Any]): Dataset configuration containing class information.
|
||||
args (Any): Training arguments and configuration.
|
||||
|
||||
Methods:
|
||||
get_model: Return WorldModel initialized with specified config and weights.
|
||||
build_dataset: Build YOLO Dataset for training or validation.
|
||||
set_text_embeddings: Set text embeddings for datasets to accelerate training.
|
||||
generate_text_embeddings: Generate text embeddings for a list of text samples.
|
||||
preprocess_batch: Preprocess a batch of images and text for YOLOWorld training.
|
||||
|
||||
Examples:
|
||||
Initialize and train a YOLO World model
|
||||
>>> from ultralytics.models.yolo.world import WorldTrainer
|
||||
>>> args = dict(model="yolov8s-world.pt", data="coco8.yaml", epochs=3)
|
||||
>>> trainer = WorldTrainer(overrides=args)
|
||||
>>> trainer.train()
|
||||
"""
|
||||
|
||||
def __init__(self, cfg=DEFAULT_CFG, overrides: dict[str, Any] | None = None, _callbacks=None):
|
||||
"""
|
||||
Initialize a WorldTrainer object with given arguments.
|
||||
|
||||
Args:
|
||||
cfg (dict[str, Any]): Configuration for the trainer.
|
||||
overrides (dict[str, Any], optional): Configuration overrides.
|
||||
_callbacks (list[Any], optional): List of callback functions.
|
||||
"""
|
||||
if overrides is None:
|
||||
overrides = {}
|
||||
assert not overrides.get("compile"), f"Training with 'model={overrides['model']}' requires 'compile=False'"
|
||||
super().__init__(cfg, overrides, _callbacks)
|
||||
self.text_embeddings = None
|
||||
|
||||
def get_model(self, cfg=None, weights: str | None = None, verbose: bool = True) -> WorldModel:
|
||||
"""
|
||||
Return WorldModel initialized with specified config and weights.
|
||||
|
||||
Args:
|
||||
cfg (dict[str, Any] | str, optional): Model configuration.
|
||||
weights (str, optional): Path to pretrained weights.
|
||||
verbose (bool): Whether to display model info.
|
||||
|
||||
Returns:
|
||||
(WorldModel): Initialized WorldModel.
|
||||
"""
|
||||
# NOTE: This `nc` here is the max number of different text samples in one image, rather than the actual `nc`.
|
||||
# NOTE: Following the official config, nc hard-coded to 80 for now.
|
||||
model = WorldModel(
|
||||
cfg["yaml_file"] if isinstance(cfg, dict) else cfg,
|
||||
ch=self.data["channels"],
|
||||
nc=min(self.data["nc"], 80),
|
||||
verbose=verbose and RANK == -1,
|
||||
)
|
||||
if weights:
|
||||
model.load(weights)
|
||||
self.add_callback("on_pretrain_routine_end", on_pretrain_routine_end)
|
||||
|
||||
return model
|
||||
|
||||
def build_dataset(self, img_path: str, mode: str = "train", batch: int | None = None):
|
||||
"""
|
||||
Build YOLO Dataset for training or validation.
|
||||
|
||||
Args:
|
||||
img_path (str): Path to the folder containing images.
|
||||
mode (str): `train` mode or `val` mode, users are able to customize different augmentations for each mode.
|
||||
batch (int, optional): Size of batches, this is for `rect`.
|
||||
|
||||
Returns:
|
||||
(Any): YOLO dataset configured for training or validation.
|
||||
"""
|
||||
gs = max(int(unwrap_model(self.model).stride.max() if self.model else 0), 32)
|
||||
dataset = build_yolo_dataset(
|
||||
self.args, img_path, batch, self.data, mode=mode, rect=mode == "val", stride=gs, multi_modal=mode == "train"
|
||||
)
|
||||
if mode == "train":
|
||||
self.set_text_embeddings([dataset], batch) # cache text embeddings to accelerate training
|
||||
return dataset
|
||||
|
||||
def set_text_embeddings(self, datasets: list[Any], batch: int | None) -> None:
|
||||
"""
|
||||
Set text embeddings for datasets to accelerate training by caching category names.
|
||||
|
||||
This method collects unique category names from all datasets, then generates and caches text embeddings
|
||||
for these categories to improve training efficiency.
|
||||
|
||||
Args:
|
||||
datasets (list[Any]): List of datasets from which to extract category names.
|
||||
batch (int | None): Batch size used for processing.
|
||||
|
||||
Notes:
|
||||
This method collects category names from datasets that have the 'category_names' attribute,
|
||||
then uses the first dataset's image path to determine where to cache the generated text embeddings.
|
||||
"""
|
||||
text_embeddings = {}
|
||||
for dataset in datasets:
|
||||
if not hasattr(dataset, "category_names"):
|
||||
continue
|
||||
text_embeddings.update(
|
||||
self.generate_text_embeddings(
|
||||
list(dataset.category_names), batch, cache_dir=Path(dataset.img_path).parent
|
||||
)
|
||||
)
|
||||
self.text_embeddings = text_embeddings
|
||||
|
||||
def generate_text_embeddings(self, texts: list[str], batch: int, cache_dir: Path) -> dict[str, torch.Tensor]:
|
||||
"""
|
||||
Generate text embeddings for a list of text samples.
|
||||
|
||||
Args:
|
||||
texts (list[str]): List of text samples to encode.
|
||||
batch (int): Batch size for processing.
|
||||
cache_dir (Path): Directory to save/load cached embeddings.
|
||||
|
||||
Returns:
|
||||
(dict[str, torch.Tensor]): Dictionary mapping text samples to their embeddings.
|
||||
"""
|
||||
model = "clip:ViT-B/32"
|
||||
cache_path = cache_dir / f"text_embeddings_{model.replace(':', '_').replace('/', '_')}.pt"
|
||||
if cache_path.exists():
|
||||
LOGGER.info(f"Reading existed cache from '{cache_path}'")
|
||||
txt_map = torch.load(cache_path, map_location=self.device)
|
||||
if sorted(txt_map.keys()) == sorted(texts):
|
||||
return txt_map
|
||||
LOGGER.info(f"Caching text embeddings to '{cache_path}'")
|
||||
assert self.model is not None
|
||||
txt_feats = unwrap_model(self.model).get_text_pe(texts, batch, cache_clip_model=False)
|
||||
txt_map = dict(zip(texts, txt_feats.squeeze(0)))
|
||||
torch.save(txt_map, cache_path)
|
||||
return txt_map
|
||||
|
||||
def preprocess_batch(self, batch: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Preprocess a batch of images and text for YOLOWorld training."""
|
||||
batch = DetectionTrainer.preprocess_batch(self, batch)
|
||||
|
||||
# Add text features
|
||||
texts = list(itertools.chain(*batch["texts"]))
|
||||
txt_feats = torch.stack([self.text_embeddings[text] for text in texts]).to(
|
||||
self.device, non_blocking=self.device.type == "cuda"
|
||||
)
|
||||
batch["txt_feats"] = txt_feats.reshape(len(batch["texts"]), -1, txt_feats.shape[-1])
|
||||
return batch
|
||||
201
ultralytics/models/yolo/world/train_world.py
Normal file
201
ultralytics/models/yolo/world/train_world.py
Normal file
@@ -0,0 +1,201 @@
|
||||
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from ultralytics.data import YOLOConcatDataset, build_grounding, build_yolo_dataset
|
||||
from ultralytics.data.utils import check_det_dataset
|
||||
from ultralytics.models.yolo.world import WorldTrainer
|
||||
from ultralytics.utils import DATASETS_DIR, DEFAULT_CFG, LOGGER
|
||||
from ultralytics.utils.torch_utils import unwrap_model
|
||||
|
||||
|
||||
class WorldTrainerFromScratch(WorldTrainer):
|
||||
"""
|
||||
A class extending the WorldTrainer for training a world model from scratch on open-set datasets.
|
||||
|
||||
This trainer specializes in handling mixed datasets including both object detection and grounding datasets,
|
||||
supporting training YOLO-World models with combined vision-language capabilities.
|
||||
|
||||
Attributes:
|
||||
cfg (dict): Configuration dictionary with default parameters for model training.
|
||||
overrides (dict): Dictionary of parameter overrides to customize the configuration.
|
||||
_callbacks (list): List of callback functions to be executed during different stages of training.
|
||||
data (dict): Final processed data configuration containing train/val paths and metadata.
|
||||
training_data (dict): Dictionary mapping training dataset paths to their configurations.
|
||||
|
||||
Methods:
|
||||
build_dataset: Build YOLO Dataset for training or validation with mixed dataset support.
|
||||
get_dataset: Get train and validation paths from data dictionary.
|
||||
plot_training_labels: Skip label plotting for YOLO-World training.
|
||||
final_eval: Perform final evaluation and validation for the YOLO-World model.
|
||||
|
||||
Examples:
|
||||
>>> from ultralytics.models.yolo.world.train_world import WorldTrainerFromScratch
|
||||
>>> from ultralytics import YOLOWorld
|
||||
>>> data = dict(
|
||||
... train=dict(
|
||||
... yolo_data=["Objects365.yaml"],
|
||||
... grounding_data=[
|
||||
... dict(
|
||||
... img_path="flickr30k/images",
|
||||
... json_file="flickr30k/final_flickr_separateGT_train.json",
|
||||
... ),
|
||||
... dict(
|
||||
... img_path="GQA/images",
|
||||
... json_file="GQA/final_mixed_train_no_coco.json",
|
||||
... ),
|
||||
... ],
|
||||
... ),
|
||||
... val=dict(yolo_data=["lvis.yaml"]),
|
||||
... )
|
||||
>>> model = YOLOWorld("yolov8s-worldv2.yaml")
|
||||
>>> model.train(data=data, trainer=WorldTrainerFromScratch)
|
||||
"""
|
||||
|
||||
def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
|
||||
"""
|
||||
Initialize a WorldTrainerFromScratch object.
|
||||
|
||||
This initializes a trainer for YOLO-World models from scratch, supporting mixed datasets including both
|
||||
object detection and grounding datasets for vision-language capabilities.
|
||||
|
||||
Args:
|
||||
cfg (dict): Configuration dictionary with default parameters for model training.
|
||||
overrides (dict, optional): Dictionary of parameter overrides to customize the configuration.
|
||||
_callbacks (list, optional): List of callback functions to be executed during different stages of training.
|
||||
|
||||
Examples:
|
||||
>>> from ultralytics.models.yolo.world.train_world import WorldTrainerFromScratch
|
||||
>>> from ultralytics import YOLOWorld
|
||||
>>> data = dict(
|
||||
... train=dict(
|
||||
... yolo_data=["Objects365.yaml"],
|
||||
... grounding_data=[
|
||||
... dict(
|
||||
... img_path="flickr30k/images",
|
||||
... json_file="flickr30k/final_flickr_separateGT_train.json",
|
||||
... ),
|
||||
... ],
|
||||
... ),
|
||||
... val=dict(yolo_data=["lvis.yaml"]),
|
||||
... )
|
||||
>>> model = YOLOWorld("yolov8s-worldv2.yaml")
|
||||
>>> model.train(data=data, trainer=WorldTrainerFromScratch)
|
||||
"""
|
||||
if overrides is None:
|
||||
overrides = {}
|
||||
super().__init__(cfg, overrides, _callbacks)
|
||||
|
||||
def build_dataset(self, img_path, mode="train", batch=None):
|
||||
"""
|
||||
Build YOLO Dataset for training or validation.
|
||||
|
||||
This method constructs appropriate datasets based on the mode and input paths, handling both
|
||||
standard YOLO datasets and grounding datasets with different formats.
|
||||
|
||||
Args:
|
||||
img_path (list[str] | str): Path to the folder containing images or list of paths.
|
||||
mode (str): 'train' mode or 'val' mode, allowing customized augmentations for each mode.
|
||||
batch (int, optional): Size of batches, used for rectangular training/validation.
|
||||
|
||||
Returns:
|
||||
(YOLOConcatDataset | Dataset): The constructed dataset for training or validation.
|
||||
"""
|
||||
gs = max(int(unwrap_model(self.model).stride.max() if self.model else 0), 32)
|
||||
if mode != "train":
|
||||
return build_yolo_dataset(self.args, img_path, batch, self.data, mode=mode, rect=False, stride=gs)
|
||||
datasets = [
|
||||
build_yolo_dataset(self.args, im_path, batch, self.training_data[im_path], stride=gs, multi_modal=True)
|
||||
if isinstance(im_path, str)
|
||||
else build_grounding(
|
||||
# assign `nc` from validation set to max number of text samples for training consistency
|
||||
self.args,
|
||||
im_path["img_path"],
|
||||
im_path["json_file"],
|
||||
batch,
|
||||
stride=gs,
|
||||
max_samples=self.data["nc"],
|
||||
)
|
||||
for im_path in img_path
|
||||
]
|
||||
self.set_text_embeddings(datasets, batch) # cache text embeddings to accelerate training
|
||||
return YOLOConcatDataset(datasets) if len(datasets) > 1 else datasets[0]
|
||||
|
||||
def get_dataset(self):
|
||||
"""
|
||||
Get train and validation paths from data dictionary.
|
||||
|
||||
Processes the data configuration to extract paths for training and validation datasets,
|
||||
handling both YOLO detection datasets and grounding datasets.
|
||||
|
||||
Returns:
|
||||
train_path (str): Train dataset path.
|
||||
val_path (str): Validation dataset path.
|
||||
|
||||
Raises:
|
||||
AssertionError: If train or validation datasets are not found, or if validation has multiple datasets.
|
||||
"""
|
||||
final_data = {}
|
||||
data_yaml = self.args.data
|
||||
assert data_yaml.get("train", False), "train dataset not found" # object365.yaml
|
||||
assert data_yaml.get("val", False), "validation dataset not found" # lvis.yaml
|
||||
data = {k: [check_det_dataset(d) for d in v.get("yolo_data", [])] for k, v in data_yaml.items()}
|
||||
assert len(data["val"]) == 1, f"Only support validating on 1 dataset for now, but got {len(data['val'])}."
|
||||
val_split = "minival" if "lvis" in data["val"][0]["val"] else "val"
|
||||
for d in data["val"]:
|
||||
if d.get("minival") is None: # for lvis dataset
|
||||
continue
|
||||
d["minival"] = str(d["path"] / d["minival"])
|
||||
for s in {"train", "val"}:
|
||||
final_data[s] = [d["train" if s == "train" else val_split] for d in data[s]]
|
||||
# save grounding data if there's one
|
||||
grounding_data = data_yaml[s].get("grounding_data")
|
||||
if grounding_data is None:
|
||||
continue
|
||||
grounding_data = grounding_data if isinstance(grounding_data, list) else [grounding_data]
|
||||
for g in grounding_data:
|
||||
assert isinstance(g, dict), f"Grounding data should be provided in dict format, but got {type(g)}"
|
||||
for k in {"img_path", "json_file"}:
|
||||
path = Path(g[k])
|
||||
if not path.exists() and not path.is_absolute():
|
||||
g[k] = str((DATASETS_DIR / g[k]).resolve()) # path relative to DATASETS_DIR
|
||||
final_data[s] += grounding_data
|
||||
# assign the first val dataset as currently only one validation set is supported
|
||||
data["val"] = data["val"][0]
|
||||
final_data["val"] = final_data["val"][0]
|
||||
# NOTE: to make training work properly, set `nc` and `names`
|
||||
final_data["nc"] = data["val"]["nc"]
|
||||
final_data["names"] = data["val"]["names"]
|
||||
# NOTE: add path with lvis path
|
||||
final_data["path"] = data["val"]["path"]
|
||||
final_data["channels"] = data["val"]["channels"]
|
||||
self.data = final_data
|
||||
if self.args.single_cls: # consistent with base trainer
|
||||
LOGGER.info("Overriding class names with single class.")
|
||||
self.data["names"] = {0: "object"}
|
||||
self.data["nc"] = 1
|
||||
self.training_data = {}
|
||||
for d in data["train"]:
|
||||
if self.args.single_cls:
|
||||
d["names"] = {0: "object"}
|
||||
d["nc"] = 1
|
||||
self.training_data[d["train"]] = d
|
||||
return final_data
|
||||
|
||||
def plot_training_labels(self):
|
||||
"""Skip label plotting for YOLO-World training."""
|
||||
pass
|
||||
|
||||
def final_eval(self):
|
||||
"""
|
||||
Perform final evaluation and validation for the YOLO-World model.
|
||||
|
||||
Configures the validator with appropriate dataset and split information before running evaluation.
|
||||
|
||||
Returns:
|
||||
(dict): Dictionary containing evaluation metrics and results.
|
||||
"""
|
||||
val = self.args.data["val"]["yolo_data"][0]
|
||||
self.validator.args.data = val
|
||||
self.validator.args.split = "minival" if isinstance(val, str) and "lvis" in val else "val"
|
||||
return super().final_eval()
|
||||
Reference in New Issue
Block a user