init commit
This commit is contained in:
408
ultralytics/utils/logger.py
Normal file
408
ultralytics/utils/logger.py
Normal file
@@ -0,0 +1,408 @@
|
||||
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
||||
|
||||
import logging
|
||||
import queue
|
||||
import shutil
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from ultralytics.utils import MACOS, RANK
|
||||
from ultralytics.utils.checks import check_requirements
|
||||
|
||||
# Initialize default log file
|
||||
DEFAULT_LOG_PATH = Path("train.log")
|
||||
if RANK in {-1, 0} and DEFAULT_LOG_PATH.exists():
|
||||
DEFAULT_LOG_PATH.unlink(missing_ok=True)
|
||||
|
||||
|
||||
class ConsoleLogger:
|
||||
"""
|
||||
Console output capture with API/file streaming and deduplication.
|
||||
|
||||
Captures stdout/stderr output and streams it to either an API endpoint or local file, with intelligent
|
||||
deduplication to reduce noise from repetitive console output.
|
||||
|
||||
Attributes:
|
||||
destination (str | Path): Target destination for streaming (URL or Path object).
|
||||
is_api (bool): Whether destination is an API endpoint (True) or local file (False).
|
||||
original_stdout: Reference to original sys.stdout for restoration.
|
||||
original_stderr: Reference to original sys.stderr for restoration.
|
||||
log_queue (queue.Queue): Thread-safe queue for buffering log messages.
|
||||
active (bool): Whether console capture is currently active.
|
||||
worker_thread (threading.Thread): Background thread for processing log queue.
|
||||
last_line (str): Last processed line for deduplication.
|
||||
last_time (float): Timestamp of last processed line.
|
||||
last_progress_line (str): Last progress bar line for progress deduplication.
|
||||
last_was_progress (bool): Whether the last line was a progress bar.
|
||||
|
||||
Examples:
|
||||
Basic file logging:
|
||||
>>> logger = ConsoleLogger("training.log")
|
||||
>>> logger.start_capture()
|
||||
>>> print("This will be logged")
|
||||
>>> logger.stop_capture()
|
||||
|
||||
API streaming:
|
||||
>>> logger = ConsoleLogger("https://api.example.com/logs")
|
||||
>>> logger.start_capture()
|
||||
>>> # All output streams to API
|
||||
>>> logger.stop_capture()
|
||||
"""
|
||||
|
||||
def __init__(self, destination):
|
||||
"""
|
||||
Initialize with API endpoint or local file path.
|
||||
|
||||
Args:
|
||||
destination (str | Path): API endpoint URL (http/https) or local file path for streaming output.
|
||||
"""
|
||||
self.destination = destination
|
||||
self.is_api = isinstance(destination, str) and destination.startswith(("http://", "https://"))
|
||||
if not self.is_api:
|
||||
self.destination = Path(destination)
|
||||
|
||||
# Console capture
|
||||
self.original_stdout = sys.stdout
|
||||
self.original_stderr = sys.stderr
|
||||
self.log_queue = queue.Queue(maxsize=1000)
|
||||
self.active = False
|
||||
self.worker_thread = None
|
||||
|
||||
# State tracking
|
||||
self.last_line = ""
|
||||
self.last_time = 0.0
|
||||
self.last_progress_line = "" # Track last progress line for deduplication
|
||||
self.last_was_progress = False # Track if last line was a progress bar
|
||||
|
||||
def start_capture(self):
|
||||
"""Start capturing console output and redirect stdout/stderr to custom capture objects."""
|
||||
if self.active:
|
||||
return
|
||||
|
||||
self.active = True
|
||||
sys.stdout = self._ConsoleCapture(self.original_stdout, self._queue_log)
|
||||
sys.stderr = self._ConsoleCapture(self.original_stderr, self._queue_log)
|
||||
|
||||
# Hook Ultralytics logger
|
||||
try:
|
||||
handler = self._LogHandler(self._queue_log)
|
||||
logging.getLogger("ultralytics").addHandler(handler)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
self.worker_thread = threading.Thread(target=self._stream_worker, daemon=True)
|
||||
self.worker_thread.start()
|
||||
|
||||
def stop_capture(self):
|
||||
"""Stop capturing console output and restore original stdout/stderr."""
|
||||
if not self.active:
|
||||
return
|
||||
|
||||
self.active = False
|
||||
sys.stdout = self.original_stdout
|
||||
sys.stderr = self.original_stderr
|
||||
self.log_queue.put(None)
|
||||
|
||||
def _queue_log(self, text):
|
||||
"""Queue console text with deduplication and timestamp processing."""
|
||||
if not self.active:
|
||||
return
|
||||
|
||||
current_time = time.time()
|
||||
|
||||
# Handle carriage returns and process lines
|
||||
if "\r" in text:
|
||||
text = text.split("\r")[-1]
|
||||
|
||||
lines = text.split("\n")
|
||||
if lines and lines[-1] == "":
|
||||
lines.pop()
|
||||
|
||||
for line in lines:
|
||||
line = line.rstrip()
|
||||
|
||||
# Skip lines with only thin progress bars (partial progress)
|
||||
if "─" in line: # Has thin lines but no thick lines
|
||||
continue
|
||||
|
||||
# Deduplicate completed progress bars only if they match the previous progress line
|
||||
if " ━━" in line:
|
||||
progress_core = line.split(" ━━")[0].strip()
|
||||
if progress_core == self.last_progress_line and self.last_was_progress:
|
||||
continue
|
||||
self.last_progress_line = progress_core
|
||||
self.last_was_progress = True
|
||||
else:
|
||||
# Skip empty line after progress bar
|
||||
if not line and self.last_was_progress:
|
||||
self.last_was_progress = False
|
||||
continue
|
||||
self.last_was_progress = False
|
||||
|
||||
# General deduplication
|
||||
if line == self.last_line and current_time - self.last_time < 0.1:
|
||||
continue
|
||||
|
||||
self.last_line = line
|
||||
self.last_time = current_time
|
||||
|
||||
# Add timestamp if needed
|
||||
if not line.startswith("[20"):
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
line = f"[{timestamp}] {line}"
|
||||
|
||||
# Queue with overflow protection
|
||||
if not self._safe_put(f"{line}\n"):
|
||||
continue # Skip if queue handling fails
|
||||
|
||||
def _safe_put(self, item):
|
||||
"""Safely put item in queue with overflow handling."""
|
||||
try:
|
||||
self.log_queue.put_nowait(item)
|
||||
return True
|
||||
except queue.Full:
|
||||
try:
|
||||
self.log_queue.get_nowait() # Drop oldest
|
||||
self.log_queue.put_nowait(item)
|
||||
return True
|
||||
except queue.Empty:
|
||||
return False
|
||||
|
||||
def _stream_worker(self):
|
||||
"""Background worker for streaming logs to destination."""
|
||||
while self.active:
|
||||
try:
|
||||
log_text = self.log_queue.get(timeout=1)
|
||||
if log_text is None:
|
||||
break
|
||||
self._write_log(log_text)
|
||||
except queue.Empty:
|
||||
continue
|
||||
|
||||
def _write_log(self, text):
|
||||
"""Write log to API endpoint or local file destination."""
|
||||
try:
|
||||
if self.is_api:
|
||||
import requests # scoped as slow import
|
||||
|
||||
payload = {"timestamp": datetime.now().isoformat(), "message": text.strip()}
|
||||
requests.post(str(self.destination), json=payload, timeout=5)
|
||||
else:
|
||||
self.destination.parent.mkdir(parents=True, exist_ok=True)
|
||||
with self.destination.open("a", encoding="utf-8") as f:
|
||||
f.write(text)
|
||||
except Exception as e:
|
||||
print(f"Platform logging error: {e}", file=self.original_stderr)
|
||||
|
||||
class _ConsoleCapture:
|
||||
"""Lightweight stdout/stderr capture."""
|
||||
|
||||
__slots__ = ("original", "callback")
|
||||
|
||||
def __init__(self, original, callback):
|
||||
self.original = original
|
||||
self.callback = callback
|
||||
|
||||
def write(self, text):
|
||||
self.original.write(text)
|
||||
self.callback(text)
|
||||
|
||||
def flush(self):
|
||||
self.original.flush()
|
||||
|
||||
class _LogHandler(logging.Handler):
|
||||
"""Lightweight logging handler."""
|
||||
|
||||
__slots__ = ("callback",)
|
||||
|
||||
def __init__(self, callback):
|
||||
super().__init__()
|
||||
self.callback = callback
|
||||
|
||||
def emit(self, record):
|
||||
self.callback(self.format(record) + "\n")
|
||||
|
||||
|
||||
class SystemLogger:
|
||||
"""
|
||||
Log dynamic system metrics for training monitoring.
|
||||
|
||||
Captures real-time system metrics including CPU, RAM, disk I/O, network I/O, and NVIDIA GPU statistics for
|
||||
training performance monitoring and analysis.
|
||||
|
||||
Attributes:
|
||||
pynvml: NVIDIA pynvml module instance if successfully imported, None otherwise.
|
||||
nvidia_initialized (bool): Whether NVIDIA GPU monitoring is available and initialized.
|
||||
net_start: Initial network I/O counters for calculating cumulative usage.
|
||||
disk_start: Initial disk I/O counters for calculating cumulative usage.
|
||||
|
||||
Examples:
|
||||
Basic usage:
|
||||
>>> logger = SystemLogger()
|
||||
>>> metrics = logger.get_metrics()
|
||||
>>> print(f"CPU: {metrics['cpu']}%, RAM: {metrics['ram']}%")
|
||||
>>> if metrics["gpus"]:
|
||||
... gpu0 = metrics["gpus"]["0"]
|
||||
... print(f"GPU0: {gpu0['usage']}% usage, {gpu0['temp']}°C")
|
||||
|
||||
Training loop integration:
|
||||
>>> system_logger = SystemLogger()
|
||||
>>> for epoch in range(epochs):
|
||||
... # Training code here
|
||||
... metrics = system_logger.get_metrics()
|
||||
... # Log to database/file
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the system logger."""
|
||||
import psutil # scoped as slow import
|
||||
|
||||
self.pynvml = None
|
||||
self.nvidia_initialized = self._init_nvidia()
|
||||
self.net_start = psutil.net_io_counters()
|
||||
self.disk_start = psutil.disk_io_counters()
|
||||
|
||||
def _init_nvidia(self):
|
||||
"""Initialize NVIDIA GPU monitoring with pynvml."""
|
||||
try:
|
||||
assert not MACOS
|
||||
check_requirements("nvidia-ml-py>=12.0.0")
|
||||
self.pynvml = __import__("pynvml")
|
||||
self.pynvml.nvmlInit()
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def get_metrics(self):
|
||||
"""
|
||||
Get current system metrics.
|
||||
|
||||
Collects comprehensive system metrics including CPU usage, RAM usage, disk I/O statistics,
|
||||
network I/O statistics, and GPU metrics (if available). Example output:
|
||||
|
||||
```python
|
||||
metrics = {
|
||||
"cpu": 45.2,
|
||||
"ram": 78.9,
|
||||
"disk": {"read_mb": 156.7, "write_mb": 89.3, "used_gb": 256.8},
|
||||
"network": {"recv_mb": 157.2, "sent_mb": 89.1},
|
||||
"gpus": {
|
||||
0: {"usage": 95.6, "memory": 85.4, "temp": 72, "power": 285},
|
||||
1: {"usage": 94.1, "memory": 82.7, "temp": 70, "power": 278},
|
||||
},
|
||||
}
|
||||
```
|
||||
|
||||
- cpu (float): CPU usage percentage (0-100%)
|
||||
- ram (float): RAM usage percentage (0-100%)
|
||||
- disk (dict):
|
||||
- read_mb (float): Cumulative disk read in MB since initialization
|
||||
- write_mb (float): Cumulative disk write in MB since initialization
|
||||
- used_gb (float): Total disk space used in GB
|
||||
- network (dict):
|
||||
- recv_mb (float): Cumulative network received in MB since initialization
|
||||
- sent_mb (float): Cumulative network sent in MB since initialization
|
||||
- gpus (dict): GPU metrics by device index (e.g., 0, 1) containing:
|
||||
- usage (int): GPU utilization percentage (0-100%)
|
||||
- memory (float): CUDA memory usage percentage (0-100%)
|
||||
- temp (int): GPU temperature in degrees Celsius
|
||||
- power (int): GPU power consumption in watts
|
||||
|
||||
Returns:
|
||||
metrics (dict): System metrics containing 'cpu', 'ram', 'disk', 'network', 'gpus' with respective usage data.
|
||||
"""
|
||||
import psutil # scoped as slow import
|
||||
|
||||
net = psutil.net_io_counters()
|
||||
disk = psutil.disk_io_counters()
|
||||
memory = psutil.virtual_memory()
|
||||
disk_usage = shutil.disk_usage("/")
|
||||
|
||||
metrics = {
|
||||
"cpu": round(psutil.cpu_percent(), 3),
|
||||
"ram": round(memory.percent, 3),
|
||||
"disk": {
|
||||
"read_mb": round((disk.read_bytes - self.disk_start.read_bytes) / (1 << 20), 3),
|
||||
"write_mb": round((disk.write_bytes - self.disk_start.write_bytes) / (1 << 20), 3),
|
||||
"used_gb": round(disk_usage.used / (1 << 30), 3),
|
||||
},
|
||||
"network": {
|
||||
"recv_mb": round((net.bytes_recv - self.net_start.bytes_recv) / (1 << 20), 3),
|
||||
"sent_mb": round((net.bytes_sent - self.net_start.bytes_sent) / (1 << 20), 3),
|
||||
},
|
||||
"gpus": {},
|
||||
}
|
||||
|
||||
# Add GPU metrics (NVIDIA only)
|
||||
if self.nvidia_initialized:
|
||||
metrics["gpus"].update(self._get_nvidia_metrics())
|
||||
|
||||
return metrics
|
||||
|
||||
def _get_nvidia_metrics(self):
|
||||
"""Get NVIDIA GPU metrics including utilization, memory, temperature, and power."""
|
||||
gpus = {}
|
||||
if not self.nvidia_initialized or not self.pynvml:
|
||||
return gpus
|
||||
try:
|
||||
device_count = self.pynvml.nvmlDeviceGetCount()
|
||||
for i in range(device_count):
|
||||
handle = self.pynvml.nvmlDeviceGetHandleByIndex(i)
|
||||
util = self.pynvml.nvmlDeviceGetUtilizationRates(handle)
|
||||
memory = self.pynvml.nvmlDeviceGetMemoryInfo(handle)
|
||||
temp = self.pynvml.nvmlDeviceGetTemperature(handle, self.pynvml.NVML_TEMPERATURE_GPU)
|
||||
power = self.pynvml.nvmlDeviceGetPowerUsage(handle) // 1000
|
||||
|
||||
gpus[str(i)] = {
|
||||
"usage": round(util.gpu, 3),
|
||||
"memory": round((memory.used / memory.total) * 100, 3),
|
||||
"temp": temp,
|
||||
"power": power,
|
||||
}
|
||||
except Exception:
|
||||
pass
|
||||
return gpus
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("SystemLogger Real-time Metrics Monitor")
|
||||
print("Press Ctrl+C to stop\n")
|
||||
|
||||
logger = SystemLogger()
|
||||
|
||||
try:
|
||||
while True:
|
||||
metrics = logger.get_metrics()
|
||||
|
||||
# Clear screen (works on most terminals)
|
||||
print("\033[H\033[J", end="")
|
||||
|
||||
# Display system metrics
|
||||
print(f"CPU: {metrics['cpu']:5.1f}%")
|
||||
print(f"RAM: {metrics['ram']:5.1f}%")
|
||||
print(f"Disk Read: {metrics['disk']['read_mb']:8.1f} MB")
|
||||
print(f"Disk Write: {metrics['disk']['write_mb']:7.1f} MB")
|
||||
print(f"Disk Used: {metrics['disk']['used_gb']:8.1f} GB")
|
||||
print(f"Net Recv: {metrics['network']['recv_mb']:9.1f} MB")
|
||||
print(f"Net Sent: {metrics['network']['sent_mb']:9.1f} MB")
|
||||
|
||||
# Display GPU metrics if available
|
||||
if metrics["gpus"]:
|
||||
print("\nGPU Metrics:")
|
||||
for gpu_id, gpu_data in metrics["gpus"].items():
|
||||
print(
|
||||
f" GPU {gpu_id}: {gpu_data['usage']:3}% | "
|
||||
f"Mem: {gpu_data['memory']:5.1f}% | "
|
||||
f"Temp: {gpu_data['temp']:2}°C | "
|
||||
f"Power: {gpu_data['power']:3}W"
|
||||
)
|
||||
else:
|
||||
print("\nGPU: No NVIDIA GPUs detected")
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nStopped monitoring.")
|
||||
Reference in New Issue
Block a user