init commit

2025-11-08 19:15:39 +01:00
parent ecffcb08e8
commit c7adacf53b
470 changed files with 73751 additions and 0 deletions
--- a/ultralytics/utils/logger.py
+++ b/ultralytics/utils/logger.py
@@ -0,0 +1,408 @@
+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+
+import logging
+import queue
+import shutil
+import sys
+import threading
+import time
+from datetime import datetime
+from pathlib import Path
+
+from ultralytics.utils import MACOS, RANK
+from ultralytics.utils.checks import check_requirements
+
+# Initialize default log file
+DEFAULT_LOG_PATH = Path("train.log")
+if RANK in {-1, 0} and DEFAULT_LOG_PATH.exists():
+    DEFAULT_LOG_PATH.unlink(missing_ok=True)
+
+
+class ConsoleLogger:
+    """
+    Console output capture with API/file streaming and deduplication.
+
+    Captures stdout/stderr output and streams it to either an API endpoint or local file, with intelligent
+    deduplication to reduce noise from repetitive console output.
+
+    Attributes:
+        destination (str | Path): Target destination for streaming (URL or Path object).
+        is_api (bool): Whether destination is an API endpoint (True) or local file (False).
+        original_stdout: Reference to original sys.stdout for restoration.
+        original_stderr: Reference to original sys.stderr for restoration.
+        log_queue (queue.Queue): Thread-safe queue for buffering log messages.
+        active (bool): Whether console capture is currently active.
+        worker_thread (threading.Thread): Background thread for processing log queue.
+        last_line (str): Last processed line for deduplication.
+        last_time (float): Timestamp of last processed line.
+        last_progress_line (str): Last progress bar line for progress deduplication.
+        last_was_progress (bool): Whether the last line was a progress bar.
+
+    Examples:
+        Basic file logging:
+        >>> logger = ConsoleLogger("training.log")
+        >>> logger.start_capture()
+        >>> print("This will be logged")
+        >>> logger.stop_capture()
+
+        API streaming:
+        >>> logger = ConsoleLogger("https://api.example.com/logs")
+        >>> logger.start_capture()
+        >>> # All output streams to API
+        >>> logger.stop_capture()
+    """
+
+    def __init__(self, destination):
+        """
+        Initialize with API endpoint or local file path.
+
+        Args:
+            destination (str | Path): API endpoint URL (http/https) or local file path for streaming output.
+        """
+        self.destination = destination
+        self.is_api = isinstance(destination, str) and destination.startswith(("http://", "https://"))
+        if not self.is_api:
+            self.destination = Path(destination)
+
+        # Console capture
+        self.original_stdout = sys.stdout
+        self.original_stderr = sys.stderr
+        self.log_queue = queue.Queue(maxsize=1000)
+        self.active = False
+        self.worker_thread = None
+
+        # State tracking
+        self.last_line = ""
+        self.last_time = 0.0
+        self.last_progress_line = ""  # Track last progress line for deduplication
+        self.last_was_progress = False  # Track if last line was a progress bar
+
+    def start_capture(self):
+        """Start capturing console output and redirect stdout/stderr to custom capture objects."""
+        if self.active:
+            return
+
+        self.active = True
+        sys.stdout = self._ConsoleCapture(self.original_stdout, self._queue_log)
+        sys.stderr = self._ConsoleCapture(self.original_stderr, self._queue_log)
+
+        # Hook Ultralytics logger
+        try:
+            handler = self._LogHandler(self._queue_log)
+            logging.getLogger("ultralytics").addHandler(handler)
+        except Exception:
+            pass
+
+        self.worker_thread = threading.Thread(target=self._stream_worker, daemon=True)
+        self.worker_thread.start()
+
+    def stop_capture(self):
+        """Stop capturing console output and restore original stdout/stderr."""
+        if not self.active:
+            return
+
+        self.active = False
+        sys.stdout = self.original_stdout
+        sys.stderr = self.original_stderr
+        self.log_queue.put(None)
+
+    def _queue_log(self, text):
+        """Queue console text with deduplication and timestamp processing."""
+        if not self.active:
+            return
+
+        current_time = time.time()
+
+        # Handle carriage returns and process lines
+        if "\r" in text:
+            text = text.split("\r")[-1]
+
+        lines = text.split("\n")
+        if lines and lines[-1] == "":
+            lines.pop()
+
+        for line in lines:
+            line = line.rstrip()
+
+            # Skip lines with only thin progress bars (partial progress)
+            if "─" in line:  # Has thin lines but no thick lines
+                continue
+
+            # Deduplicate completed progress bars only if they match the previous progress line
+            if " ━━" in line:
+                progress_core = line.split(" ━━")[0].strip()
+                if progress_core == self.last_progress_line and self.last_was_progress:
+                    continue
+                self.last_progress_line = progress_core
+                self.last_was_progress = True
+            else:
+                # Skip empty line after progress bar
+                if not line and self.last_was_progress:
+                    self.last_was_progress = False
+                    continue
+                self.last_was_progress = False
+
+            # General deduplication
+            if line == self.last_line and current_time - self.last_time < 0.1:
+                continue
+
+            self.last_line = line
+            self.last_time = current_time
+
+            # Add timestamp if needed
+            if not line.startswith("[20"):
+                timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+                line = f"[{timestamp}] {line}"
+
+            # Queue with overflow protection
+            if not self._safe_put(f"{line}\n"):
+                continue  # Skip if queue handling fails
+
+    def _safe_put(self, item):
+        """Safely put item in queue with overflow handling."""
+        try:
+            self.log_queue.put_nowait(item)
+            return True
+        except queue.Full:
+            try:
+                self.log_queue.get_nowait()  # Drop oldest
+                self.log_queue.put_nowait(item)
+                return True
+            except queue.Empty:
+                return False
+
+    def _stream_worker(self):
+        """Background worker for streaming logs to destination."""
+        while self.active:
+            try:
+                log_text = self.log_queue.get(timeout=1)
+                if log_text is None:
+                    break
+                self._write_log(log_text)
+            except queue.Empty:
+                continue
+
+    def _write_log(self, text):
+        """Write log to API endpoint or local file destination."""
+        try:
+            if self.is_api:
+                import requests  # scoped as slow import
+
+                payload = {"timestamp": datetime.now().isoformat(), "message": text.strip()}
+                requests.post(str(self.destination), json=payload, timeout=5)
+            else:
+                self.destination.parent.mkdir(parents=True, exist_ok=True)
+                with self.destination.open("a", encoding="utf-8") as f:
+                    f.write(text)
+        except Exception as e:
+            print(f"Platform logging error: {e}", file=self.original_stderr)
+
+    class _ConsoleCapture:
+        """Lightweight stdout/stderr capture."""
+
+        __slots__ = ("original", "callback")
+
+        def __init__(self, original, callback):
+            self.original = original
+            self.callback = callback
+
+        def write(self, text):
+            self.original.write(text)
+            self.callback(text)
+
+        def flush(self):
+            self.original.flush()
+
+    class _LogHandler(logging.Handler):
+        """Lightweight logging handler."""
+
+        __slots__ = ("callback",)
+
+        def __init__(self, callback):
+            super().__init__()
+            self.callback = callback
+
+        def emit(self, record):
+            self.callback(self.format(record) + "\n")
+
+
+class SystemLogger:
+    """
+    Log dynamic system metrics for training monitoring.
+
+    Captures real-time system metrics including CPU, RAM, disk I/O, network I/O, and NVIDIA GPU statistics for
+    training performance monitoring and analysis.
+
+    Attributes:
+        pynvml: NVIDIA pynvml module instance if successfully imported, None otherwise.
+        nvidia_initialized (bool): Whether NVIDIA GPU monitoring is available and initialized.
+        net_start: Initial network I/O counters for calculating cumulative usage.
+        disk_start: Initial disk I/O counters for calculating cumulative usage.
+
+    Examples:
+        Basic usage:
+        >>> logger = SystemLogger()
+        >>> metrics = logger.get_metrics()
+        >>> print(f"CPU: {metrics['cpu']}%, RAM: {metrics['ram']}%")
+        >>> if metrics["gpus"]:
+        ...     gpu0 = metrics["gpus"]["0"]
+        ...     print(f"GPU0: {gpu0['usage']}% usage, {gpu0['temp']}°C")
+
+        Training loop integration:
+        >>> system_logger = SystemLogger()
+        >>> for epoch in range(epochs):
+        ...     # Training code here
+        ...     metrics = system_logger.get_metrics()
+        ...     # Log to database/file
+    """
+
+    def __init__(self):
+        """Initialize the system logger."""
+        import psutil  # scoped as slow import
+
+        self.pynvml = None
+        self.nvidia_initialized = self._init_nvidia()
+        self.net_start = psutil.net_io_counters()
+        self.disk_start = psutil.disk_io_counters()
+
+    def _init_nvidia(self):
+        """Initialize NVIDIA GPU monitoring with pynvml."""
+        try:
+            assert not MACOS
+            check_requirements("nvidia-ml-py>=12.0.0")
+            self.pynvml = __import__("pynvml")
+            self.pynvml.nvmlInit()
+            return True
+        except Exception:
+            return False
+
+    def get_metrics(self):
+        """
+        Get current system metrics.
+
+        Collects comprehensive system metrics including CPU usage, RAM usage, disk I/O statistics,
+        network I/O statistics, and GPU metrics (if available). Example output:
+
+        ```python
+        metrics = {
+            "cpu": 45.2,
+            "ram": 78.9,
+            "disk": {"read_mb": 156.7, "write_mb": 89.3, "used_gb": 256.8},
+            "network": {"recv_mb": 157.2, "sent_mb": 89.1},
+            "gpus": {
+                0: {"usage": 95.6, "memory": 85.4, "temp": 72, "power": 285},
+                1: {"usage": 94.1, "memory": 82.7, "temp": 70, "power": 278},
+            },
+        }
+        ```
+
+        - cpu (float): CPU usage percentage (0-100%)
+        - ram (float): RAM usage percentage (0-100%)
+        - disk (dict):
+            - read_mb (float): Cumulative disk read in MB since initialization
+            - write_mb (float): Cumulative disk write in MB since initialization
+            - used_gb (float): Total disk space used in GB
+        - network (dict):
+            - recv_mb (float): Cumulative network received in MB since initialization
+            - sent_mb (float): Cumulative network sent in MB since initialization
+        - gpus (dict): GPU metrics by device index (e.g., 0, 1) containing:
+            - usage (int): GPU utilization percentage (0-100%)
+            - memory (float): CUDA memory usage percentage (0-100%)
+            - temp (int): GPU temperature in degrees Celsius
+            - power (int): GPU power consumption in watts
+
+        Returns:
+            metrics (dict): System metrics containing 'cpu', 'ram', 'disk', 'network', 'gpus' with respective usage data.
+        """
+        import psutil  # scoped as slow import
+
+        net = psutil.net_io_counters()
+        disk = psutil.disk_io_counters()
+        memory = psutil.virtual_memory()
+        disk_usage = shutil.disk_usage("/")
+
+        metrics = {
+            "cpu": round(psutil.cpu_percent(), 3),
+            "ram": round(memory.percent, 3),
+            "disk": {
+                "read_mb": round((disk.read_bytes - self.disk_start.read_bytes) / (1 << 20), 3),
+                "write_mb": round((disk.write_bytes - self.disk_start.write_bytes) / (1 << 20), 3),
+                "used_gb": round(disk_usage.used / (1 << 30), 3),
+            },
+            "network": {
+                "recv_mb": round((net.bytes_recv - self.net_start.bytes_recv) / (1 << 20), 3),
+                "sent_mb": round((net.bytes_sent - self.net_start.bytes_sent) / (1 << 20), 3),
+            },
+            "gpus": {},
+        }
+
+        # Add GPU metrics (NVIDIA only)
+        if self.nvidia_initialized:
+            metrics["gpus"].update(self._get_nvidia_metrics())
+
+        return metrics
+
+    def _get_nvidia_metrics(self):
+        """Get NVIDIA GPU metrics including utilization, memory, temperature, and power."""
+        gpus = {}
+        if not self.nvidia_initialized or not self.pynvml:
+            return gpus
+        try:
+            device_count = self.pynvml.nvmlDeviceGetCount()
+            for i in range(device_count):
+                handle = self.pynvml.nvmlDeviceGetHandleByIndex(i)
+                util = self.pynvml.nvmlDeviceGetUtilizationRates(handle)
+                memory = self.pynvml.nvmlDeviceGetMemoryInfo(handle)
+                temp = self.pynvml.nvmlDeviceGetTemperature(handle, self.pynvml.NVML_TEMPERATURE_GPU)
+                power = self.pynvml.nvmlDeviceGetPowerUsage(handle) // 1000
+
+                gpus[str(i)] = {
+                    "usage": round(util.gpu, 3),
+                    "memory": round((memory.used / memory.total) * 100, 3),
+                    "temp": temp,
+                    "power": power,
+                }
+        except Exception:
+            pass
+        return gpus
+
+
+if __name__ == "__main__":
+    print("SystemLogger Real-time Metrics Monitor")
+    print("Press Ctrl+C to stop\n")
+
+    logger = SystemLogger()
+
+    try:
+        while True:
+            metrics = logger.get_metrics()
+
+            # Clear screen (works on most terminals)
+            print("\033[H\033[J", end="")
+
+            # Display system metrics
+            print(f"CPU: {metrics['cpu']:5.1f}%")
+            print(f"RAM: {metrics['ram']:5.1f}%")
+            print(f"Disk Read: {metrics['disk']['read_mb']:8.1f} MB")
+            print(f"Disk Write: {metrics['disk']['write_mb']:7.1f} MB")
+            print(f"Disk Used: {metrics['disk']['used_gb']:8.1f} GB")
+            print(f"Net Recv: {metrics['network']['recv_mb']:9.1f} MB")
+            print(f"Net Sent: {metrics['network']['sent_mb']:9.1f} MB")
+
+            # Display GPU metrics if available
+            if metrics["gpus"]:
+                print("\nGPU Metrics:")
+                for gpu_id, gpu_data in metrics["gpus"].items():
+                    print(
+                        f"  GPU {gpu_id}: {gpu_data['usage']:3}% | "
+                        f"Mem: {gpu_data['memory']:5.1f}% | "
+                        f"Temp: {gpu_data['temp']:2}°C | "
+                        f"Power: {gpu_data['power']:3}W"
+                    )
+            else:
+                print("\nGPU: No NVIDIA GPUs detected")
+
+            time.sleep(1)
+
+    except KeyboardInterrupt:
+        print("\n\nStopped monitoring.")