Source code for composer.callbacks.system_metrics_monitor

# Copyright 2022 MosaicML Composer authors
# SPDX-License-Identifier: Apache-2.0

"""System metrics monitor callback."""

from __future__ import annotations

import logging
import os

import psutil

from composer.core import Callback, Event, State
from composer.loggers import Logger
from composer.utils import dist
from composer.utils.import_helpers import MissingConditionalImportError

log = logging.getLogger(__name__)

__all__ = ['SystemMetricsMonitor']


[docs]class SystemMetricsMonitor(Callback): """Track system metrics.""" def __init__(self, gpu_available: bool = False) -> None: super().__init__() self.gpu_available = gpu_available if self.gpu_available: try: import pynvml except ImportError as e: raise MissingConditionalImportError(extra_deps_group='pynvml', conda_package='pynvml', conda_channel='conda-forge') from e pynvml.nvmlInit() def run_event(self, event: Event, state: State, logger: Logger): # only run on the following events if event in [ Event.BATCH_START, Event.EVAL_BATCH_START, Event.PREDICT_BATCH_START, ]: local_node_system_metrics = self.compute_system_metrics() all_system_metrics = dist.all_gather_object(local_node_system_metrics) system_metrics = { key: value for local_metrics in all_system_metrics for key, value in local_metrics.items() } logger.log_metrics(system_metrics) def compute_system_metrics(self): system_metrics = {} # Get metrics for this device if available if self.gpu_available: import pynvml local_rank = dist.get_local_rank() global_rank = dist.get_global_rank() handle = pynvml.nvmlDeviceGetHandleByIndex(local_rank) memory = pynvml.nvmlDeviceGetMemoryInfo(handle) system_metrics[f'device{global_rank}_memory_total'] = memory.total system_metrics[f'device{global_rank}_memory_free'] = memory.free system_metrics[f'device{global_rank}_memory_used'] = memory.used device_utilization = pynvml.nvmlDeviceGetUtilizationRates(handle) system_metrics[f'device{global_rank}_gpu_percentage'] = device_utilization.gpu system_metrics[f'device{global_rank}_memory_percentage'] = device_utilization.memory temperature = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU) system_metrics[f'device{global_rank}_gpu_temperature'] = temperature # Get metrics for the system cpu_percent = psutil.cpu_percent() system_metrics[f'cpu_percentage'] = cpu_percent system_memory = psutil.virtual_memory()._asdict() for k, v in system_memory.items(): system_metrics[f'cpu_memory_{k}'] = v disk_usage = psutil.disk_usage(os.sep)._asdict() for k, v in disk_usage.items(): system_metrics[f'disk_memory_{k}'] = v network_usage = psutil.net_io_counters()._asdict() for k, v in network_usage.items(): system_metrics[f'network_{k}'] = v return system_metrics