Source code for composer.callbacks.system_metrics_monitor

# Copyright 2022 MosaicML Composer authors
# SPDX-License-Identifier: Apache-2.0

"""System metrics monitor callback."""

from __future__ import annotations

import logging
import os

import psutil
import torch

from composer.core import Callback, Event, State
from composer.loggers import Logger
from composer.utils import dist
from composer.utils.import_helpers import MissingConditionalImportError

log = logging.getLogger(__name__)

__all__ = ['SystemMetricsMonitor']

_GPU_METRICS = [
    'gpu_percentage',
    'memory_percentage',
    'gpu_temperature_C',
    'gpu_power_usage_W',
]


[docs]class SystemMetricsMonitor(Callback): """Logs GPU/CPU metrics. GPU Metrics: gpu_percentage: Occupancy rate, percent of time over sampling period during which one or more kernels was executing on the GPU. memory_percentage: Percent of time over sampling period during which global memory was being read or written. gpu_temperature_C: Temperature of device, in Celcius. gpu_power_usage_W: Power usage of device, in Watts. By default, only the maximum and minimum values for these metrics, alongside their respective ranks in the key names, are logged on the :attr:`.Event.BATCH_START`, :attr:`.Event.EVAL_BATCH_START`, :attr:`.Event.PREDICT_BATCH_START` events for every batch. If log_all_data is set to True, all values for these metrics across all ranks are logged on the above events for every batch. Example: .. doctest:: >>> from composer import Trainer >>> from composer.callbacks import SystemMetricsMonitor >>> # constructing trainer object with this callback >>> trainer = Trainer( ... model=model, ... train_dataloader=train_dataloader, ... eval_dataloader=eval_dataloader, ... optimizers=optimizer, ... max_duration='1ep', ... callbacks=[SystemMetricsMonitor()], ... ) Args: log_all_data (bool, optional): True if user wants to log data for all ranks, not just the min/max. Defaults to False. """ def __init__(self, log_all_data: bool = False) -> None: super().__init__() self.gpu_available = torch.cuda.is_available() self.log_all_data = log_all_data if self.gpu_available: try: import pynvml except ImportError as e: raise MissingConditionalImportError( extra_deps_group='pynvml', conda_package='pynvml', conda_channel='conda-forge', ) from e pynvml.nvmlInit() def run_event(self, event: Event, state: State, logger: Logger): # only run on the following events if event in [ Event.BATCH_START, Event.EVAL_BATCH_START, Event.PREDICT_BATCH_START, ]: local_node_system_metrics = self.compute_system_metrics() all_system_metrics = dist.all_gather_object(local_node_system_metrics) system_metrics = {} if self.log_all_data: for rank, metrics in enumerate(all_system_metrics): for key, value in metrics.items(): if key in _GPU_METRICS: system_metrics[f'{key}_rank_{rank}'] = value else: system_metrics[key] = value else: system_metrics = self.compute_gpu_min_max_metrics(all_system_metrics, state) for rank, metrics in enumerate(all_system_metrics): for key, value in metrics.items(): if key not in _GPU_METRICS: system_metrics[key] = value logger.log_metrics(system_metrics) def compute_system_metrics(self): system_metrics = {} # Get metrics for this device if available if self.gpu_available: import pynvml local_rank = dist.get_local_rank() handle = pynvml.nvmlDeviceGetHandleByIndex(local_rank) device_utilization = pynvml.nvmlDeviceGetUtilizationRates(handle) system_metrics['gpu_percentage'] = device_utilization.gpu system_metrics['memory_percentage'] = device_utilization.memory temperature = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU) system_metrics['gpu_temperature_C'] = temperature power = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0 # convert from mW to W system_metrics['gpu_power_usage_W'] = power # Get metrics for the system cpu_percent = psutil.cpu_percent() system_metrics[f'cpu_percentage'] = cpu_percent system_memory = psutil.virtual_memory()._asdict() for k, v in system_memory.items(): system_metrics[f'cpu_memory_{k}'] = v disk_usage = psutil.disk_usage(os.sep)._asdict() for k, v in disk_usage.items(): system_metrics[f'disk_memory_{k}'] = v network_usage = psutil.net_io_counters()._asdict() for k, v in network_usage.items(): system_metrics[f'network_{k}'] = v return system_metrics def compute_gpu_min_max_metrics(self, all_metrics, state): min_max_metrics = {} if self.gpu_available: for key in _GPU_METRICS: values = torch.tensor([metrics_for_cur_rank[key] for metrics_for_cur_rank in all_metrics]) values = state.device.tensor_to_device(values) min_rank = int(torch.argmin(values).item()) max_rank = int(torch.argmax(values).item()) min_max_metrics[f'min_{key}_rank_{min_rank}'] = values[min_rank].item() min_max_metrics[f'max_{key}_rank_{max_rank}'] = values[max_rank].item() return min_max_metrics