Source code for composer.profiler.torch_profiler

# Copyright 2022 MosaicML Composer authors
# SPDX-License-Identifier: Apache-2.0

"""Profiler to collect :mod:`torch` performance metrics during training."""

from __future__ import annotations

import json
import logging
import os
import textwrap
from typing import TYPE_CHECKING, Optional, OrderedDict

import torch.cuda
import torch.profiler
from packaging import version
from torch.profiler.profiler import ProfilerAction as TorchProfilerAction

from composer.core.callback import Callback
from composer.loggers import Logger
from composer.profiler.profiler_action import ProfilerAction
from composer.utils import (
    FORMAT_NAME_WITH_DIST_AND_TIME_TABLE,
    FORMAT_NAME_WITH_DIST_TABLE,
    dist,
    ensure_folder_is_empty,
    format_name_with_dist,
    format_name_with_dist_and_time,
)

if TYPE_CHECKING:
    from composer.core import State

__all__ = ['TorchProfiler']

log = logging.getLogger(__name__)


[docs]class TorchProfiler(Callback): # noqa: D101 __doc__ = f"""Profile the execution using the :class:`PyTorch Profiler <torch.profiler.profile>`. Profiling results are stored in TensorBoard format in the directory specified by ``folder``. .. note:: The Composer :class:`.Trainer` automatically creates an instance of this :class:`.TorchProfiler` callback whenever any of the PyTorch Profiler arguments (``torch_prof_record_shapes``, ``torch_prof_profile_memory``, ``torch_prof_with_stack``, or ``torch_prof_with_flops``) are enabled. When using the Composer :class:`.Trainer`, one does not need to directly create an instance of this :class:`.TorchProfiler` callback. To view profiling results, run:: pip install tensorboard torch_tb_profiler tensorboard --logdir path/to/torch/trace_folder .. note:: See :doc:`profiler` for additional usage details on the :class:`torch.profiler.profile`. .. note:: Enabling shape and stack tracing results in additional overhead. When ``record_shapes=True`` is specified, the profiler will temporarily hold references to tensors which may prevent certain optimizations that depend on the reference count and can introduce extra tensor copies. Args: folder (str, optional): Format string for the folder containing the Torch Profiler trace files. Defaults to ``'{{run_name}}/torch_traces'``. The following format variables are available: {textwrap.indent(FORMAT_NAME_WITH_DIST_TABLE, prefix=' ')} For example, if the ``run_name`` is ``'awesome_training_run'``, and the default ``folder`` of ``'{{run_name}}/torch_traces'`` is used, Torch Profiler traces will be stored in ``'awesome_training_run/torch_traces'``. filename (str, optional): A format string describing how to name Torch Profiler trace files. Defaults to ``'rank{{rank}}.{{batch}}.pt.trace.json'``. At the end of each batch where :meth:`~composer.profiler.Profiler.get_action` returns :attr:`~composer.profiler._profiler_action.ProfilerAction.ACTIVE_AND_SAVE`, trace files are saved approximately to ``{{folder.format(...)}}/{{filename.format(...)}}``. The following format variables are available: {textwrap.indent(FORMAT_NAME_WITH_DIST_AND_TIME_TABLE, prefix=' ')} Consider the following scenario, where: * The :attr:`~.State.run_name` is ``'awesome-training-run'``. * The default ``trace_folder='{{run_name}}/torch_traces'`` is used. * The default ``name='rank{{rank}}.{{batch}}.pt.trace.json'`` is used. * The current epoch count is ``1``. * The current batch count is ``42``. Each rank (process) will save traces to:: awesome-training-run/torch_traces/ep1-ba42-rank0.pt.trace.json awesome-training-run/torch_traces/ep1-ba42-rank1.pt.trace.json awesome-training-run/torch_traces/ep1-ba42-rank2.pt.trace.json ... remote_file_name (str, optional): Format string for a Torch Profiler trace file's remote file name. Defaults to ``'{{run_name}}/torch_traces/rank{{rank}}.{{batch}}.pt.trace.json'``. Whenever a trace file is saved, it is also uploaded as a file according to this format string. The same format variables as for ``filename`` are available. .. seealso:: :doc:`Uploading Files</trainer/file_uploading>` for notes for file uploading. Leading slashes (``'/'``) will be stripped. To disable uploading trace files, set this parameter to ``None``. memory_filename (str, optional): A format string describing how to name Torch Profiler memory trace files. Defaults to None. An example memory_filename is ``'rank{{rank}}.{{batch}}.pt.trace.memory.html'``. At the end of each batch where :meth:`~composer.profiler.Profiler.get_action` returns :attr:`~composer.profiler._profiler_action.ProfilerAction.ACTIVE_AND_SAVE`, trace files are saved approximately to ``{{folder.format(...)}}/{{memory_filename.format(...)}}``. The following format variables are available: {textwrap.indent(FORMAT_NAME_WITH_DIST_AND_TIME_TABLE, prefix=' ')} Consider the following scenario, where: * The :attr:`~.State.run_name` is ``'awesome-training-run'``. * The default ``trace_folder='{{run_name}}/torch_traces'`` is used. * The default ``name='rank{{rank}}.{{batch}}.pt.trace.memory.html'`` is used. * The current epoch count is ``1``. * The current batch count is ``42``. Each rank (process) will save traces to:: awesome-training-run/torch_traces/ep1-ba42-rank0.pt.trace.memory.html awesome-training-run/torch_traces/ep1-ba42-rank1.pt.trace.memory.html awesome-training-run/torch_traces/ep1-ba42-rank2.pt.trace.memory.html ... memory_remote_file_name (str, optional): Format string for a Torch Profiler memory trace file's remote file name. Defaults to ``'{{run_name}}/torch_traces/rank{{rank}}.{{batch}}.pt.trace.memory.json'``. Whenever a trace file is saved, it is also uploaded as a file according to this format string. The same format variables as for ``filename`` are available. .. seealso:: :doc:`Uploading Files</trainer/file_uploading>` for notes for file uploading. Leading slashes (``'/'``) will be stripped. To disable uploading trace files, set this parameter to ``None``. overwrite (bool, optional): Whether to override existing Torch Profiler traces. Defaults to False. If False, then the trace folder as determined by ``folder`` must be empty. use_gzip (bool, optional): Whether to use gzip for the trace. Defaults to False. If True, ``'.gz'`` will be appended ``filename`` and ``remote_file_name`` (if they do not already end in ``'.gz'``). record_shapes (bool, optional): Whether to record tensor shapes. Defaults to False. profile_memory (bool, optional): Whether to profile memory. Defaults to True. with_stack (bool, optional): Whether to record stack info. Defaults to False. with_flops (bool, optional): Whether to estimate flops for operators. Defaults to True. num_traces_to_keep (int, optional): The number of trace files to keep locally. Defaults to -1. If set to -1, then all traces files are kept locally. After a trace has been saved and uploaded, the oldest traces are removed until ``num_traces_to_keep`` traces remain. This parameter only controls how many traces are kept locally; traces are not deleted from remote file systems. It can be useful to set this parameter to ``0`` when using a remote file uploader such as the :class:`.RemoteUploaderDownloader`. This combination will minimize local disk usage by deleting trace files immediately after they have been uploaded to the object store. Attributes: saved_traces (list[tuple[Timestamp, list[pathlib.Path]]]): The trace timestamps and filepaths. This list contains tuples of the save timestamp and the trace filepaths. This list will have at most ``num_traces_to_keep`` entries. The latest trace will be at the end. The index of a filepath in each list corresponds to the global rank of the process that wrote that file. Each filepath is valid only on the process's (rank's) node. """ def __init__( self, folder: str = '{run_name}/torch_traces', filename: str = 'rank{rank}.{batch}.pt.trace.json', remote_file_name: Optional[str] = '{run_name}/torch_traces/rank{rank}.{batch}.pt.trace.json', memory_filename: Optional[str] = None, memory_remote_file_name: Optional[str] = ( '{run_name}/torch_memory_traces/' 'rank{rank}.{batch}.pt.trace.memory.html' ), overwrite: bool = False, use_gzip: bool = False, record_shapes: bool = False, profile_memory: bool = True, with_stack: bool = False, with_flops: bool = True, num_traces_to_keep: int = -1, ) -> None: self.overwrite = overwrite self.folder = folder if use_gzip: if not filename.endswith('.gz'): filename += '.gz' self.filename = filename if use_gzip: if remote_file_name is not None and not remote_file_name.endswith('.gz'): remote_file_name += '.gz' self.remote_file_name = remote_file_name if memory_filename is not None: assert memory_filename.endswith('.html'), f'memory_filename must end with .html, got {memory_filename}' self.memory_filename = memory_filename if memory_remote_file_name is not None: assert memory_remote_file_name.endswith( '.html', ), f'memory_remote_file_name must end with .html, got {memory_remote_file_name}' self.memory_remote_file_name = memory_remote_file_name self.record_shapes = record_shapes self.profile_memory = profile_memory self.with_stack = with_stack self.with_flops = with_flops self.num_traces_to_keep = num_traces_to_keep self.saved_traces = OrderedDict() self.profiler: Optional[torch.profiler.profile] = None def init(self, state: State, logger: Logger) -> None: if state.profiler is None: raise RuntimeError(( 'The Composer Profiler was not enabled, which is required to use the ' f'{type(self).__name__}. To enable, set the `prof_schedule` argument of the Trainer.' )) folder_name = format_name_with_dist(self.folder, state.run_name) os.makedirs(folder_name, exist_ok=True) if not self.overwrite: ensure_folder_is_empty(folder_name) dist.barrier() def scheduler_fn(torch_profiler_step: int) -> TorchProfilerAction: del torch_profiler_step # the torch profiler step is unused. Using the composer timestamp instead. assert state.profiler is not None composer_profiler_action = state.profiler.schedule(state) if composer_profiler_action == ProfilerAction.ACTIVE_AND_SAVE: return TorchProfilerAction.RECORD_AND_SAVE if composer_profiler_action == ProfilerAction.ACTIVE: return TorchProfilerAction.RECORD if composer_profiler_action == ProfilerAction.WARMUP: return TorchProfilerAction.WARMUP assert composer_profiler_action == ProfilerAction.SKIP, f'unexpected action: {composer_profiler_action}' return TorchProfilerAction.NONE def handler_fn(prof: torch.profiler.profiler.profile): assert state.profiler is not None timestamp = state.timestamp log.info(f'PyTorch Chrome trace profiler enabled: {self.filename if self.filename else False}') trace_file_name = os.path.join( folder_name, format_name_with_dist_and_time(self.filename, run_name=state.run_name, timestamp=timestamp), ) trace_file_dirname = os.path.dirname(trace_file_name) if trace_file_dirname: os.makedirs(trace_file_dirname, exist_ok=True) prof.export_chrome_trace(trace_file_name) state.profiler.record_chrome_json_trace_file(trace_file_name) if self.remote_file_name is not None: trace_remote_file_name = format_name_with_dist_and_time( self.remote_file_name, run_name=state.run_name, timestamp=timestamp, ) trace_remote_file_name = trace_remote_file_name.lstrip('/') logger.upload_file( remote_file_name=trace_remote_file_name, file_path=trace_file_name, overwrite=self.overwrite, ) log.info( f'PyTorch memory timeline profiler enabled: {self.memory_filename if self.memory_filename else False}', ) if self.memory_filename is not None: if version.parse(torch.__version__) > version.parse('2.1.0.dev'): # type: ignore # memory timeline profiling is only supported in torch v2.1.0-rc1 or higher memory_trace_file_name = os.path.join( folder_name, format_name_with_dist_and_time( self.memory_filename, run_name=state.run_name, timestamp=timestamp, ), ) log.debug(f'Saving memory trace to {memory_trace_file_name}') memory_trace_file_dirname = os.path.dirname(memory_trace_file_name) if memory_trace_file_dirname: os.makedirs(memory_trace_file_dirname, exist_ok=True) from composer.profiler.utils import export_memory_timeline_html export_memory_timeline_html( prof, memory_trace_file_name, torch.cuda.current_device(), # type: ignore ) log.debug(f'Uploaded memory trace to {self.memory_remote_file_name}') if self.memory_remote_file_name is not None: memory_trace_remote_file_name = format_name_with_dist_and_time( self.memory_remote_file_name, run_name=state.run_name, timestamp=timestamp, ) memory_trace_remote_file_name = memory_trace_remote_file_name.lstrip('/') log.debug( f'Uploading memory trace to {memory_trace_remote_file_name} from {memory_trace_file_name}', ) logger.upload_file( remote_file_name=memory_trace_remote_file_name, file_path=memory_trace_file_name, overwrite=self.overwrite, ) else: log.warning('Memory timeline is supported after PyTorch 2.1.0. Skipping memory trace.') if self.num_traces_to_keep >= 0: while len(self.saved_traces) > self.num_traces_to_keep: # self.saved_traces is an ordered dict, so the zeroth item will be the oldest checkpoint timestamp, filepaths = next(iter(self.saved_traces.items())) if dist.get_global_rank() < len(filepaths): # Remove this rank's checkpoint os.remove(filepaths[dist.get_global_rank()]) del self.saved_traces[timestamp] self.profiler = torch.profiler.profile( schedule=scheduler_fn, on_trace_ready=handler_fn, record_shapes=self.record_shapes, profile_memory=self.profile_memory, with_stack=self.with_stack, with_flops=self.with_flops, ) self.profiler.__enter__() def batch_end(self, state: State, logger: Logger) -> None: del state, logger # unused assert self.profiler is not None self.profiler.add_metadata_json('global_rank', json.dumps(dist.get_global_rank())) self.profiler.step() def batch_start(self, state: State, logger: Logger) -> None: del state # unused assert self.profiler is not None logger.log_traces({'profiler/state': self.profiler.current_action.name}) def close(self, state: State, logger: Logger) -> None: del state, logger # unused if self.profiler is not None and self.profiler.profiler is not None: log.info(self.profiler.key_averages().table(sort_by='cpu_time_total', row_limit=20)) if self.profile_memory: log.info(self.profiler.key_averages().table(sort_by='self_cpu_memory_usage', row_limit=20)) if torch.profiler.ProfilerActivity.CUDA in self.profiler.activities: log.info(self.profiler.key_averages().table(sort_by='cuda_time_total', row_limit=20)) if self.profile_memory: log.info(self.profiler.key_averages().table(sort_by='self_cuda_memory_usage', row_limit=20)) self.profiler.__exit__(None, None, None) self.profiler = None