# Copyright 2022 MosaicML Composer authors
# SPDX-License-Identifier: Apache-2.0
"""Utilities for working with training checkpoints."""
from __future__ import annotations
import contextlib
import fnmatch
import logging
import os
import shutil
import tarfile
import tempfile
import textwrap
import warnings
from importlib import import_module
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, Optional, Union
import torch
from packaging import version
from torch.distributed import checkpoint as dist_cp
from torch.distributed._tensor import DeviceMesh
from torch.distributed.checkpoint.metadata import Metadata
from torch.distributed.checkpoint.optimizer import load_sharded_optimizer_state_dict
from torch.distributed.checkpoint.planner import LoadPlan, LoadPlanner
from torch.distributed.checkpoint.storage import StorageReader
from torch.distributed.distributed_c10d import ProcessGroup
from composer.utils import dist, reproducibility
from composer.utils.compression import get_compressor, is_compressed_pt
from composer.utils.file_helpers import (
FORMAT_NAME_WITH_DIST_AND_TIME_TABLE,
extract_path_from_symlink,
format_name_with_dist,
format_name_with_dist_and_time,
get_file,
is_tar,
is_uri,
maybe_create_object_store_from_uri,
parse_uri,
)
from composer.utils.misc import ParallelismType, partial_format
from composer.utils.object_store import ObjectStore
from composer.utils.retrying import retry
if TYPE_CHECKING:
from composer.core import AlgorithmPass, State
from composer.loggers import Logger, LoggerDestination
log = logging.getLogger(__name__)
__all__ = ['get_save_filename', 'load_checkpoint', 'save_checkpoint', 'download_checkpoint']
_COMPOSER_STATES_FILENAME = 'composer_states.pt'
_TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME = f'__{dist.get_global_rank()}_0.distcp'
_TORCH_DISTRIBUTED_CHECKPOINTS_METADATA_FILENAME = '.metadata'
def _get_checkpoint_validation_function(
) -> Optional[Callable[[Union[Path, str], Optional[list[tuple[int, int]]]], bool]]:
"""Get the validation function specified by the environment variable `CHECKPOINT_VALIDATION_FUNCTION`.
Returns:
Callable[[Union[Path, str], Optional[int], Optional[int]], bool] The checkpoint validation function that returns
True given a valid checkpoint and optionally a list of offsets and lengths to check and False otherwise.
"""
name = os.environ.get('CHECKPOINT_VALIDATION_FUNCTION', None)
if name is None:
return None
splits = name.split('.')
module_name, fn_name = '.'.join(splits[:-1]), splits[-1]
module = import_module(module_name)
fn = getattr(module, fn_name)
log.debug(f'Checkpoint validation function {name} has been found.')
return fn
def _ensure_valid_checkpoint(checkpoint_filepath: Union[Path, str],
specs: Optional[list[tuple[int, int]]] = None) -> Union[Path, str]:
"""Ensures that the checkpoint at checkpoint_filepath is valid.
using the function specified by the CHECKPOINT_VALIDATION_FUNCTION environment variable.
If CHECKPOINT_VALIDATION_FUNCTION is not set, we skip validation.
Args:
checkpoint_filepath (Union[Path,str]): The path to the checkpoint file.
specs (Optional[list[tuple[int,int]]]): A list of offsets and lengths to check. Defaults to None.
Raises:
ValueError if checkpoint file is invalid.
"""
# Get the validation function by name.
validate = _get_checkpoint_validation_function()
# No function name has been specified.
if validate is None:
return checkpoint_filepath
# Validate the checkpoint.
if not validate(checkpoint_filepath, specs):
raise ValueError(f'Checkpoint at {checkpoint_filepath} is invalid.')
log.debug(f'Checkpoint at {checkpoint_filepath} is valid.')
return checkpoint_filepath
def _torch_load_with_validation(checkpoint_filepath: Union[Path, str], map_location: str) -> Any:
"""Validates and loads a torch checkpoint.
Args:
checkpoint_filepath (Union[Path,str]): The path to the checkpoint file.
map_location (str): The location to load the checkpoint to.
"""
return torch.load(_ensure_valid_checkpoint(checkpoint_filepath), map_location=map_location)
def _format_path_with_rank_zero(path: str) -> str:
"""Formats ``path`` with the rank zero values."""
return path.format(
rank=0,
local_rank=0,
node_rank=0,
)
def _get_write_mode(name: str) -> str:
"""Get the write mode to use with :func:`tarfile.open`."""
if name.endswith('.tar'):
return 'w'
if name.endswith('.tar.gz') or name.endswith('.tgz'):
return 'w:gz'
if name.endswith('.tar.bz2'):
return 'w:bz2'
if name.endswith('.tar.lzma'):
return 'w:xz'
raise ValueError(f'{name} does not end with a valid tarfile extension.')
def _is_rng_key(key: str, value: tuple) -> bool:
"""Check if the key is an RNG key.
We expect the RNG key to be of the form 'rng.{rank}.cuda|torch|python|numpy'.
This function ensures that we don't accidentally pick up other keys.
"""
starts_with_rng = key.startswith('rng')
ends_with_expected = key.endswith(('cuda', 'torch', 'python', 'numpy'))
three_parts = isinstance(value, tuple) and len(value) == 3
if starts_with_rng and ends_with_expected and three_parts:
return True
return False
def _get_num_ranks_that_saved_rng(metadata: Metadata):
rng_inds = []
for field_name, field_value in metadata.planner_data.items():
if _is_rng_key(field_name, field_value):
_, rng_rank_index, _ = field_value
rng_inds.append(rng_rank_index)
rng_inds = set(rng_inds)
return len(rng_inds)
class FileSystemReaderWithValidation(dist_cp.FileSystemReader):
"""FileSystemReader that validates checkpoint files prior to reading."""
def __init__(self, path: str):
if _get_checkpoint_validation_function() is None:
log.info('No checkpoint validation function found when loading sharded checkpoints.')
super().__init__(path)
def read_data(self, plan: LoadPlan, planner: LoadPlanner):
"""Reads data file.
Raises:
ValueError if the data file is invalid.
"""
path_to_specs: dict[str, list[tuple[int, int]]] = {}
for read_item in plan.items:
item_md = self.storage_data[read_item.storage_index]
path = os.path.join(self.path, item_md.relative_path)
path_to_specs.setdefault(path, []).append((item_md.offset, item_md.length))
for path, spec in path_to_specs.items():
_ensure_valid_checkpoint(path, spec)
return super().read_data(plan, planner)
def read_metadata(self) -> Metadata:
"""Reads metadata file.
Raises:
ValueError if the metadata file is invalid.
"""
metadata_file_path = os.path.join(self.path, '.metadata')
_ensure_valid_checkpoint(metadata_file_path)
return super().read_metadata()
@retry(num_attempts=5)
def download_object_or_file(
object_name: str,
file_destination: Union[str, Path],
object_store: Union[ObjectStore, LoggerDestination],
):
if isinstance(object_store, ObjectStore):
object_store.download_object(
object_name=object_name,
filename=file_destination,
)
else:
object_store.download_file(
remote_file_name=object_name,
destination=str(file_destination),
)
# A subclass of FileSystemReaderWithValidation that downloads files from the object store before reading them from the local filesystem.
class DistCPObjectStoreReader(FileSystemReaderWithValidation):
def __init__(
self,
source_path: str,
destination_path: str,
object_store: Optional[Union[ObjectStore, LoggerDestination]] = None,
device_mesh: Optional[DeviceMesh] = None,
):
if object_store is None:
if not is_uri(source_path):
raise ValueError('When object_store is None, source_path must be a URI.')
object_store = maybe_create_object_store_from_uri(source_path)
_, _, source_path = parse_uri(source_path)
self.source_path = source_path
self.destination_path = destination_path
self.object_store = object_store
self.device_mesh = device_mesh
# Download metadata file.
Path(self.destination_path).mkdir(parents=True, exist_ok=True)
metadata_destination = os.path.join(self.destination_path, '.metadata')
if dist.get_local_rank() == 0:
metadata_path = str(Path(source_path) / Path('.metadata'))
assert object_store is not None
download_object_or_file(metadata_path, metadata_destination, object_store)
dist.barrier()
# FileSystemReader takes in a root directory in its constructor, which is the dir where
# the metadata is expected to be stored. Also, this is parent directory for any shard file relative paths
# specified in the metadata file.
super().__init__(destination_path)
def read_data(self, plan: LoadPlan, planner: LoadPlanner):
# Download files if not using HSDP or if on first replica with HSDP enabled
first_replica = True
if self.device_mesh is not None and self.device_mesh.mesh_dim_names is not None and ParallelismType.DATA_PARALLEL_REPLICATE.value in self.device_mesh.mesh_dim_names:
hsdp_index = self.device_mesh.mesh_dim_names.index(ParallelismType.DATA_PARALLEL_REPLICATE.value)
first_replica = self.device_mesh.get_local_rank(mesh_dim=hsdp_index) == 0
# 1. Collect the relative paths to download for all ranks for deduplication
relative_file_paths = set()
for plan_item in plan.items:
relative_file_paths.add(self.storage_data[plan_item.storage_index].relative_path)
all_file_paths = dist.all_gather_object(relative_file_paths)
# 2. Download to the destination all files this rank needs if on first replica
download_error = False
if first_replica:
log.debug(f'Rank {dist.get_global_rank()} starting to download files.')
# Get the lowest rank in the current node
local_rank_0 = dist.get_global_rank() - dist.get_local_rank()
try:
for plan_item in plan.items:
relative_file_path = self.storage_data[plan_item.storage_index].relative_path
# Check if the file is scheduled to be downloaded by a lower rank on the same node
# i.e. if rank 0 and rank 1 on the same node have the same the same required file,
# only rank 0 should download it and not rank 1.
is_downloaded = any(
relative_file_path in all_file_paths[i] for i in range(local_rank_0, dist.get_global_rank())
)
# Download the shard file to the relative path it's associated to and save that relative path
# to the root directory specified to the FileSystem reader constructor.
file_destination = str(Path(self.destination_path) / Path(relative_file_path))
# The file could have already been downloaded as different plan items can point to same file.
if not is_downloaded and not os.path.exists(file_destination):
log.debug(f'Downloading {relative_file_path} to {file_destination}.')
object_name = str(Path(self.source_path) / Path(relative_file_path))
assert self.object_store is not None
download_object_or_file(object_name, file_destination, self.object_store)
log.debug(f'Finished downloading {relative_file_path} to {file_destination}.')
except Exception as e:
log.error(f'Exception {type(e)} raised during downloading: {str(e)}')
download_error = True
# PyTorch will capture any exception of this function,
# and dist.all_gather_objects(exception) before raising it.
# If that all_gather_objects fails, the exception is never visible to user.
# We raise the exception from all ranks to ensure the user sees it.
download_error_tensor = dist.get_device(None).tensor_to_device(torch.tensor(1 if download_error else 0))
error_by_rank = dist.all_gather(download_error_tensor)
failed_ranks = []
for rank, error in enumerate(list(error_by_rank)):
if error > 0:
failed_ranks.append(rank)
download_error = True
if download_error:
raise RuntimeError(
f'Ranks {failed_ranks} failed to download.',
'To see the full error please look at the logs for that rank, which are logged via log.error.',
)
# 3. Wait for all ranks to finish.
log.debug(f'Rank {dist.get_global_rank()} finished downloading all files.')
dist.barrier()
log.debug('Done waiting for all ranks to finish downloading files.')
# 4. Broadcast files to all other replicas if HSDP
if self.device_mesh is not None and self.device_mesh.mesh_dim_names is not None and ParallelismType.DATA_PARALLEL_REPLICATE.value in self.device_mesh.mesh_dim_names:
# Broadcast file to all replicas
replicate_index = self.device_mesh.mesh_dim_names.index(ParallelismType.DATA_PARALLEL_REPLICATE.value)
shard_index = self.device_mesh.mesh_dim_names.index(ParallelismType.DATA_PARALLEL_SHARD.value)
replicate_process_group = self.device_mesh.get_group(replicate_index)
shard_process_group = self.device_mesh.get_group(shard_index)
shard_size = self.device_mesh.size(shard_index)
rank_in_first_replica = dist.get_global_rank() % shard_size
sender = dist.get_global_rank() == rank_in_first_replica
receiver = dist.get_global_rank() != rank_in_first_replica
# Send list of files to all ranks
file_list = [[
file_name for file_name in sorted(os.listdir(self.destination_path)) if file_name.endswith('.distcp')
]]
dist.broadcast_object_list(file_list, src=rank_in_first_replica, group=replicate_process_group)
file_list = file_list[0]
log.debug(f'list of files to broadcast: {file_list}')
# Send each file to the appropriate rank
for file_name in file_list:
if dist.get_local_rank() == 0 or (
dist.get_global_rank(shard_process_group) == 0 # pyright: ignore[reportGeneralTypeIssues]
): # Only 1 rank per node needs to transfer file
full_path = os.path.join(self.destination_path, file_name)
log.debug(f'Transferring {full_path=}')
file_object = [None]
if sender:
with open(full_path, 'rb') as f:
file_object = [{'content': f.read()}]
dist.broadcast_object_list(
file_object,
src=dist.get_global_rank() % shard_size,
group=replicate_process_group,
)
received_file_object = file_object[0]
assert received_file_object is not None
if receiver and not os.path.exists(full_path) and dist.get_local_rank() == 0:
with open(full_path, 'wb') as f:
f.write(received_file_object['content'])
log.debug(f'Rank {dist.get_global_rank()} finished transferring files to all ranks.')
dist.barrier()
log.debug(
f'Done waiting for all ranks to finish transferring files. Local checkpoint files: {sorted(os.listdir(self.destination_path))}',
)
# 5. Piggyback off of the FileSystemReader to read all the files now that they are downloaded.
return super().read_data(plan, planner)
[docs]class PartialFilePath:
def __init__(self, filename: str, folder: Optional[str] = None):
self.folder = folder
self.filename = filename
def format(self, state: State, keep_placeholders: bool = False) -> str:
if self.folder:
if keep_placeholders:
return os.path.join(
self.folder,
self.filename,
)
else:
return os.path.join(
format_name_with_dist(self.folder, state.run_name),
format_name_with_dist_and_time(self.filename, state.run_name, state.timestamp),
)
else:
if keep_placeholders:
return self.filename
else:
return format_name_with_dist_and_time(
self.filename,
state.run_name,
state.timestamp,
)
[docs]def load_checkpoint(
path: str,
state: State,
logger: Logger,
object_store: Optional[Union[ObjectStore, LoggerDestination]] = None,
load_weights_only: bool = False,
strict_model_weights: bool = True,
progress_bar: bool = True,
ignore_keys: Optional[Union[list[str], Callable[[dict], None]]] = None,
exclude_algorithms: Optional[list[str]] = None,
algorithm_passes: Optional[list[AlgorithmPass]] = None,
):
"""Load a checkpoint from a local file, URI, or cloud object store into ``state``.
Args:
path (str): The path format string to an existing checkpoint file.
It can be a path to a file on the local disk, a URL, or if ``object_store`` is set, the object name
for a checkpoint in a cloud bucket.
When using FSDP with sharded checkpointing, checkpoint files are sharded by rank, and ``load_path``
should be set to the directory containing sharded checkpoint files.
state (State): The :class:`~composer.core.State` to load the checkpoint into.
logger (Logger): The :class:`~composer.logger.Logger` to log any information.
object_store (Union[ObjectStore, LoggerDestination], optional): If the ``path`` is in an object store
(i.e. AWS S3 or Google Cloud Storage), an instance of
:class:`~.ObjectStore` or :class:`~.LoggerDestination` which will be used
to retrieve the checkpoint. Otherwise, if the checkpoint is a local filepath, set to ``None``.
(default: ``None``)
load_weights_only (bool, optional): Whether or not to only restore the model weights from the checkpoint without
restoring the associated state. (default: ``False``)
strict_model_weights (bool, optional): Whether or not to force that the checkpointed weights must exactly
match the model weights. (default: ``True``)
progress_bar (bool, optional): Whether or not to show a progress bar when downloading checkpoints.
Ignored if the checkpoint is a local file path. (default: ``True``)
ignore_keys (list[str] | (dict) -> None, optional): A list of paths for the ``state_dict`` of the checkpoint,
which, when provided, will be ignored from the state_dict before a checkpoint is loaded. Each path is a list
of strings specifying the keys to index into ``state_dict`` joined together with `/` as a separator (as PyTorch
uses `.` in parameter names). If a prefix is provided, all children are also ignored (see Example 2).
See :mod:`composer.core.state` for the structure of state_dict.
Example 1: ``ignore_keys = ["state/model/layer1.weights", "state/model/layer1.bias"]`` would ignore
layer 1 weights and bias.
Example 2: ``ignore_keys = ["state/model/*"]`` would ignore the entire model, which would have the same
effect as the previous example if there was only 1 layer.
Example 3: ``ignore_keys = ["state/model/layer*.weights"]`` would ignore all weights in the model.
Example 4: ``ignore_keys = ["state/rank_zero_seed", "rng"]`` would reset all randomness when
loading the checkpoint.
If a callable, it should take one argument which is the state_dict. The callable is free to arbitrarily modify
the state_dict before it is loaded.
(default: ``None``)
exclude_algorithms (list[str], optional): A list of algorithm names to exclude from loading.
By default, algorithms with `required_on_load=True` which were enabled when training the loaded
checkpoint are automatically applied unless they conflict with a user specified algorithm. These
algorithms often change the model, and not applying them could result in certain layers not having
weights loaded.
Example 1: ``exclude_algorithms = ["BlurPool"]`` would exclude BlurPool from loading.
Example 2: ``exclude_algorithms = ["FusedLayerNorm", "Alibi"]`` would exclude FusedLayerNorm and Alibi from loading.
(default: ``None``)
algorithm_passes (list[AlgorithmPass], optional): A list of algorithm passes to apply to autoloaded algorithms
to sort them into the correct order. (default: ``None``)
Returns:
Optional[list[dict[str, Any]]]: The RNG state dicts, indexed by global rank, if
:attr:`load_weights_only` is not None. Otherwise, None.
"""
path = partial_format(path, run_name=state.run_name)
log.debug(f'Loading checkpoint from formatted path: {path}')
if state.fsdp_sharded_state_dict_enabled:
rng_state_dicts = load_sharded_checkpoint(
source_path=path,
state=state,
logger=logger,
object_store=object_store,
load_weights_only=load_weights_only,
strict_model_weights=strict_model_weights,
progress_bar=progress_bar,
ignore_keys=ignore_keys,
exclude_algorithms=exclude_algorithms,
algorithm_passes=algorithm_passes,
)
else:
# Download the checkpoint to the node-local folder
# Each node gets one unique folder to store checkpoints that is shared amongst all local ranks in that node.
# If fsdp sharded state_dicts is enabled then EVERY rank gets a unique checkpoint folder.
tempdir_ctx = tempfile.TemporaryDirectory() if dist.get_local_rank() == 0 else contextlib.nullcontext(None)
with tempdir_ctx as tempdir:
try:
# Get the path to the proper checkpoint folder corresponding to the current rank's node.
# If fsdp_sharded_state_dict_enabled then just use that rank's unique tempdir.
node_checkpoint_folder = _get_local_rank_zero_path(tempdir)
composer_states_filepath, extracted_checkpoint_folder, extracted_rank_n = download_checkpoint(
path=path,
node_checkpoint_folder=node_checkpoint_folder,
object_store=object_store,
progress_bar=progress_bar,
)
rng_state_dicts = _restore_checkpoint(
state,
logger,
composer_states_filepath,
extracted_rank_n,
extracted_checkpoint_folder,
load_weights_only=load_weights_only,
strict_model_weights=strict_model_weights,
ignore_keys=ignore_keys,
exclude_algorithms=exclude_algorithms,
algorithm_passes=algorithm_passes,
)
finally:
# Wait for all ranks to finish restoring the checkpoint before releasing the tempdir, since tempdir can
# be a shared resource between nodes.
dist.barrier()
log.info('%s loaded from %s', 'Model weights' if load_weights_only else 'Trainer checkpoint', path)
# Verify all ranks resumed on same step
step_to_resume_from = state.timestamp.batch.value
max_step_to_resume_from = state.device.tensor_to_device(
torch.tensor(state.timestamp.batch.value, dtype=torch.int64),
)
min_step_to_resume_from = state.device.tensor_to_device(
torch.tensor(state.timestamp.batch.value, dtype=torch.int64),
)
dist.all_reduce(max_step_to_resume_from, reduce_operation='MAX')
dist.all_reduce(min_step_to_resume_from, reduce_operation='MIN')
if max_step_to_resume_from.data != min_step_to_resume_from.data:
raise RuntimeError(
textwrap.dedent(
f'Timestamp mismatch error: batch to resume from {step_to_resume_from} is not the same on all ranks. '
'This usually occurs when at least one rank fails to save the last checkpoint '
'while using sharded checkpointing + autoresume. '
'Please manually resume by disabling autoresume and explicitly setting load_path '
'to the most recent checkpoints that all ranks have saved. '
'E.g. for the 10th batch: trainer = Trainer(autoresume=False, load_path="/path/to/checkpoint/ba10-rank{rank}.pt", ...). '
'Remember to keep the {rank} placeholder!',
),
)
return rng_state_dicts
def dist_cp_load(
state_dict: dict[str, Any],
storage_reader: StorageReader,
load_planner: Optional[LoadPlanner] = None,
):
if (
version.parse(torch.__version__) >= version.parse('2.4.0') and
version.parse(torch.__version__) < version.parse('2.5.0')
):
from torch.distributed.checkpoint.utils import CheckpointException
try:
dist_cp.load(
state_dict=state_dict,
storage_reader=storage_reader,
planner=load_planner,
)
except CheckpointException as e:
checkpoint_metadata = storage_reader.read_metadata().state_dict_metadata
if 'state.metadata' in checkpoint_metadata and 'state.metadata.composer_env_info.composer_version' not in checkpoint_metadata:
# Torch 2.4 changed the way how state dict is flattened. It broke backward compatibility.
# Torch issue: https://github.com/pytorch/pytorch/issues/133923.
# We override the traverse_state_dict so that the load planner could
# use the old way of flattening the state dict
log.debug('Trying to load checkpointing saved before torch 2.4')
import torch.distributed.checkpoint._nested_dict as nested_dict
import torch.distributed.checkpoint._sharded_tensor_utils as sharded_tensor_util
from torch.distributed.checkpoint._traverse import traverse_state_dict as traverse_2_4_0
from composer.trainer._patch_pytorch import traverse_state_dict as backward_compatible_traverse
nested_dict.traverse_state_dict = backward_compatible_traverse
sharded_tensor_util.traverse_state_dict = backward_compatible_traverse
dist_cp.load(
state_dict=state_dict,
storage_reader=storage_reader,
planner=load_planner,
)
# Revert the override
nested_dict.traverse_state_dict = traverse_2_4_0
sharded_tensor_util.traverse_state_dict = traverse_2_4_0
else:
raise e
else:
dist_cp.load_state_dict(
state_dict=state_dict,
storage_reader=storage_reader,
planner=load_planner,
no_dist=(not dist.is_initialized()),
)
def load_sharded_checkpoint(
source_path: str,
state: State,
logger: Logger,
object_store: Optional[Union[ObjectStore, LoggerDestination]] = None,
load_weights_only: bool = False,
strict_model_weights: bool = False,
progress_bar: bool = True,
ignore_keys: Optional[Union[list[str], Callable[[dict], None]]] = None,
exclude_algorithms: Optional[list[str]] = None,
algorithm_passes: Optional[list[AlgorithmPass]] = None,
) -> Union[list[dict], None]:
using_multinode = dist.get_world_size() != dist.get_local_world_size()
if not version.parse(torch.__version__) >= version.parse('2.0.1') and using_multinode:
raise ValueError(
f'Sharded checkpoint loading on >1 node requires torch version >= 2.0.1. You have torch version {torch.__version__}',
)
if state.fsdp_config is None:
raise ValueError('Loading a sharded checkpoint requires passing an FSDP config to Trainer.')
# Check to make sure source_path is a directory.
if object_store is None:
if os.path.islink(source_path):
source_path = os.path.join(os.path.dirname(source_path), os.readlink(source_path))
if os.path.exists(source_path):
if not os.path.isdir(source_path):
raise ValueError(f'load_path must be a directory when using sharded state dict. Got {source_path}')
else:
raise FileNotFoundError(f'{source_path} not found!')
download_dir_context = tempfile.TemporaryDirectory if object_store is not None else contextlib.nullcontext
with download_dir_context() as temp_download_dir:
if object_store is not None:
# Get the tempfile made on local rank 0.
local_rank0_index = dist.get_global_rank() - dist.get_local_rank()
rank0_download_tempdir = str(dist.all_gather_object(temp_download_dir)[local_rank0_index])
if source_path.endswith('.symlink'):
source_path = extract_path_from_symlink(source_path, object_store=object_store)
storage_reader = DistCPObjectStoreReader(
source_path=source_path,
destination_path=str(Path(rank0_download_tempdir) / Path('checkpoints')),
object_store=object_store,
device_mesh=state.device_mesh,
)
else:
storage_reader = FileSystemReaderWithValidation(source_path)
# We need no_grad because we overwrite tensor values with set_() when we do elastic loading and we don't want the set_ op recorded in the computation graph.
with torch.no_grad():
# 1. Load metadata first for backwards compatability check
# We need to check if the "optimizers" is at the root of the state dict to determine
# how to load the optimizer state.
try:
metadata = storage_reader.read_metadata()
except AttributeError as e:
if '_MEM_FORMAT_ENCODING' in str(e):
raise ValueError(
'Unable to read checkpoint metadata. The checkpoint was likely saved with a '
'newer version of torch. Upgrade your torch version to load this checkpoint.',
)
else:
raise
# Retrieve all top-level keys of the metadata.
top_level_keys = [v[0] for v in metadata.planner_data.values()]
optimizers_at_root = 'optimizers' in top_level_keys
# 2. Load model and metadata
if load_weights_only:
state_dict: dict[str, Any] = {'state': {'model': state.get_model_state_dict()}}
else:
cur_state_dict = state.state_dict()
# If 'optimizers' is at root-level, we load it separately.
if optimizers_at_root:
cur_state_dict.pop('optimizers')
num_rng_ranks = _get_num_ranks_that_saved_rng(storage_reader.read_metadata())
state_dict: dict[str, Any] = {
'state': cur_state_dict,
'rng': reproducibility.get_rng_state()[:num_rng_ranks],
}
if ignore_keys:
# Filter provided list of key paths
if not callable(ignore_keys):
ignore_keys = glob_filter(ignore_keys)
# Call function to modify state_dict
ignore_keys(state_dict)
# Ensure state exists
state_dict['state'] = state_dict.get('state', {})
dist_cp_load(
state_dict=state_dict,
storage_reader=storage_reader,
load_planner=state.fsdp_config.load_planner,
)
log.info(f'Loaded state dict')
state.load_state_dict(
state_dict['state'],
logger,
strict=strict_model_weights,
exclude_algorithms=exclude_algorithms,
algorithm_passes=algorithm_passes,
)
# 3. Optionally load optimizer
# If 'optimizers' was not at root-level, then it will already be loaded
if optimizers_at_root and not load_weights_only:
optim_state = load_sharded_optimizer_state_dict(
model_state_dict=state.state_dict()['model'],
optimizer_key='optimizers',
storage_reader=storage_reader,
)
state._legacy_load_optim_state(optim_state)
return state_dict.get('rng', None)
def _get_local_rank_zero_path(path: Optional[str]) -> str:
"""Broadcasts the ``path`` from the LOCAL rank zero to all LOCAL ranks."""
local_rank_zero = dist.get_global_rank() - dist.get_local_rank()
paths = dist.all_gather_object(path)
local_rank_zero_path = paths[local_rank_zero]
assert local_rank_zero_path is not None, 'local rank zero provides the path'
return local_rank_zero_path
def download_checkpoint(
path: str,
node_checkpoint_folder: str,
object_store: Optional[Union[ObjectStore, LoggerDestination]],
progress_bar: bool,
) -> tuple[str, Optional[str], bool]:
"""Download the checkpoint stored at ``path``, potentially in ``object_store``, to ``node_checkpoint_folder``.
Returns a tuple of (``composer_states_filepath``, ``extracted_checkpoint_folder``, ``extracted_rank_n``).
* The ``composer_states_filepath``, is the path to the composer states, which can be passed into
:meth:`torch.load`.
* The ``extracted_checkpoint_folder`` is the path to the checkpoint folder.
* The ``extracted_rank_n`` is a boolean flag indicating whether a tarball was extracted on global
rank greater than 0.
"""
log.debug('Downloading checkpoint to folder %s', node_checkpoint_folder)
rank_zero_checkpoint_filepath = os.path.join(node_checkpoint_folder, 'rank0_checkpoint')
extracted_checkpoint_folder = None
extracted_rank_n = False
if is_tar(path):
extracted_checkpoint_folder = os.path.join(node_checkpoint_folder, 'checkpoint')
composer_states_filepath = os.path.join(extracted_checkpoint_folder, _COMPOSER_STATES_FILENAME)
else:
# it's not an archive; it's just the composer state dict
# and only rank zero has this file unless fsdp_sharded_state_dict_enabled then
# every rank has it's own file.
extracted_checkpoint_folder = None
composer_states_filepath = rank_zero_checkpoint_filepath
if is_compressed_pt(path):
original_path = path
path = os.path.splitext(path)[0]
compressor = get_compressor(original_path)
with open(path, 'wb') as out_file:
with compressor.decompress(original_path) as in_file:
shutil.copyfileobj(in_file, out_file)
try:
if dist.get_local_rank() == 0:
# If the checkpoint is not sharded, then local rank 0 on each node needs to download the
# global rank 0 checkpoint
path = _format_path_with_rank_zero(path)
get_file(
destination=rank_zero_checkpoint_filepath,
path=path,
object_store=object_store,
progress_bar=progress_bar,
)
if extracted_checkpoint_folder is not None:
try:
with tarfile.open(rank_zero_checkpoint_filepath) as tarball:
tarball.extractall(extracted_checkpoint_folder)
except FileNotFoundError:
# Not re-raising the file-not-found error as that is irrelevant;
# the underlying issue is that the checkpoint file does not exist on the disk
# or could not be downloaded
raise RuntimeError(f'Checkpoint {path} does not exist')
finally:
# Use busy wait to avoid timeouts on large downloads for non-sharded checkpoints
signal_file_path = os.path.join(
node_checkpoint_folder,
dist.get_node_signal_file_name(),
)
if dist.get_local_rank() == 0:
with open(signal_file_path, 'wb') as f:
f.write(b'local_rank0_completed')
# Avoid the collective call until the local rank zero has finished trying to download the
# checkpoint so that we don't timeout for large downloads. This syncs all processes on the
# node
with dist.local_rank_zero_download_and_wait(signal_file_path):
# Then, wait to ensure every node has finished downloading the checkpoint
dist.barrier()
if dist.get_local_rank() == 0:
os.remove(signal_file_path)
dist.barrier()
return composer_states_filepath, extracted_checkpoint_folder, extracted_rank_n
def _flatten_keys(obj: Any, paths: list[str], existing_path: str):
"""Recursively flatten the keys of a dictionary or list into a set of paths."""
# Store path when we reach end, which is either non-dict or empty dict
if isinstance(obj, list) and len(obj) > 0:
for i, elm in enumerate(obj):
_flatten_keys(elm, paths, f'{existing_path}/{i}')
elif isinstance(obj, dict) and len(obj) > 0:
for k, v in obj.items():
_flatten_keys(v, paths, f'{existing_path}/{k}')
# Remove leading /
paths.append(existing_path.lstrip('/'))
def _remove_paths(obj: Union[list, dict[str, Any]], exclude_paths: list[list[str]]):
# Build str(key) to key map to undo cast from glob filtering. Despite typing, some state_dict
# keys are not strings, so we need to cast them back to their original type.
str_key_to_key = {}
if isinstance(obj, dict):
for key in obj.keys():
str_key_to_key[str(key)] = key
# First determine the keys which will be recursed on and which will be removed entirely
# Group the `exclude_paths` by the key
keys_to_recurse = {}
keys_to_remove = []
for exclude_path_parts in exclude_paths:
key = exclude_path_parts[0]
# Cast list indices to int
if isinstance(obj, list):
key = int(key)
# Un-str dict keys if necessary
if key in str_key_to_key:
key = str_key_to_key[key]
if len(exclude_path_parts) == 1:
keys_to_remove.append(key)
else:
if key not in keys_to_recurse:
keys_to_recurse[key] = []
keys_to_recurse[key].append(exclude_path_parts[1:])
# Recurse first, so in the case of a list, the indexing is consistent
for key, paths_to_recurse in keys_to_recurse.items():
_remove_paths(obj[key], paths_to_recurse)
# Sort the keys in reverse order, so in the case of a list, the indexing is consistent
keys_to_remove.sort(reverse=True)
# Remove the keys
for key in keys_to_remove:
del obj[key]
def glob_filter(exclude_globs: list[str]) -> Callable[[dict], None]:
"""Provides a function which deletes all subparts of a dictionary based on a list of paths."""
def filter_func(state_dict: dict) -> None:
# Flatten dictionary into paths
paths = []
_flatten_keys(state_dict, paths, '/')
filtered_paths = []
for exclude_glob in exclude_globs:
filtered_paths_from_glob = fnmatch.filter(paths, exclude_glob)
if len(filtered_paths_from_glob) == 0:
warnings.warn(
f'No parts from loaded checkpoint state_dict were ignored by load_ignore_key {exclude_glob}',
)
filtered_paths.extend(filtered_paths_from_glob)
filtered_paths = list(set(filtered_paths))
if filtered_paths:
filtered_paths_str = ', '.join(filtered_paths)
log.info(f'Ignoring the following paths from the loaded checkpoint state_dict: {filtered_paths_str}')
# Loop through all paths to exclude
paths_to_remove = [path.split('/') for path in filtered_paths if len(path) > 0]
_remove_paths(state_dict, paths_to_remove)
return filter_func
[docs]def safe_torch_load(
composer_states_filepath: Union[Path, str],
map_location: str = 'cpu',
load_monolith_rank0_only: bool = False,
) -> dict[str, Any]:
"""Load a torch checkpoint, catching errors due to backwards compatibility issues.
Args:
composer_states_filepath: The path to the checkpoint file.
map_location: The location to load the checkpoint to.
load_monolith_rank0_only: Whether the checkpoint is a monolith FSDP checkpoint.
"""
try:
if load_monolith_rank0_only:
log.info(
'Loading monolith FSDP checkpoint. Only rank 0 will load and broadcast non-weight/optimizer state.',
)
state_dict_list = [None]
model = None
optimizer = None
if dist.get_global_rank() == 0:
state_dict_list[0] = _torch_load_with_validation(composer_states_filepath, map_location=map_location)
# Don't broadcast model/optimizer state if they exist
if 'model' in state_dict_list[0]['state']:
model = state_dict_list[0]['state']['model']
state_dict_list[0]['state']['model'] = None
if 'optimizers' in state_dict_list[0]['state']:
optimizer = state_dict_list[0]['state']['optimizers']
state_dict_list[0]['state']['optimizers'] = None
log.debug('Broadcasting state_dict to all ranks.')
dist.broadcast_object_list(state_dict_list, src=0)
state_dict: dict[str, Any] = state_dict_list[0] # type: ignore
if dist.get_global_rank() == 0:
if model is not None:
state_dict['state']['model'] = model
if optimizer is not None:
state_dict['state']['optimizers'] = optimizer
return state_dict
else:
return _torch_load_with_validation(composer_states_filepath, map_location=map_location)
except TypeError as e:
if 'Accuracy.__new__() missing 1 required positional argument' in str(e):
raise Exception(
'As of v0.10.0, torchmetrics introduces a new required argument to Accuracy which '
'breaks backwards compatibility. Unfortunately, this means that older checkpoints '
'cannot be loaded with the metrics. In order to successfully load this model, please '
'pass `load_ignore_keys = ["state/train_metrics/*", "state/eval_metrics/*"]`.',
) from e
raise e
except FileNotFoundError as e:
if 'No such file or directory' in str(e) and dist.get_local_rank() != 0:
local_rank_zero = dist.get_global_rank() - dist.get_local_rank()
raise FileNotFoundError(
f'No such file or directory: {e.filename}. '
f'This likely implies a download failed on local rank 0, which is global rank {local_rank_zero}'
f'Please check the logs for global rank {local_rank_zero} to debug the checkpoint download issue.',
) from e
raise e
def _restore_checkpoint(
state: State,
logger: Logger,
composer_states_filepath: str,
extracted_rank_n: bool,
extracted_checkpoint_folder: Optional[str],
load_weights_only: bool,
strict_model_weights: bool,
ignore_keys: Optional[Union[list[str], Callable[[dict], None]]],
exclude_algorithms: Optional[list[str]],
algorithm_passes: Optional[list[AlgorithmPass]],
) -> Optional[list[dict[str, Any]]]:
"""Restore a checkpoint into ``state`` and returns the rng state dicts (if ``load_weights_only`` is False)."""
# Now, all ranks load the checkpoint that local rank zero downloaded
state_dict = safe_torch_load(
composer_states_filepath=composer_states_filepath,
load_monolith_rank0_only=state.load_monolith_rank0_only,
)
if ignore_keys:
# Filter provided list of key paths
if not callable(ignore_keys):
ignore_keys = glob_filter(ignore_keys)
# Call function to modify state_dict
ignore_keys(state_dict)
# Ensure state exists
state_dict['state'] = state_dict.get('state', {})
log.debug(f"Loaded checkpoint with keys {state_dict.keys()} and state keys {state_dict['state'].keys()}")
if load_weights_only:
state.load_model_state(
state_dict['state'],
logger,
strict=strict_model_weights,
exclude_algorithms=exclude_algorithms,
algorithm_passes=algorithm_passes,
)
else:
state.load_state_dict(
state_dict['state'],
logger,
exclude_algorithms=exclude_algorithms,
algorithm_passes=algorithm_passes,
)
return state_dict.get('rng', None)
[docs]def get_save_filename(
state: State,
filename: str = 'ep{epoch}-ba{batch}-rank{rank}',
) -> str:
"""Gets full filename of save filename.
Args:
state (State): The :class:`~composer.core.State` to load the checkpoint into.
filename (filename): The name of the save file.
Returns:
Full filename of save file.
"""
if not state.fsdp_sharded_state_dict_enabled:
return PartialFilePath(filename).format(state)
# Sharded checkpoints get their own little folder.
assert state.fsdp_config is not None
remote_prefix = state.fsdp_config.sharded_ckpt_prefix_dir
assert remote_prefix is not None
save_dirpath = Path(Path(filename).parent) / Path(remote_prefix)
save_dirpath = format_name_with_dist_and_time(str(save_dirpath), state.run_name, state.timestamp)
# New name is now Trainer.save_folder / sharded_ckpt_prefix_dir / __{dist.get_global_rank()}_0.distcpโ
# e.g. path/to/my/checkpoints/ep1-ba2/__1_0.distcp
ckpt_filename = _TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME
return str(Path(save_dirpath) / Path(ckpt_filename))
def _save_checkpoint(
state: State,
save_filename: str,
*,
weights_only: bool = False,
ignore_keys: Optional[Union[list[str], Callable[[dict], None]]] = None,
) -> Union[str, None]: # noqa: D103
if weights_only:
state_dict = {
'state': {
'model': state.get_model_state_dict(),
'integrations': state._get_integrations_state_dict(),
'metadata': state._get_state_metadata(),
},
}
else:
state_dict = {
'state': state.state_dict(),
'rng': reproducibility.get_rng_state(),
}
if ignore_keys:
# Filter provided list of key paths
if not callable(ignore_keys):
ignore_keys = glob_filter(ignore_keys)
# Call function to modify state_dict
ignore_keys(state_dict)
# Ensure state exists
state_dict['state'] = state_dict.get('state', {})
if state.fsdp_sharded_state_dict_enabled and not weights_only:
# Only rank 0 saves RNG
if dist.get_global_rank() > 0:
state_dict.pop('rng')
# To load optimizer states with 2.0 <= torch < 2.2.3 , the optimizer state must be at the top
# level of the state dict because the load_sharded_optimizer_state_dict function
# requires a top level state dict key for the optimizer.
# See https://github.com/pytorch/pytorch/blob/v2.0.1/torch/distributed/checkpoint/optimizer.py#L271
# for more info.
if version.parse(torch.__version__) < version.parse('2.2.3'):
state_dict['optimizers'] = state_dict['state'].pop('optimizers')
log.debug('State dict created.')
dirname = os.path.dirname(save_filename)
if dirname:
os.makedirs(dirname, exist_ok=True)
# Only some ranks are meant to save checkpoint and produce a file
expect_file = False
# Save sharded checkpoint
if state.fsdp_sharded_state_dict_enabled:
if state.fsdp_config is None:
raise ValueError('Saving a sharded checkpoint requires passing an FSDP config to Trainer.')
log.debug(f'Saving sharded checkpoints to {save_filename}...')
process_group = None
device_mesh = state.device_mesh
if device_mesh is not None and device_mesh.mesh_dim_names is not None and ParallelismType.DATA_PARALLEL_REPLICATE.value in device_mesh.mesh_dim_names:
# If hybrid shard, only rank in first replica saves
hsdp_index = device_mesh.mesh_dim_names.index(ParallelismType.DATA_PARALLEL_REPLICATE.value)
expect_file = device_mesh.get_local_rank(mesh_dim=hsdp_index) == 0
if expect_file:
process_group = device_mesh.get_group(1) # Shard process_group for first replica
assert isinstance(process_group, ProcessGroup) # For type checker
log.debug(f'Saving on global_rank={dist.get_global_rank()}, {expect_file=}')
else:
expect_file = True
if expect_file:
if version.parse(torch.__version__) >= version.parse('2.3.0'):
save_planner = state.fsdp_config.save_planner
if save_planner is None:
if version.parse(torch.__version__) < version.parse('2.4.0'):
# Dedup is only broken on <2.4
from composer.trainer._patch_pytorch import SavePlannerWithDedupFix
save_planner = SavePlannerWithDedupFix()
else:
from torch.distributed.checkpoint.default_planner import DefaultSavePlanner
save_planner = DefaultSavePlanner(dedup_save_to_lowest_rank=True)
dist_cp.save(
state_dict=state_dict,
storage_writer=dist_cp.FileSystemWriter(dirname),
planner=save_planner,
process_group=process_group,
)
else:
dist_cp.save_state_dict(
state_dict=state_dict,
storage_writer=dist_cp.FileSystemWriter(dirname),
planner=state.fsdp_config.save_planner,
process_group=process_group,
)
log.debug('Finished pytorch save state dict')
# Save monolith checkpoint
elif dist.get_global_rank() == 0:
expect_file = True
log.debug(f'Saving monolithic checkpoint to {save_filename}')
_write_checkpoint_file(state_dict, save_filename)
log.debug(f'Global rank 0 done saving checkpoint to disk at {save_filename}.')
else:
log.debug(f'Only rank 0 is saving a checkpoint, so rank {dist.get_global_rank()} skips checkpointing.')
dist.barrier() # Ensure all ranks saved their files
if expect_file:
assert os.path.exists(save_filename), 'Expected file to have been saved.'
return save_filename
else:
# no file saved
return None
def _write_checkpoint_file(state_dict: dict[str, Any], filename: str) -> None:
"""Write the given checkpoint state to the given path. Compressing if indicated to do so by the file extension."""
if is_tar(filename):
log.debug('Writing checkpoint tar file %s', filename)
write_mode = _get_write_mode(filename)
with tempfile.TemporaryDirectory(prefix='checkpoint') as tmpdir:
with open(os.path.join(tmpdir, _COMPOSER_STATES_FILENAME), 'wb') as f:
torch.save(state_dict, f)
with tarfile.open(filename, write_mode) as tarball:
tarball.add(tmpdir, arcname='')
elif is_compressed_pt(filename):
log.debug('Writing compressed checkpoint %s', filename)
compressor = get_compressor(filename)
with compressor.compress(filename) as f:
torch.save(state_dict, f)
else:
log.debug('Writing uncompressed checkpoint %s', filename)
with open(filename, 'wb') as f:
torch.save(state_dict, f)
[docs]def save_checkpoint(
state: State,
filename: str = 'ep{epoch}-ba{batch}-rank{rank}',
*,
weights_only: bool = False,
ignore_keys: Optional[Union[list[str], Callable[[dict], None]]] = None,
) -> Union[str, None]: # noqa: D103
# Clear the cache in case we are near the memory limit to give some space for NCCL.
torch.cuda.empty_cache()
save_filename = get_save_filename(state, filename)
return _save_checkpoint(state, save_filename, weights_only=weights_only, ignore_keys=ignore_keys)
save_checkpoint.__doc__ = f"""Checkpoint the training ``state``.
Args:
state (State): The training state.
logger (Logger): The logger.
filename (str): A format string describing how to name checkpoints.
(default: ``'ep{{epoch}}-ba{{batch}}-rank{{rank}}'``)
The following format variables are available:
{textwrap.indent(FORMAT_NAME_WITH_DIST_AND_TIME_TABLE, prefix=' ')}
.. note::
* By default, only the rank zero process will save a checkpoint file.
* To write to compressed tar files, set the file extension to ``'.tar.gz'``, ``'.tgz'``,
``'.tar.bz2'``, or ``'.tar.lzma'`` (depending on the desired compression algorithm).
* To write to compressed pt files, set the file extension to ``'.pt.bz2'``, ``'.pt.gz'``,
``'.pt.lz4'``, ``'.pt.lzma'``, ``'.pt.lzo'``, ``'.pt.xz'``, ``'.pt.zst'`` (depending on the
desired algorithm). You must have the corresponding CLI tool installed. ``lz4`` is a good
choice for a modest space saving while being very fast to compress.
.. warning::
Using compression will block the training loop while checkpoints are being compressed and the
compressibility of checkpoints can vary significantly depending on your setup. As such, we
recommend saving checkpoints without compression by default.
If you have the ``lz4`` command available on your system, you may want to try saving as ``.pt.lz4``
as the overhead is minimal (usually less than a second) and the saved space can sometimes
be significant (1% - 40%).
Consider the following scenario, where:
* The default ``name='ep{{epoch}}-ba{{batch}}-rank{{rank}}'`` is used.
* The current epoch count is ``1``.
* The current batch count is ``42``.
The rank zero process will save the checkpoint to ``'ep1-ba42-rank0'``.
weights_only (bool, optional): If ``True``, save only the model weights instead of the entire training state.
(default: ``False``)
Returns:
list[pathlib.Path]: The list of checkpoint files saved, indexed by the rank of the process.
.. note::
When doing multi-node training, the filepaths are valid only on each process's node;
Composer does not move checkpoint files between nodes.
Each list will contain only one filepath since only the rank zero process saves checkpoints.
"""