Source code for composer.algorithms.stochastic_depth.stochastic_depth

# Copyright 2021 MosaicML. All Rights Reserved.

from __future__ import annotations

import functools
import logging
from typing import Optional, Sequence, Type, Union

import torch
from torch.optim import Optimizer
from torchvision.models.resnet import Bottleneck

from composer.algorithms.stochastic_depth.sample_stochastic_layers import SampleStochasticBottleneck
from composer.algorithms.stochastic_depth.stochastic_layers import StochasticBottleneck
from composer.core import Algorithm, Event, State
from composer.core.time import Time, TimeUnit
from composer.loggers import Logger
from composer.utils import module_surgery

log = logging.getLogger(__name__)

_VALID_LAYER_DISTRIBUTIONS = ("uniform", "linear")

_STOCHASTIC_LAYER_MAPPING = {
    'block': {
        'ResNetBottleneck': (Bottleneck, StochasticBottleneck)
    },
    'sample': {
        'ResNetBottleneck': (Bottleneck, SampleStochasticBottleneck)
    }
}


[docs]def apply_stochastic_depth(model: torch.nn.Module,
                           target_layer_name: str,
                           stochastic_method: str = 'block',
                           drop_rate: float = 0.2,
                           drop_distribution: str = 'linear',
                           use_same_gpu_seed: bool = True,
                           optimizers: Optional[Union[Optimizer, Sequence[Optimizer]]] = None) -> torch.nn.Module:
    """Applies Stochastic Depth (`Huang et al, 2016 <https://arxiv.org/abs/1603.09382>`_) to the specified model.

    The algorithm replaces the specified target layer with a stochastic version
    of the layer. The stochastic layer will randomly drop either samples or the
    layer itself depending on the stochastic method specified. The block-wise
    version follows the original paper. The sample-wise version follows the
    implementation used for EfficientNet in the
    `Tensorflow/TPU repo <https://github.com/tensorflow/tpu>`_.

    .. note::

        Stochastic Depth only works on instances of `torchvision.models.resnet.ResNet` for now.

    Args:
        model (torch.nn.Module): model containing modules to be replaced with stochastic versions
        target_layer_name (str): Block to replace with a stochastic block
            equivalent. The name must be registered in ``STOCHASTIC_LAYER_MAPPING``
            dictionary with the target layer class and the stochastic layer class.
            Currently, only :class:`torchvision.models.resnet.Bottleneck` is supported.
        stochastic_method (str, optional): The version of stochastic depth to use. ``"block"``
            randomly drops blocks during training. ``"sample"`` randomly drops
            samples within a block during training. Default: ``"block"``.
        drop_rate (float, optional): The base probability of dropping a layer or sample. Must be
            between 0.0 and 1.0. Default: `0.2``.
        drop_distribution (str, optional): How ``drop_rate`` is distributed across
            layers. Value must be one of ``"uniform"`` or ``"linear"``.
            ``"uniform"`` assigns the same ``drop_rate`` across all layers.
            ``"linear"`` linearly increases the drop rate across layer depth
            starting with 0 drop rate and ending with ``drop_rate``. Default: ``"linear"``.
        use_same_gpu_seed (bool, optional): Set to ``True`` to have the same layers dropped
            across GPUs when using multi-GPU training. Set to ``False`` to
            have each GPU drop a different set of layers. Only used
            with ``"block"`` stochastic method. Default: ``True``.
        optimizers (torch.optim.Optimizer | Sequence[torch.optim.Optimizer], optional):
            Existing optimizers bound to ``model.parameters()``.
            All optimizers that have already been constructed with
            ``model.parameters()`` must be specified here so they will optimize
            the correct parameters.

            If the optimizer(s) are constructed *after* calling this function,
            then it is safe to omit this parameter. These optimizers will see the correct
            model parameters.

    Returns:
        The modified model

    Example:
        .. testcode::

            import composer.functional as cf
            from torchvision import models
            model = models.resnet50()
            cf.apply_stochastic_depth(model, target_layer_name='ResNetBottleneck')
    """
    _validate_stochastic_hparams(target_layer_name=target_layer_name,
                                 stochastic_method=stochastic_method,
                                 drop_rate=drop_rate,
                                 drop_distribution=drop_distribution)
    transforms = {}
    target_layer, stochastic_layer = _STOCHASTIC_LAYER_MAPPING[stochastic_method][target_layer_name]
    module_count = module_surgery.count_module_instances(model, target_layer)
    shared_kwargs = {'drop_rate': drop_rate, 'drop_distribution': drop_distribution, 'module_count': module_count}
    if stochastic_method == 'block':
        rand_generator = torch.Generator()  # Random number generator for each layer
        stochastic_from_target_layer = functools.partial(stochastic_layer.from_target_layer,
                                                         **shared_kwargs,
                                                         use_same_gpu_seed=use_same_gpu_seed,
                                                         rand_generator=rand_generator)
    elif stochastic_method == 'sample':
        stochastic_from_target_layer = functools.partial(stochastic_layer.from_target_layer, **shared_kwargs)
    else:
        raise ValueError(f"stochastic_method {stochastic_method} is not supported."
                         f" Must be one of {list(_STOCHASTIC_LAYER_MAPPING.keys())}")
    transforms[target_layer] = stochastic_from_target_layer
    module_surgery.replace_module_classes(model, optimizers=optimizers, policies=transforms)
    return model


[docs]class StochasticDepth(Algorithm):
    """Applies Stochastic Depth (`Huang et al, 2016 <https://arxiv.org/abs/1603.09382>`_) to the specified model.

    The algorithm replaces the specified target layer with a stochastic version
    of the layer. The stochastic layer will randomly drop either samples or the
    layer itself depending on the stochastic method specified. The block-wise
    version follows the original paper. The sample-wise version follows the
    implementation used for EfficientNet in the
    `Tensorflow/TPU repo <https://github.com/tensorflow/tpu>`_.

    Runs on :attr:`~composer.core.event.Event.INIT`, as well as
    :attr:`~composer.core.event.Event.BATCH_START` if ``drop_warmup > 0``.

    .. note::

        Stochastic Depth only works on instances of `torchvision.models.resnet.ResNet` for now.

    Args:
        target_layer_name (str): Block to replace with a stochastic block
            equivalent. The name must be registered in ``STOCHASTIC_LAYER_MAPPING``
            dictionary with the target layer class and the stochastic layer class.
            Currently, only :class:`torchvision.models.resnet.Bottleneck` is supported.
        stochastic_method (str, optional): The version of stochastic depth to use. ``"block"``
            randomly drops blocks during training. ``"sample"`` randomly drops
            samples within a block during training. Default: ``"block"``.
        drop_rate (float, optional): The base probability of dropping a layer or sample. Must be
            between 0.0 and 1.0. Default: ``0.2``.
        drop_distribution (str, optional): How ``drop_rate`` is distributed across
            layers. Value must be one of ``"uniform"`` or ``"linear"``.
            ``"uniform"`` assigns the same ``drop_rate`` across all layers.
            ``"linear"`` linearly increases the drop rate across layer depth
            starting with 0 drop rate and ending with ``drop_rate``. Default: ``"linear"``.
        drop_warmup (str | Time | float, optional): A :class:`Time` object, time-string, or float
            on [0.0; 1.0] representing the fraction of the training duration to linearly
            increase the drop probability to `linear_drop_rate`. Default: ``0.0``.
        use_same_gpu_seed (bool, optional): Set to ``True`` to have the same layers dropped
            across GPUs when using multi-GPU training. Set to ``False`` to
            have each GPU drop a different set of layers. Only used
            with ``"block"`` stochastic method. Default: ``True``.
    """

    def __init__(self,
                 target_layer_name: str,
                 stochastic_method: str = 'block',
                 drop_rate: float = 0.2,
                 drop_distribution: str = 'linear',
                 drop_warmup: Union[float, Time, str] = 0.0,
                 use_same_gpu_seed: bool = True):

        if drop_rate == 0.0:
            log.warning('Stochastic Depth will have no effect when drop_rate set to 0')

        if stochastic_method == "sample" and not use_same_gpu_seed:
            log.warning('use_same_gpu_seed=false has no effect when using the "sample" method')

        self.target_layer_name = target_layer_name
        self.stochastic_method = stochastic_method
        self.drop_rate = drop_rate
        self.drop_distribution = drop_distribution
        if isinstance(drop_warmup, str):
            drop_warmup = Time.from_timestring(drop_warmup)
        if isinstance(drop_warmup, float):
            drop_warmup = Time(drop_warmup, TimeUnit.DURATION)
        self.drop_warmup = drop_warmup
        self.use_same_gpu_seed = use_same_gpu_seed
        _validate_stochastic_hparams(stochastic_method=self.stochastic_method,
                                     target_layer_name=self.target_layer_name,
                                     drop_rate=self.drop_rate,
                                     drop_distribution=self.drop_distribution,
                                     drop_warmup=str(self.drop_warmup))

    @property
    def find_unused_parameters(self) -> bool:
        """DDP parameter to notify that parameters may not have gradients if it is dropped during the forward pass."""

        return (self.stochastic_method == "block")

[docs]    def match(self, event: Event, state: State) -> bool:
        """Run on :attr:`~composer.core.event.Event.INIT`, as well as
    :attr:`~composer.core.event.Event.BATCH_START` if ``drop_warmup > 0``.

        Args:
            event (Event): The current event.
            state (State): The current state.
        Returns:
            bool: True if this algorithm should run now.
        """

        return (event == Event.INIT) or (event == Event.BATCH_START and self.drop_warmup > 0.0)

[docs]    def apply(self, event: Event, state: State, logger: Logger) -> Optional[int]:
        """Applies StochasticDepth modification to the state's model.

        Args:
            event (Event): the current event
            state (State): the current trainer state
            logger (Logger): the training logger
        """
        assert state.model is not None
        target_layer, stochastic_layer = _STOCHASTIC_LAYER_MAPPING[self.stochastic_method][self.target_layer_name]

        if event == Event.INIT:
            if module_surgery.count_module_instances(state.model, target_layer) == 0:
                log.warning(f'No {self.target_layer_name} found in model! Algorithm will function as a no-op.')

            apply_stochastic_depth(state.model,
                                   optimizers=state.optimizers,
                                   target_layer_name=self.target_layer_name,
                                   stochastic_method=self.stochastic_method,
                                   drop_rate=self.drop_rate,
                                   drop_distribution=self.drop_distribution,
                                   use_same_gpu_seed=self.use_same_gpu_seed)
            num_stochastic_layers = module_surgery.count_module_instances(state.model, stochastic_layer)
            logger.data_epoch({'stochastic_depth/num_stochastic_layers': num_stochastic_layers})

        elif event == Event.BATCH_START:
            if state.get_elapsed_duration() < self.drop_warmup:
                current_drop_rate = float(state.get_elapsed_duration() / self.drop_warmup) * self.drop_rate
                _update_drop_rate(state.model, stochastic_layer, current_drop_rate, self.drop_distribution)
            else:
                current_drop_rate = self.drop_rate
            logger.data_batch({'stochastic_depth/drop_rate': current_drop_rate})


def _validate_stochastic_hparams(target_layer_name: str,
                                 stochastic_method: str,
                                 drop_rate: float,
                                 drop_distribution: str,
                                 drop_warmup: str = "0dur"):
    """Helper function to validate the Stochastic Depth hyperparameter values."""

    if stochastic_method and (stochastic_method not in _STOCHASTIC_LAYER_MAPPING):
        raise ValueError(f"stochastic_method {stochastic_method} is not supported."
                         f" Must be one of {list(_STOCHASTIC_LAYER_MAPPING.keys())}")

    if target_layer_name and (target_layer_name not in _STOCHASTIC_LAYER_MAPPING[stochastic_method]):
        raise ValueError(f"target_layer_name {target_layer_name} is not supported with {stochastic_method}."
                         f" Must be one of {list(_STOCHASTIC_LAYER_MAPPING[stochastic_method].keys())}")

    if drop_rate and (drop_rate < 0 or drop_rate > 1):
        raise ValueError(f"drop_rate must be between 0 and 1: {drop_rate}")

    if drop_distribution and (drop_distribution not in _VALID_LAYER_DISTRIBUTIONS):
        raise ValueError(f"drop_distribution '{drop_distribution}' is"
                         f" not supported. Must be one of {list(_VALID_LAYER_DISTRIBUTIONS)}")

    if stochastic_method == "sample" and Time.from_timestring(drop_warmup).value != 0:
        raise ValueError(f"drop_warmup can not be used with 'sample' stochastic_method")


def _update_drop_rate(module: torch.nn.Module, stochastic_block: Type[torch.nn.Module], drop_rate: float,
                      drop_distribution: str):
    """Recursively updates a module's drop_rate attributes with a new value."""

    if (len(list(module.children())) == 0 and len(list(module.parameters())) > 0):
        return
    else:
        for child in module.children():
            if isinstance(child, stochastic_block):
                if drop_distribution == 'uniform':
                    current_drop_rate = drop_rate
                elif drop_distribution == 'linear':
                    current_drop_rate = ((child.module_id + 1) / child.module_count) * drop_rate  # type: ignore
                else:
                    raise ValueError(f"drop_distribution '{drop_distribution}' is"
                                     f" not supported. Must be one of {list(_VALID_LAYER_DISTRIBUTIONS)}")
                child.drop_rate = torch.tensor(current_drop_rate)
            _update_drop_rate(child, stochastic_block, drop_rate, drop_distribution)