Source code for composer.algorithms.layer_freezing.layer_freezing

# Copyright 2022 MosaicML Composer authors
# SPDX-License-Identifier: Apache-2.0

"""Core Layer Freezing classes and functions."""

from __future__ import annotations

import logging
import textwrap
import warnings
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union

import torch
from torch.optim import Optimizer

from composer.core import Algorithm, Event, State
from composer.loggers import Logger
from composer.utils import ensure_tuple

log = logging.getLogger(__name__)

__all__ = ['LayerFreezing', 'freeze_layers']


[docs]def freeze_layers( model: torch.nn.Module, optimizers: Union[Optimizer, Sequence[Optimizer]], current_duration: float, freeze_start: float = 0.5, freeze_level: float = 1.0, ) -> Tuple[int, float]: """Progressively freeze the layers of the network in-place during training, starting with the earlier layers. Example: .. testcode:: from composer.algorithms.layer_freezing import freeze_layers freeze_depth, feeze_level = freeze_layers( model=model, optimizers=optimizer, current_duration=0.5, freeze_start=0.0, freeze_level=1.0 ) Args: model (torch.nn.Module): The model being trained. optimizers (torch.optim.Optimizer | Sequence[torch.optim.Optimizer]): The optimizers used during training. current_duration (float): The fraction, in ``[0, 1)`` of the training process complete. freeze_start (float, optional): The fraction of the training process in ``[0, 1)`` to run before freezing begins. Default: ``0.5``. freeze_level (float, optional): The maximum fraction of layers on ``[0, 1)`` to freeze. Default: ``1.0``. Return: (int, float): The number of layers frozen, and the percentage of the total model frozen. """ # Flatten out the layers flat_children = [] _get_layers(model, flat_children) # Determine how many layers to freeze freeze_percentage = _freeze_schedule( current_duration=current_duration, freeze_start=freeze_start, freeze_level=freeze_level, ) freeze_depth = int(freeze_percentage * len(flat_children[0:-1])) # Freeze the parameters in the chosen layers for i, child in enumerate(flat_children[0:-1]): if i < freeze_depth: for p in child.parameters(): _remove_param_from_optimizers(p, optimizers) # Do not compute gradients for this param. p.requires_grad = False # Log results log.info( textwrap.dedent( f"""\ Applied Layer Freezing with freeze_start={freeze_start}, freeze_level={freeze_level}. Froze {freeze_depth} layers in the model which equates to {freeze_percentage * 100}% of all layers.""", ), ) return freeze_depth, freeze_percentage
[docs]class LayerFreezing(Algorithm): """Progressively freeze the layers of the network during training, starting with the earlier layers. Freezing starts after the fraction of training specified by ``freeze_start`` has elapsed. The fraction of layers frozen increases linearly until it reaches ``freeze_level`` at the end of training. This freezing schedule is most similar to `FreezeOut <https://arxiv.org/abs/1706.04983>`_ and `Freeze Training <https://arxiv.org/abs/1706.05806>`_. Runs on :attr:`.Event.EPOCH_END`. Example: .. testcode:: from composer.algorithms import LayerFreezing from composer.trainer import Trainer layer_freezing_algorithm = LayerFreezing( freeze_start=0.0, freeze_level=1.0 ) trainer = Trainer( model=model, train_dataloader=train_dataloader, eval_dataloader=eval_dataloader, max_duration="1ep", algorithms=[layer_freezing_algorithm], optimizers=[optimizer] ) Args: freeze_start (float): The fraction of training to run before freezing begins. Default: ``0.5``. freeze_level (float): The maximum fraction of layers to freeze. Default: ``1.0``. """ def __init__(self, freeze_start: float = 0.5, freeze_level: float = 1.0): self.freeze_start = freeze_start self.freeze_level = freeze_level @property def find_unused_parameters(self) -> bool: """Override in order to tell DDP that some parameters will not have gradients computed for them after layer freezing is applied.""" return True def match(self, event: Event, state: State) -> bool: del state # unused return event == Event.EPOCH_END def apply(self, event: Event, state: State, logger: Logger) -> Optional[int]: del event # unused optimizers = state.optimizers assert optimizers is not None elapsed_duration = state.get_elapsed_duration() assert elapsed_duration is not None, 'elapsed duration should be set on Event.EPOCH_END' freeze_depth, freeze_percentage = freeze_layers( model=state.model, optimizers=optimizers, current_duration=float(elapsed_duration), freeze_start=self.freeze_start, freeze_level=self.freeze_level, ) logger.log_metrics({ 'layer_freezing/layers_frozen': freeze_depth, 'layer_freezing/percentage_frozen': freeze_percentage, }) def state_dict(self) -> Dict[str, Any]: warnings.warn(( 'Checkpoints with layer freezing cannot reliably be used to resume training.' 'See: https://github.com/mosaicml/composer/issues/1002' )) return {} def load_state_dict(self, state: Dict[str, Any]) -> None: warnings.warn(( 'Checkpoints with layer freezing cannot reliably be used to resume training.' 'See: https://github.com/mosaicml/composer/issues/1002' ))
def _freeze_schedule(current_duration: float, freeze_start: float, freeze_level: float) -> float: """Implements a linear schedule for freezing. The schedule is linear and begins with no freezing and linearly increases the fraction of layers frozen, reaching the fraction specified by ``freeze_level`` at the end of training. The start of freezing is given as a fraction of the total training duration and is set with ``freeze_start``. Args: current_duration (float): The elapsed training duration. freeze_start (float): The fraction of training to run before freezing begins. freeze_level (float): The maximum fraction of levels to freeze. """ # No freezing if the current epoch is less than this if current_duration <= freeze_start: return 0.0 # `Calculate the total time for freezing to occur total_freezing_time = 1.0 - freeze_start # Calculate the amount of freezing time that has elapsed freezing_time_elapsed = current_duration - freeze_start # Calculate the fraction of the freezing time elapsed. freezing_time_elapsed_frac = freezing_time_elapsed / total_freezing_time # Scale this fraction by the amount of freezing to do. return freeze_level * freezing_time_elapsed_frac def _get_layers(module: torch.nn.Module, flat_children: List[torch.nn.Module]): """Helper function to get all submodules. Does a depth first search to flatten out modules which contain parameters. Args: module (torch.nn.Module): Current module to search. flat_children (List[torch.nn.Module]): List containing modules. """ # Check if given module has no children and parameters. if (len(list(module.children())) == 0 and len(list(module.parameters())) > 0): flat_children.append(module) else: # Otherwise, continue the search over its children. for child in module.children(): _get_layers(child, flat_children) def _remove_param_from_optimizers(p: torch.nn.Parameter, optimizers: Union[Optimizer, Sequence[Optimizer]]): """Helper function to freeze the training of a parameter. To freeze a parameter, it must be removed from the optimizer, otherwise momentum and weight decay may still be applied. Args: p (torch.nn.Parameter): The parameter being frozen. optimizers (torch.optim.Optimizer | Sequence[torch.optim.Optimizer]): The optimizers used during training. """ # Search over params in the optimizers to find and remove the # given param. Necessary due to the way params are stored. for optimizer in ensure_tuple(optimizers): for group in optimizer.param_groups: group['params'] = list(filter(lambda x: id(x) != id(p), group['params']))