Source code for composer.algorithms.hparams

# Copyright 2021 MosaicML. All Rights Reserved.

import textwrap
from dataclasses import asdict, dataclass
from typing import Optional

import yahp as hp

from composer.algorithms.algorithm_hparams import AlgorithmHparams
from composer.algorithms.alibi import Alibi
from composer.algorithms.augmix import AugMix
from composer.algorithms.blurpool import BlurPool
from composer.algorithms.channels_last import ChannelsLast
from composer.algorithms.colout import ColOut
from composer.algorithms.cutmix import CutMix
from composer.algorithms.cutout import CutOut
from composer.algorithms.factorize import Factorize
from composer.algorithms.ghost_batchnorm import GhostBatchNorm
from composer.algorithms.label_smoothing import LabelSmoothing
from composer.algorithms.layer_freezing import LayerFreezing
from composer.algorithms.mixup import MixUp
from composer.algorithms.no_op_model import NoOpModel
from composer.algorithms.progressive_resizing import ProgressiveResizing
from composer.algorithms.randaugment import RandAugment
from composer.algorithms.sam import SAM
from composer.algorithms.scale_schedule import ScaleSchedule
from composer.algorithms.selective_backprop import SelectiveBackprop
from composer.algorithms.seq_length_warmup import SeqLengthWarmup
from composer.algorithms.squeeze_excite import SqueezeExcite
from composer.algorithms.stochastic_depth import StochasticDepth
from composer.algorithms.stochastic_depth.stochastic_depth import (_STOCHASTIC_LAYER_MAPPING,
                                                                   _validate_stochastic_hparams)
from composer.algorithms.swa import SWA


[docs]@dataclass class AlibiHparams(AlgorithmHparams): """See :class:`Alibi`""" position_embedding_attribute: str = hp.required("attribute name of position embeddings within the model. " "For example in HuggingFace's GPT2, the position " "embeddings are 'transformer.wpe'") attention_module_name: str = hp.required("module/class that will have its self-attention " "function replaced. For example, in HuggingFace's " "GPT, the self-attention module is " "'transformers.models.gpt2.modeling_gpt2.GPT2Attention'") attr_to_replace: str = hp.required("model attribute that self-attention function will " "replace. For example, in HuggingFace's " "GPT2, the self-attention function is '_attn'") alibi_attention: str = hp.required("new self-attention function in which ALiBi is " "implemented. Used to replace " "'{attention_module}.{attr_to_replace}'") mask_replacement_function: Optional[str] = hp.optional( "function to replace model's attention mask. This is " "sometimes necessary for evaluating on sequence " " lengths longer than the model was initialized to accommodate.", default=None) heads_per_layer: Optional[int] = hp.optional( 'Number of attention heads per layer. If ' '"None", will attempt to determine from model.config.n_head.', default=None) max_sequence_length: int = hp.optional('Maximum allowable sequence length', default=8192) train_sequence_length_scaling: float = hp.optional( 'Amount by which to scale training sequence length. One batch of training data ' 'will be reshaped from size (sequence_length, batch) to ' '(sequence_length*train_sequence_length_scaling, batch/train_sequence_length_scaling)', default=0.25) def initialize_object(self) -> "Alibi": return Alibi(**asdict(self))
[docs]@dataclass class AugMixHparams(AlgorithmHparams): """See :class:`AugMix`""" severity: int = hp.optional(doc="Intensity of each augmentation. Ranges from 0 (none) to 10 (maximum)", default=3) depth: int = hp.optional(doc="Number of augmentations to compose in a row", default=-1) width: int = hp.optional(doc="Number of parallel augmentation sequences to combine", default=3) alpha: float = hp.optional(doc="Mixing parameter for clean vs. augmented images.", default=1.0) augmentation_set: str = hp.optional( doc= "Set of augmentations to sample from. 'all', 'safe' (only augmentations that don't appear on CIFAR10C/ImageNet10C), or 'original'", default="all") def initialize_object(self) -> AugMix: return AugMix(**asdict(self))
[docs]@dataclass class BlurPoolHparams(AlgorithmHparams): """See :class:`BlurPool`""" replace_convs: bool = hp.optional('Replace Conv2d with BlurConv2d if stride > 1', default=True) replace_maxpools: bool = hp.optional('Replace MaxPool2d with BlurMaxPool2d', default=True) blur_first: bool = hp.optional('Blur input before convolution', default=True) def initialize_object(self) -> "BlurPool": return BlurPool(**asdict(self))
[docs]@dataclass class ChannelsLastHparams(AlgorithmHparams): """ChannelsLast has no hyperparameters, so this class has no member variables.""" def initialize_object(self) -> ChannelsLast: return ChannelsLast()
[docs]@dataclass class ColOutHparams(AlgorithmHparams): """See :class:`ColOut`""" p_row: float = hp.optional(doc="Fraction of rows to drop", default=0.15) p_col: float = hp.optional(doc="Fraction of cols to drop", default=0.15) batch: bool = hp.optional(doc="Run ColOut at the batch level", default=True) def initialize_object(self) -> ColOut: return ColOut(**asdict(self))
[docs]@dataclass class CutMixHparams(AlgorithmHparams): """See :class:`CutMix`""" num_classes: int = hp.required('Number of classes in the task labels.') alpha: float = hp.optional('Strength of interpolation, should be >= 0. No interpolation if alpha=0.', default=1.0) uniform_sampling: bool = hp.optional('Mix pixels with uniform probability', default=False) def initialize_object(self) -> CutMix: return CutMix(**asdict(self))
[docs]@dataclass class CutOutHparams(AlgorithmHparams): """See :class:`CutOut`""" num_holes: int = hp.optional('Number of holes to cut out', default=1) length: float = hp.optional('Relative or absolute side length of the square hole to cut out', default=0.5) uniform_sampling: bool = hp.optional('Mask pixels with uniform probability', default=False) def initialize_object(self) -> CutOut: return CutOut(**asdict(self))
[docs]@dataclass class FactorizeHparams(AlgorithmHparams): """See :class:`Factorize`""" factorize_convs: bool = hp.optional( doc='Whether to factorize convolutional layers', default=True, ) factorize_linears: bool = hp.optional( doc='Whether to factorize linear layers', default=True, ) min_channels: int = hp.optional( doc=('Minimum number of channels in a Conv2d module' + ' for it to be factorized.'), default=512, ) latent_channels: float = hp.optional( doc='Number or relative fraction of channels in factorized convolution latent representations', default=0.25, ) min_features: int = hp.optional( doc=('Minimum number of features in a Linear module' + ' for it to be factorized.'), default=512, ) latent_features: float = hp.optional( doc='Number or relative fraction of features in factorized linear latent representations', default=0.25, ) def initialize_object(self) -> Factorize: return Factorize(**asdict(self))
[docs]@dataclass class GhostBatchNormHparams(AlgorithmHparams): """See :class:`GhostBatchNorm`""" ghost_batch_size: int = hp.optional(doc='Size of sub-batches to normalize over', default=32) def initialize_object(self) -> GhostBatchNorm: return GhostBatchNorm(**asdict(self))
[docs]@dataclass class LabelSmoothingHparams(AlgorithmHparams): """See :class:`LabelSmoothing`""" smoothing: float = hp.optional(doc='smoothing factor between 0 and 1', default=0.1) def initialize_object(self) -> LabelSmoothing: return LabelSmoothing(**asdict(self))
[docs]@dataclass class LayerFreezingHparams(AlgorithmHparams): """See :class:`LayerFreezing`""" freeze_start: float = hp.optional(doc='The percentage of epochs to run before freezing begins.', default=0.5) freeze_level: float = hp.optional(doc='Scale factor for the percentage of the network to freeze.', default=1.0) def initialize_object(self) -> LayerFreezing: return LayerFreezing(**asdict(self))
[docs]@dataclass class MixUpHparams(AlgorithmHparams): """See :class:`MixUp`""" alpha: float = hp.optional('Strength of interpolation, should be >= 0. No interpolation if alpha=0.', default=0.2) interpolate_loss: bool = hp.optional('Use index labels and interpolate the loss instead of the labels.', default=False) def initialize_object(self) -> MixUp: return MixUp(**asdict(self))
[docs]@dataclass class NoOpModelHparams(AlgorithmHparams): def initialize_object(self) -> NoOpModel: return NoOpModel()
[docs]@dataclass class ProgressiveResizingHparams(AlgorithmHparams): """See :class:`ProgressiveResizing`""" mode: str = hp.optional(doc="Type of scaling to perform", default="resize") initial_scale: float = hp.optional(doc="Initial scale factor", default=0.5) finetune_fraction: float = hp.optional(doc="Fraction of training to reserve for finetuning on full-sized inputs", default=0.2) resize_targets: bool = hp.optional(doc="Also resize targets", default=False) def initialize_object(self) -> ProgressiveResizing: return ProgressiveResizing(**asdict(self))
[docs]@dataclass class RandAugmentHparams(AlgorithmHparams): """See :class:`RandAugment`""" severity: int = hp.optional(doc="Intensity of each augmentation. Ranges from 0 (none) to 10 (maximum)", default=9) depth: int = hp.optional(doc="Number of augmentations to compose in a row", default=2) augmentation_set: str = hp.optional( doc= "Set of augmentations to sample from. 'all', 'safe' (only augmentations that don't appear on CIFAR10C/ImageNet10C), or 'original'", default="all") def initialize_object(self) -> "RandAugment": return RandAugment(**asdict(self))
[docs]@dataclass class SAMHparams(AlgorithmHparams): """See :class:`SAM`""" rho: float = hp.optional(doc='The neighborhood size parameter of SAM. Must be greater than 0.', default=0.05) epsilon: float = hp.optional(doc='A small value added to gradient norm for numerical stability.', default=1.0e-12) interval: int = hp.optional(doc='SAM will run once per `interval` steps. A value of 1 will cause' 'SAM to run every step. Steps on which SAM runs take roughly twice' 'as much time to complete.', default=1) def initialize_object(self) -> SAM: return SAM(**asdict(self))
[docs]@dataclass class ScaleScheduleHparams(AlgorithmHparams): """See :class:`ScaleSchedule`""" ratio: float = hp.optional('Ratio to scale the schedule.', default=1.0) def initialize_object(self) -> "ScaleSchedule": return ScaleSchedule(**asdict(self))
[docs]@dataclass class SelectiveBackpropHparams(AlgorithmHparams): """See :class:`SelectiveBackprop`""" start: float = hp.optional(doc="SB interval start, as fraction of training duration", default=0.5) end: float = hp.optional(doc="SB interval end, as fraction of training duration", default=0.9) keep: float = hp.optional(doc="fraction of minibatch to select and keep for gradient computation", default=0.5) scale_factor: float = hp.optional(doc="scale for downsampling input for selection forward pass", default=0.5) interrupt: int = hp.optional(doc="interrupt SB with a vanilla minibatch step every 'interrupt' batches", default=2) def initialize_object(self) -> SelectiveBackprop: return SelectiveBackprop(**asdict(self))
[docs]@dataclass class SeqLengthWarmupHparams(AlgorithmHparams): duration: float = hp.optional("Fraction of total training time to apply sequential length warmup learning.", default=0.3) min_seq_length: int = hp.optional("Starting sequence length.", default=8) max_seq_length: int = hp.optional("End sequence length", default=1024) step_size: int = hp.optional("Sequence length step size", default=8) truncate: bool = hp.optional("Truncate tensors or reshape extra tokens to new examples.", default=True) def initialize_object(self) -> "SeqLengthWarmup": return SeqLengthWarmup(**asdict(self))
[docs]@dataclass class StochasticDepthHparams(AlgorithmHparams): """See :class:`StochasticDepth`""" target_layer_name: str = hp.required( f'Reference name of layer to replace. "block" method can be {list(_STOCHASTIC_LAYER_MAPPING["block"].keys())}.' f' "sample" method can be {list(_STOCHASTIC_LAYER_MAPPING["sample"].keys())}.') stochastic_method: str = hp.optional('The version of stochastic depth to use. One of ["sample", "block"].', default='block') drop_rate: float = hp.optional('The probability of dropping a block or sample.', default=0.2) drop_distribution: str = hp.optional( '"Uniform" keeps the drop rate the same across blocks. "linear" linearly' ' increases the drop rate with block depth until it reaches `drop_rate`.', default='linear') use_same_gpu_seed: bool = hp.optional( 'Whether or not to drop the same blocks across GPUs. Only used with "block" method.', default=True) drop_warmup: str = hp.optional(textwrap.dedent("""\ Time string to represent the amount of training to warmup the `drop_rate`. Only use with "block" stochastic method."""), default="0dur") def initialize_object(self) -> StochasticDepth: return StochasticDepth(**asdict(self)) def validate(self): super().validate() _validate_stochastic_hparams(target_layer_name=self.target_layer_name, stochastic_method=self.stochastic_method, drop_rate=self.drop_rate, drop_distribution=self.drop_distribution, drop_warmup=self.drop_warmup)
[docs]@dataclass class SqueezeExciteHparams(AlgorithmHparams): """See :class:`SqueezeExcite`""" latent_channels: float = hp.optional( doc='Dimensionality of hidden layer within the added MLP.', default=64, ) min_channels: int = hp.optional( doc='Minimum number of channels in a Conv2d layer' ' for a squeeze-excite block to be placed after it.', default=128, ) def initialize_object(self) -> SqueezeExcite: return SqueezeExcite(**asdict(self))
[docs]@dataclass class SWAHparams(AlgorithmHparams): """See :class:`~.composer.algorithms.swa.SWA`""" swa_start: str = hp.optional( doc='Time string denoting the amount of training ' 'completed before stochastic weight averaging begins. Currently only units of ' 'duration (e.g. "0.7dur") and epoch (e.g "50ep") are supported.', default="0.7dur", ) swa_end: str = hp.optional( doc='Time string denoting amount of training completed before the baseline ' '(non-averaged) model is replaced with the stochastic weight averaged model. ' 'Currently only units of duration (e.g. "0.97dur") and epoch (e.g "88ep") are supported.', default="0.97dur") update_interval: str = hp.optional(doc='Time string denoting how often the averaged model is updated. For example, ' '"1ep" means the averaged model will be updated once per epoch, and ' '"10ba" means the averaged model will be updated every 10 batches.', default="1ep") schedule_swa_lr: bool = hp.optional(doc='Flag to determine whether to apply an SWA-specific LR schedule during the ' 'period in which SWA is active.', default=False) anneal_strategy: str = hp.optional(doc='SWA learning rate annealing schedule strategy. ' '"linear" for linear annealing, "cos" for cosine annealing.', default='linear') anneal_steps: int = hp.optional( doc='Number of SWA model updates over which to anneal SWA learning rate. Note ' 'that updates are determined by the ``update_interval`` argument.', default=10, ) swa_lr: Optional[float] = hp.optional( doc='The final learning rate to anneal towards with this scheduler. ' 'Set to None for no annealing.', default=None, ) def initialize_object(self): return SWA(**asdict(self))