# Copyright 2021 MosaicML. All Rights Reserved.
import textwrap
from dataclasses import asdict, dataclass
from typing import Optional
import yahp as hp
from composer.algorithms.algorithm_hparams import AlgorithmHparams
from composer.algorithms.alibi import Alibi
from composer.algorithms.augmix import AugMix
from composer.algorithms.blurpool import BlurPool
from composer.algorithms.channels_last import ChannelsLast
from composer.algorithms.colout import ColOut
from composer.algorithms.cutmix import CutMix
from composer.algorithms.cutout import CutOut
from composer.algorithms.factorize import Factorize
from composer.algorithms.ghost_batchnorm import GhostBatchNorm
from composer.algorithms.label_smoothing import LabelSmoothing
from composer.algorithms.layer_freezing import LayerFreezing
from composer.algorithms.mixup import MixUp
from composer.algorithms.no_op_model import NoOpModel
from composer.algorithms.progressive_resizing import ProgressiveResizing
from composer.algorithms.randaugment import RandAugment
from composer.algorithms.sam import SAM
from composer.algorithms.scale_schedule import ScaleSchedule
from composer.algorithms.selective_backprop import SelectiveBackprop
from composer.algorithms.seq_length_warmup import SeqLengthWarmup
from composer.algorithms.squeeze_excite import SqueezeExcite
from composer.algorithms.stochastic_depth import StochasticDepth
from composer.algorithms.stochastic_depth.stochastic_depth import (_STOCHASTIC_LAYER_MAPPING,
_validate_stochastic_hparams)
from composer.algorithms.swa import SWA
[docs]@dataclass
class AlibiHparams(AlgorithmHparams):
"""See :class:`Alibi`"""
position_embedding_attribute: str = hp.required("attribute name of position embeddings within the model. "
"For example in HuggingFace's GPT2, the position "
"embeddings are 'transformer.wpe'")
attention_module_name: str = hp.required("module/class that will have its self-attention "
"function replaced. For example, in HuggingFace's "
"GPT, the self-attention module is "
"'transformers.models.gpt2.modeling_gpt2.GPT2Attention'")
attr_to_replace: str = hp.required("model attribute that self-attention function will "
"replace. For example, in HuggingFace's "
"GPT2, the self-attention function is '_attn'")
alibi_attention: str = hp.required("new self-attention function in which ALiBi is "
"implemented. Used to replace "
"'{attention_module}.{attr_to_replace}'")
mask_replacement_function: Optional[str] = hp.optional(
"function to replace model's attention mask. This is "
"sometimes necessary for evaluating on sequence "
" lengths longer than the model was initialized to accommodate.",
default=None)
heads_per_layer: Optional[int] = hp.optional(
'Number of attention heads per layer. If '
'"None", will attempt to determine from model.config.n_head.',
default=None)
max_sequence_length: int = hp.optional('Maximum allowable sequence length', default=8192)
train_sequence_length_scaling: float = hp.optional(
'Amount by which to scale training sequence length. One batch of training data '
'will be reshaped from size (sequence_length, batch) to '
'(sequence_length*train_sequence_length_scaling, batch/train_sequence_length_scaling)',
default=0.25)
def initialize_object(self) -> "Alibi":
return Alibi(**asdict(self))
[docs]@dataclass
class AugMixHparams(AlgorithmHparams):
"""See :class:`AugMix`"""
severity: int = hp.optional(doc="Intensity of each augmentation. Ranges from 0 (none) to 10 (maximum)", default=3)
depth: int = hp.optional(doc="Number of augmentations to compose in a row", default=-1)
width: int = hp.optional(doc="Number of parallel augmentation sequences to combine", default=3)
alpha: float = hp.optional(doc="Mixing parameter for clean vs. augmented images.", default=1.0)
augmentation_set: str = hp.optional(
doc=
"Set of augmentations to sample from. 'all', 'safe' (only augmentations that don't appear on CIFAR10C/ImageNet10C), or 'original'",
default="all")
def initialize_object(self) -> AugMix:
return AugMix(**asdict(self))
[docs]@dataclass
class BlurPoolHparams(AlgorithmHparams):
"""See :class:`BlurPool`"""
replace_convs: bool = hp.optional('Replace Conv2d with BlurConv2d if stride > 1', default=True)
replace_maxpools: bool = hp.optional('Replace MaxPool2d with BlurMaxPool2d', default=True)
blur_first: bool = hp.optional('Blur input before convolution', default=True)
def initialize_object(self) -> "BlurPool":
return BlurPool(**asdict(self))
[docs]@dataclass
class ChannelsLastHparams(AlgorithmHparams):
"""ChannelsLast has no hyperparameters, so this class has no member variables."""
def initialize_object(self) -> ChannelsLast:
return ChannelsLast()
[docs]@dataclass
class ColOutHparams(AlgorithmHparams):
"""See :class:`ColOut`"""
p_row: float = hp.optional(doc="Fraction of rows to drop", default=0.15)
p_col: float = hp.optional(doc="Fraction of cols to drop", default=0.15)
batch: bool = hp.optional(doc="Run ColOut at the batch level", default=True)
def initialize_object(self) -> ColOut:
return ColOut(**asdict(self))
[docs]@dataclass
class CutMixHparams(AlgorithmHparams):
"""See :class:`CutMix`"""
num_classes: int = hp.required('Number of classes in the task labels.')
alpha: float = hp.optional('Strength of interpolation, should be >= 0. No interpolation if alpha=0.', default=1.0)
uniform_sampling: bool = hp.optional('Mix pixels with uniform probability', default=False)
def initialize_object(self) -> CutMix:
return CutMix(**asdict(self))
[docs]@dataclass
class CutOutHparams(AlgorithmHparams):
"""See :class:`CutOut`"""
num_holes: int = hp.optional('Number of holes to cut out', default=1)
length: float = hp.optional('Relative or absolute side length of the square hole to cut out', default=0.5)
uniform_sampling: bool = hp.optional('Mask pixels with uniform probability', default=False)
def initialize_object(self) -> CutOut:
return CutOut(**asdict(self))
[docs]@dataclass
class FactorizeHparams(AlgorithmHparams):
"""See :class:`Factorize`"""
factorize_convs: bool = hp.optional(
doc='Whether to factorize convolutional layers',
default=True,
)
factorize_linears: bool = hp.optional(
doc='Whether to factorize linear layers',
default=True,
)
min_channels: int = hp.optional(
doc=('Minimum number of channels in a Conv2d module' + ' for it to be factorized.'),
default=512,
)
latent_channels: float = hp.optional(
doc='Number or relative fraction of channels in factorized convolution latent representations',
default=0.25,
)
min_features: int = hp.optional(
doc=('Minimum number of features in a Linear module' + ' for it to be factorized.'),
default=512,
)
latent_features: float = hp.optional(
doc='Number or relative fraction of features in factorized linear latent representations',
default=0.25,
)
def initialize_object(self) -> Factorize:
return Factorize(**asdict(self))
[docs]@dataclass
class GhostBatchNormHparams(AlgorithmHparams):
"""See :class:`GhostBatchNorm`"""
ghost_batch_size: int = hp.optional(doc='Size of sub-batches to normalize over', default=32)
def initialize_object(self) -> GhostBatchNorm:
return GhostBatchNorm(**asdict(self))
[docs]@dataclass
class LabelSmoothingHparams(AlgorithmHparams):
"""See :class:`LabelSmoothing`"""
smoothing: float = hp.optional(doc='smoothing factor between 0 and 1', default=0.1)
def initialize_object(self) -> LabelSmoothing:
return LabelSmoothing(**asdict(self))
[docs]@dataclass
class LayerFreezingHparams(AlgorithmHparams):
"""See :class:`LayerFreezing`"""
freeze_start: float = hp.optional(doc='The percentage of epochs to run before freezing begins.', default=0.5)
freeze_level: float = hp.optional(doc='Scale factor for the percentage of the network to freeze.', default=1.0)
def initialize_object(self) -> LayerFreezing:
return LayerFreezing(**asdict(self))
[docs]@dataclass
class MixUpHparams(AlgorithmHparams):
"""See :class:`MixUp`"""
alpha: float = hp.optional('Strength of interpolation, should be >= 0. No interpolation if alpha=0.', default=0.2)
interpolate_loss: bool = hp.optional('Use index labels and interpolate the loss instead of the labels.',
default=False)
def initialize_object(self) -> MixUp:
return MixUp(**asdict(self))
[docs]@dataclass
class NoOpModelHparams(AlgorithmHparams):
def initialize_object(self) -> NoOpModel:
return NoOpModel()
[docs]@dataclass
class ProgressiveResizingHparams(AlgorithmHparams):
"""See :class:`ProgressiveResizing`"""
mode: str = hp.optional(doc="Type of scaling to perform", default="resize")
initial_scale: float = hp.optional(doc="Initial scale factor", default=0.5)
finetune_fraction: float = hp.optional(doc="Fraction of training to reserve for finetuning on full-sized inputs",
default=0.2)
resize_targets: bool = hp.optional(doc="Also resize targets", default=False)
def initialize_object(self) -> ProgressiveResizing:
return ProgressiveResizing(**asdict(self))
[docs]@dataclass
class RandAugmentHparams(AlgorithmHparams):
"""See :class:`RandAugment`"""
severity: int = hp.optional(doc="Intensity of each augmentation. Ranges from 0 (none) to 10 (maximum)", default=9)
depth: int = hp.optional(doc="Number of augmentations to compose in a row", default=2)
augmentation_set: str = hp.optional(
doc=
"Set of augmentations to sample from. 'all', 'safe' (only augmentations that don't appear on CIFAR10C/ImageNet10C), or 'original'",
default="all")
def initialize_object(self) -> "RandAugment":
return RandAugment(**asdict(self))
[docs]@dataclass
class SAMHparams(AlgorithmHparams):
"""See :class:`SAM`"""
rho: float = hp.optional(doc='The neighborhood size parameter of SAM. Must be greater than 0.', default=0.05)
epsilon: float = hp.optional(doc='A small value added to gradient norm for numerical stability.', default=1.0e-12)
interval: int = hp.optional(doc='SAM will run once per `interval` steps. A value of 1 will cause'
'SAM to run every step. Steps on which SAM runs take roughly twice'
'as much time to complete.',
default=1)
def initialize_object(self) -> SAM:
return SAM(**asdict(self))
[docs]@dataclass
class ScaleScheduleHparams(AlgorithmHparams):
"""See :class:`ScaleSchedule`"""
ratio: float = hp.optional('Ratio to scale the schedule.', default=1.0)
def initialize_object(self) -> "ScaleSchedule":
return ScaleSchedule(**asdict(self))
[docs]@dataclass
class SelectiveBackpropHparams(AlgorithmHparams):
"""See :class:`SelectiveBackprop`"""
start: float = hp.optional(doc="SB interval start, as fraction of training duration", default=0.5)
end: float = hp.optional(doc="SB interval end, as fraction of training duration", default=0.9)
keep: float = hp.optional(doc="fraction of minibatch to select and keep for gradient computation", default=0.5)
scale_factor: float = hp.optional(doc="scale for downsampling input for selection forward pass", default=0.5)
interrupt: int = hp.optional(doc="interrupt SB with a vanilla minibatch step every 'interrupt' batches", default=2)
def initialize_object(self) -> SelectiveBackprop:
return SelectiveBackprop(**asdict(self))
[docs]@dataclass
class SeqLengthWarmupHparams(AlgorithmHparams):
duration: float = hp.optional("Fraction of total training time to apply sequential length warmup learning.",
default=0.3)
min_seq_length: int = hp.optional("Starting sequence length.", default=8)
max_seq_length: int = hp.optional("End sequence length", default=1024)
step_size: int = hp.optional("Sequence length step size", default=8)
truncate: bool = hp.optional("Truncate tensors or reshape extra tokens to new examples.", default=True)
def initialize_object(self) -> "SeqLengthWarmup":
return SeqLengthWarmup(**asdict(self))
[docs]@dataclass
class StochasticDepthHparams(AlgorithmHparams):
"""See :class:`StochasticDepth`"""
target_layer_name: str = hp.required(
f'Reference name of layer to replace. "block" method can be {list(_STOCHASTIC_LAYER_MAPPING["block"].keys())}.'
f' "sample" method can be {list(_STOCHASTIC_LAYER_MAPPING["sample"].keys())}.')
stochastic_method: str = hp.optional('The version of stochastic depth to use. One of ["sample", "block"].',
default='block')
drop_rate: float = hp.optional('The probability of dropping a block or sample.', default=0.2)
drop_distribution: str = hp.optional(
'"Uniform" keeps the drop rate the same across blocks. "linear" linearly'
' increases the drop rate with block depth until it reaches `drop_rate`.',
default='linear')
use_same_gpu_seed: bool = hp.optional(
'Whether or not to drop the same blocks across GPUs. Only used with "block" method.', default=True)
drop_warmup: str = hp.optional(textwrap.dedent("""\
Time string to represent the amount of training to warmup the `drop_rate`.
Only use with "block" stochastic method."""),
default="0dur")
def initialize_object(self) -> StochasticDepth:
return StochasticDepth(**asdict(self))
def validate(self):
super().validate()
_validate_stochastic_hparams(target_layer_name=self.target_layer_name,
stochastic_method=self.stochastic_method,
drop_rate=self.drop_rate,
drop_distribution=self.drop_distribution,
drop_warmup=self.drop_warmup)
[docs]@dataclass
class SqueezeExciteHparams(AlgorithmHparams):
"""See :class:`SqueezeExcite`"""
latent_channels: float = hp.optional(
doc='Dimensionality of hidden layer within the added MLP.',
default=64,
)
min_channels: int = hp.optional(
doc='Minimum number of channels in a Conv2d layer'
' for a squeeze-excite block to be placed after it.',
default=128,
)
def initialize_object(self) -> SqueezeExcite:
return SqueezeExcite(**asdict(self))
[docs]@dataclass
class SWAHparams(AlgorithmHparams):
"""See :class:`~.composer.algorithms.swa.SWA`"""
swa_start: str = hp.optional(
doc='Time string denoting the amount of training '
'completed before stochastic weight averaging begins. Currently only units of '
'duration (e.g. "0.7dur") and epoch (e.g "50ep") are supported.',
default="0.7dur",
)
swa_end: str = hp.optional(
doc='Time string denoting amount of training completed before the baseline '
'(non-averaged) model is replaced with the stochastic weight averaged model. '
'Currently only units of duration (e.g. "0.97dur") and epoch (e.g "88ep") are supported.',
default="0.97dur")
update_interval: str = hp.optional(doc='Time string denoting how often the averaged model is updated. For example, '
'"1ep" means the averaged model will be updated once per epoch, and '
'"10ba" means the averaged model will be updated every 10 batches.',
default="1ep")
schedule_swa_lr: bool = hp.optional(doc='Flag to determine whether to apply an SWA-specific LR schedule during the '
'period in which SWA is active.',
default=False)
anneal_strategy: str = hp.optional(doc='SWA learning rate annealing schedule strategy. '
'"linear" for linear annealing, "cos" for cosine annealing.',
default='linear')
anneal_steps: int = hp.optional(
doc='Number of SWA model updates over which to anneal SWA learning rate. Note '
'that updates are determined by the ``update_interval`` argument.',
default=10,
)
swa_lr: Optional[float] = hp.optional(
doc='The final learning rate to anneal towards with this scheduler. '
'Set to None for no annealing.',
default=None,
)
def initialize_object(self):
return SWA(**asdict(self))