# Copyright 2022 MosaicML Composer authors
# SPDX-License-Identifier: Apache-2.0
"""Stateless learning rate schedulers.
Stateless schedulers solve some of the problems associated with PyTorch's built-in schedulers provided in
:mod:`torch.optim.lr_scheduler`. The primary design goal of the schedulers provided in this module is to allow
schedulers to interface directly with Composer's :mod:`~composer.core.time` abstraction. This means that schedulers can
be configured using arbitrary but explicit time units.
See :class:`~.ComposerScheduler` for more information on stateless schedulers.
"""
import inspect
import logging
import math
import textwrap
import warnings
from typing import TYPE_CHECKING, Union
from torch.optim.lr_scheduler import LambdaLR, LRScheduler
from composer.core import State, Time, TimeUnit
if TYPE_CHECKING:
from typing import Protocol
else:
# subclasses of Protocol cannot be instantiated in Python 3.8
Protocol = object
log = logging.getLogger(__name__)
__all__ = [
'ComposerScheduler',
'compile_composer_scheduler',
'StepScheduler',
'MultiStepScheduler',
'ConstantScheduler',
'LinearScheduler',
'ExponentialScheduler',
'CosineAnnealingScheduler',
'CosineAnnealingWarmRestartsScheduler',
'PolynomialScheduler',
'MultiStepWithWarmupScheduler',
'ConstantWithWarmupScheduler',
'LinearWithWarmupScheduler',
'CosineAnnealingWithWarmupScheduler',
'PolynomialWithWarmupScheduler',
]
[docs]class ComposerScheduler(Protocol):
r"""Specification for a stateless scheduler function.
While this specification is provided as a Python class, an ordinary function can implement this interface as long
as it matches the signature of this interface's :meth:`~.ComposerScheduler.__call__` method.
For example, a scheduler that halves the learning rate after 10 epochs could be written as:
.. code:: python
def ten_epoch_decay_scheduler(state: State) -> float:
if state.timestamp.epoch < 10:
return 1.0
return 0.5
# ten_epoch_decay_scheduler is a valid ComposerScheduler
trainer = Trainer(
schedulers=[ten_epoch_decay_scheduler],
...
)
In order to allow schedulers to be configured, schedulers may also written as callable classes:
.. code:: python
class VariableEpochDecayScheduler(ComposerScheduler):
def __init__(num_epochs: int):
self.num_epochs = num_epochs
def __call__(state: State) -> float:
if state.time.epoch < self.num_epochs:
return 1.0
return 0.5
ten_epoch_decay_scheduler = VariableEpochDecayScheduler(num_epochs=10)
# ten_epoch_decay_scheduler is also a valid ComposerScheduler
trainer = Trainer(
schedulers=[ten_epoch_decay_scheduler],
...
)
The constructions of ``ten_epoch_decay_scheduler`` in each of the examples above are equivalent. Note that neither
scheduler uses the ``scale_schedule_ratio`` parameter. As long as this parameter is not used when initializing
:class:`.Trainer`, it is not required that any schedulers implement that parameter.
.. automethod:: __call__
"""
[docs] def __call__(self, state: State, ssr: float = 1.0) -> float:
r"""Calculate the current learning rate multiplier :math:`\alpha`.
A scheduler function should be a pure function that returns a multiplier to apply to the optimizer's provided
learning rate, given the current trainer state, and optionally a "scale schedule ratio" (SSR). A typical
implementation will read ``state.timestamp``, and possibly other fields like ``state.max_duration``, to determine
the trainer's latest temporal progress.
.. note::
All instances of :class:`~.ComposerScheduler` output a `multiplier` for the learning rate, rather than the
learning rate directly. By convention, we use the symbol :math:`\alpha` to refer to this multiplier. This
means that the learning rate :math:`\eta` at time :math:`t` can be represented as
:math:`\eta(t) = \eta_i \times \alpha(t)`, where :math:`\eta_i` represents the learning rate used to
initialize the optimizer.
.. note::
It is possible to use multiple schedulers, in which case their effects will stack multiplicatively.
The ``ssr`` param indicates that the schedule should be "stretched" accordingly. In symbolic terms, where
:math:`\alpha_\sigma(t)` represents the scheduler output at time :math:`t` using scale schedule ratio
:math:`\sigma`:
.. math::
\alpha_{\sigma}(t) = \alpha(t / \sigma)
Args:
state (State): The current Composer Trainer state.
ssr (float): The scale schedule ratio. In general, the learning rate computed by this
scheduler at time :math:`t` with an SSR of 1.0 should be the same as that computed by
this scheduler at time :math:`t \times s` with an SSR of :math:`s`. Default = ``1.0``.
Returns:
alpha (float): A multiplier to apply to the optimizer's provided learning rate.
"""
raise NotImplementedError
def _convert_time(time: Union[str, Time[int], Time[float]], state: State, ssr: float = 1.0) -> Time[int]:
if isinstance(time, str):
time = Time.from_timestring(time)
if time.unit == TimeUnit.SECOND:
raise ValueError('Wall clock time not an allowed time unit.')
assert state.max_duration is not None, 'max_duration should be set whenever schedulers are invoked'
if time.unit == TimeUnit.DURATION:
if state.max_duration.unit == TimeUnit.EPOCH:
if state.dataloader_len is None:
raise RuntimeError('Cannot convert time, as state.dataloader_len is None.')
return Time(int(time.value * int(state.dataloader_len) * state.max_duration.value), TimeUnit.BATCH)
return Time(int(time.value * state.max_duration.value), state.max_duration.unit)
elif time.unit == TimeUnit.EPOCH:
# Epochs do not provide sufficient granularity for SSR scaling
# e.g. if max_duration = 1ep, then any SSR would result in a new duration of 0.
# so, convert the time into batches
if state.dataloader_len is None:
raise RuntimeError('Cannot convert time, as state.dataloader_len is None.')
time = Time(value=time.value * int(state.dataloader_len), unit=TimeUnit.BATCH)
return Time(value=int(time.value * ssr), unit=time.unit)
[docs]def compile_composer_scheduler(scheduler: ComposerScheduler, state: State, ssr: float = 1.0) -> LRScheduler:
"""Converts a stateless scheduler into a PyTorch scheduler object.
While the resulting scheduler provides a ``.step()`` interface similar to other PyTorch schedulers, the scheduler is
also given a bound reference to the current :class:`~composer.core.State`. This means that any internal state updated
by ``.step()`` can be ignored, and the scheduler can instead simply use the bound state to recalculate the current
learning rate.
Args:
scheduler (ComposerScheduler): A stateless scheduler, provided as a :class:`~.ComposerScheduler` object.
state (State): The Composer Trainer's state.
Returns:
compiled_scheduler (LRScheduler): The scheduler, in a form compatible with PyTorch scheduler interfaces.
"""
optimizers = state.optimizers
if len(optimizers) != 1:
raise NotImplementedError('Providing functional schedulers is unsupported with multiple optimizers.')
optimizer = optimizers[0]
scheduler_sig = inspect.signature(scheduler)
def scheduler_fn(epoch: int) -> float:
del epoch # unused. Provided by the pytorch LambdaLR
# if the ssr is 1.0, don't pass it to the scheduler. This allows users to pass in lambdas that only take
# one parameter -- the state
if len(scheduler_sig.parameters) == 1:
if ssr == 1.0:
return scheduler(state)
else:
raise ValueError(
textwrap.dedent(
f"""\
Scheduler {scheduler} does not support `scale_schedule_ratio`.
To use `scale_schedule_ratio`, the scheduler must take two arguments (state, ssr)""",
),
)
return scheduler(state, ssr)
lambda_scheduler = LambdaLR(optimizer, scheduler_fn)
return lambda_scheduler
[docs]class StepScheduler(ComposerScheduler):
r"""Decays the learning rate discretely at fixed intervals.
.. seealso::
This scheduler is based on :class:`~torch.optim.lr_scheduler.StepLR` from PyTorch.
Decays the learning rate by a factor of ``gamma`` periodically, with a frequency determined by ``step_size``.
Specifically, the learning rate multiplier :math:`\alpha` can be expressed as:
.. math::
\alpha(t) = \gamma ^ {\text{floor}(t / \rho)}
Where :math:`\rho` represents the time between changes to the learning rate (the step size), and
:math:`\gamma` represents the multiplicative decay factor.
Args:
step_size (str | Time): Time between changes to the learning rate.
gamma (float): Multiplicative decay factor. Default = ``0.1``.
"""
def __init__(self, step_size: Union[str, Time], gamma: float = 0.1):
self.step_size = step_size
self.gamma = gamma
def __call__(self, state: State, ssr: float = 1.0):
step_size = _convert_time(self.step_size, state, ssr=ssr)
current_time = state.timestamp.get(step_size.unit)
steps = int(current_time / step_size)
return self.gamma**steps
[docs]class MultiStepScheduler(ComposerScheduler):
r"""Decays the learning rate discretely at fixed milestones.
.. seealso::
This scheduler is based on :class:`~torch.optim.lr_scheduler.MultiStepLR` from PyTorch.
Decays the learning rate by a factor of ``gamma`` whenever a time milestone in ``milestones`` is reached.
Specifically, the learning rate multiplier :math:`\alpha` can be expressed as:
.. math::
\alpha(t) = \gamma ^ x
Where :math:`x` represents the amount of milestones that have been reached, and :math:`\gamma` represents the
multiplicative decay factor.
Args:
milestones (list[str | Time]): Times at which the learning rate should change.
gamma (float): Multiplicative decay factor. Default = ``0.1``.
"""
def __init__(self, milestones: list[Union[str, Time]], gamma: float = 0.1):
self.milestones = milestones
self.gamma = gamma
def __call__(self, state: State, ssr: float = 1.0):
milestones = [_convert_time(milestone, state, ssr=ssr) for milestone in self.milestones]
factor = 1.0
for milestone in milestones:
if state.timestamp >= milestone:
factor *= self.gamma
return factor
[docs]class ConstantScheduler(ComposerScheduler):
r"""Maintains a fixed learning rate.
This scheduler is based on :class:`~torch.optim.lr_scheduler.ConstantLR` from PyTorch.
The default settings for this scheduler simply maintain a learning rate factor of 1 for the entire training
duration. However, both the factor and the duration of this scheduler can be configured.
Specifically, the learning rate multiplier :math:`\alpha` can be expressed as:
.. math::
\alpha(t) = \begin{cases} \alpha, & \text{if } t < t_{max} \\ 1.0 & \text{otherwise} \end{cases}
Where :math:`\alpha` represents the learning rate multiplier to maintain while this scheduler is active, and
:math:`t_{max}` represents the duration of this scheduler.
Args:
alpha (float): Learning rate multiplier to maintain while this scheduler is active. Default = ``1.0``.
t_max (str | Time): Duration of this scheduler. Default = ``"1dur"``.
"""
def __init__(self, alpha: float = 1.0, t_max: Union[str, Time] = '1dur') -> None:
self.alpha = alpha
self.t_max = t_max
def __call__(self, state: State, ssr: float = 1.0) -> float:
t_max = _convert_time(self.t_max, state, ssr=ssr)
if state.timestamp < t_max:
return self.alpha
return 1.0
[docs]class LinearScheduler(ComposerScheduler):
r"""Adjusts the learning rate linearly.
.. seealso::
This scheduler is based on :class:`~torch.optim.lr_scheduler.LinearLR` from PyTorch.
.. warning::
Note that the defaults for this scheduler differ from the defaults for
:class:`~torch.optim.lr_scheduler.LinearLR`. The PyTorch scheduler, by default, linearly increases the learning
rate multiplier from 1.0 / 3 to 1.0, whereas this implementation, by default, linearly decreases the multiplier
rom 1.0 to 0.0.
Linearly adjusts the learning rate multiplier from ``alpha_i`` to ``alpha_f`` over ``t_{max}`` time.
Specifically, the learning rate multiplier :math:`\alpha` can be expressed as:
.. math::
\alpha(t) = \alpha_i + (alpha_f - \alpha_i) \times \tau
Given :math:`\tau`, the fraction of time elapsed (clipped to the interval :math:`[0, 1]`), as:
.. math::
\tau = t / t_{max}
Where :math:`\alpha_i` represents the initial learning rate multiplier, :math:`\alpha_f` represents
the learning rate multiplier to decay to, and :math:`t_{max}` represents the duration of this scheduler.
Args:
alpha_i (float): Initial learning rate multiplier. Default = ``1.0``.
alpha_f (float): Final learning rate multiplier. Default = ``0.0``.
t_max (str | Time): The duration of this scheduler. Default = ``"1dur"``.
"""
def __init__(self, alpha_i: float = 1.0, alpha_f: float = 0.0, t_max: Union[str, Time] = '1dur'):
self.alpha_i = alpha_i
self.alpha_f = alpha_f
self.t_max = Time.from_timestring(t_max) if isinstance(t_max, str) else t_max
def __call__(self, state: State, ssr: float = 1.0):
t_max = _convert_time(self.t_max, state, ssr=ssr)
current_time = state.timestamp.get(t_max.unit)
frac_of_total = min(1.0, (current_time / t_max).value)
current_factor = self.alpha_i + frac_of_total * (self.alpha_f - self.alpha_i)
return current_factor
[docs]class ExponentialScheduler(ComposerScheduler):
r"""Decays the learning rate exponentially.
.. seealso::
This scheduler is based on :class:`~torch.optim.lr_scheduler.ExponentialLR` from PyTorch.
Exponentially decays the learning rate such that it decays by a factor of ``gamma`` every ``decay_period`` time.
Specifically, the learning rate multiplier :math:`\alpha` can be expressed as:
.. math::
\alpha(t) = \gamma ^ {t / \rho}
Where :math:`\rho` represents the decay period, and :math:`\gamma` represents the multiplicative decay factor.
Args:
decay_period (str | Time): Decay period. Default = ``"1ep"``.
gamma (float): Multiplicative decay factor.
"""
def __init__(self, gamma: float, decay_period: Union[str, Time] = '1ep'):
self.gamma = gamma
self.decay_period = decay_period
def __call__(self, state: State, ssr: float = 1.0):
decay_period = _convert_time(self.decay_period, state, ssr)
current_time_in_decay_units = state.timestamp.get(decay_period.unit)
return self.gamma**float(current_time_in_decay_units / decay_period)
def _cosine_anneal(x: float, min_y: float = 0.0, max_y: float = 1.0) -> float:
"""Implements a cosine decay curve.
Curve is cos(x) on domain [0, pi], stretched to the domain [0, 1] and range [min_y, max_y]. Additionally, param x is
clipped to the interval [0, 1]
"""
x = min(max(x, 0.0), 1.0)
return min_y + (max_y - min_y) * (1 + math.cos(x * math.pi)) / 2
[docs]class CosineAnnealingScheduler(ComposerScheduler):
r"""Decays the learning rate according to the decreasing part of a cosine curve.
.. seealso::
This scheduler is based on :class:`~torch.optim.lr_scheduler.CosineAnnealingLR` from PyTorch.
Specifically, the learning rate multiplier :math:`\alpha` can be expressed as:
.. math::
\alpha(t) = \alpha_f + (1 - \alpha_f) \times \frac{1}{2} (1 + \cos(\pi \times \tau))
Given :math:`\tau`, the fraction of time elapsed (clipped to the interval :math:`[0, 1]`), as:
.. math::
\tau = t / t_{max}
Where :math:`t_{max}`
represents the duration of this scheduler, and :math:`\alpha_f` represents the learning rate multiplier to decay to.
Args:
t_max (str | Time): The duration of this scheduler. Default = ``"1dur"``.
alpha_f (float): Learning rate multiplier to decay to. Default = ``0.0``.
"""
def __init__(self, t_max: Union[str, Time] = '1dur', alpha_f: float = 0.0):
self.t_max = t_max
self.alpha_f = alpha_f
def __call__(self, state: State, ssr: float = 1.0):
t_max = _convert_time(self.t_max, state, ssr=ssr)
current_time = state.timestamp.get(t_max.unit)
frac_of_total = (current_time / t_max).value
return _cosine_anneal(x=frac_of_total, min_y=self.alpha_f)
[docs]class CosineAnnealingWarmRestartsScheduler(ComposerScheduler):
r"""Cyclically decays the learning rate according to the decreasing part of a cosine curve.
.. seealso::
This scheduler is based on :class:`~torch.optim.lr_scheduler.CosineAnnealingWarmRestarts` from PyTorch.
This scheduler resembles a regular cosine annealing curve, as seen in :class:`~.CosineAnnealingScheduler`, except
that after the curve first completes ``t_0`` time, the curve resets to the start. The durations of subsequent cycles
are each multiplied by ``t_mult``.
Specifically, the learning rate multiplier :math:`\alpha` can be expressed as:
.. math::
\alpha(t) = \alpha_f + (1 - \alpha_f) \times \frac{1}{2}(1 + \cos(\pi \times \tau_i))
Given :math:`\tau_i`, the fraction of time elapsed through the :math:`i^\text{th}` cycle, as:
.. math::
\tau_i = (t - \sum_{j=0}^{i-1} t_0 t_{mult}^j) / (t_0 t_{mult}^i)
Where :math:`t_0`
represents the period of the first cycle, :math:`t_{mult}` represents the multiplier for the duration of successive
cycles, and :math:`\alpha_f` represents the learning rate multiplier to decay to.
Args:
t_0 (str | Time): The period of the first cycle.
t_mult (float): The multiplier for the duration of successive cycles. Default = ``1.0``.
alpha_f (float): Learning rate multiplier to decay to. Default = ``0.0``.
"""
def __init__(self, t_0: Union[str, Time], t_mult: float = 1.0, alpha_f: float = 0.0):
self.t_0 = t_0
self.t_mult = t_mult
self.alpha_f = alpha_f
def __call__(self, state: State, ssr: float = 1.0):
t_0 = _convert_time(self.t_0, state, ssr=ssr)
current_interval_len = t_0
current_interval_end = t_0
while current_interval_end <= state.timestamp.get(current_interval_end.unit):
if current_interval_len.value == 0:
raise ValueError(
'Interval between restarts for cosine annealing/warm restarts scheduler has decayed to 0.',
)
current_interval_len = Time(
value=int(self.t_mult * current_interval_len.value),
unit=current_interval_len.unit,
)
current_interval_end += current_interval_len
current_interval_start = current_interval_end - current_interval_len
frac_of_current_interval = ((state.timestamp.get(t_0.unit) - current_interval_start) /
current_interval_len).value
return _cosine_anneal(x=frac_of_current_interval, min_y=self.alpha_f)
[docs]class PolynomialScheduler(ComposerScheduler):
r"""Sets the learning rate to be proportional to a power of the fraction of training time left.
Specifically, the learning rate multiplier :math:`\alpha` can be expressed as:
.. math::
\alpha(t) = \alpha_f + (1 - \alpha_f) \times (1 - \tau) ^ {\kappa}
Given :math:`\tau`, the fraction of time elapsed (clipped to the interval :math:`[0, 1]`), as:
.. math::
\tau = t / t_{max}
Where :math:`\kappa`
represents the exponent to be used for the proportionality relationship, :math:`t_{max}` represents the duration of
this scheduler, and :math:`\alpha_f` represents the learning rate multiplier to decay to.
Args:
power (float): The exponent to be used for the proportionality relationship.
t_max (str | Time): The duration of this scheduler. Default = ``"1dur"``.
alpha_f (float): Learning rate multiplier to decay to. Default = ``0.0``.
"""
def __init__(self, power: float, t_max: Union[str, Time] = '1dur', alpha_f: float = 0.0):
self.t_max = t_max
self.power = power
self.alpha_f = alpha_f
def __call__(self, state: State, ssr: float = 1.0):
t_max = _convert_time(self.t_max, state, ssr=ssr)
current_time = state.timestamp.get(t_max.unit)
frac_of_total = (current_time / t_max).value
frac_of_total = min(1.0, frac_of_total)
coeff = (1 - frac_of_total)**self.power
current_factor = self.alpha_f + coeff * (1.0 - self.alpha_f)
return current_factor
def _raise_if_max_duration_exceeds_t_max(t_max: Union[str, Time], state: State):
assert state.max_duration is not None, 'max_duration should be set whenever schedulers are invoked'
max_dur = state.max_duration
if isinstance(t_max, str):
t_max = Time.from_timestring(t_max)
if isinstance(max_dur, str):
max_dur = Time.from_timestring(max_dur)
max_dur_exceeds_t_max = False
if t_max.unit == max_dur.unit:
if t_max.value >= max_dur.value:
# Time units are comparable, and t_max is valid.
return
else:
max_dur_exceeds_t_max = True
elif (t_max.unit == TimeUnit.BATCH and max_dur.unit == TimeUnit.EPOCH and state.dataloader_len is not None):
if t_max.value >= max_dur.value * int(state.dataloader_len):
# Batches are comparable to epochs through the dataloader length, and t_max is valid.
return
else:
max_dur_exceeds_t_max = True
elif (t_max.unit == TimeUnit.EPOCH and max_dur.unit == TimeUnit.BATCH and state.dataloader_len is not None):
if t_max.value * int(state.dataloader_len) >= max_dur.value:
# Batches are comparable to epochs through the dataloader length, and t_max is valid.
return
else:
max_dur_exceeds_t_max = True
if max_dur_exceeds_t_max:
# None of the checks above passed. Time units are comparable, but t_max is invalid since it's less than max_dur.
raise ValueError(
f't_max {t_max} must be greater than or equal to max_duration {max_dur}. Otherwise, the LR schedule will '
'not be defined for the entire training duration.',
)
if t_max.unit != max_dur.unit:
# Units are not comparable, so we cannot check if t_max is valid. Log this and return.
log.debug(
f'Since max_duration {max_dur} with units {max_dur.unit} and t_max {t_max} with units {t_max.unit} are not '
'comparable, make sure that your LR schedule is defined at all points in the training duration.',
)
def _raise_if_warmup_and_max_incompatible(t_warmup: Time[int], t_max: Time[int]):
"""Checks that t_warmup and t_max have the same units.
_convert_time should be called on both `t_warmup` and `t_max` before this function is called. As a a result, t_warmup and t_max will not
be TimeUnit.EPOCH.
"""
assert t_warmup.unit != TimeUnit.EPOCH and t_max.unit != TimeUnit.EPOCH, 't_warmup and t_max cannot be in units of EPOCH'
if isinstance(t_warmup, str):
t_warmup = Time.from_timestring(t_warmup)
if isinstance(t_max, str):
t_max = Time.from_timestring(t_max)
units_same = t_warmup.unit == t_max.unit
if not units_same:
raise ValueError(
f'Cannot use warmup scheduler with t_max {t_max} with units {t_max.unit} and t_warmup {t_warmup} with '
f'units {t_warmup.unit}. t_warmup and t_max must use the same units.',
)
[docs]class MultiStepWithWarmupScheduler(ComposerScheduler):
r"""Decays the learning rate discretely at fixed milestones, with an initial warmup.
.. seealso::
This scheduler is based on :class:`~.MultiStepScheduler`, with an added warmup.
Starts with a linear warmup over ``t_warmup`` time, then decays the learning rate by a factor of ``gamma``
whenever a time milestone in ``milestones`` is reached.
Specifically, the learning rate multiplier :math:`\alpha` can be expressed as:
.. math::
\alpha(t) = \begin{cases}
t / t_{warmup}, & \text{if } t < t_{warmup} \\
\gamma ^ x & \text{otherwise}
\end{cases}
Where :math:`t_{warmup}` represents the warmup time, :math:`x` represents the amount of milestones that have been
reached, and :math:`\gamma` represents the multiplicative decay factor.
.. warning::
All milestones should be greater than ``t_warmup``; otherwise, they will have no effect on the computed learning
rate multiplier until the warmup has completed.
.. warning::
By default, initial warmup time is **not** scaled according to any provided scale schedule ratio.
To change this behavior, set ``scale_warmup=True``.
Args:
t_warmup (str | Time): Warmup time.
milestones (list[str | Time]): Times at which the learning rate should change.
gamma (float): Multiplicative decay factor. Default = ``0.1``.
scale_warmup (float): SSR also scales the warmup period. Default = ``False``.
"""
def __init__(
self,
t_warmup: Union[str, Time],
milestones: list[Union[str, Time]],
gamma: float = 0.1,
scale_warmup: bool = False,
):
self.t_warmup = t_warmup
self.milestones = milestones
self.gamma = gamma
self.scale_warmup = scale_warmup
self.warmup_scheduler = LinearScheduler(alpha_i=0.0, alpha_f=1.0, t_max=t_warmup)
self.step_scheduler = MultiStepScheduler(milestones=milestones, gamma=gamma)
def __call__(self, state: State, ssr: float = 1.0):
assert state.max_duration is not None, 'max_duration should be set whenever schedulers are invoked'
t_warmup = _convert_time(self.t_warmup, state)
if t_warmup.value == 0:
warnings.warn(
textwrap.dedent(
"""\
The warmup duration is 0. If you specified warmup as a fraction of total
training duration, take note that the warmup duration is calculated in the
same unit as the trainer's max_duration parameter.""",
),
)
if state.timestamp < t_warmup:
if self.scale_warmup:
return self.warmup_scheduler(state, ssr)
return self.warmup_scheduler(state)
return self.step_scheduler(state, ssr)
[docs]class ConstantWithWarmupScheduler(ComposerScheduler):
r"""Maintains a fixed learning rate, with an initial warmup.
This scheduler is based on :class:`~torch.optim.lr_scheduler.ConstantLR` from PyTorch, with an added warmup.
Starts with a linear warmup over ``t_warmup`` time, then simply maintains a learning rate factor of 1 for the entire training
duration. However, both the factor and the duration of this scheduler can be configured.
Specifically, the learning rate multiplier :math:`\alpha` can be expressed as:
.. math::
\alpha(t) = \begin{cases}
t / t_{warmup}, & \text{if } t < t_{warmup} \\
\alpha, & \text{if } t < t_{max} \\
1.0 & \text{otherwise} \end{cases}
Where :math:`\alpha` represents the learning rate multiplier to maintain while this scheduler is active, and
:math:`t_{max}` represents the duration of this scheduler.
.. warning::
By default, initial warmup time is **not** scaled according to any provided scale schedule ratio.
To change this behavior, set ``scale_warmup=True``.
Args:
t_warmup (str | Time): Warmup time.
alpha (float): Learning rate multiplier to maintain while this scheduler is active. Default = ``1.0``.
t_max (str | Time): Duration of this scheduler. Default = ``"1dur"``.
scale_warmup (float): SSR also scales the warmup period. Default = ``False``.
"""
def __init__(
self,
t_warmup: Union[str, Time],
alpha: float = 1.0,
t_max: Union[str, Time] = '1dur',
scale_warmup: bool = False,
) -> None:
self.t_warmup = t_warmup
self.alpha = alpha
self.t_max = t_max
self.scale_warmup = scale_warmup
self.scheduler = LinearWithWarmupScheduler(
t_warmup=t_warmup,
alpha_i=alpha,
alpha_f=alpha,
t_max=t_max,
scale_warmup=scale_warmup,
)
def __call__(self, state: State, ssr: float = 1.0) -> float:
return self.scheduler(state, ssr)
[docs]class LinearWithWarmupScheduler(ComposerScheduler):
r"""Adjusts the learning rate linearly, with an initial warmup.
.. seealso::
This scheduler is based on :class:`~.LinearScheduler`, with an added warmup.
Linearly adjusts the learning rate multiplier from ``alpha_i`` to ``alpha_f`` over ``t_{max}`` time.
Specifically, the learning rate multiplier :math:`\alpha` can be expressed as:
.. math::
\alpha(t) = \begin{cases}
t / t_{warmup}, & \text{if } t < t_{warmup} \\
\alpha_i + (alpha_f - \alpha_i) \times \tau_w & \text{otherwise}
\end{cases}
Given :math:`\tau_w`, the fraction of post-warmup time elapsed (clipped to the interval :math:`[0, 1]`), as:
.. math::
\tau_w = (t - t_{warmup}) / t_{max}
Where :math:`t_{warmup}` represents the warmup time, :math:`\alpha_i` represents the initial learning rate multiplier,
and :math:`\alpha_f` represents the learning rate multiplier to decay to, and :math:`t_{max}` represents the duration
of this scheduler.
.. warning::
By default, the initial warmup time is **not** scaled according to any provided scale schedule ratio! However, the duration of
the scheduler is still scaled accordingly. To achieve this, after warmup, the scheduler's "slope" will be
slightly distorted from what would otherwise be expected. To scale the entire schedule, set ``scale_warmup=True``.
Args:
t_warmup (str | Time): Warmup time.
alpha_i (float): Initial learning rate multiplier. Default = ``1.0``.
alpha_f (float): Final learning rate multiplier. Default = ``0.0``.
t_max (str | Time): The duration of this scheduler. Default = ``"1dur"``.
scale_warmup (float): SSR also scales the warmup period. Default = ``False``.
"""
def __init__(
self,
t_warmup: Union[str, Time],
alpha_i: float = 1.0,
alpha_f: float = 0.0,
t_max: Union[str, Time] = '1dur',
scale_warmup: bool = False,
):
self.t_warmup = t_warmup
self.alpha_i = alpha_i
self.alpha_f = alpha_f
self.t_max = t_max
self.scale_warmup = scale_warmup
self.warmup_scheduler = LinearScheduler(alpha_i=0.0, alpha_f=alpha_i, t_max=t_warmup)
def __call__(self, state: State, ssr: float = 1.0):
assert state.max_duration is not None, 'max_duration should be set whenever schedulers are invoked'
t_warmup = _convert_time(self.t_warmup, state)
t_max = _convert_time(self.t_max, state, ssr=ssr)
_raise_if_warmup_and_max_incompatible(t_warmup, t_max)
_raise_if_max_duration_exceeds_t_max(t_max, state)
if t_warmup.value == 0:
warnings.warn(
textwrap.dedent(
"""\
The warmup duration is 0. If you specified warmup as a fraction of total
training duration, take note that the warmup duration is calculated in the
same unit as the trainer's max_duration parameter.""",
),
)
if state.timestamp < t_warmup:
if self.scale_warmup:
return self.warmup_scheduler(state, ssr)
return self.warmup_scheduler(state)
current_time = state.timestamp.get(t_warmup.unit)
frac_of_total = ((current_time - t_warmup) / (t_max - t_warmup)).value if (t_max > t_warmup) else 0.0
frac_of_total = min(1.0, frac_of_total)
current_factor = self.alpha_i + frac_of_total * (self.alpha_f - self.alpha_i)
return current_factor
[docs]class CosineAnnealingWithWarmupScheduler(ComposerScheduler):
r"""Decays the learning rate according to the decreasing part of a cosine curve, with an initial warmup.
.. seealso::
This scheduler is based on :class:`~.CosineAnnealingScheduler`, with an added warmup.
Specifically, the learning rate multiplier :math:`\alpha` can be expressed as:
.. math::
\alpha(t) = \begin{cases}
t / t_{warmup}, & \text{if } t < t_{warmup} \\
\alpha_f + (1 - \alpha_f) \times \frac{1}{2} (1 + \cos(\pi \times \tau_w)) & \text{otherwise}
\end{cases}
Given :math:`\tau_w`, the fraction of post-warmup time elapsed (clipped to the interval :math:`[0, 1]`), as:
.. math::
\tau_w = (t - t_{warmup}) / t_{max}
Where :math:`t_{warmup}` represents the warmup time, :math:`t_{max}` represents the duration of this scheduler, and
:math:`\alpha_f` represents the learning rate multiplier to decay to.
.. warning::
By default, initial warmup time is **not** scaled according to any provided scale schedule ratio.
To change this behavior, set ``scale_warmup=True``.
Args:
t_warmup (str | Time): Warmup time.
t_max (str | Time): The duration of this scheduler. Default = ``"1dur"``.
alpha_f (float): Learning rate multiplier to decay to. Default = ``0.0``.
scale_warmup (float): SSR also scales the warmup period. Default = ``False``.
"""
def __init__(
self,
t_warmup: Union[str, Time],
t_max: Union[str, Time] = '1dur',
alpha_f: float = 0.0,
scale_warmup: bool = False,
):
self.t_warmup = t_warmup
self.t_max = t_max
self.alpha_f = alpha_f
self.scale_warmup = scale_warmup
self.warmup_scheduler = LinearScheduler(alpha_i=0.0, alpha_f=1.0, t_max=t_warmup)
def __call__(self, state: State, ssr: float = 1.0):
assert state.max_duration is not None, 'max_duration should be set whenever schedulers are invoked'
t_warmup = _convert_time(self.t_warmup, state)
t_max = _convert_time(self.t_max, state, ssr=ssr)
_raise_if_warmup_and_max_incompatible(t_warmup, t_max)
_raise_if_max_duration_exceeds_t_max(t_max, state)
if t_warmup.value == 0:
warnings.warn(
textwrap.dedent(
"""\
The warmup duration is 0. If you specified warmup as a fraction of total
training duration, take note that the warmup duration is calculated in the
same unit as the trainer's max_duration parameter.""",
),
)
if state.timestamp < t_warmup:
if self.scale_warmup:
return self.warmup_scheduler(state, ssr)
return self.warmup_scheduler(state)
current_time = state.timestamp.get(t_warmup.unit)
frac_of_total = ((current_time - t_warmup) / (t_max - t_warmup)).value if (t_max > t_warmup) else 0.0
frac_of_total = min(1.0, frac_of_total)
return _cosine_anneal(x=frac_of_total, min_y=self.alpha_f)
[docs]class PolynomialWithWarmupScheduler(ComposerScheduler):
r"""Decays the learning rate according to a power of the fraction of training time left, with an initial warmup.
.. seealso::
This scheduler is based on :class:`~.PolynomialScheduler`, with an added warmup.
Specifically, the learning rate multiplier :math:`\alpha` can be expressed as:
.. math::
\alpha(t) = \begin{cases}
t / t_{warmup}, & \text{if } t < t_{warmup} \\
\alpha_f + (1 - \alpha_f) \times (1 - \tau_w) ^ {\kappa} & \text{otherwise}
\end{cases}
Given :math:`\tau_w`, the fraction of post-warmup time elapsed (clipped to the interval :math:`[0, 1]`), as:
.. math::
\tau_w = (t - t_{warmup}) / t_{max}
Where :math:`\kappa` represents the exponent to be used for the proportionality relationship,
:math:`t_{warmup}` represents the warmup time, :math:`t_{max}` represents the duration of this scheduler, and
:math:`\alpha_f` represents the learning rate multiplier to decay to.
.. warning::
By default, initial warmup time is **not** scaled according to any provided scale schedule ratio.
To change this behavior, set ``scale_warmup=True``.
Args:
t_warmup (str | Time): Warmup time.
power (float): The exponent to be used for the proportionality relationship. Default = ``2.0``.
t_max (str | Time): The duration of this scheduler. Default = ``"1dur"``.
alpha_f (float): Learning rate multiplier to decay to. Default = ``0.0``.
scale_warmup (float): SSR also scales the warmup period. Default = ``False``.
"""
def __init__(
self,
t_warmup: Union[str, Time],
power: float = 2.0,
t_max: Union[str, Time] = '1dur',
alpha_f: float = 0.0,
scale_warmup: bool = False,
):
self.t_warmup = t_warmup
self.power = power
self.t_max = t_max
self.alpha_f = alpha_f
self.scale_warmup = scale_warmup
self.warmup_scheduler = LinearScheduler(alpha_i=0.0, alpha_f=1.0, t_max=t_warmup)
def __call__(self, state: State, ssr: float = 1.0):
assert state.max_duration is not None, 'max_duration should be set whenever schedulers are invoked'
t_warmup = _convert_time(self.t_warmup, state)
t_max = _convert_time(self.t_max, state, ssr=ssr)
_raise_if_warmup_and_max_incompatible(t_warmup, t_max)
_raise_if_max_duration_exceeds_t_max(t_max, state)
if t_warmup.value == 0:
warnings.warn(
textwrap.dedent(
"""\
The warmup duration is 0. If you specified warmup as a fraction of total
training duration, take note that the warmup duration is calculated in the
same unit as the trainer's max_duration parameter.""",
),
)
if state.timestamp < t_warmup:
if self.scale_warmup:
return self.warmup_scheduler(state, ssr)
return self.warmup_scheduler(state)
current_time = state.timestamp.get(t_warmup.unit)
frac_of_total = ((current_time - t_warmup) / (t_max - t_warmup)).value if (t_max > t_warmup) else 0.0
frac_of_total = min(1.0, frac_of_total)
coeff = (1 - frac_of_total)**self.power
current_factor = self.alpha_f + coeff * (1.0 - self.alpha_f)
return current_factor