Source code for composer.algorithms.mixup.mixup

# Copyright 2021 MosaicML. All Rights Reserved.

"""Core MixUp classes and functions."""

from __future__ import annotations

import logging
from typing import Optional, Tuple

import numpy as np
import torch

from composer.core import Algorithm, Event, State
from composer.loggers import Logger
from composer.loss.utils import ensure_targets_one_hot

log = logging.getLogger(__name__)

__all__ = ["MixUp", "mixup_batch"]


[docs]def mixup_batch(input: torch.Tensor, target: torch.Tensor, mixing: Optional[float] = None, alpha: float = 0.2, indices: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, torch.Tensor, float]: """Create new samples using convex combinations of pairs of samples. This is done by taking a convex combination of ``input`` with a randomly permuted copy of ``input``. The permutation takes place along the sample axis (dim 0). The relative weight of the original ``input`` versus the permuted copy is defined by the ``mixing`` parameter. This parameter should be chosen from a ``Beta(alpha, alpha)`` distribution for some parameter ``alpha > 0``. Note that the same ``mixing`` is used for the whole batch. Args: input (torch.Tensor): input tensor of shape ``(minibatch, ...)``, where ``...`` indicates zero or more dimensions. target (torch.Tensor): target tensor of shape ``(minibatch, ...)``, where ``...`` indicates zero or more dimensions. mixing (float, optional): coefficient used to interpolate between the two examples. If provided, must be in :math:`[0, 1]`. If ``None``, value is drawn from a ``Beta(alpha, alpha)`` distribution. Default: ``None``. alpha (float, optional): parameter for the Beta distribution over ``mixing``. Ignored if ``mixing`` is provided. Default: ``0.2``. indices (Tensor, optional): Permutation of the samples to use. Default: ``None``. Returns: input_mixed (torch.Tensor): batch of inputs after mixup has been applied target_perm (torch.Tensor): The labels of the mixed-in examples mixing (torch.Tensor): the amount of mixing used Example: .. testcode:: import torch from composer.functional import mixup_batch N, C, H, W = 2, 3, 4, 5 X = torch.randn(N, C, H, W) y = torch.randint(num_classes, size=(N,)) X_mixed, y_perm, mixing = mixup_batch( X, y, alpha=0.2) """ if mixing is None: mixing = _gen_mixing_coef(alpha) # Create permuted versions of x and y in preparation for interpolation # Use given indices if there are any. if indices is None: permuted_idx = _gen_indices(input.shape[0]) else: permuted_idx = indices x_permuted = input[permuted_idx] permuted_target = target[permuted_idx] # Interpolate between the inputs x_mixup = (1 - mixing) * input + mixing * x_permuted return x_mixup, permuted_target, mixing
[docs]class MixUp(Algorithm): """`MixUp <https://arxiv.org/abs/1710.09412>`_ trains the network on convex combinations of pairs of examples and targets rather than individual examples and targets. This is done by taking a convex combination of a given batch X with a randomly permuted copy of X. The mixing coefficient is drawn from a ``Beta(alpha, alpha)`` distribution. Training in this fashion sometimes reduces generalization error. Args: alpha (float, optional): the psuedocount for the Beta distribution used to sample mixing parameters. As ``alpha`` grows, the two samples in each pair tend to be weighted more equally. As ``alpha`` approaches 0 from above, the combination approaches only using one element of the pair. Default: ``0.2``. interpolate_loss (bool, optional): Interpolates the loss rather than the labels. A useful trick when using a cross entropy loss. Will produce incorrect behavior if the loss is not a linear function of the targets. Default: ``False`` Example: .. testcode:: from composer.algorithms import MixUp algorithm = MixUp(alpha=0.2) trainer = Trainer( model=model, train_dataloader=train_dataloader, eval_dataloader=eval_dataloader, max_duration="1ep", algorithms=[algorithm], optimizers=[optimizer] ) """ def __init__(self, alpha: float = 0.2, interpolate_loss: bool = False): self.alpha = alpha self.interpolate_loss = interpolate_loss self.mixing = 0.0 self.indices = torch.Tensor() self.permuted_target = torch.Tensor() def match(self, event: Event, state: State) -> bool: if self.interpolate_loss: return event in [Event.BEFORE_FORWARD, Event.BEFORE_BACKWARD] else: return event in [Event.BEFORE_FORWARD, Event.BEFORE_LOSS] def apply(self, event: Event, state: State, logger: Logger) -> None: input, target = state.batch_pair if event == Event.BEFORE_FORWARD: if not isinstance(input, torch.Tensor): raise NotImplementedError("Multiple tensors for inputs not supported yet.") if not isinstance(target, torch.Tensor): raise NotImplementedError("Multiple tensors for targets not supported yet.") self.mixing = _gen_mixing_coef(self.alpha) self.indices = _gen_indices(input.shape[0]) new_input, self.permuted_target, _ = mixup_batch( input, target, mixing=self.mixing, indices=self.indices, ) state.batch = (new_input, target) if not self.interpolate_loss and event == Event.BEFORE_LOSS: # Interpolate the targets if not isinstance(state.outputs, torch.Tensor): raise NotImplementedError("Multiple output tensors not supported yet") if not isinstance(target, torch.Tensor): raise NotImplementedError("Multiple target tensors not supported yet") # Make sure that the targets are dense/one-hot target = ensure_targets_one_hot(state.outputs, target) permuted_target = ensure_targets_one_hot(state.outputs, self.permuted_target) # Interpolate to get the new target mixed_up_target = (1 - self.mixing) * target + self.mixing * permuted_target # Create the new batch state.batch = (input, mixed_up_target) if self.interpolate_loss and event == Event.BEFORE_BACKWARD: # Grab the loss function if hasattr(state.model, "loss"): loss_fn = state.model.loss elif hasattr(state.model, "module") and hasattr(state.model.module, "loss"): if isinstance(state.model.module, torch.nn.Module): loss_fn = state.model.module.loss else: raise TypeError("state.model.module must be a torch module") else: raise AttributeError("Loss must be accesable via model.loss or model.module.loss") # Verify that the loss is callable if not callable(loss_fn): raise TypeError("Loss must be callable") # Interpolate the loss new_loss = loss_fn(state.outputs, (input, self.permuted_target)) if not isinstance(state.loss, torch.Tensor): raise NotImplementedError("Multiple losses not supported yet") if not isinstance(new_loss, torch.Tensor): raise NotImplementedError("Multiple losses not supported yet") state.loss = (1 - self.mixing) * state.loss + self.mixing * new_loss
def _gen_mixing_coef(alpha: float) -> float: """Samples ``max(z, 1-z), z ~ Beta(alpha, alpha)``.""" # First check if alpha is positive. assert alpha >= 0 # Draw the mixing parameter from a beta distribution. # Check here is needed because beta distribution requires alpha > 0 # but alpha = 0 is fine for mixup. if alpha == 0: mixing_lambda = 0 else: mixing_lambda = np.random.beta(alpha, alpha) # for symmetric beta distribution, can always use 0 <= lambda <= .5; # this way the "main" label is always the original one, which keeps # the training accuracy meaningful return min(mixing_lambda, 1. - mixing_lambda) def _gen_indices(num_samples: int) -> torch.Tensor: """Generates a random permutation of the batch indices.""" return torch.randperm(num_samples)