Source code for composer.datasets.glue

# Copyright 2022 MosaicML Composer authors
# SPDX-License-Identifier: Apache-2.0

"""GLUE (General Language Understanding Evaluation) dataset (Wang et al, 2019).

The GLUE benchmark datasets consist of nine sentence- or sentence-pair language
understanding tasks designed to cover a diverse range of dataset sizes, text genres, and
degrees of difficulty.

Note that the GLUE diagnostic dataset, which is designed to evaluate and analyze model
performance with respect to a wide range of linguistic phenomena found in natural
language, is not included here.

Please refer to the `GLUE`_ benchmark for more details.

.. _GLUE: https://gluebenchmark.com/
"""

import logging
from dataclasses import dataclass
from typing import cast

import yahp as hp
from torch.utils.data import DataLoader

from composer.core.types import Dataset
from composer.datasets.dataloader import DataLoaderHparams
from composer.datasets.hparams import DatasetHparams, SyntheticHparamsMixin
from composer.datasets.synthetic_lm import generate_synthetic_tokenizer, synthetic_hf_dataset_builder
from composer.utils import MissingConditionalImportError, dist

__all__ = ["GLUEHparams"]

log = logging.getLogger(__name__)

_task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
}


[docs]@dataclass class GLUEHparams(DatasetHparams, SyntheticHparamsMixin): """Sets up a generic GLUE dataset loader. Args: task (str): the GLUE task to train on, choose one from: ``'CoLA'``, ``'MNLI'``, ``'MRPC'``, ``'QNLI'``, ``'QQP'``, ``'RTE'``, ``'SST-2'``, and ``'STS-B'``. tokenizer_name (str): The name of the HuggingFace tokenizer to preprocess text with. See `HuggingFace documentation <https://huggingface.co/models>`_. split (str): Whether to use ``'train'``, ``'validation'``, or ``'test'`` split. max_seq_length (int, optional): Optionally, the ability to set a custom sequence length for the training dataset. Default: ``256``. max_network_retries (int, optional): Number of times to retry HTTP requests if they fail. Default: ``10``. Returns: DataLoader: A PyTorch :class:`~torch.utils.data.DataLoader` object. """ task: str = hp.optional( "The GLUE task to train on, choose one from: CoLA, MNLI, MRPC, QNLI, QQP, RTE, SST-2, and STS-B.", default=None) tokenizer_name: str = hp.optional("The name of the HuggingFace tokenizer to preprocess text with.", default=None) split: str = hp.optional("Whether to use 'train', 'validation' or 'test' split.", default=None) max_seq_length: int = hp.optional( default=256, doc='Optionally, the ability to set a custom sequence length for the training dataset.') max_network_retries: int = hp.optional(default=10, doc="Optionally, the number of times to retry HTTP requests if they fail.")
[docs] def validate(self): if self.task not in _task_to_keys.keys(): raise ValueError(f"The task must be a valid GLUE task, options are {' ,'.join(_task_to_keys.keys())}.") if (self.max_seq_length % 8) != 0: log.warning("For best hardware acceleration, it is recommended that sequence lengths be multiples of 8.") if self.tokenizer_name is None: raise ValueError("A tokenizer name must be specified to tokenize the dataset.") if self.split is None: raise ValueError("A dataset split must be specified.")
[docs] def initialize_object(self, batch_size: int, dataloader_hparams: DataLoaderHparams) -> DataLoader: # TODO (Moin): I think this code is copied verbatim in a few different places. Move this into a function. try: import datasets import transformers except ImportError as e: raise MissingConditionalImportError(extra_deps_group="nlp", conda_package="transformers") from e self.validate() if self.use_synthetic: column_names = [i for i in _task_to_keys[self.task] if i is not None] # we just use the max sequence length in tokens to upper bound the sequence length in characters dataset = synthetic_hf_dataset_builder(num_samples=self.synthetic_num_unique_samples, chars_per_sample=self.max_seq_length, column_names=column_names) # flatten the columnar dataset into one column tokenizer = generate_synthetic_tokenizer(tokenizer_family=self.tokenizer_name, dataset=dataset) else: tokenizer = transformers.AutoTokenizer.from_pretrained(self.tokenizer_name) #type: ignore (thirdparty) log.info(f"Loading {self.task.upper()} on rank {dist.get_global_rank()}") download_config = datasets.utils.DownloadConfig(max_retries=self.max_network_retries) dataset = datasets.load_dataset("glue", self.task, split=self.split, download_config=download_config) log.info(f"Starting tokenization step by preprocessing over {dataloader_hparams.num_workers} threads!") text_column_names = _task_to_keys[self.task] def tokenize_function(inp): # truncates sentences to max_length or pads them to max_length first_half = inp[text_column_names[0]] second_half = inp[text_column_names[1]] if text_column_names[1] in inp else None return tokenizer( text=first_half, text_pair=second_half, padding="max_length", max_length=self.max_seq_length, truncation=True, ) columns_to_remove = ["idx"] + [i for i in text_column_names if i is not None] assert isinstance(dataset, datasets.Dataset) dataset = dataset.map( tokenize_function, batched=True, num_proc=None if dataloader_hparams.num_workers == 0 else dataloader_hparams.num_workers, batch_size=1000, remove_columns=columns_to_remove, new_fingerprint=f"{self.task}-{self.tokenizer_name}-tokenization-{self.split}", load_from_cache_file=True, ) data_collator = transformers.default_data_collator sampler = dist.get_sampler(cast(Dataset, dataset), drop_last=self.drop_last, shuffle=self.shuffle) return dataloader_hparams.initialize_object( dataset=dataset, #type: ignore (thirdparty) batch_size=batch_size, sampler=sampler, drop_last=self.drop_last, collate_fn=data_collator)