# Copyright 2021 MosaicML. All Rights Reserved.
"""Synthetic language modeling datasets used for testing, profiling, and debugging."""
from __future__ import annotations
import json
import random
import string
from os.path import join
from tempfile import TemporaryDirectory
from typing import TYPE_CHECKING, NamedTuple, Optional
from composer.utils import MissingConditionalImportError
if TYPE_CHECKING:
import tokenizers.models as tokenizers_models
from datasets import Dataset
from tokenizers import decoders, normalizers, pre_tokenizers
from transformers import PreTrainedTokenizer
__all__ = ["SyntheticTokenizerParams", "generate_synthetic_tokenizer", "synthetic_hf_dataset_builder"]
[docs]class SyntheticTokenizerParams(NamedTuple):
tokenizer_model: tokenizers_models.Model
normalizer: normalizers.Normalizer
pre_tokenizer: pre_tokenizers.PreTokenizer
decoder: decoders.Decoder
initial_alphabet: list
special_tokens: list
pad_token: str
trainer_cls: type
tokenizer_cls: type
def _generate_bert_tokenizer_params(dataset) -> SyntheticTokenizerParams:
try:
import tokenizers.models as tokenizers_models
import tokenizers.trainers as tokenizers_trainer
from tokenizers import decoders, normalizers, pre_tokenizers
from transformers import BertTokenizer
except ImportError as e:
raise MissingConditionalImportError(extra_deps_group="nlp", conda_package="transformers") from e
unk_token = "[UNK]"
pad_token = "[PAD]"
initial_alphabet = "".join([i for i in dataset])
initial_alphabet = list(set(initial_alphabet))
return SyntheticTokenizerParams(
tokenizer_model=tokenizers_models.WordPiece(unk_token=unk_token), # type: ignore
normalizer=normalizers.BertNormalizer(),
pre_tokenizer=pre_tokenizers.BertPreTokenizer(),
decoder=decoders.WordPiece(),
initial_alphabet=initial_alphabet,
special_tokens=[pad_token, unk_token, "[SEP]", "[CLS]", "[MASK]"],
pad_token=pad_token,
trainer_cls=tokenizers_trainer.WordPieceTrainer,
tokenizer_cls=BertTokenizer,
)
def _generate_gpt2_tokenizer_params() -> SyntheticTokenizerParams:
try:
import tokenizers.models as tokenizers_models
import tokenizers.trainers as tokenizers_trainer
from tokenizers import decoders, normalizers, pre_tokenizers
from transformers import GPT2Tokenizer
except ImportError as e:
raise MissingConditionalImportError(extra_deps_group="nlp", conda_package="transformers") from e
unk_token = None
pad_token = "<pad>"
return SyntheticTokenizerParams(
tokenizer_model=tokenizers_models.BPE(unk_token=unk_token),
normalizer=normalizers.Lowercase(),
pre_tokenizer=pre_tokenizers.ByteLevel(),
decoder=decoders.ByteLevel(),
initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
special_tokens=[pad_token, "<endoftext>"],
pad_token=pad_token,
trainer_cls=tokenizers_trainer.BpeTrainer,
tokenizer_cls=GPT2Tokenizer,
)
[docs]def generate_synthetic_tokenizer(tokenizer_family: str,
dataset: Optional[Dataset] = None,
vocab_size: int = 256) -> PreTrainedTokenizer:
"""Generates a synthetic tokenizer based on a tokenizer family.
Args:
tokenizer_family (str): Which tokenizer family to emulate. One of ['gpt2', 'bert'].
dataset (Optional[datasets.Dataset]): Optionally, the dataset to train the tokenzier off of.
If ``None``, a :class:`~SyntheticHFDataset` will be generated. Default: ``None``.
vocab_size (int): The size of the tokenizer vocabulary. Defaults to 256.
"""
try:
from tokenizers import Tokenizer
from transformers import PreTrainedTokenizer
except ImportError as e:
raise MissingConditionalImportError(extra_deps_group="nlp", conda_package="transformers") from e
# generate a synthetic dataset with reasonable defaults is none is provided
if dataset is None:
num_samples = 100
chars_per_sample = 128
column_names = ['sentence']
dataset = synthetic_hf_dataset_builder(num_samples=num_samples,
chars_per_sample=chars_per_sample,
column_names=column_names)
# change a columnar dataset into a list
flattened_columns = [dataset[key] for key in dataset.column_names if key != 'idx']
# flatten the list of lists into a single list
flattened_dataset = []
for sublist in flattened_columns:
for item in sublist:
flattened_dataset.append(item)
if "bert" in tokenizer_family:
tokenizer_params = _generate_bert_tokenizer_params(flattened_dataset)
elif "gpt2" in tokenizer_family:
tokenizer_params = _generate_gpt2_tokenizer_params()
else:
raise ValueError(f"Synthetic tokenizers for tokenizer family {tokenizer_family} are currently unsupported.")
tokenizer = Tokenizer(tokenizer_params.tokenizer_model)
tokenizer.enable_padding(direction="right",
pad_id=0,
pad_type_id=0,
pad_token=tokenizer_params.pad_token,
pad_to_multiple_of=8)
# The 'type: ignore' is because the underlying Rust package has improper type annotations. PyRight throws:
# Cannot assign member "normalizer" for type "Tokenizer". Property "normalizer" has no defined setter
tokenizer.normalizer = tokenizer_params.normalizer # type: ignore
tokenizer.pre_tokenizer = tokenizer_params.pre_tokenizer # type: ignore
tokenizer.decoder = tokenizer_params.decoder # type: ignore
tokenizer_trainer = tokenizer_params.trainer_cls(
vocab_size=vocab_size,
initial_alphabet=tokenizer_params.initial_alphabet,
special_tokens=tokenizer_params.special_tokens,
)
tokenizer.train_from_iterator(flattened_dataset, trainer=tokenizer_trainer)
# save the tokenizer config
with TemporaryDirectory() as tmp_path:
tmp_tokenizer_dir = str(tmp_path)
tmp_tokenizer_file = join(tmp_tokenizer_dir, "tokenizer.json")
tokenizer.save(tmp_tokenizer_file) #type: ignore (thirdparty)
# save the vocabulary and potential merges file
tokenizer_params.tokenizer_model.save(tmp_tokenizer_dir) # type: ignore
# the .from_pretrained method doesn't load our padding for some reason, so we save it as a special kwarg
tmp_tokenizer_config = join(tmp_tokenizer_dir, "tokenizer_config.json")
with open(tmp_tokenizer_config, "w") as f:
json.dump({"pad_token": tokenizer_params.pad_token}, f)
# instantiate the new tokenizer
if not issubclass(tokenizer_params.tokenizer_cls, PreTrainedTokenizer):
raise ValueError(f"{tokenizer_params.tokenizer_cls} should sub-class transformers.PreTrainedTokenizer.")
tokenizer = tokenizer_params.tokenizer_cls.from_pretrained(tmp_tokenizer_dir)
return tokenizer
[docs]def synthetic_hf_dataset_builder(num_samples: int, chars_per_sample: int, column_names: list):
"""Creates a synthetic :class:`~datasets.Dataset` and passes it to the preprocessing scripts.
Args:
num_samples (int): how many samples to use in the synthetic dataset.
chars_per_sample (int): how many characters each synthetic text sample should be.
column_names (list): the column names that a dataset should use
Returns:
datasets.Dataset: the synthetic HF Dataset object.
"""
try:
import datasets
except ImportError as e:
raise MissingConditionalImportError(extra_deps_group="nlp", conda_package="transformers") from e
if column_names is None or len(column_names) == 0:
raise ValueError("There must be at least one column name provided for the final dataset.")
data = {}
for column_name in column_names:
data[column_name] = [_generate_synthetic_text_sample(chars_per_sample) for _ in range(num_samples)]
data['idx'] = list(range(num_samples))
hf_synthetic_dataset = datasets.Dataset.from_dict(data)
return hf_synthetic_dataset
def _generate_synthetic_text_sample(chars_per_sample, min_word_length=3, max_word_length=10):
character_set = {
"letters": {
"weight": 10,
"choices": string.ascii_letters
},
"digits": {
"weight": 5,
"choices": string.digits
},
"punctuation": {
"weight": 1,
"choices": string.punctuation
}
}
valid_chars = ''.join([(i['choices'] * i['weight']) for i in character_set.values()])
sample = ''
while len(sample) < chars_per_sample:
sample_len = random.randint(min_word_length, max_word_length)
sample += ''.join([random.choice(valid_chars) for _ in range(sample_len)])
sample += ' '
sample = sample[:chars_per_sample]
return sample