Source code for streaming.base.format

# Copyright 2022-2024 MosaicML Streaming authors
# SPDX-License-Identifier: Apache-2.0

"""Individual dataset writer for every format."""

from typing import Any, Optional

from streaming.base.format.base import FileInfo, Reader
from streaming.base.format.index import get_index_basename
from streaming.base.format.json import JSONReader, JSONWriter
from streaming.base.format.mds import MDSReader, MDSWriter
from streaming.base.format.xsv import (CSVReader, CSVWriter, TSVReader, TSVWriter, XSVReader,
                                       XSVWriter)

__all__ = [
    'CSVWriter', 'FileInfo', 'get_index_basename', 'JSONWriter', 'MDSWriter', 'Reader',
    'reader_from_json', 'TSVWriter', 'XSVWriter'
]

_readers = {
    'csv': CSVReader,
    'json': JSONReader,
    'mds': MDSReader,
    'tsv': TSVReader,
    'xsv': XSVReader
}


[docs]def reader_from_json(dirname: str, split: Optional[str], obj: dict[str, Any]) -> Reader: """Initialize the reader from JSON object. Args: dirname (str): Local directory containing shards. split (str, optional): Which dataset split to use, if any. obj (Dict[str, Any]): JSON object to load. Returns: Reader: Loaded Reader of `format` type """ assert obj['version'] == 2 cls = _readers[obj['format']] return cls.from_json(dirname, split, obj)