
    +gd                     d    d dl mZ ddlmZmZ ddlmZmZ  edee          Zdede	de	d	efd
Z
dS )    )TypeVar   )Dataset _split_by_node_map_style_dataset)IterableDataset_split_by_node_iterable_datasetDatasetTypedatasetrank
world_sizereturnc                 t    t          | t                    rt          | ||          S t          | ||          S )a  
    Split a dataset for the node at rank `rank` in a pool of nodes of size `world_size`.

    For map-style datasets:

    Each node is assigned a chunk of data, e.g. rank 0 is given the first chunk of the dataset.
    To maximize data loading throughput, chunks are made of contiguous data on disk if possible.

    For iterable datasets:

    If the dataset has a number of shards that is a factor of `world_size` (i.e. if `dataset.n_shards % world_size == 0`),
    then the shards are evenly assigned across the nodes, which is the most optimized.
    Otherwise, each node keeps 1 example out of `world_size`, skipping the other examples.

    Args:
        dataset ([`Dataset`] or [`IterableDataset`]):
            The dataset to split by node.
        rank (`int`):
            Rank of the current node.
        world_size (`int`):
            Total number of nodes.

    Returns:
        [`Dataset`] or [`IterableDataset`]: The dataset to be used on the node at rank `rank`.
    )r   r   )
isinstancer   r   r   )r
   r   r   s      4lib/python3.11/site-packages/datasets/distributed.pysplit_dataset_by_noder   
   sA    4 '7## Z/dzZZZZ.wTjYYYY    N)typingr   arrow_datasetr   r   iterable_datasetr   r   r	   intr    r   r   <module>r      s          D D D D D D D D N N N N N N N N gmWo>>Z; Zc Zs Z{ Z Z Z Z Z Zr   