
    +gd                         d dl mZ d dlZdedefdZdededee         fdZdededee         fd	Z	d
ee         defdZ
dej        j        dedefdZdS )    )ListN
gen_kwargsreturnc                    d |                                  D             }t          t          |                                                    dk    rGt	          dd                    d |                                 D                       z   dz   dz             t          |                                d	          }t          d|          S )
zFReturn the number of possible shards according to the input gen_kwargsc                 ^    i | ]*\  }}t          |t                    |t          |          +S  
isinstancelistlen).0keyvalues      7lib/python3.11/site-packages/datasets/utils/sharding.py
<dictcomp>z3_number_of_shards_in_gen_kwargs.<locals>.<dictcomp>
   s5    eeeeZX]_cMdMdeS#e**eee       zSharding is ambiguous for this dataset: we found several data sources lists of different lengths, and we don't know over which list we should parallelize:

c              3   ,   K   | ]\  }}d | d| V  dS )z	- key z has length Nr   )r   r   lengths      r   	<genexpr>z2_number_of_shards_in_gen_kwargs.<locals>.<genexpr>   s7      jj[S&@s@@@@jjjjjjr   zW
To fix this, check the 'gen_kwargs' and make sure to use lists only for data sources, zqand use tuples otherwise. In the end there should only be one single list, or several lists with the same length.r   )default)itemsr   setvaluesRuntimeErrorjoinmax)r   lists_lengths
max_lengths      r   _number_of_shards_in_gen_kwargsr!      s     fez7G7G7I7IeeeM
3}##%%&&''!++I))jjTaTgTgTiTijjjjjk mm F	F
 
 	
 ]))++Q777Jq*r   
num_shardsmax_num_jobsc                     g }t          |          D ]R}| |z  || |z  k     z   }|dk    r n:|r|d         j        nd}t          |||z             }|                    |           S|S )a  
    Get the range of shard indices per job.
    If num_shards<max_num_jobs, then num_shards jobs are given a range of one shard.
    The shards indices order is preserved: e.g. all the first shards are given the first job.
    Moreover all the jobs are given approximately the same number of shards.

    Example:

    ```python
    >>> _distribute_shards(2, max_num_jobs=4)
    [range(0, 1), range(1, 2)]
    >>> _distribute_shards(10, max_num_jobs=3)
    [range(0, 4), range(4, 7), range(7, 10)]
    ```
    r   )rangestopappend)r"   r#   shards_indices_per_group	group_idxnum_shards_to_addstartshard_indicess          r   _distribute_shardsr.      s       "<(( 7 7	&,6)zT`G`:ab!!E5MT(,11STeU->%>?? ''6666##r   c                      t                     }|dk    rt                     gS t          ||           fdt          t	                              D             S )z2Split the gen_kwargs into `max_num_job` gen_kwargsr   )r"   r#   c                 T    g | ]#fd                                  D             $S )c                 p    i | ]1\  }|t          t                    rfd          D             n2S )c                      g | ]
}|         S r   r   )r   	shard_idxr   s     r   
<listcomp>z;_split_gen_kwargs.<locals>.<listcomp>.<dictcomp>.<listcomp>>   s    [[[9eI&[[[r   r
   r   )r   r   r   r*   shard_indices_per_groups     @r   r   z0_split_gen_kwargs.<locals>.<listcomp>.<dictcomp>=   sf        C eT**[[[[8OPY8Z[[[[  r   )r   )r   r*   r   r6   s    @r   r4   z%_split_gen_kwargs.<locals>.<listcomp><   s`     
 
 
      #-"2"2"4"4	  
 
 
r   )r!   dictr.   r&   r   )r   r#   r"   r6   s   `  @r   _split_gen_kwargsr8   4   s     1<<JQZ  !!"4
Ye"f"f"f
 
 
 
 
 #3'>#?#?@@
 
 
 	
r   gen_kwargs_listc                 ,      fd d         D             S )Nc                     i | ]@t          d                   t                    rfdD             nd                   AS )r   c                 *    g | ]}|         D ]}|S r   r   )r   r   r   r   s      r   r4   z0_merge_gen_kwargs.<locals>.<dictcomp>.<listcomp>I   s*    SSS
:c?SS%eSSSSr   r5   )r   r   r9   s    @r   r   z%_merge_gen_kwargs.<locals>.<dictcomp>H   so         	oa(-t44%SSSSoSSSSQ$  r   r   r   )r9   s   `r   _merge_gen_kwargsr=   G   s5        #1%	   r   rngc                    d |                                 D             }i }|D ]<}t          t          |                    ||<   |                     ||                    =t	          |          }|                                D ]>\  }t          t                    r$fd|t                             D             ||<   ?|S )z.Return a shuffled copy of the input gen_kwargsc                 V    h | ]&}t          |t                    t          |          'S r   r	   )r   r   s     r   	<setcomp>z&_shuffle_gen_kwargs.<locals>.<setcomp>V   s.    YYYESWAXAXY#e**YYYr   c                      g | ]
}|         S r   r   )r   ir   s     r   r4   z'_shuffle_gen_kwargs.<locals>.<listcomp>_   s    #S#S#SE!H#S#S#Sr   )r   r   r&   shuffler7   r   r
   r   )r>   r   
list_sizesindices_per_sizesizeshuffled_kwargsr   r   s          @r   _shuffle_gen_kwargsrI   P   s     ZY**;*;*=*=YYYJ , ,!%eDkk!2!2$T*++++:&&O%++-- T T
UeT"" 	T#S#S#S#S6Fs5zz6R#S#S#SOC r   )typingr   numpynpr7   intr!   r&   r.   r8   r=   random	GeneratorrI   r   r   r   <module>rP      s                  &$3 $c $d5k $ $ $ $6
$ 
c 
d4j 
 
 
 
&tDz d    RY0 d t      r   