
    +gdU                       d dl Z d dlZd dlZd dlmZ d dl mZ d dlmZ d dlmZm	Z	 d dl
mZmZmZmZmZmZmZ d dlZd dlZddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddl m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z.m/Z/m0Z0m1Z1  e,e2          Z3dXdee4e5f         dee         defdZ6deee4ef                  dee4e5f         fdZ7dee4e5f         deee4ef                  fdZ8 G d de          Z9 G d d          Z: G d de:          Z; G d  d!e;          Z< G d" d#e:          Z= G d$ d%e:          Z> G d& d'e:          Z? G d( d)e:          Z@d*ee4         fd+ZA G d, d-e:          ZB G d. d/e?          ZC G d0 d1e:          ZD G d2 d3e:          ZE G d4 d5e:          ZF G d6 d7e:          ZG G d8 d9e:          ZHd:eId;ed<ee4ee4eJdf         f         deIfd=ZKdeId;ed<ee4ee4eJdf         f         deIfd>ZL G d? d@e:          ZMdA ZNe G dB dC                      ZOe G dD dE                      ZPdF ZQ G dG dHe          ZR	 	 	 dYdIeeR         dJee&         dKee(         dLeSdeRf
dMZT	 	 	 	 	 dZdOeeR         dPeeeU                  dQeeS         dJee&         dKee(         dRee4         deRfdSZVdTeRdUeSdVeSdeRfdWZWdS )[    N)Counter)deepcopy)	dataclass)cycleislice)AnyCallableDictIteratorListOptionalUnion   )config)DatasetInfoMixin)Features)FeatureType_align_features!_check_if_features_can_be_aligned)_reset_fsspec_lock)PythonFormatterget_format_type_from_alias)DatasetInfo)
NamedSplit)
table_cast)
get_logger)_merge_gen_kwargs_number_of_shards_in_gen_kwargs_shuffle_gen_kwargs_split_gen_kwargsbatchtry_featuresreturnc                    t           j                            |           }|P	 t          |t          j        |j                            }n'# t          t           j        t           j        f$ r Y nw xY wt          j
        |j                  S N)paTablefrom_pydictr   schematype	TypeErrorArrowInvalidArrowNotImplementedErrorr   from_arrow_schema)r!   r"   pa_tables      9lib/python3.11/site-packages/datasets/iterable_dataset.py_infer_features_from_batchr1      s    x##E**H	!(BIl6G,H,HIIHH2?B,GH 	 	 	D	%ho666s   'A !A/.A/examplesc                 r     d  D             } fd|D             }t          t          ||                    S )Nc                     i | ]
}|D ]}|d S r%    .0examplecols      r0   
<dictcomp>z&_examples_to_batch.<locals>.<dictcomp>*   s'    AAA'AA#CAAAA    c                 .    g | ]fd D             S )c                 :    g | ]}|                               S r5   )getr6   s     r0   
<listcomp>z1_examples_to_batch.<locals>.<listcomp>.<listcomp>,   s%    888Gw{{3888r;   r5   )r7   r9   r2   s    @r0   r?   z&_examples_to_batch.<locals>.<listcomp>,   s/    IIIS8888x888IIIr;   )dictzip)r2   colsarrayss   `  r0   _examples_to_batchrD   '   sJ     BAXAAADIIIIDIIIFD&!!"""r;   c              #      K   t          | t          t          |                                        }t          |          D ]$fd|                                 D             V  %dS )z3Convert a batch (dict of examples) to examples listc                 (    i | ]\  }}||         S r5   r5   )r7   r9   arrayis      r0   r:   z&_batch_to_examples.<locals>.<dictcomp>4   s#    ===esE!H===r;   N)lennextiterrangeitems)r!   
n_examplesrH   s     @r0   _batch_to_examplesrO   0   ss      U4U,,-..J: > >====u{{}}=======> >r;   c                   *    e Zd ZdZd Zd Zd Zd ZdS )HasNextIteratorzyIterator with an hasnext() function. Taken from https://stackoverflow.com/questions/1966591/has-next-in-python-iterators.c                 <    t          |          | _        d | _        d S r%   )rK   it_hasnext)selfrS   s     r0   __init__zHasNextIterator.__init__:   s    r((r;   c                     | S r%   r5   rU   s    r0   __iter__zHasNextIterator.__iter__>   s    r;   c                 Z    | j         r| j        }nt          | j                  }d | _         |S r%   )rT   _thenextrJ   rS   )rU   results     r0   __next__zHasNextIterator.__next__A   s.    = 	#]FF$']]Fr;   c                     | j         9	 t          | j                  | _        d| _         n# t          $ r
 d| _         Y nw xY w| j         S )NTF)rT   rJ   rS   r[   StopIterationrX   s    r0   hasnextzHasNextIterator.hasnextI   s[    = % $TW !% ! & & & %& }s   * >>N)__name__
__module____qualname____doc__rV   rY   r]   r`   r5   r;   r0   rQ   rQ   7   s\         D  D          r;   rQ   c                       e Zd ZdZd Zdej        j        dd fdZde	de	dd fdZ
de	de	dee	         fd	Zede	fd
            ZdS )_BaseExamplesIterablez?Base class for the examples iterable used by an IterableDatasetc                 @    t          t          |            d          )zWAn examples iterable should yield tuples (example_key, example) of type (int/str, dict)z doesn't implement __iter__ yetNotImplementedErrorr*   rX   s    r0   rY   z_BaseExamplesIterable.__iter__W       !T$ZZ"P"P"PQQQr;   	generatorr#   c                 @    t          t          |            d          )z
        Either shuffle the shards/sources of the dataset, or propagate the shuffling to the underlying iterable.
        If the order of the shards must stay fixed (when using .skip or .take for example), then this method returns self.
        z+ doesn't implement shuffle_data_sources yetrh   rU   rk   s     r0   shuffle_data_sourcesz*_BaseExamplesIterable.shuffle_data_sources[   s!    
 "T$ZZ"\"\"\]]]r;   	worker_idnum_workersc                 @    t          t          |            d          )ZEither keep only the requested shard, or propagate the request to the underlying iterable.z) doesn't implement shard_data_sources yetrh   rU   ro   rp   s      r0   shard_data_sourcesz(_BaseExamplesIterable.shard_data_sourcesb   s    !T$ZZ"Z"Z"Z[[[r;   c                 H    t          t          || j        |                    S r%   )listrL   n_shardsrs   s      r0   split_shard_indices_by_workerz3_BaseExamplesIterable.split_shard_indices_by_workerf   s    E)T]K@@AAAr;   c                 @    t          t          |            d          )Nz doesn't implement n_shards yetrh   rX   s    r0   rw   z_BaseExamplesIterable.n_shardsi   rj   r;   N)ra   rb   rc   rd   rY   nprandom	Generatorrn   intrt   r   rx   propertyrw   r5   r;   r0   rf   rf   T   s        IIR R R^bi.A ^F] ^ ^ ^ ^\C \c \F] \ \ \ \Bs B BQUVYQZ B B B B R# R R R XR R Rr;   rf   c                   x    e Zd ZdedefdZd Zdej        j	        dd fdZ
ded	edd fd
Zedefd            ZdS )ExamplesIterablegenerate_examples_fnkwargsc                 "    || _         || _        d S r%   r   r   )rU   r   r   s      r0   rV   zExamplesIterable.__init__o   s    $8!r;   c              #   :   K    | j         di | j        E d {V  d S )Nr5   r   rX   s    r0   rY   zExamplesIterable.__iter__s   s9      ,4,;;t{;;;;;;;;;;;r;   rk   r#   c                 8    t          | j        | j        |          S r%   )#ShuffledDataSourcesExamplesIterabler   r   rm   s     r0   rn   z%ExamplesIterable.shuffle_data_sourcesv   s    243Ldk[deeer;   ro   rp   c                     t          | j        | j                  |                     ||          }t	          fd|D                       }t          | j        |          S )Keep only the requested shard.)max_num_jobsc                      g | ]
}|         S r5   r5   )r7   rH   gen_kwargs_lists     r0   r?   z7ExamplesIterable.shard_data_sources.<locals>.<listcomp>}   s    1\1\1\/!2D1\1\1\r;   )r    r   rw   rx   r   r   r   )rU   ro   rp   shard_indicesrequested_gen_kwargsr   s        @r0   rt   z#ExamplesIterable.shard_data_sourcesy   se    +DKdmTTT::9kRR01\1\1\1\m1\1\1\]] 9;OPPPr;   c                 *    t          | j                  S r%   )r   r   rX   s    r0   rw   zExamplesIterable.n_shards   s    .t{;;;r;   N)ra   rb   rc   r	   r@   rV   rY   rz   r{   r|   rn   r}   rt   r~   rw   r5   r;   r0   r   r   n   s        X t    < < <fbi.A fFX f f f fQC Qc QFX Q Q Q Q <# < < < X< < <r;   r   c                   Z     e Zd Zdededej        j        f fdZd Z	de
de
dd	fd
Z xZS )r   r   r   rk   c                 t    t                                          ||           t          |          | _        d S r%   )superrV   r   rk   )rU   r   r   rk   	__class__s       r0   rV   z,ShuffledDataSourcesExamplesIterable.__init__   s1    -v666!),,r;   c              #      K   t          | j                  }t          || j                  } | j        di |E d{V  dS )z*Shuffle the kwargs order to shuffle shardsNr5   )r   rk   r   r   r   )rU   rngkwargs_with_shuffled_shardss      r0   rY   z,ShuffledDataSourcesExamplesIterable.__iter__   sY      t~&&&9#t{&K&K#,4,KK/JKKKKKKKKKKKr;   ro   rp   r#   r   c                     t          | j                  }t          || j                  }t	          | j        |                              ||          S )r   )r   rk   r   r   r   r   rt   )rU   ro   rp   r   r   s        r0   rt   z6ShuffledDataSourcesExamplesIterable.shard_data_sources   sN    t~&&&9#t{&K&K# 9;VWWjj{
 
 	
r;   )ra   rb   rc   r	   r@   rz   r{   r|   rV   rY   r}   rt   __classcell__r   s   @r0   r   r      s        -X -t -PRPYPc - - - - - -L L L
C 
c 
FX 
 
 
 
 
 
 
 
r;   r   c                       e Zd Zdedee         fdZd Zdej	        j
        dd fdZded	edd fd
Zedefd            ZdS )SelectColumnsIterableex_iterablecolumn_namesc                 "    || _         || _        d S r%   r   r   )rU   r   r   s      r0   rV   zSelectColumnsIterable.__init__   s    &(r;   c              #   T   K   | j         D ]\  }|fd| j        D             fV  d S )Nc                 "    i | ]}||         S r5   r5   )r7   crows     r0   r:   z2SelectColumnsIterable.__iter__.<locals>.<dictcomp>   s    ===a3q6===r;   r   )rU   idxr   s     @r0   rY   zSelectColumnsIterable.__iter__   sS      ( 	> 	>HC====4+<========	> 	>r;   rk   r#   c                 \    t          | j                            |          | j                  S r%   )r   r   rn   r   rm   s     r0   rn   z*SelectColumnsIterable.shuffle_data_sources   s'    $T%5%J%J9%U%UW[Whiiir;   ro   rp   c                 ^    t          | j                            ||          | j                  S r%   )r   r   rt   r   rs   s      r0   rt   z(SelectColumnsIterable.shard_data_sources   s*    $T%5%H%HT_%`%`bfbstttr;   c                     | j         j        S r%   r   rw   rX   s    r0   rw   zSelectColumnsIterable.n_shards       ((r;   N)ra   rb   rc   rf   r   strrV   rY   rz   r{   r|   rn   r}   rt   r~   rw   r5   r;   r0   r   r      s        )$9 )c ) ) ) )> > >jbi.A jF] j j j juC uc uF] u u u u )# ) ) ) X) ) )r;   r   c                   |    e Zd ZdededefdZd Zdej        j	        dd fdZ
d	ed
edd fdZedefd            ZdS )StepExamplesIterabler   stepoffsetc                 0    || _         || _        || _        d S r%   )r   r   r   )rU   r   r   r   s       r0   rV   zStepExamplesIterable.__init__   s    &	r;   c              #      K   t          | j                  }	 t          t          || j                            }t          |          | j        k    r|| j                 V  nd S Mr%   )rK   r   rv   r   r   rI   r   )rU   ex_iteratorr!   s      r0   rY   zStepExamplesIterable.__iter__   sg      4+,,	TY7788E5zzDK''DK(((((	r;   rk   r#   c                 j    t          | j                            |          | j        | j                  S Nr   r   )r   r   rn   r   r   rm   s     r0   rn   z)StepExamplesIterable.shuffle_data_sources   s7    #11)<<49UYU`
 
 
 	
r;   ro   rp   c                 l    t          | j                            ||          | j        | j                  S r   )r   r   rt   r   r   rs   s      r0   rt   z'StepExamplesIterable.shard_data_sources   s9    #//	;GGdi`d`k
 
 
 	
r;   c                     | j         j        S r%   r   rX   s    r0   rw   zStepExamplesIterable.n_shards   r   r;   N)ra   rb   rc   rf   r}   rV   rY   rz   r{   r|   rn   rt   r~   rw   r5   r;   r0   r   r      s        $9  c    
  
bi.A 
F\ 
 
 
 


C 
c 
F\ 
 
 
 

 )# ) ) ) X) ) )r;   r   c                       e Zd Z	 ddee         dee         fdZd Zd Z	de
j        j        dd fd	Zedefd
            Zdededd fdZdS )#CyclingMultiSourcesExamplesIterablefirst_exhaustedex_iterablesstopping_strategyc                 h    || _         || _        |dk    rt          j        nt          j        | _        d S )Nall_exhausted)r   r   rz   allanybool_strategy_func)rU   r   r   s      r0   rV   z,CyclingMultiSourcesExamplesIterable.__init__   s8     )!2 .?/-Q-Q"&&XZX^r;   c                 ^    t          t          t          | j                                      S r%   )r   rL   rI   r   rX   s    r0   _give_indice_iteratorz9CyclingMultiSourcesExamplesIterable._give_indice_iterator   s#    U3t01122333r;   c              #     K   d | j         D             }|                                 }t          j        t	          | j                   d          }|D ]}	 t          ||                   V  ||                                         s:d||<   |                     |          r d S t          | j         |                   ||<   o# t          $ r! d||<   |                     |          rY  d S Y w xY wd S )Nc                 ,    g | ]}t          |          S r5   )rQ   r7   r   s     r0   r?   z@CyclingMultiSourcesExamplesIterable.__iter__.<locals>.<listcomp>   s     WWWk_[11WWWr;   FT)
r   r   rz   fullrI   rJ   r`   r   rQ   r_   )rU   	iteratorsindices_iteratoris_exhaustedrH   s        r0   rY   z,CyclingMultiSourcesExamplesIterable.__iter__   s*     WWTEVWWW	5577ws4#455u==! 	 	A9Q<((((( !|++-- I&*LO..|<< #243DQ3G#H#HIaL     #'Q**<88 EEE 	 	s   AC "C  $C+*C+rk   r#   c                 T    fd| j         D             }t          || j                  S )z*Shuffle each underlying examples iterable.c                 :    g | ]}|                               S r5   rn   r7   r   rk   s     r0   r?   zLCyclingMultiSourcesExamplesIterable.shuffle_data_sources.<locals>.<listcomp>   '    iii88CCiiir;   )r   r   r   rU   rk   r   s    ` r0   rn   z8CyclingMultiSourcesExamplesIterable.shuffle_data_sources   s3    iiiiW[Whiii2<AWXXXr;   c                 >    t          d | j        D                       S )Nc              3   $   K   | ]}|j         V  d S r%   rw   r   s     r0   	<genexpr>z?CyclingMultiSourcesExamplesIterable.n_shards.<locals>.<genexpr>   %      MMK;'MMMMMMr;   minr   rX   s    r0   rw   z,CyclingMultiSourcesExamplesIterable.n_shards   "    MM4;LMMMMMMr;   ro   rp   c                 V    t          fd| j        D             | j                  S )rr   c                 <    g | ]}|                               S r5   rt   r7   iterablerp   ro   s     r0   r?   zJCyclingMultiSourcesExamplesIterable.shard_data_sources.<locals>.<listcomp>  )    cccXX((K@@cccr;   r   )r   r   r   rs   s    ``r0   rt   z6CyclingMultiSourcesExamplesIterable.shard_data_sources   s?    2cccccQUQbccc"4
 
 
 	
r;   N)r   )ra   rb   rc   r   rf   r   r   rV   r   rY   rz   r{   r|   rn   r~   r}   rw   rt   r5   r;   r0   r   r      s        \m_ _ !67_LTUXM_ _ _ _4 4 4  :Ybi.A YFk Y Y Y Y
 N# N N N XN
C 
c 
Fk 
 
 
 
 
 
r;   r   c                       e Zd ZdZdee         fdZd Zdej	        j
        dd fdZedefd            Zd	ed
edd fdZdS )2VerticallyConcatenatedMultiSourcesExamplesIterablea  
    VerticallyConcatenatedMultiSourcesExamplesIterable simply chains the input iterables.
    It doesn't require the examples iterables to always yield the same columns.
    Instead, this is handled by the `IterableDataset` class or `TypedExamplesIterable`.

    For information, `IterableDataset` merges the features of all the datasets to concatenate into one.
    We use `IterableDataset._resolve_features` to obtain the features of all the datasets to concatenate.

    Then for each example, `IterableDataset` and `TypedExamplesIterable` automatically fill missing columns with None.
    This is done with `_apply_feature_types_on_example`.
    r   c                     || _         d S r%   r   rU   r   s     r0   rV   z;VerticallyConcatenatedMultiSourcesExamplesIterable.__init__      (r;   c              #   .   K   | j         D ]
}|E d {V  d S r%   r   )rU   r   s     r0   rY   z;VerticallyConcatenatedMultiSourcesExamplesIterable.__iter__  s=      , 	# 	#K"""""""""	# 	#r;   rk   r#   c                     t                    }t          | j                  }|                    |           fd|D             }t	          |          S )zTShuffle the list of examples iterable, as well as each underlying examples iterable.c                 :    g | ]}|                               S r5   r   r   s     r0   r?   z[VerticallyConcatenatedMultiSourcesExamplesIterable.shuffle_data_sources.<locals>.<listcomp>#  s'    ddd88CCdddr;   )r   rv   r   shuffler   )rU   rk   r   r   s    `  r0   rn   zGVerticallyConcatenatedMultiSourcesExamplesIterable.shuffle_data_sources  s\     y!!D-..L!!!ddddWcdddA,OOOr;   c                 >    t          d | j        D                       S )Nc              3   $   K   | ]}|j         V  d S r%   r   r   s     r0   r   zNVerticallyConcatenatedMultiSourcesExamplesIterable.n_shards.<locals>.<genexpr>(  r   r;   r   rX   s    r0   rw   z;VerticallyConcatenatedMultiSourcesExamplesIterable.n_shards&  r   r;   ro   rp   c                 H    t          fd| j        D                       S )rr   c                 <    g | ]}|                               S r5   r   r   s     r0   r?   zYVerticallyConcatenatedMultiSourcesExamplesIterable.shard_data_sources.<locals>.<listcomp>/  r   r;   )r   r   rs   s    ``r0   rt   zEVerticallyConcatenatedMultiSourcesExamplesIterable.shard_data_sources*  s7     BcccccQUQbccc
 
 	
r;   Nra   rb   rc   rd   r   rf   rV   rY   rz   r{   r|   rn   r~   r}   rw   rt   r5   r;   r0   r   r     s        
 
)T*?%@ ) ) ) )# # #P,P	=P P P P N# N N N XN

+.
	=
 
 
 
 
 
r;   r   r   c                     t          |           t          d                                 D                       s!fdD             }t          d| d          dS )zBCheck the column names to make sure they don't contain duplicates.c              3   "   K   | ]
}|d k    V  dS )r   Nr5   )r7   counts     r0   r   z&_check_column_names.<locals>.<genexpr>6  s&      88euz888888r;   c                 ,    g | ]}|         d k    |S )r   r5   )r7   r9   counters     r0   r?   z'_check_column_names.<locals>.<listcomp>7  s'    IIIcq8H8Hc8H8H8Hr;   zAThe examples iterables can't have duplicated columns but columns z are duplicated.N)r   r   values
ValueError)r   duplicated_columnsr   s     @r0   _check_column_namesr   3  s~    l##G88w~~'7'788888 
IIIIWIIItPbttt
 
 	

 
r;   c                       e Zd ZdZdee         fdZd Zdej	        j
        dd fdZedefd            Zd	ed
edd fdZdS )4HorizontallyConcatenatedMultiSourcesExamplesIterablea-  
    HorizontallyConcatenatedMultiSourcesExamplesIterable merges examples together for the input list of iterables.
    It also checks that there are no duplicate columns (otherwise we don't know which one to keep).
    This check is done once when yielding the first example.

    However it doesn't fill missing columns with None.
    Instead, this is handled by the `IterableDataset` class or `TypedExamplesIterable`.

    For information, `IterableDataset` merges the features of all the datasets to concatenate into one.
    We use `IterableDataset._resolve_features` to obtain the features of all the datasets to concatenate.

    Then for each example, `IterableDataset` and `TypedExamplesIterable` automatically fill missing columns with None.
    This is done with `_apply_feature_types_on_example`.
    r   c                     || _         d S r%   r   r   s     r0   rV   z=HorizontallyConcatenatedMultiSourcesExamplesIterable.__init__M  r   r;   c              #     K   d | j         D             }t          j                    D ]}g }g }t          |          D ]d}	 t	          |          \  }}|                    |           |                    |           @# t          $ r |                    |           Y aw xY w|ra|dk    rt          d |D                        i }|D ]}|	                    |           d
                    d |D                       }	|	|fV   d S d S )Nc                 ,    g | ]}t          |          S r5   )rK   r   s     r0   r?   zQHorizontallyConcatenatedMultiSourcesExamplesIterable.__iter__.<locals>.<listcomp>Q  s     OOOk[))OOOr;   r   c                     g | ]	}|D ]}|
S r5   r5   )r7   r8   column_names      r0   r?   zQHorizontallyConcatenatedMultiSourcesExamplesIterable.__iter__.<locals>.<listcomp>^  s'    (h(h(h`g(h(hQ\(h(h(h(hr;   _c              3   4   K   | ]}t          |          V  d S r%   r   r7   keys     r0   r   zPHorizontallyConcatenatedMultiSourcesExamplesIterable.__iter__.<locals>.<genexpr>b  (      "<"<3s88"<"<"<"<"<"<r;   )r   	itertoolsr   rv   rJ   appendr_   remover   updatejoin)
rU   ex_iteratorsrH   keysr2   r   r  r8   new_examplenew_keys
             r0   rY   z=HorizontallyConcatenatedMultiSourcesExamplesIterable.__iter__P  sd     OOT=NOOO"" 	 	ADH#L11 5 55#'#4#4LCKK$$$OOG,,,,$ 5 5 5 ''444445 	66'(h(hH(h(h(hiii ' 0 0G&&w////(("<"<t"<"<"<<<{*****'	 	s   <A<<BBrk   r#   c                     | S )z^Doesn't shuffle the wrapped examples iterable since it would break the alignment between them.r5   rm   s     r0   rn   zIHorizontallyConcatenatedMultiSourcesExamplesIterable.shuffle_data_sourcesg  s	     r;   c                     dS Nr   r5   rX   s    r0   rw   z=HorizontallyConcatenatedMultiSourcesExamplesIterable.n_shardsm  s    qr;   ro   rp   c                 H    t          fd| j        D                       S )rr   c                 <    g | ]}|                               S r5   r   r   s     r0   r?   z[HorizontallyConcatenatedMultiSourcesExamplesIterable.shard_data_sources.<locals>.<listcomp>v  r   r;   )r   r   rs   s    ``r0   rt   zGHorizontallyConcatenatedMultiSourcesExamplesIterable.shard_data_sourcesq  s7     DcccccQUQbccc
 
 	
r;   Nr   r5   r;   r0   r   r   =  s         )T*?%@ ) ) ) )  .,	?    #    X

+.
	?
 
 
 
 
 
r;   r   c                       e Zd Z	 	 ddej        j        deee                  dee	         f fdZ
e	 	 ddej        j        d	ed
eee                  dee         fd            Zd Zdej        j        dd fdZdededd fdZ xZS )+RandomlyCyclingMultiSourcesExamplesIterableNr   rk   probabilitiesr   c                     t                                          ||           t          |          | _        || _        d S r%   )r   rV   r   rk   r  )rU   r   rk   r  r   r   s        r0   rV   z4RandomlyCyclingMultiSourcesExamplesIterable.__init__{  s=     	'8999!),,*r;     r   num_sourcespr#   c              #      K   |*	 d |                      d||          D             E d{V  )	 d |                     |||          D             E d{V  ))z]Get an infinite iterator that randomly samples the index of the source to pick examples from.NTc              3   4   K   | ]}t          |          V  d S r%   r}   r7   rH   s     r0   r   zSRandomlyCyclingMultiSourcesExamplesIterable._iter_random_indices.<locals>.<genexpr>  (      aaqCFFaaaaaar;   r   sizec              3   4   K   | ]}t          |          V  d S r%   r  r  s     r0   r   zSRandomlyCyclingMultiSourcesExamplesIterable._iter_random_indices.<locals>.<genexpr>  r  r;   )r  r  )integerschoice)r   r  random_batch_sizer  s       r0   _iter_random_indicesz@RandomlyCyclingMultiSourcesExamplesIterable._iter_random_indices  s       9baaCLLKN_L,`,`aaaaaaaaaabbaaCJJ{IZ^_J,`,`aaaaaaaaaabr;   c                     t          | j                  }|                     |t          | j                  | j                  S )N)r  )r   rk   r$  rI   r   r  )rU   r   s     r0   r   zARandomlyCyclingMultiSourcesExamplesIterable._give_indice_iterator  s:    t~&&((c$2C.D.DHZ([[[r;   c                 X    fd| j         D             }t          || j                  S )z;Shuffle the data sources of each wrapped examples iterable.c                 :    g | ]}|                               S r5   r   r   s     r0   r?   zTRandomlyCyclingMultiSourcesExamplesIterable.shuffle_data_sources.<locals>.<listcomp>  r   r;   )rk   r  )r   r  r  r   s    ` r0   rn   z@RandomlyCyclingMultiSourcesExamplesIterable.shuffle_data_sources  sA    iiiiW[Whiii:IT=O
 
 
 	
r;   ro   rp   c                 l    t          fd| j        D             | j        | j        | j                  S )rr   c                 <    g | ]}|                               S r5   r   r   s     r0   r?   zRRandomlyCyclingMultiSourcesExamplesIterable.shard_data_sources.<locals>.<listcomp>  r   r;   )r  r   rk   r  r   rs   s    ``r0   rt   z>RandomlyCyclingMultiSourcesExamplesIterable.shard_data_sources  sE    :cccccQUQbcccN"	
 
 	
r;   )Nr   )r  N)ra   rb   rc   rz   r{   r|   r   r   floatr   rV   staticmethodr}   r   r$  r   rn   rt   r   r   s   @r0   r  r  z  sJ       
 04+<	+ 	+ 9&	+  U,		+
 $C=	+ 	+ 	+ 	+ 	+ 	+  #'	b bY bb DK 	b
 
#b b b \b\ \ \

bi.A 
Fs 
 
 
 

C 
c 
Fs 
 
 
 
 
 
 
 
r;   r  c                       e Zd Z	 	 	 	 	 	 	 ddedededeee                  ded	ee	         d
edeee                  dee
         fdZd Zdej        j        dd fdZde	de	dd fdZede	fd            ZdS )MappedExamplesIterableFNr  r   functionwith_indicesinput_columnsbatched
batch_sizedrop_last_batchremove_columns	fn_kwargsc
                     || _         || _        || _        || _        || _        || _        || _        || _        |	pi | _        d S r%   )	r   r.  r1  r2  r3  r4  r/  r0  r5  )
rU   r   r.  r/  r0  r1  r2  r3  r4  r5  s
             r0   rV   zMappedExamplesIterable.__init__  sP     ' $.,(*"br;   c              #     K   t          | j                  }d| j        r|D ]\  }}| j        | j        dk    r|nt	          || j        dz
            }||fgd |D             z   }t          | \  }}| j        r-| j        &| j        dk    rt          |          | j        k     r d S t          |          }|| j	        gnfd| j	        D             }	| j
        r;|	                    fdt          t          |                    D                        t          |                               | j        |	i | j                   | j        r| j        D ]}
|
= rht%          t                              fdD             }|r;t'          d| dfd	|D              d
 dt                              d	          d                    d |D                       }t+          t-                              D ]\  }}||fV  |dz   z  d S |D ]\  }}t          |          }|| j	        gnfd| j	        D             }	| j
        r|	                               t          |          }|                     | j        |	i | j                   | j        r| j        D ]}
||
= ||fV  dz  d S )Nr   r   c                     g | ]	\  }}||f
S r5   r5   r7   r  r8   s      r0   r?   z3MappedExamplesIterable.__iter__.<locals>.<listcomp>  !    7j7j7j<3PWg7j7j7jr;   c                      g | ]
}|         S r5   r5   r7   r9   inputss     r0   r?   z3MappedExamplesIterable.__iter__.<locals>.<listcomp>      LwLwLw]`VTW[LwLwLwr;   c                     g | ]}|z   S r5   r5   r7   rH   current_idxs     r0   r?   z3MappedExamplesIterable.__iter__.<locals>.<listcomp>      )a)a)aa+/)a)a)ar;   c                 l    g | ]0}t          |                   t                             k    .|1S r5   rI   )r7   r9   	first_coltransformed_batchs     r0   r?   z3MappedExamplesIterable.__iter__.<locals>.<listcomp>  sK          0566#>OPY>Z:[:[[[ [[[r;   z!Column lengths mismatch: columns z have length c                 :    g | ]}t          |                   S r5   rD  )r7   r9   rF  s     r0   r?   z3MappedExamplesIterable.__iter__.<locals>.<listcomp>  sB      XI  XI  XIx{X[\mnq\rXsXs  XI  XI  XIr;   z while z has length .r   c              3   4   K   | ]}t          |          V  d S r%   r   r  s     r0   r   z2MappedExamplesIterable.__iter__.<locals>.<genexpr>  r  r;   c                      g | ]
}|         S r5   r5   r<  s     r0   r?   z3MappedExamplesIterable.__iter__.<locals>.<listcomp>  r>  r;   )rK   r   r1  r2  r   rA   r3  rI   rD   r0  r/  r  rL   r@   r  r.  r5  r4  rJ   r   r  	enumeraterO   )rU   iteratorr  r8   iterator_batchkey_examples_listr
  r2   r!   function_argsr   bad_colsr  	batch_idxtransformed_examplerA  rE  r=  rF  s                  @@@@r0   rY   zMappedExamplesIterable.__iter__  s9     ())< ?	! ( ,- ,-W .$/Q2F2F H$/A*=>> 
 '*7^$47j7j[i7j7j7j$j!!$&7!8h(3!++H77FF*844,0,>,FLwLwLwLwdhdvLwLwLw$ c!(()a)a)a)a5M^I_I_C`C`)a)a)abbb$(KK!!(()X)X)XYYY& 1!0 1 1-a00$ 
 $T*;%<%< = =I         #4     H
   ( M  M  M  XI  XI  XI  XI  @H  XI  XI  XI  M  M  R[  M  M  il  m~  H  mI  iJ  iJ  M  M  M   (("<"<t"<"<"<<<*34FGX4Y4Y*Z*Z + +&Iw!7*****y1},Y,- ,-\ !) ! !W w-- ,0,>,FLwLwLwLwdhdvLwLwLw$ 6!((555&*7mm##**=4=-+Z4>+Z+Z[[[& 3!0 3 3/22.....q !! !r;   rk   r#   c           
          t          | j                            |          | j        | j        | j        | j        | j        | j        | j	                  S )&Shuffle the wrapped examples iterable.r.  r/  r0  r1  r2  r4  r5  )
r-  r   rn   r.  r/  r0  r1  r2  r4  r5  rm   s     r0   rn   z+MappedExamplesIterable.shuffle_data_sources  sS    %11)<<]*,L.n	
 	
 	
 		
r;   ro   rp   c           
          t          | j                            ||          | j        | j        | j        | j        | j        | j        | j	                  S )r   rU  )
r-  r   rt   r.  r/  r0  r1  r2  r4  r5  rs   s      r0   rt   z)MappedExamplesIterable.shard_data_sources  sU    %//	;GG]*,L.n	
 	
 	
 		
r;   c                     | j         j        S r%   r   rX   s    r0   rw   zMappedExamplesIterable.n_shards   r   r;   )FNFr  FNN)ra   rb   rc   rf   r	   boolr   r   r   r}   r@   rV   rY   rz   r{   r|   rn   rt   r~   rw   r5   r;   r0   r-  r-    sB       
 #-1$( %.2$() )*) ) 	)
  S	*) ) SM) ) !c+) D>) ) ) ),B! B! B!H
bi.A 
F^ 
 
 
 

C 
c 
F^ 
 
 
 
 )# ) ) ) X) ) )r;   r-  c                       e Zd Z	 	 	 	 ddedededeee                  ded	ee	         fd
Z
d Zdee	         dd fdZde	de	dd fdZede	fd            ZdS )FilteredExamplesIterableFNr  r   r.  r/  r0  r1  r2  c                 Z    || _         || _        || _        || _        || _        || _        d S r%   )r   r.  r1  r2  r/  r0  )rU   r   r.  r/  r0  r1  r2  s          r0   rV   z!FilteredExamplesIterable.__init__&  s7     ' $(*r;   c           	   #     K   t          | j                  }d| j        r|D ]\  }}| j        | j        dk    r|nt	          || j        dz
            }||fgd |D             z   }t          | \  }}t          |          }|| j        gnfd| j        D             }	| j        r;|		                    fdt          t          |                    D                         | j        |	 }
t          t          ||
                    D ]\  }\  }}|r|V  |dz   z  d S |D ]d\  }}t          |          | j        gnfd| j        D             }	| j        r|		                                | j        |	 }|r||fV  dz  ed S )Nr   r   c                     g | ]	\  }}||f
S r5   r5   r9  s      r0   r?   z5FilteredExamplesIterable.__iter__.<locals>.<listcomp>A  r:  r;   c                      g | ]
}|         S r5   r5   r<  s     r0   r?   z5FilteredExamplesIterable.__iter__.<locals>.<listcomp>F  r>  r;   c                     g | ]}|z   S r5   r5   r@  s     r0   r?   z5FilteredExamplesIterable.__iter__.<locals>.<listcomp>H  rB  r;   c                      g | ]
}|         S r5   r5   r<  s     r0   r?   z5FilteredExamplesIterable.__iter__.<locals>.<listcomp>S  r>  r;   )rK   r   r1  r2  r   rA   rD   r0  r/  r  rL   rI   r.  rK  r@   )rU   rL  r  r8   rM  rN  r
  r2   r!   rO  maskrQ  key_exampleto_keeprA  r=  s                 @@r0   rY   z!FilteredExamplesIterable.__iter__6  s8     ())<  	! ( - -W .$/Q2F2F H$/A*=>> 
 '*7^$47j7j[i7j7j7j$j!!$&7!8h*844,0,>,FLwLwLwLwdhdvLwLwLw$ c!(()a)a)a)a5M^I_I_C`C`)a)a)abbb$t}m49B3GXZ^C_C_9`9` * *5I5W *))))y1},)- -, !) 	! 	!Wg,0,>,FLwLwLwLwdhdvLwLwLw$ 6!((555'$-7 'w,&&&q 	! 	!r;   seedr#   c                     t          | j                            |          | j        | j        | j        | j        | j                  S )rT  r.  r/  r0  r1  r2  )rZ  r   rn   r.  r/  r0  r1  r2  )rU   rd  s     r0   rn   z-FilteredExamplesIterable.shuffle_data_sources[  sH    '11$77]*,L
 
 
 	
r;   ro   rp   c                     t          | j                            ||          | j        | j        | j        | j        | j                  S )r   rf  )rZ  r   rt   r.  r/  r0  r1  r2  rs   s      r0   rt   z+FilteredExamplesIterable.shard_data_sourcesf  sJ    '//	;GG]*,L
 
 
 	
r;   c                     | j         j        S r%   r   rX   s    r0   rw   z!FilteredExamplesIterable.n_shardsq  r   r;   )FNFr  )ra   rb   rc   rf   r	   rX  r   r   r   r}   rV   rY   rn   rt   r~   rw   r5   r;   r0   rZ  rZ  %  s
       
 #-1$(+ +*+ + 	+
  S	*+ + SM+ + + + #! #! #!J	
# 	
;U 	
 	
 	
 	
	
C 	
c 	
F` 	
 	
 	
 	
 )# ) ) ) X) ) )r;   rZ  c            	           e Zd Zdededej        j        fdZe	ddej        j        dede
e         fd            Zd	 Zdej        j        dd fd
Zdededd fdZedefd            ZdS )BufferShuffledExamplesIterabler   buffer_sizerk   c                 0    || _         || _        || _        d S r%   )r   rk  rk   )rU   r   rk  rk   s       r0   rV   z'BufferShuffledExamplesIterable.__init__w  s    &&"r;   r  r   r#   c              #   Z   K   	 d |                      d||          D             E d {V  ))NTc              3   4   K   | ]}t          |          V  d S r%   r  r  s     r0   r   zFBufferShuffledExamplesIterable._iter_random_indices.<locals>.<genexpr>  s(      ]]1A]]]]]]r;   r   r  )r!  )r   rk  r#  s      r0   r$  z3BufferShuffledExamplesIterable._iter_random_indices|  sK      	^]]QJ[(\(\]]]]]]]]]]	^r;   c              #   L  K   | j         }t          | j                  }|                     ||          }g }| j        D ]I}t          |          |k    rt          |          }||         V  |||<   4|                    |           J|                    |           |E d {V  d S r%   )	rk  r   rk   r$  r   rI   rJ   r  r   )rU   rk  r   r   
mem_bufferxrH   s          r0   rY   z'BufferShuffledExamplesIterable.__iter__  s      &t~&&44S+FF
! 	% 	%A:+--)** m### !
1!!!$$$$Jr;   c                 `    t          | j                            |          | j        |          S )zFShuffle the wrapped examples iterable as well as the shuffling buffer.rk  rk   )rj  r   rn   rk  rm   s     r0   rn   z3BufferShuffledExamplesIterable.shuffle_data_sources  s5    -11)<<$JZfo
 
 
 	
r;   ro   rp   c                 l    t          | j                            ||          | j        | j                  S )r   rs  )rj  r   rt   rk  rk   rs   s      r0   rt   z1BufferShuffledExamplesIterable.shard_data_sources  s:    -//	;GG(n
 
 
 	
r;   c                     | j         j        S r%   r   rX   s    r0   rw   z'BufferShuffledExamplesIterable.n_shards  r   r;   N)r  )ra   rb   rc   rf   r}   rz   r{   r|   rV   r+  r   r$  rY   rn   rt   r~   rw   r5   r;   r0   rj  rj  v  s       #$9 # #XZXaXk # # # #
 ^ ^")"5 ^C ^dlmpdq ^ ^ ^ \^  "
bi.A 
Ff 
 
 
 

C 
c 
Ff 
 
 
 
 )# ) ) ) X) ) )r;   rj  c                   d    e Zd ZdedefdZd Zdej        j	        dd fdZ
edefd            Zd	S )
SkipExamplesIterabler   nc                 "    || _         || _        d S r%   r   rx  rU   r   rx  s      r0   rV   zSkipExamplesIterable.__init__      &r;   c              #   L   K   t          | j        | j        d           E d {V  d S r%   r   r   rx  rX   s    r0   rY   zSkipExamplesIterable.__iter__  s5      $*DFD99999999999r;   rk   r#   c                     | S )zeDoesn't shuffle the wrapped examples iterable since it would skip examples from other shards instead.r5   rm   s     r0   rn   z)SkipExamplesIterable.shuffle_data_sources      r;   c                     | j         j        S r%   r   rX   s    r0   rw   zSkipExamplesIterable.n_shards  r   r;   N)ra   rb   rc   rf   r}   rV   rY   rz   r{   r|   rn   r~   rw   r5   r;   r0   rw  rw    s        $9 c    : : :bi.A F\     )# ) ) ) X) ) )r;   rw  c                       e Zd ZdedefdZd Zdej        j	        dd fdZ
ed             Zd	ed
edd fdZedefd            ZdS )TakeExamplesIterabler   rx  c                 "    || _         || _        d S r%   rz  r{  s      r0   rV   zTakeExamplesIterable.__init__  r|  r;   c              #   J   K   t          | j        | j                  E d {V  d S r%   r~  rX   s    r0   rY   zTakeExamplesIterable.__iter__  s3      $*DF33333333333r;   rk   r#   c                     | S )zeDoesn't shuffle the wrapped examples iterable since it would take examples from other shards instead.r5   rm   s     r0   rn   z)TakeExamplesIterable.shuffle_data_sources  r  r;   c                 j    | |z  }| |z  }|g|z  }t          |          D ]}||xx         dz  cc<   |S r  )rL   )numrx  quotient	remainderr\   rH   s         r0   split_numberz!TakeExamplesIterable.split_number  sR    !8!G	ay!! 	 	A1IIINIIIIr;   ro   rp   c                     t          | j                            ||          |                     | j        |          |                   S )r   )rx  )r  r   rt   r  rx  rs   s      r0   rt   z'TakeExamplesIterable.shard_data_sources  sI    #//	;GG44Y?
 
 
 	
r;   c                     | j         j        S r%   r   rX   s    r0   rw   zTakeExamplesIterable.n_shards  r   r;   N)ra   rb   rc   rf   r}   rV   rY   rz   r{   r|   rn   r+  r  rt   r~   rw   r5   r;   r0   r  r    s        $9 c    4 4 4bi.A F\       \
C 
c 
F\ 
 
 
 
 )# ) ) ) X) ) )r;   r  r8   featurestoken_per_repo_idc                     t          |           } |D ]}|| vrd | |<   |                    |           }|                    ||          }|S Nr  )r@   encode_exampledecode_example)r8   r  r  r   encoded_exampledecoded_examples         r0   _apply_feature_types_on_exampler    sf     7mmG ( (g%%#'GK --g66O--oQb-ccOr;   c                     t          |           } t          | t          t          |                                        }|D ]}|| vr	d g|z  | |<   |                    |           }|                    ||          }|S r  )r@   rI   rJ   rK   encode_batchdecode_batch)r!   r  r  rN   r   encoded_batchdecoded_batchs          r0   _apply_feature_types_on_batchr    s     KKEU4U,,-..J 5 5e##"&*!4E+))%00M))-K\)]]Mr;   c                       e Zd Zdededeeeeedf         f         fdZ	d Z
dej        j        dd fd	Zd
ededd fdZedefd            ZdS )TypedExamplesIterabler   r  r  Nc                 0    || _         || _        || _        d S r%   )r   r  r  )rU   r   r  r  s       r0   rV   zTypedExamplesIterable.__init__  s!     ' !2r;   c              #   d   K   | j         D ]%\  }}|t          || j        | j                  fV  &d S r  )r   r  r  r  )rU   r  r8   s      r0   rY   zTypedExamplesIterable.__iter__  sb       !, 	 	LC6$:P       	 	r;   rk   r#   c                 j    t          | j                            |          | j        | j                  S )rT  r  r  )r  r   rn   r  r  rm   s     r0   rn   z*TypedExamplesIterable.shuffle_data_sources	  s8    $11)<<]"4
 
 
 	
r;   ro   rp   c                 l    t          | j                            ||          | j        | j                  S )r   r  )r  r   rt   r  r  rs   s      r0   rt   z(TypedExamplesIterable.shard_data_sources  s:    $//	;GG]"4
 
 
 	
r;   c                     | j         j        S r%   r   rX   s    r0   rw   zTypedExamplesIterable.n_shards  r   r;   )ra   rb   rc   rf   r   r
   r   r   rX  rV   rY   rz   r{   r|   rn   r}   rt   r~   rw   r5   r;   r0   r  r    s        3*3 3  U3d?%; ;<	3 3 3 3  
bi.A 
F] 
 
 
 

C 
c 
F] 
 
 
 
 )# ) ) ) X) ) )r;   r  c                       fd}|S )Nc               ;      K   t                      } di | D ]G\  }}|                    |          }t          t          |                    D ]\  }}| d| |fV  Hd S )Nr   r5   )r   format_batchrK  rO   )r   python_formatterr  tabler!   rH   r8   generate_tables_fns          r0   wrapperz7_generate_examples_from_tables_wrapper.<locals>.wrapper  s      *,,,,66v66 	, 	,JC$11%88E'(:5(A(ABB , ,
7llqllG+++++,	, 	,r;   r5   )r  r  s   ` r0   &_generate_examples_from_tables_wrapperr    s#    , , , , , Nr;   c                   H    e Zd ZU ej        j        ed<   dZee	         ed<   dS )ShufflingConfigrk   N_original_seed)
ra   rb   rc   rz   r{   r|   __annotations__r  r   r}   r5   r;   r0   r  r  )  s7         y""""$(NHSM(((((r;   r  c                   $    e Zd ZU eed<   eed<   dS )DistributedConfigrank
world_sizeN)ra   rb   rc   r}   r  r5   r;   r0   r  r  /  s"         
IIIOOOOOr;   r  c                     t           j        r>ddl}|j        j        j        | j        vr$| xj        |j        j        j        fz  c_        dS dS dS )zNAdd torch.utils.data.IterableDataset as a parent class if 'torch' is availabler   N)r   TORCH_AVAILABLEtorch.utils.datautilsdataIterableDataset	__bases__)clstorchs     r0   ._maybe_add_torch_iterable_dataset_parent_classr  5  sc     A;+3=@@MMek.>@@MMMM	A A A@r;   c                   ,   e Zd ZdZ	 	 	 	 	 	 dBdedee         dee         dee         dee	         dee
         d	eeeeeedf         f                  fd
Zd Zd ZdCdZd Zedefd            ZdefdZd ZdefdZd ZdDdedefdZe	 	 dEdedee         dee         dd fd            Z	 dFdee         dd fdZ 	 	 	 	 	 	 	 	 	 dGd!ee         d"ed#eeee!e         f                  d$edee         ded%eeee!e         f                  dee         d&ee         dd fd'Z"	 	 	 	 	 dHd!ee         d#eeee!e         f                  d$edee         dd f
d(Z#	 dIdee$j%        j&                 d)edd fd*Z'd+efd,Z(dJd-Z)dJd.Z*edee!e                  fd/            Z+d0ed1ee,e$j-        f         dd fd2Z.d3ed4edd fd5Z/d6eeef         dd fd7Z0d8eee!e         f         dd fd9Z1d8eee!e         f         dd fd:Z2d1ed;e3dd fd<Z4dedd fd=Z5d>ed?edd fd@Z6dA Z7dS )Kr  z A Dataset backed by an iterable.Nr   infosplitformat_type	shufflingdistributedr  c                 F   |r#|j         dk    r|r|j        t          d          ||                                nt	                      }t          j        | ||           || _        || _        || _	        || _
        d| _        |pi | _        t          | j                   d S )Nr   zThe dataset doesn't have a fixed random seed across nodes to shuffle and split the list of dataset shards by node. Please pass e.g. `seed=42` in `.shuffle()` to make all the nodes use the same seed. )r  r  r   )r  r  RuntimeErrorcopyr   r   rV   _ex_iterable_format_type
_shuffling_distributed_epoch_token_per_repo_idr  r   )rU   r   r  r  r  r  r  r  s           r0   rV   zIterableDataset.__init__A  s      	;1A55)5	H`Hhg  
 #.tyy{{{KMM!$T????''#'EVE\Z\6t~FFFFFr;   c                     | j         S r%   )__dict__rX   s    r0   __getstate__zIterableDataset.__getstate__\  s
    }r;   c                 <    || _         t          | j                   d S r%   )r  r  r   )rU   ds     r0   __setstate__zIterableDataset.__setstate___  s    6t~FFFFFr;      c                 `    t          t          |                     |                              S r%   )rD   rv   take)rU   rx  s     r0   _headzIterableDataset._headd  s"    !$tyy||"4"4555r;   c                 ,   | j         r| j        dk    r| j         j        S | j         rat          | j         j                                      dd          | j        z
  }|dk     rd|z   n|}t
          j                            |          S t          d          )Nr   l            zThis dataset is not shuffled)	r  r  rk   r   r!  rz   r{   default_rngr   )rU   effective_seeds     r0   _effective_generatorz$IterableDataset._effective_generatorg  s    ? 	=t{a//?,,_ 	=%do&?@@II!WUUX\XccN;IA;M;Mg77SaN9((888;<<<r;   r#   c                     | j         r6| j        j        | j         j        z  dk    r| j        j        | j         j        z  S | j        j        S )Nr   )r  r  rw   r  rX   s    r0   rw   zIterableDataset.n_shardsr  sM     	N!2!;d>O>Z!Z^_!_!_$-1B1MMM ))r;   c           
   #      K   t                       dd l}|j        j                                        }|                                 rv|j        |j        k     rft          	                    d|j         d|j         d|j        |j        z
   d           t          	                    d|j         d|j         d           | j
        rd	| j
        j         d
nd}| j                            |j        |j                  }|rt                              | d|j         dt!          |           d|j         d           |                    |j        |j                  D ]/\  }}| j        rt'          || j        | j                  V  +|V  0t                              | d|j         dt!          |           d|j         d           d S t                              | d|j         d|j         d|j         d           d S )Nr   zToo many dataloader workers: z (max is dataset.n_shards=z). Stopping z dataloader workers.zTo parallelize data loading, we give each process some shards (or data sources) to process. Therefore it's unnecessary to have a number of workers greater than dataset.n_shards=zJ. To enable more parallelism, please split the dataset in more files than rH  znode#  zdataloader worker#z, ': Starting to iterate over /z shards.r  z, ': Finished iterating over z9, ': Stopping... Number of dataset shards < num_workers (<z).)r   r  r  r  get_worker_info_is_main_processrw   rp   loggerwarningr  r  r  rx   iddebugrI   rt   r  r  r  )rU   r   r  worker_info_log_prefixshards_indicesr  r8   s           r0   _iter_pytorchzIterableDataset._iter_pytorchx  s     k&6688  "" 		{';k>U'U'UNNa0G a acncw a a'3k6JJa a a   NNshsh|s s[f[os s s   <@;LT7d/47777RT*HHYdYpqq 	LL  U  U+.  U  U`cdr`s`s  U  U  wB  wK  U  U  U   !, > >{~{Of g g " "W= "9$BY       "MMMMLL  T  T+.  T  T_bcq_r_r  T  T  vA  vJ  T  T  T     LL  n  n+.  n  n  |G  |P  n  n  S^  Sj  n  n  n    r;   c                     | j         r| j         j        dk    rdS dt          j        v r1dd l}|j        j                                        }||j        dk    rdS dS )Nr   Fr  T)	r  r  sysmodulesr  r  r  r  r  )rU   r  r  s      r0   r  z IterableDataset._is_main_process  sm     	!2!7!!;!;5ck!!####+*::<<K&;>A+=+=utr;   c           	      t   | j         r-| j                            |                                           }n| j        }| j        r| j        j        }| j        j        }|j        |z  dk    rc|                                 r8|j        |z  }|dk    rdnd}t          
                    d| d| d| d           |                    ||          }nl|                                 rFt          
                    d	| d
           t                              d| d|j         d|            t          |||          }|S )Nr   r   sr  z
Assigning z shardz (or data sourcez) of the dataset to each node.zAssigning 1 out of zS examples of the dataset to each node. The others are skipped during the iteration.zIt is more optimized to distribute the dataset shards (or data sources) across nodes. You can do that by using a dataset with number of shards that is a factor of world_size=z. The current dataset has z which is not a factor of r   )r  r  rn   r  r  r  r  rw   r  r  r  rt   r  r   )rU   r   r  r  n_shards_per_nodeplurals         r0   "_prepare_ex_iterable_for_iterationz2IterableDataset._prepare_ex_iterable_for_iteration  s   ? 	,+@@AZAZA\A\]]KK+K 	^$)D*5J#j0A55((** (3(<
(J%$5$9$9SSrFNN|%6||f||V\|||   *<<T:NN((** NN Nj  N  N  N   KKps}p p3>3Gp pcmp p  
 3;ZX\]]]r;   c              #   r  K   |                                  }dt          j        v r`dd l}|j        j                                        }t          | |j        j        j                  r|| 	                    |          E d {V  d S |D ]/\  }}| j
        rt          || j
        | j                  V  +|V  0d S )Nr  r   r  )r  r  r  r  r  r  r  
isinstancer  r  r  r  r  )rU   r   r  r  r  r8   s         r0   rY   zIterableDataset.__iter__  s      ==??ck!!####+*::<<K$ 0 @AA kF]--k:::::::::' 	 	LC}  6T]d>U       	 	r;   Fr2  r3  c              #   @  K   t          |                                           }|D ]w\  }}|gd t          ||dz
            D             z   }|rt          |          |k     r dS t	          |          }| j        rt          || j        | j                  V  s|V  xdS )a  Iterate through the batches of size `batch_size`.

        Args:
            batch_size (:obj:`int`): size of each batch to yield.
            drop_last_batch (:obj:`bool`, default `False`): Whether a last batch smaller than the batch_size should be
                dropped
        c                     g | ]\  }}|S r5   r5   r9  s      r0   r?   z(IterableDataset.iter.<locals>.<listcomp>  s    #a#a#aWG#a#a#ar;   r   Nr  )rK   r  r   rI   rD   r  r  r  )rU   r2  r3  rL  r  r8   r2   r!   s           r0   rK   zIterableDataset.iter  s       ??AABB$ 	 	LCy#a#axQ[^_Q_@`@`#a#a#aaH 3x==:#=#=&x00E}  4E4=\`\sttttttt	 	r;   rk   r  
gen_kwargsc                 P    ddl m}  || ||d                                          S )a  Create an Iterable Dataset from a generator.

        Args:
            generator (`Callable`):
                A generator function that `yields` examples.
            features (`Features`, *optional*):
                Dataset features.
            gen_kwargs(`dict`, *optional*):
                Keyword arguments to be passed to the `generator` callable.
                You can define a sharded iterable dataset by passing the list of shards in `gen_kwargs`.
                This can be used to improve shuffling and when iterating over the dataset with multiple workers.

        Returns:
            `IterableDataset`

        Example:

        ```py
        >>> def gen():
        ...     yield {"text": "Good", "label": 0}
        ...     yield {"text": "Bad", "label": 1}
        ...
        >>> ds = IterableDataset.from_generator(gen)
        ```

        ```py
        >>> def gen(shards):
        ...     for shard in shards:
        ...         with open(shard) as f:
        ...             for line in f:
        ...                 yield {"line": line}
        ...
        >>> shards = [f"data{i}.txt" for i in range(32)]
        >>> ds = IterableDataset.from_generator(gen, gen_kwargs={"shards": shards})
        >>> ds = ds.shuffle(seed=42, buffer_size=10_000)  # shuffles the shards order + uses a shuffle buffer
        >>> from torch.utils.data import DataLoader
        >>> dataloader = DataLoader(ds.with_format("torch"), num_workers=4)  # give each worker a subset of 32/4=8 shards
        ```
        r   )GeneratorDatasetInputStreamT)rk   r  r  	streaming)io.generatorr  read)rk   r  r  r  s       r0   from_generatorzIterableDataset.from_generator  sH    Z 	>=====**!	
 
 

 $&&	r;   r*   c           
          t          |          }t          | j        | j                                        | j        |t	          j        | j                  t	          j        | j                  | j	                  S )aG  
        Return a dataset with the specified format.
        This method only supports the "torch" format for now.

        Args:

            type (`str`, optional, default None): if set to "torch", the returned dataset
                will be a subclass of torch.utils.data.IterableDataset to be used in a DataLoader
        r   r  r  r  r  r  r  )
r   r  r  _infor  _splitr   r  r  r  )rU   r*   s     r0   with_formatzIterableDataset.with_format+  sl     *$//
 )""+mDO44d&788"5
 
 
 	
r;   r  r.  r/  r0  r1  r4  r5  c
                    t          |t                    r|g}t          |t                    r|g}|d }|	i }	t          | j        j        &t          | j        | j        j        | j                  n| j        ||||||||		  	        }
| j        	                                }||_        t          |
|| j        | j        t          j        | j                  t          j        | j                  | j                  S )a  
        Apply a function to all the examples in the iterable dataset (individually or in batches) and update them.
        If your function returns a column that already exists, then it overwrites it.
        The function is applied on-the-fly on the examples when iterating over the dataset.

        You can specify whether the function should be batched or not with the `batched` parameter:

        - If batched is `False`, then the function takes 1 example in and should return 1 example.
          An example is a dictionary, e.g. `{"text": "Hello there !"}`.
        - If batched is `True` and `batch_size` is 1, then the function takes a batch of 1 example as input and can return a batch with 1 or more examples.
          A batch is a dictionary, e.g. a batch of 1 example is {"text": ["Hello there !"]}.
        - If batched is `True` and `batch_size` is `n` > 1, then the function takes a batch of `n` examples as input and can return a batch with `n` examples, or with an arbitrary number of examples.
          Note that the last batch may have less than `n` examples.
          A batch is a dictionary, e.g. a batch of `n` examples is `{"text": ["Hello there !"] * n}`.

        Args:
            function (`Callable`, *optional*, defaults to `None`):
                Function applied on-the-fly on the examples when you iterate on the dataset.
                It must have one of the following signatures:

                - `function(example: Dict[str, Any]) -> Dict[str, Any]` if `batched=False` and `with_indices=False`
                - `function(example: Dict[str, Any], idx: int) -> Dict[str, Any]` if `batched=False` and `with_indices=True`
                - `function(batch: Dict[str, List]) -> Dict[str, List]` if `batched=True` and `with_indices=False`
                - `function(batch: Dict[str, List], indices: List[int]) -> Dict[str, List]` if `batched=True` and `with_indices=True`

                For advanced usage, the function can also return a `pyarrow.Table`.
                Moreover if your function returns nothing (`None`), then `map` will run your function and return the dataset unchanged.
                If no function is provided, default to identity function: `lambda x: x`.
            with_indices (`bool`, defaults to `False`):
                Provide example indices to `function`. Note that in this case the signature of `function` should be `def function(example, idx[, rank]): ...`.
            input_columns (`Optional[Union[str, List[str]]]`, defaults to `None`):
                The columns to be passed into `function`
                as positional arguments. If `None`, a dict mapping to all formatted columns is passed as one argument.
            batched (`bool`, defaults to `False`):
                Provide batch of examples to `function`.
            batch_size (`int`, *optional*, defaults to `1000`):
                Number of examples per batch provided to `function` if `batched=True`.
                `batch_size <= 0` or `batch_size == None` then provide the full dataset as a single batch to `function`.
            drop_last_batch (`bool`, defaults to `False`):
                Whether a last batch smaller than the batch_size should be
                dropped instead of being processed by the function.
            remove_columns (`[List[str]]`, *optional*, defaults to `None`):
                Remove a selection of columns while doing the mapping.
                Columns will be removed before updating the examples with the output of `function`, i.e. if `function` is adding
                columns with names in `remove_columns`, these columns will be kept.
            features (`[Features]`, *optional*, defaults to `None`):
                Feature types of the resulting dataset.
            fn_kwargs (`Dict`, *optional*, default `None`):
                Keyword arguments to be passed to `function`.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="train", streaming=True)
        >>> def add_prefix(example):
        ...     example["text"] = "Review: " + example["text"]
        ...     return example
        >>> ds = ds.map(add_prefix)
        >>> list(ds.take(3))
        [{'label': 1,
         'text': 'Review: the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'},
         {'label': 1,
         'text': 'Review: the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth .'},
         {'label': 1, 'text': 'Review: effective but too-tepid biopic'}]
        ```
        Nc                     | S r%   r5   )rq  s    r0   <lambda>z%IterableDataset.map.<locals>.<lambda>  s     r;   r  )r.  r/  r0  r1  r2  r3  r4  r5  r  )r  r   r-  r  r  r  r  r  r  r  r  r  r  r   r  r  )rU   r.  r/  r0  r1  r2  r3  r4  r  r5  r   r  s               r0   mapzIterableDataset.mapG  s   ^ mS)) 	,*OMnc** 	.,-N"{HI,z". "$"3TZ5H\`\stttt"%'!+)
 
 
 y~~ #+)mDO44d&788"5
 
 
 	
r;   c           
         t          |t                    r|g}t          j        | j                  }d|_        t          | j        j        &t          | j        | j        j        | j	                  n| j        |||||          }t          ||| j        | j        t          j        | j                  t          j        | j                  | j	                  S )a  Apply a filter function to all the elements so that the dataset only includes examples according to the filter function.
        The filtering is done on-the-fly when iterating over the dataset.

        Args:
            function (`Callable`):
                Callable with one of the following signatures:

                - `function(example: Dict[str, Any]) -> bool` if `with_indices=False, batched=False`
                - `function(example: Dict[str, Any], indices: int) -> bool` if `with_indices=True, batched=False`
                - `function(example: Dict[str, List]) -> List[bool]` if `with_indices=False, batched=True`
                - `function(example: Dict[str, List], indices: List[int]) -> List[bool]` if `with_indices=True, batched=True`

                If no function is provided, defaults to an always True function: `lambda x: True`.
            with_indices (`bool`, defaults to `False`):
                Provide example indices to `function`. Note that in this case the signature of `function` should be `def function(example, idx): ...`.
            input_columns (`str` or `List[str]`, *optional*):
                The columns to be passed into `function` as
                positional arguments. If `None`, a dict mapping to all formatted columns is passed as one argument.
            batched (`bool`, defaults to `False`):
                Provide batch of examples to `function`.
            batch_size (`int`, *optional*, default `1000`):
                Number of examples per batch provided to `function` if `batched=True`.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="train", streaming=True)
        >>> ds = ds.filter(lambda x: x["label"] == 0)
        >>> list(ds.take(3))
        [{'label': 0, 'movie_review': 'simplistic , silly and tedious .'},
         {'label': 0,
         'movie_review': "it's so laddish and juvenile , only teenage boys could possibly find it funny ."},
         {'label': 0,
         'movie_review': 'exploitative and largely devoid of the depth or sophistication that would make watching such a graphic treatment of the crimes bearable .'}]
        ```
        Nr  rf  r  )r  r   r  r   r  r  rZ  r  r  r  r  r  r  r  r  )rU   r.  r/  r0  r1  r2  r  r   s           r0   filterzIterableDataset.filter  s    Z mS)) 	,*OM }TZ(( /z". "$"3TZ5H\`\stttt"%'!	
 	
 	
 #+)mDO44d&788"5
 
 
 	
r;   rk  c           
      |   | t           j                            |          }nt          |          }t	          ||          }t          t          | j        ||                              |          | j	        
                                | j        | j        |t          j        | j                  | j                  S )aC  
        Randomly shuffles the elements of this dataset.

        This dataset fills a buffer with `buffer_size` elements, then randomly samples elements from this buffer,
        replacing the selected elements with new elements. For perfect shuffling, a buffer size greater than or
        equal to the full size of the dataset is required.

        For instance, if your dataset contains 10,000 elements but `buffer_size` is set to 1000, then `shuffle` will
        initially select a random element from only the first 1000 elements in the buffer. Once an element is
        selected, its space in the buffer is replaced by the next (i.e. 1,001-st) element,
        maintaining the 1000 element buffer.

        If the dataset is made of several shards, it also does shuffle the order of the shards.
        However if the order has been fixed by using [`~datasets.IterableDataset.skip`] or [`~datasets.IterableDataset.take`]
        then the order of the shards is kept unchanged.

        Args:
            seed (`int`, *optional*, defaults to `None`):
                Random seed that will be used to shuffle the dataset.
                It is used to sample from the shuffle buffe and also to shuffle the data shards.
            generator (`numpy.random.Generator`, *optional*):
                Numpy random Generator to use to compute the permutation of the dataset rows.
                If `generator=None` (default), uses `np.random.default_rng` (the default BitGenerator (PCG64) of NumPy).
            buffer_size (`int`, defaults to `1000`):
                Size of the buffer.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="train", streaming=True)
        >>> list(ds.take(3))
        [{'label': 1,
         'text': 'the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'},
         {'label': 1,
         'text': 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth .'},
         {'label': 1, 'text': 'effective but too-tepid biopic'}]
        >>> shuffled_ds = ds.shuffle(seed=42)
        >>> list(shuffled_ds.take(3))
        [{'label': 1,
         'text': "a sports movie with action that's exciting on the field and a story you care about off it ."},
         {'label': 1,
         'text': 'at its best , the good girl is a refreshingly adult take on adultery . . .'},
         {'label': 1,
         'text': "sam jones became a very lucky filmmaker the day wilco got dropped from their record label , proving that one man's ruin may be another's fortune ."}]
        ```
        N)rk   r  rs  r  )rz   r{   r  r   r  r  rj  r  rn   r  r  r  r  r  r  )rU   rd  rk   rk  r  s        r0   r   zIterableDataset.shuffle   s    d 	--d33II ++I#iMMM	6!{i  ""9--""+)d&788"5

 

 

 
	
r;   epochc                     || _         d S r%   )r  )rU   r  s     r0   	set_epochzIterableDataset.set_epochC  s    r;   c           
          t          | j        |          }t          || j                                        | j        | j        t	          j        | j                  t	          j        | j	                  | j
                  S )a  
        Create a new [`IterableDataset`] that skips the first `n` elements.

        Args:
            n (`int`):
                Number of elements to skip.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="train", streaming=True)
        >>> list(ds.take(3))
        [{'label': 1,
         'text': 'the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'},
         {'label': 1,
         'text': 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth .'},
         {'label': 1, 'text': 'effective but too-tepid biopic'}]
        >>> ds = ds.skip(1)
        >>> list(ds.take(3))
        [{'label': 1,
         'text': 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth .'},
         {'label': 1, 'text': 'effective but too-tepid biopic'},
         {'label': 1,
         'text': 'if you sometimes like to go to the movies to have fun , wasabi is a good place to start .'}]
        ```
        r  )rw  r  r  r  r  r  r  r   r  r  r  rU   rx  r   s      r0   skipzIterableDataset.skipF  so    8 +4+<a@@#""+)mDO44d&788"5
 
 
 	
r;   c           
          t          | j        |          }t          || j                                        | j        | j        t	          j        | j                  t	          j        | j	                  | j
                  S )ay  
        Create a new [`IterableDataset`] with only the first `n` elements.

        Args:
            n (`int`):
                Number of elements to take.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="train", streaming=True)
        >>> small_ds = ds.take(2)
        >>> list(small_ds)
        [{'label': 1,
         'text': 'the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'},
         {'label': 1,
         'text': 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth .'}]
        ```
        r  )r  r  r  r  r  r  r  r   r  r  r  r  s      r0   r  zIterableDataset.takem  so    * +4+<a@@#""+)mDO44d&788"5
 
 
 	
r;   c                 t    | j         j        +t          | j         j                                                  ndS )a  Names of the columns in the dataset.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="validation", streaming=True)
        >>> ds.column_names
        ['text', 'label']
        ```
        N)r  r  rv   r
  rX   s    r0   r   zIterableDataset.column_names  s3     48:3F3RtDJ',,..///X\\r;   namecolumnc                 @    fd}|                      |d          S )zAdd column to Dataset.

        Args:
            name (str): Column name.
            column (list or np.array): Column data to be added.

        Returns:
            `IterableDataset`
        c                 L    | v rt          d d d          |         iS )NzError when adding 	: column  is already in the dataset.r   )r8   r   r  r  s     r0   add_column_fnz1IterableDataset.add_column.<locals>.add_column_fn  s=    w !fd!f!fT!f!f!fggg&+&&r;   T)r/  )r  )rU   r  r  r  s    `` r0   
add_columnzIterableDataset.add_column  s:    	' 	' 	' 	' 	' 	'
 xxDx999r;   original_column_namenew_column_namec                    fd}| j         j        r| j         j                                        nd}|                     |g          }|8t	          fd|                                D                       |j         _        |S )ax  
        Rename a column in the dataset, and move the features associated to the original column under the new column
        name.

        Args:
            original_column_name (`str`):
                Name of the column to rename.
            new_column_name (`str`):
                New name for the column.

        Returns:
            `IterableDataset`: A copy of the dataset with a renamed column.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="train", streaming=True)
        >>> next(iter(ds))
        {'label': 1,
         'text': 'the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'}
        >>> ds = ds.rename_column("text", "movie_review")
        >>> next(iter(ds))
        {'label': 1,
         'movie_review': 'the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'}
        ```
        c           	          | vrt          d d d d          | v rt          d d d d          |          iS )NError when renaming  to r  z is not in the dataset.r  r  )r8   r  r  s    r0   rename_column_fnz7IterableDataset.rename_column.<locals>.rename_column_fn  s    #722  M+?  M  M_  M  M_s  M  M  M   '))  L+?  L  L_  L  L_n  L  L  L   $W-A%BCCr;   Nr4  c                 ,    i | ]\  }}|k    rn||S r5   r5   )r7   r9   featurer  r  s      r0   r:   z1IterableDataset.rename_column.<locals>.<dictcomp>  s>       $W (+.B'B'BOOW  r;   )r  r  r  r  r   rM   )rU   r  r  r#  original_featuresds_iterables    ``   r0   rename_columnzIterableDataset.rename_column  s    :		D 		D 		D 		D 		D 		D ;?*:MWDJ/44666SWhh/AU@VhWW()1    (9(?(?(A(A  * *K& r;   column_mappingc                 $   fd}| j         j        r| j         j                                        nd}|                     |t	                              }|7t          fd|                                D                       |j         _        |S )aa  
        Rename several columns in the dataset, and move the features associated to the original columns under
        the new column names.

        Args:
            column_mapping (`Dict[str, str]`): A mapping of columns to rename to their new names

        Returns:
            `IterableDataset`: A copy of the dataset with renamed columns
        c                     t           fdD                       rbt          dt                     dt                                                     dt	                    t	                     z
   d          t           fd                                D                       rtt          dt                     dt                                                     dt	                     t	                                                    z
   d           fd                                D             S )	Nc              3       K   | ]}|vV  	d S r%   r5   r7   r9   r8   s     r0   r   zLIterableDataset.rename_columns.<locals>.rename_columns_fn.<locals>.<genexpr>  s(      @@#3g%@@@@@@r;   r!  r"  z
: columns z are not in the dataset.c              3       K   | ]}|v V  	d S r%   r5   r.  s     r0   r   zLIterableDataset.rename_columns.<locals>.rename_columns_fn.<locals>.<genexpr>  s'      EEc3'>EEEEEEr;   z are already in the dataset.c                 (    i | ]\  }}||         S r5   r5   )r7   r  r  r8   s      r0   r:   zMIterableDataset.rename_columns.<locals>.rename_columns_fn.<locals>.<dictcomp>  s6       9(/  )=!>  r;   )r   r   rv   r   setrM   )r8   r*  s   `r0   rename_columns_fnz9IterableDataset.rename_columns.<locals>.rename_columns_fn  s   @@@@@@@@@   k4+?+?  k  kT.J_J_JaJaEbEb  k  knq  sA  oB  oB  EH  IP  EQ  EQ  oQ  k  k  k   EEEE^-B-B-D-DEEEEE   x4+?+?  x  xT.J_J_JaJaEbEb  x  xnqrynznz  ~A  BP  BW  BW  BY  BY  ~Z  ~Z  oZ  x  x  x     =K=Q=Q=S=S   r;   Nr$  c                 X    i | ]&\  }}|                                 v r|         n||'S r5   )r
  )r7   r9   r&  r*  s      r0   r:   z2IterableDataset.rename_columns.<locals>.<dictcomp>  sP       $W ,/.2E2E2G2G+G+GN3''SRY  r;   )r  r  r  r  rv   r   rM   )rU   r*  r2  r'  r(  s    `   r0   rename_columnszIterableDataset.rename_columns  s    	 	 	 	 	 ;?*:MWDJ/44666SWhh0nAUAUhVV()1   (9(?(?(A(A  * *K& r;   r   c                     | j         j        r| j         j                                        nd}|                     |          }|I|                                |j         _        |                                D ]\  }}||v r|j         j        |= |S )aD  
        Remove one or several column(s) in the dataset and the features associated to them.
        The removal is done on-the-fly on the examples when iterating over the dataset.


        Args:
            column_names (`Union[str, List[str]]`):
                Name of the column(s) to remove.

        Returns:
            `IterableDataset`: A copy of the dataset object without the columns to remove.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="train", streaming=True)
        >>> next(iter(ds))
        {'text': 'the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', 'label': 1}
        >>> ds = ds.remove_columns("label")
        >>> next(iter(ds))
        {'text': 'the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'}
        ```
        Nr$  )r  r  r  r  rM   )rU   r   r'  r(  r9   r   s         r0   r4  zIterableDataset.remove_columns  s    2 ;?*:MWDJ/44666SWhhlh;;():)?)?)A)AK&+1133 8 8Q,&&#)237r;   c           
         t          |t                    r|g}| j        rt          j        | j                  | j        j        r|D ]O}|| j        j        vr?t          d| dt          | j        j                                                   d          Pt          fd|D                       _        t          | j        |          }t          || j        | j        | j        | j        | j                  S )a<  Select one or several column(s) in the dataset and the features
        associated to them. The selection is done on-the-fly on the examples
        when iterating over the dataset.


        Args:
            column_names (`Union[str, List[str]]`):
                Name of the column(s) to select.

        Returns:
            `IterableDataset`: A copy of the dataset object with selected columns.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="train", streaming=True)
        >>> next(iter(ds))
        {'text': 'the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', 'label': 1}
        >>> ds = ds.select_columns("text")
        >>> next(iter(ds))
        {'text': 'the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'}
        ```
        NzColumn name z- not in the dataset. Columns in the dataset: rH  c                 ,    i | ]}|j         |         S r5   r  )r7   r   r  s     r0   r:   z2IterableDataset.select_columns.<locals>.<dictcomp>N  s"    )T)T)T!!T]1-=)T)T)Tr;   r  )r  r   r  r  r   r  r   rv   r
  r   r   r  r  r  r  r  r  r  )rU   r   r   r   r  s       @r0   select_columnszIterableDataset.select_columns(  s.   2 lC(( 	*(>L: 
	V=,,Dz".#/  K"$**===(C; C C#DJ$7$<$<$>$>??C C C   > !))T)T)T)T|)T)T)T U U+D,=|LL#+)o)"5
 
 
 	
r;   r&  c           
      R   | j                                         }||j        |<   	 |                                 n# t          $ r
 d|_        Y nw xY wt          | j        || j        | j        t          j	        | j
                  t          j	        | j                  | j                  S )a  Cast column to feature for decoding.

        Args:
            column (`str`):
                Column name.
            feature (`Feature`):
                Target feature.

        Returns:
            `IterableDataset`

        Example:

        ```py
        >>> from datasets import load_dataset, Audio
        >>> ds = load_dataset("PolyAI/minds14", name="en-US", split="train", streaming=True)
        >>> ds.features
        {'audio': Audio(sampling_rate=8000, mono=True, decode=True, id=None),
         'english_transcription': Value(dtype='string', id=None),
         'intent_class': ClassLabel(num_classes=14, names=['abroad', 'address', 'app_error', 'atm_limit', 'balance', 'business_loan',  'card_issues', 'cash_deposit', 'direct_debit', 'freeze', 'high_value_payment', 'joint_account', 'latest_transactions', 'pay_bill'], id=None),
         'lang_id': ClassLabel(num_classes=14, names=['cs-CZ', 'de-DE', 'en-AU', 'en-GB', 'en-US', 'es-ES', 'fr-FR', 'it-IT', 'ko-KR',  'nl-NL', 'pl-PL', 'pt-PT', 'ru-RU', 'zh-CN'], id=None),
         'path': Value(dtype='string', id=None),
         'transcription': Value(dtype='string', id=None)}
        >>> ds = ds.cast_column("audio", Audio(sampling_rate=16000))
        >>> ds.features
        {'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None),
         'english_transcription': Value(dtype='string', id=None),
         'intent_class': ClassLabel(num_classes=14, names=['abroad', 'address', 'app_error', 'atm_limit', 'balance', 'business_loan',  'card_issues', 'cash_deposit', 'direct_debit', 'freeze', 'high_value_payment', 'joint_account', 'latest_transactions', 'pay_bill'], id=None),
         'lang_id': ClassLabel(num_classes=14, names=['cs-CZ', 'de-DE', 'en-AU', 'en-GB', 'en-US', 'es-ES', 'fr-FR', 'it-IT', 'ko-KR',  'nl-NL', 'pl-PL', 'pt-PT', 'ru-RU', 'zh-CN'], id=None),
         'path': Value(dtype='string', id=None),
         'transcription': Value(dtype='string', id=None)}
        ```
        Nr  r  r  r  r   task_templatesr  r  r  r  r   r  r  r  )rU   r  r&  r  s       r0   cast_columnzIterableDataset.cast_column[  s    D z   'f	'IIKKKK 	' 	' 	'"&D	')+)mDO44d&788"5
 
 
 	
s   : AAc           
      L   | j                                         }||_        	 |                                 n# t          $ r
 d|_        Y nw xY wt          | j        || j        | j        t          j	        | j
                  t          j	        | j                  | j                  S )a  
        Cast the dataset to a new set of features.

        Args:
            features ([`Features`]):
                New features to cast the dataset to.
                The name of the fields in the features must match the current column names.
                The type of the data must also be convertible from one type to the other.
                For non-trivial conversion, e.g. `string` <-> `ClassLabel` you should use [`~Dataset.map`] to update the Dataset.

        Returns:
            `IterableDataset`: A copy of the dataset with casted features.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="train", streaming=True)
        >>> ds.features
        {'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None),
         'text': Value(dtype='string', id=None)}
        >>> new_features = ds.features.copy()
        >>> new_features["label"] = ClassLabel(names=["bad", "good"])
        >>> new_features["text"] = Value("large_string")
        >>> ds = ds.cast(new_features)
        >>> ds.features
        {'label': ClassLabel(num_classes=2, names=['bad', 'good'], id=None),
         'text': Value(dtype='large_string', id=None)}
        ```
        Nr  r;  rU   r  r  s      r0   castzIterableDataset.cast  s    D z   	'IIKKKK 	' 	' 	'"&D	')+)mDO44d&788"5
 
 
 	
s   7 A
Ar   r   c           
         t          | j        ||          }t          || j                                        | j        | j        t	          j        | j                  t	          j        | j	                  | j
                  S )Nr   r  )r   r  r  r  r  r  r  r   r  r  r  )rU   r   r   r   s       r0   _stepzIterableDataset._step  ss    *4+<4PVWWW#""+)mDO44d&788"5
 
 
 	
r;   c           
         | j         | S t          | j        t                    r| j        j         }n!t	          |                                           }| j                                        }||_         t          | j        || j	        | j
        t          j        | j                  t          j        | j                  | j                  S )Nr  )r  r  r  r  r1   r  r  r  r  r  r  r   r  r  r  r?  s      r0   _resolve_featuresz!IterableDataset._resolve_features  s    =$K)+@AA 	@(1HH1$**,,??Hy~~ )+)mDO44d&788"5
 
 
 	
r;   )NNNNNN)r  )F)NNr%   )	NFNFr  FNNN)NFNFr  )NNr  )r#   r  )8ra   rb   rc   rd   rf   r   r   r   r   r  r  r
   r   rX  rV   r  r  r  r  r~   r}   rw   r  r  r  rY   rK   r+  r	   r   r@   r  r  r   r  r
  rz   r{   r|   r   r  r  r  r   rv   rG   r  r)  r4  r4  r9  r   r=  r@  rB  rD  r5   r;   r0   r  r  >  s       **
 '+&*%)/337IMG G*G {#G 
#	G
 c]G O,G /0G $DeCtO.D)D$EFG G G G6  G G G
6 6 6 6	= 	= 	= *# * * * X*
%)> % % % %N	 	 	4I    >  , s T    ,  (,%)3 338$3 TN3 
	3 3 3 \3n #
 
sm
 

 
 
 
< (,"9=$( %:>'+$(n
 n
8$n
 n
  c49n 56	n

 n
 SMn
 n
 !sDI~!67n
 8$n
 D>n
 
n
 n
 n
 n
d (,9=$(G
 G
8$G
  c49n 56	G

 G
 SMG
 
G
 G
 G
 G
T ^bA
 A
$,RY-@$AA
WZA
	A
 A
 A
 A
Fs    %
 %
 %
 %
N
 
 
 
@ ]htCy1 ] ] ] X]:s :E$.,A :FW : : : :$1# 1 1Pa 1 1 1 1f#T#s(^ #@Q # # # #J 5d3i+@  EV        D1
5d3i+@ 1
EV 1
 1
 1
 1
f1
# 1
 1
@Q 1
 1
 1
 1
f1
1
 
1
 1
 1
 1
f

# 

s 

/@ 

 

 

 


 
 
 
 
r;   r  dsetsr  r  axisc                    d | D             } |dk    rt          d | D                        nt          d | D                        t          d t          d | D                       D                       }d | D             }|dk    rt	          |          }nt          |          }|t          j        d	 | D                       }n|                                }||_	        d
 | D             }t          ||||          S )a  
    Converts a list of `IterableDataset` with the same schema into a single `IterableDataset`.
    Missing data are filled with None values.

    <Added version="2.4.0"/>

    Args:
        dsets (`List[datasets.IterableDataset]`): List of Datasets to concatenate.
        info (`DatasetInfo`, optional): Dataset information, like description, citation, etc.
        split (`NamedSplit`, optional): Name of the dataset split.
        axis (``{0, 1}``, default ``0``, meaning over rows):
            Axis to concatenate over, where ``0`` means over rows (vertically) and ``1`` means over columns
            (horizontally).

            *New in version 1.6.0*

    Example:

    ```py
    >>> ds3 = _concatenate_iterable_datasets([ds1, ds2])
    ```
    c                 6    g | ]}|                                 S r5   rD  r7   r  s     r0   r?   z2_concatenate_iterable_datasets.<locals>.<listcomp>  s$    222qQ  ""222r;   r   c                     g | ]	}|j         
S r5   r8  r7   dsets     r0   r?   z2_concatenate_iterable_datasets.<locals>.<listcomp>  s    *K*K*KT4=*K*K*Kr;   c                 &    g | ]}|j         D ]}|S r5   r8  )r7   rM  col_names      r0   r?   z2_concatenate_iterable_datasets.<locals>.<listcomp>  s'    VVV$VVHXVVVVr;   c                 H    i | ]}|                                 D ]\  }}||	 S r5   rM   r7   r  kvs       r0   r:   z2_concatenate_iterable_datasets.<locals>.<dictcomp>  s;    nnn(]e]k]k]m]mnnUYUVXYAnnnnr;   c                     g | ]	}|j         
S r5   r8  rL  s     r0   r?   z2_concatenate_iterable_datasets.<locals>.<listcomp>  s    .O.O.Ot}.O.O.Or;   c                     g | ]	}|j         
S r5   r  rJ  s     r0   r?   z2_concatenate_iterable_datasets.<locals>.<listcomp>  s    222qAN222r;   Nc                     g | ]	}|j         
S r5   r  rJ  s     r0   r?   z2_concatenate_iterable_datasets.<locals>.<listcomp>  s    &=&=&=!qv&=&=&=r;   c                 R    i | ]$}|j                                         D ]\  }}||	%S r5   r  rM   r7   datasetrepo_idtokens       r0   r:   z2_concatenate_iterable_datasets.<locals>.<dictcomp>  s;    vvvGSZSmSsSsSuSuvv%%vvvvr;   r   r  r  r  )r   r   r   r   r   r   r   
from_merger  r  r  )rE  r  r  rF  r  r   r   r  s           r0   _concatenate_iterable_datasetsrb    s:   8 32E222E qyy)*K*KU*K*K*KLLLLVV%VVVWWW nno.O.O.O.O.OPPnnn H 32E222LqyyHVVJ<XX |%&=&=u&=&=&=>>yy{{DMvvuvvv{U^oppppr;   r   datasetsr  rd  r   c                    d | D             } t          d | D                        t          d t          d | D                       D                       }d | D             }|t          ||          }n2t          j                            |          }	t          ||	||          }|t          j	        d	 | D                       }n|
                                }||_        d
 | D             }
t          ||||
          S )a  
    Interleave several iterable datasets (sources) into a single iterable dataset.
    The new iterable dataset alternates between the sources to yield examples.
    If `probabilities = None` (default) the iterable dataset will cycles through the sources in order for each next example in the iteration.
    If `probabilities` is not `None, the iterable dataset will sample a random source according to the provided probabilities for each next examples in the iteration.

    <Added version="2.4.0"/>

    Args:
        datasets (`List[IterableDataset]`): list of datasets to interleave
        probabilities (`List[float]`, optional, default None): If specified, the new iterable dataset samples
            examples from one source at a time according to these probabilities.
        seed (`int`, optional, default None): The random seed used to choose a source for each example.
        stopping_strategy (Optional `str`, defaults to `first_exhausted`):
            Two strategies are proposed right now.
            By default, `first_exhausted` is an undersampling strategy, i.e the dataset construction is stopped as soon as one dataset has ran out of samples.
            If the strategy is `all_exhausted`,  we use an oversampling strategy, i.e the dataset construction is stopped as soon as every samples of every dataset has been added at least once.
            Note that if the strategy is `all_exhausted`, the interleaved dataset size can get enormous:
            - with no probabilities, the resulting dataset will have max_length_datasets*nb_dataset samples.
            - with given probabilities, the resulting dataset will have more samples if some datasets have really low probability of visiting.

    Output:
        `datasets.IterableDataset`
    c                 6    g | ]}|                                 S r5   rI  rJ  s     r0   r?   z1_interleave_iterable_datasets.<locals>.<listcomp>=  s$    888!##%%888r;   c                     g | ]	}|j         
S r5   r8  rL  s     r0   r?   z1_interleave_iterable_datasets.<locals>.<listcomp>@  s    &J&J&Jt}&J&J&Jr;   c                 H    i | ]}|                                 D ]\  }}||	 S r5   rQ  rR  s       r0   r:   z1_interleave_iterable_datasets.<locals>.<dictcomp>E  s;    qqq(`h`n`n`p`pqqX\XY[\Aqqqqr;   c                     g | ]	}|j         
S r5   r8  rL  s     r0   r?   z1_interleave_iterable_datasets.<locals>.<listcomp>E  s    .R.R.Rt}.R.R.Rr;   c                     g | ]	}|j         
S r5   rW  rJ  s     r0   r?   z1_interleave_iterable_datasets.<locals>.<listcomp>H  s    555qAN555r;   Nr   )rk   r  r   c                     g | ]	}|j         
S r5   rY  rJ  s     r0   r?   z1_interleave_iterable_datasets.<locals>.<listcomp>U  s    &@&@&@!qv&@&@&@r;   c                 R    i | ]$}|j                                         D ]\  }}||	%S r5   r[  r\  s       r0   r:   z1_interleave_iterable_datasets.<locals>.<dictcomp>Z  sN       "WE_EeEeEgEg 3A7E   r;   r`  )r   r   r   r   rz   r{   r  r  r   ra  r  r  r  )rc  r  rd  r  r  r   r  r   r   rk   r  s              r0   _interleave_iterable_datasetsrl    sJ   @ 98x888H &&J&J&J&J&JKKK qqo.R.R.R.R.RSSqqq H 65H555L 9,ZklllI))$//	AI]^o
 
 

 |%&@&@x&@&@&@AAyy{{DM &.   {U^oppppr;   r]  r  r  c           	      $   | j         r!|| j         j        z  }|| j         j        z  |z   }t          ||          }t	          | j        | j                                        | j        | j	        t          j
        | j                  || j                  S )a  
    Split an iterable dataset for the node at rank `rank` in a pool of nodes of size `world_size`.

    If the dataset has a number of shards that is a factor of `world_size` (i.e. if `dataset.n_shards % world_size == 0`),
    then the shards are evenly assigned across the nodes, which is the most optimized.
    Otherwise, each node keeps 1 example out of `world_size`, skipping the other examples.

    Args:
        dataset ([`IterableDataset`]):
            The iterable dataset to split by node.
        rank (`int`):
            Rank of the current node.
        world_size (`int`):
            Total number of nodes.

    Returns:
        [`IterableDataset`]: The iterable dataset to be used on the node at rank `rank`.
    )r  r  r  )r  r  r  r  r  r  r  r  r  r  r   r  r  )r]  r  r  r  s       r0   _split_by_node_iterable_datasetrn  a  s    &  ='"6"AA
G055<#*EEEK(]!!n(- 233!4   r;   r%   )NNr   )NNNNr   )Xr  r  r  collectionsr   r   dataclassesr   r   r   typingr   r	   r
   r   r   r   r   numpyrz   pyarrowr&   r  r   arrow_datasetr   r  r   features.featuresr   r   r   filesystemsr   
formattingr   r   r  r   splitsr   r  r   utils.loggingr   utils.shardingr   r   r   r    ra   r  r   rv   r1   rD   rO   rQ   rf   r   r   r   r   r   r   r   r   r  r-  rZ  rj  rw  r  r@   rX  r  r  r  r  r  r  r  r  r}   rb  r*  rl  rn  r5   r;   r0   <module>r{     s        



             ! ! ! ! ! ! # # # # # # # # G G G G G G G G G G G G G G G G G G               + + + + + +       ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ + + + + + + C C C C C C C C                   % % % % % % v v v v v v v v v v v v 
H		7 7d39o 7XhEW 7ck 7 7 7 7#d38n!5 #$sDy/ # # # #>d39o >$tCH~2F > > > >    h   :R R R R R R R R4< < < < <, < < <.
 
 
 
 
*: 
 
 
() ) ) ) )1 ) ) )() ) ) ) )0 ) ) )<:
 :
 :
 :
 :
*? :
 :
 :
z(
 (
 (
 (
 (
9N (
 (
 (
V
d3i 
 
 
 
:
 :
 :
 :
 :
;P :
 :
 :
z.
 .
 .
 .
 .
2U .
 .
 .
bw) w) w) w) w)2 w) w) w)tN) N) N) N) N)4 N) N) N)b,) ,) ,) ,) ,)%: ,) ,) ,)^) ) ) ) )0 ) ) )") ) ) ) )0 ) ) )B%:>sE#tUY/DZ?Z:[	   #8<S%TSWBX=X8Y	    %) %) %) %) %)1 %) %) %)P   ) ) ) ) ) ) ) )
        
A A A`
 `
 `
 `
 `
& `
 `
 `
J #'"&	9q 9q 9q
;
9q J9q 	9q
 9q 9q 9q 9q| ,0"&"&'8Aq Aq?#AqDK(Aq 3-Aq ;
	Aq
 JAq  }Aq Aq Aq Aq AqH_ C UX ]l      r;   